sap-contributions · alexanderstephan · May 8, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/reg-tests/mcli/mcli_master_socket_leak.vtc b/reg-tests/mcli/mcli_master_socket_leak.vtc
@@ -0,0 +1,66 @@
+varnishtest "Bug fix: master CLI connection slots freed on client disconnect"
+
+# Regression test for a master CLI socket connection leak in master-worker mode.
+#
+# mworker_proxy has a fixed maxconn of 10.  When clients connect to the master
+# socket, send a command that is forwarded to a busy worker, and then close
+# the connection due to a client-side receive timeout, haproxy must free each
+# slot as the client disconnects.
+#
+# Without the fix: slots remain occupied after the client disconnects, so the
+# master CLI becomes unreachable once all 10 slots are filled.
+# With the fix: slots are freed on client disconnect and the master CLI keeps
+# accepting new connections.
+#
+# The worker is made unresponsive using "debug dev delay" (expert-mode) with
+# nbthread 1 so a single delay blocks the entire worker CLI.  This avoids
+# using SIGSTOP/SIGCONT which can be unreliable in CI environments.
+
+#REGTEST_TYPE=bug
+
+feature cmd "command -v socat"
+feature cmd "command -v timeout"
+feature ignore_unknown_macro
+
+server s1 {
+} -start
+
+haproxy h1 -W -S -conf {
+    global
+        nbthread 1
+
+    defaults
+        mode http
+        timeout connect "${HAPROXY_TEST_TIMEOUT-5s}"
+        timeout client  "${HAPROXY_TEST_TIMEOUT-5s}"
+        timeout server  "${HAPROXY_TEST_TIMEOUT-5s}"
+
+    frontend fe
+        bind "fd@${fe}"
+        default_backend be
+
+    backend be
+        server s1 ${s1_addr}:${s1_port}
+} -start
+
+# Fill all 10 master CLI slots (mworker_proxy->maxconn is hardcoded to 10).
+# Each socat sends "expert-mode on" followed by "@1 debug dev delay 10000"
+# which blocks the single worker thread for 10 s.  After 2 s the timeout(1)
+# wrapper kills socat, simulating a client-side receive timeout.  "wait"
+# ensures all background processes have exited before proceeding.
+shell {
+    for i in $(seq 1 10); do
+        (printf "expert-mode on\n@1 debug dev delay 10000\n" \
+         | timeout 2 socat TCP:${h1_mcli_addr}:${h1_mcli_port} - 2>/dev/null) &
+    done
+    wait
+}
+
+# This is the key assertion: after all 10 clients have disconnected, a new
+# connection to the master CLI must succeed.  With the bug all 10 slots are
+# still marked occupied and this connect is refused or times out.
+haproxy h1 -mcli {
+    send "show version"
+    expect ~ "3."
+}
+
diff --git a/reg-tests/mcli/mcli_master_socket_leak_sigstop.vtc b/reg-tests/mcli/mcli_master_socket_leak_sigstop.vtc
@@ -0,0 +1,106 @@
+varnishtest "Master CLI slot leak: frozen worker (SIGSTOP) — fix verified"
+
+# Companion to mcli_master_socket_leak.vtc.  That test uses
+# "debug dev delay" to block the worker INSIDE its CLI handler.  This
+# test goes further: it freezes the worker process at the OS level via
+# SIGSTOP BEFORE any byte flows back through the master->worker
+# sockpair.  This was raised as a possible gap in the fix, on the
+# theory that the backend stconn's `lra` (last-read-activity) field
+# might still be TICK_ETERNITY when the client disconnects, in which
+# case tick_add(lra, ioto) == TICK_ETERNITY and timeout server-fin
+# never fires, so the slot would still leak.
+#
+# Empirical result (run locally against fix f218e2252 on top of
+# 3.4-dev9): slots are freed within ~2s of client disconnect even with
+# the worker SIGSTOPped before any traffic.  The 11th connection
+# attempt while the worker was still frozen succeeded in ~0.5s.
+#
+# Likely explanation: `lra` is updated when the backend stconn enters
+# the EST state during sockpair setup, which happens at the kernel
+# level immediately (sockpairs are pre-connected) — independently of
+# the worker actually reading any byte.  SIGSTOP'ing the worker does
+# not unwind that master-side state, so by the time sc_set_hcto()
+# arms timeout server-fin on client disconnect, lra is already finite
+# and tick_add() yields a real expiry ~1s in the future.
+#
+# This test is kept as a local diagnostic.  It is NOT recommended for
+# upstream submission: the existing master_socket_leak test explicitly
+# avoids SIGSTOP/SIGCONT for CI reliability reasons.
+
+#REGTEST_TYPE=bug
+
+feature cmd "command -v socat"
+feature cmd "command -v timeout"
+feature cmd "command -v awk"
+feature ignore_unknown_macro
+
+server s1 {
+} -start
+
+haproxy h1 -W -S -conf {
+    global
+        nbthread 1
+
+    defaults
+        mode http
+        timeout connect "${HAPROXY_TEST_TIMEOUT-5s}"
+        timeout client  "${HAPROXY_TEST_TIMEOUT-5s}"
+        timeout server  "${HAPROXY_TEST_TIMEOUT-5s}"
+
+    frontend fe
+        bind "fd@${fe}"
+        default_backend be
+
+    backend be
+        server s1 ${s1_addr}:${s1_port}
+} -start
+
+# Discover the worker PID via the master CLI, freeze it with SIGSTOP
+# BEFORE any client traffic, fill all 10 slots with clients that get
+# killed after 2s, then verify the master CLI is still reachable while
+# the worker is still frozen.  The trap ensures we never leave a
+# stopped worker behind.
+shell {
+    set -e
+    WORKER_PID=$(printf "show proc\n" \
+        | socat -t2 TCP:${h1_mcli_addr}:${h1_mcli_port} - 2>/dev/null \
+        | awk '$1 ~ /^[0-9]+$/ && $0 ~ /worker/ { print $1; exit }')
+    test -n "$WORKER_PID" || { echo "FAIL: could not find worker pid"; exit 1; }
+    echo "worker pid: $WORKER_PID"
+
+    trap 'kill -CONT '"$WORKER_PID"' 2>/dev/null || true' EXIT INT TERM
+
+    kill -STOP "$WORKER_PID"
+
+    SOCAT_PIDS=""
+    for i in $(seq 1 10); do
+        (printf "@1 show info\n" \
+         | timeout --kill-after=1 2 socat TCP:${h1_mcli_addr}:${h1_mcli_port} - \
+           2>/dev/null) &
+        SOCAT_PIDS="$SOCAT_PIDS $!"
+    done
+    for pid in $SOCAT_PIDS; do
+        wait "$pid" 2>/dev/null || true
+    done
+
+    # The key assertion: 11th connection MUST succeed while worker is
+    # still SIGSTOPped.  If the lra==TICK_ETERNITY concern were real,
+    # this would hang or fail.
+    RESULT=$(printf "show version\n" \
+        | timeout --kill-after=1 5 socat -t4 TCP:${h1_mcli_addr}:${h1_mcli_port} - \
+          2>&1)
+    echo "11th connection result: $RESULT"
+    echo "$RESULT" | grep -qE '[0-9]+\.[0-9]+' || {
+        echo "FAIL: master CLI unreachable while worker still SIGSTOPped"
+        exit 1
+    }
+
+    kill -CONT "$WORKER_PID"
+    trap - EXIT INT TERM
+}
+
+# Sanity check: master CLI works after worker resumed.
+haproxy h1 -mcli {
+    send "show version"
+    expect ~ "."
+}
diff --git a/reg-tests/mcli/mcli_reload_no_timeout.lua b/reg-tests/mcli/mcli_reload_no_timeout.lua
@@ -0,0 +1,5 @@
+-- Delay worker readiness by 2s to test that serverfin does not
+-- kill the reload command while the new worker is starting.
+core.register_init(function()
+    os.execute("sleep 2")
+end)
diff --git a/reg-tests/mcli/mcli_reload_no_timeout.vtc b/reg-tests/mcli/mcli_reload_no_timeout.vtc
@@ -0,0 +1,65 @@
+varnishtest "Verify reload via master CLI is not affected by serverfin timeout"
+
+# Regression test: the timeout server-fin on the MASTER proxy must not
+# apply to locally-handled commands (applets).  A reload command is
+# forwarded to the master applet and may take longer than 1s while the
+# new worker is starting.  Without the fix, the pcli response analyser
+# would fire a read timeout and return "Can't connect to the target CLI!"
+# instead of the reload status.
+
+#REQUIRE_OPTIONS=LUA
+#REGTEST_TYPE=bug
+
+feature cmd "command -v socat"
+feature cmd "$HAPROXY_PROGRAM -vv | grep -q '+LUA'"
+feature ignore_unknown_macro
+
+server s1 {
+    rxreq
+    txresp
+} -start
+
+haproxy h1 -W -S -conf {
+    global
+        tune.lua.bool-sample-conversion normal
+        # Lua init hook that sleeps 2s, delaying worker readiness on reload
+        lua-load ${testdir}/mcli_reload_no_timeout.lua
+
+    defaults
+        mode http
+        timeout connect "${HAPROXY_TEST_TIMEOUT-5s}"
+        timeout client  "${HAPROXY_TEST_TIMEOUT-5s}"
+        timeout server  "${HAPROXY_TEST_TIMEOUT-5s}"
+
+    frontend fe
+        bind "fd@${fe}"
+        default_backend be
+
+    backend be
+        server s1 ${s1_addr}:${s1_port}
+} -start
+
+# Issue a reload via socat as a second command (after "show version").
+# The first command causes CF_AUTO_CLOSE to be set on the request channel.
+# When socat half-closes its write side after sending, process_stream()
+# triggers sc_shutdown(scb) -> sc_set_hcto() which applies serverfin=1s
+# to the backend applet.  Without the fix, the reload response (which comes
+# from the master applet) would be killed by this 1s timeout if the new
+# worker takes time to start, returning "Can't connect to the target CLI!".
+shell {
+    RESULT=$(printf "show version\nreload\n" | socat -t10 TCP:${h1_mcli_addr}:${h1_mcli_port} - 2>/dev/null)
+    echo "Got: $RESULT"
+    echo "$RESULT" | grep -q "Success=1" || {
+        echo "FAIL: reload did not succeed. Got: $RESULT"
+        exit 1
+    }
+}
+
+# Verify the master CLI is still functional after reload
+shell {
+    RESULT=$(printf "show version\n" | socat -t5 TCP:${h1_mcli_addr}:${h1_mcli_port} - 2>/dev/null)
+    echo "$RESULT" | grep -q "3\." || {
+        echo "FAIL: show version failed. Got: $RESULT"
+        exit 1
+    }
+}