Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions reg-tests/mcli/mcli_master_socket_leak.vtc
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
varnishtest "Bug fix: master CLI connection slots freed on client disconnect"

# Regression test for a master CLI socket connection leak in master-worker mode.
#
# mworker_proxy has a fixed maxconn of 10. When clients connect to the master
# socket, send a command that is forwarded to a busy worker, and then close
# the connection due to a client-side receive timeout, haproxy must free each
# slot as the client disconnects.
#
# Without the fix: slots remain occupied after the client disconnects, so the
# master CLI becomes unreachable once all 10 slots are filled.
# With the fix: slots are freed on client disconnect and the master CLI keeps
# accepting new connections.
#
# The worker is made unresponsive using "debug dev delay" (expert-mode) with
# nbthread 1 so a single delay blocks the entire worker CLI. This avoids
# using SIGSTOP/SIGCONT which can be unreliable in CI environments.

#REGTEST_TYPE=bug

feature cmd "command -v socat"
feature cmd "command -v timeout"
feature ignore_unknown_macro

server s1 {
} -start

haproxy h1 -W -S -conf {
global
nbthread 1

defaults
mode http
timeout connect "${HAPROXY_TEST_TIMEOUT-5s}"
timeout client "${HAPROXY_TEST_TIMEOUT-5s}"
timeout server "${HAPROXY_TEST_TIMEOUT-5s}"

frontend fe
bind "fd@${fe}"
default_backend be

backend be
server s1 ${s1_addr}:${s1_port}
} -start

# Fill all 10 master CLI slots (mworker_proxy->maxconn is hardcoded to 10).
# Each socat sends "expert-mode on" followed by "@1 debug dev delay 10000"
# which blocks the single worker thread for 10 s. After 2 s the timeout(1)
# wrapper kills socat, simulating a client-side receive timeout. "wait"
# ensures all background processes have exited before proceeding.
shell {
for i in $(seq 1 10); do
(printf "expert-mode on\n@1 debug dev delay 10000\n" \
| timeout 2 socat TCP:${h1_mcli_addr}:${h1_mcli_port} - 2>/dev/null) &
done
wait
}

# This is the key assertion: after all 10 clients have disconnected, a new
# connection to the master CLI must succeed. With the bug all 10 slots are
# still marked occupied and this connect is refused or times out.
haproxy h1 -mcli {
send "show version"
expect ~ "3."
}

106 changes: 106 additions & 0 deletions reg-tests/mcli/mcli_master_socket_leak_sigstop.vtc
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
varnishtest "Master CLI slot leak: frozen worker (SIGSTOP) — fix verified"

# Companion to mcli_master_socket_leak.vtc. That test uses
# "debug dev delay" to block the worker INSIDE its CLI handler. This
# test goes further: it freezes the worker process at the OS level via
# SIGSTOP BEFORE any byte flows back through the master->worker
# sockpair. This was raised as a possible gap in the fix, on the
# theory that the backend stconn's `lra` (last-read-activity) field
# might still be TICK_ETERNITY when the client disconnects, in which
# case tick_add(lra, ioto) == TICK_ETERNITY and timeout server-fin
# never fires, so the slot would still leak.
#
# Empirical result (run locally against fix f218e2252 on top of
# 3.4-dev9): slots are freed within ~2s of client disconnect even with
# the worker SIGSTOPped before any traffic. The 11th connection
# attempt while the worker was still frozen succeeded in ~0.5s.
#
# Likely explanation: `lra` is updated when the backend stconn enters
# the EST state during sockpair setup, which happens at the kernel
# level immediately (sockpairs are pre-connected) — independently of
# the worker actually reading any byte. SIGSTOP'ing the worker does
# not unwind that master-side state, so by the time sc_set_hcto()
# arms timeout server-fin on client disconnect, lra is already finite
# and tick_add() yields a real expiry ~1s in the future.
#
# This test is kept as a local diagnostic. It is NOT recommended for
# upstream submission: the existing master_socket_leak test explicitly
# avoids SIGSTOP/SIGCONT for CI reliability reasons.

#REGTEST_TYPE=bug

feature cmd "command -v socat"
feature cmd "command -v timeout"
feature cmd "command -v awk"
feature ignore_unknown_macro

server s1 {
} -start

haproxy h1 -W -S -conf {
global
nbthread 1

defaults
mode http
timeout connect "${HAPROXY_TEST_TIMEOUT-5s}"
timeout client "${HAPROXY_TEST_TIMEOUT-5s}"
timeout server "${HAPROXY_TEST_TIMEOUT-5s}"

frontend fe
bind "fd@${fe}"
default_backend be

backend be
server s1 ${s1_addr}:${s1_port}
} -start

# Discover the worker PID via the master CLI, freeze it with SIGSTOP
# BEFORE any client traffic, fill all 10 slots with clients that get
# killed after 2s, then verify the master CLI is still reachable while
# the worker is still frozen. The trap ensures we never leave a
# stopped worker behind.
shell {
set -e
WORKER_PID=$(printf "show proc\n" \
| socat -t2 TCP:${h1_mcli_addr}:${h1_mcli_port} - 2>/dev/null \
| awk '$1 ~ /^[0-9]+$/ && $0 ~ /worker/ { print $1; exit }')
test -n "$WORKER_PID" || { echo "FAIL: could not find worker pid"; exit 1; }
echo "worker pid: $WORKER_PID"

trap 'kill -CONT '"$WORKER_PID"' 2>/dev/null || true' EXIT INT TERM

kill -STOP "$WORKER_PID"

SOCAT_PIDS=""
for i in $(seq 1 10); do
(printf "@1 show info\n" \
| timeout --kill-after=1 2 socat TCP:${h1_mcli_addr}:${h1_mcli_port} - \
2>/dev/null) &
SOCAT_PIDS="$SOCAT_PIDS $!"
done
for pid in $SOCAT_PIDS; do
wait "$pid" 2>/dev/null || true
done

# The key assertion: 11th connection MUST succeed while worker is
# still SIGSTOPped. If the lra==TICK_ETERNITY concern were real,
# this would hang or fail.
RESULT=$(printf "show version\n" \
| timeout --kill-after=1 5 socat -t4 TCP:${h1_mcli_addr}:${h1_mcli_port} - \
2>&1)
echo "11th connection result: $RESULT"
echo "$RESULT" | grep -qE '[0-9]+\.[0-9]+' || {
echo "FAIL: master CLI unreachable while worker still SIGSTOPped"
exit 1
}

kill -CONT "$WORKER_PID"
trap - EXIT INT TERM
}

# Sanity check: master CLI works after worker resumed.
haproxy h1 -mcli {
send "show version"
expect ~ "."
}
5 changes: 5 additions & 0 deletions reg-tests/mcli/mcli_reload_no_timeout.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
-- Delay worker readiness by 2s to test that serverfin does not
-- kill the reload command while the new worker is starting.
core.register_init(function()
os.execute("sleep 2")
end)
65 changes: 65 additions & 0 deletions reg-tests/mcli/mcli_reload_no_timeout.vtc
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
varnishtest "Verify reload via master CLI is not affected by serverfin timeout"

# Regression test: the timeout server-fin on the MASTER proxy must not
# apply to locally-handled commands (applets). A reload command is
# forwarded to the master applet and may take longer than 1s while the
# new worker is starting. Without the fix, the pcli response analyser
# would fire a read timeout and return "Can't connect to the target CLI!"
# instead of the reload status.

#REQUIRE_OPTIONS=LUA
#REGTEST_TYPE=bug

feature cmd "command -v socat"
feature cmd "$HAPROXY_PROGRAM -vv | grep -q '+LUA'"
feature ignore_unknown_macro

server s1 {
rxreq
txresp
} -start

haproxy h1 -W -S -conf {
global
tune.lua.bool-sample-conversion normal
# Lua init hook that sleeps 2s, delaying worker readiness on reload
lua-load ${testdir}/mcli_reload_no_timeout.lua

defaults
mode http
timeout connect "${HAPROXY_TEST_TIMEOUT-5s}"
timeout client "${HAPROXY_TEST_TIMEOUT-5s}"
timeout server "${HAPROXY_TEST_TIMEOUT-5s}"

frontend fe
bind "fd@${fe}"
default_backend be

backend be
server s1 ${s1_addr}:${s1_port}
} -start

# Issue a reload via socat as a second command (after "show version").
# The first command causes CF_AUTO_CLOSE to be set on the request channel.
# When socat half-closes its write side after sending, process_stream()
# triggers sc_shutdown(scb) -> sc_set_hcto() which applies serverfin=1s
# to the backend applet. Without the fix, the reload response (which comes
# from the master applet) would be killed by this 1s timeout if the new
# worker takes time to start, returning "Can't connect to the target CLI!".
shell {
RESULT=$(printf "show version\nreload\n" | socat -t10 TCP:${h1_mcli_addr}:${h1_mcli_port} - 2>/dev/null)
echo "Got: $RESULT"
echo "$RESULT" | grep -q "Success=1" || {
echo "FAIL: reload did not succeed. Got: $RESULT"
exit 1
}
}

# Verify the master CLI is still functional after reload
shell {
RESULT=$(printf "show version\n" | socat -t5 TCP:${h1_mcli_addr}:${h1_mcli_port} - 2>/dev/null)
echo "$RESULT" | grep -q "3\." || {
echo "FAIL: show version failed. Got: $RESULT"
exit 1
}
}
Loading
Loading