From 53b9d779acc7f9c46f497511a1218ddfbdcce5da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E6=99=8F=E4=B8=9E?=
 <133100427+YanChen0819@users.noreply.github.com>
Date: Tue, 26 May 2026 12:03:44 +0800
Subject: [PATCH] fix(daemon): widen HTTP keep-alive on the daemon listener
 (#2557)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix(daemon): widen HTTP keep-alive so SSE survives idle gaps

The daemon's `/api/runs/:id/events` SSE stream emits an in-band
`: keepalive` comment every 25s (`SSE_KEEPALIVE_INTERVAL_MS`), but
Node's default `server.keepAliveTimeout` is 5_000ms. When a run is
quiet for more than five seconds — e.g. the agent is still composing,
or the user briefly walks away — Node closes the underlying TCP
connection from under the SSE writer, the next 25s ping lands on a
dead socket, and the browser surfaces it as a generic
"network error" mid-stream.

This is most visible behind any keep-alive-aware middlebox (the
nginx running in the desktop bundle, the socat/docker bridges users
set up for remote access, EC2 security-group idle timers): the
default 5s window is shorter than every reasonable in-band keepalive
cadence, so the connection dies before the application gets a chance
to assert it's still alive.

Set the listener to:

- `keepAliveTimeout = 120_000` — 4.8× the in-band keepalive, plenty
  of slack for clock skew and slow flushes.
- `headersTimeout = 125_000` — must exceed `keepAliveTimeout` per the
  Node docs, otherwise a misbehaving client can stall request parsing
  indefinitely.
- `requestTimeout = 0` — disable the per-request timeout entirely;
  an SSE response intentionally runs for as long as the agent runs.

Verified by curling
`/api/runs/<id>/events` from inside the daemon container and
watching the connection stay open through three full 25s keepalive
cycles where it previously RST'd at ~5s.

* fix(daemon): address PR #2557 review — drop requestTimeout, add regression test

Three changes responding to @PerishCode's review (#2557):

1. Drop `server.requestTimeout = 0`. The reviewer is correct: that knob
   bounds how long the server waits to *receive* a complete request
   (headers + body) and is cleared the moment the request is fully
   parsed — it does not gate the duration of an SSE response. Setting
   it to 0 only removes Node 18+'s default 300s slow-loris guard, which
   is a real regression on a daemon that binds to 0.0.0.0 / Tailscale.

2. Rewrite the comment block. The previous comment claimed
   `keepAliveTimeout` "closes any idle SSE connection." Per the Node
   docs, `keepAliveTimeout` arms *after* a response finishes writing —
   it bounds the between-request idle gap on a kept-alive socket, not
   an in-flight streaming response. SSE drops mid-stream are almost
   always middlebox idle timers (nginx, socat/docker, EC2 NAT), not
   Node's own socket timeout, and this listener-side change cannot
   extend a connection past those middleboxes.

   What this PR actually fixes: routine kept-alive sockets used around
   an SSE stream (status polls, run-status fetches, the initial GET
   before the SSE upgrade) surviving normal client pauses. 120s gives
   comfortable headroom over the 25s in-band cadence so chat clients
   stop reconnect-storming between bursts.

3. Add `apps/daemon/tests/server-keepalive.test.ts` so a future
   refactor cannot silently restore the Node defaults. The test uses
   the existing `startServer({ port: 0, returnServer: true })` fixture
   (mirroring version-route.test.ts) and asserts the listener's
   `keepAliveTimeout` and `headersTimeout` invariants.

Verified:
- pnpm --filter @open-design/daemon run typecheck passes
- pnpm vitest run tests/server-keepalive.test.ts → 2 passed
---
 apps/daemon/src/server.ts                  | 26 +++++++++++++++
 apps/daemon/tests/server-keepalive.test.ts | 37 ++++++++++++++++++++++
 2 files changed, 63 insertions(+)
 create mode 100644 apps/daemon/tests/server-keepalive.test.ts
diff --git a/apps/daemon/src/server.ts b/apps/daemon/src/server.ts
index c3c2c7bf2..fc82e068c 100644
--- a/apps/daemon/src/server.ts
+++ b/apps/daemon/src/server.ts
@@ -12411,6 +12411,32 @@ export async function startServer({
     let server;
     try {
       server = app.listen(port, host, () => {
+        // Widen the between-request idle window so kept-alive sockets
+        // belonging to chat/SSE clients survive the gaps between bursts.
+        //
+        // Node's `keepAliveTimeout` (default 5s) only arms *after* a
+        // response finishes writing, bounding the idle gap before the next
+        // request on the same socket — it does not fire while an SSE
+        // response is still streaming. A streaming `/api/runs/:id/events`
+        // response stays open until the agent finishes, so middlebox idle
+        // timers (nginx, socat/docker bridges, EC2 SG NAT) are typically
+        // the proximate cause when an SSE stream drops; this listener-
+        // side change cannot extend a connection past those middleboxes.
+        //
+        // What it *does* fix: chat clients that pipeline multiple requests
+        // on the same TCP socket (status polls, run-status fetches, the
+        // initial GET before the SSE upgrade). With the default 5s window
+        // a sluggish client can lose the connection between two normal
+        // calls and reconnect-storm. 120s aligns with the in-band
+        // SSE_KEEPALIVE_INTERVAL_MS (25s) so kept-alive sockets used
+        // around an SSE stream stay warm across reasonable client pauses.
+        //
+        // `headersTimeout` must exceed `keepAliveTimeout` per the Node
+        // docs; otherwise a slow-loris client can stall request parsing.
+        if (server) {
+          server.keepAliveTimeout = 120_000;
+          server.headersTimeout = 125_000;
+        }
         const address = server.address();
         // `address()` can in theory return `string | AddressInfo | null`. For
         // a TCP listener it's always `AddressInfo` with a `.port` — the guard
diff --git a/apps/daemon/tests/server-keepalive.test.ts b/apps/daemon/tests/server-keepalive.test.ts
new file mode 100644
index 000000000..d54dd3bc6
--- /dev/null
+++ b/apps/daemon/tests/server-keepalive.test.ts
@@ -0,0 +1,37 @@
+import type http from 'node:http';
+import { afterAll, beforeAll, describe, expect, it } from 'vitest';
+import { startServer } from '../src/server.js';
+
+// Regression guard for the HTTP keep-alive widening landed in #2557.
+// AGENTS.md → "Bug follow-up workflow" asks risk/high daemon bugfixes to
+// lead with a falsifiable spec. The bug being fixed is a 5s default
+// `server.keepAliveTimeout` that's tighter than the in-band SSE keepalive
+// cadence (`SSE_KEEPALIVE_INTERVAL_MS = 25_000`) — a future refactor of
+// the `listen` callback could silently restore the Node default and the
+// regression would not surface in any other test.
+describe('startServer HTTP keep-alive tuning', () => {
+  let server: http.Server;
+
+  beforeAll(async () => {
+    const started = await startServer({ port: 0, returnServer: true }) as {
+      url: string;
+      server: http.Server;
+    };
+    server = started.server;
+  });
+
+  afterAll(() => new Promise<void>((resolve) => server.close(() => resolve())));
+
+  it('widens keepAliveTimeout above the in-band SSE keepalive (25s)', () => {
+    // SSE_KEEPALIVE_INTERVAL_MS is 25_000; the listener must exceed it
+    // comfortably so kept-alive sockets used around an SSE stream survive
+    // routine client pauses.
+    expect(server.keepAliveTimeout).toBeGreaterThanOrEqual(60_000);
+  });
+
+  it('keeps headersTimeout above keepAliveTimeout per Node convention', () => {
+    // Node docs: `headersTimeout` must exceed `keepAliveTimeout`, otherwise
+    // a slow-loris client can stall request parsing indefinitely.
+    expect(server.headersTimeout).toBeGreaterThan(server.keepAliveTimeout);
+  });
+});