From 51ad525f5a2c26daddf616b44898eb677acb92cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=86=A0=E8=BE=B0?= Date: Thu, 21 May 2026 19:36:19 +0800 Subject: [PATCH 1/7] fix(daemon): give GitHub Copilot a longer chat-run inactivity ceiling (#2467) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Copilot CLI goes silent (no stdout, no streamed events) for long stretches during legitimate deck-generation and large-prompt turns — the model is still working but the CLI does not emit keepalive frames. The 600s chat-run inactivity watchdog used to kill those runs as `stalled` even though the agent was healthy; the user's reported "stuck after 10 mins and a few seconds" plus the need to type `Continue` to get the files lines up exactly with the watchdog tripping mid-turn and the next message reattaching to the still-live session. Surface an optional `inactivityTimeoutMs` field on `RuntimeAgentDef`, merge it under the existing `OD_CHAT_RUN_INACTIVITY_TIMEOUT_MS` env override, and set Copilot to 30 minutes. The env override still wins so operators can shrink it when diagnosing a runaway session, and the clamp / 24-hour ceiling apply to both the env path and the def path so neither can schedule a negative or signed-32-bit-overflowing setTimeout delay. Closes #2467. --- apps/daemon/src/runtimes/defs/copilot.ts | 10 ++ apps/daemon/src/runtimes/types.ts | 8 ++ apps/daemon/src/server.ts | 32 ++++--- .../tests/chat-run-inactivity-timeout.test.ts | 93 +++++++++++++++++++ 4 files changed, 132 insertions(+), 11 deletions(-) create mode 100644 apps/daemon/tests/chat-run-inactivity-timeout.test.ts diff --git a/apps/daemon/src/runtimes/defs/copilot.ts b/apps/daemon/src/runtimes/defs/copilot.ts index 46a927bfe..0c1827c58 100644 --- a/apps/daemon/src/runtimes/defs/copilot.ts +++ b/apps/daemon/src/runtimes/defs/copilot.ts @@ -67,4 +67,14 @@ export const copilotAgentDef = { }, promptViaStdin: true, streamFormat: 'copilot-stream-json', + // GitHub Copilot's deck-generation and large-prompt turns go silent + // (no stdout, no streamed events) for stretches that exceed the + // 10-minute global default — the model is still working but the + // CLI does not emit keepalive frames. The default watchdog used to + // kill those runs as `stalled` even though the agent was healthy + // (issue #2467: "GitHub Copilot agent getting stuck after 10 mins + // and few seconds"). 30 minutes gives the heavy turns room to land + // while still bounding genuine hangs; operators can override via + // `OD_CHAT_RUN_INACTIVITY_TIMEOUT_MS` if they want it tighter. + inactivityTimeoutMs: 30 * 60 * 1000, } satisfies RuntimeAgentDef; diff --git a/apps/daemon/src/runtimes/types.ts b/apps/daemon/src/runtimes/types.ts index 48eb927bb..d9ebe2bb1 100644 --- a/apps/daemon/src/runtimes/types.ts +++ b/apps/daemon/src/runtimes/types.ts @@ -162,6 +162,14 @@ export type RuntimeAgentDef = { // present in the daemon's `process.env`; Settings-UI per-agent env // values only reach the spawned child and are NOT consulted here. defaultModelEnvVar?: string; + // Agent-recommended override for the chat-run inactivity watchdog. + // The watchdog observes child stdout/stderr/SSE activity, not real + // CPU progress, so agents whose CLIs go silent for long stretches + // during legitimate work (e.g. Copilot's deck-generation thinking + // phase from #2467) need a longer ceiling than the 10-minute global + // default. Operators can still override per-process via + // `OD_CHAT_RUN_INACTIVITY_TIMEOUT_MS` — that env wins. + inactivityTimeoutMs?: number; }; export type DetectedAgent = Omit< diff --git a/apps/daemon/src/server.ts b/apps/daemon/src/server.ts index 7a195e137..76fa4757d 100644 --- a/apps/daemon/src/server.ts +++ b/apps/daemon/src/server.ts @@ -3738,16 +3738,26 @@ const MAX_CHAT_RUN_INACTIVITY_TIMEOUT_MS = 24 * 60 * 60 * 1000; // always means the agent is winding down or hanging. See #1451. const DEFAULT_CHAT_RUN_ARTIFACT_QUIET_PERIOD_MS = 60 * 1000; -function resolveChatRunInactivityTimeoutMs() { - const raw = Number(process.env.OD_CHAT_RUN_INACTIVITY_TIMEOUT_MS); - // This watchdog observes child stdout/stderr/SSE activity, not real CPU or - // filesystem progress. Keep the default long enough for agents that spend - // several minutes silently writing large artifacts. - if (!Number.isFinite(raw)) return DEFAULT_CHAT_RUN_INACTIVITY_TIMEOUT_MS; - // Node clamps delays larger than a signed 32-bit integer down to 1ms, which - // makes an oversized override fail almost immediately while reporting a huge - // timeout. Keep explicit overrides bounded to a practical, timer-safe value. - return Math.min(MAX_CHAT_RUN_INACTIVITY_TIMEOUT_MS, Math.max(0, Math.floor(raw))); +// Resolve the chat-run inactivity watchdog ceiling. Priority order: +// 1. `OD_CHAT_RUN_INACTIVITY_TIMEOUT_MS` (operator escape hatch). +// 2. The agent runtime def's `inactivityTimeoutMs` recommendation — +// lets agents whose CLIs go silent for long stretches during +// legitimate work (e.g. Copilot from #2467) raise the ceiling +// without every operator having to set an env var. +// 3. The 10-minute global default. +// Both env and def values pass through the same clamp because Node +// silently downgrades signed-32-bit-overflowing setTimeout delays to +// 1ms — without the clamp an oversized hint would fire the watchdog +// almost immediately while reporting the huge timeout to the user. +export function resolveChatRunInactivityTimeoutMs(agentDefault?: number) { + const env = Number(process.env.OD_CHAT_RUN_INACTIVITY_TIMEOUT_MS); + if (Number.isFinite(env)) { + return Math.min(MAX_CHAT_RUN_INACTIVITY_TIMEOUT_MS, Math.max(0, Math.floor(env))); + } + if (typeof agentDefault === 'number' && Number.isFinite(agentDefault)) { + return Math.min(MAX_CHAT_RUN_INACTIVITY_TIMEOUT_MS, Math.max(0, Math.floor(agentDefault))); + } + return DEFAULT_CHAT_RUN_INACTIVITY_TIMEOUT_MS; } // Resolve the post-artifact quiet-period window. Same clamp as the outer @@ -11516,7 +11526,7 @@ export async function startServer({ // here; on this branch `send` was hoisted into the AMR preflight // earlier, so we keep only the new `runStartTimeMs` declaration. const runStartTimeMs = Date.now(); - const inactivityTimeoutMs = resolveChatRunInactivityTimeoutMs(); + const inactivityTimeoutMs = resolveChatRunInactivityTimeoutMs(def.inactivityTimeoutMs); const artifactQuietPeriodMs = resolveChatRunArtifactQuietPeriodMs(); const inactivityKillGraceMs = 3_000; let inactivityTimer = null; diff --git a/apps/daemon/tests/chat-run-inactivity-timeout.test.ts b/apps/daemon/tests/chat-run-inactivity-timeout.test.ts new file mode 100644 index 000000000..edec9d917 --- /dev/null +++ b/apps/daemon/tests/chat-run-inactivity-timeout.test.ts @@ -0,0 +1,93 @@ +/** + * Per-agent inactivity-timeout resolution (#2467). + * + * The chat-run inactivity watchdog defaults to 10 minutes. Some agents + * (GitHub Copilot CLI) genuinely stay silent for longer than that on + * heavy deck-generation turns — the model is still working but emits + * no stdout, so the watchdog used to kill the run as `stalled` even + * though the agent was healthy. + * + * Runtime defs can now advertise a recommended `inactivityTimeoutMs`, + * and the resolver merges it under the env override: + * + * OD_CHAT_RUN_INACTIVITY_TIMEOUT_MS highest priority (operator override) + * def.inactivityTimeoutMs next (agent-specific recommendation) + * DEFAULT_CHAT_RUN_INACTIVITY_TIMEOUT_MS (10 min) global default + */ + +import { afterEach, describe, expect, it } from 'vitest'; +import { resolveChatRunInactivityTimeoutMs } from '../src/server.js'; +import { copilotAgentDef } from '../src/runtimes/defs/copilot.js'; + +const ENV_KEY = 'OD_CHAT_RUN_INACTIVITY_TIMEOUT_MS'; +const TEN_MINUTES_MS = 10 * 60 * 1000; +const THIRTY_MINUTES_MS = 30 * 60 * 1000; +const TWENTY_FOUR_HOURS_MS = 24 * 60 * 60 * 1000; + +describe('resolveChatRunInactivityTimeoutMs', () => { + const originalEnv = process.env[ENV_KEY]; + + afterEach(() => { + if (originalEnv === undefined) { + delete process.env[ENV_KEY]; + } else { + process.env[ENV_KEY] = originalEnv; + } + }); + + it('returns the 10-minute global default when no def hint and no env override are set', () => { + delete process.env[ENV_KEY]; + expect(resolveChatRunInactivityTimeoutMs()).toBe(TEN_MINUTES_MS); + }); + + it('uses the def-level hint when env is unset', () => { + delete process.env[ENV_KEY]; + expect(resolveChatRunInactivityTimeoutMs(THIRTY_MINUTES_MS)).toBe(THIRTY_MINUTES_MS); + }); + + it('lets the env override take precedence over the def hint (operator escape hatch)', () => { + // Operators must be able to shrink or lengthen the watchdog for any + // agent without editing source — diagnosing flaky CLIs, taming runaway + // sessions, etc. + process.env[ENV_KEY] = '900000'; // 15 min + expect(resolveChatRunInactivityTimeoutMs(THIRTY_MINUTES_MS)).toBe(900_000); + }); + + it('falls back to the def hint when the env value is not a finite number', () => { + process.env[ENV_KEY] = 'not-a-number'; + expect(resolveChatRunInactivityTimeoutMs(THIRTY_MINUTES_MS)).toBe(THIRTY_MINUTES_MS); + }); + + it('still honors env=0 to disable the watchdog entirely', () => { + // Existing behavior the watchdog code already supports — preserve it + // even when an agent def would otherwise contribute a larger value. + process.env[ENV_KEY] = '0'; + expect(resolveChatRunInactivityTimeoutMs(THIRTY_MINUTES_MS)).toBe(0); + }); + + it('clamps an oversized env override to the 24-hour ceiling so Node does not fire the timer immediately', () => { + process.env[ENV_KEY] = String(TWENTY_FOUR_HOURS_MS * 100); + expect(resolveChatRunInactivityTimeoutMs()).toBe(TWENTY_FOUR_HOURS_MS); + }); + + it('clamps an oversized def hint to the 24-hour ceiling for the same reason', () => { + delete process.env[ENV_KEY]; + expect(resolveChatRunInactivityTimeoutMs(TWENTY_FOUR_HOURS_MS * 100)).toBe(TWENTY_FOUR_HOURS_MS); + }); + + it('treats a non-finite def hint as if it were absent (defends against bad runtime configs)', () => { + delete process.env[ENV_KEY]; + expect(resolveChatRunInactivityTimeoutMs(Number.NaN)).toBe(TEN_MINUTES_MS); + }); + + it('floors negative def hints to 0 rather than scheduling a negative-delay timer', () => { + delete process.env[ENV_KEY]; + expect(resolveChatRunInactivityTimeoutMs(-1)).toBe(0); + }); +}); + +describe('copilotAgentDef.inactivityTimeoutMs', () => { + it('ships a 30-minute inactivity hint so Copilot silent-thinking phases do not trip the default watchdog (#2467)', () => { + expect(copilotAgentDef.inactivityTimeoutMs).toBe(THIRTY_MINUTES_MS); + }); +}); From 48fb707b2ba48baea17ef1f5f7af8f5aa7853a8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=86=A0=E8=BE=B0?= Date: Thu, 21 May 2026 21:24:42 +0800 Subject: [PATCH 2/7] fix(daemon): keep inactivityTimeoutMs spawn-only and fast-fail on bad def hints (#2467 review follow-up) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two reviewer points from #2579: 1. RuntimeAgentDef.inactivityTimeoutMs was leaking into the public /api/agents response. DetectedAgent derived its shape via `Omit` and stripFns() spread the def through unchanged, so the new field would have drifted the daemon response away from packages/contracts/src/api/registry.ts#AgentInfo (the shared web/CLI contract). Drop it from the omit list and strip it explicitly in stripFns so the registry payload stays unchanged — agents read the hint by importing the def directly. 2. resolveChatRunInactivityTimeoutMs(agentDefault) silently normalized invalid values, so a runtime def like `inactivityTimeoutMs: -1` or `Number.NaN` would have looked fine in code review while quietly disabling the watchdog for that agent at runtime. The env path stays lenient (operator-supplied, must not crash a chat run on a typo), but the def path now throws RangeError on non-finite, negative, or non-integer values — checked-in source has to surface such typos loudly. --- apps/daemon/src/runtimes/detection.ts | 4 ++ apps/daemon/src/runtimes/types.ts | 7 ++++ apps/daemon/src/server.ts | 23 +++++++++--- .../tests/chat-run-inactivity-timeout.test.ts | 37 +++++++++++++++++-- 4 files changed, 61 insertions(+), 10 deletions(-) diff --git a/apps/daemon/src/runtimes/detection.ts b/apps/daemon/src/runtimes/detection.ts index 7a1779586..74a1b320e 100644 --- a/apps/daemon/src/runtimes/detection.ts +++ b/apps/daemon/src/runtimes/detection.ts @@ -205,6 +205,9 @@ function stripFns( // `fallbackModels` slot here too. `helpArgs` / `capabilityFlags` / // `fallbackBins` / `maxPromptArgBytes` / `env` are probe-or-spawn-only // metadata and shouldn't bleed into the API response either. + // `inactivityTimeoutMs` is a spawn-time hint for the chat-run watchdog + // and is not part of the public AgentInfo contract — strip it here so + // the runtime registry stays the only consumer. const { buildArgs, listModels, @@ -216,6 +219,7 @@ function stripFns( versionProbeTimeoutMs, maxPromptArgBytes, env, + inactivityTimeoutMs, ...rest } = def; return rest; diff --git a/apps/daemon/src/runtimes/types.ts b/apps/daemon/src/runtimes/types.ts index d9ebe2bb1..23c615788 100644 --- a/apps/daemon/src/runtimes/types.ts +++ b/apps/daemon/src/runtimes/types.ts @@ -184,6 +184,13 @@ export type DetectedAgent = Omit< | 'versionProbeTimeoutMs' | 'maxPromptArgBytes' | 'env' + // `inactivityTimeoutMs` is a spawn-time-only hint consumed by the + // chat-run watchdog. It is not part of the public `/api/agents` + // contract (`packages/contracts/src/api/registry.ts#AgentInfo`), so + // omitting it here keeps the daemon response aligned with that + // shared web/CLI shape — agents pick it up by reading the runtime + // def directly, the registry payload stays unchanged. + | 'inactivityTimeoutMs' > & { models: RuntimeModelOption[]; modelsSource: RuntimeModelSource; diff --git a/apps/daemon/src/server.ts b/apps/daemon/src/server.ts index 76fa4757d..ba23dbc2f 100644 --- a/apps/daemon/src/server.ts +++ b/apps/daemon/src/server.ts @@ -3745,17 +3745,28 @@ const DEFAULT_CHAT_RUN_ARTIFACT_QUIET_PERIOD_MS = 60 * 1000; // legitimate work (e.g. Copilot from #2467) raise the ceiling // without every operator having to set an env var. // 3. The 10-minute global default. -// Both env and def values pass through the same clamp because Node -// silently downgrades signed-32-bit-overflowing setTimeout delays to -// 1ms — without the clamp an oversized hint would fire the watchdog -// almost immediately while reporting the huge timeout to the user. +// +// The env path is lenient (silently normalizes / falls back) because it +// comes from a runtime knob an operator can mis-type at any time. The +// def path is strict (throws on non-finite or negative) because the +// value lives in checked-in source — a typo like `inactivityTimeoutMs: -1` +// should crash loudly at chat-run time rather than silently disable the +// watchdog for that agent. Both paths still pass through the 24-hour +// clamp because Node silently downgrades signed-32-bit-overflowing +// setTimeout delays to 1ms. export function resolveChatRunInactivityTimeoutMs(agentDefault?: number) { const env = Number(process.env.OD_CHAT_RUN_INACTIVITY_TIMEOUT_MS); if (Number.isFinite(env)) { return Math.min(MAX_CHAT_RUN_INACTIVITY_TIMEOUT_MS, Math.max(0, Math.floor(env))); } - if (typeof agentDefault === 'number' && Number.isFinite(agentDefault)) { - return Math.min(MAX_CHAT_RUN_INACTIVITY_TIMEOUT_MS, Math.max(0, Math.floor(agentDefault))); + if (agentDefault !== undefined) { + if (!Number.isFinite(agentDefault) || agentDefault < 0 || !Number.isInteger(agentDefault)) { + throw new RangeError( + `RuntimeAgentDef.inactivityTimeoutMs must be a non-negative integer, got ${String(agentDefault)}. ` + + 'Fix the runtime def — invalid values used to silently disable the watchdog.', + ); + } + return Math.min(MAX_CHAT_RUN_INACTIVITY_TIMEOUT_MS, agentDefault); } return DEFAULT_CHAT_RUN_INACTIVITY_TIMEOUT_MS; } diff --git a/apps/daemon/tests/chat-run-inactivity-timeout.test.ts b/apps/daemon/tests/chat-run-inactivity-timeout.test.ts index edec9d917..008de3faf 100644 --- a/apps/daemon/tests/chat-run-inactivity-timeout.test.ts +++ b/apps/daemon/tests/chat-run-inactivity-timeout.test.ts @@ -75,14 +75,43 @@ describe('resolveChatRunInactivityTimeoutMs', () => { expect(resolveChatRunInactivityTimeoutMs(TWENTY_FOUR_HOURS_MS * 100)).toBe(TWENTY_FOUR_HOURS_MS); }); - it('treats a non-finite def hint as if it were absent (defends against bad runtime configs)', () => { + it('throws RangeError on a non-finite def hint — runtime defs are checked-in source and a typo must fail loudly, not silently disable the watchdog', () => { + // The previous draft normalized NaN/Infinity to the global default, + // which made a runtime def like `inactivityTimeoutMs: Number.NaN` + // look fine in source review while quietly losing the agent-specific + // ceiling at runtime. The reviewer-correctness fix: fast-fail on the + // checked-in side so the bug surfaces in dev, not in production logs. delete process.env[ENV_KEY]; - expect(resolveChatRunInactivityTimeoutMs(Number.NaN)).toBe(TEN_MINUTES_MS); + expect(() => resolveChatRunInactivityTimeoutMs(Number.NaN)).toThrow( + /RuntimeAgentDef\.inactivityTimeoutMs/, + ); }); - it('floors negative def hints to 0 rather than scheduling a negative-delay timer', () => { + it('throws RangeError on a negative def hint — `-1` would silently disable the agent-specific watchdog otherwise', () => { delete process.env[ENV_KEY]; - expect(resolveChatRunInactivityTimeoutMs(-1)).toBe(0); + expect(() => resolveChatRunInactivityTimeoutMs(-1)).toThrow( + /must be a non-negative integer/, + ); + }); + + it('throws RangeError on a fractional def hint — Math.floor would mask a wrong-units typo (e.g. seconds instead of ms)', () => { + // A def value like `inactivityTimeoutMs: 30` (seconds, written + // forgetting the unit is ms) is a real footgun; refusing + // non-integer floats keeps such typos from getting silently + // floored to a 0ms or 30ms timer. Operators can still pass + // anything via the env override. + delete process.env[ENV_KEY]; + expect(() => resolveChatRunInactivityTimeoutMs(60.5)).toThrow( + /must be a non-negative integer/, + ); + }); + + it('keeps the env override lenient when its value is bogus and no def hint is provided (falls back to the global default)', () => { + // env comes from operator-supplied configuration that can be + // mistyped at any time. Unlike the def path it must not crash the + // chat run — fall back to the 10-minute default instead. + process.env[ENV_KEY] = 'not-a-number'; + expect(resolveChatRunInactivityTimeoutMs()).toBe(TEN_MINUTES_MS); }); }); From 88f92a516fd08e6e133fc00ae5739749b2e9dce5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=86=A0=E8=BE=B0?= Date: Thu, 21 May 2026 21:40:40 +0800 Subject: [PATCH 3/7] fix(daemon): validate runtime-def inactivityTimeoutMs even when env override is set (#2467 review follow-up) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer (#2579 blocking, round 2) noted that the fast-fail check added in the previous follow-up was placed *after* the env override early-return. That ordering meant a finite `OD_CHAT_RUN_INACTIVITY_TIMEOUT_MS` would short-circuit the resolver before the def was ever validated, so a checked-in typo like `inactivityTimeoutMs: -1` would sit unnoticed in source for as long as anyone had the env override set — exactly the opposite of the fast-fail intent. Validate `agentDefault` unconditionally at the top of the resolver so the typo surfaces on every chat run regardless of which branch wins the priority race. Add a regression test for the `invalid def + finite env override` shape, and another sanity test that env still wins when both values are valid. --- apps/daemon/src/server.ts | 17 ++++++++++++---- .../tests/chat-run-inactivity-timeout.test.ts | 20 +++++++++++++++++++ 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/apps/daemon/src/server.ts b/apps/daemon/src/server.ts index ba23dbc2f..cea22b86c 100644 --- a/apps/daemon/src/server.ts +++ b/apps/daemon/src/server.ts @@ -3754,11 +3754,14 @@ const DEFAULT_CHAT_RUN_ARTIFACT_QUIET_PERIOD_MS = 60 * 1000; // watchdog for that agent. Both paths still pass through the 24-hour // clamp because Node silently downgrades signed-32-bit-overflowing // setTimeout delays to 1ms. +// +// Order matters: validate the def hint *before* checking the env +// override. Otherwise a finite env value would hide a bad checked-in +// value (e.g. `inactivityTimeoutMs: -1`) from ever tripping the +// fast-fail — the typo could sit unnoticed in source until someone +// removed the override. Validation now runs on every call regardless +// of which branch ultimately wins. export function resolveChatRunInactivityTimeoutMs(agentDefault?: number) { - const env = Number(process.env.OD_CHAT_RUN_INACTIVITY_TIMEOUT_MS); - if (Number.isFinite(env)) { - return Math.min(MAX_CHAT_RUN_INACTIVITY_TIMEOUT_MS, Math.max(0, Math.floor(env))); - } if (agentDefault !== undefined) { if (!Number.isFinite(agentDefault) || agentDefault < 0 || !Number.isInteger(agentDefault)) { throw new RangeError( @@ -3766,6 +3769,12 @@ export function resolveChatRunInactivityTimeoutMs(agentDefault?: number) { 'Fix the runtime def — invalid values used to silently disable the watchdog.', ); } + } + const env = Number(process.env.OD_CHAT_RUN_INACTIVITY_TIMEOUT_MS); + if (Number.isFinite(env)) { + return Math.min(MAX_CHAT_RUN_INACTIVITY_TIMEOUT_MS, Math.max(0, Math.floor(env))); + } + if (agentDefault !== undefined) { return Math.min(MAX_CHAT_RUN_INACTIVITY_TIMEOUT_MS, agentDefault); } return DEFAULT_CHAT_RUN_INACTIVITY_TIMEOUT_MS; diff --git a/apps/daemon/tests/chat-run-inactivity-timeout.test.ts b/apps/daemon/tests/chat-run-inactivity-timeout.test.ts index 008de3faf..8ad501fca 100644 --- a/apps/daemon/tests/chat-run-inactivity-timeout.test.ts +++ b/apps/daemon/tests/chat-run-inactivity-timeout.test.ts @@ -113,6 +113,26 @@ describe('resolveChatRunInactivityTimeoutMs', () => { process.env[ENV_KEY] = 'not-a-number'; expect(resolveChatRunInactivityTimeoutMs()).toBe(TEN_MINUTES_MS); }); + + it('still throws on an invalid def hint when a finite env override is also set — the env must not hide a checked-in typo', () => { + // Reviewer-correctness fix: an earlier ordering placed the env + // early-return before the def validation, so a bad runtime def + // could sit unnoticed in source whenever an operator had set + // OD_CHAT_RUN_INACTIVITY_TIMEOUT_MS. The fast-fail now runs first + // and catches the typo regardless of which branch eventually + // wins. + process.env[ENV_KEY] = '15000'; + expect(() => resolveChatRunInactivityTimeoutMs(-1)).toThrow( + /must be a non-negative integer/, + ); + }); + + it('returns the env override when both the env and a valid def hint are provided (env wins as operator escape hatch)', () => { + // Sanity: validation order does not change priority — env still + // wins when both are valid. + process.env[ENV_KEY] = '15000'; + expect(resolveChatRunInactivityTimeoutMs(THIRTY_MINUTES_MS)).toBe(15_000); + }); }); describe('copilotAgentDef.inactivityTimeoutMs', () => { From ccc637a27ea3a815fbaff8ed55e2f659d3443ab3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=86=A0=E8=BE=B0?= Date: Sat, 23 May 2026 09:46:29 +0800 Subject: [PATCH 4/7] fix(daemon): fast-fail invalid def inactivityTimeoutMs at def-select time, before run setup (#2467 review follow-up) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer (#2579 non-blocking, round 3) noted that `resolveChatRunInactivityTimeoutMs(def.inactivityTimeoutMs)` is called deep in the chat-run path, *after* the runtime has already started side-effectful setup: `.mcp.json` write/unlink under the project managed cwd, auto-memory extract, prompt composition, prompt-file write, etc. A bad checked-in `inactivityTimeoutMs` therefore aborted the run only at watchdog-arm time, after that setup had already mutated local state. Extract the validation into a pure `assertValidRuntimeDefInactivityTimeoutMs(agentDefault)` helper and call it immediately after the runtime def is selected — same line as the existing `def.bin` guard. An invalid value now fails the chat run with a clear `AGENT_RUNTIME_DEF_INVALID` envelope before any filesystem or prompt-building work happens, matching the fast-fail goal of the earlier follow-up. Six new regression tests pin the assert helper's contract (undefined / valid integer / NaN / Infinity / negative / fractional / env-cannot-mask). --- apps/daemon/src/server.ts | 44 +++++++++++--- .../tests/chat-run-inactivity-timeout.test.ts | 60 ++++++++++++++++++- 2 files changed, 95 insertions(+), 9 deletions(-) diff --git a/apps/daemon/src/server.ts b/apps/daemon/src/server.ts index cea22b86c..a2c9dc47b 100644 --- a/apps/daemon/src/server.ts +++ b/apps/daemon/src/server.ts @@ -3761,15 +3761,27 @@ const DEFAULT_CHAT_RUN_ARTIFACT_QUIET_PERIOD_MS = 60 * 1000; // fast-fail — the typo could sit unnoticed in source until someone // removed the override. Validation now runs on every call regardless // of which branch ultimately wins. -export function resolveChatRunInactivityTimeoutMs(agentDefault?: number) { - if (agentDefault !== undefined) { - if (!Number.isFinite(agentDefault) || agentDefault < 0 || !Number.isInteger(agentDefault)) { - throw new RangeError( - `RuntimeAgentDef.inactivityTimeoutMs must be a non-negative integer, got ${String(agentDefault)}. ` + - 'Fix the runtime def — invalid values used to silently disable the watchdog.', - ); - } +export function assertValidRuntimeDefInactivityTimeoutMs(agentDefault?: number): void { + // Strict checked-in-config guard for `RuntimeAgentDef.inactivityTimeoutMs`. + // Exported (and called by `resolveChatRunInactivityTimeoutMs` below) + // so that chat-run startup can invoke this immediately after the + // runtime def is selected — before any filesystem or + // prompt-building work happens — and abort with a loud RangeError + // when a checked-in def carries an invalid value. That keeps a + // typo from leaving partial setup state (e.g. a `.mcp.json` write + // / unlink, a freshly-composed system prompt) on disk when the + // run then aborts later at watchdog-arm time. + if (agentDefault === undefined) return; + if (!Number.isFinite(agentDefault) || agentDefault < 0 || !Number.isInteger(agentDefault)) { + throw new RangeError( + `RuntimeAgentDef.inactivityTimeoutMs must be a non-negative integer, got ${String(agentDefault)}. ` + + 'Fix the runtime def — invalid values used to silently disable the watchdog.', + ); } +} + +export function resolveChatRunInactivityTimeoutMs(agentDefault?: number) { + assertValidRuntimeDefInactivityTimeoutMs(agentDefault); const env = Number(process.env.OD_CHAT_RUN_INACTIVITY_TIMEOUT_MS); if (Number.isFinite(env)) { return Math.min(MAX_CHAT_RUN_INACTIVITY_TIMEOUT_MS, Math.max(0, Math.floor(env))); @@ -10775,6 +10787,22 @@ export async function startServer({ ); if (!def.bin) return design.runs.fail(run, 'AGENT_UNAVAILABLE', 'agent has no binary'); + // Validate the checked-in `inactivityTimeoutMs` hint immediately + // after the runtime def is selected and before any side-effectful + // setup (auto-memory extract, `.mcp.json` write/unlink, + // composeSystemPrompt, prompt persistence). A bad def value would + // otherwise abort the run only at watchdog-arm time, after that + // setup has already mutated local state, leaving confusing partial + // residue behind (issue #2467 review on PR #2579). + try { + assertValidRuntimeDefInactivityTimeoutMs(def.inactivityTimeoutMs); + } catch (err) { + return design.runs.fail( + run, + 'AGENT_RUNTIME_DEF_INVALID', + err instanceof Error ? err.message : String(err), + ); + } const safeCommentAttachments = normalizeCommentAttachments(commentAttachments); if ( diff --git a/apps/daemon/tests/chat-run-inactivity-timeout.test.ts b/apps/daemon/tests/chat-run-inactivity-timeout.test.ts index 8ad501fca..f843f0b20 100644 --- a/apps/daemon/tests/chat-run-inactivity-timeout.test.ts +++ b/apps/daemon/tests/chat-run-inactivity-timeout.test.ts @@ -16,7 +16,10 @@ */ import { afterEach, describe, expect, it } from 'vitest'; -import { resolveChatRunInactivityTimeoutMs } from '../src/server.js'; +import { + assertValidRuntimeDefInactivityTimeoutMs, + resolveChatRunInactivityTimeoutMs, +} from '../src/server.js'; import { copilotAgentDef } from '../src/runtimes/defs/copilot.js'; const ENV_KEY = 'OD_CHAT_RUN_INACTIVITY_TIMEOUT_MS'; @@ -140,3 +143,58 @@ describe('copilotAgentDef.inactivityTimeoutMs', () => { expect(copilotAgentDef.inactivityTimeoutMs).toBe(THIRTY_MINUTES_MS); }); }); + +describe('assertValidRuntimeDefInactivityTimeoutMs (#2579 fast-fail at def-select time)', () => { + // Reviewer-correctness fix (#2579 non-blocking, round 3): the + // strict checked-in-config check must run *immediately* after the + // runtime def is selected, before any side-effectful setup + // (`.mcp.json` write/unlink, prompt composition, file writes) so + // that an invalid checked-in `inactivityTimeoutMs` aborts the run + // before leaving partial state behind. The assert helper is the + // pure entry point chat-run startup can call without also touching + // env-resolution side effects. + + it('returns silently when the def hint is omitted (no-op for adapters without the field)', () => { + expect(() => assertValidRuntimeDefInactivityTimeoutMs()).not.toThrow(); + expect(() => assertValidRuntimeDefInactivityTimeoutMs(undefined)).not.toThrow(); + }); + + it('returns silently for a valid non-negative integer (e.g. Copilot 30 min)', () => { + expect(() => assertValidRuntimeDefInactivityTimeoutMs(THIRTY_MINUTES_MS)).not.toThrow(); + expect(() => assertValidRuntimeDefInactivityTimeoutMs(0)).not.toThrow(); + }); + + it('throws on NaN / Infinity — bad checked-in source must surface, not be silently normalized', () => { + expect(() => assertValidRuntimeDefInactivityTimeoutMs(Number.NaN)).toThrow( + /must be a non-negative integer/, + ); + expect(() => assertValidRuntimeDefInactivityTimeoutMs(Number.POSITIVE_INFINITY)).toThrow( + /must be a non-negative integer/, + ); + }); + + it('throws on a negative def value (typo footgun)', () => { + expect(() => assertValidRuntimeDefInactivityTimeoutMs(-1)).toThrow( + /must be a non-negative integer/, + ); + }); + + it('throws on a fractional def value (wrong-units typo, e.g. seconds instead of ms)', () => { + expect(() => assertValidRuntimeDefInactivityTimeoutMs(60.5)).toThrow( + /must be a non-negative integer/, + ); + }); + + it('runs independently of the env override — an env value cannot mask the typo', () => { + const originalEnv = process.env[ENV_KEY]; + process.env[ENV_KEY] = '15000'; + try { + expect(() => assertValidRuntimeDefInactivityTimeoutMs(-1)).toThrow( + /must be a non-negative integer/, + ); + } finally { + if (originalEnv === undefined) delete process.env[ENV_KEY]; + else process.env[ENV_KEY] = originalEnv; + } + }); +}); From 5bde2a6ab9b92cfeb9b6560d59b1979c4860a0fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=86=A0=E8=BE=B0?= Date: Sat, 23 May 2026 14:19:13 +0800 Subject: [PATCH 5/7] fix(contracts): add AGENT_RUNTIME_DEF_INVALID to API_ERROR_CODES (#2467 review follow-up) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous follow-up emitted AGENT_RUNTIME_DEF_INVALID through design.runs.fail at run startup when the selected runtime def carries a non-finite / negative / fractional inactivityTimeoutMs, but the code string did not exist in packages/contracts/src/errors.ts#API_ERROR_CODES. design.runs.fail feeds the shared SSE/status error envelopes consumed by web and CLI, so the daemon was quietly widening the wire contract while the PR description claimed no contract change. Land the code in the shared contracts union next to the other AGENT_* codes so the daemon/web/CLI surface stays aligned. Consumers reading ApiErrorCode pick the new member up for free — it is a union extension, not a break. Regression tests assert the literal is in API_ERROR_CODES and is assignable to ApiErrorCode at the type level. --- .../tests/chat-run-inactivity-timeout.test.ts | 23 +++++++++++++++++++ packages/contracts/src/errors.ts | 8 +++++++ 2 files changed, 31 insertions(+) diff --git a/apps/daemon/tests/chat-run-inactivity-timeout.test.ts b/apps/daemon/tests/chat-run-inactivity-timeout.test.ts index f843f0b20..77e911775 100644 --- a/apps/daemon/tests/chat-run-inactivity-timeout.test.ts +++ b/apps/daemon/tests/chat-run-inactivity-timeout.test.ts @@ -16,6 +16,7 @@ */ import { afterEach, describe, expect, it } from 'vitest'; +import { API_ERROR_CODES, type ApiErrorCode } from '@open-design/contracts'; import { assertValidRuntimeDefInactivityTimeoutMs, resolveChatRunInactivityTimeoutMs, @@ -198,3 +199,25 @@ describe('assertValidRuntimeDefInactivityTimeoutMs (#2579 fast-fail at def-selec } }); }); + +describe('AGENT_RUNTIME_DEF_INVALID contract surface (#2579 review follow-up)', () => { + // The chat-run startup path emits this code through `design.runs.fail`, + // which feeds the shared SSE/status error envelopes — i.e. the + // daemon/web/CLI wire contract. The previous follow-up introduced the + // string at the daemon emit site only, so downstream consumers reading + // `ApiErrorCode` saw a value not in the union. Land it in + // `packages/contracts/src/errors.ts#API_ERROR_CODES` so the contract + // catches up and any future web/CLI consumer can switch on it. + + it('AGENT_RUNTIME_DEF_INVALID is exposed by the shared contracts package so daemon/web/CLI agree on the union', () => { + expect((API_ERROR_CODES as readonly string[])).toContain('AGENT_RUNTIME_DEF_INVALID'); + }); + + it('the literal is assignable to ApiErrorCode (compile-time wire-contract check)', () => { + // If the union ever loses this member again, the assignment below + // becomes a type error; the runtime expect just keeps the spec + // executable so the regression surfaces in CI. + const code: ApiErrorCode = 'AGENT_RUNTIME_DEF_INVALID'; + expect(code).toBe('AGENT_RUNTIME_DEF_INVALID'); + }); +}); diff --git a/packages/contracts/src/errors.ts b/packages/contracts/src/errors.ts index b397b4629..7bb2f337e 100644 --- a/packages/contracts/src/errors.ts +++ b/packages/contracts/src/errors.ts @@ -29,6 +29,14 @@ export const API_ERROR_CODES = [ // `server.ts::abortForRoleMarker` alongside the existing // `fabricated_role_marker` warning event. Retryable. 'ROLE_MARKER_HALLUCINATION', + // The selected runtime agent def (apps/daemon/src/runtimes/defs/*) has + // a checked-in field that fails strict source-config validation — e.g. + // a non-integer, NaN, Infinity, or negative `inactivityTimeoutMs` + // (issue #2467 review on PR #2579). The bug is in the source file; + // the operator cannot recover the run, the daemon must abort it and + // surface the def-correctness error so it shows up in dev rather + // than silently disabling the agent-specific watchdog. + 'AGENT_RUNTIME_DEF_INVALID', 'PROJECT_NOT_FOUND', // Handoff (`POST /api/projects/:id/handoff`): the requested conversation // is not in the project, or has no messages to synthesize a handoff from. From 59246ca5acd62a518b225ffb37dcd11d3e2b28b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=86=A0=E8=BE=B0?= Date: Sat, 23 May 2026 16:15:15 +0800 Subject: [PATCH 6/7] fix(daemon): narrow chat-run startup catch to RangeError so non-validation errors are not misreported as bad def (#2467 review follow-up) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous follow-up wrapped assertValidRuntimeDefInactivityTimeoutMs in a broad catch (any thrown value rewritten as AGENT_RUNTIME_DEF_INVALID). The helper is the strict fast-fail gate and currently only throws RangeError on invalid checked-in values, but a future bug inside it — or any unrelated exception introduced while touching this path — would be reported back to the caller as 'the runtime def is bad' instead of surfacing as the real failure. Narrow the catch to RangeError so only genuine validation failures map to AGENT_RUNTIME_DEF_INVALID; anything else bubbles up to the outer chat-run starter and surfaces as AGENT_EXECUTION_FAILED. Add a pinning test that asserts the helper throws exactly RangeError for every invalid input, so the helper/wrapper contract cannot drift silently. --- apps/daemon/src/server.ts | 18 +++++++++++++----- .../tests/chat-run-inactivity-timeout.test.ts | 18 ++++++++++++++++++ 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/apps/daemon/src/server.ts b/apps/daemon/src/server.ts index a2c9dc47b..483e7656d 100644 --- a/apps/daemon/src/server.ts +++ b/apps/daemon/src/server.ts @@ -10794,14 +10794,22 @@ export async function startServer({ // otherwise abort the run only at watchdog-arm time, after that // setup has already mutated local state, leaving confusing partial // residue behind (issue #2467 review on PR #2579). + // + // Catch is intentionally narrowed to `RangeError`, the only kind + // `assertValidRuntimeDefInactivityTimeoutMs` is allowed to throw + // for invalid checked-in values. Anything else (a regression that + // makes the helper throw on a valid value, an unrelated bug + // introduced while touching this path) should bubble up to the + // outer chat-run starter — which surfaces it as + // `AGENT_EXECUTION_FAILED` — rather than being misreported as + // "the runtime def is bad" and burying the real failure. try { assertValidRuntimeDefInactivityTimeoutMs(def.inactivityTimeoutMs); } catch (err) { - return design.runs.fail( - run, - 'AGENT_RUNTIME_DEF_INVALID', - err instanceof Error ? err.message : String(err), - ); + if (err instanceof RangeError) { + return design.runs.fail(run, 'AGENT_RUNTIME_DEF_INVALID', err.message); + } + throw err; } const safeCommentAttachments = normalizeCommentAttachments(commentAttachments); diff --git a/apps/daemon/tests/chat-run-inactivity-timeout.test.ts b/apps/daemon/tests/chat-run-inactivity-timeout.test.ts index 77e911775..831fd584c 100644 --- a/apps/daemon/tests/chat-run-inactivity-timeout.test.ts +++ b/apps/daemon/tests/chat-run-inactivity-timeout.test.ts @@ -186,6 +186,24 @@ describe('assertValidRuntimeDefInactivityTimeoutMs (#2579 fast-fail at def-selec ); }); + it('throws specifically `RangeError` for every invalid def value — the chat-run startup wrapper relies on this to narrow its catch and only flag AGENT_RUNTIME_DEF_INVALID for genuine validation failures (#2579 round-5 review)', () => { + // The wrapper in server.ts now does: + // try { assertValidRuntimeDefInactivityTimeoutMs(...) } + // catch (err) { + // if (err instanceof RangeError) → AGENT_RUNTIME_DEF_INVALID + // else throw err + // } + // If the helper ever starts throwing TypeError / generic Error + // (e.g. an unrelated bug inside it, or a refactor that wraps the + // RangeError), the wrapper would either misroute the failure + // (the old `err instanceof Error` catch did) or skip the + // structured fail entirely. Pin the error class so the contract + // between helper and wrapper cannot drift silently. + for (const bad of [Number.NaN, Number.POSITIVE_INFINITY, Number.NEGATIVE_INFINITY, -1, 60.5]) { + expect(() => assertValidRuntimeDefInactivityTimeoutMs(bad)).toThrow(RangeError); + } + }); + it('runs independently of the env override — an env value cannot mask the typo', () => { const originalEnv = process.env[ENV_KEY]; process.env[ENV_KEY] = '15000'; From 5346fd7484dfe945fafa055a23210da6387e30a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=86=A0=E8=BE=B0?= Date: Sun, 31 May 2026 12:51:22 +0800 Subject: [PATCH 7/7] test(contracts): cover runtime def error code in contracts lane --- .../tests/chat-run-inactivity-timeout.test.ts | 23 ------------------- packages/contracts/tests/error-codes.test.ts | 18 +++++++++++++++ 2 files changed, 18 insertions(+), 23 deletions(-) create mode 100644 packages/contracts/tests/error-codes.test.ts diff --git a/apps/daemon/tests/chat-run-inactivity-timeout.test.ts b/apps/daemon/tests/chat-run-inactivity-timeout.test.ts index 831fd584c..77bdd4b47 100644 --- a/apps/daemon/tests/chat-run-inactivity-timeout.test.ts +++ b/apps/daemon/tests/chat-run-inactivity-timeout.test.ts @@ -16,7 +16,6 @@ */ import { afterEach, describe, expect, it } from 'vitest'; -import { API_ERROR_CODES, type ApiErrorCode } from '@open-design/contracts'; import { assertValidRuntimeDefInactivityTimeoutMs, resolveChatRunInactivityTimeoutMs, @@ -217,25 +216,3 @@ describe('assertValidRuntimeDefInactivityTimeoutMs (#2579 fast-fail at def-selec } }); }); - -describe('AGENT_RUNTIME_DEF_INVALID contract surface (#2579 review follow-up)', () => { - // The chat-run startup path emits this code through `design.runs.fail`, - // which feeds the shared SSE/status error envelopes — i.e. the - // daemon/web/CLI wire contract. The previous follow-up introduced the - // string at the daemon emit site only, so downstream consumers reading - // `ApiErrorCode` saw a value not in the union. Land it in - // `packages/contracts/src/errors.ts#API_ERROR_CODES` so the contract - // catches up and any future web/CLI consumer can switch on it. - - it('AGENT_RUNTIME_DEF_INVALID is exposed by the shared contracts package so daemon/web/CLI agree on the union', () => { - expect((API_ERROR_CODES as readonly string[])).toContain('AGENT_RUNTIME_DEF_INVALID'); - }); - - it('the literal is assignable to ApiErrorCode (compile-time wire-contract check)', () => { - // If the union ever loses this member again, the assignment below - // becomes a type error; the runtime expect just keeps the spec - // executable so the regression surfaces in CI. - const code: ApiErrorCode = 'AGENT_RUNTIME_DEF_INVALID'; - expect(code).toBe('AGENT_RUNTIME_DEF_INVALID'); - }); -}); diff --git a/packages/contracts/tests/error-codes.test.ts b/packages/contracts/tests/error-codes.test.ts new file mode 100644 index 000000000..e67c64858 --- /dev/null +++ b/packages/contracts/tests/error-codes.test.ts @@ -0,0 +1,18 @@ +import { describe, expect, it } from 'vitest'; + +import { API_ERROR_CODES, type ApiErrorCode } from '../src/errors'; + +describe('shared API error codes', () => { + it('exposes AGENT_RUNTIME_DEF_INVALID for runtime-def validation failures', () => { + // Chat-run startup emits this code through the shared SSE/status error + // envelopes when a checked-in runtime def is invalid. Keeping the + // assertion in the contracts package ensures contract-only refactors + // cannot drop the literal without this package's own test lane failing. + expect(API_ERROR_CODES).toContain('AGENT_RUNTIME_DEF_INVALID'); + }); + + it('keeps AGENT_RUNTIME_DEF_INVALID assignable to ApiErrorCode', () => { + const code: ApiErrorCode = 'AGENT_RUNTIME_DEF_INVALID'; + expect(code).toBe('AGENT_RUNTIME_DEF_INVALID'); + }); +});