fix(daemon): give GitHub Copilot a longer chat-run inactivity ceiling (#2467)

Copilot CLI goes silent (no stdout, no streamed events) for long
stretches during legitimate deck-generation and large-prompt turns —
the model is still working but the CLI does not emit keepalive frames.
The 600s chat-run inactivity watchdog used to kill those runs as
`stalled` even though the agent was healthy; the user's reported "stuck
after 10 mins and a few seconds" plus the need to type `Continue` to
get the files lines up exactly with the watchdog tripping mid-turn and
the next message reattaching to the still-live session.

Surface an optional `inactivityTimeoutMs` field on `RuntimeAgentDef`,
merge it under the existing `OD_CHAT_RUN_INACTIVITY_TIMEOUT_MS` env
override, and set Copilot to 30 minutes. The env override still wins
so operators can shrink it when diagnosing a runaway session, and the
clamp / 24-hour ceiling apply to both the env path and the def path
so neither can schedule a negative or signed-32-bit-overflowing
setTimeout delay.

Closes #2467.
This commit is contained in:
李冠辰 2026-05-21 19:36:19 +08:00
parent d66a463d62
commit 51ad525f5a
4 changed files with 132 additions and 11 deletions

View file

@ -67,4 +67,14 @@ export const copilotAgentDef = {
},
promptViaStdin: true,
streamFormat: 'copilot-stream-json',
// GitHub Copilot's deck-generation and large-prompt turns go silent
// (no stdout, no streamed events) for stretches that exceed the
// 10-minute global default — the model is still working but the
// CLI does not emit keepalive frames. The default watchdog used to
// kill those runs as `stalled` even though the agent was healthy
// (issue #2467: "GitHub Copilot agent getting stuck after 10 mins
// and few seconds"). 30 minutes gives the heavy turns room to land
// while still bounding genuine hangs; operators can override via
// `OD_CHAT_RUN_INACTIVITY_TIMEOUT_MS` if they want it tighter.
inactivityTimeoutMs: 30 * 60 * 1000,
} satisfies RuntimeAgentDef;

View file

@ -162,6 +162,14 @@ export type RuntimeAgentDef = {
// present in the daemon's `process.env`; Settings-UI per-agent env
// values only reach the spawned child and are NOT consulted here.
defaultModelEnvVar?: string;
// Agent-recommended override for the chat-run inactivity watchdog.
// The watchdog observes child stdout/stderr/SSE activity, not real
// CPU progress, so agents whose CLIs go silent for long stretches
// during legitimate work (e.g. Copilot's deck-generation thinking
// phase from #2467) need a longer ceiling than the 10-minute global
// default. Operators can still override per-process via
// `OD_CHAT_RUN_INACTIVITY_TIMEOUT_MS` — that env wins.
inactivityTimeoutMs?: number;
};
export type DetectedAgent = Omit<

View file

@ -3738,16 +3738,26 @@ const MAX_CHAT_RUN_INACTIVITY_TIMEOUT_MS = 24 * 60 * 60 * 1000;
// always means the agent is winding down or hanging. See #1451.
const DEFAULT_CHAT_RUN_ARTIFACT_QUIET_PERIOD_MS = 60 * 1000;
function resolveChatRunInactivityTimeoutMs() {
const raw = Number(process.env.OD_CHAT_RUN_INACTIVITY_TIMEOUT_MS);
// This watchdog observes child stdout/stderr/SSE activity, not real CPU or
// filesystem progress. Keep the default long enough for agents that spend
// several minutes silently writing large artifacts.
if (!Number.isFinite(raw)) return DEFAULT_CHAT_RUN_INACTIVITY_TIMEOUT_MS;
// Node clamps delays larger than a signed 32-bit integer down to 1ms, which
// makes an oversized override fail almost immediately while reporting a huge
// timeout. Keep explicit overrides bounded to a practical, timer-safe value.
return Math.min(MAX_CHAT_RUN_INACTIVITY_TIMEOUT_MS, Math.max(0, Math.floor(raw)));
// Resolve the chat-run inactivity watchdog ceiling. Priority order:
// 1. `OD_CHAT_RUN_INACTIVITY_TIMEOUT_MS` (operator escape hatch).
// 2. The agent runtime def's `inactivityTimeoutMs` recommendation —
// lets agents whose CLIs go silent for long stretches during
// legitimate work (e.g. Copilot from #2467) raise the ceiling
// without every operator having to set an env var.
// 3. The 10-minute global default.
// Both env and def values pass through the same clamp because Node
// silently downgrades signed-32-bit-overflowing setTimeout delays to
// 1ms — without the clamp an oversized hint would fire the watchdog
// almost immediately while reporting the huge timeout to the user.
export function resolveChatRunInactivityTimeoutMs(agentDefault?: number) {
const env = Number(process.env.OD_CHAT_RUN_INACTIVITY_TIMEOUT_MS);
if (Number.isFinite(env)) {
return Math.min(MAX_CHAT_RUN_INACTIVITY_TIMEOUT_MS, Math.max(0, Math.floor(env)));
}
if (typeof agentDefault === 'number' && Number.isFinite(agentDefault)) {
return Math.min(MAX_CHAT_RUN_INACTIVITY_TIMEOUT_MS, Math.max(0, Math.floor(agentDefault)));
}
return DEFAULT_CHAT_RUN_INACTIVITY_TIMEOUT_MS;
}
// Resolve the post-artifact quiet-period window. Same clamp as the outer
@ -11516,7 +11526,7 @@ export async function startServer({
// here; on this branch `send` was hoisted into the AMR preflight
// earlier, so we keep only the new `runStartTimeMs` declaration.
const runStartTimeMs = Date.now();
const inactivityTimeoutMs = resolveChatRunInactivityTimeoutMs();
const inactivityTimeoutMs = resolveChatRunInactivityTimeoutMs(def.inactivityTimeoutMs);
const artifactQuietPeriodMs = resolveChatRunArtifactQuietPeriodMs();
const inactivityKillGraceMs = 3_000;
let inactivityTimer = null;

View file

@ -0,0 +1,93 @@
/**
* Per-agent inactivity-timeout resolution (#2467).
*
* The chat-run inactivity watchdog defaults to 10 minutes. Some agents
* (GitHub Copilot CLI) genuinely stay silent for longer than that on
* heavy deck-generation turns the model is still working but emits
* no stdout, so the watchdog used to kill the run as `stalled` even
* though the agent was healthy.
*
* Runtime defs can now advertise a recommended `inactivityTimeoutMs`,
* and the resolver merges it under the env override:
*
* OD_CHAT_RUN_INACTIVITY_TIMEOUT_MS highest priority (operator override)
* def.inactivityTimeoutMs next (agent-specific recommendation)
* DEFAULT_CHAT_RUN_INACTIVITY_TIMEOUT_MS (10 min) global default
*/
import { afterEach, describe, expect, it } from 'vitest';
import { resolveChatRunInactivityTimeoutMs } from '../src/server.js';
import { copilotAgentDef } from '../src/runtimes/defs/copilot.js';
const ENV_KEY = 'OD_CHAT_RUN_INACTIVITY_TIMEOUT_MS';
const TEN_MINUTES_MS = 10 * 60 * 1000;
const THIRTY_MINUTES_MS = 30 * 60 * 1000;
const TWENTY_FOUR_HOURS_MS = 24 * 60 * 60 * 1000;
describe('resolveChatRunInactivityTimeoutMs', () => {
const originalEnv = process.env[ENV_KEY];
afterEach(() => {
if (originalEnv === undefined) {
delete process.env[ENV_KEY];
} else {
process.env[ENV_KEY] = originalEnv;
}
});
it('returns the 10-minute global default when no def hint and no env override are set', () => {
delete process.env[ENV_KEY];
expect(resolveChatRunInactivityTimeoutMs()).toBe(TEN_MINUTES_MS);
});
it('uses the def-level hint when env is unset', () => {
delete process.env[ENV_KEY];
expect(resolveChatRunInactivityTimeoutMs(THIRTY_MINUTES_MS)).toBe(THIRTY_MINUTES_MS);
});
it('lets the env override take precedence over the def hint (operator escape hatch)', () => {
// Operators must be able to shrink or lengthen the watchdog for any
// agent without editing source — diagnosing flaky CLIs, taming runaway
// sessions, etc.
process.env[ENV_KEY] = '900000'; // 15 min
expect(resolveChatRunInactivityTimeoutMs(THIRTY_MINUTES_MS)).toBe(900_000);
});
it('falls back to the def hint when the env value is not a finite number', () => {
process.env[ENV_KEY] = 'not-a-number';
expect(resolveChatRunInactivityTimeoutMs(THIRTY_MINUTES_MS)).toBe(THIRTY_MINUTES_MS);
});
it('still honors env=0 to disable the watchdog entirely', () => {
// Existing behavior the watchdog code already supports — preserve it
// even when an agent def would otherwise contribute a larger value.
process.env[ENV_KEY] = '0';
expect(resolveChatRunInactivityTimeoutMs(THIRTY_MINUTES_MS)).toBe(0);
});
it('clamps an oversized env override to the 24-hour ceiling so Node does not fire the timer immediately', () => {
process.env[ENV_KEY] = String(TWENTY_FOUR_HOURS_MS * 100);
expect(resolveChatRunInactivityTimeoutMs()).toBe(TWENTY_FOUR_HOURS_MS);
});
it('clamps an oversized def hint to the 24-hour ceiling for the same reason', () => {
delete process.env[ENV_KEY];
expect(resolveChatRunInactivityTimeoutMs(TWENTY_FOUR_HOURS_MS * 100)).toBe(TWENTY_FOUR_HOURS_MS);
});
it('treats a non-finite def hint as if it were absent (defends against bad runtime configs)', () => {
delete process.env[ENV_KEY];
expect(resolveChatRunInactivityTimeoutMs(Number.NaN)).toBe(TEN_MINUTES_MS);
});
it('floors negative def hints to 0 rather than scheduling a negative-delay timer', () => {
delete process.env[ENV_KEY];
expect(resolveChatRunInactivityTimeoutMs(-1)).toBe(0);
});
});
describe('copilotAgentDef.inactivityTimeoutMs', () => {
it('ships a 30-minute inactivity hint so Copilot silent-thinking phases do not trip the default watchdog (#2467)', () => {
expect(copilotAgentDef.inactivityTimeoutMs).toBe(THIRTY_MINUTES_MS);
});
});