fix(daemon): isolate per-agent detection failures so one bad probe cannot blank the picker (#2297) (#2444)

* fix(daemon): isolate per-agent detection failures so one bad probe cannot blank the picker

`detectAgents` ran every adapter probe in a bare `Promise.all`, so a
synchronous throw from any single probe (e.g. a filesystem error
during PATH walking on a packaged Windows daemon, or an unhandled
async rejection from one of the post-launch probes) rejected the
whole batch. The `/api/agents` route's `catch(() => [])` then handed
the UI an empty list and the model picker collapsed to BYOK / Cloud
only, losing every installed CLI option — which matches what users
in issue #2297 observed after one or two app restarts on Windows.

Wrap each probe in `safeProbe` so a single failure degrades just that
adapter to `unavailableAgent(def)` while the rest of the registry
keeps its real availability. The new
`apps/daemon/tests/runtimes/detection-resilience.test.ts` pins both
synchronous failure sites that previously sat outside the existing
inner try/catch blocks (`resolveAgentLaunch` and
`applyAgentLaunchEnv`) so a future code change cannot regress the
isolation contract.

This is a defensive guard rather than a Windows-only diagnosis: it
fixes any scenario where a single probe blows up, including ones we
have not reproduced yet. If a user still hits #2297 after this lands,
the daemon log will identify which adapter failed instead of silently
returning an empty list.

Fixes #2297

* ci: re-run checks (unrelated e2e baseline flake on previous run)
This commit is contained in:
YOMXXX 2026-05-23 11:47:32 +08:00 committed by GitHub
parent a66b34d60b
commit cb98721b91
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 110 additions and 1 deletions

View file

@ -220,11 +220,29 @@ function stripFns(
return rest;
}
async function safeProbe(
def: RuntimeAgentDef,
configuredEnv: Record<string, string> = {},
): Promise<DetectedAgent> {
try {
return await probe(def, configuredEnv);
} catch {
// Fault isolation (issue #2297): one adapter's probe blowing up
// — e.g. a synchronous filesystem throw during PATH walking on a
// packaged Windows daemon, or an async rejection from one of the
// post-launch probes — must not collapse the whole agent picker.
// Without this guard the bare `Promise.all` rejected and the
// `/api/agents` catch arm returned `[]`, so the UI silently lost
// every CLI option and fell back to BYOK / Cloud only.
return unavailableAgent(def);
}
}
export async function detectAgents(
configuredEnvByAgent: Record<string, Record<string, string>> = {},
) {
const results = await Promise.all(
AGENT_DEFS.map((def) => probe(def, configuredEnvByAgent?.[def.id] ?? {})),
AGENT_DEFS.map((def) => safeProbe(def, configuredEnvByAgent?.[def.id] ?? {})),
);
// Refresh the validation cache from whatever we just surfaced to the UI
// so /api/chat can accept any model the user could have just picked,

View file

@ -0,0 +1,91 @@
// Regression coverage for issue #2297: when a single agent's launch
// resolution throws (e.g. a transient filesystem error during PATH
// walking on Windows packaged builds), `detectAgents()` used to reject
// the whole `Promise.all` and the `/api/agents` route caught it back to
// an empty array. The UI then showed only the cloud/BYOK fallback even
// though every other CLI was healthy. These tests pin the per-probe
// isolation invariant: one broken adapter must not blank the picker.
import { afterEach, expect, test, vi } from 'vitest';
vi.mock('../../src/runtimes/launch.js', async (importOriginal) => {
const actual = await importOriginal<typeof import('../../src/runtimes/launch.js')>();
return {
...actual,
resolveAgentLaunch: vi.fn(actual.resolveAgentLaunch),
applyAgentLaunchEnv: vi.fn(actual.applyAgentLaunchEnv),
};
});
import * as launchModule from '../../src/runtimes/launch.js';
import { detectAgents } from '../../src/runtimes/detection.js';
import { AGENT_DEFS } from '../../src/runtimes/registry.js';
const mockedResolveAgentLaunch = vi.mocked(launchModule.resolveAgentLaunch);
const mockedApplyAgentLaunchEnv = vi.mocked(launchModule.applyAgentLaunchEnv);
const originalResolveImpl = mockedResolveAgentLaunch.getMockImplementation()!;
const originalApplyImpl = mockedApplyAgentLaunchEnv.getMockImplementation()!;
afterEach(() => {
mockedResolveAgentLaunch.mockImplementation(originalResolveImpl);
mockedApplyAgentLaunchEnv.mockImplementation(originalApplyImpl);
});
test('detectAgents isolates a single agent probe throw so the picker still lists every adapter', async () => {
// Simulate the failure shape that issue #2297 attributes to packaged
// Windows daemons: one adapter's launch resolution throws because the
// FS walk hits a permission error, a malformed PATH entry, or a
// broken symlink (`accessSync`/`readdirSync` can surface either as a
// throw). Before the per-probe guard the whole `Promise.all`
// rejected; the `/api/agents` route's `catch(() => [])` then handed
// the UI an empty list and the model picker collapsed to Cloud only.
mockedResolveAgentLaunch.mockImplementation((def, env) => {
if (def.id === 'claude') {
throw new Error('synthetic FS throw during PATH walk');
}
return originalResolveImpl(def, env);
});
const agents = await detectAgents();
// Every adapter from the registry must still appear, including the
// one whose probe blew up — it just gets surfaced as unavailable so
// the rest of the picker (and the BYOK fallback row) stays intact.
expect(agents.length).toBe(AGENT_DEFS.length);
const claude = agents.find((a) => a.id === 'claude');
expect(claude, 'broken adapter must still appear in the detection result').toBeDefined();
expect(claude?.available).toBe(false);
// And the other adapters must keep whatever availability the real
// probe would have returned — none of them get blanket-marked
// unavailable just because claude's slot threw.
expect(agents.filter((a) => a.id !== 'claude').length).toBe(AGENT_DEFS.length - 1);
});
test('detectAgents isolates a probe throw from applyAgentLaunchEnv just like resolveAgentLaunch', async () => {
// The same per-probe guard has to cover both synchronous sites in
// `probe()` that sit outside the existing inner try/catch blocks:
// `resolveAgentLaunch` (above) and `applyAgentLaunchEnv` (here). A
// single test for each anchors the invariant against future code
// that adds a third pre-try synchronous call.
let thrown = false;
mockedApplyAgentLaunchEnv.mockImplementation((env, launch, nodeBinDir) => {
// We cannot tell which agent this env belongs to from arguments
// alone, so throw on the first call and pass through afterwards.
// detectAgents runs probes in parallel via Promise.all so the
// "first" call is non-deterministic but for fault-isolation that
// does not matter — any single throw blanking the whole list is
// the bug.
if (!thrown) {
thrown = true;
throw new Error('synthetic env construction error');
}
return originalApplyImpl(env, launch, nodeBinDir);
});
const agents = await detectAgents();
expect(agents.length).toBe(AGENT_DEFS.length);
// At least one adapter is marked unavailable because of the throw;
// every other adapter keeps its real availability.
const unavailableFromThrow = agents.filter((a) => a.available === false);
expect(unavailableFromThrow.length).toBeGreaterThanOrEqual(1);
});