fix(daemon): isolate per-agent detection failures so one bad probe cannot blank the picker (#2297) (#2444)

* fix(daemon): isolate per-agent detection failures so one bad probe cannot blank the picker `detectAgents` ran every adapter probe in a bare `Promise.all`, so a synchronous throw from any single probe (e.g. a filesystem error during PATH walking on a packaged Windows daemon, or an unhandled async rejection from one of the post-launch probes) rejected the whole batch. The `/api/agents` route's `catch(() => [])` then handed the UI an empty list and the model picker collapsed to BYOK / Cloud only, losing every installed CLI option — which matches what users in issue #2297 observed after one or two app restarts on Windows. Wrap each probe in `safeProbe` so a single failure degrades just that adapter to `unavailableAgent(def)` while the rest of the registry keeps its real availability. The new `apps/daemon/tests/runtimes/detection-resilience.test.ts` pins both synchronous failure sites that previously sat outside the existing inner try/catch blocks (`resolveAgentLaunch` and `applyAgentLaunchEnv`) so a future code change cannot regress the isolation contract. This is a defensive guard rather than a Windows-only diagnosis: it fixes any scenario where a single probe blows up, including ones we have not reproduced yet. If a user still hits #2297 after this lands, the daemon log will identify which adapter failed instead of silently returning an empty list. Fixes #2297 * ci: re-run checks (unrelated e2e baseline flake on previous run)
2026-06-01 03:14:35 +07:00 · 2026-05-23 11:47:32 +08:00 · 2026-05-23 11:47:32 +08:00 · cb98721b91
commit cb98721b91
parent a66b34d60b
2 changed files with 110 additions and 1 deletions
--- a/apps/daemon/src/runtimes/detection.ts
+++ b/apps/daemon/src/runtimes/detection.ts
@ -220,11 +220,29 @@ function stripFns(
  return rest;
 }

+async function safeProbe(
+  def: RuntimeAgentDef,
+  configuredEnv: Record<string, string> = {},
+): Promise<DetectedAgent> {
+  try {
+    return await probe(def, configuredEnv);
+  } catch {
+    // Fault isolation (issue #2297): one adapter's probe blowing up
+    // — e.g. a synchronous filesystem throw during PATH walking on a
+    // packaged Windows daemon, or an async rejection from one of the
+    // post-launch probes — must not collapse the whole agent picker.
+    // Without this guard the bare `Promise.all` rejected and the
+    // `/api/agents` catch arm returned `[]`, so the UI silently lost
+    // every CLI option and fell back to BYOK / Cloud only.
+    return unavailableAgent(def);
+  }
+}
+
 export async function detectAgents(
  configuredEnvByAgent: Record<string, Record<string, string>> = {},
 ) {
  const results = await Promise.all(
-    AGENT_DEFS.map((def) => probe(def, configuredEnvByAgent?.[def.id] ?? {})),
+    AGENT_DEFS.map((def) => safeProbe(def, configuredEnvByAgent?.[def.id] ?? {})),
  );
  // Refresh the validation cache from whatever we just surfaced to the UI
  // so /api/chat can accept any model the user could have just picked,
--- a/apps/daemon/tests/runtimes/detection-resilience.test.ts
+++ b/apps/daemon/tests/runtimes/detection-resilience.test.ts
@ -0,0 +1,91 @@
+// Regression coverage for issue #2297: when a single agent's launch
+// resolution throws (e.g. a transient filesystem error during PATH
+// walking on Windows packaged builds), `detectAgents()` used to reject
+// the whole `Promise.all` and the `/api/agents` route caught it back to
+// an empty array. The UI then showed only the cloud/BYOK fallback even
+// though every other CLI was healthy. These tests pin the per-probe
+// isolation invariant: one broken adapter must not blank the picker.
+import { afterEach, expect, test, vi } from 'vitest';
+
+vi.mock('../../src/runtimes/launch.js', async (importOriginal) => {
+  const actual = await importOriginal<typeof import('../../src/runtimes/launch.js')>();
+  return {
+    ...actual,
+    resolveAgentLaunch: vi.fn(actual.resolveAgentLaunch),
+    applyAgentLaunchEnv: vi.fn(actual.applyAgentLaunchEnv),
+  };
+});
+
+import * as launchModule from '../../src/runtimes/launch.js';
+import { detectAgents } from '../../src/runtimes/detection.js';
+import { AGENT_DEFS } from '../../src/runtimes/registry.js';
+
+const mockedResolveAgentLaunch = vi.mocked(launchModule.resolveAgentLaunch);
+const mockedApplyAgentLaunchEnv = vi.mocked(launchModule.applyAgentLaunchEnv);
+const originalResolveImpl = mockedResolveAgentLaunch.getMockImplementation()!;
+const originalApplyImpl = mockedApplyAgentLaunchEnv.getMockImplementation()!;
+
+afterEach(() => {
+  mockedResolveAgentLaunch.mockImplementation(originalResolveImpl);
+  mockedApplyAgentLaunchEnv.mockImplementation(originalApplyImpl);
+});
+
+test('detectAgents isolates a single agent probe throw so the picker still lists every adapter', async () => {
+  // Simulate the failure shape that issue #2297 attributes to packaged
+  // Windows daemons: one adapter's launch resolution throws because the
+  // FS walk hits a permission error, a malformed PATH entry, or a
+  // broken symlink (`accessSync`/`readdirSync` can surface either as a
+  // throw). Before the per-probe guard the whole `Promise.all`
+  // rejected; the `/api/agents` route's `catch(() => [])` then handed
+  // the UI an empty list and the model picker collapsed to Cloud only.
+  mockedResolveAgentLaunch.mockImplementation((def, env) => {
+    if (def.id === 'claude') {
+      throw new Error('synthetic FS throw during PATH walk');
+    }
+    return originalResolveImpl(def, env);
+  });
+
+  const agents = await detectAgents();
+
+  // Every adapter from the registry must still appear, including the
+  // one whose probe blew up — it just gets surfaced as unavailable so
+  // the rest of the picker (and the BYOK fallback row) stays intact.
+  expect(agents.length).toBe(AGENT_DEFS.length);
+  const claude = agents.find((a) => a.id === 'claude');
+  expect(claude, 'broken adapter must still appear in the detection result').toBeDefined();
+  expect(claude?.available).toBe(false);
+  // And the other adapters must keep whatever availability the real
+  // probe would have returned — none of them get blanket-marked
+  // unavailable just because claude's slot threw.
+  expect(agents.filter((a) => a.id !== 'claude').length).toBe(AGENT_DEFS.length - 1);
+});
+
+test('detectAgents isolates a probe throw from applyAgentLaunchEnv just like resolveAgentLaunch', async () => {
+  // The same per-probe guard has to cover both synchronous sites in
+  // `probe()` that sit outside the existing inner try/catch blocks:
+  // `resolveAgentLaunch` (above) and `applyAgentLaunchEnv` (here). A
+  // single test for each anchors the invariant against future code
+  // that adds a third pre-try synchronous call.
+  let thrown = false;
+  mockedApplyAgentLaunchEnv.mockImplementation((env, launch, nodeBinDir) => {
+    // We cannot tell which agent this env belongs to from arguments
+    // alone, so throw on the first call and pass through afterwards.
+    // detectAgents runs probes in parallel via Promise.all so the
+    // "first" call is non-deterministic but for fault-isolation that
+    // does not matter — any single throw blanking the whole list is
+    // the bug.
+    if (!thrown) {
+      thrown = true;
+      throw new Error('synthetic env construction error');
+    }
+    return originalApplyImpl(env, launch, nodeBinDir);
+  });
+
+  const agents = await detectAgents();
+
+  expect(agents.length).toBe(AGENT_DEFS.length);
+  // At least one adapter is marked unavailable because of the throw;
+  // every other adapter keeps its real availability.
+  const unavailableFromThrow = agents.filter((a) => a.available === false);
+  expect(unavailableFromThrow.length).toBeGreaterThanOrEqual(1);
+});