ci: clean agent report (write-to-file) + slim artifacts/uploads (#3116)

* ci: clean agent report (write-to-file) + slim artifacts/uploads Four related cleanups to the agent PR exploration output: 1. Clean report. The PR comment / report.md was assembled by dumping the entire verbose expect.log (ACP init logs, "Git failed" warnings, the ~24KB echoed prompt, ANSI codes, progress checklist) under the trace header -- ~28KB of noise. Instead, instruct the agent to write its final Markdown report to a file via its file-write tool, and have the runner read that file directly. Verified: Codex writes a clean report to the given absolute path. Falls back to an inconclusive note if the agent did not finish. 2. Drop duplicate trace/video. The script copied playwright-smoke-trace.zip -> playwright-trace.zip (a ~28MB legacy duplicate) and the webm likewise, and uploaded both to R2. Keep only the canonical smoke-named artifacts. 3. Slim the GitHub artifact. The trace zips and videos are already on R2; exclude *.zip / *.webm from the uploaded artifact so it drops from ~56MB to <1MB (report + logs only). 4. Persist report on the runner. Copy the report / agent-report / expect.log / trace URL to a stable host dir ($HOME/.cache/agent-pr-explore/reports/pr-<n>) so dry runs (skip_comment) can be inspected without downloading the artifact. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * ci: address review — keep advisory reports + recursive artifact excludes Review findings on the report/artifact cleanup: 1. Regression fix: the non-app-surface and deterministic-verifier branches write their pre-baked advisory report (Inconclusive / Pass / Fail) and never run the agent, so they don't produce agent-report.md. After switching write_agent_report_artifact to read only agent-report.md they fell through to the "agent did not write a final report" fallback, dropping the real advisory (and mis-reporting on .github-only PRs like this one). Fix: those branches now write their advisory directly to $agent_report_file — single source of truth for the report body. 2. Recursive artifact excludes: the source Playwright recording lives at artifacts/playwright-video/<uuid>.webm; non-recursive !*.webm / !*.zip didn't match the subdirectory. Use **/*.zip and **/*.webm so the slim actually holds. 3. Drop the now-dangling summary.legacyTrace field (the legacy trace copy is no longer produced), matching the legacyVideo removal. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-31 19:04:39 +07:00 · 2026-05-27 20:30:05 +08:00 · 2026-05-27 20:30:05 +08:00 · bf61a39cb5
commit bf61a39cb5
parent ee1eab77c6
2 changed files with 29 additions and 21 deletions
--- a/.github/scripts/agent-pr-explore-sandbox.sh
+++ b/.github/scripts/agent-pr-explore-sandbox.sh
@ -52,6 +52,7 @@ context_file="$artifacts/pr-context.md"
 trimmed_context_file="$artifacts/pr-context-trimmed.md"
 changed_files_file="$artifacts/changed-files.txt"
 fixture_instructions_file="$artifacts/fixture-instructions.md"
+agent_report_file="$artifacts/agent-report.md"
 playwright_video_dir="$artifacts/playwright-video"
 rm -rf "$root"
 mkdir -p "$artifacts" "$pnpm_store" "$playwright_video_dir"
@ -674,7 +675,6 @@ function writeTraceViewerFiles(viewerUrl) {
    ok: false,
    video: null,
    trace: "playwright-smoke-trace.zip",
-    legacyTrace: "playwright-trace.zip",
    traceViewerUrl: viewerUrl || null,
  };

@ -686,19 +686,11 @@ function writeTraceViewerFiles(viewerUrl) {
    await context.close();
    await browser.close();
  }
-  const smokeTrace = path.join(artifacts, "playwright-smoke-trace.zip");
-  if (fs.existsSync(smokeTrace)) {
-    fs.copyFileSync(smokeTrace, path.join(artifacts, "playwright-trace.zip"));
-  }
-
  const videos = fs.readdirSync(videoDir).filter((name) => name.endsWith(".webm"));
  if (videos.length > 0) {
    const source = path.join(videoDir, videos[0]);
-    const stable = path.join(artifacts, "playwright-smoke-session.webm");
-    fs.copyFileSync(source, stable);
-    fs.copyFileSync(source, path.join(artifacts, "playwright-session.webm"));
+    fs.copyFileSync(source, path.join(artifacts, "playwright-smoke-session.webm"));
    summary.video = "playwright-smoke-session.webm";
-    summary.legacyVideo = "playwright-session.webm";
  }
  writeTraceViewerFiles(viewerUrl);
  fs.writeFileSync(path.join(artifacts, "playwright-recording-summary.json"), JSON.stringify(summary, null, 2));
@ -847,9 +839,7 @@ async function putObject(filePath, key, contentType, cacheControl) {
  requireConfig();
  const files = [
    ["playwright-smoke-trace.zip", "application/zip", "public, max-age=604800"],
-    ["playwright-trace.zip", "application/zip", "public, max-age=604800"],
    ["playwright-smoke-session.webm", "video/webm", "public, max-age=604800"],
-    ["playwright-session.webm", "video/webm", "public, max-age=604800"],
    ["playwright-initial.png", "image/png", "public, max-age=604800"],
    ["playwright-final.png", "image/png", "public, max-age=604800"],
    ["expect.log", "text/plain; charset=utf-8", "public, max-age=604800"],
@ -913,12 +903,14 @@ write_agent_report_artifact() {
      echo "Trace artifact was not generated for this run."
    fi
    echo
-    if [ -f "$artifacts/expect.log" ]; then
-      cat "$artifacts/expect.log"
+    if [ -s "$agent_report_file" ]; then
+      # The agent wrote its clean Markdown report to this file directly.
+      cat "$agent_report_file"
    else
      echo "### ⚠️ Verdict: Inconclusive"
      echo
-      echo "The runner did not produce an exploration report."
+      echo "The agent did not write a final report (it may have hit the run"
+      echo "timeout before finishing). See the run log artifact / \`expect.log\` for details."
    fi
  } > "$artifacts/agent-pr-exploration-report.md"
 }
@ -1181,7 +1173,7 @@ seed_agent_fixture "$agent_fixture"
 if [ "$deterministic_verifier" = "web-static-export" ] && [ "$browser_exploration_needed" != "true" ]; then
  verifier_status="$(cat "$artifacts/deterministic-verifier-exit-code.txt" 2>/dev/null || echo 1)"
  if [ "$verifier_status" = "0" ]; then
-    cat > "$artifacts/expect.log" <<REPORT
+    cat > "$agent_report_file" <<REPORT
 ### ✅ Verdict: Pass

 This PR changes the web deployment/static-export path rather than an interactive user flow. The agent therefore used the deterministic Docker verifier instead of inventing browser interaction cases that would not exercise the changed behavior.
@ -1215,7 +1207,7 @@ Observed result:
 - A dedicated CI smoke for the Vercel/static-export command would make this regression class easier to catch without requiring agent exploration.
 REPORT
  else
-    cat > "$artifacts/expect.log" <<REPORT
+    cat > "$agent_report_file" <<REPORT
 ### ❌ Verdict: Fail

 The deterministic static-export verifier failed. Because this PR changes build/deploy output rather than an interactive browser flow, browser exploration would not be a useful substitute for the failing build-level signal.
@ -1244,7 +1236,7 @@ REPORT
 fi

 if [ "$app_surface_touched" != "true" ]; then
-  cat > "$artifacts/expect.log" <<REPORT
+  cat > "$agent_report_file" <<REPORT
 ### ⚪ Verdict: Inconclusive

 This PR does not touch a path that the browser explorer can map to app UI/runtime behavior, so the run avoided inventing a broad app audit.
@ -1263,7 +1255,7 @@ This PR does not touch a path that the browser explorer can map to app UI/runtim

 - None from this PR diff. Add deterministic checks when a future PR changes app/runtime behavior.
 REPORT
-  echo "No app/runtime surface touched; wrote inconclusive advisory report to $artifacts/expect.log"
+  echo "No app/runtime surface touched; wrote inconclusive advisory report to $agent_report_file"
  record_playwright_artifacts || true
  publish_trace_artifacts_to_r2 || true
  write_agent_report_artifact
@ -1295,7 +1287,7 @@ Keep this as a fast exploratory pass:

 CRITICAL -- finish and submit promptly: the runner aborts this turn with NO report if you produce no output for about 3 minutes. Do not plan or attempt more steps than you will actually complete. As soon as you have verified 2-3 cases (or hit a blocker), stop exploring and emit the COMPLETE Markdown report below as your final message in a single turn. Never leave planned steps pending, retry silently, or run "just one more" check once you have enough to write the verdict.

-Return a reviewer-ready Markdown report fragment. Do not include the top-level title or trace section; the runner prepends the real trace link after artifacts are published.
+Write your final report as a reviewer-ready Markdown fragment to the file ${agent_report_file} using your file-write tool, as your final action. Do not print the report to stdout -- only write the file, then stop. Do not include the top-level title or trace section; the runner prepends the real trace link after artifacts are published.

 Use this structure and keep the writing concrete:

@ -1369,3 +1361,14 @@ publish_trace_artifacts_to_r2 || true
 write_agent_report_artifact

 docker logs "$container_name" > "$artifacts/docker.log" 2>&1 || true
+
+# Persist the report + trace pointer to a stable host dir so dry/validation runs
+# (skip_comment) can be inspected without downloading the slow, large workflow
+# artifact. Overwrites per PR; the big trace zip stays on R2 only.
+report_persist_dir="${OD_SANDBOX_REPORT_DIR:-$HOME/.cache/agent-pr-explore/reports}/pr-${PR_NUMBER}"
+mkdir -p "$report_persist_dir" 2>/dev/null || true
+cp -f "$artifacts/agent-pr-exploration-report.md" "$report_persist_dir/report.md" 2>/dev/null || true
+cp -f "$artifacts/agent-report.md" "$report_persist_dir/agent-report.md" 2>/dev/null || true
+cp -f "$artifacts/expect.log" "$report_persist_dir/expect.log" 2>/dev/null || true
+cp -f "$artifacts/playwright-trace-viewer.txt" "$report_persist_dir/trace-url.txt" 2>/dev/null || true
+echo "Report persisted on runner: $report_persist_dir"
--- a/.github/workflows/agent-pr-explore-sandbox.yml
+++ b/.github/workflows/agent-pr-explore-sandbox.yml
@ -127,7 +127,12 @@ jobs:
        uses: actions/upload-artifact@v7
        with:
          name: agent-pr-explore-sandbox-${{ steps.pr.outputs.number }}-${{ steps.pr.outputs.head_sha }}
-          path: ${{ runner.temp }}/agent-pr-explore-sandbox/artifacts/
+          # The trace zips (~28MB) and videos are already published to R2; keep
+          # them out of the GitHub artifact so it stays small (report + logs).
+          path: |
+            ${{ runner.temp }}/agent-pr-explore-sandbox/artifacts/
+            !${{ runner.temp }}/agent-pr-explore-sandbox/artifacts/**/*.zip
+            !${{ runner.temp }}/agent-pr-explore-sandbox/artifacts/**/*.webm
          if-no-files-found: warn
          retention-days: 7