mirror of
https://github.com/nexu-io/open-design.git
synced 2026-06-01 03:14:35 +07:00
Add sandboxed agent PR exploration (#2604)
This commit is contained in:
parent
ceb636aa1b
commit
b5bf28060b
5 changed files with 1960 additions and 0 deletions
8
.github/actionlint.yaml
vendored
Normal file
8
.github/actionlint.yaml
vendored
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
paths:
|
||||
.github/workflows/*.lock.yml:
|
||||
ignore:
|
||||
- 'shellcheck reported issue in this script: SC2016:.+'
|
||||
- 'shellcheck reported issue in this script: SC2086:.+'
|
||||
self-hosted-runner:
|
||||
labels:
|
||||
- agent-pr-explore
|
||||
105
.github/scripts/agent-pr-explore-local.sh
vendored
Executable file
105
.github/scripts/agent-pr-explore-local.sh
vendored
Executable file
|
|
@ -0,0 +1,105 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
usage() {
|
||||
cat <<'USAGE'
|
||||
Usage:
|
||||
.github/scripts/agent-pr-explore-local.sh <pr-number>
|
||||
|
||||
Runs the Docker-isolated PR exploration path from a local or self-hosted
|
||||
machine, without relying on a GitHub Actions workflow being present on main.
|
||||
|
||||
Required on the host:
|
||||
docker, gh, jq, node/npm, expect-cli@0.1.3
|
||||
|
||||
Optional environment:
|
||||
BASE_REPO=nexu-io/open-design
|
||||
RUNNER_TEMP=/tmp/od-agent-pr-explore-local
|
||||
OD_EXPECT_TIMEOUT_SECONDS=1200
|
||||
OD_SANDBOX_CPUS=4
|
||||
OD_SANDBOX_MEMORY=8g
|
||||
OD_ALLOW_NPX_EXPECT_CLI=1
|
||||
OD_TRACE_R2_UPLOAD=1
|
||||
R2_ACCOUNT_ID=...
|
||||
R2_ACCESS_KEY_ID=...
|
||||
R2_SECRET_ACCESS_KEY=...
|
||||
R2_BUCKET=...
|
||||
R2_PUBLIC_ORIGIN=https://...
|
||||
USAGE
|
||||
}
|
||||
|
||||
if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then
|
||||
usage
|
||||
exit 0
|
||||
fi
|
||||
|
||||
pr_number="${1:-${PR_NUMBER:-}}"
|
||||
if ! [[ "$pr_number" =~ ^[0-9]+$ ]]; then
|
||||
usage >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
base_repo="${BASE_REPO:-nexu-io/open-design}"
|
||||
runner_temp="${RUNNER_TEMP:-/tmp/od-agent-pr-explore-local}"
|
||||
|
||||
for command_name in docker gh jq node; do
|
||||
if ! command -v "$command_name" >/dev/null 2>&1; then
|
||||
echo "::error::$command_name is required on the mini/local runner" >&2
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
if [ -z "${GH_TOKEN:-}" ]; then
|
||||
if ! GH_TOKEN="$(gh auth token 2>/dev/null)"; then
|
||||
echo "::error::GH_TOKEN is not set and gh auth token failed. Run gh auth login or export GH_TOKEN." >&2
|
||||
exit 1
|
||||
fi
|
||||
export GH_TOKEN
|
||||
fi
|
||||
|
||||
if ! command -v expect-cli >/dev/null 2>&1 && [ "${OD_ALLOW_NPX_EXPECT_CLI:-0}" != "1" ]; then
|
||||
echo "::error::expect-cli is not installed. Install it on the mini with: npm install -g expect-cli@${OD_EXPECT_CLI_VERSION:-0.1.3}" >&2
|
||||
echo " For a one-off smoke run, set OD_ALLOW_NPX_EXPECT_CLI=1 to use the pinned npx fallback." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p "$runner_temp"
|
||||
|
||||
pr_json="$(gh pr view "$pr_number" --repo "$base_repo" --json state,isDraft,headRefOid,baseRefOid,headRepositoryOwner,headRepository)"
|
||||
state="$(jq -r '.state' <<<"$pr_json")"
|
||||
draft="$(jq -r '.isDraft' <<<"$pr_json")"
|
||||
head_sha="$(jq -r '.headRefOid' <<<"$pr_json")"
|
||||
base_sha="$(jq -r '.baseRefOid' <<<"$pr_json")"
|
||||
head_repo="$(jq -r '.headRepositoryOwner.login + "/" + .headRepository.name' <<<"$pr_json")"
|
||||
|
||||
if [ "$state" != "OPEN" ]; then
|
||||
echo "::error::Refusing to explore PR $pr_number because state is $state." >&2
|
||||
exit 1
|
||||
fi
|
||||
if [ "$draft" != "false" ]; then
|
||||
echo "::error::Refusing to explore draft PR $pr_number." >&2
|
||||
exit 1
|
||||
fi
|
||||
if ! [[ "$head_sha" =~ ^[0-9a-f]{40}$ && "$base_sha" =~ ^[0-9a-f]{40}$ ]]; then
|
||||
echo "::error::Invalid PR SHA metadata for PR $pr_number." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Running agent PR exploration locally"
|
||||
echo " PR: $base_repo#$pr_number"
|
||||
echo " Head: $head_repo@$head_sha"
|
||||
echo " Base SHA: $base_sha"
|
||||
echo " Temp root: $runner_temp"
|
||||
|
||||
PR_NUMBER="$pr_number" \
|
||||
HEAD_SHA="$head_sha" \
|
||||
HEAD_REPO="$head_repo" \
|
||||
BASE_REPO="$base_repo" \
|
||||
BASE_SHA="$base_sha" \
|
||||
RUNNER_TEMP="$runner_temp" \
|
||||
GH_TOKEN="$GH_TOKEN" \
|
||||
.github/scripts/agent-pr-explore-sandbox.sh
|
||||
|
||||
echo
|
||||
echo "Artifacts:"
|
||||
echo " $runner_temp/agent-pr-explore-sandbox/artifacts"
|
||||
1274
.github/scripts/agent-pr-explore-sandbox.sh
vendored
Executable file
1274
.github/scripts/agent-pr-explore-sandbox.sh
vendored
Executable file
File diff suppressed because it is too large
Load diff
235
.github/workflows/agent-pr-explore-sandbox.yml
vendored
Normal file
235
.github/workflows/agent-pr-explore-sandbox.yml
vendored
Normal file
|
|
@ -0,0 +1,235 @@
|
|||
name: agent-pr-explore-sandbox
|
||||
|
||||
# Trusted-orchestrator workflow for PR exploration. It intentionally uses
|
||||
# pull_request_target so the workflow file and runner script come from the
|
||||
# protected base branch. The PR head is never checked out on the host runner;
|
||||
# the sandbox script fetches and executes it inside Docker.
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [opened, synchronize, reopened, ready_for_review]
|
||||
paths:
|
||||
- "apps/web/**"
|
||||
- "package.json"
|
||||
- "pnpm-lock.yaml"
|
||||
- "pnpm-workspace.yaml"
|
||||
- ".github/workflows/agent-pr-explore-sandbox.yml"
|
||||
- ".github/scripts/agent-pr-explore-sandbox.sh"
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
pr_number:
|
||||
description: Pull request number to explore.
|
||||
required: true
|
||||
type: string
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
issues: write
|
||||
pull-requests: write
|
||||
|
||||
concurrency:
|
||||
group: agent-pr-explore-sandbox-${{ github.event.pull_request.number || inputs.pr_number }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
sandbox:
|
||||
name: Sandbox PR runtime
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || !github.event.pull_request.draft }}
|
||||
runs-on: [self-hosted, agent-pr-explore]
|
||||
environment: agent-pr-explore
|
||||
timeout-minutes: 45
|
||||
|
||||
steps:
|
||||
- name: Checkout trusted base scripts
|
||||
uses: actions/checkout@v6.0.2
|
||||
with:
|
||||
ref: ${{ github.event.pull_request.base.sha || github.sha }}
|
||||
persist-credentials: false
|
||||
|
||||
- name: Resolve PR metadata
|
||||
id: pr
|
||||
shell: bash
|
||||
env:
|
||||
GH_TOKEN: ${{ github.token }}
|
||||
EVENT_PR_NUMBER: ${{ github.event.pull_request.number || inputs.pr_number }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
if ! [[ "$EVENT_PR_NUMBER" =~ ^[0-9]+$ ]]; then
|
||||
echo "::error::Invalid PR number: $EVENT_PR_NUMBER"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
state="$(gh pr view "$EVENT_PR_NUMBER" --repo "$GITHUB_REPOSITORY" --json state --jq '.state')"
|
||||
draft="$(gh pr view "$EVENT_PR_NUMBER" --repo "$GITHUB_REPOSITORY" --json isDraft --jq '.isDraft')"
|
||||
author="$(gh pr view "$EVENT_PR_NUMBER" --repo "$GITHUB_REPOSITORY" --json author --jq '.author.login')"
|
||||
head_sha="$(gh pr view "$EVENT_PR_NUMBER" --repo "$GITHUB_REPOSITORY" --json headRefOid --jq '.headRefOid')"
|
||||
head_repo="$(gh pr view "$EVENT_PR_NUMBER" --repo "$GITHUB_REPOSITORY" --json headRepositoryOwner,headRepository --jq '.headRepositoryOwner.login + "/" + .headRepository.name')"
|
||||
base_repo="$GITHUB_REPOSITORY"
|
||||
base_sha="$(gh pr view "$EVENT_PR_NUMBER" --repo "$GITHUB_REPOSITORY" --json baseRefOid --jq '.baseRefOid')"
|
||||
|
||||
if [ "$state" != "OPEN" ]; then
|
||||
echo "::error::Refusing to explore PR $EVENT_PR_NUMBER because state is $state."
|
||||
exit 1
|
||||
fi
|
||||
if [ "$draft" != "false" ]; then
|
||||
echo "::error::Refusing to explore draft PR $EVENT_PR_NUMBER."
|
||||
exit 1
|
||||
fi
|
||||
if [ "$base_repo" != "$GITHUB_REPOSITORY" ]; then
|
||||
echo "::error::Unexpected base repo $base_repo."
|
||||
exit 1
|
||||
fi
|
||||
if ! [[ "$head_sha" =~ ^[0-9a-f]{40}$ && "$base_sha" =~ ^[0-9a-f]{40}$ ]]; then
|
||||
echo "::error::Invalid PR SHA metadata."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
{
|
||||
echo "number=$EVENT_PR_NUMBER"
|
||||
echo "author=$author"
|
||||
echo "head_sha=$head_sha"
|
||||
echo "head_repo=$head_repo"
|
||||
echo "base_sha=$base_sha"
|
||||
} >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Run expect against Docker-isolated PR app
|
||||
shell: bash
|
||||
env:
|
||||
GH_TOKEN: ${{ github.token }}
|
||||
PR_NUMBER: ${{ steps.pr.outputs.number }}
|
||||
HEAD_SHA: ${{ steps.pr.outputs.head_sha }}
|
||||
HEAD_REPO: ${{ steps.pr.outputs.head_repo }}
|
||||
BASE_REPO: ${{ github.repository }}
|
||||
BASE_SHA: ${{ steps.pr.outputs.base_sha }}
|
||||
OD_SANDBOX_CPUS: "4"
|
||||
OD_SANDBOX_MEMORY: "8g"
|
||||
OD_SANDBOX_READY_TIMEOUT_SECONDS: "900"
|
||||
OD_EXPECT_TIMEOUT_SECONDS: "1200"
|
||||
OD_EXPECT_CONTEXT_MAX_BYTES: "120000"
|
||||
OD_TRACE_R2_UPLOAD: "1"
|
||||
R2_ACCOUNT_ID: ${{ secrets.R2_ACCOUNT_ID }}
|
||||
R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
|
||||
R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
|
||||
R2_BUCKET: ${{ secrets.R2_BUCKET }}
|
||||
R2_PUBLIC_ORIGIN: ${{ vars.R2_PUBLIC_ORIGIN }}
|
||||
run: .github/scripts/agent-pr-explore-sandbox.sh
|
||||
|
||||
- name: Upload sandbox artifacts
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v7
|
||||
with:
|
||||
name: agent-pr-explore-sandbox-${{ steps.pr.outputs.number }}-${{ steps.pr.outputs.head_sha }}
|
||||
path: ${{ runner.temp }}/agent-pr-explore-sandbox/artifacts/
|
||||
if-no-files-found: warn
|
||||
retention-days: 7
|
||||
|
||||
- name: Comment exploration report
|
||||
if: always()
|
||||
shell: bash
|
||||
env:
|
||||
GH_TOKEN: ${{ github.token }}
|
||||
PR_NUMBER: ${{ steps.pr.outputs.number }}
|
||||
PR_AUTHOR: ${{ steps.pr.outputs.author }}
|
||||
HEAD_SHA: ${{ steps.pr.outputs.head_sha }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
report="$RUNNER_TEMP/agent-pr-explore-sandbox/artifacts/agent-pr-exploration-report.md"
|
||||
if [ ! -s "$report" ]; then
|
||||
echo "No agent exploration report was produced; skipping PR comment."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
marker="<!-- agent-pr-explore-sandbox:${PR_NUMBER} -->"
|
||||
body_file="$(mktemp)"
|
||||
{
|
||||
cat "$report"
|
||||
echo
|
||||
echo "$marker"
|
||||
} > "$body_file"
|
||||
|
||||
comment_id="$(
|
||||
gh api "repos/$GITHUB_REPOSITORY/issues/$PR_NUMBER/comments" --paginate \
|
||||
--jq ".[] | select(.user.type == \"Bot\" and (.body | contains(\"$marker\"))) | .id" \
|
||||
| tail -n 1
|
||||
)"
|
||||
body="$(cat "$body_file")"
|
||||
if [ -n "$comment_id" ]; then
|
||||
gh api "repos/$GITHUB_REPOSITORY/issues/comments/$comment_id" -X PATCH -f body="$body" --silent
|
||||
else
|
||||
gh api "repos/$GITHUB_REPOSITORY/issues/$PR_NUMBER/comments" -f body="$body" --silent
|
||||
fi
|
||||
|
||||
case "${PR_AUTHOR,,}" in
|
||||
nettee|mrcfps|alchemistklk|siri-ray)
|
||||
;;
|
||||
*)
|
||||
echo "PR author $PR_AUTHOR is not configured for inline agent reports; skipping inline review comment."
|
||||
exit 0
|
||||
;;
|
||||
esac
|
||||
|
||||
files_json="$(mktemp)"
|
||||
gh api --paginate --slurp "repos/$GITHUB_REPOSITORY/pulls/$PR_NUMBER/files" > "$files_json"
|
||||
inline_target="$(
|
||||
FILES_JSON="$files_json" node <<'NODE'
|
||||
const fs = require("node:fs");
|
||||
const pages = JSON.parse(fs.readFileSync(process.env.FILES_JSON, "utf8"));
|
||||
const files = pages.flat();
|
||||
|
||||
function firstAddedLine(file) {
|
||||
if (typeof file.patch !== "string") return null;
|
||||
let newLine = null;
|
||||
for (const line of file.patch.split("\n")) {
|
||||
const hunk = /^@@ -\d+(?:,\d+)? \+(\d+)(?:,\d+)? @@/.exec(line);
|
||||
if (hunk) {
|
||||
newLine = Number(hunk[1]);
|
||||
continue;
|
||||
}
|
||||
if (newLine == null) continue;
|
||||
if (line.startsWith("+") && !line.startsWith("+++")) return newLine;
|
||||
if (!line.startsWith("-")) newLine += 1;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
for (const file of files) {
|
||||
if (!file || file.status === "removed") continue;
|
||||
const line = firstAddedLine(file);
|
||||
if (line != null) {
|
||||
process.stdout.write(JSON.stringify({ path: file.filename, line }));
|
||||
process.exit(0);
|
||||
}
|
||||
}
|
||||
NODE
|
||||
)"
|
||||
if [ -z "$inline_target" ]; then
|
||||
echo "No added diff line was available for an inline agent report; skipping inline review comment."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
inline_path="$(node -e 'const target = JSON.parse(process.argv[1]); process.stdout.write(target.path)' "$inline_target")"
|
||||
inline_line="$(node -e 'const target = JSON.parse(process.argv[1]); process.stdout.write(String(target.line))' "$inline_target")"
|
||||
inline_marker="<!-- agent-pr-explore-inline:${PR_NUMBER} -->"
|
||||
inline_body_file="$(mktemp)"
|
||||
{
|
||||
cat "$report"
|
||||
echo
|
||||
echo "$inline_marker"
|
||||
} > "$inline_body_file"
|
||||
inline_body="$(cat "$inline_body_file")"
|
||||
|
||||
inline_comment_id="$(
|
||||
gh api "repos/$GITHUB_REPOSITORY/pulls/$PR_NUMBER/comments" --paginate \
|
||||
--jq ".[] | select(.user.type == \"Bot\" and (.body | contains(\"$inline_marker\"))) | .id" \
|
||||
| tail -n 1
|
||||
)"
|
||||
if [ -n "$inline_comment_id" ]; then
|
||||
gh api "repos/$GITHUB_REPOSITORY/pulls/comments/$inline_comment_id" -X PATCH -f body="$inline_body" --silent
|
||||
else
|
||||
gh api "repos/$GITHUB_REPOSITORY/pulls/$PR_NUMBER/comments" \
|
||||
-f body="$inline_body" \
|
||||
-f commit_id="$HEAD_SHA" \
|
||||
-f path="$inline_path" \
|
||||
-F line="$inline_line" \
|
||||
-f side=RIGHT \
|
||||
--silent
|
||||
fi
|
||||
338
specs/change/20260522-pr-explore-agent/spec.md
Normal file
338
specs/change/20260522-pr-explore-agent/spec.md
Normal file
|
|
@ -0,0 +1,338 @@
|
|||
---
|
||||
id: 20260522-pr-explore-agent
|
||||
name: PR Explore Agent - Advisory Web E2E
|
||||
status: designed
|
||||
created: '2026-05-22'
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
### Problem Statement
|
||||
|
||||
PR throughput is outpacing the maintainer pool's review bandwidth on
|
||||
the "does this PR's claimed browser behavior actually land?" half of
|
||||
review.
|
||||
|
||||
Existing deterministic E2E/visual checks cover predefined scenarios.
|
||||
They do not read a PR body, infer the riskiest changed behavior, and
|
||||
probe that behavior in a running app. This proposal adds an advisory
|
||||
agent lane for that manual reviewer task.
|
||||
|
||||
### Goal
|
||||
|
||||
Add a per-PR advisory, manually-approved agent that:
|
||||
|
||||
- reads the PR body and diff;
|
||||
- boots the PR's `apps/web` runtime inside Docker;
|
||||
- uses host-side `expect-cli` to explore a small number of
|
||||
diff-implied browser cases;
|
||||
- captures Playwright trace/video/screenshot artifacts;
|
||||
- posts a reviewer-ready PR report with trace, verdict, concrete
|
||||
evidence, and E2E coverage suggestions.
|
||||
|
||||
The agent does not gate merge, replace deterministic E2E, or replace
|
||||
the visual-regression workflows.
|
||||
|
||||
## Scope
|
||||
|
||||
P1 is intentionally web-only.
|
||||
|
||||
In scope:
|
||||
|
||||
- PRs touching `apps/web/**`.
|
||||
- Root workspace inputs that can affect the web runtime:
|
||||
`package.json`, `pnpm-lock.yaml`, `pnpm-workspace.yaml`.
|
||||
- The sandbox workflow/script themselves:
|
||||
`.github/workflows/agent-pr-explore-sandbox.yml` and
|
||||
`.github/scripts/agent-pr-explore-sandbox.sh`.
|
||||
- Manual workflow dispatch for a specific PR number.
|
||||
- Advisory output only.
|
||||
|
||||
Out of scope for P1:
|
||||
|
||||
- `apps/landing-page/**` and its content sources. The landing page is
|
||||
a separate Astro runtime and must not be reported as verified by the
|
||||
`apps/web` sandbox. A follow-up should add a separate landing-page
|
||||
boot path or a two-pass surface router.
|
||||
- `apps/daemon/src/**`, `packages/contracts/**`, and `od` CLI
|
||||
verification. The browser explorer cannot prove CLI/API contract
|
||||
behavior.
|
||||
- The older `gh-aw` workflow path and STEP marker extractor. This PR
|
||||
deploys the self-hosted Docker sandbox path only.
|
||||
- Auto-fix or patch-suggesting behavior.
|
||||
- Merge-blocking enforcement.
|
||||
|
||||
## Security Model
|
||||
|
||||
The workflow uses `pull_request_target` only as a trusted orchestrator.
|
||||
The workflow file and shell runner come from the protected base branch;
|
||||
the PR head is fetched by exact commit SHA inside Docker.
|
||||
|
||||
The Docker sandbox receives:
|
||||
|
||||
- no repo/org secrets;
|
||||
- no model credentials;
|
||||
- no host `$HOME`;
|
||||
- no `.ssh`, `.config`, `.codex`, or Claude/Codex credential files;
|
||||
- no Docker socket.
|
||||
|
||||
Host-side `expect-cli` may use the operator's model/OAuth credentials,
|
||||
but it receives only PR metadata, diff/body context, and the sandboxed
|
||||
localhost URL. It must not run arbitrary host shell commands or expose
|
||||
host files to untrusted PR code.
|
||||
|
||||
The workflow is environment-gated through GitHub's native
|
||||
`agent-pr-explore` environment. Every run waits for a maintainer to
|
||||
click Approve in the PR checks UI before the self-hosted runner starts
|
||||
the expensive/sensitive portion.
|
||||
|
||||
## Runtime
|
||||
|
||||
### Workflow
|
||||
|
||||
`.github/workflows/agent-pr-explore-sandbox.yml` is the trusted
|
||||
orchestrator.
|
||||
|
||||
It:
|
||||
|
||||
1. Triggers on matching `pull_request_target` events or manual
|
||||
`workflow_dispatch`.
|
||||
2. Checks out trusted base scripts only.
|
||||
3. Resolves PR number, author, head SHA, head repo, and base SHA.
|
||||
4. Runs `.github/scripts/agent-pr-explore-sandbox.sh` on a
|
||||
self-hosted runner labeled `agent-pr-explore`.
|
||||
5. Uploads artifacts.
|
||||
6. Posts or updates one sticky PR comment.
|
||||
7. For Looper-authored/managed users (`nettee`, `mrcfps`,
|
||||
`alchemistklk`, `Siri-Ray`), also posts or updates one inline review
|
||||
comment anchored to the first added line in the current diff.
|
||||
|
||||
### Sandbox Runner
|
||||
|
||||
`.github/scripts/agent-pr-explore-sandbox.sh`:
|
||||
|
||||
1. Validates PR metadata and required host tools.
|
||||
2. Builds PR context from GitHub API/diff data.
|
||||
3. Selects a small fixture when the diff maps to known web UI state.
|
||||
4. Starts a Docker container from `node:24-bookworm`.
|
||||
5. Fetches the PR head SHA inside Docker.
|
||||
6. Installs dependencies using a host-mounted pnpm store cache.
|
||||
7. Builds daemon/tools-dev, then boots:
|
||||
|
||||
```bash
|
||||
pnpm tools-dev run web \
|
||||
--namespace "agent-pr-<number>-<sha8>" \
|
||||
--daemon-port 17456 \
|
||||
--web-port 17573
|
||||
```
|
||||
|
||||
8. Publishes the container web proxy to a host localhost port.
|
||||
9. Runs host-side `expect-cli` against that URL.
|
||||
10. Records smoke Playwright artifacts and writes the final report.
|
||||
|
||||
The workflow concurrency key is PR-number scoped with
|
||||
`cancel-in-progress: true`, so a new push cancels older pending/running
|
||||
exploration for the same PR.
|
||||
|
||||
## Deterministic Verifiers
|
||||
|
||||
Some changes are build/deploy behavior rather than browser interaction.
|
||||
P1 includes a deterministic verifier for Vercel/static-export changes:
|
||||
|
||||
- `vercel.json`
|
||||
- `apps/web/next.config.ts`
|
||||
- `apps/web/tests/runtime/app-route-export.test.ts`
|
||||
|
||||
For those PRs, when no browser-observable files are touched, the runner
|
||||
skips browser exploration and executes this inside the Docker checkout:
|
||||
|
||||
```bash
|
||||
rm -rf apps/web/out apps/web/.next
|
||||
OD_WEB_OUTPUT_MODE=server sh -c 'OD_WEB_OUTPUT_MODE= pnpm --filter @open-design/web build && test -d apps/web/out'
|
||||
test -f apps/web/out/index.html
|
||||
```
|
||||
|
||||
The deterministic result becomes the advisory verdict. The runner still
|
||||
boots the app and captures smoke artifacts for reviewer debugging.
|
||||
|
||||
## Report Contract
|
||||
|
||||
The canonical reviewer-facing artifact is
|
||||
`agent-pr-exploration-report.md`.
|
||||
|
||||
It always has this shape:
|
||||
|
||||
```markdown
|
||||
## 🤖 Agent PR Exploration Report
|
||||
|
||||
### 🎬 Trace
|
||||
|
||||
[Open Playwright trace](...)
|
||||
|
||||
### ✅ Verdict: Pass
|
||||
|
||||
...
|
||||
|
||||
### 🧪 What Was Verified / Cases Tested
|
||||
|
||||
...
|
||||
|
||||
### 🔍 Concrete Evidence
|
||||
|
||||
...
|
||||
|
||||
### 🧱 E2E Coverage to Sediment
|
||||
|
||||
...
|
||||
```
|
||||
|
||||
The trace section is first. The report must describe what actually ran;
|
||||
it must not use "dry run" wording for real workflow output.
|
||||
|
||||
For normal browser exploration, the prompt asks the agent to return only
|
||||
the markdown fragment below the trace section. The runner prepends the
|
||||
real trace link after artifact upload. For deterministic verifier paths,
|
||||
the shell script writes the same report structure directly.
|
||||
|
||||
The "E2E Coverage to Sediment" section is required. It should state
|
||||
which deterministic test, fixture, mock, or CI smoke should be added if
|
||||
the exploratory run found a repeatable behavior worth preserving.
|
||||
|
||||
## Artifacts
|
||||
|
||||
Artifacts are written under
|
||||
`$RUNNER_TEMP/agent-pr-explore-sandbox/artifacts/`.
|
||||
|
||||
Important files:
|
||||
|
||||
- `agent-pr-exploration-report.md`
|
||||
- `expect.log`
|
||||
- `expect-exit-code.txt`
|
||||
- `deterministic-verifier.log` when selected
|
||||
- `playwright-smoke-trace.zip`
|
||||
- `playwright-smoke-session.webm`
|
||||
- `playwright-initial.png`
|
||||
- `playwright-final.png`
|
||||
- `playwright-trace-viewer.md`
|
||||
- `playwright-trace-viewer.txt`
|
||||
- `playwright-recording-summary.json`
|
||||
- `docker.log`
|
||||
- `sandbox.log`
|
||||
|
||||
The Playwright recording is a post-run smoke/debug artifact, not the
|
||||
source of truth for the verdict. It overlays a reviewer HUD identifying
|
||||
the fixture/verifier being replayed.
|
||||
|
||||
The recorder waits for a visible document plus a short UI-settle delay.
|
||||
It intentionally does not use Playwright `networkidle`, because Open
|
||||
Design keeps background connections active and `networkidle` creates
|
||||
misleading trace timeout steps.
|
||||
|
||||
## R2 Trace Upload
|
||||
|
||||
When `OD_TRACE_R2_UPLOAD=1`, the host runner uploads selected artifacts
|
||||
to Cloudflare R2 using these environment values:
|
||||
|
||||
- `R2_ACCOUNT_ID`
|
||||
- `R2_ACCESS_KEY_ID`
|
||||
- `R2_SECRET_ACCESS_KEY`
|
||||
- `R2_BUCKET`
|
||||
- `R2_PUBLIC_ORIGIN`
|
||||
|
||||
R2 credentials stay on the host and are never passed into Docker.
|
||||
|
||||
The default object prefix is:
|
||||
|
||||
```text
|
||||
agent-pr-explore/pr-<number>/<head-sha>/
|
||||
```
|
||||
|
||||
The public R2 origin must allow browser fetches from
|
||||
`https://trace.playwright.dev` or `*` via CORS, otherwise the trace zip
|
||||
may be public but the hosted trace viewer cannot load it.
|
||||
|
||||
## Operator Runbook
|
||||
|
||||
### GitHub Setup
|
||||
|
||||
Required repository/environment setup:
|
||||
|
||||
- `agent-pr-explore` environment exists.
|
||||
- Environment required reviewers are configured.
|
||||
- A self-hosted runner is available with labels:
|
||||
- `self-hosted`
|
||||
- `agent-pr-explore`
|
||||
- Environment secrets:
|
||||
- `R2_ACCOUNT_ID`
|
||||
- `R2_ACCESS_KEY_ID`
|
||||
- `R2_SECRET_ACCESS_KEY`
|
||||
- `R2_BUCKET`
|
||||
- Environment variable:
|
||||
- `R2_PUBLIC_ORIGIN`
|
||||
|
||||
### Mini / Runner Host
|
||||
|
||||
Install host prerequisites:
|
||||
|
||||
```bash
|
||||
gh auth login
|
||||
npm install -g expect-cli@0.1.3
|
||||
command -v jq
|
||||
docker info
|
||||
```
|
||||
|
||||
The runner does not use mutable `expect-cli@latest` by default. If
|
||||
`expect-cli` is not preinstalled, operators must either install the
|
||||
pinned version above or explicitly set `OD_ALLOW_NPX_EXPECT_CLI=1` for
|
||||
a one-off smoke run using the pinned npx fallback.
|
||||
|
||||
### Manual Local Smoke
|
||||
|
||||
Before or after merging, the Mac mini can run the same sandbox path
|
||||
manually:
|
||||
|
||||
```bash
|
||||
git clone git@github.com:nexu-io/open-design.git
|
||||
cd open-design
|
||||
git fetch origin pull/2604/head:agent-pr-explore-sandbox
|
||||
git checkout agent-pr-explore-sandbox
|
||||
RUNNER_TEMP=/tmp/od-agent-pr-explore-local \
|
||||
OD_EXPECT_TIMEOUT_SECONDS=1200 \
|
||||
.github/scripts/agent-pr-explore-local.sh <open-pr-number>
|
||||
```
|
||||
|
||||
## Rollout
|
||||
|
||||
P1 is a controlled rollout:
|
||||
|
||||
1. Merge the sandbox workflow and scripts.
|
||||
2. Register the mini as the dedicated `agent-pr-explore` runner.
|
||||
3. Trigger `agent-pr-explore-sandbox` manually for 3-5 open PRs.
|
||||
4. Verify runner stability, R2 trace links, sticky comments, and Looper
|
||||
inline comment behavior.
|
||||
5. Expand reviewer approval volume only after the reports are useful and
|
||||
no isolation incidents occur.
|
||||
|
||||
Landing-page exploration, CLI/API exploration, deeper gh-aw integration,
|
||||
and multi-surface routing are separate follow-up work.
|
||||
|
||||
## Success Criteria
|
||||
|
||||
- Maintainer can approve and run an exploration job manually.
|
||||
- The PR code runs only inside Docker.
|
||||
- No host credential is mounted or forwarded into Docker.
|
||||
- The final report starts with a clickable Playwright trace link when
|
||||
R2 upload is configured.
|
||||
- The report includes an explicit E2E coverage-sedimentation section.
|
||||
- Looper authors receive an inline review comment thread in addition to
|
||||
the sticky top-level report.
|
||||
- Non-Looper authors receive only the sticky top-level report.
|
||||
- `expect-cli` non-zero exits preserve artifacts and are advisory, while
|
||||
sandbox/bootstrap failures fail the job.
|
||||
|
||||
## References
|
||||
|
||||
- GitHub Actions secrets and fork PRs:
|
||||
https://docs.github.com/en/actions/how-tos/security-for-github-actions/security-guides/using-secrets-in-github-actions
|
||||
- Playwright Trace Viewer:
|
||||
https://trace.playwright.dev/
|
||||
Loading…
Reference in a new issue