#!/usr/bin/env bash
# Fetch the recording corpus referenced by mocks/manifest.json from
# Cloudflare R2 into mocks/recordings/. Skips files already on disk
# whose sha256 matches the manifest. Verifies every download.
#
# Usage:
# bash mocks/scripts/fetch-recordings.sh # fetch all
# bash mocks/scripts/fetch-recordings.sh --agent claude # fetch claude only
# bash mocks/scripts/fetch-recordings.sh --outcome failed # fetch failed only
# bash mocks/scripts/fetch-recordings.sh --skill agent-browser
# bash mocks/scripts/fetch-recordings.sh --concurrency 16
# bash mocks/scripts/fetch-recordings.sh --force # re-download all
# bash mocks/scripts/fetch-recordings.sh --cache-dir
# override cache location
#
# Default cache: mocks/recordings/. Override with OD_MOCKS_CACHE_DIR env
# or --cache-dir flag — useful for sharing across multiple OD checkouts.
set -euo pipefail
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)"
MOCKS_DIR="$(cd "$HERE/.." && pwd -P)"
MANIFEST="$MOCKS_DIR/manifest.json"
FILTER_AGENT=""
FILTER_OUTCOME=""
FILTER_SKILL=""
CONCURRENCY=8
FORCE=0
CACHE_DIR="${OD_MOCKS_CACHE_DIR:-$MOCKS_DIR/recordings}"
while [[ $# -gt 0 ]]; do
case "$1" in
--agent) FILTER_AGENT="$2"; shift 2 ;;
--outcome) FILTER_OUTCOME="$2"; shift 2 ;;
--skill) FILTER_SKILL="$2"; shift 2 ;;
--concurrency) CONCURRENCY="$2"; shift 2 ;;
--cache-dir) CACHE_DIR="$2"; shift 2 ;;
--force) FORCE=1; shift ;;
-h|--help)
sed -n '2,17p' "$0" | sed 's/^# //; s/^#//'; exit 0 ;;
*) echo "unknown flag: $1" >&2; exit 2 ;;
esac
done
if [ ! -f "$MANIFEST" ]; then
echo "✗ manifest not found at $MANIFEST" >&2
exit 1
fi
mkdir -p "$CACHE_DIR"
# Use node to walk the manifest — sturdier than shell JSON parsing.
PUBLIC_URL=$(node -e '
const m = JSON.parse(require("fs").readFileSync(process.argv[1],"utf-8"));
process.stdout.write(m.storage.public_url_base + "/" + m.storage.object_prefix);
' "$MANIFEST")
# Select entries matching filters, write one TSV row per entry:
# \t\t
ENTRIES_TSV=$(node -e '
const m = JSON.parse(require("fs").readFileSync(process.argv[1],"utf-8"));
const fa = process.argv[2], fo = process.argv[3], fs = process.argv[4];
for (const e of m.entries) {
if (fa && e.agent !== fa) continue;
if (fo && e.outcome !== fo) continue;
if (fs && !(e.skills || []).includes(fs)) continue;
process.stdout.write(`${e.trace_id}\t${e.sha256}\t${e.bytes}\n`);
}
' "$MANIFEST" "$FILTER_AGENT" "$FILTER_OUTCOME" "$FILTER_SKILL")
# Empty-string check has to come BEFORE any line-counting — `printf '%s\n' ""`
# emits a single empty line, which `grep -c ""` / `wc -l` would count as 1
# and let a typo'd `--agent xyz` quietly succeed with zero downloads.
if [ -z "$ENTRIES_TSV" ]; then
echo "no entries matched filter" >&2
exit 0
fi
TOTAL=$(printf '%s\n' "$ENTRIES_TSV" | wc -l | tr -d ' ')
echo "Fetching up to $TOTAL recordings → $CACHE_DIR"
echo " manifest: $MANIFEST"
echo " R2 prefix: $PUBLIC_URL"
[ -n "$FILTER_AGENT" ] && echo " filter: agent=$FILTER_AGENT"
[ -n "$FILTER_OUTCOME" ] && echo " filter: outcome=$FILTER_OUTCOME"
[ -n "$FILTER_SKILL" ] && echo " filter: skill=$FILTER_SKILL"
[ "$FORCE" -eq 1 ] && echo " --force: re-downloading all matched"
echo
# Function called by xargs — must be exported. Writes one of:
# ✓ (newly fetched)
# • (skipped — sha256 already matches)
# ✗ (failed — sha256 mismatch or download error)
fetch_one() {
local id="$1" sha="$2" bytes="$3"
local dest="$CACHE_DIR/$id.jsonl"
if [ "$FORCE" -ne 1 ] && [ -f "$dest" ]; then
local existing
existing=$(shasum -a 256 "$dest" 2>/dev/null | awk '{print $1}')
if [ "$existing" = "$sha" ]; then
echo "• $id"
return 0
fi
fi
local url="${PUBLIC_URL}${id}.jsonl"
if ! curl -sf -o "$dest.tmp" "$url"; then
echo "✗ $id (download failed)"
rm -f "$dest.tmp"
return 1
fi
local got
got=$(shasum -a 256 "$dest.tmp" | awk '{print $1}')
if [ "$got" != "$sha" ]; then
echo "✗ $id (sha256 mismatch: got $got expected $sha)"
rm -f "$dest.tmp"
return 1
fi
mv "$dest.tmp" "$dest"
echo "✓ $id"
}
export PUBLIC_URL CACHE_DIR FORCE
export -f fetch_one
printf '%s\n' "$ENTRIES_TSV" \
| xargs -P "$CONCURRENCY" -L 1 bash -c 'fetch_one "$1" "$2" "$3"' _ \
> /tmp/od-mocks-fetch-progress.txt 2>&1
new=$(grep -c "^✓" /tmp/od-mocks-fetch-progress.txt || true)
skip=$(grep -c "^•" /tmp/od-mocks-fetch-progress.txt || true)
fail=$(grep -c "^✗" /tmp/od-mocks-fetch-progress.txt || true)
echo " ✓ fetched: $new"
echo " • cached: $skip"
if [ "$fail" -gt 0 ]; then
echo " ✗ failed: $fail"
echo
grep "^✗" /tmp/od-mocks-fetch-progress.txt | head -5
echo " …(full log /tmp/od-mocks-fetch-progress.txt)"
exit 1
fi
# Symlink (or copy) into mocks/recordings/ when cache lives elsewhere so
# the mock-agent recording-picker keeps working without env overrides.
if [ "$CACHE_DIR" != "$MOCKS_DIR/recordings" ]; then
mkdir -p "$MOCKS_DIR/recordings"
for f in "$CACHE_DIR"/*.jsonl; do
[ -e "$f" ] || continue
bn=$(basename "$f")
if [ ! -e "$MOCKS_DIR/recordings/$bn" ]; then
ln -sf "$f" "$MOCKS_DIR/recordings/$bn"
fi
done
# Also link the manifest so picker/index-aware tooling sees it.
ln -sf "$MANIFEST" "$MOCKS_DIR/recordings/index.json" 2>/dev/null || true
fi
echo
echo "✅ ready: $MOCKS_DIR/recordings/"