gh-workflow unit evals (#41637)

Closes #ISSUE

Release Notes:

- N/A *or* Added/Fixed/Improved ...
This commit is contained in:
Ben Kunkle 2025-11-01 19:45:44 -07:00 committed by GitHub
parent df15d2d2fe
commit 2408f767f4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 246 additions and 157 deletions

View file

@ -1,71 +0,0 @@
name: Run Agent Eval
on:
schedule:
- cron: "0 0 * * *"
pull_request:
branches:
- "**"
types: [synchronize, reopened, labeled]
workflow_dispatch:
concurrency:
# Allow only one workflow per any non-`main` branch.
group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
cancel-in-progress: true
env:
CARGO_TERM_COLOR: always
CARGO_INCREMENTAL: 0
RUST_BACKTRACE: 1
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}
ZED_EVAL_TELEMETRY: 1
jobs:
run_eval:
timeout-minutes: 60
name: Run Agent Eval
if: >
github.repository_owner == 'zed-industries' &&
(github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-eval'))
runs-on:
- namespace-profile-16x32-ubuntu-2204
steps:
- name: Add Rust to the PATH
run: echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
- name: Checkout repo
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
with:
clean: false
- name: Cache dependencies
uses: swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6 # v2
with:
save-if: ${{ github.ref == 'refs/heads/main' }}
# cache-provider: "buildjet"
- name: Install Linux dependencies
run: ./script/linux
- name: Configure CI
run: |
mkdir -p ./../.cargo
cp ./.cargo/ci-config.toml ./../.cargo/config.toml
- name: Compile eval
run: cargo build --package=eval
- name: Run eval
run: cargo run --package=eval -- --repetitions=8 --concurrency=1
# Even the Linux runner is not stateful, in theory there is no need to do this cleanup.
# But, to avoid potential issues in the future if we choose to use a stateful Linux runner and forget to add code
# to clean up the config file, Ive included the cleanup code here as a precaution.
# While its not strictly necessary at this moment, I believe its better to err on the side of caution.
- name: Clean CI config file
if: always()
run: rm -rf ./../.cargo

62
.github/workflows/run_agent_evals.yml vendored Normal file
View file

@ -0,0 +1,62 @@
# Generated from xtask::workflows::run_agent_evals
# Rebuild with `cargo xtask workflows`.
name: run_agent_evals
env:
CARGO_TERM_COLOR: always
CARGO_INCREMENTAL: '0'
RUST_BACKTRACE: '1'
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}
ZED_EVAL_TELEMETRY: '1'
on:
pull_request:
types:
- synchronize
- reopened
- labeled
branches:
- '**'
schedule:
- cron: 0 0 * * *
workflow_dispatch: {}
jobs:
agent_evals:
if: |
github.repository_owner == 'zed-industries' &&
(github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-eval'))
runs-on: namespace-profile-16x32-ubuntu-2204
steps:
- name: steps::checkout_repo
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
with:
clean: false
- name: steps::cache_rust_dependencies
uses: swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6
with:
save-if: ${{ github.ref == 'refs/heads/main' }}
- name: steps::setup_linux
run: ./script/linux
shell: bash -euxo pipefail {0}
- name: steps::install_mold
run: ./script/install-mold
shell: bash -euxo pipefail {0}
- name: steps::setup_cargo_config
run: |
mkdir -p ./../.cargo
cp ./.cargo/ci-config.toml ./../.cargo/config.toml
shell: bash -euxo pipefail {0}
- name: cargo build --package=eval
run: cargo build --package=eval
shell: bash -euxo pipefail {0}
- name: run_agent_evals::agent_evals::run_eval
run: cargo run --package=eval -- --repetitions=8 --concurrency=1
shell: bash -euxo pipefail {0}
- name: steps::cleanup_cargo_config
if: always()
run: |
rm -rf ./../.cargo
shell: bash -euxo pipefail {0}
timeout-minutes: 60
concurrency:
group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
cancel-in-progress: true

63
.github/workflows/run_unit_evals.yml vendored Normal file
View file

@ -0,0 +1,63 @@
# Generated from xtask::workflows::run_agent_evals
# Rebuild with `cargo xtask workflows`.
name: run_agent_evals
env:
CARGO_TERM_COLOR: always
CARGO_INCREMENTAL: '0'
RUST_BACKTRACE: '1'
ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}
on:
schedule:
- cron: 47 1 * * 2
workflow_dispatch: {}
jobs:
unit_evals:
runs-on: namespace-profile-16x32-ubuntu-2204
steps:
- name: steps::checkout_repo
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
with:
clean: false
- name: steps::setup_cargo_config
run: |
mkdir -p ./../.cargo
cp ./.cargo/ci-config.toml ./../.cargo/config.toml
shell: bash -euxo pipefail {0}
- name: steps::cache_rust_dependencies
uses: swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6
with:
save-if: ${{ github.ref == 'refs/heads/main' }}
- name: steps::setup_linux
run: ./script/linux
shell: bash -euxo pipefail {0}
- name: steps::install_mold
run: ./script/install-mold
shell: bash -euxo pipefail {0}
- name: steps::cargo_install_nextest
run: cargo install cargo-nextest --locked
shell: bash -euxo pipefail {0}
- name: steps::clear_target_dir_if_large
run: ./script/clear-target-dir-if-larger-than 100
shell: bash -euxo pipefail {0}
- name: ./script/run-unit-evals
run: ./script/run-unit-evals
shell: bash -euxo pipefail {0}
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
- name: run_agent_evals::unit_evals::send_failure_to_slack
if: ${{ failure() }}
uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52
with:
method: chat.postMessage
token: ${{ secrets.SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN }}
payload: |
channel: C04UDRNNJFQ
text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}"
- name: steps::cleanup_cargo_config
if: always()
run: |
rm -rf ./../.cargo
shell: bash -euxo pipefail {0}
concurrency:
group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
cancel-in-progress: true

View file

@ -1,86 +0,0 @@
name: Run Unit Evals
on:
schedule:
# GitHub might drop jobs at busy times, so we choose a random time in the middle of the night.
- cron: "47 1 * * 2"
workflow_dispatch:
concurrency:
# Allow only one workflow per any non-`main` branch.
group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
cancel-in-progress: true
env:
CARGO_TERM_COLOR: always
CARGO_INCREMENTAL: 0
RUST_BACKTRACE: 1
ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}
jobs:
unit_evals:
if: github.repository_owner == 'zed-industries'
timeout-minutes: 60
name: Run unit evals
runs-on:
- namespace-profile-16x32-ubuntu-2204
steps:
- name: Add Rust to the PATH
run: echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
- name: Checkout repo
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
with:
clean: false
- name: Cache dependencies
uses: swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6 # v2
with:
save-if: ${{ github.ref == 'refs/heads/main' }}
# cache-provider: "buildjet"
- name: Install Linux dependencies
run: ./script/linux
- name: Configure CI
run: |
mkdir -p ./../.cargo
cp ./.cargo/ci-config.toml ./../.cargo/config.toml
- name: Install Rust
shell: bash -euxo pipefail {0}
run: |
cargo install cargo-nextest --locked
- name: Install Node
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
with:
node-version: "18"
- name: Limit target directory size
shell: bash -euxo pipefail {0}
run: script/clear-target-dir-if-larger-than 100
- name: Run unit evals
shell: bash -euxo pipefail {0}
run: cargo nextest run --workspace --no-fail-fast --features unit-eval --no-capture -E 'test(::eval_)'
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
- name: Send failure message to Slack channel if needed
if: ${{ failure() }}
uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52
with:
method: chat.postMessage
token: ${{ secrets.SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN }}
payload: |
channel: C04UDRNNJFQ
text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}"
# Even the Linux runner is not stateful, in theory there is no need to do this cleanup.
# But, to avoid potential issues in the future if we choose to use a stateful Linux runner and forget to add code
# to clean up the config file, Ive included the cleanup code here as a precaution.
# While its not strictly necessary at this moment, I believe its better to err on the side of caution.
- name: Clean CI config file
if: always()
run: rm -rf ./../.cargo

5
script/run-unit-evals Executable file
View file

@ -0,0 +1,5 @@
#!/usr/bin/env bash
set -euxo pipefail
cargo nextest run --workspace --no-fail-fast --features unit-eval --no-capture -E 'test(::eval_)'

View file

@ -10,6 +10,7 @@ mod release_nightly;
mod run_bundling;
mod release;
mod run_agent_evals;
mod run_tests;
mod runners;
mod steps;
@ -28,6 +29,8 @@ pub fn run_workflows(_: GenerateWorkflowArgs) -> Result<()> {
("run_tests.yml", run_tests::run_tests()),
("release.yml", release::release()),
("compare_perf.yml", compare_perf::compare_perf()),
("run_unit_evals.yml", run_agent_evals::run_unit_evals()),
("run_agent_evals.yml", run_agent_evals::run_agent_evals()),
];
fs::create_dir_all(dir)
.with_context(|| format!("Failed to create directory: {}", dir.display()))?;

View file

@ -0,0 +1,113 @@
use gh_workflow::{
Event, Expression, Job, PullRequest, PullRequestType, Run, Schedule, Step, Use, Workflow,
WorkflowDispatch,
};
use crate::tasks::workflows::{
runners::{self, Platform},
steps::{self, FluentBuilder as _, NamedJob, named, setup_cargo_config},
vars,
};
pub(crate) fn run_agent_evals() -> Workflow {
let agent_evals = agent_evals();
named::workflow()
.on(Event::default()
.schedule([Schedule::default().cron("0 0 * * *")])
.pull_request(PullRequest::default().add_branch("**").types([
PullRequestType::Synchronize,
PullRequestType::Reopened,
PullRequestType::Labeled,
]))
.workflow_dispatch(WorkflowDispatch::default()))
.concurrency(vars::one_workflow_per_non_main_branch())
.add_env(("CARGO_TERM_COLOR", "always"))
.add_env(("CARGO_INCREMENTAL", 0))
.add_env(("RUST_BACKTRACE", 1))
.add_env(("ANTHROPIC_API_KEY", "${{ secrets.ANTHROPIC_API_KEY }}"))
.add_env((
"ZED_CLIENT_CHECKSUM_SEED",
"${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}",
))
.add_env(("ZED_EVAL_TELEMETRY", 1))
.add_job(agent_evals.name, agent_evals.job)
}
fn agent_evals() -> NamedJob {
fn run_eval() -> Step<Run> {
named::bash("cargo run --package=eval -- --repetitions=8 --concurrency=1")
}
named::job(
Job::default()
.cond(Expression::new(indoc::indoc!{r#"
github.repository_owner == 'zed-industries' &&
(github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-eval'))
"#}))
.runs_on(runners::LINUX_DEFAULT)
.timeout_minutes(60_u32)
.add_step(steps::checkout_repo())
.add_step(steps::cache_rust_dependencies())
.map(steps::install_linux_dependencies)
.add_step(setup_cargo_config(Platform::Linux))
.add_step(steps::script("cargo build --package=eval"))
.add_step(run_eval())
.add_step(steps::cleanup_cargo_config(Platform::Linux))
)
}
pub(crate) fn run_unit_evals() -> Workflow {
let unit_evals = unit_evals();
named::workflow()
.on(Event::default()
.schedule([
// GitHub might drop jobs at busy times, so we choose a random time in the middle of the night.
Schedule::default().cron("47 1 * * 2"),
])
.workflow_dispatch(WorkflowDispatch::default()))
.concurrency(vars::one_workflow_per_non_main_branch())
.add_env(("CARGO_TERM_COLOR", "always"))
.add_env(("CARGO_INCREMENTAL", 0))
.add_env(("RUST_BACKTRACE", 1))
.add_env((
"ZED_CLIENT_CHECKSUM_SEED",
"${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}",
))
.add_job(unit_evals.name, unit_evals.job)
}
fn unit_evals() -> NamedJob {
fn send_failure_to_slack() -> Step<Use> {
named::uses(
"slackapi",
"slack-github-action",
"b0fa283ad8fea605de13dc3f449259339835fc52",
)
.if_condition(Expression::new("${{ failure() }}"))
.add_with(("method", "chat.postMessage"))
.add_with(("token", "${{ secrets.SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN }}"))
.add_with(("payload", indoc::indoc!{r#"
channel: C04UDRNNJFQ
text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}"
"#}))
}
named::job(
Job::default()
.runs_on(runners::LINUX_DEFAULT)
.add_step(steps::checkout_repo())
.add_step(steps::setup_cargo_config(Platform::Linux))
.add_step(steps::cache_rust_dependencies())
.map(steps::install_linux_dependencies)
.add_step(steps::cargo_install_nextest(Platform::Linux))
.add_step(steps::clear_target_dir_if_large(Platform::Linux))
.add_step(
steps::script("./script/run-unit-evals")
.add_env(("ANTHROPIC_API_KEY", "${{ secrets.ANTHROPIC_API_KEY }}")),
)
.add_step(send_failure_to_slack())
.add_step(steps::cleanup_cargo_config(Platform::Linux)),
)
}