mirror of
https://github.com/zed-industries/zed.git
synced 2026-06-01 03:14:56 +07:00
gh-workflow unit evals (#41637)
Closes #ISSUE Release Notes: - N/A *or* Added/Fixed/Improved ...
This commit is contained in:
parent
df15d2d2fe
commit
2408f767f4
7 changed files with 246 additions and 157 deletions
71
.github/workflows/eval.yml
vendored
71
.github/workflows/eval.yml
vendored
|
|
@ -1,71 +0,0 @@
|
|||
name: Run Agent Eval
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: "0 0 * * *"
|
||||
|
||||
pull_request:
|
||||
branches:
|
||||
- "**"
|
||||
types: [synchronize, reopened, labeled]
|
||||
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
# Allow only one workflow per any non-`main` branch.
|
||||
group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
CARGO_INCREMENTAL: 0
|
||||
RUST_BACKTRACE: 1
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}
|
||||
ZED_EVAL_TELEMETRY: 1
|
||||
|
||||
jobs:
|
||||
run_eval:
|
||||
timeout-minutes: 60
|
||||
name: Run Agent Eval
|
||||
if: >
|
||||
github.repository_owner == 'zed-industries' &&
|
||||
(github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-eval'))
|
||||
runs-on:
|
||||
- namespace-profile-16x32-ubuntu-2204
|
||||
steps:
|
||||
- name: Add Rust to the PATH
|
||||
run: echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
|
||||
|
||||
- name: Checkout repo
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
|
||||
with:
|
||||
clean: false
|
||||
|
||||
- name: Cache dependencies
|
||||
uses: swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6 # v2
|
||||
with:
|
||||
save-if: ${{ github.ref == 'refs/heads/main' }}
|
||||
# cache-provider: "buildjet"
|
||||
|
||||
- name: Install Linux dependencies
|
||||
run: ./script/linux
|
||||
|
||||
- name: Configure CI
|
||||
run: |
|
||||
mkdir -p ./../.cargo
|
||||
cp ./.cargo/ci-config.toml ./../.cargo/config.toml
|
||||
|
||||
- name: Compile eval
|
||||
run: cargo build --package=eval
|
||||
|
||||
- name: Run eval
|
||||
run: cargo run --package=eval -- --repetitions=8 --concurrency=1
|
||||
|
||||
# Even the Linux runner is not stateful, in theory there is no need to do this cleanup.
|
||||
# But, to avoid potential issues in the future if we choose to use a stateful Linux runner and forget to add code
|
||||
# to clean up the config file, I’ve included the cleanup code here as a precaution.
|
||||
# While it’s not strictly necessary at this moment, I believe it’s better to err on the side of caution.
|
||||
- name: Clean CI config file
|
||||
if: always()
|
||||
run: rm -rf ./../.cargo
|
||||
62
.github/workflows/run_agent_evals.yml
vendored
Normal file
62
.github/workflows/run_agent_evals.yml
vendored
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
# Generated from xtask::workflows::run_agent_evals
|
||||
# Rebuild with `cargo xtask workflows`.
|
||||
name: run_agent_evals
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
CARGO_INCREMENTAL: '0'
|
||||
RUST_BACKTRACE: '1'
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}
|
||||
ZED_EVAL_TELEMETRY: '1'
|
||||
on:
|
||||
pull_request:
|
||||
types:
|
||||
- synchronize
|
||||
- reopened
|
||||
- labeled
|
||||
branches:
|
||||
- '**'
|
||||
schedule:
|
||||
- cron: 0 0 * * *
|
||||
workflow_dispatch: {}
|
||||
jobs:
|
||||
agent_evals:
|
||||
if: |
|
||||
github.repository_owner == 'zed-industries' &&
|
||||
(github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-eval'))
|
||||
runs-on: namespace-profile-16x32-ubuntu-2204
|
||||
steps:
|
||||
- name: steps::checkout_repo
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
clean: false
|
||||
- name: steps::cache_rust_dependencies
|
||||
uses: swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6
|
||||
with:
|
||||
save-if: ${{ github.ref == 'refs/heads/main' }}
|
||||
- name: steps::setup_linux
|
||||
run: ./script/linux
|
||||
shell: bash -euxo pipefail {0}
|
||||
- name: steps::install_mold
|
||||
run: ./script/install-mold
|
||||
shell: bash -euxo pipefail {0}
|
||||
- name: steps::setup_cargo_config
|
||||
run: |
|
||||
mkdir -p ./../.cargo
|
||||
cp ./.cargo/ci-config.toml ./../.cargo/config.toml
|
||||
shell: bash -euxo pipefail {0}
|
||||
- name: cargo build --package=eval
|
||||
run: cargo build --package=eval
|
||||
shell: bash -euxo pipefail {0}
|
||||
- name: run_agent_evals::agent_evals::run_eval
|
||||
run: cargo run --package=eval -- --repetitions=8 --concurrency=1
|
||||
shell: bash -euxo pipefail {0}
|
||||
- name: steps::cleanup_cargo_config
|
||||
if: always()
|
||||
run: |
|
||||
rm -rf ./../.cargo
|
||||
shell: bash -euxo pipefail {0}
|
||||
timeout-minutes: 60
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
|
||||
cancel-in-progress: true
|
||||
63
.github/workflows/run_unit_evals.yml
vendored
Normal file
63
.github/workflows/run_unit_evals.yml
vendored
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
# Generated from xtask::workflows::run_agent_evals
|
||||
# Rebuild with `cargo xtask workflows`.
|
||||
name: run_agent_evals
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
CARGO_INCREMENTAL: '0'
|
||||
RUST_BACKTRACE: '1'
|
||||
ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}
|
||||
on:
|
||||
schedule:
|
||||
- cron: 47 1 * * 2
|
||||
workflow_dispatch: {}
|
||||
jobs:
|
||||
unit_evals:
|
||||
runs-on: namespace-profile-16x32-ubuntu-2204
|
||||
steps:
|
||||
- name: steps::checkout_repo
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
clean: false
|
||||
- name: steps::setup_cargo_config
|
||||
run: |
|
||||
mkdir -p ./../.cargo
|
||||
cp ./.cargo/ci-config.toml ./../.cargo/config.toml
|
||||
shell: bash -euxo pipefail {0}
|
||||
- name: steps::cache_rust_dependencies
|
||||
uses: swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6
|
||||
with:
|
||||
save-if: ${{ github.ref == 'refs/heads/main' }}
|
||||
- name: steps::setup_linux
|
||||
run: ./script/linux
|
||||
shell: bash -euxo pipefail {0}
|
||||
- name: steps::install_mold
|
||||
run: ./script/install-mold
|
||||
shell: bash -euxo pipefail {0}
|
||||
- name: steps::cargo_install_nextest
|
||||
run: cargo install cargo-nextest --locked
|
||||
shell: bash -euxo pipefail {0}
|
||||
- name: steps::clear_target_dir_if_large
|
||||
run: ./script/clear-target-dir-if-larger-than 100
|
||||
shell: bash -euxo pipefail {0}
|
||||
- name: ./script/run-unit-evals
|
||||
run: ./script/run-unit-evals
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
- name: run_agent_evals::unit_evals::send_failure_to_slack
|
||||
if: ${{ failure() }}
|
||||
uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52
|
||||
with:
|
||||
method: chat.postMessage
|
||||
token: ${{ secrets.SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN }}
|
||||
payload: |
|
||||
channel: C04UDRNNJFQ
|
||||
text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}"
|
||||
- name: steps::cleanup_cargo_config
|
||||
if: always()
|
||||
run: |
|
||||
rm -rf ./../.cargo
|
||||
shell: bash -euxo pipefail {0}
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
|
||||
cancel-in-progress: true
|
||||
86
.github/workflows/unit_evals.yml
vendored
86
.github/workflows/unit_evals.yml
vendored
|
|
@ -1,86 +0,0 @@
|
|||
name: Run Unit Evals
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# GitHub might drop jobs at busy times, so we choose a random time in the middle of the night.
|
||||
- cron: "47 1 * * 2"
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
# Allow only one workflow per any non-`main` branch.
|
||||
group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
CARGO_INCREMENTAL: 0
|
||||
RUST_BACKTRACE: 1
|
||||
ZED_CLIENT_CHECKSUM_SEED: ${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}
|
||||
|
||||
jobs:
|
||||
unit_evals:
|
||||
if: github.repository_owner == 'zed-industries'
|
||||
timeout-minutes: 60
|
||||
name: Run unit evals
|
||||
runs-on:
|
||||
- namespace-profile-16x32-ubuntu-2204
|
||||
steps:
|
||||
- name: Add Rust to the PATH
|
||||
run: echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
|
||||
|
||||
- name: Checkout repo
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
|
||||
with:
|
||||
clean: false
|
||||
|
||||
- name: Cache dependencies
|
||||
uses: swatinem/rust-cache@9d47c6ad4b02e050fd481d890b2ea34778fd09d6 # v2
|
||||
with:
|
||||
save-if: ${{ github.ref == 'refs/heads/main' }}
|
||||
# cache-provider: "buildjet"
|
||||
|
||||
- name: Install Linux dependencies
|
||||
run: ./script/linux
|
||||
|
||||
- name: Configure CI
|
||||
run: |
|
||||
mkdir -p ./../.cargo
|
||||
cp ./.cargo/ci-config.toml ./../.cargo/config.toml
|
||||
|
||||
- name: Install Rust
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
cargo install cargo-nextest --locked
|
||||
|
||||
- name: Install Node
|
||||
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
|
||||
with:
|
||||
node-version: "18"
|
||||
|
||||
- name: Limit target directory size
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: script/clear-target-dir-if-larger-than 100
|
||||
|
||||
- name: Run unit evals
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: cargo nextest run --workspace --no-fail-fast --features unit-eval --no-capture -E 'test(::eval_)'
|
||||
env:
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
|
||||
- name: Send failure message to Slack channel if needed
|
||||
if: ${{ failure() }}
|
||||
uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52
|
||||
with:
|
||||
method: chat.postMessage
|
||||
token: ${{ secrets.SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN }}
|
||||
payload: |
|
||||
channel: C04UDRNNJFQ
|
||||
text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}"
|
||||
|
||||
# Even the Linux runner is not stateful, in theory there is no need to do this cleanup.
|
||||
# But, to avoid potential issues in the future if we choose to use a stateful Linux runner and forget to add code
|
||||
# to clean up the config file, I’ve included the cleanup code here as a precaution.
|
||||
# While it’s not strictly necessary at this moment, I believe it’s better to err on the side of caution.
|
||||
- name: Clean CI config file
|
||||
if: always()
|
||||
run: rm -rf ./../.cargo
|
||||
5
script/run-unit-evals
Executable file
5
script/run-unit-evals
Executable file
|
|
@ -0,0 +1,5 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -euxo pipefail
|
||||
|
||||
cargo nextest run --workspace --no-fail-fast --features unit-eval --no-capture -E 'test(::eval_)'
|
||||
|
|
@ -10,6 +10,7 @@ mod release_nightly;
|
|||
mod run_bundling;
|
||||
|
||||
mod release;
|
||||
mod run_agent_evals;
|
||||
mod run_tests;
|
||||
mod runners;
|
||||
mod steps;
|
||||
|
|
@ -28,6 +29,8 @@ pub fn run_workflows(_: GenerateWorkflowArgs) -> Result<()> {
|
|||
("run_tests.yml", run_tests::run_tests()),
|
||||
("release.yml", release::release()),
|
||||
("compare_perf.yml", compare_perf::compare_perf()),
|
||||
("run_unit_evals.yml", run_agent_evals::run_unit_evals()),
|
||||
("run_agent_evals.yml", run_agent_evals::run_agent_evals()),
|
||||
];
|
||||
fs::create_dir_all(dir)
|
||||
.with_context(|| format!("Failed to create directory: {}", dir.display()))?;
|
||||
|
|
|
|||
113
tooling/xtask/src/tasks/workflows/run_agent_evals.rs
Normal file
113
tooling/xtask/src/tasks/workflows/run_agent_evals.rs
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
use gh_workflow::{
|
||||
Event, Expression, Job, PullRequest, PullRequestType, Run, Schedule, Step, Use, Workflow,
|
||||
WorkflowDispatch,
|
||||
};
|
||||
|
||||
use crate::tasks::workflows::{
|
||||
runners::{self, Platform},
|
||||
steps::{self, FluentBuilder as _, NamedJob, named, setup_cargo_config},
|
||||
vars,
|
||||
};
|
||||
|
||||
pub(crate) fn run_agent_evals() -> Workflow {
|
||||
let agent_evals = agent_evals();
|
||||
|
||||
named::workflow()
|
||||
.on(Event::default()
|
||||
.schedule([Schedule::default().cron("0 0 * * *")])
|
||||
.pull_request(PullRequest::default().add_branch("**").types([
|
||||
PullRequestType::Synchronize,
|
||||
PullRequestType::Reopened,
|
||||
PullRequestType::Labeled,
|
||||
]))
|
||||
.workflow_dispatch(WorkflowDispatch::default()))
|
||||
.concurrency(vars::one_workflow_per_non_main_branch())
|
||||
.add_env(("CARGO_TERM_COLOR", "always"))
|
||||
.add_env(("CARGO_INCREMENTAL", 0))
|
||||
.add_env(("RUST_BACKTRACE", 1))
|
||||
.add_env(("ANTHROPIC_API_KEY", "${{ secrets.ANTHROPIC_API_KEY }}"))
|
||||
.add_env((
|
||||
"ZED_CLIENT_CHECKSUM_SEED",
|
||||
"${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}",
|
||||
))
|
||||
.add_env(("ZED_EVAL_TELEMETRY", 1))
|
||||
.add_job(agent_evals.name, agent_evals.job)
|
||||
}
|
||||
|
||||
fn agent_evals() -> NamedJob {
|
||||
fn run_eval() -> Step<Run> {
|
||||
named::bash("cargo run --package=eval -- --repetitions=8 --concurrency=1")
|
||||
}
|
||||
|
||||
named::job(
|
||||
Job::default()
|
||||
.cond(Expression::new(indoc::indoc!{r#"
|
||||
github.repository_owner == 'zed-industries' &&
|
||||
(github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-eval'))
|
||||
"#}))
|
||||
.runs_on(runners::LINUX_DEFAULT)
|
||||
.timeout_minutes(60_u32)
|
||||
.add_step(steps::checkout_repo())
|
||||
.add_step(steps::cache_rust_dependencies())
|
||||
.map(steps::install_linux_dependencies)
|
||||
.add_step(setup_cargo_config(Platform::Linux))
|
||||
.add_step(steps::script("cargo build --package=eval"))
|
||||
.add_step(run_eval())
|
||||
.add_step(steps::cleanup_cargo_config(Platform::Linux))
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn run_unit_evals() -> Workflow {
|
||||
let unit_evals = unit_evals();
|
||||
|
||||
named::workflow()
|
||||
.on(Event::default()
|
||||
.schedule([
|
||||
// GitHub might drop jobs at busy times, so we choose a random time in the middle of the night.
|
||||
Schedule::default().cron("47 1 * * 2"),
|
||||
])
|
||||
.workflow_dispatch(WorkflowDispatch::default()))
|
||||
.concurrency(vars::one_workflow_per_non_main_branch())
|
||||
.add_env(("CARGO_TERM_COLOR", "always"))
|
||||
.add_env(("CARGO_INCREMENTAL", 0))
|
||||
.add_env(("RUST_BACKTRACE", 1))
|
||||
.add_env((
|
||||
"ZED_CLIENT_CHECKSUM_SEED",
|
||||
"${{ secrets.ZED_CLIENT_CHECKSUM_SEED }}",
|
||||
))
|
||||
.add_job(unit_evals.name, unit_evals.job)
|
||||
}
|
||||
|
||||
fn unit_evals() -> NamedJob {
|
||||
fn send_failure_to_slack() -> Step<Use> {
|
||||
named::uses(
|
||||
"slackapi",
|
||||
"slack-github-action",
|
||||
"b0fa283ad8fea605de13dc3f449259339835fc52",
|
||||
)
|
||||
.if_condition(Expression::new("${{ failure() }}"))
|
||||
.add_with(("method", "chat.postMessage"))
|
||||
.add_with(("token", "${{ secrets.SLACK_APP_ZED_UNIT_EVALS_BOT_TOKEN }}"))
|
||||
.add_with(("payload", indoc::indoc!{r#"
|
||||
channel: C04UDRNNJFQ
|
||||
text: "Unit Evals Failed: https://github.com/zed-industries/zed/actions/runs/${{ github.run_id }}"
|
||||
"#}))
|
||||
}
|
||||
|
||||
named::job(
|
||||
Job::default()
|
||||
.runs_on(runners::LINUX_DEFAULT)
|
||||
.add_step(steps::checkout_repo())
|
||||
.add_step(steps::setup_cargo_config(Platform::Linux))
|
||||
.add_step(steps::cache_rust_dependencies())
|
||||
.map(steps::install_linux_dependencies)
|
||||
.add_step(steps::cargo_install_nextest(Platform::Linux))
|
||||
.add_step(steps::clear_target_dir_if_large(Platform::Linux))
|
||||
.add_step(
|
||||
steps::script("./script/run-unit-evals")
|
||||
.add_env(("ANTHROPIC_API_KEY", "${{ secrets.ANTHROPIC_API_KEY }}")),
|
||||
)
|
||||
.add_step(send_failure_to_slack())
|
||||
.add_step(steps::cleanup_cargo_config(Platform::Linux)),
|
||||
)
|
||||
}
|
||||
Loading…
Reference in a new issue