perf: Greatly expand profiler (#38584)

Expands on #38543 (notably allows setting importance categories and weights on tests, and a lot of internal refactoring) because I couldn't help myself. Also allows exporting runs to json and comparing across them. See code for docs. Release Notes: - N/A
2026-06-01 03:14:56 +07:00 · 2025-09-21 13:54:59 +02:00 · 2025-09-21 13:54:59 +02:00 · 11041ef3b0
commit 11041ef3b0
parent 839c216620
9 changed files with 919 additions and 168 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@ -5,6 +5,7 @@ rustflags = ["-C", "symbol-mangling-version=v0", "--cfg", "tokio_unstable"]
 [alias]
 xtask = "run --package xtask --"
 perf-test = ["test", "--profile", "release-fast", "--lib", "--bins", "--tests", "--config", "target.'cfg(true)'.runner='target/release/perf'", "--config", "target.'cfg(true)'.rustflags=[\"--cfg\", \"perf_enabled\"]"]
+perf-compare = ["run", "--release", "-p", "perf", "--", "compare"]

 [target.x86_64-unknown-linux-gnu]
 linker = "clang"
@ -24,3 +25,4 @@ rustflags = [

 [env]
 MACOSX_DEPLOYMENT_TARGET = "10.15.7"
+CARGO_WORKSPACE_DIR = { value = "", relative = true }
--- a/.gitignore
+++ b/.gitignore
@ -20,6 +20,7 @@
 .venv
 .vscode
 .wrangler
+.perf-runs
 /assets/*licenses.*
 /crates/collab/seed.json
 /crates/theme/schemas/theme.json
--- a/Cargo.lock
+++ b/Cargo.lock
@ -12169,6 +12169,9 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
 name = "perf"
 version = "0.1.0"
 dependencies = [
+ "collections",
+ "serde",
+ "serde_json",
 "workspace-hack",
 ]

@ -18586,6 +18589,7 @@ dependencies = [
 name = "util_macros"
 version = "0.1.0"
 dependencies = [
+ "perf",
 "quote",
 "syn 2.0.101",
 "workspace-hack",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -356,6 +356,7 @@ outline = { path = "crates/outline" }
 outline_panel = { path = "crates/outline_panel" }
 panel = { path = "crates/panel" }
 paths = { path = "crates/paths" }
+perf = { path = "tooling/perf" }
 picker = { path = "crates/picker" }
 plugin = { path = "crates/plugin" }
 plugin_macros = { path = "crates/plugin_macros" }
--- a/crates/util_macros/Cargo.toml
+++ b/crates/util_macros/Cargo.toml
@ -16,6 +16,7 @@ doctest = false
 [dependencies]
 quote.workspace = true
 syn.workspace = true
+perf.workspace = true
 workspace-hack.workspace = true

 [features]
--- a/crates/util_macros/src/util_macros.rs
+++ b/crates/util_macros/src/util_macros.rs
@ -1,6 +1,7 @@
 #![cfg_attr(not(target_os = "windows"), allow(unused))]
 #![allow(clippy::test_attr_in_doctest)]

+use perf::*;
 use proc_macro::TokenStream;
 use quote::{ToTokens, quote};
 use syn::{ItemFn, LitStr, parse_macro_input, parse_quote};
@ -90,68 +91,81 @@ pub fn line_endings(input: TokenStream) -> TokenStream {
 }

 /// Inner data for the perf macro.
+#[derive(Default)]
 struct PerfArgs {
-    /// How many times to loop a test before rerunning the test binary.
-    /// If left empty, the test harness will auto-determine this value.
+    /// How many times to loop a test before rerunning the test binary. If left
+    /// empty, the test harness will auto-determine this value.
    iterations: Option<syn::Expr>,
+    /// How much this test's results should be weighed when comparing across runs.
+    /// If unspecified, defaults to `WEIGHT_DEFAULT` (50).
+    weight: Option<syn::Expr>,
+    /// How relevant a benchmark is to overall performance. See docs on the enum
+    /// for details. If unspecified, `Average` is selected.
+    importance: Importance,
 }

-impl syn::parse::Parse for PerfArgs {
-    fn parse(input: syn::parse::ParseStream) -> syn::Result<Self> {
-        if input.is_empty() {
-            return Ok(PerfArgs { iterations: None });
+#[warn(clippy::all, clippy::pedantic)]
+impl PerfArgs {
+    /// Parses attribute arguments into a `PerfArgs`.
+    fn parse_into(&mut self, meta: syn::meta::ParseNestedMeta) -> syn::Result<()> {
+        if meta.path.is_ident("iterations") {
+            self.iterations = Some(meta.value()?.parse()?);
+        } else if meta.path.is_ident("weight") {
+            self.weight = Some(meta.value()?.parse()?);
+        } else if meta.path.is_ident("critical") {
+            self.importance = Importance::Critical;
+        } else if meta.path.is_ident("important") {
+            self.importance = Importance::Important;
+        } else if meta.path.is_ident("average") {
+            // This shouldn't be specified manually, but oh well.
+            self.importance = Importance::Average;
+        } else if meta.path.is_ident("iffy") {
+            self.importance = Importance::Iffy;
+        } else if meta.path.is_ident("fluff") {
+            self.importance = Importance::Fluff;
+        } else {
+            return Err(syn::Error::new_spanned(meta.path, "unexpected identifier"));
        }
-
-        let mut iterations = None;
-        // In principle we only have one possible argument, but leave this as
-        // a loop in case we expand this in the future.
-        for meta in
-            syn::punctuated::Punctuated::<syn::Meta, syn::Token![,]>::parse_terminated(input)?
-        {
-            match &meta {
-                syn::Meta::NameValue(meta_name_value) => {
-                    if meta_name_value.path.is_ident("iterations") {
-                        iterations = Some(meta_name_value.value.clone());
-                    } else {
-                        return Err(syn::Error::new_spanned(
-                            &meta_name_value.path,
-                            "unexpected argument, expected 'iterations'",
-                        ));
-                    }
-                }
-                _ => {
-                    return Err(syn::Error::new_spanned(
-                        meta,
-                        "expected name-value argument like 'iterations = 1'",
-                    ));
-                }
-            }
-        }
-
-        Ok(PerfArgs { iterations })
+        Ok(())
    }
 }

 /// Marks a test as perf-sensitive, to be triaged when checking the performance
 /// of a build. This also automatically applies `#[test]`.
 ///
-/// By default, the number of iterations when profiling this test is auto-determined.
-/// If this needs to be overwritten, pass the desired iteration count to the macro
-/// as a parameter (`#[perf(iterations = n)]`). Note that the actual profiler may still
-/// run the test an arbitrary number times; this flag just sets the number of executions
-/// before the process is restarted and global state is reset.
 ///
-/// # Usage notes
-/// This should probably not be applied to tests that do any significant fs IO, as
-/// locks on files may not be released in time when repeating a test many times. This
-/// might lead to spurious failures.
+/// # Usage
+/// Applying this attribute to a test marks it as average importance by default.
+/// There are 4 levels of importance (`Critical`, `Important`, `Average`, `Fluff`);
+/// see the documentation on `Importance` for details. Add the importance as a
+/// parameter to override the default (e.g. `#[perf(important)]`).
+///
+/// Each test also has a weight factor. This is irrelevant on its own, but is considered
+/// when comparing results across different runs. By default, this is set to 50;
+/// pass `weight = n` as a parameter to override this. Note that this value is only
+/// relevant within its importance category.
+///
+/// By default, the number of iterations when profiling this test is auto-determined.
+/// If this needs to be overwritten, pass the desired iteration count as a parameter
+/// (`#[perf(iterations = n)]`). Note that the actual profiler may still run the test
+/// an arbitrary number times; this flag just sets the number of executions before the
+/// process is restarted and global state is reset.
+///
+/// This attribute should probably not be applied to tests that do any significant
+/// disk IO, as locks on files may not be released in time when repeating a test many
+/// times. This might lead to spurious failures.
 ///
 /// # Examples
 /// ```rust
 /// use util_macros::perf;
 ///
 /// #[perf]
-/// fn expensive_computation_test() {
+/// fn generic_test() {
+///     // Test goes here.
+/// }
+///
+/// #[perf(fluff, weight = 30)]
+/// fn cold_path_test() {
 ///     // Test goes here.
 /// }
 /// ```
@ -161,72 +175,108 @@ impl syn::parse::Parse for PerfArgs {
 /// ```rust,ignore
 /// use util_macros::perf;
 ///
-/// #[perf(iterations = 1)]
+/// #[perf(iterations = 1, critical)]
 /// #[gpui::test]
 /// fn oneshot_test(_cx: &mut gpui::TestAppContext) {
 ///     // Test goes here.
 /// }
 /// ```
 #[proc_macro_attribute]
+#[warn(clippy::all, clippy::pedantic)]
 pub fn perf(our_attr: TokenStream, input: TokenStream) -> TokenStream {
-    // If any of the below constants are changed, make sure to also update the perf
-    // profiler to match!
-
-    /// The suffix on tests marked with `#[perf]`.
-    const SUF_NORMAL: &str = "__ZED_PERF";
-    /// The suffix on tests marked with `#[perf(iterations = n)]`.
-    const SUF_FIXED: &str = "__ZED_PERF_FIXEDITER";
-    /// The env var in which we pass the iteration count to our tests.
-    const ITER_ENV_VAR: &str = "ZED_PERF_ITER";
-
-    let iter_count = parse_macro_input!(our_attr as PerfArgs).iterations;
+    let mut args = PerfArgs::default();
+    let parser = syn::meta::parser(|meta| PerfArgs::parse_into(&mut args, meta));
+    parse_macro_input!(our_attr with parser);

    let ItemFn {
-        mut attrs,
+        attrs: mut attrs_main,
        vis,
-        mut sig,
+        sig: mut sig_main,
        block,
    } = parse_macro_input!(input as ItemFn);
-    attrs.push(parse_quote!(#[test]));
-    attrs.push(parse_quote!(#[allow(non_snake_case)]));
+    attrs_main.push(parse_quote!(#[test]));
+    attrs_main.push(parse_quote!(#[allow(non_snake_case)]));
+
+    let fns = if cfg!(perf_enabled) {
+        #[allow(clippy::wildcard_imports, reason = "We control the other side")]
+        use consts::*;

-    let block: Box<syn::Block> = if cfg!(perf_enabled) {
        // Make the ident obvious when calling, for the test parser.
-        let mut new_ident = sig.ident.to_string();
-        if iter_count.is_some() {
-            new_ident.push_str(SUF_FIXED);
-        } else {
-            new_ident.push_str(SUF_NORMAL);
-        }
+        // Also set up values for the second metadata-returning "test".
+        let mut new_ident_main = sig_main.ident.to_string();
+        let mut new_ident_meta = new_ident_main.clone();
+        new_ident_main.push_str(SUF_NORMAL);
+        new_ident_meta.push_str(SUF_MDATA);

-        let new_ident = syn::Ident::new(&new_ident, sig.ident.span());
-        sig.ident = new_ident;
-        // If we have a preset iteration count, just use that.
-        if let Some(iter_count) = iter_count {
-            parse_quote!({
-               for _ in 0..#iter_count {
-                   #block
-               }
-            })
-        } else {
-            // Otherwise, the perf harness will pass us the value in an env var.
+        let new_ident_main = syn::Ident::new(&new_ident_main, sig_main.ident.span());
+        sig_main.ident = new_ident_main;
+
+        // We don't want any nonsense if the original test had a weird signature.
+        let new_ident_meta = syn::Ident::new(&new_ident_meta, sig_main.ident.span());
+        let sig_meta = parse_quote!(fn #new_ident_meta());
+        let attrs_meta = parse_quote!(#[test] #[allow(non_snake_case)]);
+
+        // Make the test loop as the harness instructs it to.
+        let block_main = {
+            // The perf harness will pass us the value in an env var. Even if we
+            // have a preset value, just do this to keep the code paths unified.
            parse_quote!({
                let iter_count = std::env::var(#ITER_ENV_VAR).unwrap().parse::<usize>().unwrap();
                for _ in 0..iter_count {
                    #block
                }
            })
-        }
+        };
+        let importance = format!("{}", args.importance);
+        let block_meta = {
+            // This function's job is to just print some relevant info to stdout,
+            // based on the params this attr is passed. It's not an actual test.
+            // Since we use a custom attr set on our metadata fn, it shouldn't
+            // cause problems with xfail tests.
+            let q_iter = if let Some(iter) = args.iterations {
+                quote! {
+                    println!("{} {} {}", #MDATA_LINE_PREF, #ITER_COUNT_LINE_NAME, #iter);
+                }
+            } else {
+                quote! {}
+            };
+            let weight = args
+                .weight
+                .unwrap_or_else(|| parse_quote! { #WEIGHT_DEFAULT });
+            parse_quote!({
+                #q_iter
+                println!("{} {} {}", #MDATA_LINE_PREF, #WEIGHT_LINE_NAME, #weight);
+                println!("{} {} {}", #MDATA_LINE_PREF, #IMPORTANCE_LINE_NAME, #importance);
+                println!("{} {} {}", #MDATA_LINE_PREF, #VERSION_LINE_NAME, #MDATA_VER);
+            })
+        };
+
+        vec![
+            // The real test.
+            ItemFn {
+                attrs: attrs_main,
+                vis: vis.clone(),
+                sig: sig_main,
+                block: block_main,
+            },
+            // The fake test.
+            ItemFn {
+                attrs: attrs_meta,
+                vis,
+                sig: sig_meta,
+                block: block_meta,
+            },
+        ]
    } else {
-        block
+        vec![ItemFn {
+            attrs: attrs_main,
+            vis,
+            sig: sig_main,
+            block,
+        }]
    };

-    ItemFn {
-        attrs,
-        vis,
-        sig,
-        block,
-    }
-    .into_token_stream()
-    .into()
+    fns.into_iter()
+        .flat_map(|f| TokenStream::from(f.into_token_stream()))
+        .collect()
 }
--- a/tooling/perf/Cargo.toml
+++ b/tooling/perf/Cargo.toml
@ -1,11 +1,30 @@
 [package]
 name = "perf"
 version = "0.1.0"
+description = "A tool for measuring Zed test performance, with too many Clippy lints"
 publish.workspace = true
 edition.workspace = true

-[lints]
-workspace = true
+[lib]
+
+# Some personal lint preferences :3
+[lints.rust]
+missing_docs = "warn"
+
+[lints.clippy]
+needless_continue = "allow" # For a convenience macro
+all = "warn"
+pedantic = "warn"
+style = "warn"
+missing_docs_in_private_items = "warn"
+as_underscore = "deny"
+allow_attributes_without_reason = "deny"
+let_underscore_must_use = "forbid"
+undocumented_unsafe_blocks = "forbid"
+missing_safety_doc = "forbid"

 [dependencies]
+collections.workspace = true
+serde.workspace = true
+serde_json.workspace = true
 workspace-hack.workspace = true
--- a/tooling/perf/src/lib.rs
+++ b/tooling/perf/src/lib.rs
@ -0,0 +1,413 @@
+//! Some constants and datatypes used in the Zed perf profiler. Should only be
+//! consumed by the crate providing the matching macros.
+
+use collections::HashMap;
+use serde::{Deserialize, Serialize};
+use std::time::Duration;
+
+pub mod consts {
+    //! Preset idenitifiers and constants so that the profiler and proc macro agree
+    //! on their communication protocol.
+
+    /// The suffix on the actual test function.
+    pub const SUF_NORMAL: &str = "__ZED_PERF_FN";
+    /// The suffix on an extra function which prints metadata about a test to stdout.
+    pub const SUF_MDATA: &str = "__ZED_PERF_MDATA";
+    /// The env var in which we pass the iteration count to our tests.
+    pub const ITER_ENV_VAR: &str = "ZED_PERF_ITER";
+    /// The prefix printed on all benchmark test metadata lines, to distinguish it from
+    /// possible output by the test harness itself.
+    pub const MDATA_LINE_PREF: &str = "ZED_MDATA_";
+    /// The version number for the data returned from the test metadata function.
+    /// Increment on non-backwards-compatible changes.
+    pub const MDATA_VER: u32 = 0;
+    /// The default weight, if none is specified.
+    pub const WEIGHT_DEFAULT: u8 = 50;
+    /// How long a test must have run to be assumed to be reliable-ish.
+    pub const NOISE_CUTOFF: std::time::Duration = std::time::Duration::from_millis(250);
+
+    /// Identifier for the iteration count of a test metadata.
+    pub const ITER_COUNT_LINE_NAME: &str = "iter_count";
+    /// Identifier for the weight of a test metadata.
+    pub const WEIGHT_LINE_NAME: &str = "weight";
+    /// Identifier for importance in test metadata.
+    pub const IMPORTANCE_LINE_NAME: &str = "importance";
+    /// Identifier for the test metadata version.
+    pub const VERSION_LINE_NAME: &str = "version";
+
+    /// Where to save json run information.
+    pub const RUNS_DIR: &str = ".perf-runs";
+}
+
+/// How relevant a benchmark is.
+#[derive(Clone, Debug, Default, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
+pub enum Importance {
+    /// Regressions shouldn't be accepted without good reason.
+    Critical = 4,
+    /// Regressions should be paid extra attention.
+    Important = 3,
+    /// No extra attention should be paid to regressions, but they might still
+    /// be indicative of something happening.
+    #[default]
+    Average = 2,
+    /// Unclear if regressions are likely to be meaningful, but still worth keeping
+    /// an eye on. Lowest level that's checked by default by the profiler.
+    Iffy = 1,
+    /// Regressions are likely to be spurious or don't affect core functionality.
+    /// Only relevant if a lot of them happen, or as supplemental evidence for a
+    /// higher-importance benchmark regressing. Not checked by default.
+    Fluff = 0,
+}
+
+impl std::fmt::Display for Importance {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Importance::Critical => f.write_str("critical"),
+            Importance::Important => f.write_str("important"),
+            Importance::Average => f.write_str("average"),
+            Importance::Iffy => f.write_str("iffy"),
+            Importance::Fluff => f.write_str("fluff"),
+        }
+    }
+}
+
+/// Why or when did this test fail?
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub enum FailKind {
+    /// Failed while triaging it to determine the iteration count.
+    Triage,
+    /// Failed while profiling it.
+    Profile,
+    /// Failed due to an incompatible version for the test.
+    VersionMismatch,
+    /// Skipped due to filters applied on the perf run.
+    Skipped,
+}
+
+impl std::fmt::Display for FailKind {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            FailKind::Triage => f.write_str("failed in triage"),
+            FailKind::Profile => f.write_str("failed while profiling"),
+            FailKind::VersionMismatch => f.write_str("test version mismatch"),
+            FailKind::Skipped => f.write_str("skipped"),
+        }
+    }
+}
+
+/// Information about a given perf test.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct TestMdata {
+    /// A version number for when the test was generated. If this is greater
+    /// than the version this test handler expects, one of the following will
+    /// happen in an unspecified manner:
+    /// - The test is skipped silently.
+    /// - The handler exits with an error message indicating the version mismatch
+    ///   or inability to parse the metadata.
+    ///
+    /// INVARIANT: If `version` <= `MDATA_VER`, this tool *must* be able to
+    /// correctly parse the output of this test.
+    pub version: u32,
+    /// How many iterations to pass this test, if this is preset.
+    pub iterations: Option<usize>,
+    /// The importance of this particular test. See the docs on `Importance` for
+    /// details.
+    pub importance: Importance,
+    /// The weight of this particular test within its importance category. Used
+    /// when comparing across runs.
+    pub weight: u8,
+}
+
+/// The actual timings of a test, as measured by Hyperfine.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct Timings {
+    /// Mean runtime for `self.iter_total` runs of this test.
+    pub mean: Duration,
+    /// Standard deviation for the above.
+    pub stddev: Duration,
+}
+
+impl Timings {
+    /// How many iterations does this test seem to do per second?
+    #[expect(
+        clippy::cast_precision_loss,
+        reason = "We only care about a couple sig figs anyways"
+    )]
+    #[must_use]
+    pub fn iters_per_sec(&self, total_iters: usize) -> f64 {
+        (1000. / self.mean.as_millis() as f64) * total_iters as f64
+    }
+}
+
+/// Aggregate output of all tests run by this handler.
+#[derive(Clone, Debug, Default, Serialize, Deserialize)]
+pub struct Output {
+    /// A list of test outputs. Format is `(test_name, iter_count, timings)`.
+    /// The latter being set indicates the test succeeded.
+    ///
+    /// INVARIANT: If the test succeeded, the second field is `Some(mdata)` and
+    /// `mdata.iterations` is `Some(_)`.
+    tests: Vec<(String, Option<TestMdata>, Result<Timings, FailKind>)>,
+}
+
+impl Output {
+    /// Instantiates an empty "output". Useful for merging.
+    #[must_use]
+    pub fn blank() -> Self {
+        Output { tests: Vec::new() }
+    }
+
+    /// Reports a success and adds it to this run's `Output`.
+    pub fn success(
+        &mut self,
+        name: impl AsRef<str>,
+        mut mdata: TestMdata,
+        iters: usize,
+        timings: Timings,
+    ) {
+        mdata.iterations = Some(iters);
+        self.tests
+            .push((name.as_ref().to_string(), Some(mdata), Ok(timings)));
+    }
+
+    /// Reports a failure and adds it to this run's `Output`. If this test was tried
+    /// with some number of iterations (i.e. this was not a version mismatch or skipped
+    /// test), it should be reported also.
+    ///
+    /// Using the `fail!()` macro is usually more convenient.
+    pub fn failure(
+        &mut self,
+        name: impl AsRef<str>,
+        mut mdata: Option<TestMdata>,
+        attempted_iters: Option<usize>,
+        kind: FailKind,
+    ) {
+        if let Some(ref mut mdata) = mdata {
+            mdata.iterations = attempted_iters;
+        }
+        self.tests
+            .push((name.as_ref().to_string(), mdata, Err(kind)));
+    }
+
+    /// True if no tests executed this run.
+    #[must_use]
+    pub fn is_empty(&self) -> bool {
+        self.tests.is_empty()
+    }
+
+    /// Sorts the runs in the output in the order that we want it printed.
+    pub fn sort(&mut self) {
+        self.tests.sort_unstable_by(|a, b| match (a, b) {
+            // Tests where we got no metadata go at the end.
+            ((_, Some(_), _), (_, None, _)) => std::cmp::Ordering::Greater,
+            ((_, None, _), (_, Some(_), _)) => std::cmp::Ordering::Less,
+            // Then sort by importance, then weight.
+            ((_, Some(a_mdata), _), (_, Some(b_mdata), _)) => {
+                let c = a_mdata.importance.cmp(&b_mdata.importance);
+                if matches!(c, std::cmp::Ordering::Equal) {
+                    a_mdata.weight.cmp(&b_mdata.weight)
+                } else {
+                    c
+                }
+            }
+            // Lastly by name.
+            ((a_name, ..), (b_name, ..)) => a_name.cmp(b_name),
+        });
+    }
+
+    /// Merges the output of two runs, appending a prefix to the results of the new run.
+    /// To be used in conjunction with `Output::blank()`, or else only some tests will have
+    /// a prefix set.
+    pub fn merge(&mut self, other: Self, pref_other: impl AsRef<str>) {
+        self.tests = std::mem::take(&mut self.tests)
+            .into_iter()
+            .chain(other.tests.into_iter().map(|(name, md, tm)| {
+                let mut new_name = "crates/".to_string();
+                new_name.push_str(pref_other.as_ref());
+                new_name.push_str("::");
+                new_name.push_str(&name);
+                (new_name, md, tm)
+            }))
+            .collect();
+    }
+
+    /// Evaluates the performance of `self` against `baseline`. The latter is taken
+    /// as the comparison point, i.e. a positive resulting `PerfReport` means that
+    /// `self` performed better.
+    ///
+    /// # Panics
+    /// `self` and `baseline` are assumed to have the iterations field on all
+    /// `TestMdata`s set to `Some(_)` if the `TestMdata` is present itself.
+    #[must_use]
+    pub fn compare_perf(self, baseline: Self) -> PerfReport {
+        let self_categories = self.collapse();
+        let mut other_categories = baseline.collapse();
+
+        let deltas = self_categories
+            .into_iter()
+            .filter_map(|(cat, self_data)| {
+                // Only compare categories where both           meow
+                // runs have data.                              /
+                let mut other_data = other_categories.remove(&cat)?;
+                let mut max = 0.;
+                let mut min = 0.;
+
+                // Running totals for averaging out tests.
+                let mut r_total_numerator = 0.;
+                let mut r_total_denominator = 0;
+                // Yeah this is O(n^2), but realistically it'll hardly be a bottleneck.
+                for (name, (s_timings, s_iters, weight)) in self_data {
+                    // Only use the new weights if they conflict.
+                    let Some((o_timings, o_iters, _)) = other_data.remove(&name) else {
+                        continue;
+                    };
+                    let shift =
+                        (s_timings.iters_per_sec(s_iters) / o_timings.iters_per_sec(o_iters)) - 1.;
+                    if shift > max {
+                        max = shift;
+                    }
+                    if shift < min {
+                        min = shift;
+                    }
+                    r_total_numerator += shift * f64::from(weight);
+                    r_total_denominator += u32::from(weight);
+                }
+                let mean = r_total_numerator / f64::from(r_total_denominator);
+                // TODO: also aggregate standard deviation? that's harder to keep
+                // meaningful, though, since we dk which tests are correlated
+                Some((cat, PerfDelta { max, mean, min }))
+            })
+            .collect();
+
+        PerfReport { deltas }
+    }
+
+    /// Collapses the `PerfReport` into a `HashMap` of `Importance` <-> tests
+    /// each represented as a map of `name, (Timings, iterations, weight)`.
+    fn collapse(self) -> HashMap<Importance, HashMap<String, (Timings, usize, u8)>> {
+        let mut categories = HashMap::<Importance, HashMap<String, _>>::default();
+        for entry in self.tests {
+            if let Some(mdata) = entry.1
+                && let Ok(timings) = entry.2
+            {
+                if let Some(handle) = categories.get_mut(&mdata.importance) {
+                    handle.insert(entry.0, (timings, mdata.iterations.unwrap(), mdata.weight));
+                } else {
+                    let mut new = HashMap::default();
+                    new.insert(entry.0, (timings, mdata.iterations.unwrap(), mdata.weight));
+                    categories.insert(mdata.importance, new);
+                }
+            }
+        }
+
+        categories
+    }
+}
+
+impl std::fmt::Display for Output {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Don't print the header for an empty run.
+        if self.tests.is_empty() {
+            return Ok(());
+        }
+
+        // We want to print important tests at the top, then alphabetical.
+        let mut sorted = self.clone();
+        sorted.sort();
+        // Markdown header for making a nice little table :>
+        writeln!(
+            f,
+            "| Command | Iter/sec | Mean [ms] | SD [ms] | Iterations | Importance (weight) |",
+        )?;
+        writeln!(f, "|:---|---:|---:|---:|---:|---:|")?;
+        for (name, metadata, timings) in &sorted.tests {
+            match metadata {
+                Some(metadata) => match timings {
+                    // Happy path.
+                    Ok(timings) => {
+                        // If the test succeeded, then metadata.iterations is Some(_).
+                        writeln!(
+                            f,
+                            "| {} | {:.2} | {} | {:.2} | {} | {} ({}) |",
+                            name,
+                            timings.iters_per_sec(metadata.iterations.unwrap()),
+                            {
+                                // Very small mean runtimes will give inaccurate
+                                // results. Should probably also penalise weight.
+                                let mean = timings.mean.as_secs_f64() * 1000.;
+                                if mean < consts::NOISE_CUTOFF.as_secs_f64() * 1000. / 8. {
+                                    format!("{mean:.2} (unreliable)")
+                                } else {
+                                    format!("{mean:.2}")
+                                }
+                            },
+                            timings.stddev.as_secs_f64() * 1000.,
+                            metadata.iterations.unwrap(),
+                            metadata.importance,
+                            metadata.weight,
+                        )?;
+                    }
+                    // We have (some) metadata, but the test errored.
+                    Err(err) => writeln!(
+                        f,
+                        "| ({}) {} | N/A | N/A | N/A | {} | {} ({}) |",
+                        err,
+                        name,
+                        metadata
+                            .iterations
+                            .map_or_else(|| "N/A".to_owned(), |i| format!("{i}")),
+                        metadata.importance,
+                        metadata.weight
+                    )?,
+                },
+                // No metadata, couldn't even parse the test output.
+                None => writeln!(
+                    f,
+                    "| ({}) {} | N/A | N/A | N/A | N/A | N/A |",
+                    timings.as_ref().unwrap_err(),
+                    name
+                )?,
+            }
+        }
+        writeln!(f)?;
+        Ok(())
+    }
+}
+
+/// The difference in performance between two runs within a given importance
+/// category.
+struct PerfDelta {
+    /// The biggest improvement / least bad regression.
+    max: f64,
+    /// The weighted average change in test times.
+    mean: f64,
+    /// The worst regression / smallest improvement.
+    min: f64,
+}
+
+/// Shim type for reporting all performance deltas across importance categories.
+pub struct PerfReport {
+    /// Inner (group, diff) pairing.
+    deltas: HashMap<Importance, PerfDelta>,
+}
+
+impl std::fmt::Display for PerfReport {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.deltas.is_empty() {
+            return write!(f, "(no matching tests)");
+        }
+        let sorted = self.deltas.iter().collect::<Vec<_>>();
+        writeln!(f, "| Category | Max | Mean | Min |")?;
+        // We don't want to print too many newlines at the end, so handle newlines
+        // a little jankily like this.
+        write!(f, "|:---|---:|---:|---:|")?;
+        for (cat, delta) in sorted.into_iter().rev() {
+            write!(
+                f,
+                "\n| {cat} | {:.3} | {:.3} | {:.3} |",
+                delta.max, delta.mean, delta.min
+            )?;
+        }
+        Ok(())
+    }
+}
--- a/tooling/perf/src/main.rs
+++ b/tooling/perf/src/main.rs
@ -1,18 +1,15 @@
-#![warn(clippy::all, clippy::pedantic, clippy::undocumented_unsafe_blocks)]
-#![cfg_attr(release, deny(warnings))]
-
 //! Perf profiler for Zed tests. Outputs timings of tests marked with the `#[perf]`
 //! attribute to stdout in Markdown. See the documentation of `util_macros::perf`
 //! for usage details on the actual attribute.
 //!
 //! # Setup
 //! Make sure `hyperfine` is installed and in the shell path, then run
-//! `cargo build --bin perf --workspace --release` to build the profiler.
+//! `cargo build -p perf --release` to build the profiler.
 //!
 //! # Usage
-//! Calling this tool rebuilds everything with some cfg flags set for the perf
-//! proc macro *and* enables optimisations (`release-fast` profile), so expect it
-//! to take a little while.
+//! Calling this tool rebuilds the targeted crate(s) with some cfg flags set for the
+//! perf proc macro *and* enables optimisations (`release-fast` profile), so expect
+//! it to take a little while.
 //!
 //! To test an individual crate, run:
 //! ```sh
@ -24,37 +21,221 @@
 //! cargo perf-test --workspace
 //! ```
 //!
+//! Some command-line parameters are also recognised by this profiler. To filter
+//! out all tests below a certain importance (e.g. `important`), run:
+//! ```sh
+//! cargo perf-test $WHATEVER -- --important
+//! ```
+//!
+//! Similarly, to skip outputting progress to the command line, pass `-- --quiet`.
+//! These flags can be combined.
+//!
+//! ## Comparing runs
+//! Passing `--json=ident` will save per-crate run files in `.perf-runs`, e.g.
+//! `cargo perf-test -p gpui -- --json=blah` will result in `.perf-runs/blah.gpui.json`
+//! being created (unless no tests were run). These results can be automatically
+//! compared. To do so, run `cargo perf-compare new-ident old-ident`.
+//!
+//! NB: All files matching `.perf-runs/ident.*.json` will be considered when
+//! doing this comparison, so ensure there aren't leftover files in your `.perf-runs`
+//! directory that might match that!
+//!
 //! # Notes
 //! This should probably not be called manually unless you're working on the profiler
 //! itself; use the `cargo perf-test` alias (after building this crate) instead.

+#[allow(clippy::wildcard_imports, reason = "Our crate")]
+use perf::*;
+
 use std::{
+    fs::OpenOptions,
+    io::Write,
+    path::{Path, PathBuf},
    process::{Command, Stdio},
    time::{Duration, Instant},
 };

 /// How many iterations to attempt the first time a test is run.
-const DEFAULT_ITER_COUNT: usize = 12;
+const DEFAULT_ITER_COUNT: usize = 3;
 /// Multiplier for the iteration count when a test doesn't pass the noise cutoff.
 const ITER_COUNT_MUL: usize = 4;
-/// How long a test must have run to be assumed to be reliable-ish.
-const NOISE_CUTOFF: Duration = Duration::from_millis(250);

-// If any of the below constants are changed, make sure to also update the perf
-// proc macro to match!
+/// Report a failure into the output and skip an iteration.
+macro_rules! fail {
+    ($output:ident, $name:expr, $kind:expr) => {{
+        $output.failure($name, None, None, $kind);
+        continue;
+    }};
+    ($output:ident, $name:expr, $mdata:expr, $kind:expr) => {{
+        $output.failure($name, Some($mdata), None, $kind);
+        continue;
+    }};
+    ($output:ident, $name:expr, $mdata:expr, $count:expr, $kind:expr) => {{
+        $output.failure($name, Some($mdata), Some($count), $kind);
+        continue;
+    }};
+}

-/// The suffix on tests marked with `#[perf]`.
-const SUF_NORMAL: &str = "__ZED_PERF";
-/// The suffix on tests marked with `#[perf(iterations = n)]`.
-const SUF_FIXED: &str = "__ZED_PERF_FIXEDITER";
-/// The env var in which we pass the iteration count to our tests.
-const ITER_ENV_VAR: &str = "ZED_PERF_ITER";
+/// How does this perf run return its output?
+enum OutputKind<'a> {
+    /// Print markdown to the terminal.
+    Markdown,
+    /// Save JSON to a file.
+    Json(&'a Path),
+}

-#[allow(clippy::too_many_lines)]
+/// Runs a given metadata-returning function from a test handler, parsing its
+/// output into a `TestMdata`.
+fn parse_mdata(test_bin: &str, mdata_fn: &str) -> Result<TestMdata, FailKind> {
+    let mut cmd = Command::new(test_bin);
+    cmd.args([mdata_fn, "--exact", "--nocapture"]);
+    let out = cmd
+        .output()
+        .expect("FATAL: Could not run test binary {test_bin}");
+    assert!(out.status.success());
+    let stdout = String::from_utf8_lossy(&out.stdout);
+    let mut version = None;
+    let mut iterations = None;
+    let mut importance = Importance::default();
+    let mut weight = consts::WEIGHT_DEFAULT;
+    for line in stdout
+        .lines()
+        .filter_map(|l| l.strip_prefix(consts::MDATA_LINE_PREF))
+    {
+        let mut items = line.split_whitespace();
+        // For v0, we know the ident always comes first, then one field.
+        match items.next().unwrap() {
+            consts::VERSION_LINE_NAME => {
+                let v = items.next().unwrap().parse::<u32>().unwrap();
+                if v > consts::MDATA_VER {
+                    return Err(FailKind::VersionMismatch);
+                }
+                version = Some(v);
+            }
+            consts::ITER_COUNT_LINE_NAME => {
+                iterations = Some(items.next().unwrap().parse::<usize>().unwrap());
+            }
+            consts::IMPORTANCE_LINE_NAME => {
+                importance = match items.next().unwrap() {
+                    "critical" => Importance::Critical,
+                    "important" => Importance::Important,
+                    "average" => Importance::Average,
+                    "iffy" => Importance::Iffy,
+                    "fluff" => Importance::Fluff,
+                    _ => unreachable!(),
+                };
+            }
+            consts::WEIGHT_LINE_NAME => {
+                weight = items.next().unwrap().parse::<u8>().unwrap();
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    Ok(TestMdata {
+        version: version.unwrap(),
+        // Iterations may be determined by us and thus left unspecified.
+        iterations,
+        // In principle this should always be set, but just for the sake of
+        // stability allow the potentially-breaking change of not reporting the
+        // importance without erroring. Maybe we want to change this.
+        importance,
+        // Same with weight.
+        weight,
+    })
+}
+
+/// Compares the perf results of two profiles as per the arguments passed in.
+fn compare_profiles(args: &[String]) {
+    let ident_new = args.first().expect("FATAL: missing identifier for new run");
+    let ident_old = args.get(1).expect("FATAL: missing identifier for old run");
+    // TODO: move this to a constant also tbh
+    let wspace_dir = std::env::var("CARGO_WORKSPACE_DIR").unwrap();
+    let runs_dir = PathBuf::from(&wspace_dir).join(consts::RUNS_DIR);
+
+    // Use the blank outputs initially, so we can merge into these with prefixes.
+    let mut outputs_new = Output::blank();
+    let mut outputs_old = Output::blank();
+
+    for e in runs_dir.read_dir().unwrap() {
+        let Ok(entry) = e else {
+            continue;
+        };
+        let Ok(metadata) = entry.metadata() else {
+            continue;
+        };
+        if metadata.is_file() {
+            let Ok(name) = entry.file_name().into_string() else {
+                continue;
+            };
+
+            // A little helper to avoid code duplication. Reads the `output` from
+            // a json file, then merges it into what we have so far.
+            let read_into = |output: &mut Output| {
+                let mut elems = name.split('.').skip(1);
+                let prefix = elems.next().unwrap();
+                assert_eq!("json", elems.next().unwrap());
+                assert!(elems.next().is_none());
+                let handle = OpenOptions::new().read(true).open(entry.path()).unwrap();
+                let o_other: Output = serde_json::from_reader(handle).unwrap();
+                output.merge(o_other, prefix);
+            };
+
+            if name.starts_with(ident_old) {
+                read_into(&mut outputs_old);
+            } else if name.starts_with(ident_new) {
+                read_into(&mut outputs_new);
+            }
+        }
+    }
+
+    let res = outputs_new.compare_perf(outputs_old);
+    println!("{res}");
+}
+
+#[expect(clippy::too_many_lines, reason = "This will be split up soon!")]
 fn main() {
+    let args = std::env::args().collect::<Vec<_>>();
    // We get passed the test we need to run as the 1st argument after our own name.
-    let test_bin = std::env::args().nth(1).unwrap();
-    let mut cmd = Command::new(&test_bin);
+    let test_bin = args
+        .get(1)
+        .expect("FATAL: No test binary or command; this shouldn't be manually invoked!");
+
+    // We're being asked to compare two results, not run the profiler.
+    if test_bin == "compare" {
+        compare_profiles(&args[2..]);
+        return;
+    }
+
+    // Whether to skip printing some information to stderr.
+    let mut quiet = false;
+    // Minimum test importance we care about this run.
+    let mut thresh = Importance::Iffy;
+    // Where to print the output of this run.
+    let mut out_kind = OutputKind::Markdown;
+
+    for arg in args.iter().skip(2) {
+        match arg.as_str() {
+            "--critical" => thresh = Importance::Critical,
+            "--important" => thresh = Importance::Important,
+            "--average" => thresh = Importance::Average,
+            "--iffy" => thresh = Importance::Iffy,
+            "--fluff" => thresh = Importance::Fluff,
+            "--quiet" => quiet = true,
+            s if s.starts_with("--json") => {
+                out_kind = OutputKind::Json(Path::new(
+                    s.strip_prefix("--json=")
+                        .expect("FATAL: Invalid json parameter; pass --json=filename"),
+                ));
+            }
+            _ => (),
+        }
+    }
+    if !quiet {
+        eprintln!("Starting perf check");
+    }
+
+    let mut cmd = Command::new(test_bin);
    // --format=json is nightly-only :(
    cmd.args(["--list", "--format=terse"]);
    let out = cmd
@ -64,6 +245,9 @@ fn main() {
        out.status.success(),
        "FATAL: Cannot do perf check - test binary {test_bin} returned an error"
    );
+    if !quiet {
+        eprintln!("Test binary ran successfully; starting profile...");
+    }
    // Parse the test harness output to look for tests we care about.
    let stdout = String::from_utf8_lossy(&out.stdout);
    let mut test_list: Vec<_> = stdout
@ -79,37 +263,62 @@ fn main() {
            }
        })
        // Exclude tests that aren't marked for perf triage based on suffix.
-        .filter(|t_name| t_name.ends_with(SUF_NORMAL) || t_name.ends_with(SUF_FIXED))
+        .filter(|t_name| {
+            t_name.ends_with(consts::SUF_NORMAL) || t_name.ends_with(consts::SUF_MDATA)
+        })
        .collect();

    // Pulling itertools just for .dedup() would be quite a big dependency that's
-    // not used elsewhere, so do this on the vec instead.
+    // not used elsewhere, so do this on a vec instead.
    test_list.sort_unstable();
    test_list.dedup();

-    if !test_list.is_empty() {
-        // Print the markdown header which matches hyperfine's result.
-        // TODO: Support exporting JSON also.
-        println!(
-            "| Command | Mean [ms] | Min [ms] | Max [ms] | Iterations | Iter/sec |\n|:---|---:|---:|---:|---:|---:|"
-        );
-    }
+    let len = test_list.len();
+
+    // Tests should come in pairs with their mdata fn!
+    assert!(
+        len.is_multiple_of(2),
+        "Malformed tests in test binary {test_bin}"
+    );
+
+    let mut output = Output::default();

    // Spawn and profile an instance of each perf-sensitive test, via hyperfine.
-    for t_name in test_list {
-        // Pretty-print the stripped name for the test.
-        let t_name_normal = t_name.replace(SUF_FIXED, "").replace(SUF_NORMAL, "");
+    // Each test is a pair of (test, metadata-returning-fn), so grab both. We also
+    // know the list is sorted.
+    for (idx, t_pair) in test_list.chunks_exact(2).enumerate() {
+        if !quiet {
+            eprint!("\rProfiling test {}/{}", idx + 1, len / 2);
+        }
+        // Be resilient against changes to these constants.
+        let (t_name, t_mdata) = if consts::SUF_NORMAL < consts::SUF_MDATA {
+            (t_pair[0], t_pair[1])
+        } else {
+            (t_pair[1], t_pair[0])
+        };
+        // Pretty-printable stripped name for the test.
+        let t_name_pretty = t_name.replace(consts::SUF_NORMAL, "");
+
+        // Get the metadata this test reports for us.
+        let t_mdata = match parse_mdata(test_bin, t_mdata) {
+            Ok(mdata) => mdata,
+            Err(err) => fail!(output, t_name_pretty, err),
+        };
+
+        if t_mdata.importance < thresh {
+            fail!(output, t_name_pretty, t_mdata, FailKind::Skipped);
+        }
+
        // Time test execution to see how many iterations we need to do in order
        // to account for random noise. This is skipped for tests with fixed
        // iteration counts.
-        let final_iter_count = if t_name.ends_with(SUF_FIXED) {
-            None
-        } else {
+        let mut errored = false;
+        let final_iter_count = t_mdata.iterations.unwrap_or_else(|| {
            let mut iter_count = DEFAULT_ITER_COUNT;
            loop {
-                let mut cmd = Command::new(&test_bin);
+                let mut cmd = Command::new(test_bin);
                cmd.args([t_name, "--exact"]);
-                cmd.env(ITER_ENV_VAR, format!("{iter_count}"));
+                cmd.env(consts::ITER_ENV_VAR, format!("{iter_count}"));
                // Don't let the child muck up our stdin/out/err.
                cmd.stdin(Stdio::null());
                cmd.stdout(Stdio::null());
@ -119,28 +328,32 @@ fn main() {
                let out = cmd.spawn().unwrap().wait();
                let post = Instant::now();
                if !out.unwrap().success() {
-                    println!(
-                        "| {t_name_normal} (ERRORED IN TRIAGE) | N/A | N/A | N/A | {iter_count} | N/A |"
-                    );
-                    return;
+                    errored = true;
+                    break iter_count;
                }
-                if post - pre > NOISE_CUTOFF {
-                    break Some(iter_count);
+                if post - pre > consts::NOISE_CUTOFF {
+                    break iter_count;
                } else if let Some(c) = iter_count.checked_mul(ITER_COUNT_MUL) {
                    iter_count = c;
                } else {
                    // This should almost never happen, but maybe..?
                    eprintln!(
-                        "WARNING: Running nearly usize::MAX iterations of test {t_name_normal}"
+                        "WARNING: Running nearly usize::MAX iterations of test {t_name_pretty}"
                    );
-                    break Some(iter_count);
+                    break iter_count;
                }
            }
-        };
+        });
+
+        // Don't profile failing tests.
+        if errored {
+            fail!(output, t_name_pretty, t_mdata, FailKind::Triage);
+        }

        // Now profile!
        let mut perf_cmd = Command::new("hyperfine");
        // Warm up the cache and print markdown output to stdout.
+        // TODO: json
        perf_cmd.args([
            "--style",
            "none",
@ -150,42 +363,89 @@ fn main() {
            "-",
            &format!("{test_bin} {t_name}"),
        ]);
-        if let Some(final_iter_count) = final_iter_count {
-            perf_cmd.env(ITER_ENV_VAR, format!("{final_iter_count}"));
-        }
+        perf_cmd.env(consts::ITER_ENV_VAR, format!("{final_iter_count}"));
        let p_out = perf_cmd.output().unwrap();
-        let fin_iter = match final_iter_count {
-            Some(i) => &format!("{i}"),
-            None => "(preset)",
-        };
        if p_out.status.success() {
-            let output = String::from_utf8_lossy(&p_out.stdout);
-            // Strip the name of the test binary from the table (and the space after it!)
-            // + our extraneous test bits + the "Relative" column (which is always at the end and "1.00").
-            let output = output
-                .replace(&format!("{test_bin} "), "")
-                .replace(SUF_FIXED, "")
-                .replace(SUF_NORMAL, "")
-                .replace(" 1.00 |", "");
+            let cmd_output = String::from_utf8_lossy(&p_out.stdout);
            // Can't use .last() since we have a trailing newline. Sigh.
-            let fin = output.lines().nth(3).unwrap();
+            let results_line = cmd_output.lines().nth(3).unwrap();
+            // Grab the values out of the pretty-print.
+            // TODO: Parse json instead.
+            let mut res_iter = results_line.split_whitespace();
+            // Durations are given in milliseconds, so account for that.
+            let mean =
+                Duration::from_secs_f64(res_iter.nth(4).unwrap().parse::<f64>().unwrap() / 1000.);
+            let stddev =
+                Duration::from_secs_f64(res_iter.nth(1).unwrap().parse::<f64>().unwrap() / 1000.);

-            // Calculate how many iterations this does per second, for easy comparison.
-            let ms = fin
-                .split_whitespace()
-                .nth(3)
-                .unwrap()
-                .parse::<f64>()
-                .unwrap();
-            let mul_fac = 1000.0 / ms;
-            let iter_sec = match final_iter_count {
-                #[allow(clippy::cast_precision_loss)]
-                Some(c) => &format!("{:.1}", mul_fac * c as f64),
-                None => "(unknown)",
-            };
-            println!("{fin} {fin_iter} | {iter_sec} |");
+            output.success(
+                t_name_pretty,
+                t_mdata,
+                final_iter_count,
+                Timings { mean, stddev },
+            );
        } else {
-            println!("{t_name_normal} (ERRORED) | N/A | N/A | N/A | {fin_iter} | N/A |");
+            fail!(
+                output,
+                t_name_pretty,
+                t_mdata,
+                final_iter_count,
+                FailKind::Profile
+            );
+        }
+    }
+    if !quiet {
+        if output.is_empty() {
+            eprintln!("Nothing to do.");
+        } else {
+            // If stdout and stderr are on the same terminal, move us after the
+            // output from above.
+            eprintln!();
+        }
+    }
+
+    // No need making an empty json file on every empty test bin.
+    if output.is_empty() {
+        return;
+    }
+
+    match out_kind {
+        OutputKind::Markdown => print!("{output}"),
+        OutputKind::Json(user_path) => {
+            let wspace_dir = std::env::var("CARGO_WORKSPACE_DIR").unwrap();
+            let runs_dir = PathBuf::from(&wspace_dir).join(consts::RUNS_DIR);
+            std::fs::create_dir_all(&runs_dir).unwrap();
+            assert!(
+                !user_path.to_string_lossy().is_empty(),
+                "FATAL: Empty filename specified!"
+            );
+            // Get the test binary's crate's name; a path like
+            // target/release-fast/deps/gpui-061ff76c9b7af5d7
+            // would be reduced to just "gpui".
+            let test_bin_stripped = Path::new(test_bin)
+                .file_name()
+                .unwrap()
+                .to_str()
+                .unwrap()
+                .rsplit_once('-')
+                .unwrap()
+                .0;
+            let mut file_path = runs_dir.join(user_path);
+            file_path
+                .as_mut_os_string()
+                .push(format!(".{test_bin_stripped}.json"));
+            let mut out_file = OpenOptions::new()
+                .write(true)
+                .create(true)
+                .truncate(true)
+                .open(&file_path)
+                .unwrap();
+            out_file
+                .write_all(&serde_json::to_vec(&output).unwrap())
+                .unwrap();
+            if !quiet {
+                eprintln!("JSON output written to {}", file_path.display());
+            }
        }
    }
 }