Extract language_core and grammars crates from language (#52238)

This extracts a `language_core` crate from the existing `language`
crate, and creates a `grammars` data crate. The goal is to separate
tree-sitter grammar infrastructure, language configuration, and LSP
adapter types from the heavier buffer/editor integration layer in
`language`.

## Motivation

The `language` crate pulls in `text`, `theme`, `settings`, `rpc`,
`task`, `fs`, `clock`, `sum_tree`, and `fuzzy` — all of which are needed
for buffer integration (`Buffer`, `SyntaxMap`, `Outline`,
`DiagnosticSet`) but not for grammar parsing or language configuration.
Extracting the core types lets downstream consumers depend on
`language_core` without pulling in the full integration stack.

## Dependency graph after extraction

```
language_core   ← gpui, lsp, tree-sitter, util, collections
grammars        ← language_core, rust_embed, tree-sitter-{rust,python,...}
language        ← language_core, text, theme, settings, rpc, task, fs, ...
languages       ← language, grammars
```

## What moved to `language_core`

- `Grammar`, `GrammarId`, and all query config/builder types
- `LanguageConfig`, `LanguageMatcher`, bracket/comment/indent config
types
- `HighlightMap`, `HighlightId` (theme-dependent free functions
`highlight_style` and `highlight_name` stay in `language`)
- `LanguageName`, `LanguageId`
- `LanguageQueries`, `QUERY_FILENAME_PREFIXES`
- `CodeLabel`, `CodeLabelBuilder`, `Symbol`
- `Diagnostic`, `DiagnosticSourceKind`
- `Toolchain`, `ToolchainScope`, `ToolchainList`, `ToolchainMetadata`
- `ManifestName`
- `SoftWrap`
- LSP data types: `BinaryStatus`, `ServerHealth`,
`LanguageServerStatusUpdate`, `PromptResponseContext`, `ToLspPosition`

## What stays in `language`

- `Buffer`, `BufferSnapshot`, `SyntaxMap`, `Outline`, `DiagnosticSet`,
`LanguageScope`
- `LspAdapter`, `CachedLspAdapter`, `LspAdapterDelegate` (reference
`Arc<Language>` and `WorktreeId`)
- `ToolchainLister`, `LanguageToolchainStore` (reference `task` and
`settings` types)
- `ManifestQuery`, `ManifestProvider`, `ManifestDelegate` (reference
`WorktreeId`)
- Parser/query cursor pools, `PLAIN_TEXT`, point conversion functions

## What the `grammars` crate provides

- Embedded `.scm` query files and `config.toml` files for all built-in
languages (via `rust_embed`)
- `load_queries(name)`, `load_config(name)`,
`load_config_for_feature(name, grammars_loaded)`, and `get_file(path)`
functions
- `native_grammars()` for tree-sitter grammar registration (behind
`load-grammars` feature)

## Pre-cleanup (also in this PR)

- Removed unused `Option<&Buffer>` from
`LspAdapter::process_diagnostics`
- Removed unused `&App` from `LspAdapter::retain_old_diagnostic`
- Removed `fs: &dyn Fs` from `ToolchainLister` trait methods
(`PythonToolchainProvider` captures `fs` at construction time instead)
- Moved `Diagnostic`/`DiagnosticSourceKind` out of `buffer.rs` into
their own module

## Backward compatibility

The `language` crate re-exports everything from `language_core`, so
existing `use language::Grammar` (etc.) continues to work unchanged. The
only downstream change required is importing `CodeLabelExt` where
`.fallback_for_completion()` is called on the now-foreign `CodeLabel`
type.

Release Notes:

- N/A

---------

Co-authored-by: Agus Zubiaga <agus@zed.dev>
Co-authored-by: Tom Houlé <tom@tomhoule.com>
This commit is contained in:
Nathan Sobo 2026-03-25 17:41:09 -06:00 committed by GitHub
parent 3684b5a42f
commit 3ce0cd11ec
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
219 changed files with 2569 additions and 2185 deletions

62
Cargo.lock generated
View file

@ -7884,6 +7884,35 @@ dependencies = [
"zed-scap",
]
[[package]]
name = "grammars"
version = "0.1.0"
dependencies = [
"anyhow",
"language_core",
"rust-embed",
"toml 0.8.23",
"tree-sitter",
"tree-sitter-bash",
"tree-sitter-c",
"tree-sitter-cpp",
"tree-sitter-css",
"tree-sitter-diff",
"tree-sitter-gitcommit",
"tree-sitter-go",
"tree-sitter-gomod",
"tree-sitter-gowork",
"tree-sitter-jsdoc",
"tree-sitter-json",
"tree-sitter-md",
"tree-sitter-python",
"tree-sitter-regex",
"tree-sitter-rust",
"tree-sitter-typescript",
"tree-sitter-yaml",
"util",
]
[[package]]
name = "grid"
version = "0.18.0"
@ -9345,6 +9374,7 @@ dependencies = [
"imara-diff",
"indoc",
"itertools 0.14.0",
"language_core",
"log",
"lsp",
"parking_lot",
@ -9353,7 +9383,6 @@ dependencies = [
"rand 0.9.2",
"regex",
"rpc",
"schemars",
"semver",
"serde",
"serde_json",
@ -9388,6 +9417,25 @@ dependencies = [
"ztracing",
]
[[package]]
name = "language_core"
version = "0.1.0"
dependencies = [
"anyhow",
"collections",
"gpui",
"log",
"lsp",
"parking_lot",
"regex",
"schemars",
"serde",
"serde_json",
"toml 0.8.23",
"tree-sitter",
"util",
]
[[package]]
name = "language_extension"
version = "0.1.0"
@ -9580,9 +9628,11 @@ dependencies = [
"async-trait",
"chrono",
"collections",
"fs",
"futures 0.3.31",
"globset",
"gpui",
"grammars",
"http_client",
"itertools 0.14.0",
"json_schema_store",
@ -9602,7 +9652,6 @@ dependencies = [
"project",
"regex",
"rope",
"rust-embed",
"semver",
"serde",
"serde_json",
@ -9614,25 +9663,16 @@ dependencies = [
"task",
"terminal",
"theme",
"toml 0.8.23",
"tree-sitter",
"tree-sitter-bash",
"tree-sitter-c",
"tree-sitter-cpp",
"tree-sitter-css",
"tree-sitter-diff",
"tree-sitter-gitcommit",
"tree-sitter-go",
"tree-sitter-gomod",
"tree-sitter-gowork",
"tree-sitter-jsdoc",
"tree-sitter-json",
"tree-sitter-md",
"tree-sitter-python",
"tree-sitter-regex",
"tree-sitter-rust",
"tree-sitter-typescript",
"tree-sitter-yaml",
"unindent",
"url",
"util",

View file

@ -87,6 +87,7 @@ members = [
"crates/git_ui",
"crates/go_to_line",
"crates/google_ai",
"crates/grammars",
"crates/gpui",
"crates/gpui_linux",
"crates/gpui_macos",
@ -108,6 +109,7 @@ members = [
"crates/json_schema_store",
"crates/keymap_editor",
"crates/language",
"crates/language_core",
"crates/language_extension",
"crates/language_model",
"crates/language_models",
@ -330,6 +332,7 @@ git_hosting_providers = { path = "crates/git_hosting_providers" }
git_ui = { path = "crates/git_ui" }
go_to_line = { path = "crates/go_to_line" }
google_ai = { path = "crates/google_ai" }
grammars = { path = "crates/grammars" }
gpui = { path = "crates/gpui", default-features = false }
gpui_linux = { path = "crates/gpui_linux", default-features = false }
gpui_macos = { path = "crates/gpui_macos", default-features = false }
@ -354,6 +357,7 @@ journal = { path = "crates/journal" }
json_schema_store = { path = "crates/json_schema_store" }
keymap_editor = { path = "crates/keymap_editor" }
language = { path = "crates/language" }
language_core = { path = "crates/language_core" }
language_extension = { path = "crates/language_extension" }
language_model = { path = "crates/language_model" }
language_models = { path = "crates/language_models" }

View file

@ -1826,7 +1826,7 @@ def process_data(untyped_param, typed_param: int, another_typed: str):
}
fn python_lang() -> Language {
let debug_variables_query = include_str!("../../../languages/src/python/debugger.scm");
let debug_variables_query = include_str!("../../../grammars/src/python/debugger.scm");
Language::new(
LanguageConfig {
name: "Python".into(),
@ -1843,7 +1843,7 @@ fn python_lang() -> Language {
}
fn go_lang() -> Arc<Language> {
let debug_variables_query = include_str!("../../../languages/src/go/debugger.scm");
let debug_variables_query = include_str!("../../../grammars/src/go/debugger.scm");
Arc::new(
Language::new(
LanguageConfig {
@ -2262,7 +2262,7 @@ fn main() {
}
fn javascript_lang() -> Arc<Language> {
let debug_variables_query = include_str!("../../../languages/src/javascript/debugger.scm");
let debug_variables_query = include_str!("../../../grammars/src/javascript/debugger.scm");
Arc::new(
Language::new(
LanguageConfig {
@ -2281,7 +2281,7 @@ fn javascript_lang() -> Arc<Language> {
}
fn typescript_lang() -> Arc<Language> {
let debug_variables_query = include_str!("../../../languages/src/typescript/debugger.scm");
let debug_variables_query = include_str!("../../../grammars/src/typescript/debugger.scm");
Arc::new(
Language::new(
LanguageConfig {
@ -2300,7 +2300,7 @@ fn typescript_lang() -> Arc<Language> {
}
fn tsx_lang() -> Arc<Language> {
let debug_variables_query = include_str!("../../../languages/src/tsx/debugger.scm");
let debug_variables_query = include_str!("../../../grammars/src/tsx/debugger.scm");
Arc::new(
Language::new(
LanguageConfig {

View file

@ -13,7 +13,7 @@
//!
//! Language is detected based on file extension of the `cursor_path` field.
//! The extension-to-language mapping is built from the embedded language
//! config files in the `languages` crate.
//! config files in the `grammars` crate.
use anyhow::{Context as _, Result, bail};
use clap::Args;
@ -29,7 +29,7 @@ mod language_configs_embedded {
use rust_embed::RustEmbed;
#[derive(RustEmbed)]
#[folder = "../languages/src/"]
#[folder = "../grammars/src/"]
#[include = "*/config.toml"]
pub struct LanguageConfigs;
}
@ -123,7 +123,7 @@ fn build_extension_to_language_map() -> HashMap<String, String> {
#[cfg(feature = "dynamic_prompts")]
fn build_extension_to_language_map() -> HashMap<String, String> {
const LANGUAGES_SRC_DIR: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../languages/src");
const LANGUAGES_SRC_DIR: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../grammars/src");
let mut map = HashMap::default();

View file

@ -160,7 +160,7 @@ async fn test_edit_prediction_context(cx: &mut TestAppContext) {
}
#[gpui::test]
fn test_assemble_excerpts(cx: &mut TestAppContext) {
async fn test_assemble_excerpts(cx: &mut TestAppContext) {
let table = [
(
indoc! {r#"
@ -289,6 +289,9 @@ fn test_assemble_excerpts(cx: &mut TestAppContext) {
for (input, expected_output) in table {
let (input, ranges) = marked_text_ranges(&input, false);
let buffer = cx.new(|cx| Buffer::local(input, cx).with_language(rust_lang(), cx));
buffer
.read_with(cx, |buffer, _| buffer.parsing_idle())
.await;
buffer.read_with(cx, |buffer, _cx| {
let ranges: Vec<(Range<Point>, usize)> = ranges
.into_iter()

View file

@ -101,6 +101,7 @@ use language::{
Point, Subscription as BufferSubscription,
language_settings::{AllLanguageSettings, LanguageSettings},
};
use multi_buffer::{
Anchor, AnchorRangeExt, ExcerptId, MultiBuffer, MultiBufferOffset, MultiBufferOffsetUtf16,
MultiBufferPoint, MultiBufferRow, MultiBufferSnapshot, RowInfo, ToOffset, ToPoint,
@ -1905,7 +1906,7 @@ impl DisplaySnapshot {
.flat_map(|chunk| {
let syntax_highlight_style = chunk
.syntax_highlight_id
.and_then(|id| id.style(&editor_style.syntax));
.and_then(|id| editor_style.syntax.get(id).cloned());
let chunk_highlight = chunk.highlight_style.map(|chunk_highlight| {
HighlightStyle {
@ -1999,7 +2000,8 @@ impl DisplaySnapshot {
let syntax_style = chunk
.syntax_highlight_id
.and_then(|id| id.style(syntax_theme));
.and_then(|id| syntax_theme.get(id).cloned());
let overlay_style = chunk.highlight_style;
let combined = match (syntax_style, overlay_style) {
@ -4015,7 +4017,8 @@ pub mod tests {
for chunk in snapshot.chunks(rows, true, HighlightStyles::default()) {
let syntax_color = chunk
.syntax_highlight_id
.and_then(|id| id.style(theme)?.color);
.and_then(|id| theme.get(id)?.color);
let highlight_color = chunk.highlight_style.and_then(|style| style.color);
if let Some((last_chunk, last_syntax_color, last_highlight_color)) = chunks.last_mut()
&& syntax_color == *last_syntax_color

View file

@ -19160,7 +19160,7 @@ impl Editor {
move |cx: &mut BlockContext| {
let mut text_style = cx.editor_style.text.clone();
if let Some(highlight_style) = old_highlight_id
.and_then(|h| h.style(&cx.editor_style.syntax))
.and_then(|h| cx.editor_style.syntax.get(h).cloned())
{
text_style = text_style.highlight(highlight_style);
}
@ -25039,7 +25039,8 @@ impl Editor {
for chunk in chunks {
let highlight = chunk
.syntax_highlight_id
.and_then(|id| id.name(&style.syntax));
.and_then(|id| style.syntax.get_capture_name(id));
let mut chunk_lines = chunk.text.split('\n').peekable();
while let Some(text) = chunk_lines.next() {
let mut merged_with_last_token = false;
@ -28863,7 +28864,7 @@ pub fn styled_runs_for_code_label<'a>(
background_color: Some(local_player.selection),
..Default::default()
}
} else if let Some(style) = highlight_id.style(syntax_theme) {
} else if let Some(style) = syntax_theme.get(*highlight_id).cloned() {
style
} else {
return Default::default();

View file

@ -6,6 +6,7 @@ use gpui::{
TextStyle, Window, combine_highlights,
};
use language::BufferSnapshot;
use markdown::{Markdown, MarkdownElement};
use multi_buffer::{Anchor, MultiBufferOffset, ToOffset};
use settings::Settings;
@ -236,7 +237,7 @@ impl Editor {
.highlight_text(&text, 0..signature.label.len())
.into_iter()
.flat_map(|(range, highlight_id)| {
Some((range, highlight_id.style(cx.theme().syntax())?))
Some((range, *cx.theme().syntax().get(highlight_id)?))
});
signature.highlights =
combine_highlights(signature.highlights.clone(), highlights)

View file

@ -0,0 +1,60 @@
[package]
name = "grammars"
version = "0.1.0"
edition = "2024"
publish = false
[lints]
workspace = true
[lib]
path = "src/grammars.rs"
[dependencies]
language_core.workspace = true
rust-embed.workspace = true
anyhow.workspace = true
toml.workspace = true
util.workspace = true
tree-sitter = { workspace = true, optional = true }
tree-sitter-bash = { workspace = true, optional = true }
tree-sitter-c = { workspace = true, optional = true }
tree-sitter-cpp = { workspace = true, optional = true }
tree-sitter-css = { workspace = true, optional = true }
tree-sitter-diff = { workspace = true, optional = true }
tree-sitter-gitcommit = { workspace = true, optional = true }
tree-sitter-go = { workspace = true, optional = true }
tree-sitter-go-mod = { workspace = true, optional = true }
tree-sitter-gowork = { workspace = true, optional = true }
tree-sitter-jsdoc = { workspace = true, optional = true }
tree-sitter-json = { workspace = true, optional = true }
tree-sitter-md = { workspace = true, optional = true }
tree-sitter-python = { workspace = true, optional = true }
tree-sitter-regex = { workspace = true, optional = true }
tree-sitter-rust = { workspace = true, optional = true }
tree-sitter-typescript = { workspace = true, optional = true }
tree-sitter-yaml = { workspace = true, optional = true }
[features]
load-grammars = [
"tree-sitter",
"tree-sitter-bash",
"tree-sitter-c",
"tree-sitter-cpp",
"tree-sitter-css",
"tree-sitter-diff",
"tree-sitter-gitcommit",
"tree-sitter-go",
"tree-sitter-go-mod",
"tree-sitter-gowork",
"tree-sitter-jsdoc",
"tree-sitter-json",
"tree-sitter-md",
"tree-sitter-python",
"tree-sitter-regex",
"tree-sitter-rust",
"tree-sitter-typescript",
"tree-sitter-yaml",
]
test-support = ["load-grammars"]

1
crates/grammars/LICENSE-GPL Symbolic link
View file

@ -0,0 +1 @@
../../LICENSE-GPL

View file

@ -0,0 +1,108 @@
use anyhow::Context as _;
use language_core::{LanguageConfig, LanguageQueries, QUERY_FILENAME_PREFIXES};
use rust_embed::RustEmbed;
use util::asset_str;
#[derive(RustEmbed)]
#[folder = "src/"]
#[exclude = "*.rs"]
struct GrammarDir;
/// Register all built-in native tree-sitter grammars with the provided registration function.
///
/// Each grammar is registered as a `(&str, tree_sitter_language::LanguageFn)` pair.
/// This must be called before loading language configs/queries.
#[cfg(feature = "load-grammars")]
pub fn native_grammars() -> Vec<(&'static str, tree_sitter::Language)> {
vec![
("bash", tree_sitter_bash::LANGUAGE.into()),
("c", tree_sitter_c::LANGUAGE.into()),
("cpp", tree_sitter_cpp::LANGUAGE.into()),
("css", tree_sitter_css::LANGUAGE.into()),
("diff", tree_sitter_diff::LANGUAGE.into()),
("go", tree_sitter_go::LANGUAGE.into()),
("gomod", tree_sitter_go_mod::LANGUAGE.into()),
("gowork", tree_sitter_gowork::LANGUAGE.into()),
("jsdoc", tree_sitter_jsdoc::LANGUAGE.into()),
("json", tree_sitter_json::LANGUAGE.into()),
("jsonc", tree_sitter_json::LANGUAGE.into()),
("markdown", tree_sitter_md::LANGUAGE.into()),
("markdown-inline", tree_sitter_md::INLINE_LANGUAGE.into()),
("python", tree_sitter_python::LANGUAGE.into()),
("regex", tree_sitter_regex::LANGUAGE.into()),
("rust", tree_sitter_rust::LANGUAGE.into()),
("tsx", tree_sitter_typescript::LANGUAGE_TSX.into()),
(
"typescript",
tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
),
("yaml", tree_sitter_yaml::LANGUAGE.into()),
("gitcommit", tree_sitter_gitcommit::LANGUAGE.into()),
]
}
/// Load and parse the `config.toml` for a given language name.
pub fn load_config(name: &str) -> LanguageConfig {
let config_toml = String::from_utf8(
GrammarDir::get(&format!("{}/config.toml", name))
.unwrap_or_else(|| panic!("missing config for language {:?}", name))
.data
.to_vec(),
)
.unwrap();
let config: LanguageConfig = ::toml::from_str(&config_toml)
.with_context(|| format!("failed to load config.toml for language {name:?}"))
.unwrap();
config
}
/// Load and parse the `config.toml` for a given language name, stripping fields
/// that require grammar support when grammars are not loaded.
pub fn load_config_for_feature(name: &str, grammars_loaded: bool) -> LanguageConfig {
let config = load_config(name);
if grammars_loaded {
config
} else {
LanguageConfig {
name: config.name,
matcher: config.matcher,
jsx_tag_auto_close: config.jsx_tag_auto_close,
..Default::default()
}
}
}
/// Get a raw embedded file by path (relative to `src/`).
///
/// Returns the file data as bytes, or `None` if the file does not exist.
pub fn get_file(path: &str) -> Option<rust_embed::EmbeddedFile> {
GrammarDir::get(path)
}
/// Load all `.scm` query files for a given language name into a `LanguageQueries`.
///
/// Multiple `.scm` files with the same prefix (e.g. `highlights.scm` and
/// `highlights_extra.scm`) are concatenated together with their contents appended.
pub fn load_queries(name: &str) -> LanguageQueries {
let mut result = LanguageQueries::default();
for path in GrammarDir::iter() {
if let Some(remainder) = path.strip_prefix(name).and_then(|p| p.strip_prefix('/')) {
if !remainder.ends_with(".scm") {
continue;
}
for (prefix, query) in QUERY_FILENAME_PREFIXES {
if remainder.starts_with(prefix) {
let contents = asset_str::<GrammarDir>(path.as_ref());
match query(&mut result) {
None => *query(&mut result) = Some(contents),
Some(existing) => existing.to_mut().push_str(contents.as_ref()),
}
}
}
}
}
result
}

Some files were not shown because too many files have changed in this diff Show more