ep: Parse "user accepted prediction" markers in evals (#49598)

Also:
- Add two evals
- Remove duplicated Example 6 from the teacher prompt 

Release Notes:

- N/A
This commit is contained in:
Oleksiy Syvokon 2026-02-19 15:01:07 +02:00 committed by GitHub
parent b6cd147b9f
commit 3129d7e6b5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 271 additions and 59 deletions

View file

@ -181,6 +181,7 @@ const EDIT_HISTORY_HEADING: &str = "Edit History";
const CURSOR_POSITION_HEADING: &str = "Cursor Position";
const EXPECTED_PATCH_HEADING: &str = "Expected Patch";
const REJECTED_PATCH_HEADING: &str = "Rejected Patch";
const ACCEPTED_PREDICTION_MARKER: &str = "// User accepted prediction:";
#[derive(Serialize, Deserialize)]
struct FrontMatter<'a> {
@ -352,6 +353,7 @@ impl ExampleSpec {
}
let mut current_section = Section::Start;
let mut next_edit_predicted = false;
for event in parser {
match event {
@ -387,6 +389,12 @@ impl ExampleSpec {
anyhow::bail!("Unexpected heading level: {level}");
}
Event::Start(Tag::CodeBlock(kind)) => {
if current_section == Section::EditHistory
&& text.trim() == ACCEPTED_PREDICTION_MARKER
{
next_edit_predicted = true;
}
text.clear();
match kind {
CodeBlockKind::Fenced(info) => {
block_info = info;
@ -407,6 +415,11 @@ impl ExampleSpec {
spec.uncommitted_diff = mem::take(&mut text);
}
Section::EditHistory => {
if next_edit_predicted {
spec.edit_history
.push_str(&format!("{}\n", ACCEPTED_PREDICTION_MARKER));
next_edit_predicted = false;
}
spec.edit_history.push_str(&mem::take(&mut text));
}
Section::CursorPosition => {
@ -908,4 +921,81 @@ mod tests {
let results = spec.expected_patches_with_cursor_positions();
assert_eq!(results, vec![(clean_patch, None)]);
}
#[test]
fn test_from_markdown_accepted_prediction_marker() {
let markdown = indoc! {r#"
+++
repository_url = "https://github.com/example/repo"
revision = "abc123"
+++
## Edit History
```diff
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,3 +1,3 @@
-fn hello() {}
+fn hello_world() {}
```
// User accepted prediction:
```diff
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,3 +1,3 @@
-fn hello_world() {}
+fn hello_world() { println!("hi"); }
```
```diff
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,3 +1,3 @@
-fn hello_world() { println!("hi"); }
+fn hello_world() { println!("hello"); }
```
## Cursor Position
```src/main.rs
fn hello_world() { println!("hello"); }
# ^[CURSOR_POSITION]
```
## Expected Patch
```diff
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,3 +1,3 @@
-fn hello_world() { println!("hello"); }
+fn hello_world() { println!("hello, world!"); }
```
"#};
let spec = ExampleSpec::from_markdown(markdown).unwrap();
// The first diff should NOT have the marker
assert!(spec.edit_history.starts_with("--- a/src/main.rs"));
// The second diff should be preceded by the accepted prediction marker
assert!(
spec.edit_history
.contains("// User accepted prediction:\n--- a/src/main.rs")
);
// Count occurrences of the marker - should be exactly one
let marker_count = spec
.edit_history
.matches("// User accepted prediction:")
.count();
assert_eq!(marker_count, 1);
// The third diff should NOT have the marker
// Verify all three diffs are present
let diff_count = spec.edit_history.matches("--- a/src/main.rs").count();
assert_eq!(diff_count, 3);
}
}

View file

@ -0,0 +1,98 @@
+++
repository_url = "https://github.com/pallets/flask"
revision = "2fec0b206c6e83ea813ab26597e15c96fab08be7"
+++
## Edit History
```diff
--- a/tests/test_basic.py
+++ b/tests/test_basic.py
@@ -356,3 +356,6 @@
cookie = rv.headers["set-cookie"].lower()
assert "samesite=lax" in cookie
+de
+
+
def test_missing_session(app):
```
// User accepted prediction:
```diff
--- a/tests/test_basic.py
+++ b/tests/test_basic.py
@@ -358,6 +358,14 @@
-de
+def test_session_cookie_httponly(app, client):
+ app.config["SESSION_COOKIE_HTTPONLY"] = True
+
+ @app.route("/")
+ def index():
+ flask.session["testing"] = 42
+ return "Hello World"
+
+ rv = client.get("/")
+ assert "httponly" in rv.headers["set-cookie"].lower()
def test_missing_session(app):
```
```diff
--- a/tests/test_basic.py
+++ b/tests/test_basic.py
@@ -358,14 +358,14 @@
-def test_session_cookie_httponly(app, client):
+def test_session_cookie_secur(app, client):
app.config["SESSION_COOKIE_HTTPONLY"] = True
```
## Cursor Position
```tests/test_basic.py
cookie = rv.headers["set-cookie"].lower()
assert "samesite=lax" in cookie
def test_session_cookie_secur(app, client):
# ^[CURSOR_POSITION]
app.config["SESSION_COOKIE_HTTPONLY"] = True
@app.route("/")
def index():
flask.session["testing"] = 42
return "Hello World"
rv = client.get("/")
assert "httponly" in rv.headers["set-cookie"].lower()
def test_missing_session(app):
```
## Expected Patch
```diff
--- a/tests/test_basic.py
+++ b/tests/test_basic.py
@@ -358,14 +358,14 @@
-def test_session_cookie_secur(app, client):
- app.config["SESSION_COOKIE_HTTPONLY"] = True
+def test_session_cookie_secure(app, client):
+ app.config["SESSION_COOKIE_SECURE"] = True
@app.route("/")
def index():
flask.session["testing"] = 42
return "Hello World"
rv = client.get("/")
- assert "httponly" in rv.headers["set-cookie"].lower()
+ assert "secure" in rv.headers["set-cookie"].lower()
```

View file

@ -0,0 +1,81 @@
+++
repository_url = "https://github.com/octocat/hello-world"
revision = "7fd1a60b01f91b314f59955a4e4d4e80d8edf11d"
+++
## Edit History
```diff
--- a/README
+++ b/README
@@ -1,1 +1,6 @@
-Hello World!
+function filterByStatus(items, status) {
+ return items.filter(item => item.status === status);
+}
+
+function groupBy
+
```
// User accepted prediction:
```diff
--- a/README
+++ b/README
@@ -4,3 +4,9 @@
-function groupBy
+function groupByStatus(items) {
+ return items.reduce((groups, item) => {
+ const key = item.status;
+ (groups[key] = groups[key] || []).push(item);
+ return groups;
+ }, {});
+}
```
```diff
--- a/README
+++ b/README
@@ -4,4 +4,4 @@
-function groupByStatus(items) {
+function groupByCat(items) {
return items.reduce((groups, item) => {
```
## Cursor Position
```README
function filterByStatus(items, status) {
return items.filter(item => item.status === status);
}
function groupByCat(items) {
# ^[CURSOR_POSITION]
return items.reduce((groups, item) => {
const key = item.status;
(groups[key] = groups[key] || []).push(item);
return groups;
}, {});
}
```
## Expected Patch
```diff
--- a/README
+++ b/README
@@ -5,7 +5,7 @@
-function groupByCat(items) {
+function groupByCategory(items) {
# ^[CURSOR_POSITION]
return items.reduce((groups, item) => {
- const key = item.status;
+ const key = item.category;
(groups[key] = groups[key] || []).push(item);
return groups;
}, {});
```

View file

@ -238,65 +238,6 @@ The user just fixed a bug in the `add` function, changing subtraction to additio
NO_EDITS
`````
## Example 6
The user accepted a prediction for a function, then started renaming it. The original arguments were auto-generated (marked with `// User accepted prediction:`), so they CAN be updated to match the new function name. This is NOT reverting user input—it's improving auto-generated scaffolding.
### User Edit History
`````
--- a/math_utils.py
+++ b/math_utils.py
@@ -3,3 +3,5 @@
def calculate_rectangle_area(width, height):
return width * height
+de
// User accepted prediction:
--- a/math_utils.py
+++ b/math_utils.py
@@ -3,5 +3,7 @@
def calculate_rectangle_area(width, height):
return width * height
-de
+def calculate_rectangle_perimeter(width, height):
+
--- a/math_utils.py
+++ b/math_utils.py
@@ -5,5 +5,5 @@
return width * height
-def calculate_rectangle_perimeter(width, height):
+def calculate_sq_perimeter(width, height):
`````
### Current File
`````math_utils.py
def calculate_rectangle_area(width, height):
return width * height
<|editable_region_start|>
def calculate_sq<|user_cursor|>_perimeter(width, height):
<|editable_region_end|>
`````
### Output
The user accepted a prediction for `calculate_rectangle_perimeter(width, height)`, then started renaming `rectangle` to `square`. Since squares have equal sides, the arguments should change from `(width, height)` to `(side)`. The arguments were auto-generated (from an accepted prediction), so modifying them is appropriate.
`````
<|editable_region_start|>
def calculate_square_perimeter(side):
<|user_cursor|>
<|editable_region_end|>
`````
## Example 5
The user just deleted code, leaving behind what looks incomplete. You must NOT "complete" it by restoring deleted content—that would undo their edit. Output NO_EDITS. **This is the correct response even though the code appears broken.**

View file

@ -60,6 +60,8 @@ extend-exclude = [
"crates/gpui/src/platform/mac/dispatcher.rs",
# Tests contain partially incomplete words (by design)
"crates/edit_prediction_cli/src/split_commit.rs",
# Eval examples contain intentionally partial words (e.g. "secur" for "secure")
"crates/edit_prediction_cli/evals/",
# Tests contain `baˇr` that cause `"ba" should be "by" or "be".`-like false-positives
"crates/editor/src/document_symbols.rs",
]