mirror of
https://github.com/zed-industries/zed.git
synced 2026-05-31 19:05:00 +07:00
agent: Fix a panic when splitting streamed-in edits inside of a multibyte character (#57100)
This PR fixes a panic that could occur in the `edit_file` tool where streaming in text could split in the middle of a multibyte character. Closes FR-3 and [ZED-7ZX](https://zed-dev.sentry.io/issues/7480598098). Release Notes: - Fixed a panic that could occur when streaming in text with the `edit_file` tool.
This commit is contained in:
parent
8ca194d833
commit
14befe2151
1 changed files with 106 additions and 7 deletions
|
|
@ -113,7 +113,7 @@ impl StreamingParser {
|
|||
{
|
||||
if partial.new_text.is_some() && !state.buffer_new_text_until_old_text_done {
|
||||
// new_text appeared after old_text, so old_text is done — emit everything.
|
||||
let start = state.old_text_emitted_len.min(old_text.len());
|
||||
let start = find_char_boundary(old_text, state.old_text_emitted_len);
|
||||
let chunk = normalize_done_chunk(old_text[start..].to_string());
|
||||
state.old_text_done = true;
|
||||
state.old_text_emitted_len = old_text.len();
|
||||
|
|
@ -124,9 +124,10 @@ impl StreamingParser {
|
|||
});
|
||||
} else {
|
||||
let safe_end = safe_emit_end_for_edit_text(old_text);
|
||||
let safe_start = find_char_boundary(old_text, state.old_text_emitted_len);
|
||||
|
||||
if safe_end > state.old_text_emitted_len {
|
||||
let chunk = old_text[state.old_text_emitted_len..safe_end].to_string();
|
||||
if safe_end > safe_start {
|
||||
let chunk = old_text[safe_start..safe_end].to_string();
|
||||
state.old_text_emitted_len = safe_end;
|
||||
events.push(EditEvent::OldTextChunk {
|
||||
edit_index: index,
|
||||
|
|
@ -143,9 +144,10 @@ impl StreamingParser {
|
|||
&& !state.new_text_done
|
||||
{
|
||||
let safe_end = safe_emit_end_for_edit_text(new_text);
|
||||
let safe_start = find_char_boundary(new_text, state.new_text_emitted_len);
|
||||
|
||||
if safe_end > state.new_text_emitted_len {
|
||||
let chunk = new_text[state.new_text_emitted_len..safe_end].to_string();
|
||||
if safe_end > safe_start {
|
||||
let chunk = new_text[safe_start..safe_end].to_string();
|
||||
state.new_text_emitted_len = safe_end;
|
||||
events.push(EditEvent::NewTextChunk {
|
||||
edit_index: index,
|
||||
|
|
@ -343,8 +345,10 @@ impl StreamingParser {
|
|||
/// held back because it may be an artifact of the partial JSON fixer closing
|
||||
/// an incomplete escape sequence (e.g. turning a half-received `\n` into `\\`).
|
||||
/// The next partial will reveal the correct character.
|
||||
///
|
||||
/// The returned position is always a valid UTF-8 character boundary.
|
||||
fn safe_emit_end(text: &str) -> usize {
|
||||
if text.as_bytes().last() == Some(&b'\\') {
|
||||
if text.ends_with('\\') {
|
||||
text.len() - 1
|
||||
} else {
|
||||
text.len()
|
||||
|
|
@ -353,13 +357,35 @@ fn safe_emit_end(text: &str) -> usize {
|
|||
|
||||
fn safe_emit_end_for_edit_text(text: &str) -> usize {
|
||||
let safe_end = safe_emit_end(text);
|
||||
if safe_end > 0 && text.as_bytes()[safe_end - 1] == b'\n' {
|
||||
// Use string slicing to check the last character, ensuring we respect UTF-8 boundaries.
|
||||
if safe_end > 0 && text[..safe_end].ends_with('\n') {
|
||||
safe_end - 1
|
||||
} else {
|
||||
safe_end
|
||||
}
|
||||
}
|
||||
|
||||
/// Finds a valid UTF-8 character boundary at or before the target position.
|
||||
///
|
||||
/// When streaming partial JSON, the text structure can change between updates
|
||||
/// (e.g., an escape sequence being completed). This means a byte position that
|
||||
/// was valid in one partial may land inside a multi-byte character in the next.
|
||||
/// This function finds the nearest valid boundary at or before the target.
|
||||
fn find_char_boundary(text: &str, target: usize) -> usize {
|
||||
if target >= text.len() {
|
||||
return text.len();
|
||||
}
|
||||
if text.is_char_boundary(target) {
|
||||
return target;
|
||||
}
|
||||
// Walk backwards to find a valid boundary.
|
||||
let mut pos = target;
|
||||
while pos > 0 && !text.is_char_boundary(pos) {
|
||||
pos -= 1;
|
||||
}
|
||||
pos
|
||||
}
|
||||
|
||||
fn normalize_done_chunk(mut chunk: String) -> String {
|
||||
if chunk.ends_with('\n') {
|
||||
chunk.pop();
|
||||
|
|
@ -1146,4 +1172,77 @@ mod tests {
|
|||
}]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multibyte_char_with_trailing_backslash() {
|
||||
// Reproduces a panic where the stored `old_text_emitted_len` from a previous
|
||||
// partial lands inside a multi-byte UTF-8 character in the current partial.
|
||||
//
|
||||
// Scenario: The JSON fixer produces a literal backslash when the stream cuts
|
||||
// mid-escape. If the *next* partial replaces that backslash with a multi-byte
|
||||
// character (e.g., em-dash '—'), the stored byte position is no longer valid.
|
||||
let mut parser = StreamingParser::default();
|
||||
|
||||
// First partial: text ends with backslash (held back by safe_emit_end).
|
||||
// "abc" = 3 bytes, backslash held back, so emitted_len = 3.
|
||||
let events = parser.push_edits(&[PartialEdit {
|
||||
old_text: Some("abc\\".into()),
|
||||
new_text: None,
|
||||
}]);
|
||||
assert_eq!(
|
||||
events.as_slice(),
|
||||
&[EditEvent::OldTextChunk {
|
||||
edit_index: 0,
|
||||
chunk: "abc".into(),
|
||||
done: false,
|
||||
}]
|
||||
);
|
||||
|
||||
// Second partial: the backslash is replaced by em-dash '—' (3 bytes: E2 80 94).
|
||||
// "ab—" = 2 + 3 = 5 bytes total, with em-dash at bytes 2..5.
|
||||
// The stored emitted_len (3) is inside the em-dash!
|
||||
// This should NOT panic.
|
||||
let events = parser.push_edits(&[PartialEdit {
|
||||
old_text: Some("ab—".into()),
|
||||
new_text: None,
|
||||
}]);
|
||||
// The parser should handle this gracefully.
|
||||
let _ = events;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_emitted_len_inside_multibyte_char_boundary() {
|
||||
// More direct reproduction: emitted_len points inside a multi-byte character.
|
||||
//
|
||||
// This can happen when:
|
||||
// 1. First partial has text where byte N is a valid boundary
|
||||
// 2. Second partial has *different* text where byte N is inside a multi-byte char
|
||||
let mut parser = StreamingParser::default();
|
||||
|
||||
// First partial: "ab" (2 bytes), backslash held back.
|
||||
// After processing: emitted_len = 2
|
||||
let events = parser.push_edits(&[PartialEdit {
|
||||
old_text: Some("ab\\".into()),
|
||||
new_text: None,
|
||||
}]);
|
||||
assert_eq!(
|
||||
events.as_slice(),
|
||||
&[EditEvent::OldTextChunk {
|
||||
edit_index: 0,
|
||||
chunk: "ab".into(),
|
||||
done: false,
|
||||
}]
|
||||
);
|
||||
|
||||
// Second partial: "a—" where em-dash starts at byte 1 and spans bytes 1-3.
|
||||
// Stored emitted_len = 2, but byte 2 is inside the em-dash!
|
||||
// This should NOT panic.
|
||||
let events = parser.push_edits(&[PartialEdit {
|
||||
old_text: Some("a—".into()),
|
||||
new_text: None,
|
||||
}]);
|
||||
// The parser should handle this gracefully.
|
||||
// We don't care exactly what it emits, just that it doesn't panic.
|
||||
let _ = events;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue