mirror of
https://github.com/zed-industries/zed.git
synced 2026-05-31 19:05:00 +07:00
ep: Fix bugs in the split-commit command (#57604)
Some checks are pending
Congratsbot / check-author (push) Waiting to run
Congratsbot / congrats (push) Blocked by required conditions
run_tests / orchestrate (push) Waiting to run
run_tests / check_style (push) Waiting to run
run_tests / clippy_windows (push) Blocked by required conditions
run_tests / clippy_linux (push) Blocked by required conditions
run_tests / clippy_mac (push) Blocked by required conditions
run_tests / clippy_mac_x86_64 (push) Blocked by required conditions
run_tests / run_tests_windows (push) Blocked by required conditions
run_tests / run_tests_linux (push) Blocked by required conditions
run_tests / run_tests_mac (push) Blocked by required conditions
run_tests / miri_scheduler (push) Blocked by required conditions
run_tests / doctests (push) Blocked by required conditions
run_tests / check_workspace_binaries (push) Blocked by required conditions
run_tests / build_visual_tests_binary (push) Blocked by required conditions
run_tests / check_wasm (push) Blocked by required conditions
run_tests / check_dependencies (push) Blocked by required conditions
run_tests / check_docs (push) Blocked by required conditions
run_tests / check_licenses (push) Blocked by required conditions
run_tests / check_scripts (push) Blocked by required conditions
run_tests / check_postgres_and_protobuf_migrations (push) Blocked by required conditions
run_tests / extension_tests (push) Blocked by required conditions
run_tests / tests_pass (push) Blocked by required conditions
deploy_nightly_docs / deploy_docs (push) Has been skipped
Some checks are pending
Congratsbot / check-author (push) Waiting to run
Congratsbot / congrats (push) Blocked by required conditions
run_tests / orchestrate (push) Waiting to run
run_tests / check_style (push) Waiting to run
run_tests / clippy_windows (push) Blocked by required conditions
run_tests / clippy_linux (push) Blocked by required conditions
run_tests / clippy_mac (push) Blocked by required conditions
run_tests / clippy_mac_x86_64 (push) Blocked by required conditions
run_tests / run_tests_windows (push) Blocked by required conditions
run_tests / run_tests_linux (push) Blocked by required conditions
run_tests / run_tests_mac (push) Blocked by required conditions
run_tests / miri_scheduler (push) Blocked by required conditions
run_tests / doctests (push) Blocked by required conditions
run_tests / check_workspace_binaries (push) Blocked by required conditions
run_tests / build_visual_tests_binary (push) Blocked by required conditions
run_tests / check_wasm (push) Blocked by required conditions
run_tests / check_dependencies (push) Blocked by required conditions
run_tests / check_docs (push) Blocked by required conditions
run_tests / check_licenses (push) Blocked by required conditions
run_tests / check_scripts (push) Blocked by required conditions
run_tests / check_postgres_and_protobuf_migrations (push) Blocked by required conditions
run_tests / extension_tests (push) Blocked by required conditions
run_tests / tests_pass (push) Blocked by required conditions
deploy_nightly_docs / deploy_docs (push) Has been skipped
Correctness and efficiency fixes to `imitate_human_edits`. Release Notes: - N/A
This commit is contained in:
parent
d066a73609
commit
13e7c11768
2 changed files with 165 additions and 88 deletions
|
|
@ -651,7 +651,7 @@ pub fn parse_order_spec(spec: &str) -> Vec<BTreeSet<usize>> {
|
|||
order
|
||||
}
|
||||
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
pub struct EditLocation {
|
||||
pub filename: String,
|
||||
pub source_line_number: usize,
|
||||
|
|
@ -667,8 +667,8 @@ pub enum EditType {
|
|||
Insertion,
|
||||
}
|
||||
|
||||
pub fn locate_edited_line(patch: &Patch, mut edit_index: isize) -> Option<EditLocation> {
|
||||
let mut edit_locations = vec![];
|
||||
pub fn edit_locations(patch: &Patch) -> Vec<EditLocation> {
|
||||
let mut edit_locations = Vec::new();
|
||||
|
||||
for (hunk_index, hunk) in patch.hunks.iter().enumerate() {
|
||||
let mut old_line_number = hunk.old_start;
|
||||
|
|
@ -724,6 +724,12 @@ pub fn locate_edited_line(patch: &Patch, mut edit_index: isize) -> Option<EditLo
|
|||
}
|
||||
}
|
||||
|
||||
edit_locations
|
||||
}
|
||||
|
||||
pub fn locate_edited_line(patch: &Patch, mut edit_index: isize) -> Option<EditLocation> {
|
||||
let mut edit_locations = edit_locations(patch);
|
||||
|
||||
if edit_index < 0 {
|
||||
edit_index += edit_locations.len() as isize; // take from end
|
||||
}
|
||||
|
|
@ -845,6 +851,8 @@ mod tests {
|
|||
-zinc
|
||||
"};
|
||||
let patch = Patch::parse_unified_diff(patch_str);
|
||||
let locations = edit_locations(&patch);
|
||||
assert_eq!(locations.len(), 4);
|
||||
|
||||
assert_eq!(
|
||||
locate_edited_line(&patch, 0), // -blue
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@
|
|||
//!
|
||||
//! TODO: Port Python code to generate chronologically-ordered commits
|
||||
use crate::FailedHandling;
|
||||
use crate::reorder_patch::{Patch, PatchLine, extract_edits, locate_edited_line};
|
||||
use crate::reorder_patch::{Patch, PatchLine, edit_locations, extract_edits, locate_edited_line};
|
||||
use crate::word_diff::tokenize;
|
||||
|
||||
/// Find the largest valid UTF-8 char boundary at or before `index` in `s`.
|
||||
|
|
@ -218,6 +218,7 @@ pub fn run_split_commit(
|
|||
|
||||
let split_point = args.split_point.as_deref().and_then(parse_split_point);
|
||||
let mut output_lines = Vec::new();
|
||||
let mut processed_commits = 0usize;
|
||||
|
||||
for input_path in inputs {
|
||||
let input: Box<dyn BufRead> = if input_path.as_os_str() == "-" {
|
||||
|
|
@ -326,9 +327,23 @@ pub fn run_split_commit(
|
|||
|
||||
output_lines.push(json);
|
||||
}
|
||||
|
||||
processed_commits += 1;
|
||||
eprint!(
|
||||
"\rsplit-commit: processed {} commits, generated {} examples",
|
||||
processed_commits,
|
||||
output_lines.len()
|
||||
);
|
||||
io::stderr()
|
||||
.flush()
|
||||
.context("failed to flush progress to stderr")?;
|
||||
}
|
||||
}
|
||||
|
||||
if processed_commits > 0 {
|
||||
eprintln!();
|
||||
}
|
||||
|
||||
let output_content = output_lines.join("\n") + if output_lines.is_empty() { "" } else { "\n" };
|
||||
|
||||
if let Some(path) = output_path {
|
||||
|
|
@ -550,23 +565,20 @@ fn render_reversed_header_lines(mut lines: Vec<&str>) -> String {
|
|||
lines.join("\n") + "\n"
|
||||
}
|
||||
|
||||
/// Calculate the weight for a split position based on the character at that position.
|
||||
/// Calculate the weight for a split byte offset in `text`.
|
||||
///
|
||||
/// Higher weights indicate more natural pause points (e.g., after punctuation,
|
||||
/// at identifier boundaries). Lower weights indicate less natural points
|
||||
/// (e.g., mid-identifier).
|
||||
fn position_weight(text: &str, pos: usize) -> u32 {
|
||||
if pos == 0 || pos > text.len() {
|
||||
fn position_weight(text: &str, byte_offset: usize) -> u32 {
|
||||
if byte_offset == 0 || byte_offset > text.len() || !text.is_char_boundary(byte_offset) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
let chars: Vec<char> = text.chars().collect();
|
||||
if pos > chars.len() {
|
||||
let Some(prev_char) = text[..byte_offset].chars().next_back() else {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Get the character just before this position (what we just "typed")
|
||||
let prev_char = chars[pos - 1];
|
||||
};
|
||||
let next_char = text[byte_offset..].chars().next();
|
||||
|
||||
// High weight: natural pause points (end of statement/argument, opening brackets)
|
||||
if matches!(prev_char, ',' | ';' | ':' | '(' | '[' | '{') {
|
||||
|
|
@ -588,8 +600,7 @@ fn position_weight(text: &str, pos: usize) -> u32 {
|
|||
|
||||
// Check if we're at the end of an identifier (word char followed by non-word char)
|
||||
let is_prev_word_char = prev_char.is_alphanumeric() || prev_char == '_';
|
||||
let is_next_word_char =
|
||||
pos < chars.len() && (chars[pos].is_alphanumeric() || chars[pos] == '_');
|
||||
let is_next_word_char = next_char.is_some_and(|ch| ch.is_alphanumeric() || ch == '_');
|
||||
|
||||
if is_prev_word_char && !is_next_word_char {
|
||||
// End of identifier - high weight
|
||||
|
|
@ -614,6 +625,7 @@ fn position_weight(text: &str, pos: usize) -> u32 {
|
|||
///
|
||||
/// Returns an index based on the weights, using the provided seed for
|
||||
/// deterministic selection.
|
||||
#[cfg(test)]
|
||||
fn weighted_select(weights: &[u32], seed: u64) -> usize {
|
||||
if weights.is_empty() {
|
||||
return 0;
|
||||
|
|
@ -640,6 +652,74 @@ fn weighted_select(weights: &[u32], seed: u64) -> usize {
|
|||
weights.len() - 1
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
struct CandidateSplit {
|
||||
edit_byte_offset: usize,
|
||||
weight: u32,
|
||||
}
|
||||
|
||||
fn push_typed_text_candidates(
|
||||
candidates: &mut Vec<CandidateSplit>,
|
||||
edit_start_byte_offset: usize,
|
||||
final_line: &str,
|
||||
final_line_start_byte_offset: usize,
|
||||
typed_text: &str,
|
||||
) {
|
||||
for (byte_offset, character) in typed_text.char_indices() {
|
||||
let next_byte_offset = byte_offset + character.len_utf8();
|
||||
let final_line_candidate_byte_offset = final_line_start_byte_offset + next_byte_offset;
|
||||
if final_line[..final_line_candidate_byte_offset]
|
||||
.trim()
|
||||
.is_empty()
|
||||
{
|
||||
continue;
|
||||
}
|
||||
candidates.push(CandidateSplit {
|
||||
edit_byte_offset: edit_start_byte_offset + next_byte_offset,
|
||||
weight: position_weight(final_line, final_line_candidate_byte_offset),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
fn push_deleted_text_candidates(
|
||||
candidates: &mut Vec<CandidateSplit>,
|
||||
edit_start_byte_offset: usize,
|
||||
deleted_text: &str,
|
||||
) {
|
||||
for (byte_offset, character) in deleted_text.char_indices() {
|
||||
candidates.push(CandidateSplit {
|
||||
edit_byte_offset: edit_start_byte_offset + byte_offset + character.len_utf8(),
|
||||
weight: 2,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
fn weighted_select_candidate(candidates: &[CandidateSplit], seed: u64) -> Option<CandidateSplit> {
|
||||
if candidates.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let total_weight: u64 = candidates
|
||||
.iter()
|
||||
.map(|candidate| candidate.weight as u64)
|
||||
.sum();
|
||||
if total_weight == 0 {
|
||||
return Some(candidates[seed as usize % candidates.len()]);
|
||||
}
|
||||
|
||||
let target = seed % total_weight;
|
||||
let mut cumulative: u64 = 0;
|
||||
|
||||
for candidate in candidates {
|
||||
cumulative += candidate.weight as u64;
|
||||
if target < cumulative {
|
||||
return Some(*candidate);
|
||||
}
|
||||
}
|
||||
|
||||
candidates.last().copied()
|
||||
}
|
||||
|
||||
/// Calculate similarity ratio between two strings (0-100).
|
||||
fn fuzzy_ratio(s1: &str, s2: &str) -> u32 {
|
||||
if s1.is_empty() && s2.is_empty() {
|
||||
|
|
@ -700,25 +780,13 @@ pub fn imitate_human_edits(
|
|||
_ => return no_change,
|
||||
};
|
||||
|
||||
// Try to locate the last edit in source
|
||||
let src_edit_loc = locate_edited_line(&src_patch, -1);
|
||||
let source_edit_locations = edit_locations(&src_patch);
|
||||
let src_edit_loc = source_edit_locations.last().cloned();
|
||||
|
||||
// Check if source has ANY edit at the same line as target's first edit
|
||||
// We need to iterate through all edits to check this
|
||||
let src_has_edit_at_target_line = {
|
||||
let mut found = false;
|
||||
let mut idx = 0isize;
|
||||
while let Some(loc) = locate_edited_line(&src_patch, idx) {
|
||||
if loc.filename == tgt_edit_loc.filename
|
||||
&& loc.target_line_number == tgt_edit_loc.target_line_number
|
||||
{
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
idx += 1;
|
||||
}
|
||||
found
|
||||
};
|
||||
let src_has_edit_at_target_line = source_edit_locations.iter().any(|loc| {
|
||||
loc.filename == tgt_edit_loc.filename
|
||||
&& loc.target_line_number == tgt_edit_loc.target_line_number
|
||||
});
|
||||
|
||||
// Check if this is a replacement (deletion followed by insertion on the same line)
|
||||
// or a pure insertion (no corresponding deletion in source)
|
||||
|
|
@ -756,61 +824,62 @@ pub fn imitate_human_edits(
|
|||
// Use similar to get diff operations
|
||||
let diff = TextDiff::from_slices(&src_tokens, &tgt_tokens);
|
||||
|
||||
// Build weights for each possible split position
|
||||
let mut position_weights: Vec<u32> = Vec::new();
|
||||
let mut candidate_splits = Vec::new();
|
||||
let mut edit_byte_offset = 0usize;
|
||||
let mut final_line_byte_offset = 0usize;
|
||||
|
||||
// Simulate the edit process to collect weights for all possible split positions
|
||||
{
|
||||
let mut current_text = String::new();
|
||||
|
||||
for op in diff.ops() {
|
||||
match op.tag() {
|
||||
DiffTag::Equal => {
|
||||
for i in op.old_range() {
|
||||
current_text.push_str(src_tokens[i]);
|
||||
}
|
||||
}
|
||||
DiffTag::Replace => {
|
||||
let ins: String = op.new_range().map(|i| tgt_tokens[i]).collect();
|
||||
let del: String = op.old_range().map(|i| src_tokens[i]).collect();
|
||||
|
||||
// For insertion part
|
||||
for ch in ins.chars() {
|
||||
current_text.push(ch);
|
||||
let weight = position_weight(¤t_text, current_text.len());
|
||||
position_weights.push(weight);
|
||||
}
|
||||
|
||||
// For deletion part (we're "untyping" from source)
|
||||
for _ in del.chars() {
|
||||
// Weight deletions lower as they represent removing text
|
||||
position_weights.push(2);
|
||||
}
|
||||
}
|
||||
DiffTag::Insert => {
|
||||
let ins: String = op.new_range().map(|i| tgt_tokens[i]).collect();
|
||||
for ch in ins.chars() {
|
||||
current_text.push(ch);
|
||||
let weight = position_weight(¤t_text, current_text.len());
|
||||
position_weights.push(weight);
|
||||
}
|
||||
}
|
||||
DiffTag::Delete => {
|
||||
let del: String = op.old_range().map(|i| src_tokens[i]).collect();
|
||||
for _ in del.chars() {
|
||||
// Weight deletions lower
|
||||
position_weights.push(2);
|
||||
}
|
||||
}
|
||||
for op in diff.ops() {
|
||||
match op.tag() {
|
||||
DiffTag::Equal => {
|
||||
let equal_text: String = op.old_range().map(|i| src_tokens[i]).collect();
|
||||
final_line_byte_offset += equal_text.len();
|
||||
}
|
||||
DiffTag::Replace => {
|
||||
let inserted_text: String = op.new_range().map(|i| tgt_tokens[i]).collect();
|
||||
let deleted_text: String = op.old_range().map(|i| src_tokens[i]).collect();
|
||||
push_typed_text_candidates(
|
||||
&mut candidate_splits,
|
||||
edit_byte_offset,
|
||||
&tgt_line,
|
||||
final_line_byte_offset,
|
||||
&inserted_text,
|
||||
);
|
||||
push_deleted_text_candidates(
|
||||
&mut candidate_splits,
|
||||
edit_byte_offset + inserted_text.len(),
|
||||
&deleted_text,
|
||||
);
|
||||
edit_byte_offset += inserted_text.len() + deleted_text.len();
|
||||
final_line_byte_offset += inserted_text.len();
|
||||
}
|
||||
DiffTag::Insert => {
|
||||
let inserted_text: String = op.new_range().map(|i| tgt_tokens[i]).collect();
|
||||
push_typed_text_candidates(
|
||||
&mut candidate_splits,
|
||||
edit_byte_offset,
|
||||
&tgt_line,
|
||||
final_line_byte_offset,
|
||||
&inserted_text,
|
||||
);
|
||||
edit_byte_offset += inserted_text.len();
|
||||
final_line_byte_offset += inserted_text.len();
|
||||
}
|
||||
DiffTag::Delete => {
|
||||
let deleted_text: String = op.old_range().map(|i| src_tokens[i]).collect();
|
||||
push_deleted_text_candidates(
|
||||
&mut candidate_splits,
|
||||
edit_byte_offset,
|
||||
&deleted_text,
|
||||
);
|
||||
edit_byte_offset += deleted_text.len();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Use weighted selection to choose split index
|
||||
if position_weights.is_empty() {
|
||||
let Some(selected_split) = weighted_select_candidate(&candidate_splits, seed) else {
|
||||
return no_change;
|
||||
}
|
||||
let split_index = weighted_select(&position_weights, seed);
|
||||
};
|
||||
let split_byte_offset = selected_split.edit_byte_offset;
|
||||
|
||||
let mut edit_index = 0usize;
|
||||
let mut new_src = String::new();
|
||||
|
|
@ -830,9 +899,9 @@ pub fn imitate_human_edits(
|
|||
let del: String = op.old_range().map(|i| src_tokens[i]).collect();
|
||||
let ins: String = op.new_range().map(|i| tgt_tokens[i]).collect();
|
||||
let repl_len = del.len() + ins.len();
|
||||
if edit_index + repl_len >= split_index {
|
||||
if edit_index + repl_len >= split_byte_offset {
|
||||
// Split within this replace operation
|
||||
let offset = split_index - edit_index;
|
||||
let offset = split_byte_offset - edit_index;
|
||||
if offset < ins.len() {
|
||||
let safe_offset = floor_char_boundary(&ins, offset);
|
||||
new_src.push_str(&ins[..safe_offset]);
|
||||
|
|
@ -853,8 +922,8 @@ pub fn imitate_human_edits(
|
|||
}
|
||||
DiffTag::Insert => {
|
||||
let repl: String = op.new_range().map(|i| tgt_tokens[i]).collect();
|
||||
if edit_index + repl.len() >= split_index {
|
||||
let offset = split_index - edit_index;
|
||||
if edit_index + repl.len() >= split_byte_offset {
|
||||
let offset = split_byte_offset - edit_index;
|
||||
let safe_offset = floor_char_boundary(&repl, offset);
|
||||
new_src.push_str(&repl[..safe_offset]);
|
||||
split_found = true;
|
||||
|
|
@ -866,8 +935,8 @@ pub fn imitate_human_edits(
|
|||
}
|
||||
DiffTag::Delete => {
|
||||
let repl: String = op.old_range().map(|i| src_tokens[i]).collect();
|
||||
if edit_index + repl.len() >= split_index {
|
||||
let offset = split_index - edit_index;
|
||||
if edit_index + repl.len() >= split_byte_offset {
|
||||
let offset = split_byte_offset - edit_index;
|
||||
let safe_offset = floor_char_boundary(&repl, offset);
|
||||
new_src.push_str(&repl[..safe_offset]);
|
||||
split_found = true;
|
||||
|
|
|
|||
Loading…
Reference in a new issue