From 6523260d21894fb6036cc56b864d516eece2e180 Mon Sep 17 00:00:00 2001 From: Scott Taylor Date: Sun, 24 Nov 2024 15:20:18 -0600 Subject: [PATCH] conflicts: long markers if file contains marker-like lines If a file contains lines which look like conflict markers, then we need to make the real conflict markers longer so that the materialized conflicts can be parsed unambiguously. For instance, if we have a file explaining the differences between Jujutsu's conflict markers and Git's conflict markers, it could produce a conflict with long markers like this: ``` <<<<<<<<<<< Conflict 1 of 1 %%%%%%%%%%% Changes from base to side #1 -Git's conflict markers look like this: +Unlike Jujutsu, Git's conflict markers look like this: +++++++++++ Contents of side #2 Jujutsu uses different conflict markers than Git, which just shows the sides of a conflict without a diff: >>>>>>>>>>> Conflict 1 of 1 ends <<<<<<< left ======= right >>>>>>> ``` We need to allow conflict markers to be longer than strictly necessary, because imagine that we are in the process of resolving the conflict shown above, and we remove the example of Git's conflict markers: ``` <<<<<<<<<<< Conflict 1 of 1 %%%%%%%%%%% Changes from base to side #1 -Git's conflict markers look like this: +Unlike Jujutsu, Git's conflict markers look like this: +++++++++++ Contents of side #2 Jujutsu uses different conflict markers than Git, which just shows the sides of a conflict without a diff: >>>>>>>>>>> Conflict 1 of 1 ends ``` Now, there is no reason for the conflict markers to be longer than 7 characters, since the text which looked like conflict markers has been removed. We still want this file to be parsed correctly as a conflict though, so we need to allow markers which are longer than necessary. --- lib/src/conflicts.rs | 231 +++++++++++++++++++++++++++--------- lib/tests/test_conflicts.rs | 104 +++++++++++----- 2 files changed, 252 insertions(+), 83 deletions(-) diff --git a/lib/src/conflicts.rs b/lib/src/conflicts.rs index 0eb5a5ebd5..4e099751e0 100644 --- a/lib/src/conflicts.rs +++ b/lib/src/conflicts.rs @@ -50,8 +50,14 @@ use crate::merge::MergedTreeValue; use crate::repo_path::RepoPath; use crate::store::Store; -/// Length of conflict markers. -pub const CONFLICT_MARKER_LEN: usize = 7; +/// Minimum length of conflict markers. +pub const MIN_CONFLICT_MARKER_LEN: usize = 7; + +/// If a file already contains lines which look like conflict markers of length +/// N, then the conflict markers we add will be of length (N + increment). This +/// number is chosen to make the conflict markers noticeably longer than the +/// existing markers. +pub const CONFLICT_MARKER_LEN_INCREMENT: usize = 4; fn write_diff_hunks(hunks: &[DiffHunk], file: &mut dyn Write) -> io::Result<()> { for hunk in hunks { @@ -259,16 +265,26 @@ impl ConflictMarkerKind { } } +/// Represents a conflict marker parsed from the file. Conflict markers consist +/// of a single ASCII character repeated for a certain length. +struct ConflictMarker { + kind: ConflictMarkerKind, + len: usize, +} + /// Write a conflict marker to an output file. fn write_conflict_marker( output: &mut dyn Write, kind: ConflictMarkerKind, + len: usize, suffix_text: &str, ) -> io::Result<()> { + debug_assert!(len >= MIN_CONFLICT_MARKER_LEN); + // It should be faster to build a string in memory and write it to the output // all at once than to write each byte to the output one at a time let mut line: BString = BString::default(); - for _ in 0..CONFLICT_MARKER_LEN { + for _ in 0..len { line.push_byte(kind.to_byte()); } @@ -281,17 +297,13 @@ fn write_conflict_marker( output.write_all(&line) } -/// Parse a conflict marker from a line of a file. The conflict marker must have -/// the correct length (CONFLICT_MARKER_LEN). -fn parse_conflict_marker(line: &[u8]) -> Option { +/// Parse a conflict marker from a line of a file. The conflict marker may have +/// any length (even less than MIN_CONFLICT_MARKER_LEN). +fn parse_conflict_marker_any_len(line: &[u8]) -> Option { let first_byte = *line.first()?; let kind = ConflictMarkerKind::parse_byte(first_byte)?; let len = line.iter().take_while(|&&b| b == first_byte).count(); - if len != CONFLICT_MARKER_LEN { - return None; - } - if let Some(next_byte) = line.get(len) { // If there is a character after the marker, it must be ASCII whitespace if !next_byte.is_ascii_whitespace() { @@ -299,19 +311,74 @@ fn parse_conflict_marker(line: &[u8]) -> Option { } } - Some(kind) + Some(ConflictMarker { kind, len }) +} + +/// Parse a conflict marker, expecting it to have a certain length. Any conflict +/// markers which have a shorter length are ignored. +/// +/// We have to accept longer conflict markers in case conflict resolution +/// results in one of the "fake" conflict marker lines being deleted, since this +/// will cause the minimum length to go down, meaning the existing conflict +/// markers will now appear to be longer than necessary. +/// +/// There is another possible issue, which is that the user could make one of +/// the "fake" conflict markers longer while resolving the conflict (but not +/// long enough that they are recognized by Jujutsu as real conflict markers). +/// This would result in the existing materialized conflict markers no longer +/// being recognized as conflict markers the next time the file is modified, +/// so the file would incorrectly be considered resolved. We may want to handle +/// this case better in the future (e.g. by re-materializing the file when the +/// length of conflict markers changes). +fn parse_conflict_marker(line: &[u8], expected_len: usize) -> Option { + debug_assert!(expected_len >= MIN_CONFLICT_MARKER_LEN); + + parse_conflict_marker_any_len(line) + .filter(|marker| marker.len >= expected_len) + .map(|marker| marker.kind) } +/// Choose an appropriate conflict marker length to use for a file. The conflict +/// marker length will be at least MIN_CONFLICT_MARKER_LENGTH, and is guaranteed +/// to be longer than any existing markers in the file. +pub fn choose_conflict_marker_len>(slices: &Merge) -> usize { + let max_existing_marker_len = slices + .iter() + .map(|slice| slice.as_ref()) + .flat_map(|slice| slice.lines_with_terminator()) + .filter_map(parse_conflict_marker_any_len) + .map(|marker| marker.len.saturating_add(1)) + .max() + .unwrap_or_default(); + + if max_existing_marker_len < MIN_CONFLICT_MARKER_LEN { + // Use the normal conflict marker length if it's unambiguous. + MIN_CONFLICT_MARKER_LEN + } else { + // If there are existing markers in the file, we want to make our conflict + // markers longer than them. We add CONFLICT_MARKER_LEN_INCREMENT to ensure + // they're longer by a visually noticeable amount. + max_existing_marker_len.saturating_add(CONFLICT_MARKER_LEN_INCREMENT) + } +} + +// Returns conflict marker length used for materializing conflicts, if there +// were any conflicts to materialize. pub fn materialize_merge_result>( single_hunk: &Merge, conflict_marker_style: ConflictMarkerStyle, output: &mut dyn Write, -) -> io::Result<()> { +) -> io::Result> { let merge_result = files::merge(single_hunk); match &merge_result { - MergeResult::Resolved(content) => output.write_all(content), + MergeResult::Resolved(content) => { + output.write_all(content)?; + Ok(None) + } MergeResult::Conflict(hunks) => { - materialize_conflict_hunks(hunks, conflict_marker_style, output) + let conflict_marker_len = choose_conflict_marker_len(single_hunk); + materialize_conflict_hunks(hunks, conflict_marker_style, conflict_marker_len, output)?; + Ok(Some(conflict_marker_len)) } } } @@ -324,9 +391,15 @@ pub fn materialize_merge_result_to_bytes>( match merge_result { MergeResult::Resolved(content) => content, MergeResult::Conflict(hunks) => { + let conflict_marker_len = choose_conflict_marker_len(single_hunk); let mut output = Vec::new(); - materialize_conflict_hunks(&hunks, conflict_marker_style, &mut output) - .expect("writing to an in-memory buffer should never fail"); + materialize_conflict_hunks( + &hunks, + conflict_marker_style, + conflict_marker_len, + &mut output, + ) + .expect("writing to an in-memory buffer should never fail"); output.into() } } @@ -335,6 +408,7 @@ pub fn materialize_merge_result_to_bytes>( fn materialize_conflict_hunks( hunks: &[Merge], conflict_marker_style: ConflictMarkerStyle, + conflict_marker_len: usize, output: &mut dyn Write, ) -> io::Result<()> { let num_conflicts = hunks @@ -352,13 +426,21 @@ fn materialize_conflict_hunks( match (conflict_marker_style, hunk.as_slice()) { // 2-sided conflicts can use Git-style conflict markers (ConflictMarkerStyle::Git, [left, base, right]) => { - materialize_git_style_conflict(left, base, right, &conflict_info, output)?; + materialize_git_style_conflict( + left, + base, + right, + &conflict_info, + conflict_marker_len, + output, + )?; } _ => { materialize_jj_style_conflict( hunk, &conflict_info, conflict_marker_style, + conflict_marker_len, output, )?; } @@ -373,22 +455,35 @@ fn materialize_git_style_conflict( base: &[u8], right: &[u8], conflict_info: &str, + conflict_marker_len: usize, output: &mut dyn Write, ) -> io::Result<()> { write_conflict_marker( output, ConflictMarkerKind::ConflictStart, + conflict_marker_len, &format!("Side #1 ({conflict_info})"), )?; output.write_all(left)?; - write_conflict_marker(output, ConflictMarkerKind::GitAncestor, "Base")?; + write_conflict_marker( + output, + ConflictMarkerKind::GitAncestor, + conflict_marker_len, + "Base", + )?; output.write_all(base)?; // VS Code doesn't seem to support any trailing text on the separator line - write_conflict_marker(output, ConflictMarkerKind::GitSeparator, "")?; + write_conflict_marker( + output, + ConflictMarkerKind::GitSeparator, + conflict_marker_len, + "", + )?; output.write_all(right)?; write_conflict_marker( output, ConflictMarkerKind::ConflictEnd, + conflict_marker_len, &format!("Side #2 ({conflict_info} ends)"), )?; @@ -399,44 +494,49 @@ fn materialize_jj_style_conflict( hunk: &Merge, conflict_info: &str, conflict_marker_style: ConflictMarkerStyle, + conflict_marker_len: usize, output: &mut dyn Write, ) -> io::Result<()> { // Write a positive snapshot (side) of a conflict - fn write_side(add_index: usize, data: &[u8], output: &mut dyn Write) -> io::Result<()> { + let write_side = |add_index: usize, data: &[u8], output: &mut dyn Write| { write_conflict_marker( output, ConflictMarkerKind::Add, + conflict_marker_len, &format!("Contents of side #{}", add_index + 1), )?; output.write_all(data) - } + }; // Write a negative snapshot (base) of a conflict - fn write_base(base_str: &str, data: &[u8], output: &mut dyn Write) -> io::Result<()> { + let write_base = |base_str: &str, data: &[u8], output: &mut dyn Write| { write_conflict_marker( output, ConflictMarkerKind::Remove, + conflict_marker_len, &format!("Contents of {base_str}"), )?; output.write_all(data) - } + }; // Write a diff from a negative term to a positive term - fn write_diff( - base_str: &str, - add_index: usize, - diff: &[DiffHunk], - output: &mut dyn Write, - ) -> io::Result<()> { - write_conflict_marker( - output, - ConflictMarkerKind::Diff, - &format!("Changes from {base_str} to side #{}", add_index + 1), - )?; - write_diff_hunks(diff, output) - } + let write_diff = + |base_str: &str, add_index: usize, diff: &[DiffHunk], output: &mut dyn Write| { + write_conflict_marker( + output, + ConflictMarkerKind::Diff, + conflict_marker_len, + &format!("Changes from {base_str} to side #{}", add_index + 1), + )?; + write_diff_hunks(diff, output) + }; - write_conflict_marker(output, ConflictMarkerKind::ConflictStart, conflict_info)?; + write_conflict_marker( + output, + ConflictMarkerKind::ConflictStart, + conflict_marker_len, + conflict_info, + )?; let mut add_index = 0; for (base_index, left) in hunk.removes().enumerate() { // The vast majority of conflicts one actually tries to resolve manually have 1 @@ -489,6 +589,7 @@ fn materialize_jj_style_conflict( write_conflict_marker( output, ConflictMarkerKind::ConflictEnd, + conflict_marker_len, &format!("{conflict_info} ends"), )?; Ok(()) @@ -539,7 +640,11 @@ pub fn materialized_diff_stream<'a>( /// they don't have the expected arity. // TODO: "parse" is not usually the opposite of "materialize", so maybe we // should rename them to "serialize" and "deserialize"? -pub fn parse_conflict(input: &[u8], num_sides: usize) -> Option>> { +pub fn parse_conflict( + input: &[u8], + num_sides: usize, + conflict_marker_len: usize, +) -> Option>> { if input.is_empty() { return None; } @@ -549,7 +654,7 @@ pub fn parse_conflict(input: &[u8], num_sides: usize) -> Option { conflict_start = Some(pos); conflict_start_len = line.len(); @@ -557,7 +662,7 @@ pub fn parse_conflict(input: &[u8], num_sides: usize) -> Option { if let Some(conflict_start_index) = conflict_start { let conflict_body = &input[conflict_start_index + conflict_start_len..pos]; - let hunk = parse_conflict_hunk(conflict_body); + let hunk = parse_conflict_hunk(conflict_body, conflict_marker_len); if hunk.num_sides() == num_sides { let resolved_slice = &input[resolved_start..conflict_start_index]; if !resolved_slice.is_empty() { @@ -589,27 +694,29 @@ pub fn parse_conflict(input: &[u8], num_sides: usize) -> Option Merge { +fn parse_conflict_hunk(input: &[u8], conflict_marker_len: usize) -> Merge { // If the hunk starts with a conflict marker, find its first character let initial_conflict_marker = input .lines_with_terminator() .next() - .and_then(parse_conflict_marker); + .and_then(|line| parse_conflict_marker(line, conflict_marker_len)); match initial_conflict_marker { // JJ-style conflicts must start with one of these 3 conflict marker lines Some(ConflictMarkerKind::Add | ConflictMarkerKind::Remove | ConflictMarkerKind::Diff) => { - parse_jj_style_conflict_hunk(input) + parse_jj_style_conflict_hunk(input, conflict_marker_len) } // Git-style conflicts either must not start with a conflict marker line, or must start with // the "|||||||" conflict marker line (if the first side was empty) - None | Some(ConflictMarkerKind::GitAncestor) => parse_git_style_conflict_hunk(input), + None | Some(ConflictMarkerKind::GitAncestor) => { + parse_git_style_conflict_hunk(input, conflict_marker_len) + } // No other conflict markers are allowed at the start of a hunk Some(_) => Merge::resolved(BString::new(vec![])), } } -fn parse_jj_style_conflict_hunk(input: &[u8]) -> Merge { +fn parse_jj_style_conflict_hunk(input: &[u8], conflict_marker_len: usize) -> Merge { enum State { Diff, Minus, @@ -620,7 +727,7 @@ fn parse_jj_style_conflict_hunk(input: &[u8]) -> Merge { let mut removes = vec![]; let mut adds = vec![]; for line in input.lines_with_terminator() { - match parse_conflict_marker(line) { + match parse_conflict_marker(line, conflict_marker_len) { Some(ConflictMarkerKind::Diff) => { state = State::Diff; removes.push(BString::new(vec![])); @@ -680,7 +787,7 @@ fn parse_jj_style_conflict_hunk(input: &[u8]) -> Merge { } } -fn parse_git_style_conflict_hunk(input: &[u8]) -> Merge { +fn parse_git_style_conflict_hunk(input: &[u8], conflict_marker_len: usize) -> Merge { #[derive(PartialEq, Eq)] enum State { Left, @@ -692,7 +799,7 @@ fn parse_git_style_conflict_hunk(input: &[u8]) -> Merge { let mut base = BString::new(vec![]); let mut right = BString::new(vec![]); for line in input.lines_with_terminator() { - match parse_conflict_marker(line) { + match parse_conflict_marker(line, conflict_marker_len) { Some(ConflictMarkerKind::GitAncestor) => { if state == State::Left { state = State::Base; @@ -748,7 +855,8 @@ pub async fn update_from_content( // copy. let mut old_content = Vec::with_capacity(content.len()); let merge_hunk = extract_as_single_hunk(simplified_file_ids, store, path).await?; - materialize_merge_result(&merge_hunk, conflict_marker_style, &mut old_content).unwrap(); + let conflict_marker_len = + materialize_merge_result(&merge_hunk, conflict_marker_style, &mut old_content).unwrap(); if content == old_content { return Ok(file_ids.clone()); } @@ -758,11 +866,28 @@ pub async fn update_from_content( // the arity of the unsimplified conflicts since such a conflict may be // present in the working copy if written by an earlier version of jj. let (used_file_ids, hunks) = 'hunks: { - if let Some(hunks) = parse_conflict(content, simplified_file_ids.num_sides()) { - break 'hunks (simplified_file_ids, hunks); - }; + // If the simplified conflict was already resolved, we won't know the conflict + // marker length to use, since it was never calculated. However in that case, + // the file should already not contain any conflicts since num_sides() is 1. + if let Some(conflict_marker_len) = conflict_marker_len { + if let Some(hunks) = parse_conflict( + content, + simplified_file_ids.num_sides(), + conflict_marker_len, + ) { + break 'hunks (simplified_file_ids, hunks); + }; + } + // This case is only possible if the working copy was materialized in an earlier + // version of jj. All conflict markers were of length 7 in that version, so it's + // safe to use MIN_CONFLICT_MARKER_LEN for parsing if the actual conflict marker + // length is unknown. if simplified_file_ids.num_sides() != file_ids.num_sides() { - if let Some(hunks) = parse_conflict(content, file_ids.num_sides()) { + if let Some(hunks) = parse_conflict( + content, + file_ids.num_sides(), + conflict_marker_len.unwrap_or(MIN_CONFLICT_MARKER_LEN), + ) { break 'hunks (file_ids, hunks); }; }; diff --git a/lib/tests/test_conflicts.rs b/lib/tests/test_conflicts.rs index e3c4a00f6f..4cd436e19b 100644 --- a/lib/tests/test_conflicts.rs +++ b/lib/tests/test_conflicts.rs @@ -19,6 +19,7 @@ use jj_lib::conflicts::materialize_merge_result_to_bytes; use jj_lib::conflicts::parse_conflict; use jj_lib::conflicts::update_from_content; use jj_lib::conflicts::ConflictMarkerStyle; +use jj_lib::conflicts::MIN_CONFLICT_MARKER_LEN; use jj_lib::merge::Merge; use jj_lib::repo::Repo; use jj_lib::repo_path::RepoPath; @@ -505,7 +506,7 @@ fn test_materialize_parse_roundtrip() { // The first add should always be from the left side insta::assert_debug_snapshot!( - parse_conflict(materialized.as_bytes(), conflict.num_sides()), + parse_conflict(materialized.as_bytes(), conflict.num_sides(), MIN_CONFLICT_MARKER_LEN), @r###" Some( [ @@ -628,7 +629,8 @@ fn test_materialize_conflict_no_newlines_at_eof() { // BUG(#3968): These conflict markers cannot be parsed insta::assert_debug_snapshot!(parse_conflict( materialized.as_bytes(), - conflict.num_sides() + conflict.num_sides(), + MIN_CONFLICT_MARKER_LEN ),@"None"); } @@ -795,7 +797,8 @@ line 3 line 4 line 5 "}, - 2 + 2, + 7 ), None ); @@ -817,7 +820,8 @@ fn test_parse_conflict_simple() { >>>>>>> line 5 "}, - 2 + 2, + 7 ), @r###" Some( @@ -853,7 +857,8 @@ fn test_parse_conflict_simple() { >>>>>>> More and more text line 5 "}, - 2 + 2, + 7 ), @r###" Some( @@ -892,7 +897,8 @@ fn test_parse_conflict_simple() { >>>>>>> Random text line 5 "}, - 2 + 2, + 7 ), @r#" Some( @@ -931,7 +937,8 @@ fn test_parse_conflict_simple() { >>>>>>> Random text line 5 "}, - 2 + 2, + 7 ), @r#" Some( @@ -969,7 +976,8 @@ fn test_parse_conflict_simple() { >>>>>>> End line 5 "}, - 2 + 2, + 7 ), @r#" Some( @@ -1005,7 +1013,8 @@ fn test_parse_conflict_simple() { >>>>>>> End line 5 "}, - 2 + 2, + 7 ), @r#" Some( @@ -1027,8 +1036,7 @@ fn test_parse_conflict_simple() { ) "# ); - // The conflict markers are too long and shouldn't parse (though we may - // decide to change this in the future) + // Conflict markers are allowed to be longer than necessary insta::assert_debug_snapshot!( parse_conflict(indoc! {b" line 1 @@ -1043,9 +1051,28 @@ fn test_parse_conflict_simple() { >>>>>>>>>>> line 5 "}, - 2 + 2, + 7 ), - @"None" + @r#" + Some( + [ + Resolved( + "line 1\n", + ), + Conflicted( + [ + "line 2\nleft\nline 4\n", + "line 2\nline 3\nline 4\n", + "right\n", + ], + ), + Resolved( + "line 5\n", + ), + ], + ) + "# ); } @@ -1071,7 +1098,8 @@ fn test_parse_conflict_multi_way() { >>>>>>> line 5 "}, - 3 + 3, + 7 ), @r###" Some( @@ -1114,7 +1142,8 @@ fn test_parse_conflict_multi_way() { >>>>>>> Random text line 5 "}, - 3 + 3, + 7 ), @r###" Some( @@ -1160,7 +1189,8 @@ fn test_parse_conflict_multi_way() { >>>>>>> Random text line 5 "}, - 3 + 3, + 7 ), @r#" Some( @@ -1203,7 +1233,8 @@ fn test_parse_conflict_crlf_markers() { >>>>>>>\r line 5\r "}, - 2 + 2, + 7 ), @r#" Some( @@ -1248,7 +1279,8 @@ fn test_parse_conflict_diff_stripped_whitespace() { >>>>>>> line 5 "}, - 2 + 2, + 7 ), @r#" Some( @@ -1290,7 +1322,8 @@ fn test_parse_conflict_wrong_arity() { >>>>>>> line 5 "}, - 3 + 3, + 7 ), None ); @@ -1311,7 +1344,8 @@ fn test_parse_conflict_malformed_missing_removes() { >>>>>>> line 5 "}, - 2 + 2, + 7 ), None ); @@ -1334,7 +1368,8 @@ fn test_parse_conflict_malformed_marker() { >>>>>>> line 5 "}, - 2 + 2, + 7 ), None ); @@ -1358,7 +1393,8 @@ fn test_parse_conflict_malformed_diff() { >>>>>>> line 5 "}, - 2 + 2, + 7 ), None ); @@ -1380,7 +1416,8 @@ fn test_parse_conflict_snapshot_missing_header() { >>>>>>> line 5 "}, - 2 + 2, + 7 ), None ); @@ -1400,7 +1437,8 @@ fn test_parse_conflict_wrong_git_style() { >>>>>>> line 5 "}, - 2 + 2, + 7 ), None ); @@ -1422,7 +1460,8 @@ fn test_parse_conflict_git_reordered_headers() { >>>>>>> line 5 "}, - 2 + 2, + 7 ), None ); @@ -1448,7 +1487,8 @@ fn test_parse_conflict_git_too_many_sides() { >>>>>>> line 5 "}, - 3 + 3, + 7 ), None ); @@ -1471,7 +1511,8 @@ fn test_parse_conflict_mixed_header_styles() { >>>>>>> line 5 "}, - 2 + 2, + 7 ), None ); @@ -1489,7 +1530,8 @@ fn test_parse_conflict_mixed_header_styles() { >>>>>>> line 5 "}, - 2 + 2, + 7 ), None ); @@ -1506,7 +1548,8 @@ fn test_parse_conflict_mixed_header_styles() { >>>>>>> Conflict 1 of 1 ends line 5 "}, - 2 + 2, + 7 ), @r#" Some( @@ -1541,7 +1584,8 @@ fn test_parse_conflict_mixed_header_styles() { >>>>>>> Side #2 (Conflict 1 of 1 ends) line 5 "}, - 2 + 2, + 7 ), @r#" Some(