From ffe27c575ee222647cec98ba52f1c03a19055372 Mon Sep 17 00:00:00 2001 From: Wilfred Hughes Date: Sat, 20 Jul 2024 15:54:27 -0700 Subject: [PATCH] Ensure line splitting distinguishes "foo" and "foo\n" We rely on being able to split lines and rejoin them to obtain the original string. `str::lines()` in the Rust stdlib does not have this property. This was causing crashes in word-diffing on textual diffing, where code paths differed on the number of lines they thought a string had. This was broken in 8b842387a16e5ae4a79aaf6befb0bac1894882d0. Fixes #688. --- CHANGELOG.md | 5 ++++ sample_files/big_text_hunk_1.txt | 7 +++++ sample_files/big_text_hunk_2.txt | 49 ++++++++++++++++++++++++++++++++ sample_files/compare.expected | 7 +++-- src/display/inline.rs | 18 ++++++++---- src/display/side_by_side.rs | 39 ++++++++++++++++++------- src/display/style.rs | 3 +- src/lines.rs | 49 ++++++++++++++++++++++++++++++++ src/parse/guess_language.rs | 8 ++++-- src/parse/syntax.rs | 6 ++-- 10 files changed, 166 insertions(+), 25 deletions(-) create mode 100644 sample_files/big_text_hunk_1.txt create mode 100644 sample_files/big_text_hunk_2.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index 86a6d69191..6b37d8c604 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ ## 0.59 (unreleased) +### Diffing + +Fixed crash on some textual files where a single change contained more than +1,000 words. + ### Parsing Added support for device tree and F#. diff --git a/sample_files/big_text_hunk_1.txt b/sample_files/big_text_hunk_1.txt new file mode 100644 index 0000000000..9f2293d29f --- /dev/null +++ b/sample_files/big_text_hunk_1.txt @@ -0,0 +1,7 @@ +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/naoina/go-stringutil v0.1.0 h1:rCUeRUHjBjGTSHl0VC00jUPLz8/F9dDzYI70Hzifhks= +github.com/naoina/go-stringutil v0.1.0/go.mod h1:XJ2SJL9jCtBh+P9q5btrd/Ylo8XwT/h1USek5+NqSA0= +github.com/naoina/toml v0.1.1 h1:PT/lllxVVN0gzzSqSlHEmP8MJB4MY2U7STGxiouV4X8= +github.com/naoina/toml v0.1.1/go.mod h1:NBIhNtsFMo3G2szEBne+bO4gS192HuIYRqfvOWb4i1E= +github.com/streadway/amqp v0.0.0-20160311215503-2e25825abdbd h1:625/bJvSNfQrzzK5ttnUqMqnVe8M5MILmf5ZRGgeeDY= +github.com/streadway/amqp v0.0.0-20160311215503-2e25825abdbd/go.mod h1:1WNBiOZtZQLpVAyu0iTduoJL9hEsMloAK5XWrtW0xdY= diff --git a/sample_files/big_text_hunk_2.txt b/sample_files/big_text_hunk_2.txt new file mode 100644 index 0000000000..dfb2e4a3ee --- /dev/null +++ b/sample_files/big_text_hunk_2.txt @@ -0,0 +1,49 @@ +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/naoina/go-stringutil v0.1.0 h1:rCUeRUHjBjGTSHl0VC00jUPLz8/F9dDzYI70Hzifhks= +github.com/naoina/go-stringutil v0.1.0/go.mod h1:XJ2SJL9jCtBh+P9q5btrd/Ylo8XwT/h1USek5+NqSA0= +github.com/naoina/toml v0.1.1 h1:PT/lllxVVN0gzzSqSlHEmP8MJB4MY2U7STGxiouV4X8= +github.com/naoina/toml v0.1.1/go.mod h1:NBIhNtsFMo3G2szEBne+bO4gS192HuIYRqfvOWb4i1E= +github.com/streadway/amqp v0.0.0-20160311215503-2e25825abdbd h1:625/bJvSNfQrzzK5ttnUqMqnVe8M5MILmf5ZRGgeeDY= +github.com/streadway/amqp v0.0.0-20160311215503-2e25825abdbd/go.mod h1:1WNBiOZtZQLpVAyu0iTduoJL9hEsMloAK5XWrtW0xdY= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= +go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= +golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.0.0-20220708220712-1185a9018129/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= +golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= +golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= +golang.org/x/oauth2 v0.7.0 h1:qe6s0zUXlPX80/dITx3440hWZ7GwMwgDDyrSGTPJG/g= +golang.org/x/oauth2 v0.7.0/go.mod h1:hPLQkd9LyjfXTiRohC/41GhcFqxisoUQ99sCUOHO9x4= +golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= +golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= diff --git a/sample_files/compare.expected b/sample_files/compare.expected index 42e6e6b948..d7b192402c 100644 --- a/sample_files/compare.expected +++ b/sample_files/compare.expected @@ -19,6 +19,9 @@ sample_files/b2_math_1.h sample_files/b2_math_2.h sample_files/bad_combine_1.rs sample_files/bad_combine_2.rs f5051bf7d2b8afa3a677388cbd458891 - +sample_files/big_text_hunk_1.txt sample_files/big_text_hunk_2.txt +fd0c8912c094097f82c6b29ae66fb912 - + sample_files/change_outer_1.el sample_files/change_outer_2.el 2b9334a4cc72da63bba28eff958f0038 - @@ -140,7 +143,7 @@ sample_files/makefile_1.mk sample_files/makefile_2.mk d0572210b5121ce68ac0ce45e43b922b - sample_files/many_newlines_1.txt sample_files/many_newlines_2.txt -615de4b145b7b161e4fb285728280ed1 - +52ca05855e520876479e6f608c5e7831 - sample_files/metadata_1.clj sample_files/metadata_2.clj 4b58ce366467c8cca46db53508e81323 - @@ -293,5 +296,5 @@ sample_files/yaml_1.yaml sample_files/yaml_2.yaml f068239fc7bade0e6de96d81136c1ac5 - sample_files/zig_1.zig sample_files/zig_2.zig -4516796003b81f35bfa57d84bb7c0cbe - +e36d1ea126b8b68e3344434bb63f205e - diff --git a/src/display/inline.rs b/src/display/inline.rs index dc34248d5e..3973dba538 100644 --- a/src/display/inline.rs +++ b/src/display/inline.rs @@ -2,10 +2,12 @@ use crate::{ constants::Side, - display::context::{calculate_after_context, calculate_before_context, opposite_positions}, - display::hunks::Hunk, - display::style::{self, apply_colors, apply_line_number_color}, - lines::{format_line_num, MaxLine}, + display::{ + context::{calculate_after_context, calculate_before_context, opposite_positions}, + hunks::Hunk, + style::{self, apply_colors, apply_line_number_color}, + }, + lines::{format_line_num, split_on_newlines, MaxLine}, options::DisplayOptions, parse::syntax::MatchedPos, summary::FileFormat, @@ -43,8 +45,12 @@ pub(crate) fn print( ) } else { ( - lhs_src.lines().map(|s| format!("{}\n", s)).collect(), - rhs_src.lines().map(|s| format!("{}\n", s)).collect(), + split_on_newlines(lhs_src) + .map(|s| format!("{}\n", s)) + .collect(), + split_on_newlines(rhs_src) + .map(|s| format!("{}\n", s)) + .collect(), ) }; diff --git a/src/display/side_by_side.rs b/src/display/side_by_side.rs index 914bae43b4..afec92d466 100644 --- a/src/display/side_by_side.rs +++ b/src/display/side_by_side.rs @@ -11,14 +11,16 @@ use owo_colors::{OwoColorize, Style}; use crate::{ constants::Side, - display::context::all_matched_lines_filled, - display::hunks::{matched_lines_indexes_for_hunk, Hunk}, - display::style::{ - self, apply_colors, apply_line_number_color, color_positions, novel_style, replace_tabs, - split_and_apply, BackgroundColor, + display::{ + context::all_matched_lines_filled, + hunks::{matched_lines_indexes_for_hunk, Hunk}, + style::{ + self, apply_colors, apply_line_number_color, color_positions, novel_style, + replace_tabs, split_and_apply, BackgroundColor, + }, }, hash::DftHashMap, - lines::format_line_num, + lines::{format_line_num, split_on_newlines}, options::{DisplayMode, DisplayOptions}, parse::syntax::{zip_pad_shorter, MatchedPos}, summary::FileFormat, @@ -338,8 +340,12 @@ pub(crate) fn print( ) } else { ( - lhs_src.lines().map(|s| format!("{}\n", s)).collect(), - rhs_src.lines().map(|s| format!("{}\n", s)).collect(), + split_on_newlines(lhs_src) + .map(|s| format!("{}\n", s)) + .collect(), + split_on_newlines(rhs_src) + .map(|s| format!("{}\n", s)) + .collect(), ) }; @@ -401,8 +407,21 @@ pub(crate) fn print( let mut prev_lhs_line_num = None; let mut prev_rhs_line_num = None; - let lhs_lines = lhs_src.lines().collect::>(); - let rhs_lines = rhs_src.lines().collect::>(); + let mut lhs_lines = split_on_newlines(lhs_src).collect::>(); + let mut rhs_lines = split_on_newlines(rhs_src).collect::>(); + + // If "foo" is one line, is "foo\n" two lines? Generally we want + // to care about newlines when deciding whether content differs. + // + // Ending a file with a trailing newline is extremely common + // though. If both files have a trailing newline, consider "foo\n" + // to be "foo" so we don't end up displaying a blank line on both + // sides. + if lhs_lines.last() == Some(&"") && rhs_lines.last() == Some(&"") { + lhs_lines.pop(); + rhs_lines.pop(); + } + let matched_lines = all_matched_lines_filled(lhs_mps, rhs_mps, &lhs_lines, &rhs_lines); let mut matched_lines_to_print = &matched_lines[..]; diff --git a/src/display/style.rs b/src/display/style.rs index ff63c4da2b..d65ef22459 100644 --- a/src/display/style.rs +++ b/src/display/style.rs @@ -7,6 +7,7 @@ use line_numbers::SingleLineSpan; use owo_colors::{OwoColorize, Style}; use unicode_width::{UnicodeWidthChar, UnicodeWidthStr}; +use crate::lines::split_on_newlines; use crate::parse::syntax::StringKind; use crate::{ constants::Side, @@ -401,7 +402,7 @@ pub(crate) fn apply_colors( positions: &[MatchedPos], ) -> Vec { let styles = color_positions(side, background, syntax_highlight, file_format, positions); - let lines = s.lines().collect::>(); + let lines = split_on_newlines(s).collect::>(); style_lines(&lines, &styles) } diff --git a/src/lines.rs b/src/lines.rs index f6b1fb8a72..85c5bb119f 100644 --- a/src/lines.rs +++ b/src/lines.rs @@ -32,6 +32,21 @@ impl> MaxLine for S { } } +/// Split `s` on \n or \r\n. Always returns a non-empty vec. Each line +/// in the vec does not include the trailing newline. +/// +/// This differs from `str::lines`, which considers `""` to be zero +/// lines and `"foo\n"` to be one line. +pub(crate) fn split_on_newlines(s: &str) -> impl Iterator { + s.split('\n').map(|l| { + if let Some(l) = l.strip_suffix('\r') { + l + } else { + l + } + }) +} + pub(crate) fn is_all_whitespace(s: &str) -> bool { s.chars().all(|c| c.is_whitespace()) } @@ -66,6 +81,40 @@ mod tests { assert_eq!(line.max_line().0, 1); } + #[test] + fn test_split_line_empty() { + assert_eq!(split_on_newlines("").collect::>(), vec![""]); + } + + #[test] + fn test_split_line_single() { + assert_eq!(split_on_newlines("foo").collect::>(), vec!["foo"]); + } + + #[test] + fn test_split_line_with_newline() { + assert_eq!( + split_on_newlines("foo\nbar").collect::>(), + vec!["foo", "bar"] + ); + } + + #[test] + fn test_split_line_with_crlf() { + assert_eq!( + split_on_newlines("foo\r\nbar").collect::>(), + vec!["foo", "bar"] + ); + } + + #[test] + fn test_split_line_with_trailing_newline() { + assert_eq!( + split_on_newlines("foo\nbar\n").collect::>(), + vec!["foo", "bar", ""] + ); + } + #[test] fn test_is_all_whitespace() { assert!(is_all_whitespace(" \n\t")); diff --git a/src/parse/guess_language.rs b/src/parse/guess_language.rs index eef2a0df41..5f0fb1d348 100644 --- a/src/parse/guess_language.rs +++ b/src/parse/guess_language.rs @@ -179,6 +179,8 @@ pub(crate) fn language_name(language: Language) -> &'static str { use Language::*; +use crate::lines::split_on_newlines; + /// File globs that identify languages based on the file path. pub(crate) fn language_globs(language: Language) -> Vec { let glob_strs: &'static [&'static str] = match language { @@ -420,7 +422,7 @@ fn looks_like_hacklang(path: &Path, src: &str) -> bool { fn looks_like_objc(path: &Path, src: &str) -> bool { if let Some(extension) = path.extension() { if extension == "h" { - return src.lines().take(100).any(|line| { + return split_on_newlines(src).take(100).any(|line| { ["#import", "@interface", "@protocol"] .iter() .any(|keyword| line.starts_with(keyword)) @@ -497,7 +499,7 @@ fn from_emacs_mode_header(src: &str) -> Option { // Emacs allows the mode header to occur on the second line if the // first line is a shebang. - for line in src.lines().take(2) { + for line in split_on_newlines(src).take(2) { let mode_name: String = match (MODE_RE.captures(line), SHORTHAND_RE.captures(line)) { (Some(cap), _) | (_, Some(cap)) => cap[1].into(), _ => "".into(), @@ -559,7 +561,7 @@ fn from_shebang(src: &str) -> Option { lazy_static! { static ref RE: Regex = Regex::new(r"#! *(?:/usr/bin/env )?([^ ]+)").unwrap(); } - if let Some(first_line) = src.lines().next() { + if let Some(first_line) = split_on_newlines(src).next() { if let Some(cap) = RE.captures(first_line) { let interpreter_path = Path::new(&cap[1]); if let Some(name) = interpreter_path.file_name() { diff --git a/src/parse/syntax.rs b/src/parse/syntax.rs index bdde5934d5..65167fda32 100644 --- a/src/parse/syntax.rs +++ b/src/parse/syntax.rs @@ -9,6 +9,7 @@ use line_numbers::SingleLineSpan; use typed_arena::Arena; use self::Syntax::*; +use crate::lines::split_on_newlines; use crate::words::split_words_and_numbers; use crate::{ diff::changes::ChangeKind, @@ -408,9 +409,8 @@ fn set_content_id(nodes: &[&Syntax], existing: &mut DftHashMap) .. } => { let is_comment = *highlight == AtomKind::Comment; - let clean_content = if is_comment && content.lines().count() > 1 { - content - .lines() + let clean_content = if is_comment && split_on_newlines(content).count() > 1 { + split_on_newlines(content) .map(|l| l.trim_start()) .collect::>() .join("\n")