diff --git a/CHANGELOG.md b/CHANGELOG.md index 3717609143..b994e2a485 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ ### Parsing +File detection now supports Windows-1252 encoded test (an extension of +ISO-8859-1), and is stricter about UTF-16 detection. + Updated to the latest tree-sitter parser for Make and YAML. ## 0.62 (released 20th December 2024) diff --git a/Cargo.lock b/Cargo.lock index 46d0e1ffbb..69e34b9552 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -247,6 +247,7 @@ dependencies = [ "cc", "clap", "crossterm", + "encoding_rs", "glob", "hashbrown", "humansize", @@ -317,6 +318,15 @@ version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a" +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + [[package]] name = "env_logger" version = "0.10.2" diff --git a/Cargo.toml b/Cargo.toml index e60411ef54..6955d84c0d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -97,6 +97,7 @@ tree-sitter-lua = "0.2.0" tree-sitter-xml = "0.7.0" tree-sitter-make = "1.1.1" tree-sitter-yaml = "0.7.0" +encoding_rs = "0.8.35" [dev-dependencies] # assert_cmd 2.0.10 requires predicates 3. diff --git a/sample_files/compare.expected b/sample_files/compare.expected index 74ceef1ed1..806ec45fcd 100644 --- a/sample_files/compare.expected +++ b/sample_files/compare.expected @@ -298,6 +298,9 @@ ca98b4d14fc21e0f04cf24aeb3d2526c - sample_files/whitespace_1.tsx sample_files/whitespace_2.tsx ac8b1a89ac26333f2d4e9433b2ca3958 - +sample_files/windows_1251_1.txt sample_files/windows_2251_1.txt +d41d8cd98f00b204e9800998ecf8427e - + sample_files/xml_1.xml sample_files/xml_2.xml e629cbd2e721fd249c7ce1626f17e953 - diff --git a/sample_files/windows_1251_1.txt b/sample_files/windows_1251_1.txt new file mode 100644 index 0000000000..4e6ed73f45 --- /dev/null +++ b/sample_files/windows_1251_1.txt @@ -0,0 +1 @@ +Muß können: löst muß daß Heißt löscht führen für muß ähnlich diff --git a/sample_files/windows_1251_2.txt b/sample_files/windows_1251_2.txt new file mode 100644 index 0000000000..41d35bd3d3 --- /dev/null +++ b/sample_files/windows_1251_2.txt @@ -0,0 +1 @@ +Muß können: löst muß daß Heißt löscht führen für muß ähmlich diff --git a/src/files.rs b/src/files.rs index b7702adec7..ec3ac6592b 100644 --- a/src/files.rs +++ b/src/files.rs @@ -215,7 +215,7 @@ pub(crate) fn guess_content(bytes: &[u8]) -> ProbableFileKind { .take(5000) .filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0') .count(); - if num_utf16_invalid <= 5 { + if num_utf16_invalid <= 1 { info!( "Input file is mostly valid UTF-16 (invalid characters: {})", num_utf16_invalid @@ -223,6 +223,13 @@ pub(crate) fn guess_content(bytes: &[u8]) -> ProbableFileKind { return ProbableFileKind::Text(utf16_string); } + // If the input bytes are valid Windows-1252 (an extension of + // ISO-8859-1 aka Latin 1), treat them as such. + let (latin1_str, _encoding, saw_malformed) = encoding_rs::WINDOWS_1252.decode(bytes); + if !saw_malformed { + return ProbableFileKind::Text(latin1_str.to_string()); + } + ProbableFileKind::Binary }