mgeisler · phy1729 · Dec 20, 2024
diff --git a/src/core.rs b/src/core.rs
@@ -270,6 +270,16 @@ impl<'a> Word<'a> {
         }
     }
 
+    pub(crate) fn from_unicode(word: &str) -> Word<'_> {
+        let trimmed = word.trim_end();
+        Word {
+            word: trimmed,
+            width: display_width(trimmed),
+            whitespace: &word[trimmed.len()..],
+            penalty: "",
+        }
+    }
+
     /// Break this word into smaller words with a width of at most
     /// `line_width`. The whitespace and penalty from this `Word` is
     /// added to the last piece.

diff --git a/src/word_separators.rs b/src/word_separators.rs
@@ -299,14 +299,14 @@ fn find_words_unicode_break_properties<'a>(
     Box::new(std::iter::from_fn(move || {
         for (idx, _) in opportunities.by_ref() {
             if let Some((orig_idx, _)) = idx_map.find(|&(_, stripped_idx)| stripped_idx == idx) {
-                let word = Word::from(&line[start..orig_idx]);
+                let word = Word::from_unicode(&line[start..orig_idx]);
                 start = orig_idx;
                 return Some(word);
             }
         }
 
         if start < line.len() {
-            let word = Word::from(&line[start..]);
+            let word = Word::from_unicode(&line[start..]);
             start = line.len();
             return Some(word);
         }
@@ -327,18 +327,14 @@ mod tests {
         };
     }
 
-    fn to_words(words: Vec<&str>) -> Vec<Word<'_>> {
-        words.into_iter().map(Word::from).collect()
-    }
-
     macro_rules! test_find_words {
         ($ascii_name:ident,
          $unicode_name:ident,
          $([ $line:expr, $ascii_words:expr, $unicode_words:expr ]),+) => {
             #[test]
             fn $ascii_name() {
                 $(
-                    let expected_words = to_words($ascii_words.to_vec());
+                    let expected_words: Vec<_> = $ascii_words.into_iter().map(Word::from).collect();
                     let actual_words = WordSeparator::AsciiSpace
                         .find_words($line)
                         .collect::<Vec<_>>();
@@ -350,7 +346,7 @@ mod tests {
             #[cfg(feature = "unicode-linebreak")]
             fn $unicode_name() {
                 $(
-                    let expected_words = to_words($unicode_words.to_vec());
+                    let expected_words: Vec<_> = $unicode_words.into_iter().map(Word::from_unicode).collect();
                     let actual_words = WordSeparator::UnicodeBreakProperties
                         .find_words($line)
                         .collect::<Vec<_>>();
@@ -478,4 +474,27 @@ mod tests {
         #[cfg(not(feature = "unicode-linebreak"))]
         assert!(matches!(WordSeparator::new(), AsciiSpace));
     }
+
+    #[test]
+    #[cfg(feature = "unicode-linebreak")]
+    fn unicode_trailing_whitespace_newline() {
+        let text = "foo  \nbar";
+        assert_iter_eq!(
+            UnicodeBreakProperties.find_words(text),
+            vec![
+                Word {
+                    word: "foo",
+                    whitespace: "  \n",
+                    penalty: "",
+                    width: 3,
+                },
+                Word {
+                    word: "bar",
+                    whitespace: "",
+                    penalty: "",
+                    width: 3,
+                },
+            ]
+        );
+    }
 }