Improve time complexity of tokenization regex used in diffSentences (#580)

ExplodingCabbage · web-flow · commit 4c8f44474630 · 2025-02-14T16:39:58.000Z
diff --git a/README.md b/README.md
@@ -60,7 +60,7 @@ Broadly, jsdiff's diff functions all take an old text and a new text and perform
 
     Returns a list of [change objects](#change-objects).
 
-* `Diff.diffSentences(oldStr, newStr[, options])` - diffs two blocks of text, treating each sentence as a token. The characters `.`, `!`, and `?`, when followed by whitespace, are treated as marking the end of a sentence; nothing else is considered to mark a sentence end.
+* `Diff.diffSentences(oldStr, newStr[, options])` - diffs two blocks of text, treating each sentence, and the whitespace between each pair of sentences, as a token. The characters `.`, `!`, and `?`, when followed by whitespace, are treated as marking the end of a sentence; nothing else besides the end of the string is considered to mark a sentence end.
 
   (For more sophisticated detection of sentence breaks, including support for non-English punctuation, consider instead tokenizing with an [`Intl.Segmenter`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/Segmenter) with `granularity: 'sentence'` and passing the result to `Diff.diffArrays`.)
 
diff --git a/release-notes.md b/release-notes.md
@@ -1,5 +1,13 @@
 # Release Notes
 
+## 8.0.0
+
+- [#580](https://github.com/kpdecker/jsdiff/pull/580) Multiple tweaks to `diffSentences`:
+  * tokenization no longer takes quadratic time on pathological inputs (reported as a ReDOS vulnerability by Snyk); is now linear instead
+  * the final sentence in the string is now handled the same by the tokenizer regardless of whether it has a trailing punctuation mark or not. (Previously, "foo. bar." tokenized to `["foo.", " ", "bar."]` but "foo. bar" tokenized to `["foo.", " bar"]` - i.e. whether the space between sentences was treated as a separate token depended upon whether the final sentence had trailing punctuation or not. This was arbitrary and surprising; it is no longer the case.)
+  * in a string that starts with a sentence end, like "! hello.", the "!" is now treated as a separate sentence
+  * the README now correctly documents the tokenization behaviour (it was wrong before)
+
 ## 7.0.0
 
 Just a single (breaking) bugfix, undoing a behaviour change introduced accidentally in 6.0.0:
diff --git a/src/diff/sentence.js b/src/diff/sentence.js
@@ -3,7 +3,7 @@ import Diff from './base';
 
 export const sentenceDiff = new Diff();
 sentenceDiff.tokenize = function(value) {
-  return value.split(/(\S.+?[.!?])(?=\s+|$)/);
+  return value.split(/(?<=[.!?])(\s+|$)/);
 };
 
 export function diffSentences(oldStr, newStr, callback) { return sentenceDiff.diff(oldStr, newStr, callback); }
diff --git a/test/diff/sentence.js b/test/diff/sentence.js
@@ -1,9 +1,43 @@
-import {diffSentences} from '../../lib/diff/sentence';
+import {diffSentences, sentenceDiff} from '../../lib/diff/sentence';
 import {convertChangesToXML} from '../../lib/convert/xml';
 
 import {expect} from 'chai';
 
 describe('diff/sentence', function() {
+  describe('tokenize', function() {
+    it('should split on whitespace after a punctuation mark, and keep the whitespace as a token', function() {
+      expect(sentenceDiff.removeEmpty(sentenceDiff.tokenize(''))).to.eql([]);
+
+      expect(sentenceDiff.removeEmpty(sentenceDiff.tokenize(
+          'Foo bar baz! Qux wibbly wobbly bla? \n\tYayayaya!Yayayaya!Ya! Yes!!!!! Blub'
+      ))).to.eql([
+        'Foo bar baz!',
+        ' ',
+        'Qux wibbly wobbly bla?',
+        ' \n\t',
+        'Yayayaya!Yayayaya!Ya!',
+        ' ',
+        'Yes!!!!!',
+        ' ',
+        'Blub'
+      ]);
+
+      expect(sentenceDiff.removeEmpty(sentenceDiff.tokenize(
+        '! Hello there.'
+      ))).to.eql([
+        '!',
+        ' ',
+        'Hello there.'
+      ]);
+
+      expect(sentenceDiff.removeEmpty(sentenceDiff.tokenize(
+        '    foo bar baz.'
+      ))).to.eql([
+        '    foo bar baz.'
+      ]);
+    });
+  });
+
   describe('#diffSentences', function() {
     it('Should diff Sentences', function() {
       const diffResult = diffSentences('New Value.', 'New ValueMoreData.');