Skip to content

Commit 4c8f444

Browse files
Improve time complexity of tokenization regex used in diffSentences (#580)
1 parent 5aa8383 commit 4c8f444

File tree

4 files changed

+45
-3
lines changed

4 files changed

+45
-3
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ Broadly, jsdiff's diff functions all take an old text and a new text and perform
6060

6161
Returns a list of [change objects](#change-objects).
6262

63-
* `Diff.diffSentences(oldStr, newStr[, options])` - diffs two blocks of text, treating each sentence as a token. The characters `.`, `!`, and `?`, when followed by whitespace, are treated as marking the end of a sentence; nothing else is considered to mark a sentence end.
63+
* `Diff.diffSentences(oldStr, newStr[, options])` - diffs two blocks of text, treating each sentence, and the whitespace between each pair of sentences, as a token. The characters `.`, `!`, and `?`, when followed by whitespace, are treated as marking the end of a sentence; nothing else besides the end of the string is considered to mark a sentence end.
6464

6565
(For more sophisticated detection of sentence breaks, including support for non-English punctuation, consider instead tokenizing with an [`Intl.Segmenter`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/Segmenter) with `granularity: 'sentence'` and passing the result to `Diff.diffArrays`.)
6666

release-notes.md

+8
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
# Release Notes
22

3+
## 8.0.0
4+
5+
- [#580](https://github.com/kpdecker/jsdiff/pull/580) Multiple tweaks to `diffSentences`:
6+
* tokenization no longer takes quadratic time on pathological inputs (reported as a ReDOS vulnerability by Snyk); is now linear instead
7+
* the final sentence in the string is now handled the same by the tokenizer regardless of whether it has a trailing punctuation mark or not. (Previously, "foo. bar." tokenized to `["foo.", " ", "bar."]` but "foo. bar" tokenized to `["foo.", " bar"]` - i.e. whether the space between sentences was treated as a separate token depended upon whether the final sentence had trailing punctuation or not. This was arbitrary and surprising; it is no longer the case.)
8+
* in a string that starts with a sentence end, like "! hello.", the "!" is now treated as a separate sentence
9+
* the README now correctly documents the tokenization behaviour (it was wrong before)
10+
311
## 7.0.0
412

513
Just a single (breaking) bugfix, undoing a behaviour change introduced accidentally in 6.0.0:

src/diff/sentence.js

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import Diff from './base';
33

44
export const sentenceDiff = new Diff();
55
sentenceDiff.tokenize = function(value) {
6-
return value.split(/(\S.+?[.!?])(?=\s+|$)/);
6+
return value.split(/(?<=[.!?])(\s+|$)/);
77
};
88

99
export function diffSentences(oldStr, newStr, callback) { return sentenceDiff.diff(oldStr, newStr, callback); }

test/diff/sentence.js

+35-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,43 @@
1-
import {diffSentences} from '../../lib/diff/sentence';
1+
import {diffSentences, sentenceDiff} from '../../lib/diff/sentence';
22
import {convertChangesToXML} from '../../lib/convert/xml';
33

44
import {expect} from 'chai';
55

66
describe('diff/sentence', function() {
7+
describe('tokenize', function() {
8+
it('should split on whitespace after a punctuation mark, and keep the whitespace as a token', function() {
9+
expect(sentenceDiff.removeEmpty(sentenceDiff.tokenize(''))).to.eql([]);
10+
11+
expect(sentenceDiff.removeEmpty(sentenceDiff.tokenize(
12+
'Foo bar baz! Qux wibbly wobbly bla? \n\tYayayaya!Yayayaya!Ya! Yes!!!!! Blub'
13+
))).to.eql([
14+
'Foo bar baz!',
15+
' ',
16+
'Qux wibbly wobbly bla?',
17+
' \n\t',
18+
'Yayayaya!Yayayaya!Ya!',
19+
' ',
20+
'Yes!!!!!',
21+
' ',
22+
'Blub'
23+
]);
24+
25+
expect(sentenceDiff.removeEmpty(sentenceDiff.tokenize(
26+
'! Hello there.'
27+
))).to.eql([
28+
'!',
29+
' ',
30+
'Hello there.'
31+
]);
32+
33+
expect(sentenceDiff.removeEmpty(sentenceDiff.tokenize(
34+
' foo bar baz.'
35+
))).to.eql([
36+
' foo bar baz.'
37+
]);
38+
});
39+
});
40+
741
describe('#diffSentences', function() {
842
it('Should diff Sentences', function() {
943
const diffResult = diffSentences('New Value.', 'New ValueMoreData.');

0 commit comments

Comments
 (0)