Skip to content

Commit

Permalink
HTMLDiff Performance inhancement (#54)
Browse files Browse the repository at this point in the history
* Trying to squize more performance out of the HTMLDiffer

* Initial array_slice_cached. cacheToken key breaks the differ

* Added setters for the oldWords and newWords to make sure the cache is reset

* Documentation

* Expected output

* AbstractTest, for DRY functions

* Fixed scrutinizer complained
  • Loading branch information
Sven Hagemann authored and jschroed91 committed Jun 6, 2016
1 parent 11d3493 commit 272a3ba
Show file tree
Hide file tree
Showing 11 changed files with 262 additions and 34 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ vendor/
/demo/bower_components
/demo/node_modules
.DS_Store
.idea
30 changes: 26 additions & 4 deletions lib/Caxy/HtmlDiff/AbstractDiff.php
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,12 @@ abstract class AbstractDiff
*/
protected $purifierConfig = null;

/**
* @see array_slice_cached();
* @var bool
*/
protected $resetCache = false;

/**
* AbstractDiff constructor.
*
Expand Down Expand Up @@ -113,8 +119,6 @@ abstract public function build();
*/
public function initPurifier($defaultPurifierSerializerCache = null)
{
$HTMLPurifierConfig = null;

if (null !== $this->purifierConfig) {
$HTMLPurifierConfig = $this->purifierConfig;
} else {
Expand Down Expand Up @@ -423,8 +427,26 @@ protected function purifyHtml($html)

protected function splitInputsToWords()
{
$this->oldWords = $this->convertHtmlToListOfWords($this->explode($this->oldText));
$this->newWords = $this->convertHtmlToListOfWords($this->explode($this->newText));
$this->setOldWords($this->convertHtmlToListOfWords($this->explode($this->oldText)));
$this->setNewWords($this->convertHtmlToListOfWords($this->explode($this->newText)));
}

/**
* @param array $oldWords
*/
protected function setOldWords(array $oldWords)
{
$this->resetCache = true;
$this->oldWords = $oldWords;
}

/**
* @param array $newWords
*/
protected function setNewWords(array $newWords)
{
$this->resetCache = true;
$this->newWords = $newWords;
}

/**
Expand Down
101 changes: 84 additions & 17 deletions lib/Caxy/HtmlDiff/HtmlDiff.php
Original file line number Diff line number Diff line change
Expand Up @@ -694,28 +694,32 @@ protected function operations()
$positionInOld = 0;
$positionInNew = 0;
$operations = array();
$matches = $this->matchingBlocks();

$matches = $this->matchingBlocks();
$matches[] = new Match(count($this->oldWords), count($this->newWords), 0);

foreach ($matches as $i => $match) {
$matchStartsAtCurrentPositionInOld = ($positionInOld == $match->startInOld);
$matchStartsAtCurrentPositionInNew = ($positionInNew == $match->startInNew);
$action = 'none';
$matchStartsAtCurrentPositionInOld = ($positionInOld === $match->startInOld);
$matchStartsAtCurrentPositionInNew = ($positionInNew === $match->startInNew);

if ($matchStartsAtCurrentPositionInOld == false && $matchStartsAtCurrentPositionInNew == false) {
if ($matchStartsAtCurrentPositionInOld === false && $matchStartsAtCurrentPositionInNew === false) {
$action = 'replace';
} elseif ($matchStartsAtCurrentPositionInOld == true && $matchStartsAtCurrentPositionInNew == false) {
} elseif ($matchStartsAtCurrentPositionInOld === true && $matchStartsAtCurrentPositionInNew === false) {
$action = 'insert';
} elseif ($matchStartsAtCurrentPositionInOld == false && $matchStartsAtCurrentPositionInNew == true) {
} elseif ($matchStartsAtCurrentPositionInOld === false && $matchStartsAtCurrentPositionInNew === true) {
$action = 'delete';
} else { // This occurs if the first few words are the same in both versions
$action = 'none';
}
if ($action != 'none') {

if ($action !== 'none') {
$operations[] = new Operation($action, $positionInOld, $match->startInOld, $positionInNew, $match->startInNew);
}
if (count($match) != 0) {

if (count($match) !== 0) {
$operations[] = new Operation('equal', $match->startInOld, $match->endInOld(), $match->startInNew, $match->endInNew());
}

$positionInOld = $match->endInOld();
$positionInNew = $match->endInNew();
}
Expand Down Expand Up @@ -744,11 +748,14 @@ protected function matchingBlocks()
protected function findMatchingBlocks($startInOld, $endInOld, $startInNew, $endInNew, &$matchingBlocks)
{
$match = $this->findMatch($startInOld, $endInOld, $startInNew, $endInNew);

if ($match !== null) {
if ($startInOld < $match->startInOld && $startInNew < $match->startInNew) {
$this->findMatchingBlocks($startInOld, $match->startInOld, $startInNew, $match->startInNew, $matchingBlocks);
}

$matchingBlocks[] = $match;

if ($match->endInOld() < $endInOld && $match->endInNew() < $endInNew) {
$this->findMatchingBlocks($match->endInOld(), $endInOld, $match->endInNew(), $endInNew, $matchingBlocks);
}
Expand All @@ -762,9 +769,13 @@ protected function findMatchingBlocks($startInOld, $endInOld, $startInNew, $endI
*/
protected function stripTagAttributes($word)
{
$word = explode(' ', trim($word, '<>'));
$space = strpos($word, ' ', 1);

if ($space) {
return '<' . substr($word, 1, $space) . '>';
}

return '<'.$word[ 0 ].'>';
return trim($word, '<>');
}

/**
Expand All @@ -781,6 +792,7 @@ protected function findMatch($startInOld, $endInOld, $startInNew, $endInNew)
$bestMatchInNew = $startInNew;
$bestMatchSize = 0;
$matchLengthAt = array();

for ($indexInOld = $startInOld; $indexInOld < $endInOld; ++$indexInOld) {
$newMatchLengthAt = array();
$index = $this->oldWords[ $indexInOld ];
Expand All @@ -798,16 +810,15 @@ protected function findMatch($startInOld, $endInOld, $startInNew, $endInNew)
if ($indexInNew >= $endInNew) {
break;
}

$newMatchLength = (isset($matchLengthAt[ $indexInNew - 1 ]) ? $matchLengthAt[ $indexInNew - 1 ] : 0) + 1;
$newMatchLengthAt[ $indexInNew ] = $newMatchLength;

if ($newMatchLength > $bestMatchSize ||
(
$this->isGroupDiffs() &&
$bestMatchSize > 0 &&
preg_match(
'/^\s+$/',
implode('', array_slice($this->oldWords, $bestMatchInOld, $bestMatchSize))
)
$this->isOnlyWhitespace($this->array_slice_cached($this->oldWords, $bestMatchInOld, $bestMatchSize))
)
) {
$bestMatchInOld = $indexInOld - $newMatchLength + 1;
Expand All @@ -822,12 +833,68 @@ protected function findMatch($startInOld, $endInOld, $startInNew, $endInNew)
if ($bestMatchSize != 0 &&
(
!$this->isGroupDiffs() ||
!preg_match('/^\s+$/', implode('', array_slice($this->oldWords, $bestMatchInOld, $bestMatchSize)))
!$this->isOnlyWhitespace($this->array_slice_cached($this->oldWords, $bestMatchInOld, $bestMatchSize))
)
) {
return new Match($bestMatchInOld, $bestMatchInNew, $bestMatchSize);
}

return;
return null;
}

/**
* @param string $str
*
* @return bool
*/
protected function isOnlyWhitespace($str)
{
// Slightly faster then using preg_match
return $str !== '' && (strlen(trim($str)) === 0);
}

/**
* Special array_slice function that caches its last request.
*
* The diff algorithm seems to request the same information many times in a row.
* by returning the previous answer the algorithm preforms way faster.
*
* The result is a string instead of an array, this way we safe on the amount of
* memory intensive implode() calls.
*
* @param array &$array
* @param integer $offset
* @param integer|null $length
*
* @return string
*/
protected function array_slice_cached(&$array, $offset, $length = null)
{
static $lastOffset = null;
static $lastLength = null;
static $cache = null;

// PHP has no support for by-reference comparing.
// to prevent false positive hits, reset the cache when the oldWords or newWords is changed.
if ($this->resetCache === true) {
$cache = null;

$this->resetCache = false;
}

if (
$cache !== null &&
$lastLength === $length &&
$lastOffset === $offset
) { // Hit
return $cache;
} // Miss

$lastOffset = $offset;
$lastLength = $length;

$cache = implode('', array_slice($array, $offset, $length));

return $cache;
}
}
6 changes: 6 additions & 0 deletions phpunit.xml.dist
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@
bootstrap="./tests/Caxy/Tests/TestInit.php"
>

<groups>
<exclude>
<group>performance</group>
</exclude>
</groups>

<testsuites>
<testsuite name="php-htmldiff Test Suite">
<directory>./tests/Caxy/Tests/HtmlDiff</directory>
Expand Down
17 changes: 17 additions & 0 deletions tests/Caxy/Tests/AbstractTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<?php

namespace Caxy\Tests;

abstract class AbstractTest extends \PHPUnit_Framework_TestCase
{
protected function stripExtraWhitespaceAndNewLines($text)
{
return trim(
preg_replace(
'/>\s+</',
'><',
preg_replace('/\s+/S', " ", preg_replace("/[\n\r]/", '', $text))
)
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@

use Caxy\HtmlDiff\HtmlDiff;
use Caxy\HtmlDiff\HtmlDiffConfig;
use Caxy\Tests\AbstractTest;

class HTMLPurifierConfigTest extends \PHPUnit_Framework_TestCase
class HTMLPurifierConfigTest extends AbstractTest
{
/**
* @var \HTMLPurifier_Config
Expand Down
14 changes: 2 additions & 12 deletions tests/Caxy/Tests/HtmlDiff/Functional/HtmlDiffFunctionalTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
namespace Caxy\Tests\HtmlDiff\Functional;

use Caxy\HtmlDiff\HtmlDiff;
use Caxy\Tests\AbstractTest;
use Caxy\Tests\HtmlDiff\HtmlFileIterator;

class HtmlDiffFunctionalTest extends \PHPUnit_Framework_TestCase
class HtmlDiffFunctionalTest extends AbstractTest
{
/**
* @dataProvider diffContentProvider
Expand All @@ -26,15 +27,4 @@ public function diffContentProvider()
{
return new HtmlFileIterator(__DIR__.'/../../../../fixtures/HtmlDiff');
}

protected function stripExtraWhitespaceAndNewLines($text)
{
return trim(
preg_replace(
'/>\s+</',
'><',
preg_replace('/\s+/S', " ", preg_replace("/[\n\r]/", '', $text))
)
);
}
}
29 changes: 29 additions & 0 deletions tests/Caxy/Tests/HtmlDiff/Performance/PerformanceTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
<?php

namespace Caxy\Tests\HtmlDiff\Performance;

use Caxy\HtmlDiff\HtmlDiff;
use Caxy\Tests\AbstractTest;

class PerformanceTest extends AbstractTest
{
/**
* @group performance
*/
public function testParagraphPerformance()
{
$fixturesPath = __DIR__ . '/../../../../fixtures/Performance/';

$expected = file_get_contents($fixturesPath . 'paragraphs_expected.html');

$diff = new HtmlDiff(
file_get_contents($fixturesPath . 'paragraphs.html'),
file_get_contents($fixturesPath . 'paragraphs_changed.html'),
'UTF-8', array()
);

$output = $diff->build();

$this->assertSame($this->stripExtraWhitespaceAndNewLines($output), $this->stripExtraWhitespaceAndNewLines($expected));
}
}
Loading

0 comments on commit 272a3ba

Please sign in to comment.