Skip to content

Commit

Permalink
Merge pull request #18 from nadar/encoding
Browse files Browse the repository at this point in the history
add encoding tests
  • Loading branch information
nadar authored Apr 15, 2021
2 parents 8be5e08 + 07710df commit 3e6c579
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 1 deletion.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/).
In order to read more about upgrading and BC breaks have a look at the [UPGRADE Document](UPGRADE.md).

## 1.6.2 (16. April 2021)

+ [#18](https://github.com/nadar/crawler/pull/18) Fix issue with pages where utf8 chars are in title tag.

## 1.6.1 (16. April 2021)

+ [#17](https://github.com/nadar/crawler/pull/17) Fixed issue where crawler group is not generated correctly.
Expand Down
3 changes: 2 additions & 1 deletion src/Parsers/HtmlParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,11 @@ public function validateRequestResponse(RequestResponse $requestResponse): bool
public function generateDomDocument($content)
{
$dom = new DOMDocument();
$dom->encoding = 'utf-8';

// Parse the HTML. The @ is used to suppress any parsing errors
// that will be thrown if the $html string isn't valid XHTML.
@$dom->loadHTML($content);
@$dom->loadHTML(mb_convert_encoding($content, 'HTML-ENTITIES', 'UTF-8'));

return $dom;
}
Expand Down
49 changes: 49 additions & 0 deletions tests/Parsers/HtmlParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,55 @@ public function testDomDocumentRemoveScriptInformations()
);
}

public function testValidUtf8HeaderWithoutHeaderSpecialChar()
{
$parser = new HtmlParser;
$dom = $parser->generateDomDocument('
<!DOCTYPE html><html lang="de">
<head>
<meta charset="UTF-8" />
<meta name="robots" content="index, follow" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
</head>
<body>Zurück</body></html>'
);

$this->assertSame('<body>Zurück</body>', $parser->getDomBodyContent($dom));
}

public function testUtf8CharsInsideTitle()
{
$parser = new HtmlParser;

$dom = $parser->generateDomDocument('
<!DOCTYPE html><html lang="de">
<head>
<title>Home | Geschäftsbericht</title>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<meta http-equiv="x-ua-compatible" content="ie=edge">
</head>
<body>Rückblick</body></html>');

$this->assertSame('<body>Rückblick</body>', $parser->getDomBodyContent($dom));

}

public function testMissingUtf8Information()
{
$parser = new HtmlParser;
$dom = $parser->generateDomDocument('
<!DOCTYPE html><html lang="de">
<head>
<meta name="robots" content="index, follow" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
</head>
<body>Zurück</body></html>'
);

$this->assertSame('<body>Zurück</body>', $parser->getDomBodyContent($dom));
}

public function testFullIgnoreTag()
{
$parser = new HtmlParser;
Expand Down

0 comments on commit 3e6c579

Please sign in to comment.