diff --git a/CHANGELOG.md b/CHANGELOG.md index f1e35f2..e9b822f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,10 @@ All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/). In order to read more about upgrading and BC breaks have a look at the [UPGRADE Document](UPGRADE.md). +## 1.6.1 (16. April 2021) + ++ [#17](https://github.com/nadar/crawler/pull/17) Fixed issue where crawler group is not generated correctly. + ## 1.6.0 (16. March 2021) + [#15](https://github.com/nadar/crawler/issues/15) Do not follow links which have `rel="nofollow"` by default. This can be configured in the `HtmlParser::$ignoreRels` property. diff --git a/src/Parsers/HtmlParser.php b/src/Parsers/HtmlParser.php index 83b6806..5cea7f9 100644 --- a/src/Parsers/HtmlParser.php +++ b/src/Parsers/HtmlParser.php @@ -48,10 +48,10 @@ public function run(Job $job, RequestResponse $requestResponse) : ParserResult $links = $this->getDomLinks($dom, $this->ignoreRels); // body content - $content = $this->getDomBodyContent($dom); + $body = $this->getDomBodyContent($dom); - $content = $this->stripCrawlIgnore($content); + $content = $this->stripCrawlIgnore($body); $content = $this->stripTags ? strip_tags($content) : $content; $jobResult = new ParserResult(); @@ -61,9 +61,9 @@ public function run(Job $job, RequestResponse $requestResponse) : ParserResult $jobResult->language = $this->getDomLanguage($dom); $jobResult->keywords = $this->getDomKeywords($dom); $jobResult->description = $this->getDomDescription($dom); - $jobResult->group = $this->getCrawlGroup($content); + $jobResult->group = $this->getCrawlGroup($body); - unset($dom, $links, $content); + unset($dom, $links, $content, $body); return $jobResult; } diff --git a/tests/Parsers/HtmlParserExampleTest.php b/tests/Parsers/HtmlParserExampleTest.php index 27b2553..7f82213 100644 --- a/tests/Parsers/HtmlParserExampleTest.php +++ b/tests/Parsers/HtmlParserExampleTest.php @@ -10,6 +10,20 @@ class HtmlParserExampleTest extends CrawlerTestCase { + public function testGroupContent() + { + $job = new Job(new Url('https://example.com/'), new Url('https://example.com/')); + + $html = '