From fb29876efcbdeeabcb6ce849810fcaf78050ef28 Mon Sep 17 00:00:00 2001 From: Basil Suter Date: Thu, 15 Apr 2021 10:13:42 +0000 Subject: [PATCH 1/2] crawler group --- CHANGELOG.md | 4 ++++ src/Parsers/HtmlParser.php | 8 ++++---- tests/Parsers/HtmlParserExampleTest.php | 14 ++++++++++++++ tests/Parsers/HtmlParserTest.php | 1 + 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f1e35f2..5d98ec1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,10 @@ All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/). In order to read more about upgrading and BC breaks have a look at the [UPGRADE Document](UPGRADE.md). +## 1.6.1 (16. April 2021) + ++ []() Fixed issue where crawler group is not generated correctly. + ## 1.6.0 (16. March 2021) + [#15](https://github.com/nadar/crawler/issues/15) Do not follow links which have `rel="nofollow"` by default. This can be configured in the `HtmlParser::$ignoreRels` property. diff --git a/src/Parsers/HtmlParser.php b/src/Parsers/HtmlParser.php index 83b6806..5cea7f9 100644 --- a/src/Parsers/HtmlParser.php +++ b/src/Parsers/HtmlParser.php @@ -48,10 +48,10 @@ public function run(Job $job, RequestResponse $requestResponse) : ParserResult $links = $this->getDomLinks($dom, $this->ignoreRels); // body content - $content = $this->getDomBodyContent($dom); + $body = $this->getDomBodyContent($dom); - $content = $this->stripCrawlIgnore($content); + $content = $this->stripCrawlIgnore($body); $content = $this->stripTags ? strip_tags($content) : $content; $jobResult = new ParserResult(); @@ -61,9 +61,9 @@ public function run(Job $job, RequestResponse $requestResponse) : ParserResult $jobResult->language = $this->getDomLanguage($dom); $jobResult->keywords = $this->getDomKeywords($dom); $jobResult->description = $this->getDomDescription($dom); - $jobResult->group = $this->getCrawlGroup($content); + $jobResult->group = $this->getCrawlGroup($body); - unset($dom, $links, $content); + unset($dom, $links, $content, $body); return $jobResult; } diff --git a/tests/Parsers/HtmlParserExampleTest.php b/tests/Parsers/HtmlParserExampleTest.php index 27b2553..7f82213 100644 --- a/tests/Parsers/HtmlParserExampleTest.php +++ b/tests/Parsers/HtmlParserExampleTest.php @@ -10,6 +10,20 @@ class HtmlParserExampleTest extends CrawlerTestCase { + public function testGroupContent() + { + $job = new Job(new Url('https://example.com/'), new Url('https://example.com/')); + + $html = '