Skip to content

Commit

Permalink
Merge pull request #17 from nadar/group
Browse files Browse the repository at this point in the history
crawler group
  • Loading branch information
nadar authored Apr 15, 2021
2 parents 1cc0c0d + 72e8918 commit 8be5e08
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 4 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/).
In order to read more about upgrading and BC breaks have a look at the [UPGRADE Document](UPGRADE.md).

## 1.6.1 (16. April 2021)

+ [#17](https://github.com/nadar/crawler/pull/17) Fixed issue where crawler group is not generated correctly.

## 1.6.0 (16. March 2021)

+ [#15](https://github.com/nadar/crawler/issues/15) Do not follow links which have `rel="nofollow"` by default. This can be configured in the `HtmlParser::$ignoreRels` property.
Expand Down
8 changes: 4 additions & 4 deletions src/Parsers/HtmlParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ public function run(Job $job, RequestResponse $requestResponse) : ParserResult
$links = $this->getDomLinks($dom, $this->ignoreRels);

// body content
$content = $this->getDomBodyContent($dom);
$body = $this->getDomBodyContent($dom);


$content = $this->stripCrawlIgnore($content);
$content = $this->stripCrawlIgnore($body);
$content = $this->stripTags ? strip_tags($content) : $content;

$jobResult = new ParserResult();
Expand All @@ -61,9 +61,9 @@ public function run(Job $job, RequestResponse $requestResponse) : ParserResult
$jobResult->language = $this->getDomLanguage($dom);
$jobResult->keywords = $this->getDomKeywords($dom);
$jobResult->description = $this->getDomDescription($dom);
$jobResult->group = $this->getCrawlGroup($content);
$jobResult->group = $this->getCrawlGroup($body);

unset($dom, $links, $content);
unset($dom, $links, $content, $body);

return $jobResult;
}
Expand Down
14 changes: 14 additions & 0 deletions tests/Parsers/HtmlParserExampleTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,20 @@

class HtmlParserExampleTest extends CrawlerTestCase
{
public function testGroupContent()
{
$job = new Job(new Url('https://example.com/'), new Url('https://example.com/'));

$html = '<body><!-- [CRAWL_GROUP]2020[/CRAWL_GROUP] --><!-- [CRAWL_IGNORE] --><input type="checkbox" id="mainnav__toggler" name="mainnav__toggler" /><div class="container"><div class="mainnav"><label class="mainnav__close" for="mainnav__toggler"><span class="pictogram pictogram-schliessen"></span></label><div class="mainnav__inner"><ul class="mainnav__list"><li class="mainnav__item"><a class="mainnav__link" href="/">Home</a></li><li class="mainnav__item"><a class="mainnav__link" href="/de/2020/rueckblick">Rückblick</a></li><li class="mainnav__item"><a class="mainnav__link" href="/de/2020/wichtigste-zahlen-2020">Wichtigste Zahlen 2020</a></li><li class="mainnav__item"><a class="mainnav__link" href="/de/2020/lagebericht">Lagebericht</a></li><li class="mainnav__item"><a class="mainnav__link" href="/de/2020/jahresrechnung">Jahresrechnung</a></li><li class="mainnav__item mainnav__item--dropdown"><span class="mainnav__icon-dropdown pictogram pictogram-vor"></span><a class="mainnav__link" href="/de/2020/anhang-zur-jahresrechnung/anhangstabellen">Anhang zur Jahresrechnung</a><ul class="mainnav__sub"><li class="mainnav__subitem"><a href="/de/2020/anhang-zur-jahresrechnung/bilanzierungs-und-bewertungsgrundsaetze" class="mainnav__sublink">Bilanzierungs- und Bewertungsgrundsätze</a></li><li class="mainnav__subitem"><a href="/de/2020/anhang-zur-jahresrechnung/erlauterun</body>';

$requestResponse = new RequestResponse($html, 'text/html', 200);

$parser = new HtmlParser();
$result = $parser->run($job, $requestResponse);

$this->assertSame('2020', $result->group);
}

public function testFullIgnoreResult()
{
$job = new Job(new Url('https://example.com/'), new Url('https://example.com/'));
Expand Down
1 change: 1 addition & 0 deletions tests/Parsers/HtmlParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ public function testCrawlerTags()
$parser = new HtmlParser;
$this->assertSame('hello are you?', $parser->stripCrawlIgnore('hello [CRAWL_IGNORE]how[/CRAWL_IGNORE] are you?'));
$this->assertSame('the', $parser->getCrawlGroup('this is [CRAWL_GROUP]the[/CRAWL_GROUP] group'));
$this->assertSame('2020', $parser->getCrawlGroup('this is [CRAWL_GROUP]2020[/CRAWL_GROUP] group'));
}

public function testGetContent()
Expand Down

0 comments on commit 8be5e08

Please sign in to comment.