diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d2a133..69deb00 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,9 +3,13 @@ All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/). In order to read more about upgrading and BC breaks have a look at the [UPGRADE Document](UPGRADE.md). +## 1.2.0 (14. November 2020) + ++ [#7](https://github.com/nadar/crawler/pull/7/files) By default, response content which is bigger then 5MB won't be passed to Parsers. In order to turn off this behavior use `'maxSize' => false` or increase the limit `'maxSize' => 15000000` (which is 15MB for example). The value must be provided in Bytes. The main goal is to ensure that the PDF Parser won't run into very large memory consumption. This restriction won't stop the Crawler from downloading the URL (whether its large the the maxSize definition or not), but preventing memory leaks when the Parsers start to interact with the response content. + ## 1.1.2 (12. November 2020) -+ Decrease the CURL Request Timeout. A Site will now timeout after 5 seconds when before trying to crawl. ++ Decrease the CURL Request Timeout. A CURL request for a given URL will now timeout after 5 seconds. ## 1.1.1 (21. October 2020) diff --git a/src/Crawler.php b/src/Crawler.php index c24ece5..190067b 100644 --- a/src/Crawler.php +++ b/src/Crawler.php @@ -17,7 +17,19 @@ */ class Crawler { - public $concurrentJobs = 30; + /** + * @var integer The number of concurrent curl download requests, this can strong increase memory usage. + */ + public $concurrentJobs = 15; + + /** + * @var integer The process limit in Bytes. Every response which is higher then the above value will be skipped on not the passed to the parsers. (5000000 Bytes = 5 Mb) + * This can be helpfull that parsers won't run into large memory leaks. If the value false is provided, the limit is disabeld. To be clear, this value won't stop the crawler + * from downloading any given url, it just won't passe the value to the parsers, especially the PDF parser requires a lot of memory for large files, this is why + * this property has been introduced + * @since 1.2.0 + */ + public $maxSize = 5000000; /** * @var Url Contains the URL object with the base URL. Urls which are not matching the base url will not be crawled or added to the results page. @@ -219,6 +231,12 @@ public function run() // get content and remove handles foreach ($curlRequests as $queueKey => $ch) { + if ($this->maxSize && curl_getinfo($ch, CURLINFO_SIZE_DOWNLOAD) > $this->maxSize) { + curl_multi_remove_handle($multiCurl, $ch); + unset($ch); + continue; + } + $requestResponse = new RequestResponse(curl_multi_getcontent($ch), curl_getinfo($ch, CURLINFO_CONTENT_TYPE)); $checksum = $requestResponse->getChecksum(); diff --git a/src/Job.php b/src/Job.php index e5ea3a8..e5822a2 100644 --- a/src/Job.php +++ b/src/Job.php @@ -63,7 +63,7 @@ public function generateCurl() curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); curl_setopt($curl, CURLOPT_URL, $this->url->getNormalized()); curl_setopt($curl, CURLOPT_HTTPGET, true); - curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 0); + curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 0); curl_setopt($curl, CURLOPT_TIMEOUT, 5); // timeout after 5 seconds return $curl; } @@ -95,6 +95,7 @@ public function run(RequestResponse $requestResponse, Crawler $crawler) if ($parserResult->ignore) { // for whatever reason the parser ignores this url + unset($parserResult); continue; } diff --git a/src/Parsers/PdfParser.php b/src/Parsers/PdfParser.php index 0c708e5..4db6f8a 100644 --- a/src/Parsers/PdfParser.php +++ b/src/Parsers/PdfParser.php @@ -13,8 +13,8 @@ /** * PDF Parser - - * > Attention: Keep in mind that wen you enable the PDF Parser and have multiple concurrent requests this can drastically increases memory + + * > Attention: Keep in mind that wen you enable the PDF Parser and have multiple concurrent requests this can drastically increases memory * > usage (Especially if there are large PDFs)! Therefore it's recommend to lower the concurrent value when enabling PDF Parser! * * @author Basil Suter diff --git a/tests/CrawlerTest.php b/tests/CrawlerTest.php index 565ec3f..6ea5dea 100644 --- a/tests/CrawlerTest.php +++ b/tests/CrawlerTest.php @@ -23,6 +23,34 @@ public function testRunCrawler() $this->assertNotEmpty($debug->elapsedTime()); } + public function testRunCrawlerWithGivenFilter() + { + $debug = new DebugHandler(); + + $crawler = new Crawler('https://luya.io', new ArrayStorage, new LoopRunner); + $crawler->urlFilterRules = [ + '#/news#i' + ]; + $crawler->addParser(new HtmlParser); + $crawler->addHandler($debug); + $this->assertEmpty($crawler->setup()); + $this->assertEmpty($crawler->run()); + $this->assertNotEmpty($debug->elapsedTime()); + } + + public function testRunCrawlerButSkipResponseDueToVerySmallLimit() + { + $debug = new DebugHandler(); + + $crawler = new Crawler('https://luya.io', new ArrayStorage, new LoopRunner); + $crawler->maxSize = 1; // 1 byte is required + $crawler->addParser(new HtmlParser); + $crawler->addHandler($debug); + $this->assertEmpty($crawler->setup()); + $this->assertEmpty($crawler->run()); + $this->assertNotEmpty($debug->elapsedTime()); + } + public function testCrawlWithFilteredMainDomain() { $debug = new DebugHandler(); diff --git a/tests/Parsers/HtmlParserTest.php b/tests/Parsers/HtmlParserTest.php index e7d3ac1..0976172 100644 --- a/tests/Parsers/HtmlParserTest.php +++ b/tests/Parsers/HtmlParserTest.php @@ -30,8 +30,9 @@ public function testDomDocumentRemoveScriptInformations() $this->assertSame('between

the lines

', str_replace(['\n', '\r', PHP_EOL], '', $parser->getDomBodyContent($dom))); - $this->assertNull($parser->getDomBodyContent(new DOMDocument()) - ); + $this->assertNull( + $parser->getDomBodyContent(new DOMDocument()) + ); } public function testFullIgnoreTag()