Skip to content

Commit

Permalink
Merge pull request #7 from nadar/limit-size
Browse files Browse the repository at this point in the history
Limit size
  • Loading branch information
nadar authored Nov 14, 2020
2 parents 81d4d5a + e0f6e96 commit 76e089e
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 7 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@
All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/).
In order to read more about upgrading and BC breaks have a look at the [UPGRADE Document](UPGRADE.md).

## 1.2.0 (14. November 2020)

+ [#7](https://github.com/nadar/crawler/pull/7/files) By default, response content which is bigger then 5MB won't be passed to Parsers. In order to turn off this behavior use `'maxSize' => false` or increase the limit `'maxSize' => 15000000` (which is 15MB for example). The value must be provided in Bytes. The main goal is to ensure that the PDF Parser won't run into very large memory consumption. This restriction won't stop the Crawler from downloading the URL (whether its large the the maxSize definition or not), but preventing memory leaks when the Parsers start to interact with the response content.

## 1.1.2 (12. November 2020)

+ Decrease the CURL Request Timeout. A Site will now timeout after 5 seconds when before trying to crawl.
+ Decrease the CURL Request Timeout. A CURL request for a given URL will now timeout after 5 seconds.

## 1.1.1 (21. October 2020)

Expand Down
20 changes: 19 additions & 1 deletion src/Crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,19 @@
*/
class Crawler
{
public $concurrentJobs = 30;
/**
* @var integer The number of concurrent curl download requests, this can strong increase memory usage.
*/
public $concurrentJobs = 15;

/**
* @var integer The process limit in Bytes. Every response which is higher then the above value will be skipped on not the passed to the parsers. (5000000 Bytes = 5 Mb)
* This can be helpfull that parsers won't run into large memory leaks. If the value false is provided, the limit is disabeld. To be clear, this value won't stop the crawler
* from downloading any given url, it just won't passe the value to the parsers, especially the PDF parser requires a lot of memory for large files, this is why
* this property has been introduced
* @since 1.2.0
*/
public $maxSize = 5000000;

/**
* @var Url Contains the URL object with the base URL. Urls which are not matching the base url will not be crawled or added to the results page.
Expand Down Expand Up @@ -219,6 +231,12 @@ public function run()

// get content and remove handles
foreach ($curlRequests as $queueKey => $ch) {
if ($this->maxSize && curl_getinfo($ch, CURLINFO_SIZE_DOWNLOAD) > $this->maxSize) {
curl_multi_remove_handle($multiCurl, $ch);
unset($ch);
continue;
}

$requestResponse = new RequestResponse(curl_multi_getcontent($ch), curl_getinfo($ch, CURLINFO_CONTENT_TYPE));

$checksum = $requestResponse->getChecksum();
Expand Down
3 changes: 2 additions & 1 deletion src/Job.php
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ public function generateCurl()
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_URL, $this->url->getNormalized());
curl_setopt($curl, CURLOPT_HTTPGET, true);
curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 0);
curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 0);
curl_setopt($curl, CURLOPT_TIMEOUT, 5); // timeout after 5 seconds
return $curl;
}
Expand Down Expand Up @@ -95,6 +95,7 @@ public function run(RequestResponse $requestResponse, Crawler $crawler)

if ($parserResult->ignore) {
// for whatever reason the parser ignores this url
unset($parserResult);
continue;
}

Expand Down
4 changes: 2 additions & 2 deletions src/Parsers/PdfParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@

/**
* PDF Parser
* > Attention: Keep in mind that wen you enable the PDF Parser and have multiple concurrent requests this can drastically increases memory
* > Attention: Keep in mind that wen you enable the PDF Parser and have multiple concurrent requests this can drastically increases memory
* > usage (Especially if there are large PDFs)! Therefore it's recommend to lower the concurrent value when enabling PDF Parser!
*
* @author Basil Suter <[email protected]>
Expand Down
28 changes: 28 additions & 0 deletions tests/CrawlerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,34 @@ public function testRunCrawler()
$this->assertNotEmpty($debug->elapsedTime());
}

public function testRunCrawlerWithGivenFilter()
{
$debug = new DebugHandler();

$crawler = new Crawler('https://luya.io', new ArrayStorage, new LoopRunner);
$crawler->urlFilterRules = [
'#/news#i'
];
$crawler->addParser(new HtmlParser);
$crawler->addHandler($debug);
$this->assertEmpty($crawler->setup());
$this->assertEmpty($crawler->run());
$this->assertNotEmpty($debug->elapsedTime());
}

public function testRunCrawlerButSkipResponseDueToVerySmallLimit()
{
$debug = new DebugHandler();

$crawler = new Crawler('https://luya.io', new ArrayStorage, new LoopRunner);
$crawler->maxSize = 1; // 1 byte is required
$crawler->addParser(new HtmlParser);
$crawler->addHandler($debug);
$this->assertEmpty($crawler->setup());
$this->assertEmpty($crawler->run());
$this->assertNotEmpty($debug->elapsedTime());
}

public function testCrawlWithFilteredMainDomain()
{
$debug = new DebugHandler();
Expand Down
5 changes: 3 additions & 2 deletions tests/Parsers/HtmlParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@ public function testDomDocumentRemoveScriptInformations()

$this->assertSame('<body>between<p>the lines</p></body>', str_replace(['\n', '\r', PHP_EOL], '', $parser->getDomBodyContent($dom)));

$this->assertNull($parser->getDomBodyContent(new DOMDocument())
);
$this->assertNull(
$parser->getDomBodyContent(new DOMDocument())
);
}

public function testFullIgnoreTag()
Expand Down

0 comments on commit 76e089e

Please sign in to comment.