Merge pull request #7 from nadar/limit-size

Limit size
nadar · Nov 14, 2020 · 76e089e · 76e089e
2 parents 81d4d5a + e0f6e96
commit 76e089e
Show file tree

Hide file tree

Showing 6 changed files with 59 additions and 7 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,9 +3,13 @@
 All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/).
 In order to read more about upgrading and BC breaks have a look at the [UPGRADE Document](UPGRADE.md).
 
+## 1.2.0 (14. November 2020)
+
++ [#7](https://github.com/nadar/crawler/pull/7/files) By default, response content which is bigger then 5MB won't be passed to Parsers. In order to turn off this behavior use `'maxSize' => false` or increase the limit `'maxSize' => 15000000` (which is 15MB for example). The value must be provided in Bytes. The main goal is to ensure that the PDF Parser won't run into very large memory consumption. This restriction won't stop the Crawler from downloading the URL (whether its large the the maxSize definition or not), but preventing memory leaks when the Parsers start to interact with the response content.
+
 ## 1.1.2 (12. November 2020)
 
-+ Decrease the CURL Request Timeout. A Site will now timeout after 5 seconds when before trying to crawl.
++ Decrease the CURL Request Timeout. A CURL request for a given URL will now timeout after 5 seconds.
 
 ## 1.1.1 (21. October 2020)
 

diff --git a/src/Crawler.php b/src/Crawler.php
@@ -17,7 +17,19 @@
  */
 class Crawler
 {
-    public $concurrentJobs = 30;
+    /**
+     * @var integer The number of concurrent curl download requests, this can strong increase memory usage.
+     */
+    public $concurrentJobs = 15;
+
+    /**
+     * @var integer The process limit in Bytes. Every response which is higher then the above value will be skipped on not the passed to the parsers. (5000000 Bytes = 5 Mb)
+     * This can be helpfull that parsers won't run into large memory leaks. If the value false is provided, the limit is disabeld. To be clear, this value won't stop the crawler
+     * from downloading any given url, it just won't passe the value to the parsers, especially the PDF parser requires a lot of memory for large files, this is why
+     * this property has been introduced
+     * @since 1.2.0
+     */
+    public $maxSize = 5000000;
 
     /**
      * @var Url Contains the URL object with the base URL. Urls which are not matching the base url will not be crawled or added to the results page.
@@ -219,6 +231,12 @@ public function run()
 
         // get content and remove handles
         foreach ($curlRequests as $queueKey => $ch) {
+            if ($this->maxSize && curl_getinfo($ch, CURLINFO_SIZE_DOWNLOAD) > $this->maxSize) {
+                curl_multi_remove_handle($multiCurl, $ch);
+                unset($ch);
+                continue;
+            }
+
             $requestResponse = new RequestResponse(curl_multi_getcontent($ch), curl_getinfo($ch, CURLINFO_CONTENT_TYPE));
 
             $checksum = $requestResponse->getChecksum();

diff --git a/src/Job.php b/src/Job.php
@@ -63,7 +63,7 @@ public function generateCurl()
         curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
         curl_setopt($curl, CURLOPT_URL, $this->url->getNormalized());
         curl_setopt($curl, CURLOPT_HTTPGET, true);
-        curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 0); 
+        curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 0);
         curl_setopt($curl, CURLOPT_TIMEOUT, 5); // timeout after 5 seconds
         return $curl;
     }
@@ -95,6 +95,7 @@ public function run(RequestResponse $requestResponse, Crawler $crawler)
 
                 if ($parserResult->ignore) {
                     // for whatever reason the parser ignores this url
+                    unset($parserResult);
                     continue;
                 }
 

diff --git a/src/Parsers/PdfParser.php b/src/Parsers/PdfParser.php
@@ -13,8 +13,8 @@
 
 /**
  * PDF Parser
- 
- * > Attention: Keep in mind that wen you enable the PDF Parser and have multiple concurrent requests this can drastically increases memory 
+
+ * > Attention: Keep in mind that wen you enable the PDF Parser and have multiple concurrent requests this can drastically increases memory
  * > usage (Especially if there are large PDFs)! Therefore it's recommend to lower the concurrent value when enabling PDF Parser!
  *
  * @author Basil Suter <[email protected]>

diff --git a/tests/CrawlerTest.php b/tests/CrawlerTest.php
@@ -23,6 +23,34 @@ public function testRunCrawler()
         $this->assertNotEmpty($debug->elapsedTime());
     }
 
+    public function testRunCrawlerWithGivenFilter()
+    {
+        $debug = new DebugHandler();
+
+        $crawler = new Crawler('https://luya.io', new ArrayStorage, new LoopRunner);
+        $crawler->urlFilterRules = [
+            '#/news#i'
+        ];
+        $crawler->addParser(new HtmlParser);
+        $crawler->addHandler($debug);
+        $this->assertEmpty($crawler->setup());
+        $this->assertEmpty($crawler->run());
+        $this->assertNotEmpty($debug->elapsedTime());
+    }
+
+    public function testRunCrawlerButSkipResponseDueToVerySmallLimit()
+    {
+        $debug = new DebugHandler();
+
+        $crawler = new Crawler('https://luya.io', new ArrayStorage, new LoopRunner);
+        $crawler->maxSize = 1; // 1 byte is required
+        $crawler->addParser(new HtmlParser);
+        $crawler->addHandler($debug);
+        $this->assertEmpty($crawler->setup());
+        $this->assertEmpty($crawler->run());
+        $this->assertNotEmpty($debug->elapsedTime());
+    }
+
     public function testCrawlWithFilteredMainDomain()
     {
         $debug = new DebugHandler();

diff --git a/tests/Parsers/HtmlParserTest.php b/tests/Parsers/HtmlParserTest.php
@@ -30,8 +30,9 @@ public function testDomDocumentRemoveScriptInformations()
 
         $this->assertSame('<body>between<p>the lines</p></body>', str_replace(['\n', '\r', PHP_EOL], '', $parser->getDomBodyContent($dom)));
 
-        $this->assertNull($parser->getDomBodyContent(new DOMDocument())
-    );
+        $this->assertNull(
+            $parser->getDomBodyContent(new DOMDocument())
+        );
     }
 
     public function testFullIgnoreTag()