From 5f364c086def055a2bad69b9ec8a9987187457d2 Mon Sep 17 00:00:00 2001 From: Basil Suter Date: Wed, 13 Jan 2021 10:15:52 +0000 Subject: [PATCH 1/2] add status code --- CHANGELOG.md | 4 ++++ src/Crawler.php | 2 +- src/Parsers/HtmlParser.php | 2 +- src/Parsers/PdfParser.php | 2 +- src/RequestResponse.php | 20 +++++++++++++++++++- tests/Parsers/HtmlParserExampleTest.php | 2 +- tests/Parsers/HtmlParserTest.php | 4 ++-- tests/Parsers/PdfParserTest.php | 2 +- tests/RequestResponseTest.php | 2 +- 9 files changed, 31 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ba97f0..31720fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,10 @@ All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/). In order to read more about upgrading and BC breaks have a look at the [UPGRADE Document](UPGRADE.md). +## 1.5.0 (13. January 2020) + ++ [#14](https://github.com/nadar/crawler/pull/14) Pass the StatusCode of the response into the parsers and process only HTML and PDFs with code 200 (OK). + ## 1.4.0 (13. January 2020) + [#13](https://github.com/nadar/crawler/pull/13) New Crawler method `getCycles()` returns the number of times the `run()` method was called. diff --git a/src/Crawler.php b/src/Crawler.php index 54b8abe..5f7d56b 100644 --- a/src/Crawler.php +++ b/src/Crawler.php @@ -255,7 +255,7 @@ public function run() continue; } - $requestResponse = new RequestResponse(curl_multi_getcontent($ch), curl_getinfo($ch, CURLINFO_CONTENT_TYPE)); + $requestResponse = new RequestResponse(curl_multi_getcontent($ch), curl_getinfo($ch, CURLINFO_CONTENT_TYPE), curl_getinfo($ch, CURLINFO_HTTP_CODE)); $checksum = $requestResponse->getChecksum(); if (!$this->storage->isChecksumDone($checksum)) { diff --git a/src/Parsers/HtmlParser.php b/src/Parsers/HtmlParser.php index e0a37d8..6c286c7 100644 --- a/src/Parsers/HtmlParser.php +++ b/src/Parsers/HtmlParser.php @@ -76,7 +76,7 @@ public function validateUrl(Url $url) : bool */ public function validateRequestResponse(RequestResponse $requestResponse): bool { - return in_array($requestResponse->getContentType(), ['text/html']); + return $requestResponse->getStatusCode() == 200 && in_array($requestResponse->getContentType(), ['text/html']); } /** diff --git a/src/Parsers/PdfParser.php b/src/Parsers/PdfParser.php index 4db6f8a..11ab906 100644 --- a/src/Parsers/PdfParser.php +++ b/src/Parsers/PdfParser.php @@ -76,6 +76,6 @@ public function validateUrl(Url $url) : bool */ public function validateRequestResponse(RequestResponse $requestResponse): bool { - return in_array($requestResponse->getContentType(), ['application/pdf']); + return $requestResponse->getStatusCode() == 200 && in_array($requestResponse->getContentType(), ['application/pdf']); } } diff --git a/src/RequestResponse.php b/src/RequestResponse.php index 09ba33b..14fbb54 100644 --- a/src/RequestResponse.php +++ b/src/RequestResponse.php @@ -20,16 +20,24 @@ class RequestResponse */ protected $contentType; + /** + * @var integer Contains the status code of the current request response. + * @since 1.5.0 + */ + protected $statusCode; + /** * Constructor * * @param string $content * @param string $contentType + * @param integer $statusCode {@since 1.5.0} */ - public function __construct($content, $contentType) + public function __construct($content, $contentType, $statusCode) { $this->content = $content; $this->contentType = trim($contentType); + $this->statusCode = (int) $statusCode; } /** @@ -44,6 +52,16 @@ public function getContent() return $this->content; } + /** + * Returns the request response status code. + * + * @return integer Example status code would be 200 + */ + public function getStatusCode() + { + return $this->statusCode; + } + private $_checksum; /** diff --git a/tests/Parsers/HtmlParserExampleTest.php b/tests/Parsers/HtmlParserExampleTest.php index 4f23f47..8362311 100644 --- a/tests/Parsers/HtmlParserExampleTest.php +++ b/tests/Parsers/HtmlParserExampleTest.php @@ -14,7 +14,7 @@ public function testFullIgnoreResult() { $job = new Job(new Url('https://example.com/'), new Url('https://example.com/')); - $requestResponse = new RequestResponse($this->html, 'text/html'); + $requestResponse = new RequestResponse($this->html, 'text/html', 200); $parser = new HtmlParser(); $result = $parser->run($job, $requestResponse); diff --git a/tests/Parsers/HtmlParserTest.php b/tests/Parsers/HtmlParserTest.php index bcd89bd..7e3a227 100644 --- a/tests/Parsers/HtmlParserTest.php +++ b/tests/Parsers/HtmlParserTest.php @@ -43,7 +43,7 @@ public function testFullIgnoreTag() test [CRAWL_FULL_IGNORE] -', 'text/html'); +', 'text/html', 200); $crawler = new Crawler('https://example.com/', new ArrayStorage, new LoopRunner); $debug = new DebugHandler; @@ -137,7 +137,7 @@ public function testGetContent() - ', 'text/html'); + ', 'text/html', 200); $parser = new HtmlParser(); $parser->stripTags = false; diff --git a/tests/Parsers/PdfParserTest.php b/tests/Parsers/PdfParserTest.php index 9319386..e9ed571 100644 --- a/tests/Parsers/PdfParserTest.php +++ b/tests/Parsers/PdfParserTest.php @@ -18,7 +18,7 @@ public function testPdfUtf8Issue() $pdf = 'https://www.rehab.ch/files/7_Das_REHAB_im_Dialog/Anreisekarte_DE_190923.pdf'; - $requestResponse = new RequestResponse(file_get_contents($pdf), 'application/pdf'); + $requestResponse = new RequestResponse(file_get_contents($pdf), 'application/pdf', 200); $parser = new PdfParser(); $result = $parser->run($job, $requestResponse); diff --git a/tests/RequestResponseTest.php b/tests/RequestResponseTest.php index 352270e..4b459f3 100644 --- a/tests/RequestResponseTest.php +++ b/tests/RequestResponseTest.php @@ -8,7 +8,7 @@ class RequestResponseTest extends CrawlerTestCase { public function testRequestResponseMethods() { - $r = new RequestResponse('foobar', 'text/html; charset=UTF-8'); + $r = new RequestResponse('foobar', 'text/html; charset=UTF-8', 200); $this->assertSame('foobar', $r->getContent()); $this->assertSame('3858f62230ac3c915f300c664312c63f', $r->getChecksum()); From 5a06344c292885b57860e5c42fb8079a4372f631 Mon Sep 17 00:00:00 2001 From: Basil Suter Date: Wed, 13 Jan 2021 10:17:37 +0000 Subject: [PATCH 2/2] add since --- src/RequestResponse.php | 1 + 1 file changed, 1 insertion(+) diff --git a/src/RequestResponse.php b/src/RequestResponse.php index 14fbb54..0880c67 100644 --- a/src/RequestResponse.php +++ b/src/RequestResponse.php @@ -56,6 +56,7 @@ public function getContent() * Returns the request response status code. * * @return integer Example status code would be 200 + * @since 1.5.0 */ public function getStatusCode() {