Skip to content

Commit

Permalink
Merge pull request #14 from nadar/status-code-check
Browse files Browse the repository at this point in the history
add status code
  • Loading branch information
nadar authored Jan 13, 2021
2 parents 0c0dc45 + 5a06344 commit 31b08f7
Show file tree
Hide file tree
Showing 9 changed files with 32 additions and 9 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/).
In order to read more about upgrading and BC breaks have a look at the [UPGRADE Document](UPGRADE.md).

## 1.5.0 (13. January 2020)

+ [#14](https://github.com/nadar/crawler/pull/14) Pass the StatusCode of the response into the parsers and process only HTML and PDFs with code 200 (OK).

## 1.4.0 (13. January 2020)

+ [#13](https://github.com/nadar/crawler/pull/13) New Crawler method `getCycles()` returns the number of times the `run()` method was called.
Expand Down
2 changes: 1 addition & 1 deletion src/Crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ public function run()
continue;
}

$requestResponse = new RequestResponse(curl_multi_getcontent($ch), curl_getinfo($ch, CURLINFO_CONTENT_TYPE));
$requestResponse = new RequestResponse(curl_multi_getcontent($ch), curl_getinfo($ch, CURLINFO_CONTENT_TYPE), curl_getinfo($ch, CURLINFO_HTTP_CODE));

$checksum = $requestResponse->getChecksum();
if (!$this->storage->isChecksumDone($checksum)) {
Expand Down
2 changes: 1 addition & 1 deletion src/Parsers/HtmlParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ public function validateUrl(Url $url) : bool
*/
public function validateRequestResponse(RequestResponse $requestResponse): bool
{
return in_array($requestResponse->getContentType(), ['text/html']);
return $requestResponse->getStatusCode() == 200 && in_array($requestResponse->getContentType(), ['text/html']);
}

/**
Expand Down
2 changes: 1 addition & 1 deletion src/Parsers/PdfParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,6 @@ public function validateUrl(Url $url) : bool
*/
public function validateRequestResponse(RequestResponse $requestResponse): bool
{
return in_array($requestResponse->getContentType(), ['application/pdf']);
return $requestResponse->getStatusCode() == 200 && in_array($requestResponse->getContentType(), ['application/pdf']);
}
}
21 changes: 20 additions & 1 deletion src/RequestResponse.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,24 @@ class RequestResponse
*/
protected $contentType;

/**
* @var integer Contains the status code of the current request response.
* @since 1.5.0
*/
protected $statusCode;

/**
* Constructor
*
* @param string $content
* @param string $contentType
* @param integer $statusCode {@since 1.5.0}
*/
public function __construct($content, $contentType)
public function __construct($content, $contentType, $statusCode)
{
$this->content = $content;
$this->contentType = trim($contentType);
$this->statusCode = (int) $statusCode;
}

/**
Expand All @@ -44,6 +52,17 @@ public function getContent()
return $this->content;
}

/**
* Returns the request response status code.
*
* @return integer Example status code would be 200
* @since 1.5.0
*/
public function getStatusCode()
{
return $this->statusCode;
}

private $_checksum;

/**
Expand Down
2 changes: 1 addition & 1 deletion tests/Parsers/HtmlParserExampleTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ public function testFullIgnoreResult()
{
$job = new Job(new Url('https://example.com/'), new Url('https://example.com/'));

$requestResponse = new RequestResponse($this->html, 'text/html');
$requestResponse = new RequestResponse($this->html, 'text/html', 200);

$parser = new HtmlParser();
$result = $parser->run($job, $requestResponse);
Expand Down
4 changes: 2 additions & 2 deletions tests/Parsers/HtmlParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public function testFullIgnoreTag()
<head>
</head>
<body>test [CRAWL_FULL_IGNORE]</body>
</html>', 'text/html');
</html>', 'text/html', 200);

$crawler = new Crawler('https://example.com/', new ArrayStorage, new LoopRunner);
$debug = new DebugHandler;
Expand Down Expand Up @@ -137,7 +137,7 @@ public function testGetContent()
</div>
</body>
</html>
', 'text/html');
', 'text/html', 200);

$parser = new HtmlParser();
$parser->stripTags = false;
Expand Down
2 changes: 1 addition & 1 deletion tests/Parsers/PdfParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ public function testPdfUtf8Issue()

$pdf = 'https://www.rehab.ch/files/7_Das_REHAB_im_Dialog/Anreisekarte_DE_190923.pdf';

$requestResponse = new RequestResponse(file_get_contents($pdf), 'application/pdf');
$requestResponse = new RequestResponse(file_get_contents($pdf), 'application/pdf', 200);

$parser = new PdfParser();
$result = $parser->run($job, $requestResponse);
Expand Down
2 changes: 1 addition & 1 deletion tests/RequestResponseTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class RequestResponseTest extends CrawlerTestCase
{
public function testRequestResponseMethods()
{
$r = new RequestResponse('foobar', 'text/html; charset=UTF-8');
$r = new RequestResponse('foobar', 'text/html; charset=UTF-8', 200);

$this->assertSame('foobar', $r->getContent());
$this->assertSame('3858f62230ac3c915f300c664312c63f', $r->getChecksum());
Expand Down

0 comments on commit 31b08f7

Please sign in to comment.