Skip to content

Commit

Permalink
Merge pull request #12 from nadar/issue-8
Browse files Browse the repository at this point in the history
merge path option
  • Loading branch information
nadar authored Dec 20, 2020
2 parents 3efd2b5 + 566e104 commit ab8787e
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 8 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/).
In order to read more about upgrading and BC breaks have a look at the [UPGRADE Document](UPGRADE.md).

## 1.3.0
## 1.3.0 (20. December 2020)

+ [#10](https://github.com/nadar/crawler/issues/10) Add relative url check to `Url` class.
+ [#8](https://github.com/nadar/crawler/issues/8) Merge the path of an url when only a query param is provided.

## 1.2.1 (17. December 2020)

Expand Down
7 changes: 4 additions & 3 deletions src/Job.php
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,12 @@ public function run(RequestResponse $requestResponse, Crawler $crawler)
foreach ($crawler->getParsers() as $parser) {
if ($parser->validateRequestResponse($requestResponse)) {
$parserResult = $parser->run($this, $requestResponse);

foreach ($parserResult->links as $url => $linkTitle) {
// create new url object for all found urls
$url = new Url($url);
$url->merge($crawler->baseUrl);

// merge the current url (which is equals to the referrer in this case)
// in order to ensure correct relative paths
$url->merge($this->url);
if ($url->isValid() && $crawler->baseUrl->sameHost($url)) {
$job = new Job($url, $this->url);
$crawler->push($job);
Expand Down
22 changes: 18 additions & 4 deletions src/Url.php
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,16 @@ public function isRelative()
}

/**
* If the current URL is missing informations, it cain obtain informations from the to merge url
*
* Will only merge the host and scheme of the current object with the provided url. Only if those informations are missing.
*
* If the current URL is missing informations, it cain obtain informations from the merge url.
*
* > By `current URL` it means the value from $this->url.
*
* The following parts will be merged:
*
* + host: If missing in current URL
* + scheme: If missing in current URL
* + path: If the current URL is a query parameter only, the path can be merged
*
* @param Url $url
* @return static
*/
Expand All @@ -193,6 +199,14 @@ public function merge(Url $url)
$this->parsed['scheme'] = $url->getScheme();
}

// if the url is relative and contains only a query param, the path should be mmerged
// from the url as well. This ensures urls like `?foo=bar` will be converted to the
// full path including its path, which is in most cases also the relativ url.
// @see https://github.com/nadar/crawler/issues/8
if (empty($this->getPath()) && $this->isRelative() && !empty($this->getQuery())) {
$this->parsed['path'] = $url->getPath();
}

return $this;
}

Expand Down
9 changes: 9 additions & 0 deletions tests/UrlTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -83,4 +83,13 @@ public function testIsRelative()
$this->assertTrue((new Url('/path-without-host'))->isRelative());
$this->assertTrue((new Url('path-without-host/base-path-info-required'))->isRelative());
}

public function testQueryParamOnly()
{
$url = new Url('?foo=bar');
$this->assertTrue($url->isRelative());

$url->merge(new Url('https://luya.io/current/path'));
$this->assertSame('https://luya.io/current/path?foo=bar', $url->getNormalized());
}
}

0 comments on commit ab8787e

Please sign in to comment.