From 76a58c3c92516b61b864a74531ce7d5743c5e2a2 Mon Sep 17 00:00:00 2001 From: Basil Suter Date: Sun, 20 Dec 2020 07:49:13 +0000 Subject: [PATCH 1/2] merge path option --- CHANGELOG.md | 3 ++- src/Url.php | 22 ++++++++++++++++++---- tests/UrlTest.php | 9 +++++++++ 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5799dc8..d32065c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,9 +3,10 @@ All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/). In order to read more about upgrading and BC breaks have a look at the [UPGRADE Document](UPGRADE.md). -## 1.3.0 +## 1.3.0 (20. December 2020) + [#10](https://github.com/nadar/crawler/issues/10) Add relative url check to `Url` class. ++ [#8](https://github.com/nadar/crawler/issues/8) Merge the path of an url when only a query param is provided. ## 1.2.1 (17. December 2020) diff --git a/src/Url.php b/src/Url.php index fa78f5e..32ce53b 100644 --- a/src/Url.php +++ b/src/Url.php @@ -176,10 +176,16 @@ public function isRelative() } /** - * If the current URL is missing informations, it cain obtain informations from the to merge url - * - * Will only merge the host and scheme of the current object with the provided url. Only if those informations are missing. - * + * If the current URL is missing informations, it cain obtain informations from the merge url. + * + * > By `current URL` it means the value from $this->url. + * + * The following parts will be merged: + * + * + host: If missing in current URL + * + scheme: If missing in current URL + * + path: If the current URL is a query parameter only, the path can be merged + * * @param Url $url * @return static */ @@ -193,6 +199,14 @@ public function merge(Url $url) $this->parsed['scheme'] = $url->getScheme(); } + // if the url is relative and contains only a query param, the path should be mmerged + // from the url as well. This ensures urls like `?foo=bar` will be converted to the + // full path including its path, which is in most cases also the relativ url. + // @see https://github.com/nadar/crawler/issues/8 + if (empty($this->getPath()) && $this->isRelative() && !empty($this->getQuery())) { + $this->parsed['path'] = $url->getPath(); + } + return $this; } diff --git a/tests/UrlTest.php b/tests/UrlTest.php index b38749d..401d1cd 100644 --- a/tests/UrlTest.php +++ b/tests/UrlTest.php @@ -83,4 +83,13 @@ public function testIsRelative() $this->assertTrue((new Url('/path-without-host'))->isRelative()); $this->assertTrue((new Url('path-without-host/base-path-info-required'))->isRelative()); } + + public function testQueryParamOnly() + { + $url = new Url('?foo=bar'); + $this->assertTrue($url->isRelative()); + + $url->merge(new Url('https://luya.io/current/path')); + $this->assertSame('https://luya.io/current/path?foo=bar', $url->getNormalized()); + } } From 566e10497b92edee0a0725dcc1c3e58abeb2000f Mon Sep 17 00:00:00 2001 From: Basil Suter Date: Sun, 20 Dec 2020 07:52:51 +0000 Subject: [PATCH 2/2] ensure correct relative paths --- src/Job.php | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/Job.php b/src/Job.php index e5822a2..5a99111 100644 --- a/src/Job.php +++ b/src/Job.php @@ -79,11 +79,12 @@ public function run(RequestResponse $requestResponse, Crawler $crawler) foreach ($crawler->getParsers() as $parser) { if ($parser->validateRequestResponse($requestResponse)) { $parserResult = $parser->run($this, $requestResponse); - foreach ($parserResult->links as $url => $linkTitle) { + // create new url object for all found urls $url = new Url($url); - $url->merge($crawler->baseUrl); - + // merge the current url (which is equals to the referrer in this case) + // in order to ensure correct relative paths + $url->merge($this->url); if ($url->isValid() && $crawler->baseUrl->sameHost($url)) { $job = new Job($url, $this->url); $crawler->push($job);