From 87335bb674a5531c4540f8709557122f419a4623 Mon Sep 17 00:00:00 2001 From: sengoku-f <31064651+sengoku-f@users.noreply.github.com> Date: Mon, 17 Mar 2025 11:26:08 +0800 Subject: [PATCH] url supports relative paths --- scraper/src/config/config_loader.py | 2 ++ scraper/src/strategies/default_strategy.py | 13 +++++++++++++ 2 files changed, 15 insertions(+) diff --git a/scraper/src/config/config_loader.py b/scraper/src/config/config_loader.py index 6f81490..89d91ea 100644 --- a/scraper/src/config/config_loader.py +++ b/scraper/src/config/config_loader.py @@ -63,6 +63,8 @@ class ConfigLoader: nb_hits_max = 6000000 + relative_url = False + def __init__(self, config): data = self._load_config(config) diff --git a/scraper/src/strategies/default_strategy.py b/scraper/src/strategies/default_strategy.py index b5fb3f7..d645f9a 100644 --- a/scraper/src/strategies/default_strategy.py +++ b/scraper/src/strategies/default_strategy.py @@ -66,6 +66,19 @@ def get_records_from_dom(self, current_page_url=None): if self.dom is None: sys.exit('DefaultStrategy.dom is not defined') + # Convert absolute URL to relative path + original_url = current_page_url + if self.config.relative_url and original_url: + from urllib.parse import urlparse + parsed = urlparse(original_url) + # Construct a relative path (including path, parameters, and query parameters) + relative_url = parsed.path + if parsed.params: # Handle URL parameters (less common) + relative_url += ';' + parsed.params + if parsed.query: # Handle query parameters + relative_url += '?' + parsed.query + current_page_url = relative_url + # Reset it to be able to have a clean instance when testing self.global_content = {}