diff --git a/scraper/src/config/config_loader.py b/scraper/src/config/config_loader.py index 6f81490..89d91ea 100644 --- a/scraper/src/config/config_loader.py +++ b/scraper/src/config/config_loader.py @@ -63,6 +63,8 @@ class ConfigLoader: nb_hits_max = 6000000 + relative_url = False + def __init__(self, config): data = self._load_config(config) diff --git a/scraper/src/strategies/default_strategy.py b/scraper/src/strategies/default_strategy.py index b5fb3f7..d645f9a 100644 --- a/scraper/src/strategies/default_strategy.py +++ b/scraper/src/strategies/default_strategy.py @@ -66,6 +66,19 @@ def get_records_from_dom(self, current_page_url=None): if self.dom is None: sys.exit('DefaultStrategy.dom is not defined') + # Convert absolute URL to relative path + original_url = current_page_url + if self.config.relative_url and original_url: + from urllib.parse import urlparse + parsed = urlparse(original_url) + # Construct a relative path (including path, parameters, and query parameters) + relative_url = parsed.path + if parsed.params: # Handle URL parameters (less common) + relative_url += ';' + parsed.params + if parsed.query: # Handle query parameters + relative_url += '?' + parsed.query + current_page_url = relative_url + # Reset it to be able to have a clean instance when testing self.global_content = {}