From 8a85d8288760a4d8d05d807cf2b19218a3bd7ceb Mon Sep 17 00:00:00 2001 From: Colin Emonds Date: Wed, 13 Sep 2023 10:01:09 +0200 Subject: [PATCH] feature #472: filter based on GMaps distance This PR involves some refactoring. The problem is that previously, filters ran before making any calls to external APIs. Therefore, just adding another filter for the distance doesn't actually work: the distance information is not yet available when we apply the filters. We can't just run the filters later, because then we would run the Google Maps API calls before we filtered out any properties, meaning that we would incur Google Maps API calls for all properties that we find in our search, including those that we later filter out anyway based on price, size, etc. - and we actually have to pay for those requests! My solution is to group the filters in two chains, and then run one chain before and one after external API calls have been made. This way, we can run the distance filter after the API calls are made, but keep everything else the same. --- flathunter/config.py | 28 +++++--- flathunter/dataclasses.py | 63 ++++++++++++++++++ flathunter/filter.py | 88 +++++++++++++++++++------- flathunter/gmaps_duration_processor.py | 44 ++++--------- flathunter/hunter.py | 24 +++++-- flathunter/web/views.py | 8 ++- flathunter/web_hunter.py | 23 ++++--- test/test_config.py | 14 ++-- test/test_gmaps_duration_processor.py | 41 ++++++++++-- test/test_googlecloud_idmaintainer.py | 8 ++- test/test_id_maintainer.py | 8 ++- 11 files changed, 249 insertions(+), 100 deletions(-) create mode 100644 flathunter/dataclasses.py diff --git a/flathunter/config.py b/flathunter/config.py index caab9e6a..bfe12c5a 100644 --- a/flathunter/config.py +++ b/flathunter/config.py @@ -1,6 +1,6 @@ """Wrap configuration options as an object""" import os -from typing import Optional, Dict, Any +from typing import List, Optional, Dict, Any import json import yaml @@ -17,7 +17,8 @@ from flathunter.crawler.meinestadt import MeineStadt from flathunter.crawler.wggesucht import WgGesucht from flathunter.crawler.subito import Subito -from flathunter.filter import Filter +from flathunter.dataclasses import DistanceConfig +from flathunter.gmaps_duration_processor import TransportationModes from flathunter.logging import logger from flathunter.exceptions import ConfigException @@ -170,12 +171,6 @@ def searchers(self): """Get the list of search plugins""" return self.__searchers__ - def get_filter(self): - """Read the configured filter""" - builder = Filter.builder() - builder.read_config(self) - return builder.build() - def captcha_enabled(self): """Check if captcha is configured""" return self._get_captcha_solver() is not None @@ -352,6 +347,23 @@ def max_price_per_square(self): """Return the configured maximum price per square meter""" return self._get_filter_config("max_price_per_square") + def max_distance(self) -> List[DistanceConfig] | None: + """Return the configured maximum distance to locations.""" + config = self._get_filter_config("max_distance") + if config is None: + return None + out = [] + for distance_filter_item in config: + out.append( + DistanceConfig( + location_name=distance_filter_item['location_name'], + transport_mode=TransportationModes(distance_filter_item['transportation_mode']), + max_distance_meters=distance_filter_item.get('max_distance_meters'), + max_duration_seconds=distance_filter_item.get('max_duration_seconds') + ) + ) + return out + def __repr__(self): return json.dumps({ "captcha_enabled": self.captcha_enabled(), diff --git a/flathunter/dataclasses.py b/flathunter/dataclasses.py new file mode 100644 index 00000000..34978189 --- /dev/null +++ b/flathunter/dataclasses.py @@ -0,0 +1,63 @@ +from dataclasses import dataclass +from enum import Enum +from typing import Optional + + +class TransportationModes(Enum): + """The transportation mode for Google Maps distance calculation.""" + TRANSIT = 'transit' + BICYCLING = 'bicycling' + DRIVING = 'driving' + WALKING = 'walking' + + +@dataclass +class DistanceValueTuple: + """We want to keep both the numeric value of a distance, and its string representation.""" + meters: float + text: str + + +@dataclass +class DurationValueTuple: + """We want to keep both the numeric value of a duration, and its string representation.""" + seconds: float + text: str + + +@dataclass +class DistanceElement: + """Represents the distance from a property to some location.""" + duration: DurationValueTuple + distance: DistanceValueTuple + mode: TransportationModes + + +@dataclass +class DistanceConfig: + """Represents distance filter information in the configuration file. + + location_name must refer to the location name used to identify the location + in the durations section of the config file, and the transport_mode must be + configured in the durations section for that location name, lest no information + is available to actually filter on.""" + location_name: str + transport_mode: TransportationModes + max_distance_meters: Optional[float] + max_duration_seconds: Optional[float] + + +class FilterChainName(Enum): + """Identifies the filter chain that a filter acts on + + Preprocess filters will be run before the expose is processed by any further actions. + Use this chain to filter exposes that can be excluded based on information scraped + from the expose website alone (such as based on price or size). + Postprocess filters will be run after other actions have completed. Use this if you + require additional information from other steps, such as information from the Google + Maps API, to make a decision on this expose. + + We separate the filter chains to avoid making expensive (literally!) calls to the + Google Maps API for exposes that we already know we aren't interested in anyway.""" + preprocess = 'PREPROCESS' + postprocess = 'POSTPROCESS' diff --git a/flathunter/filter.py b/flathunter/filter.py index bc1b4979..2fb442d9 100644 --- a/flathunter/filter.py +++ b/flathunter/filter.py @@ -1,8 +1,12 @@ """Module with implementations of standard expose filters""" -from functools import reduce import re from abc import ABC, ABCMeta -from typing import List, Any +from typing import List, Any, Dict + +from flathunter.config import DistanceConfig +from flathunter.dataclasses import FilterChainName +from flathunter.gmaps_duration_processor import DistanceElement +from flathunter.logging import logger class AbstractFilter(ABC): @@ -172,30 +176,65 @@ def is_interesting(self, expose): return pps <= self.max_pps -class FilterBuilder: +class DistanceFilter(AbstractFilter): + """Exclude properties based on distance or duration to a location + + This must be in the post-processing filter chain, as it requires data + from the Google Maps API, which is not available right after scraping.""" + + distance_config: DistanceConfig + + def __init__(self, distance_config: DistanceConfig): + self.distance_config = distance_config + + def is_interesting(self, expose): + durations: Dict[str, DistanceElement] = expose.get('durations_unformatted', None) + if durations is None or self.distance_config.location_name not in durations: + logger.info('DurationFilter is enabled, but no GMaps data found. Skipping filter.') + return True + distance = durations[self.distance_config.location_name].distance.meters + duration = durations[self.distance_config.location_name].duration.seconds + out = True + if self.distance_config.max_distance_meters: + out &= distance < self.distance_config.max_distance_meters + if self.distance_config.max_duration_seconds: + out &= duration < self.distance_config.max_duration_seconds + return out + + +class FilterChainBuilder: """Construct a filter chain""" filters: List[AbstractFilter] def __init__(self): self.filters = [] - def _append_filter_if_not_empty(self, filter_class: ABCMeta, filter_config: Any): + def _append_filter_if_not_empty( + self, + filter_class: ABCMeta, + filter_config: Any): """Appends a filter to the list if its configuration is set""" if not filter_config: return self.filters.append(filter_class(filter_config)) - def read_config(self, config): + def read_config(self, config, filter_chain: FilterChainName): """Adds filters from a config dictionary""" - self._append_filter_if_not_empty(TitleFilter, config.excluded_titles()) - self._append_filter_if_not_empty(MinPriceFilter, config.min_price()) - self._append_filter_if_not_empty(MaxPriceFilter, config.max_price()) - self._append_filter_if_not_empty(MinSizeFilter, config.min_size()) - self._append_filter_if_not_empty(MaxSizeFilter, config.max_size()) - self._append_filter_if_not_empty(MinRoomsFilter, config.min_rooms()) - self._append_filter_if_not_empty(MaxRoomsFilter, config.max_rooms()) - self._append_filter_if_not_empty( - PPSFilter, config.max_price_per_square()) + if filter_chain == FilterChainName.preprocess: + self._append_filter_if_not_empty(TitleFilter, config.excluded_titles()) + self._append_filter_if_not_empty(MinPriceFilter, config.min_price()) + self._append_filter_if_not_empty(MaxPriceFilter, config.max_price()) + self._append_filter_if_not_empty(MinSizeFilter, config.min_size()) + self._append_filter_if_not_empty(MaxSizeFilter, config.max_size()) + self._append_filter_if_not_empty(MinRoomsFilter, config.min_rooms()) + self._append_filter_if_not_empty(MaxRoomsFilter, config.max_rooms()) + self._append_filter_if_not_empty( + PPSFilter, config.max_price_per_square()) + elif filter_chain == FilterChainName.postprocess: + for df in config.max_distance(): + self._append_filter_if_not_empty(DistanceFilter, df) + else: + raise NotImplementedError() return self def filter_already_seen(self, id_watch): @@ -204,12 +243,12 @@ def filter_already_seen(self, id_watch): return self def build(self): - """Return the compiled filter""" - return Filter(self.filters) + """Return the compiled filter chain""" + return FilterChain(self.filters) -class Filter: - """Abstract filter object""" +class FilterChain: + """Collection of expose filters in use by a hunter instance""" filters: List[AbstractFilter] @@ -218,14 +257,17 @@ def __init__(self, filters: List[AbstractFilter]): def is_interesting_expose(self, expose): """Apply all filters to this expose""" - return reduce((lambda x, y: x and y), - map((lambda x: x.is_interesting(expose)), self.filters), True) - + for filter_ in self.filters: + if not filter_.is_interesting(expose): + return False + return True + def filter(self, exposes): """Apply all filters to every expose in the list""" return filter(self.is_interesting_expose, exposes) @staticmethod def builder(): - """Return a new filter builder""" - return FilterBuilder() + """Return a new filter chain builder""" + return FilterChainBuilder() + diff --git a/flathunter/gmaps_duration_processor.py b/flathunter/gmaps_duration_processor.py index 0037c072..405f1973 100644 --- a/flathunter/gmaps_duration_processor.py +++ b/flathunter/gmaps_duration_processor.py @@ -3,35 +3,16 @@ import time from urllib.parse import quote_plus from typing import Dict -from dataclasses import dataclass import requests +from flathunter.dataclasses import DistanceElement, DistanceValueTuple, DurationValueTuple, TransportationModes from flathunter.logging import logger from flathunter.abstract_processor import Processor -@dataclass -class TextValueTuple: - """We want to keep both what we parsed, and its numeric value.""" - value: float - text: str - - -@dataclass -class DistanceElement: - """Represents the distance from a property to some location.""" - duration: TextValueTuple - distance: TextValueTuple - mode: str - - class GMapsDurationProcessor(Processor): """Implementation of Processor class to calculate travel durations""" - GM_MODE_TRANSIT = 'transit' - GM_MODE_BICYCLE = 'bicycling' - GM_MODE_DRIVING = 'driving' - def __init__(self, config): self.config = config @@ -54,7 +35,7 @@ def get_distances_and_durations(self, address) -> Dict[str, DistanceElement]: for mode in duration.get('modes', []): if 'gm_id' in mode and 'title' in mode \ and 'key' in self.config.get('google_maps_api', {}): - duration = self.get_gmaps_distance(address, dest, mode['gm_id']) + duration = self._get_gmaps_distance(address, dest, mode['gm_id']) out[name] = duration return out @@ -62,14 +43,14 @@ def get_formatted_durations(self, address): """Return a formatted list of GoogleMaps durations""" durations = self.get_distances_and_durations(address) return self._format_durations(durations) - + def _format_durations(self, durations: Dict[str, DistanceElement]): out = "" for location_name, val in durations.items(): - out += f"> {location_name} ({val.mode}): {val.duration.text} ({val.distance.text})\n" + out += f"> {location_name} ({val.mode.value}): {val.duration.text} ({val.distance.text})\n" return out.strip() - def _get_gmaps_distance(self, address, dest, mode) -> DistanceElement: + def _get_gmaps_distance(self, address, dest, mode) -> DistanceElement | None: """Get the distance""" # get timestamp for next monday at 9:00:00 o'clock now = datetime.datetime.today().replace(hour=9, minute=0, second=0) @@ -85,11 +66,10 @@ def _get_gmaps_distance(self, address, dest, mode) -> DistanceElement: base_url = self.config.get('google_maps_api', {}).get('url') gm_key = self.config.get('google_maps_api', {}).get('key') - if not gm_key and mode != self.GM_MODE_DRIVING: + if not gm_key and mode != TransportationModes.DRIVING: logger.warning("No Google Maps API key configured and without using a mode " - "different from 'driving' is not allowed. " - "Downgrading to mode 'drinving' thus. ") - mode = 'driving' + "different from 'driving' is not allowed. Thus downgrading to mode 'driving'.") + mode = TransportationModes.DRIVING base_url = base_url.replace('&key={key}', '') # retrieve the result @@ -114,13 +94,13 @@ def _get_gmaps_distance(self, address, dest, mode) -> DistanceElement: element['duration']['text'], element['duration']['value']) distance_element = DistanceElement( - duration=TextValueTuple( + duration=DurationValueTuple( float(element['duration']['value']), element['duration']['text']), - distance=TextValueTuple( + distance=DistanceValueTuple( float(element['distance']['value']), element['distance']['text']), - mode=mode + mode=TransportationModes(mode) ) - distances[distance_element.distance.value] = distance_element + distances[distance_element.distance.meters] = distance_element return distances[min(distances.keys())] if distances else None diff --git a/flathunter/hunter.py b/flathunter/hunter.py index 692b9996..998c122d 100644 --- a/flathunter/hunter.py +++ b/flathunter/hunter.py @@ -5,10 +5,11 @@ from flathunter.logging import logger from flathunter.config import YamlConfig -from flathunter.filter import Filter +from flathunter.filter import FilterChain from flathunter.processor import ProcessorChain from flathunter.captcha.captcha_solver import CaptchaUnsolvableError from flathunter.exceptions import ConfigException +from flathunter.dataclasses import FilterChainName class Hunter: """Basic methods for crawling and processing / filtering exposes""" @@ -38,16 +39,14 @@ def try_crawl(searcher, url, max_pages): def hunt_flats(self, max_pages=None): """Crawl, process and filter exposes""" - filter_set = Filter.builder() \ - .read_config(self.config) \ - .filter_already_seen(self.id_watch) \ - .build() - + preprocess_filter_chain = self._build_preprocess_filter_chain(self.config) + postprocess_filter_chain = self._build_postprocess_filter_chain(self.config) processor_chain = ProcessorChain.builder(self.config) \ .save_all_exposes(self.id_watch) \ - .apply_filter(filter_set) \ + .apply_filter(preprocess_filter_chain) \ .resolve_addresses() \ .calculate_durations() \ + .apply_filter(postprocess_filter_chain) \ .send_messages() \ .build() @@ -58,3 +57,14 @@ def hunt_flats(self, max_pages=None): result.append(expose) return result + + def _build_preprocess_filter_chain(self, config) -> FilterChain: + return FilterChain.builder() \ + .read_config(config, FilterChainName.preprocess) \ + .filter_already_seen(self.id_watch) \ + .build() + + def _build_postprocess_filter_chain(self, config) -> FilterChain: + return FilterChain.builder() \ + .read_config(config, FilterChainName.postprocess) \ + .build() diff --git a/flathunter/web/views.py b/flathunter/web/views.py index 5a5926a0..6b1732e7 100644 --- a/flathunter/web/views.py +++ b/flathunter/web/views.py @@ -6,10 +6,11 @@ from flask import render_template, jsonify, request, session, redirect from flask_api import status +from flathunter.dataclasses import FilterChainName from flathunter.web import app, log from flathunter.web.util import sanitize_float -from flathunter.filter import FilterBuilder +from flathunter.filter import FilterChainBuilder from flathunter.config import YamlConfig class AuthenticationError(Exception): @@ -72,7 +73,10 @@ def filter_for_user(): """Load the filter for the current user""" if filter_values_for_user() is None: return None - return FilterBuilder().read_config(YamlConfig({'filters': filter_values_for_user()})).build() + return (FilterChainBuilder() + .read_config(YamlConfig({'filters': filter_values_for_user()}), FilterChainName.preprocess) + .read_config(YamlConfig({'filters': filter_values_for_user()}), FilterChainName.postprocess) + .build()) def form_filter_values(): """Extract the filter settings from the submitted form""" diff --git a/flathunter/web_hunter.py b/flathunter/web_hunter.py index 41bab501..77d30812 100644 --- a/flathunter/web_hunter.py +++ b/flathunter/web_hunter.py @@ -2,7 +2,6 @@ from flathunter.config import YamlConfig from flathunter.logging import logger from flathunter.hunter import Hunter -from flathunter.filter import Filter from flathunter.processor import ProcessorChain from flathunter.exceptions import BotBlockedException, UserDeactivatedException @@ -13,17 +12,19 @@ class WebHunter(Hunter): def hunt_flats(self, max_pages=1): """Crawl all URLs, and send notifications to users of new flats""" - filter_set = Filter.builder() \ - .read_config(self.config) \ - .filter_already_seen(self.id_watch) \ - .build() - + preprocess_filter_chain = self._build_preprocess_filter_chain(self.config) + postprocess_filter_chain = self._build_postprocess_filter_chain(self.config) + # note: we have to save all exposes *after* applying the processors because + # the exposes later get loaded from disk to then be filtered again, so we need + # the additional information from the processor lest the postprocess chain breaks + # due to missing data processor_chain = ProcessorChain.builder(self.config) \ - .apply_filter(filter_set) \ + .apply_filter(preprocess_filter_chain) \ .crawl_expose_details() \ - .save_all_exposes(self.id_watch) \ .resolve_addresses() \ .calculate_durations() \ + .save_all_exposes(self.id_watch) \ + .apply_filter(postprocess_filter_chain) \ .send_messages() \ .build() @@ -34,10 +35,12 @@ def hunt_flats(self, max_pages=1): for (user_id, settings) in self.id_watch.get_user_settings(): if 'mute_notifications' in settings: continue - filter_set = Filter.builder().read_config(YamlConfig(settings)).build() + preprocess_filter_chain = self._build_preprocess_filter_chain(YamlConfig(settings)) + postprocess_filter_chain = self._build_postprocess_filter_chain(YamlConfig(settings)) try: processor_chain = ProcessorChain.builder(self.config) \ - .apply_filter(filter_set) \ + .apply_filter(preprocess_filter_chain) \ + .apply_filter(postprocess_filter_chain) \ .send_messages([user_id]) \ .build() for message in processor_chain.process(new_exposes): diff --git a/test/test_config.py b/test/test_config.py index 6cf821fa..e02d464b 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -71,22 +71,20 @@ def test_loads_config_at_file(self): def test_loads_config_from_string(self): config = StringConfig(string=self.EMPTY_FILTERS_CONFIG) - self.assertIsNotNone(config) - my_filter = config.get_filter() - self.assertIsNotNone(my_filter) + target_urls = config.target_urls() + self.assertIsNotNone(target_urls) def test_loads_legacy_config_from_string(self): config = StringConfig(string=self.LEGACY_FILTERS_CONFIG) - self.assertIsNotNone(config) - my_filter = config.get_filter() + my_filter = config.excluded_titles() self.assertIsNotNone(my_filter) - self.assertTrue(len(my_filter.filters) > 0) + self.assertTrue(len(my_filter) > 0) def test_loads_filters_config_from_string(self): config = StringConfig(string=self.FILTERS_CONFIG) - self.assertIsNotNone(config) - my_filter = config.get_filter() + my_filter = config.excluded_titles() self.assertIsNotNone(my_filter) + self.assertTrue(len(my_filter) > 0) def test_defaults_fields(self): config = StringConfig(string=self.FILTERS_CONFIG) diff --git a/test/test_gmaps_duration_processor.py b/test/test_gmaps_duration_processor.py index a35b1cdf..15a780ea 100644 --- a/test/test_gmaps_duration_processor.py +++ b/test/test_gmaps_duration_processor.py @@ -1,6 +1,7 @@ +from typing import Dict import unittest -import yaml import re +from flathunter.gmaps_duration_processor import DistanceElement import requests_mock from flathunter.hunter import Hunter from flathunter.idmaintainer import IdMaintainer @@ -33,6 +34,20 @@ class GMapsDurationProcessorTest(unittest.TestCase): - gm_id: driving title: Car """ + DUMMY_RESPONSE = """ + { + "status": "OK", + "rows": [ + { + "elements": [ + { + "distance": { "text": "228 mi", "value": 367654 }, + "duration": { "text": "3 hours 55 mins", "value": 14078 } + } + ] + } + ] + }""" @requests_mock.Mocker() def test_resolve_durations(self, m): @@ -40,11 +55,25 @@ def test_resolve_durations(self, m): config.set_searchers([DummyCrawler()]) hunter = Hunter(config, IdMaintainer(":memory:")) matcher = re.compile('maps.googleapis.com/maps/api/distancematrix/json') - m.get(matcher, text='{"status": "OK", "rows": [ { "elements": [ { "distance": { "text": "far", "value": 123 }, "duration": { "text": "days", "value": 123 } } ] } ]}') + m.get(matcher, text=self.DUMMY_RESPONSE) exposes = hunter.hunt_flats() self.assertTrue(count(exposes) > 4, "Expected to find exposes") without_durations = list(filter(lambda expose: 'durations' not in expose, exposes)) - if len(without_durations) > 0: - for expose in without_durations: - print("Got expose: ", expose) - self.assertTrue(len(without_durations) == 0, "Expected durations to be calculated") \ No newline at end of file + for expose in without_durations: + print("Got expose: ", expose) + self.assertTrue(len(without_durations) == 0, "Expected durations to be calculated") + + @requests_mock.Mocker() + def test_durations_valid(self, m): + config = StringConfig(string=self.DUMMY_CONFIG) + config.set_searchers([DummyCrawler()]) + hunter = Hunter(config, IdMaintainer(":memory:")) + matcher = re.compile('maps.googleapis.com/maps/api/distancematrix/json') + m.get(matcher, text=self.DUMMY_RESPONSE) + exposes = hunter.hunt_flats() + for expose in exposes: + durations: Dict[str, DistanceElement] = expose['durations_unformatted'] + for duration in durations.values(): + self.assertAlmostEqual(duration.distance.meters, 367654) + self.assertAlmostEqual(duration.duration.seconds, 14078) + self.assertEqual(duration.duration.text, '3 hours 55 mins') diff --git a/test/test_googlecloud_idmaintainer.py b/test/test_googlecloud_idmaintainer.py index b3788e9a..d430d6a9 100644 --- a/test/test_googlecloud_idmaintainer.py +++ b/test/test_googlecloud_idmaintainer.py @@ -7,7 +7,8 @@ from flathunter.googlecloud_idmaintainer import GoogleCloudIdMaintainer from flathunter.hunter import Hunter from flathunter.web_hunter import WebHunter -from flathunter.filter import Filter +from flathunter.filter import FilterChain +from flathunter.dataclasses import FilterChainName from test.dummy_crawler import DummyCrawler from test.test_util import count from test.utils.config import StringConfig @@ -94,7 +95,10 @@ def test_exposes_are_returned_filtered(id_watch): hunter = Hunter(config, id_watch) hunter.hunt_flats() hunter.hunt_flats() - filter = Filter.builder().read_config(StringConfig('{"filters":{"max_size":70}}')).build() + filter = (FilterChain + .builder() + .read_config(StringConfig('{"filters":{"max_size":70}}'), FilterChainName.preprocess) + .build()) saved = id_watch.get_recent_exposes(10, filter_set=filter) assert len(saved) == 10 for expose in saved: diff --git a/test/test_id_maintainer.py b/test/test_id_maintainer.py index 2d4cdc75..9a84bbf3 100644 --- a/test/test_id_maintainer.py +++ b/test/test_id_maintainer.py @@ -6,7 +6,8 @@ from flathunter.idmaintainer import IdMaintainer from flathunter.hunter import Hunter from flathunter.web_hunter import WebHunter -from flathunter.filter import Filter +from flathunter.filter import FilterChain +from flathunter.dataclasses import FilterChainName from test.dummy_crawler import DummyCrawler from test.test_util import count from test.utils.config import StringConfig @@ -111,7 +112,10 @@ def test_exposes_are_returned_filtered(): hunter = Hunter(config, id_watch) hunter.hunt_flats() hunter.hunt_flats() - filter = Filter.builder().read_config(StringConfig('{"filters":{"max_size":70}}')).build() + filter = (FilterChain + .builder() + .read_config(StringConfig('{"filters":{"max_size":70}}'), FilterChainName.preprocess) + .build()) saved = id_watch.get_recent_exposes(10, filter_set=filter) assert len(saved) == 10 for expose in saved: