Skip to content

Commit

Permalink
feature flathunters#472: filter based on GMaps distance
Browse files Browse the repository at this point in the history
This PR involves some refactoring. The problem is that previously,
filters ran before making any calls to external APIs. Therefore, just
adding another filter for the distance doesn't actually work: the
distance information is not yet available when we apply the filters. We
can't just run the filters later, because then we would run the Google
Maps API calls before we filtered out any properties, meaning that we
would incur Google Maps API calls for all properties that we find in our
search, including those that we later filter out anyway based on price,
size, etc. - and we actually have to pay for those requests! My solution
is to group the filters in two chains, and then run one chain before and
one after external API calls have been made. This way, we can run the
distance filter after the API calls are made, but keep everything else
the same.
  • Loading branch information
colinemondswieprecht committed Sep 13, 2023
1 parent 1c5f36b commit 8a85d82
Show file tree
Hide file tree
Showing 11 changed files with 249 additions and 100 deletions.
28 changes: 20 additions & 8 deletions flathunter/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Wrap configuration options as an object"""
import os
from typing import Optional, Dict, Any
from typing import List, Optional, Dict, Any

import json
import yaml
Expand All @@ -17,7 +17,8 @@
from flathunter.crawler.meinestadt import MeineStadt
from flathunter.crawler.wggesucht import WgGesucht
from flathunter.crawler.subito import Subito
from flathunter.filter import Filter
from flathunter.dataclasses import DistanceConfig
from flathunter.gmaps_duration_processor import TransportationModes
from flathunter.logging import logger
from flathunter.exceptions import ConfigException

Expand Down Expand Up @@ -170,12 +171,6 @@ def searchers(self):
"""Get the list of search plugins"""
return self.__searchers__

def get_filter(self):
"""Read the configured filter"""
builder = Filter.builder()
builder.read_config(self)
return builder.build()

def captcha_enabled(self):
"""Check if captcha is configured"""
return self._get_captcha_solver() is not None
Expand Down Expand Up @@ -352,6 +347,23 @@ def max_price_per_square(self):
"""Return the configured maximum price per square meter"""
return self._get_filter_config("max_price_per_square")

def max_distance(self) -> List[DistanceConfig] | None:
"""Return the configured maximum distance to locations."""
config = self._get_filter_config("max_distance")
if config is None:
return None
out = []
for distance_filter_item in config:
out.append(
DistanceConfig(
location_name=distance_filter_item['location_name'],
transport_mode=TransportationModes(distance_filter_item['transportation_mode']),
max_distance_meters=distance_filter_item.get('max_distance_meters'),
max_duration_seconds=distance_filter_item.get('max_duration_seconds')
)
)
return out

def __repr__(self):
return json.dumps({
"captcha_enabled": self.captcha_enabled(),
Expand Down
63 changes: 63 additions & 0 deletions flathunter/dataclasses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from dataclasses import dataclass
from enum import Enum
from typing import Optional


class TransportationModes(Enum):
"""The transportation mode for Google Maps distance calculation."""
TRANSIT = 'transit'
BICYCLING = 'bicycling'
DRIVING = 'driving'
WALKING = 'walking'


@dataclass
class DistanceValueTuple:
"""We want to keep both the numeric value of a distance, and its string representation."""
meters: float
text: str


@dataclass
class DurationValueTuple:
"""We want to keep both the numeric value of a duration, and its string representation."""
seconds: float
text: str


@dataclass
class DistanceElement:
"""Represents the distance from a property to some location."""
duration: DurationValueTuple
distance: DistanceValueTuple
mode: TransportationModes


@dataclass
class DistanceConfig:
"""Represents distance filter information in the configuration file.
location_name must refer to the location name used to identify the location
in the durations section of the config file, and the transport_mode must be
configured in the durations section for that location name, lest no information
is available to actually filter on."""
location_name: str
transport_mode: TransportationModes
max_distance_meters: Optional[float]
max_duration_seconds: Optional[float]


class FilterChainName(Enum):
"""Identifies the filter chain that a filter acts on
Preprocess filters will be run before the expose is processed by any further actions.
Use this chain to filter exposes that can be excluded based on information scraped
from the expose website alone (such as based on price or size).
Postprocess filters will be run after other actions have completed. Use this if you
require additional information from other steps, such as information from the Google
Maps API, to make a decision on this expose.
We separate the filter chains to avoid making expensive (literally!) calls to the
Google Maps API for exposes that we already know we aren't interested in anyway."""
preprocess = 'PREPROCESS'
postprocess = 'POSTPROCESS'
88 changes: 65 additions & 23 deletions flathunter/filter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
"""Module with implementations of standard expose filters"""
from functools import reduce
import re
from abc import ABC, ABCMeta
from typing import List, Any
from typing import List, Any, Dict

from flathunter.config import DistanceConfig
from flathunter.dataclasses import FilterChainName
from flathunter.gmaps_duration_processor import DistanceElement
from flathunter.logging import logger


class AbstractFilter(ABC):
Expand Down Expand Up @@ -172,30 +176,65 @@ def is_interesting(self, expose):
return pps <= self.max_pps


class FilterBuilder:
class DistanceFilter(AbstractFilter):
"""Exclude properties based on distance or duration to a location
This must be in the post-processing filter chain, as it requires data
from the Google Maps API, which is not available right after scraping."""

distance_config: DistanceConfig

def __init__(self, distance_config: DistanceConfig):
self.distance_config = distance_config

def is_interesting(self, expose):
durations: Dict[str, DistanceElement] = expose.get('durations_unformatted', None)
if durations is None or self.distance_config.location_name not in durations:
logger.info('DurationFilter is enabled, but no GMaps data found. Skipping filter.')
return True
distance = durations[self.distance_config.location_name].distance.meters
duration = durations[self.distance_config.location_name].duration.seconds
out = True
if self.distance_config.max_distance_meters:
out &= distance < self.distance_config.max_distance_meters
if self.distance_config.max_duration_seconds:
out &= duration < self.distance_config.max_duration_seconds
return out


class FilterChainBuilder:
"""Construct a filter chain"""
filters: List[AbstractFilter]

def __init__(self):
self.filters = []

def _append_filter_if_not_empty(self, filter_class: ABCMeta, filter_config: Any):
def _append_filter_if_not_empty(
self,
filter_class: ABCMeta,
filter_config: Any):
"""Appends a filter to the list if its configuration is set"""
if not filter_config:
return
self.filters.append(filter_class(filter_config))

def read_config(self, config):
def read_config(self, config, filter_chain: FilterChainName):
"""Adds filters from a config dictionary"""
self._append_filter_if_not_empty(TitleFilter, config.excluded_titles())
self._append_filter_if_not_empty(MinPriceFilter, config.min_price())
self._append_filter_if_not_empty(MaxPriceFilter, config.max_price())
self._append_filter_if_not_empty(MinSizeFilter, config.min_size())
self._append_filter_if_not_empty(MaxSizeFilter, config.max_size())
self._append_filter_if_not_empty(MinRoomsFilter, config.min_rooms())
self._append_filter_if_not_empty(MaxRoomsFilter, config.max_rooms())
self._append_filter_if_not_empty(
PPSFilter, config.max_price_per_square())
if filter_chain == FilterChainName.preprocess:
self._append_filter_if_not_empty(TitleFilter, config.excluded_titles())
self._append_filter_if_not_empty(MinPriceFilter, config.min_price())
self._append_filter_if_not_empty(MaxPriceFilter, config.max_price())
self._append_filter_if_not_empty(MinSizeFilter, config.min_size())
self._append_filter_if_not_empty(MaxSizeFilter, config.max_size())
self._append_filter_if_not_empty(MinRoomsFilter, config.min_rooms())
self._append_filter_if_not_empty(MaxRoomsFilter, config.max_rooms())
self._append_filter_if_not_empty(
PPSFilter, config.max_price_per_square())
elif filter_chain == FilterChainName.postprocess:
for df in config.max_distance():
self._append_filter_if_not_empty(DistanceFilter, df)
else:
raise NotImplementedError()
return self

def filter_already_seen(self, id_watch):
Expand All @@ -204,12 +243,12 @@ def filter_already_seen(self, id_watch):
return self

def build(self):
"""Return the compiled filter"""
return Filter(self.filters)
"""Return the compiled filter chain"""
return FilterChain(self.filters)


class Filter:
"""Abstract filter object"""
class FilterChain:
"""Collection of expose filters in use by a hunter instance"""

filters: List[AbstractFilter]

Expand All @@ -218,14 +257,17 @@ def __init__(self, filters: List[AbstractFilter]):

def is_interesting_expose(self, expose):
"""Apply all filters to this expose"""
return reduce((lambda x, y: x and y),
map((lambda x: x.is_interesting(expose)), self.filters), True)

for filter_ in self.filters:
if not filter_.is_interesting(expose):
return False
return True

def filter(self, exposes):
"""Apply all filters to every expose in the list"""
return filter(self.is_interesting_expose, exposes)

@staticmethod
def builder():
"""Return a new filter builder"""
return FilterBuilder()
"""Return a new filter chain builder"""
return FilterChainBuilder()

44 changes: 12 additions & 32 deletions flathunter/gmaps_duration_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,35 +3,16 @@
import time
from urllib.parse import quote_plus
from typing import Dict
from dataclasses import dataclass
import requests
from flathunter.dataclasses import DistanceElement, DistanceValueTuple, DurationValueTuple, TransportationModes

from flathunter.logging import logger
from flathunter.abstract_processor import Processor


@dataclass
class TextValueTuple:
"""We want to keep both what we parsed, and its numeric value."""
value: float
text: str


@dataclass
class DistanceElement:
"""Represents the distance from a property to some location."""
duration: TextValueTuple
distance: TextValueTuple
mode: str


class GMapsDurationProcessor(Processor):
"""Implementation of Processor class to calculate travel durations"""

GM_MODE_TRANSIT = 'transit'
GM_MODE_BICYCLE = 'bicycling'
GM_MODE_DRIVING = 'driving'

def __init__(self, config):
self.config = config

Expand All @@ -54,22 +35,22 @@ def get_distances_and_durations(self, address) -> Dict[str, DistanceElement]:
for mode in duration.get('modes', []):
if 'gm_id' in mode and 'title' in mode \
and 'key' in self.config.get('google_maps_api', {}):
duration = self.get_gmaps_distance(address, dest, mode['gm_id'])
duration = self._get_gmaps_distance(address, dest, mode['gm_id'])
out[name] = duration
return out

def get_formatted_durations(self, address):
"""Return a formatted list of GoogleMaps durations"""
durations = self.get_distances_and_durations(address)
return self._format_durations(durations)

def _format_durations(self, durations: Dict[str, DistanceElement]):
out = ""
for location_name, val in durations.items():
out += f"> {location_name} ({val.mode}): {val.duration.text} ({val.distance.text})\n"
out += f"> {location_name} ({val.mode.value}): {val.duration.text} ({val.distance.text})\n"
return out.strip()

def _get_gmaps_distance(self, address, dest, mode) -> DistanceElement:
def _get_gmaps_distance(self, address, dest, mode) -> DistanceElement | None:
"""Get the distance"""
# get timestamp for next monday at 9:00:00 o'clock
now = datetime.datetime.today().replace(hour=9, minute=0, second=0)
Expand All @@ -85,11 +66,10 @@ def _get_gmaps_distance(self, address, dest, mode) -> DistanceElement:
base_url = self.config.get('google_maps_api', {}).get('url')
gm_key = self.config.get('google_maps_api', {}).get('key')

if not gm_key and mode != self.GM_MODE_DRIVING:
if not gm_key and mode != TransportationModes.DRIVING:
logger.warning("No Google Maps API key configured and without using a mode "
"different from 'driving' is not allowed. "
"Downgrading to mode 'drinving' thus. ")
mode = 'driving'
"different from 'driving' is not allowed. Thus downgrading to mode 'driving'.")
mode = TransportationModes.DRIVING
base_url = base_url.replace('&key={key}', '')

# retrieve the result
Expand All @@ -114,13 +94,13 @@ def _get_gmaps_distance(self, address, dest, mode) -> DistanceElement:
element['duration']['text'],
element['duration']['value'])
distance_element = DistanceElement(
duration=TextValueTuple(
duration=DurationValueTuple(
float(element['duration']['value']),
element['duration']['text']),
distance=TextValueTuple(
distance=DistanceValueTuple(
float(element['distance']['value']),
element['distance']['text']),
mode=mode
mode=TransportationModes(mode)
)
distances[distance_element.distance.value] = distance_element
distances[distance_element.distance.meters] = distance_element
return distances[min(distances.keys())] if distances else None
24 changes: 17 additions & 7 deletions flathunter/hunter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@

from flathunter.logging import logger
from flathunter.config import YamlConfig
from flathunter.filter import Filter
from flathunter.filter import FilterChain
from flathunter.processor import ProcessorChain
from flathunter.captcha.captcha_solver import CaptchaUnsolvableError
from flathunter.exceptions import ConfigException
from flathunter.dataclasses import FilterChainName

class Hunter:
"""Basic methods for crawling and processing / filtering exposes"""
Expand Down Expand Up @@ -38,16 +39,14 @@ def try_crawl(searcher, url, max_pages):

def hunt_flats(self, max_pages=None):
"""Crawl, process and filter exposes"""
filter_set = Filter.builder() \
.read_config(self.config) \
.filter_already_seen(self.id_watch) \
.build()

preprocess_filter_chain = self._build_preprocess_filter_chain(self.config)
postprocess_filter_chain = self._build_postprocess_filter_chain(self.config)
processor_chain = ProcessorChain.builder(self.config) \
.save_all_exposes(self.id_watch) \
.apply_filter(filter_set) \
.apply_filter(preprocess_filter_chain) \
.resolve_addresses() \
.calculate_durations() \
.apply_filter(postprocess_filter_chain) \
.send_messages() \
.build()

Expand All @@ -58,3 +57,14 @@ def hunt_flats(self, max_pages=None):
result.append(expose)

return result

def _build_preprocess_filter_chain(self, config) -> FilterChain:
return FilterChain.builder() \
.read_config(config, FilterChainName.preprocess) \
.filter_already_seen(self.id_watch) \
.build()

def _build_postprocess_filter_chain(self, config) -> FilterChain:
return FilterChain.builder() \
.read_config(config, FilterChainName.postprocess) \
.build()
Loading

0 comments on commit 8a85d82

Please sign in to comment.