From 1dad3d431a68bea9038ea90d226aac3e3f377334 Mon Sep 17 00:00:00 2001 From: Colin Emonds Date: Thu, 7 Sep 2023 14:43:19 +0200 Subject: [PATCH 1/6] refactor: separate GMaps process from format Make GMapsDurationProcessor export unformatted data as well, to enable later steps in the pipeline (such as filters) to access and process this data. --- flathunter/gmaps_duration_processor.py | 70 ++++++++++++++++++++------ 1 file changed, 54 insertions(+), 16 deletions(-) diff --git a/flathunter/gmaps_duration_processor.py b/flathunter/gmaps_duration_processor.py index 843f2975..0037c072 100644 --- a/flathunter/gmaps_duration_processor.py +++ b/flathunter/gmaps_duration_processor.py @@ -2,11 +2,29 @@ import datetime import time from urllib.parse import quote_plus +from typing import Dict +from dataclasses import dataclass import requests from flathunter.logging import logger from flathunter.abstract_processor import Processor + +@dataclass +class TextValueTuple: + """We want to keep both what we parsed, and its numeric value.""" + value: float + text: str + + +@dataclass +class DistanceElement: + """Represents the distance from a property to some location.""" + duration: TextValueTuple + distance: TextValueTuple + mode: str + + class GMapsDurationProcessor(Processor): """Implementation of Processor class to calculate travel durations""" @@ -19,26 +37,39 @@ def __init__(self, config): def process_expose(self, expose): """Calculate the durations for an expose""" - expose['durations'] = self.get_formatted_durations(expose['address']).strip() + durations = self.get_distances_and_durations(expose['address']) + expose['durations'] = self._format_durations(durations).strip() + expose['durations_unformatted'] = durations return expose + def get_distances_and_durations(self, address) -> Dict[str, DistanceElement]: + """Return a dict mapping location names to distances and durations""" + out = {} + for duration in self.config.get('durations', []): + if 'destination' not in duration or 'name' not in duration or 'modes' not in duration: + logger.warning('illegal duration configuration: %s', duration) + continue + dest = duration.get('destination') + name = duration.get('name') + for mode in duration.get('modes', []): + if 'gm_id' in mode and 'title' in mode \ + and 'key' in self.config.get('google_maps_api', {}): + duration = self.get_gmaps_distance(address, dest, mode['gm_id']) + out[name] = duration + return out + def get_formatted_durations(self, address): """Return a formatted list of GoogleMaps durations""" + durations = self.get_distances_and_durations(address) + return self._format_durations(durations) + + def _format_durations(self, durations: Dict[str, DistanceElement]): out = "" - for duration in self.config.get('durations', []): - if 'destination' in duration and 'name' in duration: - dest = duration.get('destination') - name = duration.get('name') - for mode in duration.get('modes', []): - if 'gm_id' in mode and 'title' in mode \ - and 'key' in self.config.get('google_maps_api', {}): - duration = self.get_gmaps_distance(address, dest, mode['gm_id']) - title = mode['title'] - out += f"> {name} ({title}): {duration}\n" - + for location_name, val in durations.items(): + out += f"> {location_name} ({val.mode}): {val.duration.text} ({val.distance.text})\n" return out.strip() - def get_gmaps_distance(self, address, dest, mode): + def _get_gmaps_distance(self, address, dest, mode) -> DistanceElement: """Get the distance""" # get timestamp for next monday at 9:00:00 o'clock now = datetime.datetime.today().replace(hour=9, minute=0, second=0) @@ -82,7 +113,14 @@ def get_gmaps_distance(self, address, dest, mode): element['distance']['text'], element['duration']['text'], element['duration']['value']) - duration_text = element['duration']['text'] - distance_text = element['distance']['text'] - distances[element['duration']['value']] = f"{duration_text} ({distance_text})" + distance_element = DistanceElement( + duration=TextValueTuple( + float(element['duration']['value']), + element['duration']['text']), + distance=TextValueTuple( + float(element['distance']['value']), + element['distance']['text']), + mode=mode + ) + distances[distance_element.distance.value] = distance_element return distances[min(distances.keys())] if distances else None From 1c5f36bd0de8f7c0a0c30aa1688cdf5fb84abed3 Mon Sep 17 00:00:00 2001 From: Colin Emonds Date: Wed, 13 Sep 2023 09:35:55 +0200 Subject: [PATCH 2/6] fix unit test on Windows You can't re-open a NamedTemporaryFile on Windows while it's still open. --- test/test_config.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/test/test_config.py b/test/test_config.py index 0804a1e1..6cf821fa 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -58,11 +58,16 @@ def test_loads_config(self): os.remove("config.yaml") def test_loads_config_at_file(self): - with tempfile.NamedTemporaryFile(mode='w+') as temp: - temp.write(self.DUMMY_CONFIG) - temp.flush() - config = Config(temp.name) - self.assertTrue(len(config.get('urls', [])) > 0, "Expected URLs in config file") + tempFileName = None + try: + with tempfile.NamedTemporaryFile(mode='w+', delete=False) as temp: + temp.write(self.DUMMY_CONFIG) + temp.flush() + tempFileName = temp.name + config = Config(tempFileName) + self.assertTrue(len(config.get('urls', [])) > 0, "Expected URLs in config file") + finally: + os.unlink(tempFileName) def test_loads_config_from_string(self): config = StringConfig(string=self.EMPTY_FILTERS_CONFIG) From 8a85d8288760a4d8d05d807cf2b19218a3bd7ceb Mon Sep 17 00:00:00 2001 From: Colin Emonds Date: Wed, 13 Sep 2023 10:01:09 +0200 Subject: [PATCH 3/6] feature #472: filter based on GMaps distance This PR involves some refactoring. The problem is that previously, filters ran before making any calls to external APIs. Therefore, just adding another filter for the distance doesn't actually work: the distance information is not yet available when we apply the filters. We can't just run the filters later, because then we would run the Google Maps API calls before we filtered out any properties, meaning that we would incur Google Maps API calls for all properties that we find in our search, including those that we later filter out anyway based on price, size, etc. - and we actually have to pay for those requests! My solution is to group the filters in two chains, and then run one chain before and one after external API calls have been made. This way, we can run the distance filter after the API calls are made, but keep everything else the same. --- flathunter/config.py | 28 +++++--- flathunter/dataclasses.py | 63 ++++++++++++++++++ flathunter/filter.py | 88 +++++++++++++++++++------- flathunter/gmaps_duration_processor.py | 44 ++++--------- flathunter/hunter.py | 24 +++++-- flathunter/web/views.py | 8 ++- flathunter/web_hunter.py | 23 ++++--- test/test_config.py | 14 ++-- test/test_gmaps_duration_processor.py | 41 ++++++++++-- test/test_googlecloud_idmaintainer.py | 8 ++- test/test_id_maintainer.py | 8 ++- 11 files changed, 249 insertions(+), 100 deletions(-) create mode 100644 flathunter/dataclasses.py diff --git a/flathunter/config.py b/flathunter/config.py index caab9e6a..bfe12c5a 100644 --- a/flathunter/config.py +++ b/flathunter/config.py @@ -1,6 +1,6 @@ """Wrap configuration options as an object""" import os -from typing import Optional, Dict, Any +from typing import List, Optional, Dict, Any import json import yaml @@ -17,7 +17,8 @@ from flathunter.crawler.meinestadt import MeineStadt from flathunter.crawler.wggesucht import WgGesucht from flathunter.crawler.subito import Subito -from flathunter.filter import Filter +from flathunter.dataclasses import DistanceConfig +from flathunter.gmaps_duration_processor import TransportationModes from flathunter.logging import logger from flathunter.exceptions import ConfigException @@ -170,12 +171,6 @@ def searchers(self): """Get the list of search plugins""" return self.__searchers__ - def get_filter(self): - """Read the configured filter""" - builder = Filter.builder() - builder.read_config(self) - return builder.build() - def captcha_enabled(self): """Check if captcha is configured""" return self._get_captcha_solver() is not None @@ -352,6 +347,23 @@ def max_price_per_square(self): """Return the configured maximum price per square meter""" return self._get_filter_config("max_price_per_square") + def max_distance(self) -> List[DistanceConfig] | None: + """Return the configured maximum distance to locations.""" + config = self._get_filter_config("max_distance") + if config is None: + return None + out = [] + for distance_filter_item in config: + out.append( + DistanceConfig( + location_name=distance_filter_item['location_name'], + transport_mode=TransportationModes(distance_filter_item['transportation_mode']), + max_distance_meters=distance_filter_item.get('max_distance_meters'), + max_duration_seconds=distance_filter_item.get('max_duration_seconds') + ) + ) + return out + def __repr__(self): return json.dumps({ "captcha_enabled": self.captcha_enabled(), diff --git a/flathunter/dataclasses.py b/flathunter/dataclasses.py new file mode 100644 index 00000000..34978189 --- /dev/null +++ b/flathunter/dataclasses.py @@ -0,0 +1,63 @@ +from dataclasses import dataclass +from enum import Enum +from typing import Optional + + +class TransportationModes(Enum): + """The transportation mode for Google Maps distance calculation.""" + TRANSIT = 'transit' + BICYCLING = 'bicycling' + DRIVING = 'driving' + WALKING = 'walking' + + +@dataclass +class DistanceValueTuple: + """We want to keep both the numeric value of a distance, and its string representation.""" + meters: float + text: str + + +@dataclass +class DurationValueTuple: + """We want to keep both the numeric value of a duration, and its string representation.""" + seconds: float + text: str + + +@dataclass +class DistanceElement: + """Represents the distance from a property to some location.""" + duration: DurationValueTuple + distance: DistanceValueTuple + mode: TransportationModes + + +@dataclass +class DistanceConfig: + """Represents distance filter information in the configuration file. + + location_name must refer to the location name used to identify the location + in the durations section of the config file, and the transport_mode must be + configured in the durations section for that location name, lest no information + is available to actually filter on.""" + location_name: str + transport_mode: TransportationModes + max_distance_meters: Optional[float] + max_duration_seconds: Optional[float] + + +class FilterChainName(Enum): + """Identifies the filter chain that a filter acts on + + Preprocess filters will be run before the expose is processed by any further actions. + Use this chain to filter exposes that can be excluded based on information scraped + from the expose website alone (such as based on price or size). + Postprocess filters will be run after other actions have completed. Use this if you + require additional information from other steps, such as information from the Google + Maps API, to make a decision on this expose. + + We separate the filter chains to avoid making expensive (literally!) calls to the + Google Maps API for exposes that we already know we aren't interested in anyway.""" + preprocess = 'PREPROCESS' + postprocess = 'POSTPROCESS' diff --git a/flathunter/filter.py b/flathunter/filter.py index bc1b4979..2fb442d9 100644 --- a/flathunter/filter.py +++ b/flathunter/filter.py @@ -1,8 +1,12 @@ """Module with implementations of standard expose filters""" -from functools import reduce import re from abc import ABC, ABCMeta -from typing import List, Any +from typing import List, Any, Dict + +from flathunter.config import DistanceConfig +from flathunter.dataclasses import FilterChainName +from flathunter.gmaps_duration_processor import DistanceElement +from flathunter.logging import logger class AbstractFilter(ABC): @@ -172,30 +176,65 @@ def is_interesting(self, expose): return pps <= self.max_pps -class FilterBuilder: +class DistanceFilter(AbstractFilter): + """Exclude properties based on distance or duration to a location + + This must be in the post-processing filter chain, as it requires data + from the Google Maps API, which is not available right after scraping.""" + + distance_config: DistanceConfig + + def __init__(self, distance_config: DistanceConfig): + self.distance_config = distance_config + + def is_interesting(self, expose): + durations: Dict[str, DistanceElement] = expose.get('durations_unformatted', None) + if durations is None or self.distance_config.location_name not in durations: + logger.info('DurationFilter is enabled, but no GMaps data found. Skipping filter.') + return True + distance = durations[self.distance_config.location_name].distance.meters + duration = durations[self.distance_config.location_name].duration.seconds + out = True + if self.distance_config.max_distance_meters: + out &= distance < self.distance_config.max_distance_meters + if self.distance_config.max_duration_seconds: + out &= duration < self.distance_config.max_duration_seconds + return out + + +class FilterChainBuilder: """Construct a filter chain""" filters: List[AbstractFilter] def __init__(self): self.filters = [] - def _append_filter_if_not_empty(self, filter_class: ABCMeta, filter_config: Any): + def _append_filter_if_not_empty( + self, + filter_class: ABCMeta, + filter_config: Any): """Appends a filter to the list if its configuration is set""" if not filter_config: return self.filters.append(filter_class(filter_config)) - def read_config(self, config): + def read_config(self, config, filter_chain: FilterChainName): """Adds filters from a config dictionary""" - self._append_filter_if_not_empty(TitleFilter, config.excluded_titles()) - self._append_filter_if_not_empty(MinPriceFilter, config.min_price()) - self._append_filter_if_not_empty(MaxPriceFilter, config.max_price()) - self._append_filter_if_not_empty(MinSizeFilter, config.min_size()) - self._append_filter_if_not_empty(MaxSizeFilter, config.max_size()) - self._append_filter_if_not_empty(MinRoomsFilter, config.min_rooms()) - self._append_filter_if_not_empty(MaxRoomsFilter, config.max_rooms()) - self._append_filter_if_not_empty( - PPSFilter, config.max_price_per_square()) + if filter_chain == FilterChainName.preprocess: + self._append_filter_if_not_empty(TitleFilter, config.excluded_titles()) + self._append_filter_if_not_empty(MinPriceFilter, config.min_price()) + self._append_filter_if_not_empty(MaxPriceFilter, config.max_price()) + self._append_filter_if_not_empty(MinSizeFilter, config.min_size()) + self._append_filter_if_not_empty(MaxSizeFilter, config.max_size()) + self._append_filter_if_not_empty(MinRoomsFilter, config.min_rooms()) + self._append_filter_if_not_empty(MaxRoomsFilter, config.max_rooms()) + self._append_filter_if_not_empty( + PPSFilter, config.max_price_per_square()) + elif filter_chain == FilterChainName.postprocess: + for df in config.max_distance(): + self._append_filter_if_not_empty(DistanceFilter, df) + else: + raise NotImplementedError() return self def filter_already_seen(self, id_watch): @@ -204,12 +243,12 @@ def filter_already_seen(self, id_watch): return self def build(self): - """Return the compiled filter""" - return Filter(self.filters) + """Return the compiled filter chain""" + return FilterChain(self.filters) -class Filter: - """Abstract filter object""" +class FilterChain: + """Collection of expose filters in use by a hunter instance""" filters: List[AbstractFilter] @@ -218,14 +257,17 @@ def __init__(self, filters: List[AbstractFilter]): def is_interesting_expose(self, expose): """Apply all filters to this expose""" - return reduce((lambda x, y: x and y), - map((lambda x: x.is_interesting(expose)), self.filters), True) - + for filter_ in self.filters: + if not filter_.is_interesting(expose): + return False + return True + def filter(self, exposes): """Apply all filters to every expose in the list""" return filter(self.is_interesting_expose, exposes) @staticmethod def builder(): - """Return a new filter builder""" - return FilterBuilder() + """Return a new filter chain builder""" + return FilterChainBuilder() + diff --git a/flathunter/gmaps_duration_processor.py b/flathunter/gmaps_duration_processor.py index 0037c072..405f1973 100644 --- a/flathunter/gmaps_duration_processor.py +++ b/flathunter/gmaps_duration_processor.py @@ -3,35 +3,16 @@ import time from urllib.parse import quote_plus from typing import Dict -from dataclasses import dataclass import requests +from flathunter.dataclasses import DistanceElement, DistanceValueTuple, DurationValueTuple, TransportationModes from flathunter.logging import logger from flathunter.abstract_processor import Processor -@dataclass -class TextValueTuple: - """We want to keep both what we parsed, and its numeric value.""" - value: float - text: str - - -@dataclass -class DistanceElement: - """Represents the distance from a property to some location.""" - duration: TextValueTuple - distance: TextValueTuple - mode: str - - class GMapsDurationProcessor(Processor): """Implementation of Processor class to calculate travel durations""" - GM_MODE_TRANSIT = 'transit' - GM_MODE_BICYCLE = 'bicycling' - GM_MODE_DRIVING = 'driving' - def __init__(self, config): self.config = config @@ -54,7 +35,7 @@ def get_distances_and_durations(self, address) -> Dict[str, DistanceElement]: for mode in duration.get('modes', []): if 'gm_id' in mode and 'title' in mode \ and 'key' in self.config.get('google_maps_api', {}): - duration = self.get_gmaps_distance(address, dest, mode['gm_id']) + duration = self._get_gmaps_distance(address, dest, mode['gm_id']) out[name] = duration return out @@ -62,14 +43,14 @@ def get_formatted_durations(self, address): """Return a formatted list of GoogleMaps durations""" durations = self.get_distances_and_durations(address) return self._format_durations(durations) - + def _format_durations(self, durations: Dict[str, DistanceElement]): out = "" for location_name, val in durations.items(): - out += f"> {location_name} ({val.mode}): {val.duration.text} ({val.distance.text})\n" + out += f"> {location_name} ({val.mode.value}): {val.duration.text} ({val.distance.text})\n" return out.strip() - def _get_gmaps_distance(self, address, dest, mode) -> DistanceElement: + def _get_gmaps_distance(self, address, dest, mode) -> DistanceElement | None: """Get the distance""" # get timestamp for next monday at 9:00:00 o'clock now = datetime.datetime.today().replace(hour=9, minute=0, second=0) @@ -85,11 +66,10 @@ def _get_gmaps_distance(self, address, dest, mode) -> DistanceElement: base_url = self.config.get('google_maps_api', {}).get('url') gm_key = self.config.get('google_maps_api', {}).get('key') - if not gm_key and mode != self.GM_MODE_DRIVING: + if not gm_key and mode != TransportationModes.DRIVING: logger.warning("No Google Maps API key configured and without using a mode " - "different from 'driving' is not allowed. " - "Downgrading to mode 'drinving' thus. ") - mode = 'driving' + "different from 'driving' is not allowed. Thus downgrading to mode 'driving'.") + mode = TransportationModes.DRIVING base_url = base_url.replace('&key={key}', '') # retrieve the result @@ -114,13 +94,13 @@ def _get_gmaps_distance(self, address, dest, mode) -> DistanceElement: element['duration']['text'], element['duration']['value']) distance_element = DistanceElement( - duration=TextValueTuple( + duration=DurationValueTuple( float(element['duration']['value']), element['duration']['text']), - distance=TextValueTuple( + distance=DistanceValueTuple( float(element['distance']['value']), element['distance']['text']), - mode=mode + mode=TransportationModes(mode) ) - distances[distance_element.distance.value] = distance_element + distances[distance_element.distance.meters] = distance_element return distances[min(distances.keys())] if distances else None diff --git a/flathunter/hunter.py b/flathunter/hunter.py index 692b9996..998c122d 100644 --- a/flathunter/hunter.py +++ b/flathunter/hunter.py @@ -5,10 +5,11 @@ from flathunter.logging import logger from flathunter.config import YamlConfig -from flathunter.filter import Filter +from flathunter.filter import FilterChain from flathunter.processor import ProcessorChain from flathunter.captcha.captcha_solver import CaptchaUnsolvableError from flathunter.exceptions import ConfigException +from flathunter.dataclasses import FilterChainName class Hunter: """Basic methods for crawling and processing / filtering exposes""" @@ -38,16 +39,14 @@ def try_crawl(searcher, url, max_pages): def hunt_flats(self, max_pages=None): """Crawl, process and filter exposes""" - filter_set = Filter.builder() \ - .read_config(self.config) \ - .filter_already_seen(self.id_watch) \ - .build() - + preprocess_filter_chain = self._build_preprocess_filter_chain(self.config) + postprocess_filter_chain = self._build_postprocess_filter_chain(self.config) processor_chain = ProcessorChain.builder(self.config) \ .save_all_exposes(self.id_watch) \ - .apply_filter(filter_set) \ + .apply_filter(preprocess_filter_chain) \ .resolve_addresses() \ .calculate_durations() \ + .apply_filter(postprocess_filter_chain) \ .send_messages() \ .build() @@ -58,3 +57,14 @@ def hunt_flats(self, max_pages=None): result.append(expose) return result + + def _build_preprocess_filter_chain(self, config) -> FilterChain: + return FilterChain.builder() \ + .read_config(config, FilterChainName.preprocess) \ + .filter_already_seen(self.id_watch) \ + .build() + + def _build_postprocess_filter_chain(self, config) -> FilterChain: + return FilterChain.builder() \ + .read_config(config, FilterChainName.postprocess) \ + .build() diff --git a/flathunter/web/views.py b/flathunter/web/views.py index 5a5926a0..6b1732e7 100644 --- a/flathunter/web/views.py +++ b/flathunter/web/views.py @@ -6,10 +6,11 @@ from flask import render_template, jsonify, request, session, redirect from flask_api import status +from flathunter.dataclasses import FilterChainName from flathunter.web import app, log from flathunter.web.util import sanitize_float -from flathunter.filter import FilterBuilder +from flathunter.filter import FilterChainBuilder from flathunter.config import YamlConfig class AuthenticationError(Exception): @@ -72,7 +73,10 @@ def filter_for_user(): """Load the filter for the current user""" if filter_values_for_user() is None: return None - return FilterBuilder().read_config(YamlConfig({'filters': filter_values_for_user()})).build() + return (FilterChainBuilder() + .read_config(YamlConfig({'filters': filter_values_for_user()}), FilterChainName.preprocess) + .read_config(YamlConfig({'filters': filter_values_for_user()}), FilterChainName.postprocess) + .build()) def form_filter_values(): """Extract the filter settings from the submitted form""" diff --git a/flathunter/web_hunter.py b/flathunter/web_hunter.py index 41bab501..77d30812 100644 --- a/flathunter/web_hunter.py +++ b/flathunter/web_hunter.py @@ -2,7 +2,6 @@ from flathunter.config import YamlConfig from flathunter.logging import logger from flathunter.hunter import Hunter -from flathunter.filter import Filter from flathunter.processor import ProcessorChain from flathunter.exceptions import BotBlockedException, UserDeactivatedException @@ -13,17 +12,19 @@ class WebHunter(Hunter): def hunt_flats(self, max_pages=1): """Crawl all URLs, and send notifications to users of new flats""" - filter_set = Filter.builder() \ - .read_config(self.config) \ - .filter_already_seen(self.id_watch) \ - .build() - + preprocess_filter_chain = self._build_preprocess_filter_chain(self.config) + postprocess_filter_chain = self._build_postprocess_filter_chain(self.config) + # note: we have to save all exposes *after* applying the processors because + # the exposes later get loaded from disk to then be filtered again, so we need + # the additional information from the processor lest the postprocess chain breaks + # due to missing data processor_chain = ProcessorChain.builder(self.config) \ - .apply_filter(filter_set) \ + .apply_filter(preprocess_filter_chain) \ .crawl_expose_details() \ - .save_all_exposes(self.id_watch) \ .resolve_addresses() \ .calculate_durations() \ + .save_all_exposes(self.id_watch) \ + .apply_filter(postprocess_filter_chain) \ .send_messages() \ .build() @@ -34,10 +35,12 @@ def hunt_flats(self, max_pages=1): for (user_id, settings) in self.id_watch.get_user_settings(): if 'mute_notifications' in settings: continue - filter_set = Filter.builder().read_config(YamlConfig(settings)).build() + preprocess_filter_chain = self._build_preprocess_filter_chain(YamlConfig(settings)) + postprocess_filter_chain = self._build_postprocess_filter_chain(YamlConfig(settings)) try: processor_chain = ProcessorChain.builder(self.config) \ - .apply_filter(filter_set) \ + .apply_filter(preprocess_filter_chain) \ + .apply_filter(postprocess_filter_chain) \ .send_messages([user_id]) \ .build() for message in processor_chain.process(new_exposes): diff --git a/test/test_config.py b/test/test_config.py index 6cf821fa..e02d464b 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -71,22 +71,20 @@ def test_loads_config_at_file(self): def test_loads_config_from_string(self): config = StringConfig(string=self.EMPTY_FILTERS_CONFIG) - self.assertIsNotNone(config) - my_filter = config.get_filter() - self.assertIsNotNone(my_filter) + target_urls = config.target_urls() + self.assertIsNotNone(target_urls) def test_loads_legacy_config_from_string(self): config = StringConfig(string=self.LEGACY_FILTERS_CONFIG) - self.assertIsNotNone(config) - my_filter = config.get_filter() + my_filter = config.excluded_titles() self.assertIsNotNone(my_filter) - self.assertTrue(len(my_filter.filters) > 0) + self.assertTrue(len(my_filter) > 0) def test_loads_filters_config_from_string(self): config = StringConfig(string=self.FILTERS_CONFIG) - self.assertIsNotNone(config) - my_filter = config.get_filter() + my_filter = config.excluded_titles() self.assertIsNotNone(my_filter) + self.assertTrue(len(my_filter) > 0) def test_defaults_fields(self): config = StringConfig(string=self.FILTERS_CONFIG) diff --git a/test/test_gmaps_duration_processor.py b/test/test_gmaps_duration_processor.py index a35b1cdf..15a780ea 100644 --- a/test/test_gmaps_duration_processor.py +++ b/test/test_gmaps_duration_processor.py @@ -1,6 +1,7 @@ +from typing import Dict import unittest -import yaml import re +from flathunter.gmaps_duration_processor import DistanceElement import requests_mock from flathunter.hunter import Hunter from flathunter.idmaintainer import IdMaintainer @@ -33,6 +34,20 @@ class GMapsDurationProcessorTest(unittest.TestCase): - gm_id: driving title: Car """ + DUMMY_RESPONSE = """ + { + "status": "OK", + "rows": [ + { + "elements": [ + { + "distance": { "text": "228 mi", "value": 367654 }, + "duration": { "text": "3 hours 55 mins", "value": 14078 } + } + ] + } + ] + }""" @requests_mock.Mocker() def test_resolve_durations(self, m): @@ -40,11 +55,25 @@ def test_resolve_durations(self, m): config.set_searchers([DummyCrawler()]) hunter = Hunter(config, IdMaintainer(":memory:")) matcher = re.compile('maps.googleapis.com/maps/api/distancematrix/json') - m.get(matcher, text='{"status": "OK", "rows": [ { "elements": [ { "distance": { "text": "far", "value": 123 }, "duration": { "text": "days", "value": 123 } } ] } ]}') + m.get(matcher, text=self.DUMMY_RESPONSE) exposes = hunter.hunt_flats() self.assertTrue(count(exposes) > 4, "Expected to find exposes") without_durations = list(filter(lambda expose: 'durations' not in expose, exposes)) - if len(without_durations) > 0: - for expose in without_durations: - print("Got expose: ", expose) - self.assertTrue(len(without_durations) == 0, "Expected durations to be calculated") \ No newline at end of file + for expose in without_durations: + print("Got expose: ", expose) + self.assertTrue(len(without_durations) == 0, "Expected durations to be calculated") + + @requests_mock.Mocker() + def test_durations_valid(self, m): + config = StringConfig(string=self.DUMMY_CONFIG) + config.set_searchers([DummyCrawler()]) + hunter = Hunter(config, IdMaintainer(":memory:")) + matcher = re.compile('maps.googleapis.com/maps/api/distancematrix/json') + m.get(matcher, text=self.DUMMY_RESPONSE) + exposes = hunter.hunt_flats() + for expose in exposes: + durations: Dict[str, DistanceElement] = expose['durations_unformatted'] + for duration in durations.values(): + self.assertAlmostEqual(duration.distance.meters, 367654) + self.assertAlmostEqual(duration.duration.seconds, 14078) + self.assertEqual(duration.duration.text, '3 hours 55 mins') diff --git a/test/test_googlecloud_idmaintainer.py b/test/test_googlecloud_idmaintainer.py index b3788e9a..d430d6a9 100644 --- a/test/test_googlecloud_idmaintainer.py +++ b/test/test_googlecloud_idmaintainer.py @@ -7,7 +7,8 @@ from flathunter.googlecloud_idmaintainer import GoogleCloudIdMaintainer from flathunter.hunter import Hunter from flathunter.web_hunter import WebHunter -from flathunter.filter import Filter +from flathunter.filter import FilterChain +from flathunter.dataclasses import FilterChainName from test.dummy_crawler import DummyCrawler from test.test_util import count from test.utils.config import StringConfig @@ -94,7 +95,10 @@ def test_exposes_are_returned_filtered(id_watch): hunter = Hunter(config, id_watch) hunter.hunt_flats() hunter.hunt_flats() - filter = Filter.builder().read_config(StringConfig('{"filters":{"max_size":70}}')).build() + filter = (FilterChain + .builder() + .read_config(StringConfig('{"filters":{"max_size":70}}'), FilterChainName.preprocess) + .build()) saved = id_watch.get_recent_exposes(10, filter_set=filter) assert len(saved) == 10 for expose in saved: diff --git a/test/test_id_maintainer.py b/test/test_id_maintainer.py index 2d4cdc75..9a84bbf3 100644 --- a/test/test_id_maintainer.py +++ b/test/test_id_maintainer.py @@ -6,7 +6,8 @@ from flathunter.idmaintainer import IdMaintainer from flathunter.hunter import Hunter from flathunter.web_hunter import WebHunter -from flathunter.filter import Filter +from flathunter.filter import FilterChain +from flathunter.dataclasses import FilterChainName from test.dummy_crawler import DummyCrawler from test.test_util import count from test.utils.config import StringConfig @@ -111,7 +112,10 @@ def test_exposes_are_returned_filtered(): hunter = Hunter(config, id_watch) hunter.hunt_flats() hunter.hunt_flats() - filter = Filter.builder().read_config(StringConfig('{"filters":{"max_size":70}}')).build() + filter = (FilterChain + .builder() + .read_config(StringConfig('{"filters":{"max_size":70}}'), FilterChainName.preprocess) + .build()) saved = id_watch.get_recent_exposes(10, filter_set=filter) assert len(saved) == 10 for expose in saved: From e0f45f39c92075e022b2bbfc4ee11f0aa3abd2f6 Mon Sep 17 00:00:00 2001 From: Arthur Taylor Date: Tue, 19 Sep 2023 16:47:00 +0200 Subject: [PATCH 4/6] Satisfy the linter --- flathunter/crawler/vrmimmo.py | 4 ++-- flathunter/dataclasses.py | 10 +++++++--- flathunter/filter.py | 19 +++++++++---------- flathunter/gmaps_duration_processor.py | 8 ++++++-- flathunter/hunter.py | 8 ++++---- flathunter/web/views.py | 6 ++++-- flathunter/web_hunter.py | 2 +- 7 files changed, 33 insertions(+), 24 deletions(-) diff --git a/flathunter/crawler/vrmimmo.py b/flathunter/crawler/vrmimmo.py index 2ff3acec..ee6ee99b 100644 --- a/flathunter/crawler/vrmimmo.py +++ b/flathunter/crawler/vrmimmo.py @@ -2,7 +2,7 @@ import re import hashlib -from bs4 import BeautifulSoup, Tag +from bs4 import BeautifulSoup from flathunter.logging import logger from flathunter.abstract_crawler import Crawler @@ -29,7 +29,7 @@ def extract_data(self, soup: BeautifulSoup): link = item.find("a", {"class": "js-item-title-link ci-search-result__link"}) url = link.get("href") title = link.get("title") - logger.debug("Analyze " + url) + logger.debug("Analyze %s", url) try: price = item.find("div", {"class": "item__spec item-spec-price"}).text diff --git a/flathunter/dataclasses.py b/flathunter/dataclasses.py index 34978189..7f61d9eb 100644 --- a/flathunter/dataclasses.py +++ b/flathunter/dataclasses.py @@ -1,3 +1,7 @@ +""" +This module contains dataclasses to help with serialisation and typechecking of data +sent to and received from the Google Maps Distance API +""" from dataclasses import dataclass from enum import Enum from typing import Optional @@ -58,6 +62,6 @@ class FilterChainName(Enum): Maps API, to make a decision on this expose. We separate the filter chains to avoid making expensive (literally!) calls to the - Google Maps API for exposes that we already know we aren't interested in anyway.""" - preprocess = 'PREPROCESS' - postprocess = 'POSTPROCESS' + Google Maps API for exposes that we already know we aren't interested in anyway.""" + PREPROCESS = 'PREPROCESS' + POSTPROCESS = 'POSTPROCESS' diff --git a/flathunter/filter.py b/flathunter/filter.py index 2fb442d9..dc5f4a61 100644 --- a/flathunter/filter.py +++ b/flathunter/filter.py @@ -181,12 +181,12 @@ class DistanceFilter(AbstractFilter): This must be in the post-processing filter chain, as it requires data from the Google Maps API, which is not available right after scraping.""" - + distance_config: DistanceConfig - + def __init__(self, distance_config: DistanceConfig): self.distance_config = distance_config - + def is_interesting(self, expose): durations: Dict[str, DistanceElement] = expose.get('durations_unformatted', None) if durations is None or self.distance_config.location_name not in durations: @@ -211,7 +211,7 @@ def __init__(self): def _append_filter_if_not_empty( self, - filter_class: ABCMeta, + filter_class: ABCMeta, filter_config: Any): """Appends a filter to the list if its configuration is set""" if not filter_config: @@ -220,7 +220,7 @@ def _append_filter_if_not_empty( def read_config(self, config, filter_chain: FilterChainName): """Adds filters from a config dictionary""" - if filter_chain == FilterChainName.preprocess: + if filter_chain == FilterChainName.PREPROCESS: self._append_filter_if_not_empty(TitleFilter, config.excluded_titles()) self._append_filter_if_not_empty(MinPriceFilter, config.min_price()) self._append_filter_if_not_empty(MaxPriceFilter, config.max_price()) @@ -230,9 +230,9 @@ def read_config(self, config, filter_chain: FilterChainName): self._append_filter_if_not_empty(MaxRoomsFilter, config.max_rooms()) self._append_filter_if_not_empty( PPSFilter, config.max_price_per_square()) - elif filter_chain == FilterChainName.postprocess: - for df in config.max_distance(): - self._append_filter_if_not_empty(DistanceFilter, df) + elif filter_chain == FilterChainName.POSTPROCESS: + for distance_filter in config.max_distance(): + self._append_filter_if_not_empty(DistanceFilter, distance_filter) else: raise NotImplementedError() return self @@ -261,7 +261,7 @@ def is_interesting_expose(self, expose): if not filter_.is_interesting(expose): return False return True - + def filter(self, exposes): """Apply all filters to every expose in the list""" return filter(self.is_interesting_expose, exposes) @@ -270,4 +270,3 @@ def filter(self, exposes): def builder(): """Return a new filter chain builder""" return FilterChainBuilder() - diff --git a/flathunter/gmaps_duration_processor.py b/flathunter/gmaps_duration_processor.py index 405f1973..4da46678 100644 --- a/flathunter/gmaps_duration_processor.py +++ b/flathunter/gmaps_duration_processor.py @@ -4,8 +4,11 @@ from urllib.parse import quote_plus from typing import Dict import requests -from flathunter.dataclasses import DistanceElement, DistanceValueTuple, DurationValueTuple, TransportationModes +from flathunter.dataclasses import (DistanceElement, + DistanceValueTuple, + DurationValueTuple, + TransportationModes) from flathunter.logging import logger from flathunter.abstract_processor import Processor @@ -47,7 +50,8 @@ def get_formatted_durations(self, address): def _format_durations(self, durations: Dict[str, DistanceElement]): out = "" for location_name, val in durations.items(): - out += f"> {location_name} ({val.mode.value}): {val.duration.text} ({val.distance.text})\n" + out += f"> {location_name} ({val.mode.value}): " + \ + f"{val.duration.text} ({val.distance.text})\n" return out.strip() def _get_gmaps_distance(self, address, dest, mode) -> DistanceElement | None: diff --git a/flathunter/hunter.py b/flathunter/hunter.py index 998c122d..0c1c77c1 100644 --- a/flathunter/hunter.py +++ b/flathunter/hunter.py @@ -57,14 +57,14 @@ def hunt_flats(self, max_pages=None): result.append(expose) return result - + def _build_preprocess_filter_chain(self, config) -> FilterChain: return FilterChain.builder() \ - .read_config(config, FilterChainName.preprocess) \ + .read_config(config, FilterChainName.PREPROCESS) \ .filter_already_seen(self.id_watch) \ .build() - + def _build_postprocess_filter_chain(self, config) -> FilterChain: return FilterChain.builder() \ - .read_config(config, FilterChainName.postprocess) \ + .read_config(config, FilterChainName.POSTPROCESS) \ .build() diff --git a/flathunter/web/views.py b/flathunter/web/views.py index 6b1732e7..6559e57c 100644 --- a/flathunter/web/views.py +++ b/flathunter/web/views.py @@ -74,8 +74,10 @@ def filter_for_user(): if filter_values_for_user() is None: return None return (FilterChainBuilder() - .read_config(YamlConfig({'filters': filter_values_for_user()}), FilterChainName.preprocess) - .read_config(YamlConfig({'filters': filter_values_for_user()}), FilterChainName.postprocess) + .read_config( + YamlConfig({'filters': filter_values_for_user()}), FilterChainName.PREPROCESS) + .read_config( + YamlConfig({'filters': filter_values_for_user()}), FilterChainName.POSTPROCESS) .build()) def form_filter_values(): diff --git a/flathunter/web_hunter.py b/flathunter/web_hunter.py index 77d30812..9a1551e0 100644 --- a/flathunter/web_hunter.py +++ b/flathunter/web_hunter.py @@ -14,7 +14,7 @@ def hunt_flats(self, max_pages=1): """Crawl all URLs, and send notifications to users of new flats""" preprocess_filter_chain = self._build_preprocess_filter_chain(self.config) postprocess_filter_chain = self._build_postprocess_filter_chain(self.config) - # note: we have to save all exposes *after* applying the processors because + # note: we have to save all exposes *after* applying the processors because # the exposes later get loaded from disk to then be filtered again, so we need # the additional information from the processor lest the postprocess chain breaks # due to missing data From d5e28411eb77c068f5ddac10ae8b8e8896138ac9 Mon Sep 17 00:00:00 2001 From: Arthur Taylor Date: Tue, 19 Sep 2023 16:53:39 +0200 Subject: [PATCH 5/6] Make pyright happy --- test/test_config.py | 3 ++- test/test_googlecloud_idmaintainer.py | 2 +- test/test_id_maintainer.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/test/test_config.py b/test/test_config.py index e02d464b..8d917f34 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -67,7 +67,8 @@ def test_loads_config_at_file(self): config = Config(tempFileName) self.assertTrue(len(config.get('urls', [])) > 0, "Expected URLs in config file") finally: - os.unlink(tempFileName) + if tempFileName is not None: + os.unlink(tempFileName) def test_loads_config_from_string(self): config = StringConfig(string=self.EMPTY_FILTERS_CONFIG) diff --git a/test/test_googlecloud_idmaintainer.py b/test/test_googlecloud_idmaintainer.py index d430d6a9..06cacda1 100644 --- a/test/test_googlecloud_idmaintainer.py +++ b/test/test_googlecloud_idmaintainer.py @@ -97,7 +97,7 @@ def test_exposes_are_returned_filtered(id_watch): hunter.hunt_flats() filter = (FilterChain .builder() - .read_config(StringConfig('{"filters":{"max_size":70}}'), FilterChainName.preprocess) + .read_config(StringConfig('{"filters":{"max_size":70}}'), FilterChainName.PREPROCESS) .build()) saved = id_watch.get_recent_exposes(10, filter_set=filter) assert len(saved) == 10 diff --git a/test/test_id_maintainer.py b/test/test_id_maintainer.py index 9a84bbf3..796b9a90 100644 --- a/test/test_id_maintainer.py +++ b/test/test_id_maintainer.py @@ -114,7 +114,7 @@ def test_exposes_are_returned_filtered(): hunter.hunt_flats() filter = (FilterChain .builder() - .read_config(StringConfig('{"filters":{"max_size":70}}'), FilterChainName.preprocess) + .read_config(StringConfig('{"filters":{"max_size":70}}'), FilterChainName.PREPROCESS) .build()) saved = id_watch.get_recent_exposes(10, filter_set=filter) assert len(saved) == 10 From 727716263801d0ecd41007b0bbb6308370c9b65c Mon Sep 17 00:00:00 2001 From: Arthur Taylor Date: Wed, 20 Sep 2023 10:42:18 +0200 Subject: [PATCH 6/6] Max some of the tests pass --- flathunter/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flathunter/config.py b/flathunter/config.py index ebede602..55b919ed 100644 --- a/flathunter/config.py +++ b/flathunter/config.py @@ -349,11 +349,11 @@ def max_price_per_square(self): """Return the configured maximum price per square meter""" return self._get_filter_config("max_price_per_square") - def max_distance(self) -> List[DistanceConfig] | None: + def max_distance(self) -> List[DistanceConfig]: """Return the configured maximum distance to locations.""" config = self._get_filter_config("max_distance") if config is None: - return None + return [] out = [] for distance_filter_item in config: out.append(