Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Google Maps filter (with passing linter and type checker) #477

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 20 additions & 8 deletions flathunter/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Wrap configuration options as an object"""
import os
from typing import Optional, Dict, Any
from typing import List, Optional, Dict, Any

import json
import yaml
Expand All @@ -18,7 +18,8 @@
from flathunter.crawler.wggesucht import WgGesucht
from flathunter.crawler.vrmimmo import VrmImmo
from flathunter.crawler.subito import Subito
from flathunter.filter import Filter
from flathunter.dataclasses import DistanceConfig
from flathunter.gmaps_duration_processor import TransportationModes
from flathunter.logging import logger
from flathunter.exceptions import ConfigException

Expand Down Expand Up @@ -172,12 +173,6 @@ def searchers(self):
"""Get the list of search plugins"""
return self.__searchers__

def get_filter(self):
"""Read the configured filter"""
builder = Filter.builder()
builder.read_config(self)
return builder.build()

def captcha_enabled(self):
"""Check if captcha is configured"""
return self._get_captcha_solver() is not None
Expand Down Expand Up @@ -354,6 +349,23 @@ def max_price_per_square(self):
"""Return the configured maximum price per square meter"""
return self._get_filter_config("max_price_per_square")

def max_distance(self) -> List[DistanceConfig]:
"""Return the configured maximum distance to locations."""
config = self._get_filter_config("max_distance")
if config is None:
return []
out = []
for distance_filter_item in config:
out.append(
DistanceConfig(
location_name=distance_filter_item['location_name'],
transport_mode=TransportationModes(distance_filter_item['transportation_mode']),
max_distance_meters=distance_filter_item.get('max_distance_meters'),
max_duration_seconds=distance_filter_item.get('max_duration_seconds')
)
)
return out

def __repr__(self):
return json.dumps({
"captcha_enabled": self.captcha_enabled(),
Expand Down
4 changes: 2 additions & 2 deletions flathunter/crawler/vrmimmo.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import re
import hashlib

from bs4 import BeautifulSoup, Tag
from bs4 import BeautifulSoup

from flathunter.logging import logger
from flathunter.abstract_crawler import Crawler
Expand All @@ -29,7 +29,7 @@ def extract_data(self, soup: BeautifulSoup):
link = item.find("a", {"class": "js-item-title-link ci-search-result__link"})
url = link.get("href")
title = link.get("title")
logger.debug("Analyze " + url)
logger.debug("Analyze %s", url)

try:
price = item.find("div", {"class": "item__spec item-spec-price"}).text
Expand Down
67 changes: 67 additions & 0 deletions flathunter/dataclasses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
"""
This module contains dataclasses to help with serialisation and typechecking of data
sent to and received from the Google Maps Distance API
"""
from dataclasses import dataclass
from enum import Enum
from typing import Optional


class TransportationModes(Enum):
"""The transportation mode for Google Maps distance calculation."""
TRANSIT = 'transit'
BICYCLING = 'bicycling'
DRIVING = 'driving'
WALKING = 'walking'


@dataclass
class DistanceValueTuple:
"""We want to keep both the numeric value of a distance, and its string representation."""
meters: float
text: str


@dataclass
class DurationValueTuple:
"""We want to keep both the numeric value of a duration, and its string representation."""
seconds: float
text: str


@dataclass
class DistanceElement:
"""Represents the distance from a property to some location."""
duration: DurationValueTuple
distance: DistanceValueTuple
mode: TransportationModes


@dataclass
class DistanceConfig:
"""Represents distance filter information in the configuration file.

location_name must refer to the location name used to identify the location
in the durations section of the config file, and the transport_mode must be
configured in the durations section for that location name, lest no information
is available to actually filter on."""
location_name: str
transport_mode: TransportationModes
max_distance_meters: Optional[float]
max_duration_seconds: Optional[float]


class FilterChainName(Enum):
"""Identifies the filter chain that a filter acts on

Preprocess filters will be run before the expose is processed by any further actions.
Use this chain to filter exposes that can be excluded based on information scraped
from the expose website alone (such as based on price or size).
Postprocess filters will be run after other actions have completed. Use this if you
require additional information from other steps, such as information from the Google
Maps API, to make a decision on this expose.

We separate the filter chains to avoid making expensive (literally!) calls to the
Google Maps API for exposes that we already know we aren't interested in anyway."""
PREPROCESS = 'PREPROCESS'
POSTPROCESS = 'POSTPROCESS'
85 changes: 63 additions & 22 deletions flathunter/filter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
"""Module with implementations of standard expose filters"""
from functools import reduce
import re
from abc import ABC, ABCMeta
from typing import List, Any
from typing import List, Any, Dict

from flathunter.config import DistanceConfig
from flathunter.dataclasses import FilterChainName
from flathunter.gmaps_duration_processor import DistanceElement
from flathunter.logging import logger


class AbstractFilter(ABC):
Expand Down Expand Up @@ -172,30 +176,65 @@ def is_interesting(self, expose):
return pps <= self.max_pps


class FilterBuilder:
class DistanceFilter(AbstractFilter):
"""Exclude properties based on distance or duration to a location

This must be in the post-processing filter chain, as it requires data
from the Google Maps API, which is not available right after scraping."""

distance_config: DistanceConfig

def __init__(self, distance_config: DistanceConfig):
self.distance_config = distance_config

def is_interesting(self, expose):
durations: Dict[str, DistanceElement] = expose.get('durations_unformatted', None)
if durations is None or self.distance_config.location_name not in durations:
logger.info('DurationFilter is enabled, but no GMaps data found. Skipping filter.')
return True
distance = durations[self.distance_config.location_name].distance.meters
duration = durations[self.distance_config.location_name].duration.seconds
out = True
if self.distance_config.max_distance_meters:
out &= distance < self.distance_config.max_distance_meters
if self.distance_config.max_duration_seconds:
out &= duration < self.distance_config.max_duration_seconds
return out


class FilterChainBuilder:
"""Construct a filter chain"""
filters: List[AbstractFilter]

def __init__(self):
self.filters = []

def _append_filter_if_not_empty(self, filter_class: ABCMeta, filter_config: Any):
def _append_filter_if_not_empty(
self,
filter_class: ABCMeta,
filter_config: Any):
"""Appends a filter to the list if its configuration is set"""
if not filter_config:
return
self.filters.append(filter_class(filter_config))

def read_config(self, config):
def read_config(self, config, filter_chain: FilterChainName):
"""Adds filters from a config dictionary"""
self._append_filter_if_not_empty(TitleFilter, config.excluded_titles())
self._append_filter_if_not_empty(MinPriceFilter, config.min_price())
self._append_filter_if_not_empty(MaxPriceFilter, config.max_price())
self._append_filter_if_not_empty(MinSizeFilter, config.min_size())
self._append_filter_if_not_empty(MaxSizeFilter, config.max_size())
self._append_filter_if_not_empty(MinRoomsFilter, config.min_rooms())
self._append_filter_if_not_empty(MaxRoomsFilter, config.max_rooms())
self._append_filter_if_not_empty(
PPSFilter, config.max_price_per_square())
if filter_chain == FilterChainName.PREPROCESS:
self._append_filter_if_not_empty(TitleFilter, config.excluded_titles())
self._append_filter_if_not_empty(MinPriceFilter, config.min_price())
self._append_filter_if_not_empty(MaxPriceFilter, config.max_price())
self._append_filter_if_not_empty(MinSizeFilter, config.min_size())
self._append_filter_if_not_empty(MaxSizeFilter, config.max_size())
self._append_filter_if_not_empty(MinRoomsFilter, config.min_rooms())
self._append_filter_if_not_empty(MaxRoomsFilter, config.max_rooms())
self._append_filter_if_not_empty(
PPSFilter, config.max_price_per_square())
elif filter_chain == FilterChainName.POSTPROCESS:
for distance_filter in config.max_distance():
self._append_filter_if_not_empty(DistanceFilter, distance_filter)
else:
raise NotImplementedError()
return self

def filter_already_seen(self, id_watch):
Expand All @@ -204,12 +243,12 @@ def filter_already_seen(self, id_watch):
return self

def build(self):
"""Return the compiled filter"""
return Filter(self.filters)
"""Return the compiled filter chain"""
return FilterChain(self.filters)


class Filter:
"""Abstract filter object"""
class FilterChain:
"""Collection of expose filters in use by a hunter instance"""

filters: List[AbstractFilter]

Expand All @@ -218,14 +257,16 @@ def __init__(self, filters: List[AbstractFilter]):

def is_interesting_expose(self, expose):
"""Apply all filters to this expose"""
return reduce((lambda x, y: x and y),
map((lambda x: x.is_interesting(expose)), self.filters), True)
for filter_ in self.filters:
if not filter_.is_interesting(expose):
return False
return True

def filter(self, exposes):
"""Apply all filters to every expose in the list"""
return filter(self.is_interesting_expose, exposes)

@staticmethod
def builder():
"""Return a new filter builder"""
return FilterBuilder()
"""Return a new filter chain builder"""
return FilterChainBuilder()
70 changes: 46 additions & 24 deletions flathunter/gmaps_duration_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,43 +2,59 @@
import datetime
import time
from urllib.parse import quote_plus
from typing import Dict
import requests

from flathunter.dataclasses import (DistanceElement,
DistanceValueTuple,
DurationValueTuple,
TransportationModes)
from flathunter.logging import logger
from flathunter.abstract_processor import Processor


class GMapsDurationProcessor(Processor):
"""Implementation of Processor class to calculate travel durations"""

GM_MODE_TRANSIT = 'transit'
GM_MODE_BICYCLE = 'bicycling'
GM_MODE_DRIVING = 'driving'

def __init__(self, config):
self.config = config

def process_expose(self, expose):
"""Calculate the durations for an expose"""
expose['durations'] = self.get_formatted_durations(expose['address']).strip()
durations = self.get_distances_and_durations(expose['address'])
expose['durations'] = self._format_durations(durations).strip()
expose['durations_unformatted'] = durations
return expose

def get_distances_and_durations(self, address) -> Dict[str, DistanceElement]:
"""Return a dict mapping location names to distances and durations"""
out = {}
for duration in self.config.get('durations', []):
if 'destination' not in duration or 'name' not in duration or 'modes' not in duration:
logger.warning('illegal duration configuration: %s', duration)
continue
dest = duration.get('destination')
name = duration.get('name')
for mode in duration.get('modes', []):
if 'gm_id' in mode and 'title' in mode \
and 'key' in self.config.get('google_maps_api', {}):
duration = self._get_gmaps_distance(address, dest, mode['gm_id'])
out[name] = duration
return out

def get_formatted_durations(self, address):
"""Return a formatted list of GoogleMaps durations"""
out = ""
for duration in self.config.get('durations', []):
if 'destination' in duration and 'name' in duration:
dest = duration.get('destination')
name = duration.get('name')
for mode in duration.get('modes', []):
if 'gm_id' in mode and 'title' in mode \
and 'key' in self.config.get('google_maps_api', {}):
duration = self.get_gmaps_distance(address, dest, mode['gm_id'])
title = mode['title']
out += f"> {name} ({title}): {duration}\n"
durations = self.get_distances_and_durations(address)
return self._format_durations(durations)

def _format_durations(self, durations: Dict[str, DistanceElement]):
out = ""
for location_name, val in durations.items():
out += f"> {location_name} ({val.mode.value}): " + \
f"{val.duration.text} ({val.distance.text})\n"
return out.strip()

def get_gmaps_distance(self, address, dest, mode):
def _get_gmaps_distance(self, address, dest, mode) -> DistanceElement | None:
"""Get the distance"""
# get timestamp for next monday at 9:00:00 o'clock
now = datetime.datetime.today().replace(hour=9, minute=0, second=0)
Expand All @@ -54,11 +70,10 @@ def get_gmaps_distance(self, address, dest, mode):
base_url = self.config.get('google_maps_api', {}).get('url')
gm_key = self.config.get('google_maps_api', {}).get('key')

if not gm_key and mode != self.GM_MODE_DRIVING:
if not gm_key and mode != TransportationModes.DRIVING:
logger.warning("No Google Maps API key configured and without using a mode "
"different from 'driving' is not allowed. "
"Downgrading to mode 'drinving' thus. ")
mode = 'driving'
"different from 'driving' is not allowed. Thus downgrading to mode 'driving'.")
mode = TransportationModes.DRIVING
base_url = base_url.replace('&key={key}', '')

# retrieve the result
Expand All @@ -82,7 +97,14 @@ def get_gmaps_distance(self, address, dest, mode):
element['distance']['text'],
element['duration']['text'],
element['duration']['value'])
duration_text = element['duration']['text']
distance_text = element['distance']['text']
distances[element['duration']['value']] = f"{duration_text} ({distance_text})"
distance_element = DistanceElement(
duration=DurationValueTuple(
float(element['duration']['value']),
element['duration']['text']),
distance=DistanceValueTuple(
float(element['distance']['value']),
element['distance']['text']),
mode=TransportationModes(mode)
)
distances[distance_element.distance.meters] = distance_element
return distances[min(distances.keys())] if distances else None
Loading
Loading