Skip to content

Commit

Permalink
Updated crawler for new immowelt website
Browse files Browse the repository at this point in the history
  • Loading branch information
tofylion committed Sep 8, 2024
1 parent c6c88f0 commit b25c1e1
Showing 1 changed file with 31 additions and 25 deletions.
56 changes: 31 additions & 25 deletions flathunter/crawler/immowelt.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,61 +49,68 @@ def get_expose_details(self, expose):
def extract_data(self, soup: BeautifulSoup):
"""Extracts all exposes from a provided Soup object"""
entries = []
soup_res = soup.find("main")
soup_res = soup
if not isinstance(soup_res, Tag):
return []

title_elements = soup_res.find_all("h2")
expose_ids = soup_res.find_all("a", id=True)
advertisements = soup_res.find_all("div", attrs={"class": "css-79elbk"})
for adv in advertisements:
try:
title = adv.find("div", {"class": "css-1cbj9xw"}).text
except:
title = ""

for idx, title_el in enumerate(title_elements):
try:
price = expose_ids[idx].find(
"div", attrs={"data-test": "price"}).text
except IndexError:
price = adv.find(
"div", attrs={"data-test": "cardmfe-price-testid"}).text
except:
price = ""

try:
size = expose_ids[idx].find(
"div", attrs={"data-test": "area"}).text
descriptions = adv.find("div", attrs={"data-testid": "cardmfe-keyfacts-testid"}).children
descriptions = [result.text for result in descriptions]
except:
descriptions = []

size = list(filter(lambda x: "m²" in x, descriptions))
try:
size = size[0]
except IndexError:
size = ""

rooms = list(filter(lambda x: "Zimmer" in x, descriptions))
try:
rooms = expose_ids[idx].find(
"div", attrs={"data-test": "rooms"}).text.replace(" Zi.", "")
rooms = rooms[0]
except IndexError:
rooms = ""

id_element = adv.find("a")
try:
url = expose_ids[idx].get("href")
url = "https://immowelt.de" + id_element.get("href")
except IndexError:
continue

picture = expose_ids[idx].find("picture")
picture = adv.find("img")
image = None
if picture:
src = picture.find("source")
if src:
image = src.get("data-srcset")
image = picture.get('src')

try:
address = expose_ids[idx].find(
"div", attrs={"class": re.compile("IconFact.*")}
)
address = address.find("span").text
address = adv.find(
"div", attrs={"data-testid": "cardmfe-description-box-address"}
).text
except (IndexError, AttributeError):
address = ""

ad_id = url.split('/')[-1]
processed_id = int(
hashlib.sha256(expose_ids[idx].get("id").encode('utf-8')).hexdigest(), 16
hashlib.sha256(ad_id.encode('utf-8')).hexdigest(), 16
) % 10**16

details = {
'id': processed_id,
'image': image,
'url': url,
'title': title_el.text.strip(),
'title': title.strip(),
'rooms': rooms,
'price': price,
'size': size,
Expand All @@ -113,5 +120,4 @@ def extract_data(self, soup: BeautifulSoup):
entries.append(details)

logger.debug('Number of entries found: %d', len(entries))

return entries
return entries

0 comments on commit b25c1e1

Please sign in to comment.