Updated crawler for new immowelt website

flathunters · Sep 8, 2024 · b25c1e1 · b25c1e1
1 parent c6c88f0
commit b25c1e1
Showing 1 changed file with 31 additions and 25 deletions.
diff --git a/flathunter/crawler/immowelt.py b/flathunter/crawler/immowelt.py
@@ -49,61 +49,68 @@ def get_expose_details(self, expose):
     def extract_data(self, soup: BeautifulSoup):
         """Extracts all exposes from a provided Soup object"""
         entries = []
-        soup_res = soup.find("main")
+        soup_res = soup
         if not isinstance(soup_res, Tag):
             return []
 
-        title_elements = soup_res.find_all("h2")
-        expose_ids = soup_res.find_all("a", id=True)
+        advertisements = soup_res.find_all("div", attrs={"class": "css-79elbk"})
+        for adv in advertisements:
+            try:
+                title = adv.find("div", {"class": "css-1cbj9xw"}).text
+            except:
+                title = ""
 
-        for idx, title_el in enumerate(title_elements):
             try:
-                price = expose_ids[idx].find(
-                    "div", attrs={"data-test": "price"}).text
-            except IndexError:
+                price = adv.find(
+                    "div", attrs={"data-test": "cardmfe-price-testid"}).text
+            except:
                 price = ""
 
             try:
-                size = expose_ids[idx].find(
-                    "div", attrs={"data-test": "area"}).text
+                descriptions = adv.find("div", attrs={"data-testid": "cardmfe-keyfacts-testid"}).children
+                descriptions = [result.text for result in descriptions]
+            except:
+                descriptions = []
+
+            size = list(filter(lambda x: "m²" in x, descriptions))
+            try:
+                size = size[0]
             except IndexError:
                 size = ""
 
+            rooms = list(filter(lambda x: "Zimmer" in x, descriptions))
             try:
-                rooms = expose_ids[idx].find(
-                    "div", attrs={"data-test": "rooms"}).text.replace(" Zi.", "")
+                rooms = rooms[0]
             except IndexError:
                 rooms = ""
 
+            id_element = adv.find("a")
             try:
-                url = expose_ids[idx].get("href")
+                url = "https://immowelt.de" + id_element.get("href")
             except IndexError:
                 continue
 
-            picture = expose_ids[idx].find("picture")
+            picture = adv.find("img")
             image = None
             if picture:
-                src = picture.find("source")
-                if src:
-                    image = src.get("data-srcset")
+                image = picture.get('src')
 
             try:
-                address = expose_ids[idx].find(
-                    "div", attrs={"class": re.compile("IconFact.*")}
-                  )
-                address = address.find("span").text
+                address = adv.find(
+                    "div", attrs={"data-testid": "cardmfe-description-box-address"}
+                  ).text
             except (IndexError, AttributeError):
                 address = ""
-
+            ad_id = url.split('/')[-1]
             processed_id = int(
-              hashlib.sha256(expose_ids[idx].get("id").encode('utf-8')).hexdigest(), 16
+              hashlib.sha256(ad_id.encode('utf-8')).hexdigest(), 16
             ) % 10**16
 
             details = {
                 'id': processed_id,
                 'image': image,
                 'url': url,
-                'title': title_el.text.strip(),
+                'title': title.strip(),
                 'rooms': rooms,
                 'price': price,
                 'size': size,
@@ -113,5 +120,4 @@ def extract_data(self, soup: BeautifulSoup):
             entries.append(details)
 
         logger.debug('Number of entries found: %d', len(entries))
-
-        return entries
+        return entries