From 38201f3709f29b548a3433f4983511ee8ee0ab8a Mon Sep 17 00:00:00 2001 From: chrisr Date: Wed, 21 Aug 2024 17:21:52 +0100 Subject: [PATCH] Update website_crawler.py to keep clean urls --- crawlers/website_crawler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/crawlers/website_crawler.py b/crawlers/website_crawler.py index c7c58f4..845d8ef 100644 --- a/crawlers/website_crawler.py +++ b/crawlers/website_crawler.py @@ -79,7 +79,6 @@ def crawl(self) -> None: pos_regex=self.pos_regex, neg_regex=self.neg_regex, indexer=self.indexer, visited=set(), verbose=self.indexer.verbose) urls = clean_urls(urls_set, keep_query_params) - urls = list(set(urls_set)) else: logging.info(f"Unknown pages_source: {self.cfg.website_crawler.pages_source}") return