From e5b8dcf2b00f4abdc58dce9909c9594e11fd05a3 Mon Sep 17 00:00:00 2001 From: David Kane Date: Sun, 12 Apr 2020 16:07:24 +0100 Subject: [PATCH] use separate files --- findthatcharity_import/spiders/companies.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/findthatcharity_import/spiders/companies.py b/findthatcharity_import/spiders/companies.py index 301911d..c454c57 100644 --- a/findthatcharity_import/spiders/companies.py +++ b/findthatcharity_import/spiders/companies.py @@ -15,7 +15,7 @@ class CompaniesSpider(BaseScraper): name = 'companies' allowed_domains = ['companieshouse.gov.uk'] start_urls = ["http://download.companieshouse.gov.uk/en_output.html"] - zip_regex = re.compile(r"BasicCompanyDataAsOneFile.*\.zip") + zip_regex = re.compile(r"BasicCompanyData-.*\.zip") org_id_prefix = "GB-COH" clg_types = [ "PRI/LBG/NSC (Private, Limited by guarantee, no share capital, use of 'Limited' exemption)", @@ -84,13 +84,18 @@ def start_requests(self): return [scrapy.Request(self.start_urls[0], callback=self.fetch_zip)] def fetch_zip(self, response): - link = response.css("a::attr(href)").re_first(self.zip_regex) - - self.source["distribution"][0]["accessURL"] = self.start_urls[0] - self.source["distribution"][0]["downloadURL"] = response.urljoin(link) self.source["modified"] = datetime.datetime.now().isoformat() + links = [] + for i, link in enumerate(response.css("a::attr(href)").re(self.zip_regex)): + + self.source["distribution"][i] = { + "accessURL": self.start_urls[0], + "downloadURL": response.urljoin(link), + "title": "Free Company Data Product", + } - return [scrapy.Request(response.urljoin(link), callback=self.process_zip)] + links.append(scrapy.Request(response.urljoin(link), callback=self.process_zip)) + return links def process_zip(self, response): yield Source(**self.source)