Skip to content
This repository has been archived by the owner on Sep 21, 2020. It is now read-only.

Commit

Permalink
use separate files
Browse files Browse the repository at this point in the history
  • Loading branch information
drkane committed Apr 12, 2020
1 parent f26bb88 commit e5b8dcf
Showing 1 changed file with 11 additions and 6 deletions.
17 changes: 11 additions & 6 deletions findthatcharity_import/spiders/companies.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class CompaniesSpider(BaseScraper):
name = 'companies'
allowed_domains = ['companieshouse.gov.uk']
start_urls = ["http://download.companieshouse.gov.uk/en_output.html"]
zip_regex = re.compile(r"BasicCompanyDataAsOneFile.*\.zip")
zip_regex = re.compile(r"BasicCompanyData-.*\.zip")
org_id_prefix = "GB-COH"
clg_types = [
"PRI/LBG/NSC (Private, Limited by guarantee, no share capital, use of 'Limited' exemption)",
Expand Down Expand Up @@ -84,13 +84,18 @@ def start_requests(self):
return [scrapy.Request(self.start_urls[0], callback=self.fetch_zip)]

def fetch_zip(self, response):
link = response.css("a::attr(href)").re_first(self.zip_regex)

self.source["distribution"][0]["accessURL"] = self.start_urls[0]
self.source["distribution"][0]["downloadURL"] = response.urljoin(link)
self.source["modified"] = datetime.datetime.now().isoformat()
links = []
for i, link in enumerate(response.css("a::attr(href)").re(self.zip_regex)):

self.source["distribution"][i] = {
"accessURL": self.start_urls[0],
"downloadURL": response.urljoin(link),
"title": "Free Company Data Product",
}

return [scrapy.Request(response.urljoin(link), callback=self.process_zip)]
links.append(scrapy.Request(response.urljoin(link), callback=self.process_zip))
return links

def process_zip(self, response):
yield Source(**self.source)
Expand Down

0 comments on commit e5b8dcf

Please sign in to comment.