Skip to content
This repository has been archived by the owner on Sep 21, 2020. It is now read-only.

Commit

Permalink
add redis to make scraping more memory efficient
Browse files Browse the repository at this point in the history
  • Loading branch information
drkane committed Jan 24, 2020
1 parent 9d9cfa0 commit aae8d57
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 7 deletions.
46 changes: 39 additions & 7 deletions findthatcharity_import/spiders/ccew.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,20 @@
import bcp
import tempfile
import os
import pickle

import scrapy
import tqdm
import redis

from .base_scraper import BaseScraper
from ..items import Organisation, Source, AREA_TYPES

class CCEWSpider(BaseScraper):
name = 'ccew'
custom_settings = {
'DOWNLOAD_TIMEOUT': 180 * 3
'DOWNLOAD_TIMEOUT': 180 * 3,
'REDIS_URL': os.environ.get('REDIS_URL'),
}
allowed_domains = ['charitycommission.gov.uk']
start_urls = [
Expand Down Expand Up @@ -111,6 +114,10 @@ class CCEWSpider(BaseScraper):
}

def start_requests(self):
if self.settings.get('REDIS_URL'):
self.redis = redis.StrictRedis.from_url(self.settings.get('REDIS_URL'))
else:
self.redis = None
return [
scrapy.Request(self.start_urls[1], callback=self.download_aoo_ref)
]
Expand All @@ -135,7 +142,7 @@ def fetch_zip(self, response):

def process_zip(self, response):
self.logger.info("File size: {}".format(len(response.body)))
self.charities = {}
self.initialise_charities()

with tempfile.TemporaryDirectory() as tmpdirname:
cczip_name = os.path.join(tmpdirname, 'ccew.zip')
Expand Down Expand Up @@ -175,20 +182,45 @@ def process_bcp(self, bcpfile, filename):
row = self.clean_fields(row)
if not row.get("regno"):
continue
if row["regno"] not in self.charities:
self.charities[row["regno"]] = {
charity = self.get_charity(row['regno'])
if not charity:
charity = {
f: [] for f in self.ccew_files.keys()
}
if (filename in ["extract_main_charity", "extract_charity"] and row.get("subno", '0') == '0'):
for field in row:
self.charities[row["regno"]][field] = row[field]
charity[field] = row[field]
else:
self.charities[row["regno"]][filename].append(row)
charity[filename].append(row)
self.set_charity(row['regno'], charity)

def initialise_charities(self):
if self.redis:
return self.redis.delete('charities')
self.charities = {}

def get_charity(self, regno):
if self.redis:
charity = self.redis.hget('charities', regno)
return pickle.loads(charity) if charity else charity
return self.charities.get(regno)

def set_charity(self, regno, charity):
if self.redis:
return self.redis.hset('charities', regno, pickle.dumps(charity))
self.charities[regno] = charity

def get_all_charities(self):
if self.redis:
for regno, charity in self.redis.hscan_iter('charities'):
yield (regno, pickle.loads(charity))
else:
return self.charities.items()

def process_charities(self):
yield Source(**self.source)

for regno, record in self.charities.items():
for regno, record in self.get_all_charities():

# helps with debugging - shouldn't normally be empty
record["regno"] = regno
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ pyexcel-ezodf==0.3.4
pyexcel-io==0.5.20
pyexcel-ods3==0.5.3
pymongo==3.10.1
redis==3.3.11
bcp-reader==0.1.1
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"pyexcel-io==0.5.20",
"pyexcel-ods3==0.5.3",
"pymongo==3.10.1",
"redis==3.3.11",
"bcp-reader==0.1.1",
],
entry_points={
Expand Down

0 comments on commit aae8d57

Please sign in to comment.