Skip to content
This repository has been archived by the owner on Sep 21, 2020. It is now read-only.

Commit

Permalink
add primary organisation type to scrapers
Browse files Browse the repository at this point in the history
  • Loading branch information
drkane committed Apr 19, 2020
1 parent f3aa1d0 commit b1eaae2
Show file tree
Hide file tree
Showing 18 changed files with 36 additions and 9 deletions.
3 changes: 2 additions & 1 deletion findthatcharity_import/spiders/casc.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ def parse(self, response):
"alternateName": [],
"email": None,
"description": None,
"organisationType": ["Sports Club", "Community Amateur Sports Club"],
"organisationType": ["Community Amateur Sports Club", "Sports Club"],
"organisationTypePrimary": ["Community Amateur Sports Club"],
"url": None,
"location": [],
"latestIncome": None,
Expand Down
1 change: 1 addition & 0 deletions findthatcharity_import/spiders/ccew.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,7 @@ def process_charities(self):
"email": record.get("email"),
"description": self.get_objects(record),
"organisationType": org_types,
"organisationTypePrimary": 'Registered Charity',
"url": self.parse_url(record.get("web")),
"location": self.get_locations(record),
"latestIncome": int(record["income"]) if record.get("income") else None,
Expand Down
1 change: 1 addition & 0 deletions findthatcharity_import/spiders/ccni.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def parse_row(self, record):
"email": record.get("Email"),
"description": None,
"organisationType": org_types,
"organisationTypePrimary": 'Registered Charity',
"url": self.parse_url(record.get("Website")),
"location": [],
"latestIncome": int(record["Total income"]) if record.get("Total income") else None,
Expand Down
1 change: 1 addition & 0 deletions findthatcharity_import/spiders/companies.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ def parse_row(self, row):
"email": None,
"description": None,
"organisationType": orgtypes,
"organisationTypePrimary": record.get("CompanyCategory", "Regisered Company"),
"url": None,
"location": [],
"latestIncome": None,
Expand Down
1 change: 1 addition & 0 deletions findthatcharity_import/spiders/gor.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def parse_row(self, record):
"email": None,
"description": None,
"organisationType": org_types,
"organisationTypePrimary": org_types[0],
"url": record.get("website"),
"location": [],
"latestIncome": None,
Expand Down
1 change: 1 addition & 0 deletions findthatcharity_import/spiders/hesa.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def get_rows(self, response):
"email": None,
"description": None,
"organisationType": self.org_types.get(cells[4].strip(), cells[4].strip()),
"organisationType": self.org_types.get(cells[4].strip(), ["Higher Education"])[0],
"url": None,
"location": [],
"latestIncome": None,
Expand Down
2 changes: 1 addition & 1 deletion findthatcharity_import/spiders/lae.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,8 @@ def parse_row(self, record):
"alternateName": [],
"email": None,
"description": None,
"organisationTypePrimary": "Local Authority",
"organisationType": org_types,
"organisationTypePrimary": "Local Authority",
"url": None,
"location": locations,
"latestIncome": None,
Expand Down
1 change: 1 addition & 0 deletions findthatcharity_import/spiders/lani.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def parse_row(self, record):
"email": None,
"description": None,
"organisationType": org_types,
"organisationTypePrimary": "Local Authority",
"url": record.get("website"),
"location": locations,
"latestIncome": None,
Expand Down
1 change: 1 addition & 0 deletions findthatcharity_import/spiders/las.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def parse_row(self, record):
"email": None,
"description": None,
"organisationType": org_types,
"organisationTypePrimary": "Local Authority",
"url": None,
"location": locations,
"latestIncome": None,
Expand Down
1 change: 1 addition & 0 deletions findthatcharity_import/spiders/mutuals.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
# "email": None,
# "description": record.get("Registration Act"),
# "organisationType": org_types,
# "organisationTypePrimary": record.get("Registered As"),
# "url": None,
# "location": [],
# "latestIncome": None,
Expand Down
4 changes: 3 additions & 1 deletion findthatcharity_import/spiders/nhsods.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,10 @@ def parse_row(self, record, org_type=None):

record = self.clean_fields(record)

org_types = ["Health"]
org_types = []
if org_type:
org_types.append(org_type)
org_types.append("Health")

address = {
"streetAddress": record.get("Address Line 1"),
Expand Down Expand Up @@ -159,6 +160,7 @@ def parse_row(self, record, org_type=None):
"email": None,
"description": None,
"organisationType": org_types,
"organisationTypePrimary": org_types[0],
"url": None,
"location": [],
"latestIncome": None,
Expand Down
1 change: 1 addition & 0 deletions findthatcharity_import/spiders/oscr.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ def parse_row(self, record):
"email": None,
"description": record.get("Objectives"),
"organisationType": org_types,
"organisationTypePrimary": "Registered Charity",
"url": self.parse_url(record.get("Website")),
"location": [],
"latestIncome": int(record["Most recent year income"]) if record.get("Most recent year income") else None,
Expand Down
1 change: 1 addition & 0 deletions findthatcharity_import/spiders/pla.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def parse_row(self, record):
"email": None,
"description": None,
"organisationType": org_types,
"organisationTypePrimary": "Local Authority",
"url": None,
"location": locations,
"latestIncome": None,
Expand Down
2 changes: 1 addition & 1 deletion findthatcharity_import/spiders/rsl.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,8 @@ def parse_row(self, record):
"alternateName": [],
"email": None,
"description": None,
"organisationTypePrimary": "Registered Provider of Social Housing",
"organisationType": org_types,
"organisationTypePrimary": "Registered Provider of Social Housing",
"url": None,
"location": locations,
"latestIncome": None,
Expand Down
16 changes: 14 additions & 2 deletions findthatcharity_import/spiders/schools_gias.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,17 @@ def find_csv(self, response):
self.source["modified"] = datetime.datetime.now().isoformat()
return [scrapy.Request(response.urljoin(link), callback=self.parse_csv)]

def depluralise(self, s):
if not isinstance(s, str):
return s
if s == 'Other types':
return "Other school"
if s.endswith("ies"):
return s[:-3] + "y"
if s.endswith("s"):
return s[:-1]
return s

def parse_row(self, record):

record = self.clean_fields(record)
Expand All @@ -90,9 +101,10 @@ def parse_row(self, record):
description=None,
organisationType=[
"Education",
record.get("EstablishmentTypeGroup (name)"),
record.get("TypeOfEstablishment (name)"),
self.depluralise(record.get("EstablishmentTypeGroup (name)")),
self.depluralise(record.get("TypeOfEstablishment (name)")),
],
organisationTypePrimary=self.depluralise(record.get("EstablishmentTypeGroup (name)")),
url=self.parse_url(record.get("SchoolWebsite")),
location=self.get_locations(record),
latestIncome=None,
Expand Down
1 change: 1 addition & 0 deletions findthatcharity_import/spiders/schools_ni.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def parse_row(self, record):
record.get("Management"),
record.get("Type", "") + " School",
],
"organisationTypePrimary": record.get("Type", "") + " School",
"url": None,
"location": [],
"latestIncome": None,
Expand Down
1 change: 1 addition & 0 deletions findthatcharity_import/spiders/schools_scotland.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ def parse(self, response):
email=record.get("e_mail"),
description=None,
organisationType=self.get_org_types(record),
organisationTypePrimary=self.get_org_types(record)[1],
url=None,
location=self.get_locations(record),
latestIncome=None,
Expand Down
6 changes: 3 additions & 3 deletions findthatcharity_import/spiders/schools_wales.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def parse_row(self, record):
"email": None,
"description": None,
"organisationType": self.get_org_types(record),
"organisationTypePrimary": self.get_org_types(record)[0],
"url": None,
"location": self.get_locations(record),
"latestIncome": None,
Expand All @@ -125,15 +126,14 @@ def parse_row(self, record):
})

def get_org_types(self, record):
org_types = [
"Education",
]
org_types = []
for f in ["Sector", "Governance - see notes", "Welsh Medium Type - see notes", "School Type", "type"]:
if record.get(f):
if record.get(f) == "PRU":
org_types.append("Pupil Referral Unit")
else:
org_types.append(record[f] + " School")
org_types.append("Education")
return org_types

def get_locations(self, record):
Expand Down

0 comments on commit b1eaae2

Please sign in to comment.