This repository has been archived by the owner on Dec 5, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse.py
149 lines (127 loc) · 6.08 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import gzip
import orjson
from pathlib import Path
from pprint import pprint
from typing import Any, Dict
from zavod import Zavod, init_context
SCHEME_PROPS = {
"Not a valid Org-Id scheme, provided for backwards compatibility": "registrationNumber",
"DK Centrale Virksomhedsregister": "registrationNumber",
"Danish Central Business Register": "registrationNumber",
"AF EITI 2013-2015 beneficial ownership pilot": "alias",
"CM EITI 2013-2015 beneficial ownership pilot": "alias",
"GB EITI 2013-2015 beneficial ownership pilot": "alias",
"ZM EITI 2013-2015 beneficial ownership pilot": "alias",
"ZM EITI 2013-2015 beneficial ownership pilot": "alias",
"GH EITI 2013-2015 beneficial ownership pilot": "alias",
"HN EITI 2013-2015 beneficial ownership pilot": "alias",
"ID EITI 2013-2015 beneficial ownership pilot": "alias",
"BF EITI 2013-2015 beneficial ownership pilot": "alias",
"MR EITI 2013-2015 beneficial ownership pilot": "alias",
"CD EITI 2013-2015 beneficial ownership pilot": "alias",
"TT EITI 2013-2015 beneficial ownership pilot": "alias",
"TG EITI 2013-2015 beneficial ownership pilot": "alias",
"TZ EITI 2013-2015 beneficial ownership pilot": "alias",
"LR EITI 2013-2015 beneficial ownership pilot": "alias",
"SC EITI 2013-2015 beneficial ownership pilot": "alias",
"NG EITI 2013-2015 beneficial ownership pilot": "alias",
"NO EITI 2013-2015 beneficial ownership pilot": "alias",
"MG EITI 2013-2015 beneficial ownership pilot": "alias",
"MM EITI 2013-2015 beneficial ownership pilot": "alias",
"ML EITI 2013-2015 beneficial ownership pilot": "alias",
"KG EITI 2013-2015 beneficial ownership pilot": "alias",
"EITI Structured Data - Côte d'Ivoire": "alias",
"UA Edinyy Derzhavnyj Reestr": "registrationNumber",
"United State Register": "registrationNumber",
"Ministry of Justice Business Register": "registrationNumber",
"SK Register Partnerov Verejného Sektora": "registrationNumber",
"GB Persons Of Significant Control Register": None,
"GB Persons Of Significant Control Register - Registration numbers": "registrationNumber",
"OpenOwnership Register": "sourceUrl",
"OpenCorporates": "opencorporatesUrl",
"Companies House": "registrationNumber",
}
def parse_statement(context: Zavod, data: Dict[str, Any]) -> None:
statement_type = data.pop("statementType")
statement_id = data.pop("statementID")
countries = set()
if statement_type == "personStatement":
person_type = data.pop("personType")
if person_type in ("unknownPerson", "anonymousPerson"):
return
assert person_type == "knownPerson", (person_type, data)
proxy = context.make("Person")
proxy.add("birthDate", data.pop("birthDate", None))
for name in data.pop("names", []):
proxy.add("name", name.pop("fullName"))
# print(name)
for nat in data.pop("nationalities", []):
countries.add(nat.pop("code"))
proxy.add("nationality", nat.pop("name"))
elif statement_type == "entityStatement":
entity_type = data.pop("entityType")
proxy = context.make("LegalEntity")
proxy.add("name", data.pop("name", None))
proxy.add("incorporationDate", data.pop("foundingDate", None))
proxy.add("dissolutionDate", data.pop("dissolutionDate", None))
juris = data.pop("incorporatedInJurisdiction", {})
juris_code = juris.pop("code", juris.pop("name", None))
if len(juris):
pprint(juris)
countries.add(juris_code)
proxy.add("jurisdiction", juris_code)
elif statement_type == "ownershipOrControlStatement":
proxy = context.make("Ownership")
interested_party = data.pop("interestedParty", {})
proxy.add("owner", interested_party.pop("describedByPersonStatement", None))
proxy.add("owner", interested_party.pop("describedByEntityStatement", None))
subject = data.pop("subject", {})
proxy.add("asset", subject.pop("describedByEntityStatement", None))
proxy.add("date", data.pop("statementDate", None))
source = data.pop("source", {})
proxy.add("publisher", source.pop("description", None))
proxy.add("publisherUrl", source.pop("url", None))
proxy.add("retrievedAt", source.pop("retrievedAt", None))
for inter in data.pop("interests", []):
proxy.add("role", inter.pop("type", None))
proxy.add("summary", inter.pop("details", None))
proxy.add("startDate", inter.pop("startDate", None))
proxy.add("endDate", inter.pop("endDate", None))
if len(data):
pprint(data)
else:
context.log.warn("Unknown statement type", statement_type)
proxy.id = statement_id
for addr in data.pop("addresses", []):
proxy.add("address", addr.pop("address"))
country = addr.pop("country", None)
if country not in countries:
countries.add(country)
proxy.add("country", country)
for ident in data.pop("identifiers", []):
scheme = ident.pop("schemeName")
value = ident.pop("uri", ident.pop("id", None))
if scheme not in SCHEME_PROPS:
context.log.warn("Unknown scheme", scheme=repr(scheme), value=value)
continue
if value is None:
context.log.warn("Weird identifier", identifier=ident)
prop = SCHEME_PROPS[scheme]
if prop is not None:
proxy.add(prop, value)
if len(data):
pprint({"type": statement_type, "data": data})
context.emit(proxy)
def parse_file(context: Zavod, file_name: Path):
with gzip.open(file_name) as fh:
index = 0
while line := fh.readline():
data = orjson.loads(line)
parse_statement(context, data)
index += 1
if index > 0 and index % 10000 == 0:
context.log.info("Statements: %d..." % index)
if __name__ == "__main__":
with init_context("openownership", "oo") as context:
fn = context.get_resource_path("statements.latest.jsonl.gz")
parse_file(context, fn)