Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] wikipedia integration #4

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion mojp_dbs_pipelines/common/processors/base_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@ def main(cls):
# can be used like this in datapackage processor files:
# if __main__ == '__main__':
# Processor.main()
spew(*cls(*ingest()).spew())
parameters, datapackage, resources = ingest()
processor = cls(parameters, datapackage, resources)
datapackage, resources = processor.spew()
spew(datapackage, resources)

def spew(self):
self._datapackage, self._resources = self._process(self._datapackage, self._resources)
Expand Down
Empty file.
13 changes: 13 additions & 0 deletions mojp_dbs_pipelines/wikipedia/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# temporarily - we hard-code the page titles to download
WIKIPEDIA_PAGES_TO_DOWNLOAD = {
"he": [
"פייבל_פולקס",
"דוד_בן-גוריון"
],
"en": [
"Merneptah_Stele",
"Thebes,_Egypt"
]
}

WIKIPEDIA_PARSE_API_URL_TEMPLATE = "https://{language}.wikipedia.org/w/api.php?action=parse&page={page_title}&format=json"
Empty file.
35 changes: 35 additions & 0 deletions mojp_dbs_pipelines/wikipedia/processors/convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# from mojp_dbs_pipelines.common.processors.base_processors import FilterResourcesProcessor
# from mojp_dbs_pipelines.common.processors.sync import DBS_DOCS_TABLE_SCHEMA
# import json
# from mojp_dbs_pipelines.clearmash.constants import CLEARMASH_SOURCE_ID
# from copy import deepcopy
#
#
# class ClearmashConvertProcessor(FilterResourcesProcessor):
#
# def _filter_resource_descriptor(self, descriptor):
# if descriptor["name"] == "clearmash":
# # replace clearmash documents resource with the common dbs docs resource
# descriptor.update({"name": "dbs_docs",
# "path": "dbs_docs.csv",
# "schema": DBS_DOCS_TABLE_SCHEMA})
# return descriptor
#
# def _filter_row(self, row, resource_descirptor):
# if resource_descirptor["name"] == "dbs_docs":
# # filter rows of clearmash documents and convert them to dbs docs documents
# clearmash_id = row["id"]
# clearmash_doc = json.loads(row["source_doc"])
# dbs_doc = deepcopy(clearmash_doc)
# # add some mock data to know that document was modified
# dbs_doc.update({"implemented": "not yet",
# "sorry": True})
# row = {"source": CLEARMASH_SOURCE_ID,
# "id": str(clearmash_id),
# "source_doc": json.dumps(dbs_doc),
# "version": "5"}
# return row
#
#
# if __name__ == '__main__':
# ClearmashConvertProcessor.main()
37 changes: 37 additions & 0 deletions mojp_dbs_pipelines/wikipedia/processors/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from mojp_dbs_pipelines.common.processors.base_processors import AddResourcesProcessor
import json
from mojp_dbs_pipelines.wikipedia.constants import WIKIPEDIA_PAGES_TO_DOWNLOAD, WIKIPEDIA_PARSE_API_URL_TEMPLATE
import requests


class WikipediaDownloadProcessor(AddResourcesProcessor):

def _get_resource_descriptors(self):
return [{"name": "wikipedia",
"path": "wikipedia.csv",
"schema": {"fields": [
{"name": "id", "type": "integer"},
{"name": "source_doc", "type": "string"}
]}}]

def _get_title_json(self, url):
return requests.get(url).json()

def _download_titles(self, lang, titles):
for title in titles:
yield from self._download_title(lang, title)

def _download_title(self, lang, title):
page = self._get_title_json(WIKIPEDIA_PARSE_API_URL_TEMPLATE.format(language=lang, page_title=title))["parse"]
yield {"id": page["pageid"], "source_doc": json.dumps(page)}

def _download(self):
for lang, titles in WIKIPEDIA_PAGES_TO_DOWNLOAD.items():
yield from self._download_titles(lang, titles)

def _get_resources_iterator(self):
return [self._download()]


if __name__ == '__main__':
WikipediaDownloadProcessor.main()
19 changes: 19 additions & 0 deletions pipeline-spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,22 @@ clearmash:
run: dump.to_path
parameters:
out-path: data/clearmash

wikipedia:
title: Download, convert, validate and sync data from Wikipedia to MoJP databases
stream_reader_limit: 1048576
pipeline:
-
run: add_metadata
parameters:
name: wikipedia
-
run: mojp_dbs_pipelines.wikipedia.processors.download
# -
# run: mojp_dbs_pipelines.clearmash.processors.convert
# -
# run: mojp_dbs_pipelines.common.processors.sync
-
run: dump.to_path
parameters:
out-path: data/wikipedia
88 changes: 88 additions & 0 deletions tests/test_wikipedia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from .common import assert_processor
from mojp_dbs_pipelines.wikipedia.processors.download import WikipediaDownloadProcessor
# from mojp_dbs_pipelines.wikipedia.processors.convert import WikipediaConvertProcessor
import json, os
from urllib.parse import urlparse, parse_qs


class MockWikipediaDownloadProcessor(WikipediaDownloadProcessor):

def _get_title_json(self, url):
scheme, netloc, path, params, query, fragment = urlparse(url)
if path == "/w/api.php":
lang = "he" if netloc == "he.wikipedia.org" else "en"
qs = parse_qs(query)
page = qs["page"][0]
filename = "{}-{}.json".format(lang, page)
full_filename = os.path.join(os.path.dirname(__file__), "wikipedia-mocks", filename)
if os.environ.get("WIKIPEDIA_WRITE_MOCKS") == "1":
data = super(MockWikipediaDownloadProcessor, self)._get_title_json(url)
with open(full_filename, "w") as f:
json.dump(data, f)
else:
with open(full_filename) as f:
data = json.load(f)
return data
else:
raise Exception("invalud url: {}".format(url))

def test_download():
assert_processor(
MockWikipediaDownloadProcessor,
parameters={},
datapackage={"resources": []},
resources=[],
expected_datapackage={
"resources": [{
"name": "wikipedia",
"path": "wikipedia.csv",
"schema": {"fields": [
{"name": "id", "type": "integer"},
{"name": "source_doc", "type": "string"}
]}
}]
},
expected_resources=[[
{"id": 1, "source_doc": json.dumps({"title": "foobar", "content": "bazbax"})},
{"id": 2, "source_doc": json.dumps({"title": "222", "content": "2222"})}
]]
)


# def test_convert_to_dbs_documents():
# assert_processor(
# ClearmashConvertProcessor,
# parameters={},
# datapackage={
# "resources": [{
# "name": "clearmash",
# "path": "clearmash.csv",
# "schema": {"fields": [
# {"name": "id", "type": "integer"},
# {"name": "source_doc", "type": "string"}
# ]}
# }]
# },
# resources=[[
# {"id": 1, "source_doc": json.dumps({"title": "foobar", "content": "bazbax"})},
# {"id": 2, "source_doc": json.dumps({"title": "222", "content": "2222"})}
# ]],
# expected_datapackage={
# "resources": [{
# "name": "dbs_docs",
# "path": "dbs_docs.csv",
# "schema": {"fields": [
# {"name": "source", "type": "string"},
# {"name": "id", "type": "string"},
# {'name': 'version', 'type': 'string', 'description': 'source dependant field, used by sync process to detect document updates'},
# {"name": "source_doc", "type": "string"}
# ]}
# }]
# },
# expected_resources=[[
# {"source": "clearmash", "id": "1", "version": "5",
# "source_doc": '{"title": "foobar", "content": "bazbax", "implemented": "not yet", "sorry": true}'},
# {"source": "clearmash", "id": "2", "version": "5",
# "source_doc": '{"title": "222", "content": "2222", "implemented": "not yet", "sorry": true}'}
# ]]
# )
1 change: 1 addition & 0 deletions tests/wikipedia-mocks/en-Merneptah_Stele.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tests/wikipedia-mocks/en-Thebes,_Egypt.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tests/wikipedia-mocks/he-דוד_בן-גוריון.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tests/wikipedia-mocks/he-פייבל_פולקס.json

Large diffs are not rendered by default.