Skip to content

Commit

Permalink
feat: also allow deep nested single document JSON files
Browse files Browse the repository at this point in the history
  • Loading branch information
BeritJanssen committed Nov 21, 2024
1 parent f7ad58c commit 2fde2a8
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 16 deletions.
22 changes: 14 additions & 8 deletions ianalyzer_readers/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,18 +493,24 @@ def _apply(self, metadata, *nargs, **kwargs):


class JSON(Extractor):
''' An extractor to extract data from JSON
'''An extractor to extract data from JSON
This extractor assumes that each source is a flat dictionary
Parameters:
key: the key with which to retrieve a field from the source
keys (Iterable[str]): the keys with which to retrieve a field value from the source
'''
def __init__(self, key, *args, **kwargs):
self.key = key
super().__init__(*args, **kwargs)

def _apply(self, data, *args, **kwargs):
return data.get(self.key)
def __init__(self, *keys, **kwargs):
self.keys = list(keys)
super().__init__(**kwargs)

def _apply(self, data: Union[str, dict], key_index: int = 0, **kwargs) -> str:
key = self.keys[key_index]
data = data.get(key)
if len(self.keys) > key_index + 1:
key_index += 1
return self._apply(data, key_index)
return data


class RDF(Extractor):
Expand Down
3 changes: 2 additions & 1 deletion ianalyzer_readers/readers/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ def source2dicts(self, source: Source, *nargs, **kwargs) -> Iterable[Document]:
'records'
)
else:
documents = list(json_data)
documents = [json_data]

self._reject_extractors(extract.XML, extract.CSV, extract.RDF)

for doc in documents:
Expand Down
41 changes: 37 additions & 4 deletions tests/json/json_reader.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from glob import glob
import json
import os

from ianalyzer_readers.extract import JSON
Expand All @@ -12,20 +13,52 @@ def merge_lines(lines: list | str) -> str:
return lines


class JSONTestReader(JSONReader):
class JSONDocumentReader(JSONReader):
"""
Example JSON reader for testing, using JSON data from https://github.com/tux255/analyzing-shakespeare
Example reader that would operate on corpora with one json file per document
"""

data_directory = os.path.join(os.path.dirname(__file__), "data")
record_path = ["SCENE", "SPEECH"]
meta = ["TITLE", ["SPEECH", "TITLE"], ["SPEECH", "STAGEDIR"]]

def sources(self, **kwargs):
for i in range(1):
data = json.dumps(
{
"TITLE": "ACT I",
"SCENE": {
"TITLE": "SCENE I. A desert place.",
"STAGEDIR": [
"Thunder and lightning. Enter three Witches",
"Exeunt",
],
"SPEECH": {
"SPEAKER": "First Witch",
},
},
}
)
yield data.encode('utf-8')

act = Field("act", JSON("TITLE"))
character = Field("character", JSON("SCENE", "SPEECH", "SPEAKER"))
scene = Field("scene", JSON("SCENE", "TITLE"))

fields = [act, character, scene]


class JSONMultipleDocumentReader(JSONDocumentReader):
"""
Example JSON reader for testing parsing arrays in JSON, using JSON data from https://github.com/tux255/analyzing-shakespeare
"""

def sources(self, **kwargs):
for filename in glob(f"{self.data_directory}/*.json"):
full_path = os.path.join(self.data_directory, filename)
yield full_path

record_path = ["SCENE", "SPEECH"]
meta = ["TITLE", ["SPEECH", "TITLE"], ["SPEECH", "STAGEDIR"]]

act = Field("act", JSON("TITLE"))
scene = Field("scene", JSON("SPEECH.TITLE"))
character = Field("character", JSON("SPEAKER"))
Expand Down
16 changes: 13 additions & 3 deletions tests/test_json_reader.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from tests.json.json_reader import JSONTestReader
from tests.json.json_reader import JSONDocumentReader, JSONMultipleDocumentReader

expected = [
{
Expand All @@ -19,8 +19,18 @@
]


def test_json_read_file():
reader = JSONTestReader()
def test_json_parse_single_document():
reader = JSONDocumentReader()
docs = list(reader.documents())
assert len(docs) == 1
assert docs[0].get('act') == 'ACT I'
assert docs[0].get('character') == 'First Witch'
assert docs[0].get('scene') == 'SCENE I. A desert place.'


def test_json_parse_multiple_documents():
'''test that JSON reader can parse multiple documents from an array in a single file'''
reader = JSONMultipleDocumentReader()
docs = list(reader.documents())
assert len(docs) == len(expected)
_assert_matches(expected[0], docs[0])
Expand Down

0 comments on commit 2fde2a8

Please sign in to comment.