feat: also allow deep nested single document JSON files

UUDigitalHumanitieslab · Nov 21, 2024 · 2fde2a8 · 2fde2a8
1 parent f7ad58c
commit 2fde2a8
Show file tree

Hide file tree

Showing 4 changed files with 66 additions and 16 deletions.
diff --git a/ianalyzer_readers/extract.py b/ianalyzer_readers/extract.py
@@ -493,18 +493,24 @@ def _apply(self, metadata, *nargs, **kwargs):
 
 
 class JSON(Extractor):
-    ''' An extractor to extract data from JSON
+    '''An extractor to extract data from JSON
     This extractor assumes that each source is a flat dictionary
-    
+
     Parameters:
-        key: the key with which to retrieve a field from the source
+        keys (Iterable[str]): the keys with which to retrieve a field value from the source
     '''
-    def __init__(self, key, *args, **kwargs):
-        self.key = key
-        super().__init__(*args, **kwargs)
 
-    def _apply(self, data, *args, **kwargs):
-        return data.get(self.key)
+    def __init__(self, *keys, **kwargs):
+        self.keys = list(keys)
+        super().__init__(**kwargs)
+
+    def _apply(self, data: Union[str, dict], key_index: int = 0, **kwargs) -> str:
+        key = self.keys[key_index]
+        data = data.get(key)
+        if len(self.keys) > key_index + 1:
+            key_index += 1
+            return self._apply(data, key_index)
+        return data
 
 
 class RDF(Extractor):

diff --git a/ianalyzer_readers/readers/json.py b/ianalyzer_readers/readers/json.py
@@ -47,7 +47,8 @@ def source2dicts(self, source: Source, *nargs, **kwargs) -> Iterable[Document]:
                 'records'
             )
         else:
-            documents = list(json_data)
+            documents = [json_data]
+
         self._reject_extractors(extract.XML, extract.CSV, extract.RDF)
 
         for doc in documents:

diff --git a/tests/json/json_reader.py b/tests/json/json_reader.py
@@ -1,4 +1,5 @@
 from glob import glob
+import json
 import os
 
 from ianalyzer_readers.extract import JSON
@@ -12,20 +13,52 @@ def merge_lines(lines: list | str) -> str:
     return lines
 
 
-class JSONTestReader(JSONReader):
+class JSONDocumentReader(JSONReader):
     """
-    Example JSON reader for testing, using JSON data from https://github.com/tux255/analyzing-shakespeare
+    Example reader that would operate on corpora with one json file per document
     """
 
     data_directory = os.path.join(os.path.dirname(__file__), "data")
-    record_path = ["SCENE", "SPEECH"]
-    meta = ["TITLE", ["SPEECH", "TITLE"], ["SPEECH", "STAGEDIR"]]
+
+    def sources(self, **kwargs):
+        for i in range(1):
+            data = json.dumps(
+                {
+                    "TITLE": "ACT I",
+                    "SCENE": {
+                        "TITLE": "SCENE I.  A desert place.",
+                        "STAGEDIR": [
+                            "Thunder and lightning. Enter three Witches",
+                            "Exeunt",
+                        ],
+                        "SPEECH": {
+                            "SPEAKER": "First Witch",
+                        },
+                    },
+                }
+            )
+            yield data.encode('utf-8')
+
+    act = Field("act", JSON("TITLE"))
+    character = Field("character", JSON("SCENE", "SPEECH", "SPEAKER"))
+    scene = Field("scene", JSON("SCENE", "TITLE"))
+
+    fields = [act, character, scene]
+
+
+class JSONMultipleDocumentReader(JSONDocumentReader):
+    """
+    Example JSON reader for testing parsing arrays in JSON, using JSON data from https://github.com/tux255/analyzing-shakespeare
+    """
 
     def sources(self, **kwargs):
         for filename in glob(f"{self.data_directory}/*.json"):
             full_path = os.path.join(self.data_directory, filename)
             yield full_path
 
+    record_path = ["SCENE", "SPEECH"]
+    meta = ["TITLE", ["SPEECH", "TITLE"], ["SPEECH", "STAGEDIR"]]
+
     act = Field("act", JSON("TITLE"))
     scene = Field("scene", JSON("SPEECH.TITLE"))
     character = Field("character", JSON("SPEAKER"))

diff --git a/tests/test_json_reader.py b/tests/test_json_reader.py
@@ -1,4 +1,4 @@
-from tests.json.json_reader import JSONTestReader
+from tests.json.json_reader import JSONDocumentReader, JSONMultipleDocumentReader
 
 expected = [
     {
@@ -19,8 +19,18 @@
 ]
 
 
-def test_json_read_file():
-    reader = JSONTestReader()
+def test_json_parse_single_document():
+    reader = JSONDocumentReader()
+    docs = list(reader.documents())
+    assert len(docs) == 1
+    assert docs[0].get('act') == 'ACT I'
+    assert docs[0].get('character') == 'First Witch'
+    assert docs[0].get('scene') == 'SCENE I.  A desert place.'
+
+
+def test_json_parse_multiple_documents():
+    '''test that JSON reader can parse multiple documents from an array in a single file'''
+    reader = JSONMultipleDocumentReader()
     docs = list(reader.documents())
     assert len(docs) == len(expected)
     _assert_matches(expected[0], docs[0])