Standardize formatting and remove original downloaded data files

scribe-org · Feb 23, 2024 · 4840ff3 · 4840ff3
1 parent c00966f
commit 4840ff3
Show file tree

Hide file tree

Showing 58 changed files with 338 additions and 2,948,002 deletions.
diff --git a/src/scribe_data/extract_transform/languages/Basque/nouns/nouns_queried.json b/src/scribe_data/extract_transform/languages/Basque/nouns/nouns_queried.json
diff --git a/src/scribe_data/extract_transform/languages/Bengali/nouns/nouns_queried.json b/src/scribe_data/extract_transform/languages/Bengali/nouns/nouns_queried.json
diff --git a/src/scribe_data/extract_transform/languages/Bokmål/nouns/nouns_queried.json b/src/scribe_data/extract_transform/languages/Bokmål/nouns/nouns_queried.json
diff --git a/src/scribe_data/extract_transform/languages/Czech/nouns/nouns_queried.json b/src/scribe_data/extract_transform/languages/Czech/nouns/nouns_queried.json
diff --git a/src/scribe_data/extract_transform/languages/Danish/nouns/nouns_queried.json b/src/scribe_data/extract_transform/languages/Danish/nouns/nouns_queried.json
diff --git a/src/scribe_data/extract_transform/languages/English/nouns/format_nouns.py b/src/scribe_data/extract_transform/languages/English/nouns/format_nouns.py
@@ -11,6 +11,8 @@
 import sys
 
 LANGUAGE = "English"
+QUERIED_DATA_TYPE = "nouns"
+QUERIED_DATA_FILE = f"{QUERIED_DATA_TYPE}_queried.json"
 PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0]
 LANGUAGES_DIR_PATH = (
     f"{PATH_TO_SCRIBE_ORG}/Scribe-Data/src/scribe_data/extract_transform/languages"
@@ -19,16 +21,16 @@
 file_path = sys.argv[0]
 
 update_data_in_use = False  # check if update_data.py is being used
-if f"languages/{LANGUAGE}/nouns/" not in file_path:
-    with open("nouns_queried.json", encoding="utf-8") as f:
-        nouns_list = json.load(f)
+if f"languages/{LANGUAGE}/{QUERIED_DATA_TYPE}/" not in file_path:
+    data_path = QUERIED_DATA_FILE
 else:
     update_data_in_use = True
-    with open(
-        f"{LANGUAGES_DIR_PATH}/{LANGUAGE}/nouns/nouns_queried.json",
-        encoding="utf-8",
-    ) as f:
-        nouns_list = json.load(f)
+    data_path = (
+        f"{LANGUAGES_DIR_PATH}/{LANGUAGE}/{QUERIED_DATA_TYPE}/{QUERIED_DATA_FILE}"
+    )
+
+with open(data_path, encoding="utf-8") as f:
+    nouns_list = json.load(f)
 
 nouns_formatted = {}
 
@@ -92,13 +94,11 @@
 
 nouns_formatted = collections.OrderedDict(sorted(nouns_formatted.items()))
 
-export_dir = "../formatted_data/"
-export_path = os.path.join(export_dir, "nouns.json")
+export_path = f"../formatted_data/{QUERIED_DATA_TYPE}.json"
 if update_data_in_use:
-    export_path = f"{LANGUAGES_DIR_PATH}/{LANGUAGE}/formatted_data/nouns.json"
-
-if not os.path.exists(export_dir):
-    os.makedirs(export_dir)
+    export_path = (
+        f"{LANGUAGES_DIR_PATH}/{LANGUAGE}/formatted_data/{QUERIED_DATA_TYPE}.json"
+    )
 
 with open(
     export_path,
@@ -107,4 +107,8 @@
 ) as file:
     json.dump(nouns_formatted, file, ensure_ascii=False, indent=0)
 
-print(f"Wrote file nouns.json with {len(nouns_formatted):,} nouns.")
+print(
+    f"Wrote file {QUERIED_DATA_TYPE}.json with {len(nouns_formatted):,} {QUERIED_DATA_TYPE}."
+)
+
+os.remove(data_path)
diff --git a/src/scribe_data/extract_transform/languages/English/nouns/nouns_queried.json b/src/scribe_data/extract_transform/languages/English/nouns/nouns_queried.json
diff --git a/src/scribe_data/extract_transform/languages/English/verbs/format_verbs.py b/src/scribe_data/extract_transform/languages/English/verbs/format_verbs.py
@@ -11,6 +11,8 @@
 import sys
 
 LANGUAGE = "English"
+QUERIED_DATA_TYPE = "verbs"
+QUERIED_DATA_FILE = f"{QUERIED_DATA_TYPE}_queried.json"
 PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0]
 LANGUAGES_DIR_PATH = (
     f"{PATH_TO_SCRIBE_ORG}/Scribe-Data/src/scribe_data/extract_transform/languages"
@@ -19,16 +21,16 @@
 file_path = sys.argv[0]
 
 update_data_in_use = False  # check if update_data.py is being used
-if f"languages/{LANGUAGE}/verbs/" not in file_path:
-    with open("verbs_queried.json", encoding="utf-8") as f:
-        verbs_list = json.load(f)
+if f"languages/{LANGUAGE}/{QUERIED_DATA_TYPE}/" not in file_path:
+    data_path = QUERIED_DATA_FILE
 else:
     update_data_in_use = True
-    with open(
-        f"{LANGUAGES_DIR_PATH}/{LANGUAGE}/verbs/verbs_queried.json",
-        encoding="utf-8",
-    ) as f:
-        verbs_list = json.load(f)
+    data_path = (
+        f"{LANGUAGES_DIR_PATH}/{LANGUAGE}/{QUERIED_DATA_TYPE}/{QUERIED_DATA_FILE}"
+    )
+
+with open(data_path, encoding="utf-8") as f:
+    verbs_list = json.load(f)
 
 verbs_formatted = {}
 
@@ -79,13 +81,11 @@
 
 verbs_formatted = collections.OrderedDict(sorted(verbs_formatted.items()))
 
-export_dir = "../formatted_data/"
-export_path = os.path.join(export_dir, "verbs.json")
+export_path = f"../formatted_data/{QUERIED_DATA_TYPE}.json"
 if update_data_in_use:
-    export_path = f"{LANGUAGES_DIR_PATH}/{LANGUAGE}/formatted_data/verbs.json"
-
-if not os.path.exists(export_dir):
-    os.makedirs(export_dir)
+    export_path = (
+        f"{LANGUAGES_DIR_PATH}/{LANGUAGE}/formatted_data/{QUERIED_DATA_TYPE}.json"
+    )
 
 with open(
     export_path,
@@ -94,4 +94,8 @@
 ) as file:
     json.dump(verbs_formatted, file, ensure_ascii=False, indent=0)
 
-print(f"Wrote file verbs.json with {len(verbs_formatted):,} verbs.")
+print(
+    f"Wrote file {QUERIED_DATA_TYPE}.json with {len(verbs_formatted):,} {QUERIED_DATA_TYPE}."
+)
+
+os.remove(data_path)