Skip to content

Commit

Permalink
Standardize formatting and remove original downloaded data files
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewtavis committed Feb 23, 2024
1 parent c00966f commit 4840ff3
Show file tree
Hide file tree
Showing 58 changed files with 338 additions and 2,948,002 deletions.
14,496 changes: 0 additions & 14,496 deletions src/scribe_data/extract_transform/languages/Basque/nouns/nouns_queried.json

This file was deleted.

7,426 changes: 0 additions & 7,426 deletions src/scribe_data/extract_transform/languages/Bengali/nouns/nouns_queried.json

This file was deleted.

66,658 changes: 0 additions & 66,658 deletions src/scribe_data/extract_transform/languages/Bokmål/nouns/nouns_queried.json

This file was deleted.

10,759 changes: 0 additions & 10,759 deletions src/scribe_data/extract_transform/languages/Czech/nouns/nouns_queried.json

This file was deleted.

76,022 changes: 0 additions & 76,022 deletions src/scribe_data/extract_transform/languages/Danish/nouns/nouns_queried.json

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import sys

LANGUAGE = "English"
QUERIED_DATA_TYPE = "nouns"
QUERIED_DATA_FILE = f"{QUERIED_DATA_TYPE}_queried.json"
PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0]
LANGUAGES_DIR_PATH = (
f"{PATH_TO_SCRIBE_ORG}/Scribe-Data/src/scribe_data/extract_transform/languages"
Expand All @@ -19,16 +21,16 @@
file_path = sys.argv[0]

update_data_in_use = False # check if update_data.py is being used
if f"languages/{LANGUAGE}/nouns/" not in file_path:
with open("nouns_queried.json", encoding="utf-8") as f:
nouns_list = json.load(f)
if f"languages/{LANGUAGE}/{QUERIED_DATA_TYPE}/" not in file_path:
data_path = QUERIED_DATA_FILE
else:
update_data_in_use = True
with open(
f"{LANGUAGES_DIR_PATH}/{LANGUAGE}/nouns/nouns_queried.json",
encoding="utf-8",
) as f:
nouns_list = json.load(f)
data_path = (
f"{LANGUAGES_DIR_PATH}/{LANGUAGE}/{QUERIED_DATA_TYPE}/{QUERIED_DATA_FILE}"
)

with open(data_path, encoding="utf-8") as f:
nouns_list = json.load(f)

nouns_formatted = {}

Expand Down Expand Up @@ -92,13 +94,11 @@

nouns_formatted = collections.OrderedDict(sorted(nouns_formatted.items()))

export_dir = "../formatted_data/"
export_path = os.path.join(export_dir, "nouns.json")
export_path = f"../formatted_data/{QUERIED_DATA_TYPE}.json"
if update_data_in_use:
export_path = f"{LANGUAGES_DIR_PATH}/{LANGUAGE}/formatted_data/nouns.json"

if not os.path.exists(export_dir):
os.makedirs(export_dir)
export_path = (
f"{LANGUAGES_DIR_PATH}/{LANGUAGE}/formatted_data/{QUERIED_DATA_TYPE}.json"
)

with open(
export_path,
Expand All @@ -107,4 +107,8 @@
) as file:
json.dump(nouns_formatted, file, ensure_ascii=False, indent=0)

print(f"Wrote file nouns.json with {len(nouns_formatted):,} nouns.")
print(
f"Wrote file {QUERIED_DATA_TYPE}.json with {len(nouns_formatted):,} {QUERIED_DATA_TYPE}."
)

os.remove(data_path)
31,049 changes: 0 additions & 31,049 deletions src/scribe_data/extract_transform/languages/English/nouns/nouns_queried.json

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import sys

LANGUAGE = "English"
QUERIED_DATA_TYPE = "verbs"
QUERIED_DATA_FILE = f"{QUERIED_DATA_TYPE}_queried.json"
PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0]
LANGUAGES_DIR_PATH = (
f"{PATH_TO_SCRIBE_ORG}/Scribe-Data/src/scribe_data/extract_transform/languages"
Expand All @@ -19,16 +21,16 @@
file_path = sys.argv[0]

update_data_in_use = False # check if update_data.py is being used
if f"languages/{LANGUAGE}/verbs/" not in file_path:
with open("verbs_queried.json", encoding="utf-8") as f:
verbs_list = json.load(f)
if f"languages/{LANGUAGE}/{QUERIED_DATA_TYPE}/" not in file_path:
data_path = QUERIED_DATA_FILE
else:
update_data_in_use = True
with open(
f"{LANGUAGES_DIR_PATH}/{LANGUAGE}/verbs/verbs_queried.json",
encoding="utf-8",
) as f:
verbs_list = json.load(f)
data_path = (
f"{LANGUAGES_DIR_PATH}/{LANGUAGE}/{QUERIED_DATA_TYPE}/{QUERIED_DATA_FILE}"
)

with open(data_path, encoding="utf-8") as f:
verbs_list = json.load(f)

verbs_formatted = {}

Expand Down Expand Up @@ -79,13 +81,11 @@

verbs_formatted = collections.OrderedDict(sorted(verbs_formatted.items()))

export_dir = "../formatted_data/"
export_path = os.path.join(export_dir, "verbs.json")
export_path = f"../formatted_data/{QUERIED_DATA_TYPE}.json"
if update_data_in_use:
export_path = f"{LANGUAGES_DIR_PATH}/{LANGUAGE}/formatted_data/verbs.json"

if not os.path.exists(export_dir):
os.makedirs(export_dir)
export_path = (
f"{LANGUAGES_DIR_PATH}/{LANGUAGE}/formatted_data/{QUERIED_DATA_TYPE}.json"
)

with open(
export_path,
Expand All @@ -94,4 +94,8 @@
) as file:
json.dump(verbs_formatted, file, ensure_ascii=False, indent=0)

print(f"Wrote file verbs.json with {len(verbs_formatted):,} verbs.")
print(
f"Wrote file {QUERIED_DATA_TYPE}.json with {len(verbs_formatted):,} {QUERIED_DATA_TYPE}."
)

os.remove(data_path)
Loading

0 comments on commit 4840ff3

Please sign in to comment.