From e3854601de38ee339b1cbb7f99ba2a963072bc8c Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Fri, 25 Oct 2024 22:43:08 +0300 Subject: [PATCH 1/5] Convert camelCase to snake_case for JSON keys and CSV/TSV column headers - Added camel_to_snake function to convert camelCase keys and column headers to snake_case for consistency - Updated key and column name handling in convert_to_csv_or_tsv and convert_to_json functions --- src/scribe_data/cli/convert.py | 38 ++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index 6d5f4d38..43a5d8b1 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -20,6 +20,7 @@ --> """ +import re import csv import json import shutil @@ -35,6 +36,15 @@ get_language_iso, ) + +# MARK: convert camelCase to snake_case + + +def camel_to_snake(name: str) -> str: + """Convert camelCase to snake_case.""" + return re.sub(r"(? Date: Fri, 25 Oct 2024 23:23:01 +0300 Subject: [PATCH 2/5] Convert column names to snake_case in create_table function - Updated the create_table function to convert column names from camelCase to snake_case before creating the SQLite table. - Included a new import for the camel_to_snake function to handle the conversion. --- src/scribe_data/load/data_to_sqlite.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/scribe_data/load/data_to_sqlite.py b/src/scribe_data/load/data_to_sqlite.py index 1be35b28..71b46cf2 100644 --- a/src/scribe_data/load/data_to_sqlite.py +++ b/src/scribe_data/load/data_to_sqlite.py @@ -30,6 +30,7 @@ from tqdm.auto import tqdm +from scribe_data.cli.convert import camel_to_snake from scribe_data.utils import ( DEFAULT_JSON_EXPORT_DIR, DEFAULT_SQLITE_EXPORT_DIR, @@ -108,11 +109,14 @@ def create_table(data_type, cols): Parameters ---------- data_type : str - The name of the table to be created + The name of the table to be created. cols : list of strings - The names of columns for the new table + The names of columns for the new table. """ + # Convert column names to snake_case + cols = [camel_to_snake(col) for col in cols] + cursor.execute( f"CREATE TABLE IF NOT EXISTS {data_type} ({' Text, '.join(cols)} Text, UNIQUE({cols[0]}))" ) From 7861bd59c42ddb4c8bd5d85c561bef27b6de4328 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Fri, 25 Oct 2024 23:51:15 +0300 Subject: [PATCH 3/5] Fix circular import by moving camel_to_snake function to utils.py - Added the camel_to_snake function to the utils.py file to avoid circular import issues. - Updated the import statements in data_to_sqlite.py and convert.py to reference the new location of the function. - Verified that all tests pass successfully after the changes. --- src/scribe_data/cli/convert.py | 10 +--------- src/scribe_data/load/data_to_sqlite.py | 2 +- src/scribe_data/utils.py | 9 +++++++++ 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index 43a5d8b1..39e052b2 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -20,7 +20,6 @@ --> """ -import re import csv import json import shutil @@ -34,17 +33,10 @@ DEFAULT_SQLITE_EXPORT_DIR, DEFAULT_TSV_EXPORT_DIR, get_language_iso, + camel_to_snake, ) -# MARK: convert camelCase to snake_case - - -def camel_to_snake(name: str) -> str: - """Convert camelCase to snake_case.""" - return re.sub(r"(? """ +import re import ast import json from importlib import resources @@ -646,3 +647,11 @@ def list_languages_with_metadata_for_data_type(language_metadata=_languages): ) return sorted(current_languages, key=lambda x: x["name"]) + + +# MARK: convert camelCase to snake_case + + +def camel_to_snake(name: str) -> str: + """Convert camelCase to snake_case.""" + return re.sub(r"(? Date: Mon, 11 Nov 2024 17:50:01 +0200 Subject: [PATCH 4/5] Add identifier case option to convert command with camelCase default and updated the docs for this new command. --- docs/source/scribe_data/cli.rst | 1 + src/scribe_data/cli/convert.py | 96 ++++++++++++++++++++------ src/scribe_data/cli/get.py | 5 ++ src/scribe_data/cli/main.py | 18 +++++ src/scribe_data/load/data_to_sqlite.py | 8 ++- 5 files changed, 104 insertions(+), 24 deletions(-) diff --git a/docs/source/scribe_data/cli.rst b/docs/source/scribe_data/cli.rst index c99eaed2..1e0fe92d 100644 --- a/docs/source/scribe_data/cli.rst +++ b/docs/source/scribe_data/cli.rst @@ -145,6 +145,7 @@ Options: - ``-o, --overwrite``: Whether to overwrite existing files (default: False). - ``-a, --all ALL``: Get all languages and data types. - ``-i, --interactive``: Run in interactive mode. +- ``-ic, --identifier-case``: The case format for identifiers in the output data (default: camel). Example: diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index 39e052b2..673b6f7e 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -47,6 +47,7 @@ def convert_to_json( input_file: str, output_dir: str = None, overwrite: bool = False, + identifier_case: str = "camel", ) -> None: """ Convert a CSV/TSV file to JSON. @@ -71,6 +72,9 @@ def convert_to_json( overwrite : bool Whether to overwrite existing files. + identifier_case : str + The case format for identifiers. Default is "camel". + Returns ------- None @@ -117,12 +121,16 @@ def convert_to_json( if len(keys) == 1: # Handle Case: { key: None }. - data[camel_to_snake(first_row[keys[0]])] = None + data[first_row[keys[0]]] = None elif len(keys) == 2: # Handle Case: { key: value }. for row in rows: - key = camel_to_snake(row[keys[0]]) + key = ( + camel_to_snake(row[keys[0]]) + if identifier_case == "snake" + else row[keys[0]] + ) value = row[keys[1]] data[key] = value @@ -130,7 +138,10 @@ def convert_to_json( if all(col in first_row for col in ["emoji", "is_base", "rank"]): # Handle Case: { key: [ { emoji: ..., is_base: ..., rank: ... }, { emoji: ..., is_base: ..., rank: ... } ] }. for row in rows: - key = camel_to_snake(row.get(reader.fieldnames[0])) + if identifier_case == "snake": + key = camel_to_snake(row.get(reader.fieldnames[0])) + else: + key = row.get(reader.fieldnames[0]) emoji = row.get("emoji", "").strip() is_base = ( row.get("is_base", "false").strip().lower() == "true" @@ -147,8 +158,13 @@ def convert_to_json( else: # Handle Case: { key: { value1: ..., value2: ... } }. for row in rows: - data[camel_to_snake(row[keys[0]])] = { - camel_to_snake(k): row[k] for k in keys[1:] + data[row[keys[0]]] = { + ( + camel_to_snake(k) + if identifier_case == "snake" + else k + ): row[k] + for k in keys[1:] } except (IOError, csv.Error) as e: @@ -187,6 +203,7 @@ def convert_to_csv_or_tsv( input_file: str, output_dir: str = None, overwrite: bool = False, + identifier_case: str = "camel", ) -> None: """ Convert a JSON File to CSV/TSV file. @@ -211,6 +228,9 @@ def convert_to_csv_or_tsv( overwrite : bool Whether to overwrite existing files. + identifier_case : str + The case format for identifiers. Default is "camel". + Returns ------- None @@ -273,10 +293,16 @@ def convert_to_csv_or_tsv( if isinstance(data[first_key], dict): # Handle case: { key: { value1: ..., value2: ... } }. columns = sorted(next(iter(data.values())).keys()) - writer.writerow( - [camel_to_snake(dtype[:-1])] - + [camel_to_snake(col) for col in columns] - ) + header = [ + camel_to_snake(dtype[:-1]) + if identifier_case == "snake" + else dtype[:-1] + ] + header += [ + camel_to_snake(col) if identifier_case == "snake" else col + for col in columns + ] + writer.writerow(header) for key, value in data.items(): row = [key] + [value.get(col, "") for col in columns] @@ -289,6 +315,8 @@ def convert_to_csv_or_tsv( columns = ["word", "emoji", "is_base", "rank"] writer.writerow( [camel_to_snake(col) for col in columns] + if identifier_case == "snake" + else columns ) for key, value in data.items(): @@ -301,10 +329,13 @@ def convert_to_csv_or_tsv( ] writer.writerow(row) else: - columns = [camel_to_snake(dtype[:-1])] + [ - camel_to_snake(col) - for col in data[first_key][0].keys() - ] + if identifier_case == "snake": + columns = [camel_to_snake(dtype[:-1])] + [ + camel_to_snake(col) + for col in data[first_key][0].keys() + ] + else: + writer.writerow(columns) writer.writerow(columns) for key, value in data.items(): @@ -316,20 +347,30 @@ def convert_to_csv_or_tsv( elif all(isinstance(item, str) for item in data[first_key]): # Handle case: { key: [value1, value2, ...] }. - writer.writerow( - [camel_to_snake(dtype[:-1])] - + [ - f"autosuggestion_{i+1}" - for i in range(len(data[first_key])) - ] - ) + header = [ + camel_to_snake(dtype[:-1]) + if identifier_case == "snake" + else dtype[:-1] + ] + header += [ + f"autosuggestion_{i+1}" + for i in range(len(data[first_key])) + ] + writer.writerow(header) for key, value in data.items(): row = [key] + value writer.writerow(row) else: # Handle case: { key: value }. - writer.writerow([camel_to_snake(dtype[:-1]), "value"]) + writer.writerow( + [ + camel_to_snake(dtype[:-1]) + if identifier_case == "snake" + else dtype[:-1], + "value", + ] + ) for key, value in data.items(): writer.writerow([key, value]) @@ -350,6 +391,7 @@ def convert_to_sqlite( input_file: str = None, output_dir: str = None, overwrite: bool = False, + identifier_case: str = "snake", ) -> None: """ Converts a Scribe-Data output file to an SQLite file. @@ -374,6 +416,9 @@ def convert_to_sqlite( overwrite : bool Whether to overwrite existing files. + identifier_case : str + The case format for identifiers. Default is "camel". + Returns ------- A SQLite file saved in the given location. @@ -399,7 +444,7 @@ def convert_to_sqlite( if not output_dir.exists(): output_dir.mkdir(parents=True, exist_ok=True) - data_to_sqlite(languages, specific_tables) + data_to_sqlite(languages, specific_tables, identifier_case) source_file = f"{get_language_iso(language).upper()}LanguageData.sqlite" source_path = input_file.parent / source_file @@ -426,6 +471,7 @@ def convert_wrapper( input_file: str, output_dir: str = None, overwrite: bool = False, + identifier_case: str = "snake", ): """ Convert data to the specified output type: JSON, CSV/TSV, or SQLite. @@ -450,6 +496,9 @@ def convert_wrapper( overwrite : bool, optional Whether to overwrite existing output files. Defaults to False. + identifier_case : str + The case format for identifiers. Default is "camel". + Returns ------- None @@ -466,6 +515,7 @@ def convert_wrapper( input_file=input_file, output_dir=output_dir, overwrite=overwrite, + identifier_case=identifier_case, ) elif output_type in {"csv", "tsv"}: @@ -476,6 +526,7 @@ def convert_wrapper( input_file=input_file, output_dir=output_dir, overwrite=overwrite, + identifier_case=identifier_case, ) elif output_type == "sqlite": @@ -486,6 +537,7 @@ def convert_wrapper( input_file=input_file, output_dir=output_dir, overwrite=overwrite, + identifier_case=identifier_case, ) else: diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index 3e4dd277..81a12754 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -45,6 +45,7 @@ def get_data( outputs_per_entry: int = None, all: bool = False, interactive: bool = False, + identifier_case: str = "camel", ) -> None: """ Function for controlling the data get process for the CLI. @@ -75,6 +76,9 @@ def get_data( interactive : bool (default: False) Whether it's running in interactive mode. + identifier_case : str + The case format for identifiers. Default is "camel". + Returns ------- The requested data saved locally given file type and location arguments. @@ -149,6 +153,7 @@ def get_data( input_file=str(json_input_path), output_dir=output_dir, overwrite=overwrite, + identifier_case=identifier_case, ) os.remove(json_input_path) diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index 83bd4d81..60ce23cc 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -149,6 +149,14 @@ def main() -> None: get_parser.add_argument( "-i", "--interactive", action="store_true", help="Run in interactive mode" ) + get_parser.add_argument( + "-ic", + "--identifier-case", + type=str, + choices=["camel", "snake"], + default="camel", + help="The case format for identifiers in the output data (default: camel).", + ) # MARK: Total @@ -237,6 +245,14 @@ def main() -> None: default=True, help="Whether to keep the original file to be converted (default: True).", ) + convert_parser.add_argument( + "-ic", + "--identifier-case", + type=str, + choices=["camel", "snake"], + default="camel", + help="The case format for identifiers in the output data (default: camel).", + ) # MARK: Setup CLI @@ -281,6 +297,7 @@ def main() -> None: outputs_per_entry=args.outputs_per_entry, overwrite=args.overwrite, all=args.all, + identifier_case=args.identifier_case, ) elif args.command in ["total", "t"]: @@ -296,6 +313,7 @@ def main() -> None: input_file=args.input_file, output_dir=args.output_dir, overwrite=args.overwrite, + identifier_case=args.identifier_case, ) else: diff --git a/src/scribe_data/load/data_to_sqlite.py b/src/scribe_data/load/data_to_sqlite.py index 7e0e51f7..8a315b91 100644 --- a/src/scribe_data/load/data_to_sqlite.py +++ b/src/scribe_data/load/data_to_sqlite.py @@ -40,7 +40,9 @@ def data_to_sqlite( - languages: Optional[List[str]] = None, specific_tables: Optional[List[str]] = None + languages: Optional[List[str]] = None, + specific_tables: Optional[List[str]] = None, + identifier_case: str = "camel", ) -> None: PATH_TO_SCRIBE_DATA = Path(__file__).parent.parent @@ -115,7 +117,9 @@ def create_table(data_type, cols): The names of columns for the new table. """ # Convert column names to snake_case - cols = [camel_to_snake(col) for col in cols] + cols = [ + camel_to_snake(col) if identifier_case == "snake" else col for col in cols + ] cursor.execute( f"CREATE TABLE IF NOT EXISTS {data_type} ({' Text, '.join(cols)} Text, UNIQUE({cols[0]}))" From ad11f2e03a45abc6a67cbdaab4081c0397f7a66c Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Tue, 12 Nov 2024 00:43:36 +0100 Subject: [PATCH 5/5] Minor fixes of comments and mark --- src/scribe_data/load/data_to_sqlite.py | 4 ++-- src/scribe_data/utils.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/scribe_data/load/data_to_sqlite.py b/src/scribe_data/load/data_to_sqlite.py index 8a315b91..99ee7ff0 100644 --- a/src/scribe_data/load/data_to_sqlite.py +++ b/src/scribe_data/load/data_to_sqlite.py @@ -33,9 +33,9 @@ from scribe_data.utils import ( DEFAULT_JSON_EXPORT_DIR, DEFAULT_SQLITE_EXPORT_DIR, + camel_to_snake, get_language_iso, list_all_languages, - camel_to_snake, ) @@ -116,7 +116,7 @@ def create_table(data_type, cols): cols : list of strings The names of columns for the new table. """ - # Convert column names to snake_case + # Convert column names to snake_case if requested. cols = [ camel_to_snake(col) if identifier_case == "snake" else col for col in cols ] diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 5c6bd260..919a5868 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -21,9 +21,9 @@ --> """ -import re import ast import json +import re from importlib import resources from pathlib import Path from typing import Any, Optional @@ -652,7 +652,7 @@ def list_languages_with_metadata_for_data_type(language_metadata=_languages): return sorted(current_languages, key=lambda x: x["name"]) -# MARK: convert camelCase to snake_case +# MARK: Case Conversion def camel_to_snake(name: str) -> str: