smart-on-fhir · dogversioning · Feb 7, 2024 · Feb 6, 2024 · Feb 6, 2024 · Feb 7, 2024
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -2,5 +2,4 @@
 ### Checklist
 - [ ] Consider if documentation (like in `docs/`) needs to be updated
 - [ ] Consider if tests should be added
-- [ ] Run pylint if you're making changes beyond adding studies
 - [ ] Update template repo if there are changes to study configuration
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -47,10 +47,10 @@ jobs:
       - name: Run sqlfluff on jinja templates
         run: |
           sqlfluff lint
-      - name: Run black
+      - name: Run ruff
         if: success() || failure() # still run black if above checks fails
         run: |
-          black --check --verbose .
+          ruff
   regression:
     runs-on: ubuntu-22.04
     permissions:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,15 +1,10 @@
 repos:
-  - repo: https://github.com/psf/black
-    #this version is synced with the black mentioned in .github/workflows/ci.yml
-    rev: 23.10.0
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.2.1
     hooks:
-      - id: black
-        entry: bash -c 'black "$@"; git add -u' --
-        # It is recommended to specify the latest version of Python
-        # supported by your project here, or alternatively use
-        # pre-commit's default_language_version, see
-        # https://pre-commit.com/#top_level-default_language_version
-        language_version: python3.9
+      - id: ruff
+        args: [--fix]
+      - id: ruff-format
 
   - repo: https://github.com/sqlfluff/sqlfluff
     rev: 2.3.4

diff --git a/cumulus_library/base_table_builder.py b/cumulus_library/base_table_builder.py
@@ -3,12 +3,11 @@
 import pathlib
 import re
 import sys
-
 from abc import ABC, abstractmethod
 from typing import final
 
-from cumulus_library.databases import DatabaseCursor
 from cumulus_library import base_utils
+from cumulus_library.databases import DatabaseCursor
 
 
 class BaseTableBuilder(ABC):
@@ -70,8 +69,8 @@ def execute_queries(
                         )
 
                     table_name = table_name[0]
-                    # if it contains a schema, remove it (usually it won't, but some CTAS
-                    # forms may)
+                    # if it contains a schema, remove it (usually it won't, but some
+                    # CTAS forms may)
                     if "." in table_name:
                         table_name = table_name.split(".")[1].replace('"', "")
                     table_names.append(table_name)
@@ -94,7 +93,7 @@ def execute_queries(
 
         self.post_execution(cursor, schema, verbose, drop_table, *args, **kwargs)
 
-    def post_execution(
+    def post_execution(  # noqa: B027 - this looks like, but is not, an abstract method
         self,
         cursor: DatabaseCursor,
         schema: str,
@@ -122,7 +121,9 @@ def comment_queries(self, doc_str=None):
         commented_queries.pop()
         self.queries = commented_queries
 
-    def write_queries(self, path: pathlib.Path = pathlib.Path.cwd() / "output.sql"):
+    def write_queries(self, path: pathlib.Path | None = None):
+        if path is None:
+            path = pathlib.Path.cwd() / "output.sql"
         """writes all queries constructed by prepare_queries to disk"""
         path.parents[0].mkdir(parents=True, exist_ok=True)
         with open(path, "w", encoding="utf-8") as file:

diff --git a/cumulus_library/base_utils.py b/cumulus_library/base_utils.py
@@ -1,11 +1,9 @@
 """ Collection of small commonly used utility functions """
 
 import datetime
-import os
 import json
-
+import os
 from contextlib import contextmanager
-from typing import List
 
 from rich import progress
 
@@ -15,16 +13,16 @@ def filepath(filename: str) -> str:
 
 
 def load_text(path: str) -> str:
-    with open(path, "r", encoding="UTF-8") as fp:
+    with open(path, encoding="UTF-8") as fp:
         return fp.read()
 
 
 def load_json(path: str) -> dict:
-    with open(path, "r", encoding="UTF-8") as fp:
+    with open(path, encoding="UTF-8") as fp:
         return json.load(fp)
 
 
-def parse_sql(sql_text: str) -> List[str]:
+def parse_sql(sql_text: str) -> list[str]:
     commands = []
 
     for statement in sql_text.split(";"):
@@ -36,11 +34,11 @@ def parse_sql(sql_text: str) -> List[str]:
     return filter_strip(commands)
 
 
-def filter_strip(commands) -> List[str]:
+def filter_strip(commands) -> list[str]:
     return list(filter(None, [c.strip() for c in commands]))
 
 
-def list_coding(code_display: dict, system=None) -> List[dict]:
+def list_coding(code_display: dict, system=None) -> list[dict]:
     as_list = []
     for code, display in code_display.items():
         if system:

diff --git a/cumulus_library/cli.py b/cumulus_library/cli.py
@@ -7,18 +7,15 @@
 import sys
 import sysconfig
 
-
-from typing import Dict, List, Optional
-
 import rich
 
 from cumulus_library import (
     __version__,
+    base_utils,
     cli_parser,
     databases,
     enums,
     errors,
-    base_utils,
     protected_table_builder,
     study_parser,
     upload,
@@ -60,8 +57,8 @@ def update_transactions(self, prefix: str, status: str):
 
     def clean_study(
         self,
-        targets: List[str],
-        study_dict: Dict,
+        targets: list[str],
+        study_dict: dict,
         *,
         stats_clean: bool,
         prefix: bool = False,
@@ -106,7 +103,7 @@ def clean_and_build_study(
         target: pathlib.Path,
         *,
         stats_build: bool,
-        continue_from: str = None,
+        continue_from: str | None = None,
     ) -> None:
         """Recreates study views/tables
 
@@ -176,7 +173,7 @@ def run_single_table_builder(
             parser=self.db.parser(),
         )
 
-    def clean_and_build_all(self, study_dict: Dict, stats_build: bool) -> None:
+    def clean_and_build_all(self, study_dict: dict, stats_build: bool) -> None:
         """Builds views for all studies.
 
         NOTE: By design, this method will always exclude the `template` study dir,
@@ -206,7 +203,7 @@ def export_study(self, target: pathlib.Path, data_path: pathlib.Path) -> None:
         studyparser = study_parser.StudyManifestParser(target, data_path)
         studyparser.export_study(self.db, data_path)
 
-    def export_all(self, study_dict: Dict, data_path: pathlib.Path):
+    def export_all(self, study_dict: dict, data_path: pathlib.Path):
         """Exports all defined count tables to disk"""
         for key in study_dict.keys():
             self.export_study(study_dict[key], data_path)
@@ -254,7 +251,7 @@ def create_template(path: str) -> None:
         dest_path.write_bytes(source_path.read_bytes())
 
 
-def get_study_dict(alt_dir_paths: List) -> Optional[Dict[str, pathlib.Path]]:
+def get_study_dict(alt_dir_paths: list) -> dict[str, pathlib.Path] | None:
     """Gets valid study targets from ./studies/, and any pip installed studies
 
     :returns: A list of Path objects
@@ -264,11 +261,12 @@ def get_study_dict(alt_dir_paths: List) -> Optional[Dict[str, pathlib.Path]]:
 
     # first, we'll get any installed public studies
     with open(
-        pathlib.Path(cli_path, "./module_allowlist.json"), "r", encoding="utf-8"
+        pathlib.Path(cli_path, "./module_allowlist.json"), encoding="utf-8"
     ) as study_allowlist_json:
         study_allowlist = json.load(study_allowlist_json)["allowlist"]
-    site_packages_dir = sysconfig.get_path("purelib")
+    site_packages_dir = "".join(sysconfig.get_path("purelib"))
     for study, subdir in study_allowlist.items():
+        print(site_packages_dir)
         study_path = pathlib.Path(site_packages_dir, subdir)
         if study_path.exists():
             manifest_studies[study] = study_path
@@ -295,7 +293,7 @@ def get_studies_by_manifest_path(path: pathlib.Path) -> dict:
     return manifest_paths
 
 
-def run_cli(args: Dict):
+def run_cli(args: dict):
     """Controls which library tasks are run based on CLI arguments"""
     if args["action"] == "create":
         create_template(args["create_dir"])
@@ -312,7 +310,6 @@ def run_cli(args: Dict):
                 runner.verbose = True
             print("Testing connection to database...")
             runner.cursor.execute("SHOW DATABASES")
-
             study_dict = get_study_dict(args["study_dir"])
             if "prefix" not in args.keys():
                 if args["target"]:

diff --git a/cumulus_library/cli_parser.py b/cumulus_library/cli_parser.py
@@ -29,10 +29,10 @@ def add_study_dir_argument(parser: argparse.ArgumentParser) -> None:
         action="append",
         help=(
             "Optionally add one or more directories to look for study definitions in. "
-            "Default is in project directory and CUMULUS_LIBRARY_STUDY_DIR, if present, "
-            "followed by any supplied paths. Target, and all its subdirectories, "
-            "are checked for manifests. Overriding studies with the same namespace "
-            "supersede earlier ones."
+            "Default is in project directory and CUMULUS_LIBRARY_STUDY_DIR, "
+            "if present, followed by any supplied paths. Target, and all its "
+            "subdirectories, are checked for manifests. Overriding studies with the"
+            " same namespace supersede earlier ones."
         ),
     )
 
@@ -87,10 +87,14 @@ def add_db_config(parser: argparse.ArgumentParser) -> None:
     )
     group.add_argument(
         "--database",
-        # In Athena, we use this as the schema_name (which is also called a Database in their UX).
+        # In Athena, we use this as the schema_name (which is also called a Database
+        # in their UX).
+        #
         # In DuckDB, we use this as the path to the filename to store tables.
-        # Since we started as an Athena-centric codebase, we mostly keep referring to this as
-        # name "schema_name". But to the user, both uses are still conceptually a "database".
+        #
+        # Since we started as an Athena-centric codebase, we mostly keep referring to
+        # this as name "schema_name". But to the user, both uses are still conceptually
+        # a "database".
         dest="schema_name",
         help="Database name (for Athena) or file (for DuckDB)",
     )

diff --git a/cumulus_library/databases.py b/cumulus_library/databases.py
@@ -14,7 +14,7 @@
 import os
 import sys
 from pathlib import Path
-from typing import Optional, Protocol, Union
+from typing import Protocol
 
 import cumulus_fhir_support
 import duckdb
@@ -31,13 +31,13 @@ class DatabaseCursor(Protocol):
     def execute(self, sql: str) -> None:
         pass
 
-    def fetchone(self) -> Optional[list]:
+    def fetchone(self) -> list | None:
         pass
 
-    def fetchmany(self, size: Optional[int]) -> Optional[list[list]]:
+    def fetchmany(self, size: int | None) -> list[list] | None:
         pass
 
-    def fetchall(self) -> Optional[list[list]]:
+    def fetchall(self) -> list[list] | None:
         pass
 
 
@@ -131,6 +131,7 @@ def execute_as_pandas(self, sql: str) -> pandas.DataFrame:
     def parser(self) -> DatabaseParser:
         """Returns parser object for interrogating DB schemas"""
 
+    @abc.abstractmethod
     def close(self) -> None:
         """Clean up any resources necessary"""
 
@@ -174,6 +175,9 @@ def execute_as_pandas(self, sql: str) -> pandas.DataFrame:
     def parser(self) -> DatabaseParser:
         return AthenaParser()
 
+    def close(self) -> None:
+        return self.connection.close()
+
 
 class AthenaParser(DatabaseParser):
     def validate_table_schema(
@@ -190,7 +194,7 @@ def __init__(self, db_file: str):
         super().__init__("main")
         self.connection = duckdb.connect(db_file)
         # Aliasing Athena's as_pandas to duckDB's df cast
-        setattr(duckdb.DuckDBPyConnection, "as_pandas", duckdb.DuckDBPyConnection.df)
+        duckdb.DuckDBPyConnection.as_pandas = duckdb.DuckDBPyConnection.df
 
         # Paper over some syntax differences between Athena and DuckDB
         self.connection.create_function(
@@ -238,22 +242,24 @@ def __init__(self, db_file: str):
         )
 
     def insert_tables(self, tables: dict[str, pyarrow.Table]) -> None:
-        """Ingests all ndjson data from a folder tree (often the output folder of Cumulus ETL)"""
+        """Ingests all ndjson data from a folder tree.
+
+        This is often the output folder of Cumulus ETL"""
         for name, table in tables.items():
             self.connection.register(name, table)
 
     @staticmethod
     def _compat_array_join(
-        value: Optional[list[Optional[str]]], delimiter: str
-    ) -> Optional[str]:
+        value: list[str | None] | None, delimiter: str
+    ) -> str | None:
         if value is None:
             return None
         return delimiter.join(v for v in value if v is not None)
 
     @staticmethod
     def _compat_date(
-        value: Union[str, datetime.datetime, datetime.date, None]
-    ) -> Optional[datetime.date]:
+        value: str | datetime.datetime | datetime.date | None,
+    ) -> datetime.date | None:
         if value is None:
             return None
         elif isinstance(value, str):
@@ -266,14 +272,14 @@ def _compat_date(
             raise ValueError("Unexpected date() argument:", type(value), value)
 
     @staticmethod
-    def _compat_to_utf8(value: Optional[str]) -> Optional[datetime.date]:
+    def _compat_to_utf8(value: str | None) -> datetime.date | None:
         """See the create_function() call for to_utf8 for more background"""
         return value
 
     @staticmethod
     def _compat_from_iso8601_timestamp(
-        value: Optional[str],
-    ) -> Optional[datetime.datetime]:
+        value: str | None,
+    ) -> datetime.datetime | None:
         if value is None:
             return None
 
@@ -329,12 +335,13 @@ def read_ndjson_dir(path: str) -> dict[str, pyarrow.Table]:
     """Loads a directory tree of raw ndjson into schema-ful tables.
 
     :param path: a directory path
-    :returns: dictionary of table names (like 'documentreference') to table data (with schema)
+    :returns: dictionary of table names (like 'documentreference') to table
+      data (with schema)
     """
     all_tables = {}
 
-    # Manually specify the list of resources because we want to create each table even if the
-    # folder does not exist.
+    # Manually specify the list of resources because we want to create each table
+    # even if the folder does not exist.
     resources = [
         "AllergyIntolerance",
         "Condition",
@@ -364,7 +371,7 @@ def read_ndjson_dir(path: str) -> dict[str, pyarrow.Table]:
         # Read all ndjson directly into memory
         rows = []
         for filename in filenames:
-            with open(filename, "r", encoding="utf8") as f:
+            with open(filename, encoding="utf8") as f:
                 for line in f:
                     rows.append(json.loads(line))