Add an advanced dynamic_study_prefix field to manifests.

This will allow a study to provide a Python file that accepts study options and generates an appropriate study prefix. It's largely targeted at the data_metrics study, but might have application elsewhere.
smart-on-fhir · Oct 25, 2024 · 761773a · 761773a
1 parent fb0823d
commit 761773a
Show file tree

Hide file tree

Showing 16 changed files with 321 additions and 66 deletions.
diff --git a/MAINTAINER.md b/MAINTAINER.md
@@ -1,6 +1,6 @@
 # Maintainer notes
 
-This document is intended for users contributing/manitaining this repository.
+This document is intended for users contributing/maintaining this repository.
 It is not comprehensive, but aims to capture items relevant to architecture
 that are not covered in another document.
 
@@ -10,7 +10,7 @@ Since these terms are used somewhat haphazardly in different database implementa
 we'll quickly define them for the purposes of this document:
 
 - database - a single instance of a database product, local or in the cloud. it can
-contain serveral schemas.
+contain several schemas.
 - schema - a namespaced collection of tables inside a database
 
 Cumulus, as a holistic system, is designed to allow querying against the entire history
@@ -36,4 +36,58 @@ via cross-database joining. Additional tables should not be created by users in
 schemas.
 
 A user could elect to use these vocabulary builders and skip the entire rest of the
-Cumulus ecosystem, if they wanted to. 
+Cumulus ecosystem, if they wanted to. 
+
+## Advanced study features
+
+These features are for very narrow and advanced use cases,
+designed for internal project studies (like `core`, `discovery`, or `data_metrics`).
+
+### Dynamic prefixes
+
+The `data_metrics` study wants to be able to generate an analysis of a single study cohort's data.
+In order to do this, it needs to privately namespace that analysis.
+
+The solution we use for this is to allow a study to dynamically generate the prefix it will use.
+Thus, the `data_metrics` study can use a prefix like `data_metrics_hypertension__` for a
+`hypertension` study and `data_metrics_covid__` for a `covid` study.
+
+#### Config
+Add a field called `dynamic_study_prefix` pointing at a local Python file.
+If this field is present, any `study_prefix` field is ignored.
+```toml
+dynamic_study_prefix = "gen_prefix.py"
+```
+
+#### Generator file
+
+Your generator file will be called as a Python script,
+with the `--option` arguments that Cumulus Library gets
+(but without the `--option` bit).
+You should ignore unknown options, for forward compatibility.
+
+You should print your desired prefix to stdout.
+
+Example:
+```python
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--study")
+args, _rest = parser.parse_known_args()
+
+if args.study:
+  print(f"data_metrics_{args.study}")
+else:
+  print("data_metrics")
+```
+
+#### Usage
+
+Your study still has to be selected using its original name (`--target=data_metrics`),
+but the resulting tables will be prefixed using the generated study name.
+
+This command line would build a `data_metrics_hypertension` study:
+```sh
+cumulus-library build -t data_metrics --option study:hypertension
+```
diff --git a/cumulus_library/actions/builder.py b/cumulus_library/actions/builder.py
@@ -48,7 +48,7 @@ def _load_and_execute_builder(
     :keyword filename: filename of a module implementing a TableBuilder
     :keyword db_parser: an object implementing DatabaseParser for the target database
     :keyword write_reference_sql: if true, writes sql to disk inside a study's directory
-    :keyword doc_string: A string to insert between queries written to disk
+    :keyword doc_str: A string to insert between queries written to disk
     """
 
     # Since we have to support arbitrary user-defined python files here, we

diff --git a/cumulus_library/builders/counts.py b/cumulus_library/builders/counts.py
@@ -1,6 +1,5 @@
 """Class for generating counts tables from templates"""
 
-import sys
 from pathlib import Path
 
 from cumulus_library import BaseTableBuilder, errors, study_manifest
@@ -14,37 +13,28 @@ class CountsBuilder(BaseTableBuilder):
     """Extends BaseTableBuilder for counts-related use cases"""
 
     def __init__(self, study_prefix: str | None = None):
-        if study_prefix is None:
-            # This slightly wonky approach will give us the path of the
-            # directory of a class extending the CountsBuilder
-            study_path = Path(sys.modules[self.__module__].__file__).parent
-
-            try:
-                parser = study_manifest.StudyManifest(study_path)
-                self.study_prefix = parser.get_study_prefix()
-            except Exception as e:
-                raise errors.CountsBuilderError(
-                    "CountsBuilder must be either initiated with a study prefix, "
-                    "or be in a directory with a valid manifest.toml"
-                ) from e
-        else:
-            self.study_prefix = study_prefix
         super().__init__()
+        self.study_prefix = study_prefix
 
     def get_table_name(self, table_name: str, duration=None) -> str:
         """Convenience method for constructing table name
 
         :param table_name: table name to add after the study prefix
         :param duration: a time period reflecting the table binning strategy
         """
+        if not self.study_prefix:
+            raise errors.CountsBuilderError(
+                "CountsBuilder must be either initiated with a study prefix, "
+                "or be in a directory with a valid manifest.toml"
+            )
         if duration:
             return f"{self.study_prefix}__{table_name}_{duration}"
         else:
             return f"{self.study_prefix}__{table_name}"
 
     def get_where_clauses(
         self, clause: list | str | None = None, min_subject: int | None = None
-    ) -> str:
+    ) -> list[str]:
         """Convenience method for constructing arbitrary where clauses.
 
         :param clause: either a string or a list of sql where statements
@@ -295,10 +285,13 @@ def write_counts(self, filepath: str):
         self.comment_queries()
         self.write_queries(path=Path(filepath))
 
-    def prepare_queries(self, cursor: object | None = None, schema: str | None = None):
-        """Stub implementing abstract base class
+    def prepare_queries(
+        self, *args, manifest: study_manifest.StudyManifest | None = None, **kwargs
+    ):
+        """Prepare count queries
 
         This should be overridden in any count generator. See studies/core/count_core.py
         for an example
         """
-        pass  # pragma: no cover
+        if manifest and not self.study_prefix:
+            self.study_prefix = manifest.get_study_prefix()
diff --git a/cumulus_library/cli.py b/cumulus_library/cli.py
@@ -37,11 +37,11 @@ class StudyRunner:
     verbose = False
     schema_name = None
 
-    def __init__(self, config: base_utils.StudyConfig, data_path: str):
+    def __init__(self, config: base_utils.StudyConfig, data_path: str | None):
         self.config = config
-        self.data_path = data_path
+        self.data_path = data_path and pathlib.Path(data_path)
 
-    def get_config(self, manifest=study_manifest.StudyManifest):
+    def get_config(self, manifest: study_manifest.StudyManifest):
         schema = base_utils.get_schema(self.config, manifest)
         if schema == self.config.schema:
             return self.config
@@ -57,6 +57,7 @@ def clean_study(
         targets: list[str],
         study_dict: dict,
         *,
+        options: dict[str, str],
         prefix: bool = False,
     ) -> None:
         """Removes study table/views from Athena.
@@ -65,8 +66,9 @@ def clean_study(
         this can be useful for cleaning up tables if a study prefix is changed
         for some reason.
 
-        :param target: The study prefix to use for IDing tables to remove
+        :param targets: The study prefixes to use for IDing tables to remove
         :param study_dict: The dictionary of available study targets
+        :param options: The dictionary of study-specific options
         :keyword prefix: If True, does a search by string prefix in place of study name
         """
         if targets is None:
@@ -77,26 +79,28 @@ def clean_study(
 
         for target in targets:
             if prefix:
-                manifest = study_manifest.StudyManifest()
+                manifest = study_manifest.StudyManifest(options=options)
                 cleaner.clean_study(
                     config=self.get_config(manifest), manifest=manifest, prefix=target
                 )
             else:
-                manifest = study_manifest.StudyManifest(study_dict[target])
+                manifest = study_manifest.StudyManifest(study_dict[target], options=options)
                 cleaner.clean_study(config=self.get_config(manifest), manifest=manifest)
 
     def clean_and_build_study(
         self,
         target: pathlib.Path,
         *,
+        options: dict[str, str],
         continue_from: str | None = None,
     ) -> None:
         """Recreates study views/tables
 
         :param target: A path to the study directory
+        :param options: The dictionary of study-specific options
         :keyword continue_from: Restart a run from a specific sql file (for dev only)
         """
-        manifest = study_manifest.StudyManifest(target, self.data_path)
+        manifest = study_manifest.StudyManifest(target, self.data_path, options=options)
         try:
             builder.run_protected_table_builder(config=self.get_config(manifest), manifest=manifest)
             if not continue_from:
@@ -150,28 +154,39 @@ def run_matching_table_builder(
         self,
         target: pathlib.Path,
         table_builder_name: str,
+        *,
+        options: dict[str, str],
     ) -> None:
         """Runs a single table builder
 
         :param target: A path to the study directory
         :param table_builder_name: a builder file referenced in the study's manifest
+        :param options: The dictionary of study-specific options
         """
-        manifest = study_manifest.StudyManifest(target)
+        manifest = study_manifest.StudyManifest(target, options=options)
         builder.run_matching_table_builder(
             config=self.get_config(manifest),
             manifest=manifest,
             builder=table_builder_name,
         )
 
     ### Data exporters
-    def export_study(self, target: pathlib.Path, data_path: pathlib.Path, archive: bool) -> None:
+    def export_study(
+        self,
+        target: pathlib.Path,
+        data_path: pathlib.Path,
+        archive: bool,
+        *,
+        options: dict[str, str],
+    ) -> None:
         """Exports aggregates defined in a manifest
 
         :param target: A path to the study directory
-        :param datapath: The location to export data to
+        :param data_path: The location to export data to
         :param archive: If true, will export all tables, otherwise uses manifest list
+        :param options: The dictionary of study-specific options
         """
-        manifest = study_manifest.StudyManifest(target, data_path)
+        manifest = study_manifest.StudyManifest(target, data_path, options=options)
         exporter.export_study(
             config=self.get_config(manifest),
             manifest=manifest,
@@ -183,14 +198,14 @@ def generate_study_sql(
         self,
         target: pathlib.Path,
         *,
-        builder: str | None = None,
+        options: dict[str, str],
     ) -> None:
         """Materializes study sql from templates
 
         :param target: A path to the study directory
-        :keyword builder: Specify a single builder to generate sql from
+        :param options: The dictionary of study-specific options
         """
-        manifest = study_manifest.StudyManifest(target)
+        manifest = study_manifest.StudyManifest(target, options=options)
         file_generator.run_generate_sql(config=self.get_config(manifest), manifest=manifest)
 
     def generate_study_markdown(
@@ -252,7 +267,8 @@ def get_studies_by_manifest_path(path: pathlib.Path) -> dict[str, pathlib.Path]:
             manifest_paths.update(get_studies_by_manifest_path(child_path))
         elif child_path.name == "manifest.toml":
             try:
-                manifest = study_manifest.StudyManifest(path)
+                # Pass empty options, since we are not in a study-specific context
+                manifest = study_manifest.StudyManifest(path, options={})
                 manifest_paths[manifest.get_study_prefix()] = path
             except errors.StudyManifestParsingError as exc:
                 rich.print(f"[bold red] Ignoring study in '{path}': {exc}")
@@ -309,15 +325,19 @@ def run_cli(args: dict):
                     targets=args["target"],
                     study_dict=study_dict,
                     prefix=args["prefix"],
+                    options=args["options"],
                 )
             elif args["action"] == "build":
                 for target in args["target"]:
                     if args["builder"]:
-                        runner.run_matching_table_builder(study_dict[target], args["builder"])
+                        runner.run_matching_table_builder(
+                            study_dict[target], args["builder"], options=args["options"]
+                        )
                     else:
                         runner.clean_and_build_study(
                             study_dict[target],
                             continue_from=args["continue_from"],
+                            options=args["options"],
                         )
 
             elif args["action"] == "export":
@@ -337,7 +357,12 @@ def run_cli(args: dict):
                     if response.lower() != "y":
                         sys.exit()
                 for target in args["target"]:
-                    runner.export_study(study_dict[target], args["data_path"], args["archive"])
+                    runner.export_study(
+                        study_dict[target],
+                        args["data_path"],
+                        args["archive"],
+                        options=args["options"],
+                    )
 
             elif args["action"] == "import":
                 for archive in args["archive_path"]:
@@ -346,7 +371,7 @@ def run_cli(args: dict):
 
             elif args["action"] == "generate-sql":
                 for target in args["target"]:
-                    runner.generate_study_sql(study_dict[target], builder=args["builder"])
+                    runner.generate_study_sql(study_dict[target], options=args["options"])
 
             elif args["action"] == "generate-md":
                 for target in args["target"]:
@@ -429,7 +454,7 @@ def main(cli_args=None):
     options = {}
     cli_options = args.get("options") or []
     for c_arg in cli_options:
-        c_arg = c_arg.split(":", 2)
+        c_arg = c_arg.split(":", 1)
         if len(c_arg) == 1:
             sys.exit(
                 f"Custom argument '{c_arg}' is not validly formatted.\n"

diff --git a/cumulus_library/studies/core/count_core.py b/cumulus_library/studies/core/count_core.py
@@ -115,6 +115,7 @@ def count_core_patient(self):
         return self.count_patient(table_name, from_table, cols)
 
     def prepare_queries(self, *args, **kwargs):
+        super().prepare_queries(*args, **kwargs)
         self.queries = [
             self.count_core_allergyintolerance(duration="month"),
             self.count_core_condition(duration="month"),