Process new ETL gpt tables into symptom lists

smart-on-fhir · Aug 22, 2024 · ac0633f · ac0633f
1 parent dc237d7
commit ac0633f
Show file tree

Hide file tree

Showing 10 changed files with 280 additions and 5 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -12,6 +12,14 @@ jobs:
         with:
           python-version: "3.11"
 
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[tests]
+
+      - name: Test with pytest
+        run: python -m pytest
+
   lint:
     runs-on: ubuntu-latest
     steps:

diff --git a/RUNNING.md b/RUNNING.md
@@ -43,7 +43,9 @@ You should now have all the interesting results sitting in Athena.
 
 In Athena's web console, run these commands and download the CSV results,
 using the given filenames (we will refer back to these filenames later):
-- **ctakes.csv**: `select encounter_ref, symptom_display from covid_symptom__symptom_ctakes_negation`
+- **ctakes.csv** (if you ran cTAKES): `select encounter_ref, symptom_display from covid_symptom__symptom_ctakes_negation`
+- **gpt35.csv** (if you ran ChatGPT 3.5): `select encounter_ref, symptom_display from covid_symptom__symptom_gpt35`
+- **gpt4.csv** (if you ran ChatGPT 4): `select encounter_ref, symptom_display from covid_symptom__symptom_gpt4`
 - **docrefs.csv**: `select distinct docref_id from covid_symptom__symptom_ctakes_negation`
 - **icd10.csv**: `select encounter_ref, substring(icd10_display, 7) as symptom_display from covid_symptom__symptom_icd10`
 
@@ -115,7 +117,7 @@ Save this file as `labelstudio-export.json` in a new folder.
 ## 8. Set up `chart-review`
 
 - Run `pip install chart-review`
-- Copy `ctakes.csv` and `icd10.csv` from step 3 above into the same folder
+- Copy the `.csv` files from step 3 above into the same folder
   you used for `labelstudio-export.json` above (the "chart review folder").
 - Add a new `config.yaml` file in that folder:
 ```yaml
@@ -137,6 +139,10 @@ annotators:
   human2: 2
   ctakes:
     filename: ctakes.csv
+  gpt35:
+    filename: gpt35.csv
+  gpt4:
+    filename: gpt4.csv
   icd10:
     filename: icd10.csv
 ```

diff --git a/cumulus_library_covid/__init__.py b/cumulus_library_covid/__init__.py
@@ -1,3 +1,3 @@
 """SQL generation for cumulus covid symptom analysis"""
 
-__version__ = "2.0.0"
+__version__ = "2.1.0"
diff --git a/cumulus_library_covid/builder_gpt.jinja b/cumulus_library_covid/builder_gpt.jinja
@@ -0,0 +1,39 @@
+-- Map boolean column names to symptom labels that Chart Review will use
+{% set cols = {
+  'congestion_or_runny_nose': 'Congestion or runny nose',
+  'cough': 'Cough',
+  'diarrhea': 'Diarrhea',
+  'dyspnea': 'Dyspnea',
+  'fatigue': 'Fatigue',
+  'fever_or_chills': 'Fever or chills',
+  'headache': 'Headache',
+  'loss_of_taste_or_smell': 'Loss of taste or smell',
+  'muscle_or_body_aches': 'Muscle or body aches',
+  'nausea_or_vomiting': 'Nausea or vomiting',
+  'sore_throat': 'Sore throat',
+} -%}
+
+CREATE TABLE covid_symptom__symptom_{{ model }} AS
+
+-- Iterate table once for each symptom, noting the cases where it was present
+{% for col_name, symptom_label in cols.items() %}
+SELECT
+    CONCAT('Encounter/', nr.encounter_id),
+    '{{ symptom_label }}' AS symptom_display
+FROM covid_symptom__nlp_results_{{ model }} AS nr
+WHERE nr.symptoms.{{ col_name }}
+UNION
+{% endfor %}
+
+-- Also capture encounters with no symptoms (as a single empty symptom label).
+-- Chart review will recognize this as "reviewed, but did not find anything".
+SELECT
+    CONCAT('Encounter/', nr.encounter_id),
+    '' AS symptom_display
+FROM covid_symptom__nlp_results_{{ model }} AS nr
+WHERE
+{% for col_name in cols %}
+NOT nr.symptoms.{{ col_name }}
+{%- if not loop.last %} AND{%- endif %}
+{% endfor %}
+;
diff --git a/cumulus_library_covid/builder_gpt.py b/cumulus_library_covid/builder_gpt.py
@@ -0,0 +1,24 @@
+"""Builder for the ChatGPT symptoms tables."""
+
+import os
+
+import cumulus_library
+import jinja2
+
+
+class GptBuilder(cumulus_library.BaseTableBuilder):
+    display_text = "Creating ChatGPT symptom tables..."
+
+    def prepare_queries(self, *args, **kwargs):
+        self.queries += [
+            self.render_sql("builder_gpt", model="gpt35"),
+            self.render_sql("builder_gpt", model="gpt4"),
+        ]
+
+    @staticmethod
+    def render_sql(template: str, **kwargs) -> str:
+        path = os.path.dirname(__file__)
+        with open(f"{path}/{template}.jinja") as file:
+            loader = jinja2.FileSystemLoader(path)
+            env = jinja2.Environment(loader=loader, autoescape=True).from_string(file.read())
+            return env.render(**kwargs)
diff --git a/cumulus_library_covid/counts.py b/cumulus_library_covid/counts.py
@@ -1,9 +1,9 @@
 from pathlib import Path
 
-from cumulus_library.statistics.counts import CountsBuilder
+import cumulus_library
 
 
-class CovidCountsBuilder(CountsBuilder):
+class CovidCountsBuilder(cumulus_library.CountsBuilder):
     display_text = "Creating covid counts..."
 
     def count_dx(self, duration="week"):

diff --git a/cumulus_library_covid/manifest.toml b/cumulus_library_covid/manifest.toml
@@ -1,5 +1,10 @@
 study_prefix = "covid_symptom"
 
+[table_builder_config]
+file_names = [
+    "builder_gpt.py",
+]
+
 [sql_config]
 file_names = [
 #    "define_age_pediatric.sql",

diff --git a/pyproject.toml b/pyproject.toml
@@ -30,6 +30,10 @@ dev = [
     # if you update the ruff version, also update .pre-commit-config.yaml
     "ruff < 0.6",
 ]
+tests = [
+    "duckdb",
+    "pandas",
+]
 
 [tool.ruff]
 line-length = 100
@@ -46,3 +50,5 @@ select = [
     "S",  # bandit security warnings
     "UP",  # alert you when better syntax is available in your python version
 ]
+[tool.ruff.lint.per-file-ignores]
+"tests/**" = ["S"]  # tests do suspicious stuff that's fine, actually
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/test_gpt.py b/tests/test_gpt.py
@@ -0,0 +1,187 @@
+"""Gpt unit tests"""
+
+import datetime
+import os
+import tempfile
+import unittest
+
+import duckdb
+import pandas
+from cumulus_library import cli
+
+
+class GptTestCase(unittest.TestCase):
+    """Test case for the gpt symptom tables."""
+
+    def setUp(self):
+        super().setUp()
+        self.maxDiff = None
+
+    @staticmethod
+    def register(db: duckdb.DuckDBPyConnection, name: str, table: pandas.DataFrame) -> None:
+        db.register(f"{name}_df", table)
+        db.sql(f"CREATE TABLE {name} AS SELECT * FROM {name}_df")
+
+    def make_core_tables(self, db: duckdb.DuckDBPyConnection) -> None:
+        """Make a single core encounter, patient, and docref"""
+        encounter = pandas.DataFrame(
+            {
+                "period_start_day": [datetime.date(2016, 10, 10)],
+                "period_start_week": [datetime.date(2016, 10, 10)],
+                "period_start_month": [datetime.date(2016, 10, 1)],
+                "period_end_day": [datetime.date(2016, 10, 11)],
+                "age_at_visit": [12],
+                "status": ["finished"],
+                "class_code": [None],
+                "class_display": [None],
+                "encounter_ref": ["Encounter/E1"],
+                "subject_ref": ["Patient/P1"],
+            }
+        )
+        self.register(db, "core__encounter", encounter)
+
+        patient = pandas.DataFrame(
+            {
+                "gender": ["unknown"],
+                "race_display": [None],
+                "ethnicity_display": [None],
+                "subject_ref": ["Patient/P1"],
+            }
+        )
+        self.register(db, "core__patient", patient)
+
+        condition = pandas.DataFrame(
+            {
+                "recordeddate_day": [datetime.date(2016, 10, 10)],
+                "recordeddate_week": [datetime.date(2016, 10, 10)],
+                "recordeddate_month": [datetime.date(2016, 10, 1)],
+                "recordeddate_year": [datetime.date(2016, 1, 1)],
+                "code": ["U07.1"],
+                "encounter_ref": ["Encounter/E1"],
+                "subject_ref": ["Patient/P1"],
+            }
+        )
+        self.register(db, "core__condition", condition)
+
+        docref = pandas.DataFrame(
+            {
+                "author_day": [datetime.date(2016, 10, 10)],
+                "author_week": [datetime.date(2016, 10, 10)],
+                "author_month": [datetime.date(2016, 10, 1)],
+                "author_year": [datetime.date(2016, 1, 1)],
+                "type_code": ["34878-9"],
+                "type_display": ["Emergency medicine Note"],
+                "documentreference_ref": ["DocumentReference/D1"],
+                "encounter_ref": ["Encounter/E1"],
+                "subject_ref": ["Patient/P1"],
+            }
+        )
+        self.register(db, "core__documentreference", docref)
+
+        lab = pandas.DataFrame(
+            {
+                "observation_code": ["94309-2"],
+                "effectivedatetime_day": [datetime.date(2016, 10, 10)],
+                "effectivedatetime_week": [datetime.date(2016, 10, 10)],
+                "effectivedatetime_month": [datetime.date(2016, 10, 1)],
+                "valuecodeableconcept_code": ["10828004"],
+                "observation_ref": ["Observation/O1"],
+                "encounter_ref": ["Encounter/E1"],
+                "subject_ref": ["Patient/P1"],
+            }
+        )
+        self.register(db, "core__observation_lab", lab)
+
+        ed_note = pandas.DataFrame(
+            {
+                "code": ["34878-9"],
+                "from_code": ["149798455"],
+            }
+        )
+        self.register(db, "core__ed_note", ed_note)
+
+        nlp = pandas.DataFrame(
+            {
+                "docref_id": ["D1"],
+                "encounter_id": ["E1"],
+                "subject_id": ["P1"],
+                "match": [
+                    {
+                        "conceptattributes": [
+                            {"cui": "C0027424"},
+                        ],
+                        "text": "Congestion",
+                    }
+                ],
+            }
+        )
+        self.register(db, "covid_symptom__nlp_results", nlp)
+
+    def make_gpt_table(self, db: duckdb.DuckDBPyConnection, name: str, **kwargs) -> None:
+        symptoms = {
+            "congestion_or_runny_nose": False,
+            "cough": False,
+            "diarrhea": False,
+            "dyspnea": False,
+            "fatigue": False,
+            "fever_or_chills": False,
+            "headache": False,
+            "loss_of_taste_or_smell": False,
+            "muscle_or_body_aches": False,
+            "nausea_or_vomiting": False,
+            "sore_throat": False,
+        }
+        symptoms.update(kwargs)
+        table = pandas.DataFrame(
+            {
+                "encounter_id": ["E1"],
+                "symptoms": [symptoms],
+            }
+        )
+        self.register(db, f"covid_symptom__nlp_results_{name}", table)
+
+    def test_happy_path(self) -> None:
+        """Runs the study on some input data and spot-checks the gpt results"""
+        test_dir = os.path.dirname(__file__)
+        root_dir = os.path.dirname(test_dir)
+        study_dir = f"{root_dir}/cumulus_library_covid"
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            db = duckdb.connect(f"{tmpdir}/duck.db")
+            self.make_core_tables(db)
+            self.make_gpt_table(db, "gpt35", cough=True, fever_or_chills=True)
+            self.make_gpt_table(db, "gpt4")  # test that we mark no-symptom-found docrefs
+            db.close()
+
+            cli.main(
+                [
+                    "build",
+                    # "--verbose",
+                    "--target=covid_symptom",
+                    f"--study-dir={study_dir}",
+                    "--db-type=duckdb",
+                    f"--database={tmpdir}/duck.db",
+                ]
+            )
+            db = duckdb.connect(f"{tmpdir}/duck.db")
+
+            # Confirm we flag the right symptoms when present
+            rel = db.sql("SELECT * FROM covid_symptom__symptom_gpt35")
+            rows = rel.order("symptom_display").fetchall()
+            self.assertEqual(
+                [
+                    ("Encounter/E1", "Cough"),
+                    ("Encounter/E1", "Fever or chills"),
+                ],
+                rows,
+            )
+
+            # Confirm we flag a no-results docref too
+            rel = db.sql("SELECT * FROM covid_symptom__symptom_gpt4")
+            rows = rel.order("symptom_display").fetchall()
+            self.assertEqual(
+                [
+                    ("Encounter/E1", ""),
+                ],
+                rows,
+            )