Merge pull request #310 from NCI-CGR/302-skip-merging-graf-king-and-p…

…link-results-to-make-the-qc-pipeline-more-efficient 302 skip merging graf king and plink results to make the qc pipeline more efficient
NCI-CGR · Aug 6, 2024 · 3d31024 · 3d31024
2 parents 4015e2c + bd12378
commit 3d31024
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 144 deletions.
diff --git a/docs/static/Thumbs.db b/docs/static/Thumbs.db
diff --git a/src/cgr_gwas_qc/workflow/scripts/qc_report_table.py b/src/cgr_gwas_qc/workflow/scripts/qc_report_table.py
@@ -132,11 +132,6 @@ def _sample_qc(sample_sheet_csv: PathLike, sample_qc_csv: PathLike) -> pd.DataFr
     "PLINK_concordance",
     "PLINK_is_ge_pi_hat",
     "PLINK_is_ge_concordance",
-    "GRAF_HGMR",
-    "GRAF_AGMR",
-    "GRAF_relationship",
-    "KING_Kinship",
-    "KING_relationship",
 ]
 
 
@@ -158,10 +153,8 @@ def _sample_concordance(sample_qc_csv: PathLike, sample_concordance_csv: PathLik
     return (
         sample_concordance.read(sample_concordance_csv)
         .pipe(
-            lambda x: x[
-                x.PLINK_PI_HAT.notna() | (x.GRAF_relationship.notna() & x.KING_relationship.notna())
-            ]
-        )  # Keep if we have results from plink or both GRAF and KING. This will limit the number of rows.
+            lambda x: x[x.PLINK_PI_HAT.notna()]
+        )  # Keep if we have results from plink. This will limit the number of rows.
         .merge(ancestry.rename_axis("Sample_ID1").rename("Ancestry1"), on="Sample_ID1", how="left")
         .merge(ancestry.rename_axis("Sample_ID2").rename("Ancestry2"), on="Sample_ID2", how="left")
         .merge(

diff --git a/src/cgr_gwas_qc/workflow/scripts/sample_concordance.py b/src/cgr_gwas_qc/workflow/scripts/sample_concordance.py
@@ -23,26 +23,18 @@
     PLINK_concordance, float, Proportion IBS2 ``IBS2 / (IBS0 + IBS1 + IBS2)``
     PLINK_is_ge_pi_hat, boolean, True if PI_HAT was greater than ``software_params.pi_hat_cutoff``
     PLINK_is_ge_concordance, boolean, True if concordance was greater than ``software_params.dup_concordance_cutoff``
-    GRAF_HGMR, float, Homozygous Genotype Mismatch Rate (%)
-    GRAF_AGMR, float, All Genotype Mismatch Rate (%)
-    GRAF_relationship, string, relationship determined by sample genotypes.
-    KING_Kinship, float, Estimated kinship coefficient from the SNP data
-    KING_relationship, string, The assigned relationship based on Kinship
 
 References:
     - :mod:`cgr_gwas_qc.workflow.scripts.concordance_table`
-    - :mod:`cgr_gwas_qc.parsers.king`
-    - :mod:`cgr_gwas_qc.parsers.graf`
 """
-import os
 from itertools import combinations
 from pathlib import Path
 from typing import List, Tuple
 
 import pandas as pd
 import typer
 
-from cgr_gwas_qc.parsers import graf, king, sample_sheet
+from cgr_gwas_qc.parsers import sample_sheet
 from cgr_gwas_qc.typing import PathLike
 from cgr_gwas_qc.workflow.scripts import concordance_table
 
@@ -64,11 +56,6 @@
     "PLINK_concordance": "float",
     "PLINK_is_ge_pi_hat": "boolean",
     "PLINK_is_ge_concordance": "boolean",
-    "GRAF_HGMR": "float",
-    "GRAF_AGMR": "float",
-    "GRAF_relationship": "string",
-    "KING_Kinship": "float",
-    "KING_relationship": "string",
 }
 
 
@@ -92,11 +79,6 @@ def read(filename: PathLike):
         - PLINK_concordance
         - PLINK_is_ge_pi_hat
         - PLINK_is_ge_concordance
-        - GRAF_HGMR
-        - GRAF_AGMR
-        - GRAF_relationship
-        - KING_Kinship
-        - KING_relationship
     """
     return pd.read_csv(filename, dtype=DTYPES)
 
@@ -107,12 +89,8 @@ def main(
 ):
     ss = sample_sheet.read(sample_sheet_csv)
 
-    # Check if graf or king are empty (no related). If so add expected final headers to file
-    check_empty_add_headers(graf_file, ["GRAF_HGMR", "GRAF_AGMR", "GRAF_relationship"])
-    check_empty_add_headers(king_file, ["KING_Kinship", "KING_relationship"])
-
     concordance = (
-        build(plink_file, graf_file, king_file)
+        build(plink_file)
         .pipe(_add_expected_replicates, ss)
         .pipe(_add_discordant_replicates)
         .pipe(_add_unexpected_replicates)
@@ -125,21 +103,9 @@ def main(
     )
 
 
-def check_empty_add_headers(file_path: Path, headers: list):
-    """Check if the file is empty"""
-    if os.path.getsize(file_path) == 0:
-        with open(file_path, "w") as f:
-            f.write("\t".join(headers) + "\n")
-
-
-def build(plink_file: PathLike, graf_file: PathLike, king_file: PathLike):
+def build(plink_file: PathLike):
     """Build the main concordance table."""
-    return (
-        _plink(plink_file)
-        .join(_graf(graf_file), how="outer")
-        .join(_king(king_file), how="outer")
-        .rename_axis(["Sample_ID1", "Sample_ID2"])
-    )
+    return _plink(plink_file).rename_axis(["Sample_ID1", "Sample_ID2"])
 
 
 def _add_subject(df: pd.DataFrame, ss: pd.DataFrame) -> pd.DataFrame:
@@ -188,34 +154,18 @@ def _add_expected_replicates(df: pd.DataFrame, ss: pd.DataFrame) -> pd.DataFrame
 
 def _discordant_logic(sr: pd.Series) -> bool:
     if not sr.is_expected_replicate:
-        # not a replicate
+        # not an expected replicate
         return False
 
-    if all(
-        [
-            pd.isna(sr.PLINK_is_ge_concordance),
-            pd.isna(sr.GRAF_relationship),
-            pd.isna(sr.KING_relationship),
-        ]
-    ):
-        # No metrics so ignore
-        # Issue 210: If someone is an expected replicates but has no PLINK, GRAF, or KING then these should be flagged as an discordant replicate (True). Before was returning False
+    if pd.isna(sr.PLINK_is_ge_concordance):
+        # Issue 210: If someone is an expected replicates but has no PLINK then these should be flagged as an discordant replicate (True). Before was returning False
         return True
 
     if pd.notna(sr.PLINK_is_ge_concordance):
-        # plink call: True if < concordance threshold
+        # plink call: Mark as discordant (True) if expected replicate and < concordance threshold
         return not sr.PLINK_is_ge_concordance
 
-    if all([pd.notna(sr.GRAF_relationship), pd.notna(sr.KING_relationship)]):
-        # We have both graf and king calls: True if both are discordant
-        return (sr.GRAF_relationship != "ID") & (sr.KING_relationship != "ID")
-
-    if pd.notna(sr.GRAF_relationship):
-        # We only have graf call
-        return sr.GRAF_relationship != "ID"
-
-    # We only have king call
-    return sr.KING_relationship != "ID"
+    return False
 
 
 def _add_discordant_replicates(df: pd.DataFrame) -> pd.DataFrame:
@@ -231,8 +181,6 @@ def _add_unexpected_replicates(df: pd.DataFrame) -> pd.DataFrame:
     """
     df["is_unexpected_replicate"] = ~df.is_expected_replicate & (
         (df.PLINK_is_ge_concordance.notna() & df.PLINK_is_ge_concordance)
-        | (df.GRAF_relationship.notna() & (df.GRAF_relationship == "ID"))
-        | (df.KING_relationship.notna() & (df.KING_relationship == "ID"))
     )
     return df
 
@@ -253,26 +201,6 @@ def _plink(filename: PathLike):
     )
 
 
-def _graf(filename: PathLike):
-    return (
-        graf.read_relatedness(filename)
-        .set_index(["ID1", "ID2"])
-        .reindex(["HGMR", "AGMR", "relationship"], axis=1)
-        .rename(
-            {"HGMR": "GRAF_HGMR", "AGMR": "GRAF_AGMR", "relationship": "GRAF_relationship"}, axis=1,
-        )
-    )
-
-
-def _king(filename: PathLike):
-    return (
-        king.read_related(filename)
-        .set_index(["ID1", "ID2"])
-        .reindex(["Kinship", "relationship"], axis=1)
-        .rename({"Kinship": "KING_Kinship", "relationship": "KING_relationship"}, axis=1)
-    )
-
-
 def _get_known_replicates(ss: pd.DataFrame) -> List[Tuple[str, str]]:
     """Create list of pairwise combinations of know replicates.
 

diff --git a/tests/workflow/scripts/test_sample_concordance.py b/tests/workflow/scripts/test_sample_concordance.py
@@ -8,17 +8,11 @@
 @pytest.fixture
 def concordance():
     data_cache = FakeData()
-    return sample_concordance.build(
-        data_cache / "cgr/concordance.csv",
-        data_cache / "graf/relatedness.tsv",
-        data_cache / "king/king.kin0",
-    )
+    return sample_concordance.build(data_cache / "cgr/concordance.csv",)
 
 
 def test_build(concordance):
     assert 4 == concordance.filter(regex="^PLINK").shape[1]
-    assert 3 == concordance.filter(regex="^GRAF").shape[1]
-    assert 2 == concordance.filter(regex="^KING").shape[1]
     assert ["Sample_ID1", "Sample_ID2"] == concordance.index.names
 
 
@@ -91,38 +85,17 @@ def test_add_discordant_replicate_missing_pair(fake_cfg, concordance):
 
 
 @pytest.mark.parametrize(
-    "expected_result,is_rep,plink,graf,king",
+    "expected_result,is_rep,plink",
     [
-        (False, False, True, "ID", "ID"),  # Not an expected replicate
+        (False, False, True),  # Not an expected replicate
         # PLINK Concordant
-        (False, True, True, "ID", "ID"),  # all concordant
-        (False, True, True, "UN", "UN"),  # GRAF and KING don't matter
-        (False, True, True, pd.NA, pd.NA),  # GRAF and KING don't matter
+        (False, True, True),  # all concordant
         # PLINK Discordant
-        (True, True, False, "ID", "UN"),  # GRAF and KING don't matter
-        (True, True, False, "UN", "ID"),  # GRAF and KING don't matter
-        (True, True, False, pd.NA, pd.NA),  # GRAF and KING don't matter
-        (True, True, False, "UN", "UN"),  # GRAF and KING don't matter
-        # PLINK Missing
-        (False, True, pd.NA, "ID", "ID"),  # GRAF and KING concordant
-        (False, True, pd.NA, "ID", "UN"),  # GRAF and KING disagree
-        (False, True, pd.NA, "UN", "ID"),  # GRAF and KING disagree
-        (False, True, pd.NA, "ID", pd.NA),  # GRAF concordant
-        (False, True, pd.NA, pd.NA, "ID"),  # KING concordant
-        (True, True, pd.NA, "UN", pd.NA),  # GRAF discordant
-        (True, True, pd.NA, pd.NA, "UN"),  # KING discordant
-        (True, True, pd.NA, pd.NA, pd.NA),  # All missing: flag as discordant replicate
+        (True, True, False),  # GRAF and KING don't matter
     ],
 )
-def test_add_discordant_logic(expected_result, is_rep, plink, graf, king):
-    df = pd.DataFrame(
-        {
-            "is_expected_replicate": [is_rep],
-            "PLINK_is_ge_concordance": [plink],
-            "GRAF_relationship": [graf],
-            "KING_relationship": [king],
-        }
-    )
+def test_add_discordant_logic(expected_result, is_rep, plink):
+    df = pd.DataFrame({"is_expected_replicate": [is_rep], "PLINK_is_ge_concordance": [plink],})
 
     result = sample_concordance._add_discordant_replicates(df).is_discordant_replicate.squeeze()
     assert expected_result == result
@@ -149,29 +122,17 @@ def test_add_unexpected_replicate_one(fake_cfg, concordance):
 
 
 @pytest.mark.parametrize(
-    "expected_result,is_rep,plink,graf,king",
+    "expected_result,is_rep,plink",
     [
-        (False, True, True, "ID", "ID"),  # expected replicate
-        (True, False, True, "ID", "ID"),  # All concordant
-        (True, False, True, "UN", "UN"),  # plink concordant
-        (True, False, False, "ID", "UN"),  # graf concordant
-        (True, False, False, "UN", "ID"),  # king concordant
-        (True, False, True, pd.NA, pd.NA),  # plink concordant with missing
-        (True, False, pd.NA, "ID", pd.NA),  # graf concordant with missing
-        (True, False, pd.NA, pd.NA, "ID"),  # king concordant with missing
-        (False, False, False, "UN", "UN"),  # all discordant
-        (False, False, pd.NA, pd.NA, pd.NA),  # all missing
+        (False, True, True),  # expected replicate
+        (True, False, True),  # All concordant
+        (True, False, True),  # plink concordant
+        (False, False, False),  # all discordant
+        (False, False, pd.NA),  # all missing
     ],
 )
-def test_add_unexpected_logic(expected_result, is_rep, plink, graf, king):
-    df = pd.DataFrame(
-        {
-            "is_expected_replicate": [is_rep],
-            "PLINK_is_ge_concordance": [plink],
-            "GRAF_relationship": [graf],
-            "KING_relationship": [king],
-        }
-    )
+def test_add_unexpected_logic(expected_result, is_rep, plink):
+    df = pd.DataFrame({"is_expected_replicate": [is_rep], "PLINK_is_ge_concordance": [plink],})
 
     result = sample_concordance._add_unexpected_replicates(df).is_unexpected_replicate.squeeze()
     assert expected_result == result