Merge pull request #331 from NCI-CGR/issue_326

feat: Improve handling of unexpected replicates (issue 269)
NCI-CGR · Sep 26, 2024 · 910ae33 · 910ae33
2 parents 24c0fb2 + eec673b
commit 910ae33
Showing 1 changed file with 28 additions and 18 deletions.
diff --git a/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py b/src/cgr_gwas_qc/workflow/scripts/sample_qc_table.py
@@ -192,7 +192,9 @@ def main(
     )
 
     add_qc_columns(
-        sample_qc, remove_contam, remove_rep_discordant,
+        sample_qc,
+        remove_contam,
+        remove_rep_discordant,
     )
 
     sample_qc = sample_qc.rename(
@@ -328,19 +330,20 @@ def _read_GRAF(file_name: Path, Sample_IDs: pd.Index) -> pd.DataFrame:
     .. _manuscript: https://pubmed.ncbi.nlm.nih.gov/31151998/
 
     """
-    return (
-        pd.read_csv(file_name, sep="\t")
-        .assign(
-            Sample_ID=lambda x: x["Subject"].astype(str)
-        )  # Issue 216: When subject IDs are numeric reindex fails. This makes sure index Sample_ID will always be as a character
-        .assign(Ancestry=lambda x: x["Computed population"].str.replace(" ", "_"))
-        .assign(AFR=lambda x: x["P_f (%)"] / 100)
-        .assign(EUR=lambda x: x["P_e (%)"] / 100)
-        .assign(ASN=lambda x: x["P_a (%)"] / 100)
-        .set_index("Sample_ID")
-        .loc[:, ("AFR", "EUR", "ASN", "Ancestry")]
-        .reindex(Sample_IDs)
-    )
+
+    graf = pd.read_csv(file_name, sep="\t")
+    graf = graf.assign(
+        Sample_ID=lambda x: x["Subject"].astype(str)
+    )  # Issue 216: When subject IDs are numeric reindex fails. This makes sure index Sample_ID will always be as a character
+    graf = graf.assign(Ancestry=lambda x: x["Computed population"].str.replace(" ", "_"))
+    graf = graf.assign(AFR=lambda x: x["P_f (%)"] / 100)
+    graf = graf.assign(EUR=lambda x: x["P_e (%)"] / 100)
+    graf = graf.assign(ASN=lambda x: x["P_a (%)"] / 100)
+    graf = graf.set_index("Sample_ID")
+    graf = graf.loc[:, ("AFR", "EUR", "ASN", "Ancestry")]
+    graf["Ancestry"] = graf["Ancestry"].fillna("Other")
+    graf = graf.reindex(Sample_IDs)
+    return graf
 
 
 def _read_SNPweights(file_name: Path, Sample_IDs: pd.Index) -> pd.DataFrame:
@@ -411,7 +414,8 @@ def _read_contam(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.DataFram
 
     if file_name is None:
         return pd.DataFrame(
-            index=Sample_IDs, columns=["Contamination_Rate", "is_contaminated"],
+            index=Sample_IDs,
+            columns=["Contamination_Rate", "is_contaminated"],
         ).astype({"Contamination_Rate": "float", "is_contaminated": "boolean"})
 
     return (
@@ -454,12 +458,16 @@ def _read_intensity(file_name: Optional[Path], Sample_IDs: pd.Index) -> pd.Serie
 
 
 def add_qc_columns(
-    sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool,
+    sample_qc: pd.DataFrame,
+    remove_contam: bool,
+    remove_rep_discordant: bool,
 ) -> pd.DataFrame:
     add_call_rate_flags(sample_qc)
     _add_identifiler(sample_qc)
     _add_analytic_exclusion(
-        sample_qc, remove_contam, remove_rep_discordant,
+        sample_qc,
+        remove_contam,
+        remove_rep_discordant,
     )
     _add_subject_representative(sample_qc)
     _add_subject_dropped_from_study(sample_qc)
@@ -505,7 +513,9 @@ def reason_string(row: pd.Series) -> str:
 
 
 def _add_analytic_exclusion(
-    sample_qc: pd.DataFrame, remove_contam: bool, remove_rep_discordant: bool,
+    sample_qc: pd.DataFrame,
+    remove_contam: bool,
+    remove_rep_discordant: bool,
 ) -> pd.DataFrame:
     """Adds a flag to remove samples based on provided conditions.