Skip to content

Commit

Permalink
Merge pull request #310 from NCI-CGR/302-skip-merging-graf-king-and-p…
Browse files Browse the repository at this point in the history
…link-results-to-make-the-qc-pipeline-more-efficient

302 skip merging graf king and plink results to make the qc pipeline more efficient
  • Loading branch information
kliao12 authored Aug 6, 2024
2 parents 4015e2c + bd12378 commit 3d31024
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 144 deletions.
Binary file added docs/static/Thumbs.db
Binary file not shown.
11 changes: 2 additions & 9 deletions src/cgr_gwas_qc/workflow/scripts/qc_report_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,6 @@ def _sample_qc(sample_sheet_csv: PathLike, sample_qc_csv: PathLike) -> pd.DataFr
"PLINK_concordance",
"PLINK_is_ge_pi_hat",
"PLINK_is_ge_concordance",
"GRAF_HGMR",
"GRAF_AGMR",
"GRAF_relationship",
"KING_Kinship",
"KING_relationship",
]


Expand All @@ -158,10 +153,8 @@ def _sample_concordance(sample_qc_csv: PathLike, sample_concordance_csv: PathLik
return (
sample_concordance.read(sample_concordance_csv)
.pipe(
lambda x: x[
x.PLINK_PI_HAT.notna() | (x.GRAF_relationship.notna() & x.KING_relationship.notna())
]
) # Keep if we have results from plink or both GRAF and KING. This will limit the number of rows.
lambda x: x[x.PLINK_PI_HAT.notna()]
) # Keep if we have results from plink. This will limit the number of rows.
.merge(ancestry.rename_axis("Sample_ID1").rename("Ancestry1"), on="Sample_ID1", how="left")
.merge(ancestry.rename_axis("Sample_ID2").rename("Ancestry2"), on="Sample_ID2", how="left")
.merge(
Expand Down
90 changes: 9 additions & 81 deletions src/cgr_gwas_qc/workflow/scripts/sample_concordance.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,26 +23,18 @@
PLINK_concordance, float, Proportion IBS2 ``IBS2 / (IBS0 + IBS1 + IBS2)``
PLINK_is_ge_pi_hat, boolean, True if PI_HAT was greater than ``software_params.pi_hat_cutoff``
PLINK_is_ge_concordance, boolean, True if concordance was greater than ``software_params.dup_concordance_cutoff``
GRAF_HGMR, float, Homozygous Genotype Mismatch Rate (%)
GRAF_AGMR, float, All Genotype Mismatch Rate (%)
GRAF_relationship, string, relationship determined by sample genotypes.
KING_Kinship, float, Estimated kinship coefficient from the SNP data
KING_relationship, string, The assigned relationship based on Kinship
References:
- :mod:`cgr_gwas_qc.workflow.scripts.concordance_table`
- :mod:`cgr_gwas_qc.parsers.king`
- :mod:`cgr_gwas_qc.parsers.graf`
"""
import os
from itertools import combinations
from pathlib import Path
from typing import List, Tuple

import pandas as pd
import typer

from cgr_gwas_qc.parsers import graf, king, sample_sheet
from cgr_gwas_qc.parsers import sample_sheet
from cgr_gwas_qc.typing import PathLike
from cgr_gwas_qc.workflow.scripts import concordance_table

Expand All @@ -64,11 +56,6 @@
"PLINK_concordance": "float",
"PLINK_is_ge_pi_hat": "boolean",
"PLINK_is_ge_concordance": "boolean",
"GRAF_HGMR": "float",
"GRAF_AGMR": "float",
"GRAF_relationship": "string",
"KING_Kinship": "float",
"KING_relationship": "string",
}


Expand All @@ -92,11 +79,6 @@ def read(filename: PathLike):
- PLINK_concordance
- PLINK_is_ge_pi_hat
- PLINK_is_ge_concordance
- GRAF_HGMR
- GRAF_AGMR
- GRAF_relationship
- KING_Kinship
- KING_relationship
"""
return pd.read_csv(filename, dtype=DTYPES)

Expand All @@ -107,12 +89,8 @@ def main(
):
ss = sample_sheet.read(sample_sheet_csv)

# Check if graf or king are empty (no related). If so add expected final headers to file
check_empty_add_headers(graf_file, ["GRAF_HGMR", "GRAF_AGMR", "GRAF_relationship"])
check_empty_add_headers(king_file, ["KING_Kinship", "KING_relationship"])

concordance = (
build(plink_file, graf_file, king_file)
build(plink_file)
.pipe(_add_expected_replicates, ss)
.pipe(_add_discordant_replicates)
.pipe(_add_unexpected_replicates)
Expand All @@ -125,21 +103,9 @@ def main(
)


def check_empty_add_headers(file_path: Path, headers: list):
"""Check if the file is empty"""
if os.path.getsize(file_path) == 0:
with open(file_path, "w") as f:
f.write("\t".join(headers) + "\n")


def build(plink_file: PathLike, graf_file: PathLike, king_file: PathLike):
def build(plink_file: PathLike):
"""Build the main concordance table."""
return (
_plink(plink_file)
.join(_graf(graf_file), how="outer")
.join(_king(king_file), how="outer")
.rename_axis(["Sample_ID1", "Sample_ID2"])
)
return _plink(plink_file).rename_axis(["Sample_ID1", "Sample_ID2"])


def _add_subject(df: pd.DataFrame, ss: pd.DataFrame) -> pd.DataFrame:
Expand Down Expand Up @@ -188,34 +154,18 @@ def _add_expected_replicates(df: pd.DataFrame, ss: pd.DataFrame) -> pd.DataFrame

def _discordant_logic(sr: pd.Series) -> bool:
if not sr.is_expected_replicate:
# not a replicate
# not an expected replicate
return False

if all(
[
pd.isna(sr.PLINK_is_ge_concordance),
pd.isna(sr.GRAF_relationship),
pd.isna(sr.KING_relationship),
]
):
# No metrics so ignore
# Issue 210: If someone is an expected replicates but has no PLINK, GRAF, or KING then these should be flagged as an discordant replicate (True). Before was returning False
if pd.isna(sr.PLINK_is_ge_concordance):
# Issue 210: If someone is an expected replicates but has no PLINK then these should be flagged as an discordant replicate (True). Before was returning False
return True

if pd.notna(sr.PLINK_is_ge_concordance):
# plink call: True if < concordance threshold
# plink call: Mark as discordant (True) if expected replicate and < concordance threshold
return not sr.PLINK_is_ge_concordance

if all([pd.notna(sr.GRAF_relationship), pd.notna(sr.KING_relationship)]):
# We have both graf and king calls: True if both are discordant
return (sr.GRAF_relationship != "ID") & (sr.KING_relationship != "ID")

if pd.notna(sr.GRAF_relationship):
# We only have graf call
return sr.GRAF_relationship != "ID"

# We only have king call
return sr.KING_relationship != "ID"
return False


def _add_discordant_replicates(df: pd.DataFrame) -> pd.DataFrame:
Expand All @@ -231,8 +181,6 @@ def _add_unexpected_replicates(df: pd.DataFrame) -> pd.DataFrame:
"""
df["is_unexpected_replicate"] = ~df.is_expected_replicate & (
(df.PLINK_is_ge_concordance.notna() & df.PLINK_is_ge_concordance)
| (df.GRAF_relationship.notna() & (df.GRAF_relationship == "ID"))
| (df.KING_relationship.notna() & (df.KING_relationship == "ID"))
)
return df

Expand All @@ -253,26 +201,6 @@ def _plink(filename: PathLike):
)


def _graf(filename: PathLike):
return (
graf.read_relatedness(filename)
.set_index(["ID1", "ID2"])
.reindex(["HGMR", "AGMR", "relationship"], axis=1)
.rename(
{"HGMR": "GRAF_HGMR", "AGMR": "GRAF_AGMR", "relationship": "GRAF_relationship"}, axis=1,
)
)


def _king(filename: PathLike):
return (
king.read_related(filename)
.set_index(["ID1", "ID2"])
.reindex(["Kinship", "relationship"], axis=1)
.rename({"Kinship": "KING_Kinship", "relationship": "KING_relationship"}, axis=1)
)


def _get_known_replicates(ss: pd.DataFrame) -> List[Tuple[str, str]]:
"""Create list of pairwise combinations of know replicates.
Expand Down
69 changes: 15 additions & 54 deletions tests/workflow/scripts/test_sample_concordance.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,11 @@
@pytest.fixture
def concordance():
data_cache = FakeData()
return sample_concordance.build(
data_cache / "cgr/concordance.csv",
data_cache / "graf/relatedness.tsv",
data_cache / "king/king.kin0",
)
return sample_concordance.build(data_cache / "cgr/concordance.csv",)


def test_build(concordance):
assert 4 == concordance.filter(regex="^PLINK").shape[1]
assert 3 == concordance.filter(regex="^GRAF").shape[1]
assert 2 == concordance.filter(regex="^KING").shape[1]
assert ["Sample_ID1", "Sample_ID2"] == concordance.index.names


Expand Down Expand Up @@ -91,38 +85,17 @@ def test_add_discordant_replicate_missing_pair(fake_cfg, concordance):


@pytest.mark.parametrize(
"expected_result,is_rep,plink,graf,king",
"expected_result,is_rep,plink",
[
(False, False, True, "ID", "ID"), # Not an expected replicate
(False, False, True), # Not an expected replicate
# PLINK Concordant
(False, True, True, "ID", "ID"), # all concordant
(False, True, True, "UN", "UN"), # GRAF and KING don't matter
(False, True, True, pd.NA, pd.NA), # GRAF and KING don't matter
(False, True, True), # all concordant
# PLINK Discordant
(True, True, False, "ID", "UN"), # GRAF and KING don't matter
(True, True, False, "UN", "ID"), # GRAF and KING don't matter
(True, True, False, pd.NA, pd.NA), # GRAF and KING don't matter
(True, True, False, "UN", "UN"), # GRAF and KING don't matter
# PLINK Missing
(False, True, pd.NA, "ID", "ID"), # GRAF and KING concordant
(False, True, pd.NA, "ID", "UN"), # GRAF and KING disagree
(False, True, pd.NA, "UN", "ID"), # GRAF and KING disagree
(False, True, pd.NA, "ID", pd.NA), # GRAF concordant
(False, True, pd.NA, pd.NA, "ID"), # KING concordant
(True, True, pd.NA, "UN", pd.NA), # GRAF discordant
(True, True, pd.NA, pd.NA, "UN"), # KING discordant
(True, True, pd.NA, pd.NA, pd.NA), # All missing: flag as discordant replicate
(True, True, False), # GRAF and KING don't matter
],
)
def test_add_discordant_logic(expected_result, is_rep, plink, graf, king):
df = pd.DataFrame(
{
"is_expected_replicate": [is_rep],
"PLINK_is_ge_concordance": [plink],
"GRAF_relationship": [graf],
"KING_relationship": [king],
}
)
def test_add_discordant_logic(expected_result, is_rep, plink):
df = pd.DataFrame({"is_expected_replicate": [is_rep], "PLINK_is_ge_concordance": [plink],})

result = sample_concordance._add_discordant_replicates(df).is_discordant_replicate.squeeze()
assert expected_result == result
Expand All @@ -149,29 +122,17 @@ def test_add_unexpected_replicate_one(fake_cfg, concordance):


@pytest.mark.parametrize(
"expected_result,is_rep,plink,graf,king",
"expected_result,is_rep,plink",
[
(False, True, True, "ID", "ID"), # expected replicate
(True, False, True, "ID", "ID"), # All concordant
(True, False, True, "UN", "UN"), # plink concordant
(True, False, False, "ID", "UN"), # graf concordant
(True, False, False, "UN", "ID"), # king concordant
(True, False, True, pd.NA, pd.NA), # plink concordant with missing
(True, False, pd.NA, "ID", pd.NA), # graf concordant with missing
(True, False, pd.NA, pd.NA, "ID"), # king concordant with missing
(False, False, False, "UN", "UN"), # all discordant
(False, False, pd.NA, pd.NA, pd.NA), # all missing
(False, True, True), # expected replicate
(True, False, True), # All concordant
(True, False, True), # plink concordant
(False, False, False), # all discordant
(False, False, pd.NA), # all missing
],
)
def test_add_unexpected_logic(expected_result, is_rep, plink, graf, king):
df = pd.DataFrame(
{
"is_expected_replicate": [is_rep],
"PLINK_is_ge_concordance": [plink],
"GRAF_relationship": [graf],
"KING_relationship": [king],
}
)
def test_add_unexpected_logic(expected_result, is_rep, plink):
df = pd.DataFrame({"is_expected_replicate": [is_rep], "PLINK_is_ge_concordance": [plink],})

result = sample_concordance._add_unexpected_replicates(df).is_unexpected_replicate.squeeze()
assert expected_result == result

0 comments on commit 3d31024

Please sign in to comment.