Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

parallel guided-merge & better report #31

Merged
merged 6 commits into from
Oct 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 8 additions & 10 deletions .github/workflows/CI.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,14 @@ jobs:
- name: Extract Version from __init__.py
id: extract_version
run: |
VERSION=$(python -c "import re;
import pathlib;
init_file = pathlib.Path('src/snipe/__init__.py');
content = init_file.read_text();
match = re.search(r'__version__\s*=\s*[\'\"]([^\'\"]+)[\'\"]', content);
VERSION=$(python -c "import re; \
import pathlib; \
init_file = pathlib.Path('src/snipe/__init__.py'); \
content = init_file.read_text(); \
match = re.search(r'__version__\s*=\s*[\'\"]([^\'\"]+)[\'\"]', content); \
print(match.group(1) if match else '0.0.0')")
echo "VERSION=$VERSION" >> $GITHUB_ENV

- name: Build package
run: |
hatch build
Expand All @@ -130,8 +131,5 @@ jobs:
with:
tag: "v${{ env.VERSION }}"
name: "v${{ env.VERSION }}"
body: |
Release version ${{ env.VERSION }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

generateReleaseNotes: true
token: ${{ secrets.GITHUB_TOKEN }}
63 changes: 40 additions & 23 deletions src/snipe/api/multisig_reference_QC.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,13 +407,6 @@ def process_sample(self, sample_sig: SnipeSig, predict_extra_folds: Optional[Lis
# Compute intersection of sample and reference genome
self.logger.debug("Type of sample_sig: %s | Type of reference_sig: %s", sample_sig.sigtype, self.reference_sig.sigtype)
sample_genome = sample_sig & self.reference_sig
# Get stats (call get_sample_stats only once)

# Log hashes and abundances for both sample and reference
# self.logger.debug("Sample hashes: %s", self.sample_sig.hashes)
# self.logger.debug("Sample abundances: %s", self.sample_sig.abundances)
# self.logger.debug("Reference hashes: %s", self.reference_sig.hashes)
# self.logger.debug("Reference abundances: %s", self.reference_sig.abundances)

sample_genome_stats = sample_genome.get_sample_stats

Expand All @@ -422,14 +415,14 @@ def process_sample(self, sample_sig: SnipeSig, predict_extra_folds: Optional[Lis
"Genomic k-mers total abundance": sample_genome_stats["total_abundance"],
"Genomic k-mers mean abundance": sample_genome_stats["mean_abundance"],
"Genomic k-mers median abundance": sample_genome_stats["median_abundance"],
# Genome coverage index

"Genome coverage index": (
sample_genome_stats["num_hashes"] / len(self.reference_sig)
if len(self.reference_sig) > 0 else 0
if len(self.reference_sig) > 0 and sample_genome_stats["num_hashes"] is not None else 0
),
"Mapping index": (
sample_genome_stats["total_abundance"] / sample_stats["k-mer total abundance"]
if sample_stats["k-mer total abundance"] > 0 else 0
if sample_stats.get("k-mer total abundance", 0) > 0 and sample_genome_stats["total_abundance"] is not None else 0
),
})

Expand Down Expand Up @@ -461,18 +454,21 @@ def process_sample(self, sample_sig: SnipeSig, predict_extra_folds: Optional[Lis
"Amplicon k-mers median abundance": sample_amplicon_stats["median_abundance"],
"Amplicon coverage index": (
sample_amplicon_stats["num_hashes"] / len(self.amplicon_sig)
if len(self.amplicon_sig) > 0 else 0
if len(self.amplicon_sig) > 0 and sample_amplicon_stats["num_hashes"] is not None else 0
),
})

# ============= RELATIVE STATS =============
amplicon_stats["Relative total abundance"] = (
amplicon_stats["Amplicon k-mers total abundance"] / genome_stats["Genomic k-mers total abundance"]
if genome_stats["Genomic k-mers total abundance"] > 0 else 0
amplicon_stats["Amplicon k-mers total abundance"] / genome_stats["Genomic k-mers total abundance"]
if genome_stats.get("Genomic k-mers total abundance", 0) > 0 and
amplicon_stats.get("Amplicon k-mers total abundance") is not None else 0
)

amplicon_stats["Relative coverage"] = (
amplicon_stats["Amplicon coverage index"] / genome_stats["Genome coverage index"]
if genome_stats["Genome coverage index"] > 0 else 0
if genome_stats.get("Genome coverage index", 0) > 0 and
amplicon_stats.get("Amplicon coverage index") is not None else 0
)

relative_total_abundance = amplicon_stats["Relative total abundance"]
Expand Down Expand Up @@ -500,8 +496,15 @@ def process_sample(self, sample_sig: SnipeSig, predict_extra_folds: Optional[Lis
sample_nonref_non_singletons = sample_nonref.total_abundance - sample_nonref_singletons
sample_total_abundance = sample_sig.total_abundance

predicted_error_index = sample_nonref_singletons / sample_total_abundance
predicted_contamination_index = sample_nonref_non_singletons / sample_total_abundance
predicted_error_index = (
sample_nonref_singletons / sample_total_abundance
if sample_total_abundance is not None and sample_total_abundance > 0 else 0
)

predicted_contamination_index = (
sample_nonref_non_singletons / sample_total_abundance
if sample_total_abundance is not None and sample_total_abundance > 0 else 0
)

# predict error and contamination index
predicted_error_contamination_index["Predicted contamination index"] = predicted_contamination_index
Expand Down Expand Up @@ -536,8 +539,9 @@ def process_sample(self, sample_sig: SnipeSig, predict_extra_folds: Optional[Lis
"Median-trimmed Genomic median abundance": median_trimmed_sample_genome_stats["median_abundance"],
"Median-trimmed Genome coverage index": (
median_trimmed_sample_genome_stats["num_hashes"] / len(self.reference_sig)
if len(self.reference_sig) > 0 else 0
if len(self.reference_sig) > 0 and median_trimmed_sample_genome_stats.get("num_hashes") is not None else 0
),

})

if self.amplicon_sig is not None:
Expand All @@ -552,19 +556,24 @@ def process_sample(self, sample_sig: SnipeSig, predict_extra_folds: Optional[Lis
"Median-trimmed Amplicon median abundance": median_trimmed_sample_amplicon_stats["median_abundance"],
"Median-trimmed Amplicon coverage index": (
median_trimmed_sample_amplicon_stats["num_hashes"] / len(self.amplicon_sig)
if len(self.amplicon_sig) > 0 else 0
if len(self.amplicon_sig) > 0 and median_trimmed_sample_amplicon_stats.get("num_hashes") is not None else 0
),

})
# Additional advanced relative metrics
self.logger.debug("Calculating advanced relative metrics.")
amplicon_stats["Median-trimmed relative coverage"] = (
advanced_stats["Median-trimmed Amplicon coverage index"] / advanced_stats["Median-trimmed Genome coverage index"]
if advanced_stats["Median-trimmed Genome coverage index"] > 0 else 0
if advanced_stats.get("Median-trimmed Genome coverage index", 0) > 0 and
advanced_stats.get("Median-trimmed Amplicon coverage index") is not None else 0
)

amplicon_stats["Median-trimmed relative mean abundance"] = (
advanced_stats["Median-trimmed Amplicon mean abundance"] / advanced_stats["Median-trimmed Genomic mean abundance"]
if advanced_stats["Median-trimmed Genomic mean abundance"] > 0 else 0
if advanced_stats.get("Median-trimmed Genomic mean abundance", 0) > 0 and
advanced_stats.get("Median-trimmed Amplicon mean abundance") is not None else 0
)

# Update amplicon_stats with advanced metrics
amplicon_stats.update({
"Median-trimmed relative coverage": amplicon_stats["Median-trimmed relative coverage"],
Expand Down Expand Up @@ -614,7 +623,8 @@ def sort_chromosomes(chrom_dict):
# calculate the CV for the whole sample
if autosomal_chr_to_mean_abundance:
mean_abundances = np.array(list(autosomal_chr_to_mean_abundance.values()), dtype=np.float64)
cv = np.std(mean_abundances) / np.mean(mean_abundances) if np.mean(mean_abundances) != 0 else 0.0
mean = np.mean(mean_abundances)
cv = np.std(mean_abundances) / mean if mean > 0 and not np.isnan(mean) else 0.0
chrs_stats.update({"Autosomal k-mer mean abundance CV": cv})
self.logger.debug("Calculated Autosomal CV: %f", cv)
else:
Expand Down Expand Up @@ -703,7 +713,10 @@ def sort_chromosomes(chrom_dict):
self.logger.warning("Insufficient k-mers for chrY Coverage score calculation. Setting chrY Coverage score to zero.")
ycoverage = 0.0
else:
ycoverage = (len(ychr_in_sample) / len(ychr_specific_kmers)) / (len(sample_autosomal_sig) / len(autosomals_specific_kmers))
ycoverage = ((len(ychr_in_sample) / len(ychr_specific_kmers)) / (len(sample_autosomal_sig) / len(autosomals_specific_kmers))
if len(ychr_specific_kmers) > 0 and len(autosomals_specific_kmers) > 0 else 0
)


self.logger.debug("Calculated chrY Coverage score: %.4f", ycoverage)
sex_stats.update({"chrY Coverage score": ycoverage})
Expand Down Expand Up @@ -757,7 +770,11 @@ def sort_chromosomes(chrom_dict):
vars_nonref_stats["unexplained variance total abundance"] = sample_nonref.total_abundance
vars_nonref_stats["unexplained variance mean abundance"] = sample_nonref.mean_abundance
vars_nonref_stats["unexplained variance median abundance"] = sample_nonref.median_abundance
vars_nonref_stats["unexplained variance fraction of total abundance"] = sample_nonref.total_abundance / sample_nonref_total_abundance if sample_nonref_total_abundance > 0 else 0.0
vars_nonref_stats["unexplained variance fraction of total abundance"] = (
sample_nonref.total_abundance / sample_nonref_total_abundance
if sample_nonref_total_abundance > 0 and sample_nonref.total_abundance is not None else 0.0
)


self.logger.debug(
"After consuming all vars from the non reference k-mers, the size of the sample signature is: %d hashes, "
Expand Down
Loading