From 728d59a75f44784de8e73ba21bd4a2d812abe5e7 Mon Sep 17 00:00:00 2001 From: akikuno Date: Sun, 24 Dec 2023 13:59:11 +0900 Subject: [PATCH 01/27] bump v0.3.6 --- setup.py | 2 +- src/DAJIN2/main.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index df2400ab..f6360ae6 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ setuptools.setup( name="DAJIN2", - version="0.3.5", + version="0.3.6", author="Akihiro Kuno", author_email="akuno@md.tsukuba.ac.jp", description="One-step genotyping tools for targeted long-read sequencing", diff --git a/src/DAJIN2/main.py b/src/DAJIN2/main.py index 57e47cae..b3adba3e 100644 --- a/src/DAJIN2/main.py +++ b/src/DAJIN2/main.py @@ -1,6 +1,7 @@ from __future__ import annotations import os + os.environ["OMP_NUM_THREADS"] = "1" os.environ["OPENBLAS_NUM_THREADS"] = "1" os.environ["MKL_NUM_THREADS"] = "1" @@ -19,7 +20,7 @@ from DAJIN2.utils import io, config, report_generator, input_validator, multiprocess -DAJIN_VERSION = "0.3.5" +DAJIN_VERSION = "0.3.6" def generate_report(name: str) -> None: From a9399f5d4aaa0f6bcdbea0ead39b2aabf8223f5d Mon Sep 17 00:00:00 2001 From: akikuno Date: Sun, 24 Dec 2023 14:00:27 +0900 Subject: [PATCH 02/27] Added `merge_minor_alleles` to reclassify alleles with less than 10 reads. --- src/DAJIN2/core/classification/classifier.py | 116 ++++++++++++++++--- 1 file changed, 102 insertions(+), 14 deletions(-) diff --git a/src/DAJIN2/core/classification/classifier.py b/src/DAJIN2/core/classification/classifier.py index e4de8603..0fc18d2b 100644 --- a/src/DAJIN2/core/classification/classifier.py +++ b/src/DAJIN2/core/classification/classifier.py @@ -1,32 +1,33 @@ from __future__ import annotations -import midsv from pathlib import Path from itertools import groupby +from collections import defaultdict +from DAJIN2.utils import io -def _calc_match(CSSPLIT: str) -> float: - match_score = CSSPLIT.count("=") - match_score -= CSSPLIT.count("+") # insertion - match_score -= sum(cs.islower() for cs in CSSPLIT) # inversion - cssplit = CSSPLIT.split(",") - return match_score / len(cssplit) +def calc_match(cssplit: str) -> float: + match_score = cssplit.count("=") + match_score -= cssplit.count("+") # insertion + match_score -= sum(cs.islower() for cs in cssplit) # inversion + return match_score / len(cssplit.split(",")) -def _score_allele(TEMPDIR: Path, allele: str, SAMPLE_NAME: str) -> list[dict]: - midsv_sample = midsv.read_jsonl(Path(TEMPDIR, SAMPLE_NAME, "midsv", f"{allele}.json")) - scored_alleles = [] +def score_allele(path_midsv: Path, allele: str) -> list[dict]: + midsv_sample = io.read_jsonl(path_midsv) + + scored_alleles = [] for dict_midsv in midsv_sample: - score = _calc_match(dict_midsv["CSSPLIT"]) + score = calc_match(dict_midsv["CSSPLIT"]) dict_midsv.update({"SCORE": score, "ALLELE": allele}) scored_alleles.append(dict_midsv) return scored_alleles -def _extract_alleles_with_max_score(score_of_each_alleles: list[dict]) -> list[dict]: +def extract_alleles_with_max_score(score_of_each_alleles: list[dict]) -> list[dict]: alleles_with_max_score = [] score_of_each_alleles.sort(key=lambda x: x["QNAME"]) for _, group in groupby(score_of_each_alleles, key=lambda x: x["QNAME"]): @@ -36,6 +37,90 @@ def _extract_alleles_with_max_score(score_of_each_alleles: list[dict]) -> list[d return alleles_with_max_score +########################################################## +# merge minor alleles +########################################################## + + +def count_alleles(score_of_each_alleles: list[dict]) -> dict[str, int]: + score_of_each_alleles.sort(key=lambda x: x["QNAME"]) + allele_counts = defaultdict(int) + for _, group in groupby(score_of_each_alleles, key=lambda x: x["QNAME"]): + allele_predicted = max(group, key=lambda x: x["SCORE"])["ALLELE"] + allele_counts[allele_predicted] += 1 + return allele_counts + + +# マイナーアレル, メジャーアレルの同定 + + +def extract_minor_alleles( + count_allele: dict[str, int], major_alleles: set[str], threshold_readnumber: int = 10 +) -> dict[str, int]: + return { + allele: value + for allele, value in count_allele.items() + if allele not in major_alleles and value < threshold_readnumber + } + + +def update_major_allele( + count_allele: dict[str, int], major_alleles: set[str], threshold_readnumber: int = 10 +) -> set[str]: + new_major_alleles = { + allele + for allele, value in count_allele.items() + if allele not in major_alleles and value >= threshold_readnumber + } + return major_alleles | new_major_alleles + + +# score_of_each_allelesから、マイナーアレルに該当するリードを抽出する + + +def extract_minor_groups(score_of_each_alleles: list[dict], minor_alleles: dict[str, int]) -> list[dict]: + score_on_minor_alleles = [] + score_of_each_alleles.sort(key=lambda x: [x["QNAME"]]) + for _, group in groupby(score_of_each_alleles, key=lambda x: x["QNAME"]): + group = list(group) + allele_predicted = max(group, key=lambda x: x["SCORE"])["ALLELE"] + if allele_predicted in minor_alleles: + score_on_minor_alleles.extend(group) + return score_on_minor_alleles + + +def merge_minor_alleles(score_of_each_alleles, threshold_readnumber: int = 10) -> list[dict]: + score_of_each_alleles.sort(key=lambda x: [x["QNAME"]]) + major_alleles = set() + minor_alleles = dict() + + allele_counts = count_alleles(score_of_each_alleles) + minor_alleles = extract_minor_alleles(allele_counts, major_alleles, threshold_readnumber) + major_alleles = update_major_allele(allele_counts, major_alleles, threshold_readnumber) + + minor_alleles_sorted = sorted(minor_alleles.items(), key=lambda x: x[1]) + + score_on_minor_alleles = extract_minor_groups(score_of_each_alleles, minor_alleles) + + minor_allele_record = set() + for minor_allele, _ in minor_alleles_sorted: + minor_allele_record.add(minor_allele) + score_on_minor_alleles.sort(key=lambda x: x["QNAME"]) + for _, group in groupby(score_on_minor_alleles, key=lambda x: x["QNAME"]): + group = list(group) + allele_predicted = max(group, key=lambda x: x["SCORE"])["ALLELE"] + if allele_predicted != minor_allele: + continue + for g in group: + if g["ALLELE"] in minor_allele_record: + g["SCORE"] = float("-inf") + allele_counts = count_alleles(score_on_minor_alleles) + minor_alleles = extract_minor_alleles(allele_counts, major_alleles, threshold_readnumber) + major_alleles = update_major_allele(allele_counts, major_alleles, threshold_readnumber) + score_on_minor_alleles = extract_minor_groups(score_on_minor_alleles, minor_alleles) + return score_of_each_alleles + + ########################################################## # main ########################################################## @@ -44,6 +129,9 @@ def _extract_alleles_with_max_score(score_of_each_alleles: list[dict]) -> list[d def classify_alleles(TEMPDIR: Path, FASTA_ALLELES: dict, SAMPLE_NAME: str) -> list[dict]: score_of_each_alleles = [] for allele in FASTA_ALLELES: - score_of_each_alleles.extend(_score_allele(TEMPDIR, allele, SAMPLE_NAME)) + path_midsv = Path(TEMPDIR, SAMPLE_NAME, "midsv", f"{allele}.json") + score_of_each_alleles.extend(score_allele(path_midsv, allele)) + + score_of_each_alleles_merged = merge_minor_alleles(score_of_each_alleles) - return _extract_alleles_with_max_score(score_of_each_alleles) + return extract_alleles_with_max_score(score_of_each_alleles_merged) From 4bd9f7dd806d192475d8d4f20c1e50c37281d64e Mon Sep 17 00:00:00 2001 From: akikuno Date: Sun, 24 Dec 2023 14:05:25 +0900 Subject: [PATCH 03/27] Added the function `merge_minor_cluster` to revert labels clustered with less than 10 reads back to the previous labels. --- docs/ROADMAP.md | 61 ++++++++++++++----- src/DAJIN2/core/clustering/clustering.py | 22 +++++-- src/DAJIN2/core/clustering/label_extractor.py | 29 +++++---- src/DAJIN2/core/clustering/label_merger.py | 45 +++++++++++--- .../core/preprocess/insertions_to_fasta.py | 1 + tests/src/clustering/test_label_merger.py | 22 +++++-- 6 files changed, 132 insertions(+), 48 deletions(-) diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index 9f45eb06..e2fb9ad7 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -1,8 +1,51 @@ # Development Logs of DAJIN2 + + +# v0.3.6 (yyyy-mm-dd) + +## 📝 Documentation +## 🚀 Features + +### Classification + +#### Skipped the classification of minor alleles to suppress excessive subdivision of alleles. + ++ Added `merge_minor_alleles` to reclassify alleles with less than 10 reads. [Commit Detail](https://github.com/akikuno/DAJIN2/commit/a9399f5d4aaa0f6bcdbea0ead39b2aabf8223f5d) + +### Clustering + +#### Skipped the clustering of minor alleles to suppress excessive subdivision of alleles. + ++ Added the function `merge_minor_cluster` to revert labels clustered with less than 10 reads back to the previous labels. + [Commit Detail](https://github.com/akikuno/DAJIN2/commit/xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx) + + +## 🐛 Bug Fixes +## 🔧 Maintenance +## ⛔️ Deprecated + + +# 💡 Future Tasks + ++ Remove minor alleles with predicted insertion ++ Enhance the Clarity of Insertion Allele Identification. ++ Develop and Integrate Inversion Detection Capability + ------------- -# v0.3.5 (2023-12-23) +# Past Logs + +
+ v0.3.5 (2023-12-23) ## 📝 Documentation @@ -46,18 +89,4 @@ + None -------------- - -# 💡 Future Tasks - -+ Remove minor alleles with predicted insertion -+ Enhance the Clarity of Insertion Allele Identification. -+ Develop and Integrate Inversion Detection Capability - - +
diff --git a/src/DAJIN2/core/clustering/clustering.py b/src/DAJIN2/core/clustering/clustering.py index d0575f57..9fa85c26 100644 --- a/src/DAJIN2/core/clustering/clustering.py +++ b/src/DAJIN2/core/clustering/clustering.py @@ -22,20 +22,30 @@ ############################################################################### -def optimize_labels(X: spmatrix, coverage_sample, coverage_control) -> list[int]: +def count_number_of_clusters(labels_control: list[int], coverage_control: int) -> int: + """Reads < 1% in the control are considered clustering errors and are not counted""" + return sum(1 for reads in Counter(labels_control).values() if reads / coverage_control > 0.01) + + +def optimize_labels(X: spmatrix, coverage_sample: int, coverage_control: int) -> list[int]: labels_previous = list(range(coverage_sample)) for i in range(1, coverage_sample): np.random.seed(seed=1) labels_all = BisectingKMeans(n_clusters=i, random_state=1).fit_predict(X).tolist() + labels_sample = labels_all[:coverage_sample] labels_control = labels_all[coverage_sample:] - labels_current = merge_labels(labels_control, labels_sample) + labels_current = merge_labels(labels_control, labels_sample, labels_previous) # print(i, Counter(labels_sample), Counter(labels_control), Counter(labels_current)) # ! DEBUG - # Reads < 1% in the control are considered clustering errors and are not counted - count_control = Counter(labels_control) - num_labels_control = sum(1 for reads in count_control.values() if reads / coverage_control > 0.01) + + num_labels_control = count_number_of_clusters(labels_control, coverage_control) mutual_info = metrics.adjusted_rand_score(labels_previous, labels_current) - # Report the number of clusters in SAMPLE when the number of clusters in CONTROL is split into more than one. + + """ + Return the number of clusters when: + - the number of clusters in control is split into more than one. + - the mutual information between the current and previous labels is high enough (= similar). + """ if num_labels_control >= 2: return labels_previous if 0.95 <= mutual_info <= 1.0: diff --git a/src/DAJIN2/core/clustering/label_extractor.py b/src/DAJIN2/core/clustering/label_extractor.py index 01539e66..0a00e6fa 100644 --- a/src/DAJIN2/core/clustering/label_extractor.py +++ b/src/DAJIN2/core/clustering/label_extractor.py @@ -17,25 +17,30 @@ def extract_labels(classif_sample, TEMPDIR, SAMPLE_NAME, CONTROL_NAME) -> list[d strand_bias = is_strand_bias(Path(TEMPDIR, CONTROL_NAME, "midsv", "control.json")) classif_sample.sort(key=lambda x: x["ALLELE"]) for allele, group in groupby(classif_sample, key=lambda x: x["ALLELE"]): - path_mutation_loci = Path(TEMPDIR, SAMPLE_NAME, "mutation_loci", f"{allele}.pickle") - mutation_loci: list[set[str]] = io.load_pickle(path_mutation_loci) - if all(m == set() for m in mutation_loci): - max_label += 1 - labels_all.extend([max_label] * len(classif_sample)) - continue - - path_knockin_loci = Path(TEMPDIR, SAMPLE_NAME, "knockin_loci", f"{allele}.pickle") - knockin_loci: set[int] = io.load_pickle(path_knockin_loci) if path_knockin_loci.exists() else set() - + """Cache data to temporary files""" RANDOM_INT = random.randint(0, 10**10) - path_sample = Path(TEMPDIR, SAMPLE_NAME, "clustering", f"{allele}_{RANDOM_INT}.json") if Path(TEMPDIR, CONTROL_NAME, "midsv", f"{allele}.json").exists(): path_control = Path(TEMPDIR, CONTROL_NAME, "midsv", f"{allele}.json") else: path_control = Path(TEMPDIR, CONTROL_NAME, "midsv", f"{allele}_{SAMPLE_NAME}.json") + path_sample = Path(TEMPDIR, SAMPLE_NAME, "clustering", f"{allele}_{RANDOM_INT}.json") io.write_jsonl(data=group, file_path=path_sample) - """Prepare and write clustering data to temporary files.""" + """Load mutation_loci and knockin_loci.""" + path_mutation_loci = Path(TEMPDIR, SAMPLE_NAME, "mutation_loci", f"{allele}.pickle") + mutation_loci: list[set[str]] = io.load_pickle(path_mutation_loci) + path_knockin_loci = Path(TEMPDIR, SAMPLE_NAME, "knockin_loci", f"{allele}.pickle") + knockin_loci: set[int] = io.load_pickle(path_knockin_loci) if path_knockin_loci.exists() else set() + + """Skip clustering when the number of reads is too small or there is no mutation.""" + read_numbers = io.count_newlines(path_sample) + is_no_mutation = all(m == set() for m in mutation_loci) + if read_numbers <= 10 or is_no_mutation: + max_label += 1 + labels_all.extend([max_label] * read_numbers) + continue + + """Calculate scores to temporary files.""" mutation_score: list[dict[str, float]] = make_score(path_sample, path_control, mutation_loci, knockin_loci) scores_sample = annotate_score(path_sample, mutation_score, mutation_loci) diff --git a/src/DAJIN2/core/clustering/label_merger.py b/src/DAJIN2/core/clustering/label_merger.py index 90159cfe..81f7b8ac 100644 --- a/src/DAJIN2/core/clustering/label_merger.py +++ b/src/DAJIN2/core/clustering/label_merger.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections import Counter +import numpy as np def calculate_label_percentages(labels: list[int]) -> dict[int, float]: @@ -24,16 +25,38 @@ def merge_mixed_cluster(labels_control: list[int], labels_sample: list[int], thr return labels_merged -def merge_minor_cluster(labels_sample: list[int], threshold: float = 0.5) -> list[int]: +def map_clusters_to_previous(labels_sample: list[int], labels_previous: list[int]) -> dict[int, int]: + """ + Determine which cluster in labels_previous corresponds to each cluster in labels_sample. + """ + correspondence = {} + for current_label in np.unique(labels_sample): + # Get the indices of data points belonging to a specific cluster in labels_sample + indices = np.where(labels_sample == current_label)[0] + # Identify the cluster in labels_previous that these data points belong to + previous_label = np.bincount(np.array(labels_previous)[indices]).argmax() + correspondence[current_label] = previous_label + + return correspondence + + +def merge_minor_cluster( + labels_sample: list[int], + labels_previous: list[int], + threshold_percentage: float = 0.5, + threshold_readnumber: int = 10, +) -> list[int]: """Merge labels in sample if they appear less than 'threshold' percentage.""" - labels_merged = labels_sample.copy() - label_percentages_sample = calculate_label_percentages(labels_sample) - minor_labels = {label for label, percent in label_percentages_sample.items() if percent < threshold} - most_common_label = Counter(labels_sample).most_common(1)[0][0] - for i, label in enumerate(labels_sample): - if label in minor_labels: - labels_merged[i] = most_common_label + # Find minor labels + label_percentages = calculate_label_percentages(labels_sample) + minor_labels_percentage = {label for label, percent in label_percentages.items() if percent < threshold_percentage} + minor_labels_readnumber = {label for label, num in Counter(labels_sample).items() if num <= threshold_readnumber} + minor_labels = minor_labels_percentage | minor_labels_readnumber + + correspondence = map_clusters_to_previous(labels_sample, labels_previous) + + labels_merged = [correspondence[label] if label in minor_labels else label for label in labels_sample] return labels_merged @@ -43,7 +66,9 @@ def merge_minor_cluster(labels_sample: list[int], threshold: float = 0.5) -> lis ############################################################################### -def merge_labels(labels_control: list[int], labels_sample: list[int]) -> list[int]: +def merge_labels(labels_control: list[int], labels_sample: list[int], labels_previous: list[int]) -> list[int]: labels_merged = merge_mixed_cluster(labels_control, labels_sample) - labels_merged = merge_minor_cluster(labels_merged) + labels_merged = merge_minor_cluster( + labels_merged, labels_previous, threshold_percentage=0.5, threshold_readnumber=10 + ) return labels_merged diff --git a/src/DAJIN2/core/preprocess/insertions_to_fasta.py b/src/DAJIN2/core/preprocess/insertions_to_fasta.py index d990be6f..d61eec3b 100644 --- a/src/DAJIN2/core/preprocess/insertions_to_fasta.py +++ b/src/DAJIN2/core/preprocess/insertions_to_fasta.py @@ -173,6 +173,7 @@ def clustering_insertions(scores) -> list[int]: if X.max() - X.min() == 0: return [1] * len(X) X = (X - X.min()) / (X.max() - X.min()) + np.random.seed(seed=1) clustering = MeanShift().fit(X) labels = clustering.labels_ return labels.tolist() diff --git a/tests/src/clustering/test_label_merger.py b/tests/src/clustering/test_label_merger.py index 787ce932..6dbedc0a 100644 --- a/tests/src/clustering/test_label_merger.py +++ b/tests/src/clustering/test_label_merger.py @@ -1,4 +1,5 @@ from DAJIN2.core.clustering.label_merger import merge_mixed_cluster +from DAJIN2.core.clustering.label_merger import map_clusters_to_previous from DAJIN2.core.clustering.label_merger import merge_minor_cluster @@ -10,9 +11,22 @@ def test_merge_mixed_cluster(): assert merge_mixed_cluster(labels_control, labels_sample, threshold=20) == expected_output +def test_map_clusters_to_previous(): + labels_sample = [1, 1, 1, 0, 2, 0] + labels_previous = [0, 0, 0, 1, 1, 1] + expected_correspondence = {1: 0, 0: 1, 2: 1} + + assert map_clusters_to_previous(labels_sample, labels_previous) == expected_correspondence + + def test_merge_minor_cluster(): + """ + In labels_sample, label 1 occurs 37.5% of the time, label 3 occurs 25%, and labels 2, 4, 5 occur 12.5% each. Therefore, labels 2, 4, 5 should be merged into previous lables + """ labels_sample = [1, 1, 1, 2, 3, 3, 4, 5] - # In labels_sample, label 1 occurs 37.5% of the time, label 3 occurs 25%, and labels 2, 4, 5 occur 12.5% each. - # Therefore, labels 2, 4, 5 should be merged into label 1 - expected_output = [1, 1, 1, 1, 3, 3, 1, 1] - assert merge_minor_cluster(labels_sample, threshold=20) == expected_output + labels_previous = [0, 0, 0, 0, 1, 1, 2, 2] + expected_output = [0, 0, 0, 0, 1, 1, 2, 2] + assert ( + merge_minor_cluster(labels_sample, labels_previous, threshold_percentage=20, threshold_readnumber=100) + == expected_output + ) From cefed0ff4d04282b9915486be07de85b2b77b657 Mon Sep 17 00:00:00 2001 From: akikuno Date: Mon, 25 Dec 2023 11:09:22 +0900 Subject: [PATCH 04/27] Added a short form of installation guide to TROUBLESHOOTING.md --- docs/TROUBLESHOOTING.md | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/docs/TROUBLESHOOTING.md b/docs/TROUBLESHOOTING.md index eb232ec6..fff79ac1 100644 --- a/docs/TROUBLESHOOTING.md +++ b/docs/TROUBLESHOOTING.md @@ -2,13 +2,31 @@ ## Installation +### Long Story Short + +```bash +# Update conda +conda update -n base conda -y + +# Setup of Bioconda +conda config --add channels defaults +conda config --add channels bioconda +conda config --add channels conda-forge +conda config --set channel_priority strict + +# Install DAJIN2 to a virtual environment +conda create -n env-dajin2 -y +conda activate env-dajin2 +conda install -c bioconda DAJIN2 -y +``` + ### Prerequisites Before installing `DAJIN2`, please ensure that your system meets the following requirements: - Python >=3.7 - Unix-like environment (Linux, macOS, WSL, etc.) -- [conda](https://docs.conda.io/en/latest/) or [mamba](https://mamba.readthedocs.io/en/latest/) is highly recommended for managing dependencies +- [Conda](https://docs.conda.io/en/latest/) is highly recommended for managing dependencies - If using pip, access to administrative privileges or the ability to install packages globally ### Dependencies @@ -24,15 +42,25 @@ Before installing `DAJIN2`, please ensure that your system meets the following r We strongly recommend using Conda or Mamba for installation, as they efficiently manage the complex dependencies of `pysam` and `htslib`: -1. Install Conda or Mamba if you haven't already. You can download Conda from [here](https://docs.conda.io/en/latest/miniconda.html). +1. Install the latest Conda if you have not already. You can download Conda from [here](https://docs.conda.io/en/latest/miniconda.html). + +2. Setup the [Bioconda](https://bioconda.github.io/) channel: + +```bash +conda config --add channels defaults +conda config --add channels bioconda +conda config --add channels conda-forge +conda config --set channel_priority strict +``` + +3. Create a new virtual environment (optional but recommended): -2. Create a new environment (optional but recommended): ```bash -conda create -n DAJIN2-env -conda activate DAJIN2-env +conda create -n env-dajin2 +conda activate env-dajin2 ``` -3. Install `DAJIN2`: +4. Install `DAJIN2` in the virtual environment: ```bash conda install -c bioconda DAJIN2 ``` From a6bdf50b111fca6e565117e7d6cb12e199e09fd8 Mon Sep 17 00:00:00 2001 From: akikuno Date: Mon, 25 Dec 2023 14:21:44 +0900 Subject: [PATCH 05/27] specify python=3.10 --- docs/TROUBLESHOOTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/TROUBLESHOOTING.md b/docs/TROUBLESHOOTING.md index fff79ac1..d5f8aa33 100644 --- a/docs/TROUBLESHOOTING.md +++ b/docs/TROUBLESHOOTING.md @@ -15,7 +15,7 @@ conda config --add channels conda-forge conda config --set channel_priority strict # Install DAJIN2 to a virtual environment -conda create -n env-dajin2 -y +conda create -n env-dajin2 python=3.10 -y conda activate env-dajin2 conda install -c bioconda DAJIN2 -y ``` From 93c21110d5d965cb870d6198922b6599c5609242 Mon Sep 17 00:00:00 2001 From: akikuno Date: Tue, 26 Dec 2023 14:09:21 +0900 Subject: [PATCH 06/27] Add line breaks to make the separation of the code more understandable --- src/DAJIN2/core/consensus/clust_formatter.py | 1 + src/DAJIN2/core/consensus/consensus.py | 3 +-- src/DAJIN2/core/core.py | 4 ++++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/DAJIN2/core/consensus/clust_formatter.py b/src/DAJIN2/core/consensus/clust_formatter.py index c221cab2..2f1ce54f 100644 --- a/src/DAJIN2/core/consensus/clust_formatter.py +++ b/src/DAJIN2/core/consensus/clust_formatter.py @@ -59,6 +59,7 @@ def cache_normalized_indels(ARGS, path_midsv_sample: Path) -> None: def cache_mutation_loci(ARGS, clust_sample: list[dict]) -> None: + # Separate clusters by label and cache them clust_sample.sort(key=lambda x: [x["ALLELE"], x["LABEL"]]) path_consensus = Path(ARGS.tempdir, ARGS.sample_name, "consensus") diff --git a/src/DAJIN2/core/consensus/consensus.py b/src/DAJIN2/core/consensus/consensus.py index 9dc54607..ff5753c0 100644 --- a/src/DAJIN2/core/consensus/consensus.py +++ b/src/DAJIN2/core/consensus/consensus.py @@ -218,8 +218,7 @@ def call_consensus(tempdir: Path, sample_name: str, clust_sample: list[dict]) -> for (allele, label), group in groupby(clust_sample, key=lambda x: [x["ALLELE"], x["LABEL"]]): clust = list(group) - prefix = f"{allele}_{label}" - mutation_loci = io.load_pickle(Path(path_consensus, f"{prefix}_mutation_loci.pickle")) + mutation_loci = io.load_pickle(Path(path_consensus, f"{allele}_{label}_mutation_loci.pickle")) cssplits = [cs["CSSPLIT"].split(",") for cs in clust] cons_percentage = call_percentage(cssplits, mutation_loci) diff --git a/src/DAJIN2/core/core.py b/src/DAJIN2/core/core.py index 9312c608..a50b48cd 100644 --- a/src/DAJIN2/core/core.py +++ b/src/DAJIN2/core/core.py @@ -115,6 +115,7 @@ def format_inputs(arguments: dict) -> FormattedInputs: def execute_control(arguments: dict): + logger.info(f"{arguments['control']} is now processing...") ########################################################### @@ -172,6 +173,7 @@ def execute_control(arguments: dict): def execute_sample(arguments: dict): + logger.info(f"{arguments['sample']} is now processing...") ########################################################### @@ -236,7 +238,9 @@ def execute_sample(arguments: dict): ######################################################################## logger.info(f"Classify {arguments['sample']}...") + classif_sample = classification.classify_alleles(ARGS.tempdir, ARGS.fasta_alleles, ARGS.sample_name) + io.save_pickle(classif_sample, Path(ARGS.tempdir, ARGS.sample_name, "classification", "classif_sample.pickle")) ######################################################################## From c91baab7128ff06b955a02d0415a777ef173909f Mon Sep 17 00:00:00 2001 From: akikuno Date: Tue, 26 Dec 2023 14:09:27 +0900 Subject: [PATCH 07/27] update ROADMAP --- docs/ROADMAP.md | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index e2fb9ad7..e8935c99 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -10,9 +10,18 @@ + [ ] XXX [Commit Detail](https://github.com/akikuno/DAJIN2/commit/xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx) --> + # v0.3.6 (yyyy-mm-dd) ## 📝 Documentation + ++ Added a quick quide of installation to TROUBLESHOOTING.md [Commit Detail](https://github.com/akikuno/DAJIN2/commit/cefed0ff4d04282b9915486be07de85b2b77b657) + ## 🚀 Features ### Classification @@ -26,13 +35,21 @@ #### Skipped the clustering of minor alleles to suppress excessive subdivision of alleles. + Added the function `merge_minor_cluster` to revert labels clustered with less than 10 reads back to the previous labels. - [Commit Detail](https://github.com/akikuno/DAJIN2/commit/xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx) + [Commit Detail](https://github.com/akikuno/DAJIN2/commit/4bd9f7dd806d192475d8d4f20c1e50c37281d64e) ## 🐛 Bug Fixes + +### Consensus + ++ 大型欠失の内部で欠失が反映されないバグを修正 [Commit Detail](https://github.com/akikuno/DAJIN2/commit/XXX) + ## 🔧 Maintenance + + ## ⛔️ Deprecated +--- # 💡 Future Tasks From 3f66f0b2eda0525e831f72de799c86e314214e1c Mon Sep 17 00:00:00 2001 From: akikuno Date: Fri, 29 Dec 2023 07:54:30 +0900 Subject: [PATCH 08/27] Remove the `is_insertion` arguments at `preprocess.cache_mutation_loci` because it is no longer being used. --- src/DAJIN2/core/core.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/DAJIN2/core/core.py b/src/DAJIN2/core/core.py index a50b48cd..6e1f77b9 100644 --- a/src/DAJIN2/core/core.py +++ b/src/DAJIN2/core/core.py @@ -115,7 +115,6 @@ def format_inputs(arguments: dict) -> FormattedInputs: def execute_control(arguments: dict): - logger.info(f"{arguments['control']} is now processing...") ########################################################### @@ -157,7 +156,7 @@ def execute_control(arguments: dict): ########################################################### # Prepare data to `extract mutaion loci` ########################################################### - preprocess.cache_mutation_loci(ARGS, is_control=True, is_insertion=False) + preprocess.cache_mutation_loci(ARGS, is_control=True) ########################################################### # Output BAM files @@ -173,7 +172,6 @@ def execute_control(arguments: dict): def execute_sample(arguments: dict): - logger.info(f"{arguments['sample']} is now processing...") ########################################################### @@ -204,7 +202,7 @@ def execute_sample(arguments: dict): # Extract mutation loci # ============================================================ preprocess.extract_knockin_loci(ARGS.tempdir, ARGS.sample_name) - preprocess.cache_mutation_loci(ARGS, is_control=False, is_insertion=False) + preprocess.cache_mutation_loci(ARGS, is_control=False) # ============================================================ # Detect and align insertion alleles @@ -227,9 +225,9 @@ def execute_sample(arguments: dict): preprocess.generate_midsv(ARGS, is_control=True, is_insertion=True) preprocess.generate_midsv(ARGS, is_control=False, is_insertion=True) # Reculculate mutation loci - preprocess.cache_mutation_loci(ARGS, is_control=True, is_insertion=True) + preprocess.cache_mutation_loci(ARGS, is_control=True) preprocess.extract_knockin_loci(ARGS.tempdir, ARGS.sample_name) - preprocess.cache_mutation_loci(ARGS, is_control=False, is_insertion=True) + preprocess.cache_mutation_loci(ARGS, is_control=False) io.save_pickle(ARGS.fasta_alleles, Path(ARGS.tempdir, ARGS.sample_name, "fasta", "fasta_alleles.pickle")) @@ -263,11 +261,11 @@ def execute_sample(arguments: dict): logger.info(f"Consensus calling of {arguments['sample']}...") - consensus.cache_mutation_loci(ARGS, clust_sample) - # Downsampling to 1000 reads in each LABEL clust_subset_sample = consensus.subset_clust(clust_sample, 1000) + consensus.cache_mutation_loci(ARGS, clust_subset_sample) + cons_percentage, cons_sequence = consensus.call_consensus(ARGS.tempdir, ARGS.sample_name, clust_subset_sample) allele_names = consensus.call_allele_name(cons_sequence, cons_percentage, ARGS.fasta_alleles) @@ -278,7 +276,7 @@ def execute_sample(arguments: dict): RESULT_SAMPLE.sort(key=lambda x: x["LABEL"]) io.save_pickle(cons_percentage, Path(ARGS.tempdir, ARGS.sample_name, "consensus", "cons_percentage.pickle")) - io.save_pickle(cons_sequence, Path(ARGS.tempdir, ARGS.sample_name, "consensus", "conse_sequence.pickle")) + io.save_pickle(cons_sequence, Path(ARGS.tempdir, ARGS.sample_name, "consensus", "cons_sequence.pickle")) ######################################################################## # Output Report:RESULT/FASTA/HTML/BAM From 6b83646aacaee31bc4f39aeed8f6dbb25689cc28 Mon Sep 17 00:00:00 2001 From: akikuno Date: Fri, 29 Dec 2023 10:37:50 +0900 Subject: [PATCH 09/27] move `consensus.cache_mutation_loci` to `consensus.mutation_extractor` --- src/DAJIN2/core/consensus/__init__.py | 3 +- src/DAJIN2/core/consensus/clust_formatter.py | 75 ------------ .../core/consensus/mutation_extractor.py | 114 ++++++++++++++++++ 3 files changed, 116 insertions(+), 76 deletions(-) create mode 100644 src/DAJIN2/core/consensus/mutation_extractor.py diff --git a/src/DAJIN2/core/consensus/__init__.py b/src/DAJIN2/core/consensus/__init__.py index 154c97e6..a22db208 100644 --- a/src/DAJIN2/core/consensus/__init__.py +++ b/src/DAJIN2/core/consensus/__init__.py @@ -2,4 +2,5 @@ from DAJIN2.core.consensus.name_handler import call_allele_name from DAJIN2.core.consensus.name_handler import update_key_by_allele_name from DAJIN2.core.consensus.name_handler import add_key_by_allele_name -from DAJIN2.core.consensus.clust_formatter import subset_clust, cache_mutation_loci +from DAJIN2.core.consensus.clust_formatter import subset_clust +from DAJIN2.core.consensus.mutation_extractor import cache_mutation_loci diff --git a/src/DAJIN2/core/consensus/clust_formatter.py b/src/DAJIN2/core/consensus/clust_formatter.py index 2f1ce54f..8d2e9cd0 100644 --- a/src/DAJIN2/core/consensus/clust_formatter.py +++ b/src/DAJIN2/core/consensus/clust_formatter.py @@ -1,14 +1,7 @@ from __future__ import annotations -from pathlib import Path from itertools import groupby -from sklearn.cluster import MiniBatchKMeans - -from DAJIN2.utils import io -from DAJIN2.core.preprocess.mutation_extractor import summarize_indels, extract_mutation_loci -from DAJIN2.core.consensus.similarity_searcher import cache_selected_control_by_similarity - def subset_clust(clust_sample: list[dict], num: int = 1000) -> list[dict]: clust_subset_sample = [] @@ -16,71 +9,3 @@ def subset_clust(clust_sample: list[dict], num: int = 1000) -> list[dict]: for _, group in groupby(clust_sample, key=lambda x: x["LABEL"]): clust_subset_sample.extend(list(group)[:num]) return clust_subset_sample - - -########################################################### -# cache_mutation_loci -########################################################### - - -def get_thresholds(path_indels_normalized_sample, path_indels_normalized_control) -> dict[str, float]: - indels_normalized_sample = io.load_pickle(path_indels_normalized_sample) - indels_normalized_control = io.load_pickle(path_indels_normalized_control) - thresholds = dict() - for mut in {"+", "-", "*"}: - values_sample = indels_normalized_sample[mut] - values_control = indels_normalized_control[mut] - values_subtract = values_sample - values_control - kmeans = MiniBatchKMeans(n_clusters=2, random_state=0).fit(values_subtract.reshape(-1, 1)) - threshold = kmeans.cluster_centers_.mean() - thresholds[mut] = max(threshold, 0.05) - return thresholds - - -def cache_normalized_indels(ARGS, path_midsv_sample: Path) -> None: - allele, label, *_ = path_midsv_sample.stem.split("_") - sequence = ARGS.fasta_alleles[allele] - - if Path(ARGS.tempdir, ARGS.control_name, "midsv", f"{allele}.json").exists(): - path_midsv_control = Path(ARGS.tempdir, ARGS.control_name, "midsv", f"{allele}.json") - else: - path_midsv_control = Path(ARGS.tempdir, ARGS.control_name, "midsv", f"{allele}_{ARGS.sample_name}.json") - - cache_selected_control_by_similarity(path_midsv_control, path_midsv_sample, path_midsv_sample.parent) - - path_midsv_filtered_control = Path(path_midsv_sample.parent, f"{allele}_{label}_control.jsonl") - - _, indels_normalized_sample = summarize_indels(path_midsv_sample, sequence) - _, indels_normalized_control = summarize_indels(path_midsv_filtered_control, sequence) - - path_consensus = Path(ARGS.tempdir, ARGS.sample_name, "consensus") - io.save_pickle(indels_normalized_sample, Path(path_consensus, f"{allele}_{label}_normalized_sample.pickle")) - io.save_pickle(indels_normalized_control, Path(path_consensus, f"{allele}_{label}_normalized_control.pickle")) - - -def cache_mutation_loci(ARGS, clust_sample: list[dict]) -> None: - - # Separate clusters by label and cache them - clust_sample.sort(key=lambda x: [x["ALLELE"], x["LABEL"]]) - path_consensus = Path(ARGS.tempdir, ARGS.sample_name, "consensus") - for (allele, label), group in groupby(clust_sample, key=lambda x: [x["ALLELE"], x["LABEL"]]): - io.write_jsonl(group, Path(path_consensus, f"{allele}_{label}_sample.jsonl")) - - # Cache normalized indels counts - for path_midsv_sample in path_consensus.glob("*_sample.jsonl"): - cache_normalized_indels(ARGS, path_midsv_sample) - - # Extract and cache mutation loci - for path_indels_normalized_sample in path_consensus.glob("*_normalized_sample.pickle"): - allele, label, *_ = path_indels_normalized_sample.stem.split("_") - path_indels_normalized_control = Path(path_consensus, f"{allele}_{label}_normalized_control.pickle") - sequence = ARGS.fasta_alleles[allele] - path_knockin = Path(ARGS.tempdir, ARGS.sample_name, "knockin_loci", f"{allele}.pickle") - - thresholds = get_thresholds(path_indels_normalized_sample, path_indels_normalized_control) - - mutation_loci = extract_mutation_loci( - sequence, path_indels_normalized_sample, path_indels_normalized_control, path_knockin, thresholds - ) - - io.save_pickle(mutation_loci, Path(path_consensus, f"{allele}_{label}_mutation_loci.pickle")) diff --git a/src/DAJIN2/core/consensus/mutation_extractor.py b/src/DAJIN2/core/consensus/mutation_extractor.py new file mode 100644 index 00000000..6a99c5bd --- /dev/null +++ b/src/DAJIN2/core/consensus/mutation_extractor.py @@ -0,0 +1,114 @@ +from __future__ import annotations + +from pathlib import Path +from itertools import groupby + +import numpy as np +from sklearn.cluster import MiniBatchKMeans + +from DAJIN2.utils import io +from DAJIN2.core.preprocess.mutation_extractor import summarize_indels, extract_mutation_loci, minimize_mutation_counts +from DAJIN2.core.consensus.similarity_searcher import cache_selected_control_by_similarity + +""" +Most of the code reuses `preprocess.cache_mutation_loci`. +This code differs from `preprocess.cache_mutation_loci` in that it dynamically extracts control sequences similar to alleles after clustering, and sets thresholds for distinguishing between sequence errors and mutations. +- Extracts reads similar to alleles after clustering to minimize the impact of anomalies in the control sequences. +- Since alleles after clustering essentially assume a single allele, the threshold setting in preprocess is not 10% but is set to a higher value. +""" + + +def get_thresholds(path_indels_normalized_sample, path_indels_normalized_control) -> dict[str, float]: + """ + For the consensus threshold, the minimum threshold for "sample - control" was set to be at least 10%. + """ + indels_normalized_sample = io.load_pickle(path_indels_normalized_sample) + indels_normalized_control = io.load_pickle(path_indels_normalized_control) + indels_normalized_minimize_control = minimize_mutation_counts(indels_normalized_control, indels_normalized_sample) + thresholds = dict() + for mut in {"+", "-", "*"}: + values_sample = indels_normalized_sample[mut] + values_control = indels_normalized_minimize_control[mut] + values_subtract = values_sample - values_control + kmeans = MiniBatchKMeans(n_clusters=2, random_state=0).fit(values_subtract.reshape(-1, 1)) + threshold = kmeans.cluster_centers_.mean() + thresholds[mut] = max(threshold, 10) + return thresholds + + +def extract_path_control(ARGS, allele: str) -> Path: + # Define potential file paths + paths = [ + Path(ARGS.tempdir, ARGS.control_name, "midsv", f"{allele}.json"), + Path(ARGS.tempdir, ARGS.control_name, "midsv", f"{allele}_{ARGS.sample_name}.json"), + ] + # Return the first path that exists + for path in paths: + if path.exists(): + return path + + +def extract_path_n_filtered_control(ARGS, path_midsv_control: Path) -> Path: + """ + Filter out reads with N enriched in the control. + """ + path_output = Path(ARGS.tempdir, ARGS.control_name, "consensus", f"{path_midsv_control.stem}_n_filtered.jsonl") + if path_output.exists(): + return path_output + + midsv_control = io.read_jsonl(path_midsv_control) + n_counts = np.array([sum(1 if cs == "N" else 0 for cs in c["CSSPLIT"].split(",")) for c in midsv_control]) + + kmeans = MiniBatchKMeans(n_clusters=2, random_state=0).fit(n_counts.reshape(-1, 1)) + threshold = kmeans.cluster_centers_.mean() + labels = np.where(n_counts <= threshold, True, False) + midsv_filtered_control = (m for m, label in zip(io.read_jsonl(path_midsv_control), labels) if label) + io.write_jsonl(midsv_filtered_control, path_output) + + return path_output + + +def cache_normalized_indels(ARGS, path_midsv_sample: Path) -> None: + allele, label, *_ = path_midsv_sample.stem.split("_") + sequence = ARGS.fasta_alleles[allele] + + path_midsv_control = extract_path_control(ARGS, allele) + path_midsv_n_filtered_control = extract_path_n_filtered_control(ARGS, path_midsv_control) + + cache_selected_control_by_similarity(path_midsv_n_filtered_control, path_midsv_sample, path_midsv_sample.parent) + + path_midsv_similar_control = Path(path_midsv_sample.parent, f"{allele}_{label}_control.jsonl") + + _, indels_normalized_sample = summarize_indels(path_midsv_sample, sequence) + _, indels_normalized_control = summarize_indels(path_midsv_similar_control, sequence) + + path_consensus = Path(ARGS.tempdir, ARGS.sample_name, "consensus") + io.save_pickle(indels_normalized_sample, Path(path_consensus, f"{allele}_{label}_normalized_sample.pickle")) + io.save_pickle(indels_normalized_control, Path(path_consensus, f"{allele}_{label}_normalized_control.pickle")) + + +def cache_mutation_loci(ARGS, clust_sample: list[dict]) -> None: + # Separate clusters by label and cache them + clust_sample.sort(key=lambda x: [x["ALLELE"], x["LABEL"]]) + path_consensus = Path(ARGS.tempdir, ARGS.sample_name, "consensus") + for (allele, label), group in groupby(clust_sample, key=lambda x: [x["ALLELE"], x["LABEL"]]): + io.write_jsonl(group, Path(path_consensus, f"{allele}_{label}_sample.jsonl")) + + # Cache normalized indels counts + for path_midsv_sample in path_consensus.glob("*_sample.jsonl"): + cache_normalized_indels(ARGS, path_midsv_sample) + + # Extract and cache mutation loci + for path_indels_normalized_sample in path_consensus.glob("*_normalized_sample.pickle"): + allele, label, *_ = path_indels_normalized_sample.stem.split("_") + path_indels_normalized_control = Path(path_consensus, f"{allele}_{label}_normalized_control.pickle") + sequence = ARGS.fasta_alleles[allele] + path_knockin = Path(ARGS.tempdir, ARGS.sample_name, "knockin_loci", f"{allele}.pickle") + + thresholds = get_thresholds(path_indels_normalized_sample, path_indels_normalized_control) + + mutation_loci = extract_mutation_loci( + sequence, path_indels_normalized_sample, path_indels_normalized_control, path_knockin, thresholds + ) + + io.save_pickle(mutation_loci, Path(path_consensus, f"{allele}_{label}_mutation_loci.pickle")) From 21c2596805c36074f360285600e60ee76b948908 Mon Sep 17 00:00:00 2001 From: akikuno Date: Fri, 29 Dec 2023 10:39:44 +0900 Subject: [PATCH 10/27] Simplyfy the error detection using cosine similarity --- .../core/preprocess/homopolymer_handler.py | 131 +++--------------- 1 file changed, 17 insertions(+), 114 deletions(-) diff --git a/src/DAJIN2/core/preprocess/homopolymer_handler.py b/src/DAJIN2/core/preprocess/homopolymer_handler.py index 3da77af0..afeca846 100644 --- a/src/DAJIN2/core/preprocess/homopolymer_handler.py +++ b/src/DAJIN2/core/preprocess/homopolymer_handler.py @@ -1,11 +1,8 @@ from __future__ import annotations import re -from collections import defaultdict -import scipy import numpy as np -from statsmodels.nonparametric.smoothers_lowess import lowess as sm_lowess def get_repeat_regions(sequence: str, loci: set[int]) -> list[tuple[int, int]]: @@ -22,110 +19,8 @@ def get_repeat_regions(sequence: str, loci: set[int]) -> list[tuple[int, int]]: return repeat_regions -def get_directions(indels_mut: np.ndarray, repeat_regions: list[tuple]) -> dict[tuple, bool]: - directions = dict() - for start, end in repeat_regions: - error_counts = indels_mut[start:end] - # If there are many errors at the beginning, direction is False. - if error_counts[0] <= error_counts[-1]: - directions[(start, end)] = True - else: - directions[(start, end)] = False - return directions - - -def get_counts_homopolymer( - indels_mut: np.ndarray, directions: dict[tuple, bool] -) -> dict[int, list[float], dict[tuple[int, int], np.ndarray]]: - # Initialize default dictionaries to hold counts - error_counts = defaultdict(list) - error_counts_regions = dict() - # Iterate through each repeat region - for (start, end), direction in directions.items(): - indels_counts = indels_mut[start:end] - # If there are many errors at the beginning, reverse the order - if direction is False: - indels_counts = indels_counts[::-1] - start, end = end, start - # Append the start, end, and total log errors to the region count - error_counts_regions[(start, end)] = indels_counts - # Append the log errors count to each position - for position, value in enumerate(indels_counts): - error_counts[position].append(value) - return dict(error_counts), error_counts_regions - - -def _smooth_data(input_x, input_y, input_xgrid): - # Sample 50 data points from the input x and y - sampled_indices = np.random.choice(len(input_x), 50, replace=True) - sampled_y = input_y[sampled_indices] - sampled_x = input_x[sampled_indices] - # Apply lowess smoothing to the sampled data - smoothed_y = sm_lowess(sampled_y, sampled_x, frac=1.0 / 5.0, it=5, return_sorted=False) - # Interpolate the smoothed data onto the input grid - interpolated_y_grid = scipy.interpolate.interp1d(sampled_x, smoothed_y, fill_value="extrapolate")(input_xgrid) - return interpolated_y_grid - - -def return_thresholds(error_counts: dict[int, list[float]]) -> list[float]: - # Initialize empty lists to hold x and y data - error_positions = [] - error_counts_extended = [] - # Populate the x and y data with the mutation counts and positions - for position, counts in error_counts.items(): - position_values = [position] * len(counts) - error_positions.extend(position_values) - error_counts_extended.extend(counts) - # Convert x and y data to numpy arrays - error_positions = np.array(error_positions) - error_counts_extended = np.array(error_counts_extended) - # Define a grid of x values from the minimum to the maximum position - x_grid = np.linspace(error_positions.min(), error_positions.max(), error_positions.max() + 1) - # Smooth the y data K times and stack the results - num_smoothings = 100 - smoothed_data = np.stack( - [_smooth_data(error_positions, error_counts_extended, x_grid) for _ in range(num_smoothings)] - ).T - # Calculate the mean and standard error of the smoothed data - mean_smoothed_data = np.nanmean(smoothed_data, axis=1) - stderr_smoothed_data = scipy.stats.sem(smoothed_data, axis=1) - stderr_smoothed_data = np.nanstd(smoothed_data, axis=1, ddof=0) - # Define the thresholds - thresholds = mean_smoothed_data + 1.90 * stderr_smoothed_data - return thresholds - - -def get_errors_in_homopolyer( - error_counts_regions: dict[tuple[int, int], np.ndarray], thresholds: list[float] -) -> set[int]: - # Initialize a set to hold the locations of errors - sequence_error_loci = set() - # Iterate through each region and its associated error counts - for (start, end), counts in error_counts_regions.items(): - # +-1 because 0-index is not considered as homopolymer - if start > end: - candidate_error_region = set(range(end, start - 1)) - else: - candidate_error_region = set(range(start + 1, end)) - # Initialize a list to hold the indices of mutations - mutation_indices = [] - # Iterate through each mutation count - for index, count in enumerate(counts): - # Skip the first count (since it has no previous count to compare to) - if index == 0: - continue - # If the count exceeds the threshold, record the index - if count > thresholds[index]: - mutation_indices.append(index) - # Add the absolute location of each mutation to the set of mutation locations - mutations = set() - for mutation_index in mutation_indices: - if start > end: - mutations.add(start - 1 - mutation_index) - else: - mutations.add(end + mutation_index) - sequence_error_loci |= candidate_error_region - mutations - return sequence_error_loci +def cosine_simirarity(X, Y) -> float: + return np.dot(X, Y) / (np.linalg.norm(X) * np.linalg.norm(Y)) ########################################################### @@ -133,16 +28,24 @@ def get_errors_in_homopolyer( ########################################################### -def extract_errors(sequence, indels_sample, indels_control, candidate_loci: dict[set]) -> dict[str, set(int)]: +def extract_errors( + sequence: str, + indels_normalized_sample: dict[str, np.array], + indels_normalized_control: dict[str, np.array], + candidate_loci: dict[set], +) -> dict[str, set(int)]: errors_in_homopolymer = dict() for mut in ["+", "-", "*"]: repeat_regions = get_repeat_regions(sequence, candidate_loci[mut]) - if repeat_regions == []: + if len(repeat_regions) == 0: errors_in_homopolymer[mut] = set() continue - directions = get_directions(indels_control[mut], repeat_regions) - error_counts_control, _ = get_counts_homopolymer(indels_control[mut], directions) - _, error_counts_regions_sample = get_counts_homopolymer(indels_sample[mut], directions) - thresholds = return_thresholds(error_counts_control) - errors_in_homopolymer[mut] = get_errors_in_homopolyer(error_counts_regions_sample, thresholds) + errors = set() + for start, end in repeat_regions: + x = indels_normalized_sample[mut][start:end] + y = indels_normalized_control[mut][start:end] + if cosine_simirarity(x, y) > 0.95: + errors.update(range(start, end + 1)) + errors_in_homopolymer[mut] = errors + return errors_in_homopolymer From 94c48da01fc039902c97a23ddea47dd5f2b42ab4 Mon Sep 17 00:00:00 2001 From: akikuno Date: Fri, 29 Dec 2023 10:43:13 +0900 Subject: [PATCH 11/27] Use `LocalOutlierFactor` to filter abnormal control reads --- .../core/consensus/similarity_searcher.py | 69 +++++++------------ 1 file changed, 26 insertions(+), 43 deletions(-) diff --git a/src/DAJIN2/core/consensus/similarity_searcher.py b/src/DAJIN2/core/consensus/similarity_searcher.py index aa86f854..b239341b 100644 --- a/src/DAJIN2/core/consensus/similarity_searcher.py +++ b/src/DAJIN2/core/consensus/similarity_searcher.py @@ -4,7 +4,8 @@ from collections import defaultdict import numpy as np -from sklearn.cluster import MiniBatchKMeans + +from sklearn.neighbors import LocalOutlierFactor from DAJIN2.utils import io @@ -24,60 +25,43 @@ def onehot_by_mutations(midsv_sample: list[dict]) -> dict[str, np.ndarray]: return {mut: np.array(value) for mut, value in mut_onehot.items()} -def calc_percentage(mut_onehot: dict[str, np.ndarray], coverage: int) -> dict[str, np.ndarray]: +def calculate_percentage(mut_onehot: dict[str, np.ndarray], coverage: list[int]) -> dict[str, np.ndarray]: mut_percentage = dict() - for mut, onehot in mut_onehot.items(): - mut_percentage[mut] = np.sum(onehot, axis=0) / coverage + for i, (mut, onehot) in enumerate(mut_onehot.items()): + mut_percentage[mut] = np.sum(onehot, axis=0) / coverage[i] return mut_percentage -def get_index_to_mask(mut_percentage_sample: dict[str, np.ndarray], threshold=0.5) -> dict[str, np.ndarray[bool]]: +def get_values_to_mask(mut_percentage_sample: dict[str, np.ndarray], threshold=0.5) -> dict[str, np.ndarray[float]]: mask = dict() for mut, percentage in mut_percentage_sample.items(): mask[mut] = np.where(percentage > threshold, 0, percentage) return mask -def apply_mask(mut_onehot: dict[str, np.ndarray], mask_sample: dict[str, np.ndarray[bool]]): +def apply_mask(mut_onehot: dict[str, np.ndarray], mask_sample: dict[str, np.ndarray[float]]): mut_onehot_masked = dict() for mut, onehot in mut_onehot.items(): mut_onehot_masked[mut] = onehot * mask_sample[mut] return mut_onehot_masked -def calc_similarity(mut_onehot_sample_masked: dict[str, np.ndarray], mut_onehot_control_masked: dict[str, np.ndarray]): - """The label compares the number of mutations in the sample and control. - The number of mutations in the sample is classified into two categories, and a threshold for being considered normal is determined, which is then applied to the control. - - On the other hand, the distance compares the Euclidean distance between the sample (average value of mutations at each index) and the control. - The smaller the Euclidean distance, the more similar they are. - """ - similarity = dict() +def identify_normal_reads( + mut_onehot_sample_masked: dict[str, np.ndarray], mut_onehot_control_masked: dict[str, np.ndarray] +) -> list[bool]: + mutation_comparisons = dict() for mut in {"+", "-", "*"}: values_sample = mut_onehot_sample_masked[mut] values_control = mut_onehot_control_masked[mut] - kmeans = MiniBatchKMeans(n_clusters=2, random_state=0).fit(values_sample.sum(axis=1).reshape(-1, 1)) - threshold = kmeans.cluster_centers_.mean() - label = np.where(values_control.sum(axis=1) <= threshold, 1, 0) + values_sum_sample = values_sample.sum(axis=1).reshape(-1, 1) + values_sum_control = values_control.sum(axis=1).reshape(-1, 1) - euclidean_distance = np.linalg.norm(values_control - values_sample.mean(axis=0), axis=1) + clf = LocalOutlierFactor(novelty=True, n_neighbors=len(values_sum_sample)).fit(values_sum_sample) + labels = clf.predict(values_sum_control) + mutation_comparisons[mut] = np.where(labels == 1, True, False) - similarity[mut] = {"label": label, "distance": euclidean_distance} - - return similarity - - -def is_similar(similarity, coverage_sample) -> list[bool]: - x = similarity["+"]["label"] * similarity["-"]["label"] * similarity["*"]["label"] - if np.sum(x) >= coverage_sample: - return np.where(x == 1, True, False).tolist() - else: - # If there are few similar reads, extract reads with a smaller distance - x = similarity["+"]["distance"] + similarity["-"]["distance"] + similarity["*"]["distance"] - n = min(coverage_sample, len(x)) - threshold = np.sort(x)[n - 1] - return np.where(x < threshold, True, False).tolist() + return (mutation_comparisons["+"] * mutation_comparisons["-"] * mutation_comparisons["*"]).tolist() ########################################################### @@ -89,25 +73,24 @@ def filter_control(path_midsv_control: Path, path_midsv_sample: Path) -> list[bo """ find similar control reads compared to sample reads """ - coverage_sample = io.count_newlines(path_midsv_sample) + cssplits = (m["CSSPLIT"].split(",") for m in io.read_jsonl(path_midsv_sample)) + coverage_not_N = [sum(1 for item in sublist if item != "N") for sublist in zip(*cssplits)] mut_onehot_sample = onehot_by_mutations(io.read_jsonl(path_midsv_sample)) mut_onehot_control = onehot_by_mutations(io.read_jsonl(path_midsv_control)) - mut_percentage_sample = calc_percentage(mut_onehot_sample, coverage_sample) - mask_sample = get_index_to_mask(mut_percentage_sample) - - mut_onehot_sample_masked = apply_mask(mut_onehot_sample, mask_sample) - mut_onehot_control_masked = apply_mask(mut_onehot_control, mask_sample) + mut_percentage_sample = calculate_percentage(mut_onehot_sample, coverage_not_N) + values_mask = get_values_to_mask(mut_percentage_sample) - similarity = calc_similarity(mut_onehot_sample_masked, mut_onehot_control_masked) + mut_onehot_sample_masked = apply_mask(mut_onehot_sample, values_mask) + mut_onehot_control_masked = apply_mask(mut_onehot_control, values_mask) - return is_similar(similarity, coverage_sample) + return identify_normal_reads(mut_onehot_sample_masked, mut_onehot_control_masked) def cache_selected_control_by_similarity(path_midsv_control: Path, path_midsv_sample: Path, path_output: Path) -> None: - bools = filter_control(path_midsv_control, path_midsv_sample) + normal_reads_flags = filter_control(path_midsv_control, path_midsv_sample) midsv_control = io.read_jsonl(path_midsv_control) - midsv_filtered = (m for m, b in zip(midsv_control, bools) if b is True) + midsv_filtered = (m for m, flag in zip(midsv_control, normal_reads_flags) if flag is True) allele, label, *_ = Path(path_midsv_sample).stem.split("_") io.write_jsonl(midsv_filtered, Path(path_output, f"{allele}_{label}_control.jsonl")) From f30fabfbd2e7f1d5e1e23a495818af25add32c76 Mon Sep 17 00:00:00 2001 From: akikuno Date: Fri, 29 Dec 2023 10:43:52 +0900 Subject: [PATCH 12/27] Add consensus directory --- src/DAJIN2/core/preprocess/directories.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/DAJIN2/core/preprocess/directories.py b/src/DAJIN2/core/preprocess/directories.py index 2e7129ae..125d8942 100644 --- a/src/DAJIN2/core/preprocess/directories.py +++ b/src/DAJIN2/core/preprocess/directories.py @@ -6,7 +6,7 @@ def create_temporal(TEMPDIR: Path, NAME: str, is_control=False) -> None: Path(TEMPDIR, "result").mkdir(parents=True, exist_ok=True) if is_control: - SUBDIRS = ["fasta", "sam", "midsv", "mutation_loci", "clustering"] + SUBDIRS = ["fasta", "sam", "midsv", "mutation_loci", "clustering", "consensus"] else: SUBDIRS = [ "fasta", From c9f5aa7b48581e58d99fe8c31275c422756aa9f1 Mon Sep 17 00:00:00 2001 From: akikuno Date: Fri, 29 Dec 2023 10:57:04 +0900 Subject: [PATCH 13/27] Use cosine similarity to filter dissimilar loci --- docs/ROADMAP.md | 10 +++ .../core/preprocess/mutation_extractor.py | 78 +++++++++---------- 2 files changed, 47 insertions(+), 41 deletions(-) diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index e8935c99..c7dec467 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -24,6 +24,12 @@ ## 🚀 Features +### Preprocess + ++ Simplyfy `homopolymer_handler.py` for the error detection using cosine similarity [Commit Detail](https://github.com/akikuno/DAJIN2/commit/21c2596805c36074f360285600e60ee76b948908) + ++ Use cosine similarity to filter dissimilar loci at `mutation_extractor.py` [Commit Detail](https://github.com/akikuno/DAJIN2/commit/21c2596805c36074f360285600e60ee76b948908) + ### Classification #### Skipped the classification of minor alleles to suppress excessive subdivision of alleles. @@ -37,6 +43,10 @@ + Added the function `merge_minor_cluster` to revert labels clustered with less than 10 reads back to the previous labels. [Commit Detail](https://github.com/akikuno/DAJIN2/commit/4bd9f7dd806d192475d8d4f20c1e50c37281d64e) +### Consensus + ++ Use `LocalOutlierFactor` to filter abnormal control reads [Commit Detail](https://github.com/akikuno/DAJIN2/commit/4bd9f7dd806d192494c48da01fc039902c97a23ddea47dd5f2b42ab475d8d4f20c1e50c37281d64e) + ## 🐛 Bug Fixes diff --git a/src/DAJIN2/core/preprocess/mutation_extractor.py b/src/DAJIN2/core/preprocess/mutation_extractor.py index f073e228..130c243c 100644 --- a/src/DAJIN2/core/preprocess/mutation_extractor.py +++ b/src/DAJIN2/core/preprocess/mutation_extractor.py @@ -46,7 +46,7 @@ def normalize_indels(count: dict[str, list[int]]) -> dict[str, np.array]: for mut, indel_count in count.items(): numerator = np.array(indel_count) denominator = numerator + match_count - count_normalized[mut] = np.where(denominator == 0, 0, numerator / denominator) + count_normalized[mut] = np.where(denominator == 0, 0, numerator / denominator * 100) return count_normalized @@ -84,7 +84,17 @@ def split_kmer(indels: dict[str, np.array], kmer: int = 11) -> dict[str, np.arra ########################################################### -def detect_anomalies(values_subtract: np.array) -> set[int]: +def cosine_similarity(x, y): + return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)) + + +def identify_dissimilar_loci(values_sample, values_control, index: int) -> int: + x = values_sample[index - 5 : index + 6] + y = values_control[index - 5 : index + 6] + return cosine_similarity(x, y) < 0.95 + + +def detect_anomalies(values_sample, values_control, threshold: float) -> set[int]: """ Detect anomalies and return indices of outliers. @@ -95,11 +105,15 @@ def detect_anomalies(values_subtract: np.array) -> set[int]: This function returns the indices of the class with the higher mean of values_subtract values, as this class is considered to be the true anomalies. """ + values_subtract = values_sample - values_control + + """"When the result of subtraction is threshold (%) or less, ignore it as 0""" + values_subtract = np.where(values_subtract <= threshold, 0, values_subtract) values_subtract_reshaped = values_subtract.reshape(-1, 1) - kmeans = MiniBatchKMeans(n_clusters=2, random_state=0) - _ = kmeans.fit_predict(values_subtract_reshaped) + kmeans = MiniBatchKMeans(n_clusters=2, random_state=0, n_init="auto").fit(values_subtract_reshaped) threshold = kmeans.cluster_centers_.mean() - return {i for i, v in enumerate(values_subtract_reshaped) if v > threshold} + candidate_loci = {i for i, v in enumerate(values_subtract_reshaped) if v > threshold} + return {i for i in candidate_loci if identify_dissimilar_loci(values_sample, values_control, i)} def extract_anomal_loci( @@ -111,13 +125,7 @@ def extract_anomal_loci( for mut in {"+", "-", "*"}: values_sample = indels_normalized_sample[mut] values_control = indels_normalized_control[mut] - values_subtract = values_sample - values_control - """" - When the result of subtraction is threshold (%) or less, ignore it as 0 - """ - threshold = thresholds[mut] - values_subtract = np.where(values_subtract <= threshold, 0, values_subtract) - idx_outliers = detect_anomalies(values_subtract) + idx_outliers = detect_anomalies(values_sample, values_control, thresholds[mut]) results[mut] = idx_outliers return results @@ -196,7 +204,7 @@ def merge_index_of_consecutive_indel(mutation_loci: dict[str, set[int]]) -> dict ########################################################### -# main +# formatters ########################################################### @@ -237,25 +245,20 @@ def transpose_mutation_loci(mutation_loci: dict[str, set[int]], sequence: str) - ########################################################### -def cache_indels_count(ARGS, is_control: bool = False, is_insertion: bool = False) -> None: - tempdir, sample_name, control_name, fasta_alleles = ( - ARGS.tempdir, - ARGS.sample_name, - ARGS.control_name, - ARGS.fasta_alleles, - ) - - for allele, sequence in fasta_alleles.items(): - if is_control and Path(tempdir, control_name, "midsv", f"{allele}_{sample_name}.json").exists(): - prefix = f"{allele}_{sample_name}" +def cache_indels_count(ARGS, is_control: bool = False) -> None: + for allele, sequence in ARGS.fasta_alleles.items(): + if is_control and Path(ARGS.tempdir, ARGS.control_name, "midsv", f"{allele}_{ARGS.sample_name}.json").exists(): + prefix = f"{allele}_{ARGS.sample_name}" else: prefix = allele - path_mutation_loci = Path(tempdir, control_name if is_control else sample_name, "mutation_loci") + path_mutation_loci = Path(ARGS.tempdir, ARGS.control_name if is_control else ARGS.sample_name, "mutation_loci") if Path(path_mutation_loci, f"{prefix}_count.pickle").exists(): continue - path_midsv = Path(tempdir, control_name if is_control else sample_name, "midsv", f"{prefix}.json") + path_midsv = Path( + ARGS.tempdir, ARGS.control_name if is_control else ARGS.sample_name, "midsv", f"{prefix}.json" + ) indels_count, indels_normalized = summarize_indels(path_midsv, sequence) io.save_pickle(indels_count, Path(path_mutation_loci, f"{prefix}_count.pickle")) io.save_pickle(indels_normalized, Path(path_mutation_loci, f"{prefix}_normalized.pickle")) @@ -266,7 +269,7 @@ def extract_mutation_loci( path_indels_normalized_sample: Path, path_indels_normalized_control: Path, path_knockin: Path, - thresholds: dict[str, float] = {"*": 0.05, "-": 0.05, "+": 0.05}, + thresholds: dict[str, float] = {"*": 0.5, "-": 0.5, "+": 0.5}, ) -> list[set[str]]: indels_normalized_sample = io.load_pickle(path_indels_normalized_sample) indels_normalized_control = io.load_pickle(path_indels_normalized_control) @@ -291,34 +294,27 @@ def extract_mutation_loci( return mutation_loci_transposed -def cache_mutation_loci(ARGS, is_control: bool = False, is_insertion: bool = False) -> None: - cache_indels_count(ARGS, is_control, is_insertion) +def cache_mutation_loci(ARGS, is_control: bool = False) -> None: + cache_indels_count(ARGS, is_control) if is_control: return - tempdir, sample_name, control_name, fasta_alleles = ( - ARGS.tempdir, - ARGS.sample_name, - ARGS.control_name, - ARGS.fasta_alleles, - ) - - path_mutation_sample = Path(tempdir, sample_name, "mutation_loci") - path_mutation_control = Path(tempdir, control_name, "mutation_loci") + path_mutation_sample = Path(ARGS.tempdir, ARGS.sample_name, "mutation_loci") + path_mutation_control = Path(ARGS.tempdir, ARGS.control_name, "mutation_loci") - for allele, sequence in fasta_alleles.items(): + for allele, sequence in ARGS.fasta_alleles.items(): path_output = Path(path_mutation_sample, f"{allele}.pickle") if path_output.exists(): continue - file_name = f"{allele}_{sample_name}_normalized.pickle" + file_name = f"{allele}_{ARGS.sample_name}_normalized.pickle" if not Path(path_mutation_control, file_name).exists(): file_name = f"{allele}_normalized.pickle" path_indels_normalized_control = Path(path_mutation_control, file_name) path_indels_normalized_sample = Path(path_mutation_sample, f"{allele}_normalized.pickle") - path_knockin = Path(tempdir, sample_name, "knockin_loci", f"{allele}.pickle") + path_knockin = Path(ARGS.tempdir, ARGS.sample_name, "knockin_loci", f"{allele}.pickle") mutation_loci = extract_mutation_loci( sequence, path_indels_normalized_sample, path_indels_normalized_control, path_knockin From 8b552c25c4c290ad7a3c396826f67cbf1dd88d4f Mon Sep 17 00:00:00 2001 From: akikuno Date: Fri, 29 Dec 2023 10:57:28 +0900 Subject: [PATCH 14/27] Update commit links --- docs/ROADMAP.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index c7dec467..3f4df42d 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -28,7 +28,7 @@ + Simplyfy `homopolymer_handler.py` for the error detection using cosine similarity [Commit Detail](https://github.com/akikuno/DAJIN2/commit/21c2596805c36074f360285600e60ee76b948908) -+ Use cosine similarity to filter dissimilar loci at `mutation_extractor.py` [Commit Detail](https://github.com/akikuno/DAJIN2/commit/21c2596805c36074f360285600e60ee76b948908) ++ Use cosine similarity to filter dissimilar loci at `mutation_extractor.py` [Commit Detail](https://github.com/akikuno/DAJIN2/commit/c9f5aa7b48581e58d99fe8c31275c422756aa9f1) ### Classification From 4ad9c9ef8bd963a6e20c1721480aed0fe7922760 Mon Sep 17 00:00:00 2001 From: akikuno Date: Fri, 29 Dec 2023 17:07:31 +0900 Subject: [PATCH 15/27] The UCSC Blat server sometimes returns a 200 HTTP status code even when an error occurs. In such cases, "Very Early Error" is indicated in the Title. Therefore, we have made it so that it returns False in those situations. --- src/DAJIN2/utils/input_validator.py | 43 ++++++++++++++++------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/src/DAJIN2/utils/input_validator.py b/src/DAJIN2/utils/input_validator.py index 89e17366..a10aa5c9 100644 --- a/src/DAJIN2/utils/input_validator.py +++ b/src/DAJIN2/utils/input_validator.py @@ -17,24 +17,25 @@ def update_threads(threads: int) -> int: threads_updated = max(1, min(threads, available_threads)) return threads_updated + ######################################################################## # Check if the sample is in the proper format. ######################################################################## -def _validate_file_existence(input_file: str): +def validate_file_existence(input_file: str): if not Path(input_file).exists(): raise FileNotFoundError(f"{input_file} is not found") -def _validate_fastq_extension(fastq_path: str): +def validate_fastq_extension(fastq_path: str): if not re.search(r".fastq$|.fastq.gz$|.fq$|.fq.gz$", fastq_path): raise ValueError(f"{fastq_path} requires extensions either 'fastq', 'fastq.gz', 'fq' or 'fq.gz'") # Varidate if the file is in the proper format. # See top 100 lines -def _validate_fastq_content(fastq_path: str): +def validate_fastq_content(fastq_path: str): try: names, seqs, quals = zip(*[(n, s, q) for i, (n, s, q) in enumerate(mappy.fastx_read(fastq_path)) if i < 100]) if not (len(names) == len(seqs) == len(quals) > 0): @@ -43,7 +44,7 @@ def _validate_fastq_content(fastq_path: str): raise ValueError(f"{fastq_path} is not a FASTQ format") -def _validate_fasta_content(fasta_path: str): +def validate_fasta_content(fasta_path: str): try: names, seqs = zip(*[(n, s) for n, s, _ in mappy.fastx_read(fasta_path)]) if len(names) != len(seqs) or not names: @@ -59,12 +60,12 @@ def _validate_fasta_content(fasta_path: str): def validate_files(SAMPLE: str, CONTROL: str, ALLELE: str) -> None: - for file in [CONTROL, SAMPLE, ALLELE]: - _validate_file_existence(file) - for file in [CONTROL, SAMPLE]: - _validate_fastq_extension(file) - _validate_fastq_content(file) - _validate_fasta_content(ALLELE) + for path_fastx in [CONTROL, SAMPLE, ALLELE]: + validate_file_existence(path_fastx) + for path_fastx in [CONTROL, SAMPLE]: + validate_fastq_extension(path_fastx) + validate_fastq_content(path_fastx) + validate_fasta_content(ALLELE) ######################################################################## @@ -96,38 +97,42 @@ def exists_cached_genome(genome: str, tempdir: Path, exists_cache_control: bool) ######################################################################## -def _is_webpage_available(url: str) -> bool: +def is_webpage_available(url: str) -> bool: try: with urlopen(url) as response: - return 200 <= response.status < 300 + html = response.read().decode("utf-8") + if "TITLE" not in html: + return True + title = next((h for h in html.split("\n") if "" in h), "") + return "Error" not in title except URLError: return False def get_first_available_url(urls: list[str]) -> str | None: - return next((url for url in urls if _is_webpage_available(url)), None) + return next((url for url in urls if is_webpage_available(url)), None) -def _fetch_xml_data(url: str) -> bytes: +def fetch_xml_data(url: str) -> bytes: """Fetch XML data from a given URL.""" with urlopen(url) as response: return response.read() -def _extract_genome_ids_from_xml(xml_data: bytes) -> set: +def extract_genome_ids_from_xml(xml_data: bytes) -> set: """Extract genome IDs from XML data.""" root = ET.fromstring(xml_data) return {cc.attrib["id"] for child in root for cc in child if cc.tag == "SOURCE"} -def _get_genome_ids_in_ucsc(url_das: str) -> set: +def get_genome_ids_in_ucsc(url_das: str) -> set: """Get available genome IDs in UCSC.""" - xml_data = _fetch_xml_data(url_das) - return _extract_genome_ids_from_xml(xml_data) + xml_data = fetch_xml_data(url_das) + return extract_genome_ids_from_xml(xml_data) def is_genome_in_ucsc_ids(genome: str, url_das: str) -> bool: - genome_ids = _get_genome_ids_in_ucsc(url_das) + genome_ids = get_genome_ids_in_ucsc(url_das) return genome in genome_ids From 69c56fa904ef847dc5b0e2dcdb90303409412d0f Mon Sep 17 00:00:00 2001 From: akikuno <akikuno@users.noreply.github.com> Date: Tue, 2 Jan 2024 17:16:45 +0900 Subject: [PATCH 16/27] Added `preprocess.midsv_caller.convert_consecutive_indels_to_match`: Due to alignment errors, there can be instances where a true match is mistakenly replaced with "insertion following a deletion". For example, although it should be "=C,=T", it gets replaced by "-C,+C|=T". In such cases, a process is performed to revert it back to "=C,=T". --- src/DAJIN2/core/consensus/consensus.py | 76 ---------------------- src/DAJIN2/core/preprocess/midsv_caller.py | 74 +++++++++++++++++++-- tests/src/consensus/test_consensus.py | 60 ----------------- tests/src/preprocess/test_midsv_caller.py | 45 +++++++++---- 4 files changed, 104 insertions(+), 151 deletions(-) diff --git a/src/DAJIN2/core/consensus/consensus.py b/src/DAJIN2/core/consensus/consensus.py index ff5753c0..dc6b04c7 100644 --- a/src/DAJIN2/core/consensus/consensus.py +++ b/src/DAJIN2/core/consensus/consensus.py @@ -62,81 +62,6 @@ def replace_sequence_error(cons_percentage: list[dict[str, float]]) -> list[dict return cons_percentage_replaced -def convert_consecutive_indels_to_match(cons_percentage: list[dict[str, float]]) -> list[dict[str, float]]: - """ - Converts consecutive insertion and deletion operations on the same characters - into a matched operation. This function processes a sequence of operations - represented as a list of dictionaries. It cancels out an insertion ("+") - operation followed downstream by a corresponding deletion ("-") operation on - the same character. The function then consolidates the remaining operations - and returns the updated sequence. - """ - cons_percentage_reversed = cons_percentage[::-1].copy() - - i = 0 - while i < len(cons_percentage_reversed): - current_cssplit = cons_percentage_reversed[i] - current_consensus = max(current_cssplit, key=current_cssplit.get) - - if not current_consensus.startswith("+"): - i += 1 - continue - - insertions = [base.lstrip("+") for base in current_consensus.split("|")[:-1]][::-1] - - # Extract deletions - deletions = [] - count_del_min = float("inf") - - for j in range(1, len(insertions) + 1): - if i + j >= len(cons_percentage_reversed): - break - - next_consensus = max(cons_percentage_reversed[i + j], key=cons_percentage_reversed[i + j].get) - - if not next_consensus.startswith("-"): - break - - deletions.append(next_consensus.lstrip("-")) - count_del_min = min(count_del_min, cons_percentage_reversed[i + j][next_consensus]) - - if insertions != deletions: - i += 1 - continue - - # Format deletions - count_ins = current_cssplit[current_consensus] - - for k, insertion in enumerate(insertions, 1): - current_cssplit = cons_percentage_reversed[i + k] - current_count_match = current_cssplit.get(f"={insertion}", 0) - - if count_del_min - count_ins == 0: - current_cssplit[f"={insertion}"] = current_count_match + count_ins - del current_cssplit[f"-{insertion}"] - elif count_del_min - count_ins > 0: - current_cssplit[f"={insertion}"] = current_count_match + count_ins - current_cssplit[f"-{insertion}"] = count_del_min - count_ins - else: - current_cssplit[f"={insertion}"] = current_count_match + count_del_min - current_cssplit[f"-{insertion}"] -= count_del_min - if current_cssplit[f"-{insertion}"] <= 0: - del current_cssplit[f"-{insertion}"] - - # Format insertion - cs_last = current_consensus.split("|")[-1] - if count_del_min - count_ins >= 0: - cons_percentage_reversed[i][cs_last] = count_ins - del cons_percentage_reversed[i][current_consensus] - else: - cons_percentage_reversed[i][cs_last] = count_del_min - cons_percentage_reversed[i][current_consensus] = count_ins - count_del_min - - i += len(insertions) + 1 - - return cons_percentage_reversed[::-1] - - def adjust_to_100_percent(cons_percentage: list[dict[str, float]]) -> list[dict[str, float]]: adjusted_percentages = [] @@ -157,7 +82,6 @@ def call_percentage(cssplits: list[list[str]], mutation_loci: list[set[str]]) -> cons_percentage = convert_to_percentage(cssplits, mutation_loci) cons_percentage = remove_all_n(cons_percentage) cons_percentage = replace_sequence_error(cons_percentage) - cons_percentage = convert_consecutive_indels_to_match(cons_percentage) return adjust_to_100_percent(cons_percentage) diff --git a/src/DAJIN2/core/preprocess/midsv_caller.py b/src/DAJIN2/core/preprocess/midsv_caller.py index d1bbbed4..40aefd95 100644 --- a/src/DAJIN2/core/preprocess/midsv_caller.py +++ b/src/DAJIN2/core/preprocess/midsv_caller.py @@ -12,7 +12,7 @@ from DAJIN2.utils import cssplits_handler -def _has_inversion_in_splice(CIGAR: str) -> bool: +def has_inversion_in_splice(CIGAR: str) -> bool: previous_insertion = False for cigar in sam_handler.split_cigar(CIGAR): if cigar.endswith("I"): @@ -44,7 +44,7 @@ def extract_qname_of_map_ont(sam_ont: Generator[list[str]], sam_splice: Generato cigar_splice = alignments_splice[qname_ont][5] # If preset=splice and inversion is present, `midsv.transform`` will not work, so use preset=map-ont. - if _has_inversion_in_splice(cigar_splice): + if has_inversion_in_splice(cigar_splice): qname_of_map_ont.add(qname_ont) continue @@ -98,11 +98,18 @@ def replace_internal_n_to_d(midsv_sample: Generator[list[dict]], sequence: str) yield samp +def is_reverse_strand(flag: int) -> bool: + """ + Determines if the read is mapped to the reverse strand. + """ + # Check if the bit 4 (0-based) is set + return bool(flag & 16) + + def convert_flag_to_strand(midsv_sample: Generator[list[dict]]) -> Generator[list[dict]]: """Convert FLAG to STRAND (+ or -)""" - REVERSE_STRAND_FLAG = 16 | 2064 for samp in midsv_sample: - samp["STRAND"] = "-" if samp["FLAG"] & REVERSE_STRAND_FLAG else "+" + samp["STRAND"] = "-" if is_reverse_strand(int(samp["FLAG"])) else "+" del samp["FLAG"] yield samp @@ -117,6 +124,64 @@ def filter_samples_by_n_proportion(midsv_sample: Generator[dict], threshold: int yield samp +########################################################### +# convert_consecutive_indels_to_match +########################################################### + +""" +Due to alignment errors, there can be instances where a true match is mistakenly replaced with "insertion following a deletion". +For example, although it should be "=C,=T", it gets replaced by "-C,+C|=T". In such cases, a process is performed to revert it back to "=C,=T". +""" + + +def convert_consecutive_indels_to_match(cssplit: str) -> str: + i = 0 + cssplit_reversed = cssplit.split(",")[::-1] + while i < len(cssplit_reversed): + current_cs = cssplit_reversed[i] + + if not current_cs.startswith("+"): + i += 1 + continue + + insertions = [base.lstrip("+") for base in current_cs.split("|")[:-1]][::-1] + + # Extract deletions + deletions = [] + + for j in range(1, len(insertions) + 1): + if i + j >= len(cssplit_reversed): + break + + next_cs = cssplit_reversed[i + j] + + if not next_cs.startswith("-"): + break + + deletions.append(next_cs.lstrip("-")) + + if insertions != deletions: + i += 1 + continue + + # Format insertions + cssplit_reversed[i] = current_cs.split("|")[-1] + + # Format deletions + for k, insertion in enumerate(insertions, 1): + cssplit_reversed[i + k] = cssplit_reversed[i + k].replace("-", "=") + + i += len(insertions) + 1 + + return ",".join(cssplit_reversed[::-1]) + + +def convert_consecutive_indels(midsv_sample: Generator) -> Generator[list[dict]]: + for m in midsv_sample: + m["CSSPLIT"] = convert_consecutive_indels_to_match(m["CSSPLIT"]) + yield m + + ########################################################### # main ########################################################### @@ -153,4 +218,5 @@ def generate_midsv(ARGS, is_control: bool = False, is_insertion: bool = False) - midsv_sample = replace_internal_n_to_d(midsv_chaind, sequence) midsv_sample = convert_flag_to_strand(midsv_sample) midsv_sample = filter_samples_by_n_proportion(midsv_sample) + midsv_sample = convert_consecutive_indels(midsv_sample) midsv.write_jsonl(midsv_sample, path_output_midsv) diff --git a/tests/src/consensus/test_consensus.py b/tests/src/consensus/test_consensus.py index 4f1ebfdc..1636993e 100644 --- a/tests/src/consensus/test_consensus.py +++ b/tests/src/consensus/test_consensus.py @@ -6,68 +6,8 @@ call_percentage, cstag_to_base, call_sequence, - convert_consecutive_indels_to_match, ) -########################################################### -# convert_consecutive_indels_to_match -########################################################### - - -def test_convert_consecutive_indels_to_match_empty_input(): - assert convert_consecutive_indels_to_match([]) == [] - - -@pytest.mark.parametrize( - "cons, expected", - [ - # simple case - ( - [{"=A": 2}, {"-C": 2}, {"-G": 2}, {"-T": 2}, {"+T|=A": 1}], - [{"=A": 2}, {"-C": 2}, {"-G": 2}, {"-T": 1, "=T": 1}, {"=A": 1}], - ), - ( - [{"=A": 2}, {"-C": 2}, {"-G": 2}, {"-T": 2}, {"+G|+T|=A": 1}], - [{"=A": 2}, {"-C": 2}, {"-G": 1, "=G": 1}, {"-T": 1, "=T": 1}, {"=A": 1}], - ), - ( - [{"=A": 2}, {"-C": 2}, {"-G": 2}, {"-T": 2}, {"+C|+G|+T|=A": 1}], - [{"=A": 2}, {"-C": 1, "=C": 1}, {"-G": 1, "=G": 1}, {"-T": 1, "=T": 1}, {"=A": 1}], - ), - # insertion < deletion - ( - [{"=A": 2}, {"-C": 2}, {"-G": 2}, {"-T": 2, "=T": 1}, {"+T|=A": 1}], - [{"=A": 2}, {"-C": 2}, {"-G": 2}, {"-T": 1, "=T": 2}, {"=A": 1}], - ), - # insertion == deletion - ( - [{"=A": 2}, {"-C": 2}, {"-G": 2}, {"-T": 1, "=T": 1}, {"+T|=A": 1}], - [{"=A": 2}, {"-C": 2}, {"-G": 2}, {"=T": 2}, {"=A": 1}], - ), - ( - [{"=A": 2}, {"-C": 2}, {"-G": 2}, {"-T": 1}, {"+T|=A": 1}], - [{"=A": 2}, {"-C": 2}, {"-G": 2}, {"=T": 1}, {"=A": 1}], - ), - # insertion > deletion - ( - [{"=A": 2}, {"-C": 2}, {"-G": 2}, {"-T": 3, "=T": 1}, {"+T|=A": 10}], - [{"=A": 2}, {"-C": 2}, {"-G": 2}, {"=T": 4}, {"+T|=A": 7, "=A": 3}], - ), - # deletion_min < insertion - ( - [{"=A": 2}, {"-C": 20}, {"-G": 2}, {"-T": 3, "=T": 1}, {"+C|+G|+T|=A": 10}], - [{"=A": 2}, {"-C": 18, "=C": 2}, {"=G": 2}, {"-T": 1, "=T": 3}, {"+C|+G|+T|=A": 8, "=A": 2}], - ), - # no change - ( - [{"=A": 2}, {"-C": 2}, {"-G": 2}, {"-T": 2}, {"=A": 2}], - [{"=A": 2}, {"-C": 2}, {"-G": 2}, {"-T": 2}, {"=A": 2}], - ), - ], -) -def test_convert_consecutive_indels_to_match(cons, expected): - assert convert_consecutive_indels_to_match(cons) == expected - ########################################################### # replace_sequence diff --git a/tests/src/preprocess/test_midsv_caller.py b/tests/src/preprocess/test_midsv_caller.py index 184dd093..270d9626 100644 --- a/tests/src/preprocess/test_midsv_caller.py +++ b/tests/src/preprocess/test_midsv_caller.py @@ -1,28 +1,29 @@ import pytest import midsv -from DAJIN2.core.preprocess.midsv_caller import _has_inversion_in_splice +from DAJIN2.core.preprocess.midsv_caller import has_inversion_in_splice from DAJIN2.core.preprocess.midsv_caller import extract_qname_of_map_ont from DAJIN2.core.preprocess.midsv_caller import replace_internal_n_to_d from DAJIN2.core.preprocess.midsv_caller import convert_flag_to_strand +from DAJIN2.core.preprocess.midsv_caller import convert_consecutive_indels_to_match from pathlib import Path from DAJIN2.core.preprocess.mapping import to_sam ########################################################### -# _has_inversion_in_splice +# has_inversion_in_splice ########################################################### def test_has_inversion_in_splice(): # Test cases where there is an inversion in splice (insertion followed by deletion) - assert _has_inversion_in_splice("4M1I4N") - assert _has_inversion_in_splice("10M1I5N") - assert _has_inversion_in_splice("1I1N") + assert has_inversion_in_splice("4M1I4N") + assert has_inversion_in_splice("10M1I5N") + assert has_inversion_in_splice("1I1N") # Test cases where there is no inversion in splice - assert not _has_inversion_in_splice("10M") - assert not _has_inversion_in_splice("5N5M") - assert not _has_inversion_in_splice("1I1M") + assert not has_inversion_in_splice("10M") + assert not has_inversion_in_splice("5N5M") + assert not has_inversion_in_splice("1I1M") def test_has_inversion_in_splice_random_inversion(): @@ -32,7 +33,7 @@ def test_has_inversion_in_splice_random_inversion(): test = list(to_sam(str(path_reference), str(path_query), preset=preset)) test = [s.split("\t") for s in test] test_cigar = [s[5] for s in test if not s[0].startswith("@")] - assert _has_inversion_in_splice(test_cigar[0]) + assert has_inversion_in_splice(test_cigar[0]) def test_has_inversion_in_splice_random_deletion(): @@ -42,7 +43,7 @@ def test_has_inversion_in_splice_random_deletion(): test = list(to_sam(str(path_reference), str(path_query), preset=preset)) test = [s.split("\t") for s in test] test_cigar = [s[5] for s in test if not s[0].startswith("@")] - assert not _has_inversion_in_splice(test_cigar[0]) + assert not has_inversion_in_splice(test_cigar[0]) """ @@ -56,7 +57,7 @@ def test_has_inversion_in_splice_random_deletion(): # test = list(to_sam(str(path_reference), str(path_query), preset=preset)) # test = [s.split("\t") for s in test] # test_cigar = [s[5] for s in test if not s[0].startswith("@")] -# assert not _has_inversion_in_splice(test_cigar[0]) +# assert not has_inversion_in_splice(test_cigar[0]) ########################################################### # extract_qname_of_map_ont @@ -181,3 +182,25 @@ def test_replace_internal_n_to_d_large_n(): def test_convert_flag_to_strand(input_sample, expected_output): result = list(convert_flag_to_strand(iter(input_sample))) assert result == expected_output + + +########################################################### +# convert_consecutive_indels_to_match +########################################################### + + +@pytest.mark.parametrize( + "cons, expected", + [ + # simple case + ("=A,-C,-G,-T,+T|=A", "=A,-C,-G,=T,=A"), + ("=A,-C,-G,-T,+G|+T|=A", "=A,-C,=G,=T,=A"), + ("=A,-C,-G,-T,+C|+G|+T|=A", "=A,=C,=G,=T,=A"), + # no change + ("=A,=C,=G,=T,=A", "=A,=C,=G,=T,=A"), + # empty + ("", "") + ], +) +def test_convert_consecutive_indels_to_match(cons, expected): + assert convert_consecutive_indels_to_match(cons) == expected From 217355d46bf69d5a7e03f775077d8124faea1982 Mon Sep 17 00:00:00 2001 From: akikuno <akikuno@users.noreply.github.com> Date: Tue, 2 Jan 2024 17:40:30 +0900 Subject: [PATCH 17/27] remove underscore of the functions --- src/DAJIN2/core/preprocess/genome_fetcher.py | 8 +++---- .../src/classification/test_classification.py | 6 ++--- tests/src/utils/test_validator.py | 22 +++++++++---------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/DAJIN2/core/preprocess/genome_fetcher.py b/src/DAJIN2/core/preprocess/genome_fetcher.py index 15862637..90db0a96 100644 --- a/src/DAJIN2/core/preprocess/genome_fetcher.py +++ b/src/DAJIN2/core/preprocess/genome_fetcher.py @@ -3,12 +3,12 @@ from urllib.request import urlopen -def _fetch_seq_coordinates(genome: str, blat_url: str, seq: str) -> dict: +def fetch_seq_coordinates(genome: str, blat_url: str, seq: str) -> dict: url = f"{blat_url}?db={genome}&type=BLAT&userSeq={seq}" response = urlopen(url).read().decode("utf8").split("\n") matches = [x for x in response if "100.0%" in x] if not matches: - raise ValueError(f"{seq} is not found in {genome}") + raise ValueError(f"{seq[:10]}... is not found in {genome}") chrom, strand, start, end, _ = matches[0].split()[-5:] return {"chrom": chrom, "strand": strand, "start": int(start), "end": int(end)} @@ -18,8 +18,8 @@ def fetch_coordinates(genome_coordinates: dict, genome_urls: dict, seq: str) -> blat_url = genome_urls["blat"] seq_start, seq_end = seq[:1000], seq[-1000:] - coordinate_start = _fetch_seq_coordinates(genome, blat_url, seq_start) - coordinate_end = _fetch_seq_coordinates(genome, blat_url, seq_end) + coordinate_start = fetch_seq_coordinates(genome, blat_url, seq_start) + coordinate_end = fetch_seq_coordinates(genome, blat_url, seq_end) chromosome, strand = coordinate_start["chrom"], coordinate_start["strand"] if strand == "+": diff --git a/tests/src/classification/test_classification.py b/tests/src/classification/test_classification.py index 23e937bf..2781d43e 100644 --- a/tests/src/classification/test_classification.py +++ b/tests/src/classification/test_classification.py @@ -13,7 +13,7 @@ ], ) def test_calc_match(input_str, expected_output): - result = classifier._calc_match(input_str) + result = classifier.calc_match(input_str) assert result == expected_output @@ -22,7 +22,7 @@ def test_extract_alleles_with_max_score(): {"QNAME": "read1", "CSSPLIT": "=A,=C,=G,=T", "SCORE": 1.0, "ALLELE": "allele1"}, {"QNAME": "read1", "CSSPLIT": "=A,=C,=G,=T", "SCORE": 0.5, "ALLELE": "allele2"}, ] - result = classifier._extract_alleles_with_max_score(score_of_each_alleles) + result = classifier.extract_alleles_with_max_score(score_of_each_alleles) expected_output = [ {"QNAME": "read1", "CSSPLIT": "=A,=C,=G,=T", "ALLELE": "allele1"}, ] @@ -36,7 +36,7 @@ def test_extract_alleles_with_max_score_multiple_reads(): {"QNAME": "read2", "CSSPLIT": "=A,=C,=G,=T", "SCORE": 0.5, "ALLELE": "allele1"}, {"QNAME": "read2", "CSSPLIT": "=A,=C,=G,=T", "SCORE": 1.0, "ALLELE": "allele2"}, ] - result = classifier._extract_alleles_with_max_score(score_of_each_alleles) + result = classifier.extract_alleles_with_max_score(score_of_each_alleles) expected_output = [ {"QNAME": "read1", "CSSPLIT": "=A,=C,=G,=T", "ALLELE": "allele1"}, {"QNAME": "read2", "CSSPLIT": "=A,=C,=G,=T", "ALLELE": "allele2"}, diff --git a/tests/src/utils/test_validator.py b/tests/src/utils/test_validator.py index ed26ff8a..0f8b7717 100644 --- a/tests/src/utils/test_validator.py +++ b/tests/src/utils/test_validator.py @@ -10,7 +10,7 @@ def test_exists(): with pytest.raises(FileNotFoundError) as e: test = "filenotfound.txt" - input_validator._validate_file_existence(test) + input_validator.validate_file_existence(test) assert str(e.value) == f"{test} is not found" @@ -22,19 +22,19 @@ def test_exists(): def test_fastq_extension(): with pytest.raises(ValueError) as e: test = "test.fqq" - input_validator._validate_fastq_extension("test.fqq") + input_validator.validate_fastq_extension("test.fqq") assert str(e.value) == f"{test} requires extensions either 'fastq', 'fastq.gz', 'fq' or 'fq.gz'" def test_fastq_error_not_fastq_format(): with pytest.raises(ValueError): fastq_path = "tests/data/preprocess/validate_inputs/empty.fq" - _ = input_validator._validate_fastq_content(fastq_path) + _ = input_validator.validate_fastq_content(fastq_path) def test_fastq_without_error(): fasta_path = "tests/data/preprocess/validate_inputs/control.fq.gz" - assert input_validator._validate_fastq_content(fasta_path) is None + assert input_validator.validate_fastq_content(fasta_path) is None ############################################################################### @@ -45,34 +45,34 @@ def test_fastq_without_error(): def test_non_proper_fasta_format(): with pytest.raises(ValueError) as e: fasta_path = "tests/data/preprocess/validate_inputs/empty.fa" - _ = input_validator._validate_fasta_content(fasta_path) + _ = input_validator.validate_fasta_content(fasta_path) assert str(e.value) == f"{fasta_path} is not a proper FASTA format" def test_fasta_error_duplicated_identifiers(): with pytest.raises(ValueError) as e: fasta_path = "tests/data/preprocess/validate_inputs/duplicated_name.fa" - _ = input_validator._validate_fasta_content(fasta_path) + _ = input_validator.validate_fasta_content(fasta_path) assert str(e.value) == f"{fasta_path} must include unique identifiers" def test_fasta_error_duplicated_sequences(): with pytest.raises(ValueError) as e: fasta_path = "tests/data/preprocess/validate_inputs/duplicated_seq.fa" - _ = input_validator._validate_fasta_content(fasta_path) + _ = input_validator.validate_fasta_content(fasta_path) assert str(e.value) == f"{fasta_path} must include unique DNA sequences" def test_fasta_error_without_control(): with pytest.raises(ValueError) as e: fasta_path = "tests/data/preprocess/validate_inputs/no_control.fa" - _ = input_validator._validate_fasta_content(fasta_path) + _ = input_validator.validate_fasta_content(fasta_path) assert str(e.value) == f"One of the headers in the {fasta_path} must be '>control'" def test_fasta_without_error(): fasta_path = "tests/data/preprocess/validate_inputs/design_stx2.fa" - assert input_validator._validate_fasta_content(fasta_path) is None + assert input_validator.validate_fasta_content(fasta_path) is None ############################################################################### @@ -82,12 +82,12 @@ def test_fasta_without_error(): @pytest.mark.slow def test_available_url_pass(): - assert input_validator._is_webpage_available("https://example.com") is True + assert input_validator.is_webpage_available("https://example.com") is True @pytest.mark.slow def test_available_url_fail(): - assert input_validator._is_webpage_available("https://example_xxx.com") is False + assert input_validator.is_webpage_available("https://example_xxx.com") is False @pytest.mark.slow From 9eefaaa1a9be3922b60655292c0a310e0f5fc76d Mon Sep 17 00:00:00 2001 From: akikuno <akikuno@users.noreply.github.com> Date: Tue, 2 Jan 2024 17:44:14 +0900 Subject: [PATCH 18/27] Update `generate_mutation_kmers` to to consider indices not registered in mutation_loci as mutations by replacing them with "@". For example, if there are no mutations in mutation_loci, "=G,=C,-C" and "~G,=G,=C" become "@,@,@" and "@,@,@" respectively, making them the same and ensuring they do not affect clustering. --- src/DAJIN2/core/clustering/kmer_generator.py | 26 ++++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/src/DAJIN2/core/clustering/kmer_generator.py b/src/DAJIN2/core/clustering/kmer_generator.py index 56c638eb..c9d41476 100644 --- a/src/DAJIN2/core/clustering/kmer_generator.py +++ b/src/DAJIN2/core/clustering/kmer_generator.py @@ -24,23 +24,33 @@ def compress_insertion(cssplit: list[str], index: int, compress_ins: bool) -> st return cssplit +def annotate_cs_tag(cssplit: str, mutation: set[str]) -> str: + cs = cssplit[0] # +, - , *, =, or N + if cs in mutation: + return cssplit + else: + return "@" + + def generate_mutation_kmers( path_sample: Path | str, mutation_loci: list[set[str]], compress_ins: bool = True ) -> Generator[list[str]]: midsv_sample = io.read_jsonl(path_sample) for cssplit in (cs["CSSPLIT"].split(",") for cs in midsv_sample): - mutation_kmers = ["N,N,N"] + mutation_kmers = ["@,@,@"] for i in range(1, len(cssplit) - 1): if mutation_loci[i] == set(): - mutation_kmers.append("N,N,N") + mutation_kmers.append("@,@,@") continue - mutation = cssplit[i][0] # +, - , *, =, N - if mutation == "+": + cs_current = cssplit[i][0] # +, - , *, =, N + if cs_current == "+": cssplit = compress_insertion(cssplit, i, compress_ins) - if mutation in mutation_loci[i]: - kmer = ",".join([cssplit[i - 1], cssplit[i], cssplit[i + 1]]) + if cs_current in mutation_loci[i]: + cs_prev = annotate_cs_tag(cssplit[i - 1], mutation_loci[i - 1]) + cs_next = annotate_cs_tag(cssplit[i + 1], mutation_loci[i + 1]) + kmer = ",".join([cs_prev, cssplit[i], cs_next]) mutation_kmers.append(kmer) else: - mutation_kmers.append("N,N,N") - mutation_kmers.append("N,N,N") + mutation_kmers.append("@,@,@") + mutation_kmers.append("@,@,@") yield mutation_kmers From e50bd57410592f195a7ebf571639c426ec2fba02 Mon Sep 17 00:00:00 2001 From: akikuno <akikuno@users.noreply.github.com> Date: Tue, 2 Jan 2024 17:45:17 +0900 Subject: [PATCH 19/27] Change `coverage_match` from `coverage_not_N` to consider matched sequence as a coverage --- src/DAJIN2/core/consensus/similarity_searcher.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/DAJIN2/core/consensus/similarity_searcher.py b/src/DAJIN2/core/consensus/similarity_searcher.py index b239341b..b6db1a34 100644 --- a/src/DAJIN2/core/consensus/similarity_searcher.py +++ b/src/DAJIN2/core/consensus/similarity_searcher.py @@ -25,10 +25,13 @@ def onehot_by_mutations(midsv_sample: list[dict]) -> dict[str, np.ndarray]: return {mut: np.array(value) for mut, value in mut_onehot.items()} -def calculate_percentage(mut_onehot: dict[str, np.ndarray], coverage: list[int]) -> dict[str, np.ndarray]: +def calculate_percentage( + mut_onehot_sample: dict[str, np.ndarray], coverage_match: np.ndarray[int] +) -> dict[str, np.ndarray]: mut_percentage = dict() - for i, (mut, onehot) in enumerate(mut_onehot.items()): - mut_percentage[mut] = np.sum(onehot, axis=0) / coverage[i] + for mut, onehot in mut_onehot_sample.items(): + x = np.sum(onehot, axis=0) / coverage_match + mut_percentage[mut] = np.where(np.isnan(x), 0, x) return mut_percentage @@ -74,11 +77,11 @@ def filter_control(path_midsv_control: Path, path_midsv_sample: Path) -> list[bo find similar control reads compared to sample reads """ cssplits = (m["CSSPLIT"].split(",") for m in io.read_jsonl(path_midsv_sample)) - coverage_not_N = [sum(1 for item in sublist if item != "N") for sublist in zip(*cssplits)] + coverage_match = np.array([sum(1 for cs in cssplit if cs.startswith("=")) for cssplit in zip(*cssplits)]) mut_onehot_sample = onehot_by_mutations(io.read_jsonl(path_midsv_sample)) mut_onehot_control = onehot_by_mutations(io.read_jsonl(path_midsv_control)) - mut_percentage_sample = calculate_percentage(mut_onehot_sample, coverage_not_N) + mut_percentage_sample = calculate_percentage(mut_onehot_sample, coverage_match) values_mask = get_values_to_mask(mut_percentage_sample) mut_onehot_sample_masked = apply_mask(mut_onehot_sample, values_mask) From 0cbec5217fdfba6886979eb86cf970b587e83e5f Mon Sep 17 00:00:00 2001 From: akikuno <akikuno@users.noreply.github.com> Date: Tue, 2 Jan 2024 17:46:36 +0900 Subject: [PATCH 20/27] Update the `identify_dissimilar_loci` function so that it unconditionally returns True if the 'sample' shows more than 5% variation compared to the 'control'. --- src/DAJIN2/core/preprocess/mutation_extractor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/DAJIN2/core/preprocess/mutation_extractor.py b/src/DAJIN2/core/preprocess/mutation_extractor.py index 130c243c..a52d45b8 100644 --- a/src/DAJIN2/core/preprocess/mutation_extractor.py +++ b/src/DAJIN2/core/preprocess/mutation_extractor.py @@ -89,6 +89,9 @@ def cosine_similarity(x, y): def identify_dissimilar_loci(values_sample, values_control, index: int) -> int: + # If 'sample' has more than 5% variation compared to 'control', unconditionally set it to True + if values_sample[index] - values_control[index] > 5: + return True x = values_sample[index - 5 : index + 6] y = values_control[index - 5 : index + 6] return cosine_similarity(x, y) < 0.95 From 5b16f1a85f28a0a6597c2e7308dd93347a7f9869 Mon Sep 17 00:00:00 2001 From: akikuno <akikuno@users.noreply.github.com> Date: Tue, 2 Jan 2024 17:47:50 +0900 Subject: [PATCH 21/27] Update `generate_mutation_kmers` to to consider indices not registered in mutation_loci as mutations by replacing them with "@". For example, if there are no mutations in mutation_loci, "=G,=C,-C" and "~G,=G,=C" become "@,@,@" and "@,@,@" respectively, making them the same and ensuring they do not affect clustering. --- docs/ROADMAP.md | 18 +++++++++++------- tests/src/clustering/test_kmer_generator.py | 8 ++++---- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index 3f4df42d..096bf728 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -26,23 +26,27 @@ ### Preprocess ++ Update `input_validator.py`: The UCSC Blat server sometimes returns a 200 HTTP status code even when an error occurs. In such cases, "Very Early Error" is indicated in the Title. Therefore, we have made it so that it returns False in those situations. [Commit Detail](https://github.com/akikuno/DAJIN2/commit/4ad9c9ef8bd963a6e20c1721480aed0fe7922760) + + Simplyfy `homopolymer_handler.py` for the error detection using cosine similarity [Commit Detail](https://github.com/akikuno/DAJIN2/commit/21c2596805c36074f360285600e60ee76b948908) -+ Use cosine similarity to filter dissimilar loci at `mutation_extractor.py` [Commit Detail](https://github.com/akikuno/DAJIN2/commit/c9f5aa7b48581e58d99fe8c31275c422756aa9f1) ++ Update `mutation_extractor.py` to use cosine similarity to filter dissimilar loci [Commit Detail](https://github.com/akikuno/DAJIN2/commit/c9f5aa7b48581e58d99fe8c31275c422756aa9f1) -### Classification ++ Update the `mutation_extractor.identify_dissimilar_loci` so that it unconditionally returns True if the 'sample' shows more than 5% variation compared to the 'control'. [Commit Detail](https://github.com/akikuno/DAJIN2/commit/0cbec5217fdfba6886979eb86cf970b587e83e5f) + ++ Add `preprocess.midsv_caller.convert_consecutive_indels_to_match`: Due to alignment errors, there can be instances where a true match is mistakenly replaced with "insertion following a deletion". For example, although it should be "=C,=T", it gets replaced by "-C,+C|=T". In such cases, a process is performed to revert it back to "=C,=T". [Commit Detail](https://github.com/akikuno/DAJIN2/commit/69c56fa904ef847dc5b0e2dcdb90303409412d0f) -#### Skipped the classification of minor alleles to suppress excessive subdivision of alleles. +### Classification -+ Added `merge_minor_alleles` to reclassify alleles with less than 10 reads. [Commit Detail](https://github.com/akikuno/DAJIN2/commit/a9399f5d4aaa0f6bcdbea0ead39b2aabf8223f5d) ++ Added `merge_minor_alleles` to reclassify alleles with less than 10 reads to suppress excessive subdivision of alleles. [Commit Detail](https://github.com/akikuno/DAJIN2/commit/a9399f5d4aaa0f6bcdbea0ead39b2aabf8223f5d) ### Clustering -#### Skipped the clustering of minor alleles to suppress excessive subdivision of alleles. - -+ Added the function `merge_minor_cluster` to revert labels clustered with less than 10 reads back to the previous labels. ++ Added the function `merge_minor_cluster` to revert labels clustered with less than 10 reads back to the previous labels to suppress excessive subdivision of alleles. [Commit Detail](https://github.com/akikuno/DAJIN2/commit/4bd9f7dd806d192475d8d4f20c1e50c37281d64e) ++ Update `generate_mutation_kmers` to to consider indices not registered in mutation_loci as mutations by replacing them with "@". For example, if there are no mutations in mutation_loci, "=G,=C,-C" and "~G,=G,=C" become "@,@,@" and "@,@,@" respectively, making them the same and ensuring they do not affect clustering. [Commit Detail](https://github.com/akikuno/DAJIN2/commit/9eefaaa1a9be3922b60655292c0a310e0f5fc76d) + ### Consensus + Use `LocalOutlierFactor` to filter abnormal control reads [Commit Detail](https://github.com/akikuno/DAJIN2/commit/4bd9f7dd806d192494c48da01fc039902c97a23ddea47dd5f2b42ab475d8d4f20c1e50c37281d64e) diff --git a/tests/src/clustering/test_kmer_generator.py b/tests/src/clustering/test_kmer_generator.py index 42082843..dce464d1 100644 --- a/tests/src/clustering/test_kmer_generator.py +++ b/tests/src/clustering/test_kmer_generator.py @@ -25,8 +25,8 @@ def test_is_generator(mock_io_read_jsonl): def test_with_specific_mutation_loci_and_compress_ins_true(mock_io_read_jsonl): gen = generate_mutation_kmers("some_path", [set(), {"-"}, {"+", "*"}, {"*"}, set()], compress_ins=True) assert list(gen) == [ - ["N,N,N", "=A,-g,+t|t|t|=A", "-g,+I=A,*ac", "+I=A,*ac,N", "N,N,N"], - ["N,N,N", "=A,-g,=T", "N,N,N", "=T,*gt,N", "N,N,N"], + ["@,@,@", "@,-g,+t|t|t|=A", "-g,+I=A,*ac", "+I=A,*ac,@", "@,@,@"], + ["@,@,@", "@,-g,@", "@,@,@", "@,*gt,@", "@,@,@"], ] @@ -34,6 +34,6 @@ def test_with_specific_mutation_loci_and_compress_ins_true(mock_io_read_jsonl): def test_with_specific_mutation_loci_and_compress_ins_false(mock_io_read_jsonl): gen = generate_mutation_kmers("some_path", [set(), {"-"}, {"+", "*"}, {"*"}, set()], compress_ins=False) assert list(gen) == [ - ["N,N,N", "=A,-g,+t|t|t|=A", "-g,+3=A,*ac", "+3=A,*ac,N", "N,N,N"], - ["N,N,N", "=A,-g,=T", "N,N,N", "=T,*gt,N", "N,N,N"], + ["@,@,@", "@,-g,+t|t|t|=A", "-g,+3=A,*ac", "+3=A,*ac,@", "@,@,@"], + ["@,@,@", "@,-g,@", "@,@,@", "@,*gt,@", "@,@,@"], ] From 71d4303743eba9923b44fa8a79ff72d5808eb0d5 Mon Sep 17 00:00:00 2001 From: akikuno <akikuno@users.noreply.github.com> Date: Fri, 5 Jan 2024 12:17:10 +0900 Subject: [PATCH 22/27] rename function of `cssplit_to_base` from `cstag_to_base` --- src/DAJIN2/core/consensus/consensus.py | 4 ++-- tests/src/consensus/test_consensus.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/DAJIN2/core/consensus/consensus.py b/src/DAJIN2/core/consensus/consensus.py index dc6b04c7..90576921 100644 --- a/src/DAJIN2/core/consensus/consensus.py +++ b/src/DAJIN2/core/consensus/consensus.py @@ -90,7 +90,7 @@ def call_percentage(cssplits: list[list[str]], mutation_loci: list[set[str]]) -> ########################################################### -def cstag_to_base(cons: str) -> str: +def cssplit_to_base(cons: str) -> str: if cons.startswith("="): # match return cons.replace("=", "") if cons.startswith("-"): # deletion @@ -115,7 +115,7 @@ def call_sequence(cons_percentage: list[dict[str, float]]) -> str: for i, cons_per in enumerate(cons_percentage): if n_left < i < n_right: cons = max(cons_per, key=cons_per.get) - consensus_sequence.append(cstag_to_base(cons)) + consensus_sequence.append(cssplit_to_base(cons)) else: consensus_sequence.append("N") return "".join(consensus_sequence) diff --git a/tests/src/consensus/test_consensus.py b/tests/src/consensus/test_consensus.py index 1636993e..fc2e154a 100644 --- a/tests/src/consensus/test_consensus.py +++ b/tests/src/consensus/test_consensus.py @@ -4,7 +4,7 @@ replace_sequence_error, adjust_to_100_percent, call_percentage, - cstag_to_base, + cssplit_to_base, call_sequence, ) @@ -60,7 +60,7 @@ def test_call_percentage(): ########################################################### -# Example test cases for cstag_to_base function +# Example test cases for cssplit_to_base function @pytest.mark.parametrize( "cons, expected_output", [ @@ -76,8 +76,8 @@ def test_call_percentage(): ("", ""), ], ) -def test_cstag_to_base(cons, expected_output): - assert cstag_to_base(cons) == expected_output +def test_cssplit_to_base(cons, expected_output): + assert cssplit_to_base(cons) == expected_output @pytest.mark.parametrize( From a57862b43c1891109436eeffaae19e7a0db59d7c Mon Sep 17 00:00:00 2001 From: akikuno <akikuno@users.noreply.github.com> Date: Fri, 5 Jan 2024 12:17:31 +0900 Subject: [PATCH 23/27] fix the function name of `fetch_seq_coordinates` --- tests/src/preprocess/test_genome_fetcher.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/src/preprocess/test_genome_fetcher.py b/tests/src/preprocess/test_genome_fetcher.py index 71dbc5ad..0dae046d 100644 --- a/tests/src/preprocess/test_genome_fetcher.py +++ b/tests/src/preprocess/test_genome_fetcher.py @@ -9,7 +9,7 @@ def test_fetch_seq_coodinates_strand_plus(): genome = "mm39" blat_url = "https://genome.ucsc.edu/cgi-bin/hgBlat" seq = "GTTAGGATTTTCAGGGTGACGACCTCCCAAGTACTCATCTGTGCAAATGT" - test = genome_fetcher._fetch_seq_coordinates(genome, blat_url, seq) + test = genome_fetcher.fetch_seq_coordinates(genome, blat_url, seq) answer = {"chrom": "chr7", "strand": "+", "start": 87141776, "end": 87141825} assert test == answer @@ -19,7 +19,7 @@ def test_fetch_seq_coodinates_strand_minus(): genome = "mm39" blat_url = "https://genome.ucsc.edu/cgi-bin/hgBlat" seq = "ACATTTGCACAGATGAGTACTTGGGAGGTCGTCACCCTGAAAATCCTAAC" - test = genome_fetcher._fetch_seq_coordinates(genome, blat_url, seq) + test = genome_fetcher.fetch_seq_coordinates(genome, blat_url, seq) answer = {"chrom": "chr7", "strand": "-", "start": 87141776, "end": 87141825} assert test == answer @@ -30,8 +30,8 @@ def test_fetch_seq_coodinates_error(): blat_url = "https://genome.ucsc.edu/cgi-bin/hgBlat" seq = "XXXXXXXXXXXXXXXXX" with pytest.raises(ValueError) as e: - genome_fetcher._fetch_seq_coordinates(genome, blat_url, seq) - assert str(e.value) == f"{seq} is not found in {genome}" + genome_fetcher.fetch_seq_coordinates(genome, blat_url, seq) + assert str(e.value) == f"{seq[:10]}... is not found in {genome}" @pytest.mark.slow From b0752960def313e237ccf7d44542f9810cad0c00 Mon Sep 17 00:00:00 2001 From: akikuno <akikuno@users.noreply.github.com> Date: Fri, 5 Jan 2024 12:17:59 +0900 Subject: [PATCH 24/27] Add `allele_merger.py` to merge minor alleles --- .../core/classification/allele_merger.py | 97 +++++++++++++++++++ src/DAJIN2/core/classification/classifier.py | 88 +---------------- 2 files changed, 99 insertions(+), 86 deletions(-) create mode 100644 src/DAJIN2/core/classification/allele_merger.py diff --git a/src/DAJIN2/core/classification/allele_merger.py b/src/DAJIN2/core/classification/allele_merger.py new file mode 100644 index 00000000..ca34a592 --- /dev/null +++ b/src/DAJIN2/core/classification/allele_merger.py @@ -0,0 +1,97 @@ +from __future__ import annotations + +from itertools import groupby +from collections import defaultdict + + +########################################################## +# merge minor alleles +########################################################## + + +def count_alleles(score_of_each_alleles: list[dict]) -> dict[str, int]: + score_of_each_alleles.sort(key=lambda x: x["QNAME"]) + allele_counts = defaultdict(int) + for _, group in groupby(score_of_each_alleles, key=lambda x: x["QNAME"]): + allele_predicted = max(group, key=lambda x: x["SCORE"])["ALLELE"] + allele_counts[allele_predicted] += 1 + + return allele_counts + + +def extract_minor_alleles(count_allele: dict[str, int], threshold_readnumber: int = 10) -> dict[str, int]: + return {allele: value for allele, value in count_allele.items() if value < threshold_readnumber} + + +def extract_major_alleles(count_allele: dict[str, int], threshold_readnumber: int = 10) -> set[str]: + return {allele for allele, value in count_allele.items() if value >= threshold_readnumber} + + +def sum_dictionaries(dict1, dict2): + """Function to sum the values of two dictionaries based on their keys.""" + return {key: dict1.get(key, 0) + dict2.get(key, 0) for key in set(dict1) | set(dict2)} + + +def split_major_minor_alleles( + score_of_each_alleles: list[dict], minor_alleles: dict[str, int] +) -> tuple[list[dict], list[dict]]: + score_on_minor_alleles = [] + score_on_major_alleles = [] + score_of_each_alleles.sort(key=lambda x: [x["QNAME"]]) + for _, group in groupby(score_of_each_alleles, key=lambda x: x["QNAME"]): + group = list(group) + allele_predicted = max(group, key=lambda x: x["SCORE"])["ALLELE"] + if allele_predicted in minor_alleles: + score_on_minor_alleles.extend(group) + else: + score_on_major_alleles.extend(group) + + return score_on_major_alleles, score_on_minor_alleles + + +def extract_most_major_allele(major_alleles: dict[str, int]) -> str: + return max(major_alleles, key=lambda x: major_alleles[x]) + + +def replace_negative_inf_with_most_major_allele( + score_of_minor_alleles: list[dict], all_allele_counts: dict[str, int] +) -> list[dict]: + most_major_allele = extract_most_major_allele(all_allele_counts) + score_of_minor_alleles = [] + for s in score_of_minor_alleles: + if s["SCORE"] == float("-inf"): + s["ALLELE"] = most_major_allele + score_of_minor_alleles.append(s) + return score_of_minor_alleles + + +def merge_minor_alleles(score_of_each_alleles, threshold_readnumber: int = 10) -> list[dict]: + score_of_each_alleles.sort(key=lambda x: [x["QNAME"]]) + + all_allele_counts = count_alleles(score_of_each_alleles) + minor_allele_counts = extract_minor_alleles(all_allele_counts, threshold_readnumber) + + score_of_alleles_merged, score_of_minor_alleles = split_major_minor_alleles( + score_of_each_alleles, minor_allele_counts + ) + + score_of_minor_alleles.sort(key=lambda x: x["QNAME"]) + new_major_allele = set() + for minor_allele in sorted(minor_allele_counts, key=minor_allele_counts.get): + if minor_allele in new_major_allele: + continue + for _, group in groupby(score_of_minor_alleles, key=lambda x: x["QNAME"]): + group = list(group) + allele_predicted = max(group, key=lambda x: x["SCORE"])["ALLELE"] + if allele_predicted != minor_allele: + continue + for g in group: + if g["ALLELE"] in minor_allele: + g["SCORE"] = float("-inf") + + allele_counts = count_alleles(score_of_minor_alleles) + new_major_allele |= extract_major_alleles(allele_counts, threshold_readnumber) + + score_of_minor_alleles = replace_negative_inf_with_most_major_allele(score_of_minor_alleles, all_allele_counts) + + return score_of_alleles_merged + score_of_minor_alleles diff --git a/src/DAJIN2/core/classification/classifier.py b/src/DAJIN2/core/classification/classifier.py index 0fc18d2b..52a78331 100644 --- a/src/DAJIN2/core/classification/classifier.py +++ b/src/DAJIN2/core/classification/classifier.py @@ -2,9 +2,9 @@ from pathlib import Path from itertools import groupby -from collections import defaultdict from DAJIN2.utils import io +from DAJIN2.core.classification.allele_merger import merge_minor_alleles def calc_match(cssplit: str) -> float: @@ -17,7 +17,6 @@ def calc_match(cssplit: str) -> float: def score_allele(path_midsv: Path, allele: str) -> list[dict]: midsv_sample = io.read_jsonl(path_midsv) - scored_alleles = [] for dict_midsv in midsv_sample: score = calc_match(dict_midsv["CSSPLIT"]) @@ -34,91 +33,8 @@ def extract_alleles_with_max_score(score_of_each_alleles: list[dict]) -> list[di max_read = max(group, key=lambda x: x["SCORE"]) del max_read["SCORE"] alleles_with_max_score.append(max_read) - return alleles_with_max_score - - -########################################################## -# merge minor alleles -########################################################## - - -def count_alleles(score_of_each_alleles: list[dict]) -> dict[str, int]: - score_of_each_alleles.sort(key=lambda x: x["QNAME"]) - allele_counts = defaultdict(int) - for _, group in groupby(score_of_each_alleles, key=lambda x: x["QNAME"]): - allele_predicted = max(group, key=lambda x: x["SCORE"])["ALLELE"] - allele_counts[allele_predicted] += 1 - return allele_counts - -# マイナーアレル, メジャーアレルの同定 - - -def extract_minor_alleles( - count_allele: dict[str, int], major_alleles: set[str], threshold_readnumber: int = 10 -) -> dict[str, int]: - return { - allele: value - for allele, value in count_allele.items() - if allele not in major_alleles and value < threshold_readnumber - } - - -def update_major_allele( - count_allele: dict[str, int], major_alleles: set[str], threshold_readnumber: int = 10 -) -> set[str]: - new_major_alleles = { - allele - for allele, value in count_allele.items() - if allele not in major_alleles and value >= threshold_readnumber - } - return major_alleles | new_major_alleles - - -# score_of_each_allelesから、マイナーアレルに該当するリードを抽出する - - -def extract_minor_groups(score_of_each_alleles: list[dict], minor_alleles: dict[str, int]) -> list[dict]: - score_on_minor_alleles = [] - score_of_each_alleles.sort(key=lambda x: [x["QNAME"]]) - for _, group in groupby(score_of_each_alleles, key=lambda x: x["QNAME"]): - group = list(group) - allele_predicted = max(group, key=lambda x: x["SCORE"])["ALLELE"] - if allele_predicted in minor_alleles: - score_on_minor_alleles.extend(group) - return score_on_minor_alleles - - -def merge_minor_alleles(score_of_each_alleles, threshold_readnumber: int = 10) -> list[dict]: - score_of_each_alleles.sort(key=lambda x: [x["QNAME"]]) - major_alleles = set() - minor_alleles = dict() - - allele_counts = count_alleles(score_of_each_alleles) - minor_alleles = extract_minor_alleles(allele_counts, major_alleles, threshold_readnumber) - major_alleles = update_major_allele(allele_counts, major_alleles, threshold_readnumber) - - minor_alleles_sorted = sorted(minor_alleles.items(), key=lambda x: x[1]) - - score_on_minor_alleles = extract_minor_groups(score_of_each_alleles, minor_alleles) - - minor_allele_record = set() - for minor_allele, _ in minor_alleles_sorted: - minor_allele_record.add(minor_allele) - score_on_minor_alleles.sort(key=lambda x: x["QNAME"]) - for _, group in groupby(score_on_minor_alleles, key=lambda x: x["QNAME"]): - group = list(group) - allele_predicted = max(group, key=lambda x: x["SCORE"])["ALLELE"] - if allele_predicted != minor_allele: - continue - for g in group: - if g["ALLELE"] in minor_allele_record: - g["SCORE"] = float("-inf") - allele_counts = count_alleles(score_on_minor_alleles) - minor_alleles = extract_minor_alleles(allele_counts, major_alleles, threshold_readnumber) - major_alleles = update_major_allele(allele_counts, major_alleles, threshold_readnumber) - score_on_minor_alleles = extract_minor_groups(score_on_minor_alleles, minor_alleles) - return score_of_each_alleles + return alleles_with_max_score ########################################################## From a704118d52edc1af527f6e34a5d0230349653b97 Mon Sep 17 00:00:00 2001 From: akikuno <akikuno@users.noreply.github.com> Date: Fri, 5 Jan 2024 12:18:23 +0900 Subject: [PATCH 25/27] update ROADMAP.md --- docs/ROADMAP.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index 096bf728..66df9133 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -38,7 +38,7 @@ ### Classification -+ Added `merge_minor_alleles` to reclassify alleles with less than 10 reads to suppress excessive subdivision of alleles. [Commit Detail](https://github.com/akikuno/DAJIN2/commit/a9399f5d4aaa0f6bcdbea0ead39b2aabf8223f5d) ++ Added `allele_merger.merge_minor_alleles` to reclassify alleles with less than 10 reads to suppress excessive subdivision of alleles. [Commit Detail](https://github.com/akikuno/DAJIN2/commit/b0752960def313e237ccf7d44542f9810cad0c00) ### Clustering From 7bca4590f97e1858304c3e9fb66c54a279dfcdf0 Mon Sep 17 00:00:00 2001 From: akikuno <akikuno@users.noreply.github.com> Date: Mon, 8 Jan 2024 08:26:24 +0900 Subject: [PATCH 26/27] Add `is_consensus` argument: When it comes to consensus, if the difference between sample and control is more than 20%, it is unconditionally considered a mutation. --- docs/ROADMAP.md | 5 +++ .../core/consensus/mutation_extractor.py | 9 +++++- .../core/preprocess/mutation_extractor.py | 32 +++++++++---------- 3 files changed, 29 insertions(+), 17 deletions(-) diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index 66df9133..5a6f803c 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -70,6 +70,7 @@ + Remove minor alleles with predicted insertion + Enhance the Clarity of Insertion Allele Identification. + Develop and Integrate Inversion Detection Capability ++ ReferenceのアレルをFASTA/HTMLディレクトリに保存する ------------- @@ -100,8 +101,12 @@ + [x] Added `similarity_searcher.py` to extract control reads resembling the consensus sequence, thereby enhancing the accuracy of detecting sample-specific mutations. [Commit Detail](https://github.com/akikuno/DAJIN2/commit/98a8a45e13835502f7dea2622274da81bbbc3ba3) + [x] Changed the method in `clust_formatter.get_thresholds`` to dynamically define the thresholds for ignoring mutations, instead of using fixed values.[Commit Detail](https://github.com/akikuno/DAJIN2/commit/2249d1601ad619a7db0fcc9ebf79d63f8dcf164b) + + [x] Removed code that was previously commented out [Commit Detail](https://github.com/akikuno/DAJIN2/commit/2249d1601ad619a7db0fcc9ebf79d63f8dcf164b) ++ [x] Add `is_consensus` argument: When it comes to consensus, if the difference between sample and control is more than 20%, it is unconditionally considered a mutation. [Commit Detail](https://github.com/akikuno/DAJIN2/commit/2249d1601ad619a7db0fcc9ebf79d63f8dcf164b) + + ## 🐛 Bug Fixes + None diff --git a/src/DAJIN2/core/consensus/mutation_extractor.py b/src/DAJIN2/core/consensus/mutation_extractor.py index 6a99c5bd..2f1a842e 100644 --- a/src/DAJIN2/core/consensus/mutation_extractor.py +++ b/src/DAJIN2/core/consensus/mutation_extractor.py @@ -107,8 +107,15 @@ def cache_mutation_loci(ARGS, clust_sample: list[dict]) -> None: thresholds = get_thresholds(path_indels_normalized_sample, path_indels_normalized_control) + is_consensus = True + mutation_loci = extract_mutation_loci( - sequence, path_indels_normalized_sample, path_indels_normalized_control, path_knockin, thresholds + sequence, + path_indels_normalized_sample, + path_indels_normalized_control, + path_knockin, + thresholds, + is_consensus, ) io.save_pickle(mutation_loci, Path(path_consensus, f"{allele}_{label}_mutation_loci.pickle")) diff --git a/src/DAJIN2/core/preprocess/mutation_extractor.py b/src/DAJIN2/core/preprocess/mutation_extractor.py index a52d45b8..f898e73f 100644 --- a/src/DAJIN2/core/preprocess/mutation_extractor.py +++ b/src/DAJIN2/core/preprocess/mutation_extractor.py @@ -88,47 +88,44 @@ def cosine_similarity(x, y): return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)) -def identify_dissimilar_loci(values_sample, values_control, index: int) -> int: - # If 'sample' has more than 5% variation compared to 'control', unconditionally set it to True - if values_sample[index] - values_control[index] > 5: +def identify_dissimilar_loci(values_sample, values_control, index: int, is_consensus: bool = False) -> int: + # If 'sample' has more than X% variation compared to 'control', unconditionally set it to "dissimilar loci" + threshold = 20 if is_consensus else 5 + if values_sample[index] - values_control[index] > threshold: return True + x = values_sample[index - 5 : index + 6] y = values_control[index - 5 : index + 6] + return cosine_similarity(x, y) < 0.95 -def detect_anomalies(values_sample, values_control, threshold: float) -> set[int]: +def detect_anomalies(values_sample, values_control, threshold: float, is_consensus: bool = False) -> set[int]: """ Detect anomalies and return indices of outliers. - - It classifies data points into two classes: 1 (inliers) and -1 (outliers). - However, depending on how the "normal" class was learned, either of these classes - might represent the true anomalies in the context of this problem. - - This function returns the indices of the class with the higher mean of values_subtract - values, as this class is considered to be the true anomalies. """ values_subtract = values_sample - values_control - - """"When the result of subtraction is threshold (%) or less, ignore it as 0""" values_subtract = np.where(values_subtract <= threshold, 0, values_subtract) + values_subtract_reshaped = values_subtract.reshape(-1, 1) kmeans = MiniBatchKMeans(n_clusters=2, random_state=0, n_init="auto").fit(values_subtract_reshaped) threshold = kmeans.cluster_centers_.mean() candidate_loci = {i for i, v in enumerate(values_subtract_reshaped) if v > threshold} - return {i for i in candidate_loci if identify_dissimilar_loci(values_sample, values_control, i)} + + return {i for i in candidate_loci if identify_dissimilar_loci(values_sample, values_control, i, is_consensus)} def extract_anomal_loci( indels_normalized_sample, indels_normalized_control, thresholds: dict[str, float], + is_consensus: bool = False, ) -> dict[str, set[int]]: results = dict() for mut in {"+", "-", "*"}: values_sample = indels_normalized_sample[mut] values_control = indels_normalized_control[mut] - idx_outliers = detect_anomalies(values_sample, values_control, thresholds[mut]) + idx_outliers = detect_anomalies(values_sample, values_control, thresholds[mut], is_consensus) results[mut] = idx_outliers return results @@ -273,13 +270,16 @@ def extract_mutation_loci( path_indels_normalized_control: Path, path_knockin: Path, thresholds: dict[str, float] = {"*": 0.5, "-": 0.5, "+": 0.5}, + is_consensus: bool = False, ) -> list[set[str]]: indels_normalized_sample = io.load_pickle(path_indels_normalized_sample) indels_normalized_control = io.load_pickle(path_indels_normalized_control) """Extract candidate mutation loci""" indels_normalized_minimize_control = minimize_mutation_counts(indels_normalized_control, indels_normalized_sample) - anomal_loci = extract_anomal_loci(indels_normalized_sample, indels_normalized_minimize_control, thresholds) + anomal_loci = extract_anomal_loci( + indels_normalized_sample, indels_normalized_minimize_control, thresholds, is_consensus + ) """Extract error loci in homopolymer regions""" errors_in_homopolymer = homopolymer_handler.extract_errors( From 40588a1243f3172290d8f40a22ca21426eccaef6 Mon Sep 17 00:00:00 2001 From: akikuno <akikuno@users.noreply.github.com> Date: Mon, 8 Jan 2024 08:26:48 +0900 Subject: [PATCH 27/27] update commit log --- docs/ROADMAP.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index 5a6f803c..f3ec894d 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -104,7 +104,7 @@ + [x] Removed code that was previously commented out [Commit Detail](https://github.com/akikuno/DAJIN2/commit/2249d1601ad619a7db0fcc9ebf79d63f8dcf164b) -+ [x] Add `is_consensus` argument: When it comes to consensus, if the difference between sample and control is more than 20%, it is unconditionally considered a mutation. [Commit Detail](https://github.com/akikuno/DAJIN2/commit/2249d1601ad619a7db0fcc9ebf79d63f8dcf164b) ++ [x] Add `is_consensus` argument: When it comes to consensus, if the difference between sample and control is more than 20%, it is unconditionally considered a mutation. [Commit Detail](https://github.com/akikuno/DAJIN2/commit/7bca4590f97e1858304c3e9fb66c54a279dfcdf0) ## 🐛 Bug Fixes