From 6b98645a6380703471e41f6186487c9ff6a0f5d6 Mon Sep 17 00:00:00 2001 From: lenakinzel <93591767+lenakinzel@users.noreply.github.com> Date: Wed, 13 Jul 2022 14:27:08 +0200 Subject: [PATCH] refactor: merge master in branch (#549) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore(deps): bump pre-commit/action from 2.0.3 to 3.0.0 (#542) Bumps [pre-commit/action](https://github.com/pre-commit/action) from 2.0.3 to 3.0.0. - [Release notes](https://github.com/pre-commit/action/releases) - [Commits](https://github.com/pre-commit/action/compare/v2.0.3...v3.0.0) --- updated-dependencies: - dependency-name: pre-commit/action dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * fix: voc probability (#541) * fix: Replace outdated df append (#544) * replaced all df.append with pd.concat * replaced df.append with pd.concat * replace df.append with pd.concat * replaced df.append with pd.concat * replaced df.append with pd.concat * replaced df.append with pd.concat Co-authored-by: lenakinzel Co-authored-by: Alexander Thomas <77535027+alethomas@users.noreply.github.com> * refactor: fgbio read clipping (#545) * change primer file formatting * change read clipping * fmt * fix primer clipping plot * fix primer clipping plot * fix * fix * typo * test * Update pangolin to 4.1 (#546) Co-authored-by: thomasbtf Co-authored-by: Ann-Kathrin Brüggemann <90249112+AKBrueggemann@users.noreply.github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Alexander Thomas <77535027+alethomas@users.noreply.github.com> Co-authored-by: vBassewitz <104062464+vBassewitz@users.noreply.github.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: thomasbtf Co-authored-by: Ann-Kathrin Brüggemann <90249112+AKBrueggemann@users.noreply.github.com> --- .github/workflows/main.yml | 2 +- resources/lineage-variant-table-formatter.js | 14 ++++- workflow/envs/pangolin.yaml | 2 +- workflow/rules/read_clipping.smk | 51 ++++++----------- .../{bed-to-bedpe.py => bed-to-tsv.py} | 5 +- workflow/scripts/collect_lineage_calls.py | 2 +- .../scripts/evaluate-strain-call-error.py | 2 +- .../extract-strains-from-gisaid-provision.py | 4 +- .../scripts/generate-lineage-variant-table.py | 55 +++++++++++++------ workflow/scripts/generate-overview-table.py | 2 +- workflow/scripts/plot-all-coverage.py | 2 +- workflow/scripts/plot-assembly-comparison.py | 52 +++++++++++------- .../plot-dependency-of-pangolin-call.py | 2 +- workflow/scripts/plot-pangolin-conflict.py | 2 +- workflow/scripts/plot-primer-clipping.py | 12 ++-- 15 files changed, 122 insertions(+), 87 deletions(-) rename workflow/scripts/{bed-to-bedpe.py => bed-to-tsv.py} (85%) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 45b1b12ad..0be53d2b3 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -54,7 +54,7 @@ jobs: steps: - uses: actions/checkout@v2 - uses: actions/setup-python@v3 - - uses: pre-commit/action@v2.0.3 + - uses: pre-commit/action@v3.0.0 Technology-Tests: runs-on: ubuntu-latest diff --git a/resources/lineage-variant-table-formatter.js b/resources/lineage-variant-table-formatter.js index c980d5682..862fe9d67 100644 --- a/resources/lineage-variant-table-formatter.js +++ b/resources/lineage-variant-table-formatter.js @@ -117,13 +117,23 @@ Probability: function (prob) { prob = parseFloat(prob) if (!isNaN(prob)) { - var lighting = (0.9 - prob * 0.4) * 100; + if (prob < 0.5) { + var lighting = (0.5 + (prob * 0.8)) * 100; + var color = "10, 85%, " + lighting + "%" + + } else if (prob < 0.95 && prob >= 0.5){ + var lighting = ((0.9 * prob) + 0.05) * 100; + var color = "47, 100%, " + lighting + "%" + } else { + var lighting = (8.5 - (prob * 8)) * 100; + var color = "138, 72%, " + lighting + "%" + } if (prob < 0.1) { prob = prob.toExponential(2) } else { prob = prob.toFixed(2) } - return `${prob}`; + return `${prob}`; } else { return " " } diff --git a/workflow/envs/pangolin.yaml b/workflow/envs/pangolin.yaml index a36746fcb..2af1a4e7b 100644 --- a/workflow/envs/pangolin.yaml +++ b/workflow/envs/pangolin.yaml @@ -2,4 +2,4 @@ channels: - bioconda - conda-forge dependencies: - - pangolin =4.0.6 + - pangolin =4.1 diff --git a/workflow/rules/read_clipping.smk b/workflow/rules/read_clipping.smk index 8abdde7ed..4502dfcbb 100644 --- a/workflow/rules/read_clipping.smk +++ b/workflow/rules/read_clipping.smk @@ -21,63 +21,48 @@ rule samtools_sort: "0.74.0/bio/samtools/sort" -rule bed_to_bedpe: +rule bed_to_tsv: input: check_bed_for_URL(config["preprocessing"]["amplicon-primers"]), output: - "resources/primer.bedpe", + "resources/primer.tsv", log: "logs/bed-to-bedpe.log", conda: "../envs/python.yaml" script: - "../scripts/bed-to-bedpe.py" + "../scripts/bed-to-tsv.py" -rule bamclipper: +rule trim_primers_fgbio: input: bam="results/{date}/read-sorted/{read_type}~position/{sample}.initial.bam", bai="results/{date}/read-sorted/{read_type}~position/{sample}.initial.bam.bai", - bedpe="resources/primer.bedpe", - output: - temp( - "results/{date}/read-clipping/softclipped/{read_type}/{sample}/{sample}.initial.primerclipped.bam" + ref="resources/genomes/{reference}.fasta".format( + reference=config["preprocessing"]["amplicon-reference"] ), - params: - output_dir=get_output_dir, - cwd=lambda w: os.getcwd(), - bed_path=lambda w, input: os.path.join(os.getcwd(), input.bedpe), - bam=lambda w, input: os.path.basename(input.bam), + primers="resources/primer.tsv", + output: + "results/{date}/read-clipping/hardclipped/{read_type}/{sample}/{sample}.bam", log: - "logs/{date}/bamclipper/{read_type}/{sample}.log", + "logs/{date}/fgbio_primer_clipping/{read_type}/{sample}.log", conda: - "../envs/bamclipper.yaml" - threads: 6 + "../envs/fgbio.yaml" shell: - "(cp {input.bam} {params.output_dir} &&" - " cp {input.bai} {params.output_dir} &&" - " cd {params.output_dir} &&" - " bamclipper.sh -b {params.bam} -p {params.bed_path} -n {threads} -u 5 -d 5) " - " > {params.cwd}/{log} 2>&1" + "fgbio TrimPrimers -i {input.bam} -p {input.primers} -o {output} -H true -r {input.ref} > {log} 2>&1" -rule fgbio: +rule filter_bam_fgbio: input: - bam="results/{date}/read-clipping/softclipped/{read_type}/{sample}/{sample}.initial.primerclipped.bam", - bai="results/{date}/read-clipping/softclipped/{read_type}/{sample}/{sample}.initial.primerclipped.bam.bai", - ref="resources/genomes/{reference}.fasta".format( - reference=config["preprocessing"]["amplicon-reference"] - ), + bam="results/{date}/read-clipping/hardclipped/{read_type}/{sample}/{sample}.bam", output: - temp( - "results/{date}/read-clipping/hardclipped/{read_type}/{sample}/{sample}.bam" - ), + "results/{date}/read-clipping/hc_filtered/{read_type}/{sample}/{sample}.bam", log: - "logs/{date}/fgbio/{read_type}/{sample}.log", + "logs/{date}/fgbio_filter_bam/{read_type}/{sample}.log", conda: "../envs/fgbio.yaml" shell: - "fgbio --sam-validation-stringency=LENIENT ClipBam -i {input.bam} -o {output} -H true -r {input.ref} > {log} 2>&1" + "fgbio FilterBam -i {input.bam} -o {output} --min-insert-size 100 --remove-single-end-mappings > {log} 2>&1" rule samtools_fastq_pe: @@ -132,7 +117,7 @@ rule plot_primer_clipping: ), params: samples=lambda wildcards: get_samples_for_date(wildcards.date), - bedpe="resources/primer.bedpe", + bedpe="resources/primer.tsv", log: "logs/{date}/plot-primer-clipping.log", conda: diff --git a/workflow/scripts/bed-to-bedpe.py b/workflow/scripts/bed-to-tsv.py similarity index 85% rename from workflow/scripts/bed-to-bedpe.py rename to workflow/scripts/bed-to-tsv.py index 4a3b343df..098a18cb7 100644 --- a/workflow/scripts/bed-to-bedpe.py +++ b/workflow/scripts/bed-to-tsv.py @@ -23,10 +23,9 @@ df_sense["chrom"], df_sense["start"], df_sense["end"], - df_antisense["chrom"], df_antisense["start"], df_antisense["end"], ] -headers = ["chrom1", "start1", "end1", "chrom2", "start2", "end2"] +headers = ["chrom", "left_start", "left_end", "right_start", "right_end"] df_bedpe = pd.concat(data, axis=1, keys=headers) -df_bedpe.to_csv(snakemake.output[0], header=None, index=None, sep="\t", mode="a") +df_bedpe.to_csv(snakemake.output[0], index=None, sep="\t", mode="a") diff --git a/workflow/scripts/collect_lineage_calls.py b/workflow/scripts/collect_lineage_calls.py index b897b0e78..7dc86ce28 100644 --- a/workflow/scripts/collect_lineage_calls.py +++ b/workflow/scripts/collect_lineage_calls.py @@ -57,7 +57,7 @@ def collect_calls(sm_input, sm_output, states, lineage, number, length): ] # bring them together - call = pangolin_calls.append(call) + call = pd.concat([pangolin_calls, call]) call.to_csv(sm_output, sep="\t", index=False) diff --git a/workflow/scripts/evaluate-strain-call-error.py b/workflow/scripts/evaluate-strain-call-error.py index f91397555..7b202af7a 100644 --- a/workflow/scripts/evaluate-strain-call-error.py +++ b/workflow/scripts/evaluate-strain-call-error.py @@ -62,7 +62,7 @@ def eval_error(paths, sm_output, max_reads, prefix, separator, percentage, load_ df = df.merge(org_mix_df, how="outer").fillna(0) - results_df = results_df.append(df) + results_df = pd.concat([results_df, df]) for sample in results_df["mix"].unique(): sample_rmse = rmse( diff --git a/workflow/scripts/extract-strains-from-gisaid-provision.py b/workflow/scripts/extract-strains-from-gisaid-provision.py index c737fdccd..7ed832482 100644 --- a/workflow/scripts/extract-strains-from-gisaid-provision.py +++ b/workflow/scripts/extract-strains-from-gisaid-provision.py @@ -21,7 +21,9 @@ def extract_strains_from_provision( chunks = pd.read_json(path_to_provision, lines=True, chunksize=9000) for i, chunk in enumerate(chunks): print(f"Parsing chunk {i}", file=sys.stderr) - provision = provision.append(select_oldest_strains(chunk), ignore_index=True) + provision = pd.concat( + [provision, select_oldest_strains(chunk)], ignore_index=True + ) provision = select_oldest_strains(provision) # save strain genomes diff --git a/workflow/scripts/generate-lineage-variant-table.py b/workflow/scripts/generate-lineage-variant-table.py index 6784c48a0..94b5387c1 100644 --- a/workflow/scripts/generate-lineage-variant-table.py +++ b/workflow/scripts/generate-lineage-variant-table.py @@ -21,11 +21,19 @@ def phred_to_prob(phred): # np.prod returns 1's as values for a pd series with NaN's. A list would return NaN's -def prod_prob_not_present(probs): +# switched for use of minimum +# def prod_prob_not_present(probs): +# if pd.isna(probs).any(): +# return pd.NA +# else: +# return np.prod(probs) + + +def min_prob_not_present(probs): if pd.isna(probs).any(): return pd.NA else: - return np.prod(probs) + return np.min(probs) def has_numbers(inputString): @@ -67,21 +75,36 @@ def rename_enumeration(list_length): lineages = record.info["LINEAGES"] for signature in signatures: # generate df with all signatures + VAF and Prob_not_present from calculation - variants_df = variants_df.append( - { - "Mutations": signature, - "Frequency": vaf, - "ReadDepth": dp, - "Prob_not_present": prob_not_present, - }, + variants_df = pd.concat( + [ + variants_df, + pd.DataFrame( + { + "Frequency": vaf, + "Mutations": signature, + "Prob_not_present": prob_not_present, + "ReadDepth": dp, + }, + index=[0], + ), + ], ignore_index=True, ) - # generate df with lineage matrix for all signatures - lineage_df = lineage_df.append( - { - "Mutations": signature, - **{lineage.replace(".", " "): "x" for lineage in lineages}, - }, + + lineage_df = pd.concat( + [ + lineage_df, + pd.DataFrame( + { + "Mutations": [signature], + **{ + lineage.replace(".", " "): "x" + for lineage in lineages + }, + }, + index=[0], + ), + ], ignore_index=True, ) @@ -89,7 +112,7 @@ def rename_enumeration(list_length): variants_df = variants_df.groupby(["Mutations"]).agg( func={ "Frequency": lambda x: min(sum(x), 1.0), - "Prob_not_present": prod_prob_not_present, + "Prob_not_present": min_prob_not_present, "ReadDepth": np.min, }, axis=1, diff --git a/workflow/scripts/generate-overview-table.py b/workflow/scripts/generate-overview-table.py index 28683493c..448f61f12 100644 --- a/workflow/scripts/generate-overview-table.py +++ b/workflow/scripts/generate-overview-table.py @@ -65,7 +65,7 @@ def is_patient_report(): columns=[eukaryota, bacteria, viruses, sars_cov2, unclassified] ).fillna(0) kraken_results["sample"] = sample - species_columns = species_columns.append(kraken_results, ignore_index=True) + species_columns = pd.concat([species_columns, kraken_results], ignore_index=True) data = data.join(species_columns.set_index("sample")) diff --git a/workflow/scripts/plot-all-coverage.py b/workflow/scripts/plot-all-coverage.py index c7af5699c..065d92667 100644 --- a/workflow/scripts/plot-all-coverage.py +++ b/workflow/scripts/plot-all-coverage.py @@ -23,7 +23,7 @@ def plot_coverage(sm_input, sm_output, min_coverage): sample_df["Sample"] = sample_df["#CHROM"].apply(lambda x: str(x).split(".")[0]) - coverage = coverage.append(sample_df, ignore_index=True) + coverage = pd.concat([coverage, sample_df], ignore_index=True) coverage["# Coverage"] = coverage.Coverage.apply( lambda x: f"< {min_coverage}" diff --git a/workflow/scripts/plot-assembly-comparison.py b/workflow/scripts/plot-assembly-comparison.py index 27ee60060..3754784eb 100644 --- a/workflow/scripts/plot-assembly-comparison.py +++ b/workflow/scripts/plot-assembly-comparison.py @@ -18,29 +18,43 @@ def register_lengths(sample, file_list, state, amplicon_state, data): for file, assembler in zip(file_list, snakemake.params.assembler): if state in ("initial", "scaffolded"): with pysam.FastxFile(file) as infile: - data = data.append( - { - "Sample": sample, - "Assembler": assembler, - "Amplicon": amplicon_state, - "length (bp)": max(len(contig.sequence) for contig in infile), - "State": state, - }, + data = pd.concat( + [ + data, + pd.DataFrame( + { + "Sample": sample, + "Assembler": assembler, + "Amplicon": amplicon_state, + "length (bp)": max( + len(contig.sequence) for contig in infile + ), + "State": state, + }, + index=[0], + ), + ], ignore_index=True, ) else: quastDf = pd.read_csv(file, sep="\t") - data = data.append( - { - "Sample": sample, - "Assembler": assembler, - "Amplicon": amplicon_state, - "length (bp)": quastDf.loc[0, "N50"], - "State": "N50", - "Genome fraction (%)": quastDf.loc[0, "Genome fraction (%)"] - if "Genome fraction (%)" in quastDf.columns - else float("nan"), - }, + data = pd.concat( + [ + data, + pd.DataFrame( + { + "Sample": sample, + "Assembler": assembler, + "Amplicon": amplicon_state, + "length (bp)": quastDf.loc[0, "N50"], + "State": "N50", + "Genome fraction (%)": quastDf.loc[0, "Genome fraction (%)"] + if "Genome fraction (%)" in quastDf.columns + else float("nan"), + }, + index=[0], + ), + ], ignore_index=True, ) return data diff --git a/workflow/scripts/plot-dependency-of-pangolin-call.py b/workflow/scripts/plot-dependency-of-pangolin-call.py index 99117ab89..f8b5f03c0 100644 --- a/workflow/scripts/plot-dependency-of-pangolin-call.py +++ b/workflow/scripts/plot-dependency-of-pangolin-call.py @@ -35,7 +35,7 @@ def plot_dependency_of_pangolin_call(sm_input, sm_output): pangolin_output["mixture_content"] = input.split(MIXTURE_PREFIX, 1)[-1].split( "." )[0] - all_sampes = all_sampes.append(pangolin_output, ignore_index=True) + all_sampes = pd.concat([all_sampes, pangolin_output], ignore_index=True) all_sampes["mixture_content"] = all_sampes["mixture_content"].str.replace("-", ".") diff --git a/workflow/scripts/plot-pangolin-conflict.py b/workflow/scripts/plot-pangolin-conflict.py index e653e0b77..2d7a28f55 100644 --- a/workflow/scripts/plot-pangolin-conflict.py +++ b/workflow/scripts/plot-pangolin-conflict.py @@ -35,7 +35,7 @@ def plot_pangolin_conflict(sm_input, sm_output): pangolin_output = pd.read_csv(input) pangolin_output["true_lineage"] = true_lineage pangolin_output["true_lineage_percent"] = percent - all_sampes = all_sampes.append(pangolin_output, ignore_index=True) + all_sampes = pd.concat([all_sampes, pangolin_output], ignore_index=True) all_sampes["correct_lineage_assignment"] = ( all_sampes["lineage"] == all_sampes["true_lineage"] diff --git a/workflow/scripts/plot-primer-clipping.py b/workflow/scripts/plot-primer-clipping.py index d30f4dea7..a99b55a8f 100644 --- a/workflow/scripts/plot-primer-clipping.py +++ b/workflow/scripts/plot-primer-clipping.py @@ -13,8 +13,10 @@ from intervaltree import IntervalTree # read primer bedpe to df -PRIMER = pd.read_csv(snakemake.params.get("bedpe", ""), delimiter="\t", header=None) -PRIMER.drop(PRIMER.columns[[0, 3]], axis=1, inplace=True) +PRIMER = pd.read_csv(snakemake.params.get("bedpe", ""), delimiter="\t", header=0) +print(PRIMER) +PRIMER.drop(PRIMER.columns[[0]], axis=1, inplace=True) +print(PRIMER) PRIMER.columns = ["p1_start", "p1_end", "p2_start", "p2_end"] # convert df to interval trees @@ -116,7 +118,7 @@ def count_intervals(file): "uncut primer within", "cut primer exact", "cut primer within", - "no mathing win", + "no matching win", ], } ) @@ -147,13 +149,13 @@ def plot_classes(counters): counts_before = count_intervals(file) counts_before["sample"] = sample counts_before["state"] = "before" - all_df = all_df.append(counts_before, ignore_index=True) + all_df = pd.concat([all_df, counts_before], ignore_index=True) for sample, file in iter_with_samples(snakemake.input.clipped): counts_after = count_intervals(file) counts_after["sample"] = sample counts_after["state"] = "after" - all_df = all_df.append(counts_after, ignore_index=True) + all_df = pd.concat([all_df, counts_after], ignore_index=True) bars, text = plot_classes(all_df)