diff --git a/workflow/Snakefile b/workflow/Snakefile index bf38625b1..0a462c06b 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -43,6 +43,8 @@ include: "rules/generate_output.smk" include: "rules/benchmarking.smk" include: "rules/long_read.smk" include: "rules/lineage_variant_calling.smk" +include: "rules/amplicon_stats.smk" +include: "rules/freyja.smk" if config["data-handling"]["use-data-handling"]: diff --git a/workflow/envs/freyja.yaml b/workflow/envs/freyja.yaml new file mode 100644 index 000000000..a1b4f0b1a --- /dev/null +++ b/workflow/envs/freyja.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - freyja =1.4.5 \ No newline at end of file diff --git a/workflow/envs/pysam.yaml b/workflow/envs/pysam.yaml index 2b748a104..516b849a8 100644 --- a/workflow/envs/pysam.yaml +++ b/workflow/envs/pysam.yaml @@ -9,3 +9,4 @@ dependencies: - requests =2.26 - dnachisel =3.2 - gffutils =0.10 + - natsort =5.5.0 diff --git a/workflow/report/lineage-variant-overview.rst b/workflow/report/lineage-variant-overview.rst new file mode 100644 index 000000000..d0094abec --- /dev/null +++ b/workflow/report/lineage-variant-overview.rst @@ -0,0 +1 @@ +Overview report for all given variants from covariants.org \ No newline at end of file diff --git a/workflow/rules/amplicon_stats.smk b/workflow/rules/amplicon_stats.smk new file mode 100644 index 000000000..b84af7a8c --- /dev/null +++ b/workflow/rules/amplicon_stats.smk @@ -0,0 +1,87 @@ +rule get_ampliconstats: + input: + bed="resources/primer-v4.bed", + bam="results/{date}/read-sorted/pe~position/{sample}.hardclipped.bam", + output: + "results/{date}/ampliconstats/{sample}.ampliconstats.txt", + log: + "logs/{date}/samtools/ampliconstats/{sample}.log", + conda: + "../envs/samtools.yaml" + shell: + "samtools ampliconstats {input.bed} {input.bam} > {output} 2> {log}" + + +rule plot_amplicon_coverage: + input: + bedpe="resources/primer.bedpe", + amp_stats=lambda wildcards: expand( + "results/{{date}}/ampliconstats/{sample}.ampliconstats.txt", + sample=get_samples_for_date(wildcards.date), + ), + names="resources/ww-sample-name-dict.csv", + weeks="resources/ww-sample-week-dict.csv", + output: + plot="results/{date}/plots/amplicon-abundance/all.svg", + stats="results/{date}/tables/amplicon-abundance/all.csv", + params: + samples=lambda wildcards: get_samples_for_date(wildcards.date), + log: + "logs/{date}/plot-amplicon-coverage.log", + conda: + "../envs/python.yaml" + script: + "../scripts/plot-amplicon-coverage.py" + + +rule amplicon_profiles: + input: + stats="results/{date}/ampliconstats/{sample}.ampliconstats.txt", + var_df="results/{date}/lineage-variant-report/{sample}.csv", + output: + profiles="results/{date}/amplicon-profiles-sorted/{sample}.txt", + log: + "logs/{date}/amplicon-profiles/{sample}.txt", + threads: 32 + conda: + "../envs/python.yaml" + script: + "../scripts/get-amplicon-profiles.py" + + +rule amplicon_profiles_snv: + input: + csv=lambda wildcards: expand( + "results/{{date}}/lineage-variant-report/{sample}.csv", + sample=get_samples_for_date(wildcards.date), + ), + output: + ampprofile="results/{date}/amplicon-profiles-snv/all.csv", + log: + "logs/{date}/amplicon-profiles-snv/all.log", + params: + sample=lambda wildcards: get_samples_for_date(wildcards.date), + threads: 32 + conda: + "../envs/python.yaml" + script: + "../scripts/get-amplicon-profiles-snv.py" + + +rule get_amplicon_stat_output: + input: + expand( + "results/{date}/amplicon-profiles-snv/all.csv", + date="2023-09-08", + sample=get_samples_for_date("2023-09-08"), + ), + expand( + "results/{date}/plots/amplicon-abundance/all.svg", + date="2023-09-08", + sample=get_samples_for_date("2023-09-08"), + ), + expand( + "results/{date}/lineage-abundance/all.demix.csv", + date="2023-09-08", + sample=get_samples_for_date("2023-09-08"), + ), \ No newline at end of file diff --git a/workflow/rules/freyja.smk b/workflow/rules/freyja.smk new file mode 100644 index 000000000..057139fb3 --- /dev/null +++ b/workflow/rules/freyja.smk @@ -0,0 +1,56 @@ +rule freyja_variants: + input: + bam="results/{date}/read-clipping/hardclipped/pe/{sample}/{sample}.bam", + ref="resources/genomes/MN908947.fasta", + output: + var="results/{date}/lineage-abundance/freyja/{sample}.variants.tsv", + depth="results/{date}/lineage-abundance/freyja/{sample}.depths.txt", + log: + "logs/{date}/freyja/variants/{sample}.log" + params: + var=lambda w, output: os.path.splitext(output.var)[0], + conda: + "../envs/freyja.yaml" + shell: + "freyja variants {input.bam} --variants {params.var} --depths {output.depth} --ref {input.ref} > {log} 2>&1" + + +rule freyja_demix: + input: + var="results/{date}/lineage-abundance/freyja/{sample}.variants.tsv", + depth="results/{date}/lineage-abundance/freyja/{sample}.depths.txt", + output: + "results/{date}/lineage-abundance/freyja/{sample}.demix.txt", + log: + "logs/{date}/freyja/demix/{sample}.log" + conda: + "../envs/freyja.yaml" + shell: + "freyja demix {input.var} {input.depth} --output {output} > {log} 2>&1" + + +rule aggregate_freyja: + input: + demix=lambda wildcards: expand( + "results/{{date}}/lineage-abundance/freyja/{sample}.demix.txt", + sample=get_timeseries_samples("sample"), + ), + output: + all="results/{date}/lineage-abundance/freyja/all.demix.csv", + all_count="results/{date}/lineage-abundance/freyja/all.count.csv", + pivot="results/{date}/lineage-abundance/freyja/all.pivot.csv", + log: + "logs/{date}/aggregate_freyja/all.log", + params: + sample=lambda wildcards: get_timeseries_samples("sample"), + location=lambda wildcards: get_timeseries_samples("location"), + timestamp=lambda wildcards: get_timeseries_samples("timestamp"), + conda: + "../envs/python.yaml" + script: + "../scripts/aggregate-freyja.py" + + +rule get_aggregated_freyja: + input: + "results/2022-12-21/lineage-abundance/freyja/all.demix.csv", \ No newline at end of file diff --git a/workflow/rules/lineage_variant_calling.smk b/workflow/rules/lineage_variant_calling.smk index f50053f1f..8ae429fe7 100644 --- a/workflow/rules/lineage_variant_calling.smk +++ b/workflow/rules/lineage_variant_calling.smk @@ -48,13 +48,62 @@ rule generate_lineage_variant_table: "../scripts/generate-lineage-variant-table.py" -rule get_lineage_variant_table: +rule aggregate_lineage_variants: + input: + csv=lambda wildcards: expand( + "results/{{date}}/lineage-variant-report/{sample}.csv", + sample=get_samples_for_date(wildcards.date), + ), + annotation="resources/annotation_known_variants.gff.gz", + output: + "results/{date}/lineage-variant-overview/all.csv", + log: + "logs/{date}/aggregate-lineage-variants/all.log", + params: + sample=lambda wildcards: get_samples_for_date(wildcards.date) + conda: + "../envs/pysam.yaml" + script: + "../scripts/aggreagte-lineage-variants.py" + + +rule get_aggregated_lineage_variant_table: input: expand( - "results/{date}/lineage-variant-report/{sample}.csv", - date="2022-05-16", - sample=get_samples_for_date("2022-05-16"), + "results/{date}/lineage-variant-report/all.csv", + date="2022-12-21", + ), + + +rule render_datavzrd_config: + input: + template=workflow.source_path("../../resources/lineage-variant-overview.template.datavzrd.yaml"), + table="results/{date}/lineage-variant-overview/all.csv", + output: + "results/{date}/datavzrd/variant-table-model.yaml", + log: + "logs/{date}/yte/render-datavzrd-config/variant-table-model.log", + template_engine: + "yte" + + +rule render_lineage_variant_table: + input: + config="results/{date}/datavzrd/variant-table-model.yaml", + table="results/{date}/lineage-variant-overview/all.csv", + output: + report( + directory("results/{date}/lineage-variant-report/all"), + htmlindex="index.html", + caption="../report/lineage-variant-overview.rst", + category="2. Variant Call Details", + subcategory=" VOC variant overview", ), + log: + "logs/{date}/lineage-variant-overview/all.log", + wrapper: + "v2.1.0/utils/datavzrd" + use rule overview_table_html as generate_lineage_variant_report with: diff --git a/workflow/scripts/aggreagte-lineage-variants.py b/workflow/scripts/aggreagte-lineage-variants.py new file mode 100644 index 000000000..7413cd68f --- /dev/null +++ b/workflow/scripts/aggreagte-lineage-variants.py @@ -0,0 +1,57 @@ +# Copyright 2022 Thomas Battenfeld, Alexander Thomas, Johannes Köster. +# Licensed under the BSD 2-Clause License (https://opensource.org/licenses/BSD-2-Clause) +# This file may not be copied, modified, or distributed +# except according to those terms. + +import sys +import pandas as pd +import numpy as np +import gffutils +from natsort import natsort_keygen + +sys.stderr = open(snakemake.log[0], "w") + +data = pd.DataFrame() + +for file, sample in zip(snakemake.input.csv, snakemake.params.sample): + data = pd.concat( + [ + data, + pd.read_csv(file, usecols=["Mutations","Probability","Frequency","ReadDepth"]) + ] + ) + data.rename(columns={"Probability": sample + ": " + "Probability"}, inplace=True) + data.rename(columns={"Frequency": sample + ": " + "VAF"}, inplace=True) + data.rename(columns={"ReadDepth": sample + ": " + "Read Depth"}, inplace=True) + +data = data.groupby(["Mutations"]).agg(func={column: np.max for column in data.columns}) +# add feature column for sorting +data["Features"] = data.index.to_series().str.extract(r"(.+)[:].+|\*") + +# position of variant for sorting and change type +data["Position"] = data.index.to_series().str.extract( + r"(.*:?[A-Z]+|\*$|-)([0-9]+)([A-Z]+$|\*$|-)$" +)[1] +# data = data.astype({"Position": "int64"}) + +# generate sorting list from .gff with correct order of features +gff = gffutils.create_db(snakemake.input.annotation, dbfn=":memory:") +gene_start = {gene["gene_name"][0]: gene.start for gene in gff.features_of_type("gene")} +sorter = [k[0] for k in sorted(gene_start.items(), key=lambda item: item[1])] +sorterIndex = dict(zip(sorter, range(len(sorter)))) +data["Features_Rank"] = data["Features"].map(sorterIndex) + +data["Features_Rank"].replace(np.NaN, 12, inplace=True) + +data.drop(index=["Lineage", "Similarity"], inplace=True) +data.sort_values( + by=["Features_Rank", "Position"], + ascending=[True, True], + na_position="last", + inplace=True, + key=natsort_keygen(), +) + +data.drop(columns=["Mutations", "Features_Rank"], index=[], inplace=True) +print(data) +data.to_csv(snakemake.output[0]) \ No newline at end of file diff --git a/workflow/scripts/aggregate-freyja.py b/workflow/scripts/aggregate-freyja.py new file mode 100644 index 000000000..6e491b630 --- /dev/null +++ b/workflow/scripts/aggregate-freyja.py @@ -0,0 +1,50 @@ +# Copyright 2022 Thomas Battenfeld, Alexander Thomas, Johannes Köster. +# Licensed under the BSD 2-Clause License (https://opensource.org/licenses/BSD-2-Clause) +# This file may not be copied, modified, or distributed +# except according to those terms. + +import pandas as pd +import numpy as np +from natsort import index_natsorted + +data = pd.DataFrame() +data2 = pd.DataFrame() + +for file, sample, location, timestamp in zip(snakemake.input.demix, snakemake.params.sample, snakemake.params.location, snakemake.params.timestamp): + lineages = [] + abundances = [] + # name, num = sample.split("-") + with open(file, "r") as infile: + for line in infile.read().splitlines(): + if line.startswith("lineages"): + lineages = line.split("\t")[1].split(" ") + elif line.startswith("abundances"): + abundances = line.split("\t")[1].split(" ") + rounded_abundances = [ '%.2f' % float(elem) for elem in abundances ] + data.at[timestamp, location] = ", ".join([lin + ":" + ab for lin, ab in zip(lineages, rounded_abundances)]) + data2.at[timestamp, location] = len(lineages) + +data.sort_index(key= lambda x: np.argsort(index_natsorted(data.index)), inplace=True) +data.to_csv(snakemake.output.all) +data2.to_csv(snakemake.output.all_count) + + +data = pd.DataFrame() + +for file, sample in zip(snakemake.input.demix, snakemake.params.sample): + lineages = [] + abundances = [] + with open(file, "r") as infile: + for line in infile.read().splitlines(): + if line.startswith("lineages"): + lineages = line.split("\t")[1].split(" ") + elif line.startswith("abundances"): + abundances = line.split("\t")[1].split(" ") + rounded_abundances = [ '%.2f' % float(elem) for elem in abundances ] + for lin, ab in zip(lineages, rounded_abundances): + data.at[lin, sample] = ab + +data.sort_index(key= lambda x: np.argsort(index_natsorted(data.index)), inplace=True) +# data.sort_index(key= lambda x: np.argsort(data.columns.to_series().str[1:].astype(int)), inplace=True) +data = data[sorted(data.columns, key=lambda x: tuple(map(int,x[-2:])))] +data.to_csv(snakemake.output.pivot) \ No newline at end of file diff --git a/workflow/scripts/generate-amplicon-profiles.py b/workflow/scripts/generate-amplicon-profiles.py new file mode 100644 index 000000000..3af9f0b7e --- /dev/null +++ b/workflow/scripts/generate-amplicon-profiles.py @@ -0,0 +1,298 @@ + # Copyright 2022 Thomas Battenfeld, Alexander Thomas, Johannes Köster. +# Licensed under the BSD 2-Clause License (https://opensource.org/licenses/BSD-2-Clause) +# This file may not be copied, modified, or distributed +# except according to those terms. + +import pandas as pd +import numpy as np +from pysam import VariantFile +from natsort import index_natsorted, ns + +bcf_in = VariantFile("/local/work/uncovar-wastewater/resources/lineage-candidate-variants/all.sorted.bcf") # auto-detect input format + +mutations = pd.DataFrame() +lineages = pd.DataFrame() + +for rec in bcf_in.fetch(): + lineage_dict = {lineage: "x" for lineage in rec.info["LINEAGES"]} + mutations = pd.concat( + [ + mutations, + pd.DataFrame( + { + "mutation": rec.info["SIGNATURES"], + "position": rec.pos, + }, + + ), + ], + ignore_index=True + ) + for signature in rec.info["SIGNATURES"]: + lineages = pd.concat( + [ + lineages, + pd.DataFrame( + { + "mutation": signature, + **{ + lineage.replace(".", " "): "x" + for lineage in rec.info["LINEAGES"] + }, + }, + index=[0], + ), + ], + ignore_index=True, + ) + + +lineages = lineages.fillna(0) +lineages = lineages.replace({"x": 1}) +lineages = ( + lineages.groupby(["mutation"]) + .agg(func={column: np.max for column in lineages.columns}) + .reset_index(drop=True) +) +lineages = lineages.replace({1: "x", 0: np.NaN}) +lineages["count"] = lineages.loc[:, lineages.columns!='mutation'].count(axis=1) +print(lineages) + + + +mutations = mutations.groupby(["mutation"]).agg(func={column: np.max for column in mutations.columns}).reset_index(drop=True) +mutations.sort_values(by=["position"], inplace=True) +mutations.to_csv("test.csv", index=False) +mutations["amplicon"] = "" +print(mutations) + +mutations.set_index("mutation", inplace=True) +lineages.set_index("mutation", inplace=True) + +mutations = mutations.merge(lineages, left_index=True, right_index=True) + +with open("/local/work/uncovar-wastewater/D01.amplicon_stats.txt", "r") as statfile: + for line in statfile.read().splitlines(): + if line.startswith("FREADS"): + coverage = line.split("\t")[2:] + +print(len(coverage)) + +amplicons = pd.read_csv("/local/work/uncovar-wastewater/resources/primer.bedpe", delimiter="\t", names=["ref_name_1", "p1_start", "p1_end", "ref_name_2", "p2_start", "p2_end"]) +amplicons.drop(columns=["ref_name_1", "ref_name_2"], inplace=True) +amplicons["amp_start"] = amplicons["p1_end"] + 1 +amplicons["amp_end"] = amplicons["p2_end"] - 1 +amplicons["amp_len"] = amplicons["amp_end"] - amplicons["amp_start"] +amplicons["amp_cov"] = coverage + +print(amplicons) + +for index1, row1 in mutations.iterrows(): + for index2, row2 in amplicons.iterrows(): + if row1["position"] in range(int(row2['amp_start']), int(row2['amp_end'])): + if mutations.loc[index1]["amplicon"] == "": + mutations.at[index1, "amplicon"] = str(index2) + else: + mutations.at[index1, "amplicon"] += "," + str(index2) + +all_columns = mutations.columns +first_columns = ["position", "amplicon", "count"] +rest_columns = [item for item in all_columns.sort_values() if item not in first_columns] +mutations = mutations[[*first_columns, *rest_columns]] + + +mutations = (mutations.drop('amplicon', axis=1) + .join + ( + mutations.amplicon + .str + .split(",", expand=True) + .stack() + .reset_index(drop=True, level=1) + .rename('amplicon') + )) + +mutations = mutations[[*first_columns, *rest_columns]] +mutations.sort_values(by=["position", "amplicon"], inplace=True) +mutations.to_csv("amplicons.tsv", index=True, sep="\t") + + + +amplicon_profiles = mutations + + +# amplicon_profiles.set_index("mutation", inplace=True) +for index, row in amplicon_profiles.iterrows(): + for column in rest_columns: + if row[column] == "x": + amplicon_profiles.at[index, column] = index + +amplicon_profiles.reset_index(drop=True, inplace=True) +amplicon_profiles.drop(columns=["position", "count"], inplace=True) +amplicon_profiles = amplicon_profiles.replace({np.NaN : ""}) +amplicon_profiles = amplicon_profiles.groupby(["amplicon"]).agg(lambda x: ' '.join(set(x))) +amplicon_profiles.sort_index(key= lambda x: np.argsort(index_natsorted(amplicon_profiles.index)), inplace=True) +print(amplicon_profiles) + +amplicon_profiles.to_csv("amp_profiles.csv", index=True) +# pos = 21764 +# test = amplicons[(pos >= amplicons['amp_start']) & (pos <= amplicons['amp_end'])] +# print(test) + +# print(mutations) + +# import sys + +# sys.stderr = open(snakemake.log[0], "w") + +# import altair as alt +# import pandas as pd +# import pysam +# from intervaltree import IntervalTree + +# # read primer bedpe to df +# PRIMER = pd.read_csv(snakemake.params.get("bedpe", ""), delimiter="\t", header=None) +# PRIMER.drop(PRIMER.columns[[0, 3]], axis=1, inplace=True) +# PRIMER.columns = ["p1_start", "p1_end", "p2_start", "p2_end"] + +# # convert df to interval trees +# primer_intervals = IntervalTree() +# no_primer_intervals = IntervalTree() +# for index, row in PRIMER.iterrows(): +# primer_intervals[row["p1_start"] : row["p2_end"] + 1] = ( +# row["p1_start"], +# row["p2_end"] + 1, +# ) +# no_primer_intervals[row["p1_end"] + 1 : row["p2_start"]] = ( +# row["p1_end"] + 1, +# row["p2_start"], +# ) + + +# def iter_with_samples(inputfiles): +# return zip(snakemake.params.samples, inputfiles) + + +# def count_intervals(file): +# with pysam.AlignmentFile(file, "rb") as bam: +# counter_primer = 0 +# counter_no_primer = 0 +# counter_primer_within = 0 +# counter_no_primer_within = 0 +# counter_nothing = 0 +# mate_pair_intervals = {} +# for read in bam.fetch(): +# if not mate_pair_intervals.get(read.query_name): +# mate_pair_intervals[read.query_name] = [read.reference_start] +# else: +# mate_pair_intervals[read.query_name].append(read.reference_end) +# for pair in mate_pair_intervals: +# if ( +# len(mate_pair_intervals[pair]) > 1 +# and mate_pair_intervals[pair][0] != None +# and mate_pair_intervals[pair][1] != None +# ): +# if primer_intervals.envelop( +# mate_pair_intervals[pair][0], mate_pair_intervals[pair][1] + 1 +# ): +# if ( +# sorted( +# primer_intervals.envelop( +# mate_pair_intervals[pair][0], +# mate_pair_intervals[pair][1] + 1, +# ) +# )[0].begin +# == mate_pair_intervals[pair][0] +# and sorted( +# primer_intervals.envelop( +# mate_pair_intervals[pair][0], +# mate_pair_intervals[pair][1] + 1, +# ) +# )[0].end +# == mate_pair_intervals[pair][1] + 1 +# ): +# counter_primer += 1 +# else: +# counter_primer_within += 1 +# elif no_primer_intervals.envelop( +# mate_pair_intervals[pair][0] + 1, mate_pair_intervals[pair][1] +# ): +# if ( +# sorted( +# no_primer_intervals.envelop( +# mate_pair_intervals[pair][0] + 1, +# mate_pair_intervals[pair][1], +# ) +# )[0].begin +# == mate_pair_intervals[pair][0] + 1 +# and sorted( +# no_primer_intervals.envelop( +# mate_pair_intervals[pair][0] + 1, +# mate_pair_intervals[pair][1], +# ) +# )[0].end +# == mate_pair_intervals[pair][1] +# ): +# counter_no_primer += 1 +# else: +# counter_no_primer_within += 1 +# else: +# counter_nothing += 1 +# else: +# counter_nothing += 1 +# counters = pd.DataFrame( +# { +# "n_count": [ +# counter_primer, +# counter_primer_within, +# counter_no_primer, +# counter_no_primer_within, +# counter_nothing, +# ], +# "class": [ +# "uncut primer exact", +# "uncut primer within", +# "cut primer exact", +# "cut primer within", +# "no mathing win", +# ], +# } +# ) +# return counters + + +# def plot_classes(counters): +# bars = ( +# alt.Chart(counters) +# .mark_bar() +# .encode( +# y="class", +# x="n_count", +# row=alt.Row("sample:N"), +# column=alt.Column("state:N", sort="descending"), +# ) +# ) +# text = bars.mark_text( +# align="left", +# baseline="middle", +# dx=3, # Nudges text to right so it doesn't appear on top of the bar +# ).encode(text="n_count", row=alt.Row("sample:N"), column=alt.Column("state:N")) +# return bars, text + + +# all_df = pd.DataFrame() +# for sample, file in iter_with_samples(snakemake.input.unclipped): +# counts_before = count_intervals(file) +# counts_before["sample"] = sample +# counts_before["state"] = "before" +# all_df = pd.concat([all_df, counts_before], ignore_index=True) + +# for sample, file in iter_with_samples(snakemake.input.clipped): +# counts_after = count_intervals(file) +# counts_after["sample"] = sample +# counts_after["state"] = "after" +# all_df = pd.concat([all_df, counts_after], ignore_index=True) + +# bars, text = plot_classes(all_df) + +# (bars).properties(title="Amplicon matching").save(snakemake.output.plot) diff --git a/workflow/scripts/get-amplicon-profiles-WIP.py b/workflow/scripts/get-amplicon-profiles-WIP.py new file mode 100644 index 000000000..b908697a6 --- /dev/null +++ b/workflow/scripts/get-amplicon-profiles-WIP.py @@ -0,0 +1,417 @@ + # Copyright 2022 Thomas Battenfeld, Alexander Thomas, Johannes Köster. +# Licensed under the BSD 2-Clause License (https://opensource.org/licenses/BSD-2-Clause) +# This file may not be copied, modified, or distributed +# except according to those terms. + +import pandas as pd +import numpy as np +from pysam import VariantFile +from natsort import index_natsorted, ns +from more_itertools import powerset + +bcf_in = VariantFile("/local/work/uncovar-wastewater/resources/lineage-candidate-variants/all.sorted.bcf") # auto-detect input format + +mutations = pd.DataFrame() +lineages = pd.DataFrame() + +data = pd.DataFrame() + +for rec in bcf_in.fetch(): + data = pd.concat( + [data, pd.DataFrame + ( + { + "snv-combi": ", ".join(rec.info["SIGNATURES"]), + "snv": rec.info["SIGNATURES"], + "position": rec.pos, + "lineage": ",".join(rec.info["LINEAGES"]) + }, + ), + ],ignore_index=True + ) + +print(data) +# "lineage": ",".join([lin.replace(".", " ") for lin in rec.info["LINEAGES"]]) +data.drop_duplicates(inplace=True) + +amplicons = pd.read_csv("/local/work/uncovar-wastewater/resources/primer.bedpe", delimiter="\t", names=["ref_name_1", "p1_start", "p1_end", "ref_name_2", "p2_start", "p2_end"]) +amplicons.drop(columns=["ref_name_1", "ref_name_2"], inplace=True) +amplicons["amp_start"] = amplicons["p1_end"] + 1 +amplicons["amp_end"] = amplicons["p2_end"] - 1 +amplicons["amp_len"] = amplicons["amp_end"] - amplicons["amp_start"] +amplicons["number"] = amplicons.index.to_series() + 1 +amplicons.index = pd.IntervalIndex.from_arrays(amplicons["amp_start"], amplicons["amp_end"], closed="both") + +print(amplicons) + +voc_names = pd.read_csv("/local/work/uncovar-wastewater/resources/variants-rename.csv", header=0) +rename_dict = voc_names.set_index('Nextstrain Clade').to_dict()['Pango Lineage'] + +data["pango"] = data["lineage"].apply(lambda x: ",".join(rename_dict[y] if y in rename_dict else y for y in x.split(","))) + +data["amplicon"] = data["position"].apply(lambda x: amplicons.loc[amplicons.index.contains(x)]["number"].values) +data = data.explode("amplicon") +data.reset_index(inplace=True, drop=True) + +print(data) + +data.to_csv("test_amp_interval.csv") +# for index1, row1 in mutations.iterrows(): +# for index2, row2 in amplicons.iterrows(): +# if row1["position"] in range(int(row2['amp_start']), int(row2['amp_end'])): +# if mutations.loc[index1]["amplicon"] == "": +# mutations.at[index1, "amplicon"] = str(index2) +# else: +# mutations.at[index1, "amplicon"] += "," + str(index2) + + + # for signature in rec.info["SIGNATURES"]: + # lineages = pd.concat( + # [ + # lineages, + # pd.DataFrame( + # { + # "mutation": signature, + # **{ + # lineage.replace(".", " "): "x" + # for lineage in rec.info["LINEAGES"] + # }, + # }, + # index=[0], + # ), + # ], + # ignore_index=True, + # ) + + +# lineages = lineages.fillna(0) +# lineages = lineages.replace({"x": 1}) +# lineages = ( +# lineages.groupby(["mutation"]) +# .agg(func={column: np.max for column in lineages.columns}) +# .reset_index(drop=True) +# ) +# lineages = lineages.replace({1: "x", 0: np.NaN}) +# lineages["count"] = lineages.loc[:, lineages.columns!='mutation'].count(axis=1) + +# mutations = mutations.groupby(["mutation"]).agg(func={column: np.max for column in mutations.columns}).reset_index(drop=True) +# mutations.sort_values(by=["position"], inplace=True) +# mutations.to_csv("test.csv", index=False) +# mutations["amplicon"] = "" + +# # print("here") +# # print(mutations) + +# mutations.set_index("mutation", inplace=True) +# lineages.set_index("mutation", inplace=True) + +# mutations = mutations.merge(lineages, left_index=True, right_index=True) + +# with open(snakemake.input.stats, "r") as statfile: +# for line in statfile.read().splitlines(): +# if line.startswith("FREADS"): +# coverage = line.split("\t")[2:] + +# # print(len(coverage)) + +# amplicons = pd.read_csv("/local/work/uncovar-wastewater/resources/primer.bedpe", delimiter="\t", names=["ref_name_1", "p1_start", "p1_end", "ref_name_2", "p2_start", "p2_end"]) +# amplicons.drop(columns=["ref_name_1", "ref_name_2"], inplace=True) +# amplicons["amp_start"] = amplicons["p1_end"] + 1 +# amplicons["amp_end"] = amplicons["p2_end"] - 1 +# amplicons["amp_len"] = amplicons["amp_end"] - amplicons["amp_start"] +# amplicons["amp_cov"] = coverage + +# # print(amplicons) + +# for index1, row1 in mutations.iterrows(): +# for index2, row2 in amplicons.iterrows(): +# if row1["position"] in range(int(row2['amp_start']), int(row2['amp_end'])): +# if mutations.loc[index1]["amplicon"] == "": +# mutations.at[index1, "amplicon"] = str(index2) +# else: +# mutations.at[index1, "amplicon"] += "," + str(index2) + +# all_columns = mutations.columns +# first_columns = ["position", "amplicon", "count"] +# rest_columns = [item for item in all_columns.sort_values() if item not in first_columns] +# mutations = mutations[[*first_columns, *rest_columns]] + + +# # mutations = (mutations.drop('amplicon', axis=1) +# # .join +# # ( +# # mutations.amplicon +# # .str +# # .split(",", expand=True) +# # .stack() +# # .reset_index(drop=True, level=1) +# # .rename('amplicon') +# # )) + +# mutations = mutations[[*first_columns, *rest_columns]] +# mutations.sort_values(by=["position", "amplicon"], inplace=True) +# mutations.to_csv("amplicons.tsv", index=True, sep="\t") + + +# amplicon_profiles = mutations.copy(deep=True) + +# amplicon_profiles = (amplicon_profiles.drop('amplicon', axis=1) +# .join +# ( +# amplicon_profiles.amplicon +# .str +# .split(",", expand=True) +# .stack() +# .reset_index(drop=True, level=1) +# .rename('amplicon') +# )) + +# # amplicon_profiles.set_index("mutation", inplace=True) +# for index, row in amplicon_profiles.iterrows(): +# for column in rest_columns: +# if row[column] == "x": +# amplicon_profiles.at[index, column] = index + +# amplicon_profiles.reset_index(drop=True, inplace=True) + +# # print("here2") +# # print(mutations) +# amplicon_profiles.drop(columns=["position", "count"], inplace=True) +# amplicon_profiles = amplicon_profiles.replace({np.NaN : ""}) +# amplicon_profiles = amplicon_profiles.groupby(["amplicon"]).agg(lambda x: ' '.join(sorted(set(x)))) +# amplicon_profiles.sort_index(key= lambda x: np.argsort(index_natsorted(amplicon_profiles.index)), inplace=True) +# # print(amplicon_profiles) + +# amplicon_profiles.to_csv("amp_profiles.csv", index=True) + +# files = [ +# "/local/work/uncovar-wastewater/results/2022-12-21/lineage-variant-report/D01.csv", +# # "/local/work/uncovar-wastewater/results/2022-12-21/lineage-variant-report/D02.csv", +# # "/local/work/uncovar-wastewater/results/2022-12-21/lineage-variant-report/D06.csv" +# ] + +# samples = ["D01"] + +# data = pd.DataFrame() +# data = pd.read_csv(snakemake.input.var_df, usecols=["Mutations","Probability","Frequency","ReadDepth"]) + +# # for file, sample in zip(files, samples): +# # data = pd.concat( +# # [ +# # data, +# # pd.read_csv(file, usecols=["Mutations","Probability","Frequency","ReadDepth"]) +# # ] +# # ) + +# data.set_index("Mutations", inplace=True) +# data.index.rename("mutation", inplace=True) +# # mutations. +# data.drop(["Lineage", "Similarity"], inplace=True) +# print(data) +# print(mutations.index) +# data = pd.concat([data, mutations], axis=1) + +# data = (data.drop('amplicon', axis=1) +# .join +# ( +# data.amplicon +# .str +# .split(",", expand=True) +# .stack() +# .reset_index(drop=True, level=1) +# .rename('amplicon') +# )) + +# cols = ["Probability", "Frequency", "ReadDepth", "position", "count", "amplicon"] +# data = data[cols] + +# data.sort_values(by=["position", "amplicon"], inplace=True) +# print(data) + +# data = data.replace({np.NaN: 0}) +# # filter_data = data[(data["Probability"] > 0.95 & data["Frequency"] > 0 & data["ReadDepth"] > 0)] +# filter_data = data[(data["Probability"] > 0.9) & (data["Frequency"] * data["ReadDepth"] > 10)].copy(deep=True) + +# filter_data.reset_index(inplace=True, drop=False) +# filter_data = filter_data[["mutation", "amplicon"]] +# filter_data = filter_data.groupby(["amplicon"]).agg(lambda x: ' '.join(sorted(set(x)))) + +# filter_data.sort_index(key= lambda x: np.argsort(index_natsorted(filter_data.index)), inplace=True) +# print(filter_data) + +# amplicon_profiles.replace(r"^\s", "", regex=True, inplace=True) +# # print(amplicon_profiles) +# with open(snakemake.output.profiles, "w") as outfile: +# for index, mutation_profile in filter_data.iterrows(): +# lineage_hit = amplicon_profiles.columns[(amplicon_profiles == mutation_profile["mutation"]).any()].tolist() +# print(index + "\t" + mutation_profile["mutation"], lineage_hit, file=outfile) +# snvs = mutation_profile["mutation"].split(" ") +# if len(snvs) > 1: +# for snv in snvs: +# snv_hit = amplicon_profiles.columns[(amplicon_profiles == snv).any()].tolist() +# print("\t" + snv, snv_hit, file=outfile) +# # print(list(powerset(snvs))) + + + +# pos = 21764 +# test = amplicons[(pos >= amplicons['amp_start']) & (pos <= amplicons['amp_end'])] +# print(test) + +# print(mutations) + +# import sys + +# sys.stderr = open(snakemake.log[0], "w") + +# import altair as alt +# import pandas as pd +# import pysam +# from intervaltree import IntervalTree + +# # read primer bedpe to df +# PRIMER = pd.read_csv(snakemake.params.get("bedpe", ""), delimiter="\t", header=None) +# PRIMER.drop(PRIMER.columns[[0, 3]], axis=1, inplace=True) +# PRIMER.columns = ["p1_start", "p1_end", "p2_start", "p2_end"] + +# # convert df to interval trees +# primer_intervals = IntervalTree() +# no_primer_intervals = IntervalTree() +# for index, row in PRIMER.iterrows(): +# primer_intervals[row["p1_start"] : row["p2_end"] + 1] = ( +# row["p1_start"], +# row["p2_end"] + 1, +# ) +# no_primer_intervals[row["p1_end"] + 1 : row["p2_start"]] = ( +# row["p1_end"] + 1, +# row["p2_start"], +# ) + + +# def iter_with_samples(inputfiles): +# return zip(snakemake.params.samples, inputfiles) + + +# def count_intervals(file): +# with pysam.AlignmentFile(file, "rb") as bam: +# counter_primer = 0 +# counter_no_primer = 0 +# counter_primer_within = 0 +# counter_no_primer_within = 0 +# counter_nothing = 0 +# mate_pair_intervals = {} +# for read in bam.fetch(): +# if not mate_pair_intervals.get(read.query_name): +# mate_pair_intervals[read.query_name] = [read.reference_start] +# else: +# mate_pair_intervals[read.query_name].append(read.reference_end) +# for pair in mate_pair_intervals: +# if ( +# len(mate_pair_intervals[pair]) > 1 +# and mate_pair_intervals[pair][0] != None +# and mate_pair_intervals[pair][1] != None +# ): +# if primer_intervals.envelop( +# mate_pair_intervals[pair][0], mate_pair_intervals[pair][1] + 1 +# ): +# if ( +# sorted( +# primer_intervals.envelop( +# mate_pair_intervals[pair][0], +# mate_pair_intervals[pair][1] + 1, +# ) +# )[0].begin +# == mate_pair_intervals[pair][0] +# and sorted( +# primer_intervals.envelop( +# mate_pair_intervals[pair][0], +# mate_pair_intervals[pair][1] + 1, +# ) +# )[0].end +# == mate_pair_intervals[pair][1] + 1 +# ): +# counter_primer += 1 +# else: +# counter_primer_within += 1 +# elif no_primer_intervals.envelop( +# mate_pair_intervals[pair][0] + 1, mate_pair_intervals[pair][1] +# ): +# if ( +# sorted( +# no_primer_intervals.envelop( +# mate_pair_intervals[pair][0] + 1, +# mate_pair_intervals[pair][1], +# ) +# )[0].begin +# == mate_pair_intervals[pair][0] + 1 +# and sorted( +# no_primer_intervals.envelop( +# mate_pair_intervals[pair][0] + 1, +# mate_pair_intervals[pair][1], +# ) +# )[0].end +# == mate_pair_intervals[pair][1] +# ): +# counter_no_primer += 1 +# else: +# counter_no_primer_within += 1 +# else: +# counter_nothing += 1 +# else: +# counter_nothing += 1 +# counters = pd.DataFrame( +# { +# "n_count": [ +# counter_primer, +# counter_primer_within, +# counter_no_primer, +# counter_no_primer_within, +# counter_nothing, +# ], +# "class": [ +# "uncut primer exact", +# "uncut primer within", +# "cut primer exact", +# "cut primer within", +# "no mathing win", +# ], +# } +# ) +# return counters + + +# def plot_classes(counters): +# bars = ( +# alt.Chart(counters) +# .mark_bar() +# .encode( +# y="class", +# x="n_count", +# row=alt.Row("sample:N"), +# column=alt.Column("state:N", sort="descending"), +# ) +# ) +# text = bars.mark_text( +# align="left", +# baseline="middle", +# dx=3, # Nudges text to right so it doesn't appear on top of the bar +# ).encode(text="n_count", row=alt.Row("sample:N"), column=alt.Column("state:N")) +# return bars, text + + +# all_df = pd.DataFrame() +# for sample, file in iter_with_samples(snakemake.input.unclipped): +# counts_before = count_intervals(file) +# counts_before["sample"] = sample +# counts_before["state"] = "before" +# all_df = pd.concat([all_df, counts_before], ignore_index=True) + +# for sample, file in iter_with_samples(snakemake.input.clipped): +# counts_after = count_intervals(file) +# counts_after["sample"] = sample +# counts_after["state"] = "after" +# all_df = pd.concat([all_df, counts_after], ignore_index=True) + +# bars, text = plot_classes(all_df) + +# (bars).properties(title="Amplicon matching").save(snakemake.output.plot) diff --git a/workflow/scripts/get-amplicon-profiles-snv.py b/workflow/scripts/get-amplicon-profiles-snv.py new file mode 100644 index 000000000..7b2096e6a --- /dev/null +++ b/workflow/scripts/get-amplicon-profiles-snv.py @@ -0,0 +1,126 @@ +# Copyright 2022 Thomas Battenfeld, Alexander Thomas, Johannes Köster. +# Licensed under the BSD 2-Clause License (https://opensource.org/licenses/BSD-2-Clause) +# This file may not be copied, modified, or distributed +# except according to those terms. + +import pandas as pd +import numpy as np +from pysam import VariantFile +from natsort import index_natsorted, ns +from more_itertools import powerset + +bcf_in = VariantFile("/local/work/uncovar-wastewater/resources/lineage-candidate-variants/all.sorted.bcf") # auto-detect input format + +mutations = pd.DataFrame() +lineages = pd.DataFrame() + +for rec in bcf_in.fetch(): + for signature in rec.info["SIGNATURES"]: + for lineage in rec.info["LINEAGES"]: + lineages = pd.concat( + [ + lineages, + pd.DataFrame( + { + "mutation": signature, + "lineage": lineage.replace(".", " "), + "position": rec.pos, + }, + index=[0], + ), + ], + ignore_index=True, + ) + +lineages.drop_duplicates(inplace=True) +lineages.reset_index(inplace=True, drop=True) + +voc_names = pd.read_csv("/local/work/uncovar-wastewater/resources/variants-rename.csv", header=0) +print(voc_names) +newer_omicrons = voc_names[ + (voc_names["WHO Label"] == "Omicron") & + (voc_names["Nextstrain Clade"] != "21K Omicron") & + (voc_names["Nextstrain Clade"] != "21L Omicron") + ]["Nextstrain Clade"].to_list() + +exclude_list = ["20A S 126A", "21J Delta", "21I Delta"] +# lineages["lineage"].isin(newer_omicrons).index.drop() +# lineages.drop(lineages[lineages["lineage"].isin(newer_omicrons)].index, inplace=True) +lineages.drop(lineages[lineages["lineage"].isin(exclude_list)].index, inplace=True) +# lineages[lineages.groupby('mutation').mutation.transform(len) == 1] +print(lineages[lineages.groupby('mutation').mutation.transform(len) == 1]) +print(len(lineages.mutation.unique())) +# lineages.to_csv("snv.csv") + +# files = [ +# "/local/work/uncovar-wastewater/results/2022-06-30/lineage-variant-report/sewerA-04.csv", +# "/local/work/uncovar-wastewater/results/2022-06-30/lineage-variant-report/sewerA-05.csv", +# "/local/work/uncovar-wastewater/results/2022-06-30/lineage-variant-report/sewerA-06.csv", +# "/local/work/uncovar-wastewater/results/2022-06-30/lineage-variant-report/sewerB-04.csv", +# "/local/work/uncovar-wastewater/results/2022-06-30/lineage-variant-report/sewerB-05.csv", +# "/local/work/uncovar-wastewater/results/2022-06-30/lineage-variant-report/sewerB-06.csv", +# ] + +# # # samples = ["D01"] +# samples = [ +# "sewerA-04", +# "sewerA-05", +# "sewerA-06", +# "sewerB-04", +# "sewerB-05", +# "sewerB-06", +# ] + +data = pd.DataFrame() +# data = pd.read_csv(snakemake.input.var_df, usecols=["Mutations","Probability","Frequency","ReadDepth"]) + +vocs_included = {} +# for file, sample in zip(files, samples): +for file, sample in zip(snakemake.input.csv, snakemake.params.sample): + data = pd.concat( + [ + data, + pd.read_csv(file, usecols=["Mutations","Probability","Frequency","ReadDepth"]) + ] + ) + + data.set_index("Mutations", inplace=True) + data.index.rename("mutation", inplace=True) + # mutations. + data.drop(["Lineage", "Similarity"], inplace=True) + data.drop(["ORF1a:L3674-", "ORF1a:S2083-"], inplace=True) + + filter_data = data[(data["Probability"] > 0.95) & (data["Frequency"] * data["ReadDepth"] > 10)].copy(deep=True) + + # print(filter_data) + + rename_dict = voc_names.set_index('Nextstrain Clade').to_dict()['Pango Lineage'] + # print(rename_dict) + # lineages["lineage"].replace(rename_dict, inplace=True) + + out_list = [] + for mutation in filter_data.index: + eindeut_lineage = lineages[lineages["mutation"] == mutation]["lineage"].to_list() + eindeut_lineage = [x for x in eindeut_lineage if x != "nan"] + if len(eindeut_lineage) <= 1: + out_list.append("/".join(eindeut_lineage)) + print(mutation, lineages[lineages["mutation"] == mutation]["lineage"].to_list()) + out_list = list(dict.fromkeys(out_list)) + out_list = [rename_dict[x] if x in rename_dict else x for x in out_list] + out_list = [x for x in out_list if x != ""] + site, week = sample.split("-") + # site = sample[:-2] + # week = sample[-2:] + if week not in vocs_included: + vocs_included[week] = {} + vocs_included[week][site] = out_list + +print(vocs_included) + +vocs = pd.DataFrame() +for week in vocs_included: + for site in vocs_included[week]: + vocs.at[week, site] = " ".join(vocs_included[week][site]) + +print(vocs) +vocs.to_csv(snakemake.output.ampprofile) \ No newline at end of file diff --git a/workflow/scripts/get-amplicon-profiles.py b/workflow/scripts/get-amplicon-profiles.py new file mode 100644 index 000000000..b1018b611 --- /dev/null +++ b/workflow/scripts/get-amplicon-profiles.py @@ -0,0 +1,381 @@ + # Copyright 2022 Thomas Battenfeld, Alexander Thomas, Johannes Köster. +# Licensed under the BSD 2-Clause License (https://opensource.org/licenses/BSD-2-Clause) +# This file may not be copied, modified, or distributed +# except according to those terms. + +import pandas as pd +import numpy as np +from pysam import VariantFile +from natsort import index_natsorted, ns +from more_itertools import powerset + +bcf_in = VariantFile("/local/work/uncovar-wastewater/resources/lineage-candidate-variants/all.sorted.bcf") # auto-detect input format + +mutations = pd.DataFrame() +lineages = pd.DataFrame() + +for rec in bcf_in.fetch(): + lineage_dict = {lineage: "x" for lineage in rec.info["LINEAGES"]} + mutations = pd.concat( + [ + mutations, + pd.DataFrame( + { + "mutation": rec.info["SIGNATURES"], + "position": rec.pos, + }, + + ), + ], + ignore_index=True + ) + for signature in rec.info["SIGNATURES"]: + lineages = pd.concat( + [ + lineages, + pd.DataFrame( + { + "mutation": signature, + **{ + lineage.replace(".", " "): "x" + for lineage in rec.info["LINEAGES"] + }, + }, + index=[0], + ), + ], + ignore_index=True, + ) + + +lineages = lineages.fillna(0) +lineages = lineages.replace({"x": 1}) +lineages = ( + lineages.groupby(["mutation"]) + .agg(func={column: np.max for column in lineages.columns}) + .reset_index(drop=True) +) +lineages = lineages.replace({1: "x", 0: np.NaN}) +lineages["count"] = lineages.loc[:, lineages.columns!='mutation'].count(axis=1) + +mutations = mutations.groupby(["mutation"]).agg(func={column: np.max for column in mutations.columns}).reset_index(drop=True) +mutations.sort_values(by=["position"], inplace=True) +mutations.to_csv("test.csv", index=False) +mutations["amplicon"] = "" + +# print("here") +# print(mutations) + +mutations.set_index("mutation", inplace=True) +lineages.set_index("mutation", inplace=True) + +mutations = mutations.merge(lineages, left_index=True, right_index=True) + +with open(snakemake.input.stats, "r") as statfile: + for line in statfile.read().splitlines(): + if line.startswith("FREADS"): + coverage = line.split("\t")[2:] + +# print(len(coverage)) + +amplicons = pd.read_csv("/local/work/uncovar-wastewater/resources/primer.bedpe", delimiter="\t", names=["ref_name_1", "p1_start", "p1_end", "ref_name_2", "p2_start", "p2_end"]) +amplicons.drop(columns=["ref_name_1", "ref_name_2"], inplace=True) +amplicons["amp_start"] = amplicons["p1_end"] + 1 +amplicons["amp_end"] = amplicons["p2_end"] - 1 +amplicons["amp_len"] = amplicons["amp_end"] - amplicons["amp_start"] +amplicons["amp_cov"] = coverage + +# print(amplicons) + +for index1, row1 in mutations.iterrows(): + for index2, row2 in amplicons.iterrows(): + if row1["position"] in range(int(row2['amp_start']), int(row2['amp_end'])): + if mutations.loc[index1]["amplicon"] == "": + mutations.at[index1, "amplicon"] = str(index2) + else: + mutations.at[index1, "amplicon"] += "," + str(index2) + +all_columns = mutations.columns +first_columns = ["position", "amplicon", "count"] +rest_columns = [item for item in all_columns.sort_values() if item not in first_columns] +mutations = mutations[[*first_columns, *rest_columns]] + + +# mutations = (mutations.drop('amplicon', axis=1) +# .join +# ( +# mutations.amplicon +# .str +# .split(",", expand=True) +# .stack() +# .reset_index(drop=True, level=1) +# .rename('amplicon') +# )) + +mutations = mutations[[*first_columns, *rest_columns]] +mutations.sort_values(by=["position", "amplicon"], inplace=True) +mutations.to_csv("amplicons.tsv", index=True, sep="\t") + + +amplicon_profiles = mutations.copy(deep=True) + +amplicon_profiles = (amplicon_profiles.drop('amplicon', axis=1) + .join + ( + amplicon_profiles.amplicon + .str + .split(",", expand=True) + .stack() + .reset_index(drop=True, level=1) + .rename('amplicon') + )) + +# amplicon_profiles.set_index("mutation", inplace=True) +for index, row in amplicon_profiles.iterrows(): + for column in rest_columns: + if row[column] == "x": + amplicon_profiles.at[index, column] = index + +amplicon_profiles.reset_index(drop=True, inplace=True) + +# print("here2") +# print(mutations) +amplicon_profiles.drop(columns=["position", "count"], inplace=True) +amplicon_profiles = amplicon_profiles.replace({np.NaN : ""}) +amplicon_profiles = amplicon_profiles.groupby(["amplicon"]).agg(lambda x: ' '.join(sorted(set(x)))) +amplicon_profiles.sort_index(key= lambda x: np.argsort(index_natsorted(amplicon_profiles.index)), inplace=True) +# print(amplicon_profiles) + +amplicon_profiles.to_csv("amp_profiles.csv", index=True) + +files = [ + "/local/work/uncovar-wastewater/results/2022-12-21/lineage-variant-report/D01.csv", + # "/local/work/uncovar-wastewater/results/2022-12-21/lineage-variant-report/D02.csv", + # "/local/work/uncovar-wastewater/results/2022-12-21/lineage-variant-report/D06.csv" + ] + +samples = ["D01"] + +data = pd.DataFrame() +data = pd.read_csv(snakemake.input.var_df, usecols=["Mutations","Probability","Frequency","ReadDepth"]) + +# for file, sample in zip(files, samples): +# data = pd.concat( +# [ +# data, +# pd.read_csv(file, usecols=["Mutations","Probability","Frequency","ReadDepth"]) +# ] +# ) + +data.set_index("Mutations", inplace=True) +data.index.rename("mutation", inplace=True) +# mutations. +data.drop(["Lineage", "Similarity"], inplace=True) +print(data) +print(mutations.index) +data = pd.concat([data, mutations], axis=1) + +data = (data.drop('amplicon', axis=1) + .join + ( + data.amplicon + .str + .split(",", expand=True) + .stack() + .reset_index(drop=True, level=1) + .rename('amplicon') + )) + +cols = ["Probability", "Frequency", "ReadDepth", "position", "count", "amplicon"] +data = data[cols] + +data.sort_values(by=["position", "amplicon"], inplace=True) +print(data) + +data = data.replace({np.NaN: 0}) +# filter_data = data[(data["Probability"] > 0.95 & data["Frequency"] > 0 & data["ReadDepth"] > 0)] +filter_data = data[(data["Probability"] > 0.9) & (data["Frequency"] * data["ReadDepth"] > 10)].copy(deep=True) + +filter_data.reset_index(inplace=True, drop=False) +filter_data = filter_data[["mutation", "amplicon"]] +filter_data = filter_data.groupby(["amplicon"]).agg(lambda x: ' '.join(sorted(set(x)))) + +filter_data.sort_index(key= lambda x: np.argsort(index_natsorted(filter_data.index)), inplace=True) +print(filter_data) + +amplicon_profiles.replace(r"^\s", "", regex=True, inplace=True) +# print(amplicon_profiles) +with open(snakemake.output.profiles, "w") as outfile: + for index, mutation_profile in filter_data.iterrows(): + lineage_hit = amplicon_profiles.columns[(amplicon_profiles == mutation_profile["mutation"]).any()].tolist() + print(index + "\t" + mutation_profile["mutation"], lineage_hit, file=outfile) + snvs = mutation_profile["mutation"].split(" ") + if len(snvs) > 1: + for snv in snvs: + snv_hit = amplicon_profiles.columns[(amplicon_profiles == snv).any()].tolist() + print("\t" + snv, snv_hit, file=outfile) + # print(list(powerset(snvs))) + + + +# pos = 21764 +# test = amplicons[(pos >= amplicons['amp_start']) & (pos <= amplicons['amp_end'])] +# print(test) + +# print(mutations) + +# import sys + +# sys.stderr = open(snakemake.log[0], "w") + +# import altair as alt +# import pandas as pd +# import pysam +# from intervaltree import IntervalTree + +# # read primer bedpe to df +# PRIMER = pd.read_csv(snakemake.params.get("bedpe", ""), delimiter="\t", header=None) +# PRIMER.drop(PRIMER.columns[[0, 3]], axis=1, inplace=True) +# PRIMER.columns = ["p1_start", "p1_end", "p2_start", "p2_end"] + +# # convert df to interval trees +# primer_intervals = IntervalTree() +# no_primer_intervals = IntervalTree() +# for index, row in PRIMER.iterrows(): +# primer_intervals[row["p1_start"] : row["p2_end"] + 1] = ( +# row["p1_start"], +# row["p2_end"] + 1, +# ) +# no_primer_intervals[row["p1_end"] + 1 : row["p2_start"]] = ( +# row["p1_end"] + 1, +# row["p2_start"], +# ) + + +# def iter_with_samples(inputfiles): +# return zip(snakemake.params.samples, inputfiles) + + +# def count_intervals(file): +# with pysam.AlignmentFile(file, "rb") as bam: +# counter_primer = 0 +# counter_no_primer = 0 +# counter_primer_within = 0 +# counter_no_primer_within = 0 +# counter_nothing = 0 +# mate_pair_intervals = {} +# for read in bam.fetch(): +# if not mate_pair_intervals.get(read.query_name): +# mate_pair_intervals[read.query_name] = [read.reference_start] +# else: +# mate_pair_intervals[read.query_name].append(read.reference_end) +# for pair in mate_pair_intervals: +# if ( +# len(mate_pair_intervals[pair]) > 1 +# and mate_pair_intervals[pair][0] != None +# and mate_pair_intervals[pair][1] != None +# ): +# if primer_intervals.envelop( +# mate_pair_intervals[pair][0], mate_pair_intervals[pair][1] + 1 +# ): +# if ( +# sorted( +# primer_intervals.envelop( +# mate_pair_intervals[pair][0], +# mate_pair_intervals[pair][1] + 1, +# ) +# )[0].begin +# == mate_pair_intervals[pair][0] +# and sorted( +# primer_intervals.envelop( +# mate_pair_intervals[pair][0], +# mate_pair_intervals[pair][1] + 1, +# ) +# )[0].end +# == mate_pair_intervals[pair][1] + 1 +# ): +# counter_primer += 1 +# else: +# counter_primer_within += 1 +# elif no_primer_intervals.envelop( +# mate_pair_intervals[pair][0] + 1, mate_pair_intervals[pair][1] +# ): +# if ( +# sorted( +# no_primer_intervals.envelop( +# mate_pair_intervals[pair][0] + 1, +# mate_pair_intervals[pair][1], +# ) +# )[0].begin +# == mate_pair_intervals[pair][0] + 1 +# and sorted( +# no_primer_intervals.envelop( +# mate_pair_intervals[pair][0] + 1, +# mate_pair_intervals[pair][1], +# ) +# )[0].end +# == mate_pair_intervals[pair][1] +# ): +# counter_no_primer += 1 +# else: +# counter_no_primer_within += 1 +# else: +# counter_nothing += 1 +# else: +# counter_nothing += 1 +# counters = pd.DataFrame( +# { +# "n_count": [ +# counter_primer, +# counter_primer_within, +# counter_no_primer, +# counter_no_primer_within, +# counter_nothing, +# ], +# "class": [ +# "uncut primer exact", +# "uncut primer within", +# "cut primer exact", +# "cut primer within", +# "no mathing win", +# ], +# } +# ) +# return counters + + +# def plot_classes(counters): +# bars = ( +# alt.Chart(counters) +# .mark_bar() +# .encode( +# y="class", +# x="n_count", +# row=alt.Row("sample:N"), +# column=alt.Column("state:N", sort="descending"), +# ) +# ) +# text = bars.mark_text( +# align="left", +# baseline="middle", +# dx=3, # Nudges text to right so it doesn't appear on top of the bar +# ).encode(text="n_count", row=alt.Row("sample:N"), column=alt.Column("state:N")) +# return bars, text + + +# all_df = pd.DataFrame() +# for sample, file in iter_with_samples(snakemake.input.unclipped): +# counts_before = count_intervals(file) +# counts_before["sample"] = sample +# counts_before["state"] = "before" +# all_df = pd.concat([all_df, counts_before], ignore_index=True) + +# for sample, file in iter_with_samples(snakemake.input.clipped): +# counts_after = count_intervals(file) +# counts_after["sample"] = sample +# counts_after["state"] = "after" +# all_df = pd.concat([all_df, counts_after], ignore_index=True) + +# bars, text = plot_classes(all_df) + +# (bars).properties(title="Amplicon matching").save(snakemake.output.plot) diff --git a/workflow/scripts/plot-amplicon-coverage.py b/workflow/scripts/plot-amplicon-coverage.py new file mode 100644 index 000000000..7fcd0a205 --- /dev/null +++ b/workflow/scripts/plot-amplicon-coverage.py @@ -0,0 +1,52 @@ + # Copyright 2022 Thomas Battenfeld, Alexander Thomas, Johannes Köster. +# Licensed under the BSD 2-Clause License (https://opensource.org/licenses/BSD-2-Clause) +# This file may not be copied, modified, or distributed +# except according to those terms. + +import pandas as pd +import altair as alt + +primer_bed = pd.read_csv(snakemake.input.bedpe, delimiter="\t", header=None) +primer_count = len(primer_bed.index) + +print(primer_count) + + +amp_num = [*range(1, primer_count + 1)] + +# names = pd.read_csv(snakemake.input.names, header=0, index_col="processing_name") +# weeks = pd.read_csv(snakemake.input.weeks, header=0, index_col="number", ) + +amp_cov = pd.DataFrame() +amp_cov["amplicon"] = amp_num + + +for sample, file in zip(snakemake.params.samples, snakemake.input.amp_stats): + with open(file, "r") as statfile: + # sample_rename = f"{names.loc[sample[0]]['screen_name']} {weeks.loc[int(sample[1:])]['week_number']}" + for line in statfile.read().splitlines(): + if line.startswith("FDEPTH"): + coverage = line.split("\t")[2:] + amp_cov[sample] = [float(x) for x in coverage] + # amp_cov[sample_rename] = [float(x) for x in coverage] + +amp_cov.to_csv(snakemake.output.stats) +amp_cov = amp_cov.melt('amplicon', var_name='sample', value_name='coverage') +print(amp_cov["coverage"].dtypes) +amp_cov = amp_cov.round({"coverage": 0}) + +amp_cov.sort_values(by=["sample", "amplicon"], inplace=True) +print(amp_cov) + +scale = alt.Scale(type="linear", domain=[0,10,11,19,20,5000], range=["red", "red", "yellow", "yellow", "green", "green"]) + +plot = alt.Chart(amp_cov).mark_rect().encode( + alt.X("amplicon:N", axis=alt.Axis(title="Amplicon number (n = 154)")), + alt.Y("sample:O", axis=alt.Axis(title="Sampling location and week")), + color=alt.Color('coverage:Q', scale=scale, legend=None) +).configure_axis( + labelFontSize=20, + titleFontSize=30 +) + +plot.save(snakemake.output.plot) \ No newline at end of file diff --git a/workflow/scripts/plot-amplicon-coverage_WIP.py b/workflow/scripts/plot-amplicon-coverage_WIP.py new file mode 100644 index 000000000..d70fb3d5d --- /dev/null +++ b/workflow/scripts/plot-amplicon-coverage_WIP.py @@ -0,0 +1,399 @@ + # Copyright 2022 Thomas Battenfeld, Alexander Thomas, Johannes Köster. +# Licensed under the BSD 2-Clause License (https://opensource.org/licenses/BSD-2-Clause) +# This file may not be copied, modified, or distributed +# except according to those terms. + +import pandas as pd +import numpy as np +from pysam import VariantFile +from natsort import index_natsorted, ns +from more_itertools import powerset + +bcf_in = VariantFile("/local/work/uncovar-wastewater/resources/lineage-candidate-variants/all.sorted.bcf") # auto-detect input format + +mutations = pd.DataFrame() +lineages = pd.DataFrame() + +for rec in bcf_in.fetch(): + lineage_dict = {lineage: "x" for lineage in rec.info["LINEAGES"]} + mutations = pd.concat( + [ + mutations, + pd.DataFrame( + { + "mutation": rec.info["SIGNATURES"], + "position": rec.pos, + }, + + ), + ], + ignore_index=True + ) + for signature in rec.info["SIGNATURES"]: + lineages = pd.concat( + [ + lineages, + pd.DataFrame( + { + "mutation": signature, + **{ + lineage.replace(".", " "): "x" + for lineage in rec.info["LINEAGES"] + }, + }, + index=[0], + ), + ], + ignore_index=True, + ) + + +lineages = lineages.fillna(0) +lineages = lineages.replace({"x": 1}) +lineages = ( + lineages.groupby(["mutation"]) + .agg(func={column: np.max for column in lineages.columns}) + .reset_index(drop=True) +) +lineages = lineages.replace({1: "x", 0: np.NaN}) +lineages["count"] = lineages.loc[:, lineages.columns!='mutation'].count(axis=1) +print(lineages) + + + +mutations = mutations.groupby(["mutation"]).agg(func={column: np.max for column in mutations.columns}).reset_index(drop=True) +mutations.sort_values(by=["position"], inplace=True) +mutations.to_csv("test.csv", index=False) +mutations["amplicon"] = "" + +print("here") +print(mutations) + +mutations.set_index("mutation", inplace=True) +lineages.set_index("mutation", inplace=True) + +mutations = mutations.merge(lineages, left_index=True, right_index=True) + +with open("/local/work/uncovar-wastewater/results/2022-12-21/ampliconstats/D01.ampliconstats.txt", "r") as statfile: + for line in statfile.read().splitlines(): + if line.startswith("FREADS"): + coverage = line.split("\t")[2:] + +print(len(coverage)) + +amplicons = pd.read_csv("/local/work/uncovar-wastewater/resources/primer.bedpe", delimiter="\t", names=["ref_name_1", "p1_start", "p1_end", "ref_name_2", "p2_start", "p2_end"]) +amplicons.drop(columns=["ref_name_1", "ref_name_2"], inplace=True) +amplicons["amp_start"] = amplicons["p1_end"] + 1 +amplicons["amp_end"] = amplicons["p2_end"] - 1 +amplicons["amp_len"] = amplicons["amp_end"] - amplicons["amp_start"] +amplicons["amp_cov"] = coverage + +print(amplicons) + +for index1, row1 in mutations.iterrows(): + for index2, row2 in amplicons.iterrows(): + if row1["position"] in range(int(row2['amp_start']), int(row2['amp_end'])): + if mutations.loc[index1]["amplicon"] == "": + mutations.at[index1, "amplicon"] = str(index2) + else: + mutations.at[index1, "amplicon"] += "," + str(index2) + +all_columns = mutations.columns +first_columns = ["position", "amplicon", "count"] +rest_columns = [item for item in all_columns.sort_values() if item not in first_columns] +mutations = mutations[[*first_columns, *rest_columns]] + + +# mutations = (mutations.drop('amplicon', axis=1) +# .join +# ( +# mutations.amplicon +# .str +# .split(",", expand=True) +# .stack() +# .reset_index(drop=True, level=1) +# .rename('amplicon') +# )) + +mutations = mutations[[*first_columns, *rest_columns]] +mutations.sort_values(by=["position", "amplicon"], inplace=True) +mutations.to_csv("amplicons.tsv", index=True, sep="\t") + + +amplicon_profiles = mutations.copy(deep=True) + +amplicon_profiles = (amplicon_profiles.drop('amplicon', axis=1) + .join + ( + amplicon_profiles.amplicon + .str + .split(",", expand=True) + .stack() + .reset_index(drop=True, level=1) + .rename('amplicon') + )) + +# amplicon_profiles.set_index("mutation", inplace=True) +for index, row in amplicon_profiles.iterrows(): + for column in rest_columns: + if row[column] == "x": + amplicon_profiles.at[index, column] = index + +amplicon_profiles.reset_index(drop=True, inplace=True) + +print("here2") +print(mutations) +amplicon_profiles.drop(columns=["position", "count"], inplace=True) +amplicon_profiles = amplicon_profiles.replace({np.NaN : ""}) +amplicon_profiles = amplicon_profiles.groupby(["amplicon"]).agg(lambda x: ' '.join(set(x))) +amplicon_profiles.sort_index(key= lambda x: np.argsort(index_natsorted(amplicon_profiles.index)), inplace=True) +# print(amplicon_profiles) + +amplicon_profiles.replace(r"^\s", "", regex=True, inplace=True) +amplicon_transpose = amplicon_profiles.copy(deep=True) +# amplicon_transpose.reset_index(drop=False, inplace=True) + +amplicon_transpose.iloc[0,0] = "no_amplicon" + +amplicon_transpose = amplicon_transpose.transpose() + +amplicon_transpose.replace({"", np.NaN}, inplace=True) +print("pivot") +print(amplicon_transpose) + +for column in amplicon_transpose.columns: + print(f"amplicon {column}\n{amplicon_transpose[column].value_counts(ascending=False ,dropna=True).index.to_list()}") + +amplicon_profiles.to_csv("amp_profiles.csv", index=True) + + + +files = [ + # "/local/work/uncovar-wastewater/results/2022-12-21/lineage-variant-report/D01.csv", + "/local/work/uncovar-wastewater/results/2022-12-21/lineage-variant-report/D01.csv", + # "/local/work/uncovar-wastewater/results/2022-12-21/lineage-variant-report/D06.csv" + ] + +samples = ["D01"] + +data = pd.DataFrame() + +for file, sample in zip(files, samples): + data = pd.concat( + [ + data, + pd.read_csv(file, usecols=["Mutations","Probability","Frequency","ReadDepth"]) + ] + ) + +data.set_index("Mutations", inplace=True) +data.index.rename("mutation", inplace=True) +# mutations. +data.drop(["Lineage", "Similarity"], inplace=True) +print(data) +print(mutations.index) +data = pd.concat([data, mutations], axis=1) + +data = (data.drop('amplicon', axis=1) + .join + ( + data.amplicon + .str + .split(",", expand=True) + .stack() + .reset_index(drop=True, level=1) + .rename('amplicon') + )) + +cols = ["Probability", "Frequency", "ReadDepth", "position", "count", "amplicon"] +data = data[cols] + +data.sort_values(by=["position", "amplicon"], inplace=True) +print(data) + +data = data.replace({np.NaN: 0}) +# filter_data = data[(data["Probability"] > 0.95 & data["Frequency"] > 0 & data["ReadDepth"] > 0)] +filter_data = data[(data["Probability"] > 0.9) & (data["Frequency"] > 0.5) & (data["ReadDepth"] > 10)].copy(deep=True) + +filter_data.reset_index(inplace=True, drop=False) +filter_data = filter_data[["mutation", "amplicon"]] +filter_data = filter_data.groupby(["amplicon"]).agg(lambda x: ' '.join(set(x))) + +filter_data.sort_index(key= lambda x: np.argsort(index_natsorted(filter_data.index)), inplace=True) +print(filter_data) + +amplicon_profiles.replace(r"^\s", "", regex=True, inplace=True) +print(amplicon_profiles) +for index, mutation_profile in filter_data.iterrows(): + lineage_hit = amplicon_profiles.columns[(amplicon_profiles == mutation_profile["mutation"]).any()].tolist() + print(index + "\t" + mutation_profile["mutation"], lineage_hit) + snvs = mutation_profile["mutation"].split(" ") + if len(snvs) > 1: + for snv in snvs: + snv_hit = amplicon_profiles.columns[(amplicon_profiles == snv).any()].tolist() + print("\t" + snv, snv_hit) + # print(list(powerset(snvs))) + + + +# pos = 21764 +# test = amplicons[(pos >= amplicons['amp_start']) & (pos <= amplicons['amp_end'])] +# print(test) + +# print(mutations) + +# import sys + +# sys.stderr = open(snakemake.log[0], "w") + +# import altair as alt +# import pandas as pd +# import pysam +# from intervaltree import IntervalTree + +# # read primer bedpe to df +# PRIMER = pd.read_csv(snakemake.params.get("bedpe", ""), delimiter="\t", header=None) +# PRIMER.drop(PRIMER.columns[[0, 3]], axis=1, inplace=True) +# PRIMER.columns = ["p1_start", "p1_end", "p2_start", "p2_end"] + +# # convert df to interval trees +# primer_intervals = IntervalTree() +# no_primer_intervals = IntervalTree() +# for index, row in PRIMER.iterrows(): +# primer_intervals[row["p1_start"] : row["p2_end"] + 1] = ( +# row["p1_start"], +# row["p2_end"] + 1, +# ) +# no_primer_intervals[row["p1_end"] + 1 : row["p2_start"]] = ( +# row["p1_end"] + 1, +# row["p2_start"], +# ) + + +# def iter_with_samples(inputfiles): +# return zip(snakemake.params.samples, inputfiles) + + +# def count_intervals(file): +# with pysam.AlignmentFile(file, "rb") as bam: +# counter_primer = 0 +# counter_no_primer = 0 +# counter_primer_within = 0 +# counter_no_primer_within = 0 +# counter_nothing = 0 +# mate_pair_intervals = {} +# for read in bam.fetch(): +# if not mate_pair_intervals.get(read.query_name): +# mate_pair_intervals[read.query_name] = [read.reference_start] +# else: +# mate_pair_intervals[read.query_name].append(read.reference_end) +# for pair in mate_pair_intervals: +# if ( +# len(mate_pair_intervals[pair]) > 1 +# and mate_pair_intervals[pair][0] != None +# and mate_pair_intervals[pair][1] != None +# ): +# if primer_intervals.envelop( +# mate_pair_intervals[pair][0], mate_pair_intervals[pair][1] + 1 +# ): +# if ( +# sorted( +# primer_intervals.envelop( +# mate_pair_intervals[pair][0], +# mate_pair_intervals[pair][1] + 1, +# ) +# )[0].begin +# == mate_pair_intervals[pair][0] +# and sorted( +# primer_intervals.envelop( +# mate_pair_intervals[pair][0], +# mate_pair_intervals[pair][1] + 1, +# ) +# )[0].end +# == mate_pair_intervals[pair][1] + 1 +# ): +# counter_primer += 1 +# else: +# counter_primer_within += 1 +# elif no_primer_intervals.envelop( +# mate_pair_intervals[pair][0] + 1, mate_pair_intervals[pair][1] +# ): +# if ( +# sorted( +# no_primer_intervals.envelop( +# mate_pair_intervals[pair][0] + 1, +# mate_pair_intervals[pair][1], +# ) +# )[0].begin +# == mate_pair_intervals[pair][0] + 1 +# and sorted( +# no_primer_intervals.envelop( +# mate_pair_intervals[pair][0] + 1, +# mate_pair_intervals[pair][1], +# ) +# )[0].end +# == mate_pair_intervals[pair][1] +# ): +# counter_no_primer += 1 +# else: +# counter_no_primer_within += 1 +# else: +# counter_nothing += 1 +# else: +# counter_nothing += 1 +# counters = pd.DataFrame( +# { +# "n_count": [ +# counter_primer, +# counter_primer_within, +# counter_no_primer, +# counter_no_primer_within, +# counter_nothing, +# ], +# "class": [ +# "uncut primer exact", +# "uncut primer within", +# "cut primer exact", +# "cut primer within", +# "no mathing win", +# ], +# } +# ) +# return counters + + +# def plot_classes(counters): +# bars = ( +# alt.Chart(counters) +# .mark_bar() +# .encode( +# y="class", +# x="n_count", +# row=alt.Row("sample:N"), +# column=alt.Column("state:N", sort="descending"), +# ) +# ) +# text = bars.mark_text( +# align="left", +# baseline="middle", +# dx=3, # Nudges text to right so it doesn't appear on top of the bar +# ).encode(text="n_count", row=alt.Row("sample:N"), column=alt.Column("state:N")) +# return bars, text + + +# all_df = pd.DataFrame() +# for sample, file in iter_with_samples(snakemake.input.unclipped): +# counts_before = count_intervals(file) +# counts_before["sample"] = sample +# counts_before["state"] = "before" +# all_df = pd.concat([all_df, counts_before], ignore_index=True) + +# for sample, file in iter_with_samples(snakemake.input.clipped): +# counts_after = count_intervals(file) +# counts_after["sample"] = sample +# counts_after["state"] = "after" +# all_df = pd.concat([all_df, counts_after], ignore_index=True) + +# bars, text = plot_classes(all_df) + +# (bars).properties(title="Amplicon matching").save(snakemake.output.plot)