From 71297ecd06bacd8385037201a565095111bf8c52 Mon Sep 17 00:00:00 2001 From: lenakinzel Date: Thu, 28 Apr 2022 07:15:43 +0000 Subject: [PATCH 1/6] replaced all df.append with pd.concat --- workflow/scripts/collect_lineage_calls.py | 3 +- .../scripts/evaluate-strain-call-error.py | 3 +- .../extract-strains-from-gisaid-provision.py | 5 +- .../scripts/generate-lineage-variant-table.py | 44 +++++++++---- workflow/scripts/generate-overview-table.py | 3 +- workflow/scripts/plot-all-coverage.py | 3 +- workflow/scripts/plot-assembly-comparison.py | 64 ++++++++++++------- .../plot-dependency-of-pangolin-call.py | 3 +- workflow/scripts/plot-pangolin-conflict.py | 3 +- workflow/scripts/plot-primer-clipping.py | 6 +- 10 files changed, 92 insertions(+), 45 deletions(-) diff --git a/workflow/scripts/collect_lineage_calls.py b/workflow/scripts/collect_lineage_calls.py index b897b0e78..8ed731eb8 100644 --- a/workflow/scripts/collect_lineage_calls.py +++ b/workflow/scripts/collect_lineage_calls.py @@ -57,7 +57,8 @@ def collect_calls(sm_input, sm_output, states, lineage, number, length): ] # bring them together - call = pangolin_calls.append(call) + # call = pangolin_calls.append(call) + call = pd.concat([pangolin_calls, call]) call.to_csv(sm_output, sep="\t", index=False) diff --git a/workflow/scripts/evaluate-strain-call-error.py b/workflow/scripts/evaluate-strain-call-error.py index cca966c95..e1ffeb6bf 100644 --- a/workflow/scripts/evaluate-strain-call-error.py +++ b/workflow/scripts/evaluate-strain-call-error.py @@ -64,7 +64,8 @@ def eval_error(paths, sm_output, max_reads, prefix, separator, percentage, load_ df = df.merge(org_mix_df, how="outer").fillna(0) - results_df = results_df.append(df) + # results_df = results_df.append(df) + results_df = pd.concat([results_df, df]) for sample in results_df["mix"].unique(): sample_rmse = rmse( diff --git a/workflow/scripts/extract-strains-from-gisaid-provision.py b/workflow/scripts/extract-strains-from-gisaid-provision.py index c737fdccd..7f8b5214e 100644 --- a/workflow/scripts/extract-strains-from-gisaid-provision.py +++ b/workflow/scripts/extract-strains-from-gisaid-provision.py @@ -21,7 +21,10 @@ def extract_strains_from_provision( chunks = pd.read_json(path_to_provision, lines=True, chunksize=9000) for i, chunk in enumerate(chunks): print(f"Parsing chunk {i}", file=sys.stderr) - provision = provision.append(select_oldest_strains(chunk), ignore_index=True) + # provision = provision.append(select_oldest_strains(chunk), ignore_index=True) + provision = pd.concat( + [provision, select_oldest_strains(chunk)], ignore_index=True + ) provision = select_oldest_strains(provision) # save strain genomes diff --git a/workflow/scripts/generate-lineage-variant-table.py b/workflow/scripts/generate-lineage-variant-table.py index 6784c48a0..a72f68723 100644 --- a/workflow/scripts/generate-lineage-variant-table.py +++ b/workflow/scripts/generate-lineage-variant-table.py @@ -67,22 +67,38 @@ def rename_enumeration(list_length): lineages = record.info["LINEAGES"] for signature in signatures: # generate df with all signatures + VAF and Prob_not_present from calculation - variants_df = variants_df.append( - { - "Mutations": signature, - "Frequency": vaf, - "ReadDepth": dp, - "Prob_not_present": prob_not_present, - }, - ignore_index=True, + # variants_df = variants_df.append( + # { + # "Mutations": signature, + # "Frequency": vaf, + # "ReadDepth": dp, + # "Prob_not_present": prob_not_present, + # }, + # ignore_index=True, + # ) + variants_df_append = { + "Mutations": signature, + "Frequency": vaf, + "ReadDepth": dp, + "Prob_not_present": prob_not_present, + } + variants_df = pd.concat( + [variants_df, variants_df_append], ignore_index=True ) # generate df with lineage matrix for all signatures - lineage_df = lineage_df.append( - { - "Mutations": signature, - **{lineage.replace(".", " "): "x" for lineage in lineages}, - }, - ignore_index=True, + # lineage_df = lineage_df.append( + # { + # "Mutations": signature, + # **{lineage.replace(".", " "): "x" for lineage in lineages}, + # }, + # ignore_index=True, + # ) + lineage_df_append = { + "Mutations": signature, + **{lineage.replace(".", " "): "x" for lineage in lineages}, + } + lineage_df = pd.concat( + [lineage_df, lineage_df_append], ignore_index=True ) # aggregate both dataframes by summing up repeating rows for VAR (maximum=1) and multiply Prob_not_present diff --git a/workflow/scripts/generate-overview-table.py b/workflow/scripts/generate-overview-table.py index 28683493c..b93daafc3 100644 --- a/workflow/scripts/generate-overview-table.py +++ b/workflow/scripts/generate-overview-table.py @@ -65,7 +65,8 @@ def is_patient_report(): columns=[eukaryota, bacteria, viruses, sars_cov2, unclassified] ).fillna(0) kraken_results["sample"] = sample - species_columns = species_columns.append(kraken_results, ignore_index=True) + # species_columns = species_columns.append(kraken_results, ignore_index=True) + species_columns = pd.concat([species_columns, kraken_results], ignore_index=True) data = data.join(species_columns.set_index("sample")) diff --git a/workflow/scripts/plot-all-coverage.py b/workflow/scripts/plot-all-coverage.py index c7af5699c..f2b381aa1 100644 --- a/workflow/scripts/plot-all-coverage.py +++ b/workflow/scripts/plot-all-coverage.py @@ -23,7 +23,8 @@ def plot_coverage(sm_input, sm_output, min_coverage): sample_df["Sample"] = sample_df["#CHROM"].apply(lambda x: str(x).split(".")[0]) - coverage = coverage.append(sample_df, ignore_index=True) + # coverage = coverage.append(sample_df, ignore_index=True) + coverage = pd.concat([coverage, sample_df], ignore_index=True) coverage["# Coverage"] = coverage.Coverage.apply( lambda x: f"< {min_coverage}" diff --git a/workflow/scripts/plot-assembly-comparison.py b/workflow/scripts/plot-assembly-comparison.py index 27ee60060..60a48caa8 100644 --- a/workflow/scripts/plot-assembly-comparison.py +++ b/workflow/scripts/plot-assembly-comparison.py @@ -18,31 +18,51 @@ def register_lengths(sample, file_list, state, amplicon_state, data): for file, assembler in zip(file_list, snakemake.params.assembler): if state in ("initial", "scaffolded"): with pysam.FastxFile(file) as infile: - data = data.append( - { - "Sample": sample, - "Assembler": assembler, - "Amplicon": amplicon_state, - "length (bp)": max(len(contig.sequence) for contig in infile), - "State": state, - }, - ignore_index=True, - ) - else: - quastDf = pd.read_csv(file, sep="\t") - data = data.append( - { + # data = data.append( + # { + # "Sample": sample, + # "Assembler": assembler, + # "Amplicon": amplicon_state, + # "length (bp)": max(len(contig.sequence) for contig in infile), + # "State": state, + # }, + # ignore_index=True, + # ) + + data_append = { "Sample": sample, "Assembler": assembler, "Amplicon": amplicon_state, - "length (bp)": quastDf.loc[0, "N50"], - "State": "N50", - "Genome fraction (%)": quastDf.loc[0, "Genome fraction (%)"] - if "Genome fraction (%)" in quastDf.columns - else float("nan"), - }, - ignore_index=True, - ) + "length (bp)": max(len(contig.sequence) for contig in infile), + "State": state, + } + data = pd.concat([data, data_append], ignore_index=True) + else: + quastDf = pd.read_csv(file, sep="\t") + # data = data.append( + # { + # "Sample": sample, + # "Assembler": assembler, + # "Amplicon": amplicon_state, + # "length (bp)": quastDf.loc[0, "N50"], + # "State": "N50", + # "Genome fraction (%)": quastDf.loc[0, "Genome fraction (%)"] + # if "Genome fraction (%)" in quastDf.columns + # else float("nan"), + # }, + # ignore_index=True, + # ) + data_append = { + "Sample": sample, + "Assembler": assembler, + "Amplicon": amplicon_state, + "length (bp)": quastDf.loc[0, "N50"], + "State": "N50", + "Genome fraction (%)": quastDf.loc[0, "Genome fraction (%)"] + if "Genome fraction (%)" in quastDf.columns + else float("nan"), + } + data = pd.concat([data, data_append], ignore_index=True) return data diff --git a/workflow/scripts/plot-dependency-of-pangolin-call.py b/workflow/scripts/plot-dependency-of-pangolin-call.py index 99117ab89..dbc68f8ca 100644 --- a/workflow/scripts/plot-dependency-of-pangolin-call.py +++ b/workflow/scripts/plot-dependency-of-pangolin-call.py @@ -35,7 +35,8 @@ def plot_dependency_of_pangolin_call(sm_input, sm_output): pangolin_output["mixture_content"] = input.split(MIXTURE_PREFIX, 1)[-1].split( "." )[0] - all_sampes = all_sampes.append(pangolin_output, ignore_index=True) + # all_sampes = all_sampes.append(pangolin_output, ignore_index=True) + all_sampes = pd.concat([all_sampes, pangolin_output], ignore_index=True) all_sampes["mixture_content"] = all_sampes["mixture_content"].str.replace("-", ".") diff --git a/workflow/scripts/plot-pangolin-conflict.py b/workflow/scripts/plot-pangolin-conflict.py index e653e0b77..5acd20b60 100644 --- a/workflow/scripts/plot-pangolin-conflict.py +++ b/workflow/scripts/plot-pangolin-conflict.py @@ -35,7 +35,8 @@ def plot_pangolin_conflict(sm_input, sm_output): pangolin_output = pd.read_csv(input) pangolin_output["true_lineage"] = true_lineage pangolin_output["true_lineage_percent"] = percent - all_sampes = all_sampes.append(pangolin_output, ignore_index=True) + # all_sampes = all_sampes.append(pangolin_output, ignore_index=True) + all_sampes = pd.concat([all_sampes, pangolin_output], ignore_index=True) all_sampes["correct_lineage_assignment"] = ( all_sampes["lineage"] == all_sampes["true_lineage"] diff --git a/workflow/scripts/plot-primer-clipping.py b/workflow/scripts/plot-primer-clipping.py index d30f4dea7..c537a87f1 100644 --- a/workflow/scripts/plot-primer-clipping.py +++ b/workflow/scripts/plot-primer-clipping.py @@ -147,13 +147,15 @@ def plot_classes(counters): counts_before = count_intervals(file) counts_before["sample"] = sample counts_before["state"] = "before" - all_df = all_df.append(counts_before, ignore_index=True) + # all_df = all_df.append(counts_before, ignore_index=True) + all_df = pd.concat([all_df, counts_before], ignore_index=True) for sample, file in iter_with_samples(snakemake.input.clipped): counts_after = count_intervals(file) counts_after["sample"] = sample counts_after["state"] = "after" - all_df = all_df.append(counts_after, ignore_index=True) + # all_df = all_df.append(counts_after, ignore_index=True) + all_df = pd.concat([all_df, counts_after], ignore_index=True) bars, text = plot_classes(all_df) From 760d26cb46b66413cdacc4bdea804753c77b7e68 Mon Sep 17 00:00:00 2001 From: vBassewitz Date: Tue, 28 Jun 2022 09:57:44 +0000 Subject: [PATCH 2/6] replaced df.append with pd.concat --- config/pep/samples.csv | 4 +--- .../scripts/generate-lineage-variant-table.py | 23 +++++++++++-------- workflow/scripts/plot-assembly-comparison.py | 8 +++---- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/config/pep/samples.csv b/config/pep/samples.csv index 4173923ea..ee6ac62cd 100644 --- a/config/pep/samples.csv +++ b/config/pep/samples.csv @@ -1,4 +1,2 @@ sample_name,fq1,fq2,date,is_amplicon_data,technology,include_in_high_genome_summary -SAMPLE_NAME_1,PATH/TO/fq1,PATH/TO/fq2,SEQUENCING_DATE,0,illumina,1 # Required information for a sample sequencing on the Illumina platform -SAMPLE_NAME_2,PATH/TO/fq,,SEQUENCING_DATE,1,ont,0 # Required information for a sample sequencing on the Oxford Nanopore platform -SAMPLE_NAME_3,PATH/TO/fq,,SEQUENCING_DATE,1,ion,0 # Required information for a sample sequencing on the Ion Torrent platform +SAMPLE_NAME_1,/local/work/uncovar_dfappend/incoming/B.1.1.7.reads.1.fastq.gz,/local/work/uncovar_dfappend/incoming/B.1.1.7.reads.2.fastq.gz,SEQUENCING_DATE,0,illumina,1 \ No newline at end of file diff --git a/workflow/scripts/generate-lineage-variant-table.py b/workflow/scripts/generate-lineage-variant-table.py index a72f68723..0b2d45e3e 100644 --- a/workflow/scripts/generate-lineage-variant-table.py +++ b/workflow/scripts/generate-lineage-variant-table.py @@ -75,17 +75,18 @@ def rename_enumeration(list_length): # "Prob_not_present": prob_not_present, # }, # ignore_index=True, - # ) - variants_df_append = { - "Mutations": signature, + #) + variants_df_append = pd.DataFrame({ "Frequency": vaf, - "ReadDepth": dp, + "Mutations": signature, "Prob_not_present": prob_not_present, - } + "ReadDepth": dp, + }, index=[0]) + variants_df = pd.concat( [variants_df, variants_df_append], ignore_index=True ) - # generate df with lineage matrix for all signatures + #generate df with lineage matrix for all signatures # lineage_df = lineage_df.append( # { # "Mutations": signature, @@ -93,13 +94,15 @@ def rename_enumeration(list_length): # }, # ignore_index=True, # ) - lineage_df_append = { - "Mutations": signature, + + lineage_df_append = pd.DataFrame({ + "Mutations": [signature], **{lineage.replace(".", " "): "x" for lineage in lineages}, - } + }, index=[0]) + lineage_df = pd.concat( [lineage_df, lineage_df_append], ignore_index=True - ) + ) # aggregate both dataframes by summing up repeating rows for VAR (maximum=1) and multiply Prob_not_present variants_df = variants_df.groupby(["Mutations"]).agg( diff --git a/workflow/scripts/plot-assembly-comparison.py b/workflow/scripts/plot-assembly-comparison.py index 60a48caa8..debe7c966 100644 --- a/workflow/scripts/plot-assembly-comparison.py +++ b/workflow/scripts/plot-assembly-comparison.py @@ -29,13 +29,13 @@ def register_lengths(sample, file_list, state, amplicon_state, data): # ignore_index=True, # ) - data_append = { + data_append = pd.DataFrame({ "Sample": sample, "Assembler": assembler, "Amplicon": amplicon_state, "length (bp)": max(len(contig.sequence) for contig in infile), "State": state, - } + }, index=[0]) data = pd.concat([data, data_append], ignore_index=True) else: quastDf = pd.read_csv(file, sep="\t") @@ -52,7 +52,7 @@ def register_lengths(sample, file_list, state, amplicon_state, data): # }, # ignore_index=True, # ) - data_append = { + data_append = pd.DataFrame({ "Sample": sample, "Assembler": assembler, "Amplicon": amplicon_state, @@ -61,7 +61,7 @@ def register_lengths(sample, file_list, state, amplicon_state, data): "Genome fraction (%)": quastDf.loc[0, "Genome fraction (%)"] if "Genome fraction (%)" in quastDf.columns else float("nan"), - } + }, index=[0]) data = pd.concat([data, data_append], ignore_index=True) return data From 2f129052e1685d89ad41424185bac3d6541b3bd3 Mon Sep 17 00:00:00 2001 From: vBassewitz Date: Tue, 28 Jun 2022 10:15:08 +0000 Subject: [PATCH 3/6] replace df.append with pd.concat --- config/pep/samples.csv | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/config/pep/samples.csv b/config/pep/samples.csv index ee6ac62cd..4173923ea 100644 --- a/config/pep/samples.csv +++ b/config/pep/samples.csv @@ -1,2 +1,4 @@ sample_name,fq1,fq2,date,is_amplicon_data,technology,include_in_high_genome_summary -SAMPLE_NAME_1,/local/work/uncovar_dfappend/incoming/B.1.1.7.reads.1.fastq.gz,/local/work/uncovar_dfappend/incoming/B.1.1.7.reads.2.fastq.gz,SEQUENCING_DATE,0,illumina,1 \ No newline at end of file +SAMPLE_NAME_1,PATH/TO/fq1,PATH/TO/fq2,SEQUENCING_DATE,0,illumina,1 # Required information for a sample sequencing on the Illumina platform +SAMPLE_NAME_2,PATH/TO/fq,,SEQUENCING_DATE,1,ont,0 # Required information for a sample sequencing on the Oxford Nanopore platform +SAMPLE_NAME_3,PATH/TO/fq,,SEQUENCING_DATE,1,ion,0 # Required information for a sample sequencing on the Ion Torrent platform From eeae706b9024fd3882f1a80fd0a671635f0d6e4d Mon Sep 17 00:00:00 2001 From: vBassewitz Date: Tue, 28 Jun 2022 11:11:23 +0000 Subject: [PATCH 4/6] replaced df.append with pd.concat --- .../scripts/generate-lineage-variant-table.py | 34 +++++++++------- workflow/scripts/plot-assembly-comparison.py | 40 +++++++++++-------- 2 files changed, 43 insertions(+), 31 deletions(-) diff --git a/workflow/scripts/generate-lineage-variant-table.py b/workflow/scripts/generate-lineage-variant-table.py index 0b2d45e3e..a2258dc3c 100644 --- a/workflow/scripts/generate-lineage-variant-table.py +++ b/workflow/scripts/generate-lineage-variant-table.py @@ -75,18 +75,21 @@ def rename_enumeration(list_length): # "Prob_not_present": prob_not_present, # }, # ignore_index=True, - #) - variants_df_append = pd.DataFrame({ - "Frequency": vaf, - "Mutations": signature, - "Prob_not_present": prob_not_present, - "ReadDepth": dp, - }, index=[0]) + # ) + variants_df_append = pd.DataFrame( + { + "Frequency": vaf, + "Mutations": signature, + "Prob_not_present": prob_not_present, + "ReadDepth": dp, + }, + index=[0], + ) variants_df = pd.concat( [variants_df, variants_df_append], ignore_index=True ) - #generate df with lineage matrix for all signatures + # generate df with lineage matrix for all signatures # lineage_df = lineage_df.append( # { # "Mutations": signature, @@ -94,15 +97,18 @@ def rename_enumeration(list_length): # }, # ignore_index=True, # ) - - lineage_df_append = pd.DataFrame({ - "Mutations": [signature], - **{lineage.replace(".", " "): "x" for lineage in lineages}, - }, index=[0]) + + lineage_df_append = pd.DataFrame( + { + "Mutations": [signature], + **{lineage.replace(".", " "): "x" for lineage in lineages}, + }, + index=[0], + ) lineage_df = pd.concat( [lineage_df, lineage_df_append], ignore_index=True - ) + ) # aggregate both dataframes by summing up repeating rows for VAR (maximum=1) and multiply Prob_not_present variants_df = variants_df.groupby(["Mutations"]).agg( diff --git a/workflow/scripts/plot-assembly-comparison.py b/workflow/scripts/plot-assembly-comparison.py index debe7c966..c13409b69 100644 --- a/workflow/scripts/plot-assembly-comparison.py +++ b/workflow/scripts/plot-assembly-comparison.py @@ -29,13 +29,16 @@ def register_lengths(sample, file_list, state, amplicon_state, data): # ignore_index=True, # ) - data_append = pd.DataFrame({ - "Sample": sample, - "Assembler": assembler, - "Amplicon": amplicon_state, - "length (bp)": max(len(contig.sequence) for contig in infile), - "State": state, - }, index=[0]) + data_append = pd.DataFrame( + { + "Sample": sample, + "Assembler": assembler, + "Amplicon": amplicon_state, + "length (bp)": max(len(contig.sequence) for contig in infile), + "State": state, + }, + index=[0], + ) data = pd.concat([data, data_append], ignore_index=True) else: quastDf = pd.read_csv(file, sep="\t") @@ -52,16 +55,19 @@ def register_lengths(sample, file_list, state, amplicon_state, data): # }, # ignore_index=True, # ) - data_append = pd.DataFrame({ - "Sample": sample, - "Assembler": assembler, - "Amplicon": amplicon_state, - "length (bp)": quastDf.loc[0, "N50"], - "State": "N50", - "Genome fraction (%)": quastDf.loc[0, "Genome fraction (%)"] - if "Genome fraction (%)" in quastDf.columns - else float("nan"), - }, index=[0]) + data_append = pd.DataFrame( + { + "Sample": sample, + "Assembler": assembler, + "Amplicon": amplicon_state, + "length (bp)": quastDf.loc[0, "N50"], + "State": "N50", + "Genome fraction (%)": quastDf.loc[0, "Genome fraction (%)"] + if "Genome fraction (%)" in quastDf.columns + else float("nan"), + }, + index=[0], + ) data = pd.concat([data, data_append], ignore_index=True) return data From 2d1cabcefd3498a7a510a96d428d624688d82990 Mon Sep 17 00:00:00 2001 From: vBassewitz Date: Tue, 28 Jun 2022 12:30:40 +0000 Subject: [PATCH 5/6] replaced df.append with pd.concat --- workflow/scripts/collect_lineage_calls.py | 1 - .../scripts/evaluate-strain-call-error.py | 1 - .../extract-strains-from-gisaid-provision.py | 1 - .../scripts/generate-lineage-variant-table.py | 33 ++++-------------- workflow/scripts/generate-overview-table.py | 1 - workflow/scripts/plot-all-coverage.py | 1 - workflow/scripts/plot-assembly-comparison.py | 34 +++---------------- .../plot-dependency-of-pangolin-call.py | 1 - workflow/scripts/plot-pangolin-conflict.py | 1 - workflow/scripts/plot-primer-clipping.py | 2 -- 10 files changed, 10 insertions(+), 66 deletions(-) diff --git a/workflow/scripts/collect_lineage_calls.py b/workflow/scripts/collect_lineage_calls.py index 8ed731eb8..7dc86ce28 100644 --- a/workflow/scripts/collect_lineage_calls.py +++ b/workflow/scripts/collect_lineage_calls.py @@ -57,7 +57,6 @@ def collect_calls(sm_input, sm_output, states, lineage, number, length): ] # bring them together - # call = pangolin_calls.append(call) call = pd.concat([pangolin_calls, call]) call.to_csv(sm_output, sep="\t", index=False) diff --git a/workflow/scripts/evaluate-strain-call-error.py b/workflow/scripts/evaluate-strain-call-error.py index 8f87f2073..7b202af7a 100644 --- a/workflow/scripts/evaluate-strain-call-error.py +++ b/workflow/scripts/evaluate-strain-call-error.py @@ -62,7 +62,6 @@ def eval_error(paths, sm_output, max_reads, prefix, separator, percentage, load_ df = df.merge(org_mix_df, how="outer").fillna(0) - # results_df = results_df.append(df) results_df = pd.concat([results_df, df]) for sample in results_df["mix"].unique(): diff --git a/workflow/scripts/extract-strains-from-gisaid-provision.py b/workflow/scripts/extract-strains-from-gisaid-provision.py index 7f8b5214e..7ed832482 100644 --- a/workflow/scripts/extract-strains-from-gisaid-provision.py +++ b/workflow/scripts/extract-strains-from-gisaid-provision.py @@ -21,7 +21,6 @@ def extract_strains_from_provision( chunks = pd.read_json(path_to_provision, lines=True, chunksize=9000) for i, chunk in enumerate(chunks): print(f"Parsing chunk {i}", file=sys.stderr) - # provision = provision.append(select_oldest_strains(chunk), ignore_index=True) provision = pd.concat( [provision, select_oldest_strains(chunk)], ignore_index=True ) diff --git a/workflow/scripts/generate-lineage-variant-table.py b/workflow/scripts/generate-lineage-variant-table.py index 84459d00b..d8d69e326 100644 --- a/workflow/scripts/generate-lineage-variant-table.py +++ b/workflow/scripts/generate-lineage-variant-table.py @@ -75,16 +75,8 @@ def rename_enumeration(list_length): lineages = record.info["LINEAGES"] for signature in signatures: # generate df with all signatures + VAF and Prob_not_present from calculation - # variants_df = variants_df.append( - # { - # "Mutations": signature, - # "Frequency": vaf, - # "ReadDepth": dp, - # "Prob_not_present": prob_not_present, - # }, - # ignore_index=True, - # ) - variants_df_append = pd.DataFrame( + variants_df = pd.concat( + [variants_df, pd.DataFrame( { "Frequency": vaf, "Mutations": signature, @@ -92,30 +84,17 @@ def rename_enumeration(list_length): "ReadDepth": dp, }, index=[0], + )], ignore_index=True ) - variants_df = pd.concat( - [variants_df, variants_df_append], ignore_index=True - ) - # generate df with lineage matrix for all signatures - # lineage_df = lineage_df.append( - # { - # "Mutations": signature, - # **{lineage.replace(".", " "): "x" for lineage in lineages}, - # }, - # ignore_index=True, - # ) - - lineage_df_append = pd.DataFrame( + lineage_df = pd.concat( + [lineage_df, pd.DataFrame( { "Mutations": [signature], **{lineage.replace(".", " "): "x" for lineage in lineages}, }, index=[0], - ) - - lineage_df = pd.concat( - [lineage_df, lineage_df_append], ignore_index=True + )], ignore_index=True ) # aggregate both dataframes by summing up repeating rows for VAR (maximum=1) and multiply Prob_not_present diff --git a/workflow/scripts/generate-overview-table.py b/workflow/scripts/generate-overview-table.py index b93daafc3..448f61f12 100644 --- a/workflow/scripts/generate-overview-table.py +++ b/workflow/scripts/generate-overview-table.py @@ -65,7 +65,6 @@ def is_patient_report(): columns=[eukaryota, bacteria, viruses, sars_cov2, unclassified] ).fillna(0) kraken_results["sample"] = sample - # species_columns = species_columns.append(kraken_results, ignore_index=True) species_columns = pd.concat([species_columns, kraken_results], ignore_index=True) data = data.join(species_columns.set_index("sample")) diff --git a/workflow/scripts/plot-all-coverage.py b/workflow/scripts/plot-all-coverage.py index f2b381aa1..065d92667 100644 --- a/workflow/scripts/plot-all-coverage.py +++ b/workflow/scripts/plot-all-coverage.py @@ -23,7 +23,6 @@ def plot_coverage(sm_input, sm_output, min_coverage): sample_df["Sample"] = sample_df["#CHROM"].apply(lambda x: str(x).split(".")[0]) - # coverage = coverage.append(sample_df, ignore_index=True) coverage = pd.concat([coverage, sample_df], ignore_index=True) coverage["# Coverage"] = coverage.Coverage.apply( diff --git a/workflow/scripts/plot-assembly-comparison.py b/workflow/scripts/plot-assembly-comparison.py index c13409b69..8aaab089f 100644 --- a/workflow/scripts/plot-assembly-comparison.py +++ b/workflow/scripts/plot-assembly-comparison.py @@ -18,18 +18,7 @@ def register_lengths(sample, file_list, state, amplicon_state, data): for file, assembler in zip(file_list, snakemake.params.assembler): if state in ("initial", "scaffolded"): with pysam.FastxFile(file) as infile: - # data = data.append( - # { - # "Sample": sample, - # "Assembler": assembler, - # "Amplicon": amplicon_state, - # "length (bp)": max(len(contig.sequence) for contig in infile), - # "State": state, - # }, - # ignore_index=True, - # ) - - data_append = pd.DataFrame( + data = pd.concat([data, pd.DataFrame( { "Sample": sample, "Assembler": assembler, @@ -38,24 +27,10 @@ def register_lengths(sample, file_list, state, amplicon_state, data): "State": state, }, index=[0], - ) - data = pd.concat([data, data_append], ignore_index=True) + )], ignore_index=True) else: quastDf = pd.read_csv(file, sep="\t") - # data = data.append( - # { - # "Sample": sample, - # "Assembler": assembler, - # "Amplicon": amplicon_state, - # "length (bp)": quastDf.loc[0, "N50"], - # "State": "N50", - # "Genome fraction (%)": quastDf.loc[0, "Genome fraction (%)"] - # if "Genome fraction (%)" in quastDf.columns - # else float("nan"), - # }, - # ignore_index=True, - # ) - data_append = pd.DataFrame( + data = pd.concat([data, pd.DataFrame( { "Sample": sample, "Assembler": assembler, @@ -67,8 +42,7 @@ def register_lengths(sample, file_list, state, amplicon_state, data): else float("nan"), }, index=[0], - ) - data = pd.concat([data, data_append], ignore_index=True) + )], ignore_index=True) return data diff --git a/workflow/scripts/plot-dependency-of-pangolin-call.py b/workflow/scripts/plot-dependency-of-pangolin-call.py index dbc68f8ca..f8b5f03c0 100644 --- a/workflow/scripts/plot-dependency-of-pangolin-call.py +++ b/workflow/scripts/plot-dependency-of-pangolin-call.py @@ -35,7 +35,6 @@ def plot_dependency_of_pangolin_call(sm_input, sm_output): pangolin_output["mixture_content"] = input.split(MIXTURE_PREFIX, 1)[-1].split( "." )[0] - # all_sampes = all_sampes.append(pangolin_output, ignore_index=True) all_sampes = pd.concat([all_sampes, pangolin_output], ignore_index=True) all_sampes["mixture_content"] = all_sampes["mixture_content"].str.replace("-", ".") diff --git a/workflow/scripts/plot-pangolin-conflict.py b/workflow/scripts/plot-pangolin-conflict.py index 5acd20b60..2d7a28f55 100644 --- a/workflow/scripts/plot-pangolin-conflict.py +++ b/workflow/scripts/plot-pangolin-conflict.py @@ -35,7 +35,6 @@ def plot_pangolin_conflict(sm_input, sm_output): pangolin_output = pd.read_csv(input) pangolin_output["true_lineage"] = true_lineage pangolin_output["true_lineage_percent"] = percent - # all_sampes = all_sampes.append(pangolin_output, ignore_index=True) all_sampes = pd.concat([all_sampes, pangolin_output], ignore_index=True) all_sampes["correct_lineage_assignment"] = ( diff --git a/workflow/scripts/plot-primer-clipping.py b/workflow/scripts/plot-primer-clipping.py index c537a87f1..c6dabb8e3 100644 --- a/workflow/scripts/plot-primer-clipping.py +++ b/workflow/scripts/plot-primer-clipping.py @@ -147,14 +147,12 @@ def plot_classes(counters): counts_before = count_intervals(file) counts_before["sample"] = sample counts_before["state"] = "before" - # all_df = all_df.append(counts_before, ignore_index=True) all_df = pd.concat([all_df, counts_before], ignore_index=True) for sample, file in iter_with_samples(snakemake.input.clipped): counts_after = count_intervals(file) counts_after["sample"] = sample counts_after["state"] = "after" - # all_df = all_df.append(counts_after, ignore_index=True) all_df = pd.concat([all_df, counts_after], ignore_index=True) bars, text = plot_classes(all_df) From bc1af003e57db4c3bee45acec439306fb610e8f1 Mon Sep 17 00:00:00 2001 From: vBassewitz Date: Tue, 28 Jun 2022 12:38:50 +0000 Subject: [PATCH 6/6] replaced df.append with pd.concat --- .../scripts/generate-lineage-variant-table.py | 43 ++++++++----- workflow/scripts/plot-assembly-comparison.py | 60 ++++++++++++------- 2 files changed, 64 insertions(+), 39 deletions(-) diff --git a/workflow/scripts/generate-lineage-variant-table.py b/workflow/scripts/generate-lineage-variant-table.py index d8d69e326..94b5387c1 100644 --- a/workflow/scripts/generate-lineage-variant-table.py +++ b/workflow/scripts/generate-lineage-variant-table.py @@ -76,25 +76,36 @@ def rename_enumeration(list_length): for signature in signatures: # generate df with all signatures + VAF and Prob_not_present from calculation variants_df = pd.concat( - [variants_df, pd.DataFrame( - { - "Frequency": vaf, - "Mutations": signature, - "Prob_not_present": prob_not_present, - "ReadDepth": dp, - }, - index=[0], - )], ignore_index=True + [ + variants_df, + pd.DataFrame( + { + "Frequency": vaf, + "Mutations": signature, + "Prob_not_present": prob_not_present, + "ReadDepth": dp, + }, + index=[0], + ), + ], + ignore_index=True, ) lineage_df = pd.concat( - [lineage_df, pd.DataFrame( - { - "Mutations": [signature], - **{lineage.replace(".", " "): "x" for lineage in lineages}, - }, - index=[0], - )], ignore_index=True + [ + lineage_df, + pd.DataFrame( + { + "Mutations": [signature], + **{ + lineage.replace(".", " "): "x" + for lineage in lineages + }, + }, + index=[0], + ), + ], + ignore_index=True, ) # aggregate both dataframes by summing up repeating rows for VAR (maximum=1) and multiply Prob_not_present diff --git a/workflow/scripts/plot-assembly-comparison.py b/workflow/scripts/plot-assembly-comparison.py index 8aaab089f..3754784eb 100644 --- a/workflow/scripts/plot-assembly-comparison.py +++ b/workflow/scripts/plot-assembly-comparison.py @@ -18,31 +18,45 @@ def register_lengths(sample, file_list, state, amplicon_state, data): for file, assembler in zip(file_list, snakemake.params.assembler): if state in ("initial", "scaffolded"): with pysam.FastxFile(file) as infile: - data = pd.concat([data, pd.DataFrame( - { - "Sample": sample, - "Assembler": assembler, - "Amplicon": amplicon_state, - "length (bp)": max(len(contig.sequence) for contig in infile), - "State": state, - }, - index=[0], - )], ignore_index=True) + data = pd.concat( + [ + data, + pd.DataFrame( + { + "Sample": sample, + "Assembler": assembler, + "Amplicon": amplicon_state, + "length (bp)": max( + len(contig.sequence) for contig in infile + ), + "State": state, + }, + index=[0], + ), + ], + ignore_index=True, + ) else: quastDf = pd.read_csv(file, sep="\t") - data = pd.concat([data, pd.DataFrame( - { - "Sample": sample, - "Assembler": assembler, - "Amplicon": amplicon_state, - "length (bp)": quastDf.loc[0, "N50"], - "State": "N50", - "Genome fraction (%)": quastDf.loc[0, "Genome fraction (%)"] - if "Genome fraction (%)" in quastDf.columns - else float("nan"), - }, - index=[0], - )], ignore_index=True) + data = pd.concat( + [ + data, + pd.DataFrame( + { + "Sample": sample, + "Assembler": assembler, + "Amplicon": amplicon_state, + "length (bp)": quastDf.loc[0, "N50"], + "State": "N50", + "Genome fraction (%)": quastDf.loc[0, "Genome fraction (%)"] + if "Genome fraction (%)" in quastDf.columns + else float("nan"), + }, + index=[0], + ), + ], + ignore_index=True, + ) return data