diff --git a/workflow/scripts/collect_lineage_calls.py b/workflow/scripts/collect_lineage_calls.py index b897b0e7..7dc86ce2 100644 --- a/workflow/scripts/collect_lineage_calls.py +++ b/workflow/scripts/collect_lineage_calls.py @@ -57,7 +57,7 @@ def collect_calls(sm_input, sm_output, states, lineage, number, length): ] # bring them together - call = pangolin_calls.append(call) + call = pd.concat([pangolin_calls, call]) call.to_csv(sm_output, sep="\t", index=False) diff --git a/workflow/scripts/evaluate-strain-call-error.py b/workflow/scripts/evaluate-strain-call-error.py index f9139755..7b202af7 100644 --- a/workflow/scripts/evaluate-strain-call-error.py +++ b/workflow/scripts/evaluate-strain-call-error.py @@ -62,7 +62,7 @@ def eval_error(paths, sm_output, max_reads, prefix, separator, percentage, load_ df = df.merge(org_mix_df, how="outer").fillna(0) - results_df = results_df.append(df) + results_df = pd.concat([results_df, df]) for sample in results_df["mix"].unique(): sample_rmse = rmse( diff --git a/workflow/scripts/extract-strains-from-gisaid-provision.py b/workflow/scripts/extract-strains-from-gisaid-provision.py index c737fdcc..7ed83248 100644 --- a/workflow/scripts/extract-strains-from-gisaid-provision.py +++ b/workflow/scripts/extract-strains-from-gisaid-provision.py @@ -21,7 +21,9 @@ def extract_strains_from_provision( chunks = pd.read_json(path_to_provision, lines=True, chunksize=9000) for i, chunk in enumerate(chunks): print(f"Parsing chunk {i}", file=sys.stderr) - provision = provision.append(select_oldest_strains(chunk), ignore_index=True) + provision = pd.concat( + [provision, select_oldest_strains(chunk)], ignore_index=True + ) provision = select_oldest_strains(provision) # save strain genomes diff --git a/workflow/scripts/generate-lineage-variant-table.py b/workflow/scripts/generate-lineage-variant-table.py index 6a7491fe..94b5387c 100644 --- a/workflow/scripts/generate-lineage-variant-table.py +++ b/workflow/scripts/generate-lineage-variant-table.py @@ -75,21 +75,36 @@ def rename_enumeration(list_length): lineages = record.info["LINEAGES"] for signature in signatures: # generate df with all signatures + VAF and Prob_not_present from calculation - variants_df = variants_df.append( - { - "Mutations": signature, - "Frequency": vaf, - "ReadDepth": dp, - "Prob_not_present": prob_not_present, - }, + variants_df = pd.concat( + [ + variants_df, + pd.DataFrame( + { + "Frequency": vaf, + "Mutations": signature, + "Prob_not_present": prob_not_present, + "ReadDepth": dp, + }, + index=[0], + ), + ], ignore_index=True, ) - # generate df with lineage matrix for all signatures - lineage_df = lineage_df.append( - { - "Mutations": signature, - **{lineage.replace(".", " "): "x" for lineage in lineages}, - }, + + lineage_df = pd.concat( + [ + lineage_df, + pd.DataFrame( + { + "Mutations": [signature], + **{ + lineage.replace(".", " "): "x" + for lineage in lineages + }, + }, + index=[0], + ), + ], ignore_index=True, ) diff --git a/workflow/scripts/generate-overview-table.py b/workflow/scripts/generate-overview-table.py index 28683493..448f61f1 100644 --- a/workflow/scripts/generate-overview-table.py +++ b/workflow/scripts/generate-overview-table.py @@ -65,7 +65,7 @@ def is_patient_report(): columns=[eukaryota, bacteria, viruses, sars_cov2, unclassified] ).fillna(0) kraken_results["sample"] = sample - species_columns = species_columns.append(kraken_results, ignore_index=True) + species_columns = pd.concat([species_columns, kraken_results], ignore_index=True) data = data.join(species_columns.set_index("sample")) diff --git a/workflow/scripts/plot-all-coverage.py b/workflow/scripts/plot-all-coverage.py index c7af5699..065d9266 100644 --- a/workflow/scripts/plot-all-coverage.py +++ b/workflow/scripts/plot-all-coverage.py @@ -23,7 +23,7 @@ def plot_coverage(sm_input, sm_output, min_coverage): sample_df["Sample"] = sample_df["#CHROM"].apply(lambda x: str(x).split(".")[0]) - coverage = coverage.append(sample_df, ignore_index=True) + coverage = pd.concat([coverage, sample_df], ignore_index=True) coverage["# Coverage"] = coverage.Coverage.apply( lambda x: f"< {min_coverage}" diff --git a/workflow/scripts/plot-assembly-comparison.py b/workflow/scripts/plot-assembly-comparison.py index 27ee6006..3754784e 100644 --- a/workflow/scripts/plot-assembly-comparison.py +++ b/workflow/scripts/plot-assembly-comparison.py @@ -18,29 +18,43 @@ def register_lengths(sample, file_list, state, amplicon_state, data): for file, assembler in zip(file_list, snakemake.params.assembler): if state in ("initial", "scaffolded"): with pysam.FastxFile(file) as infile: - data = data.append( - { - "Sample": sample, - "Assembler": assembler, - "Amplicon": amplicon_state, - "length (bp)": max(len(contig.sequence) for contig in infile), - "State": state, - }, + data = pd.concat( + [ + data, + pd.DataFrame( + { + "Sample": sample, + "Assembler": assembler, + "Amplicon": amplicon_state, + "length (bp)": max( + len(contig.sequence) for contig in infile + ), + "State": state, + }, + index=[0], + ), + ], ignore_index=True, ) else: quastDf = pd.read_csv(file, sep="\t") - data = data.append( - { - "Sample": sample, - "Assembler": assembler, - "Amplicon": amplicon_state, - "length (bp)": quastDf.loc[0, "N50"], - "State": "N50", - "Genome fraction (%)": quastDf.loc[0, "Genome fraction (%)"] - if "Genome fraction (%)" in quastDf.columns - else float("nan"), - }, + data = pd.concat( + [ + data, + pd.DataFrame( + { + "Sample": sample, + "Assembler": assembler, + "Amplicon": amplicon_state, + "length (bp)": quastDf.loc[0, "N50"], + "State": "N50", + "Genome fraction (%)": quastDf.loc[0, "Genome fraction (%)"] + if "Genome fraction (%)" in quastDf.columns + else float("nan"), + }, + index=[0], + ), + ], ignore_index=True, ) return data diff --git a/workflow/scripts/plot-dependency-of-pangolin-call.py b/workflow/scripts/plot-dependency-of-pangolin-call.py index 99117ab8..f8b5f03c 100644 --- a/workflow/scripts/plot-dependency-of-pangolin-call.py +++ b/workflow/scripts/plot-dependency-of-pangolin-call.py @@ -35,7 +35,7 @@ def plot_dependency_of_pangolin_call(sm_input, sm_output): pangolin_output["mixture_content"] = input.split(MIXTURE_PREFIX, 1)[-1].split( "." )[0] - all_sampes = all_sampes.append(pangolin_output, ignore_index=True) + all_sampes = pd.concat([all_sampes, pangolin_output], ignore_index=True) all_sampes["mixture_content"] = all_sampes["mixture_content"].str.replace("-", ".") diff --git a/workflow/scripts/plot-pangolin-conflict.py b/workflow/scripts/plot-pangolin-conflict.py index e653e0b7..2d7a28f5 100644 --- a/workflow/scripts/plot-pangolin-conflict.py +++ b/workflow/scripts/plot-pangolin-conflict.py @@ -35,7 +35,7 @@ def plot_pangolin_conflict(sm_input, sm_output): pangolin_output = pd.read_csv(input) pangolin_output["true_lineage"] = true_lineage pangolin_output["true_lineage_percent"] = percent - all_sampes = all_sampes.append(pangolin_output, ignore_index=True) + all_sampes = pd.concat([all_sampes, pangolin_output], ignore_index=True) all_sampes["correct_lineage_assignment"] = ( all_sampes["lineage"] == all_sampes["true_lineage"] diff --git a/workflow/scripts/plot-primer-clipping.py b/workflow/scripts/plot-primer-clipping.py index d30f4dea..c6dabb8e 100644 --- a/workflow/scripts/plot-primer-clipping.py +++ b/workflow/scripts/plot-primer-clipping.py @@ -147,13 +147,13 @@ def plot_classes(counters): counts_before = count_intervals(file) counts_before["sample"] = sample counts_before["state"] = "before" - all_df = all_df.append(counts_before, ignore_index=True) + all_df = pd.concat([all_df, counts_before], ignore_index=True) for sample, file in iter_with_samples(snakemake.input.clipped): counts_after = count_intervals(file) counts_after["sample"] = sample counts_after["state"] = "after" - all_df = all_df.append(counts_after, ignore_index=True) + all_df = pd.concat([all_df, counts_after], ignore_index=True) bars, text = plot_classes(all_df)