Merge pull request #173 from nextstrain/add-derived-haplotypes-for-al…

…l-sequences Summarize haplotype coverage by titer references using frequencies per haplotype from all available data
nextstrain · Jul 25, 2024 · 8849483 · 8849483
2 parents bf73f72 + dd9aae5
commit 8849483
Show file tree

Hide file tree

Showing 8 changed files with 393 additions and 15 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@
 data/
 builds/
 results/
+tables/
 auspice/
 auspice-who/
 auspice_renamed/

diff --git a/.pylintrc b/.pylintrc
@@ -311,13 +311,6 @@ max-line-length=100
 # Maximum number of lines in a module
 max-module-lines=1000
 
-# List of optional constructs for which whitespace checking is disabled. `dict-
-# separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
-# `trailing-comma` allows a space between comma and closing bracket: (a, ).
-# `empty-line` allows space-only lines.
-no-space-check=trailing-comma,
-               dict-separator
-
 # Allow the body of a class to be on the same line as the declaration if body
 # contains single statement.
 single-line-class-stmt=no

diff --git a/profiles/nextflu-private/report.smk b/profiles/nextflu-private/report.smk
@@ -1,5 +1,6 @@
 rule all_report_outputs:
     input:
+        derived_haplotypes=expand("tables/{lineage}/derived_haplotypes.md", lineage=["h1n1pdm", "h3n2", "vic"]),
         counts_by_clade=expand("tables/{lineage}/counts_of_recent_sequences_by_clade.md", lineage=["h1n1pdm", "h3n2", "vic"]),
         total_sample_count_by_lineage="figures/total-sample-count-by-lineage.png",
 
@@ -66,10 +67,22 @@ rule download_nextclade:
         aws s3 cp {params.s3_path} {output.nextclade}
         """
 
+rule filter_nextclade_by_qc:
+    input:
+        nextclade="data/{lineage}/{segment}/nextclade.tsv.xz",
+    output:
+        nextclade="data/{lineage}/{segment}/nextclade_without_bad_qc.tsv",
+    conda: "../../workflow/envs/nextstrain.yaml"
+    shell:
+        """
+        xz -c -d {input.nextclade} \
+            | tsv-filter -H --str-ne "qc.overallStatus:bad" > {output.nextclade}
+        """
+
 rule count_recent_tips_by_clade:
     input:
         recency="tables/{lineage}/recency.json",
-        clades="data/{lineage}/ha/nextclade.tsv.xz",
+        clades="data/{lineage}/ha/nextclade_without_bad_qc.tsv",
     output:
         counts="tables/{lineage}/counts_of_recent_sequences_by_clade.md",
     conda: "../../workflow/envs/nextstrain.yaml"
@@ -80,3 +93,82 @@ rule count_recent_tips_by_clade:
             --clades {input.clades} \
             --output {output.counts}
         """
+
+rule get_derived_haplotypes:
+    input:
+        nextclade="data/{lineage}/ha/nextclade_without_bad_qc.tsv",
+    output:
+        haplotypes="data/{lineage}/nextclade_with_derived_haplotypes.tsv",
+    conda: "../../workflow/envs/nextstrain.yaml"
+    params:
+        genes=["HA1"],
+    shell:
+        """
+        python3 scripts/add_derived_haplotypes.py \
+            --nextclade {input.nextclade} \
+            --genes {params.genes:q} \
+            --strip-genes \
+            --output {output.haplotypes}
+        """
+
+rule join_metadata_and_nextclade:
+    input:
+        metadata="data/{lineage}/metadata.tsv",
+        nextclade="data/{lineage}/nextclade_with_derived_haplotypes.tsv",
+    output:
+        metadata="data/{lineage}/metadata_with_derived_haplotypes.tsv",
+    conda: "../../workflow/envs/nextstrain.yaml"
+    shell:
+        """
+        tsv-join -H -f {input.nextclade} -a haplotype -k seqName -d strain {input.metadata} > {output.metadata}
+        """
+
+rule estimate_derived_haplotype_frequencies:
+    input:
+        metadata="data/{lineage}/metadata_with_derived_haplotypes.tsv",
+    output:
+        frequencies="tables/{lineage}/derived_haplotype_frequencies.json",
+    conda: "../../workflow/envs/nextstrain.yaml"
+    params:
+        narrow_bandwidth=1 / 12.0,
+        min_date="16W",
+        max_date=config.get("build_date", "4W"),
+    shell:
+        """
+        python3 scripts/estimate_frequencies_from_metadata.py \
+            --metadata {input.metadata} \
+            --narrow-bandwidth {params.narrow_bandwidth} \
+            --min-date {params.min_date} \
+            --max-date {params.max_date} \
+            --output {output.frequencies}
+        """
+
+rule summarize_derived_haplotypes:
+    input:
+        metadata="data/{lineage}/metadata_with_derived_haplotypes.tsv",
+        frequencies="tables/{lineage}/derived_haplotype_frequencies.json",
+        titers=lambda wildcards: [
+            collection["data"]
+            for collection in config["builds"][f"{wildcards.lineage}_2y_titers"]["titer_collections"]
+            if "ferret" in collection["data"]
+        ],
+    output:
+        table="tables/{lineage}/derived_haplotypes.tsv",
+        markdown_table="tables/{lineage}/derived_haplotypes.md",
+    conda: "../../workflow/envs/nextstrain.yaml"
+    params:
+        titer_names=lambda wildcards: [
+            collection["name"]
+            for collection in config["builds"][f"{wildcards.lineage}_2y_titers"]["titer_collections"]
+            if "ferret" in collection["data"]
+        ],
+    shell:
+        """
+        python3 scripts/summarize_haplotypes.py \
+            --metadata {input.metadata} \
+            --frequencies {input.frequencies} \
+            --titers {input.titers:q} \
+            --titer-names {params.titer_names:q} \
+            --output-table {output.table} \
+            --output-markdown-table {output.markdown_table}
+        """
diff --git a/scripts/add_derived_haplotypes.py b/scripts/add_derived_haplotypes.py
@@ -0,0 +1,80 @@
+"""
+Annotate derived haplotypes per node from annotated clades and store as node data JSON.
+"""
+import argparse
+import pandas as pd
+
+
+def create_haplotype_for_record(record, clade_column, mutations_column, genes=None, strip_genes=False):
+    """Create a haplotype string for the given record based on the values in its
+    clade and mutations column. If a list of genes is given, filter mutations to
+    only those in the requested genes.
+
+    """
+    clade = record[clade_column]
+    mutations = record[mutations_column].split(",")
+
+    # Filter mutations to requested genes.
+    if genes is not None:
+        mutations = [
+            mutation
+            for mutation in mutations
+            if mutation.split(":")[0] in genes
+        ]
+
+    mutations = "-".join(mutations).replace(":", "-")
+
+    if mutations:
+        if strip_genes and genes is not None:
+            for gene in genes:
+                mutations = mutations.replace(f"{gene}-", "")
+
+        return f"{clade}:{mutations}"
+    else:
+        return clade
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description="Annotate derived haplotypes per record in Nextclade annotations",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument("--nextclade", required=True, help="TSV file of Nextclade annotations with columns for clade and AA mutations derived from clade")
+    parser.add_argument("--clade-column", help="name of the branch attribute for clade labels in the given Nextclade annotations", default="subclade")
+    parser.add_argument("--mutations-column", help="name of the attribute for mutations relative to clades in the given Nextclade annotations", default="founderMuts['subclade'].aaSubstitutions")
+    parser.add_argument("--genes", nargs="+", help="list of genes to filter mutations to. If not provided, all mutations will be used.")
+    parser.add_argument("--strip-genes", action="store_true", help="strip gene names from coordinates in output haplotypes")
+    parser.add_argument("--attribute-name", default="haplotype", help="name of attribute to store the derived haplotype in the output file")
+    parser.add_argument("--output", help="TSV file of Nextclade annotations with derived haplotype column added", required=True)
+    args = parser.parse_args()
+
+    # Load Nextclade annotations.
+    df = pd.read_csv(
+        args.nextclade,
+        sep="\t",
+        dtype={
+            args.clade_column: "str",
+            args.mutations_column: "str",
+        },
+        na_filter=False,
+    )
+
+    # Annotate derived haplotypes.
+    df[args.attribute_name] = df.apply(
+        lambda record: create_haplotype_for_record(
+            record,
+            args.clade_column,
+            args.mutations_column,
+            args.genes,
+            args.strip_genes,
+        ),
+        axis=1
+    )
+
+    # Save updated Nextclade annotations
+    df.to_csv(
+        args.output,
+        sep="\t",
+        index=False,
+    )
diff --git a/scripts/annotate_haplotypes.py b/scripts/annotate_haplotypes.py
@@ -80,12 +80,12 @@
             mutations = []
             for i in range(len(sequence_by_node[node.name])):
                 if sequence_by_node[node.name][i] != sequence_by_clade[clade][i]:
-                    # Store 1-based mutation position and derived allele.
-                    mutations.append(f"{i + 1}{sequence_by_node[node.name][i]}")
+                    # Store ancestral allele, 1-based mutation position, and derived allele.
+                    mutations.append(f"{sequence_by_clade[clade][i]}{i + 1}{sequence_by_node[node.name][i]}")
 
-            # Store the clade name plus a comma-delimited list of derived
-            # mutations present in the current node.
-            haplotype = f"{clade}:{','.join(mutations)}"
+            # Store the clade name plus a delimited list of derived mutations
+            # present in the current node.
+            haplotype = f"{clade}:{'-'.join(mutations)}"
 
         # Store the clade and haplotype values for this node.
         haplotypes[node.name][args.attribute_name] = haplotype

diff --git a/scripts/count_recent_tips_by_clade.py b/scripts/count_recent_tips_by_clade.py
@@ -31,14 +31,13 @@
     clades = pd.read_csv(
         args.clades,
         sep="\t",
-        usecols=["seqName", "subclade", "qc.overallStatus"],
+        usecols=["seqName", "subclade"],
     )
 
 
     # Filter clade labels to recent non-low-quality sequences and count the
     # clade membership for each recent tip.
     count_by_clade = clades[
-        (clades["qc.overallStatus"] != "bad") &
         (clades["seqName"].isin(recent_tips))
     ].groupby(
         "subclade"

diff --git a/scripts/estimate_frequencies_from_metadata.py b/scripts/estimate_frequencies_from_metadata.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+import argparse
+import numpy as np
+
+from augur.dates import get_numerical_dates, numeric_date_type
+from augur.frequencies import format_frequencies
+from augur.frequency_estimators import get_pivots, KdeFrequencies
+from augur.io import read_metadata
+from augur.utils import write_json
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description="Estimate sequence frequencies from metadata with collection dates",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument("--metadata", required=True, help="TSV file of metadata with at least 'strain' and 'date' columns")
+    parser.add_argument("--narrow-bandwidth", required=True, type=float, help="narrow bandwidth for KDE frequencies")
+    parser.add_argument("--proportion-wide", type=float, default=0.0, help="proportion of wide bandwidth to use for KDE frequencies")
+    parser.add_argument("--pivot-interval", type=int, default=4, help="interval between pivots in weeks")
+    parser.add_argument("--min-date", type=numeric_date_type, help="minimum date to estimate frequencies for")
+    parser.add_argument("--max-date", type=numeric_date_type, help="maximum date to estimate frequencies for")
+    parser.add_argument("--output", required=True, help="JSON file in tip-frequencies format")
+    args = parser.parse_args()
+
+    columns_to_load = ["strain", "date"]
+    metadata = read_metadata(
+        args.metadata,
+        columns=columns_to_load,
+        dtype="string",
+    )
+    dates = get_numerical_dates(metadata, fmt='%Y-%m-%d')
+
+    strains = []
+    observations = []
+    for strain in metadata.index.values:
+        if dates.get(strain):
+            strains.append(strain)
+            observations.append(np.mean(dates[strain]))
+
+    pivots = get_pivots(
+        observations,
+        args.pivot_interval,
+        args.min_date,
+        args.max_date,
+        "weeks",
+    )
+
+    frequencies = KdeFrequencies(
+        sigma_narrow=args.narrow_bandwidth,
+        proportion_wide=args.proportion_wide,
+        pivot_frequency=args.pivot_interval,
+        start_date=args.min_date,
+        end_date=args.max_date,
+    )
+    frequency_matrix = frequencies.estimate_frequencies(
+        observations,
+        pivots,
+    )
+    tip_frequencies = {
+        strain: frequency_matrix[index]
+        for index, strain in enumerate(strains)
+        if frequency_matrix[index].sum() > 0
+    }
+
+    frequency_dict = {"pivots": list(pivots)}
+    for node_name in tip_frequencies:
+        frequency_dict[node_name] = {
+            "frequencies": format_frequencies(tip_frequencies[node_name])
+        }
+
+    write_json(frequency_dict, args.output)