-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #173 from nextstrain/add-derived-haplotypes-for-al…
…l-sequences Summarize haplotype coverage by titer references using frequencies per haplotype from all available data
- Loading branch information
Showing
8 changed files
with
393 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
data/ | ||
builds/ | ||
results/ | ||
tables/ | ||
auspice/ | ||
auspice-who/ | ||
auspice_renamed/ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
""" | ||
Annotate derived haplotypes per node from annotated clades and store as node data JSON. | ||
""" | ||
import argparse | ||
import pandas as pd | ||
|
||
|
||
def create_haplotype_for_record(record, clade_column, mutations_column, genes=None, strip_genes=False): | ||
"""Create a haplotype string for the given record based on the values in its | ||
clade and mutations column. If a list of genes is given, filter mutations to | ||
only those in the requested genes. | ||
""" | ||
clade = record[clade_column] | ||
mutations = record[mutations_column].split(",") | ||
|
||
# Filter mutations to requested genes. | ||
if genes is not None: | ||
mutations = [ | ||
mutation | ||
for mutation in mutations | ||
if mutation.split(":")[0] in genes | ||
] | ||
|
||
mutations = "-".join(mutations).replace(":", "-") | ||
|
||
if mutations: | ||
if strip_genes and genes is not None: | ||
for gene in genes: | ||
mutations = mutations.replace(f"{gene}-", "") | ||
|
||
return f"{clade}:{mutations}" | ||
else: | ||
return clade | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser( | ||
description="Annotate derived haplotypes per record in Nextclade annotations", | ||
formatter_class=argparse.ArgumentDefaultsHelpFormatter | ||
) | ||
|
||
parser.add_argument("--nextclade", required=True, help="TSV file of Nextclade annotations with columns for clade and AA mutations derived from clade") | ||
parser.add_argument("--clade-column", help="name of the branch attribute for clade labels in the given Nextclade annotations", default="subclade") | ||
parser.add_argument("--mutations-column", help="name of the attribute for mutations relative to clades in the given Nextclade annotations", default="founderMuts['subclade'].aaSubstitutions") | ||
parser.add_argument("--genes", nargs="+", help="list of genes to filter mutations to. If not provided, all mutations will be used.") | ||
parser.add_argument("--strip-genes", action="store_true", help="strip gene names from coordinates in output haplotypes") | ||
parser.add_argument("--attribute-name", default="haplotype", help="name of attribute to store the derived haplotype in the output file") | ||
parser.add_argument("--output", help="TSV file of Nextclade annotations with derived haplotype column added", required=True) | ||
args = parser.parse_args() | ||
|
||
# Load Nextclade annotations. | ||
df = pd.read_csv( | ||
args.nextclade, | ||
sep="\t", | ||
dtype={ | ||
args.clade_column: "str", | ||
args.mutations_column: "str", | ||
}, | ||
na_filter=False, | ||
) | ||
|
||
# Annotate derived haplotypes. | ||
df[args.attribute_name] = df.apply( | ||
lambda record: create_haplotype_for_record( | ||
record, | ||
args.clade_column, | ||
args.mutations_column, | ||
args.genes, | ||
args.strip_genes, | ||
), | ||
axis=1 | ||
) | ||
|
||
# Save updated Nextclade annotations | ||
df.to_csv( | ||
args.output, | ||
sep="\t", | ||
index=False, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
#!/usr/bin/env python3 | ||
import argparse | ||
import numpy as np | ||
|
||
from augur.dates import get_numerical_dates, numeric_date_type | ||
from augur.frequencies import format_frequencies | ||
from augur.frequency_estimators import get_pivots, KdeFrequencies | ||
from augur.io import read_metadata | ||
from augur.utils import write_json | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser( | ||
description="Estimate sequence frequencies from metadata with collection dates", | ||
formatter_class=argparse.ArgumentDefaultsHelpFormatter | ||
) | ||
|
||
parser.add_argument("--metadata", required=True, help="TSV file of metadata with at least 'strain' and 'date' columns") | ||
parser.add_argument("--narrow-bandwidth", required=True, type=float, help="narrow bandwidth for KDE frequencies") | ||
parser.add_argument("--proportion-wide", type=float, default=0.0, help="proportion of wide bandwidth to use for KDE frequencies") | ||
parser.add_argument("--pivot-interval", type=int, default=4, help="interval between pivots in weeks") | ||
parser.add_argument("--min-date", type=numeric_date_type, help="minimum date to estimate frequencies for") | ||
parser.add_argument("--max-date", type=numeric_date_type, help="maximum date to estimate frequencies for") | ||
parser.add_argument("--output", required=True, help="JSON file in tip-frequencies format") | ||
args = parser.parse_args() | ||
|
||
columns_to_load = ["strain", "date"] | ||
metadata = read_metadata( | ||
args.metadata, | ||
columns=columns_to_load, | ||
dtype="string", | ||
) | ||
dates = get_numerical_dates(metadata, fmt='%Y-%m-%d') | ||
|
||
strains = [] | ||
observations = [] | ||
for strain in metadata.index.values: | ||
if dates.get(strain): | ||
strains.append(strain) | ||
observations.append(np.mean(dates[strain])) | ||
|
||
pivots = get_pivots( | ||
observations, | ||
args.pivot_interval, | ||
args.min_date, | ||
args.max_date, | ||
"weeks", | ||
) | ||
|
||
frequencies = KdeFrequencies( | ||
sigma_narrow=args.narrow_bandwidth, | ||
proportion_wide=args.proportion_wide, | ||
pivot_frequency=args.pivot_interval, | ||
start_date=args.min_date, | ||
end_date=args.max_date, | ||
) | ||
frequency_matrix = frequencies.estimate_frequencies( | ||
observations, | ||
pivots, | ||
) | ||
tip_frequencies = { | ||
strain: frequency_matrix[index] | ||
for index, strain in enumerate(strains) | ||
if frequency_matrix[index].sum() > 0 | ||
} | ||
|
||
frequency_dict = {"pivots": list(pivots)} | ||
for node_name in tip_frequencies: | ||
frequency_dict[node_name] = { | ||
"frequencies": format_frequencies(tip_frequencies[node_name]) | ||
} | ||
|
||
write_json(frequency_dict, args.output) |
Oops, something went wrong.