-
Notifications
You must be signed in to change notification settings - Fork 403
/
parameters.yaml
346 lines (305 loc) · 11.2 KB
/
parameters.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
# This file contains defaults for the "config" object used in the Snakefile.
# To temporarily override or provide a value, you can use snakemake's --config
# or --configfile options.
---
# conda environment file to use by default
# This must be a relative path to the top-level Snakefile directory (e.g., `ncov/`).
conda_environment: "workflow/envs/nextstrain.yaml"
strip_strain_prefixes:
- hCoV-19/
- SARS-CoV-2/
sanitize_metadata:
metadata_id_columns:
- strain
- name
- Virus name
database_id_columns:
- "Accession ID"
- gisaid_epi_isl
- genbank_accession
error_on_duplicate_strains: false
parse_location_field: Location
rename_fields:
- "Virus name=strain"
- "Type=type"
- "Accession ID=gisaid_epi_isl"
- "Collection date=date"
- "Additional location information=additional_location_information"
- "Sequence length=length"
- "Host=host"
- "Patient age=patient_age"
- "Gender=sex"
- "Clade=GISAID_clade"
- "Pango lineage=pango_lineage"
- "pangolin_lineage=pango_lineage"
- "Lineage=pango_lineage"
- "Pangolin version=pangolin_version"
- "Variant=variant"
- "AA Substitutions=aaSubstitutions"
- "Submission date=date_submitted"
- "Is reference?=is_reference"
- "Is complete?=is_complete"
- "Is high coverage?=is_high_coverage"
- "Is low coverage?=is_low_coverage"
- "N-Content=n_content"
- "GC-Content=gc_content"
reference_node_name: "USA/WA1/2020"
nextclade_dataset: sars-cov-2
# Define files used for external configuration. Common examples consist of a
# list of strains to include and exclude from analyses, a reference sequence to
# align sequences to, colors for strain attributes in auspice, and auspice
# configuration files.
files:
include: "defaults/include.txt"
exclude: "defaults/exclude.txt"
reference: "defaults/reference_seq.gb"
alignment_reference: "defaults/reference_seq.fasta"
annotation: "defaults/annotation.gff"
outgroup: "defaults/outgroup.fasta" # is this arg still used? file doesn't exist in gh repo
ordering: "defaults/color_ordering.tsv"
color_schemes: "defaults/color_schemes.tsv"
auspice_config: "defaults/auspice_config.json"
lat_longs: "defaults/lat_longs.tsv"
description: "defaults/description.md"
clades: "defaults/clades.tsv"
clade_name_mapping: "defaults/clade_display_names.yml"
emerging_lineages: "defaults/emerging_lineages.tsv"
mutational_fitness_distance_map: "defaults/mutational_fitness_distance_map.json"
sites_to_mask: "defaults/sites_ignored_for_tree_topology.txt"
# Define genes to translate during alignment by nextclade.
genes: ["ORF1a", "ORF1b", "S", "ORF3a", "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8", "N", "ORF9b"]
# Filter settings
filter:
# Require nearly full-length genomes.
min_length: 27000
# Omit sequences with incomplete date annotations, and USA seqs without a state.
exclude_where: "division='USA'"
exclude_ambiguous_dates_by: "any"
# Exclude sequences which are from before Dec 2019 (likely date mix-ups)
min_date: "2019-12-01"
# When choosing contextual samples for a focal set, applying crowding penalty
# will help reduce the number of genetically identical strains that get chosen,
# and allows for more diversity represented on the tree.
priorities:
crowding_penalty: 0.1
# Alignment settings
# Alignments are partitioned into smaller groups to speed up the overall alignment process.
# The number of sequences per group determines the run time of a single alignment job.
partition_sequences:
sequences_per_group: 150
# Mask settings determine how the multiple sequence alignment is masked prior to phylogenetic inference.
mask:
# Number of bases to mask from the beginning and end of the alignment. These regions of the genome
# are difficult to sequence accurately.
mask_from_beginning: 100
mask_from_end: 200
# Specific sites to mask in the reference genome's coordinates.
# These are 1-indexed coordinates of sites that have been identified as prone to sequencing errors.
mask_sites: "21987 21846"
# 21987 (S:142) is a problematic ARTIC V3 primer site. Impacts Delta & some 21L (Omicron) descendants
# 21846 (S:T95I) was/is often artifacually reverted in Delta
# Depending on how you define focal and contextual inputs, the same strains may
# appear in both inputs. This option tells the workflow not to throw an error
# when duplicates occur but to accept the first instance of a given strain's
# sequences (e.g., the focal set).
combine_sequences_for_subsampling:
warn_about_duplicates: true
tree:
tree-builder-args: "'-ninit 10 -n 4'"
# TreeTime settings
refine:
root: "Wuhan/Hu-1/2019"
clock_rate: 0.0008
clock_std_dev: 0.0004
coalescent: "opt"
date_inference: "marginal"
divergence_unit: "mutations"
clock_filter_iqd: 8
ancestral:
inference: "joint"
colors:
default:
# Amount of time back to color clades, if "all" then all clades are colored
# Can be specified per build in builds.yaml
clade_recency: "all"
# Frequencies settings
frequencies:
# default settings that can be over-ridden for specific builds
default:
# min_date is set by default to 1 year before present
min_date: "1Y"
# max_date is set by default to present date - recent_days_to_censor
# KDE bandwidths in proportion of a year to use per strain.
# using 1M bandwidth by default
narrow_bandwidth: 0.0833
# settings that can be over-ridden across all builds, but not for specific builds
recent_days_to_censor: 0
# Number of weeks between pivots
pivot_interval: 1
# Measure pivots in weeks
pivot_interval_units: "weeks"
# Weight of KDE that uses wide bandwidth
proportion_wide: 0.0
# Diffusion frequency settings
minimal_frequency: 0.01
stiffness: 20
inertia: 0.2
# Logistic growth settings
logistic_growth:
# Calculate logistic growth over the last N pivots which corresponds to N
# times the amount of time represented by the `pivot_interval_units` in the
# frequencies configuration.
delta_pivots: 6
# Set the minimum number of tips a clade must have before its logistic growth
# is calculated.
min_tips: 50
# Set the minimum and maximum current frequency a clade must for its logistic
# growth to be calculated.
min_frequency: 0.000001
max_frequency: 0.95
# Cluster-specific settings
cluster:
# Require a minimum number of tips to be annotated as a cluster.
min_tips: 3
# Find clusters with the same values in the given column (e.g., division, country, etc.).
group_by: division
#
# Region-specific settings
#
traits:
default:
sampling_bias_correction: 2.5
columns: ["country"]
# Default subsampling schemes designed for different geographic scales. With the
# exception of the "global" region, most subsampling schemes define a large
# focal set of sequences related to a specific geographic entity (e.g., Africa,
# Switzerland, etc.) and a smaller set of contextual sequences from the rest of
# the world.
#
# Users can control the specificity of subsampling by defining as many
# subsampling sets as they require. For example, one may wish to include all
# sequences from California, many contextual sequences from the USA, and a few
# contextual sequences from the rest of the world.
#
# Each build defines its subsampling scheme by specifying a `subsampling_scheme`
# attribute whose value is the name of an entry in the subsampling configuration
# below or defined elsewhere in another configuration file.
subsampling:
# Default subsampling logic to select all strains from all inputs (i.e., no subsampling).
all:
all:
no_subsampling: true
# Default subsampling logic for regions
region:
# Focal samples for region
region:
group_by: "division year month"
seq_per_group: 48
exclude: "--exclude-where 'region!={region}'"
# Contextual samples for region from the rest of the world
global:
group_by: "country year month"
seq_per_group: 4
exclude: "--exclude-where 'region={region}'"
priorities:
type: "proximity"
focus: "region"
# Custom subsampling logic for global region.
region_global:
global:
group_by: "country year month"
seq_per_group: 20
# Custom subsampling logic for regions like Europe where grouping by country
# is the smallest resolution required.
region_grouped_by_country:
# Focal samples for region
region:
group_by: "country year month"
seq_per_group: 64
exclude: "--exclude-where 'region!={region}'"
# Contextual samples for region from the rest of the world
global:
group_by: "country year month"
seq_per_group: 6
exclude: "--exclude-where 'region={region}'"
priorities:
type: "proximity"
focus: "region"
# Default subsampling logic for countries
country:
# Focal samples for country
country:
group_by: "division year month"
seq_per_group: 200
exclude: "--exclude-where 'country!={country}'"
# Contextual samples from country's region
region:
group_by: "country year month"
seq_per_group: 20
exclude: "--exclude-where 'country={country}' 'region!={region}'"
priorities:
type: "proximity"
focus: "country"
# Contextual samples from the rest of the world,
# excluding the current region to avoid resampling.
global:
group_by: "country year month"
seq_per_group: 10
exclude: "--exclude-where 'region={region}'"
priorities:
type: "proximity"
focus: "country"
# Default subsampling logic for divisions
division:
# Focal samples for division
division:
group_by: "year month"
seq_per_group: 300
exclude: "--exclude-where 'region!={region}' 'country!={country}' 'division!={division}'"
# Contextual samples from division's country
country:
group_by: "division year month"
seq_per_group: 20
exclude: "--exclude-where 'region!={region}' 'country!={country}' 'division={division}'"
priorities:
type: "proximity"
focus: "division"
# Contextual samples from division's region
region:
group_by: "country year month"
seq_per_group: 10
exclude: "--exclude-where 'region!={region}' 'country={country}'"
priorities:
type: "proximity"
focus: "division"
# Contextual samples from the rest of the world, excluding the current
# division to avoid resampling.
global:
group_by: "country year month"
seq_per_group: 5
exclude: "--exclude-where 'region={region}'"
priorities:
type: "proximity"
focus: "division"
# Default subsampling logic for locations
location:
# Focal samples for location
focal:
group_by: "year month"
seq_per_group: 300
exclude: "--exclude-where 'region!={region}' 'country!={country}' 'division!={division}' 'location!={location}'"
# Samples that are genetically related to the focal samples
related:
group_by: "country year month"
seq_per_group: 20
exclude: "--exclude-where 'location={location}'"
priorities:
type: "proximity"
focus: "focal"
# Contextual samples from the rest of the world
contextual:
group_by: "country year month"
seq_per_group: 10
exclude: "--exclude-where 'location={location}'"
# Configure the prefix to use for Auspice JSONs.
auspice_json_prefix: ncov