-
Notifications
You must be signed in to change notification settings - Fork 8
/
config.yaml
84 lines (74 loc) · 2.86 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#-- email notifications of pipeline success/failure (use "Skip" to deactivate) --#
email: None
#-- I/O --#
# file listing samples and associated data
samples_file: data/GTDBr95_n10/GTDBr95_n10.tsv
## column names in samples table
samples_col: 'ncbi_organism_name'
accession_col: 'accession'
fasta_file_path_col: 'fasta_file_path'
taxID_col: 'gtdb_taxid' # or 'ncbi_species_taxid'
taxonomy_col: 'gtdb_taxonomy' # or 'ncbi_taxonomy'
# output location
output_dir: tests/output/GTDBr95_n10/
# temporary file directory (your username will be added automatically)
tmp_dir: /ebio/abt3_scratch/
#-- databases to create --#
# Replace "Create" with "Skip" to skip creation of any of these
# Note that braken relies on the kraken2 database
databases:
kraken2: Create
bracken: Create
genes: Create
humann3_bowtie2: Create
humann3_diamond: Create
# Name of UniRef clustering (uniref90 or uniref50)
## "uniref90" highly recommended
uniref_name: uniref90
# Name of the humann3 diamond database to be created
## This must match the naming allowed by humann3 (eg., "uniref90_201901.dmnd")
dmnd_name: uniref90_201901.dmnd
# Index mapping UniRef90 clusters to UniRef50 (saves time vs re-annotating)
## This is skipped if annotating with UniRef50 instead of UniRef90
cluster_idx: data/uniref50-90.pkl
#-- if custom NCBI/GTDB taxdump files, "Skip" if standard NCBI taxdump --#
# Used for kraken taxonomy & metaphlan
names_dmp: data/taxdump/names.dmp
nodes_dmp: data/taxdump/nodes.dmp
#-- keep intermediate files required for re-creating DBs (eg., w/ more genomes) --#
# If "True", the intermediate files are saved to `output_dir`
# Else, the intermediate files are temporarily stored in `temp_folder`
keep_intermediate: True
#-- software parameters --#
# `vsearch_per_genome` = per-genome gene clustering
# use "Skip" at the start of any param to skip (if possible to skip)
# for humann3, use either mmseqs or diamond (mmseqs gets priority if neither skipped)
# for humann3::mmseqs_search::run, --num-iterations must be >=2
params:
ionice: -c 3
bracken:
build_kmer: 35
build_read_lens:
- 100
- 150
genes:
prodigal: ""
vsearch_per_genome: --id 0.97 --strand both --qmask none --fasta_width 0
mmseqs_cluster: --min-seq-id 0.9 -c 0.8
mmseqs_cluster_method: linclust # or "cluster", which is slower
humann3:
batches: 2
mmseqs_search:
db: data/UniRef90/uniref90
index: -s 6
run: -e 1e-3 --max-accept 1 --max-seqs 100 --num-iterations 2 --start-sens 1 --sens-steps 3 -s 6
diamond:
db: Skip #data/uniref90_ec-filtered/uniref90_ec_filt_201901.dmnd
run: --evalue 1e-3 --query-cover 80 --id 90 --max-target-seqs 1 --block-size 4 --index-chunks 2
propagate_annotations: --min-cov 80 --min-pident 90
#-- snakemake pipeline --#
pipeline:
snakemake_folder: ./
script_folder: ./bin/scripts/
name: Struo2_db-create
config: create