From 6ebe5a7bca40a8ee2f069153ea2a636189344dbc Mon Sep 17 00:00:00 2001 From: "Dr. K.D. Murray" Date: Mon, 22 Jan 2024 09:14:59 +0100 Subject: [PATCH] Various config file documentation updates --- acanthophis/template/config.yml | 25 +++++++++++--- documentation.md | 59 ++++++++++++++++++++++++++++----- 2 files changed, 71 insertions(+), 13 deletions(-) diff --git a/acanthophis/template/config.yml b/acanthophis/template/config.yml index 884dfce..b859660 100644 --- a/acanthophis/template/config.yml +++ b/acanthophis/template/config.yml @@ -16,18 +16,33 @@ data_paths: lambda: fasta: "rawdata/reference/genome.fa" - # Taxon profiling databases + # Taxon profiling databases. These can either be downloaded as a pre-compiled + # database, or built by the corresponding tool from some database. In the + # case of custom databases, obviously you'll need to create it yourself. See + # each tool's documetation on how todo that. Also note, that paths in these + # sections need not live within the Acanthophis directory, so if you have + # e.g. a directory of databases shared between many users/projects, you can + # provide an abosolute path to it here. kraken: + # Kraken databases can be downloaded from + # https://benlangmead.github.io/aws-indexes/k2 or built from local + # databases using `kraken2-build`. Viral: dir: "rawdata/kraken/Viral" # If the database contains Bracken dbs, please uncomment the below and # specify the bracken db sequence/kmer length to use #bracken: 150 kaiju: + # Kaiju databases can be downloaded from + # https://bioinformatics-centre.github.io/kaiju/downloads.html or built + # from local databases using `kaiju-makedb` Viral: nodes: "rawdata/kaiju/Viral/nodes.dmp" fmi: "rawdata/kaiju/Viral/kaiju_db_viruses.fmi" centrifuge: + # Centrifuge databases can be downloaded from + # https://ccb.jhu.edu/software/centrifuge/ or `centrifuge-download`, and + # built from a local database using `centrifuge-build`. lambda: "rawdata/centrifuge/lambda/lambda.1.cf" # This directory should contain nodes.dmp and names.dmp from NCBI's taxdump.tar.gz ncbi_taxonomy: "rawdata/ncbitax/" @@ -45,7 +60,6 @@ data_paths: # setfile_glob above). samplesets: - # `all_samples` is a inbuilt sample set, corresponding to all samples in the # metadata file. If you only have one logical set of samples, you can use # this as the sampleset name. If you have multiple sample sets, please @@ -161,9 +175,6 @@ samplesets: # had better luck with bcftools csq, which will be supported here soon. # CRITICAL NOTE: deepvariant calls will not be subject to these filters. snpeff: false - - # Not for Deepvariant - max_depth_per_sample: 1000 tool_settings: @@ -202,7 +213,11 @@ tool_settings: sketch_size: 100000 varcall: + # Inference model for DeepVariant. Can be WGS for whole genome shotgun + # short reads, or any of [WGS,WES,PACBIO,ONT_R104,HYBRID_PACBIO_ILLUMINA]. + # See the deepvariant documentation for more information. deepvariant_model: "WGS" + # Per-aligner minimum MAPQ thresholds for using a read. minmapq: # bwa scores approximately follow a PHRED scale (-10*log10(p)) diff --git a/documentation.md b/documentation.md index 527724b..db486de 100644 --- a/documentation.md +++ b/documentation.md @@ -26,6 +26,9 @@ acanthophis-init /path/to/someproject/ # issue on github. vim config.yml +# You will of course need to copy/link in your input files and metadata, please +# see documentation below. + # Run snakemake snakemake -j 16 -p --use-conda --conda-frontend mamba --ri @@ -111,15 +114,24 @@ Now, we have a complete workflow (minus software and input data of course), and # Acanthophis setup -Acanthophis is distributed as a python package. I recommend installing it using pip: +Acanthophis is distributed as a python package, and can be installed with either conda or pip. Not that Acanthophis assumes one is using snakemake version 8, which requires at least python3.11. For this reason, I recommend using conda/mamba to install it. + +``` +mamba create -n someproject python snakemake=8 pip natsort +mamba activate someproject +python3 -m pip install acanthophis +``` + +Alternatively, with pip: ```bash -python3 -m pip install acanthophis 'snakemake[all]' natsort +python3 -m pip install acanthophis snakemake=8 natsort ``` + This should have also installed Snakemake along with the many dependencies of Snakemake. -We should now have an `acanthophis-init` command: +We will now have an `acanthophis-init` command: ``` acanthophis-init --help @@ -206,18 +218,33 @@ data_paths: lambda: fasta: "rawdata/reference/genome.fa" - # Taxon profiling databases + # Taxon profiling databases. These can either be downloaded as a pre-compiled + # database, or built by the corresponding tool from some database. In the + # case of custom databases, obviously you'll need to create it yourself. See + # each tool's documetation on how todo that. Also note, that paths in these + # sections need not live within the Acanthophis directory, so if you have + # e.g. a directory of databases shared between many users/projects, you can + # provide an abosolute path to it here. kraken: + # Kraken databases can be downloaded from + # https://benlangmead.github.io/aws-indexes/k2 or built from local + # databases using `kraken2-build`. Viral: dir: "rawdata/kraken/Viral" # If the database contains Bracken dbs, please uncomment the below and # specify the bracken db sequence/kmer length to use #bracken: 150 kaiju: + # Kaiju databases can be downloaded from + # https://bioinformatics-centre.github.io/kaiju/downloads.html or built + # from local databases using `kaiju-makedb` Viral: nodes: "rawdata/kaiju/Viral/nodes.dmp" fmi: "rawdata/kaiju/Viral/kaiju_db_viruses.fmi" centrifuge: + # Centrifuge databases can be downloaded from + # https://ccb.jhu.edu/software/centrifuge/ or `centrifuge-download`, and + # built from a local database using `centrifuge-build`. lambda: "rawdata/centrifuge/lambda/lambda.1.cf" # This directory should contain nodes.dmp and names.dmp from NCBI's taxdump.tar.gz ncbi_taxonomy: "rawdata/ncbitax/" @@ -231,13 +258,12 @@ data_paths: ####################################################################### # # This section is where we tell snakemake which files to generate for each set -# of samples. Samplesets are configured as files of sample names (see +# of samples. Samplesets are configured as files of sample names (see # setfile_glob above). samplesets: - # `all_samples` is a inbuilt sample set, corresponding to all samples in the - # runlib2samp_file from above. If you only have one logical set of samples, you can use + # metadata file. If you only have one logical set of samples, you can use # this as the sampleset name. If you have multiple sample sets, please # duplicate this entire section for each sample set, and modify the settings # accordingly. @@ -304,6 +330,8 @@ samplesets: callers: - mpileup - freebayes + - deepvariant + # Which short read aligners to use for variant calling? (can be # more/less/different to the align section above) aligners: @@ -314,7 +342,9 @@ samplesets: refs: - lambda - # Which set of filter expressions to use? (see tool_settings section below) + # Which set of filter expressions to use? (see tool_settings section + # below). CRITICAL NOTE: deepvariant calls will not be subject to these + # filters. filters: - default @@ -322,24 +352,30 @@ samplesets: # this conservatively high, as the behaviour is different between variant # callers: Freebayes skips sites with more than this many reads per # sample on average, whereas mpileup subsamples to this many reads. + # CRITICAL NOTE: deepvariant calls will not be subject to these filters. max_depth_per_sample: 400 # Only genotype the N best alleles. A considerable performance tunable, # especially for freebayes. + # CRITICAL NOTE: deepvariant calls will not be subject to these filters. best_n_alleles: 4 # At least 4 reads in at least one sample to call it a variant + # CRITICAL NOTE: deepvariant calls will not be subject to these filters. min_alt_count: 4 # Organism's ploidy? + # CRITICAL NOTE: deepvariant calls will not be subject to these filters. ploidy: 2 # Prior on the proportion of variable sites (Θ) + # CRITICAL NOTE: deepvariant calls will not be subject to these filters. theta_prior: 0.01 # Use SNPEff to annotate variant effects? Requires a rather specific set # of precomputed references, refer to the SNPeff docs. Personally I've # had better luck with bcftools csq, which will be supported here soon. + # CRITICAL NOTE: deepvariant calls will not be subject to these filters. snpeff: false @@ -379,6 +415,11 @@ tool_settings: sketch_size: 100000 varcall: + # Inference model for DeepVariant. Can be WGS for whole genome shotgun + # short reads, or any of [WGS,WES,PACBIO,ONT_R104,HYBRID_PACBIO_ILLUMINA]. + # See the deepvariant documentation for more information. + deepvariant_model: "WGS" + # Per-aligner minimum MAPQ thresholds for using a read. minmapq: # bwa scores approximately follow a PHRED scale (-10*log10(p)) @@ -495,6 +536,8 @@ snakemake \ --conda-frontend mamba ``` + + One can adjust the number of local cores or cluster jobs used with the `-j` flag. ## Resource usage