From fcd86bd13fbd5087a269de56877592a47a3a789f Mon Sep 17 00:00:00 2001 From: Lee Katz Date: Tue, 26 Apr 2022 21:39:02 -0400 Subject: [PATCH 01/24] Create Dockerfile --- EToKi/1.2/Dockerfile | 109 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 EToKi/1.2/Dockerfile diff --git a/EToKi/1.2/Dockerfile b/EToKi/1.2/Dockerfile new file mode 100644 index 000000000..b61e165e6 --- /dev/null +++ b/EToKi/1.2/Dockerfile @@ -0,0 +1,109 @@ +# FROM defines the base docker image. This command has to come first in the file +# The 'as' keyword lets you name the folowing stage. We use `app` for the production image +#FROM ubuntu:focal as app +# Copying the Freyja container a bit +FROM mambaorg/micromamba:0.22.0 as app + +# ARG sets environment variables during the build stage +ARG SOFTWARENAME_VER="1.2" + +# build and run as root users since micromamba image has 'mambauser' set as the $USER +USER root +# set workdir to default for building; set to /data at the end +WORKDIR / + +# LABEL instructions tag the image with metadata that might be important to the user +# Optional, but highly recommended +LABEL base.image="ubuntu:focal" +LABEL dockerfile.version="1" +LABEL software="EToKi" +LABEL software.version=$SOFTWARENAME_VER +LABEL description="All methods related to Enterobase data analysis pipelines" +LABEL website="https://github.com/zheminzhou/EToKi" +LABEL license="https://github.com/zheminzhou/EToKi/blob/master/LICENSE" +LABEL maintainer="Lee Katz" +LABEL maintainer.email="gzu2@cdc.gov" + +# https://askubuntu.com/a/1013396 +# avoid asking about timezone during apt-get +ARG DEBIAN_FRONTEND=noninteractive + +# RUN executes code during the build +# Install dependencies via apt-get or yum if using a centos or fedora base +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3-pip \ + python3-dev \ + libgconf-2-4 \ + curl \ + unzip \ + libcurl4-openssl-dev \ + build-essential \ + git \ + pigz \ + libcurl4-openssl-dev \ + libcurl4 \ + ant \ + libssl-dev \ + python3-venv \ + wget && \ + apt-get autoclean + + #openjdk-8-jdk \ + +# Gimme python3 instead of python2 +RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 1 + +# Double check the python version +RUN python --version + +# set the environment, put new conda env in PATH by default +ENV PATH="/opt/conda/envs/etoki/bin:/opt/conda/envs/env/bin:${PATH}" \ + LC_ALL=C.UTF-8 + + +RUN micromamba create -n etoki python=3.8 -c conda-forge -c bioconda -c defaults ete3 numba numpy pandas scikit-learn psutil click scipy && \ + micromamba clean -a -y +# might also have to install sklearn?? + +# Now get us into that yummy yummy EToKi env +ENV ENV_NAME="etoki" +ARG MAMBA_DOCKERFILE_ACTIVATE=1 + +## EToKi itself ## + +# ENV instructions set environment variables that persist from the build into the resulting image +# Use for e.g. $PATH and locale settings for compatibility with Singularity +ENV PATH="/usr/local/bin/EToKi:$PATH" \ + LC_ALL=C + +RUN cd /usr/local/bin && git clone https://github.com/zheminzhou/EToKi.git -b ${SOFTWARENAME_VER} + +# Installs all 3rd party software except the kraken database and usearch +RUN cd /usr/local/bin/EToKi && python EToKi.py configure --install + +RUN find /usr/local/bin/EToKi -name '*.py' -exec sed -i.bak -e '1 i #!/usr/bin/env python\n# ^^^ inserted corrected shebang for this container' {} \; + +# WORKDIR sets working directory +WORKDIR /data + +# A second FROM insruction creates a new stage +# We use `test` for the test image +#FROM app as test + +# Demonstrate that the program is successfully installed + +# Option 1: run the program's internal tests, for example with SPAdes: +RUN cd /usr/local/bin/EToKi && EToKi.py --help +#RUN cd /usr/local/bin/EToKi && $(which python) $(which EToKi.py) --help +RUN cd /usr/local/bin/EToKi && bash example.bash + +# Option 2: write your own tests in a bash script in the same directory as your Dockerfile: +#COPY my_tests.sh . +#RUN bash my_tests.sh + +# Option 3: write python unit tests in a tests/ directory in the same directory as your Dockerfile: +#RUN apt-get install -y python3 +#RUN mkdir tests/ +#COPY tests/ tests/ +#RUN python3 -m unittest discover -s tests +# From fdfd2dcbecc3d04ed37df5e21bdcbd37b5f8962d Mon Sep 17 00:00:00 2001 From: Lee Katz Date: Tue, 26 Apr 2022 21:42:07 -0400 Subject: [PATCH 02/24] Rename EToKi/1.2/Dockerfile to etoki/1.2/Dockerfile --- {EToKi => etoki}/1.2/Dockerfile | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {EToKi => etoki}/1.2/Dockerfile (100%) diff --git a/EToKi/1.2/Dockerfile b/etoki/1.2/Dockerfile similarity index 100% rename from EToKi/1.2/Dockerfile rename to etoki/1.2/Dockerfile From b928781964b43fb058819d9403e87a645ded595a Mon Sep 17 00:00:00 2001 From: Lee Katz Date: Wed, 27 Apr 2022 12:58:01 -0400 Subject: [PATCH 03/24] README.md, EToKi --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 42d0ad721..2fb2155bc 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,7 @@ To learn more about the docker pull rate limits and the open source software pro | [datasets-sars-cov-2](https://github.com/CDCgov/datasets-sars-cov-2)
[![docker pulls](https://badgen.net/docker/pulls/staphb/datasets-sars-cov-2)](https://hub.docker.com/r/staphb/datasets-sars-cov-2) |
  • 0.6.2
  • 0.6.3
| https://github.com/CDCgov/datasets-sars-cov-2 | | [DSK](https://hub.docker.com/r/staphb/dsk)
[![docker pulls](https://badgen.net/docker/pulls/staphb/dsk)](https://hub.docker.com/r/staphb/dsk) |
  • 0.0.100
| https://gatb.inria.fr/software/dsk/ | | [emm-typing-tool](https://hub.docker.com/r/staphb/emm-typing-tool)
[![docker pulls](https://badgen.net/docker/pulls/staphb/emm-typing-tool)](https://hub.docker.com/r/staphb/emm-typing-tool) |
  • 0.0.1 (no version)
| https://github.com/phe-bioinformatics/emm-typing-tool | +| [EToKi](https://hub.docker.com/r/staphb/etoki)
[![docker pulls](https://badgen.net/docker/pulls/staphb/etoki)](https://hub.docker.com/r/staphb/etoki) |
  • 1.2
| https://github.com/zheminzhou/EToKi | | [FastANI](https://hub.docker.com/r/staphb/fastani)
[![docker pulls](https://badgen.net/docker/pulls/staphb/fastani)](https://hub.docker.com/r/staphb/fastani) |
  • 1.1
  • 1.32
  • 1.33
| https://github.com/ParBLiSS/FastANI | | [FastTree](https://hub.docker.com/r/staphb/fasttree)
[![docker pulls](https://badgen.net/docker/pulls/staphb/fasttree)](https://hub.docker.com/r/staphb/fasttree) |
  • 2.1.11
| http://www.microbesonline.org/fasttree/ | | [FastQC](https://hub.docker.com/r/staphb/fastqc)
[![docker pulls](https://badgen.net/docker/pulls/staphb/fastqc)](https://hub.docker.com/r/staphb/fastqc) |
  • 0.11.8
  • 0.11.9
| https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
https://github.com/s-andrews/FastQC | From 8dc4adf0bd42a515477e5c0b46d43adbc235f946 Mon Sep 17 00:00:00 2001 From: Lee Katz Date: Wed, 27 Apr 2022 12:59:41 -0400 Subject: [PATCH 04/24] EToKi program license --- Program_Licenses.md | 1 + 1 file changed, 1 insertion(+) diff --git a/Program_Licenses.md b/Program_Licenses.md index 5e3f119dc..62137efec 100644 --- a/Program_Licenses.md +++ b/Program_Licenses.md @@ -25,6 +25,7 @@ The licenses of the open-source software that is contained in these Docker image | colorid | MIT | https://github.com/hcdenbakker/colorid/blob/master/LICENSE | | DSK | GNU Affero GPLv3 | https://github.com/GATB/dsk/blob/master/LICENSE | | emm-typing-tool | GNU GPLv3 | https://github.com/phe-bioinformatics/emm-typing-tool/blob/master/LICENCE | +| EToKi | GNU GPLv3 | https://github.com/zheminzhou/EToKi/blob/master/LICENSE | | FastANI | Apache v2.0 | https://github.com/ParBLiSS/FastANI/blob/master/LICENSE | | FastTree | GNU GPLv2 | http://www.microbesonline.org/fasttree/ | | FastQC | GNU GPLv3 | https://github.com/s-andrews/FastQC/blob/master/LICENSE.txt | From 76dad2e037b1b687b3205487439b78ac27fac42c Mon Sep 17 00:00:00 2001 From: Lee Katz Date: Wed, 27 Apr 2022 13:03:15 -0400 Subject: [PATCH 05/24] Create README.md --- etoki/1.2/README.md | 518 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 518 insertions(+) create mode 100644 etoki/1.2/README.md diff --git a/etoki/1.2/README.md b/etoki/1.2/README.md new file mode 100644 index 000000000..4deb8327a --- /dev/null +++ b/etoki/1.2/README.md @@ -0,0 +1,518 @@ +# Quick Start (with examples) + +_note_ this text was lifted from the original repo README + +### Trim genomic reads +~~~~~~~~~~~ +python EToKi.py prepare --pe examples/S_R1.fastq.gz,examples/S_R2.fastq.gz -p examples/prep_out +~~~~~~~~~~~ +### Merge and trim metagenomic reads +~~~~~~~~~~~ +python EToKi.py prepare --pe examples/S_R1.fastq.gz,examples/S_R2.fastq.gz -p examples/meta_out --noRename --merge +~~~~~~~~~~~ +### Assemble genomic reads using SPAdes +~~~~~~~~~~~ +python EToKi.py assemble --pe examples/prep_out_L1_R1.fastq.gz,examples/prep_out_L1_R2.fastq.gz --se examples/prep_out_L1_SE.fastq.gz -p examples/asm_out +~~~~~~~~~~~ +### Assemble genomic reads using MEGAHIT +~~~~~~~~~~~ +python EToKi.py assemble --se examples/meta_out_L1_MP.fastq.gz \ +--pe examples/meta_out_L1_R1.fastq.gz,examples/meta_out_L1_R2.fastq.gz --se examples/meta_out_L1_SE.fastq.gz \ +-p examples/asm_out2 --assembler megahit +~~~~~~~~~~~ +### Map reads onto reference, with pre-filtering with ingroups and outgroups +~~~~~~~~~~~ +python EToKi.py assemble --se examples/meta_out_L1_MP.fastq.gz --metagenome \ +--pe examples/meta_out_L1_R1.fastq.gz,examples/meta_out_L1_R2.fastq.gz --se examples/meta_out_L1_SE.fastq.gz \ +-p examples/map_out -r examples/GCF_000010485.1_ASM1048v1_genomic.fna.gz \ +-i examples/GCF_000214765.2_ASM21476v3_genomic.fna.gz -o examples/GCF_000005845.2_ASM584v2_genomic.fna.gz +~~~~~~~~~~~ +### Prepare reference alleles and a local database for 7 Gene MLST scheme +~~~~~~~~~~~ +python EToKi.py MLSTdb -i examples/Escherichia.Achtman.alleles.fasta -r examples/Escherichia.Achtman.references.fasta -d examples/Escherichia.Achtman.convert.tab +~~~~~~~~~~~ +### Calculate 7 Gene MLST genotype for a queried genome +~~~~~~~~~~~ +gzip -cd examples/GCF_001566635.1_ASM156663v1_genomic.fna.gz > examples/GCF_001566635.1_ASM156663v1_genomic.fna && \ +python EToKi.py MLSType -i examples/GCF_001566635.1_ASM156663v1_genomic.fna -r examples/Escherichia.Achtman.references.fasta -k G749 -o stdout -d examples/Escherichia.Achtman.convert.tab +~~~~~~~~~~~ +### Run EBEis (EnteroBase Escherichia in silico serotyping) +~~~~~~~~~~~ +python EToKi.py EBEis -t Escherichia -q examples/GCF_000010485.1_ASM1048v1_genomic.fna -p SE15 +~~~~~~~~~~~ +### Cluster sequences into similarity-based groups +~~~~~~~~~~~ +python EToKi.py clust -p examples/Escherichia.Achtman.alleles_clust -i examples/Escherichia.Achtman.alleles.fasta -d 0.95 -c 0.95 +~~~~~~~~~~~ +### Do a joint BLASTn-like search using BLASTn, uSearch (uBLASTp), Mimimap and mmseqs +~~~~~~~~~~~ +python EToKi.py uberBlast -q examples/Escherichia.Achtman.alleles.fasta -r examples/GCF_001566635.1_ASM156663v1_genomic.fna -o examples/G749_7Gene.bsn --blastn --ublast --minimap --mmseq -s 2 -f +~~~~~~~~~~~ +### align multiple genomes onto one reference +~~~~~~~~~~~ +python EToKi.py align -r GCF_000010485:examples/GCF_000010485.1_ASM1048v1_genomic.fna.gz -p examples/phylo_out \ +GCF_000005845:examples/GCF_000005845.2_ASM584v2_genomic.fna.gz \ +GCF_000214765:examples/GCF_000214765.2_ASM21476v3_genomic.fna.gz \ +GCF_001566635:examples/GCF_001566635.1_ASM156663v1_genomic.fna.gz +~~~~~~~~~~~ +### Build ML tree using RAxML and place all SNPs onto branches in the tree +~~~~~~~~~~~ +cd examples && python ../EToKi.py phylo -t snp2mut -p phylo_out -s phylo_out.matrix.gz --ng && cd .. +~~~~~~~~~~~ + +# USAGE: +The first argument passed into EToKi specifies the command to be called and the rest are the parameters for that command. To see all the commands available in EToKi, use +> python EToKi.py -h + +And to see the parameters for an individual command, use: +> EToKi.py \ -h + +## configure - install and/or configure 3rd party programs +See the INSTALL section or the help page below. +~~~~~~~~~~~~~~ +usage: EToKi.py configure [-h] [--install] [--usearch USEARCH] + [--download_krakenDB] + [--link_krakenDB KRAKEN_DATABASE] [--path PATH] + +Install or modify the 3rd party programs. + +optional arguments: + -h, --help show this help message and exit + --install install 3rd party programs + --usearch USEARCH usearch is required for ortho and MLSType. A 32-bit + version of usearch can be downloaded from + https://www.drive5.com/usearch/ + --download_krakenDB When specified, miniKraken2 (8GB) will be downloaded + into the EToKi folder. You can also use + --link_krakenDB to use a pre-installed kraken2 + database. + --link_krakenDB KRAKEN_DATABASE + Kraken is optional in the assemble module. You can + specify your own database here + --path PATH, -p PATH Specify path to the 3rd party programs manually. + Format: =. This parameter can be + specified multiple times +~~~~~~~~~~~~~~~~~ + +## prepare - trim, collapse, downsize and rename the short reads +~~~~~~~~~~~~~ +usage: EToKi.py prepare [-h] [--pe PE] [--se SE] [-p PREFIX] [-q READ_QUAL] + [-b MAX_BASE] [-m MEMORY] [--noTrim] [--merge] + [--noRename] + +EToKi.py prepare +(1) Concatenates reads of the same library together. +(2) Merge pair-end sequences for metagenomic reads (bbmap). +(3) Trims sequences based on base-qualities (bbduk). +(4) Removes potential adapters and barcodes (bbduk). +(5) Limits total amount of reads to be used. +(6) Renames reads using sequential numbers. + +optional arguments: + -h, --help show this help message and exit + --pe PE comma delimited files of PE reads from the same library. + e.g. --pe a_R1.fq.gz,a_R2.fq.gz,b_R1.fq.gz,b_R2.fq.gz + This can be specified multiple times for different libraries. + --se SE comma delimited files of SE reads from the same library. + e.g. --se c_SE.fq.gz,d_SE.fq.gz + This can be specified multiple times for different libraries. + -p PREFIX, --prefix PREFIX + prefix for the outputs. Default: EToKi_prepare + -q READ_QUAL, --read_qual READ_QUAL + Minimum quality to be kept in bbduk. Default: 6 + -b MAX_BASE, --max_base MAX_BASE + Total amount of bases (in BPs) to be kept. + Default as -1 for no restriction. + Suggest to use ~100X coverage for de novo assembly. + -m MEMORY, --memory MEMORY + maximum amount of memory to be used in bbduk. Default: 30g + --noTrim Do not do quality trim using bbduk + --merge Try to merge PE reads by their overlaps using bbmap + --noRename Do not rename reads +~~~~~~~~~~~~~~~~ + +## assemble - *de novo* or reference-guided assembly for genomic or metagenomic reads +**EToKi assemble** is a joint method for both *de novo* assembly and reference-guided assembly. +* *de novo* assembly approach calls either SPAdes (default) or MEGAHIT (default for metagenomic data) on short reads that have been cleaned up using **EToKi prepare**, and uses Pilon to polish the assembled scaffolds and evaluate the reliability of consensus bases of the scaffolds. + +* Reference-guided assembly is also called "reference mapping". Short reads are aligned to a user-specified reference genome using minimap2. Nucleotide bases of the reference genome are updated using Pilon, according to the consensus base calls of the covered reads. Non-specific metagenomic reads of closely related species can sometimes also align to the reference genome and confuse consensus calling. Two arguments, **--outgroup** and **--ingroup**, are given to pre-filter these non-specific reads and obtain clean SNP calls. +~~~~~~~~~~~~~~~~~ +usage: EToKi.py assemble [-h] [--pe PE] [--se SE] [--pacbio PACBIO] [--ont ONT] [-p PREFIX] [-a ASSEMBLER] [-r REFERENCE] [-k KMERS] [-m MAPPER] [-d MAX_DIFF] [-i INGROUP] [-o OUTGROUP] [-S SNP] [-c CONT_DEPTH] + [--excluded EXCLUDED] [--metagenome] [--numPolish NUMPOLISH] [--reassemble] [--onlySNP] [--noQuality] [--onlyEval] [--kraken] + +EToKi.py assemble +(1.1) Assembles short reads into assemblies, or +(1.2) Maps them onto a reference. +And +(2) Polishes consensus using polish, +(3) Removes low level contaminations. +(4) Estimates the base quality of the consensus. +(5) Predicts taxonomy using Kraken. + +optional arguments: + -h, --help show this help message and exit + --pe PE comma delimited two files of PE reads. + --se SE one file of SE read. + --pacbio PACBIO one file of pacbio read. + --ont ONT one file of nanopore read. + -p PREFIX, --prefix PREFIX + prefix for the outputs. Default: EToKi_assemble + -a ASSEMBLER, --assembler ASSEMBLER + Assembler used for de novo assembly. + Disabled if you specify a reference. + Default: spades for single colony isolates, megahit for metagenome. + Long reads will always be assembled with Flye + -r REFERENCE, --reference REFERENCE + Reference for read mapping. Specify this for reference mapping module. + -k KMERS, --kmers KMERS + relative lengths of kmers used in SPAdes. Default: 30,50,70,90 + -m MAPPER, --mapper MAPPER + aligner used for read mapping. + options are: miminap (default), bwa or bowtie2 + -d MAX_DIFF, --max_diff MAX_DIFF + Maximum proportion of variations allowed for a aligned reads. + Default: 0.1 for single isolates, 0.05 for metagenome + -i INGROUP, --ingroup INGROUP + Additional references presenting intra-population genetic diversities. + -o OUTGROUP, --outgroup OUTGROUP + Additional references presenting genetic diversities outside of the studied population. + Reads that are more similar to outgroups will be excluded from analysis. + -S SNP, --SNP SNP Exclusive set of SNPs. This will overwrite the polish process. + Required format: + + ... + -c CONT_DEPTH, --cont_depth CONT_DEPTH + Allowed range of read depth variations relative to average value. + Default: 0.2,2.5 + Contigs with read depths outside of this range will be removed from the final assembly. + --excluded EXCLUDED A name of the file that contains reads to be excluded from the analysis. + --metagenome Reads are from metagenomic samples + --numPolish NUMPOLISH + Number of Pilon polish iterations. Default: 1 + --reassemble Do local re-assembly in PILON. Suggest to use this flag with long reads. + --onlySNP Only modify substitutions during the PILON polish. + --noQuality Do not estimate base qualities. + --onlyEval Do not run assembly/mapping. Only evaluate assembly status. + --kraken Run kmer based species predicton on contigs. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +## ortho - pan-genome (and wgMLST scheme) prediction +**EToKi ortho** has now been migrated to a [separate repository](https://github.com/zheminzhou/PEPPA) and renamed as **PEPPA**. + +## MLSTdb - Set up exemplar alleles and database for MLST schemes +**EToKi MLSTdb** converts existing allelic sequences into two files: (1) a multi-fasta file of exemplar allelic sequences and (2) a lookup table for the **EToKi MLSType** method. +* The exemplar alleles are defined as: + 1. Over 40% identity to the allelic sequences of a reference genome specified by **--refstrain** + 2. Less than 90% identity between different exemplar sequences of the same locus + 3. Identity to sequences of any different locus that is at least 10% less than the similarity to sequences of the same locus. +~~~~~~~~~~~ +usage: EToKi.py MLSTdb [-h] -i ALLELEFASTA [-r REFSET] [-d DATABASE] + [-s REFSTRAIN] [-x MAX_IDEN] [-m MIN_IDEN] [-p PARALOG] + [-c COVERAGE] [-e] + +MLSTdb. Create reference sets of alleles for nomenclature. + +optional arguments: + -h, --help show this help message and exit + -i ALLELEFASTA, --input ALLELEFASTA + [REQUIRED] A single file contains all known alleles in + a MLST scheme. + -r REFSET, --refset REFSET + [DEFAULT: No ref allele] Output - Reference alleles + used for MLSType. + -d DATABASE, --database DATABASE + [DEFAULT: No allele DB] Output - A lookup table of all + alleles. + -s REFSTRAIN, --refstrain REFSTRAIN + [DEFAULT: None] A single file contains alleles from + the reference genome. + -x MAX_IDEN, --max_iden MAX_IDEN + [DEFAULT: 0.9 ] Maximum identities between resulting + refAlleles. + -m MIN_IDEN, --min_iden MIN_IDEN + [DEFAULT: 0.4 ] Minimum identities between refstrain + and resulting refAlleles. + -p PARALOG, --paralog PARALOG + [DEFAULT: 0.1 ] Minimum differences between difference + loci. + -c COVERAGE, --coverage COVERAGE + [DEFAULT: 0.7 ] Proportion of aligned regions between + alleles. + -e, --relaxEnd [DEFAULT: False ] Allow changed ends (for pubmlst). +~~~~~~~~~~~ + +## MLSType - MLST nomenclature using a local set of references +**EToKi MLSType** identities allelic sequences in a queried genome, by comparing it with the exemplar alleles generated by **MLSTdb**. + ~~~~~~~~~~ +usage: EToKi.py MLSType [-h] -i GENOME -r REFALLELE -k UNIQUE_KEY + [-d DATABASE] [-o OUTPUT] [-q] [-f] [-m MIN_IDEN] + [-p MIN_FRAG_PROP] [-l MIN_FRAG_LEN] [-x INTERGENIC] + [--overlap_prop OVERLAP_PROP] + [--overlap_iden OVERLAP_IDEN] [--max_dist MAX_DIST] + [--diag_diff DIAG_DIFF] [--max_diff MAX_DIFF] + +MLSType. Find and designate MLST alleles from a queried assembly. + +optional arguments: + -h, --help show this help message and exit + -i GENOME, --genome GENOME + [REQUIRED] Input - filename for genomic assembly. + -r REFALLELE, --refAllele REFALLELE + [REQUIRED] Input - fasta file for reference alleles. + -k UNIQUE_KEY, --unique_key UNIQUE_KEY + [REQUIRED] An unique identifier for the assembly. + -d DATABASE, --database DATABASE + [OPTIONAL] Input - lookup table of existing alleles. + -o OUTPUT, --output OUTPUT + [DEFAULT: No output] Output - filename for the + generated alleles. Specify to STDOUT for screen + output. + -q, --query_only [DEFAULT: False] Do not submit new allele, only query. + -f, --force [DEFAULT: False] Force to accept low quality alleles. + -m MIN_IDEN, --min_iden MIN_IDEN + [DEFAULT: 0.65 ] Minimum identities between refAllele + and genome. + -p MIN_FRAG_PROP, --min_frag_prop MIN_FRAG_PROP + [DEFAULT: 0.6 ] Minimum covereage of a fragment. + -l MIN_FRAG_LEN, --min_frag_len MIN_FRAG_LEN + [DEFAULT: 50 ] Minimum length of a fragment. + -x INTERGENIC, --intergenic INTERGENIC + [DEFAULT: -1,-1 ] Call alleles in intergenic region if + the distance between two closely located loci fall + within the range defined by the two numbers. Suggest + to use 50,500. This is diabled by default with minus + numbers. + --overlap_prop OVERLAP_PROP + [DEFAULT: 0.5 ] Given two hits, if of + their regions overlap, and the sequence identities of + one hits is lower than the other. The + hit with lower identities will be removed. + --overlap_iden OVERLAP_IDEN + [DEFAULT: 0.05 ] Given two hits, if of + their regions overlap, and the sequence identities of + one hits is lower than the other. The + hit with lower identities will be removed. + --max_dist MAX_DIST [DEFAULT: 300 ] Consider two closely located hits as a + synteny block if their coordinates in both queried + genomes and reference gene are seperated by no more + than bps. + --diag_diff DIAG_DIFF + [DEFAULT: 1.2 ] Consider two closely located hits as a + synteny block if, after merged, its covered region in + the queried genome is no more than folds + of the region in the reference gene. + --max_diff MAX_DIFF [DEFAULT: 200 ] Consider two closely located hits as a + synteny block if, after merged, the lengths of its + covered regions in the queried genome and the + reference gene are differed by no more than + bps. + ~~~~~~~~~~ + +## align - align multiple queried genomes to a single reference +~~~~~~~~~~~ +usage: EToKi.py align [-h] -r REFERENCE [-p PREFIX] [-a] [-m] [-l] [-c CORE] + [-n N_PROC] + queries [queries ...] + +Align multiple genomes onto a single reference. + +positional arguments: + queries queried genomes. Use : format to feed + in a tag for each genome. Otherwise filenames will be + used as tags for genomes. + +optional arguments: + -h, --help show this help message and exit + -r REFERENCE, --reference REFERENCE + [REQUIRED; INPUT] reference genomes to be aligned + against. Use : format to assign a tag + to the reference. + -p PREFIX, --prefix PREFIX + [OUTPUT] prefix for all outputs. + -a, --alignment [OUTPUT] Generate core genomic alignments in FASTA + format + -m, --matrix [OUTPUT] Do not generate core SNP matrix + -l, --last Activate to use LAST as aligner. [DEFAULT: minimap2] + -c CORE, --core CORE [PARAM] percentage of presences for core genome. + [DEFAULT: 0.95] + -n N_PROC, --n_proc N_PROC + [PARAM] number of processes to use. [DEFAULT: 5] +~~~~~~~~~~~ + +## phylo - infer phylogeny and ancestral states from genomic alignments +~~~~~~~~~~~ +usage: EToKi.py phylo [-h] [--tasks TASKS] --prefix PREFIX + [--alignment ALIGNMENT] [--snp SNP] [--tree TREE] + [--ancestral ANCESTRAL] [--core CORE] [--n_proc N_PROC] + +EToKi phylo runs to: +(1) Generate SNP matrix from alignment (-t matrix) +(2) Calculate ML phylogeny from SNP matrix using RAxML (-t phylogeny) +(3) Workout the nucleotide sequences of internal nodes in the tree using ML estimation (-t ancestral or -t ancestral_proportion for ratio frequencies) +(4) Place mutations onto branches of the tree (-t mutation) + +optional arguments: + -h, --help show this help message and exit + --tasks TASKS, -t TASKS + Tasks to call. Allowed tasks are: + matrix: generate SNP matrix from alignment. + phylogeny: generate phylogeny from SNP matrix. + ancestral: generate AS (ancestral state) matrix from SNP matrix and phylogeny + ancestral_proportion: generate possibilities of AS for each site + mutation: assign SNPs into branches from AS matrix + + You can run multiple tasks by sending a comma delimited task list. + There are also some pre-defined task combo: + all: matrix,phylogeny,ancestral,mutation + aln2phy: matrix,phylogeny [default] + snp2anc: phylogeny,ancestral + mat2mut: ancestral,mutation + --prefix PREFIX, -p PREFIX + prefix for all outputs. + --alignment ALIGNMENT, -m ALIGNMENT + aligned sequences in either fasta format or Xmfa format. Required for "matrix" task. + --snp SNP, -s SNP SNP matrix in specified format. Required for "phylogeny" and "ancestral" if alignment is not given + --tree TREE, -z TREE phylogenetic tree. Required for "ancestral" task + --ancestral ANCESTRAL, -a ANCESTRAL + Inferred ancestral states in a specified format. Required for "mutation" task + --core CORE, -c CORE Core genome proportion. Default: 0.95 + --n_proc N_PROC, -n N_PROC + Number of processes. Default: 7. +~~~~~~~~~~~ + + +## EB*Eis* - *in silico* serotype prediction for *Escherichia coli* & *Shigella spp.* +**EB*Eis*** is a BLASTn based prediction tool for the O and H antigens of *Escherichia coli* and *Shigella*. It uses essential genes (*wzx, wzy, wzt & wzm* for O; *fliC* for H) as markers. **EB*Eis*** uses a database built from two sources: +1. [SeroTypeFinder ](https://bitbucket.org/genomicepidemiology/serotypefinder_db/src) +2. O-antigen gene sequences reported in [DebRoy et al., PLoS ONE, 2016](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0147434#pone.0147434.ref011) +~~~~~~~~~~~ +usage: EToKi.py EBEis [-h] -q QUERY [-t TAXON] [-p PREFIX] + +EnteroBase Escherichia in silico serotyping + +optional arguments: + -h, --help show this help message and exit + -q QUERY, --query QUERY + file name for the queried assembly in multi-FASTA format. + -t TAXON, --taxon TAXON + Taxon database to compare with. + Only support Escherichia (default) for the moment. + -p PREFIX, --prefix PREFIX + prefix for intermediate files. Default: EBEis +~~~~~~~~~~~ + +## isCRISPOL - *in silico* prediction of CRISPOL array for *Salmonella enterica* serovar Typhimurium +CRISPOL is an oligo based Typhimurium sub-typing method described in ([Fabre et al., PLoS ONE, 2012](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0036995)). We use the direct repeats (DRs) and spacers in the Typhimurium CPRISR array to predict CRISPOL types from genomic assemblies. +~~~~~~~~~~~ +usage: EToKi.py isCRISPOL [-h] [N [N ...]] + +in silico Typhimurium subtyping using CRISPOL scheme (Fabre et al., PLoS ONE, 2012) + +positional arguments: + N FASTA files containing assemblies of S. enterica Typhimurium. + +optional arguments: + -h, --help show this help message and exit +~~~~~~~~~~~ + +## uberBlast - Use BLASTn, uBLASTp, minimap2 and/or mmseqs to identify similar sequences +**EToKi uberBlast** is also internally called by **EToKi ortho** to align exemplar genes to queried genomes, using both BLASTn and uSearch-uBLASTp. Amino acid alignments are converted back to nucleotide sequences, meaning that genome coordinates remain consistent across different methods. + +* minimap2 --- Fastest alignment in nucleotide level. High accuracy in identities >= 90%, but lose sensitivity quickly for lower identities. +* blastn --- Fast alignment in nucleotide level. Lose sensitivity for identities < 80% +* mmseqs --- Amino acid based alignment for identities >= 70% (open source) +* uBLASTp --- Amino acid based alignment for identities < 50% (commercial software) +~~~~~~~~~~~ +usage: EToKi.py uberBlast [-h] -r REFERENCE -q QUERY [-o OUTPUT] [--blastn] + [--ublast] [--ublastSELF] [--minimap] [--minimapASM] + [--mmseq] [--min_id MIN_ID] [--min_cov MIN_COV] + [--min_ratio MIN_RATIO] [-s RE_SCORE] [-f] + [--filter_cov FILTER_COV] + [--filter_score FILTER_SCORE] [-m] + [--merge_gap MERGE_GAP] [--merge_diff MERGE_DIFF] + [-O] [--overlap_length OVERLAP_LENGTH] + [--overlap_proportion OVERLAP_PROPORTION] + [-e FIX_END] [-t N_THREAD] [-p] + +Five different alignment methods. + +optional arguments: + -h, --help show this help message and exit + -r REFERENCE, --reference REFERENCE + [INPUT; REQUIRED] filename for the reference. This is + normally a genomic assembly. + -q QUERY, --query QUERY + [INPUT; REQUIRED] filename for the query. This can be + short-reads or genes or genomic assemblies. + -o OUTPUT, --output OUTPUT + [OUTPUT; Default: None] save result to a file or to + screen (stdout). Default do nothing. + --blastn Run BLASTn. Slowest. Good for identities between [80, + 100] + --ublast Run uBLAST in tBLASTn mode. Fast. Good for identities + between [30-100] + --ublastSELF Run uBLAST in tBLASTn mode. Fast. Good for identities + between [30-100] + --minimap Run minimap. Fast. Good for identities between + [90-100] + --minimapASM Run minimap on assemblies. Fast. Good for identities + between [90-100] + --mmseq Run mmseq2 in tBLASTn mode. Fast. Good for identities + between [70-100] + --min_id MIN_ID [DEFAULT: 0.3] Minimum identity before reScore for an + alignment to be kept + --min_cov MIN_COV [DEFAULT: 40] Minimum length for an alignment to be + kept + --min_ratio MIN_RATIO + [DEFAULT: 0.05] Minimum length for an alignment to be + kept, proportional to the length of the query + -s RE_SCORE, --re_score RE_SCORE + [DEFAULT: 0] Re-interpret alignment scores and + identities. 0: No rescore; 1: Rescore with + nucleotides; 2: Rescore with amino acid; 3: Rescore + with codons + -f, --filter [DEFAULT: False] Remove secondary alignments if they + overlap with any other regions + --filter_cov FILTER_COV + [DEFAULT: 0.9] + --filter_score FILTER_SCORE + [DEFAULT: 0] + -m, --linear_merge [DEFAULT: False] Merge consective alignments + --merge_gap MERGE_GAP + [DEFAULT: 300] + --merge_diff MERGE_DIFF + [DEFAULT: 1.2] + -O, --return_overlap [DEFAULT: False] Report overlapped alignments + --overlap_length OVERLAP_LENGTH + [DEFAULT: 300] Minimum overlap to report + --overlap_proportion OVERLAP_PROPORTION + [DEFAULT: 0.6] Minimum overlap proportion to report + -e FIX_END, --fix_end FIX_END + [FORMAT: L,R; DEFAULT: 0,0] Extend alignment to the + edges if the un-aligned regions are <= [L,R] + basepairs. + -t N_THREAD, --n_thread N_THREAD + [DEFAULT: 8] Number of threads to use. + -p, --process [DEFAULT: False] Use processes instead of threads. +~~~~~~~~~~~ + +## clust - linear-time clustering of short sequences using mmseqs linclust +**EToKi clust** is called internally by **EToKi ortho** to cluster seed genes into gene clusters. Given its linear-time complexity, it can cluster millions of gene sequences in minutes. +~~~~~~~~~~~ +usage: EToKi.py clust [-h] -i INPUT -p PREFIX [-d IDENTITY] [-c COVERAGE] + [-t N_THREAD] + +Get clusters and exemplars of clusters from gene sequences using mmseqs linclust. + +optional arguments: + -h, --help show this help message and exit + -i INPUT, --input INPUT + [INPUT; REQUIRED] name of the file containing gene sequneces in FASTA format. + -p PREFIX, --prefix PREFIX + [OUTPUT; REQUIRED] prefix of the outputs. + -d IDENTITY, --identity IDENTITY + [PARAM; DEFAULT: 0.9] minimum intra-cluster identity. + -c COVERAGE, --coverage COVERAGE + [PARAM; DEFAULT: 0.9] minimum intra-cluster coverage. + -t N_THREAD, --n_thread N_THREAD + [PARAM; DEFAULT: 8] number of threads to use. +~~~~~~~~~~~ From 89dd197ace80de527f2f994af263e4d46a873105 Mon Sep 17 00:00:00 2001 From: Lee Katz Date: Wed, 27 Apr 2022 13:05:55 -0400 Subject: [PATCH 06/24] Create test-etoki.yml --- .github/workflows/test-etoki.yml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 .github/workflows/test-etoki.yml diff --git a/.github/workflows/test-etoki.yml b/.github/workflows/test-etoki.yml new file mode 100644 index 000000000..32738591a --- /dev/null +++ b/.github/workflows/test-etoki.yml @@ -0,0 +1,23 @@ +# This caller workflow builds an image to the "test" stage. +# Instructions: replace all the stubs in this template with values for your image. +# Some explanations come from: https://github.com/actions/starter-workflows/blob/main/automation/manual.yml + +name: Test etoki image + +# Controls when the action will run. Workflow runs when manually triggered using the UI or when you submit your pull request +on: + workflow_dispatch: + pull_request: + paths: + - "etoki/1.2/Dockerfile" # Dockerfile path, e.g. 'htslib/1.14/Dockerfile' so that only your image is tested + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + + # This job calls a workflow to build the image to the 'test' stage + build-to-test: + uses: ./.github/workflows/build-to-test.yml + with: + path_to_context: "./etoki/1.2" # Path to directory with Dockerfile and context, e.g. "./spades/3.12.0" + dockerfile_name: "Dockerfile" + cache: "etoki" # Use the program name as a nickname for a GitHub cache of your image's layers, e.g. "spades". The cache will speed up re-running the workflow. From e5b18949afdd32548f143849222d0cddbc662fbc Mon Sep 17 00:00:00 2001 From: Lee Katz Date: Wed, 27 Apr 2022 13:15:53 -0400 Subject: [PATCH 07/24] added back in FROM app as test --- etoki/1.2/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etoki/1.2/Dockerfile b/etoki/1.2/Dockerfile index b61e165e6..59824378b 100644 --- a/etoki/1.2/Dockerfile +++ b/etoki/1.2/Dockerfile @@ -88,7 +88,7 @@ WORKDIR /data # A second FROM insruction creates a new stage # We use `test` for the test image -#FROM app as test +FROM app as test # Demonstrate that the program is successfully installed From 74dd0de3996c7d81decd2199c476bb888bc8dbcd Mon Sep 17 00:00:00 2001 From: Lee Katz Date: Thu, 28 Apr 2022 11:13:08 -0400 Subject: [PATCH 08/24] moving to multistage build --- etoki/1.2/Dockerfile | 108 ++++++++++++++++++++++++++++++------------- 1 file changed, 76 insertions(+), 32 deletions(-) diff --git a/etoki/1.2/Dockerfile b/etoki/1.2/Dockerfile index 59824378b..886297c41 100644 --- a/etoki/1.2/Dockerfile +++ b/etoki/1.2/Dockerfile @@ -1,14 +1,12 @@ # FROM defines the base docker image. This command has to come first in the file # The 'as' keyword lets you name the folowing stage. We use `app` for the production image -#FROM ubuntu:focal as app -# Copying the Freyja container a bit -FROM mambaorg/micromamba:0.22.0 as app +FROM ubuntu:focal as app # ARG sets environment variables during the build stage -ARG SOFTWARENAME_VER="1.2" +ARG ETOKI_VER="1.2" +# Persistence with an env +ENV ETOKI_VER_ENV=$ETOKI_VER -# build and run as root users since micromamba image has 'mambauser' set as the $USER -USER root # set workdir to default for building; set to /data at the end WORKDIR / @@ -17,13 +15,23 @@ WORKDIR / LABEL base.image="ubuntu:focal" LABEL dockerfile.version="1" LABEL software="EToKi" -LABEL software.version=$SOFTWARENAME_VER +LABEL software.version=$ETOKI_VER LABEL description="All methods related to Enterobase data analysis pipelines" LABEL website="https://github.com/zheminzhou/EToKi" LABEL license="https://github.com/zheminzhou/EToKi/blob/master/LICENSE" LABEL maintainer="Lee Katz" LABEL maintainer.email="gzu2@cdc.gov" +# Multistage build +FROM staphb/shovill:1.1.0 AS shovill +FROM staphb/kraken2:2.1.2-no-db AS kraken2 +FROM staphb/bowtie2:2.4.4 AS bowtie2 +FROM staphb/lyveset:1.1.4f AS lyveset +FROM torognes/vsearch:2.21.1 as vsearch + +# Back to the base app so that we have things like ENV variables +FROM app + # https://askubuntu.com/a/1013396 # avoid asking about timezone during apt-get ARG DEBIAN_FRONTEND=noninteractive @@ -31,43 +39,68 @@ ARG DEBIAN_FRONTEND=noninteractive # RUN executes code during the build # Install dependencies via apt-get or yum if using a centos or fedora base RUN apt-get update && apt-get install -y --no-install-recommends \ + libncurses5-dev \ + libbz2-dev \ + liblzma-dev \ + perl \ + libcurl4-gnutls-dev \ + gcc \ + g++ \ + python-setuptools \ + zlib1g-dev \ python3-pip \ python3-dev \ libgconf-2-4 \ curl \ unzip \ - libcurl4-openssl-dev \ build-essential \ git \ pigz \ - libcurl4-openssl-dev \ libcurl4 \ ant \ libssl-dev \ python3-venv \ wget && \ - apt-get autoclean + apt-get autoclean && \ + rm -rf /var/lib/apt/lists/* #openjdk-8-jdk \ + #libcurl4-openssl-dev \ # Gimme python3 instead of python2 RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 1 -# Double check the python version -RUN python --version - -# set the environment, put new conda env in PATH by default -ENV PATH="/opt/conda/envs/etoki/bin:/opt/conda/envs/env/bin:${PATH}" \ - LC_ALL=C.UTF-8 - - -RUN micromamba create -n etoki python=3.8 -c conda-forge -c bioconda -c defaults ete3 numba numpy pandas scikit-learn psutil click scipy && \ - micromamba clean -a -y -# might also have to install sklearn?? - -# Now get us into that yummy yummy EToKi env -ENV ENV_NAME="etoki" -ARG MAMBA_DOCKERFILE_ACTIVATE=1 +COPY --from=shovill /skesa/skesa /usr/local/bin/ +COPY --from=shovill /megahit /megahit +COPY --from=shovill /pilon /pilon +COPY --from=shovill /SPAdes-*-Linux /spades +COPY --from=kraken2 /kraken2-2.1.2 /kraken2 +COPY --from=kraken2 /kraken2-db /kraken2-db +COPY --from=bowtie2 /opt/bowtie2-* /opt/bowtie2 + +# Copied from flye container +ENV FLYE_VER="2.9" +RUN wget https://github.com/fenderglass/Flye/archive/${FLYE_VER}.tar.gz && \ + tar -xvf ${FLYE_VER}.tar.gz && \ + rm ${FLYE_VER}.tar.gz && \ + cd Flye-${FLYE_VER} && \ + python setup.py build && \ + python setup.py install + +# Copied from samtools container +ENV SAMTOOLSVER="1.15" +RUN wget https://github.com/samtools/samtools/releases/download/${SAMTOOLSVER}/samtools-${SAMTOOLSVER}.tar.bz2 && \ + tar -xjf samtools-${SAMTOOLSVER}.tar.bz2 && \ + rm samtools-${SAMTOOLSVER}.tar.bz2 && \ + cd samtools-${SAMTOOLSVER} && \ + ./configure && \ + make && \ + make install + +# vsearch aims to be a drop in replacement for usearch and so let's see if that's true +# ie, set `usearch` as path to vsearch +COPY --from=vsearch /usr/local/bin/vsearch /usr/local/bin/vsearch +COPY --from=vsearch /usr/local/bin/vsearch /usr/local/bin/usearch ## EToKi itself ## @@ -76,11 +109,21 @@ ARG MAMBA_DOCKERFILE_ACTIVATE=1 ENV PATH="/usr/local/bin/EToKi:$PATH" \ LC_ALL=C -RUN cd /usr/local/bin && git clone https://github.com/zheminzhou/EToKi.git -b ${SOFTWARENAME_VER} +WORKDIR /usr/local/bin +RUN git clone https://github.com/zheminzhou/EToKi.git -b ${ETOKI_VER_ENV} + +RUN pip3 install ete3 numba numpy==1.21 pandas scikit-learn psutil click scipy -# Installs all 3rd party software except the kraken database and usearch -RUN cd /usr/local/bin/EToKi && python EToKi.py configure --install +# Install all 3rd party software except the kraken database +WORKDIR /usr/local/bin/EToKi +# Install 3rd party tools +# Samtools has an issue for some reason and so explicitly give that path +# Also give the path to usearch since it is normally proprietary (although we have vsearch) +RUN python EToKi.py configure --install --usearch $(which usearch) +RUN python EToKi.py configure --path samtools=$(which samtools) +RUN python EToKi.py configure --path blast=$(which blastn) +# Fix the shebang line for all EToKi scripts to /usr/bin/env python RUN find /usr/local/bin/EToKi -name '*.py' -exec sed -i.bak -e '1 i #!/usr/bin/env python\n# ^^^ inserted corrected shebang for this container' {} \; # WORKDIR sets working directory @@ -88,14 +131,15 @@ WORKDIR /data # A second FROM insruction creates a new stage # We use `test` for the test image -FROM app as test +#FROM app as test # Demonstrate that the program is successfully installed # Option 1: run the program's internal tests, for example with SPAdes: -RUN cd /usr/local/bin/EToKi && EToKi.py --help -#RUN cd /usr/local/bin/EToKi && $(which python) $(which EToKi.py) --help -RUN cd /usr/local/bin/EToKi && bash example.bash +WORKDIR /usr/local/bin/EToKi +RUN usearch && which usearch +RUN EToKi.py --help +RUN bash -e example.bash # Option 2: write your own tests in a bash script in the same directory as your Dockerfile: #COPY my_tests.sh . From 0f5fc247d1cdb679f6936f78a93dfe792ade63f3 Mon Sep 17 00:00:00 2001 From: Lee Katz Date: Thu, 28 Apr 2022 16:00:20 -0400 Subject: [PATCH 09/24] Update Dockerfile --- etoki/1.2/Dockerfile | 68 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 55 insertions(+), 13 deletions(-) diff --git a/etoki/1.2/Dockerfile b/etoki/1.2/Dockerfile index 886297c41..eecb1a788 100644 --- a/etoki/1.2/Dockerfile +++ b/etoki/1.2/Dockerfile @@ -27,7 +27,11 @@ FROM staphb/shovill:1.1.0 AS shovill FROM staphb/kraken2:2.1.2-no-db AS kraken2 FROM staphb/bowtie2:2.4.4 AS bowtie2 FROM staphb/lyveset:1.1.4f AS lyveset -FROM torognes/vsearch:2.21.1 as vsearch +FROM torognes/vsearch:2.21.1 AS vsearch +FROM staphb/bbtools:38.96 AS bbtools +FROM staphb/mlst:2.19.0 AS mlst +FROM staphb/orthofinder:2.17 AS orthofinder +FROM staphb/cfsan-snp-pipeline:2.0.2 AS cfsan # Back to the base app so that we have things like ENV variables FROM app @@ -74,9 +78,26 @@ COPY --from=shovill /skesa/skesa /usr/local/bin/ COPY --from=shovill /megahit /megahit COPY --from=shovill /pilon /pilon COPY --from=shovill /SPAdes-*-Linux /spades +COPY --from=shovill /megahit /megahit COPY --from=kraken2 /kraken2-2.1.2 /kraken2 COPY --from=kraken2 /kraken2-db /kraken2-db COPY --from=bowtie2 /opt/bowtie2-* /opt/bowtie2 +COPY --from=bbtools /opt/bbmap /opt/bbmap +COPY --from=mlst /ncbi-blast-2.9.0+ /ncbi-blast-2.9.0+ +COPY --from=lyveset /lyve-SET /lyve-SET +COPY --from=orthofinder /mmseqs /mmseqs +COPY --from=cfsan /gatk /gatk +# vsearch aims to be a drop in replacement for usearch and so let's see if that's true +# ie, set `usearch` as path to vsearch +#COPY --from=vsearch /usr/local/bin/vsearch /usr/local/bin/vsearch +#COPY --from=vsearch /usr/local/bin/vsearch /usr/local/bin/usearch + +WORKDIR / +RUN wget https://github.com/torognes/vsearch/releases/download/v2.21.1/vsearch-2.21.1-linux-x86_64.tar.gz && \ + tar zxvf vsearch-2.21.1*.tar.gz && \ + cp -v vsearch-2.21.1-linux-x86_64/bin/vsearch /usr/local/bin/ && \ + ln -sv /usr/local/bin/vsearch /usr/local/bin/usearch && \ + rm -rfv ./vsearch # Copied from flye container ENV FLYE_VER="2.9" @@ -97,16 +118,17 @@ RUN wget https://github.com/samtools/samtools/releases/download/${SAMTOOLSVER}/s make && \ make install -# vsearch aims to be a drop in replacement for usearch and so let's see if that's true -# ie, set `usearch` as path to vsearch -COPY --from=vsearch /usr/local/bin/vsearch /usr/local/bin/vsearch -COPY --from=vsearch /usr/local/bin/vsearch /usr/local/bin/usearch +# Diamond +ENV DIAMOND_VER="v2.0.15" +RUN wget https://github.com/bbuchfink/diamond/releases/download/${DIAMOND_VER}/diamond-linux64.tar.gz && \ + tar zxvf diamond-linux64.tar.gz && \ + mv -v diamond /usr/local/bin/diamond && \ + rm diamond-linux64.tar.gz ## EToKi itself ## - # ENV instructions set environment variables that persist from the build into the resulting image # Use for e.g. $PATH and locale settings for compatibility with Singularity -ENV PATH="/usr/local/bin/EToKi:$PATH" \ +ENV PATH="/usr/local/bin:/usr/local/bin/EToKi:/megahit:/pilon:/spades/bin:/kraken2:/opt/bowtie2:/opt/bbmap:/ncbi-blast-2.9.0+/bin:/megahit/megahit_v1.1.4_LINUX_CPUONLY_x86_64-bin:/lyve-SET:/lyve-SET/scripts:/mmseqs/bin:/mmseqs/util:$PATH" \ LC_ALL=C WORKDIR /usr/local/bin @@ -119,13 +141,34 @@ WORKDIR /usr/local/bin/EToKi # Install 3rd party tools # Samtools has an issue for some reason and so explicitly give that path # Also give the path to usearch since it is normally proprietary (although we have vsearch) -RUN python EToKi.py configure --install --usearch $(which usearch) -RUN python EToKi.py configure --path samtools=$(which samtools) -RUN python EToKi.py configure --path blast=$(which blastn) +RUN python EToKi.py configure --path bbduk=$(which bbduk.sh) || true +RUN python EToKi.py configure --path bbmerge=$(which bbmerge.sh) || true +RUN python EToKi.py configure --path repair=$(which repair.sh) || true +RUN python EToKi.py configure --path pilon=$(find /pilon -name 'pilon-*.jar' | head -n 1) || true +RUN python EToKi.py configure --path flye=$(which flye) || true +RUN python EToKi.py configure --path kraken2=$(find /kraken2 -type f -name kraken2 | head -n 1) || true +RUN python EToKi.py configure --path bowtie2=$(which bowtie2) || true +RUN python EToKi.py configure --path bowtie2build=$(which bowtie2-build) || true +RUN python EToKi.py configure --path raxml=$(which raxml) || true +RUN python EToKi.py configure --path raxml_ng=$(which raxml_ng) || true +RUN python EToKi.py configure --path samtools=$(which samtools) || true +RUN python EToKi.py configure --path blastn=$(which blastn) || true +RUN python EToKi.py configure --path makeblastdb=$(which makeblastdb) || true +RUN python EToKi.py configure --path diamond=$(which diamond) || true +RUN python EToKi.py configure --path gatk=/gatk/GenomeAnalysisTK-3.8-1-0-gf15c1c3ef/GenomeAnalysisTK.jar || true +#RUN python EToKi.py configure --path lastal= +#RUN python EToKi.py configure --path lastdb= +RUN python EToKi.py configure --path mmseqs=$(which mmseqs) || true +RUN python EToKi.py configure --path megahit=$(which megahit) || true +RUN python EToKi.py configure --path spades=$(which spades.py) || true +#RUN python EToKi.py configure --path kraken_db= +RUN python EToKi.py configure --usearch /usr/local/bin/usearch +RUN python EToKi.py configure # Fix the shebang line for all EToKi scripts to /usr/bin/env python RUN find /usr/local/bin/EToKi -name '*.py' -exec sed -i.bak -e '1 i #!/usr/bin/env python\n# ^^^ inserted corrected shebang for this container' {} \; + # WORKDIR sets working directory WORKDIR /data @@ -137,9 +180,9 @@ WORKDIR /data # Option 1: run the program's internal tests, for example with SPAdes: WORKDIR /usr/local/bin/EToKi -RUN usearch && which usearch RUN EToKi.py --help -RUN bash -e example.bash +#RUN bash -e example.bash +RUN python EToKi.py MLSTdb -i examples/Escherichia.Achtman.alleles.fasta -r examples/Escherichia.Achtman.references.fasta -d examples/Escherichia.Achtman.convert.tab # Option 2: write your own tests in a bash script in the same directory as your Dockerfile: #COPY my_tests.sh . @@ -150,4 +193,3 @@ RUN bash -e example.bash #RUN mkdir tests/ #COPY tests/ tests/ #RUN python3 -m unittest discover -s tests -# From 5b31182b66ce5fac11d5281b36dd2711cc3ff476 Mon Sep 17 00:00:00 2001 From: Lee Katz Date: Thu, 28 Apr 2022 16:05:39 -0400 Subject: [PATCH 10/24] added back in test --- etoki/1.2/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etoki/1.2/Dockerfile b/etoki/1.2/Dockerfile index eecb1a788..2f69498a6 100644 --- a/etoki/1.2/Dockerfile +++ b/etoki/1.2/Dockerfile @@ -174,7 +174,7 @@ WORKDIR /data # A second FROM insruction creates a new stage # We use `test` for the test image -#FROM app as test +FROM app as test # Demonstrate that the program is successfully installed From b5184737f9f34540d9303a48104f5021f3c97847 Mon Sep 17 00:00:00 2001 From: Lee Katz - Aspen Date: Mon, 2 May 2022 16:28:41 -0400 Subject: [PATCH 11/24] updated multistage etoki --- etoki/1.2/Dockerfile | 232 ++++++++++++++++++------------------------- 1 file changed, 95 insertions(+), 137 deletions(-) diff --git a/etoki/1.2/Dockerfile b/etoki/1.2/Dockerfile index 2f69498a6..5ff34146f 100644 --- a/etoki/1.2/Dockerfile +++ b/etoki/1.2/Dockerfile @@ -1,17 +1,37 @@ -# FROM defines the base docker image. This command has to come first in the file -# The 'as' keyword lets you name the folowing stage. We use `app` for the production image -FROM ubuntu:focal as app +# FYI this is a multistage build, which can get very complicated -# ARG sets environment variables during the build stage ARG ETOKI_VER="1.2" -# Persistence with an env -ENV ETOKI_VER_ENV=$ETOKI_VER +ARG SKESA_VER=2.4.0 +ARG SPADES_VER=3.15.4 +ARG SHOVILL_VER=1.1.0 +ARG KRAKEN2_VER=2.1.2-no-db +ARG BOWTIE_VER=2.4.4 +ARG LYVESET_VER=1.1.4f +ARG VSEARCH_VER=2.21.1 +ARG BBTOOLS_VER=38.96 +ARG MLST_VER=2.19.0 +ARG ORTHOFINDER_VER=2.17 +ARG CFSAN_VER=2.0.2 +ARG FLYE_VER=2.9 +ARG SAMTOOLS_VER=1.15 +ARG DIAMOND_VER="v2.0.15" +ARG RAXML_VER="8.2.12" + +FROM staphb/shovill:${SHOVILL_VER} AS shovill +FROM staphb/kraken2:${KRAKEN2_VER} AS kraken2 +FROM staphb/bowtie2:${BOWTIE_VER} AS bowtie2 +FROM staphb/lyveset:${LYVESET_VER} AS lyveset +FROM torognes/vsearch:${VSEARCH_VER} AS vsearch +FROM staphb/bbtools:${BBTOOLS_VER} AS bbtools +FROM staphb/mlst:${MLST_VER} AS mlst +FROM staphb/orthofinder:${ORTHOFINDER_VER} AS orthofinder +FROM staphb/cfsan-snp-pipeline:${CFSAN_VER} AS cfsan +FROM staphb/flye:${FLYE_VER} AS flye +FROM staphb/samtools:${SAMTOOLS_VER} AS samtools +FROM staphb/raxml:${RAXML_VER} AS raxml + +FROM ubuntu:jammy as app -# set workdir to default for building; set to /data at the end -WORKDIR / - -# LABEL instructions tag the image with metadata that might be important to the user -# Optional, but highly recommended LABEL base.image="ubuntu:focal" LABEL dockerfile.version="1" LABEL software="EToKi" @@ -22,26 +42,8 @@ LABEL license="https://github.com/zheminzhou/EToKi/blob/master/LICENSE" LABEL maintainer="Lee Katz" LABEL maintainer.email="gzu2@cdc.gov" -# Multistage build -FROM staphb/shovill:1.1.0 AS shovill -FROM staphb/kraken2:2.1.2-no-db AS kraken2 -FROM staphb/bowtie2:2.4.4 AS bowtie2 -FROM staphb/lyveset:1.1.4f AS lyveset -FROM torognes/vsearch:2.21.1 AS vsearch -FROM staphb/bbtools:38.96 AS bbtools -FROM staphb/mlst:2.19.0 AS mlst -FROM staphb/orthofinder:2.17 AS orthofinder -FROM staphb/cfsan-snp-pipeline:2.0.2 AS cfsan - -# Back to the base app so that we have things like ENV variables -FROM app - -# https://askubuntu.com/a/1013396 -# avoid asking about timezone during apt-get ARG DEBIAN_FRONTEND=noninteractive -# RUN executes code during the build -# Install dependencies via apt-get or yum if using a centos or fedora base RUN apt-get update && apt-get install -y --no-install-recommends \ libncurses5-dev \ libbz2-dev \ @@ -52,6 +54,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ g++ \ python-setuptools \ zlib1g-dev \ + python-is-python3 \ python3-pip \ python3-dev \ libgconf-2-4 \ @@ -68,128 +71,83 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ apt-get autoclean && \ rm -rf /var/lib/apt/lists/* - #openjdk-8-jdk \ - #libcurl4-openssl-dev \ - -# Gimme python3 instead of python2 -RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 1 - -COPY --from=shovill /skesa/skesa /usr/local/bin/ -COPY --from=shovill /megahit /megahit -COPY --from=shovill /pilon /pilon -COPY --from=shovill /SPAdes-*-Linux /spades -COPY --from=shovill /megahit /megahit -COPY --from=kraken2 /kraken2-2.1.2 /kraken2 -COPY --from=kraken2 /kraken2-db /kraken2-db -COPY --from=bowtie2 /opt/bowtie2-* /opt/bowtie2 -COPY --from=bbtools /opt/bbmap /opt/bbmap -COPY --from=mlst /ncbi-blast-2.9.0+ /ncbi-blast-2.9.0+ -COPY --from=lyveset /lyve-SET /lyve-SET -COPY --from=orthofinder /mmseqs /mmseqs -COPY --from=cfsan /gatk /gatk -# vsearch aims to be a drop in replacement for usearch and so let's see if that's true -# ie, set `usearch` as path to vsearch -#COPY --from=vsearch /usr/local/bin/vsearch /usr/local/bin/vsearch -#COPY --from=vsearch /usr/local/bin/vsearch /usr/local/bin/usearch - -WORKDIR / -RUN wget https://github.com/torognes/vsearch/releases/download/v2.21.1/vsearch-2.21.1-linux-x86_64.tar.gz && \ - tar zxvf vsearch-2.21.1*.tar.gz && \ - cp -v vsearch-2.21.1-linux-x86_64/bin/vsearch /usr/local/bin/ && \ - ln -sv /usr/local/bin/vsearch /usr/local/bin/usearch && \ - rm -rfv ./vsearch - -# Copied from flye container -ENV FLYE_VER="2.9" -RUN wget https://github.com/fenderglass/Flye/archive/${FLYE_VER}.tar.gz && \ - tar -xvf ${FLYE_VER}.tar.gz && \ - rm ${FLYE_VER}.tar.gz && \ - cd Flye-${FLYE_VER} && \ - python setup.py build && \ - python setup.py install - -# Copied from samtools container -ENV SAMTOOLSVER="1.15" -RUN wget https://github.com/samtools/samtools/releases/download/${SAMTOOLSVER}/samtools-${SAMTOOLSVER}.tar.bz2 && \ - tar -xjf samtools-${SAMTOOLSVER}.tar.bz2 && \ - rm samtools-${SAMTOOLSVER}.tar.bz2 && \ - cd samtools-${SAMTOOLSVER} && \ - ./configure && \ - make && \ - make install +RUN pip3 install ete3 numba pandas scikit-learn psutil click scipy +RUN pip3 install numpy==1.21.6 + +COPY --from=shovill /skesa/skesa /usr/local/bin/ +COPY --from=shovill /megahit /megahit +COPY --from=shovill /pilon /pilon +COPY --from=shovill /SPAdes-*-Linux /spades +COPY --from=kraken2 /kraken2-2* /kraken2 +COPY --from=kraken2 /kraken2-db /kraken2-db +COPY --from=bowtie2 /opt/bowtie2-* /opt/bowtie2 +COPY --from=bbtools /opt/bbmap /opt/bbmap +COPY --from=mlst /ncbi-blast-2.9.0+ /ncbi-blast-2.9.0+ +COPY --from=lyveset /lyve-SET /lyve-SET +COPY --from=orthofinder /mmseqs /mmseqs +COPY --from=cfsan /gatk /gatk +COPY --from=flye /Flye-* /flye +COPY --from=samtools /samtools-* /samtools +COPY --from=vsearch /usr/local/bin/vsearch /usr/local/bin/vsearch +COPY --from=vsearch /usr/local/bin/vsearch /usr/local/bin/usearch +COPY --from=raxml /standard-RAxML-8.2.12 /standard-RAxML +COPY --from=raxml /raxml_ng /raxml_ng + +ARG DIAMOND_VER +ARG ETOKI_VER # Diamond -ENV DIAMOND_VER="v2.0.15" RUN wget https://github.com/bbuchfink/diamond/releases/download/${DIAMOND_VER}/diamond-linux64.tar.gz && \ tar zxvf diamond-linux64.tar.gz && \ mv -v diamond /usr/local/bin/diamond && \ + chmod +x /usr/local/bin/diamond && \ rm diamond-linux64.tar.gz ## EToKi itself ## -# ENV instructions set environment variables that persist from the build into the resulting image -# Use for e.g. $PATH and locale settings for compatibility with Singularity -ENV PATH="/usr/local/bin:/usr/local/bin/EToKi:/megahit:/pilon:/spades/bin:/kraken2:/opt/bowtie2:/opt/bbmap:/ncbi-blast-2.9.0+/bin:/megahit/megahit_v1.1.4_LINUX_CPUONLY_x86_64-bin:/lyve-SET:/lyve-SET/scripts:/mmseqs/bin:/mmseqs/util:$PATH" \ - LC_ALL=C - -WORKDIR /usr/local/bin -RUN git clone https://github.com/zheminzhou/EToKi.git -b ${ETOKI_VER_ENV} - -RUN pip3 install ete3 numba numpy==1.21 pandas scikit-learn psutil click scipy +RUN wget https://github.com/zheminzhou/EToKi/archive/refs/tags/${ETOKI_VER}.tar.gz && \ + ls / && \ + tar zxvf ${ETOKI_VER}.tar.gz && \ + rm ${ETOKI_VER}.tar.gz +#RUN git clone https://github.com/zheminzhou/EToKi.git -b ${ETOKI_VER} /EToKi-${ETOKI_VER} + +WORKDIR /EToKi-${ETOKI_VER} + +ENV PATH="/usr/local/bin:/EToKi-${ETOKI_VER}:/flye/bin:/megahit:/pilon:/spades/bin:/kraken2:/opt/bowtie2:/opt/bbmap:/ncbi-blast-2.9.0+/bin:/megahit/megahit_v1.1.4_LINUX_CPUONLY_x86_64-bin:/raxml_ng:/standard-RAxML:/lyve-SET:/lyve-SET/scripts:/mmseqs/bin:/mmseqs/util:$PATH" \ + LC_ALL=C + +RUN EToKi.py configure --help + +RUN EToKi.py configure --path bbduk=/opt/bbmap/bbduk.sh +RUN EToKi.py configure --path bbmerge=$(which bbmerge.sh) +RUN EToKi.py configure --path bbduk=/opt/bbmap/bbduk.sh +RUN EToKi.py configure --path bbmerge=$(which bbmerge.sh) +RUN EToKi.py configure --path repair=$(which repair.sh) +RUN EToKi.py configure --path pilon=$(find /pilon -name 'pilon-*.jar' | head -n 1) +RUN EToKi.py configure --path flye=$(which flye) +RUN EToKi.py configure --path kraken2=$(find /kraken2 -type f -name kraken2 | head -n 1) +RUN EToKi.py configure --path bowtie2=$(which bowtie2) +RUN EToKi.py configure --path bowtie2build=$(which bowtie2-build) +RUN EToKi.py configure --path raxml=$(which raxmlHPC) +RUN EToKi.py configure --path raxml_ng=$(which raxml-ng) +RUN EToKi.py configure --path samtools=$(which samtools) +RUN EToKi.py configure --path blastn=$(which blastn) +RUN EToKi.py configure --path makeblastdb=$(which makeblastdb) +RUN EToKi.py configure --path diamond=$(which diamond) +RUN EToKi.py configure --path gatk=/gatk/GenomeAnalysisTK-3.8-1-0-gf15c1c3ef/GenomeAnalysisTK.jar +RUN EToKi.py configure --path mmseqs=$(which mmseqs) +RUN EToKi.py configure --path megahit=$(which megahit) +RUN EToKi.py configure --path spades=$(which spades.py) +RUN EToKi.py configure --usearch /usr/local/bin/usearch -# Install all 3rd party software except the kraken database -WORKDIR /usr/local/bin/EToKi -# Install 3rd party tools -# Samtools has an issue for some reason and so explicitly give that path -# Also give the path to usearch since it is normally proprietary (although we have vsearch) -RUN python EToKi.py configure --path bbduk=$(which bbduk.sh) || true -RUN python EToKi.py configure --path bbmerge=$(which bbmerge.sh) || true -RUN python EToKi.py configure --path repair=$(which repair.sh) || true -RUN python EToKi.py configure --path pilon=$(find /pilon -name 'pilon-*.jar' | head -n 1) || true -RUN python EToKi.py configure --path flye=$(which flye) || true -RUN python EToKi.py configure --path kraken2=$(find /kraken2 -type f -name kraken2 | head -n 1) || true -RUN python EToKi.py configure --path bowtie2=$(which bowtie2) || true -RUN python EToKi.py configure --path bowtie2build=$(which bowtie2-build) || true -RUN python EToKi.py configure --path raxml=$(which raxml) || true -RUN python EToKi.py configure --path raxml_ng=$(which raxml_ng) || true -RUN python EToKi.py configure --path samtools=$(which samtools) || true -RUN python EToKi.py configure --path blastn=$(which blastn) || true -RUN python EToKi.py configure --path makeblastdb=$(which makeblastdb) || true -RUN python EToKi.py configure --path diamond=$(which diamond) || true -RUN python EToKi.py configure --path gatk=/gatk/GenomeAnalysisTK-3.8-1-0-gf15c1c3ef/GenomeAnalysisTK.jar || true -#RUN python EToKi.py configure --path lastal= -#RUN python EToKi.py configure --path lastdb= -RUN python EToKi.py configure --path mmseqs=$(which mmseqs) || true -RUN python EToKi.py configure --path megahit=$(which megahit) || true -RUN python EToKi.py configure --path spades=$(which spades.py) || true -#RUN python EToKi.py configure --path kraken_db= -RUN python EToKi.py configure --usearch /usr/local/bin/usearch -RUN python EToKi.py configure - -# Fix the shebang line for all EToKi scripts to /usr/bin/env python -RUN find /usr/local/bin/EToKi -name '*.py' -exec sed -i.bak -e '1 i #!/usr/bin/env python\n# ^^^ inserted corrected shebang for this container' {} \; - - -# WORKDIR sets working directory WORKDIR /data -# A second FROM insruction creates a new stage -# We use `test` for the test image FROM app as test -# Demonstrate that the program is successfully installed - -# Option 1: run the program's internal tests, for example with SPAdes: WORKDIR /usr/local/bin/EToKi +# Get the help menu up RUN EToKi.py --help +# Show the configuration works +RUN EToKi.py configure #RUN bash -e example.bash -RUN python EToKi.py MLSTdb -i examples/Escherichia.Achtman.alleles.fasta -r examples/Escherichia.Achtman.references.fasta -d examples/Escherichia.Achtman.convert.tab - -# Option 2: write your own tests in a bash script in the same directory as your Dockerfile: -#COPY my_tests.sh . -#RUN bash my_tests.sh +RUN EToKi.py MLSTdb -i examples/Escherichia.Achtman.alleles.fasta -r examples/Escherichia.Achtman.references.fasta -d examples/Escherichia.Achtman.convert.tab -# Option 3: write python unit tests in a tests/ directory in the same directory as your Dockerfile: -#RUN apt-get install -y python3 -#RUN mkdir tests/ -#COPY tests/ tests/ -#RUN python3 -m unittest discover -s tests From f582fe83faecc569f452784d4af9396708dd116b Mon Sep 17 00:00:00 2001 From: Lee Katz Date: Mon, 2 May 2022 16:31:00 -0400 Subject: [PATCH 12/24] with ideas from Erin --- etoki/1.2/Dockerfile | 232 ++++++++++++++++++------------------------- 1 file changed, 95 insertions(+), 137 deletions(-) diff --git a/etoki/1.2/Dockerfile b/etoki/1.2/Dockerfile index 2f69498a6..5ff34146f 100644 --- a/etoki/1.2/Dockerfile +++ b/etoki/1.2/Dockerfile @@ -1,17 +1,37 @@ -# FROM defines the base docker image. This command has to come first in the file -# The 'as' keyword lets you name the folowing stage. We use `app` for the production image -FROM ubuntu:focal as app +# FYI this is a multistage build, which can get very complicated -# ARG sets environment variables during the build stage ARG ETOKI_VER="1.2" -# Persistence with an env -ENV ETOKI_VER_ENV=$ETOKI_VER +ARG SKESA_VER=2.4.0 +ARG SPADES_VER=3.15.4 +ARG SHOVILL_VER=1.1.0 +ARG KRAKEN2_VER=2.1.2-no-db +ARG BOWTIE_VER=2.4.4 +ARG LYVESET_VER=1.1.4f +ARG VSEARCH_VER=2.21.1 +ARG BBTOOLS_VER=38.96 +ARG MLST_VER=2.19.0 +ARG ORTHOFINDER_VER=2.17 +ARG CFSAN_VER=2.0.2 +ARG FLYE_VER=2.9 +ARG SAMTOOLS_VER=1.15 +ARG DIAMOND_VER="v2.0.15" +ARG RAXML_VER="8.2.12" + +FROM staphb/shovill:${SHOVILL_VER} AS shovill +FROM staphb/kraken2:${KRAKEN2_VER} AS kraken2 +FROM staphb/bowtie2:${BOWTIE_VER} AS bowtie2 +FROM staphb/lyveset:${LYVESET_VER} AS lyveset +FROM torognes/vsearch:${VSEARCH_VER} AS vsearch +FROM staphb/bbtools:${BBTOOLS_VER} AS bbtools +FROM staphb/mlst:${MLST_VER} AS mlst +FROM staphb/orthofinder:${ORTHOFINDER_VER} AS orthofinder +FROM staphb/cfsan-snp-pipeline:${CFSAN_VER} AS cfsan +FROM staphb/flye:${FLYE_VER} AS flye +FROM staphb/samtools:${SAMTOOLS_VER} AS samtools +FROM staphb/raxml:${RAXML_VER} AS raxml + +FROM ubuntu:jammy as app -# set workdir to default for building; set to /data at the end -WORKDIR / - -# LABEL instructions tag the image with metadata that might be important to the user -# Optional, but highly recommended LABEL base.image="ubuntu:focal" LABEL dockerfile.version="1" LABEL software="EToKi" @@ -22,26 +42,8 @@ LABEL license="https://github.com/zheminzhou/EToKi/blob/master/LICENSE" LABEL maintainer="Lee Katz" LABEL maintainer.email="gzu2@cdc.gov" -# Multistage build -FROM staphb/shovill:1.1.0 AS shovill -FROM staphb/kraken2:2.1.2-no-db AS kraken2 -FROM staphb/bowtie2:2.4.4 AS bowtie2 -FROM staphb/lyveset:1.1.4f AS lyveset -FROM torognes/vsearch:2.21.1 AS vsearch -FROM staphb/bbtools:38.96 AS bbtools -FROM staphb/mlst:2.19.0 AS mlst -FROM staphb/orthofinder:2.17 AS orthofinder -FROM staphb/cfsan-snp-pipeline:2.0.2 AS cfsan - -# Back to the base app so that we have things like ENV variables -FROM app - -# https://askubuntu.com/a/1013396 -# avoid asking about timezone during apt-get ARG DEBIAN_FRONTEND=noninteractive -# RUN executes code during the build -# Install dependencies via apt-get or yum if using a centos or fedora base RUN apt-get update && apt-get install -y --no-install-recommends \ libncurses5-dev \ libbz2-dev \ @@ -52,6 +54,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ g++ \ python-setuptools \ zlib1g-dev \ + python-is-python3 \ python3-pip \ python3-dev \ libgconf-2-4 \ @@ -68,128 +71,83 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ apt-get autoclean && \ rm -rf /var/lib/apt/lists/* - #openjdk-8-jdk \ - #libcurl4-openssl-dev \ - -# Gimme python3 instead of python2 -RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 1 - -COPY --from=shovill /skesa/skesa /usr/local/bin/ -COPY --from=shovill /megahit /megahit -COPY --from=shovill /pilon /pilon -COPY --from=shovill /SPAdes-*-Linux /spades -COPY --from=shovill /megahit /megahit -COPY --from=kraken2 /kraken2-2.1.2 /kraken2 -COPY --from=kraken2 /kraken2-db /kraken2-db -COPY --from=bowtie2 /opt/bowtie2-* /opt/bowtie2 -COPY --from=bbtools /opt/bbmap /opt/bbmap -COPY --from=mlst /ncbi-blast-2.9.0+ /ncbi-blast-2.9.0+ -COPY --from=lyveset /lyve-SET /lyve-SET -COPY --from=orthofinder /mmseqs /mmseqs -COPY --from=cfsan /gatk /gatk -# vsearch aims to be a drop in replacement for usearch and so let's see if that's true -# ie, set `usearch` as path to vsearch -#COPY --from=vsearch /usr/local/bin/vsearch /usr/local/bin/vsearch -#COPY --from=vsearch /usr/local/bin/vsearch /usr/local/bin/usearch - -WORKDIR / -RUN wget https://github.com/torognes/vsearch/releases/download/v2.21.1/vsearch-2.21.1-linux-x86_64.tar.gz && \ - tar zxvf vsearch-2.21.1*.tar.gz && \ - cp -v vsearch-2.21.1-linux-x86_64/bin/vsearch /usr/local/bin/ && \ - ln -sv /usr/local/bin/vsearch /usr/local/bin/usearch && \ - rm -rfv ./vsearch - -# Copied from flye container -ENV FLYE_VER="2.9" -RUN wget https://github.com/fenderglass/Flye/archive/${FLYE_VER}.tar.gz && \ - tar -xvf ${FLYE_VER}.tar.gz && \ - rm ${FLYE_VER}.tar.gz && \ - cd Flye-${FLYE_VER} && \ - python setup.py build && \ - python setup.py install - -# Copied from samtools container -ENV SAMTOOLSVER="1.15" -RUN wget https://github.com/samtools/samtools/releases/download/${SAMTOOLSVER}/samtools-${SAMTOOLSVER}.tar.bz2 && \ - tar -xjf samtools-${SAMTOOLSVER}.tar.bz2 && \ - rm samtools-${SAMTOOLSVER}.tar.bz2 && \ - cd samtools-${SAMTOOLSVER} && \ - ./configure && \ - make && \ - make install +RUN pip3 install ete3 numba pandas scikit-learn psutil click scipy +RUN pip3 install numpy==1.21.6 + +COPY --from=shovill /skesa/skesa /usr/local/bin/ +COPY --from=shovill /megahit /megahit +COPY --from=shovill /pilon /pilon +COPY --from=shovill /SPAdes-*-Linux /spades +COPY --from=kraken2 /kraken2-2* /kraken2 +COPY --from=kraken2 /kraken2-db /kraken2-db +COPY --from=bowtie2 /opt/bowtie2-* /opt/bowtie2 +COPY --from=bbtools /opt/bbmap /opt/bbmap +COPY --from=mlst /ncbi-blast-2.9.0+ /ncbi-blast-2.9.0+ +COPY --from=lyveset /lyve-SET /lyve-SET +COPY --from=orthofinder /mmseqs /mmseqs +COPY --from=cfsan /gatk /gatk +COPY --from=flye /Flye-* /flye +COPY --from=samtools /samtools-* /samtools +COPY --from=vsearch /usr/local/bin/vsearch /usr/local/bin/vsearch +COPY --from=vsearch /usr/local/bin/vsearch /usr/local/bin/usearch +COPY --from=raxml /standard-RAxML-8.2.12 /standard-RAxML +COPY --from=raxml /raxml_ng /raxml_ng + +ARG DIAMOND_VER +ARG ETOKI_VER # Diamond -ENV DIAMOND_VER="v2.0.15" RUN wget https://github.com/bbuchfink/diamond/releases/download/${DIAMOND_VER}/diamond-linux64.tar.gz && \ tar zxvf diamond-linux64.tar.gz && \ mv -v diamond /usr/local/bin/diamond && \ + chmod +x /usr/local/bin/diamond && \ rm diamond-linux64.tar.gz ## EToKi itself ## -# ENV instructions set environment variables that persist from the build into the resulting image -# Use for e.g. $PATH and locale settings for compatibility with Singularity -ENV PATH="/usr/local/bin:/usr/local/bin/EToKi:/megahit:/pilon:/spades/bin:/kraken2:/opt/bowtie2:/opt/bbmap:/ncbi-blast-2.9.0+/bin:/megahit/megahit_v1.1.4_LINUX_CPUONLY_x86_64-bin:/lyve-SET:/lyve-SET/scripts:/mmseqs/bin:/mmseqs/util:$PATH" \ - LC_ALL=C - -WORKDIR /usr/local/bin -RUN git clone https://github.com/zheminzhou/EToKi.git -b ${ETOKI_VER_ENV} - -RUN pip3 install ete3 numba numpy==1.21 pandas scikit-learn psutil click scipy +RUN wget https://github.com/zheminzhou/EToKi/archive/refs/tags/${ETOKI_VER}.tar.gz && \ + ls / && \ + tar zxvf ${ETOKI_VER}.tar.gz && \ + rm ${ETOKI_VER}.tar.gz +#RUN git clone https://github.com/zheminzhou/EToKi.git -b ${ETOKI_VER} /EToKi-${ETOKI_VER} + +WORKDIR /EToKi-${ETOKI_VER} + +ENV PATH="/usr/local/bin:/EToKi-${ETOKI_VER}:/flye/bin:/megahit:/pilon:/spades/bin:/kraken2:/opt/bowtie2:/opt/bbmap:/ncbi-blast-2.9.0+/bin:/megahit/megahit_v1.1.4_LINUX_CPUONLY_x86_64-bin:/raxml_ng:/standard-RAxML:/lyve-SET:/lyve-SET/scripts:/mmseqs/bin:/mmseqs/util:$PATH" \ + LC_ALL=C + +RUN EToKi.py configure --help + +RUN EToKi.py configure --path bbduk=/opt/bbmap/bbduk.sh +RUN EToKi.py configure --path bbmerge=$(which bbmerge.sh) +RUN EToKi.py configure --path bbduk=/opt/bbmap/bbduk.sh +RUN EToKi.py configure --path bbmerge=$(which bbmerge.sh) +RUN EToKi.py configure --path repair=$(which repair.sh) +RUN EToKi.py configure --path pilon=$(find /pilon -name 'pilon-*.jar' | head -n 1) +RUN EToKi.py configure --path flye=$(which flye) +RUN EToKi.py configure --path kraken2=$(find /kraken2 -type f -name kraken2 | head -n 1) +RUN EToKi.py configure --path bowtie2=$(which bowtie2) +RUN EToKi.py configure --path bowtie2build=$(which bowtie2-build) +RUN EToKi.py configure --path raxml=$(which raxmlHPC) +RUN EToKi.py configure --path raxml_ng=$(which raxml-ng) +RUN EToKi.py configure --path samtools=$(which samtools) +RUN EToKi.py configure --path blastn=$(which blastn) +RUN EToKi.py configure --path makeblastdb=$(which makeblastdb) +RUN EToKi.py configure --path diamond=$(which diamond) +RUN EToKi.py configure --path gatk=/gatk/GenomeAnalysisTK-3.8-1-0-gf15c1c3ef/GenomeAnalysisTK.jar +RUN EToKi.py configure --path mmseqs=$(which mmseqs) +RUN EToKi.py configure --path megahit=$(which megahit) +RUN EToKi.py configure --path spades=$(which spades.py) +RUN EToKi.py configure --usearch /usr/local/bin/usearch -# Install all 3rd party software except the kraken database -WORKDIR /usr/local/bin/EToKi -# Install 3rd party tools -# Samtools has an issue for some reason and so explicitly give that path -# Also give the path to usearch since it is normally proprietary (although we have vsearch) -RUN python EToKi.py configure --path bbduk=$(which bbduk.sh) || true -RUN python EToKi.py configure --path bbmerge=$(which bbmerge.sh) || true -RUN python EToKi.py configure --path repair=$(which repair.sh) || true -RUN python EToKi.py configure --path pilon=$(find /pilon -name 'pilon-*.jar' | head -n 1) || true -RUN python EToKi.py configure --path flye=$(which flye) || true -RUN python EToKi.py configure --path kraken2=$(find /kraken2 -type f -name kraken2 | head -n 1) || true -RUN python EToKi.py configure --path bowtie2=$(which bowtie2) || true -RUN python EToKi.py configure --path bowtie2build=$(which bowtie2-build) || true -RUN python EToKi.py configure --path raxml=$(which raxml) || true -RUN python EToKi.py configure --path raxml_ng=$(which raxml_ng) || true -RUN python EToKi.py configure --path samtools=$(which samtools) || true -RUN python EToKi.py configure --path blastn=$(which blastn) || true -RUN python EToKi.py configure --path makeblastdb=$(which makeblastdb) || true -RUN python EToKi.py configure --path diamond=$(which diamond) || true -RUN python EToKi.py configure --path gatk=/gatk/GenomeAnalysisTK-3.8-1-0-gf15c1c3ef/GenomeAnalysisTK.jar || true -#RUN python EToKi.py configure --path lastal= -#RUN python EToKi.py configure --path lastdb= -RUN python EToKi.py configure --path mmseqs=$(which mmseqs) || true -RUN python EToKi.py configure --path megahit=$(which megahit) || true -RUN python EToKi.py configure --path spades=$(which spades.py) || true -#RUN python EToKi.py configure --path kraken_db= -RUN python EToKi.py configure --usearch /usr/local/bin/usearch -RUN python EToKi.py configure - -# Fix the shebang line for all EToKi scripts to /usr/bin/env python -RUN find /usr/local/bin/EToKi -name '*.py' -exec sed -i.bak -e '1 i #!/usr/bin/env python\n# ^^^ inserted corrected shebang for this container' {} \; - - -# WORKDIR sets working directory WORKDIR /data -# A second FROM insruction creates a new stage -# We use `test` for the test image FROM app as test -# Demonstrate that the program is successfully installed - -# Option 1: run the program's internal tests, for example with SPAdes: WORKDIR /usr/local/bin/EToKi +# Get the help menu up RUN EToKi.py --help +# Show the configuration works +RUN EToKi.py configure #RUN bash -e example.bash -RUN python EToKi.py MLSTdb -i examples/Escherichia.Achtman.alleles.fasta -r examples/Escherichia.Achtman.references.fasta -d examples/Escherichia.Achtman.convert.tab - -# Option 2: write your own tests in a bash script in the same directory as your Dockerfile: -#COPY my_tests.sh . -#RUN bash my_tests.sh +RUN EToKi.py MLSTdb -i examples/Escherichia.Achtman.alleles.fasta -r examples/Escherichia.Achtman.references.fasta -d examples/Escherichia.Achtman.convert.tab -# Option 3: write python unit tests in a tests/ directory in the same directory as your Dockerfile: -#RUN apt-get install -y python3 -#RUN mkdir tests/ -#COPY tests/ tests/ -#RUN python3 -m unittest discover -s tests From 0f18dc29e712920caec3b249319ee479b802c5ce Mon Sep 17 00:00:00 2001 From: Lee Katz Date: Tue, 3 May 2022 16:16:01 -0400 Subject: [PATCH 13/24] etoki finally works --- etoki/1.2/Dockerfile | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/etoki/1.2/Dockerfile b/etoki/1.2/Dockerfile index 5ff34146f..b5d8f9ed6 100644 --- a/etoki/1.2/Dockerfile +++ b/etoki/1.2/Dockerfile @@ -143,11 +143,13 @@ WORKDIR /data FROM app as test -WORKDIR /usr/local/bin/EToKi +ARG ETOKI_VER + +WORKDIR /EToKi-${ETOKI_VER} # Get the help menu up RUN EToKi.py --help # Show the configuration works RUN EToKi.py configure -#RUN bash -e example.bash -RUN EToKi.py MLSTdb -i examples/Escherichia.Achtman.alleles.fasta -r examples/Escherichia.Achtman.references.fasta -d examples/Escherichia.Achtman.convert.tab +RUN bash example.bash +#RUN EToKi.py MLSTdb -i examples/Escherichia.Achtman.alleles.fasta -r examples/Escherichia.Achtman.references.fasta -d examples/Escherichia.Achtman.convert.tab From ac94572a4855957a51f403e9c71284eb41de6b2f Mon Sep 17 00:00:00 2001 From: Lee Katz Date: Wed, 4 May 2022 13:19:43 -0400 Subject: [PATCH 14/24] etoki test works --- etoki/1.2/Dockerfile | 59 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 6 deletions(-) diff --git a/etoki/1.2/Dockerfile b/etoki/1.2/Dockerfile index b5d8f9ed6..6b10fd1e8 100644 --- a/etoki/1.2/Dockerfile +++ b/etoki/1.2/Dockerfile @@ -21,7 +21,6 @@ FROM staphb/shovill:${SHOVILL_VER} AS shovill FROM staphb/kraken2:${KRAKEN2_VER} AS kraken2 FROM staphb/bowtie2:${BOWTIE_VER} AS bowtie2 FROM staphb/lyveset:${LYVESET_VER} AS lyveset -FROM torognes/vsearch:${VSEARCH_VER} AS vsearch FROM staphb/bbtools:${BBTOOLS_VER} AS bbtools FROM staphb/mlst:${MLST_VER} AS mlst FROM staphb/orthofinder:${ORTHOFINDER_VER} AS orthofinder @@ -54,6 +53,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ g++ \ python-setuptools \ zlib1g-dev \ + libbz2-dev \ python-is-python3 \ python3-pip \ python3-dev \ @@ -67,12 +67,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ant \ libssl-dev \ python3-venv \ + autoconf \ + automake \ + make \ wget && \ apt-get autoclean && \ rm -rf /var/lib/apt/lists/* -RUN pip3 install ete3 numba pandas scikit-learn psutil click scipy -RUN pip3 install numpy==1.21.6 +RUN pip3 install ete3 numba pandas scikit-learn psutil click scipy numpy==1.21.6 COPY --from=shovill /skesa/skesa /usr/local/bin/ COPY --from=shovill /megahit /megahit @@ -88,13 +90,12 @@ COPY --from=orthofinder /mmseqs /mmseqs COPY --from=cfsan /gatk /gatk COPY --from=flye /Flye-* /flye COPY --from=samtools /samtools-* /samtools -COPY --from=vsearch /usr/local/bin/vsearch /usr/local/bin/vsearch -COPY --from=vsearch /usr/local/bin/vsearch /usr/local/bin/usearch COPY --from=raxml /standard-RAxML-8.2.12 /standard-RAxML COPY --from=raxml /raxml_ng /raxml_ng ARG DIAMOND_VER ARG ETOKI_VER +ARG VSEARCH_VER # Diamond RUN wget https://github.com/bbuchfink/diamond/releases/download/${DIAMOND_VER}/diamond-linux64.tar.gz && \ @@ -103,6 +104,21 @@ RUN wget https://github.com/bbuchfink/diamond/releases/download/${DIAMOND_VER}/d chmod +x /usr/local/bin/diamond && \ rm diamond-linux64.tar.gz +# Vsearch sort of copied from https://github.com/torognes/vsearch/blob/v2.21.1/Dockerfile +WORKDIR /opt +RUN git clone https://github.com/torognes/vsearch.git && cd vsearch && git checkout v${VSEARCH_VER} +WORKDIR /opt/vsearch +RUN ./autogen.sh && \ + ./configure CFLAGS="-O3" CXXFLAGS="-O3" && \ + make clean && \ + make && \ + make install && \ + make clean && \ + cd .. && \ + rm -rf /opt/vsearch + +WORKDIR / + ## EToKi itself ## RUN wget https://github.com/zheminzhou/EToKi/archive/refs/tags/${ETOKI_VER}.tar.gz && \ ls / && \ @@ -137,7 +153,14 @@ RUN EToKi.py configure --path gatk=/gatk/GenomeAnalysisTK-3.8-1-0-gf15c1c3ef/Gen RUN EToKi.py configure --path mmseqs=$(which mmseqs) RUN EToKi.py configure --path megahit=$(which megahit) RUN EToKi.py configure --path spades=$(which spades.py) -RUN EToKi.py configure --usearch /usr/local/bin/usearch +RUN EToKi.py configure --usearch /usr/local/bin/vsearch + +# Swap out usearch for vsearch in the MLSType code +RUN sed -i "s/ublast_cmd = .*/ublast_cmd = 'vsearch --usearch_global {refAA} --db {qryAA} --threads {n_thread} --id 0.8 --maxaccepts 6 --blast6out {aaMatch} --userfields query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+raw+ql+tl+qrow+trow+qstrand'.format(/" /EToKi-${ETOKI_VER}/modules/MLSType.py +RUN sed -i -e "s/qryAA=qryAA/qryAA=qry/" -e "s/refAA=refAA/refAA=ref/" /EToKi-${ETOKI_VER}/modules/MLSType.py +RUN sed -i "s/pp.communicate\(\)/ublast_res = pp.communicate(); print(ublast_res[1]);/" /EToKi-${ETOKI_VER}/modules/MLSType.py +# Show the results of this swap +RUN grep -n -C 5 [uv]search /EToKi-${ETOKI_VER}/modules/MLSType.py WORKDIR /data @@ -150,6 +173,30 @@ WORKDIR /EToKi-${ETOKI_VER} RUN EToKi.py --help # Show the configuration works RUN EToKi.py configure + +ENV MLSTdb="/EToKi-${ETOKI_VER}/examples/Escherichia.Achtman.alleles.fasta" +ENV MLSTalleles="/EToKi-${ETOKI_VER}/examples/Escherichia.Achtman.alleles.fasta" +ENV MLSTtab="/EToKi-${ETOKI_VER}/examples/Escherichia.Achtman.convert.tab" +ENV ECOLI_assembly="/EToKi-${ETOKI_VER}/examples/GCF_000005845.2_ASM584v2_genomic.fna" +ENV ECOLI_assembly2="/EToKi-${ETOKI_VER}/examples/GCF_000214765.2_ASM21476v3_genomic.fna" +ENV ECOLI_assembly3="/EToKi-${ETOKI_VER}/examples/GCF_001566635.1_ASM156663v1_genomic.fna" +RUN gunzip -vf ${ECOLI_assembly} ${ECOLI_assembly2} ${ECOLI_assembly3} + +# Try out MLST +RUN EToKi.py MLSTdb -i ${MLSTalleles} -r ${MLSTdb} -d ${MLSTtab} +# Show some results of the database creation +RUN ls -lh $MLSTdb $MLSTalleles $MLSTtab && head ${MLSTalleles} ${MLSTdb} ${MLSTtab} +# Run typing +RUN EToKi.py MLSType -i ${ECOLI_assembly3} -r ${MLSTdb} -k G749 -o stdout -d ${MLSTtab} + +# Return the fasta files back to gzip status +RUN gzip -vf ${ECOLI_assembly} ${ECOLI_assembly2} ${ECOLI_assembly3} +# Some of this command will appear like it failed because we are lacking lastdb and kraken RUN bash example.bash + +# Run EToKi uberBlast, the same way that it is called in EToKi mlstDb +#RUN EToKi.py uberBlast -q ${ECOLI_assembly} -r ${ECOLI_assembly2} -f --blastn --diamondSELF --min_id 0.1 --min_ratio 0.1 -t 2 -p -s 0 -e 0,3 -o /dev/stdout +# RUN EToKi.py uberBlast -q ${ECOLI_assembly} -r ${ECOLI_assembly2} -f --blastn --diamondSELF --min_id 0.6 --min_ratio 0.7 -t 2 -p -s 1 -e 0,3 -o /dev/stdout +#RUN bash example.bash #RUN EToKi.py MLSTdb -i examples/Escherichia.Achtman.alleles.fasta -r examples/Escherichia.Achtman.references.fasta -d examples/Escherichia.Achtman.convert.tab From b9600f9b5bbcb270bcf8148a79e3123d0ac7c68c Mon Sep 17 00:00:00 2001 From: Lee Katz - Aspen Date: Mon, 23 May 2022 15:45:59 -0400 Subject: [PATCH 15/24] incorporating a suggestion from @erinyoung --- etoki/1.2/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etoki/1.2/Dockerfile b/etoki/1.2/Dockerfile index 5ff34146f..858f4221a 100644 --- a/etoki/1.2/Dockerfile +++ b/etoki/1.2/Dockerfile @@ -32,7 +32,7 @@ FROM staphb/raxml:${RAXML_VER} AS raxml FROM ubuntu:jammy as app -LABEL base.image="ubuntu:focal" +LABEL base.image="ubuntu:jammy" LABEL dockerfile.version="1" LABEL software="EToKi" LABEL software.version=$ETOKI_VER From 5a43e8fbbfdd0e8820543a5a636f1326c782cb75 Mon Sep 17 00:00:00 2001 From: Lee Katz - Aspen Date: Mon, 23 May 2022 15:52:05 -0400 Subject: [PATCH 16/24] added note that usearch replaced by blast --- etoki/1.2/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/etoki/1.2/README.md b/etoki/1.2/README.md index 4deb8327a..3bcb25aa2 100644 --- a/etoki/1.2/README.md +++ b/etoki/1.2/README.md @@ -2,6 +2,8 @@ _note_ this text was lifted from the original repo README +_note_ this is a modified version of EToKi mainly to remove `usearch` and replace it with blast + ### Trim genomic reads ~~~~~~~~~~~ python EToKi.py prepare --pe examples/S_R1.fastq.gz,examples/S_R2.fastq.gz -p examples/prep_out From dcaf8d033df8fde4712b0e663e1ab339cb6a0baa Mon Sep 17 00:00:00 2001 From: Lee Katz - Aspen Date: Wed, 25 May 2022 21:20:34 -0400 Subject: [PATCH 17/24] updated to EToKi lskatz fork --- etoki/1.2/Dockerfile | 91 ++++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 41 deletions(-) diff --git a/etoki/1.2/Dockerfile b/etoki/1.2/Dockerfile index e9e4b4cc4..a60273864 100644 --- a/etoki/1.2/Dockerfile +++ b/etoki/1.2/Dockerfile @@ -7,9 +7,9 @@ ARG SHOVILL_VER=1.1.0 ARG KRAKEN2_VER=2.1.2-no-db ARG BOWTIE_VER=2.4.4 ARG LYVESET_VER=1.1.4f -ARG VSEARCH_VER=2.21.1 +#ARG VSEARCH_VER=2.21.1 ARG BBTOOLS_VER=38.96 -ARG MLST_VER=2.19.0 +#ARG MLST_VER=2.19.0 ARG ORTHOFINDER_VER=2.17 ARG CFSAN_VER=2.0.2 ARG FLYE_VER=2.9 @@ -22,8 +22,8 @@ FROM staphb/kraken2:${KRAKEN2_VER} AS kraken2 FROM staphb/bowtie2:${BOWTIE_VER} AS bowtie2 FROM staphb/lyveset:${LYVESET_VER} AS lyveset FROM staphb/bbtools:${BBTOOLS_VER} AS bbtools -FROM staphb/mlst:${MLST_VER} AS mlst -FROM staphb/orthofinder:${ORTHOFINDER_VER} AS orthofinder +#FROM staphb/mlst:${MLST_VER} AS mlst +#FROM staphb/orthofinder:${ORTHOFINDER_VER} AS orthofinder FROM staphb/cfsan-snp-pipeline:${CFSAN_VER} AS cfsan FROM staphb/flye:${FLYE_VER} AS flye FROM staphb/samtools:${SAMTOOLS_VER} AS samtools @@ -47,7 +47,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libncurses5-dev \ libbz2-dev \ liblzma-dev \ - perl \ + perl-base \ libcurl4-gnutls-dev \ gcc \ g++ \ @@ -70,7 +70,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ autoconf \ automake \ make \ - wget && \ + wget \ + mmseqs2 \ + ncbi-blast+ && \ apt-get autoclean && \ rm -rf /var/lib/apt/lists/* @@ -84,9 +86,9 @@ COPY --from=kraken2 /kraken2-2* /kraken2 COPY --from=kraken2 /kraken2-db /kraken2-db COPY --from=bowtie2 /opt/bowtie2-* /opt/bowtie2 COPY --from=bbtools /opt/bbmap /opt/bbmap -COPY --from=mlst /ncbi-blast-2.9.0+ /ncbi-blast-2.9.0+ +#COPY --from=mlst /ncbi-blast-2.9.0+ /ncbi-blast-2.9.0+ COPY --from=lyveset /lyve-SET /lyve-SET -COPY --from=orthofinder /mmseqs /mmseqs +#COPY --from=orthofinder /mmseqs /mmseqs COPY --from=cfsan /gatk /gatk COPY --from=flye /Flye-* /flye COPY --from=samtools /samtools-* /samtools @@ -95,7 +97,7 @@ COPY --from=raxml /raxml_ng /raxml_ng ARG DIAMOND_VER ARG ETOKI_VER -ARG VSEARCH_VER +#ARG VSEARCH_VER # Diamond RUN wget https://github.com/bbuchfink/diamond/releases/download/${DIAMOND_VER}/diamond-linux64.tar.gz && \ @@ -104,33 +106,50 @@ RUN wget https://github.com/bbuchfink/diamond/releases/download/${DIAMOND_VER}/d chmod +x /usr/local/bin/diamond && \ rm diamond-linux64.tar.gz +# mmseqs +#RUN wget https://mmseqs.com/latest/mmseqs-linux-avx2.tar.gz && \ +#RUN wget --no-check-certificate --secure-protocol=TLSv1_2 --debug -v --auth-no-challenge http://mmseqs.com/latest/mmseqs-linux-avx2.tar.gz && \ +# tar -xvf mmseqs-linux-avx2.tar.gz && \ +# rm -rf mmseqs-linux-avx2.tar.gz && \ +# /bin/bash -c "source /mmseqs/util/bash-completion.sh" + + +ENV PATH="/usr/local/bin:/EToKi-${ETOKI_VER}:/flye/bin:/megahit:/pilon:/spades/bin:/kraken2:/opt/bowtie2:/opt/bbmap:/ncbi-blast-2.9.0+/bin:/megahit/megahit_v1.1.4_LINUX_CPUONLY_x86_64-bin:/raxml_ng:/standard-RAxML:/lyve-SET:/lyve-SET/scripts:/mmseqs/bin:/mmseqs/util:$PATH" \ + LC_ALL=C + +# Test many executables +#RUN mmseqs --version +#RUN flye --version +#RUN samtools --help +#RUN diamond --help && diamond --version + # Vsearch sort of copied from https://github.com/torognes/vsearch/blob/v2.21.1/Dockerfile -WORKDIR /opt -RUN git clone https://github.com/torognes/vsearch.git && cd vsearch && git checkout v${VSEARCH_VER} -WORKDIR /opt/vsearch -RUN ./autogen.sh && \ - ./configure CFLAGS="-O3" CXXFLAGS="-O3" && \ - make clean && \ - make && \ - make install && \ - make clean && \ - cd .. && \ - rm -rf /opt/vsearch +#WORKDIR /opt +#RUN git clone https://github.com/torognes/vsearch.git && cd vsearch && git checkout v${VSEARCH_VER} +#WORKDIR /opt/vsearch +#RUN ./autogen.sh && \ + #./configure CFLAGS="-O3" CXXFLAGS="-O3" && \ + #make clean && \ + #make && \ + #make install && \ + #make clean && \ + #cd .. && \ + #rm -rf /opt/vsearch WORKDIR / ## EToKi itself ## -RUN wget https://github.com/zheminzhou/EToKi/archive/refs/tags/${ETOKI_VER}.tar.gz && \ - ls / && \ - tar zxvf ${ETOKI_VER}.tar.gz && \ - rm ${ETOKI_VER}.tar.gz -#RUN git clone https://github.com/zheminzhou/EToKi.git -b ${ETOKI_VER} /EToKi-${ETOKI_VER} +#RUN wget https://github.com/zheminzhou/EToKi/archive/refs/tags/${ETOKI_VER}.tar.gz && \ +# ls / && \ +# tar zxvf ${ETOKI_VER}.tar.gz && \ +# rm ${ETOKI_VER}.tar.gz +# TODO checkout a specific tag or hashtag +RUN git clone https://github.com/lskatz/EToKi.git /EToKi-${ETOKI_VER} && \ + cd /EToKi-${ETOKI_VER} && \ + cd - WORKDIR /EToKi-${ETOKI_VER} -ENV PATH="/usr/local/bin:/EToKi-${ETOKI_VER}:/flye/bin:/megahit:/pilon:/spades/bin:/kraken2:/opt/bowtie2:/opt/bbmap:/ncbi-blast-2.9.0+/bin:/megahit/megahit_v1.1.4_LINUX_CPUONLY_x86_64-bin:/raxml_ng:/standard-RAxML:/lyve-SET:/lyve-SET/scripts:/mmseqs/bin:/mmseqs/util:$PATH" \ - LC_ALL=C - RUN EToKi.py configure --help RUN EToKi.py configure --path bbduk=/opt/bbmap/bbduk.sh @@ -147,20 +166,15 @@ RUN EToKi.py configure --path raxml=$(which raxmlHPC) RUN EToKi.py configure --path raxml_ng=$(which raxml-ng) RUN EToKi.py configure --path samtools=$(which samtools) RUN EToKi.py configure --path blastn=$(which blastn) +RUN EToKi.py configure --path blastp=$(which blastp) RUN EToKi.py configure --path makeblastdb=$(which makeblastdb) RUN EToKi.py configure --path diamond=$(which diamond) RUN EToKi.py configure --path gatk=/gatk/GenomeAnalysisTK-3.8-1-0-gf15c1c3ef/GenomeAnalysisTK.jar RUN EToKi.py configure --path mmseqs=$(which mmseqs) RUN EToKi.py configure --path megahit=$(which megahit) RUN EToKi.py configure --path spades=$(which spades.py) -RUN EToKi.py configure --usearch /usr/local/bin/vsearch - -# Swap out usearch for vsearch in the MLSType code -RUN sed -i "s/ublast_cmd = .*/ublast_cmd = 'vsearch --usearch_global {refAA} --db {qryAA} --threads {n_thread} --id 0.8 --maxaccepts 6 --blast6out {aaMatch} --userfields query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+raw+ql+tl+qrow+trow+qstrand'.format(/" /EToKi-${ETOKI_VER}/modules/MLSType.py -RUN sed -i -e "s/qryAA=qryAA/qryAA=qry/" -e "s/refAA=refAA/refAA=ref/" /EToKi-${ETOKI_VER}/modules/MLSType.py -RUN sed -i "s/pp.communicate\(\)/ublast_res = pp.communicate(); print(ublast_res[1]);/" /EToKi-${ETOKI_VER}/modules/MLSType.py -# Show the results of this swap -RUN grep -n -C 5 [uv]search /EToKi-${ETOKI_VER}/modules/MLSType.py +# In the LK version, we are emulating usearch with blastp +RUN EToKi.py configure --usearch $(which blastp) WORKDIR /data @@ -194,8 +208,3 @@ RUN gzip -vf ${ECOLI_assembly} ${ECOLI_assembly2} ${ECOLI_assembly3} # Some of this command will appear like it failed because we are lacking lastdb and kraken RUN bash example.bash -# Run EToKi uberBlast, the same way that it is called in EToKi mlstDb -#RUN EToKi.py uberBlast -q ${ECOLI_assembly} -r ${ECOLI_assembly2} -f --blastn --diamondSELF --min_id 0.1 --min_ratio 0.1 -t 2 -p -s 0 -e 0,3 -o /dev/stdout -# RUN EToKi.py uberBlast -q ${ECOLI_assembly} -r ${ECOLI_assembly2} -f --blastn --diamondSELF --min_id 0.6 --min_ratio 0.7 -t 2 -p -s 1 -e 0,3 -o /dev/stdout -#RUN bash example.bash -#RUN EToKi.py MLSTdb -i examples/Escherichia.Achtman.alleles.fasta -r examples/Escherichia.Achtman.references.fasta -d examples/Escherichia.Achtman.convert.tab From 6dde5550e5693d134db1b113295928fe62069543 Mon Sep 17 00:00:00 2001 From: Lee Katz - Aspen Date: Thu, 26 May 2022 10:43:52 -0400 Subject: [PATCH 18/24] updated EToKi to v1.2.1 and made a note in the EToKi readme that only MLST is guaranteed --- etoki/{1.2 => 1.2.1}/Dockerfile | 10 ++++++++-- etoki/{1.2 => 1.2.1}/README.md | 4 +++- 2 files changed, 11 insertions(+), 3 deletions(-) rename etoki/{1.2 => 1.2.1}/Dockerfile (94%) rename etoki/{1.2 => 1.2.1}/README.md (99%) diff --git a/etoki/1.2/Dockerfile b/etoki/1.2.1/Dockerfile similarity index 94% rename from etoki/1.2/Dockerfile rename to etoki/1.2.1/Dockerfile index a60273864..cbe6b6dc8 100644 --- a/etoki/1.2/Dockerfile +++ b/etoki/1.2.1/Dockerfile @@ -146,6 +146,7 @@ WORKDIR / # TODO checkout a specific tag or hashtag RUN git clone https://github.com/lskatz/EToKi.git /EToKi-${ETOKI_VER} && \ cd /EToKi-${ETOKI_VER} && \ + git checkout 1.2.1 && \ cd - WORKDIR /EToKi-${ETOKI_VER} @@ -205,6 +206,11 @@ RUN EToKi.py MLSType -i ${ECOLI_assembly3} -r ${MLSTdb} -k G749 -o stdout -d ${M # Return the fasta files back to gzip status RUN gzip -vf ${ECOLI_assembly} ${ECOLI_assembly2} ${ECOLI_assembly3} -# Some of this command will appear like it failed because we are lacking lastdb and kraken -RUN bash example.bash + +# Just take the examples of unit tests that are most relevant to MLST +# from example.bash in the repo +#RUN python EToKi.py MLSTdb -i examples/Escherichia.Achtman.alleles.fasta -r examples/Escherichia.Achtman.references.fasta -d examples/Escherichia.Achtman.convert.tab +#RUN gzip -cd examples/GCF_001566635.1_ASM156663v1_genomic.fna.gz > examples/GCF_001566635.1_ASM156663v1_genomic.fna && \ + + diff --git a/etoki/1.2/README.md b/etoki/1.2.1/README.md similarity index 99% rename from etoki/1.2/README.md rename to etoki/1.2.1/README.md index 3bcb25aa2..fa822506c 100644 --- a/etoki/1.2/README.md +++ b/etoki/1.2.1/README.md @@ -2,7 +2,9 @@ _note_ this text was lifted from the original repo README -_note_ this is a modified version of EToKi mainly to remove `usearch` and replace it with blast +_note_ this is a modified version of EToKi mainly to remove `usearch` and replace it with blast. +This is noted by using a custom version of EToKi labeled as `1.2.1` by @lskatz. +Additionally, only MLST methods are tested and there are other modules in EToKi that are not guaranteed to work such as assembly and metagenomics. ### Trim genomic reads ~~~~~~~~~~~ From a66fd67817acd5d1a56a25c4ae00860ca8fa4d7d Mon Sep 17 00:00:00 2001 From: Lee Katz Date: Wed, 8 Jun 2022 21:59:58 -0400 Subject: [PATCH 19/24] fixed version in test-etoki.yml to v1.2.1 --- .github/workflows/test-etoki.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-etoki.yml b/.github/workflows/test-etoki.yml index 32738591a..0ce825cf9 100644 --- a/.github/workflows/test-etoki.yml +++ b/.github/workflows/test-etoki.yml @@ -9,7 +9,7 @@ on: workflow_dispatch: pull_request: paths: - - "etoki/1.2/Dockerfile" # Dockerfile path, e.g. 'htslib/1.14/Dockerfile' so that only your image is tested + - "etoki/1.2.1/Dockerfile" # Dockerfile path, e.g. 'htslib/1.14/Dockerfile' so that only your image is tested # A workflow run is made up of one or more jobs that can run sequentially or in parallel jobs: From a9dd585c370db189b9c1d896d7e5cf2e3e7fc9b6 Mon Sep 17 00:00:00 2001 From: Lee Katz Date: Wed, 8 Jun 2022 22:01:47 -0400 Subject: [PATCH 20/24] etoki v1.2.1 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c3b94c5a3..b828dd998 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ To learn more about the docker pull rate limits and the open source software pro | [datasets-sars-cov-2](https://github.com/CDCgov/datasets-sars-cov-2)
[![docker pulls](https://badgen.net/docker/pulls/staphb/datasets-sars-cov-2)](https://hub.docker.com/r/staphb/datasets-sars-cov-2) |
  • 0.6.2
  • 0.6.3
| https://github.com/CDCgov/datasets-sars-cov-2 | | [DSK](https://hub.docker.com/r/staphb/dsk)
[![docker pulls](https://badgen.net/docker/pulls/staphb/dsk)](https://hub.docker.com/r/staphb/dsk) |
  • 0.0.100
| https://gatb.inria.fr/software/dsk/ | | [emm-typing-tool](https://hub.docker.com/r/staphb/emm-typing-tool)
[![docker pulls](https://badgen.net/docker/pulls/staphb/emm-typing-tool)](https://hub.docker.com/r/staphb/emm-typing-tool) |
  • 0.0.1 (no version)
| https://github.com/phe-bioinformatics/emm-typing-tool | -| [EToKi](https://hub.docker.com/r/staphb/etoki)
[![docker pulls](https://badgen.net/docker/pulls/staphb/etoki)](https://hub.docker.com/r/staphb/etoki) |
  • 1.2
| https://github.com/zheminzhou/EToKi | +| [EToKi](https://hub.docker.com/r/staphb/etoki)
[![docker pulls](https://badgen.net/docker/pulls/staphb/etoki)](https://hub.docker.com/r/staphb/etoki) |
  • 1.2.1
| https://github.com/zheminzhou/EToKi | | [FastANI](https://hub.docker.com/r/staphb/fastani)
[![docker pulls](https://badgen.net/docker/pulls/staphb/fastani)](https://hub.docker.com/r/staphb/fastani) |
  • 1.1
  • 1.32
  • 1.33
| https://github.com/ParBLiSS/FastANI | | [FastTree](https://hub.docker.com/r/staphb/fasttree)
[![docker pulls](https://badgen.net/docker/pulls/staphb/fasttree)](https://hub.docker.com/r/staphb/fasttree) |
  • 2.1.11
| http://www.microbesonline.org/fasttree/ | | [FastQC](https://hub.docker.com/r/staphb/fastqc)
[![docker pulls](https://badgen.net/docker/pulls/staphb/fastqc)](https://hub.docker.com/r/staphb/fastqc) |
  • 0.11.8
  • 0.11.9
| https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
https://github.com/s-andrews/FastQC | From 85320705478646fb06a6d47b4c46bd50ee5eae43 Mon Sep 17 00:00:00 2001 From: Lee Katz Date: Wed, 8 Jun 2022 22:02:36 -0400 Subject: [PATCH 21/24] v1.2.1 --- .github/workflows/test-etoki.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-etoki.yml b/.github/workflows/test-etoki.yml index 0ce825cf9..6a5955222 100644 --- a/.github/workflows/test-etoki.yml +++ b/.github/workflows/test-etoki.yml @@ -18,6 +18,6 @@ jobs: build-to-test: uses: ./.github/workflows/build-to-test.yml with: - path_to_context: "./etoki/1.2" # Path to directory with Dockerfile and context, e.g. "./spades/3.12.0" + path_to_context: "./etoki/1.2.1" # Path to directory with Dockerfile and context, e.g. "./spades/3.12.0" dockerfile_name: "Dockerfile" cache: "etoki" # Use the program name as a nickname for a GitHub cache of your image's layers, e.g. "spades". The cache will speed up re-running the workflow. From 781b52ca2f5290be01ac45d3421302198e53db7d Mon Sep 17 00:00:00 2001 From: Lee Katz - Aspen Date: Wed, 8 Jun 2022 22:10:44 -0400 Subject: [PATCH 22/24] added wget command instead of git checkout; v1.2.1 --- etoki/1.2.1/Dockerfile | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/etoki/1.2.1/Dockerfile b/etoki/1.2.1/Dockerfile index cbe6b6dc8..18574e1fa 100644 --- a/etoki/1.2.1/Dockerfile +++ b/etoki/1.2.1/Dockerfile @@ -1,6 +1,6 @@ # FYI this is a multistage build, which can get very complicated -ARG ETOKI_VER="1.2" +ARG ETOKI_VER="1.2.1" ARG SKESA_VER=2.4.0 ARG SPADES_VER=3.15.4 ARG SHOVILL_VER=1.1.0 @@ -144,10 +144,13 @@ WORKDIR / # tar zxvf ${ETOKI_VER}.tar.gz && \ # rm ${ETOKI_VER}.tar.gz # TODO checkout a specific tag or hashtag -RUN git clone https://github.com/lskatz/EToKi.git /EToKi-${ETOKI_VER} && \ - cd /EToKi-${ETOKI_VER} && \ - git checkout 1.2.1 && \ - cd - +RUN wget https://github.com/lskatz/EToKi/archive/refs/tags/${ETOKI_VER}.tar.gz && \ + tar zxvf 1.2.1.tar.gz + +#RUN git clone https://github.com/lskatz/EToKi.git /EToKi-${ETOKI_VER} && \ +# cd /EToKi-${ETOKI_VER} && \ +# git checkout 1.2.1 && \ +# cd - WORKDIR /EToKi-${ETOKI_VER} From f6cf4c3ea469d0658bc4e7f6a7627e6700dc7c8d Mon Sep 17 00:00:00 2001 From: Lee Katz Date: Fri, 17 Jun 2022 20:23:57 -0400 Subject: [PATCH 23/24] added `tar zxvf ${ETOKI_VER}.tar.gz` --- etoki/1.2.1/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etoki/1.2.1/Dockerfile b/etoki/1.2.1/Dockerfile index 18574e1fa..6cde5bad3 100644 --- a/etoki/1.2.1/Dockerfile +++ b/etoki/1.2.1/Dockerfile @@ -145,7 +145,7 @@ WORKDIR / # rm ${ETOKI_VER}.tar.gz # TODO checkout a specific tag or hashtag RUN wget https://github.com/lskatz/EToKi/archive/refs/tags/${ETOKI_VER}.tar.gz && \ - tar zxvf 1.2.1.tar.gz + tar zxvf ${ETOKI_VER}.tar.gz #RUN git clone https://github.com/lskatz/EToKi.git /EToKi-${ETOKI_VER} && \ # cd /EToKi-${ETOKI_VER} && \ From ae897fd739eb607d6c35e691ac1bf8bd90fc123a Mon Sep 17 00:00:00 2001 From: Lee Katz Date: Fri, 17 Jun 2022 20:25:29 -0400 Subject: [PATCH 24/24] Delete test-etoki.yml --- .github/workflows/test-etoki.yml | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 .github/workflows/test-etoki.yml diff --git a/.github/workflows/test-etoki.yml b/.github/workflows/test-etoki.yml deleted file mode 100644 index 6a5955222..000000000 --- a/.github/workflows/test-etoki.yml +++ /dev/null @@ -1,23 +0,0 @@ -# This caller workflow builds an image to the "test" stage. -# Instructions: replace all the stubs in this template with values for your image. -# Some explanations come from: https://github.com/actions/starter-workflows/blob/main/automation/manual.yml - -name: Test etoki image - -# Controls when the action will run. Workflow runs when manually triggered using the UI or when you submit your pull request -on: - workflow_dispatch: - pull_request: - paths: - - "etoki/1.2.1/Dockerfile" # Dockerfile path, e.g. 'htslib/1.14/Dockerfile' so that only your image is tested - -# A workflow run is made up of one or more jobs that can run sequentially or in parallel -jobs: - - # This job calls a workflow to build the image to the 'test' stage - build-to-test: - uses: ./.github/workflows/build-to-test.yml - with: - path_to_context: "./etoki/1.2.1" # Path to directory with Dockerfile and context, e.g. "./spades/3.12.0" - dockerfile_name: "Dockerfile" - cache: "etoki" # Use the program name as a nickname for a GitHub cache of your image's layers, e.g. "spades". The cache will speed up re-running the workflow.