From b0441f4c43ecfaabd37055979c06e44bf43549eb Mon Sep 17 00:00:00 2001 From: Rad Suchecki Date: Wed, 8 Jan 2020 15:43:16 +1030 Subject: [PATCH] Feature/streamline inputs (#34) * use small genomes to generate examples and stramline input definitions * corrected urls * relaxed allowed target name regex * stingency settings not ensembl specific, moved to main config * refactoring gtf/gff3 fileds def * major re-work of input staging and multitude of related changes * updated repr pep filtering * relaxed req to include supercontigs not just chromosomes * added sequencesToPlace spec to test config * restored core functionality after re-structure * cleanup, comments * added samtools container def * optional faidx process if idx no provided * added data set from non Esembl source * generalised gff3-based pep conversion to Ensembl style, also allows pass-through of already existing records * allowing user-specified chromosome id pattern for block and feature JSON generation * updated and documented test data sets * travis stub * opted for smaller samtools container * hack to handle gz (not bgz) files fro chr lengths * minor * Update README.md * Update .travis.yaml * Update .travis.yaml * Update .travis.yaml * Update .travis.yaml * test profile with local data * travis data download and untar * travis fixes * ubu version for travis * updated dep * for GH actions * docker user change for GH actions * docker groovy test for GHA * docker user * docker grp exists * added go for singularity * added groovy image with ps * reconf * test profile updates - fix for groovy @Grab failing with singularity (read only file-system) - fix errorStrategy config * added Singularity install to GH actions * Singularity dependencies @ GH actions * working around https://github.com/sylabs/singularity/issues/3634 * test singularity pull form docker * explicit use of gawk - may matter on alpine * workaround for nextflow-io/nextflow#1210 sylabs/singularity#3634 * leaner fastx container * fastx and reconf * fix path to script, renamed tasks * test wspace path * added missing script, fixed GH actions cmd * ansi-lo on and try docker again * docker workflow test * fix typo * fix typo * fix for permission denied GH actions (?) * fix for groovy grapes in docker * test * test * test * test * another docker test * GH A job.needs experiemnt * GH A tidy * GH A fix indent * GH A fix job * added GH actions CI badge * re-implemented: duplicate emissions if multiple annotations per reference assembly * updated datastes in line with feature dev * another badge ver * fix * added EP datasets * ensure non-empty process out * generalised for different gff3 interpretations * Delete .travis.yaml * Update README.md * Update README.md * At & Bd ref fasta not needed * speeding things up: gawk in jq container and up resources * do not report markers placed outside pseudochromosomes (e.g. on scaffolds) * id pattern match extended to seq placement * redundant-ish * added TOC --- .github/workflows/main.yml | 68 +++++++ README.md | 88 ++++----- bin/excludeSameChromosome.awk | 4 +- bin/filterForRepresentative.awk | 6 +- bin/gff3AndRepr2ensembl_pep.awk | 29 ++- bin/gtfAndRepr2ensembl_pep.awk | 2 +- bin/paf2pretzel.groovy | 14 +- conf/containers.config | 13 +- conf/ensembl-plants.config | 8 - conf/microsporidia.config | 94 +++++++++ conf/requirements.config | 6 +- conf/test-data.config | 94 +++++++++ conf/triticeae.config | 218 +++++++++++--------- dockerfiles/fastx.Dockerfile | 3 + dockerfiles/jq.Dockerfile | 4 + main.nf | 341 ++++++++++++++++---------------- nextflow.config | 65 +++++- pull_containers.nf | 33 ++++ 18 files changed, 727 insertions(+), 363 deletions(-) create mode 100644 .github/workflows/main.yml create mode 100644 conf/microsporidia.config create mode 100644 conf/test-data.config create mode 100644 dockerfiles/fastx.Dockerfile create mode 100644 dockerfiles/jq.Dockerfile create mode 100644 pull_containers.nf diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..2c745ef --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,68 @@ +name: CI + +on: [push] + +jobs: + docker: + runs-on: ubuntu-18.04 + steps: + - name: Install GraphViz + run: | + sudo apt-get update && sudo apt-get install -y graphviz + - name: Install Nextflow + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + - name: Check out code + uses: actions/checkout@v1 + - name: Test workflow (docker) + run: | + NXF_VER=19.10.0 nextflow run ${GITHUB_WORKSPACE} -profile CI,docker --max_cpus 2 --max_memory 4.GB -ansi-log false + + singularity: + runs-on: ubuntu-18.04 + # runs-on: ubuntu-18.04 + steps: + - name: Check out code + uses: actions/checkout@v1 + - name: Set up Go + uses: actions/setup-go@v1 + with: + go-version: 1.13 + id: go + - name: Install Dependencies for Singularity + run: | + sudo apt-get update && sudo apt-get install -y \ + build-essential \ + libssl-dev \ + uuid-dev \ + libgpgme11-dev \ + squashfs-tools \ + libseccomp-dev \ + pkg-config + - name: Install Singularity + env: + SINGULARITY_VERSION: 3.5.2 + run: | + export GOPATH=/tmp/go + mkdir -p $GOPATH + sudo mkdir -p /usr/local/var/singularity/mnt && \ + mkdir -p $GOPATH/src/github.com/sylabs && \ + cd $GOPATH/src/github.com/sylabs && \ + wget -qO- https://github.com/sylabs/singularity/releases/download/v${SINGULARITY_VERSION}/singularity-${SINGULARITY_VERSION}.tar.gz | \ + tar xzv && \ + cd singularity && \ + ./mconfig -p /usr/local && \ + make -C builddir && \ + sudo make -C builddir install + - name: Install Nextflow + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + - name: Pull containers + run: | + echo "GITHUB_WORKSPACE: ${GITHUB_WORKSPACE}" + nextflow run ${GITHUB_WORKSPACE}/pull_containers.nf -ansi-log false -profile singularity + - name: Test workflow (singularity) + run: | + NXF_VER=19.10.0 nextflow run ${GITHUB_WORKSPACE} -profile CI,singularity --max_cpus 2 --max_memory 4.GB -ansi-log false \ No newline at end of file diff --git a/README.md b/README.md index ee90165..2e398d5 100644 --- a/README.md +++ b/README.md @@ -2,23 +2,15 @@ [![GitHub commits since latest release](https://img.shields.io/github/commits-since/plantinformatics/pretzel-input-generator/latest.svg?style=for-the-badge&logo=github)](https://github.com/plantinformatics/pretzel-input-generator/releases) -Note that this description lacks detail which will be added in future releases. - -# Pipeline overview - -`pretzel-input-generator` is a [nextflow](https://www.nextflow.io) pipeline for generating input for [pretzel](https://github.com/plantinformatics/pretzel) from annotated and (mostly) contiguous genome assemblies. The pipeline requires approximately 1 cpu-day, but as many processes can run independently, the real run-time is much shorter if suitable compute resources are available. +![GitHub Workflow Status](https://img.shields.io/github/workflow/status/plantinformatics/pretzel-input-generator/CI?label=CI%20TESTS&logo=github&style=for-the-badge) - - +**Note that this README is partly out-of-date ** - [Pipeline overview](#pipeline-overview) - [Default pipeline](#default-pipeline) - - [Quick start example using yeast data](#quick-start-example-using-yeast-data) + - [Quick start example using microsporidia data](#quick-start-example-using-microsporidia-data) - [Input specification (triticeae and other relevant data sets)](#input-specification-triticeae-and-other-relevant-data-sets) - - [Data sources](#data-sources) - - [Remote](#remote) - - [Local](#local) - - [Other considerations](#other-considerations) + - [Disparate triticeae datasets](#disparate-triticeae-datasets) - [Dependencies](#dependencies) - [Execution](#execution) - [Output](#output) @@ -26,7 +18,13 @@ Note that this description lacks detail which will be added in future releases. - [Quick-ish start](#quick-ish-start) - [Output](#output-1) - + +# Pipeline overview + +`pretzel-input-generator` is a [nextflow](https://www.nextflow.io) pipeline for generating input for [pretzel](https://github.com/plantinformatics/pretzel) from annotated and (mostly) contiguous genome assemblies. The pipeline requires approximately ??? cpu-???, but as many processes can run independently, the real run-time is much shorter if suitable compute resources are available. + + + # Default pipeline @@ -35,54 +33,34 @@ Designed for EnsemblPlants and similarly formatted data. ![doc/dag.png](doc/dag.png) -## Quick start example using yeast data +## Quick start example using microsporidia data -Requires [nextflow](https://www.nextflow.io) and [Singularity](http://singularity.lbl.gov) +Requires [nextflow](https://www.nextflow.io) and either [Singularity](http://singularity.lbl.gov) ``` nextflow run plantinformatics/pretzel-input-generator \ --profile YEAST,singularity --max_cpus 2 --max_memory 2.GB +-profile MICROSPORIDIA,singularity --max_cpus 2 --max_memory 2.GB ``` -This will pull and process data sets from [Ensembl](https://ensembl.org) specified in [`conf/ensembl-yeast.config`](conf/ensembl-yeast.config) +This will pull and process data sets specified in [`conf/microsporidia.config`](conf/microsporidia.config) -## Input specification (triticeae and other relevant data sets) - -Input files are specified in [conf/triticeae.config](conf/triticeae.config). This can be supplemented/replaced by JSON/YAML formatted input spec. - -### Data sources -Currently all input data comes from the following sources: - -* [Ensembl plants](https://plants.ensembl.org) - multiple datasets as specified in [`conf/triticeae.config`](conf/triticeae.config) and -* [International Wheat Genome Sequencing Consortium](https://www.wheatgenome.org/) - * [Triticum aestivum (Chinese Spring) IWGSC RefSeq v1.0 assembly](https://wheat-urgi.versailles.inra.fr/Seq-Repository/Assemblies) -* [The wild emmer wheat sequencing consortium (WEWseq)](http://wewseq.wixsite.com/consortium) - * Zavitan assembly downloaded from [GrainGenes](https://wheat.pw.usda.gov/GG3/wildemmer) -* [European Nucleotide Archive](https://www.ebi.ac.uk/ena) - * [Assembly of chromosome 2D of *Triticum aestivum* line CH Campala *Lr22a*](https://www.ebi.ac.uk/ena/data/view/LS480641) - * [Assembly of *Triticum urartu* ](https://www.ebi.ac.uk/ena/data/view/GCA_003073215) - * Annotation downloaded from [MBKBase](http://www.mbkbase.org/Tu/) - * [Assembly of *Aegilops tauschii* ](https://www.ebi.ac.uk/ena/data/view/GCA_002575655.1) - * Annotation downloaded from [http://aegilops.wheat.ucdavis.edu/ATGSP/annotation/](http://aegilops.wheat.ucdavis.edu/ATGSP/annotation/) -* ...and more... -* -#### Remote +## Input specification (triticeae and other relevant data sets) -The pipeline pulls data from Ensembl, included species and assembly versions are specified in configuration file(s) e.g. [conf/ensembl-plants-data.config](conf/ensembl-plants-data.config). -For each of the data sets the pipeline downloads: +A mix of local and remote files can be specified - see [`conf/microsporidia.config`](conf/microsporidia.config) and the corresponding [`conf/test-data.config`](conf/test-data.config) +There are several paths through the pipeline which are executed depending on input specification and availability of various input file types, e.g. -* genome assembly index file (required) +* genome assembly index file * protein sequences (required if pipeline is to generate aliases) -* genome assembly fasta (only required if pipeline is to place markers on assemblies) +* marker sequences +* genome assembly fasta (required if pipeline is to place marker sequences on assemblies) -#### Local +Different paths through the pipeline rely on partly different inputs -Different branches of the pipeline rely on partly different inputs +1. Generation of genome blocks requires a genome assembly index file - all we really need are lengths of pseudo-chromosomes so a two-column `.tsv` file with chromosome names and their lengths will suffice. Also, if genome assembly fasta file is specified, the index will be generated automatically. -1. Generation of genome blocks requires a genome assembly index file - all we really need are lengths of pseudo-chromosomes so a two-column `.tsv` file with chromosome names and their lengths will suffice 2. Placement of gene features on the generated genome blocks and generation of aliases between features requires * gene annotations (either GTF or GFF3) @@ -102,9 +80,10 @@ This follows how protein sequences are annotated on Ensembl plants, but we do no 4. Marker placement requires full reference FASTA file. -### Other considerations -Wherever possible the local assembly files are used as input for the pipeline in their original form - as downloaded from their respective sources. This is however not always possible due to inconsistencies in formatting and varying levels of adherence to standards and conventions. We try to capture additional steps needed to prepare these input data sets for the inclusion in this pipeline in [doc/format_local.md](doc/format_local.md). +### Disparate triticeae datasets + +Wherever possible the assembly files are used as input for the pipeline in their original form - as downloaded from their respective sources. This is however not always possible due to inconsistencies in formatting and varying levels of adherence to standards and conventions. We try to capture additional steps needed to prepare these input data sets for the inclusion in this pipeline in [doc/format_local.md](doc/format_local.md). ## Dependencies @@ -114,14 +93,13 @@ Wherever possible the local assembly files are used as input for the pipeline in * [Docker](http://singularity.lbl.gov) * Required software installed. In addition to standard linux tools, these include: * [FASTX-Toolkit](http://hannonlab.cshl.edu/fastx_toolkit/) - * [MMSeqs2](https://github.com/soedinglab/mmseqs2) + * [MMSeqs2](https://github.com/soedinglab/mmseqs2) - if generating aliases * [Minimap2](https://github.com/lh3/minimap2) - if placing markers * `jq` * `groovy` interpreter - When using Singularity or Docker, the required containers are specified in [`conf/containers.conf`](conf/containers.config) - +and pulled by Nextflow as required. ## Execution @@ -131,21 +109,21 @@ Run locally with docker ``` nextflow run plantinformatics/pretzel-input-generator \ --profile YEAST,docker +-profile MICROSPORIDIA,docker ``` Run locally with singularity ``` nextflow run plantinformatics/pretzel-input-generator \ --profile YEAST,singularity +-profile MICROSPORIDIA,singularity ``` Dispatch on a SLURM cluster with singularity ``` nextflow run plantinformatics/pretzel-input-generator \ --profile YEAST,slurm,singularity +-profile MICROSPORIDIA,slurm,singularity ``` ## Output @@ -162,13 +140,13 @@ All generated JSON files generated by the pipeline are output to `results/JSON`. The output files (hopefully) conform to the requirements of [pretzel data structure](https://github.com/plantinformatics/pretzel-data). - The `results/flowinfo` directory contains summaries of pipeline execution and `results/downloads` includes the files downloaded from Ensembl plants. ``` results ├── downloads ├── flowinfo +├── summary └── JSON ``` @@ -193,4 +171,4 @@ This will pull and process data sets from [DNA Zoo](https://www.dnazoo.org/) spe ## Output -In comparison with the main pipeline the output lacks `*_aliases.json.gz` as features on different genomes are implicitly connected by BUSCOs identifiers. \ No newline at end of file +In comparison with the main pipeline the output lacks `*_aliases.json.gz` as features on different genomes are implicitly connected by BUSCOs identifiers. diff --git a/bin/excludeSameChromosome.awk b/bin/excludeSameChromosome.awk index 2b6f57f..9e00201 100755 --- a/bin/excludeSameChromosome.awk +++ b/bin/excludeSameChromosome.awk @@ -1,9 +1,9 @@ -#!/usr/bin/awk -f +#!/usr/bin/gawk -f BEGIN { OFS="\t"; } -NR==FNR && $3 ~/^chromosome/ { +NR==FNR && $3 ~/^(chromosome|supercontig)/ { #gsub("^>","",$1) split($3,location,":"); idmap[$1]=location[3]; diff --git a/bin/filterForRepresentative.awk b/bin/filterForRepresentative.awk index d870977..49a0aed 100755 --- a/bin/filterForRepresentative.awk +++ b/bin/filterForRepresentative.awk @@ -1,4 +1,4 @@ -#!/usr/bin/awk -f +#!/usr/bin/gawk -f BEGIN { FS = "\t"; @@ -9,8 +9,8 @@ BEGIN { split($1,arr," "); #GET GENE FIELD split(arr[4],gene,":"); #GET GENE FIELD ID=gene[2]; #GET GENE ID - sub(/^>[^ ]+/, ">"ID); #USE GENE ID AS FASTA IDENTIFIER (NOT TRANSCRIPT ID) - if(!(ID in storedIDs) || length($2) > length(StoredSeqLines[ID])) { #FIRST OCCURANCE OT LONGER THAN STORED + sub(/^>[^ ]+/, ">"ID); #USE GENE ID AS FASTA IDENTIFIER (RATER THAN THE TRANSCRIPT ID) + if(!(ID in storedIDs) || length($2) > length(StoredSeqLines[ID])) { #FIRST OCCURANCE OR LONGER THAN STORED storedIdLInes[ID] = $1; # print "storing "$1 storedSeqLines[ID] = $2; diff --git a/bin/gff3AndRepr2ensembl_pep.awk b/bin/gff3AndRepr2ensembl_pep.awk index 0766346..3d954fa 100755 --- a/bin/gff3AndRepr2ensembl_pep.awk +++ b/bin/gff3AndRepr2ensembl_pep.awk @@ -1,4 +1,4 @@ -#!/usr/bin/awk -f +#!/usr/bin/gawk -f BEGIN { FS="\t"; @@ -15,7 +15,7 @@ NR==FNR { } NR!=FNR { - if($3 =="mRNA") { + if($3 =="CDS") { gsub("\"",""); split($9,arr,";| "); for(i in arr) { @@ -23,16 +23,33 @@ NR!=FNR { if(pair[1]=="ID") { transcript=pair[2]; } else if(pair[1]=="Parent") { - gene=pair[2]; + parent=pair[2]; + gene=parent gsub(/\.[0-9]+$/,"",gene); + } else if(pair[1] ~ /^protein(_source)?_id$/) { + source=pair[2]; } } - if(transcript in repr && !(gene in printed)) { + # print "p="parent,"g="gene,"t="transcript,"s="source + # if(transcript in repr && !(gene in printed)) { + if(!(parent in printed)) { #IGNORECASE=1; gsub(/chr_?/,"",$1); #IGNORECASE=0; - printed[gene]=1; - print ">"transcript" pep chromosome:"version":"$1":"$4":"$5" gene:"gene"\n"repr[transcript]; + + if(source in repr) { + id = source + } else if(parent in repr) { #dealing with GFF files being inconsistent... + id = parent + } else { + id = "" + } + # if(source in repr && !(gene in printed)) { + # print ">"transcript" pep chromosome:"version":"$1":"$4":"$5" gene:"gene"\n"repr[transcript]; + if(id) { + print ">"id" pep chromosome:"version":"$1":"$4":"$5" gene:"gene"\n"repr[id]; + printed[parent]=1; + } } } } \ No newline at end of file diff --git a/bin/gtfAndRepr2ensembl_pep.awk b/bin/gtfAndRepr2ensembl_pep.awk index 2a0c586..81f586f 100755 --- a/bin/gtfAndRepr2ensembl_pep.awk +++ b/bin/gtfAndRepr2ensembl_pep.awk @@ -1,4 +1,4 @@ -#!/usr/bin/awk -f +#!/usr/bin/gawk -f BEGIN { FS="\t"; diff --git a/bin/paf2pretzel.groovy b/bin/paf2pretzel.groovy index 21154a9..b19e1d6 100755 --- a/bin/paf2pretzel.groovy +++ b/bin/paf2pretzel.groovy @@ -5,7 +5,9 @@ import java.util.zip.GZIPInputStream import java.util.zip.GZIPOutputStream -@Grab('info.picocli:picocli:4.0.0-alpha-3') //command line interface +//@Grab('info.picocli:picocli-groovy:4.1.2') //command line interface +groovy.grape.Grape.grab(group:'info.picocli', module:'picocli-groovy', version:'4.1.2') + @Command(header = [ //Font Name: Calvin S $/@|bold,blue ╔═╗╔═╗╔═╗ ┌┬┐┌─┐ ╔═╗┬─┐┌─┐┌┬┐┌─┐┌─┐┬ |@/$, @@ -52,6 +54,9 @@ import static picocli.CommandLine.* @Option(names = ["--align-params"], description = ["Params used to generate input PAF alignments"]) @Field private String alignParams +@Option(names = ["--allowed-target-id-pattern"], description = ["Provide target identifier patter if other than common chromosome naming"]) +@Field private String allowedTargetIdPattern + @Option(names = ["-O", "--output"], description = ["JSON output file name"]) @Field private String output = '/dev/stdout' @@ -112,8 +117,12 @@ pafContent.eachLine { line -> // println "${query_identity} >= ${minIdentity} ?" if(query_identity >= minIdentity) { def kosher = true; - if(!(TNAME.toLowerCase() ==~ /^(chr(omosome)?)?(_)?([0-9]+|x|y|i|v).*/)) { + // println "check if TNAME kosher" + // if(!(TNAME.toLowerCase() ==~ /^(ch(romosome)?)?(_)?([0-9]+|x|y|i|v).*/)) { + // if(!(TNAME.toLowerCase() ==~ /^(ch(romosome)?)?(_)?([0-9]+|x|y|i|v|[0-9a-z_\-]).*/)) { + if(!((TNAME.toLowerCase() =~ /^(ch|[0-9]{1,2}|x|y|i|v)/) || (TNAME =~ allowedTargetIdPattern) )) { kosher = false //don't report placement on plasmid or other non-pseudomolecule parts of assembly + // println "${allowedTargetIdPattern} not matching $TNAME" } else if(markerMode && query_identity < 1) { //Not a 100% match, so for markers we check if no MM in last 3 bases - if notMarkerMode the required tag may not be present TAGS.each { tag -> if(tag.startsWith('cs:Z')) { @@ -129,6 +138,7 @@ pafContent.eachLine { line -> } if(kosher) { + // println TNAME def key = TNAME.replaceFirst("^(C|c)(H|h)(R|r)[_]?","") if(!scope.containsKey(key)) { scope << [(key) : []] diff --git a/conf/containers.config b/conf/containers.config index c269f48..eab5420 100644 --- a/conf/containers.config +++ b/conf/containers.config @@ -1,12 +1,10 @@ process { - withLabel: BUSCO { - container = 'rsuchecki/busco:3.0.2_blast2.8.1' - } withLabel: fastx { - container = 'biocontainers/fastxtools:v0.0.14_cv2' + container = 'rsuchecki/fastx:0.0.14' + // container = 'biocontainers/fastxtools:v0.0.14_cv2' } withLabel: groovy { - container = 'groovy:3.0' + container = 'rsuchecki/groovy:3.0_868da92992a46b74552abbbf72b76c8aba3fbc9c' } withLabel: minimap2 { container = 'rsuchecki/minimap2:2.17' @@ -15,6 +13,9 @@ process { container = 'rsuchecki/mmseqs2:version-10' } withLabel: jq { - container = 'stedolan/jq' + container = 'rsuchecki/jq:latest' + } + withLabel: samtools { + container = 'mgibio/samtools:1.9' } } diff --git a/conf/ensembl-plants.config b/conf/ensembl-plants.config index 543036f..13d9d4b 100644 --- a/conf/ensembl-plants.config +++ b/conf/ensembl-plants.config @@ -6,12 +6,4 @@ pepsuffix = ".pep.all.fa.gz" idxsuffix = ".dna.toplevel.fa.gz.fai" fastasuffix = ".dna.toplevel.fa.gz" - - //SEQUENCE ALIGNMENT THRESHOLDS - minIdentity = 0.5 - minCoverage = 0.6 - - //POST ALIGNMENT FILTERING THRESHOLDS - minIdentityFilter = 0.7 - minCoverageFilter = 0.8 } \ No newline at end of file diff --git a/conf/microsporidia.config b/conf/microsporidia.config new file mode 100644 index 0000000..834452b --- /dev/null +++ b/conf/microsporidia.config @@ -0,0 +1,94 @@ + params { + /* + Dataset definitions - these are assemblies for which we want to generate pretzel-compatible JSON files. + + Fields: + species (required) - no spaces, stick to alpphanumeric and underscores + version (required) - version of the assembly + shortName (optional) - name displayed in pretzel over the chromosome axis + + The combination of species+version must be unique among input assemblies. + + In addition, to generate different data set types you will need one or more of the following: + + 1. For generation of protein-alignmant-based aliases which are most useful for interspecies comparions + idx - a path or URL to a genome index (fai) file, + or, really just sequence identifiers and their lengths + if idx not defined then 'fasta' field must be defined (see below) + pep - a path or URL to a set of proteins FASTA + if protein definition lines are formatted as pep files from Ensembl genomes + this is enough, otherwise you will also need + gff3 - a gff3 file describing the gene predictions, compatible with content of pep and the underlying genome assembly + + 2. To be able to place a set of sequences on an assembly, its definition should specify + fasta - a path or URL to a genome fasta file + you will also need to define sequences you want to place, see below comments for 'sequencesToPlace' + + + It is recommended to include these (optional) additional fields to capture the origin of your data sets + source + citation + + Furthermore, if chromosome ids in your dataset do not match /^(ch|[0-9]|x|y|i|v)/ + you may specify the following optional field + allowedIdPattern - which could be a regular expression matching your chromosome/supercontig naming pattern + + */ + references = [ + [ + species : "Encephalitozoon_intestinalis_ATCC_50506", + version : "gca_000146465", + shortName : "E. intestinalis", //arbitrary display name + pep : "ftp://ftp.ensemblgenomes.org/pub/fungi/release-45/fasta/fungi_microsporidia1_collection/encephalitozoon_intestinalis_atcc_50506_gca_000146465/pep/Encephalitozoon_intestinalis_atcc_50506_gca_000146465.ASM14646v1.pep.all.fa.gz", + // gff3 : not required as Ensembl-style pep provided // gff3 : "ftp://ftp.ensemblgenomes.org/pub/fungi/release-45/gff3/fungi_microsporidia1_collection/encephalitozoon_intestinalis_atcc_50506_gca_000146465/Encephalitozoon_intestinalis_atcc_50506_gca_000146465.ASM14646v1.45.gff3.gz", + idx : "ftp://ftp.ensemblgenomes.org/pub/fungi/release-45/fasta/fungi_microsporidia1_collection/encephalitozoon_intestinalis_atcc_50506_gca_000146465/dna_index/Encephalitozoon_intestinalis_atcc_50506_gca_000146465.ASM14646v1.dna.toplevel.fa.gz.fai", + fasta : "ftp://ftp.ensemblgenomes.org/pub/fungi/release-45/fasta/fungi_microsporidia1_collection/encephalitozoon_intestinalis_atcc_50506_gca_000146465/dna_index/Encephalitozoon_intestinalis_atcc_50506_gca_000146465.ASM14646v1.dna.toplevel.fa.gz", + source: "https://fungi.ensembl.org/Encephalitozoon_intestinalis_atcc_50506_gca_000146465" + ], + [ + species : "Encephalitozoon_cuniculi_ecuniii_l", + version : "gca_001078035", + shortName : "E. cuniculi L", //arbitrary display name + pep : "ftp://ftp.ensemblgenomes.org/pub/fungi/release-45/fasta/fungi_microsporidia1_collection/encephalitozoon_cuniculi_ecuniii_l_gca_001078035/pep/Encephalitozoon_cuniculi_ecuniii_l_gca_001078035.ECIIIL.pep.all.fa.gz", + //gff3 : (not needed with Ensembl-style pep but including here for testing purposes) + gff3 : "ftp://ftp.ensemblgenomes.org/pub/fungi/release-45/gff3/fungi_microsporidia1_collection/encephalitozoon_cuniculi_ecuniii_l_gca_001078035/Encephalitozoon_cuniculi_ecuniii_l_gca_001078035.ECIIIL.45.gff3.gz", + idx : "ftp://ftp.ensemblgenomes.org/pub/fungi/release-45/fasta/fungi_microsporidia1_collection/encephalitozoon_cuniculi_ecuniii_l_gca_001078035/dna_index/Encephalitozoon_cuniculi_ecuniii_l_gca_001078035.ECIIIL.dna.toplevel.fa.gz.fai", + source: "https://fungi.ensembl.org/Encephalitozoon_cuniculi_ecuniii_l_gca_001078035" + ], + [ + species : "Encephalitozoon_hellem_ATCC_50504", + version : "2014-10-01", + shortName : "E. hellem", //arbitrary display name + pep : "https://microsporidiadb.org/common/downloads/Current_Release/EhellemATCC50504/fasta/data/MicrosporidiaDB-46_EhellemATCC50504_AnnotatedProteins.fasta", + gff3 : "https://microsporidiadb.org/common/downloads/Current_Release/EhellemATCC50504/gff/data/MicrosporidiaDB-46_EhellemATCC50504.gff", + fasta : "https://microsporidiadb.org/common/downloads/Current_Release/EhellemATCC50504/fasta/data/MicrosporidiaDB-46_EhellemATCC50504_Genome.fasta", + allowedIdPattern : '^CP0027.*', //Must specify chromosome ID prefix if chromosome naming other than /^(ch|[0-9]|x|y|i|v)/ + source: "https://microsporidiadb.org/micro/app/record/organism/NCBITAXON_907965" + ], + [ //Data set without pep specified so will not be used for alias generation, but genome FASTA provided so can be used to place marker/other sequences on + species : "Encephalitozoon_cuniculi_EC2", + version : "GCA_000221265.2", + shortName : "E cuniculi EC2", //arbitrary display name + fasta : "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/221/265/GCA_000221265.2_Ence_cuni_EC2_V1/GCA_000221265.2_Ence_cuni_EC2_V1_genomic.fna.gz", + allowedIdPattern : '^AEWQ010000.*', + source: "https://www.ncbi.nlm.nih.gov/assembly/GCA_000221265.2" + ] + ] + + /* + Markers, contigs scaffolds, gene predictions to be placed on all or a subset of assemblies + Fileds: + name + fasta + seqtype - can be one of markers|transcripts|cds|genomic + target: [ [species: '', version: ''], [species: '', version: ''] ] //all data sets if this optional field not specified + */ + sequencesToPlace = [ + [ + name: 'E_cuniculi', + fasta: 'ftp://ftp.ensemblgenomes.org/pub/fungi/release-45/fasta/fungi_microsporidia1_collection/encephalitozoon_cuniculi_ecuniii_l_gca_001078035/cds/Encephalitozoon_cuniculi_ecuniii_l_gca_001078035.ECIIIL.cds.all.fa.gz', //local or remote, either gz or not + seqtype: 'cds' //markers|transcripts|cds|genomic + ] + ] + } + diff --git a/conf/requirements.config b/conf/requirements.config index 7f14e8e..974d04a 100644 --- a/conf/requirements.config +++ b/conf/requirements.config @@ -3,7 +3,7 @@ process { maxRetries = 3 cpus = 2 memory = 2.GB - time = 1.h + time = { check_max( 1.h * task.attempt * task.attempt, 'time' ) } withLabel: download { errorStrategy = { task.attempt < process.maxRetries ? 'retry' : 'ignore' } } @@ -27,6 +27,10 @@ process { memory = { check_max( 120.GB, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } } + withLabel: jq { + time = { check_max( 2.h * task.attempt, 'time' ) } + memory = { check_max( 4.GB * task.attempt, 'memory' ) } + } withLabel: mem { cpus = { check_max( 2 * task.attempt, 'cpus' ) } memory = { check_max( 4.GB * task.attempt, 'memory' ) } diff --git a/conf/test-data.config b/conf/test-data.config new file mode 100644 index 0000000..b49dff5 --- /dev/null +++ b/conf/test-data.config @@ -0,0 +1,94 @@ + params { + /* + Dataset definitions - these are assemblies for which we want to generate pretzel-compatible JSON files. + + Fields: + species (required) - no spaces, stick to alpphanumeric and underscores + version (required) - version of the assembly + shortName (optional) - name displayed in pretzel over the chromosome axis + + The combination of species+version must be unique among input assemblies. + + In addition, to generate different data set types you will need one or more of the following: + + 1. For generation of protein-alignmant-based aliases which are most useful for interspecies comparions + idx - a path or URL to a genome index (fai) file, + or, really just sequence identifiers and their lengths + if idx not defined then 'fasta' field must be defined (see below) + pep - a path or URL to a set of proteins FASTA + if protein definition lines are formatted as pep files from Ensembl genomes + this is enough, otherwise you will also need + gff3 - a gff3 file describing the gene predictions, compatible with content of pep and the underlying genome assembly + + 2. To be able to place a set of sequences on an assembly, its definition should specify + fasta - a path or URL to a genome fasta file + you will also need to define sequences you want to place, see below comments for 'sequencesToPlace' + + + It is recommended to include these (optional) additional fields to capture the origin of your data sets + source + citation + + Furthermore, if chromosome ids in your dataset do not match /^(ch|[0-9]|x|y|i|v)/ + you may specify the following optional field + allowedIdPattern - which could be a regular expression matching your chromosome/supercontig naming pattern + + */ + references = [ + [ + species : "Encephalitozoon_intestinalis_ATCC_50506", + version : "gca_000146465", + shortName : "E. intestinalis", //arbitrary display name + pep : "testdata/Encephalitozoon_intestinalis_atcc_50506_gca_000146465.ASM14646v1.pep.all.fa.gz", + // gff3 : not required as Ensembl-style pep provided // gff3 : "ftp://ftp.ensemblgenomes.org/pub/fungi/release-45/gff3/fungi_microsporidia1_collection/encephalitozoon_intestinalis_atcc_50506_gca_000146465/Encephalitozoon_intestinalis_atcc_50506_gca_000146465.ASM14646v1.45.gff3.gz", + idx : "testdata/Encephalitozoon_intestinalis_atcc_50506_gca_000146465.ASM14646v1.dna.toplevel.fa.gz.fai", + fasta : "testdata/Encephalitozoon_intestinalis_atcc_50506_gca_000146465.ASM14646v1.dna.toplevel.fa.gz", + source: "https://fungi.ensembl.org/Encephalitozoon_intestinalis_atcc_50506_gca_000146465" + ], + [ + species : "Encephalitozoon_cuniculi_ecuniii_l", + version : "gca_001078035", + shortName : "E. cuniculi L", //arbitrary display name + pep : "testdata/Encephalitozoon_cuniculi_ecuniii_l_gca_001078035.ECIIIL.pep.all.fa.gz", + //gff3 : (not needed with Ensembl-style pep but including here for testing purposes) + gff3 : "testdata/Encephalitozoon_cuniculi_ecuniii_l_gca_001078035.ECIIIL.45.gff3.gz", + idx : "testdata/Encephalitozoon_cuniculi_ecuniii_l_gca_001078035.ECIIIL.dna.toplevel.fa.gz.fai", + source: "https://fungi.ensembl.org/Encephalitozoon_cuniculi_ecuniii_l_gca_001078035" + ], + [ + species : "Encephalitozoon_hellem_ATCC_50504", + version : "2014-10-01", + shortName : "E. hellem", //arbitrary display name + pep : "testdata/MicrosporidiaDB-46_EhellemATCC50504_AnnotatedProteins.fasta.gz", + gff3 : "testdata/MicrosporidiaDB-46_EhellemATCC50504.gff.gz", + fasta : "testdata/MicrosporidiaDB-46_EhellemATCC50504_Genome.fasta.gz", + allowedIdPattern : '^CP0027.*', //Must specify chromosome ID prefix if chromosome naming other than /^(ch|[0-9]|x|y|i|v)/ + source: "https://microsporidiadb.org/micro/app/record/organism/NCBITAXON_907965" + ], + [ //Data set without pep specified so will not be used for alias generation, but genome FASTA provided so can be used to place marker/other sequences on + species : "Encephalitozoon_cuniculi_EC2", + version : "GCA_000221265.2", + shortName : "E cuniculi EC2", //arbitrary display name + fasta : "testdata/GCA_000221265.2_Ence_cuni_EC2_V1_genomic.fna.gz", + allowedIdPattern : '^AEWQ010000.*', + source: "https://www.ncbi.nlm.nih.gov/assembly/GCA_000221265.2" + ] + ] + + /* + Markers, contigs scaffolds, gene predictions to be placed on all or a subset of assemblies + Fileds: + name + fasta + seqtype - can be one of markers|transcripts|cds|genomic + target: [ [species: '', version: ''], [species: '', version: ''] ] //all data sets if this optional field not specified + */ + sequencesToPlace = [ + [ + name: 'E_cuniculi', + fasta: 'testdata/Encephalitozoon_cuniculi_ecuniii_l_gca_001078035.ECIIIL.cds.all.fa.gz', //local or remote, either gz or not + seqtype: 'cds' //markers|transcripts|cds|genomic + ] + ] + } + diff --git a/conf/triticeae.config b/conf/triticeae.config index 1240e7a..6ba3cc0 100644 --- a/conf/triticeae.config +++ b/conf/triticeae.config @@ -6,7 +6,37 @@ all content is parsed-in under params. Alterantively, you can modify this file or add to it */ params { - localAssembly = [ + references = [ + [ + species : "Arabidopsis_thaliana", + version : "TAIR10", + shortName : "TAIR", + pep : "ftp://ftp.ensemblgenomes.org/pub/plants/release-39/fasta/arabidopsis_thaliana/pep/Arabidopsis_thaliana.TAIR10.pep.all.fa.gz", + idx : "ftp://ftp.ensemblgenomes.org/pub/plants/release-39/fasta/arabidopsis_thaliana/dna_index/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz.fai", + // fasta : "ftp://ftp.ensemblgenomes.org/pub/plants/release-39/fasta/arabidopsis_thaliana/dna/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz", + source : "https://plants.ensembl.org/Arabidopsis_thaliana", + citation : "https://doi.org/10.1093/nar/gkm965" + ], + [ + species : "Brachypodium_distachyon", + version : "v1.0", + shortName : "Brachy", + pep : "ftp://ftp.ensemblgenomes.org/pub/plants/release-39/fasta/brachypodium_distachyon/pep/Brachypodium_distachyon.v1.0.pep.all.fa.gz", + idx : "ftp://ftp.ensemblgenomes.org/pub/plants/release-39/fasta/brachypodium_distachyon/dna_index/Brachypodium_distachyon.v1.0.dna.toplevel.fa.gz.fai", + // fasta : "ftp://ftp.ensemblgenomes.org/pub/plants/release-39/fasta/brachypodium_distachyon/dna/Brachypodium_distachyon.v1.0.dna.toplevel.fa.gz", + source : "https://plants.ensembl.org/Brachypodium_distachyon", + citation : "https://doi.org/10.1038/nature08747" + ], + [ + species : "Brachypodium_distachyon", + version : "v3.0", + shortName : "BrachyV3", + pep : "ftp://ftp.ensemblgenomes.org/pub/plants/release-43/fasta/brachypodium_distachyon/pep/Brachypodium_distachyon.Brachypodium_distachyon_v3.0.pep.all.fa.gz", + idx : "ftp://ftp.ensemblgenomes.org/pub/plants/release-43/fasta/brachypodium_distachyon/dna_index/Brachypodium_distachyon.Brachypodium_distachyon_v3.0.dna.toplevel.fa.gz.fai", + // fasta : "ftp://ftp.ensemblgenomes.org/pub/plants/release-43/fasta/brachypodium_distachyon/dna/Brachypodium_distachyon.Brachypodium_distachyon_v3.0.dna.toplevel.fa.gz", + source : "https://plants.ensembl.org/Brachypodium_distachyon", + citation : "https://doi.org/10.1038/nature08747" + ], [ species : "Aegilops_tauschii", subspecies : "strangulata", @@ -17,7 +47,7 @@ params { HC: "local/AET_High_confidence_gene_protein.fasta", LC: "local/AET_Low_confidence_gene_protein.fasta" ], - gtfgff3 : [ + gff3 : [ HC: "local/AET_High_confidence_gene_mod.gff3", LC: "local/AET_Low_confidence_gene_mod.gff3" ], @@ -35,7 +65,7 @@ params { HC: "local/barley/gene_annotation/Barley_Morex_V2_gene_annotation_PGSB.HC.aa.fasta", LC: "local/barley/gene_annotation/Barley_Morex_V2_gene_annotation_PGSB.LC.aa.fasta" ], - gtfgff3 : [ + gff3 : [ HC: "local/barley/gene_annotation/Barley_Morex_V2_gene_annotation_PGSB.HC.gff3", LC: "local/barley/gene_annotation/Barley_Morex_V2_gene_annotation_PGSB.LC.gff3" ], @@ -52,7 +82,7 @@ params { RGAP : "local/Rice_RGAP/all.pep", IRGSP : "local/Oryza_sativa.IRGSP-1.0.pep.REPR.fa" ], - gtfgff3 : [ + gff3 : [ RGAP: "local/Rice_RGAP/all.gff3" // IRGSP : "local/Oryza_sativa.IRGSP-1.0.43.gff3" //not needed as pep already formatted correctly ], @@ -68,7 +98,7 @@ params { HC: "local/iwgsc_refseqv1.0_HighConf_REPR_PROTEIN_2017Apr03.fa", LC: "local/iwgsc_refseqv1.0_LowConf_REPR_PROTEIN_2017Apr03.fa" ], - gtfgff3 : [ + gff3 : [ HC: "local/iwgsc_refseqv1.0_HighConf_2017Mar13.gff3", LC: "local/iwgsc_refseqv1.0_LowConf_2017Mar13.gff3" ], @@ -89,96 +119,96 @@ params { fasta : "local/iwgsc_refseqv2.0_all_chromosomes.fa", source : "https://wheat-urgi.versailles.inra.fr/Seq-Repository/Assemblies", ], - //10 wheats START ========================================== - [ - species : "Triticum_aestivum", - version : "Julius_MAGIC3_170807", - shortName : "Julius", - idx : "local/10wheats/170807_julius_MAGIC3_pseudomolecules.fasta.gz.fai", - fasta : "local/10wheats/170807_julius_MAGIC3_pseudomolecules.fasta.gz", - source : "http://www.10wheatgenomes.com/ https://wheat.ipk-gatersleben.de/", - ], - [ - species : "Triticum_aestivum", - version : "Landmark_v1_170831", - shortName : "Landmark", - idx : "local/10wheats/170831_Landmark_pseudomolecules.fasta.gz.fai", - fasta : "local/10wheats/170831_Landmark_pseudomolecules.fasta.gz", - source : "http://www.10wheatgenomes.com/ https://wheat.ipk-gatersleben.de/", - ], - [ - species : "Triticum_aestivum", - version : "Jagger_v1.1_180529", - shortName : "Jagger", - idx : "local/10wheats/180529_Jagger_pseudomolecule_v1.1.fasta.gz.fai", - fasta : "local/10wheats/180529_Jagger_pseudomolecule_v1.1.fasta.gz", - source : "http://www.10wheatgenomes.com/ https://wheat.ipk-gatersleben.de/", - ], - [ - species : "Triticum_aestivum", - version : "ArinaLrFor_v3_180808", - shortName : "Arina", - idx : "local/10wheats/180808_ArinaLrFor_pseudomolecules_v3.fasta.gz.fai", - fasta : "local/10wheats/180808_ArinaLrFor_pseudomolecules_v3.fasta.gz", - source : "http://www.10wheatgenomes.com/ https://wheat.ipk-gatersleben.de/", - ], - [ - species : "Triticum_aestivum", - version : "Stanley_v1.2_180902", - shortName : "Stanley", - idx : "local/10wheats/180902_Stanley_pseudomolecules_v1.2.fasta.gz.fai", - fasta : "local/10wheats/180902_Stanley_pseudomolecules_v1.2.fasta.gz", - source : "http://www.10wheatgenomes.com/ https://wheat.ipk-gatersleben.de/", - ], - [ - species : "Triticum_aestivum", - version : "SY_Mattis_v1_181016", - shortName : "Mattis", - idx : "local/10wheats/181016_SY_Mattis_pseudomolecule_v1.fasta.gz.fai", - fasta : "local/10wheats/181016_SY_Mattis_pseudomolecule_v1.fasta.gz", - source : "http://www.10wheatgenomes.com/ https://wheat.ipk-gatersleben.de/", - ], - [ - species : "Triticum_aestivum", - version : "Lancer_v1.0_181120", - shortName : "Lancer", - idx : "local/10wheats/181120_lancer_pseudomolecule_v1.0.fasta.gz.fai", - fasta : "local/10wheats/181120_lancer_pseudomolecule_v1.0.fasta.gz", - source : "http://www.10wheatgenomes.com/ https://wheat.ipk-gatersleben.de/", - ], - [ - species : "Triticum_aestivum", - version : "Mace_v1.0_181120", - shortName : "Mace", - idx : "local/10wheats/181120_mace_pseudomolecule_v1.0.fasta.gz.fai", - fasta : "local/10wheats/181120_mace_pseudomolecule_v1.0.fasta.gz", - source : "http://www.10wheatgenomes.com/ https://wheat.ipk-gatersleben.de/", - ], - [ - species : "Triticum_aestivum", - version : "Norin61_v1.1_190307", - shortName : "Norin61", - idx : "local/10wheats/190307_Norin61_pseudomolecule_v1.1.fasta.gz.fai", - fasta : "local/10wheats/190307_Norin61_pseudomolecule_v1.1.fasta.gz", - source : "http://www.10wheatgenomes.com/ https://wheat.ipk-gatersleben.de/", - ], - [ - species : "Triticum_spelta", - version : "Spelt_v1.0_190524", - shortName : "Spelt", - idx : "local/10wheats/190524_spelt_pseudomolecules_v1.0.fasta.gz.fai", - fasta : "local/10wheats/190524_spelt_pseudomolecules_v1.0.fasta.gz", - source : "http://www.10wheatgenomes.com/ https://wheat.ipk-gatersleben.de/", - ], + // //10 wheats START ========================================== + // [ + // species : "Triticum_aestivum", + // version : "Julius_MAGIC3_170807", + // shortName : "Julius", + // idx : "local/10wheats/170807_julius_MAGIC3_pseudomolecules.fasta.gz.fai", + // fasta : "local/10wheats/170807_julius_MAGIC3_pseudomolecules.fasta.gz", + // source : "http://www.10wheatgenomes.com/ https://wheat.ipk-gatersleben.de/", + // ], + // [ + // species : "Triticum_aestivum", + // version : "Landmark_v1_170831", + // shortName : "Landmark", + // idx : "local/10wheats/170831_Landmark_pseudomolecules.fasta.gz.fai", + // fasta : "local/10wheats/170831_Landmark_pseudomolecules.fasta.gz", + // source : "http://www.10wheatgenomes.com/ https://wheat.ipk-gatersleben.de/", + // ], + // [ + // species : "Triticum_aestivum", + // version : "Jagger_v1.1_180529", + // shortName : "Jagger", + // idx : "local/10wheats/180529_Jagger_pseudomolecule_v1.1.fasta.gz.fai", + // fasta : "local/10wheats/180529_Jagger_pseudomolecule_v1.1.fasta.gz", + // source : "http://www.10wheatgenomes.com/ https://wheat.ipk-gatersleben.de/", + // ], + // [ + // species : "Triticum_aestivum", + // version : "ArinaLrFor_v3_180808", + // shortName : "Arina", + // idx : "local/10wheats/180808_ArinaLrFor_pseudomolecules_v3.fasta.gz.fai", + // fasta : "local/10wheats/180808_ArinaLrFor_pseudomolecules_v3.fasta.gz", + // source : "http://www.10wheatgenomes.com/ https://wheat.ipk-gatersleben.de/", + // ], // [ // species : "Triticum_aestivum", - // version : "", - // shortName : "", - // idx : "local/10wheats/", - // fasta : "local/10wheats/", + // version : "Stanley_v1.2_180902", + // shortName : "Stanley", + // idx : "local/10wheats/180902_Stanley_pseudomolecules_v1.2.fasta.gz.fai", + // fasta : "local/10wheats/180902_Stanley_pseudomolecules_v1.2.fasta.gz", + // source : "http://www.10wheatgenomes.com/ https://wheat.ipk-gatersleben.de/", + // ], + // [ + // species : "Triticum_aestivum", + // version : "SY_Mattis_v1_181016", + // shortName : "Mattis", + // idx : "local/10wheats/181016_SY_Mattis_pseudomolecule_v1.fasta.gz.fai", + // fasta : "local/10wheats/181016_SY_Mattis_pseudomolecule_v1.fasta.gz", + // source : "http://www.10wheatgenomes.com/ https://wheat.ipk-gatersleben.de/", + // ], + // [ + // species : "Triticum_aestivum", + // version : "Lancer_v1.0_181120", + // shortName : "Lancer", + // idx : "local/10wheats/181120_lancer_pseudomolecule_v1.0.fasta.gz.fai", + // fasta : "local/10wheats/181120_lancer_pseudomolecule_v1.0.fasta.gz", + // source : "http://www.10wheatgenomes.com/ https://wheat.ipk-gatersleben.de/", + // ], + // [ + // species : "Triticum_aestivum", + // version : "Mace_v1.0_181120", + // shortName : "Mace", + // idx : "local/10wheats/181120_mace_pseudomolecule_v1.0.fasta.gz.fai", + // fasta : "local/10wheats/181120_mace_pseudomolecule_v1.0.fasta.gz", + // source : "http://www.10wheatgenomes.com/ https://wheat.ipk-gatersleben.de/", + // ], + // [ + // species : "Triticum_aestivum", + // version : "Norin61_v1.1_190307", + // shortName : "Norin61", + // idx : "local/10wheats/190307_Norin61_pseudomolecule_v1.1.fasta.gz.fai", + // fasta : "local/10wheats/190307_Norin61_pseudomolecule_v1.1.fasta.gz", + // source : "http://www.10wheatgenomes.com/ https://wheat.ipk-gatersleben.de/", + // ], + // [ + // species : "Triticum_spelta", + // version : "Spelt_v1.0_190524", + // shortName : "Spelt", + // idx : "local/10wheats/190524_spelt_pseudomolecules_v1.0.fasta.gz.fai", + // fasta : "local/10wheats/190524_spelt_pseudomolecules_v1.0.fasta.gz", // source : "http://www.10wheatgenomes.com/ https://wheat.ipk-gatersleben.de/", - // ], - //10 wheats END ========================================== + // ], + // // [ + // // species : "Triticum_aestivum", + // // version : "", + // // shortName : "", + // // idx : "local/10wheats/", + // // fasta : "local/10wheats/", + // // source : "http://www.10wheatgenomes.com/ https://wheat.ipk-gatersleben.de/", + // // ], + // //10 wheats END ========================================== [ species : "Triticum_aestivum", version : "Campala_Lr22a", @@ -197,7 +227,7 @@ params { HC : "local/TRIDC_WEWseq_PGSB_20160501_Proteins_HighConf_REPR.fasta", LC : "local/TRIDC_WEWseq_PGSB_20160501_Proteins_LowConf_REPR.fasta", ], - gtfgff3 : [ + gtf : [ HC : "local/TRIDC_WEWseq_PGSB_20160501_HighConf.gtf", LC : "local/TRIDC_WEWseq_PGSB_20160501_LowConf.gtf" ], @@ -235,7 +265,7 @@ params { HC : "local/WheatTu.pros.long.HC.fasta", LC : "local/WheatTu.pros.long.LC.fasta", ], - gtfgff3 : [ + gff3 : [ HC : "local/WheatTu.gene_mod.gff", LC : "local/WheatTu.gene_mod.gff" ], diff --git a/dockerfiles/fastx.Dockerfile b/dockerfiles/fastx.Dockerfile new file mode 100644 index 0000000..7066cc7 --- /dev/null +++ b/dockerfiles/fastx.Dockerfile @@ -0,0 +1,3 @@ +FROM alexcoppe/fastx-toolkit-lite + +RUN apk --no-cache add procps coreutils gawk diff --git a/dockerfiles/jq.Dockerfile b/dockerfiles/jq.Dockerfile new file mode 100644 index 0000000..38a36fa --- /dev/null +++ b/dockerfiles/jq.Dockerfile @@ -0,0 +1,4 @@ +FROM stedolan/jq + +RUN apt-get update && apt-get install -y gawk \ + && apt-get autoremove -y \ No newline at end of file diff --git a/main.nf b/main.nf index f53dd75..1fd54e7 100644 --- a/main.nf +++ b/main.nf @@ -5,11 +5,22 @@ if(workflow.profile.contains('BUSCOs')) { exit 1 } -//INPUT PARAMS -trialLines = params.trialLines -// eprelease = params.eprelease -import static groovy.json.JsonOutput.* +// import static groovy.json.JsonOutput.* +//For pretty-printing nested maps etc +import groovy.json.JsonGenerator +import groovy.json.JsonSlurper +import groovy.json.JsonOutput + +//Preventing stack overflow on Path objects and other when map -> JSON +JsonGenerator jsonGenerator = new JsonGenerator.Options() + .addConverter(java.nio.file.Path) { java.nio.file.Path p, String key -> p.toUriString() } + .addConverter(Duration) { Duration d, String key -> d.durationInMillis } + .addConverter(java.time.OffsetDateTime) { java.time.OffsetDateTime dt, String key -> dt.toString() } + .addConverter(nextflow.NextflowMeta) { nextflow.NextflowMeta m, String key -> m.toJsonMap() } //incompatible with Nextflow <= 19.04.0 + .excludeFieldsByType(java.lang.Class) // .excludeFieldsByName('class') + // .excludeNulls() + .build() //Otherwise JSON generation triggers stackoverflow when encountering Path objects jsonGenerator = new groovy.json.JsonGenerator.Options() @@ -38,78 +49,27 @@ if (params.help){ exit 0 } -//STATIC(?) ENSEMBL URLs -urlprefix = params.urlprefix -pepsuffix = params.pepsuffix -idxsuffix = params.idxsuffix -fastasuffix = params.fastasuffix // def noKeyOrIsMap //LOCAL marker/contigs to place sets Channel.from(params.sequencesToPlace) -.filter { params.sequencesToPlace != "NA" } .map { [it, file(it.fasta)] } .set{ sequencesToPlaceChannel } -//LOCAL INPUTS -localInput = Channel.create() -localIndices = Channel.create() -localGenomeSeqs = Channel.create() - params.localAssembly.each { - //Genome Fasta optional (?) - if(it.containsKey("fasta")) { - localGenomeSeqs << [it,file(it.fasta)] - it.remove('fasta') //preventing cached re runs after fasta added to meta - } - // println(prettyPrint(toJson(it))) - // for (key in ["gtf","gff3"]) { - key = 'gtfgff3' - // if(it.containsKey(key)) { - //IF MORE THAN ONE ANNOTATION PER GENOME - if(it.containsKey("pep")) { - if(it.pep instanceof Map) { // && it.containsKey(key) && it.get(key) instanceof Map) { - it.pep.each {id, pep -> - // println(id+" -> "+pep) - clone = it.clone() - clone.annotation = id - gtfgff3 = (it.containsKey(key) && it.get(key).containsKey(id)) ? file(it.get(key).get(id)) : null - localInput << [clone,gtfgff3,file(pep)] - } - } else { //if (!(it.pep instanceof Map) && !(it.get(key) instanceof Map)){ - gtfgff3 = it.containsKey(key) ? file(it.get(key)) : null - localInput << [it,gtfgff3,file(it.pep)] - // } else { - // exitOnInputMismatch(it) - } - } - //ALL SHOULD HAVE AN INDEX - if(it.containsKey("idx")) { - localIndices << [it,file(it.idx)] - } +//INPUT DATA - } -localInput.close() -localIndices.close() -localGenomeSeqs.close() -// def exitOnInputMismatch(data) { -// println("Malformed input. Expecting number of pep and gtf/gff3 inputs to match for a data set.") -// println("Offending data set: ") -// println(data) -// println("Terminating.") -// System.exit(1) -// } /* Generic method for extracting a string tag or a file basename from a metadata map */ def getAnnotationTagFromMeta(meta, delim = '_') { - return meta.species+delim+meta.version+(meta.containsKey("annotation") ? delim+meta.annotation : "")+(trialLines == null ? "" : delim+trialLines+delim+"trialLines") + return meta.species+delim+meta.version+(meta.containsKey("annotation") ? delim+meta.annotation : "") } @@ -117,58 +77,23 @@ def getAnnotationTagFromMeta(meta, delim = '_') { Generic method for extracting a string tag or a file basename from a metadata map */ def getDatasetTagFromMeta(meta, delim = '_') { - return meta.species+delim+meta.version+(trialLines == null ? "" : delim+trialLines+delim+"trialLines") + return meta.species+delim+meta.version } +Channel.from(params.references) + .into { refsChannel1; refsChannel2 ; refsChannel3} -/* -* Download peptide seqs and assembly index files from Ensembl plants -*/ -process fetchRemoteDataFromEnsembl { - tag{meta.subMap(['species','version','release'])} - label 'download' - - input: - set val(species), val(version), val(shortName), val(eprelease) from Channel.from(params.remoteAssembly) - - output: - set val(meta), file("${basename}.idx") into remoteIndices - set val(meta), file("${basename}.pep") into remotePepSeqs - set val(meta), file("${basename}.fasta") into remoteGenomeSeqs - - script: - meta=["species":species, "version":version, "source": "https://plants.ensembl.org/"+species, "release": eprelease, "shortName": shortName] - basename=getDatasetTagFromMeta(meta) - idxurl=urlprefix+eprelease+"/fasta/"+species.toLowerCase()+"/dna_index/"+species+"."+version+idxsuffix - fastaurl=urlprefix+eprelease+"/fasta/"+species.toLowerCase()+"/dna/"+species+"."+version+fastasuffix - pepurl=urlprefix+eprelease+"/fasta/"+species.toLowerCase()+"/pep/"+species+"."+version+pepsuffix - //Someone decided to embed Genus_species in version as well, - //must be kept there to fetch from Ensembl plants but otherwise annoying as makes data set names long and repetitive - //could be solved by explicitly setting pathis in input config (e.g. conf/triticeae.config) - meta.version = ((meta.version-meta.species).strip('_')) - if(trialLines == null) { - """ - curl $idxurl > ${basename}.idx - curl $fastaurl | gunzip --stdout > ${basename}.fasta - curl $pepurl | gunzip --stdout > ${basename}.pep - """ - } else { - """ - curl $idxurl > ${basename}.idx - curl $fastaurl | gunzip --stdout | head -n ${trialLines} > ${basename}.fasta - curl $pepurl | gunzip --stdout | head -n ${trialLines} > ${basename}.pep - """ - } -} - process alignToGenome { label 'minimap2' - tag {"${refmeta.subMap(['species','version'])} <- ${seqsmeta.name}"} + input: - set val(refmeta), file(ref), val(seqsmeta), file(seqs) from remoteGenomeSeqs.mix(localGenomeSeqs).combine(sequencesToPlaceChannel) // <========= + tuple val(refmeta), file(ref), val(seqsmeta), file(seqs) from refsChannel2 + .filter { it.containsKey('fasta') } + .map { [it, file(it.fasta)]} + .combine(sequencesToPlaceChannel) output: set val(outmeta), file('*.paf') into alignedSeqsChannel @@ -259,11 +184,39 @@ process generateFeaturesFromSeqAlignmentsJSON { --short-name ${meta.seqs.name} \ --align-tool ${meta.align.tool} \ --align-params "${meta.align.params}" \ + --allowed-target-id-pattern '${meta.ref.allowedIdPattern}' \ --output ${tag}_${meta.seqs.seqtype}.json.gz \ --out-counts ${tag}_${meta.seqs.seqtype}.counts """ } +refsChannel1 + .branch { meta -> //redirect data sets; ones without fai idx will need to have it generated + ready: meta.containsKey('idx') + [meta, file(meta.idx)] + faidx: meta.containsKey('fasta') + [meta, file(meta.fasta)] + } + .set { refs4genomeBlocks1 } + +process faidxAssembly { + tag{tag} + label 'samtools' + + input: + tuple val(meta), file(fasta) from refs4genomeBlocks1.faidx + + output: + tuple val(meta), file("${fasta}.fai") into refs4genomeBlocks2 + + script: + tag=getDatasetTagFromMeta(meta) + """ + #if err, likely due to gzipped not bgzipped fasta then index flat - we just need the lengths not the index! + samtools faidx ${fasta} || (zcat ${fasta} > tmp && samtools faidx tmp && mv tmp.fai ${fasta}.fai) + """ +} + /* * Generate genome blocks definitions JSON for pretzel */ @@ -273,7 +226,7 @@ process generateGenomeBlocksJSON { label 'groovy' input: - set val(meta), file(idx) from localIndices.mix(remoteIndices) + tuple val(meta), file(idx) from refs4genomeBlocks1.ready.mix(refs4genomeBlocks2) output: file "*.json" into genomeBlocksJSON, genomeBlocksStats @@ -305,7 +258,7 @@ process generateGenomeBlocksJSON { genome.meta << ["type" : "Genome"] genome.blocks = [] idx.eachLine { line -> - if(line.toLowerCase() =~ /^(chr|[0-9]{1,2}|x|y|i|v)/ ) { + if(line.toLowerCase() =~ /^(ch|[0-9]{1,2}|x|y|i|v)/ || line ==~ '${meta.allowedIdPattern}' ) { toks = line.split('\\t| ') genome.blocks += [ "scope": toks[0].replaceFirst("^(C|c)(H|h)(R|r)[_]?",""), "featureType": "linear", "range": [1, toks[1].toInteger()] ] } @@ -318,6 +271,69 @@ process generateGenomeBlocksJSON { """ } +refsChannel3 + // .view { + // """ + // ${it} + // ${it.containsKey('gff3')} + // ${it.containsKey('gtf')} + // ${(it.containsKey('gff3') || it.containsKey('gtf'))} + // ${!(it.containsKey('gff3') || it.containsKey('gtf'))} + // """ + // } + .filter { meta -> meta.containsKey('pep') } + .map { meta -> // DUPLICATE EMISSIONS IF MULTIPLE ANNOTATIONS PER REFERENCE ASSEMBLY + if(meta.pep instanceof Map) { + def repeated = [] + meta.pep.each { k,v -> + def item = meta.subMap(meta.keySet().minus(['pep','gff3','gtf'])) + [pep: v, annotation: k] + if(meta.containsKey('gff3') && meta.gff3.containsKey(k)) { + item.gff3 = meta.gff3."${k}" + } else if(meta.containsKey('gtf') && meta.gtf.containsKey(k)) { + item.gtf = meta.gtf."${k}" + } + repeated << item + } + repeated + } else { + meta + } + } + .flatten() + .branch { meta -> //redirect data sets; ones with pep but without gff/gtf are assumed to be in ENSEMBL format + pep4Conversion: (meta.containsKey('gff3') || meta.containsKey('gtf')) + pepEnsembl: !(meta.containsKey('gff3') && !meta.containsKey('gtf')) + [meta, file(meta.pep)] + } + .set { refsWithPepChannel } + +// refsWithPepChannel.pep4Conversion.view { it -> groovy.json.JsonOutput.prettyPrint(jsonGenerator.toJson(it))} + +/* + Only keep "representative" splice form for each gene, + current approach selects longest transcript, + we previously relied on ID suffix ".1", some times "-01" + but some of the more recent Ensembl plants data sets + no longer follow this convention +*/ +process filterForRepresentativePeps { + tag{meta.subMap(['species','version'])} + label 'fastx' + input: + set val(meta), file(pep) from refsWithPepChannel.pepEnsembl + + output: + set val(meta), file("${tag}_repr.pep.gz") into representativePepSeqs4Features, representativePepSeqs4Aliases1, representativePepSeqs4Aliases2 + + script: + tag=getAnnotationTagFromMeta(meta) + cmd = "${pep}".endsWith(".gz") ? "zcat" : "cat" + """ + ${cmd} ${pep} | fasta_formatter | paste - - | filterForRepresentative.awk | gzip -c > ${tag}_repr.pep.gz + [ ! -z \$(zcat ${tag}_repr.pep.gz | head -c1) ] || (echo 'Error! Empty output file! ${tag}_repr.pep.gz'; exit 1) + """ +} + /* Given a FASTA with representative peps and the corresponding gtfgff3 @@ -325,62 +341,46 @@ process generateGenomeBlocksJSON { mimicking the ensembl plants (EP) format for such data - this can then be piped into the same processes which we use for chewing through EP data */ -process convertReprFasta2EnsemblPep { +process convertReprFasta2EnsemblPep { //TODO - NOT WORKING IF ENSEMB-FORMATTED INPUT (should not be used here but need to pass-through if already formatted?) tag{tag} - // label 'fastx' + label 'fastx' input: - //val arr from localInput - set (val(meta), file(gtfgff3), file(reprPep)) from localInput + tuple (val(meta), file(gtfgff3), file(reprPep)) from refsWithPepChannel.pep4Conversion + //.filter { meta -> meta.containsKey('pep') && (meta.containsKey('gff3') || meta.containsKey('gtf'))} + .map { meta -> [ meta, file( meta.containsKey('gff3') ? meta.gff3 : meta.gtf ), file( meta.pep ) ] } output: - set val(meta), file(pep) into localPepSeqs4Features, localPepSeqs4Aliases1, localPepSeqs4Aliases2 + tuple val(meta), file('pep.gz') into pepSeqs4Features, pepSeqs4Aliases1, pepSeqs4Aliases2 script: tag=getAnnotationTagFromMeta(meta) //TRIAL RUN? ONLY TAKE FIRST n LINES - cmd = trialLines != null ? "head -n ${trialLines}" : "cat" - if(meta.containsKey("gtfgff3") && (gtfgff3.name).matches(".*gtf\$")) { - // println("MATCHED gtf: "+gtfgff3) + + + cmd0 = "${reprPep}".endsWith(".gz") ? "zcat" : "cat" + cmd1 = "${gtfgff3}".endsWith(".gz") ? "zcat" : "cat" + // if(meta.containsKey("gtfgff3") && (gtfgff3.name).matches(".*gtf\$")) { + if(meta.containsKey("gtf")) { """ - ${cmd} ${reprPep} | fasta_formatter | gtfAndRepr2ensembl_pep.awk -vversion="${meta.version}" - ${gtfgff3} > pep + ${cmd0} ${reprPep} | fasta_formatter | gtfAndRepr2ensembl_pep.awk -vversion="${meta.version}" - <(${cmd1} ${gtfgff3}) | gzip > pep.gz + [ ! -z \$(zcat pep.gz | head -c1) ] || (echo 'Error! Empty output file! pep.gz'; exit 1) """ - } else if(meta.containsKey("gtfgff3") && (gtfgff3.name).matches(".*gff(3)?\$")) { //if(meta.containsKey("gff3")) { + } else { //if(meta.containsKey("gtfgff3") && (gtfgff3.name).matches(".*gff(3)?\$")) { //if(meta.containsKey("gff3")) { // println("MATCHED gff3: "+gtfgff3) """ - ${cmd} ${reprPep} | fasta_formatter | gff3AndRepr2ensembl_pep.awk -vversion="${meta.version}" - ${gtfgff3} > pep - """ - } else { //ASSUMING ENSEMBL PLANTS-LIKE FORMATTED PEPTIDE FASTA - // println("NOT MATCHED gtfgff3: "+gtfgff3) - """ - cp --no-dereference ${reprPep} pep + ${cmd0} ${reprPep} | fasta_formatter | gff3AndRepr2ensembl_pep.awk -vversion="${meta.version}" - <(${cmd1} ${gtfgff3}) | gzip > pep.gz + [ ! -z \$(zcat pep.gz | head -c1) ] || (echo 'Error! Empty output file! pep.gz'; exit 1) """ + // } else { //ASSUMING ENSEMBL PLANTS-LIKE FORMATTED PEPTIDE FASTA + // // println("NOT MATCHED gtfgff3: "+gtfgff3) + // """ + // cp --no-dereference ${reprPep} pep + // """ } } -/* - Only keep "representative" splice form for each gene, - current approach selects longest transcript, - we previously relied on ID suffix ".1", some times "-01" - but some of the more recent Ensembl plants data sets - no longer follow this convention -*/ -process filterForRepresentativePeps { - tag{meta.subMap(['species','version'])} - // label 'fastx' - input: - set val(meta), file(pep) from remotePepSeqs - - output: - set val(meta), file("${tag}_repr.pep") into remotePepSeqs4Features, remotePepSeqs4Aliases1, remotePepSeqs4Aliases2 - script: - tag=getAnnotationTagFromMeta(meta) - """ - fasta_formatter < ${pep} | paste - - | filterForRepresentative.awk > ${tag}_repr.pep - [ -s ${tag}_repr.pep ] || (echo 'Error! Empty output file! ${tag}_repr.pep'; exit 1) - """ -} /* @@ -391,24 +391,32 @@ process generateFeaturesJSON { tag{tag} label 'json' label 'groovy' + echo true + errorStrategy 'terminate' input: - set val(meta), file(pep) from localPepSeqs4Features.mix(remotePepSeqs4Features) + set val(meta), file(pep) from representativePepSeqs4Features.mix(pepSeqs4Features) + // set val(meta), file(pep) from refsChannel2.map { meta -> [meta, file(meta.pep)] } output: file "*.json.gz" into featuresJSON file "*.counts" into featuresCounts script: - tag=getAnnotationTagFromMeta(meta) genome=getDatasetTagFromMeta(meta) shortName = (meta.containsKey("shortName") ? meta.shortName+"_genes" : "") shortName +=(meta.containsKey("annotation") ? "_"+meta.annotation : "") //only for cases where multiple annotations per genome + // """ + // ls -la + // """ """ #!/usr/bin/env groovy + import java.util.zip.GZIPInputStream + import java.util.zip.GZIPOutputStream import static groovy.json.JsonOutput.* + pep = new File('${pep}').text out = new File('${tag}_annotation.json') counts = new File('${tag}_annotation.counts') @@ -432,14 +440,18 @@ process generateFeaturesJSON { annotation.parent = "${genome}" annotation.blocks = [] TreeMap scope = [:] //keep keys sorted as the corresponding blocks get displayed in order in pretzel - pep.eachLine { line -> + def pepStream = new FileInputStream(new File('${pep}')) + def inStream = '${pep}'.endsWith('.gz') ? new GZIPInputStream(pepStream , 1024) : pepStream + def content = new BufferedReader(new InputStreamReader(inStream, "UTF-8"), 1024); + while ((line = content.readLine()) != null && !line.isEmpty() ) { + // pep.eachLine { line -> if(line =~ /^>/ ) { toks = line.split() location = toks[2].split(":") gene = toks[3].split(":") - key = location[2].replaceFirst("^(C|c)(H|h)(R|r)[_]?","") + key = location[2].replaceFirst("^(C|c)(H|h)(R|r)?[_]?","") //Skip non-chromosome blocks - if(key.toLowerCase() =~ /^(chr|[0-9]|x|y|i|v)/ ) { + if(key.toLowerCase() =~ /^(ch|[0-9]|x|y|i|v)/ || key ==~ '${meta.allowedIdPattern}' ) { if(!scope.containsKey(key)) { scope << [(key) : []] } @@ -467,30 +479,11 @@ process generateFeaturesJSON { """ } - - -// //REPEAT INPUT FOR EACH SUBGENOME -// localPepSeqs4AliasesRep = Channel.create() -// localPepSeqs4Aliases.subscribe onNext: { -// // println it[0] -// if(it[0].containsKey("subgenomes")) { -// for(subgenome in it[0].subgenomes) { -// clone = it[0].clone() -// clone.subgenome = subgenome -// localPepSeqs4AliasesRep << [clone,it[1]] -// } -// } else { -// localPepSeqs4AliasesRep << it -// } -// }, onComplete: { localPepSeqs4Aliases.close(); localPepSeqs4AliasesRep.close() } - - -//COMBINE AND FILTER DUPLICATED CHANNEL TO ALLOW ALL VS ALL DATASETS COMPARISONS -// remotePepSeqs4AliasesCombined = remotePepSeqs4Aliases1.mix(localPepSeqs4AliasesRepSplit1).combine(remotePepSeqs4Aliases2.mix(localPepSeqs4AliasesRepSplit2)) -pepSeqs4AliasesCombined = remotePepSeqs4Aliases1.mix(localPepSeqs4Aliases1).combine(remotePepSeqs4Aliases2.mix(localPepSeqs4Aliases2)) +pepSeqs4AliasesCombined = representativePepSeqs4Aliases1.mix(pepSeqs4Aliases1).combine(representativePepSeqs4Aliases2.mix(pepSeqs4Aliases2)) .filter { getAnnotationTagFromMeta(it[0]) <= getAnnotationTagFromMeta(it[2]) } //[species,version,file.pep] - -// .collect().subscribe{ println it.combinations().each { a, b -> a[0].species < b[0].species} } + // .first() + // .view{ [it[0].species, it[2].species] } + // .view { it -> groovy.json.JsonOutput.prettyPrint(jsonGenerator.toJson(it))} /* * Identify best hit for each pep @@ -501,7 +494,7 @@ process pairProteins { errorStrategy 'ignore' input: - set val(metaA), file('pepA'), val(metaB), file('pepB') from pepSeqs4AliasesCombined + set val(metaA), file('pepA.gz'), val(metaB), file('pepB.gz') from pepSeqs4AliasesCombined output: set val(metaA), val(metaB), file("*.tsv"), file(idlines) into pairedProteins @@ -512,12 +505,12 @@ process pairProteins { meta = ["query": tagA, "target": tagB] basename=tagA+"_VS_"+tagB """ - mmseqs easy-search ${pepA} ${pepB} ${basename}.tsv \${TMPDIR:-/tmp}/${basename} \ + mmseqs easy-search pepA.gz pepB.gz ${basename}.tsv \${TMPDIR:-/tmp}/${basename} \ --format-mode 2 \ -c ${params.minCoverage} \ --min-seq-id ${params.minIdentity} \ --threads ${task.cpus} -v 1 \ - && grep --no-filename '^>' ${pepA} ${pepB} | sed 's/^>//' > idlines + && zcat pepA.gz pepB.gz | grep --no-filename '^>' | sed 's/^>//' > idlines """ //'qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen' } @@ -575,9 +568,6 @@ process generateAliasesJSON { // sed -e '1 i\[' -e '$ i\]' } - - - process stats { label 'summary' label 'jq' @@ -595,13 +585,12 @@ process stats { """ jq -r '.blocks[] | (input_filename, .scope, .range[1])' *_genome.json | paste - - - | sort -V > blocks.counts cat *_annotation.counts | sort -V > feature.counts - cat *_markers.counts | sort -V > markers.counts + cat *_{markers,transcripts,cds,genomic}.counts | sort -V > placed.counts grep "" *_aliases.len > aliases.counts """ //jq '.blocks[]' ${f} | jq 'input_filename, .scope, (.features | length)' | paste - - | sort -V } - process pack { label 'archive' executor 'local' diff --git a/nextflow.config b/nextflow.config index 23be095..3a50665 100644 --- a/nextflow.config +++ b/nextflow.config @@ -10,12 +10,23 @@ params { outdir = "./results" publishmode = "copy" //NUMBER OF LINES TO BE PROOCESSED FOR TRIAL PURPOSES, DEFAULT null WILL PROCESS WHOLE DATASETS - trialLines = null - //Generated datasets are public by default, use --makePrivate or --make-private to make them private + subset = -1 + //Generated dataset JSON files have 'public' field set to true by default, use --makePrivate or --make-private to set public to false makePrivate = false + + //PROTEIN SEQUENCE ALIGNMENT THRESHOLDS + minIdentity = 0.5 + minCoverage = 0.6 + + //POST PROTEIN ALIGNMENT FILTERING THRESHOLDS FOR GENERATING ALIASES + minIdentityFilter = 0.7 + minCoverageFilter = 0.8 + //Empty params - may be used depending on execution profile or runtime use of -params-file json/yaml sequencesToPlace = [] - localAssembly = [] + assembly = [] + + singularitydir = "./singularity-images" } process { @@ -29,18 +40,36 @@ profiles { //INPUT AND LOGIC BUSCOs { //BUSCO-based pipeline includeConfig 'conf/dna_zoo.config' + process { + withLabel: BUSCO { + container = 'rsuchecki/busco:3.0.2_blast2.8.1' + } + } } EP { //EP+MMSeqs2 pipeline includeConfig 'conf/ensembl-plants.config' } TRITICEAE { //USe with EP+MMSeqs2 pipeline - includeConfig 'conf/ensembl-plants.config' - includeConfig 'conf/ensembl-plants-data.config' + // includeConfig 'conf/ensembl-plants.config' + // includeConfig 'conf/ensembl-plants-data.config' includeConfig 'conf/triticeae.config' } YEAST { includeConfig 'conf/ensembl-yeast.config' } + MICROSPORIDIA { + includeConfig 'conf/microsporidia.config' + } + test { + includeConfig 'conf/test-data.config' + process { + errorStrategy = 'terminate' + maxRetries = 0 + } + // docker { + // runOptions = "-u \$(id -u):\$(id -g)" + // } + } //COMPUTE standard { process.executor = 'local' @@ -50,7 +79,23 @@ profiles { docker { enabled = true fixOwnership = true + //runOptions = '-u $(id -u):$(id -g)' + // runOptions = '-u \$(id -u):\$(id -g)' + // runOptions = '-u root:root' + } + // process { + // withLabel: groovy { + // containerOptions = '--volume "$PWD":/home/groovy/.groovy' //otherwise grabbing grapes may fail with read-only filesystem error + // } + // } + } + CI { + docker.runOptions = '-u root:root' //apparently required for GH actions but only causes problems with process using bin/paf2pretzel.groovy due to Ivy limitations + process { + errorStrategy = 'terminate' + maxRetries = 0 } + includeConfig 'conf/microsporidia.config' } modules { includeConfig 'conf/modules.config' @@ -67,11 +112,13 @@ profiles { singularity { enabled = true autoMounts = true - cacheDir = "${HOME}/singularity-images" //when distibuting the pipeline probably should point under $workDir + cacheDir = "${params.singularitydir}" //when distibuting the pipeline probably should point under $workDir + } + process { + withLabel: groovy { + containerOptions = '-B "$PWD":/home/groovy/.groovy' //otherwise grabbing grapes may fail with read-only filesystem error + } } - } - singularitymodule { - process.module = 'singularity/3.0.3' } } diff --git a/pull_containers.nf b/pull_containers.nf new file mode 100644 index 0000000..e8dd3d4 --- /dev/null +++ b/pull_containers.nf @@ -0,0 +1,33 @@ +#!/usr/bin/env nextflow + +import nextflow.util.Escape +import nextflow.container.SingularityCache + +def containers = [] +session.getConfig().process.each {k, v -> + if((k.startsWith('withLabel:') || k.startsWith('withName:')) && v.containsKey('container')) { + println "$k -> $v.container" + containers << v.container + } +} + +SingularityCache scache = new SingularityCache() //to get NF-consitent image file names + +process pull_container { + tag { remote } + maxForks 1 + storeDir "${params.singularitydir}" + echo true + +input: + val(remote) from Channel.from(containers) + +output: + file(img) + +script: +img = scache.simpleName(remote) +""" +singularity pull --name ${img} docker://${Escape.path(remote)} +""" +} \ No newline at end of file