Skip to content

Commit

Permalink
Edge v3 (#16)
Browse files Browse the repository at this point in the history
* initial commit

* Prokka and RATT running

* generating plots and KEGG pathway views

* Full workflow, with explicit outputs and TODOs resolved.

* Containerized

* comments

* Nf phage finder (#9)

Merging two workflows with related functionality.

* proof of concept: linked runAnnotation and phageFinder, with phageFinder able to be turned on or off

* restructured early pipeline. added nf-test infrastructure

* testing added for sra2fastq

* Testing for the first few modules

* integrated hostRemoval

* Edits to match EDGE v3 UI

* Nf composition (#15)

This merge will change some workflow inputs, so it'll require additional commits to fix, but this is necessary for the repository restructuring happening.

* runAssembly in pipeline

* host removal testing + cleanup

* tests for runAssembly

* reads to contig restructured as subworkflow. adding nf-test CI

* fixing JDK version for CI

* testing Apptainer for CI

* cleanup

* updated snapshots for JDK 17

* adding debugging output

* more testing output for GH actions

* added Git LFS to testing yml

* adding sharding to tests - checking to see if this resolves space issues with the runner

* removing files from LFS

* basic testing for runReadsToContig

* adding nf-test file

* attempting optimized testing requirements

* reverting testing strategy

* host removal testing accounts for inconsistent file naming

* updated tests for EDGEv3 inputs

* fixed host removal for edgev3 settings

* checking if testing issues related to ubuntu version

* Actions testing uses compatible Apptainer version for ubuntu 24.04

* fixed badly merged snapshot for sra2fastq

* continued snapshot fix
  • Loading branch information
aw-watson authored Jan 7, 2025
1 parent 4957dc2 commit be19942
Show file tree
Hide file tree
Showing 14 changed files with 186 additions and 767 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/ci-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,16 @@ jobs:
java-version: '17'
distribution: 'adopt'

- name: Set up Apptainer 1.3.0
- name: Set up Apptainer 1.3.6
uses: eWaterCycle/setup-apptainer@v2
with:
apptainer-version: 1.3.0
apptainer-version: 1.3.6


- name: Setup Nextflow 24.10.1
uses: nf-core/setup-nextflow@v1
with:
version: "24.10.1"
version: "24.10.3"

- name: Install nf-test
run: |
Expand Down
22 changes: 11 additions & 11 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -9,30 +9,30 @@ include {READSTOCONTIGS} from './modules/runReadsToContig/runReadsToContig.nf'

workflow {

//input specification

pairedFiles = channel.fromPath(params.pairedFiles, checkIfExists:true)
unpairedFiles = channel.fromPath(params.unpairedFiles, checkIfExists:true)
//input specification
fastqFiles = channel.fromPath(params.shared.inputFastq, checkIfExists:true)
contigs = channel.empty()
if(params.r2c.useAssembledContigs) {
contigs = channel.fromPath(params.inputContigs, checkIfExists:true)
contigs = channel.fromPath(params.shared.inputContigs, checkIfExists:true)
}


if(params.modules.sra2fastq) {
SRA2FASTQ(params.sra2fastq.plus(params.shared))
pairedFiles = pairedFiles.concat(SRA2FASTQ.out.paired).flatten()
unpairedFiles = unpairedFiles.concat(SRA2FASTQ.out.unpaired).flatten()
fastqFiles = fastqFiles.concat(SRA2FASTQ.out.fastq).flatten()
}

COUNTFASTQ(pairedFiles.collect(), unpairedFiles.collect())
COUNTFASTQ(params.shared, fastqFiles.collect())

avgLen = COUNTFASTQ.out.avgReadLen
paired = COUNTFASTQ.out.paired.ifEmpty(params.pairedFiles)
unpaired = COUNTFASTQ.out.unpaired.ifEmpty(params.unpairedFiles)
fastqFiles = COUNTFASTQ.out.fastqFiles


paired = channel.empty()
unpaired = channel.empty()
if(params.modules.faqcs) {
FAQCS(params.faqcs.plus(params.shared),paired,unpaired,avgLen)
FAQCS(params.faqcs.plus(params.shared), fastqFiles,avgLen)

paired = FAQCS.out.paired.ifEmpty(params.pairedFiles)
unpaired = FAQCS.out.unpaired.ifEmpty(params.unpairedFiles)
}
Expand Down
40 changes: 13 additions & 27 deletions modules/countFastq/countFastq.nf
Original file line number Diff line number Diff line change
Expand Up @@ -3,37 +3,26 @@ process countFastq {
label "countFastq"

input:
path paired
path unpaired
val settings
path fastq

output:
path "fastqCount.txt", emit: counts
path "all.{1,2}.fastq", emit: allPaired, optional:true
path "all.se.fastq", emit: allUnpaired, optional:true
path "all.*.fastq", emit: allFiles

script:

if(paired.size() > 1 && paired[0] =~ /NO_FILE/) {
paired = paired.tail().join(" ")
file_list = ""
if(settings["pairedFile"]) {
file_list = "-p $fastq"
}
else {
paired = paired.join(" ")
file_list = "-u $fastq"
}
if(unpaired.size() > 1 && unpaired[0] =~ /NO_FILE/) {
unpaired = unpaired.tail().join(" ")
}
else {
unpaired = unpaired.join(" ")
}


paired_list = paired.startsWith("NO_FILE") ? "" : "-p ${paired}"
unpaired_list = unpaired.startsWith("NO_FILE2") ? "" : "-u ${unpaired}"

"""
getAvgLen.pl\
$paired_list\
$unpaired_list\
$file_list\
-d .
"""
}
Expand Down Expand Up @@ -70,19 +59,16 @@ process avgLen {
//calculates average read length and concatenates input files
workflow COUNTFASTQ {
take:
pairedFiles
unpairedFiles
settings
inputFastq

main:

countFastq(pairedFiles, unpairedFiles)
countFastq(settings, inputFastq)
avgReadLen = avgLen(countFastq.out.counts)
paired = countFastq.out.allPaired
unpaired = countFastq.out.allUnpaired

fastqFiles = countFastq.out.allFiles

emit:
avgReadLen
paired
unpaired
fastqFiles
}
13 changes: 10 additions & 3 deletions modules/hostRemoval/hostRemoval.nf
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,17 @@ process hostRemoval {
def prefix = "-prefix ${ref.name.take(ref.name.lastIndexOf('.'))}.clean "
def similarity = settings["similarity"] != null ? "-similarity ${settings["similarity"]} " : ""
def minScore = settings["bwaMemOptions"] != null ? "${settings["bwaMemOptions"]} " : "-T 50 "
def ontFlag = settings["fastqSource"].equalsIgnoreCase("nanopore") ? "-x ont2d " : ""
ontFlag = settings["fastqSource"].equalsIgnoreCase("pacbio") ? "-x pacbio " : ontFlag
ontFlag = ""
if(settings["fastqSource"] != null) {
if(settings["fastqSource"].equalsIgnoreCase("nanopore")) {
ontFlag = "-x ont2d "
}
else if(settings["fastqSource"].equalsIgnoreCase("pacbio")) {
ontFlag = "-x pacbio "
}
}
minScore = ontFlag != "" ? "-T ${settings["minLen"]} " : minScore
def bwaMemOptions = "-bwaMemOptions \"${ontFlag} ${minScore}\" "
bwaMemOptions = "-bwaMemOptions \"${ontFlag} ${minScore}\" "
def cpu = settings["cpus"] != null ? "-cpu ${settings["cpus"]} " : ""

"""
Expand Down
116 changes: 25 additions & 91 deletions modules/runFaQCs/runFaQCs.nf
Original file line number Diff line number Diff line change
@@ -1,49 +1,5 @@
#!/usr/bin/env nextflow

//plotting for trimmed reads from ONT
process nanoplot {
label "qc"
publishDir(
path: "${settings["outDir"]}/QcReads",
mode: 'copy'
)
input:
val settings
path unpaired

output:
path "*" //lots of output plots

script:
"""
NanoPlot --fastq $unpaired --N50 --loglength -t ${settings["cpus"]} -f pdf --outdir . 2>/dev/null
"""

}


//Porechop for removing adapters from ONT or PacBio reads
process porechop {
label "qc"
publishDir(
path: "${settings["outDir"]}/QcReads",
mode: 'copy'
)


input:
val settings
path trimmed
path log
output:
path "*.porechop.fastq", emit: porechopped

script:
"""
porechop -i $trimmed -o ./QC.unpaired.porechop.fastq -t ${settings["cpus"]} > $log
"""
}

//double-checks that any provided adapter file is in FASTA format
process adapterFileCheck {
label "qc"
Expand All @@ -60,8 +16,6 @@ process adapterFileCheck {
}

//main QC process. puts parameters together and runs FaQCs.
//EDGE currently uses a custom script (illumina_fastq_QC.pl) to handle QC for long reads,
//but it was unable to create report files when I attempted using it. For now, all input reads go through FaQCs.
process qc {
label "qc"
publishDir(
Expand All @@ -71,8 +25,7 @@ process qc {

input:
val settings
path paired
path unpaired
path fastq
val validAdapter
path adapter
val avgLen
Expand All @@ -86,84 +39,65 @@ process qc {

script:
//adjust minLength
def min = settings["minLength"]
if(settings["minLength"] < 1) {
min = Math.abs(settings["minLength"] * avgLen.toInteger())
def min = settings["minLen"]
if(settings["minLen"] < 1) {
min = Math.abs(settings["minLen"] * avgLen.toInteger())
}

def qcSoftware = "FaQCs"
// if(params.ontFlag || params.pacbioFlag) {
// qcSoftware = "illumina_fastq_QC.pl"
// }
def pairedArg = paired.name != "NO_FILE" ? "-1 ${paired[0]} -2 ${paired[1]}" : ""
// if(pairedArg != "" && (params.ontFlag || params.pacbioFlag)) {
// pairedArg = "-p $paired"
// }
def unpairedArg = unpaired.name != "NO_FILE2" ? "-u $unpaired" : ""


def inputArg = settings["pairedFile"] ? "-1 ${fastq[0]} -2 ${fastq[1]}" : "-u $fastq"

def adapterArg = ""
if(adapter.name != "NO_FILE3" && validAdapter == "Yes"){
adapterArg = "--adapter --artifactFile $adapter"
}

def polyA = settings["polyA"] ? "--polyA" : ""
def trim = ""
// if(params.ontFlag || params.pacbioFlag) {
// trim = "--trim_only"
// }
def ascii = settings["phredOffset"] != null ? "--ascii ${settings["phredOffset"]}" : ""
def phiX = settings["filtPhiX"] ? "--phiX" : ""

"""
$qcSoftware $pairedArg $unpairedArg \
-q ${settings["qualityCutoff"]} --min_L $min --avg_q ${settings["avgQuality"]} \
-n ${settings["numN"]} --lc ${settings["lowComplexity"]} --5end ${settings["cut5end"]} --3end ${settings["cut3end"]} \
--split_size ${settings["splitSize"]} -d . -t ${settings["cpus"]} \
$qcSoftware $inputArg \
-q ${settings["trimQual"]} --min_L $min --avg_q ${settings["avgQual"]} \
-n ${settings["numN"]} --lc ${settings["filtLC"]} --5end ${settings["trim5end"]} --3end ${settings["trim3end"]} \
--split_size 1000000 -d . -t ${settings["cpus"]} \
$polyA \
$trim \
$adapterArg \
$ascii \
$phiX
1>QC.log 2>&1
"""
}

workflow FAQCS {
take:
settings
paired
unpaired
fastq
avgLen


main:

//adapter setup
adapter_ch = channel.fromPath(settings["adapter"], checkIfExists:true)
adapter_ch = channel.fromPath(settings["artifactFile"], checkIfExists:true)
//checks to see if the provided adapter file is a valid FASTA
adapterFileCheck(adapter_ch)

//main QC process
qc(settings, paired, unpaired, adapterFileCheck.out, adapter_ch, avgLen)
qc(settings, fastq, adapterFileCheck.out, adapter_ch, avgLen)


trimmed = channel.empty()
if(settings["pairedFile"]) {
trimmed = qc.out.pairedQC
}
else {
trimmed = qc.out.unpairedQC
}
paired = qc.out.pairedQC
unpaired = qc.out.unpairedQC

//long read trimming and plotting
if(settings["ontFlag"]) {
nanoplot_ch = channel.empty()
if(settings["porechop"]) {
porechop(settings, unpaired, qc.out.log)
nanoplot(settings, porechop.out.porechopped)
unpaired = porechop.out.porechopped
}
else {
nanoplot(settings, unpaired_ch)
unpaired = porechop.out.porechopped

}
}

emit:
paired
unpaired
trimmed

}
19 changes: 9 additions & 10 deletions modules/sra2fastq/sra2fastq.nf
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env nextflow
//to run: nextflow [OPT: -log /path/to/log file] run sra2fastq.nf -params-file [JSON parameter file]
//to run: nextflow run sra2fastq.nf -params-file [JSON parameter file]
//not supporting filesize or run count restrictions


Expand All @@ -9,7 +9,8 @@ process sraDownload {
tag "$accession"
publishDir "${settings["outDir"]}/SRA_Download", mode: 'copy'

//retries download in case of transient failure, then completes any processes that didn't fail
//retries download in case of transient failure, then completes any downloads that didn't fail

maxRetries 3
errorStrategy { (task.attempt <= maxRetries) ? 'retry' : 'finish' }

Expand All @@ -19,15 +20,14 @@ process sraDownload {
val settings

output:
path "$accession/${accession}.fastq.gz", emit: unpairedSRA, optional:true
path "$accession/${accession}_{1,2}.fastq.gz", emit: pairedSRA, optional:true
path "$accession/${accession}*.fastq.gz", emit: files
path "$accession/${accession}_metadata.txt"
path "$accession/sra2fastq_temp/*", optional: true //needed output?

script:
//conditionally create command-line options based on non-empty parameters, for use in the command below
def clean = settings["clean"] != null ? "--clean True" : ""
def platform_restrict = settings["platformRestrict"] != null ? "--platform_restrict ${settings["platformRestrict"]}" : ""
def clean = settings["clean"] ? "--clean True" : ""
def platform_restrict = settings["fastqSource"] != null ? "--platform_restrict ${settings["fastqSource"]}" : ""

//invoke sra2fastq.py with those options
"""
Expand All @@ -46,10 +46,9 @@ workflow SRA2FASTQ {
accessions_ch = channel.of(settings["accessions"])
sraDownload(accessions_ch.flatten().unique(), settings)

paired = sraDownload.out.pairedSRA
unpaired = sraDownload.out.unpairedSRA
fastq = sraDownload.out.files

emit:
paired
unpaired
fastq

}
Loading

0 comments on commit be19942

Please sign in to comment.