Merge pull request #347 from UPHL-BioNGS/update-20240702

Update 20240702
UPHL-BioNGS · Jul 9, 2024 · 6f3584d · 6f3584d
2 parents c3740a9 + 5be5d8c
commit 6f3584d
Show file tree

Hide file tree

Showing 25 changed files with 860 additions and 621 deletions.
diff --git a/.github/workflows/check_versions.yml b/.github/workflows/check_versions.yml
@@ -36,6 +36,8 @@ jobs:
               echo "New version for $base! Upgrade to $latest_version from $workflow_version." | tee -a versions.txt
               issue_text="$issue_text<br>- $base from $workflow_version to $latest_version  "
             fi
+
+            docker rmi $base:latest
           done
 
           latest_nextclade_version=$(docker run nextstrain/nextclade:latest nextclade --version | awk '{print $2}')

diff --git a/.github/workflows/github_actions.config b/.github/workflows/github_actions.config
@@ -1,7 +1,134 @@
-params.vadr = false
-
 process {
-  withName:ivar_consensus{
-    memory = '4 GB'
-  }
+	errorStrategy = { task.attempt < 2 ? 'retry' : 'terminate'}
+	withName:aci{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:artic{
+		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+	}
+	withName:artic_read_filtering{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:bbnorm{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:bcftools_variants{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:bwa{
+    		publishDir = [ path: "cecret", mode: 'link', pattern: 'logs/*/*log' ]
+  	}
+	withName:download{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:fasta_prep{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:summary{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:unzip{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:fastp{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:fastqc{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:freyja_variants{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:freyja_demix{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:freyja_aggregate{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:heatcluster{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:igv_reports{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:iqtree2{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:ivar_consensus{
+    		memory = '4 GB'
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:ivar_variants{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:ivar_trim{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:kraken2{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:mafft{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:minimap2{
+    		publishDir = [ path: "cecret", mode: 'link', pattern: 'logs/*/*log' ]
+  	}
+	withName:multiqc_combine{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:nextclade_dataset{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:nextclade{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:pango_collapse{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:pangolin{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:phytreeviz{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:samtools_stats{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:samtools_coverage{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:samtools_flagstat{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:samtools_depth{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:samtools_ampliconstats{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:samtools_plot_ampliconstats{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:samtools_sort{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:samtools_filter{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:samtools_ampliconclip{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:samtools_markdup{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:seqyclean{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:snpdists{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
+	withName:vadr{
+    		publishDir = [ path: "cecret", mode: 'link', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }]
+  	}
 }
+
diff --git a/.github/workflows/test_kraken2.yml b/.github/workflows/test_kraken2.yml
@@ -41,5 +41,9 @@ jobs:
             
           cat cecret*/cecret_results.txt
 
+      - name: Kraken2 results
+        run: |
+          wc -l cecret/kraken2/*_kraken2_report.txt
+
       - name: Clean
         run: rm -rf work .nextflow*
diff --git a/.github/workflows/test_mpx_yale.yml b/.github/workflows/test_mpx_yale.yml
@@ -0,0 +1,37 @@
+name: Test mpx workflow with yale primers
+
+on: [pull_request, workflow_dispatch]
+
+run-name: mpx_yale
+
+jobs:
+
+  test:
+    runs-on: ubuntu-20.04
+    steps:
+      - name: Checkout
+        uses: actions/checkout@master
+
+      - name: Install Nextflow
+        run: |
+          wget -qO- get.nextflow.io | bash
+          sudo mv nextflow /usr/local/bin/
+      
+      - name: Download reads
+        run: |
+          mkdir reads
+          cd reads
+          wget -q ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR206/024/SRR20689724/SRR20689724_1.fastq.gz
+          wget -q ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR206/024/SRR20689724/SRR20689724_2.fastq.gz
+          cd ../
+
+      - name: Run Cecret
+        run: |
+          nextflow run . -profile docker,mpx_yale -c .github/workflows/github_actions.config --maxcpus 2 --medcpus 2
+
+          ls cecret*
+            
+          head cecret*/cecret_results.txt
+          
+      - name: Clean
+        run: rm -rf work .nextflow*
diff --git a/.github/workflows/test_primers.yml b/.github/workflows/test_primers.yml
@@ -18,7 +18,8 @@ jobs:
                 'ncov_V3', 
                 'ncov_V4', 
                 'ncov_V4.1', 
-                'ncov_V5.3.2'
+                'ncov_V5.3.2',
+                'mpx_yale'
                 ]
 
     steps:

diff --git a/.github/workflows/test_profile.yml b/.github/workflows/test_profile.yml
@@ -24,7 +24,7 @@ jobs:
           
       - name: Run Cecret
         run: |
-          nextflow run . -profile docker,test -c .github/workflows/github_actions.config --maxcpus 2 --medcpus 2 --cleaner 'fastp' --aligner 'minimap2' --mpileup_depth 200 --vadr false
+          nextflow run . -profile docker,test -c .github/workflows/github_actions.config --maxcpus 2 --medcpus 2 --cleaner 'fastp' --aligner 'minimap2' --mpileup_depth 200
 
           ls cecret*
             

diff --git a/.github/workflows/test_profile1.yml b/.github/workflows/test_profile1.yml
@@ -24,7 +24,7 @@ jobs:
           
       - name: Run Cecret
         run: |
-          nextflow run . -profile docker,test1 -c .github/workflows/github_actions.config --maxcpus 2 --medcpus 2 --cleaner 'fastp' --aligner 'minimap2' --mpileup_depth 200 --vadr false
+          nextflow run . -profile docker,test1 -c .github/workflows/github_actions.config --maxcpus 2 --medcpus 2 --cleaner 'fastp' --aligner 'minimap2' --mpileup_depth 200
 
           ls cecret*
             

diff --git a/.github/workflows/test_profile2.yml b/.github/workflows/test_profile2.yml
@@ -24,7 +24,7 @@ jobs:
           
       - name: Run Cecret
         run: |
-          nextflow run . -profile docker,test2 -c .github/workflows/github_actions.config --maxcpus 2 --medcpus 2 --cleaner 'fastp' --aligner 'minimap2' --mpileup_depth 200 --vadr false
+          nextflow run . -profile docker,test2 -c .github/workflows/github_actions.config --maxcpus 2 --medcpus 2 --cleaner 'fastp' --aligner 'minimap2' --mpileup_depth 200
 
           ls cecret*
             

diff --git a/README.md b/README.md
@@ -230,14 +230,17 @@ params.minimum_depth = 10
 The defaults for Cecret continue to be for SARS-CoV-2, but there are growing demands for a workflow for Monkeypox Virus. As such, there are a few parameters that might benefit the **End User**.
 
 ### Using the Monkeypox profile
-There are three profiles for Monkeypox Virus sequencing : `mpx`, `mpx_idt` and `mpx_primalseq`. The `mpx` profile has some defaults for a metagenomic-type sequencing, while `mpx_idt` is for libraries prepped with [IDT](https://www.idtdna.com/)'s primers, and `mpx_primalseq` which has been [validated](https://www.medrxiv.org/content/10.1101/2022.10.14.22280783v1.full-text) with Illumina library prep methods and sequencing platforms.
+There are three profiles for Monkeypox Virus sequencing : `mpx`, `mpx_idt`, `mpx_yale`, and `mpx_primalseq`. The `mpx` profile has some defaults for a metagenomic-type sequencing, while `mpx_idt` is for libraries prepped with [IDT](https://www.idtdna.com/)'s primers, and `mpx_primalseq` which has been [validated](https://www.medrxiv.org/content/10.1101/2022.10.14.22280783v1.full-text) with Illumina library prep methods and sequencing platforms.
 ```
 # metagenomic
 nextflow run UPHL-BioNGS/Cecret -profile singularity,mpx
 
 # using IDT's primers
 nextflow run UPHL-BioNGS/Cecret -profile singularity,mpx_idt
 
+# using primalseq with Yale's reference
+nextflow run UPHL-BioNGS/Cecret -profile singularity,mpx_yale
+
 # using Illumina library prep methods and sequencing platforms
 nextflow run UPHL-BioNGS/Cecret -profile singularity,mpx_primalseq
 ```

diff --git a/bin/combine_results.py b/bin/combine_results.py
@@ -10,9 +10,9 @@
 aci_file                = 'aci_coverage_summary.csv'
 ampliconstats_file      = 'ampliconstats.summary'
 samtools_coverage_file  = 'samtools_coverage_summary.tsv'
-pangolin_file           = 'multiqc_data/multiqc_pangolin.txt'
+pangolin_file           = 'lineage_report.csv'
 pango_collapse_file     = 'pango_collapse.csv'
-nextclade_file          = 'multiqc_data/multiqc_nextclade.txt'
+nextclade_file          = 'nextclade.csv'
 vadr_file               = 'vadr.csv'
 fastp_file              = 'multiqc_data/multiqc_general_stats.txt'
 fastq_names_file        = 'fastq_names.csv'
@@ -277,9 +277,17 @@
     summary_df              = summary_df.drop('sample', axis=1)
     columns                 = columns + ['samtools_meandepth_after_trimming', 'samtools_per_1X_coverage_after_trimming']
 
+def vadr_sample_name(s):
+    if s.count('.') >=1:
+        if len(s.split(".")[-1]) > 2:
+            return ''.join(s.split(".")[:-1])
+    return s
+
 if exists(vadr_file) :
     print("Getting results from vadr file " + vadr_file)
     vadr_df = pd.read_csv(vadr_file, dtype = str, usecols = ['name', 'p/f', 'model', 'alerts'], index_col= False)
+    vadr_df = vadr_df[vadr_df['name'] != 'name']
+    vadr_df = vadr_df[vadr_df['name'] != 'seq']
     vadr_df = vadr_df.add_prefix('vadr_')
     vadr_columns = list(vadr_df.columns)
     vadr_columns.remove('vadr_name')
@@ -291,7 +299,8 @@
         summary_df.drop('vadr_name', axis=1, inplace=True)
         columns = ['vadr_p/f'] + columns + vadr_columns
     else:
-        vadr_df['sample_match'] = vadr_df['vadr_name'].str.replace('Consensus_', '', regex =  False).str.split(".").str[0]
+        vadr_df['sample_match'] = vadr_df['vadr_name'].str.replace('Consensus_', '', regex =  False).apply(vadr_sample_name)
+
         summary_df = pd.merge(summary_df, vadr_df, left_on = 'sample_id', right_on = 'sample_match', how = 'outer')
         summary_df['sample_id'].fillna(summary_df['sample_match'], inplace=True)
         summary_df.drop('vadr_name', axis=1, inplace=True)
@@ -301,42 +310,42 @@
 if exists(nextclade_file) :
     print("Getting results from nextclade file " + nextclade_file)
 
-    use_cols = ['Sample', 'clade', 'qc_overallstatus', 'qc_overallscore']
+    use_cols = ['seqName', 'clade', 'qc.overallStatus', 'qc.overallScore']
 
-    first = pd.read_table(nextclade_file, sep = '\t' , dtype = str, nrows=1)
+    first = pd.read_table(nextclade_file, sep = ';' , dtype = str, nrows=1)
     if 'clade_who' in first.columns:
         use_cols.append('clade_who')
     if 'outbreak' in first.columns:
         use_cols.append('outbreak')
     if 'lineage' in first.columns:
         use_cols.append('lineage')
 
-    nextclade_df = pd.read_table(nextclade_file, sep = '\t' , dtype = str, usecols = use_cols)
+    nextclade_df = pd.read_table(nextclade_file, sep = ';' , dtype = str, usecols = use_cols)
     nextclade_df=nextclade_df.add_prefix('nextclade_')
     nextclade_columns = list(nextclade_df.columns)
-    nextclade_df['sample_match'] = nextclade_df['nextclade_Sample'].str.replace('Consensus_', '', regex =  False)
-    nextclade_columns.remove('nextclade_Sample')
+    nextclade_df['sample_match'] = nextclade_df['nextclade_seqName'].str.replace('Consensus_', '', regex =  False).str.split(' ').str[0]
+    nextclade_columns.remove('nextclade_seqName')
     nextclade_columns.remove('nextclade_clade')
 
     summary_df = pd.merge(summary_df, nextclade_df, left_on = 'sample_id', right_on = 'sample_match', how = 'outer')
     summary_df['sample_id'].fillna(summary_df['sample_match'], inplace=True)
-    summary_df.drop('nextclade_Sample', axis=1, inplace=True)
+    summary_df.drop('nextclade_seqName', axis=1, inplace=True)
     summary_df.drop('sample_match', axis = 1, inplace = True )
     columns = ['nextclade_clade'] + columns + nextclade_columns
 
 if exists(pangolin_file) :
     print("Getting results from pangolin file " + pangolin_file)
 
-    pangolin_df = pd.read_table(pangolin_file, dtype = str)
+    pangolin_df = pd.read_csv(pangolin_file, dtype = str)
     pangolin_df = pangolin_df.add_prefix('pangolin_')
     pangolin_columns = list(pangolin_df.columns)
-    pangolin_df['sample_match'] = pangolin_df['pangolin_Sample'].str.replace('Consensus_', '', regex= False)
-    pangolin_columns.remove('pangolin_Sample')
+    pangolin_df['sample_match'] = pangolin_df['pangolin_taxon'].str.replace('Consensus_', '', regex= False).str.split(' ').str[0]
+    pangolin_columns.remove('pangolin_taxon')
     pangolin_columns.remove('pangolin_lineage')
 
     summary_df = pd.merge(summary_df, pangolin_df, left_on = 'sample_id', right_on = 'sample_match', how = 'outer')
     summary_df['sample_id'].fillna(summary_df['sample_match'], inplace=True)
-    summary_df.drop('pangolin_Sample', axis=1, inplace=True)
+    summary_df.drop('pangolin_taxon', axis=1, inplace=True)
     summary_df.drop('sample_match', axis=1, inplace=True)
     columns = ['pangolin_lineage'] + columns + pangolin_columns
 

diff --git a/main.nf b/main.nf
@@ -513,7 +513,7 @@ ch_reads.ifEmpty     { println("No fastq or fastq.gz files were found at ${param
 
 workflow CECRET {
     ch_for_dataset = Channel.empty()
-    ch_for_version = Channel.from("Cecret version", workflow.manifest.version).first()
+    ch_for_version = Channel.from("Cecret version", workflow.manifest.version).collect()
     ch_prealigned  = Channel.empty()
     ch_versions    = Channel.empty()
 

diff --git a/modules/artic.nf b/modules/artic.nf
@@ -35,7 +35,7 @@ process artic {
     # time stamp + capturing tool versions
     date > \$log
     artic --version >> \$log
-    artic_version=\$(artic --version)
+    artic_version=\$(artic --version | awk '{print \$NF}')
 
     cp ${reference} schema/cecret/V1/cecret.reference.fasta
     cp ${bed}       schema/cecret/V1/cecret.scheme.bed

diff --git a/modules/bwa.nf b/modules/bwa.nf
@@ -34,7 +34,7 @@ process bwa {
     # time stamp + capturing tool versions
     date > \$log
     echo "bwa \$(bwa 2>&1 | grep Version )" >> \$log
-    bwa_version="bwa : "\$(bwa 2>&1 | grep Version)
+    bwa_version=\$(bwa 2>&1 | grep Version | awk '{print \$NF}')
 
     # index the reference fasta file
     bwa index ${reference_genome}

diff --git a/modules/fastp.nf b/modules/fastp.nf
@@ -37,7 +37,7 @@ process fastp {
       # time stamp + capturing tool versions
       date > \$log
       fastp --version >> \$log
-      cleaner_version="\$(fastp --version 2>&1 | head -n 1)"
+      cleaner_version=\$(fastp --version 2>&1 | awk '{print \$NF}')
 
       fastp ${args} \
         -i ${reads[0]} \
@@ -63,7 +63,7 @@ process fastp {
       # time stamp + capturing tool versions
       date > \$log
       fastp --version >> \$log
-      cleaner_version="\$(fastp --version 2>&1 | head -n 1)"
+      cleaner_version=\$(fastp --version 2>&1 | awk '{print \$NF}')
 
       fastp ${args} \
         -i ${reads} \