diff --git a/modules/nf-core/last/lastal/main.nf b/modules/nf-core/last/lastal/main.nf index edf92a38461..560ada11b32 100644 --- a/modules/nf-core/last/lastal/main.nf +++ b/modules/nf-core/last/lastal/main.nf @@ -13,6 +13,7 @@ process LAST_LASTAL { output: tuple val(meta), path("*.maf.gz"), emit: maf + tuple val(meta), path("*.tsv") , emit: multiqc path "versions.yml" , emit: versions when: @@ -25,15 +26,33 @@ process LAST_LASTAL { """ INDEX_NAME=\$(basename \$(ls $index/*.des) .des) set -o pipefail + + function calculate_psl_metrics() { + awk 'BEGIN { + FS="\t"; # Set field separator as tab + totalMatches = 0; + totalAlignmentLength = 0; + print "Sample\tTotalAlignmentLength\tPercentSimilarity"; # Header for MultiQC + } + { + totalMatches += \$1 + \$3; # Sum matches and repMatches + totalAlignmentLength += \$1 + \$2 + \$3 + \$6 + \$8; # Sum matches, misMatches, repMatches, qBaseInsert, and tBaseInsert + } + END { + percentSimilarity = (totalAlignmentLength > 0) ? (totalMatches / totalAlignmentLength * 100) : 0; + print "$meta.id" "\t" totalAlignmentLength "\t" percentSimilarity; # Data in TSV format + }' + } + lastal \\ -P $task.cpus \\ $trained_params \\ $args \\ ${index}/\$INDEX_NAME \\ - $fastx \\ - | gzip --no-name > ${prefix}.\$INDEX_NAME.maf.gz - # gzip needs --no-name otherwise it puts a timestamp in the file, - # which makes its checksum non-reproducible. + $fastx | + tee >(gzip --no-name > ${prefix}.maf.gz) | + maf-convert psl | + calculate_psl_metrics > ${prefix}.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -48,6 +67,7 @@ process LAST_LASTAL { """ INDEX_NAME=STUB echo stub | gzip --no-name > ${prefix}.\$INDEX_NAME.maf.gz + touch ${prefix}.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/last/lastal/meta.yml b/modules/nf-core/last/lastal/meta.yml index 7e416f7f342..c14fa27989d 100644 --- a/modules/nf-core/last/lastal/meta.yml +++ b/modules/nf-core/last/lastal/meta.yml @@ -46,6 +46,10 @@ output: type: file description: Gzipped MAF (Multiple Alignment Format) file pattern: "*.{maf.gz}" + - multiqc: + type: file + description: Alignment summary for MultiQC + pattern: "*.tsv" authors: - "@charles-plessy" maintainers: diff --git a/modules/nf-core/last/lastal/tests/main.nf.test.snap b/modules/nf-core/last/lastal/tests/main.nf.test.snap index ba054b089a8..9245a96252e 100644 --- a/modules/nf-core/last/lastal/tests/main.nf.test.snap +++ b/modules/nf-core/last/lastal/tests/main.nf.test.snap @@ -8,10 +8,19 @@ "id": "contigs", "single_end": false }, - "contigs.genome.maf.gz:md5,902274b72657f62d270d284dc211aa7f" + "contigs.maf.gz:md5,902274b72657f62d270d284dc211aa7f" ] ], "1": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.tsv:md5,f028e69bd64e54080b9a03fd809cba74" + ] + ], + "2": [ "versions.yml:md5,e0a425d7cbca674252a1e4328b247ca2" ], "maf": [ @@ -20,7 +29,16 @@ "id": "contigs", "single_end": false }, - "contigs.genome.maf.gz:md5,902274b72657f62d270d284dc211aa7f" + "contigs.maf.gz:md5,902274b72657f62d270d284dc211aa7f" + ] + ], + "multiqc": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.tsv:md5,f028e69bd64e54080b9a03fd809cba74" ] ], "versions": [ @@ -32,7 +50,7 @@ "nf-test": "0.8.4", "nextflow": "24.04.2" }, - "timestamp": "2024-06-06T23:11:59.764152" + "timestamp": "2024-07-02T17:57:48.589408" }, "sarscov2 - contigs - genome - stub": { "content": [ @@ -47,6 +65,15 @@ ] ], "1": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ "versions.yml:md5,e0a425d7cbca674252a1e4328b247ca2" ], "maf": [ @@ -58,6 +85,15 @@ "contigs.STUB.maf.gz:md5,f50b84b1db4b83ba62ec1deacc69c260" ] ], + "multiqc": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], "versions": [ "versions.yml:md5,e0a425d7cbca674252a1e4328b247ca2" ] @@ -67,7 +103,7 @@ "nf-test": "0.8.4", "nextflow": "24.04.2" }, - "timestamp": "2024-06-06T23:12:43.028075" + "timestamp": "2024-07-02T17:58:30.521811" }, "sarscov2 - contigs - genome - withparams": { "content": [ @@ -78,10 +114,19 @@ "id": "contigs", "single_end": false }, - "contigs.genome.maf.gz:md5,8cb97b6daa34dbf9c723a2c4a984992d" + "contigs.maf.gz:md5,8cb97b6daa34dbf9c723a2c4a984992d" ] ], "1": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.tsv:md5,f315664aa18f1f6bad79486f9750f200" + ] + ], + "2": [ "versions.yml:md5,e0a425d7cbca674252a1e4328b247ca2" ], "maf": [ @@ -90,7 +135,16 @@ "id": "contigs", "single_end": false }, - "contigs.genome.maf.gz:md5,8cb97b6daa34dbf9c723a2c4a984992d" + "contigs.maf.gz:md5,8cb97b6daa34dbf9c723a2c4a984992d" + ] + ], + "multiqc": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.tsv:md5,f315664aa18f1f6bad79486f9750f200" ] ], "versions": [ @@ -102,6 +156,6 @@ "nf-test": "0.8.4", "nextflow": "24.04.2" }, - "timestamp": "2024-06-06T23:12:21.536568" + "timestamp": "2024-07-02T17:58:09.677672" } } \ No newline at end of file diff --git a/modules/nf-core/last/split/main.nf b/modules/nf-core/last/split/main.nf index 4b29d912fb4..410d16f1d27 100644 --- a/modules/nf-core/last/split/main.nf +++ b/modules/nf-core/last/split/main.nf @@ -12,6 +12,7 @@ process LAST_SPLIT { output: tuple val(meta), path("*.maf.gz"), emit: maf + tuple val(meta), path("*.tsv") , emit: multiqc path "versions.yml" , emit: versions when: @@ -23,7 +24,29 @@ process LAST_SPLIT { if( "$maf" == "${prefix}.maf.gz" ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" """ set -o pipefail - zcat < $maf | last-split $args | gzip --no-name > ${prefix}.maf.gz + + function calculate_psl_metrics() { + awk 'BEGIN { + FS="\t"; # Set field separator as tab + totalMatches = 0; + totalAlignmentLength = 0; + print "Sample\tTotalAlignmentLength\tPercentSimilarity"; # Header for MultiQC + } + { + totalMatches += \$1 + \$3; # Sum matches and repMatches + totalAlignmentLength += \$1 + \$2 + \$3 + \$6 + \$8; # Sum matches, misMatches, repMatches, qBaseInsert, and tBaseInsert + } + END { + percentSimilarity = (totalAlignmentLength > 0) ? (totalMatches / totalAlignmentLength * 100) : 0; + print "$meta.id" "\t" totalAlignmentLength "\t" percentSimilarity; # Data in TSV format + }' + } + + zcat < $maf | + last-split $args | + tee >(gzip --no-name > ${prefix}.maf.gz) | + maf-convert psl | + calculate_psl_metrics > ${prefix}.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -37,6 +60,7 @@ process LAST_SPLIT { if( "$maf" == "${prefix}.maf.gz" ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" """ echo stub | gzip --no-name > ${prefix}.maf.gz + touch ${prefix}.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/last/split/meta.yml b/modules/nf-core/last/split/meta.yml index 03b030f9bbe..2e23f8b4c6a 100644 --- a/modules/nf-core/last/split/meta.yml +++ b/modules/nf-core/last/split/meta.yml @@ -40,6 +40,10 @@ output: type: file description: Multiple Aligment Format (MAF) file, compressed with gzip pattern: "*.{maf.gz}" + - multiqc: + type: file + description: Alignment summary for MultiQC + pattern: "*.tsv" authors: - "@aleksandrabliznina" - "@charles-plessy" diff --git a/modules/nf-core/last/split/tests/main.nf.test.snap b/modules/nf-core/last/split/tests/main.nf.test.snap index 85a9c087a03..953a6542a80 100644 --- a/modules/nf-core/last/split/tests/main.nf.test.snap +++ b/modules/nf-core/last/split/tests/main.nf.test.snap @@ -11,6 +11,14 @@ ] ], "1": [ + [ + { + "id": "sarscov.contigs.genome" + }, + "sarscov.contigs.genome.tsv:md5,b625a3b37343e9e6a279b8625d4c2da8" + ] + ], + "2": [ "versions.yml:md5,9e429d0800988ae0bbe5000827d34ad1" ], "maf": [ @@ -21,6 +29,14 @@ "sarscov.contigs.genome.maf.gz:md5,689cb18ff7098ff90eaf87017f590208" ] ], + "multiqc": [ + [ + { + "id": "sarscov.contigs.genome" + }, + "sarscov.contigs.genome.tsv:md5,b625a3b37343e9e6a279b8625d4c2da8" + ] + ], "versions": [ "versions.yml:md5,9e429d0800988ae0bbe5000827d34ad1" ] @@ -30,7 +46,7 @@ "nf-test": "0.8.4", "nextflow": "24.04.2" }, - "timestamp": "2024-06-06T17:49:24.045661" + "timestamp": "2024-07-02T11:45:00.535348" }, "sarscov2 - contigs_genome - stub": { "content": [ @@ -44,6 +60,14 @@ ] ], "1": [ + [ + { + "id": "sarscov.contigs.genome" + }, + "sarscov.contigs.genome.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ "versions.yml:md5,9e429d0800988ae0bbe5000827d34ad1" ], "maf": [ @@ -54,6 +78,14 @@ "sarscov.contigs.genome.maf.gz:md5,f50b84b1db4b83ba62ec1deacc69c260" ] ], + "multiqc": [ + [ + { + "id": "sarscov.contigs.genome" + }, + "sarscov.contigs.genome.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], "versions": [ "versions.yml:md5,9e429d0800988ae0bbe5000827d34ad1" ] @@ -63,6 +95,6 @@ "nf-test": "0.8.4", "nextflow": "24.04.2" }, - "timestamp": "2024-06-06T17:50:20.139442" + "timestamp": "2024-07-02T11:45:21.243325" } } \ No newline at end of file diff --git a/modules/nf-core/last/train/main.nf b/modules/nf-core/last/train/main.nf index 67938716beb..10bed777300 100644 --- a/modules/nf-core/last/train/main.nf +++ b/modules/nf-core/last/train/main.nf @@ -13,6 +13,7 @@ process LAST_TRAIN { output: tuple val(meta), path("*.train"), emit: param_file + tuple val(meta), path("*.tsv") , emit: multiqc path "versions.yml" , emit: versions when: @@ -31,6 +32,16 @@ process LAST_TRAIN { $fastx \\ > ${prefix}.\$INDEX_NAME.train + echo "id\tsubstitution_percent_identity\tlast -t\tlast -a\tlast -A\tlast -b\tlast -B\tlast -S" > ${prefix}.train.tsv + printf "\$(basename ${prefix}.\$INDEX_NAME.train .target.train)\t" >> ${prefix}.train.tsv + grep 'substitution percent identity' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$5}' | tr '\n' '\t' >> ${prefix}.train.tsv + grep 'last -t' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$2}' | sed -e 's/-t//' | tr '\n' '\t' >> ${prefix}.train.tsv + grep 'last -a' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$3}' | tr '\n' '\t' >> ${prefix}.train.tsv + grep 'last -A' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$3}' | tr '\n' '\t' >> ${prefix}.train.tsv + grep 'last -b' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$3}' | tr '\n' '\t' >> ${prefix}.train.tsv + grep 'last -B' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$3}' | tr '\n' '\t' >> ${prefix}.train.tsv + grep 'last -S' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$3}' >> ${prefix}.train.tsv + cat <<-END_VERSIONS > versions.yml "${task.process}": last: \$(lastdb --version | sed 's/lastdb //') @@ -43,6 +54,7 @@ process LAST_TRAIN { """ INDEX_NAME=STUB touch ${prefix}.\$INDEX_NAME.train + touch ${prefix}.train.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/last/train/meta.yml b/modules/nf-core/last/train/meta.yml index db723be5ba9..d55e82734b3 100644 --- a/modules/nf-core/last/train/meta.yml +++ b/modules/nf-core/last/train/meta.yml @@ -42,8 +42,13 @@ output: type: file description: Trained parameter file pattern: "*.train" + - multiqc: + type: file + description: Alignment parameter summary for MultiQC + pattern: "*.tsv" authors: - "@aleksandrabliznina" - "@charles-plessy" + - "@U13bs1125" maintainers: - "@charles-plessy" diff --git a/modules/nf-core/last/train/tests/main.nf.test.snap b/modules/nf-core/last/train/tests/main.nf.test.snap index d41a8a63ded..703bcfa0152 100644 --- a/modules/nf-core/last/train/tests/main.nf.test.snap +++ b/modules/nf-core/last/train/tests/main.nf.test.snap @@ -12,8 +12,26 @@ ] ], "1": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.train.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ "versions.yml:md5,c5578547acf9e77e1e8f6bf796e32ac2" ], + "multiqc": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.train.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], "param_file": [ [ { @@ -32,7 +50,7 @@ "nf-test": "0.8.4", "nextflow": "24.04.2" }, - "timestamp": "2024-06-06T23:01:07.091657" + "timestamp": "2024-07-03T18:04:56.173906" }, "sarscov2 - genome - contigs": { "content": [ @@ -47,7 +65,25 @@ ] ], "1": [ - "versions.yml:md5,c5578547acf9e77e1e8f6bf796e32ac2" + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.train.tsv:md5,f09bcd1a111241a3439258a43c2a1a4e" + ] + ], + "2": [ + "versions.yml:md5,38234cf053c708e57cc080990f777411" + ], + "multiqc": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.train.tsv:md5,f09bcd1a111241a3439258a43c2a1a4e" + ] ], "param_file": [ [ @@ -59,7 +95,7 @@ ] ], "versions": [ - "versions.yml:md5,c5578547acf9e77e1e8f6bf796e32ac2" + "versions.yml:md5,38234cf053c708e57cc080990f777411" ] } ], @@ -67,6 +103,6 @@ "nf-test": "0.8.4", "nextflow": "24.04.2" }, - "timestamp": "2024-06-06T23:00:45.493868" + "timestamp": "2024-07-03T18:04:34.914789" } } \ No newline at end of file