Plant-Food-Research-Open · GallVp · Nov 20, 2024 · Nov 20, 2024 · Nov 20, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,7 +3,7 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## v0.5.0dev - [19-Nov-2024]
+## v0.5.0dev - [20-Nov-2024]
 
 ### `Added`
 

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -28,17 +28,16 @@ module_order:
   - busco:
       name: "BUSCO"
       info: "Genome completeness statistics"
-      path_filters:
-        - "*seqkit.rmdup.fasta.txt"
+      path_filters_exclude:
+        - "*.proteins.txt"
   - busco:
       name: "BUSCO Annotation"
       info: "Annotation completeness statistics"
-      path_filters_exclude:
-        - "*seqkit.rmdup.fasta.txt"
+      path_filters:
+        - "*.proteins.txt"
   - gffcompare:
 
 extra_fn_clean_exts:
-  - ".seqkit"
-  - ".rmdup"
+  - ".proteins"
   - type: regex
     pattern: "^short_summary\\.specific\\..*_odb10\\."
diff --git a/bin/genepal_report.R b/bin/genepal_report.R
@@ -78,6 +78,16 @@ parse_busco_file <- function(file_path) {
   return(results)
 }
 
+# Assumed file name pattern: short_summary.specific.(sample|lineage).(sample|lineage).*
+parse_busco_id_lineage <- function(file_name) {
+  name_components <- strsplit(basename(file_name), "\\.")[[1]][3:4]
+  id <- name_components[!grepl("_odb", name_components)]
+  lineage <- name_components[grepl("_odb", name_components)]
+  lineage_strip <- str_replace(lineage, "_odb.*$", "")
+
+  return(list(id = id, lineage = lineage_strip))
+}
+
 parse_busco_folder <- function(folder_path, col_prefix = "Genome") {
   list_of_files <- list.files(folder_path, pattern = "short_summary.specific.*.txt$", full.names = TRUE)
 
@@ -89,8 +99,8 @@ parse_busco_folder <- function(folder_path, col_prefix = "Genome") {
     lapply(parse_busco_file) %>%
     bind_rows() %>%
     mutate(
-      ID = sapply(strsplit(basename(list_of_files), "\\."), `[`, 3),
-      Lineage = sapply(strsplit(basename(list_of_files), "\\."), `[`, 4)
+      ID = list_of_files %>% lapply(parse_busco_id_lineage) %>% sapply(function(x) x$id),
+      Lineage = list_of_files %>% lapply(parse_busco_id_lineage) %>% sapply(function(x) x$lineage)
     ) %>%
     select(
       ID,
@@ -177,17 +187,13 @@ process_protein_clustering <- function(file_path) {
   if (!file.exists(file_path)) {
     return(NULL) # No output if the file doesn't exist
   }
-  df <- read.table(
-    file = file_path,
-    sep = "\t",
-    header = TRUE,
-    nrows = 10,
-    row.names = 1
-  )
+  df <- read.table(file = file_path, sep = "\t", header = TRUE, nrows = 10, row.names = 1)
   df <- df %>%
     rownames_to_column(var = "row_id") %>% # Changed to snake_case
     pivot_longer(cols = -row_id, names_to = "ID", values_to = "Value") %>%
-    pivot_wider(names_from = row_id, values_from = "Value")
+    pivot_wider(names_from = row_id, values_from = "Value") %>%
+    mutate(ID = str_replace(ID, ".pep", ""))
+
   colnames(df) <- gsub(
     x = colnames(df),
     pattern = "genes",

diff --git a/modules.json b/modules.json
@@ -106,7 +106,7 @@
                     },
                     "fasta_gxf_busco_plot": {
                         "branch": "main",
-                        "git_sha": "7bf6fbca23edc94490ffa6709f52b2f71c6fb130",
+                        "git_sha": "5b401d806f8d69599ae88422064aea4f0e432a09",
                         "installed_by": ["subworkflows"]
                     },
                     "gxf_fasta_agat_spaddintrons_spextractsequences": {

diff --git a/subworkflows/gallvp/fasta_gxf_busco_plot/main.nf b/subworkflows/gallvp/fasta_gxf_busco_plot/main.nf
@@ -68,7 +68,7 @@ workflow FASTA_GXF_BUSCO_PLOT {
     ch_versions                                 = ch_versions.mix(BUSCO_ASSEMBLY.out.versions.first())
 
     // MODULE: BUSCO_GENERATEPLOT as PLOT_ASSEMBLY
-    ch_assembly_plot_inputs                     = ch_assembly_short_summaries_txt
+    ch_assembly_plot_summary                    = ch_assembly_short_summaries_txt
                                                 | map { meta, txt ->
                                                     def lineage_name = meta.lineage.split('_odb')[0]
                                                     [
@@ -77,9 +77,8 @@ workflow FASTA_GXF_BUSCO_PLOT {
                                                     ]
                                                 }
                                                 | collectFile
-                                                | collect
 
-    PLOT_ASSEMBLY( ch_assembly_plot_inputs )
+    PLOT_ASSEMBLY( ch_assembly_plot_summary.collect() )
 
     ch_assembly_png                             = PLOT_ASSEMBLY.out.png
     ch_versions                                 = ch_versions.mix(PLOT_ASSEMBLY.out.versions)
@@ -138,18 +137,17 @@ workflow FASTA_GXF_BUSCO_PLOT {
     ch_versions                                 = ch_versions.mix(BUSCO_ANNOTATION.out.versions.first())
 
     // MODULE: BUSCO_GENERATEPLOT as PLOT_ANNOTATION
-    ch_annotation_plot_inputs                   = ch_annotation_short_summaries_txt
+    ch_annotation_plot_summary                  = ch_annotation_short_summaries_txt
                                                 | map { meta, txt ->
                                                     def lineage_name = meta.lineage.split('_odb')[0]
                                                     [
-                                                        "short_summary.specific.${meta.lineage}.${meta.id}_${lineage_name}.txt",
+                                                        "short_summary.specific.${meta.lineage}.${meta.id}_${lineage_name}.proteins.txt",
                                                         txt.text
                                                     ]
                                                 }
                                                 | collectFile
-                                                | collect
 
-    PLOT_ANNOTATION( ch_annotation_plot_inputs )
+    PLOT_ANNOTATION( ch_annotation_plot_summary.collect() )
 
     ch_annotation_png                           = PLOT_ANNOTATION.out.png
     ch_versions                                 = ch_versions.mix(PLOT_ANNOTATION.out.versions)
@@ -159,11 +157,12 @@ workflow FASTA_GXF_BUSCO_PLOT {
     assembly_batch_summary                      = ch_assembly_batch_summary             // channel: [ meta3, txt ]; meta3 ~ meta + [ val(mode), val(lineage) ]
     assembly_short_summaries_txt                = ch_assembly_short_summaries_txt       // channel: [ meta3, txt ]
     assembly_short_summaries_json               = ch_assembly_short_summaries_json      // channel: [ meta3, json ]
+    assembly_plot_summary_txt                   = ch_assembly_plot_summary              // channel: [ text ]
     assembly_png                                = ch_assembly_png                       // channel: [ png ]
     annotation_batch_summary                    = ch_annotation_batch_summary           // channel: [ meta3, txt ]
     annotation_short_summaries_txt              = ch_annotation_short_summaries_txt     // channel: [ meta3, txt ]
     annotation_short_summaries_json             = ch_annotation_short_summaries_json    // channel: [ meta3, json ]
+    annotation_plot_summary_txt                 = ch_annotation_plot_summary            // channel: [ txt ]
     annotation_png                              = ch_annotation_png                     // channel: [ png ]
     versions                                    = ch_versions                           // channel: [ versions.yml ]
 }
-
diff --git a/subworkflows/gallvp/fasta_gxf_busco_plot/meta.yml b/subworkflows/gallvp/fasta_gxf_busco_plot/meta.yml
@@ -64,6 +64,12 @@ output:
         Channel containing BUSCO short summaries corresponding to fasta files
         Structure: [ val(meta), json ]
       pattern: "*.json"
+  - assembly_plot_summary_txt:
+      type: file
+      description: |
+        Channel containing BUSCO short summaries corresponding to fasta files renamed to include lineage in sample id
+        Structure: [ txt ]
+      pattern: "*.txt"
   - assembly_png:
       type: file
       description: |
@@ -88,6 +94,12 @@ output:
         Channel containing BUSCO short summaries corresponding to annotation files
         Structure: [ val(meta), json ]
       pattern: "*.json"
+  - annotation_plot_summary_txt:
+      type: file
+      description: |
+        Channel containing BUSCO short summaries corresponding to annotation files renamed to include lineage in sample id
+        Structure: [ txt ]
+      pattern: "*.txt"
   - annotation_png:
       type: file
       description: |

diff --git a/subworkflows/gallvp/fasta_gxf_busco_plot/tests/main.nf.test.snap b/subworkflows/gallvp/fasta_gxf_busco_plot/tests/main.nf.test.snap
@@ -38,6 +38,11 @@
                 ],
                 "1": [
 
+                ],
+                "10": [
+                    "versions.yml:md5,36b11c442943567e471af0abd474a10b",
+                    "versions.yml:md5,9435355f913e283f60b4fb7ef77dd52a",
+                    "versions.yml:md5,e9d65e2f2f13175e99c5b7f4ae1013b9"
                 ],
                 "2": [
 
@@ -46,6 +51,9 @@
 
                 ],
                 "4": [
+
+                ],
+                "5": [
                     [
                         {
                             "id": "test",
@@ -62,9 +70,6 @@
                         },
                         "test-bacteria_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e"
                     ]
-                ],
-                "5": [
-
                 ],
                 "6": [
 
@@ -73,9 +78,10 @@
 
                 ],
                 "8": [
-                    "versions.yml:md5,36b11c442943567e471af0abd474a10b",
-                    "versions.yml:md5,9435355f913e283f60b4fb7ef77dd52a",
-                    "versions.yml:md5,e9d65e2f2f13175e99c5b7f4ae1013b9"
+
+                ],
+                "9": [
+
                 ],
                 "annotation_batch_summary": [
                     [
@@ -94,6 +100,9 @@
                         },
                         "test-bacteria_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e"
                     ]
+                ],
+                "annotation_plot_summary_txt": [
+
                 ],
                 "annotation_png": [
 
@@ -137,6 +146,9 @@
                         },
                         "test2-bacteria_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e"
                     ]
+                ],
+                "assembly_plot_summary_txt": [
+
                 ],
                 "assembly_png": [
 
@@ -155,10 +167,10 @@
             }
         ],
         "meta": {
-            "nf-test": "0.8.4",
-            "nextflow": "23.10.1"
+            "nf-test": "0.9.0",
+            "nextflow": "24.04.4"
         },
-        "timestamp": "2024-05-13T16:39:45.021811"
+        "timestamp": "2024-11-20T14:04:28.17169"
     },
     "candidatus_portiera_aleyrodidarum - bacteroides_fragilis - genome": {
         "content": [

diff --git a/workflows/genepal.nf b/workflows/genepal.nf
@@ -246,9 +246,18 @@ workflow GENEPAL {
 
     ch_busco_fasta_summary      = FASTA_GXF_BUSCO_PLOT.out.assembly_short_summaries_txt
     ch_busco_gff_summary        = FASTA_GXF_BUSCO_PLOT.out.annotation_short_summaries_txt
+
+    ch_busco_fasta_plot_summary = FASTA_GXF_BUSCO_PLOT.out.assembly_plot_summary_txt
+    ch_busco_gff_plot_summary   = FASTA_GXF_BUSCO_PLOT.out.annotation_plot_summary_txt
+
     ch_multiqc_files            = ch_multiqc_files
-                                | mix(ch_busco_fasta_summary)
-                                | mix(ch_busco_gff_summary)
+                                | mix(
+                                    ch_busco_fasta_plot_summary.map { file -> [ [], file ] }
+                                )
+                                | mix(
+                                    ch_busco_gff_plot_summary.map { file -> [ [], file ] }
+                                )
+
     ch_versions                 = ch_versions.mix(FASTA_GXF_BUSCO_PLOT.out.versions)
 
     // SUBWORKFLOW: GXF_FASTA_AGAT_SPADDINTRONS_SPEXTRACTSEQUENCES