From 42c4db677fcd6f9e4835a67dd923c2eba07338ad Mon Sep 17 00:00:00 2001 From: FernandoDuarteF Date: Mon, 9 Dec 2024 09:20:14 +0000 Subject: [PATCH 01/11] Added bin/tree_summary.R for new tree plot --- bin/tree_summary.R | 229 ++++++++++++++++++++++++++++++++++ modules/local/tree_summary.nf | 22 ++-- 2 files changed, 239 insertions(+), 12 deletions(-) create mode 100755 bin/tree_summary.R diff --git a/bin/tree_summary.R b/bin/tree_summary.R new file mode 100755 index 0000000..208168b --- /dev/null +++ b/bin/tree_summary.R @@ -0,0 +1,229 @@ +#!/usr/bin/Rscript + +# Written by Chris Wyatt and Fernando Duarte and released under the MIT license. +# Plots a phylogenetic tree alongside BUSCO and Quast stats + +# Load necessary libraries +if (!requireNamespace("argparse", quietly = TRUE)) { + install.packages("argparse") +} +if (!requireNamespace("ggtree", quietly = TRUE)) { + BiocManager::install("ggtree") +} +if (!requireNamespace("ggplot2", quietly = TRUE)) { + install.packages("ggplot2") +} +if (!requireNamespace("cowplot", quietly = TRUE)) { + install.packages("cowplot") +} +if (!requireNamespace("tidyr", quietly = TRUE)) { + install.packages("tidyr") +} +if (!requireNamespace("ggtreeExtra", quietly = TRUE)) { + BiocManager::install("ggtreeExtra") +} +if (!requireNamespace("ggimage", quietly = TRUE)) { + install.packages("ggimage") +} + +library(ggtree) +library(ggplot2) +library(cowplot) +library(argparse) +library(dplyr) +library(tidyr) +library(ggimage) +library(ggtreeExtra) + +# Parse command-line arguments +parser <- ArgumentParser(description = 'Plot phylogenetic tree with statistics and true/false data') +parser$add_argument('tree_file', type = 'character', help = 'Path to the Newick formatted tree file') +parser$add_argument('busco_file', type = 'character', help = 'Path to processed BUSCO output file') +parser$add_argument('quast_file', type = 'character', help = 'Path to processed Quast output file') +parser$add_argument('--text_size', type = 'double', default = 2.5, help = 'Tree tip text size for the plots') +parser$add_argument('--pie_size', type = 'double', default = 0, help = 'Increase pie size of each tip by this fold') +#parser$add_argument('--tree_size', type = 'double', default = 0.3, help = 'Proportion of the plot width for the tree') +args <- parser$parse_args() + +# Read the Newick tree from the file +tree <- read.tree(args$tree_file) + +# Clean tree tip labels +tree$tip.label <- trimws(tree$tip.label) +tree$tip.label <- tolower(tree$tip.label) + +# Read the data table from the file, ensuring species column is read as character +# Load BUSCO +data_busco <- read.csv(args$busco_file, sep = "\t", colClasses = c("Input_file" = "character")) + +# Prepare BUSCO data +data_busco <- data_busco %>% + # Remove extension from Input_file + mutate(Input_file = tools::file_path_sans_ext(Input_file)) %>% + # Rename 'Input_file' to 'species' + rename(species = Input_file) %>% + # Make everything lowercase + mutate(species = tolower(species)) + +# Load Quast data +data_quast <- read.csv(args$quast_file, sep = "\t") + +#Tidy Quast data +data_quast <- data_quast %>% + # Remove the row where species is NA + filter(!is.na(species)) %>% + # Make species lowercase + mutate(species = tolower(species)) %>% + # Convert wide to long format + pivot_longer(cols = c(N50, N90), + names_to = "metric", + values_to = "value") %>% + # Remove any remaining "bar" rows if necessary (check Chris script) + filter(value != "bar") %>% + # Convert value column to numeric if needed + mutate(value = as.numeric(value)) + +# Debugging: Print species names from the tree and the data +cat("Species names in the tree based on nw:\n") +cat(tree$tip.label) +cat("\nSpecies names in data tables:\n") +quast_sp <- unique(data_quast$species) +cat("BUSCO:", data_busco$species, "\nQUAST:", quast_sp, "\n") + +# Debugging: Check if there are any mismatches in species names (might need to be +# changed in the future to support optional stats) +if (all(sort(tree$tip.label) != sort(data_busco$species))) { + stop("Species names in BUSCO and tree labels do not match") +} else if (all(sort(quast_sp) != sort(data_busco$species))) { + stop("Species names in Quast and tree labels do not match") +} + +# Arrange data according to tree labels, so that they are plotted correctly +# For BUSCO +data_busco <- data_busco %>% + arrange(match(species, tree$tip.label)) + +# For Quast +data_quast <- data_quast %>% + arrange(match(species, tree$tip.label)) + +# Node number needed in BUSCO dataframe for nodpie. This is a generic 1-n +# numbering (n=number of species) +data_busco <- data_busco %>% + mutate(node = seq_len(n())) + +# Plot the phylogenetic tree +tree_plot <- ggtree(tree, branch.length = "none") + # No branch length, only topology + geom_tiplab(size=args$text_size) + # Tip font size + ggplot2::xlim(0, 5) + + ggtitle("Phylogenetic Tree") + + theme(plot.margin = margin(10, 10, 10, 10)) # Increase margins + +# Save tree only +ggsave("tree_only.pdf", tree_plot) + +# Create list of pies from BUSCO data +pies <- nodepie(data_busco, + cols = 4:7, + color = c( + "Single" = "steelblue", + "Duplicated" = "orange", + "Fragmented" = "purple", + "Missing" = "red" + ) +) + +print("Check 1") + +# Create a dummy pie as for the legend it seems impossible to extract it +# from nodpie +dummy_pie_data <- data.frame( + Type = factor(c("Single", "Duplicated", "Fragmented", "Missing")), + value = c(1, 1, 1, 1) # Dummy values for equal slices +) + +dummy_pie <- ggplot(dummy_pie_data) + + geom_bar( + aes(x = "", y = value, fill = Type), + stat = "identity", + width = 1 + ) + + coord_polar("y") + + scale_fill_manual( + values = c( + "Single" = "steelblue", + "Duplicated" = "orange", + "Fragmented" = "purple", + "Missing" = "red" + ), + name = "BUSCO" + ) + + theme_void() + + theme(legend.position = "right") # Show the legend + +print("Check 2") + +# Create variable for standarization of legends format +legends_theme <- theme(legend.title = element_text(size = 10, face = "bold"), + legend.text = element_text(size = 8), + legend.key.size = unit(0.5, "cm")) + +# Extract legend of dummy BUSCO pie chart +legend_busco <- cowplot::get_legend(dummy_pie + legends_theme) + +print("Check 2.5") + +# Plot BUSCO pies next to tree tips +p <- ggtree::inset(tree_plot, + pies, + width=(args$pie_size * .16 + .16), + height=(args$pie_size * .6 + .6), + hjust=-1.2) + +print("Check 3") + +# Plot Quast data +p2 <- p + geom_fruit(data=data_quast, + geom = geom_bar, + mapping = aes(y=species, x=value, fill=metric), + position=position_dodgex(), + pwidth=0.38, + orientation="y", + stat="identity", + axis.params = list(axis = "x", text.size = 1.8, hjust = 1, vjust = 0.5, nbreak = 3), + grid.params = list(), + offset = 1, + width = 0.4) + labs(fill = "Quast") + +print("Check 4") + +# Save plot without legend +ggsave("Phyloplot_no_legend.pdf", p2 + theme(legend.position="none")) + +# Extract Quast legend +legend_quast <- cowplot::get_legend(p2 + std_theme) + +# Wrap the legends in fixed-sized containers and adjust margins so that legends +# are perfectly aligned +legend_busco <- ggdraw(legend_busco) + theme(plot.margin = margin(0, 0, 0, 0)) +legend_quast <- ggdraw(legend_quast) + theme(plot.margin = margin(0, 29, 60, 0)) + +# Combine both BUSCO and Quast legends +combined_legends <- plot_grid(legend_busco, + legend_quast, + NULL, # Dummy rows for better positioning + NULL, + nrow = 4, + rel_widths = c(1, 1)) + +# Save legend +ggsave("legend.pdf", plot = combined_legends) + +# Add combined legends to tree plot +p3 <- plot_grid(p2 + theme(legend.position = "none"), + combined_legends, + ncol = 2, + rel_widths = c(4, 1)) + +# Save full plot +ggsave("Phyloplot_complete.pdf", plot = p3) diff --git a/modules/local/tree_summary.nf b/modules/local/tree_summary.nf index eca2247..993bb80 100644 --- a/modules/local/tree_summary.nf +++ b/modules/local/tree_summary.nf @@ -21,24 +21,22 @@ process TREE_SUMMARY { #Remove unwanted extensions in the tree file sed \'s/\\.prot\\.fa\\.largestIsoform//g\' ${tree}/Species_Tree/SpeciesTree_rooted_node_labels.txt > tree.nw - # Combine the BUSCO outputs - head -qn 1 *.txt | head -n 1 > Busco_combined - tail -q -n 1 *.txt >> Busco_combined - cut -f 1,3,4,5,6,7 Busco_combined >> Busco_combined_cut - sed -i \'s/\\.fasta//g\' Busco_combined_cut + # Combine the BUSCO outputs and remove empty tabs + head -qn 1 *.txt | head -n 1 > Busco_combined + tail -q -n 1 *.txt | sed -E 's/\t+/\t/g' | sed 's/\t\$//g' >> Busco_combined + # cut -f 1,3,4,5,6,7 Busco_combined >> Busco_combined_cut + # sed -i \'s/\\.fasta//g\' Busco_combined_cut - busco_2_table.py Busco_combined_cut Busco_to_plot.tsv + # busco_2_table.py Busco_combined_cut Busco_to_plot.tsv # Combine QUAST ouput quast_2_table.py *quast.tsv -o Quast_to_plot.tsv -col N50,N90 -plot_types bar,bar - - #Remove unwanted extensions from Busco tables - sed \'s/.prot.fa.largestIsoform.fa//g\' Busco_to_plot.tsv > Busco_to_plot_final.tsv - sed \'s/.prot.fa.largestIsoform.fa//g\' Quast_to_plot.tsv > Quast_to_plot_final.tsv + + sudo apt-get install libmagick++-dev # Run summary plot - plot_tree_summary2.R tree.nw Busco_to_plot_final.tsv --tree_size 0.6 - plot_tree_summary.R tree.nw Quast_to_plot_final.tsv --tree_size 0.6 + tree_summary.R tree.nw Busco_combined Quast_to_plot.tsv + #plot_tree_summary.R tree.nw Quast_to_plot_final.tsv --tree_size 0.6 cat <<-END_VERSIONS > versions.yml "${task.process}": From eb0fbac6490c2a92bb54ebeed30bcbe54f055a71 Mon Sep 17 00:00:00 2001 From: FernandoDuarteF Date: Mon, 9 Dec 2024 16:41:50 +0000 Subject: [PATCH 02/11] Fixed Quast crop problem --- bin/tree_summary.R | 55 +++++++++++++++++++++++++++++++++-- modules/local/tree_summary.nf | 4 +-- 2 files changed, 53 insertions(+), 6 deletions(-) diff --git a/bin/tree_summary.R b/bin/tree_summary.R index 208168b..3789c87 100755 --- a/bin/tree_summary.R +++ b/bin/tree_summary.R @@ -42,9 +42,47 @@ parser$add_argument('busco_file', type = 'character', help = 'Path to processed parser$add_argument('quast_file', type = 'character', help = 'Path to processed Quast output file') parser$add_argument('--text_size', type = 'double', default = 2.5, help = 'Tree tip text size for the plots') parser$add_argument('--pie_size', type = 'double', default = 0, help = 'Increase pie size of each tip by this fold') +parser$add_argument('--pie_hjust', type = 'double', default = 0.4, help = 'Horizontally adjust pie positions') #parser$add_argument('--tree_size', type = 'double', default = 0.3, help = 'Proportion of the plot width for the tree') args <- parser$parse_args() +# Create a function so that, if the Quast data is off limits and warnings related +# to this pop up , margins will be increased by the value "step" on each iteration +increase_scale_until_no_warnings <- function(plot, initial_limit = 5, step = 0.1, max_iterations = 50) { + # Initialize limit and warning flag + current_limit <- initial_limit + warning_flag <- TRUE + iteration <- 0 + + while (warning_flag && iteration < max_iterations) { + iteration <- iteration + 1 + # Update the plot with the current limit + updated_plot <- plot + scale_x_continuous(limits = c(0, current_limit)) + + # Capture warnings + warnings <- tryCatch({ + print(updated_plot) # Render the plot + NULL # No warnings + }, warning = function(w) { + w$message # Capture warning message + }) + + # Check if there are warnings + if (is.null(warnings)) { + warning_flag <- FALSE # No warnings, exit loop + } else { + # Increase the limit for the next iteration + current_limit <- current_limit + step + } + } + + # Return the final plot + if (iteration >= max_iterations) { + warning("Maximum iterations reached. Plot may still have issues.") + } + return(updated_plot) +} + # Read the Newick tree from the file tree <- read.tree(args$tree_file) @@ -190,18 +228,29 @@ p2 <- p + geom_fruit(data=data_quast, pwidth=0.38, orientation="y", stat="identity", - axis.params = list(axis = "x", text.size = 1.8, hjust = 1, vjust = 0.5, nbreak = 3), + axis.params = list( + axis = "x", + text.size = 1.8, + hjust = 1, + vjust = 0.5, + nbreak = 3 + ), grid.params = list(), offset = 1, - width = 0.4) + labs(fill = "Quast") + width = 0.4) + + labs(fill = "Quast") + + scale_x_continuous(limits=c(0,5)) # limits=c(0,5) by default for p2 print("Check 4") +# +p2 <- increase_scale_until_no_warnings(p2) + # Save plot without legend ggsave("Phyloplot_no_legend.pdf", p2 + theme(legend.position="none")) # Extract Quast legend -legend_quast <- cowplot::get_legend(p2 + std_theme) +legend_quast <- cowplot::get_legend(p2 + legends_theme) # Wrap the legends in fixed-sized containers and adjust margins so that legends # are perfectly aligned diff --git a/modules/local/tree_summary.nf b/modules/local/tree_summary.nf index 993bb80..f44a6b7 100644 --- a/modules/local/tree_summary.nf +++ b/modules/local/tree_summary.nf @@ -2,7 +2,7 @@ process TREE_SUMMARY { tag "$meta.id" label 'process_single' - container = 'ecoflowucl/genomeqc_tree:v1.3' + container = 'fduarte001/genomeqc_tree:0.1' publishDir "$params.outdir/tree_plots" , mode: "${params.publish_dir_mode}", pattern:"Phyloplot_*.pdf" input: @@ -31,8 +31,6 @@ process TREE_SUMMARY { # Combine QUAST ouput quast_2_table.py *quast.tsv -o Quast_to_plot.tsv -col N50,N90 -plot_types bar,bar - - sudo apt-get install libmagick++-dev # Run summary plot tree_summary.R tree.nw Busco_combined Quast_to_plot.tsv From a8619010a4d20f8facb648f6ea0fd8afb240411d Mon Sep 17 00:00:00 2001 From: FernandoDuarteF Date: Mon, 9 Dec 2024 16:42:44 +0000 Subject: [PATCH 03/11] Removed redundant lines --- modules/local/tree_summary.nf | 5 ----- 1 file changed, 5 deletions(-) diff --git a/modules/local/tree_summary.nf b/modules/local/tree_summary.nf index f44a6b7..696695f 100644 --- a/modules/local/tree_summary.nf +++ b/modules/local/tree_summary.nf @@ -24,17 +24,12 @@ process TREE_SUMMARY { # Combine the BUSCO outputs and remove empty tabs head -qn 1 *.txt | head -n 1 > Busco_combined tail -q -n 1 *.txt | sed -E 's/\t+/\t/g' | sed 's/\t\$//g' >> Busco_combined - # cut -f 1,3,4,5,6,7 Busco_combined >> Busco_combined_cut - # sed -i \'s/\\.fasta//g\' Busco_combined_cut - - # busco_2_table.py Busco_combined_cut Busco_to_plot.tsv # Combine QUAST ouput quast_2_table.py *quast.tsv -o Quast_to_plot.tsv -col N50,N90 -plot_types bar,bar # Run summary plot tree_summary.R tree.nw Busco_combined Quast_to_plot.tsv - #plot_tree_summary.R tree.nw Quast_to_plot_final.tsv --tree_size 0.6 cat <<-END_VERSIONS > versions.yml "${task.process}": From dc0897a29201fccf013af1e7418a9843f4dbea4e Mon Sep 17 00:00:00 2001 From: Fernando Duarte Date: Mon, 9 Dec 2024 17:53:35 +0000 Subject: [PATCH 04/11] Changed R script name --- bin/plot_tree_summary.R | 371 ++++++++++++++++++++-------------- bin/plot_tree_summary.R.old | 219 ++++++++++++++++++++ bin/tree_summary.R | 278 ------------------------- modules/local/tree_summary.nf | 4 +- 4 files changed, 436 insertions(+), 436 deletions(-) create mode 100755 bin/plot_tree_summary.R.old delete mode 100755 bin/tree_summary.R diff --git a/bin/plot_tree_summary.R b/bin/plot_tree_summary.R index 7028126..af2f358 100755 --- a/bin/plot_tree_summary.R +++ b/bin/plot_tree_summary.R @@ -1,7 +1,7 @@ #!/usr/bin/Rscript -# Written by Chris Wyatt and released under the MIT license. -# Prints a tree with QUAST N50 results on tips of branches +# Written by Chris Wyatt and Fernando Duarte and released under the MIT license. +# Plots a phylogenetic tree alongside BUSCO and Quast stats # Load necessary libraries if (!requireNamespace("argparse", quietly = TRUE)) { @@ -19,6 +19,12 @@ if (!requireNamespace("cowplot", quietly = TRUE)) { if (!requireNamespace("tidyr", quietly = TRUE)) { install.packages("tidyr") } +if (!requireNamespace("ggtreeExtra", quietly = TRUE)) { + BiocManager::install("ggtreeExtra") +} +if (!requireNamespace("ggimage", quietly = TRUE)) { + install.packages("ggimage") +} library(ggtree) library(ggplot2) @@ -26,22 +32,57 @@ library(cowplot) library(argparse) library(dplyr) library(tidyr) - -# Function to extract legend from a ggplot -extract_legend <- function(plot) { - g <- ggplotGrob(plot) - legend <- g$grobs[which(sapply(g$grobs, function(x) x$name) == "guide-box")] - return(legend) -} +library(ggimage) +library(ggtreeExtra) # Parse command-line arguments parser <- ArgumentParser(description = 'Plot phylogenetic tree with statistics and true/false data') parser$add_argument('tree_file', type = 'character', help = 'Path to the Newick formatted tree file') -parser$add_argument('data_file', type = 'character', help = 'Path to the CSV file with statistic and true/false data') -parser$add_argument('--text_size', type = 'double', default = 3, help = 'Text size for the plots') -parser$add_argument('--tree_size', type = 'double', default = 0.3, help = 'Proportion of the plot width for the tree') +parser$add_argument('busco_file', type = 'character', help = 'Path to processed BUSCO output file') +parser$add_argument('quast_file', type = 'character', help = 'Path to processed Quast output file') +parser$add_argument('--text_size', type = 'double', default = 2.5, help = 'Tree tip text size for the plots') +parser$add_argument('--pie_size', type = 'double', default = 0, help = 'Increase pie size of each tip by this fold') +parser$add_argument('--pie_hjust', type = 'double', default = 0.4, help = 'Horizontally adjust pie positions') +#parser$add_argument('--tree_size', type = 'double', default = 0.3, help = 'Proportion of the plot width for the tree') args <- parser$parse_args() +# Create a function so that, if the Quast data is off limits and warnings related +# to this pop up , margins will be increased by the value "step" on each iteration +increase_scale_until_no_warnings <- function(plot, initial_limit = 5, step = 0.1, max_iterations = 50) { + # Initialize limit and warning flag + current_limit <- initial_limit + warning_flag <- TRUE + iteration <- 0 + + while (warning_flag && iteration < max_iterations) { + iteration <- iteration + 1 + # Update the plot with the current limit + updated_plot <- plot + scale_x_continuous(limits = c(0, current_limit)) + + # Capture warnings + warnings <- tryCatch({ + print(updated_plot) # Render the plot + NULL # No warnings + }, warning = function(w) { + w$message # Capture warning message + }) + + # Check if there are warnings + if (is.null(warnings)) { + warning_flag <- FALSE # No warnings, exit loop + } else { + # Increase the limit for the next iteration + current_limit <- current_limit + step + } + } + + # Return the final plot + if (iteration >= max_iterations) { + warning("Maximum iterations reached. Plot may still have issues.") + } + return(updated_plot) +} + # Read the Newick tree from the file tree <- read.tree(args$tree_file) @@ -50,170 +91,188 @@ tree$tip.label <- trimws(tree$tip.label) tree$tip.label <- tolower(tree$tip.label) # Read the data table from the file, ensuring species column is read as character -data <- read.csv(args$data_file, sep = "\t", colClasses = c("species" = "character")) +# Load BUSCO +data_busco <- read.csv(args$busco_file, sep = "\t", colClasses = c("Input_file" = "character")) + +# Prepare BUSCO data +data_busco <- data_busco %>% + # Remove extension from Input_file + mutate(Input_file = tools::file_path_sans_ext(Input_file)) %>% + # Rename 'Input_file' to 'species' + rename(species = Input_file) %>% + # Make everything lowercase + mutate(species = tolower(species)) -# Clean data species names -data$species <- trimws(data$species) -data$species <- tolower(data$species) +# Load Quast data +data_quast <- read.csv(args$quast_file, sep = "\t") -# Extract the column headers and the second row to determine plot types -plot_types <- as.character(data[1, ]) -data <- data[-1, ] # Remove the second row used for plot types +#Tidy Quast data +data_quast <- data_quast %>% + # Remove the row where species is NA + filter(!is.na(species)) %>% + # Make species lowercase + mutate(species = tolower(species)) %>% + # Convert wide to long format + pivot_longer(cols = c(N50, N90), + names_to = "metric", + values_to = "value") %>% + # Remove any remaining "bar" rows if necessary (check Chris script) + filter(value != "bar") %>% + # Convert value column to numeric if needed + mutate(value = as.numeric(value)) # Debugging: Print species names from the tree and the data cat("Species names in the tree based on nw:\n") -print(tree$tip.label) -cat("\nSpecies names in the data table before processing:\n") -print(data$species) +cat(tree$tip.label) +cat("\nSpecies names in data tables:\n") +quast_sp <- unique(data_quast$species) +cat("BUSCO:", data_busco$species, "\nQUAST:", quast_sp, "\n") -# Debugging: Print the data frame to check its structure -cat("\nData frame:\n") -print(data) - -# Ensure species names match the tree tips -missing_in_tree <- setdiff(data$species, tree$tip.label) -missing_in_data <- setdiff(tree$tip.label, data$species) - -if (length(missing_in_tree) > 0) { - cat("Species in data but not in tree:\n") - print(missing_in_tree) +# Debugging: Check if there are any mismatches in species names (might need to be +# changed in the future to support optional stats) +if (all(sort(tree$tip.label) != sort(data_busco$species))) { + stop("Species names in BUSCO and tree labels do not match") +} else if (all(sort(quast_sp) != sort(data_busco$species))) { + stop("Species names in Quast and tree labels do not match") } -if (length(missing_in_data) > 0) { - cat("Species in tree but not in data:\n") - print(missing_in_data) -} +# Arrange data according to tree labels, so that they are plotted correctly +# For BUSCO +data_busco <- data_busco %>% + arrange(match(species, tree$tip.label)) -if (length(missing_in_tree) > 0 || length(missing_in_data) > 0) { - stop("Species names in the data do not match the tree tips") -} +# For Quast +data_quast <- data_quast %>% + arrange(match(species, tree$tip.label)) + +# Node number needed in BUSCO dataframe for nodpie. This is a generic 1-n +# numbering (n=number of species) +data_busco <- data_busco %>% + mutate(node = seq_len(n())) # Plot the phylogenetic tree -tree_plot <- ggtree(tree) + - geom_tiplab() + - coord_cartesian(clip="off") + +tree_plot <- ggtree(tree, branch.length = "none") + # No branch length, only topology + geom_tiplab(size=args$text_size) + # Tip font size + ggplot2::xlim(0, 5) + ggtitle("Phylogenetic Tree") + - theme(plot.margin = margin(10, 150, 10, 10)) # Increase margins + theme(plot.margin = margin(10, 10, 10, 10)) # Increase margins -pdf ("Tree_only.pdf") -tree_plot -dev.off() +# Save tree only +ggsave("tree_only.pdf", tree_plot) +# Create list of pies from BUSCO data +pies <- nodepie(data_busco, + cols = 4:7, + color = c( + "Single" = "steelblue", + "Duplicated" = "orange", + "Fragmented" = "purple", + "Missing" = "red" + ) +) -# Extract the exact tree tip names:axis.text -tree$tip.label <- get_taxa_name(tree_plot) +print("Check 1") -# Reorder the data based on the order of species in the tree -data <- data %>% arrange(match(species, tree$tip.label)) +# Create a dummy pie as for the legend it seems impossible to extract it +# from nodpie +dummy_pie_data <- data.frame( + Type = factor(c("Single", "Duplicated", "Fragmented", "Missing")), + value = c(1, 1, 1, 1) # Dummy values for equal slices +) -# Debugging: Print plot types -cat("\nPlot types:\n") -print(plot_types) +dummy_pie <- ggplot(dummy_pie_data) + + geom_bar( + aes(x = "", y = value, fill = Type), + stat = "identity", + width = 1 + ) + + coord_polar("y") + + scale_fill_manual( + values = c( + "Single" = "steelblue", + "Duplicated" = "orange", + "Fragmented" = "purple", + "Missing" = "red" + ), + name = "BUSCO" + ) + + theme_void() + + theme(legend.position = "right") # Show the legend -# Debugging: Print the reordered data frame to check its structure -cat("\nReordered data frame:\n") -print(data) +print("Check 2") -write.table(data, "Reordered_output_tree.tsv", sep="\t", quote=F) +# Create variable for standarization of legends format +legends_theme <- theme(legend.title = element_text(size = 10, face = "bold"), + legend.text = element_text(size = 8), + legend.key.size = unit(0.5, "cm")) -# Check if the number of unique species matches the number of tree tips -if (length(unique(data$species)) != length(tree$tip.label)) { - warning("The number of unique species in the data does not match the number of tree tips.") -} +# Extract legend of dummy BUSCO pie chart +legend_busco <- cowplot::get_legend(dummy_pie + legends_theme) -# Find column headers -column_headers <- colnames(data) - -# Create plots based on the plot types -plots <- list() -legend_plot <- NULL -for (i in 2:length(column_headers)) { - column_name <- column_headers[i] - plot_type <- plot_types[i] - - if (plot_type == "bar") { - bar_plot <- ggplot(data, aes(x = as.numeric(!!sym(column_name)), y = factor(species, levels = rev(tree$tip.label)))) + - geom_bar(stat = "identity", fill = "darkblue") + - geom_text(aes(label = as.numeric(!!sym(column_name)), x = 0), hjust = -0.1, color = "white", size = args$text_size) + # Place text at the base of the bar - theme_minimal() + - theme(axis.title.y = element_blank(), - axis.text.y = element_blank(), - axis.ticks.y = element_blank(), - axis.title.x = element_blank(), - axis.text.x = element_blank(), - axis.ticks.x = element_blank(), - panel.grid.major = element_blank(), # Remove vertical grid lines - panel.grid.minor = element_blank(), # Remove minor grid lines - panel.background = element_blank(), # Remove grey background - plot.background = element_blank(), # Remove any outer plot background - plot.margin = margin(0, 0, 0, 0)) + - labs(x = column_name) + - ggtitle(column_name) - plots[[length(plots) + 1]] <- bar_plot - } else if (plot_type == "text") { - text_plot <- ggplot(data, aes(y = factor(species, levels = rev(tree$tip.label)), x = 1, label = !!sym(column_name))) + - geom_text(size = args$text_size, hjust = 0) + - theme_void() + - labs(x = NULL, y = NULL) + - theme(axis.title.x = element_blank(), - axis.text.x = element_blank(), - axis.ticks.x = element_blank(), - plot.margin = margin(0, 0, 0, 0)) + - ggtitle(column_name) - plots[[length(plots) + 1]] <- text_plot - } else if (plot_type == "stacked") { - # Split the stacked values into separate columns - stacked_data <- data %>% - separate(column_name, into = paste0(column_name, "_", 1:4), sep = ",", convert = TRUE, extra = "drop") %>% - pivot_longer(cols = starts_with(column_name), names_to = "stack", values_to = "value") %>% - mutate(stack = factor(stack, levels = paste0(column_name, "_", 1:4))) - - stacked_plot <- ggplot(stacked_data, aes(x = value, y = factor(species, levels = rev(tree$tip.label)), fill = stack)) + - geom_bar(stat = "identity", position = "fill") + - theme_minimal() + - theme(axis.title.y = element_blank(), - axis.text.y = element_blank(), - axis.ticks.y = element_blank(), - axis.title.x = element_blank(), - axis.text.x = element_blank(), - axis.ticks.x = element_blank(), - panel.grid.major.y = element_blank(), # Remove horizontal grid lines - panel.grid.minor.y = element_blank(), - plot.margin = margin(0, 0, 0, 0)) + - labs(x = column_name) + - ggtitle(column_name) - - # Extract the legend from the stacked plot - legend_plot <- extract_legend(stacked_plot) - - # Remove the legend from the stacked plot - stacked_plot <- stacked_plot + theme(legend.position = "none") - - plots[[length(plots) + 1]] <- stacked_plot - } else { - cat("Unknown plot type:", plot_type, "for column:", column_name, "\n") - } -} +print("Check 2.5") -# Debugging: Print the list of plots -cat("List of plots:\n") -print(plots) - -# Combine the plots if there are any -if (length(plots) > 0) { - combined_plot <- plot_grid(tree_plot, plot_grid(plotlist = plots, ncol = length(plots)), ncol = 2, rel_widths = c(args$tree_size, 1 - args$tree_size)) # Adjust widths based on tree_size - - # Save the combined plot to a PDF file - ggsave("Phyloplot_quast.pdf", plot = combined_plot, width = 10, height = 8) - - # Save the legend to a separate PDF file - if (!is.null(legend_plot)) { - legend_plot <- cowplot::plot_grid(legend_plot) - ggsave("Legend.pdf", plot = legend_plot, width = 10, height = 2) - } -} else { - cat("No plots to combine.\n") -} +# Plot BUSCO pies next to tree tips +p <- ggtree::inset(tree_plot, + pies, + width=(args$pie_size * .16 + .16), + height=(args$pie_size * .6 + .6), + hjust=-1.2) + +print("Check 3") + +# Plot Quast data +p2 <- p + geom_fruit(data=data_quast, + geom = geom_bar, + mapping = aes(y=species, x=value, fill=metric), + position=position_dodgex(), + pwidth=0.38, + orientation="y", + stat="identity", + axis.params = list( + axis = "x", + text.size = 1.8, + hjust = 1, + vjust = 0.5, + nbreak = 3 + ), + grid.params = list(), + offset = 1, + width = 0.4) + + labs(fill = "Quast") + + scale_x_continuous(limits=c(0,5)) # limits=c(0,5) by default for p2 + +print("Check 4") + +# +p2 <- increase_scale_until_no_warnings(p2) + +# Save plot without legend +ggsave("Phyloplot_no_legend.pdf", p2 + theme(legend.position="none")) + +# Extract Quast legend +legend_quast <- cowplot::get_legend(p2 + legends_theme) + +# Wrap the legends in fixed-sized containers and adjust margins so that legends +# are perfectly aligned +legend_busco <- ggdraw(legend_busco) + theme(plot.margin = margin(0, 0, 0, 0)) +legend_quast <- ggdraw(legend_quast) + theme(plot.margin = margin(0, 29, 60, 0)) + +# Combine both BUSCO and Quast legends +combined_legends <- plot_grid(legend_busco, + legend_quast, + NULL, # Dummy rows for better positioning + NULL, + nrow = 4, + rel_widths = c(1, 1)) + +# Save legend +ggsave("legend.pdf", plot = combined_legends) + +# Add combined legends to tree plot +p3 <- plot_grid(p2 + theme(legend.position = "none"), + combined_legends, + ncol = 2, + rel_widths = c(4, 1)) -# Print warnings -warnings() \ No newline at end of file +# Save full plot +ggsave("Phyloplot_complete.pdf", plot = p3) diff --git a/bin/plot_tree_summary.R.old b/bin/plot_tree_summary.R.old new file mode 100755 index 0000000..3439030 --- /dev/null +++ b/bin/plot_tree_summary.R.old @@ -0,0 +1,219 @@ +#!/usr/bin/Rscript + +# Written by Chris Wyatt and released under the MIT license. +# Prints a tree with QUAST N50 results on tips of branches + +# Load necessary libraries +if (!requireNamespace("argparse", quietly = TRUE)) { + install.packages("argparse") +} +if (!requireNamespace("ggtree", quietly = TRUE)) { + BiocManager::install("ggtree") +} +if (!requireNamespace("ggplot2", quietly = TRUE)) { + install.packages("ggplot2") +} +if (!requireNamespace("cowplot", quietly = TRUE)) { + install.packages("cowplot") +} +if (!requireNamespace("tidyr", quietly = TRUE)) { + install.packages("tidyr") +} + +library(ggtree) +library(ggplot2) +library(cowplot) +library(argparse) +library(dplyr) +library(tidyr) + +# Function to extract legend from a ggplot +extract_legend <- function(plot) { + g <- ggplotGrob(plot) + legend <- g$grobs[which(sapply(g$grobs, function(x) x$name) == "guide-box")] + return(legend) +} + +# Parse command-line arguments +parser <- ArgumentParser(description = 'Plot phylogenetic tree with statistics and true/false data') +parser$add_argument('tree_file', type = 'character', help = 'Path to the Newick formatted tree file') +parser$add_argument('data_file', type = 'character', help = 'Path to the CSV file with statistic and true/false data') +parser$add_argument('--text_size', type = 'double', default = 3, help = 'Text size for the plots') +parser$add_argument('--tree_size', type = 'double', default = 0.3, help = 'Proportion of the plot width for the tree') +args <- parser$parse_args() + +# Read the Newick tree from the file +tree <- read.tree(args$tree_file) + +# Clean tree tip labels +tree$tip.label <- trimws(tree$tip.label) +tree$tip.label <- tolower(tree$tip.label) + +# Read the data table from the file, ensuring species column is read as character +data <- read.csv(args$data_file, sep = "\t", colClasses = c("species" = "character")) + +# Clean data species names +data$species <- trimws(data$species) +data$species <- tolower(data$species) + +# Extract the column headers and the second row to determine plot types +plot_types <- as.character(data[1, ]) +data <- data[-1, ] # Remove the second row used for plot types + +# Debugging: Print species names from the tree and the data +cat("Species names in the tree based on nw:\n") +print(tree$tip.label) +cat("\nSpecies names in the data table before processing:\n") +print(data$species) + +# Debugging: Print the data frame to check its structure +cat("\nData frame:\n") +print(data) + +# Ensure species names match the tree tips +missing_in_tree <- setdiff(data$species, tree$tip.label) +missing_in_data <- setdiff(tree$tip.label, data$species) + +if (length(missing_in_tree) > 0) { + cat("Species in data but not in tree:\n") + print(missing_in_tree) +} + +if (length(missing_in_data) > 0) { + cat("Species in tree but not in data:\n") + print(missing_in_data) +} + +if (length(missing_in_tree) > 0 || length(missing_in_data) > 0) { + stop("Species names in the data do not match the tree tips") +} + +# Plot the phylogenetic tree +tree_plot <- ggtree(tree) + + geom_tiplab() + + coord_cartesian(clip="off") + + ggtitle("Phylogenetic Tree") + + theme(plot.margin = margin(10, 150, 10, 10)) # Increase margins + +pdf ("Tree_only.pdf") +tree_plot +dev.off() + + +# Extract the exact tree tip names:axis.text +tree$tip.label <- get_taxa_name(tree_plot) + +# Reorder the data based on the order of species in the tree +data <- data %>% arrange(match(species, tree$tip.label)) + +# Debugging: Print plot types +cat("\nPlot types:\n") +print(plot_types) + +# Debugging: Print the reordered data frame to check its structure +cat("\nReordered data frame:\n") +print(data) + +write.table(data, "Reordered_output_tree.tsv", sep="\t", quote=F) + +# Check if the number of unique species matches the number of tree tips +if (length(unique(data$species)) != length(tree$tip.label)) { + warning("The number of unique species in the data does not match the number of tree tips.") +} + +# Find column headers +column_headers <- colnames(data) + +# Create plots based on the plot types +plots <- list() +legend_plot <- NULL +for (i in 2:length(column_headers)) { + column_name <- column_headers[i] + plot_type <- plot_types[i] + + if (plot_type == "bar") { + bar_plot <- ggplot(data, aes(x = as.numeric(!!sym(column_name)), y = factor(species, levels = rev(tree$tip.label)))) + + geom_bar(stat = "identity", fill = "darkblue") + + geom_text(aes(label = as.numeric(!!sym(column_name)), x = 0), hjust = -0.1, color = "white", size = args$text_size) + # Place text at the base of the bar + theme_minimal() + + theme(axis.title.y = element_blank(), + axis.text.y = element_blank(), + axis.ticks.y = element_blank(), + axis.title.x = element_blank(), + axis.text.x = element_blank(), + axis.ticks.x = element_blank(), + panel.grid.major = element_blank(), # Remove vertical grid lines + panel.grid.minor = element_blank(), # Remove minor grid lines + panel.background = element_blank(), # Remove grey background + plot.background = element_blank(), # Remove any outer plot background + plot.margin = margin(0, 0, 0, 0)) + + labs(x = column_name) + + ggtitle(column_name) + plots[[length(plots) + 1]] <- bar_plot + } else if (plot_type == "text") { + text_plot <- ggplot(data, aes(y = factor(species, levels = rev(tree$tip.label)), x = 1, label = !!sym(column_name))) + + geom_text(size = args$text_size, hjust = 0) + + theme_void() + + labs(x = NULL, y = NULL) + + theme(axis.title.x = element_blank(), + axis.text.x = element_blank(), + axis.ticks.x = element_blank(), + plot.margin = margin(0, 0, 0, 0)) + + ggtitle(column_name) + plots[[length(plots) + 1]] <- text_plot + } else if (plot_type == "stacked") { + # Split the stacked values into separate columns + stacked_data <- data %>% + separate(column_name, into = paste0(column_name, "_", 1:4), sep = ",", convert = TRUE, extra = "drop") %>% + pivot_longer(cols = starts_with(column_name), names_to = "stack", values_to = "value") %>% + mutate(stack = factor(stack, levels = paste0(column_name, "_", 1:4))) + + stacked_plot <- ggplot(stacked_data, aes(x = value, y = factor(species, levels = rev(tree$tip.label)), fill = stack)) + + geom_bar(stat = "identity", position = "fill") + + theme_minimal() + + theme(axis.title.y = element_blank(), + axis.text.y = element_blank(), + axis.ticks.y = element_blank(), + axis.title.x = element_blank(), + axis.text.x = element_blank(), + axis.ticks.x = element_blank(), + panel.grid.major.y = element_blank(), # Remove horizontal grid lines + panel.grid.minor.y = element_blank(), + plot.margin = margin(0, 0, 0, 0)) + + labs(x = column_name) + + ggtitle(column_name) + + # Extract the legend from the stacked plot + legend_plot <- extract_legend(stacked_plot) + + # Remove the legend from the stacked plot + stacked_plot <- stacked_plot + theme(legend.position = "none") + + plots[[length(plots) + 1]] <- stacked_plot + } else { + cat("Unknown plot type:", plot_type, "for column:", column_name, "\n") + } +} + +# Debugging: Print the list of plots +cat("List of plots:\n") +print(plots) + +# Combine the plots if there are any +if (length(plots) > 0) { + combined_plot <- plot_grid(tree_plot, plot_grid(plotlist = plots, ncol = length(plots)), ncol = 2, rel_widths = c(args$tree_size, 1 - args$tree_size)) # Adjust widths based on tree_size + + # Save the combined plot to a PDF file + ggsave("Phyloplot_quast.pdf", plot = combined_plot, width = 10, height = 8) + + # Save the legend to a separate PDF file + if (!is.null(legend_plot)) { + legend_plot <- cowplot::plot_grid(legend_plot) + ggsave("Legend.pdf", plot = legend_plot, width = 10, height = 2) + } +} else { + cat("No plots to combine.\n") +} + +# Print warnings +warnings() diff --git a/bin/tree_summary.R b/bin/tree_summary.R deleted file mode 100755 index 3789c87..0000000 --- a/bin/tree_summary.R +++ /dev/null @@ -1,278 +0,0 @@ -#!/usr/bin/Rscript - -# Written by Chris Wyatt and Fernando Duarte and released under the MIT license. -# Plots a phylogenetic tree alongside BUSCO and Quast stats - -# Load necessary libraries -if (!requireNamespace("argparse", quietly = TRUE)) { - install.packages("argparse") -} -if (!requireNamespace("ggtree", quietly = TRUE)) { - BiocManager::install("ggtree") -} -if (!requireNamespace("ggplot2", quietly = TRUE)) { - install.packages("ggplot2") -} -if (!requireNamespace("cowplot", quietly = TRUE)) { - install.packages("cowplot") -} -if (!requireNamespace("tidyr", quietly = TRUE)) { - install.packages("tidyr") -} -if (!requireNamespace("ggtreeExtra", quietly = TRUE)) { - BiocManager::install("ggtreeExtra") -} -if (!requireNamespace("ggimage", quietly = TRUE)) { - install.packages("ggimage") -} - -library(ggtree) -library(ggplot2) -library(cowplot) -library(argparse) -library(dplyr) -library(tidyr) -library(ggimage) -library(ggtreeExtra) - -# Parse command-line arguments -parser <- ArgumentParser(description = 'Plot phylogenetic tree with statistics and true/false data') -parser$add_argument('tree_file', type = 'character', help = 'Path to the Newick formatted tree file') -parser$add_argument('busco_file', type = 'character', help = 'Path to processed BUSCO output file') -parser$add_argument('quast_file', type = 'character', help = 'Path to processed Quast output file') -parser$add_argument('--text_size', type = 'double', default = 2.5, help = 'Tree tip text size for the plots') -parser$add_argument('--pie_size', type = 'double', default = 0, help = 'Increase pie size of each tip by this fold') -parser$add_argument('--pie_hjust', type = 'double', default = 0.4, help = 'Horizontally adjust pie positions') -#parser$add_argument('--tree_size', type = 'double', default = 0.3, help = 'Proportion of the plot width for the tree') -args <- parser$parse_args() - -# Create a function so that, if the Quast data is off limits and warnings related -# to this pop up , margins will be increased by the value "step" on each iteration -increase_scale_until_no_warnings <- function(plot, initial_limit = 5, step = 0.1, max_iterations = 50) { - # Initialize limit and warning flag - current_limit <- initial_limit - warning_flag <- TRUE - iteration <- 0 - - while (warning_flag && iteration < max_iterations) { - iteration <- iteration + 1 - # Update the plot with the current limit - updated_plot <- plot + scale_x_continuous(limits = c(0, current_limit)) - - # Capture warnings - warnings <- tryCatch({ - print(updated_plot) # Render the plot - NULL # No warnings - }, warning = function(w) { - w$message # Capture warning message - }) - - # Check if there are warnings - if (is.null(warnings)) { - warning_flag <- FALSE # No warnings, exit loop - } else { - # Increase the limit for the next iteration - current_limit <- current_limit + step - } - } - - # Return the final plot - if (iteration >= max_iterations) { - warning("Maximum iterations reached. Plot may still have issues.") - } - return(updated_plot) -} - -# Read the Newick tree from the file -tree <- read.tree(args$tree_file) - -# Clean tree tip labels -tree$tip.label <- trimws(tree$tip.label) -tree$tip.label <- tolower(tree$tip.label) - -# Read the data table from the file, ensuring species column is read as character -# Load BUSCO -data_busco <- read.csv(args$busco_file, sep = "\t", colClasses = c("Input_file" = "character")) - -# Prepare BUSCO data -data_busco <- data_busco %>% - # Remove extension from Input_file - mutate(Input_file = tools::file_path_sans_ext(Input_file)) %>% - # Rename 'Input_file' to 'species' - rename(species = Input_file) %>% - # Make everything lowercase - mutate(species = tolower(species)) - -# Load Quast data -data_quast <- read.csv(args$quast_file, sep = "\t") - -#Tidy Quast data -data_quast <- data_quast %>% - # Remove the row where species is NA - filter(!is.na(species)) %>% - # Make species lowercase - mutate(species = tolower(species)) %>% - # Convert wide to long format - pivot_longer(cols = c(N50, N90), - names_to = "metric", - values_to = "value") %>% - # Remove any remaining "bar" rows if necessary (check Chris script) - filter(value != "bar") %>% - # Convert value column to numeric if needed - mutate(value = as.numeric(value)) - -# Debugging: Print species names from the tree and the data -cat("Species names in the tree based on nw:\n") -cat(tree$tip.label) -cat("\nSpecies names in data tables:\n") -quast_sp <- unique(data_quast$species) -cat("BUSCO:", data_busco$species, "\nQUAST:", quast_sp, "\n") - -# Debugging: Check if there are any mismatches in species names (might need to be -# changed in the future to support optional stats) -if (all(sort(tree$tip.label) != sort(data_busco$species))) { - stop("Species names in BUSCO and tree labels do not match") -} else if (all(sort(quast_sp) != sort(data_busco$species))) { - stop("Species names in Quast and tree labels do not match") -} - -# Arrange data according to tree labels, so that they are plotted correctly -# For BUSCO -data_busco <- data_busco %>% - arrange(match(species, tree$tip.label)) - -# For Quast -data_quast <- data_quast %>% - arrange(match(species, tree$tip.label)) - -# Node number needed in BUSCO dataframe for nodpie. This is a generic 1-n -# numbering (n=number of species) -data_busco <- data_busco %>% - mutate(node = seq_len(n())) - -# Plot the phylogenetic tree -tree_plot <- ggtree(tree, branch.length = "none") + # No branch length, only topology - geom_tiplab(size=args$text_size) + # Tip font size - ggplot2::xlim(0, 5) + - ggtitle("Phylogenetic Tree") + - theme(plot.margin = margin(10, 10, 10, 10)) # Increase margins - -# Save tree only -ggsave("tree_only.pdf", tree_plot) - -# Create list of pies from BUSCO data -pies <- nodepie(data_busco, - cols = 4:7, - color = c( - "Single" = "steelblue", - "Duplicated" = "orange", - "Fragmented" = "purple", - "Missing" = "red" - ) -) - -print("Check 1") - -# Create a dummy pie as for the legend it seems impossible to extract it -# from nodpie -dummy_pie_data <- data.frame( - Type = factor(c("Single", "Duplicated", "Fragmented", "Missing")), - value = c(1, 1, 1, 1) # Dummy values for equal slices -) - -dummy_pie <- ggplot(dummy_pie_data) + - geom_bar( - aes(x = "", y = value, fill = Type), - stat = "identity", - width = 1 - ) + - coord_polar("y") + - scale_fill_manual( - values = c( - "Single" = "steelblue", - "Duplicated" = "orange", - "Fragmented" = "purple", - "Missing" = "red" - ), - name = "BUSCO" - ) + - theme_void() + - theme(legend.position = "right") # Show the legend - -print("Check 2") - -# Create variable for standarization of legends format -legends_theme <- theme(legend.title = element_text(size = 10, face = "bold"), - legend.text = element_text(size = 8), - legend.key.size = unit(0.5, "cm")) - -# Extract legend of dummy BUSCO pie chart -legend_busco <- cowplot::get_legend(dummy_pie + legends_theme) - -print("Check 2.5") - -# Plot BUSCO pies next to tree tips -p <- ggtree::inset(tree_plot, - pies, - width=(args$pie_size * .16 + .16), - height=(args$pie_size * .6 + .6), - hjust=-1.2) - -print("Check 3") - -# Plot Quast data -p2 <- p + geom_fruit(data=data_quast, - geom = geom_bar, - mapping = aes(y=species, x=value, fill=metric), - position=position_dodgex(), - pwidth=0.38, - orientation="y", - stat="identity", - axis.params = list( - axis = "x", - text.size = 1.8, - hjust = 1, - vjust = 0.5, - nbreak = 3 - ), - grid.params = list(), - offset = 1, - width = 0.4) + - labs(fill = "Quast") + - scale_x_continuous(limits=c(0,5)) # limits=c(0,5) by default for p2 - -print("Check 4") - -# -p2 <- increase_scale_until_no_warnings(p2) - -# Save plot without legend -ggsave("Phyloplot_no_legend.pdf", p2 + theme(legend.position="none")) - -# Extract Quast legend -legend_quast <- cowplot::get_legend(p2 + legends_theme) - -# Wrap the legends in fixed-sized containers and adjust margins so that legends -# are perfectly aligned -legend_busco <- ggdraw(legend_busco) + theme(plot.margin = margin(0, 0, 0, 0)) -legend_quast <- ggdraw(legend_quast) + theme(plot.margin = margin(0, 29, 60, 0)) - -# Combine both BUSCO and Quast legends -combined_legends <- plot_grid(legend_busco, - legend_quast, - NULL, # Dummy rows for better positioning - NULL, - nrow = 4, - rel_widths = c(1, 1)) - -# Save legend -ggsave("legend.pdf", plot = combined_legends) - -# Add combined legends to tree plot -p3 <- plot_grid(p2 + theme(legend.position = "none"), - combined_legends, - ncol = 2, - rel_widths = c(4, 1)) - -# Save full plot -ggsave("Phyloplot_complete.pdf", plot = p3) diff --git a/modules/local/tree_summary.nf b/modules/local/tree_summary.nf index 696695f..54bc4c2 100644 --- a/modules/local/tree_summary.nf +++ b/modules/local/tree_summary.nf @@ -20,7 +20,7 @@ process TREE_SUMMARY { #Remove unwanted extensions in the tree file sed \'s/\\.prot\\.fa\\.largestIsoform//g\' ${tree}/Species_Tree/SpeciesTree_rooted_node_labels.txt > tree.nw - + # Combine the BUSCO outputs and remove empty tabs head -qn 1 *.txt | head -n 1 > Busco_combined tail -q -n 1 *.txt | sed -E 's/\t+/\t/g' | sed 's/\t\$//g' >> Busco_combined @@ -29,7 +29,7 @@ process TREE_SUMMARY { quast_2_table.py *quast.tsv -o Quast_to_plot.tsv -col N50,N90 -plot_types bar,bar # Run summary plot - tree_summary.R tree.nw Busco_combined Quast_to_plot.tsv + plot_tree_summary.R tree.nw Busco_combined Quast_to_plot.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": From 3d99a0d769133862acb56e478c9c06884d64108b Mon Sep 17 00:00:00 2001 From: FernandoDuarteF Date: Fri, 27 Dec 2024 13:46:27 +0000 Subject: [PATCH 05/11] Update tree plot module --- bin/gene_overlaps.R | 5 ++++- modules/local/tree_summary.nf | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/bin/gene_overlaps.R b/bin/gene_overlaps.R index 80f3ed2..11a80de 100755 --- a/bin/gene_overlaps.R +++ b/bin/gene_overlaps.R @@ -2,6 +2,7 @@ # Code written by Chris Wyatt with some editing by ChatGPT +library(dplyr) library(GenomicRanges) # Parse command-line arguments @@ -19,6 +20,9 @@ read_gff_to_granges <- function(gff_file) { gff_data <- read.delim(gff_file, header = FALSE, comment.char = "#") colnames(gff_data) <- c("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute") + gff_data <- gff_data %>% + filter(strand %in% c("-","+")) + gr <- GRanges( seqnames = gff_data$seqname, ranges = IRanges(start = gff_data$start, end = gff_data$end), @@ -67,7 +71,6 @@ for (i in seq_len(length(overlap_results))) { if (query_idx != subject_idx) { query_gene <- genes[query_idx] subject_gene <- genes[subject_idx] - overlap_range <- intersect(ranges(query_gene), ranges(subject_gene)) overlap_length <- width(overlap_range) diff --git a/modules/local/tree_summary.nf b/modules/local/tree_summary.nf index 54bc4c2..4bffe87 100644 --- a/modules/local/tree_summary.nf +++ b/modules/local/tree_summary.nf @@ -26,7 +26,7 @@ process TREE_SUMMARY { tail -q -n 1 *.txt | sed -E 's/\t+/\t/g' | sed 's/\t\$//g' >> Busco_combined # Combine QUAST ouput - quast_2_table.py *quast.tsv -o Quast_to_plot.tsv -col N50,N90 -plot_types bar,bar + quast_2_table.py *quast.tsv -o Quast_to_plot.tsv -col N50,N90,"Total length","GC (%)" -plot_types bar,bar,bar,bar # Run summary plot plot_tree_summary.R tree.nw Busco_combined Quast_to_plot.tsv From 32d3ea69b9e8112b37256ee4f937d284ac2fc961 Mon Sep 17 00:00:00 2001 From: FernandoDuarteF Date: Mon, 30 Dec 2024 18:46:20 +0100 Subject: [PATCH 06/11] Added number_seqs.nf --- bin/gene_overlaps_table.py | 62 +++++++++++++++++++++ conf/modules.config | 7 +++ modules/local/gene_overlaps.nf | 2 +- modules/local/number_seqs.nf | 27 +++++++++ modules/local/tree_summary.nf | 13 ++++- subworkflows/local/genome_and_annotation.nf | 13 ++++- 6 files changed, 119 insertions(+), 5 deletions(-) create mode 100755 bin/gene_overlaps_table.py create mode 100644 modules/local/number_seqs.nf diff --git a/bin/gene_overlaps_table.py b/bin/gene_overlaps_table.py new file mode 100755 index 0000000..946c121 --- /dev/null +++ b/bin/gene_overlaps_table.py @@ -0,0 +1,62 @@ +#!/usr/bin/python3 + +# Written using ChatGPT + +import pandas as pd +import argparse + +# Set up the argument parser +parser = argparse.ArgumentParser(description='Extract gene statistics from files.') +parser.add_argument('input_files', nargs='+', help='List of input files.') +parser.add_argument('output_file', help='Path to save the output TSV file.') +parser.add_argument('--include-sense', action='store_true', help='Include sense genes count.') +parser.add_argument('--include-antisense', action='store_true', help='Include antisense genes count.') + +# Parse the arguments +args = parser.parse_args() + +# Initialize an empty list to store the results +results = [] + +# Process each input file +for file in args.input_files: + try: + # Load the file into a DataFrame + df = pd.read_csv(file, sep='\t', header=None, names=['Statistic', 'Count']) + + # Extract required statistics + total_genes = df.loc[df['Statistic'] == 'Total number of genes', 'Count'].values[0] + overlapping_genes = df.loc[df['Statistic'] == 'Total number of overlapping genes', 'Count'].values[0] + + # Optional statistics + sense_genes = df.loc[df['Statistic'] == 'Number of genes fully contained in sense direction', 'Count'].values[0] if args.include_sense else "NA" + antisense_genes = df.loc[df['Statistic'] == 'Number of genes fully contained in antisense direction', 'Count'].values[0] if args.include_antisense else "NA" + + # Collect results in a dictionary + entry = { + 'File': file, + 'Total_genes': total_genes, + 'Overlapping_genes': overlapping_genes, + } + if args.include_sense: + entry['Sense_genes'] = sense_genes + if args.include_antisense: + entry['Antisense_genes'] = antisense_genes + + results.append(entry) + except Exception as e: + print(f"Error processing {file}: {e}") + continue + +# Convert the results to a DataFrame +columns = ['File', 'Total_genes', 'Overlapping_genes'] +if args.include_sense: + columns.append('Fully_contained_sense_genes') +if args.include_antisense: + columns.append('Fully_contained_antisense_genes') + +result_df = pd.DataFrame(results, columns=columns) + +# Write the result to the output file +result_df.to_csv(args.output_file, sep='\t', index=False) +print(f"Extraction completed successfully. Output saved to {args.output_file}.") \ No newline at end of file diff --git a/conf/modules.config b/conf/modules.config index 3079ef8..1a418ef 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -175,4 +175,11 @@ process { ] } + withName: 'NUMBER_SEQS' { + publishDir = [ + path: { "${params.outdir}/number_seqs" }, + mode: params.publish_dir_mode + ] + } + } diff --git a/modules/local/gene_overlaps.nf b/modules/local/gene_overlaps.nf index 1c81522..54085f1 100644 --- a/modules/local/gene_overlaps.nf +++ b/modules/local/gene_overlaps.nf @@ -8,7 +8,7 @@ process GENE_OVERLAPS { output: tuple val(meta), path("Summary.*.tsv"), emit: overlap_detailed_summary - tuple val(meta), path("Count.*.tsv"), emit: overlap_counts + tuple val(meta), path("Count.*.tsv") , emit: overlap_counts path "versions.yml", emit: versions when: diff --git a/modules/local/number_seqs.nf b/modules/local/number_seqs.nf new file mode 100644 index 0000000..0b20c5c --- /dev/null +++ b/modules/local/number_seqs.nf @@ -0,0 +1,27 @@ +process NUMBER_SEQS { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::coreutils=8.31" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gnu-wget:1.18--h36e9172_9' : + 'biocontainers/gnu-wget:1.18--h36e9172_9' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path( "${meta.id}.n_seqs.tsv" ), emit: n_seqs + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + # Add header + echo -e "species\tchromosome_count\tsequence_count" > ${prefix}.n_seqs.tsv + + # Use grep to cunt the number of chromosomes and sequences + echo -e "${prefix}\t\$(grep -i '^>.*\\(chromosome\\|chr\\)' $fasta | wc -l)\t\$(grep -c '^>' $fasta)" >> ${prefix}.n_seqs.tsv + """ + +} \ No newline at end of file diff --git a/modules/local/tree_summary.nf b/modules/local/tree_summary.nf index 4bffe87..0e6d727 100644 --- a/modules/local/tree_summary.nf +++ b/modules/local/tree_summary.nf @@ -13,16 +13,23 @@ process TREE_SUMMARY { path( "P*.pdf" ), emit: figure path( "versions.yml" ), emit: versions - script: def prefix = task.ext.prefix ?: "${meta.id}" - """ + """ #Remove unwanted extensions in the tree file sed \'s/\\.prot\\.fa\\.largestIsoform//g\' ${tree}/Species_Tree/SpeciesTree_rooted_node_labels.txt > tree.nw + # Combine GENE OVERLAPS outputs + gene_overlaps_table.py Count.*.tsv gene_stats.tsv --include-sense --include-antisense + + # Combine NUMBER SEQS outputs + # Add header + echo -e "species\tchromosome_count\tsequence_count" > number_seqs.tsv + tail -n +2 -q *.n_seqs.tsv >> number_seqs.tsv + # Combine the BUSCO outputs and remove empty tabs - head -qn 1 *.txt | head -n 1 > Busco_combined + head -qn 1 *.txt | head -n 1 > Busco_combined tail -q -n 1 *.txt | sed -E 's/\t+/\t/g' | sed 's/\t\$//g' >> Busco_combined # Combine QUAST ouput diff --git a/subworkflows/local/genome_and_annotation.nf b/subworkflows/local/genome_and_annotation.nf index 08b3a7c..a325bc8 100644 --- a/subworkflows/local/genome_and_annotation.nf +++ b/subworkflows/local/genome_and_annotation.nf @@ -9,6 +9,7 @@ include { GFFREAD } from '../../modules/nf-core/gffr include { ORTHOFINDER } from '../../modules/nf-core/orthofinder/main' include { FASTAVALIDATOR } from '../../modules/nf-core/fastavalidator/main' include { GENE_OVERLAPS } from '../../modules/local/gene_overlaps' +include { NUMBER_SEQS } from '../../modules/local/number_seqs' workflow GENOME_AND_ANNOTATION { @@ -52,12 +53,21 @@ workflow GENOME_AND_ANNOTATION { | combine(ch_gff_agat, by:0) // by:0 | Only combine when both channels share the same id | combine(ch_gff_long, by:0) | multiMap { - meta, fasta, gff_unfilt, gff_filt -> // null is is probably not necessary + meta, fasta, gff_unfilt, gff_filt -> // "null" probably not necessary fasta : fasta ? tuple( meta, file(fasta) ) : null // channel: [ val(meta), [ fasta ] ] gff_unfilt : gff_unfilt ? tuple( meta, file(gff_unfilt) ) : null // channel: [ val(meta), [ gff ] ], unfiltered gff_filt : gff_filt ? tuple( meta, file(gff_filt) ) : null // channel: [ val(meta), [ gff ] ], filtered for longest isoform } + // + // Check number of sequences + // + + NUMBER_SEQS ( + ch_input.fasta + ) + ch_tree_data = ch_tree_data.mix(NUMBER_SEQS.out.n_seqs.collect { meta, file -> file }) + // // Run AGAT Spstatistics // @@ -74,6 +84,7 @@ workflow GENOME_AND_ANNOTATION { GENE_OVERLAPS { ch_input.gff_filt } + ch_tree_data = ch_tree_data.mix(GENE_OVERLAPS.out.overlap_counts.collect { meta, file -> file }) // // MODULE: Run Quast From 871db0e4beccf70481d9b79091938b7056c10bd5 Mon Sep 17 00:00:00 2001 From: FernandoDuarteF Date: Mon, 30 Dec 2024 20:42:15 +0100 Subject: [PATCH 07/11] Fixed bin/gene_overlaps_table.py --- bin/gene_overlaps_table.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bin/gene_overlaps_table.py b/bin/gene_overlaps_table.py index 946c121..9df003e 100755 --- a/bin/gene_overlaps_table.py +++ b/bin/gene_overlaps_table.py @@ -31,7 +31,8 @@ # Optional statistics sense_genes = df.loc[df['Statistic'] == 'Number of genes fully contained in sense direction', 'Count'].values[0] if args.include_sense else "NA" antisense_genes = df.loc[df['Statistic'] == 'Number of genes fully contained in antisense direction', 'Count'].values[0] if args.include_antisense else "NA" - + print(sense_genes) + # Collect results in a dictionary entry = { 'File': file, @@ -39,9 +40,9 @@ 'Overlapping_genes': overlapping_genes, } if args.include_sense: - entry['Sense_genes'] = sense_genes + entry['Fully_contained_sense_genes'] = sense_genes if args.include_antisense: - entry['Antisense_genes'] = antisense_genes + entry['Fully_contained_antisense_genes'] = antisense_genes results.append(entry) except Exception as e: From 6138cdfbc76e34cdaa2755d52e5f5dd1f9dd39c2 Mon Sep 17 00:00:00 2001 From: FernandoDuarteF Date: Mon, 13 Jan 2025 10:38:40 +0000 Subject: [PATCH 08/11] Added contigs column to quast table --- modules/local/tree_summary.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/tree_summary.nf b/modules/local/tree_summary.nf index 0e6d727..638f991 100644 --- a/modules/local/tree_summary.nf +++ b/modules/local/tree_summary.nf @@ -33,7 +33,7 @@ process TREE_SUMMARY { tail -q -n 1 *.txt | sed -E 's/\t+/\t/g' | sed 's/\t\$//g' >> Busco_combined # Combine QUAST ouput - quast_2_table.py *quast.tsv -o Quast_to_plot.tsv -col N50,N90,"Total length","GC (%)" -plot_types bar,bar,bar,bar + quast_2_table.py *quast.tsv -o Quast_to_plot.tsv -col N50,N90,"Total length","GC (%)","# contigs" -plot_types bar,bar,bar,bar # Run summary plot plot_tree_summary.R tree.nw Busco_combined Quast_to_plot.tsv From 484194372d7c63e0797420fbba8f6cabe01440cb Mon Sep 17 00:00:00 2001 From: FernandoDuarteF Date: Thu, 16 Jan 2025 13:54:55 +0000 Subject: [PATCH 09/11] Fixed tree plot not working --- bin/plot_tree_summary.R | 586 +++++++++++++++++---------- conf/modules.config | 12 +- modules/local/tree_summary.nf | 10 +- nextflow.config | 1 + nextflow_schema.json | 735 +++++++++++++++++----------------- 5 files changed, 759 insertions(+), 585 deletions(-) diff --git a/bin/plot_tree_summary.R b/bin/plot_tree_summary.R index af2f358..a8c2c7b 100755 --- a/bin/plot_tree_summary.R +++ b/bin/plot_tree_summary.R @@ -1,87 +1,84 @@ #!/usr/bin/Rscript -# Written by Chris Wyatt and Fernando Duarte and released under the MIT license. -# Plots a phylogenetic tree alongside BUSCO and Quast stats +# Written by Chris Wyatt and Fernando Duarte and released under the MIT license. +# Plots the phylogenetic tree with BUSCO result in pie charts and Quast -# Load necessary libraries -if (!requireNamespace("argparse", quietly = TRUE)) { - install.packages("argparse") -} -if (!requireNamespace("ggtree", quietly = TRUE)) { - BiocManager::install("ggtree") -} -if (!requireNamespace("ggplot2", quietly = TRUE)) { - install.packages("ggplot2") -} -if (!requireNamespace("cowplot", quietly = TRUE)) { - install.packages("cowplot") -} -if (!requireNamespace("tidyr", quietly = TRUE)) { - install.packages("tidyr") -} -if (!requireNamespace("ggtreeExtra", quietly = TRUE)) { - BiocManager::install("ggtreeExtra") +#--------FUNCTIONS-------# + +# Function to rotate legends +rotate_grob <- function(grob, angle) { + gTree(children = gList(grob), vp = viewport(angle = angle)) } -if (!requireNamespace("ggimage", quietly = TRUE)) { - install.packages("ggimage") + +# Function to plot tree and plots +build_tree_plot <- function(tree, n, plots, legends, xlimit, rigth_margin, bottom_margin) { #xlim for legends, use same xlim as barplots (new_xlim) + # Update xlim for the tree plot + tree <- tree + ggplot2::xlim(0, n * max(tree$data$branch.length)) + + # Initialize combined plot with the tree + combined_plots <- tree + widths <- c(10 - length(plots)) # Dynamic width for the tree + + # Add each additional plot + for (plot in plots) { + combined_plots <- combined_plots | plot + widths <- c(widths, 1) # Same widths for plots and legends + } + + # Initialize combined legends with empty plot aligned w/ tree + combined_legends <- plot_spacer() + xlimit + + # Add each additional legend + for (legend in legends) { + if (length(legend) != 0) { + legend <- legend#wrap_elements(rotate_grob(legend, -45)) + xlimit + } else { + legend <- plot_spacer() + xlimit + } + combined_legends <- combined_legends | legend + } + + # Apply the layout widths + combined_plots <- combined_plots + plot_layout(widths = widths) + + combined_legends <- combined_legends + + plot_layout(widths = widths) + + theme(plot.margin = margin(0, rigth_margin, bottom_margin, 0)) + #theme(plot.margin = margin(0, 15, 60, 0)) + + combined_plots <- combined_plots / combined_legends + + plot_layout(heights = c(0.99, 0.01)) + + return(combined_plots) } library(ggtree) library(ggplot2) -library(cowplot) +#library(cowplot) +library(patchwork) library(argparse) library(dplyr) library(tidyr) -library(ggimage) -library(ggtreeExtra) +#library(ggtreeExtra) +library(scatterpie) +library(scales) +#library(grid) +#library(gridExtra) # Parse command-line arguments parser <- ArgumentParser(description = 'Plot phylogenetic tree with statistics and true/false data') parser$add_argument('tree_file', type = 'character', help = 'Path to the Newick formatted tree file') parser$add_argument('busco_file', type = 'character', help = 'Path to processed BUSCO output file') parser$add_argument('quast_file', type = 'character', help = 'Path to processed Quast output file') -parser$add_argument('--text_size', type = 'double', default = 2.5, help = 'Tree tip text size for the plots') -parser$add_argument('--pie_size', type = 'double', default = 0, help = 'Increase pie size of each tip by this fold') -parser$add_argument('--pie_hjust', type = 'double', default = 0.4, help = 'Horizontally adjust pie positions') -#parser$add_argument('--tree_size', type = 'double', default = 0.3, help = 'Proportion of the plot width for the tree') +parser$add_argument('genes_file', type = 'character', help = 'Path to gene stats output file') +parser$add_argument('--text_size', type = 'double', default = 3, help = 'Text size for the plots') +parser$add_argument('--tree_size', type = 'double', default = 4.5, help = 'x axis width for tree plot') args <- parser$parse_args() -# Create a function so that, if the Quast data is off limits and warnings related -# to this pop up , margins will be increased by the value "step" on each iteration -increase_scale_until_no_warnings <- function(plot, initial_limit = 5, step = 0.1, max_iterations = 50) { - # Initialize limit and warning flag - current_limit <- initial_limit - warning_flag <- TRUE - iteration <- 0 - - while (warning_flag && iteration < max_iterations) { - iteration <- iteration + 1 - # Update the plot with the current limit - updated_plot <- plot + scale_x_continuous(limits = c(0, current_limit)) - - # Capture warnings - warnings <- tryCatch({ - print(updated_plot) # Render the plot - NULL # No warnings - }, warning = function(w) { - w$message # Capture warning message - }) - - # Check if there are warnings - if (is.null(warnings)) { - warning_flag <- FALSE # No warnings, exit loop - } else { - # Increase the limit for the next iteration - current_limit <- current_limit + step - } - } +#setwd("/Users/fernando/work/repositories/pipelines/make_better_tree_plot") - # Return the final plot - if (iteration >= max_iterations) { - warning("Maximum iterations reached. Plot may still have issues.") - } - return(updated_plot) -} +# Avoid scientific notation in all plots +options(scipen = 999) # Read the Newick tree from the file tree <- read.tree(args$tree_file) @@ -90,6 +87,9 @@ tree <- read.tree(args$tree_file) tree$tip.label <- trimws(tree$tip.label) tree$tip.label <- tolower(tree$tip.label) +# Capitalize first letter of tip labels +tree$tip.label <- sub("^(\\w)(.*)", "\\U\\1\\L\\2", tree$tip.label, perl = TRUE) + # Read the data table from the file, ensuring species column is read as character # Load BUSCO data_busco <- read.csv(args$busco_file, sep = "\t", colClasses = c("Input_file" = "character")) @@ -99,180 +99,342 @@ data_busco <- data_busco %>% # Remove extension from Input_file mutate(Input_file = tools::file_path_sans_ext(Input_file)) %>% # Rename 'Input_file' to 'species' - rename(species = Input_file) %>% - # Make everything lowercase - mutate(species = tolower(species)) + rename(species = Input_file) -# Load Quast data +# Load Quast data_quast <- read.csv(args$quast_file, sep = "\t") -#Tidy Quast data +# Change header of GC% and contigs column +colnames(data_quast)[5] <- "GC" +colnames(data_quast)[6] <- "Sequences" + +#Prepare Quast data data_quast <- data_quast %>% # Remove the row where species is NA filter(!is.na(species)) %>% - # Make species lowercase - mutate(species = tolower(species)) %>% - # Convert wide to long format - pivot_longer(cols = c(N50, N90), - names_to = "metric", - values_to = "value") %>% # Remove any remaining "bar" rows if necessary (check Chris script) - filter(value != "bar") %>% - # Convert value column to numeric if needed - mutate(value = as.numeric(value)) + filter(N50 != "bar") %>% + # Total length values to Mb + mutate(Total.length = (as.numeric(Total.length)/1000000)) %>% + # Create new col with numbers of GC bp + mutate(GC = as.numeric(Total.length)*as.numeric(GC)/100) %>% + # Rename column to make it shorter + rename(Length = Total.length) + +# Load gene stats +data_genes <- read.csv(args$genes_file, sep = "\t") + +# Prepare gene stats +data_genes <- data_genes %>% + # Rename columns + rename(species = File) %>% + rename(Total = Total_genes) %>% + rename(Overlapping = Overlapping_genes) %>% + # Remove prefix and suffix + mutate(species = gsub("Count\\.|\\.tsv", "", species)) # Debugging: Print species names from the tree and the data cat("Species names in the tree based on nw:\n") -cat(tree$tip.label) +print(tree$tip.label) cat("\nSpecies names in data tables:\n") quast_sp <- unique(data_quast$species) -cat("BUSCO:", data_busco$species, "\nQUAST:", quast_sp, "\n") +cat("BUSCO:", data_busco$species, "\nQUAST:", quast_sp) -# Debugging: Check if there are any mismatches in species names (might need to be -# changed in the future to support optional stats) +# Debugging: Check if there are any mismatches in species names if (all(sort(tree$tip.label) != sort(data_busco$species))) { stop("Species names in BUSCO and tree labels do not match") } else if (all(sort(quast_sp) != sort(data_busco$species))) { stop("Species names in Quast and tree labels do not match") } -# Arrange data according to tree labels, so that they are plotted correctly -# For BUSCO -data_busco <- data_busco %>% - arrange(match(species, tree$tip.label)) +# Get order of tips +tree_plot <- ggtree(tree) # Temporary plot for get_taxa_name() +tips_order <- rev(get_taxa_name(tree_plot)) -# For Quast +# Arrange data according to tree labels +data_busco <- data_busco %>% + arrange(match(species, tips_order)) data_quast <- data_quast %>% - arrange(match(species, tree$tip.label)) - -# Node number needed in BUSCO dataframe for nodpie. This is a generic 1-n -# numbering (n=number of species) + arrange(match(species, tips_order)) %>% + mutate(node = 1:length(species)) +data_genes <- data_genes %>% + arrange(match(species, tips_order)) %>% + mutate(node = 1:length(species)) + +# Match names with new tree tips (only necessary for Quast) +# data_quast$species <- gsub("_", " ", data_quast$species) + +# Tidy Quast data +# For N50/N90 +data_quast_n5090 <- data_quast %>% + # Convert wide to long format + pivot_longer(cols = c(N50, N90), + names_to = "metric", + values_to = "value") %>% + # Convert value column to numeric if needed + mutate(value = as.numeric(value)) %>% + mutate(value = (as.numeric(value)/1000000)) # Values in Mb + +# For GC content and length +data_quast_len <- data_quast %>% + pivot_longer(cols = c(GC, Length), + names_to = "metric", + values_to = "value") + +# Tidy gene stats data +data_genes <- data_genes %>% + pivot_longer(cols = c(Total, Overlapping), + names_to = "stat", + values_to = "value") + +# Add node column data_busco <- data_busco %>% - mutate(node = seq_len(n())) + mutate(node = 1:length(species)) # Node number needed for nodpie -# Plot the phylogenetic tree -tree_plot <- ggtree(tree, branch.length = "none") + # No branch length, only topology - geom_tiplab(size=args$text_size) + # Tip font size - ggplot2::xlim(0, 5) + - ggtitle("Phylogenetic Tree") + - theme(plot.margin = margin(10, 10, 10, 10)) # Increase margins +# This is for the synteny paper, remove "_" and changes the first letter to uppercase +tree$tip.label <- gsub("_", " ", tree$tip.label) + +# Plot number of chromosomes/sequences +ch_plot <- ggplot(data_quast, aes(x=1, y=node)) + + geom_text(aes(label = Sequences)) + + theme_void() + + ggtitle("Sequence\nnumber") + + theme(plot.title = element_text(size = 9, hjust = 0.5, vjust = -2.2)) + +# Now with the bar plots. Set standard theme for all barplots +barplots_theme <- theme_classic() + + theme( + axis.text.y=element_blank(), + axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1, size = 6), + axis.ticks.y=element_blank(), + axis.line.x = element_line(), + axis.line.y = element_blank() + ) + +# Prepare Quast data for plotting +data_quast_n50 <- data_quast_n5090[data_quast_n5090$metric %in% "N50",] +#data_quast_n90 <- data_quast_n5090[data_quast_n5090$metric %in% "N90",] + +# Plot Quast data genome size +len_plot <- ggplot( + data_quast_len, + aes(y=value, x=node) +) + + geom_col( + aes(fill=metric), + position = position_stack(reverse = TRUE), + width = 0.7 + ) + + scale_fill_manual(values = c("brown1", "cornflowerblue")) + + ggtitle("Genome\nsize") + + barplots_theme + + theme(plot.title = element_text(size = 9, hjust = 0.5, vjust = -5)) + + coord_flip() + #Flip plot + xlab(NULL) + + ylab(NULL) + +# Extract legend +legend_len <- cowplot::get_legend( + len_plot + + theme(legend.position = "right", + legend.justification = c(0, 1.2), # This is what actually move the legend, play with it, default position is c(1,0.5) + legend.title = element_blank(), + legend.key.size = unit(0.2, "cm"), + legend.background = element_rect(fill = NA), + legend.text = element_text(size = 8)) +) + +# Display the legend alone +cowplot::ggdraw() + cowplot::draw_grob(legend_len) + +# Remove legend +len_plot <- len_plot + guides(fill="none") + +# Plot Quast data N50 +n50_plot <- ggplot( + data_quast_n50, + aes(y=value, x=node) +) + + geom_col( + position = position_stack(reverse = TRUE), # For GC% + width = 0.7, + fill = "steelblue" + ) + + ggtitle("N50") + + barplots_theme + + theme(plot.title = element_text(size = 9, hjust = 0.5, vjust = -0.4)) + + coord_flip() + + xlab(NULL) + + ylab(NULL) + +# Remove legend +n50_plot <- n50_plot + guides(fill="none") + +# Create the scatterpie plot +pies_plot <- ggplot() + + geom_scatterpie( + aes(x = 1, y = node, group = species, r = 0.4), # r determines the radius of the pies + data = data_busco, + cols = c("Single", "Duplicated", "Fragmented", "Missing"), + color = NA + ) + + scale_fill_manual(values = c("deepskyblue", "orange", "darkorchid4", "firebrick1")) + + coord_fixed() + + theme_void() + + ggtitle("BUSCO") + + theme(plot.title = element_text(size = 9, hjust = 0.5, vjust = 0.05)) + +# Extract legend +legend_busco <- cowplot::get_legend( + pies_plot + + #guides(fill=guide_legend(ncol=2)) + + theme(legend.position = "right", + legend.justification = c(2.5, 1.08), + legend.title = element_blank(), + legend.key.size = unit(0.2, "cm"), + #legend.background = element_rect(fill = NA), # I don't know why this doesn't work, if I set this to NA an outline appears around the legend + legend.text = element_text(size = 8)) +) -# Save tree only -ggsave("tree_only.pdf", tree_plot) - -# Create list of pies from BUSCO data -pies <- nodepie(data_busco, - cols = 4:7, - color = c( - "Single" = "steelblue", - "Duplicated" = "orange", - "Fragmented" = "purple", - "Missing" = "red" - ) +# Display the legend alone +cowplot::ggdraw() + cowplot::draw_grob(legend_busco) + +# Remove lenged for pieplot +pies_plot <- pies_plot + guides(fill="none") + +# Plot gene stats + +gene_plot <- ggplot( + data_genes, + aes(y=value, x=node) +) + + geom_col( + aes(fill=stat), + position = position_stack(reverse = TRUE), + width = 0.7 + ) + + scale_fill_manual(values = c("indianred1", "lightsteelblue")) + + ggtitle("Gene\nnumber") + + barplots_theme + + theme(plot.title = element_text(size = 9, hjust = 0.5, vjust = -5)) + + coord_flip() + #Flip plot + scale_y_continuous(breaks = pretty_breaks(n = 3)) + + xlab(NULL) + + ylab(NULL) + + theme(legend.position = "bottom", legend.direction = "vertical", legend.title = element_blank()) #+ + +# Extract legend +legend_gene <- cowplot::get_legend( + gene_plot + + theme(legend.position = "right", + legend.justification = c(0, 1.2), + legend.title = element_blank(), + legend.key.size = unit(0.2, "cm"), + legend.background = element_rect(fill = NA), + legend.text = element_text(size = 8)) ) -print("Check 1") +# Display the legend alone +cowplot::ggdraw() + cowplot::draw_grob(legend_gene) + +# Remove legend from pie plot +pies_plot <- pies_plot + guides(fill="none") -# Create a dummy pie as for the legend it seems impossible to extract it -# from nodpie -dummy_pie_data <- data.frame( - Type = factor(c("Single", "Duplicated", "Fragmented", "Missing")), - value = c(1, 1, 1, 1) # Dummy values for equal slices +# Remove legend from gene plot +gene_plot <- gene_plot + guides(fill="none") + +# To match tips with plots it's necessary to set the same ylim for all plots +# Select the biggest range to avoid cropping +p_ranges_y <- c( + ggplot_build(ch_plot)$layout$panel_scales_y[[1]]$range$range, + ggplot_build(len_plot)$layout$panel_scales_x[[1]]$range$range, + ggplot_build(n50_plot)$layout$panel_scales_x[[1]]$range$range, + ggplot_build(pies_plot)$layout$panel_scales_y[[1]]$range$range ) -dummy_pie <- ggplot(dummy_pie_data) + - geom_bar( - aes(x = "", y = value, fill = Type), - stat = "identity", - width = 1 - ) + - coord_polar("y") + - scale_fill_manual( - values = c( - "Single" = "steelblue", - "Duplicated" = "orange", - "Fragmented" = "purple", - "Missing" = "red" - ), - name = "BUSCO" - ) + - theme_void() + - theme(legend.position = "right") # Show the legend - -print("Check 2") - -# Create variable for standarization of legends format -legends_theme <- theme(legend.title = element_text(size = 10, face = "bold"), - legend.text = element_text(size = 8), - legend.key.size = unit(0.5, "cm")) - -# Extract legend of dummy BUSCO pie chart -legend_busco <- cowplot::get_legend(dummy_pie + legends_theme) - -print("Check 2.5") - -# Plot BUSCO pies next to tree tips -p <- ggtree::inset(tree_plot, - pies, - width=(args$pie_size * .16 + .16), - height=(args$pie_size * .6 + .6), - hjust=-1.2) - -print("Check 3") - -# Plot Quast data -p2 <- p + geom_fruit(data=data_quast, - geom = geom_bar, - mapping = aes(y=species, x=value, fill=metric), - position=position_dodgex(), - pwidth=0.38, - orientation="y", - stat="identity", - axis.params = list( - axis = "x", - text.size = 1.8, - hjust = 1, - vjust = 0.5, - nbreak = 3 - ), - grid.params = list(), - offset = 1, - width = 0.4) + - labs(fill = "Quast") + - scale_x_continuous(limits=c(0,5)) # limits=c(0,5) by default for p2 - -print("Check 4") - -# -p2 <- increase_scale_until_no_warnings(p2) - -# Save plot without legend -ggsave("Phyloplot_no_legend.pdf", p2 + theme(legend.position="none")) - -# Extract Quast legend -legend_quast <- cowplot::get_legend(p2 + legends_theme) - -# Wrap the legends in fixed-sized containers and adjust margins so that legends -# are perfectly aligned -legend_busco <- ggdraw(legend_busco) + theme(plot.margin = margin(0, 0, 0, 0)) -legend_quast <- ggdraw(legend_quast) + theme(plot.margin = margin(0, 29, 60, 0)) - -# Combine both BUSCO and Quast legends -combined_legends <- plot_grid(legend_busco, - legend_quast, - NULL, # Dummy rows for better positioning - NULL, - nrow = 4, - rel_widths = c(1, 1)) - -# Save legend -ggsave("legend.pdf", plot = combined_legends) - -# Add combined legends to tree plot -p3 <- plot_grid(p2 + theme(legend.position = "none"), - combined_legends, - ncol = 2, - rel_widths = c(4, 1)) - -# Save full plot -ggsave("Phyloplot_complete.pdf", plot = p3) +# Set new ylim based on the highest value taking into account both plots +# Add -0,1 and 0.5 to avoid the cropping of first and last pies +new_ylim <- ylim(c(min(p_ranges_y), max(p_ranges_y))) +# A new xlim is needed for barplots (equivalent to ylim), as these are flipped +# using coord_flip() +new_xlim <- xlim(c(min(p_ranges_y), max(p_ranges_y))) + +#n=5 + +#tree_plot <- tree_plot + ggplot2::xlim(0, n*max(tree2$edge.length)) + new_ylim + +# Set new ylim for chromosome tree +ch_plot <- ch_plot + new_ylim + +# Set new xlim for Quast genome size (equivalent to ylim) +len_plot <- len_plot + new_xlim + +# Set new xlim for Quast N50 (equivalent to ylim) +n50_plot <- n50_plot + new_xlim + +# Set new ylim for Quast pies +pies_plot <- pies_plot + new_ylim + +# Set new xlim for gene stats (equivalent to ylim) +gene_plot <- gene_plot + new_xlim + +# Build tree +tree_plot <- ggtree(tree) + + # Tip font size, should be an arg + geom_tiplab(size=3, fontface = "italic", align = TRUE) + + # This avoids the cropping of tips, it takes into consideration + # the max edge length to set the xlim + # This is later reset, the value here is for tree alone + # Might need to be increased a bit + ggplot2::xlim(0, 5*max(tree$edge.length)) + #2.5 as default + theme(plot.margin = margin(10, 10, 10, 10)) # Increase margins + +# Set new ylim and xlim for tree +tree_plot <- tree_plot + new_ylim + +# Call the function +final_plot <- build_tree_plot( + tree = tree_plot, + n = args$tree_size, # Only affects tree_plot (default 4.5) + plots = list(ch_plot, len_plot, gene_plot, n50_plot, pies_plot), + legends = list(NULL, legend_len, legend_gene, NULL, legend_busco), + new_xlim, + 15, + 60 +) + +pdf("tree_plot.pdf") +final_plot +dev.off() + +######OLD CODE##### +# Now to add annotations. This is a random example + +#Vspecies_sy <- data_busco[c(1,5,8,15,17),]$species +#species_sy <- gsub("_", " ", species_sy) + +#tree_plot2 <- tree_plot + +#tree_plot2$data$label <- ifelse(tree_plot2$data$label %in% species_sy, paste0(tree_plot2$data$label, " *"), tree_plot2$data$label) + +#annotation$offset <- 0.35 + +#tree_plot2 <- tree_plot + geom_point(data = annotation, aes(x = offset+0.35, y = node), shape = 18, color = "green3", size = 3) +# Padding label names for right alignment +#padding_labels <- data.frame(label = tree$tip.label, +# newlabel = label_pad(tree$tip.label), +# newlabel2 = label_pad(tree$tip.label, pad = " ")) +# Plot the phylogenetic tree prob not necessary +#tree_plot <- ggtree(tree) %<+% padding_labels + +#tree_plot <- tree_plot + + # Tip font size, should be an arg +# geom_tiplab(aes(label=newlabel2), size=3, family='mono', fontface = "italic", align = TRUE, linetype = NULL) + + # This avoids the cropping of tips, it takes into consideration + # the max edge length to set the xlim + # This is later reset, the value here is for tree alone + # Might need to be increased a bit +# ggplot2::xlim(0, 5*max(tree2$edge.length)) + #2.5 as default +# theme(plot.margin = margin(10, 10, 10, 10)) # Increase margins + +#tree_plot diff --git a/conf/modules.config b/conf/modules.config index 1a418ef..c7bfb05 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -117,13 +117,23 @@ process { withName: 'SORT_BY_LENGTH' { ext.prefix = { "${meta.id}_sorted" } publishDir = [ - path: { "${params.outdir}/tidk_explore" }, + path: { "${params.outdir}/seqkit_sorted" }, mode: params.publish_dir_mode, enabled: params.save_sorted_seqs, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + withName: 'FILTER_BY_LENGTH' { + ext.prefix = { "${meta.id}_filtered" } + publishDir = [ + path: { "${params.outdir}/seqkit_filtered" }, + mode: params.publish_dir_mode, + enabled: params.save_filtered_seqs, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'TIDK_EXPLORE' { ext.args = "--minimum 5 --maximum 12" publishDir = [ diff --git a/modules/local/tree_summary.nf b/modules/local/tree_summary.nf index 638f991..989db17 100644 --- a/modules/local/tree_summary.nf +++ b/modules/local/tree_summary.nf @@ -2,15 +2,15 @@ process TREE_SUMMARY { tag "$meta.id" label 'process_single' - container = 'fduarte001/genomeqc_tree:0.1' - publishDir "$params.outdir/tree_plots" , mode: "${params.publish_dir_mode}", pattern:"Phyloplot_*.pdf" + container = 'fduarte001/genomeqc_tree:0.3' + publishDir "$params.outdir/tree_plots" , mode: "${params.publish_dir_mode}", pattern:"*.pdf" input: tuple val(meta), path(tree) path multiqc_files output: - path( "P*.pdf" ), emit: figure + path( "*.pdf" ), emit: figure path( "versions.yml" ), emit: versions script: @@ -33,10 +33,10 @@ process TREE_SUMMARY { tail -q -n 1 *.txt | sed -E 's/\t+/\t/g' | sed 's/\t\$//g' >> Busco_combined # Combine QUAST ouput - quast_2_table.py *quast.tsv -o Quast_to_plot.tsv -col N50,N90,"Total length","GC (%)","# contigs" -plot_types bar,bar,bar,bar + quast_2_table.py *quast.tsv -o Quast_to_plot.tsv -col N50,N90,"Total length","GC (%)","# contigs" -plot_types bar,bar,bar,bar,bar # Run summary plot - plot_tree_summary.R tree.nw Busco_combined Quast_to_plot.tsv + plot_tree_summary.R tree.nw Busco_combined Quast_to_plot.tsv gene_stats.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index a48e393..1ce14df 100644 --- a/nextflow.config +++ b/nextflow.config @@ -23,6 +23,7 @@ params { // seqkit options save_sorted_seqs = false + save_filtered_seqs = false // Genome only option genome_only = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 92d67e2..22f216a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,377 +1,378 @@ { - "$schema": "https://json-schema.org/draft-07/schema", - "$id": "https://raw.githubusercontent.com/ecoflow/genomeqc/master/nextflow_schema.json", - "title": "ecoflow/genomeqc pipeline parameters", - "description": "A pipeline to compare multiple genomes and annotations", - "type": "object", - "definitions": { - "input_output_options": { - "title": "Input/output options", - "type": "object", - "fa_icon": "fas fa-terminal", - "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], - "properties": { - "input": { - "type": "string", - "format": "file-path", - "exists": true, - "schema": "assets/schema_input.json", - "mimetype": "text/csv", - "pattern": "^\\S+\\.csv$", - "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row.", - "fa_icon": "fas fa-file-csv" - }, - "outdir": { - "type": "string", - "format": "directory-path", - "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", - "fa_icon": "fas fa-folder-open" - }, - "email": { - "type": "string", - "description": "Email address for completion summary.", - "fa_icon": "fas fa-envelope", - "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" - }, - "multiqc_title": { - "type": "string", - "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", - "fa_icon": "fas fa-file-signature" - } - } - }, - "reference_genome_options": { - "title": "Reference genome options", - "type": "object", - "fa_icon": "fas fa-dna", - "description": "Reference genome related files and options required for the workflow.", - "properties": { - "genome": { - "type": "string", - "description": "Name of iGenomes reference.", - "fa_icon": "fas fa-book", - "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." - }, - "fasta": { - "type": "string", - "format": "file-path", - "exists": true, - "mimetype": "text/plain", - "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", - "description": "Path to FASTA genome file.", - "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", - "fa_icon": "far fa-file-code" - }, - "igenomes_ignore": { - "type": "boolean", - "description": "Do not load the iGenomes reference config.", - "fa_icon": "fas fa-ban", - "hidden": true, - "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + "$schema": "https://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/ecoflow/genomeqc/master/nextflow_schema.json", + "title": "ecoflow/genomeqc pipeline parameters", + "description": "A pipeline to compare multiple genomes and annotations", + "type": "object", + "definitions": { + "input_output_options": { + "title": "Input/output options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Define where the pipeline should find input data and save output data.", + "required": [ + "input", + "outdir" + ], + "properties": { + "input": { + "type": "string", + "format": "file-path", + "exists": true, + "schema": "assets/schema_input.json", + "mimetype": "text/csv", + "pattern": "^\\S+\\.csv$", + "description": "Path to comma-separated file containing information about the samples in the experiment.", + "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row.", + "fa_icon": "fas fa-file-csv" + }, + "outdir": { + "type": "string", + "format": "directory-path", + "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", + "fa_icon": "fas fa-folder-open" + }, + "email": { + "type": "string", + "description": "Email address for completion summary.", + "fa_icon": "fas fa-envelope", + "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + }, + "multiqc_title": { + "type": "string", + "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", + "fa_icon": "fas fa-file-signature" + } + } + }, + "reference_genome_options": { + "title": "Reference genome options", + "type": "object", + "fa_icon": "fas fa-dna", + "description": "Reference genome related files and options required for the workflow.", + "properties": { + "genome": { + "type": "string", + "description": "Name of iGenomes reference.", + "fa_icon": "fas fa-book", + "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." + }, + "fasta": { + "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/plain", + "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", + "description": "Path to FASTA genome file.", + "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", + "fa_icon": "far fa-file-code" + }, + "igenomes_ignore": { + "type": "boolean", + "description": "Do not load the iGenomes reference config.", + "fa_icon": "fas fa-ban", + "hidden": true, + "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + } + } + }, + "institutional_config_options": { + "title": "Institutional config options", + "type": "object", + "fa_icon": "fas fa-university", + "description": "Parameters used to describe centralised config profiles. These should not be edited.", + "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.", + "properties": { + "custom_config_version": { + "type": "string", + "description": "Git commit id for Institutional configs.", + "default": "master", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "custom_config_base": { + "type": "string", + "description": "Base directory for Institutional configs.", + "default": "https://raw.githubusercontent.com/nf-core/configs/master", + "hidden": true, + "help_text": "If you're running offline, Nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell Nextflow where to find them with this parameter.", + "fa_icon": "fas fa-users-cog" + }, + "config_profile_name": { + "type": "string", + "description": "Institutional config name.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_description": { + "type": "string", + "description": "Institutional config description.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_contact": { + "type": "string", + "description": "Institutional config contact information.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_url": { + "type": "string", + "description": "Institutional config URL link.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + } + } + }, + "max_job_request_options": { + "title": "Max job request options", + "type": "object", + "fa_icon": "fab fa-acquisitions-incorporated", + "description": "Set the top limit for requested resources for any single job.", + "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", + "properties": { + "max_cpus": { + "type": "integer", + "description": "Maximum number of CPUs that can be requested for any single job.", + "default": 16, + "fa_icon": "fas fa-microchip", + "hidden": true, + "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" + }, + "max_memory": { + "type": "string", + "description": "Maximum amount of memory that can be requested for any single job.", + "default": "128.GB", + "fa_icon": "fas fa-memory", + "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", + "hidden": true, + "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" + }, + "max_time": { + "type": "string", + "description": "Maximum amount of time that can be requested for any single job.", + "default": "240.h", + "fa_icon": "far fa-clock", + "pattern": "^(\\d+\\.?\\s*(s|m|h|d|day)\\s*)+$", + "hidden": true, + "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" + } + } + }, + "generic_options": { + "title": "Generic options", + "type": "object", + "fa_icon": "fas fa-file-import", + "description": "Less common options for the pipeline, typically set in a config file.", + "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.", + "properties": { + "help": { + "type": "boolean", + "description": "Display help text.", + "fa_icon": "fas fa-question-circle", + "hidden": true + }, + "version": { + "type": "boolean", + "description": "Display version and exit.", + "fa_icon": "fas fa-question-circle", + "hidden": true + }, + "publish_dir_mode": { + "type": "string", + "default": "copy", + "description": "Method used to save pipeline results to output directory.", + "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", + "fa_icon": "fas fa-copy", + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], + "hidden": true + }, + "email_on_fail": { + "type": "string", + "description": "Email address for completion summary, only when pipeline fails.", + "fa_icon": "fas fa-exclamation-triangle", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", + "help_text": "An email address to send a summary email to when the pipeline is completed - ONLY sent if the pipeline does not exit successfully.", + "hidden": true + }, + "plaintext_email": { + "type": "boolean", + "description": "Send plain-text email instead of HTML.", + "fa_icon": "fas fa-remove-format", + "hidden": true + }, + "max_multiqc_email_size": { + "type": "string", + "description": "File size limit when attaching MultiQC reports to summary emails.", + "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", + "default": "25.MB", + "fa_icon": "fas fa-file-upload", + "hidden": true + }, + "monochrome_logs": { + "type": "boolean", + "description": "Do not use coloured log outputs.", + "fa_icon": "fas fa-palette", + "hidden": true + }, + "hook_url": { + "type": "string", + "description": "Incoming hook URL for messaging service", + "fa_icon": "fas fa-people-group", + "help_text": "Incoming hook URL for messaging service. Currently, MS Teams and Slack are supported.", + "hidden": true + }, + "multiqc_config": { + "type": "string", + "format": "file-path", + "description": "Custom config file to supply to MultiQC.", + "fa_icon": "fas fa-cog", + "hidden": true + }, + "multiqc_logo": { + "type": "string", + "description": "Custom logo file to supply to MultiQC. File name must also be set in the MultiQC config file", + "fa_icon": "fas fa-image", + "hidden": true + }, + "multiqc_methods_description": { + "type": "string", + "description": "Custom MultiQC yaml file containing HTML including a methods description.", + "fa_icon": "fas fa-cog" + }, + "validate_params": { + "type": "boolean", + "description": "Boolean whether to validate parameters against the schema at runtime", + "default": true, + "fa_icon": "fas fa-check-square", + "hidden": true + }, + "validationShowHiddenParams": { + "type": "boolean", + "fa_icon": "far fa-eye-slash", + "description": "Show all params when using `--help`", + "hidden": true, + "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." + }, + "validationFailUnrecognisedParams": { + "type": "boolean", + "fa_icon": "far fa-check-circle", + "description": "Validation of parameters fails when an unrecognised parameter is found.", + "hidden": true, + "help_text": "By default, when an unrecognised parameter is found, it returns a warinig." + }, + "validationLenientMode": { + "type": "boolean", + "fa_icon": "far fa-check-circle", + "description": "Validation of parameters in lenient more.", + "hidden": true, + "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode)." + }, + "pipelines_testdata_base_path": { + "type": "string", + "fa_icon": "far fa-check-circle", + "description": "Base URL or local path to location of pipeline test dataset files", + "default": "https://raw.githubusercontent.com/nf-core/test-datasets/", + "hidden": true + }, + "groups": { + "type": "string", + "default": "all", + "description": "A string with NCBI taxonomic groups to download. Can be a comma-separated list. Options are 'all', 'archaea', 'bacteria', 'fungi', 'invertebrate', 'metagenomes', 'plant', 'protozoa', 'vertebrate_mammalian', 'vertebrate_other', 'viral'" + }, + "busco_mode": { + "type": "string", + "default": "proteins", + "description": "A flag to set the busco mode. Either genome, proteins, transcriptome. Default: proteins" + }, + "busco_lineage": { + "type": "string", + "default": "hymenoptera_odb10", + "description": "A flag to set the busco lineage. Default: auto" + }, + "busco_lineages_path": { + "type": "string", + "description": "A flag to set the BUSCO lineages directory (optional)" + }, + "busco_config": { + "type": "string", + "description": "A path to a BUSCO config file (optional)" + }, + "genome_only": { + "type": "boolean", + "description": "Run genomeqc on genomes only" + }, + "kvalue": { + "type": "integer", + "default": 21, + "description": "k size for meryl (merqury)" + }, + "run_merqury": { + "type": "boolean", + "description": "Run meryl/merqury step?" + }, + "skip_tidk": { + "type": "boolean", + "description": "Do not run TIDK", + "hidden": true, + "help_text": "You may wish to turn off the tidk subworkflow" + } + } } - } }, - "institutional_config_options": { - "title": "Institutional config options", - "type": "object", - "fa_icon": "fas fa-university", - "description": "Parameters used to describe centralised config profiles. These should not be edited.", - "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.", - "properties": { - "custom_config_version": { - "type": "string", - "description": "Git commit id for Institutional configs.", - "default": "master", - "hidden": true, - "fa_icon": "fas fa-users-cog" + "allOf": [ + { + "$ref": "#/definitions/input_output_options" }, - "custom_config_base": { - "type": "string", - "description": "Base directory for Institutional configs.", - "default": "https://raw.githubusercontent.com/nf-core/configs/master", - "hidden": true, - "help_text": "If you're running offline, Nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell Nextflow where to find them with this parameter.", - "fa_icon": "fas fa-users-cog" + { + "$ref": "#/definitions/reference_genome_options" }, - "config_profile_name": { - "type": "string", - "description": "Institutional config name.", - "hidden": true, - "fa_icon": "fas fa-users-cog" + { + "$ref": "#/definitions/institutional_config_options" }, - "config_profile_description": { - "type": "string", - "description": "Institutional config description.", - "hidden": true, - "fa_icon": "fas fa-users-cog" + { + "$ref": "#/definitions/max_job_request_options" }, - "config_profile_contact": { - "type": "string", - "description": "Institutional config contact information.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "config_profile_url": { - "type": "string", - "description": "Institutional config URL link.", - "hidden": true, - "fa_icon": "fas fa-users-cog" + { + "$ref": "#/definitions/generic_options" } - } - }, - "max_job_request_options": { - "title": "Max job request options", - "type": "object", - "fa_icon": "fab fa-acquisitions-incorporated", - "description": "Set the top limit for requested resources for any single job.", - "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", - "properties": { - "max_cpus": { - "type": "integer", - "description": "Maximum number of CPUs that can be requested for any single job.", - "default": 16, - "fa_icon": "fas fa-microchip", - "hidden": true, - "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" - }, - "max_memory": { - "type": "string", - "description": "Maximum amount of memory that can be requested for any single job.", - "default": "128.GB", - "fa_icon": "fas fa-memory", - "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", - "hidden": true, - "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" - }, - "max_time": { - "type": "string", - "description": "Maximum amount of time that can be requested for any single job.", - "default": "240.h", - "fa_icon": "far fa-clock", - "pattern": "^(\\d+\\.?\\s*(s|m|h|d|day)\\s*)+$", - "hidden": true, - "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" + ], + "properties": { + "save_sorted_seqs": { + "type": "boolean", + "description": "Publish sorted fasta genome files from seqkit" + }, + "save_validated_annotation": { + "type": "boolean", + "description": "Publish gff files validated by AGAT" + }, + "save_assembly": { + "type": "boolean", + "description": "Publish genomes and/or annotations of user-supplied RefSeq IDs" + }, + "save_extracted_seqs": { + "type": "boolean", + "description": "Publish extracted protein fasta files by GFFREAD" + }, + "save_orthofinder_results": { + "type": "boolean", + "description": "Publish orthofinder results" + }, + "save_longest_isoform": { + "type": "boolean", + "description": "Publish longest protein isoform fasta files" + }, + "save_filtered_seqs": { + "type": "boolean", + "description": "Publish filtered fasta genome files from seqkit" } - } - }, - "generic_options": { - "title": "Generic options", - "type": "object", - "fa_icon": "fas fa-file-import", - "description": "Less common options for the pipeline, typically set in a config file.", - "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.", - "properties": { - "help": { - "type": "boolean", - "description": "Display help text.", - "fa_icon": "fas fa-question-circle", - "hidden": true - }, - "version": { - "type": "boolean", - "description": "Display version and exit.", - "fa_icon": "fas fa-question-circle", - "hidden": true - }, - "publish_dir_mode": { - "type": "string", - "default": "copy", - "description": "Method used to save pipeline results to output directory.", - "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", - "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], - "hidden": true - }, - "email_on_fail": { - "type": "string", - "description": "Email address for completion summary, only when pipeline fails.", - "fa_icon": "fas fa-exclamation-triangle", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", - "help_text": "An email address to send a summary email to when the pipeline is completed - ONLY sent if the pipeline does not exit successfully.", - "hidden": true - }, - "plaintext_email": { - "type": "boolean", - "description": "Send plain-text email instead of HTML.", - "fa_icon": "fas fa-remove-format", - "hidden": true - }, - "max_multiqc_email_size": { - "type": "string", - "description": "File size limit when attaching MultiQC reports to summary emails.", - "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", - "default": "25.MB", - "fa_icon": "fas fa-file-upload", - "hidden": true - }, - "monochrome_logs": { - "type": "boolean", - "description": "Do not use coloured log outputs.", - "fa_icon": "fas fa-palette", - "hidden": true - }, - "hook_url": { - "type": "string", - "description": "Incoming hook URL for messaging service", - "fa_icon": "fas fa-people-group", - "help_text": "Incoming hook URL for messaging service. Currently, MS Teams and Slack are supported.", - "hidden": true - }, - "multiqc_config": { - "type": "string", - "format": "file-path", - "description": "Custom config file to supply to MultiQC.", - "fa_icon": "fas fa-cog", - "hidden": true - }, - "multiqc_logo": { - "type": "string", - "description": "Custom logo file to supply to MultiQC. File name must also be set in the MultiQC config file", - "fa_icon": "fas fa-image", - "hidden": true - }, - "multiqc_methods_description": { - "type": "string", - "description": "Custom MultiQC yaml file containing HTML including a methods description.", - "fa_icon": "fas fa-cog" - }, - "validate_params": { - "type": "boolean", - "description": "Boolean whether to validate parameters against the schema at runtime", - "default": true, - "fa_icon": "fas fa-check-square", - "hidden": true - }, - "validationShowHiddenParams": { - "type": "boolean", - "fa_icon": "far fa-eye-slash", - "description": "Show all params when using `--help`", - "hidden": true, - "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." - }, - "validationFailUnrecognisedParams": { - "type": "boolean", - "fa_icon": "far fa-check-circle", - "description": "Validation of parameters fails when an unrecognised parameter is found.", - "hidden": true, - "help_text": "By default, when an unrecognised parameter is found, it returns a warinig." - }, - "validationLenientMode": { - "type": "boolean", - "fa_icon": "far fa-check-circle", - "description": "Validation of parameters in lenient more.", - "hidden": true, - "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode)." - }, - "pipelines_testdata_base_path": { - "type": "string", - "fa_icon": "far fa-check-circle", - "description": "Base URL or local path to location of pipeline test dataset files", - "default": "https://raw.githubusercontent.com/nf-core/test-datasets/", - "hidden": true - }, - "groups": { - "type": "string", - "default": "all", - "description": "A string with NCBI taxonomic groups to download. Can be a comma-separated list. Options are 'all', 'archaea', 'bacteria', 'fungi', 'invertebrate', 'metagenomes', 'plant', 'protozoa', 'vertebrate_mammalian', 'vertebrate_other', 'viral'" - }, - "busco_mode": { - "type": "string", - "default": "proteins", - "description": "A flag to set the busco mode. Either genome, proteins, transcriptome. Default: proteins" - }, - "busco_lineage": { - "type": "string", - "default": "hymenoptera_odb10", - "description": "A flag to set the busco lineage. Default: auto" - }, - "busco_lineages_path": { - "type": "string", - "description": "A flag to set the BUSCO lineages directory (optional)" - }, - "busco_config": { - "type": "string", - "description": "A path to a BUSCO config file (optional)" - }, - "genome_only": { - "type": "boolean", - "description": "Run genomeqc on genomes only" - }, - "kvalue": { - "type": "integer", - "default": 21, - "description": "k size for meryl (merqury)" - }, - "run_merqury": { - "type": "boolean", - "description": "Run meryl/merqury step?" - }, - "skip_tidk": { - "type": "boolean", - "description": "Do not run TIDK", - "hidden": true, - "help_text": "You may wish to turn off the tidk subworkflow" - }, - "karyotype": { - "type": "string", - "description": "Path to the karyotype file for ideogram plotting", - "help_text": "A tab-delimited file containing chromosome information for ideogram plotting", - "fa_icon": "fas fa-dna" - } - } - } - }, - "allOf": [ - { - "$ref": "#/definitions/input_output_options" - }, - { - "$ref": "#/definitions/reference_genome_options" - }, - { - "$ref": "#/definitions/institutional_config_options" - }, - { - "$ref": "#/definitions/max_job_request_options" - }, - { - "$ref": "#/definitions/generic_options" - } - ], - "properties": { - "save_sorted_seqs": { - "type": "boolean", - "description": "Publish sorted fasta genome files from TIDK Explore subworkflow" - }, - "save_validated_annotation": { - "type": "boolean", - "description": "Publish gff files validated by AGAT" - }, - "save_assembly": { - "type": "boolean", - "description": "Publish genomes and/or annotations of user-supplied RefSeq IDs" - }, - "save_extracted_seqs": { - "type": "boolean", - "description": "Publish extracted protein fasta files by GFFREAD" - }, - "save_orthofinder_results": { - "type": "boolean", - "description": "Publish orthofinder results" - }, - "save_longest_isoform": { - "type": "boolean", - "description": "Publish longest protein isoform fasta files" } - } -} +} \ No newline at end of file From 545004869f1b5998ebfdce34359699c0dd6a34b1 Mon Sep 17 00:00:00 2001 From: FernandoDuarteF Date: Fri, 17 Jan 2025 16:38:24 +0000 Subject: [PATCH 10/11] Added formula to prevent truncated labels --- bin/plot_tree_summary.R | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/bin/plot_tree_summary.R b/bin/plot_tree_summary.R index a8c2c7b..e203011 100755 --- a/bin/plot_tree_summary.R +++ b/bin/plot_tree_summary.R @@ -5,7 +5,7 @@ #--------FUNCTIONS-------# -# Function to rotate legends +# Function to rotate legends (not used) rotate_grob <- function(grob, angle) { gTree(children = gList(grob), vp = viewport(angle = angle)) } @@ -13,7 +13,7 @@ rotate_grob <- function(grob, angle) { # Function to plot tree and plots build_tree_plot <- function(tree, n, plots, legends, xlimit, rigth_margin, bottom_margin) { #xlim for legends, use same xlim as barplots (new_xlim) # Update xlim for the tree plot - tree <- tree + ggplot2::xlim(0, n * max(tree$data$branch.length)) + tree <- tree + ggplot2::xlim(0, n) # Initialize combined plot with the tree combined_plots <- tree @@ -72,11 +72,9 @@ parser$add_argument('busco_file', type = 'character', help = 'Path to processed parser$add_argument('quast_file', type = 'character', help = 'Path to processed Quast output file') parser$add_argument('genes_file', type = 'character', help = 'Path to gene stats output file') parser$add_argument('--text_size', type = 'double', default = 3, help = 'Text size for the plots') -parser$add_argument('--tree_size', type = 'double', default = 4.5, help = 'x axis width for tree plot') +parser$add_argument('--tree_size', type = 'double', default = 0.0005, help = 'Change x axis limits for tree plot (useful when tree labels appear truncated)') args <- parser$parse_args() -#setwd("/Users/fernando/work/repositories/pipelines/make_better_tree_plot") - # Avoid scientific notation in all plots options(scipen = 999) @@ -116,6 +114,8 @@ data_quast <- data_quast %>% filter(N50 != "bar") %>% # Total length values to Mb mutate(Total.length = (as.numeric(Total.length)/1000000)) %>% + #Change Sequence values to integers + mutate(Sequences = as.integer(Sequences)) %>% # Create new col with numbers of GC bp mutate(GC = as.numeric(Total.length)*as.numeric(GC)/100) %>% # Rename column to make it shorter @@ -225,8 +225,8 @@ len_plot <- ggplot( position = position_stack(reverse = TRUE), width = 0.7 ) + - scale_fill_manual(values = c("brown1", "cornflowerblue")) + - ggtitle("Genome\nsize") + + scale_fill_manual(labels = c("GC %", "Length"), values = c("brown1", "cornflowerblue")) + + ggtitle("Genome\nsize (Mb)") + barplots_theme + theme(plot.title = element_text(size = 9, hjust = 0.5, vjust = -5)) + coord_flip() + #Flip plot @@ -260,7 +260,7 @@ n50_plot <- ggplot( width = 0.7, fill = "steelblue" ) + - ggtitle("N50") + + ggtitle("N50 (Mb)") + barplots_theme + theme(plot.title = element_text(size = 9, hjust = 0.5, vjust = -0.4)) + coord_flip() + @@ -359,11 +359,7 @@ new_ylim <- ylim(c(min(p_ranges_y), max(p_ranges_y))) # using coord_flip() new_xlim <- xlim(c(min(p_ranges_y), max(p_ranges_y))) -#n=5 - -#tree_plot <- tree_plot + ggplot2::xlim(0, n*max(tree2$edge.length)) + new_ylim - -# Set new ylim for chromosome tree +# Set new ylim for sequnces ch_plot <- ch_plot + new_ylim # Set new xlim for Quast genome size (equivalent to ylim) @@ -382,20 +378,22 @@ gene_plot <- gene_plot + new_xlim tree_plot <- ggtree(tree) + # Tip font size, should be an arg geom_tiplab(size=3, fontface = "italic", align = TRUE) + - # This avoids the cropping of tips, it takes into consideration - # the max edge length to set the xlim - # This is later reset, the value here is for tree alone - # Might need to be increased a bit - ggplot2::xlim(0, 5*max(tree$edge.length)) + #2.5 as default - theme(plot.margin = margin(10, 10, 10, 10)) # Increase margins + theme(plot.margin = margin(10, 10, 10, 10)) + # Increase margins + coord_cartesian(clip="off") # Set new ylim and xlim for tree tree_plot <- tree_plot + new_ylim +# Set value for tree xlim to avoid the truncation of labels: +# Why "^2*0.001"? ^2 is because the relatin between number of characters and the number +# of pixels is close to beexponential, not proportional. 0.001 would be the length +# per character in the x axis scale. Script should allow to change this value +m = max(tree_plot$data$x) + max(nchar(tree_plot$data$label))^2*args$tree_size + # Call the function final_plot <- build_tree_plot( tree = tree_plot, - n = args$tree_size, # Only affects tree_plot (default 4.5) + n = m, # Only affects tree_plot plots = list(ch_plot, len_plot, gene_plot, n50_plot, pies_plot), legends = list(NULL, legend_len, legend_gene, NULL, legend_busco), new_xlim, From 8d8314b7b71ac7ba4ec17b63ab8c8babdbfad08c Mon Sep 17 00:00:00 2001 From: Fernando Duarte Date: Mon, 20 Jan 2025 10:10:13 +0000 Subject: [PATCH 11/11] Updated README.md --- README.md | 48 ++++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 4af6489..c82dfa4 100644 --- a/README.md +++ b/README.md @@ -14,51 +14,55 @@ The pipeline takes a list of genomes and annotations (from raw files or Refseq IDs), and runs commonly used tools to assess their quality. -There are three different ways you can run this pipeline. 1. Genome only, 2. Annotation only, or 3. Genome and Annotation. **Only Genome plus Annotation is functional** +There are three different ways you can run this pipeline. 1. Genome only, 2. Annotation only, or 3. Genome and Annotation. **Currently, only Genome plus Annotation is functional** -**Genome and Annnotation:** +**Genome and Annotation:** 1. Downloads the genome and gene annotation files from NCBI `[NCBIGENOMEDOWNLOAD]` - Or you provide your own genomes/annotations 2. Describes genome assembly: -2a. `[BUSCO_BUSCO]`: Determines how complete is the genome compared to expected (protein mode). -2b. `[BUSCO_IDEOGRAM]`: Plots the location of BUSCO markers on the assembly. -2c. `[QUAST]`: Determines the N50, how contiguous the genome is. -2d. More options -3. Describes your annotation : `[AGAT]`: Gene, feature, length, averages, counts. -4. Extract longest protein fasta sequences `[GFFREAD]`. -5. Finds orthologous genes `[ORTHOFINDER]`. -6. Summary with MulitQC. - -> [!WARNING] -> We strongly suggest users to specify the lineage using the `--busco_lineage` parameter, as setting the lineage to `auto` (default value) might cause problems with `[BUSCO]` during the leneage determination step. - -> [!NOTE] -> `BUSCO_IDEOGRAM` will only plot those chromosomes -or scaffolds- that contain single copy markers. + 1. `[BUSCO_BUSCO]`: Determines how complete is the genome compared to expected number of single copy markers (protein mode). + 2. `[BUSCO_IDEOGRAM]`: Plots the location of markers on the assembly. + 3. `[QUAST]`: Computes contiguity and integrity statistics: N50, N90, GC%, number of sequences. + 4. More options... +3. Describes annotation : `[AGAT]`: Gene, feature, length, averages, counts. +4. Finds the number of overlapping genes `[GENE_OVERLAPS]`. +5. Extracts longest protein isoform `[GFFREAD]`. +6. Finds orthologous genes `[ORTHOFINDER]`. +7. Plots an orthology-based phylogenetic tree `[TREE_SUMMARY]`, as well as other relevant stats from the above steps. +8. Summary with MulitQC `[MULTIQC]`. **Genome Only (in development):** 1. Downloads the genome files from NCBI `[NCBIGENOMEDOWNLOAD]` - Or you provide your own genomes 2. Describes genome assembly: -2a. `[BUSCO_BUSCO]`: Determines how complete is the genome compared to expected (genome mode). -2b. `[QUAST]`: Determines the N50, how contiguous the genome is. -2c. More options -3. Summary with MulitQC. + 1. `[BUSCO_BUSCO]`: Determines how complete is the genome compared to expected number of single copy markers (protein mode). + 2. `[BUSCO_IDEOGRAM]`: Plots the location of markers on the assembly. + 3. `[QUAST]`: Computes contiguity and integrity stats: N50, N90, GC%, number of sequences. +3. Summary with MulitQC `[MULTIQC]`. **Annnotation Only (in development):** 1. Downloads the gene annotation files from NCBI `[NCBIGENOMEDOWNLOAD]` - Or you provide your own annotations. 2. Describes your annotation : `[AGAT]`: Gene, feature, length, averages, counts. 3. Summary with MulitQC. -In addition to the three different modes described above, it is also possible to run the pipeline with or without sequencing reads. When supplying sequencing reads, Merqury can also be run. [Merqury](https://github.com/marbl/merqury) is a tool for genome quality assessment that uses k-mer counts from raw sequencing data to evaluate the accuracy and completeness of a genome assembly. Meryl is the companion tool that efficiently counts and stores k-mers from sequencing reads, enabling Merqury to estimate metrics like assembly completeness and base accuracy. These tools provide a k-mer-based approach to assess assembly quality, helping to identify potential errors or gaps.​ +**Merqury** + +Optionally, user can also run the pipeline **Genome only** and **Genome and Annotation** pipelines with **Merqury** by supplying sequencing reads (in _fastq_ format). [Merqury](https://github.com/marbl/merqury) is a tool for genome quality assessment that uses k-mer counts from raw sequencing data to evaluate the accuracy and completeness of a genome assembly. Meryl is the companion tool that enables Merqury to estimate metrics like assembly completeness and base accuracy. These tools provide a k-mer-based approach to assess assembly quality, helping to identify potential errors or gaps.​ To run the pipeline with reads, you must supply a single FASTQ file for each genome in the samplesheet, alongside the `--run_merqury` flag. It is assumed that reads used to create the assembly are from long read technology such as PacBio or ONT, and are therefore single end. If reads are in a .bam file, they must be converted to FASTQ format first. If you have paired end reads, these must be interleaved first. +> [!WARNING] +> We strongly suggest users to specify the lineage using the `--busco_lineage` parameter, as setting the lineage to `auto` (value by default) might cause problems with `[BUSCO]` during the lineage determination step. + +> [!NOTE] +> `BUSCO_IDEOGRAM` will only plot those chromosomes -or scaffolds- that contain single copy markers. + ## Usage > [!NOTE]