diff --git a/bin/gene_overlaps.R b/bin/gene_overlaps.R index 80f3ed2..11a80de 100755 --- a/bin/gene_overlaps.R +++ b/bin/gene_overlaps.R @@ -2,6 +2,7 @@ # Code written by Chris Wyatt with some editing by ChatGPT +library(dplyr) library(GenomicRanges) # Parse command-line arguments @@ -19,6 +20,9 @@ read_gff_to_granges <- function(gff_file) { gff_data <- read.delim(gff_file, header = FALSE, comment.char = "#") colnames(gff_data) <- c("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute") + gff_data <- gff_data %>% + filter(strand %in% c("-","+")) + gr <- GRanges( seqnames = gff_data$seqname, ranges = IRanges(start = gff_data$start, end = gff_data$end), @@ -67,7 +71,6 @@ for (i in seq_len(length(overlap_results))) { if (query_idx != subject_idx) { query_gene <- genes[query_idx] subject_gene <- genes[subject_idx] - overlap_range <- intersect(ranges(query_gene), ranges(subject_gene)) overlap_length <- width(overlap_range) diff --git a/bin/gene_overlaps_table.py b/bin/gene_overlaps_table.py new file mode 100755 index 0000000..9df003e --- /dev/null +++ b/bin/gene_overlaps_table.py @@ -0,0 +1,63 @@ +#!/usr/bin/python3 + +# Written using ChatGPT + +import pandas as pd +import argparse + +# Set up the argument parser +parser = argparse.ArgumentParser(description='Extract gene statistics from files.') +parser.add_argument('input_files', nargs='+', help='List of input files.') +parser.add_argument('output_file', help='Path to save the output TSV file.') +parser.add_argument('--include-sense', action='store_true', help='Include sense genes count.') +parser.add_argument('--include-antisense', action='store_true', help='Include antisense genes count.') + +# Parse the arguments +args = parser.parse_args() + +# Initialize an empty list to store the results +results = [] + +# Process each input file +for file in args.input_files: + try: + # Load the file into a DataFrame + df = pd.read_csv(file, sep='\t', header=None, names=['Statistic', 'Count']) + + # Extract required statistics + total_genes = df.loc[df['Statistic'] == 'Total number of genes', 'Count'].values[0] + overlapping_genes = df.loc[df['Statistic'] == 'Total number of overlapping genes', 'Count'].values[0] + + # Optional statistics + sense_genes = df.loc[df['Statistic'] == 'Number of genes fully contained in sense direction', 'Count'].values[0] if args.include_sense else "NA" + antisense_genes = df.loc[df['Statistic'] == 'Number of genes fully contained in antisense direction', 'Count'].values[0] if args.include_antisense else "NA" + print(sense_genes) + + # Collect results in a dictionary + entry = { + 'File': file, + 'Total_genes': total_genes, + 'Overlapping_genes': overlapping_genes, + } + if args.include_sense: + entry['Fully_contained_sense_genes'] = sense_genes + if args.include_antisense: + entry['Fully_contained_antisense_genes'] = antisense_genes + + results.append(entry) + except Exception as e: + print(f"Error processing {file}: {e}") + continue + +# Convert the results to a DataFrame +columns = ['File', 'Total_genes', 'Overlapping_genes'] +if args.include_sense: + columns.append('Fully_contained_sense_genes') +if args.include_antisense: + columns.append('Fully_contained_antisense_genes') + +result_df = pd.DataFrame(results, columns=columns) + +# Write the result to the output file +result_df.to_csv(args.output_file, sep='\t', index=False) +print(f"Extraction completed successfully. Output saved to {args.output_file}.") \ No newline at end of file diff --git a/bin/plot_tree_summary.R b/bin/plot_tree_summary.R index 7028126..af2f358 100755 --- a/bin/plot_tree_summary.R +++ b/bin/plot_tree_summary.R @@ -1,7 +1,7 @@ #!/usr/bin/Rscript -# Written by Chris Wyatt and released under the MIT license. -# Prints a tree with QUAST N50 results on tips of branches +# Written by Chris Wyatt and Fernando Duarte and released under the MIT license. +# Plots a phylogenetic tree alongside BUSCO and Quast stats # Load necessary libraries if (!requireNamespace("argparse", quietly = TRUE)) { @@ -19,6 +19,12 @@ if (!requireNamespace("cowplot", quietly = TRUE)) { if (!requireNamespace("tidyr", quietly = TRUE)) { install.packages("tidyr") } +if (!requireNamespace("ggtreeExtra", quietly = TRUE)) { + BiocManager::install("ggtreeExtra") +} +if (!requireNamespace("ggimage", quietly = TRUE)) { + install.packages("ggimage") +} library(ggtree) library(ggplot2) @@ -26,22 +32,57 @@ library(cowplot) library(argparse) library(dplyr) library(tidyr) - -# Function to extract legend from a ggplot -extract_legend <- function(plot) { - g <- ggplotGrob(plot) - legend <- g$grobs[which(sapply(g$grobs, function(x) x$name) == "guide-box")] - return(legend) -} +library(ggimage) +library(ggtreeExtra) # Parse command-line arguments parser <- ArgumentParser(description = 'Plot phylogenetic tree with statistics and true/false data') parser$add_argument('tree_file', type = 'character', help = 'Path to the Newick formatted tree file') -parser$add_argument('data_file', type = 'character', help = 'Path to the CSV file with statistic and true/false data') -parser$add_argument('--text_size', type = 'double', default = 3, help = 'Text size for the plots') -parser$add_argument('--tree_size', type = 'double', default = 0.3, help = 'Proportion of the plot width for the tree') +parser$add_argument('busco_file', type = 'character', help = 'Path to processed BUSCO output file') +parser$add_argument('quast_file', type = 'character', help = 'Path to processed Quast output file') +parser$add_argument('--text_size', type = 'double', default = 2.5, help = 'Tree tip text size for the plots') +parser$add_argument('--pie_size', type = 'double', default = 0, help = 'Increase pie size of each tip by this fold') +parser$add_argument('--pie_hjust', type = 'double', default = 0.4, help = 'Horizontally adjust pie positions') +#parser$add_argument('--tree_size', type = 'double', default = 0.3, help = 'Proportion of the plot width for the tree') args <- parser$parse_args() +# Create a function so that, if the Quast data is off limits and warnings related +# to this pop up , margins will be increased by the value "step" on each iteration +increase_scale_until_no_warnings <- function(plot, initial_limit = 5, step = 0.1, max_iterations = 50) { + # Initialize limit and warning flag + current_limit <- initial_limit + warning_flag <- TRUE + iteration <- 0 + + while (warning_flag && iteration < max_iterations) { + iteration <- iteration + 1 + # Update the plot with the current limit + updated_plot <- plot + scale_x_continuous(limits = c(0, current_limit)) + + # Capture warnings + warnings <- tryCatch({ + print(updated_plot) # Render the plot + NULL # No warnings + }, warning = function(w) { + w$message # Capture warning message + }) + + # Check if there are warnings + if (is.null(warnings)) { + warning_flag <- FALSE # No warnings, exit loop + } else { + # Increase the limit for the next iteration + current_limit <- current_limit + step + } + } + + # Return the final plot + if (iteration >= max_iterations) { + warning("Maximum iterations reached. Plot may still have issues.") + } + return(updated_plot) +} + # Read the Newick tree from the file tree <- read.tree(args$tree_file) @@ -50,170 +91,188 @@ tree$tip.label <- trimws(tree$tip.label) tree$tip.label <- tolower(tree$tip.label) # Read the data table from the file, ensuring species column is read as character -data <- read.csv(args$data_file, sep = "\t", colClasses = c("species" = "character")) +# Load BUSCO +data_busco <- read.csv(args$busco_file, sep = "\t", colClasses = c("Input_file" = "character")) + +# Prepare BUSCO data +data_busco <- data_busco %>% + # Remove extension from Input_file + mutate(Input_file = tools::file_path_sans_ext(Input_file)) %>% + # Rename 'Input_file' to 'species' + rename(species = Input_file) %>% + # Make everything lowercase + mutate(species = tolower(species)) -# Clean data species names -data$species <- trimws(data$species) -data$species <- tolower(data$species) +# Load Quast data +data_quast <- read.csv(args$quast_file, sep = "\t") -# Extract the column headers and the second row to determine plot types -plot_types <- as.character(data[1, ]) -data <- data[-1, ] # Remove the second row used for plot types +#Tidy Quast data +data_quast <- data_quast %>% + # Remove the row where species is NA + filter(!is.na(species)) %>% + # Make species lowercase + mutate(species = tolower(species)) %>% + # Convert wide to long format + pivot_longer(cols = c(N50, N90), + names_to = "metric", + values_to = "value") %>% + # Remove any remaining "bar" rows if necessary (check Chris script) + filter(value != "bar") %>% + # Convert value column to numeric if needed + mutate(value = as.numeric(value)) # Debugging: Print species names from the tree and the data cat("Species names in the tree based on nw:\n") -print(tree$tip.label) -cat("\nSpecies names in the data table before processing:\n") -print(data$species) +cat(tree$tip.label) +cat("\nSpecies names in data tables:\n") +quast_sp <- unique(data_quast$species) +cat("BUSCO:", data_busco$species, "\nQUAST:", quast_sp, "\n") -# Debugging: Print the data frame to check its structure -cat("\nData frame:\n") -print(data) - -# Ensure species names match the tree tips -missing_in_tree <- setdiff(data$species, tree$tip.label) -missing_in_data <- setdiff(tree$tip.label, data$species) - -if (length(missing_in_tree) > 0) { - cat("Species in data but not in tree:\n") - print(missing_in_tree) +# Debugging: Check if there are any mismatches in species names (might need to be +# changed in the future to support optional stats) +if (all(sort(tree$tip.label) != sort(data_busco$species))) { + stop("Species names in BUSCO and tree labels do not match") +} else if (all(sort(quast_sp) != sort(data_busco$species))) { + stop("Species names in Quast and tree labels do not match") } -if (length(missing_in_data) > 0) { - cat("Species in tree but not in data:\n") - print(missing_in_data) -} +# Arrange data according to tree labels, so that they are plotted correctly +# For BUSCO +data_busco <- data_busco %>% + arrange(match(species, tree$tip.label)) -if (length(missing_in_tree) > 0 || length(missing_in_data) > 0) { - stop("Species names in the data do not match the tree tips") -} +# For Quast +data_quast <- data_quast %>% + arrange(match(species, tree$tip.label)) + +# Node number needed in BUSCO dataframe for nodpie. This is a generic 1-n +# numbering (n=number of species) +data_busco <- data_busco %>% + mutate(node = seq_len(n())) # Plot the phylogenetic tree -tree_plot <- ggtree(tree) + - geom_tiplab() + - coord_cartesian(clip="off") + +tree_plot <- ggtree(tree, branch.length = "none") + # No branch length, only topology + geom_tiplab(size=args$text_size) + # Tip font size + ggplot2::xlim(0, 5) + ggtitle("Phylogenetic Tree") + - theme(plot.margin = margin(10, 150, 10, 10)) # Increase margins + theme(plot.margin = margin(10, 10, 10, 10)) # Increase margins -pdf ("Tree_only.pdf") -tree_plot -dev.off() +# Save tree only +ggsave("tree_only.pdf", tree_plot) +# Create list of pies from BUSCO data +pies <- nodepie(data_busco, + cols = 4:7, + color = c( + "Single" = "steelblue", + "Duplicated" = "orange", + "Fragmented" = "purple", + "Missing" = "red" + ) +) -# Extract the exact tree tip names:axis.text -tree$tip.label <- get_taxa_name(tree_plot) +print("Check 1") -# Reorder the data based on the order of species in the tree -data <- data %>% arrange(match(species, tree$tip.label)) +# Create a dummy pie as for the legend it seems impossible to extract it +# from nodpie +dummy_pie_data <- data.frame( + Type = factor(c("Single", "Duplicated", "Fragmented", "Missing")), + value = c(1, 1, 1, 1) # Dummy values for equal slices +) -# Debugging: Print plot types -cat("\nPlot types:\n") -print(plot_types) +dummy_pie <- ggplot(dummy_pie_data) + + geom_bar( + aes(x = "", y = value, fill = Type), + stat = "identity", + width = 1 + ) + + coord_polar("y") + + scale_fill_manual( + values = c( + "Single" = "steelblue", + "Duplicated" = "orange", + "Fragmented" = "purple", + "Missing" = "red" + ), + name = "BUSCO" + ) + + theme_void() + + theme(legend.position = "right") # Show the legend -# Debugging: Print the reordered data frame to check its structure -cat("\nReordered data frame:\n") -print(data) +print("Check 2") -write.table(data, "Reordered_output_tree.tsv", sep="\t", quote=F) +# Create variable for standarization of legends format +legends_theme <- theme(legend.title = element_text(size = 10, face = "bold"), + legend.text = element_text(size = 8), + legend.key.size = unit(0.5, "cm")) -# Check if the number of unique species matches the number of tree tips -if (length(unique(data$species)) != length(tree$tip.label)) { - warning("The number of unique species in the data does not match the number of tree tips.") -} +# Extract legend of dummy BUSCO pie chart +legend_busco <- cowplot::get_legend(dummy_pie + legends_theme) -# Find column headers -column_headers <- colnames(data) - -# Create plots based on the plot types -plots <- list() -legend_plot <- NULL -for (i in 2:length(column_headers)) { - column_name <- column_headers[i] - plot_type <- plot_types[i] - - if (plot_type == "bar") { - bar_plot <- ggplot(data, aes(x = as.numeric(!!sym(column_name)), y = factor(species, levels = rev(tree$tip.label)))) + - geom_bar(stat = "identity", fill = "darkblue") + - geom_text(aes(label = as.numeric(!!sym(column_name)), x = 0), hjust = -0.1, color = "white", size = args$text_size) + # Place text at the base of the bar - theme_minimal() + - theme(axis.title.y = element_blank(), - axis.text.y = element_blank(), - axis.ticks.y = element_blank(), - axis.title.x = element_blank(), - axis.text.x = element_blank(), - axis.ticks.x = element_blank(), - panel.grid.major = element_blank(), # Remove vertical grid lines - panel.grid.minor = element_blank(), # Remove minor grid lines - panel.background = element_blank(), # Remove grey background - plot.background = element_blank(), # Remove any outer plot background - plot.margin = margin(0, 0, 0, 0)) + - labs(x = column_name) + - ggtitle(column_name) - plots[[length(plots) + 1]] <- bar_plot - } else if (plot_type == "text") { - text_plot <- ggplot(data, aes(y = factor(species, levels = rev(tree$tip.label)), x = 1, label = !!sym(column_name))) + - geom_text(size = args$text_size, hjust = 0) + - theme_void() + - labs(x = NULL, y = NULL) + - theme(axis.title.x = element_blank(), - axis.text.x = element_blank(), - axis.ticks.x = element_blank(), - plot.margin = margin(0, 0, 0, 0)) + - ggtitle(column_name) - plots[[length(plots) + 1]] <- text_plot - } else if (plot_type == "stacked") { - # Split the stacked values into separate columns - stacked_data <- data %>% - separate(column_name, into = paste0(column_name, "_", 1:4), sep = ",", convert = TRUE, extra = "drop") %>% - pivot_longer(cols = starts_with(column_name), names_to = "stack", values_to = "value") %>% - mutate(stack = factor(stack, levels = paste0(column_name, "_", 1:4))) - - stacked_plot <- ggplot(stacked_data, aes(x = value, y = factor(species, levels = rev(tree$tip.label)), fill = stack)) + - geom_bar(stat = "identity", position = "fill") + - theme_minimal() + - theme(axis.title.y = element_blank(), - axis.text.y = element_blank(), - axis.ticks.y = element_blank(), - axis.title.x = element_blank(), - axis.text.x = element_blank(), - axis.ticks.x = element_blank(), - panel.grid.major.y = element_blank(), # Remove horizontal grid lines - panel.grid.minor.y = element_blank(), - plot.margin = margin(0, 0, 0, 0)) + - labs(x = column_name) + - ggtitle(column_name) - - # Extract the legend from the stacked plot - legend_plot <- extract_legend(stacked_plot) - - # Remove the legend from the stacked plot - stacked_plot <- stacked_plot + theme(legend.position = "none") - - plots[[length(plots) + 1]] <- stacked_plot - } else { - cat("Unknown plot type:", plot_type, "for column:", column_name, "\n") - } -} +print("Check 2.5") -# Debugging: Print the list of plots -cat("List of plots:\n") -print(plots) - -# Combine the plots if there are any -if (length(plots) > 0) { - combined_plot <- plot_grid(tree_plot, plot_grid(plotlist = plots, ncol = length(plots)), ncol = 2, rel_widths = c(args$tree_size, 1 - args$tree_size)) # Adjust widths based on tree_size - - # Save the combined plot to a PDF file - ggsave("Phyloplot_quast.pdf", plot = combined_plot, width = 10, height = 8) - - # Save the legend to a separate PDF file - if (!is.null(legend_plot)) { - legend_plot <- cowplot::plot_grid(legend_plot) - ggsave("Legend.pdf", plot = legend_plot, width = 10, height = 2) - } -} else { - cat("No plots to combine.\n") -} +# Plot BUSCO pies next to tree tips +p <- ggtree::inset(tree_plot, + pies, + width=(args$pie_size * .16 + .16), + height=(args$pie_size * .6 + .6), + hjust=-1.2) + +print("Check 3") + +# Plot Quast data +p2 <- p + geom_fruit(data=data_quast, + geom = geom_bar, + mapping = aes(y=species, x=value, fill=metric), + position=position_dodgex(), + pwidth=0.38, + orientation="y", + stat="identity", + axis.params = list( + axis = "x", + text.size = 1.8, + hjust = 1, + vjust = 0.5, + nbreak = 3 + ), + grid.params = list(), + offset = 1, + width = 0.4) + + labs(fill = "Quast") + + scale_x_continuous(limits=c(0,5)) # limits=c(0,5) by default for p2 + +print("Check 4") + +# +p2 <- increase_scale_until_no_warnings(p2) + +# Save plot without legend +ggsave("Phyloplot_no_legend.pdf", p2 + theme(legend.position="none")) + +# Extract Quast legend +legend_quast <- cowplot::get_legend(p2 + legends_theme) + +# Wrap the legends in fixed-sized containers and adjust margins so that legends +# are perfectly aligned +legend_busco <- ggdraw(legend_busco) + theme(plot.margin = margin(0, 0, 0, 0)) +legend_quast <- ggdraw(legend_quast) + theme(plot.margin = margin(0, 29, 60, 0)) + +# Combine both BUSCO and Quast legends +combined_legends <- plot_grid(legend_busco, + legend_quast, + NULL, # Dummy rows for better positioning + NULL, + nrow = 4, + rel_widths = c(1, 1)) + +# Save legend +ggsave("legend.pdf", plot = combined_legends) + +# Add combined legends to tree plot +p3 <- plot_grid(p2 + theme(legend.position = "none"), + combined_legends, + ncol = 2, + rel_widths = c(4, 1)) -# Print warnings -warnings() \ No newline at end of file +# Save full plot +ggsave("Phyloplot_complete.pdf", plot = p3) diff --git a/bin/plot_tree_summary.R.old b/bin/plot_tree_summary.R.old new file mode 100755 index 0000000..3439030 --- /dev/null +++ b/bin/plot_tree_summary.R.old @@ -0,0 +1,219 @@ +#!/usr/bin/Rscript + +# Written by Chris Wyatt and released under the MIT license. +# Prints a tree with QUAST N50 results on tips of branches + +# Load necessary libraries +if (!requireNamespace("argparse", quietly = TRUE)) { + install.packages("argparse") +} +if (!requireNamespace("ggtree", quietly = TRUE)) { + BiocManager::install("ggtree") +} +if (!requireNamespace("ggplot2", quietly = TRUE)) { + install.packages("ggplot2") +} +if (!requireNamespace("cowplot", quietly = TRUE)) { + install.packages("cowplot") +} +if (!requireNamespace("tidyr", quietly = TRUE)) { + install.packages("tidyr") +} + +library(ggtree) +library(ggplot2) +library(cowplot) +library(argparse) +library(dplyr) +library(tidyr) + +# Function to extract legend from a ggplot +extract_legend <- function(plot) { + g <- ggplotGrob(plot) + legend <- g$grobs[which(sapply(g$grobs, function(x) x$name) == "guide-box")] + return(legend) +} + +# Parse command-line arguments +parser <- ArgumentParser(description = 'Plot phylogenetic tree with statistics and true/false data') +parser$add_argument('tree_file', type = 'character', help = 'Path to the Newick formatted tree file') +parser$add_argument('data_file', type = 'character', help = 'Path to the CSV file with statistic and true/false data') +parser$add_argument('--text_size', type = 'double', default = 3, help = 'Text size for the plots') +parser$add_argument('--tree_size', type = 'double', default = 0.3, help = 'Proportion of the plot width for the tree') +args <- parser$parse_args() + +# Read the Newick tree from the file +tree <- read.tree(args$tree_file) + +# Clean tree tip labels +tree$tip.label <- trimws(tree$tip.label) +tree$tip.label <- tolower(tree$tip.label) + +# Read the data table from the file, ensuring species column is read as character +data <- read.csv(args$data_file, sep = "\t", colClasses = c("species" = "character")) + +# Clean data species names +data$species <- trimws(data$species) +data$species <- tolower(data$species) + +# Extract the column headers and the second row to determine plot types +plot_types <- as.character(data[1, ]) +data <- data[-1, ] # Remove the second row used for plot types + +# Debugging: Print species names from the tree and the data +cat("Species names in the tree based on nw:\n") +print(tree$tip.label) +cat("\nSpecies names in the data table before processing:\n") +print(data$species) + +# Debugging: Print the data frame to check its structure +cat("\nData frame:\n") +print(data) + +# Ensure species names match the tree tips +missing_in_tree <- setdiff(data$species, tree$tip.label) +missing_in_data <- setdiff(tree$tip.label, data$species) + +if (length(missing_in_tree) > 0) { + cat("Species in data but not in tree:\n") + print(missing_in_tree) +} + +if (length(missing_in_data) > 0) { + cat("Species in tree but not in data:\n") + print(missing_in_data) +} + +if (length(missing_in_tree) > 0 || length(missing_in_data) > 0) { + stop("Species names in the data do not match the tree tips") +} + +# Plot the phylogenetic tree +tree_plot <- ggtree(tree) + + geom_tiplab() + + coord_cartesian(clip="off") + + ggtitle("Phylogenetic Tree") + + theme(plot.margin = margin(10, 150, 10, 10)) # Increase margins + +pdf ("Tree_only.pdf") +tree_plot +dev.off() + + +# Extract the exact tree tip names:axis.text +tree$tip.label <- get_taxa_name(tree_plot) + +# Reorder the data based on the order of species in the tree +data <- data %>% arrange(match(species, tree$tip.label)) + +# Debugging: Print plot types +cat("\nPlot types:\n") +print(plot_types) + +# Debugging: Print the reordered data frame to check its structure +cat("\nReordered data frame:\n") +print(data) + +write.table(data, "Reordered_output_tree.tsv", sep="\t", quote=F) + +# Check if the number of unique species matches the number of tree tips +if (length(unique(data$species)) != length(tree$tip.label)) { + warning("The number of unique species in the data does not match the number of tree tips.") +} + +# Find column headers +column_headers <- colnames(data) + +# Create plots based on the plot types +plots <- list() +legend_plot <- NULL +for (i in 2:length(column_headers)) { + column_name <- column_headers[i] + plot_type <- plot_types[i] + + if (plot_type == "bar") { + bar_plot <- ggplot(data, aes(x = as.numeric(!!sym(column_name)), y = factor(species, levels = rev(tree$tip.label)))) + + geom_bar(stat = "identity", fill = "darkblue") + + geom_text(aes(label = as.numeric(!!sym(column_name)), x = 0), hjust = -0.1, color = "white", size = args$text_size) + # Place text at the base of the bar + theme_minimal() + + theme(axis.title.y = element_blank(), + axis.text.y = element_blank(), + axis.ticks.y = element_blank(), + axis.title.x = element_blank(), + axis.text.x = element_blank(), + axis.ticks.x = element_blank(), + panel.grid.major = element_blank(), # Remove vertical grid lines + panel.grid.minor = element_blank(), # Remove minor grid lines + panel.background = element_blank(), # Remove grey background + plot.background = element_blank(), # Remove any outer plot background + plot.margin = margin(0, 0, 0, 0)) + + labs(x = column_name) + + ggtitle(column_name) + plots[[length(plots) + 1]] <- bar_plot + } else if (plot_type == "text") { + text_plot <- ggplot(data, aes(y = factor(species, levels = rev(tree$tip.label)), x = 1, label = !!sym(column_name))) + + geom_text(size = args$text_size, hjust = 0) + + theme_void() + + labs(x = NULL, y = NULL) + + theme(axis.title.x = element_blank(), + axis.text.x = element_blank(), + axis.ticks.x = element_blank(), + plot.margin = margin(0, 0, 0, 0)) + + ggtitle(column_name) + plots[[length(plots) + 1]] <- text_plot + } else if (plot_type == "stacked") { + # Split the stacked values into separate columns + stacked_data <- data %>% + separate(column_name, into = paste0(column_name, "_", 1:4), sep = ",", convert = TRUE, extra = "drop") %>% + pivot_longer(cols = starts_with(column_name), names_to = "stack", values_to = "value") %>% + mutate(stack = factor(stack, levels = paste0(column_name, "_", 1:4))) + + stacked_plot <- ggplot(stacked_data, aes(x = value, y = factor(species, levels = rev(tree$tip.label)), fill = stack)) + + geom_bar(stat = "identity", position = "fill") + + theme_minimal() + + theme(axis.title.y = element_blank(), + axis.text.y = element_blank(), + axis.ticks.y = element_blank(), + axis.title.x = element_blank(), + axis.text.x = element_blank(), + axis.ticks.x = element_blank(), + panel.grid.major.y = element_blank(), # Remove horizontal grid lines + panel.grid.minor.y = element_blank(), + plot.margin = margin(0, 0, 0, 0)) + + labs(x = column_name) + + ggtitle(column_name) + + # Extract the legend from the stacked plot + legend_plot <- extract_legend(stacked_plot) + + # Remove the legend from the stacked plot + stacked_plot <- stacked_plot + theme(legend.position = "none") + + plots[[length(plots) + 1]] <- stacked_plot + } else { + cat("Unknown plot type:", plot_type, "for column:", column_name, "\n") + } +} + +# Debugging: Print the list of plots +cat("List of plots:\n") +print(plots) + +# Combine the plots if there are any +if (length(plots) > 0) { + combined_plot <- plot_grid(tree_plot, plot_grid(plotlist = plots, ncol = length(plots)), ncol = 2, rel_widths = c(args$tree_size, 1 - args$tree_size)) # Adjust widths based on tree_size + + # Save the combined plot to a PDF file + ggsave("Phyloplot_quast.pdf", plot = combined_plot, width = 10, height = 8) + + # Save the legend to a separate PDF file + if (!is.null(legend_plot)) { + legend_plot <- cowplot::plot_grid(legend_plot) + ggsave("Legend.pdf", plot = legend_plot, width = 10, height = 2) + } +} else { + cat("No plots to combine.\n") +} + +# Print warnings +warnings() diff --git a/conf/modules.config b/conf/modules.config index 3079ef8..1a418ef 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -175,4 +175,11 @@ process { ] } + withName: 'NUMBER_SEQS' { + publishDir = [ + path: { "${params.outdir}/number_seqs" }, + mode: params.publish_dir_mode + ] + } + } diff --git a/modules/local/gene_overlaps.nf b/modules/local/gene_overlaps.nf index 1c81522..54085f1 100644 --- a/modules/local/gene_overlaps.nf +++ b/modules/local/gene_overlaps.nf @@ -8,7 +8,7 @@ process GENE_OVERLAPS { output: tuple val(meta), path("Summary.*.tsv"), emit: overlap_detailed_summary - tuple val(meta), path("Count.*.tsv"), emit: overlap_counts + tuple val(meta), path("Count.*.tsv") , emit: overlap_counts path "versions.yml", emit: versions when: diff --git a/modules/local/number_seqs.nf b/modules/local/number_seqs.nf new file mode 100644 index 0000000..0b20c5c --- /dev/null +++ b/modules/local/number_seqs.nf @@ -0,0 +1,27 @@ +process NUMBER_SEQS { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::coreutils=8.31" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gnu-wget:1.18--h36e9172_9' : + 'biocontainers/gnu-wget:1.18--h36e9172_9' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path( "${meta.id}.n_seqs.tsv" ), emit: n_seqs + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + # Add header + echo -e "species\tchromosome_count\tsequence_count" > ${prefix}.n_seqs.tsv + + # Use grep to cunt the number of chromosomes and sequences + echo -e "${prefix}\t\$(grep -i '^>.*\\(chromosome\\|chr\\)' $fasta | wc -l)\t\$(grep -c '^>' $fasta)" >> ${prefix}.n_seqs.tsv + """ + +} \ No newline at end of file diff --git a/modules/local/tree_summary.nf b/modules/local/tree_summary.nf index eca2247..0e6d727 100644 --- a/modules/local/tree_summary.nf +++ b/modules/local/tree_summary.nf @@ -2,7 +2,7 @@ process TREE_SUMMARY { tag "$meta.id" label 'process_single' - container = 'ecoflowucl/genomeqc_tree:v1.3' + container = 'fduarte001/genomeqc_tree:0.1' publishDir "$params.outdir/tree_plots" , mode: "${params.publish_dir_mode}", pattern:"Phyloplot_*.pdf" input: @@ -13,32 +13,30 @@ process TREE_SUMMARY { path( "P*.pdf" ), emit: figure path( "versions.yml" ), emit: versions - script: def prefix = task.ext.prefix ?: "${meta.id}" - """ + """ #Remove unwanted extensions in the tree file sed \'s/\\.prot\\.fa\\.largestIsoform//g\' ${tree}/Species_Tree/SpeciesTree_rooted_node_labels.txt > tree.nw - - # Combine the BUSCO outputs - head -qn 1 *.txt | head -n 1 > Busco_combined - tail -q -n 1 *.txt >> Busco_combined - cut -f 1,3,4,5,6,7 Busco_combined >> Busco_combined_cut - sed -i \'s/\\.fasta//g\' Busco_combined_cut - busco_2_table.py Busco_combined_cut Busco_to_plot.tsv + # Combine GENE OVERLAPS outputs + gene_overlaps_table.py Count.*.tsv gene_stats.tsv --include-sense --include-antisense - # Combine QUAST ouput - quast_2_table.py *quast.tsv -o Quast_to_plot.tsv -col N50,N90 -plot_types bar,bar + # Combine NUMBER SEQS outputs + # Add header + echo -e "species\tchromosome_count\tsequence_count" > number_seqs.tsv + tail -n +2 -q *.n_seqs.tsv >> number_seqs.tsv + + # Combine the BUSCO outputs and remove empty tabs + head -qn 1 *.txt | head -n 1 > Busco_combined + tail -q -n 1 *.txt | sed -E 's/\t+/\t/g' | sed 's/\t\$//g' >> Busco_combined - #Remove unwanted extensions from Busco tables - sed \'s/.prot.fa.largestIsoform.fa//g\' Busco_to_plot.tsv > Busco_to_plot_final.tsv - sed \'s/.prot.fa.largestIsoform.fa//g\' Quast_to_plot.tsv > Quast_to_plot_final.tsv + # Combine QUAST ouput + quast_2_table.py *quast.tsv -o Quast_to_plot.tsv -col N50,N90,"Total length","GC (%)" -plot_types bar,bar,bar,bar # Run summary plot - plot_tree_summary2.R tree.nw Busco_to_plot_final.tsv --tree_size 0.6 - plot_tree_summary.R tree.nw Quast_to_plot_final.tsv --tree_size 0.6 + plot_tree_summary.R tree.nw Busco_combined Quast_to_plot.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/subworkflows/local/genome_and_annotation.nf b/subworkflows/local/genome_and_annotation.nf index 08b3a7c..a325bc8 100644 --- a/subworkflows/local/genome_and_annotation.nf +++ b/subworkflows/local/genome_and_annotation.nf @@ -9,6 +9,7 @@ include { GFFREAD } from '../../modules/nf-core/gffr include { ORTHOFINDER } from '../../modules/nf-core/orthofinder/main' include { FASTAVALIDATOR } from '../../modules/nf-core/fastavalidator/main' include { GENE_OVERLAPS } from '../../modules/local/gene_overlaps' +include { NUMBER_SEQS } from '../../modules/local/number_seqs' workflow GENOME_AND_ANNOTATION { @@ -52,12 +53,21 @@ workflow GENOME_AND_ANNOTATION { | combine(ch_gff_agat, by:0) // by:0 | Only combine when both channels share the same id | combine(ch_gff_long, by:0) | multiMap { - meta, fasta, gff_unfilt, gff_filt -> // null is is probably not necessary + meta, fasta, gff_unfilt, gff_filt -> // "null" probably not necessary fasta : fasta ? tuple( meta, file(fasta) ) : null // channel: [ val(meta), [ fasta ] ] gff_unfilt : gff_unfilt ? tuple( meta, file(gff_unfilt) ) : null // channel: [ val(meta), [ gff ] ], unfiltered gff_filt : gff_filt ? tuple( meta, file(gff_filt) ) : null // channel: [ val(meta), [ gff ] ], filtered for longest isoform } + // + // Check number of sequences + // + + NUMBER_SEQS ( + ch_input.fasta + ) + ch_tree_data = ch_tree_data.mix(NUMBER_SEQS.out.n_seqs.collect { meta, file -> file }) + // // Run AGAT Spstatistics // @@ -74,6 +84,7 @@ workflow GENOME_AND_ANNOTATION { GENE_OVERLAPS { ch_input.gff_filt } + ch_tree_data = ch_tree_data.mix(GENE_OVERLAPS.out.overlap_counts.collect { meta, file -> file }) // // MODULE: Run Quast