diff --git a/.gitignore b/.gitignore index 64933bc..f0dcf21 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ testing* testing/ work/ log +out diff --git a/bin/gene_overlaps.R b/bin/gene_overlaps.R old mode 100644 new mode 100755 index 8e7e1c9..85b44dd --- a/bin/gene_overlaps.R +++ b/bin/gene_overlaps.R @@ -1,7 +1,12 @@ #!/usr/bin/env Rscript # Code written by Chris Wyatt with some editing by ChatGPT -#!/usr/bin/env Rscript + +# Load required libraries +if (!requireNamespace("BiocManager", quietly = TRUE)) { + install.packages("BiocManager", repos = "https://cloud.r-project.org/") +} +BiocManager::install(c("GenomicRanges", "IRanges", "GenomeInfoDb", "BiocGenerics")) library(GenomicRanges) @@ -38,15 +43,23 @@ genes <- gr[gr$feature == "gene"] # Find overlaps overlap_results <- findOverlaps(genes, genes, ignore.strand = FALSE) +# Initialize counts +sense_count_within <- 0 +antisense_count_within <- 0 + # Initialize a results data frame results <- data.frame( - gene1_id = character(), - gene1_start = integer(), - gene1_end = integer(), - gene2_id = character(), - gene2_start = integer(), - gene2_end = integer(), - overlap_percent = numeric(), + query_gene = character(), + subject_gene = character(), + query_start = integer(), + query_end = integer(), + subject_start = integer(), + subject_end = integer(), + query_strand = character(), + subject_strand = character(), + overlap_type = character(), + query_overlap_pct = numeric(), + subject_overlap_pct = numeric(), stringsAsFactors = FALSE ) @@ -56,27 +69,47 @@ for (i in seq_len(length(overlap_results))) { subject_idx <- subjectHits(overlap_results)[i] if (query_idx != subject_idx) { - gene1 <- genes[query_idx] - gene2 <- genes[subject_idx] + query_gene <- genes[query_idx] + subject_gene <- genes[subject_idx] - overlap_range <- intersect(ranges(gene1), ranges(gene2)) + overlap_range <- intersect(ranges(query_gene), ranges(subject_gene)) overlap_length <- width(overlap_range) - gene1_length <- width(ranges(gene1)) - gene2_length <- width(ranges(gene2)) + query_length <- width(ranges(query_gene)) + subject_length <- width(ranges(subject_gene)) + + query_overlap_pct <- (overlap_length / query_length) * 100 + subject_overlap_pct <- (overlap_length / subject_length) * 100 + + # Determine overlap type + if (strand(query_gene) == strand(subject_gene)) { + overlap_type <- "sense" + } else { + overlap_type <- "antisense" + } - overlap_percent <- (overlap_length / gene1_length) * 100 + # Increment counters for fully contained overlaps + if (query_overlap_pct == 100 && overlap_type == "sense") { + sense_count_within <- sense_count_within + 1 + } else if (query_overlap_pct == 100 && overlap_type == "antisense") { + antisense_count_within <- antisense_count_within + 1 + } + # Append to results results <- rbind( results, data.frame( - gene1_id = gene1$attribute, - gene1_start = start(gene1), - gene1_end = end(gene1), - gene2_id = gene2$attribute, - gene2_start = start(gene2), - gene2_end = end(gene2), - overlap_percent = overlap_percent, + query_gene = query_gene$attribute, + subject_gene = subject_gene$attribute, + query_start = start(query_gene), + query_end = end(query_gene), + subject_start = start(subject_gene), + subject_end = end(subject_gene), + query_strand = as.character(strand(query_gene)), + subject_strand = as.character(strand(subject_gene)), + overlap_type = overlap_type, + query_overlap_pct = query_overlap_pct, + subject_overlap_pct = subject_overlap_pct, stringsAsFactors = FALSE ) ) @@ -86,4 +119,7 @@ for (i in seq_len(length(overlap_results))) { # Write results to output file write.table(results, file = output_table, sep = "\t", row.names = FALSE, quote = FALSE) +# Print the summary +cat("Number of genes fully contained in sense direction:", sense_count_within, "\n") +cat("Number of genes fully contained in antisense direction:", antisense_count_within, "\n") cat("Overlap analysis complete. Results saved to:", output_table, "\n") diff --git a/modules/local/gene_overlaps.nf b/modules/local/gene_overlaps.nf index 0d46d81..563afc4 100644 --- a/modules/local/gene_overlaps.nf +++ b/modules/local/gene_overlaps.nf @@ -1,7 +1,7 @@ process GENE_OVERLAPS { tag "$meta.id" label 'process_single' - container = 'ecoflowucl/gene_overlap:v1' + container = 'ecoflowucl/gene_overlap:v1.0' input: tuple val(meta), path(gff) @@ -17,7 +17,7 @@ process GENE_OVERLAPS { """ #Run overlap R script - gene_overlaps.R $gff My_output + gene_overlaps.R $gff Summary.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/subworkflows/local/genome_and_annotation.nf b/subworkflows/local/genome_and_annotation.nf index f5b0f77..bb9df2d 100644 --- a/subworkflows/local/genome_and_annotation.nf +++ b/subworkflows/local/genome_and_annotation.nf @@ -8,6 +8,7 @@ include { PLOT_BUSCO_IDEOGRAM } from '../../modules/local/plot_b include { GFFREAD } from '../../modules/nf-core/gffread/main' include { ORTHOFINDER } from '../../modules/nf-core/orthofinder/main' include { FASTAVALIDATOR } from '../../modules/nf-core/fastavalidator/main' +include { GENE_OVERLAPS } from '../../modules/local/gene_overlaps' workflow GENOME_AND_ANNOTATION { @@ -66,6 +67,14 @@ workflow GENOME_AND_ANNOTATION { ) ch_versions = ch_versions.mix(AGAT_SPSTATISTICS.out.versions.first()) + // + // MODULE: Run gene overlap module + // + + GENE_OVERLAPS { + ch_input.gff_filt + } + // // MODULE: Run Quast // @@ -111,7 +120,7 @@ workflow GENOME_AND_ANNOTATION { } | collect // Collect all fasta in a single tuple | map { fastas -> - [[id:"orthofinder"], fastas] + [[id:"orthofinder"], fastas] } // Run orthofinder @@ -142,7 +151,7 @@ workflow GENOME_AND_ANNOTATION { // Prepare BUSCO output BUSCO_BUSCO.out.full_table.map { meta, full_tables -> full_tables }.view() - + //BUSCO_BUSCO.out.full_table.map { meta, full_tables -> full_tables.collect { it -> it.toString().split('/')[-2] } }.view() //ch_busco_full_table = BUSCO_BUSCO.out.full_table