Merge pull request #20 from jsalignon/revisions

Revisions
jsalignon · Apr 22, 2024 · 6db3423 · 6db3423
2 parents 2f4f23c + 8847b3d
commit 6db3423
Show file tree

Hide file tree

Showing 44 changed files with 710 additions and 84 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -26,6 +26,28 @@ Types of changes
     Security in case of vulnerabilities. -->
 
 
+## [v0.9.0](https://github.com/jsalignon/cactus/releases/tag/v0.9.0) - Gymnocalycium, 22-04-2024
+
+### Added
+
+ - [#20](https://github.com/jsalignon/cactus/pull/20) - Added a tutorial section in the documentation and an associated script.
+ - [#20](https://github.com/jsalignon/cactus/pull/20) - Added figures for the manuscript: genome track plots, examples of figures and tables from the worm test dataset.
+ - [#20, commit 8ab3377](https://github.com/jsalignon/cactus/commit/8ab33772fd9493a542478ebc1746ae3eb9af0cc3) - Added Peak Annotation filters to keep only peaks at less than 3, 8 or 30 kilobases to their closest gene.
+ - [#20, commit 7f797f8](https://github.com/jsalignon/cactus/commit/7f797f892db6d83f9409eea33d5cf658ebe6ad4b) - Added ChIPseeker options to ignore upstream or downstream of peak for closest gene annotation.
+ - [#19](https://github.com/jsalignon/cactus/pull/19) - Added quickstart scripts.
+
+### Changed
+
+### Fixed
+
+ - [#20, commit 250afb6](https://github.com/jsalignon/cactus/commit/250afb6174c3e976ec9eda72da480732daf8a938#diff-6401496ba455b9488ffa902a6e4d7732b2c60ff2d77c5c3ef96b28a7ac7d3b28) - Fixing a typo which prevented bigWig files from being saved.
+
+### Deprecated
+
+### Removed
+
+### Dependencies
+
 
 ## [v0.8.6](https://github.com/jsalignon/cactus/releases/tag/v0.8.6) - Pygmaeocereus, 30-11-2023
 

diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 
 <img src="/docs/images/logo_cactus.png" width="400" />
 
-* [Introduction](/README.md): [Quick Start](/docs/1_Intro/Quick_start.md), [Flowchart](/docs/1_Intro/Flowchart.md), [Outputs structure](/docs/1_Intro/Outputs_structure.md)
+* [Introduction](/README.md): [Quick Start](/docs/1_Intro/Quick_start.md), [Tutorial](/docs/1_Intro/tutorial.md), [Flowchart](/docs/1_Intro/Flowchart.md), [Outputs structure](/docs/1_Intro/Outputs_structure.md)
 * [Install](/docs/2_Install/2_Install.md): [Dependencies](/docs/2_Install/Dependencies.md), [Containers](/docs/2_Install/Containers.md), [References](/docs/2_Install/References.md), [Test datasets](/docs/2_Install/Test_datasets.md)
 * [Inputs](/docs/3_Inputs/3_Inputs.md): [Data](/docs/3_Inputs/Data.md), [Design](/docs/3_Inputs/Design.md), [Parameters](/docs/3_Inputs/Parameters.md)
 * [1. Preprocessing](/docs/4_Prepro/4_Prepro.md): [ATAC reads](/docs/4_Prepro/ATAC_reads.md), [ATAC peaks](/docs/4_Prepro/ATAC_peaks.md), [mRNA](/docs/4_Prepro/mRNA.md)

diff --git a/case_study/2__running_the_case_study.sh b/case_study/2__running_the_case_study.sh
@@ -8,14 +8,15 @@
 homedir=~
 eval homedir=$homedir
 cactus_dir=$homedir/workspace/cactus
-test_dir=$cactus_dir/test_datasets  
+test_ds_dir=$cactus_dir/test_datasets  
 # case_study_dir=$test_ds_dir/case_study/data
 case_study_dir=$test_ds_dir/application_note
 
 ## setting up variables
 cpu_nb=30
 memory_size='300G'
-cactus_version="v0.8.6" ; latest_flag=''
+# cactus_version="v0.8.6" ; latest_flag=''
+cactus_version="revisions" ; latest_flag=''
 
 # nextflow drop jsalignon/cactus # if needed
 figshare_version=v4
@@ -26,7 +27,8 @@ heatmaps_filters_motifs="c( 30, 20, 10, T, 2, 'ward.D', F )"
 heatmaps_params__peaks_self="c( 0.05, T, 'none', T, 50, 'UUDD', 2.5 )"
 common__heatmaps_ggplot="c( 12, 12, 12 )" 
 heatmaps_ggplot__chrom_states="c( 8, 10, 7 )"
-res_prefix="results/2023_11_26__"
+# res_prefix="results/2023_11_26__"
+res_prefix="results/2024_04_09__"
 
 ## Initialization
 #################

diff --git a/case_study/3__getting_figure_panels.sh b/case_study/3__getting_figure_panels.sh
@@ -72,6 +72,14 @@ cp $case_study_dir_human_2_pro_3/Heatmaps__chrom_states/ATAC__all__1.3__ctl__chr
 # Tab S1: reprogramming genes defined in Figure 6L of the Kolundzic et al. study
 cp $case_study_dir_human_1/Tables_Merged/2_Differential_Analysis/res_simple.xlsx $case_study_copy_dir
 
+# Fig S2: HA-HE gene tracks
+find "$case_study_dir/worm/results/2024_04_09___worm_run_1/Processed_Data" -regex ".*\(hmg4\|ctl\).*.bw" -exec cp "{}" $case_study_copy_dir \;
+# find "$case_study_dir/worm/results/2024_04_09___worm_run_1/Processed_Data" -regex ".*\(hmg4\|ctl\).*.bw" -exec bash -c 'cp "$0" "${1}/$(basename "${0%.*}").bigWig"' {} "$case_study_copy_dir" \;
+cp "$case_study_dir/worm/results/2024_04_09___worm_run_1/Tables_Individual/2_Differential_Analysis/res_filter/hmg4_vs_ctl__res_filter.xlsx" $case_study_copy_dir
+cp "$case_study_dir/worm/results/2024_04_09___worm_run_1/Processed_Data/2_Differential_Analysis/DA_split__bed_regions/both_ATAC__all__up__1.3__hmg4_vs_ctl__regions.bed" $case_study_copy_dir
+cp "$case_study_dir/worm/results/2024_04_09___worm_run_1/Processed_Data/2_Differential_Analysis/DA_split__bed_regions/both_ATAC__all__down__1.3__hmg4_vs_ctl__regions.bed" $case_study_copy_dir
+
+
 
 # converting the heavy plots to png
 pdftoppm -rx 300 -ry 300 -png \

diff --git a/case_study/4__making_figures.R b/case_study/4__making_figures.R
@@ -13,6 +13,10 @@ R
 library(ggplot2)
 library(magrittr)
 library(data.table)
+library(Gviz)
+library(rtracklayer)
+library(purrr)
+library(openxlsx)
 
 readRDS1 <- function(x) readRDS(paste0('panels/', x))
 pdf_a4   <- function(x, ...) pdf(x, paper = 'a4r', ...)
@@ -24,6 +28,10 @@ update_text_size <- function(px) {
 		)
 }
 
+
+
+
+
 # Fig 5: peak_self heatmaps
 
 p1 = readRDS1('mRNA__Null__1.3__ctl__peaks_self__heatmap__worm.rds') + ggtitle('worm, DEGs') 
@@ -34,7 +42,7 @@ p4 = readRDS1('ATAC__all__1.3__ctl__peaks_self__heatmap__human.rds') + ggtitle('
 pp = ggpubr::ggarrange(p1, p2, p3, p4, ncol = 2, nrow = 2, common.legend = T, 
 	legend = 'right', labels = letters[1:4])
 
-pdf('Figure_5.pdf', width = 7.5, height = 7.5)
+pdf('Figure_4.pdf', width = 7.5, height = 7.5)
 	print(pp)
 dev.off()
 
@@ -59,7 +67,7 @@ p1 = p1 + theme(
 pp = ggpubr::ggarrange(p1, p2, p3, p4, ncol = 4, nrow = 1, common.legend = T, 
 	legend = 'right', labels = letters[1:4], widths = c(1.19, 1.3, 1.02, 1.4))
 
-pdf('Figure_6.pdf', width = 7, height = 5.7)
+pdf('Figure_5.pdf', width = 7, height = 5.7)
 	print(pp)
 dev.off()
 
@@ -75,7 +83,7 @@ p2 = p2 + theme(plot.title = element_text(hjust = 1))
 pp = ggpubr::ggarrange(p1, p2, ncol = 2, nrow = 1, common.legend = T, 
 	legend = 'right', labels = letters[1:2])
 
-pdf('Figure_7.pdf', width = 7, height = 7)
+pdf('Figure_6.pdf', width = 7, height = 7)
 	print(pp)
 dev.off()
 
@@ -99,7 +107,7 @@ p1 = p1 + theme(
 pp = ggpubr::ggarrange(p1, p2, p3, ncol = 3, nrow = 1, common.legend = T, 
 	legend = 'right', labels = letters[1:3], widths = c(1.2, 1.03, 1.27))
 
-pdf('Figure_8.pdf', width = 7, height = 3.5)
+pdf('Figure_7.pdf', width = 7, height = 3.5)
 	print(pp)
 dev.off()
 
@@ -120,3 +128,195 @@ dt_dw = dt_dw[order(-L2FC), c('COMP', 'gene_name', 'gene_id', 'pval', 'padj', 'L
 
 openxlsx::write.xlsx(list(up = dt_up, down = dt_dw), 'Table_S1.xlsx')
 
+
+
+# Supplementary Figures showing all outputs from Cactus
+
+png_to_gg <- function(png_file, margin = 20) {
+	ggplot2::ggplot() + ggplot2::annotation_custom(
+		grid::rasterGrob(png::readPNG(png_file), 
+			width = ggplot2::unit(1,"npc"), 
+			height = ggplot2::unit(1,"npc")), 
+		-Inf, Inf, -Inf, Inf) + theme(plot.margin = 
+    margin(t = margin, r = margin, b = margin, l = margin, unit = "pt"))
+}
+
+grep1 <- function(x) grep(x, v_png_files, value = T)
+
+plot_nrow_by_ncol <- function(v_files, nrow = 4, ncol = 3, v_labels = 'auto',
+	margin = 10){
+	lp = lapply(v_files, png_to_gg, margin = margin)
+	p1 = ggpubr::ggarrange(plotlist = lp, nrow = nrow, ncol = ncol, 
+			labels = v_labels)
+	return(p1)
+}
+
+plot_figures <- function(v_files, nrow = 4, ncol = 3, v_labels = 'auto'){
+	plot_nrow_by_ncol(v_files, nrow = nrow, ncol = ncol, v_labels = v_labels)
+}
+
+plot_tables <- function(v_files, nrow = 6, ncol = 2, v_labels = v_labels,
+	margin = 15){
+	plot_nrow_by_ncol(v_files, nrow = nrow, ncol = ncol, v_labels = v_labels,
+		margin = margin)
+}
+
+v_png_files = c(list.files('../../docs/examples/png', full.names = T), 
+				list.files('../../docs/examples/xlsx_png', full.names = T))
+
+v_files_1_QC_reads = grep1('\\/(pca|spearman|ctl_1__(reads|insert))')
+v_files_1_QC_satur = grep1('\\/ctl_1__saturation_curve')
+v_files_1_QC_peaks = grep1('(\\/(ctl_1__(peaks|average)|ATAC__peaks))')
+v_files_1_QC       = c(v_files_1_QC_reads, v_files_1_QC_satur, 
+	v_files_1_QC_peaks)
+p_QC = plot_figures(v_files_1_QC)
+pdf('Figure_S1.pdf', width = 7, height = 10)
+	print(p_QC)
+dev.off()
+
+
+v_files_2_DA_fig   = grep1('\\/hmg4_vs_ctl__') %>% c(grep1('venn'))
+v_files_2_DA_fig %<>% .[. != '../../docs/examples/png/hmg4_vs_ctl__ATAC_other_plots.png']
+v_files_2_DA_fig %<>% .[. != '../../docs/examples/png/hmg4_vs_ctl__mRNA_other_plots.png']
+v_files_2_DA_fig %<>% sort %>% rev
+v_files_2_DA_tab   = c(grep1('xlsx_png\\/ATAC'), grep1('xlsx_png\\/mRNA'), 
+	grep1('xlsx_png\\/res'))
+length(v_files_2_DA_fig) # 15
+length(v_files_2_DA_fig) - 12 # 3
+length(v_files_2_DA_tab) #  7
+length(v_files_2_DA_tab) + length(v_files_2_DA_fig) - 12 #  10
+p_DA_1   = plot_figures(v_files_2_DA_fig[1:12])
+p_DA_2_1 = plot_figures(v_files_2_DA_fig[13:15], nrow = 1,
+	v_labels = letters[12 + (1:3)])
+p_DA_2_2 = plot_tables(v_files_2_DA_tab, v_labels = letters[12 + (4:10)], 
+	nrow = 5)
+p_DA_2 = ggpubr::ggarrange(p_DA_2_1, p_DA_2_2, nrow = 2, ncol = 1, 
+	heights = c(0.25, 0.75) )
+pdf('Figure_S2.pdf', width = 7, height = 10)
+	print(p_DA_1)
+	print(p_DA_2)
+dev.off()
+
+
+v_files_3_EN_fig   = c(grep1('__barplot\\.'), grep1('__heatmap\\.'))
+v_files_3_EN_tab   = grep1('xlsx_png\\/(CHIP|motifs|func|peaks|genes|chrom)')
+length(v_files_3_EN_fig) # 14
+length(v_files_3_EN_fig) - 12 # 2
+length(v_files_3_EN_tab) #  7
+p_EN_1   = plot_figures(v_files_3_EN_fig[1:12])
+p_EN_2_1 = plot_figures(v_files_3_EN_fig[13:14], nrow = 1,
+	v_labels = letters[12 + (1:2)])
+p_EN_2_2 = plot_tables(v_files_3_EN_tab, 
+
+	v_labels = letters[12 + (3:10)], nrow = 5)
+p_EN_2 = ggpubr::ggarrange(p_EN_2_1, p_EN_2_2, nrow = 2, ncol = 1, 
+	heights = c(0.25, 0.75) )
+pdf('Figure_S3.pdf', width = 7, height = 10)
+	print(p_EN_1)
+	print(p_EN_2)
+dev.off()
+
+
+# Fig S2: HA-HE gene tracks
+
+# https://jnicolaus.com/tutorials-old/gviztutorial.html
+# https://ivanek.github.io/Gviz/reference/plotTracks.html
+
+
+
+
+get_ensGenes <- function(genome, chromosome, start, end){
+	ensGenes = UcscTrack(genome = genome, chromosome = chromosome, 
+                      track="NCBI RefSeq", table = "refGene", 
+                       # track="Ensembl Genes", table = "ensemblSource",
+                      from = start, to = end,
+                      trackType = "GeneRegionTrack", 
+                      rstarts = "exonStarts", rends = "exonEnds",
+                      gene = "name2", symbol = "name2", 
+                      transcript = "name", strand = "strand", 
+                      fill = "#960000", showId = TRUE, geneSymbol = TRUE)
+	# ?GeneRegionTrack
+	# https://genome.ucsc.edu/cgi-bin/hgTables?hgsid=2127429502_LdAOU7oroxVxnxNJPIAbY9BcGbOW&clade=worm&org=C.+elegans&db=ce11&hgta_group=genes&hgta_track=refSeqComposite&hgta_table=refGene&hgta_regionType=genome&position=chrII%3A14%2C646%2C376-14%2C667%2C875&hgta_outputType=primaryTable&hgta_outFileName=
+	return(ensGenes)
+}
+
+
+get_bigwig_plot <- function(file, name, genome, chromosome, start, end, 
+	ylim = NULL){
+	bw_med1 <- import.bw(file, as="GRanges")
+	bw_med1@seqnames@values %<>% paste0('chr', .) %>% as.factor
+	bw_med1@seqinfo@seqnames %<>% paste0('chr', .)
+	bw_med1@seqnames
+
+	options(ucscChromosomeNames = F)
+	dTrack2 <- DataTrack(bw_med1, genome = genome,
+	  chromosome = chromosome,from = start, to = end, name = name, type = 'l', 
+	  ylim = ylim, 
+		col.histogram  = ifelse(grepl('ctl', name), '#FDE725FF', '#440154FF'),
+		fill.histogram = ifelse(grepl('ctl', name), '#FDE725FF', '#440154FF'))
+	return(dTrack2)
+}
+
+
+get_peak_track <- function(type){
+	peak_file = paste0('panels/', 'both_ATAC__all__', type, 
+		'__1.3__hmg4_vs_ctl__regions.bed')
+	if(type == 'up')   track_name = 'HA-HE' 
+	if(type == 'down') track_name = 'LA-LE' 
+	ref.df = read.table(peak_file, header = FALSE,
+                     stringsAsFactors = FALSE) 
+	ref.gr=GRanges(seqnames = ref.df[,1], ranges = IRanges(start = ref.df[,2], 
+					end = ref.df[,3]), strand = ref.df[,6], name = ref.df[,4])
+	ptrack = AnnotationTrack(ref.gr, name = track_name)
+	return(ptrack)
+}
+
+
+
+
+plot_samples_and_ref <- function(chromosome, start, end, title, type, 
+	ylim = NULL){
+
+	genome = 'ce11'
+
+	# gtrack <- GenomeAxisTrack()
+
+	l_bw_p = imap(l_bw, ~get_bigwig_plot(.x, .y, genome, chromosome, start, end,
+	ylim))
+
+	ptrack = get_peak_track(type)
+
+	ensGenes = get_ensGenes(genome, chromosome, start, end)
+
+	plotTracks(c(l_bw_p, list(ptrack, ensGenes)), from = start, to = end, main = title,
+		type = 'histogram', background.title = 'darkgrey')
+
+}
+
+l_bw = list.files('panels', pattern = '(ctl|hmg4).*.bw', 
+	full.names = T)[c(4:6, 1:3)] %>% setNames(., gsub('.*\\/', '', .) %>% 
+	gsub('\\.bw', '', .)) %>% as.list
+
+# # checking the top genes
+# # dt = openxlsx::read.xlsx('panels/hmg4_vs_ctl__res_filter.xlsx') %>% setDT
+# dt = openxlsx::read.xlsx('panels/hmg4_vs_ctl__res_filter.xlsx') %>% setDT
+# dt[ET == 'both_ATAC' & L2FC > 0]$gene_name[1:4] # "R03H10.6" "F08G2.5"  "zip-2"    "tsp-1"
+# dt[ET == 'both_ATAC' & L2FC < 0]$gene_name[1:4] # "nhr-76"  "ceh-60"  "F11E6.8" "ZK381.2"
+
+
+pdf('tmp1.pdf', width = 7, height = 7)
+	plot_samples_and_ref('chrIII', 8236501, 8239980, 
+		title = 'Example of a HA-HE gene', 
+		type = 'up', ylim = c(0, 500)) # tsp-1
+	plot_samples_and_ref('chrIV', 626537, 634963,
+		title = 'Example of a LA-LE gene', 
+		type = 'down', ylim = c(0, 500)) # nhr-76
+dev.off()
+
+
+system('qpdf --rotate=+90 tmp1.pdf rotated.pdf')
+system('a5toa4 rotated.pdf')
+system('qpdf --rotate=+270 rotated-sidebyside.pdf Figure_S6.pdf')
+file.remove(c('tmp1.pdf', 'rotated.pdf', 'rotated-sidebyside.pdf'))
+
+
diff --git a/conf/run_defaults.config b/conf/run_defaults.config
@@ -81,6 +81,8 @@ params {
 	chipseeker__overlap             = 'all'
 	chipseeker__ignore_overlap      = 'FALSE'
 	chipseeker__annotation_priority = "c('Promoter', '5UTR', '3UTR', 'Exon', 'Intron', 'Downstream', 'Intergenic')"
+	chipseeker__ignore_upstream     = 'FALSE'
+	chipseeker__ignore_downstream   = 'FALSE'
 
 
 	///// c. mRNA

diff --git a/docs/1_Intro/Flowchart.md b/docs/1_Intro/Flowchart.md
@@ -1,7 +1,7 @@
 
 <img src="/docs/images/logo_cactus.png" width="400" />
 
-* [Introduction](/README.md): [Quick Start](/docs/1_Intro/Quick_start.md), [Flowchart](/docs/1_Intro/Flowchart.md), [Outputs structure](/docs/1_Intro/Outputs_structure.md)
+* [Introduction](/README.md): [Quick Start](/docs/1_Intro/Quick_start.md), [Tutorial](/docs/1_Intro/tutorial.md), [Flowchart](/docs/1_Intro/Flowchart.md), [Outputs structure](/docs/1_Intro/Outputs_structure.md)
 * [Install](/docs/2_Install/2_Install.md): [Dependencies](/docs/2_Install/Dependencies.md), [Containers](/docs/2_Install/Containers.md), [References](/docs/2_Install/References.md), [Test datasets](/docs/2_Install/Test_datasets.md)
 * [Inputs](/docs/3_Inputs/3_Inputs.md): [Data](/docs/3_Inputs/Data.md), [Design](/docs/3_Inputs/Design.md), [Parameters](/docs/3_Inputs/Parameters.md)
 * [1. Preprocessing](/docs/4_Prepro/4_Prepro.md): [ATAC reads](/docs/4_Prepro/ATAC_reads.md), [ATAC peaks](/docs/4_Prepro/ATAC_peaks.md), [mRNA](/docs/4_Prepro/mRNA.md)
@@ -29,4 +29,3 @@
 [Sleuth]: https://doi.org/10.1038/nmeth.4324
 [MACS2]: https://doi.org/10.1101/496521 
 [DiffBind]: https://doi.org/10.1038/nature10730
-
diff --git a/docs/1_Intro/Outputs_structure.md b/docs/1_Intro/Outputs_structure.md
@@ -1,7 +1,7 @@
 
 <img src="/docs/images/logo_cactus.png" width="400" />
 
-* [Introduction](/README.md): [Quick Start](/docs/1_Intro/Quick_start.md), [Flowchart](/docs/1_Intro/Flowchart.md), [Outputs structure](/docs/1_Intro/Outputs_structure.md)
+* [Introduction](/README.md): [Quick Start](/docs/1_Intro/Quick_start.md), [Tutorial](/docs/1_Intro/tutorial.md), [Flowchart](/docs/1_Intro/Flowchart.md), [Outputs structure](/docs/1_Intro/Outputs_structure.md)
 * [Install](/docs/2_Install/2_Install.md): [Dependencies](/docs/2_Install/Dependencies.md), [Containers](/docs/2_Install/Containers.md), [References](/docs/2_Install/References.md), [Test datasets](/docs/2_Install/Test_datasets.md)
 * [Inputs](/docs/3_Inputs/3_Inputs.md): [Data](/docs/3_Inputs/Data.md), [Design](/docs/3_Inputs/Design.md), [Parameters](/docs/3_Inputs/Parameters.md)
 * [1. Preprocessing](/docs/4_Prepro/4_Prepro.md): [ATAC reads](/docs/4_Prepro/ATAC_reads.md), [ATAC peaks](/docs/4_Prepro/ATAC_peaks.md), [mRNA](/docs/4_Prepro/mRNA.md)