-
Notifications
You must be signed in to change notification settings - Fork 3
/
GenestoJunctions_v2.R
executable file
·103 lines (87 loc) · 5.34 KB
/
GenestoJunctions_v2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#GenestoJunctions_v2.R: The next code will get the unique genes associated to each junction and will associate this list of genes to each junctions in the original bed file
#Additionally, we will study the type of each junction (1: Annotated, 2: New connection, 3: 5ss, 4: 3ss, 5: New junction)
#V2: This version has been adapted for been used for change_gtf.sh
#arg[1]: Scripts dir --> Root to the path of Junckey scripts
#arg[2]: Input file --> SJ.out.bed: file with all the junctions
#arg[3]: Input file --> SJ.out.enriched.unique.bed: file with the junctions which falls in exons, according to the annotation
#arg[4]: Input file --> SJ.out.enriched.filtered.bed: file with the exons that overlaps exactly with the junction
#arg[5]: Ouput file --> SJ.out.junction.type.bed: output file with the list junction and the type of each junction
#arg[6]: Output file --> SJ.out.geneAnnotated.bed: output file with the whole information
#install.packages("data.table")
suppressMessages(require(data.table))
########################
#Gene annotation
########################
cat("\tGenestoJunctions_v2.R: Gene annotation...\n")
CHARACTER_command_args <- commandArgs(trailingOnly=TRUE)
scripts_path <- CHARACTER_command_args[1]
# scripts_path <- "/genomics/users/juanluis/comprna/Junckey"
original_file_bed <- fread(CHARACTER_command_args[2])
# original_file_bed <- fread("/projects_rg/SCM/tables_out2/TCGA-05-4244-01A-01R-1107-07/SJ.out.bed")
colnames(original_file_bed) <- c("chrom","start","end","id","unique_junction_reads","strand","annotated")
original_file_bed <- as.data.frame(original_file_bed)
file_unique <- fread(CHARACTER_command_args[3],header=FALSE,sep=" ",quote="")
# file_unique <- fread("/projects_rg/SCM/tables_out2/TCGA-05-4244-01A-01R-1107-07/SJ.out.enriched.unique.bed",header=FALSE,sep=" ")
file_unique <- as.data.frame(file_unique)
#Remove the " and ; from the genes column
file_unique$V2 <- unlist(lapply(file_unique$V2,function(x)gsub("\"|\\;","",x,perl=TRUE)))
#Remove those with 0 gene associated
file_unique_f <- file_unique[which(file_unique$V2!="0"),]
#There are ids with more than one gene associated
table <- table(as.character(file_unique_f$V1))
duplicated_ids <- row.names(table[which(table!=1)])
duplicated_ids2 <- file_unique_f[which(as.character(file_unique_f$V1)%in%duplicated_ids),]
#Sort the file by V1
duplicated_ids2_sorted <- duplicated_ids2[order(duplicated_ids2$V1),]
table2 <- table(as.character(duplicated_ids2_sorted$V1))
#For each id, take all the id genes and paste it in one line
id <- ""
list_genes <- ""
matrix_output <- matrix(data="",nrow=length(table2),ncol=2)
colnames(matrix_output) <- c("Id","Genes")
cont <- 1
i <- 1
for(i in 1:nrow(duplicated_ids2_sorted)){
if(i==1){
id <- as.character(duplicated_ids2_sorted[i,]$V1)
list_genes <- as.character(duplicated_ids2_sorted[i,]$V2)
}
else if(id!=as.character(duplicated_ids2_sorted[i,]$V1)){
matrix_output[cont,1] <- id
matrix_output[cont,2] <- list_genes
cont <- cont + 1
id <- as.character(duplicated_ids2_sorted[i,]$V1)
list_genes <- as.character(duplicated_ids2_sorted[i,]$V2)
}
else{
list_genes <- paste0(list_genes,",",as.character(duplicated_ids2_sorted[i,]$V2))
}
}
matrix_output[cont,1] <- id
matrix_output[cont,2] <- list_genes
df_output <- as.data.frame(matrix_output)
#Associate this lists to the original SJ.out.enriched.unique.bed
file_unique2 <- merge(file_unique_f,df_output,by.x="V1",by.y="Id",all.x=TRUE)
file_unique2$Genes_final <- ifelse(is.na(file_unique2$Genes),as.character(file_unique2$V2),as.character(file_unique2$Genes))
file_unique3 <- unique(file_unique2[,c("V1","Genes_final")])
#Associate this info to the original bed file
original_file_bed_final <- merge(original_file_bed,file_unique3,by.x="id",by.y="V1",all.x=TRUE)
colnames(original_file_bed_final)[8] <- "Associated_genes"
# write.table(original_file_bed_final,file=CHARACTER_command_args[3],sep="\t",quote=FALSE,row.names=FALSE)
########################
#Type of the junctions
########################
#Call the next python script for obtaining the type of our junctions
work.dir <- getwd()
system(paste0("module load Python; python ", scripts_path, "/GenestoJunctions.py ",CHARACTER_command_args[4], " ",CHARACTER_command_args[5],"; module unload Python;"))
# system(paste0("python ", scripts_path, "/GenestoJunctions.py ","/projects_rg/SCLC_cohorts/George/STAR/v2/S00035T/SJ.out.enriched.filtered.bed"," ","/projects_rg/SCLC_cohorts/George/STAR/v2/S00035T/SJ.out.junction.type.bed"))
output_df <- as.data.frame(fread(CHARACTER_command_args[5],header = FALSE,sep="\t"))
# output_df <- as.data.frame(fread("/projects_rg/SCLC_cohorts/George/STAR/v2/S00035T/SJ.out.junction.type.bed",header = FALSE,sep="\t"))
#Associate this info to the original bed file
original_file_bed_final2 <- merge(original_file_bed_final,output_df,by.x="id",by.y="V1",all.x=TRUE)
colnames(original_file_bed_final2)[9] <- "Type_junction"
original_file_bed_final2$Type_junction <- ifelse(is.na(original_file_bed_final2$Type_junction),5,original_file_bed_final2$Type_junction)
write.table(original_file_bed_final2,file=CHARACTER_command_args[6],sep="\t",quote=FALSE,row.names=FALSE)
# write.table(original_file_bed_final2,file="/projects_rg/SCLC_cohorts/George/STAR/v2/S00035T/SJ.out.geneAnnotated.bed",sep="\t",quote=FALSE,row.names=FALSE)
cat("\tGenestoJunctions_v2.R: Saved SJ.out.geneAnnotated.bed\n")
cat("\tGenestoJunctions_v2.R: Finish\n")