-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathCGRPhylo_pipeline.r
68 lines (46 loc) · 2.51 KB
/
CGRPhylo_pipeline.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
############################
##### CGRPhylo Pipeline #####
#############################
setwd("./")
file <- "Input_recom_SARS_cov2.fasta"
library("seqinr")
fastafile <- seqinr::read.fasta(file = file, seqtype = "DNA", as.string = TRUE, set.attributes = FALSE)
source('cgat_function.r')
source('distances_n_other.r')
source('cgrplot.r')
##########################################################################
######## Filtering of the sequencing with a specified number of 'N' bases
##########################################################################
library(stringr)
N_filter <- 50 ## filter sequence with n bases > this value
fasta_filtered <- fastafile_new(fastafile, N_filter) ## create filtered sequence file using fastafile_new function
#write fasta file from filtered sequences
seqinr::write.fasta(sequences=fasta_filtered,names =names(fasta_filtered),file.out=paste("recombinant_XBB.1_Filter",N_filter,".fasta",sep = ''))
#write fasta file from filtered sequences
#seqinr::write.fasta(sequences=fastafile_new,names =sequence_new,file.out =paste("Filtered_N",N_filter,"_new_seq.fasta",sep = ''))
#######################################################################################
############ Create frequency matrix object for each sequence; one by one #############
######################################################################################
k_mer <- 6 ## define the value of K
Freq_mat_obj <- list()
sequence_new <- names(fastafile_new)
source('cgat_function.r')
for(n in 1:length(fastafile_new)) {
##skip passing whole data to the function ##increase speed ## "fastafile_new" name is Locked
Freq_mat_obj[[n]] <- cgat(k_mer,n, nchar(fastafile[[n]])) # executing one seq at a time #k-mer,seq_length,trimmed_length
print(paste("processing sequence : ",n , sequence_new[n], sep=" "))
}
names(Freq_mat_obj) <- sequence_new
#################################################################################
############### calculates distances ############################################
################################################################################
j <- length(sequence_new)
distance <- matrix(0,j,j) ## defining the empty matrix from distance calculations
row.names(distance) <- sequence_new[1:j]
colnames(distance) <- sequence_new[1:j]
for(n in 1:j) {
for(cr in 1:j) {
distance[n,cr] <- matrixDistance(Freq_used[[n]],Freq_used[[cr]],distance_type ='Euclidean' ) ##'Euclidean','S_Euclidean' and Manhattan
}}
distance <- round(distance, 5)
#plot(hclust(as.dist(distance)))