diff --git a/00_merge_area-size.Rmd b/00_merge_area-size.Rmd
new file mode 100644
index 0000000..785016a
--- /dev/null
+++ b/00_merge_area-size.Rmd
@@ -0,0 +1,230 @@
+---
+title: "R Notebook"
+output: html_notebook
+---
+
+```{r}
+wd <- dirname(getwd())
+
+#Set working directory and folder structure
+data_folder <- file.path(wd,'clinical_data')
+
+#set plot folder for results
+plot_folder <- file.path(wd,"results","merged")
+
+set.seed(101100)
+
+```
+
+
+```{r}
+wd <-dirname(getwd())
+
+t86a <- read.csv(file=file.path(wd,"clinical_data", paste("Resized_AreaMeasurements_Image_86A.csv")))
+t86b <- read.csv(file=file.path(wd,"clinical_data", paste("Resized_AreaMeasurements_Image_86B.csv")))
+t86c <- read.csv(file=file.path(wd,"clinical_data", paste("Resized_AreaMeasurements_Image_86C.csv")))
+
+t87a <- read.csv(file=file.path(wd,"clinical_data", paste("Resized_AreaMeasurements_Image_87A.csv")))
+t87b <- read.csv(file=file.path(wd,"clinical_data", paste("Resized_AreaMeasurements_Image_87B.csv")))
+t87c <- read.csv(file=file.path(wd,"clinical_data", paste("Resized_AreaMeasurements_Image_87C.csv")))
+
+t88a <- read.csv(file=file.path(wd,"clinical_data", paste("Resized_AreaMeasurements_Image_88A.csv")))
+t88b <- read.csv(file=file.path(wd,"clinical_data", paste("Resized_AreaMeasurements_Image_88B.csv")))
+t88c <- read.csv(file=file.path(wd,"clinical_data", paste("Resized_AreaMeasurements_Image_88C.csv")))
+
+t175a <- read.csv(file=file.path(wd,"clinical_data", paste("Resized_AreaMeasurements_Image_175A.csv")))
+t175b <- read.csv(file=file.path(wd,"clinical_data", paste("Resized_AreaMeasurements_Image_175B.csv")))
+t175c <- read.csv(file=file.path(wd,"clinical_data", paste("Resized_AreaMeasurements_Image_175C.csv")))
+
+t176a <- read.csv(file=file.path(wd,"clinical_data", paste("Resized_AreaMeasurements_Image_176A.csv")))
+t176b <- read.csv(file=file.path(wd,"clinical_data", paste("Resized_AreaMeasurements_Image_176B.csv")))
+t176c <- read.csv(file=file.path(wd,"clinical_data", paste("Resized_AreaMeasurements_Image_176C.csv")))
+
+t178a <- read.csv(file=file.path(wd,"clinical_data", paste("Resized_AreaMeasurements_Image_178A.csv")))
+t178b <- read.csv(file=file.path(wd,"clinical_data", paste("Resized_AreaMeasurements_Image_178B.csv")))
+t178c <- read.csv(file=file.path(wd,"clinical_data", paste("Resized_AreaMeasurements_Image_178C.csv")))
+
+
+t86a$TMA <- "86A"
+t86b$TMA <- "86B"
+t86c$TMA <- "86C"
+
+t87a$TMA <- "87A"
+t87b$TMA <- "87B"
+t87c$TMA <- "87C"
+
+t88a$TMA <- "88A"
+t88b$TMA <- "88B"
+t88c$TMA <- "88C"
+
+t175a$TMA <- "175A"
+t175b$TMA <- "175B"
+t175c$TMA <- "175C"
+
+t176a$TMA <- "176A"
+t176b$TMA <- "176B"
+t176c$TMA <- "176C"
+
+t178a$TMA <- "178A"
+t178b$TMA <- "178B"
+t178c$TMA <- "178C"
+
+t.area <-rbind(t86a,t86b,t86c,t87a,t87b,t87c,t88a,t88b,t88c,t175a,t175b,t175c,t176a,t176b,t176c,t178a,t178b,t178c)
+t.area$ImageID <- t.area$Metadata_acid
+t.area <- t.area %>% select(c(AreaOccupied_AreaOccupied_StromaBinary,AreaOccupied_AreaOccupied_TumourBinary,AreaOccupied_AreaOccupied_TumourStromaBinary,TMA, ImageID)) #ImageNumber
+
+t.area <-t.area %>% unite("TMA_ImageID", c(TMA, ImageID), remove=T)
+colnames(t.area) <- c("Area_px_Stroma","Area_px_Tumour","Area_px_Core", "TMA_ImageID")
+
+range(t.area$Area_px_Core)/1000000
+hist(t.area$Area_px_Core/1000000)
+summary(t.area$Area_px_Core/1000000)
+t.area.mm <-
+  t.area %>% 
+  dplyr::mutate_at(vars(Area_px_Stroma:Area_px_Core),
+            .funs = funs(. /1000000))
+
+t.area.mm[t.area.mm$Area_px_Core>1,]
+t.area.mm$ImageID <-NULL
+colnames(t.area.mm) <- c("Area_mm_Stroma","Area_mm_Tumour","Area_mm_Core", "TMA_ImageID")
+area <-left_join(t.area, t.area.mm, by="TMA_ImageID")
+
+write.csv(area, file=file.path(data_folder, "area.csv"))
+```
+
+
+```{r}
+colData(roi.sce) <-as.data.frame(colData(roi.sce)) %>% 
+  unite(TMA_ImageID, c(TMA, ImageID), sep = "_", remove = FALSE) %>% left_join(area, by="TMA_ImageID") %>% DataFrame()
+
+colData(pat.sce) <-as.data.frame(colData(pat.sce)) %>% 
+  unite(TMA_ImageID, c(TMA, ImageID), sep = "_", remove = FALSE) %>% left_join(area, by="TMA_ImageID") %>% DataFrame()
+colData(roi.sce) <-as.data.frame(colData(roi.sce)) %>% 
+  unite(TMA_ImageID, c(TMA, ImageID), sep = "_", remove = FALSE) %>% left_join(area, by="TMA_ImageID") %>% DataFrame()
+
+colData(roi.pat.sce) <-as.data.frame(colData(roi.pat.sce)) %>% 
+  unite(TMA_ImageID, c(TMA, ImageID), sep = "_", remove = FALSE) %>% left_join(area, by="TMA_ImageID") %>% DataFrame()
+
+colData(immune.u1) <-as.data.frame(colData(immune.u1)) %>% 
+  unite(TMA_ImageID, c(TMA, ImageID), sep = "_", remove = FALSE) %>% left_join(area, by="TMA_ImageID") %>% DataFrame()
+
+colData(tumour.final) <-as.data.frame(colData(tumour.final)) %>% 
+  unite(TMA_ImageID, c(TMA, ImageID), sep = "_", remove = FALSE) %>% left_join(area, by="TMA_ImageID") %>% DataFrame()
+
+
+colData(vessel.sce) <-as.data.frame(colData(vessel.sce)) %>% 
+  unite(TMA_ImageID, c(TMA, ImageID), sep = "_", remove = FALSE) %>% left_join(area, by="TMA_ImageID") %>% DataFrame()
+
+colData(vessel.vessel_pat.sce) <-as.data.frame(colData(vessel.vessel_pat.sce)) %>% 
+  unite(TMA_ImageID, c(TMA, ImageID), sep = "_", remove = FALSE) %>% left_join(area, by="TMA_ImageID") %>% DataFrame()
+
+
+colData(tumour.final) <-as.data.frame(colData(tumour.final)) %>% 
+  unite(TMA_ImageID, c(TMA, ImageID), sep = "_", remove = FALSE) %>% left_join(area, by="TMA_ImageID") %>% DataFrame()
+
+#immune
+immune.o <-immune.final
+immune.final <-immune.o
+colData(immune.final) <-as.data.frame(colData(immune.final)) %>% 
+  unite(TMA_ImageID, c(TMA, ImageID), sep = "_", remove = FALSE) %>% left_join(area, by="TMA_ImageID") %>% DataFrame()
+
+dat.sce <- as_tibble(colData(immune.final))
+
+dat.sce<-dat.sce %>% 
+  unite(TMA_ImageID, c(TMA, ImageID), sep = "_", remove = FALSE)
+
+dat.sce <-DataFrame(dat.sce)
+dat.sce<-merge(dat.sce, area, by="TMA_ImageID")
+rownames(dat.sce) <-paste(dat.sce$TmaID, dat.sce$TmaBlock, dat.sce$ImageID, dat.sce$CellNumber, sep='_')
+colData(immune.final) <- dat.sce
+colnames(immune.final) <-rownames(dat.sce)
+colData(immune.final)
+colnames(colData(immune.final))
+  
+#tcells
+tcells.o <-tcells.final
+tcells.final <-tcells.o
+colData(tcells.final) <-as.data.frame(colData(tcells.final)) %>% 
+  unite(TMA_ImageID, c(TMA, ImageID), sep = "_", remove = FALSE) %>% left_join(area, by="TMA_ImageID") %>% DataFrame()
+
+
+dat.sce <- as_tibble(colData(tcells.final))
+dat.sce$Area_mm_Core <-NULL
+dat.sce$Area_mm_Stroma <-NULL
+dat.sce$Area_mm_Tumour <-NULL
+dat.sce$Area_px_Core <-NULL
+dat.sce$Area_px_Stroma <-NULL
+dat.sce$Area_px_Tumour <-NULL
+dat.sce$TMA_Image <-NULL
+dat.sce<-dat.sce %>% 
+  unite(TMA_ImageID, c(TMA, ImageID), sep = "_", remove = FALSE)
+
+dat.sce <-DataFrame(dat.sce)
+dat.sce<-merge(dat.sce, area, by="TMA_ImageID")
+rownames(dat.sce) <-paste(dat.sce$TmaID, dat.sce$TmaBlock, dat.sce$ImageID, dat.sce$CellNumber, sep='_')
+colData(tcells.final) <- dat.sce
+colnames(tcells.final) <-rownames(dat.sce)
+colData(tcells.final)
+colnames(colData(tcells.final))
+
+#tumour
+dat.sce <- as_tibble(colData(tumour))
+
+dat.sce<-dat.sce %>% 
+  unite(TMA_ImageID, c(TMA, ImageID), sep = "_", remove = FALSE)
+
+dat.sce <-DataFrame(dat.sce)
+dat.sce<-merge(dat.sce, area, by="TMA_ImageID")
+rownames(dat.sce) <-paste(dat.sce$TmaID, dat.sce$TmaBlock, dat.sce$ImageID, dat.sce$CellNumber, sep='_')
+colData(tumour) <- dat.sce
+colnames(tumour) <-rownames(dat.sce)
+colData(tumour)
+colnames(colData(tumour))
+  
+#fibro
+dat.sce <- as_tibble(colData(fibro.final.new))
+
+dat.sce<-dat.sce %>% 
+  unite(TMA_ImageID, c(TMA, ImageID), sep = "_", remove = FALSE)
+
+dat.sce <-DataFrame(dat.sce)
+dat.sce<-merge(dat.sce, area, by="TMA_ImageID")
+rownames(dat.sce) <-paste(dat.sce$TmaID, dat.sce$TmaBlock, dat.sce$ImageID, dat.sce$CellNumber, sep='_')
+colData(fibro.final.new) <- dat.sce
+colnames(fibro.final.new) <-rownames(dat.sce)
+colData(fibro.final.new)
+colnames(colData(fibro.final.new))
+  
+#vessel
+dat.sce <- as_tibble(colData(vessel))
+
+dat.sce<-dat.sce %>% 
+  unite(TMA_ImageID, c(TMA, ImageID), sep = "_", remove = FALSE)
+
+dat.sce <-DataFrame(dat.sce)
+dat.sce<-merge(dat.sce, area, by="TMA_ImageID")
+rownames(dat.sce) <-paste(dat.sce$TmaID, dat.sce$TmaBlock, dat.sce$ImageID, dat.sce$CellNumber, sep='_')
+colData(vessel) <- dat.sce
+colnames(vessel) <-rownames(dat.sce)
+colData(vessel)
+colnames(colData(vessel))
+length(unique(tumour[,tumour$TMA=="88_A"]$ImageID))  
+order(unique(tumour[,tumour$TMA=="88_A"]$ImageID))
+fibro$Area_mm
+```
+
+```{r}
+wd <- dirname(getwd())
+
+#Set working directory and folder structure
+data_folder <- file.path(wd,'TMA_all',"SCE")
+
+
+set.seed(101100)
+
+saveRDS(tumour,file=file.path(data_folder,paste("tumour_sce_merge_minus_ctrl_area.rds")))
+saveRDS(immune,file=file.path(data_folder,paste("immune_sce_merge_minus_ctrl_area.rds")))
+saveRDS(tcells,file=file.path(data_folder,paste("tcells_sce_merge_minus_ctrl_area.rds")))
+saveRDS(fibro,file=file.path(data_folder,paste("fibro_sce_merge_minus_ctrl_area.rds")))
+saveRDS(vessel,file=file.path(data_folder,paste("vessel_sce_merge_minus_ctrl_area.rds")))
+```
\ No newline at end of file
diff --git a/00_prepare_clinicaldata.Rmd b/00_prepare_clinicaldata.Rmd
new file mode 100644
index 0000000..3f6fd8b
--- /dev/null
+++ b/00_prepare_clinicaldata.Rmd
@@ -0,0 +1,139 @@
+---
+title: "Prepare clinical data and position data"
+output: html_notebook
+---
+```{r install needed packages}
+#install.packages("zoo")              
+library("zoo")    
+```
+
+```{r Set file structure}
+wd <- (getwd())
+data_folder <- file.path(dirname(wd),'clinical_data')
+```
+
+
+```{r Read in clinical data}
+clinical.data <-read.csv(file=file.path(data_folder, paste("TMA_86_87_88_175_176_178_USZ.csv")))
+```
+
+```{r Read in position files}
+p86 <- read.csv(file=file.path(data_folder, paste("zTMA_86.csv")),header=T, na.strings=c("","NA"))
+p87 <- read.csv(file=file.path(data_folder, paste("zTMA_87.csv")),header=T, na.strings=c("","NA"))
+p88 <- read.csv(file=file.path(data_folder, paste("zTMA_88.csv")),header=T, na.strings=c("","NA"))
+
+p175 <-read.csv(file=file.path(data_folder, paste("zTMA_175.csv")),header=T, na.strings=c("","NA"))
+p176 <-read.csv(file=file.path(data_folder, paste("zTMA_176.csv")),header=T, na.strings=c("","NA"))
+p178 <- read.csv(file=file.path(data_folder, paste("zTMA_178.csv")),header=T, na.strings=c("","NA"))
+```
+
+```{r Add missing patient numbers and ctrls}
+p86$Patient_Nr <-na.locf(p86$Patient_Nr)
+p87$Patient_Nr <-na.locf(p87$Patient_Nr)
+p88$Patient_Nr <-na.locf(p88$Patient_Nr)
+
+p175$Patient_Nr <-na.locf(p175$Patient_Nr)
+p176$Patient_Nr <-na.locf(p176$Patient_Nr)
+p178$Patient_Nr <-na.locf(p178$Patient_Nr)
+```
+
+```{r Add TMA column to position data}
+p86["TMA"] <-"86"
+p87["TMA"] <-"87"
+p88["TMA"] <-"88"
+
+p175["TMA"] <-"175"
+p176["TMA"] <-"176"
+p178["TMA"] <-"178"
+```
+
+```{r merge clinical data and position data}
+clinical.data_pos <-clinical.data
+
+#change Nr. to Patient_Nr
+names(clinical.data_pos)[names(clinical.data_pos) == 'Nr.'] <- 'Patient_Nr'
+
+#combine all position files rowwise
+p_combined <- rbind(p86,p87,p88,p175,p176,p178)
+
+clinical.data_pos <-merge(p_combined, clinical.data_pos, by.x=c("Patient_Nr", "TMA"), by.y=c("Patient_Nr","TMA")) 
+```
+ 
+```{r save combined clincal and position data as csv file}
+#write.csv(clinical.data_pos, file=file.path(data_folder,paste("combined_clinical_position_data.csv")))
+
+clinical.data <- read.csv( file=file.path(data_folder,paste("combined_clinical_position_data.csv")))
+
+table(clinical.data$DX.name)
+clinical.data$DX.name[clinical.data$DX.name == "Adeno-Ca"] <- "Adenocarcinoma"
+clinical.data$DX.name[clinical.data$DX.name == "Adeno-Ca "] <- "Adenocarcinoma"
+clinical.data$DX.name[clinical.data$DX.name == "PE-Ca"] <- "Squamous cell carcinoma"
+clinical.data$DX.name[clinical.data$DX.name == "adenosquam. Ca"] <- "Adeno squamous cell carcinoma"
+clinical.data$DX.name[clinical.data$DX.name == "ASQ"] <- "Adeno squamous cell carcinoma"
+clinical.data$DX.name[clinical.data$DX.name == "Adeno squamous cell carcinoma"] <- "Adeno squamous cell carcinoma"
+clinical.data$DX.name[clinical.data$DX.name == "Adenosquamöses CA"] <- "Adeno squamous cell carcinoma"
+clinical.data$DX.name[clinical.data$DX.name == "Adeno/squamous carcinoma"] <- "Adeno squamous cell carcinoma"
+clinical.data$DX.name[clinical.data$DX.name == "LC"] <- "Large cell carcinoma"
+clinical.data$DX.name[clinical.data$DX.name == "SCC"] <- "Squamous cell carcinoma"
+clinical.data$DX.name[clinical.data$DX.name == "LCNEC"] <- "Large cell neuroendocrine carcinoma"
+
+unique(clinical.data$DX.name)
+#add factor levels
+levels(clinical.data$Gender) <- list(male=1, female=2)
+levels(clinical.data$Grade) <- list("missing"=0,"Grade 1"=1,  "Grade 2"=2,"Grade 3"=3)
+levels(clinical.data$Vessel) <- list("negative"=0, "positive"=1)
+levels(clinical.data$Pleura) <- list("negative"=0, "positive"=1)
+levels(clinical.data$R) <- list("R0"=0, "R1"=1)
+levels(clinical.data$Relapse) <- list("no"=0, "yes"=1)
+levels(clinical.data$Ev.O) <- list("alive"=0, "dead"=1)
+levels(clinical.data$T.new )<- list("1a"=1, "1b"=2,"2a"=3, "2b"=4, "3"=5, "4"=6)
+levels(clinical.data$M.new) <- list("no"=0, "1a"=1, "1b"=2)
+levels(clinical.data$N) <- list("N0"=0, "N1"=1,"N2"=2, "N3"=3)
+
+levels(clinical.data$Stage) <- list("1a"=1, "1b"=2,"2a"=3, "2b"=4, "3a"=5, "3b"=6,"4"=7)
+levels(clinical.data$Chemo) <- list("Neoadjuvant no"=0, "Neoadjuvant yes"=1)
+levels(clinical.data$Radio) <- list("Neoadjuvant no"=0, "Neoadjuvant yes"=1)
+levels(clinical.data$Chemo3) <- list("Adjuvant no"=0, "Adjuvant yes"=1)
+levels(clinical.data$Radio4) <- list("Adjuvant no"=0, "Adjuvant loc"=1, "Other"=2, "Both"=3)
+levels(clinical.data$Chemo3) <- list("Post relapse no"=0, "Post relapse yes"=1)
+levels(clinical.data$Radio6) <- list("Post relapse no"=0, "Post relapse loc"=1, "Post relapse other"=2, "Post relapse both"=3)
+levels(clinical.data$Smok) <- list("no"=0, "currently"=1, "former"=2, "unknown"=3)
+
+cols <- c("TMA","Gender","Typ","Grade","Vessel","Pleura","T.new","N","M.new","Stage","R","Chemo","Radio","Chemo3","Radio4","Relapse","Chemo5","Radio6","DFS","Ev.O" ,"Smok"   )
+clinical.data[cols] <- lapply(clinical.data[cols], as.factor)
+
+head(clinical.data)
+
+clinical.data$ROI <- paste(clinical.data$Grid, clinical.data$x.y.localisation, sep="")
+head(clinical.data$ROI)
+clinical.data$RoiID <-paste(clinical.data$TMA, clinical.data$ROI, sep="_")
+clinical.data$Patient_ID <- paste(clinical.data$TMA, clinical.data$Patient_Nr,sep="_")
+
+
+clinical.data$LN.Met <- ifelse(clinical.data$N ==0, "No LN Metastases", "LN Metastases")
+clinical.data$Dist.Met <- ifelse(clinical.data$M.new ==0, "No Dist. Metastases", "Dist. Metastases")
+clinical.data$NeoAdj <- ifelse(clinical.data$Radio==1 |clinical.data$Chemo==1, "NeoAdjuvantTherapy", "NoNeoAdjuvantTherapy")
+clinical.data$X <-NULL
+clinical.data$TmaBlock <- clinical.data$Grid
+clinical.data$Grid <-NULL
+clinical.data$x.localisation <-NULL
+clinical.data$y.localisation <-NULL
+
+wd <-"/mnt"
+
+data_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","clinical_data"))
+
+write.csv(clinical.data, file=file.path(data_folder,"clinical_data.csv"))
+```
+
+
+```{r save position files}
+write.csv(p86, file=file.path(data_folder, paste("zTMA_86.csv")))
+write.csv(p87, file=file.path(data_folder, paste("zTMA_87.csv")))
+write.csv(p88, file=file.path(data_folder, paste("zTMA_88.csv")))
+
+write.csv(p175, file=file.path(data_folder, paste("zTMA_175.csv")))
+write.csv(p176, file=file.path(data_folder, paste("zTMA_176.csv")))
+write.csv(p178, file=file.path(data_folder, paste("zTMA_178.csv")))
+```
+
diff --git a/01_Merge-sce-objects.Rmd b/01_Merge-sce-objects.Rmd
new file mode 100644
index 0000000..6ce5e3f
--- /dev/null
+++ b/01_Merge-sce-objects.Rmd
@@ -0,0 +1,111 @@
+---
+title: "R Notebook - merge all single cell objects"
+output: html_notebook
+---
+
+```{r, Set wd}
+#set working directory 
+wd <-"/mnt"
+
+data_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","RAW"))
+
+sce_86_A <- readRDS(file=file.path(data_folder, "sce_86_A_2022.rds"))
+sce_86_B <- readRDS(file=file.path(data_folder, "sce_86_B_2022.rds"))
+sce_86_C <- readRDS(file=file.path(data_folder, "sce_86_C_2022.rds"))
+
+sce_87_A <- readRDS(file=file.path(data_folder, "sce_87_A_2022.rds"))
+sce_87_B <- readRDS(file=file.path(data_folder, "sce_87_B_2022.rds"))
+sce_87_C <- readRDS(file=file.path(data_folder, "sce_87_C_2022.rds"))
+
+sce_88_A <- readRDS(file=file.path(data_folder, "sce_88_A_2022.rds"))
+sce_88_B <- readRDS(file=file.path(data_folder, "sce_88_B_2022.rds"))
+sce_88_C <- readRDS(file=file.path(data_folder, "sce_88_C_2022.rds"))
+
+sce_175_A <- readRDS(file=file.path(data_folder, "sce_175_A_2022.rds"))
+sce_175_B <- readRDS(file=file.path(data_folder, "sce_175_B_2022.rds"))
+sce_175_C <- readRDS(file=file.path(data_folder, "sce_175_C_2022.rds"))
+
+sce_176_A <- readRDS(file=file.path(data_folder, "sce_176_A_2022.rds"))
+sce_176_B <- readRDS(file=file.path(data_folder, "sce_176_B_2022.rds"))
+sce_176_C <- readRDS(file=file.path(data_folder, "sce_176_C_2022.rds"))
+
+sce_178_A <- readRDS(file=file.path(data_folder, "sce_178_A_2022.rds"))
+sce_178_B <- readRDS(file=file.path(data_folder, "sce_178_B_2022.rds"))
+sce_178_C <- readRDS(file=file.path(data_folder, "sce_178_C_2022.rds"))
+```
+
+```{r}
+tma86 <-cbind(sce_86_A,sce_86_B,sce_86_C)
+saveRDS(tma86,file=file.path(data_folder,"merge_86_sces.rds"))
+rm(sce_86_A,sce_86_B,sce_86_C)
+
+tma87 <-cbind(sce_87_A,sce_87_B,sce_87_C)
+saveRDS(tma87,file=file.path(data_folder,"merge_87_sces.rds"))
+rm(sce_87_A,sce_87_B,sce_87_C)
+
+tma88 <-cbind(sce_88_A,sce_88_B,sce_88_C)
+saveRDS(tma88,file=file.path(data_folder,"merge_88_sces.rds"))
+rm(sce_88_A,sce_88_B,sce_88_C)
+
+tma175 <-cbind(sce_175_A,sce_175_B,sce_175_C)
+saveRDS(tma175,file=file.path(data_folder,"merge_175_sces.rds"))
+rm(sce_175_A,sce_175_B,sce_175_C)
+
+tma176 <-cbind(sce_176_A,sce_176_B,sce_176_C)
+saveRDS(tma176,file=file.path(data_folder,"merge_176_sces.rds"))
+rm(sce_176_A,sce_176_B,sce_176_C)
+
+tma178 <-cbind(sce_178_A,sce_178_B,sce_178_C)
+saveRDS(tma178,file=file.path(data_folder,"merge_178_sces.rds"))
+rm(sce_178_A,sce_178_B,sce_178_C)
+```
+
+```{r}
+tma80s <- cbind(tma86, tma87, tma88)
+saveRDS(tma80s,file=file.path(data_folder,"merge_80s_sces_RAW.rds"))
+rm(tma86, tma87, tma88)
+
+tma170s <- cbind(tma175, tma176, tma178)
+saveRDS(tma170s,file=file.path(data_folder,"merge_170s_sces_RAW.rds"))
+rm(tma175, tma176, tma178)
+
+tma_all <- cbind(tma80s, tma170s)
+saveRDS(tma_all, file=file.path(data_folder, "merge_all_TMAs_sce_RAW.rds"))
+```
+
+
+
+
+
+#Add clinical data to counts data
+
+```{r, Set wd}
+#set working directory 
+wd <-"/mnt"
+
+data_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts"))
+
+sce_86_A <- readRDS(file=file.path(data_folder, "sce_86_A_counts_RAW.rds"))
+sce_86_B <- readRDS(file=file.path(data_folder, "sce_86_B_counts_RAW.rds"))
+sce_86_C <- readRDS(file=file.path(data_folder, "sce_86_C_counts_RAW.rds"))
+
+sce_87_A <- readRDS(file=file.path(data_folder, "sce_87_A_counts_RAW.rds"))
+sce_87_B <- readRDS(file=file.path(data_folder, "sce_87_B_counts_RAW.rds"))
+sce_87_C <- readRDS(file=file.path(data_folder, "sce_87_C_counts_RAW.rds"))
+
+sce_88_A <- readRDS(file=file.path(data_folder, "sce_88_A_counts_RAW.rds"))
+sce_88_B <- readRDS(file=file.path(data_folder, "sce_88_B_counts_RAW.rds"))
+sce_88_C <- readRDS(file=file.path(data_folder, "sce_88_C_counts_RAW.rds"))
+
+sce_175_A <- readRDS(file=file.path(data_folder, "sce_175_A_counts_RAW.rds"))
+sce_175_B <- readRDS(file=file.path(data_folder, "sce_175_B_counts_RAW.rds"))
+sce_175_C <- readRDS(file=file.path(data_folder, "sce_175_C_counts_RAW.rds"))
+
+sce_176_A <- readRDS(file=file.path(data_folder, "sce_176_A_counts_RAW.rds"))
+sce_176_B <- readRDS(file=file.path(data_folder, "sce_176_B_counts_RAW.rds"))
+sce_176_C <- readRDS(file=file.path(data_folder, "sce_176_C_counts_RAW.rds"))
+
+sce_178_A <- readRDS(file=file.path(data_folder, "sce_178_A_counts_RAW.rds"))
+sce_178_B <- readRDS(file=file.path(data_folder, "sce_178_B_counts_RAW.rds"))
+sce_178_C <- readRDS(file=file.path(data_folder, "sce_178_C_counts_RAW.rds"))
+```
\ No newline at end of file
diff --git a/02_glm_tumour-non-tumour_perTMA.Rmd b/02_glm_tumour-non-tumour_perTMA.Rmd
new file mode 100644
index 0000000..e8b8740
--- /dev/null
+++ b/02_glm_tumour-non-tumour_perTMA.Rmd
@@ -0,0 +1,772 @@
+title: "R Notebook - classify tumour non tumour using glm() per TMA Block (86_A, 87_B...)"
+output: html_notebook
+---
+
+```{r, import libraries}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+
+library(dplyr)
+library(tidyr)
+library(mclust)
+library(ggplot2)
+library(RColorBrewer)
+library(scater)
+set.seed(101100)
+```
+
+```{r, Set wd and load data}
+#set working directory 
+wd <-getwd() %>% dirname
+
+data_folder <-(file.path(wd,"sce_objects","RAW"))
+
+sce_86_A <- readRDS(file=file.path(data_folder, "sce_86_A_counts_RAW.rds"))
+sce_86_B <- readRDS(file=file.path(data_folder, "sce_86_B_counts_RAW.rds"))
+sce_86_C <- readRDS(file=file.path(data_folder, "sce_86_C_counts_RAW.rds"))
+
+sce_87_A_c <- readRDS(file=file.path(data_folder, "sce_87_A_counts_RAW.rds"))
+sce_87_B_c <- readRDS(file=file.path(data_folder, "sce_87_B_counts_RAW.rds"))
+sce_87_C_c <- readRDS(file=file.path(data_folder, "sce_87_C_counts_RAW.rds"))
+
+sce_88_A <- readRDS(file=file.path(data_folder, "sce_88_A_counts_RAW.rds"))
+sce_88_B <- readRDS(file=file.path(data_folder, "sce_88_B_counts_RAW.rds"))
+sce_88_C <- readRDS(file=file.path(data_folder, "sce_88_C_counts_RAW.rds"))
+
+sce_175_A <- readRDS(file=file.path(data_folder, "sce_175_A_counts_RAW.rds"))
+sce_175_B <- readRDS(file=file.path(data_folder, "sce_175_B_counts_RAW.rds"))
+sce_175_C <- readRDS(file=file.path(data_folder, "sce_175_C_counts_RAW.rds"))
+
+sce_176_A <- readRDS(file=file.path(data_folder, "sce_176_A_counts_RAW.rds"))
+sce_176_B <- readRDS(file=file.path(data_folder, "sce_176_B_counts_RAW.rds"))
+sce_176_C <- readRDS(file=file.path(data_folder, "sce_176_C_counts_RAW.rds"))
+
+sce_178_A <- readRDS(file=file.path(data_folder, "sce_178_A_counts_RAW.rds"))
+sce_178_B <- readRDS(file=file.path(data_folder, "sce_178_B_counts_RAW.rds"))
+sce_178_C <- readRDS(file=file.path(data_folder, "sce_178_C_counts_RAW.rds"))
+```
+
+```{r}
+tma86 <-cbind(sce_86_A,sce_86_B,sce_86_C)
+saveRDS(tma86,file=file.path(data_folder,"merge_86_sces.rds"))
+rm(sce_86_A,sce_86_B,sce_86_C)
+
+tma87 <-cbind(sce_87_A,sce_87_B,sce_87_C)
+saveRDS(tma87,file=file.path(data_folder,"merge_87_sces.rds"))
+rm(sce_87_A,sce_87_B,sce_87_C)
+
+tma88 <-cbind(sce_88_A,sce_88_B,sce_88_C)
+saveRDS(tma88,file=file.path(data_folder,"merge_88_sces.rds"))
+rm(sce_88_A,sce_88_B,sce_88_C)
+
+tma175 <-cbind(sce_175_A,sce_175_B,sce_175_C)
+saveRDS(tma175,file=file.path(data_folder,"merge_175_sces.rds"))
+rm(sce_175_A,sce_175_B,sce_175_C)
+
+tma176 <-cbind(sce_176_A,sce_176_B,sce_176_C)
+saveRDS(tma176,file=file.path(data_folder,"merge_176_sces.rds"))
+rm(sce_176_A,sce_176_B,sce_176_C)
+
+tma178 <-cbind(sce_178_A,sce_178_B,sce_178_C)
+saveRDS(tma178,file=file.path(data_folder,"merge_178_sces.rds"))
+rm(sce_178_A,sce_178_B,sce_178_C)
+```
+
+```{r}
+tma80s <- cbind(tma86, tma87, tma88)
+saveRDS(tma80s,file=file.path(data_folder,"merge_80s_sces_RAW.rds"))
+rm(tma86, tma87, tma88)
+
+tma170s <- cbind(tma175, tma176, tma178)
+saveRDS(tma170s,file=file.path(data_folder,"merge_170s_sces_RAW.rds"))
+rm(tma175, tma176, tma178)
+
+tma_all <- cbind(tma80s, tma170s)
+saveRDS(tma_all, file=file.path(data_folder, "merge_all_TMAs_sce_RAW.rds"))
+```
+
+#Save data objects into workingfile folder
+```{r}
+data_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","workingfiles_TMA"))
+
+saveRDS(sce_86_A, file=file.path(data_folder, "sce_86_A_workingfile.rds"))
+saveRDS(sce_86_B, file=file.path(data_folder, "sce_86_B_workingfile.rds"))
+saveRDS(sce_86_C, file=file.path(data_folder, "sce_86_C_workingfile.rds"))
+
+saveRDS(sce_87_A, file=file.path(data_folder, "sce_87_A_workingfile.rds"))
+saveRDS(sce_87_B, file=file.path(data_folder, "sce_87_B_workingfile.rds"))
+saveRDS(sce_87_C, file=file.path(data_folder, "sce_87_C_workingfile.rds"))
+
+saveRDS(sce_88_A, file=file.path(data_folder, "sce_88_A_workingfile.rds"))
+saveRDS(sce_88_B, file=file.path(data_folder, "sce_88_B_workingfile.rds"))
+saveRDS(sce_88_C, file=file.path(data_folder, "sce_88_C_workingfile.rds"))
+
+saveRDS(sce_175_A, file=file.path(data_folder, "sce_175_A_workingfile.rds"))
+saveRDS(sce_175_B, file=file.path(data_folder, "sce_175_B_workingfile.rds"))
+saveRDS(sce_175_C, file=file.path(data_folder, "sce_175_C_workingfile.rds"))
+
+saveRDS(sce_176_A, file=file.path(data_folder, "sce_176_A_workingfile.rds"))
+saveRDS(sce_176_B, file=file.path(data_folder, "sce_176_B_workingfile.rds"))
+saveRDS(sce_176_C, file=file.path(data_folder, "sce_176_C_workingfile.rds"))
+
+saveRDS(sce_178_A, file=file.path(data_folder, "sce_178_A_workingfile.rds"))
+saveRDS(sce_178_B, file=file.path(data_folder, "sce_178_B_workingfile.rds"))
+saveRDS(sce_178_C, file=file.path(data_folder, "sce_178_C_workingfile.rds"))
+```
+
+```{r, Define good markers, echo=F, warning=F, message=FALSE}
+all.marker <-rownames(sce_86_A)
+
+bad.marker <- c("Iridium_191","Iridium_193","Cadherin-6","Histone H3") 
+good.marker <- all.marker[!all.marker %in% bad.marker]
+print(good.marker)
+```
+
+#GLM per TmaBlock separating cells into tumour/non-tumour
+#86
+```{r, glm tumour non tumour 86A}
+dat.counts <-as.data.frame(t((assay(sce_86_A,"c_counts_asinh"))))
+dat.counts$CellID <- rownames(dat.counts)
+dat.counts.panCK <- dat.counts %>% select(CellID, `Pan Cytokeratin + Keratin Epithelial`)
+
+ggplot(dat.counts.panCK, aes(x=`Pan Cytokeratin + Keratin Epithelial`)) + 
+  geom_density()
+library(mclust)
+glm.86.A <-Mclust(dat.counts.panCK$`Pan Cytokeratin + Keratin Epithelial`,G=2)
+#plot(glm.86.A)
+table(glm.86.A$classification)
+
+sce_86_A$mclust <- glm.86.A$classification
+```
+
+```{r, glm tumour non tumour 86B}
+dat.counts <-as.data.frame(t((assay(sce_86_B,"c_counts_asinh"))))
+dat.counts$CellID <- rownames(dat.counts)
+dat.counts.panCK <- dat.counts %>% select(CellID, `Pan Cytokeratin + Keratin Epithelial`)
+
+ggplot(dat.counts.panCK, aes(x=`Pan Cytokeratin + Keratin Epithelial`)) + 
+  geom_density()
+
+glm.86.B <-Mclust(dat.counts.panCK$`Pan Cytokeratin + Keratin Epithelial`,G=2)
+#plot(glm.86.B)
+table(glm.86.B$classification)
+
+sce_86_B$mclust <- glm.86.B$classification
+```
+
+```{r, glm tumour non tumour 86C}
+dat.counts <-as.data.frame(t((assay(sce_86_C,"c_counts_asinh"))))
+dat.counts$CellID <- rownames(dat.counts)
+dat.counts.panCK <- dat.counts %>% select(CellID, `Pan Cytokeratin + Keratin Epithelial`)
+
+ggplot(dat.counts.panCK, aes(x=`Pan Cytokeratin + Keratin Epithelial`)) + 
+  geom_density()
+
+glm.86.C <-Mclust(dat.counts.panCK$`Pan Cytokeratin + Keratin Epithelial`,G=2)
+#plot(glm.86.C)
+table(glm.86.C$classification)
+
+sce_86_C$mclust <- glm.86.C$classification
+```
+#87
+```{r, glm tumour non tumour 87A}
+dat.counts <-as.data.frame(t((assay(sce_87_A,"c_counts_asinh"))))
+dat.counts$CellID <- rownames(dat.counts)
+dat.counts.panCK <- dat.counts %>% select(CellID, `Pan Cytokeratin + Keratin Epithelial`)
+
+ggplot(dat.counts.panCK, aes(x=`Pan Cytokeratin + Keratin Epithelial`)) + 
+  geom_density()
+
+glm.87.A <-Mclust(dat.counts.panCK$`Pan Cytokeratin + Keratin Epithelial`,G=2)
+#plot(glm.87.A)
+table(glm.87.A$classification)
+
+sce_87_A$mclust <- glm.87.A$classification
+```
+
+```{r, glm tumour non tumour 87B}
+dat.counts <-as.data.frame(t((assay(sce_87_B,"c_counts_asinh"))))
+dat.counts$CellID <- rownames(dat.counts)
+dat.counts.panCK <- dat.counts %>% select(CellID, `Pan Cytokeratin + Keratin Epithelial`)
+
+ggplot(dat.counts.panCK, aes(x=`Pan Cytokeratin + Keratin Epithelial`)) + 
+  geom_density()
+
+glm.87.B <-Mclust(dat.counts.panCK$`Pan Cytokeratin + Keratin Epithelial`,G=2)
+#plot(glm.87.B)
+table(glm.87.B$classification)
+
+sce_87_B$mclust <- glm.87.B$classification
+```
+
+```{r, glm tumour non tumour 87C}
+dat.counts <-as.data.frame(t((assay(sce_87_C,"c_counts_asinh"))))
+dat.counts$CellID <- rownames(dat.counts)
+dat.counts.panCK <- dat.counts %>% select(CellID, `Pan Cytokeratin + Keratin Epithelial`)
+
+ggplot(dat.counts.panCK, aes(x=`Pan Cytokeratin + Keratin Epithelial`)) + 
+  geom_density()
+
+glm.87.C <-Mclust(dat.counts.panCK$`Pan Cytokeratin + Keratin Epithelial`,G=2)
+#plot(glm.87.C)
+table(glm.87.C$classification)
+
+sce_87_C$mclust <- glm.87.C$classification
+```
+
+#88
+```{r, glm tumour non tumour 88A}
+dat.counts <-as.data.frame(t((assay(sce_88_A,"c_counts_asinh"))))
+dat.counts$CellID <- rownames(dat.counts)
+dat.counts.panCK <- dat.counts %>% select(CellID, `Pan Cytokeratin + Keratin Epithelial`)
+
+ggplot(dat.counts.panCK, aes(x=`Pan Cytokeratin + Keratin Epithelial`)) + 
+  geom_density()
+
+glm.88.A <-Mclust(dat.counts.panCK$`Pan Cytokeratin + Keratin Epithelial`,G=2)
+#plot(glm.88.A)
+table(glm.88.A$classification)
+
+sce_88_A$mclust <- glm.88.A$classification
+```
+
+```{r, glm tumour non tumour 88B}
+dat.counts <-as.data.frame(t((assay(sce_88_B,"c_counts_asinh"))))
+dat.counts$CellID <- rownames(dat.counts)
+dat.counts.panCK <- dat.counts %>% select(CellID, `Pan Cytokeratin + Keratin Epithelial`)
+
+ggplot(dat.counts.panCK, aes(x=`Pan Cytokeratin + Keratin Epithelial`)) + 
+  geom_density()
+
+glm.88.B <-Mclust(dat.counts.panCK$`Pan Cytokeratin + Keratin Epithelial`,G=2)
+#plot(glm.88.B)
+table(glm.88.B$classification)
+
+sce_88_B$mclust <- glm.88.B$classification
+```
+
+```{r, glm tumour non tumour 88C}
+dat.counts <-as.data.frame(t((assay(sce_88_C,"c_counts_asinh"))))
+dat.counts$CellID <- rownames(dat.counts)
+dat.counts.panCK <- dat.counts %>% select(CellID, `Pan Cytokeratin + Keratin Epithelial`)
+
+ggplot(dat.counts.panCK, aes(x=`Pan Cytokeratin + Keratin Epithelial`)) + 
+  geom_density()
+
+glm.88.C <-Mclust(dat.counts.panCK$`Pan Cytokeratin + Keratin Epithelial`,G=2)
+#plot(glm.88.C)
+table(glm.88.C$classification)
+
+sce_88_C$mclust <- glm.88.C$classification
+```
+
+#175
+```{r, glm tumour non tumour 175A}
+dat.counts <-as.data.frame(t((assay(sce_175_A,"c_counts_asinh"))))
+dat.counts$CellID <- rownames(dat.counts)
+dat.counts.panCK <- dat.counts %>% select(CellID, `Pan Cytokeratin + Keratin Epithelial`)
+
+ggplot(dat.counts.panCK, aes(x=`Pan Cytokeratin + Keratin Epithelial`)) + 
+  geom_density()
+
+glm.175.A <-Mclust(dat.counts.panCK$`Pan Cytokeratin + Keratin Epithelial`,G=2)
+#plot(glm.175.A)
+table(glm.175.A$classification)
+
+sce_175_A$mclust <- glm.175.A$classification
+```
+
+```{r, glm tumour non tumour 175B}
+dat.counts <-as.data.frame(t((assay(sce_175_B,"c_counts_asinh"))))
+dat.counts$CellID <- rownames(dat.counts)
+dat.counts.panCK <- dat.counts %>% select(CellID, `Pan Cytokeratin + Keratin Epithelial`)
+
+ggplot(dat.counts.panCK, aes(x=`Pan Cytokeratin + Keratin Epithelial`)) + 
+  geom_density()
+
+glm.175.B <-Mclust(dat.counts.panCK$`Pan Cytokeratin + Keratin Epithelial`,G=2)
+#plot(glm.175.B)
+table(glm.175.B$classification)
+
+sce_175_B$mclust <- glm.175.B$classification
+```
+
+```{r, glm tumour non tumour 175C}
+dat.counts <-as.data.frame(t((assay(sce_175_C,"c_counts_asinh"))))
+dat.counts$CellID <- rownames(dat.counts)
+dat.counts.panCK <- dat.counts %>% select(CellID, `Pan Cytokeratin + Keratin Epithelial`)
+
+ggplot(dat.counts.panCK, aes(x=`Pan Cytokeratin + Keratin Epithelial`)) + 
+  geom_density()
+
+glm.175.C <-Mclust(dat.counts.panCK$`Pan Cytokeratin + Keratin Epithelial`,G=2)
+#plot(glm.175.C)
+table(glm.175.C$classification)
+
+sce_175_C$mclust <- glm.175.C$classification
+```
+
+#176
+```{r, glm tumour non tumour 176A}
+dat.counts <-as.data.frame(t((assay(sce_176_A,"c_counts_asinh"))))
+dat.counts$CellID <- rownames(dat.counts)
+dat.counts.panCK <- dat.counts %>% select(CellID, `Pan Cytokeratin + Keratin Epithelial`)
+
+ggplot(dat.counts.panCK, aes(x=`Pan Cytokeratin + Keratin Epithelial`)) + 
+  geom_density()
+
+glm.176.A <-Mclust(dat.counts.panCK$`Pan Cytokeratin + Keratin Epithelial`,G=2)
+#plot(glm.176.A)
+table(glm.176.A$classification)
+
+sce_176_A$mclust <- glm.176.A$classification
+```
+
+```{r, glm tumour non tumour 176B}
+dat.counts <-as.data.frame(t((assay(sce_176_B,"c_counts_asinh"))))
+dat.counts$CellID <- rownames(dat.counts)
+dat.counts.panCK <- dat.counts %>% select(CellID, `Pan Cytokeratin + Keratin Epithelial`)
+
+ggplot(dat.counts.panCK, aes(x=`Pan Cytokeratin + Keratin Epithelial`)) + 
+  geom_density()
+
+glm.176.B <-Mclust(dat.counts.panCK$`Pan Cytokeratin + Keratin Epithelial`,G=2)
+#plot(glm.176.B)
+table(glm.176.B$classification)
+
+sce_176_B$mclust <- glm.176.B$classification
+```
+
+```{r, glm tumour non tumour 176C}
+dat.counts <-as.data.frame(t((assay(sce_176_C,"c_counts_asinh"))))
+dat.counts$CellID <- rownames(dat.counts)
+dat.counts.panCK <- dat.counts %>% select(CellID, `Pan Cytokeratin + Keratin Epithelial`)
+
+ggplot(dat.counts.panCK, aes(x=`Pan Cytokeratin + Keratin Epithelial`)) + 
+  geom_density()
+
+glm.176.C <-Mclust(dat.counts.panCK$`Pan Cytokeratin + Keratin Epithelial`,G=2)
+#plot(glm.176.C)
+table(glm.176.C$classification)
+
+sce_176_C$mclust <- glm.176.C$classification
+```
+
+#178
+```{r, glm tumour non tumour 178A}
+dat.counts <-as.data.frame(t((assay(sce_178_A,"c_counts_asinh"))))
+dat.counts$CellID <- rownames(dat.counts)
+dat.counts.panCK <- dat.counts %>% select(CellID, `Pan Cytokeratin + Keratin Epithelial`)
+
+ggplot(dat.counts.panCK, aes(x=`Pan Cytokeratin + Keratin Epithelial`)) + 
+  geom_density()
+
+glm.178.A <-Mclust(dat.counts.panCK$`Pan Cytokeratin + Keratin Epithelial`,G=2)
+#plot(glm.178.A)
+table(glm.178.A$classification)
+
+sce_178_A$mclust <- glm.178.A$classification
+```
+
+```{r, glm tumour non tumour 178B}
+dat.counts <-as.data.frame(t((assay(sce_178_B,"c_counts_asinh"))))
+dat.counts$CellID <- rownames(dat.counts)
+dat.counts.panCK <- dat.counts %>% select(CellID, `Pan Cytokeratin + Keratin Epithelial`)
+
+ggplot(dat.counts.panCK, aes(x=`Pan Cytokeratin + Keratin Epithelial`)) + 
+  geom_density()
+
+glm.178.B <-Mclust(dat.counts.panCK$`Pan Cytokeratin + Keratin Epithelial`,G=2)
+#plot(glm.178.B)
+table(glm.178.B$classification)
+
+sce_178_B$mclust <- glm.178.B$classification
+```
+
+```{r, glm tumour non tumour 178C, fig.width=12, fig.height=12}
+dat.counts <-as.data.frame(t((assay(sce_178_C,"c_counts_asinh"))))
+dat.counts$CellID <- rownames(dat.counts)
+dat.counts.panCK <- dat.counts %>% select(CellID, `Pan Cytokeratin + Keratin Epithelial`)
+
+ggplot(dat.counts.panCK, aes(x=`Pan Cytokeratin + Keratin Epithelial`)) + 
+  geom_density()
+
+glm.178.C <-Mclust(dat.counts.panCK$`Pan Cytokeratin + Keratin Epithelial`,G=2)
+#plot(glm.178.C)
+table(glm.178.C$classification)
+
+sce_178_C$mclust <- glm.178.C$classification
+```
+
+
+```{r, glm tumour non tumour 178C check expression, fig.width=12, fig.height=12}
+all.marker <-rownames(sce_87_C)
+
+bad.marker <- c("Iridium_191","Iridium_193","Cadherin-6","Histone H3") 
+good.marker <- all.marker[!all.marker %in% bad.marker]
+print(good.marker)
+
+#check expression
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+agg_sce <- aggregateAcrossCells(sce_178_C, ids=sce_178_C$mclust, average=TRUE, use_exprs_values="c_counts_asinh_scaled")
+
+scater::plotHeatmap(agg_sce,
+              #features = fibro.marker,
+              features=good.marker,
+              exprs_values = "c_counts_asinh_scaled",
+              #symmetric = FALSE,
+              zlim=c(-0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c("mclust"),
+              main=paste0("Heatmap - tumour no tumour"))
+
+cellnb <- 100000
+subsample <-  colnames(sce_178_C)[sample(length(colnames(sce_178_C)), cellnb)]
+sce_sub <- sce_178_C[, colnames(sce_178_C) %in% subsample]
+p <-50
+for(i in p){
+sce_sub <- runUMAP(sce_sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("UMAP_p", i),
+                 #use_dimred="PCA_20",
+                 n_neighbors = i)
+}
+
+dat <-as.data.frame(reducedDims(sce_sub)$`UMAP_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(sce_sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% good.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+
+cluster <- "mclust"
+p <-plotReducedDim(sce_sub, "UMAP_p50", colour_by=paste(cluster), text_by = paste(cluster), point_size=0.5,)+labs(x="UMAP-1", y="UMAP-2", title="UMAP") 
+plot(p)
+##ggsave(filename=file.path(plot_folder, paste("sub_Fibro_CAF-Clusters_UMAP_p50.png",sep="")), plot=p)
+```
+
+
+```{r, save mclust results to workingfile}
+data_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","workingfiles_TMA"))
+
+saveRDS(sce_86_A, file=file.path(data_folder, "sce_86_A_workingfile.rds"))
+saveRDS(sce_86_B, file=file.path(data_folder, "sce_86_B_workingfile.rds"))
+saveRDS(sce_86_C, file=file.path(data_folder, "sce_86_C_workingfile.rds"))
+
+saveRDS(sce_87_A, file=file.path(data_folder, "sce_87_A_workingfile.rds"))
+saveRDS(sce_87_B, file=file.path(data_folder, "sce_87_B_workingfile.rds"))
+saveRDS(sce_87_C, file=file.path(data_folder, "sce_87_C_workingfile.rds"))
+
+saveRDS(sce_88_A, file=file.path(data_folder, "sce_88_A_workingfile.rds"))
+saveRDS(sce_88_B, file=file.path(data_folder, "sce_88_B_workingfile.rds"))
+saveRDS(sce_88_C, file=file.path(data_folder, "sce_88_C_workingfile.rds"))
+
+saveRDS(sce_175_A, file=file.path(data_folder, "sce_175_A_workingfile.rds"))
+saveRDS(sce_175_B, file=file.path(data_folder, "sce_175_B_workingfile.rds"))
+saveRDS(sce_175_C, file=file.path(data_folder, "sce_175_C_workingfile.rds"))
+
+saveRDS(sce_176_A, file=file.path(data_folder, "sce_176_A_workingfile.rds"))
+saveRDS(sce_176_B, file=file.path(data_folder, "sce_176_B_workingfile.rds"))
+saveRDS(sce_176_C, file=file.path(data_folder, "sce_176_C_workingfile.rds"))
+
+saveRDS(sce_178_A, file=file.path(data_folder, "sce_178_A_workingfile.rds"))
+saveRDS(sce_178_B, file=file.path(data_folder, "sce_178_B_workingfile.rds"))
+saveRDS(sce_178_C, file=file.path(data_folder, "sce_178_C_workingfile.rds"))
+```
+#
+```{r, split tumour and non tumour and save out}
+wd <-"/mnt"
+data_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","tumour_non-tumour_from_TMA"))
+#86
+sce_86_A_tumour <- sce_86_A[, sce_86_A$mclust==2]
+sce_86_A_NONtumour <- sce_86_A[, sce_86_A$mclust==1]
+sce_86_B_tumour <- sce_86_B[, sce_86_B$mclust==2]
+sce_86_B_NONtumour <- sce_86_B[, sce_86_B$mclust==1]
+sce_86_C_tumour <- sce_86_C[, sce_86_C$mclust==2]
+sce_86_C_NONtumour <- sce_86_C[, sce_86_C$mclust==1]
+
+saveRDS(sce_86_A_tumour, file=file.path(data_folder, "sce_86_A_tumour.rds"))
+saveRDS(sce_86_B_tumour, file=file.path(data_folder, "sce_86_B_tumour.rds"))
+saveRDS(sce_86_C_tumour, file=file.path(data_folder, "sce_86_C_tumour.rds"))
+
+saveRDS(sce_86_A_NONtumour, file=file.path(data_folder, "sce_86_A_NON-tumour.rds"))
+saveRDS(sce_86_B_NONtumour, file=file.path(data_folder, "sce_86_B_NON-tumour.rds"))
+saveRDS(sce_86_C_NONtumour, file=file.path(data_folder, "sce_86_C_NON-tumour.rds"))
+
+
+#87
+sce_87_A_tumour <- sce_87_A[, sce_87_A$mclust==2]
+sce_87_A_NONtumour <- sce_87_A[, sce_87_A$mclust==1]
+sce_87_B_tumour <- sce_87_B[, sce_87_B$mclust==2]
+sce_87_B_NONtumour <- sce_87_B[, sce_87_B$mclust==1]
+sce_87_C_tumour <- sce_87_C[, sce_87_C$mclust==2]
+sce_87_C_NONtumour <- sce_87_C[, sce_87_C$mclust==1]
+
+
+saveRDS(sce_87_A_tumour, file=file.path(data_folder, "sce_87_A_tumour.rds"))
+saveRDS(sce_87_B_tumour, file=file.path(data_folder, "sce_87_B_tumour.rds"))
+saveRDS(sce_87_C_tumour, file=file.path(data_folder, "sce_87_C_tumour.rds"))
+
+saveRDS(sce_87_A_NONtumour, file=file.path(data_folder, "sce_87_A_NON-tumour.rds"))
+saveRDS(sce_87_B_NONtumour, file=file.path(data_folder, "sce_87_B_NON-tumour.rds"))
+saveRDS(sce_87_C_NONtumour, file=file.path(data_folder, "sce_87_C_NON-tumour.rds"))
+
+#88
+sce_88_A_tumour <- sce_88_A[, sce_88_A$mclust==2]
+sce_88_A_NONtumour <- sce_88_A[, sce_88_A$mclust==1]
+sce_88_B_tumour <- sce_88_B[, sce_88_B$mclust==2]
+sce_88_B_NONtumour <- sce_88_B[, sce_88_B$mclust==1]
+sce_88_C_tumour <- sce_88_C[, sce_88_C$mclust==2]
+sce_88_C_NONtumour <- sce_88_C[, sce_88_C$mclust==1]
+
+saveRDS(sce_88_A_tumour, file=file.path(data_folder, "sce_88_A_tumour.rds"))
+saveRDS(sce_88_B_tumour, file=file.path(data_folder, "sce_88_B_tumour.rds"))
+saveRDS(sce_88_C_tumour, file=file.path(data_folder, "sce_88_C_tumour.rds"))
+
+saveRDS(sce_88_A_NONtumour, file=file.path(data_folder, "sce_88_A_NON-tumour.rds"))
+saveRDS(sce_88_B_NONtumour, file=file.path(data_folder, "sce_88_B_NON-tumour.rds"))
+saveRDS(sce_88_C_NONtumour, file=file.path(data_folder, "sce_88_C_NON-tumour.rds"))
+
+#175
+sce_175_A_tumour <- sce_175_A[, sce_175_A$mclust==2]
+sce_175_A_NONtumour <- sce_175_A[, sce_175_A$mclust==1]
+sce_175_B_tumour <- sce_175_B[, sce_175_B$mclust==2]
+sce_175_B_NONtumour <- sce_175_B[, sce_175_B$mclust==1]
+sce_175_C_tumour <- sce_175_C[, sce_175_C$mclust==2]
+sce_175_C_NONtumour <- sce_175_C[, sce_175_C$mclust==1]
+
+saveRDS(sce_175_A_tumour, file=file.path(data_folder, "sce_175_A_tumour.rds"))
+saveRDS(sce_175_B_tumour, file=file.path(data_folder, "sce_175_B_tumour.rds"))
+saveRDS(sce_175_C_tumour, file=file.path(data_folder, "sce_175_C_tumour.rds"))
+
+saveRDS(sce_175_A_NONtumour, file=file.path(data_folder, "sce_175_A_NON-tumour.rds"))
+saveRDS(sce_175_B_NONtumour, file=file.path(data_folder, "sce_175_B_NON-tumour.rds"))
+saveRDS(sce_175_C_NONtumour, file=file.path(data_folder, "sce_175_C_NON-tumour.rds"))
+
+#176
+sce_176_A_tumour <- sce_176_A[, sce_176_A$mclust==2]
+sce_176_A_NONtumour <- sce_176_A[, sce_176_A$mclust==1]
+sce_176_B_tumour <- sce_176_B[, sce_176_B$mclust==2]
+sce_176_B_NONtumour <- sce_176_B[, sce_176_B$mclust==1]
+sce_176_C_tumour <- sce_176_C[, sce_176_C$mclust==2]
+sce_176_C_NONtumour <- sce_176_C[, sce_176_C$mclust==1]
+
+saveRDS(sce_176_A_tumour, file=file.path(data_folder, "sce_176_A_tumour.rds"))
+saveRDS(sce_176_B_tumour, file=file.path(data_folder, "sce_176_B_tumour.rds"))
+saveRDS(sce_176_C_tumour, file=file.path(data_folder, "sce_176_C_tumour.rds"))
+
+saveRDS(sce_176_A_NONtumour, file=file.path(data_folder, "sce_176_A_NON-tumour.rds"))
+saveRDS(sce_176_B_NONtumour, file=file.path(data_folder, "sce_176_B_NON-tumour.rds"))
+saveRDS(sce_176_C_NONtumour, file=file.path(data_folder, "sce_176_C_NON-tumour.rds"))
+
+#178
+sce_178_A_tumour <- sce_178_A[, sce_178_A$mclust==2]
+sce_178_A_NONtumour <- sce_178_A[, sce_178_A$mclust==1]
+sce_178_B_tumour <- sce_178_B[, sce_178_B$mclust==2]
+sce_178_B_NONtumour <- sce_178_B[, sce_178_B$mclust==1]
+sce_178_C_tumour <- sce_178_C[, sce_178_C$mclust==2]
+sce_178_C_NONtumour <- sce_178_C[, sce_178_C$mclust==1]
+
+saveRDS(sce_178_A_tumour, file=file.path(data_folder, "sce_178_A_tumour.rds"))
+saveRDS(sce_178_B_tumour, file=file.path(data_folder, "sce_178_B_tumour.rds"))
+saveRDS(sce_178_C_tumour, file=file.path(data_folder, "sce_178_C_tumour.rds"))
+
+saveRDS(sce_178_A_NONtumour, file=file.path(data_folder, "sce_178_A_NON-tumour.rds"))
+saveRDS(sce_178_B_NONtumour, file=file.path(data_folder, "sce_178_B_NON-tumour.rds"))
+saveRDS(sce_178_C_NONtumour, file=file.path(data_folder, "sce_178_C_NON-tumour.rds"))
+
+
+
+rm(sce_86_A,sce_86_B, sce_86_C,
+   sce_87_A,sce_87_B, sce_87_C,
+   sce_88_A,sce_88_B, sce_88_C,
+   sce_175_A,sce_175_B, sce_175_C,
+   sce_176_A,sce_176_B, sce_176_C,
+   sce_178_A,sce_178_B, sce_178_C)
+
+```
+
+```{r, Set wd and load data}
+#set working directory 
+wd <-"/mnt"
+
+data_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","tumour_non-tumour_from_TMA"))
+
+sce_86_A_tumour <- readRDS(file=file.path(data_folder, "sce_86_A_tumour.rds"))
+sce_86_B_tumour <- readRDS(file=file.path(data_folder, "sce_86_B_tumour.rds"))
+sce_86_C_tumour <- readRDS(file=file.path(data_folder, "sce_86_C_tumour.rds"))
+
+sce_87_A_tumour <- readRDS(file=file.path(data_folder, "sce_87_A_tumour.rds"))
+sce_87_B_tumour <- readRDS(file=file.path(data_folder, "sce_87_B_tumour.rds"))
+sce_87_C_tumour <- readRDS(file=file.path(data_folder, "sce_87_C_tumour.rds"))
+
+sce_88_A_tumour <- readRDS(file=file.path(data_folder, "sce_88_A_tumour.rds"))
+sce_88_B_tumour <- readRDS(file=file.path(data_folder, "sce_88_B_tumour.rds"))
+sce_88_C_tumour <- readRDS(file=file.path(data_folder, "sce_88_C_tumour.rds"))
+
+sce_175_A_tumour <- readRDS(file=file.path(data_folder, "sce_175_A_tumour.rds"))
+sce_175_B_tumour <- readRDS(file=file.path(data_folder, "sce_175_B_tumour.rds"))
+sce_175_C_tumour <- readRDS(file=file.path(data_folder, "sce_175_C_tumour.rds"))
+
+sce_176_A_tumour <- readRDS(file=file.path(data_folder, "sce_176_A_tumour.rds"))
+sce_176_B_tumour <- readRDS(file=file.path(data_folder, "sce_176_B_tumour.rds"))
+sce_176_C_tumour <- readRDS(file=file.path(data_folder, "sce_176_C_tumour.rds"))
+
+sce_178_A_tumour <- readRDS(file=file.path(data_folder, "sce_178_A_tumour.rds"))
+sce_178_B_tumour <- readRDS(file=file.path(data_folder, "sce_178_B_tumour.rds"))
+sce_178_C_tumour <- readRDS(file=file.path(data_folder, "sce_178_C_tumour.rds"))
+```
+
+#merge data objects into tumour vs non.tumour
+```{r, merge tumour}
+data_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","tumour_non-tumour_from_TMA"))
+
+all.tumouor <-cbind(sce_86_A_tumour ,
+      sce_86_B_tumour ,
+      sce_86_C_tumour ,
+      sce_87_A_tumour ,
+      sce_87_B_tumour ,
+      sce_87_C_tumour ,
+      sce_88_A_tumour ,
+      sce_88_B_tumour ,
+      sce_88_C_tumour ,
+      sce_175_A_tumour ,
+      sce_175_B_tumour ,
+      sce_175_C_tumour ,
+      sce_176_A_tumour ,
+      sce_176_B_tumour ,
+      sce_176_C_tumour ,
+      sce_178_A_tumour ,
+      sce_178_B_tumour ,
+      sce_178_C_tumour 
+      )
+saveRDS(all.tumouor,file=file.path(data_folder,"merge_all-TUMOUR.rds"))
+metadata(all.tumouor) <-list()
+all.tumour$TMA <-paste(all.tumour$TmaID, all.tumour$TmaBlock)
+rm(sce_86_A_tumour ,
+      sce_86_B_tumour ,
+      sce_86_C_tumour ,
+      sce_87_A_tumour ,
+      sce_87_B_tumour ,
+      sce_87_C_tumour ,
+      sce_88_A_tumour ,
+      sce_88_B_tumour ,
+      sce_88_C_tumour ,
+      sce_175_A_tumour ,
+      sce_175_B_tumour ,
+      sce_175_C_tumour ,
+      sce_176_A_tumour ,
+      sce_176_B_tumour ,
+      sce_176_C_tumour ,
+      sce_178_A_tumour ,
+      sce_178_B_tumour ,
+      sce_178_C_tumour 
+      )
+```
+
+```{r, Set wd and load Non tumour data}
+#set working directory 
+wd <-"/mnt"
+
+data_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","tumour_non-tumour_from_TMA"))
+
+sce_86_A_NONtumour <- readRDS(file=file.path(data_folder, "sce_86_A_NON-tumour.rds"))
+sce_86_B_NONtumour <- readRDS(file=file.path(data_folder, "sce_86_B_NON-tumour.rds"))
+sce_86_C_NONtumour <- readRDS(file=file.path(data_folder, "sce_86_C_NON-tumour.rds"))
+
+sce_87_A_NONtumour <- readRDS(file=file.path(data_folder, "sce_87_A_NON-tumour.rds"))
+sce_87_B_NONtumour <- readRDS(file=file.path(data_folder, "sce_87_B_NON-tumour.rds"))
+sce_87_C_NONtumour <- readRDS(file=file.path(data_folder, "sce_87_C_NON-tumour.rds"))
+
+sce_88_A_NONtumour <- readRDS(file=file.path(data_folder, "sce_88_A_NON-tumour.rds"))
+sce_88_B_NONtumour <- readRDS(file=file.path(data_folder, "sce_88_B_NON-tumour.rds"))
+sce_88_C_NONtumour <- readRDS(file=file.path(data_folder, "sce_88_C_NON-tumour.rds"))
+
+sce_175_A_NONtumour <- readRDS(file=file.path(data_folder, "sce_175_A_NON-tumour.rds"))
+sce_175_B_NONtumour <- readRDS(file=file.path(data_folder, "sce_175_B_NON-tumour.rds"))
+sce_175_C_NONtumour <- readRDS(file=file.path(data_folder, "sce_175_C_NON-tumour.rds"))
+
+sce_176_A_NONtumour <- readRDS(file=file.path(data_folder, "sce_176_A_NON-tumour.rds"))
+sce_176_B_NONtumour <- readRDS(file=file.path(data_folder, "sce_176_B_NON-tumour.rds"))
+sce_176_C_NONtumour <- readRDS(file=file.path(data_folder, "sce_176_C_NON-tumour.rds"))
+
+sce_178_A_NONtumour <- readRDS(file=file.path(data_folder, "sce_178_A_NON-tumour.rds"))
+sce_178_B_NONtumour <- readRDS(file=file.path(data_folder, "sce_178_B_NON-tumour.rds"))
+sce_178_C_NONtumour <- readRDS(file=file.path(data_folder, "sce_178_C_NON-tumour.rds"))
+```
+
+```{r, merge NON tumour}
+data_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","tumour_non-tumour_from_TMA"))
+
+all.NONtumouor <-cbind(sce_86_A_NONtumour,
+      sce_86_B_NONtumour,
+      sce_86_C_NONtumour,
+      sce_87_A_NONtumour,
+      sce_87_B_NONtumour,
+      sce_87_C_NONtumour,
+      sce_88_A_NONtumour,
+      sce_88_B_NONtumour,
+      sce_88_C_NONtumour,
+      sce_175_A_NONtumour,
+      sce_175_B_NONtumour,
+      sce_175_C_NONtumour,
+      sce_176_A_NONtumour,
+      sce_176_B_NONtumour,
+      sce_176_C_NONtumour,
+      sce_178_A_NONtumour,
+      sce_178_B_NONtumour,
+      sce_178_C_NONtumour
+      )
+metadata(all.NONtumouor) <-list()
+
+saveRDS(all.NONtumouor,file=file.path(data_folder,"merge_all-NONtumour.rds"))
+rm(sce_86_A_NONtumour,sce_86_B_NONtumour,sce_86_C_NONtumour,sce_87_A_NONtumour,sce_87_B_NONtumour,sce_87_C_NONtumour,sce_88_A_NONtumour,sce_88_B_NONtumour,sce_88_C_NONtumour,sce_175_A_NONtumour,sce_175_B_NONtumour,sce_175_C_NONtumour,sce_176_A_NONtumour,sce_176_B_NONtumour,sce_176_C_NONtumour,sce_178_A_NONtumour,sce_178_B_NONtumour,sce_178_C_NONtumour)
+```
+
+```{r, merge all}
+all.cells <- cbind(all.tumouor, all.NONtumouor)
+saveRDS(all.cells,file=file.path(data_folder, "all-cells_merged.rds"))
+
+all.tumouor$TMA <-paste(all.tumouor$TmaID, all.tumouor$TmaBlock,sep="")
+
+all.NONtumouor$TMA <-paste(all.NONtumouor$TmaID, all.NONtumouor$TmaBlock,sep="")
+
+all.cells$TMA <-paste(all.cells$TmaID, all.cells$TmaBlock,sep="")
+
+table(all.NONtumouor$mclust, all.NONtumouor$TMA) %>% data.frame()
+head(colData(all.NONtumouor))
+
+table(all.tumouor$mclust, all.tumouor$TMA) %>% data.frame()
+```
+
diff --git a/03_Check_tumour_non-tumouor_afterGLM.Rmd b/03_Check_tumour_non-tumouor_afterGLM.Rmd
new file mode 100644
index 0000000..36af62e
--- /dev/null
+++ b/03_Check_tumour_non-tumouor_afterGLM.Rmd
@@ -0,0 +1,921 @@
+---
+title: "R Notebook"
+output: html_notebook
+---
+
+```{r, import libraries}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+
+library(dplyr)
+library(tidyr)
+library(mclust)
+library(ggplot2)
+library(RColorBrewer)
+library(scater)
+library(Rphenoannoy)
+set.seed(101100)
+```
+
+```{r, Set wd and load data}
+#set working directory 
+wd <-dirname(getwd())
+
+data_folder <-(file.path(wd,"sce_objects","tumour_non-tumour_from_TMA"))
+all.cells <- readRDS(file=file.path(data_folder, "all-cells_merged.rds"))
+
+all.tumour <- readRDS(file=file.path(data_folder, "merge_all-TUMOUR.rds"))
+```
+
+
+```{r, Define good markers, echo=F, warning=F, message=FALSE}
+all.marker <-rownames(all.cells)
+
+bad.marker <- c("Iridium_191","Iridium_193","Cadherin-6","Histone H3") 
+good.marker <- all.marker[!all.marker %in% bad.marker]
+print(good.marker)
+```
+
+```{r}
+all.cells$Tma_ac <- paste(all.cells$TMA, all.cells$acID, sep="_")
+length(unique(all.cells$Tma_ac))
+```
+
+
+```{r, sub tumour final, warning=F, message=F, echo=F, eval=F}
+#split cells by ImageNumber
+n_cells_dr <- 100
+cs <- split(seq_len(ncol(all.cells)), all.cells$Tma_ac)
+length(unique(all.cells$Tma_ac))
+#sample 'n_cells_dr' per ImageNumber
+cs <- unlist(lapply(cs, function(u) 
+  sample(u, min(n_cells_dr, length(u)))))
+#sub.test <- sce[,cs]
+all.cells.sub <- all.cells[,cs]
+
+#calculate percentage of cells subsetting results in
+p<-dim(assay(all.cells.sub))[2]/dim(assay(all.cells))[2]*100
+
+#results in % of all cells
+print(paste("Subsetting results in a total number of",dim(assay(
+  all.cells.sub))[2],"cells, which corresponds to",round(p,digits=2), "% of all cells of the original sce." ))
+
+saveRDS(all.cells.sub, file=file.path(data_folder, paste("all_cells_sub.rds")))
+```
+
+```{r, load subset, message=FALSE, warning=FALSE, echo=FALSE}
+all.cells.sub <- readRDS(file=file.path(data_folder, paste("all_cells_sub.rds")))
+```
+
+```{r, calculate umap, warning=F, message=F, echo=F, eval=FALSE}
+p <-c(10,50,100)
+
+for(i in p){
+all.cells.sub <- runUMAP(all.cells.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("UMAP_p", i),
+                 #use_dimred="PCA_20",
+                 n_neighbors = i)
+saveRDS(all.cells.sub, file=file.path(data_folder, paste("all_cells_sub.rds")))
+
+}
+saveRDS(all.cells.sub, file=file.path(data_folder, paste("all_cells_sub.rds")))
+
+
+for(i in p){
+all.cells.sub <- runTSNE(all.cells.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("tSNE_p", i),
+                 #use_dimred="PCA_20",
+                 perplexity = i)
+saveRDS(all.cells.sub, file=file.path(data_folder, paste("all_cells_sub.rds")))
+}
+```
+
+
+**UMAP with Tumour markers**
+```{r,plot umap tumour marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+dat <-as.data.frame(reducedDims(all.cells.sub)$`UMAP_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.cells.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% good.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+**tsne with Tumour markers**
+```{r,plot tsne tumour marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+dat <-as.data.frame(reducedDims(all.cells.sub)$`tSNE_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.cells.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% good.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+
+```{r, plot umap tumour cluster, fig.width=6, fig.height=4, warning=F, message=F, echo=F}
+cluster <- "mclust"
+p <-plotReducedDim(all.cells.sub, "UMAP_p50", colour_by=paste(cluster), text_by = paste(cluster), point_size=0.5,)+labs(x="UMAP-1", y="UMAP-2", title="UMAP") 
+plot(p)
+##ggsave(filename=file.path(plot_folder, paste("sub_Fibro_CAF-Clusters_UMAP_p50.png",sep="")), plot=p)
+
+cluster <- "mclust"
+p <-plotReducedDim(all.cells.sub, "tSNE_p50", colour_by=paste(cluster), text_by = paste(cluster), point_size=0.5,)+labs(x="tSNE-1", y="tSNE-2", title="tSNE") 
+plot(p)
+```
+##TUMOUR
+```{r}
+all.tumour <- readRDS(file=file.path(data_folder, "merge_all-TUMOUR.rds"))
+
+all.tumour$TMA <- paste(all.tumour$TmaID, all.tumour$TmaBlock, sep="")
+
+all.tumour$Tma_ac <- paste(all.tumour$TMA, all.tumour$acID, sep="_")
+length(unique(all.tumour$Tma_ac))
+head(colData(all.tumour))
+```
+
+
+```{r, sub tumour final, warning=F, message=F, echo=F, eval=F}
+#split cells by ImageNumber
+n_cells_dr <- 100
+cs <- split(seq_len(ncol(all.tumour)), all.tumour$Tma_ac)
+length(unique(all.tumour$Tma_ac))
+#sample 'n_cells_dr' per ImageNumber
+cs <- unlist(lapply(cs, function(u) 
+  sample(u, min(n_cells_dr, length(u)))))
+#sub.test <- sce[,cs]
+all.tumour.sub <- all.tumour[,cs]
+
+#calculate percentage of cells subsetting results in
+p<-dim(assay(all.tumour.sub))[2]/dim(assay(all.tumour))[2]*100
+
+#results in % of all cells
+print(paste("Subsetting results in a total number of",dim(assay(
+  all.tumour.sub))[2],"cells, which corresponds to",round(p,digits=2), "% of all cells of the original sce." ))
+
+saveRDS(all.tumour.sub, file=file.path(data_folder, paste("all_tumour_sub.rds")))
+```
+
+```{r, load subset, message=FALSE, warning=FALSE, echo=FALSE}
+all.tumour.sub <- readRDS(file=file.path(data_folder, paste("all_tumour_sub.rds")))
+```
+
+```{r, calculate umap, warning=F, message=F, echo=F, eval=FALSE}
+p <-c(10,50,100)
+
+for(i in p){
+all.tumour.sub <- runUMAP(all.tumour.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("UMAP_p", i),
+                 #use_dimred="PCA_20",
+                 n_neighbors = i)
+saveRDS(all.tumour.sub, file=file.path(data_folder, paste("all_tumour_sub.rds")))
+
+}
+saveRDS(all.tumour.sub, file=file.path(data_folder, paste("all_tumour_sub.rds")))
+
+p=50
+for(i in p){
+all.tumour.sub <- runTSNE(all.tumour.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("tSNE_p", i),
+                 #use_dimred="PCA_20",
+                 perplexity = i)
+saveRDS(all.tumour.sub, file=file.path(data_folder, paste("all_tumour_sub.rds")))
+}
+```
+
+
+**UMAP with Tumour markers**
+```{r,plot umap tumour marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+dat <-as.data.frame(reducedDims(all.tumour.sub)$`UMAP_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.tumour.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% good.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+**tsne with Tumour markers**
+```{r,plot tsne tumour marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+dat <-as.data.frame(reducedDims(all.tumour.sub)$`tSNE_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.tumour.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% good.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+
+
+
+```{r Clustering using Rphenoannoy, fig.width=25, fig.height=12}
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+
+#marker.clustering <- good.marker[!good.marker %in% "Pan Cytokeratin + Keratin Epithelial"]
+#cellnb <- 250000
+#subsample <-  colnames(sce)[sample(length(colnames(sce)), cellnb)]
+#sce_sub <- sce[, colnames(sce) %in% subsample]
+
+#set ks for clustering
+#i <- 100
+k <- c(30,35,40)
+#i<-10
+for (i in k) {
+    all.tumour$RPmembership <- factor(Rphenoannoy(data = t(assay(all.tumour[rownames(all.tumour) %in% good.marker,],"c_counts_asinh")), k = i)[[2]]$membership)
+    cluster <- paste0("rp_tumour_k",i)
+  colnames(colData(all.tumour))[which(names(colData(all.tumour)) == "RPmembership")] <- paste0(cluster)
+  agg_sce <-aggregateAcrossCells(all.tumour, ids=all.tumour[[cluster]], average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+ 
+    scater::plotHeatmap(agg_sce,
+              features = good.marker,
+              #features = all.marker,
+              #features = marker.clustering,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              colour_columns_by = c(cluster),
+              main=paste0("Heatmap tumour cells, ",cluster))
+    
+  #save sce clustering
+  saveRDS(all.tumour, file=file.path(data_folder, "merge_all-TUMOUR_workingfile.rds"))
+}
+all.tumour <-readRDS( file=file.path(data_folder, "merge_all-TUMOUR_workingfile.rds"))
+
+```
+
+```{r, add clustering results to subset tumour}
+rp_df <- data.frame("CellID"=all.tumour$CellID, "rp_tumour_k30"=all.tumour$rp_tumour_k30,
+                    "rp_tumour_k35"=all.tumour$rp_tumour_k35,
+                    "rp_tumour_k40"=all.tumour$rp_tumour_k40 )
+
+cur_DF <- as_tibble(colData(all.tumour.sub)) %>% left_join(rp_df, by = "CellID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(all.tumour.sub$ImageNumber, all.tumour.sub$CellNumber))
+
+colData(all.tumour.sub) <- cur_DF
+rownames(colData(all.tumour.sub)) <-all.tumour.sub$CellID
+```
+```{r,define tumour non tumour from clustering}
+is.nt <-c(39,12,53,104,47,20,1,97,52,110,150,128,71,32,147,34,138,133,101,8,115)#145
+all.tumour$tumour_nontumour <-ifelse(all.tumour$rp_tumour_k40 %in% is.nt, "non_tumour","tumour")
+table(all.tumour$tumour_nontumour)
+
+saveRDS(all.tumour, file=file.path(data_folder, "merge_all-TUMOUR_workingfile.rds"))
+
+all.tumour.sub$tumour_nontumour <-ifelse(all.tumour.sub$rp_tumour_k40 %in% is.nt, "non_tumour","tumour")
+saveRDS(all.tumour.sub, file=file.path(data_folder, paste("all_tumour_sub_workingfile.rds")))
+table(all.tumour$tumour_nontumour)
+```
+
+
+```{r, plot umap all.tumour cluster, fig.width=6, fig.height=4, warning=F, message=F, echo=F}
+cluster <- "tumour_nontumour"
+p <-plotReducedDim(all.tumour.sub, "UMAP_p50", colour_by=paste(cluster), text_by = paste(cluster), point_size=0.5,)+labs(x="UMAP-1", y="UMAP-2", title="UMAP") 
+plot(p)
+##ggsave(filename=file.path(plot_folder, paste("sub_Fibro_CAF-Clusters_UMAP_p50.png",sep="")), plot=p)
+
+cluster <- "rp_tumour_k40"
+p <-plotReducedDim(all.tumour.sub, "UMAP_p50", colour_by=paste(cluster), text_by = paste(cluster), point_size=0.5,)+labs(x="UMAP-1", y="UMAP-2", title="UMAP") 
+plot(p)
+#cluster <- "mclust"
+p <-plotReducedDim(all.tumour.sub, "tSNE_p50", colour_by=paste(cluster), text_by = paste(cluster), point_size=0.5,)+labs(x="tSNE-1", y="tSNE-2", title="tSNE") 
+plot(p)
+```
+
+```{r, fig.width=25, fig.height=20}
+agg_sce <-aggregateAcrossCells(all.tumour[, all.tumour$tumour_nontumour=="non_tumour"], ids=all.tumour[, all.tumour$tumour_nontumour=="non_tumour"][[cluster]], average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+scater::plotHeatmap(agg_sce,
+              features = good.marker,
+              #features = all.marker,
+              #features = marker.clustering,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              colour_columns_by = c(cluster),
+              main=paste0("Heatmap tumour cells, ",cluster))
+
+scater::plotHeatmap(all.tumour[, all.tumour$rp_tumour_k40==47],
+              features = good.marker,
+              #features = all.marker,
+              #features = marker.clustering,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              colour_columns_by = c(cluster),
+              main=paste0("Heatmap tumour cells, ",cluster))
+```
+
+```{r, subset tumour into tumour_tumour and tumour_non tumour}
+tumour_tumour <- all.tumour[, all.tumour$tumour_nontumour=="tumour"]
+tumour_NONtumour <- all.tumour[, all.tumour$tumour_nontumour=="non_tumour"]
+
+saveRDS(tumour_tumour, file=file.path(data_folder, "tumour", paste("merge_all-tumour_TUMOUR.rds")))
+saveRDS(tumour_NONtumour, file=file.path(data_folder, "tumour", paste("merge_all-tumour_NONtumour.rds")))
+
+tumour_tumour <-readRDS( file=file.path(data_folder, "tumour", paste("merge_all-tumour_TUMOUR.rds")))
+
+```
+
+##NON TUMOUR
+```{r}
+all.NONtumour <- readRDS(file=file.path(data_folder, "merge_all-NONtumour.rds"))
+
+all.NONtumour$TMA <- paste(all.NONtumour$TmaID, all.NONtumour$TmaBlock, sep="")
+
+all.NONtumour$Tma_ac <- paste(all.NONtumour$TMA, all.NONtumour$acID, sep="_")
+length(unique(all.NONtumour$Tma_ac))
+head(colData(all.NONtumour))
+```
+
+
+```{r, sub non tumour, warning=F, message=F, echo=F, eval=F}
+#split cells by ImageNumber
+n_cells_dr <- 100
+cs <- split(seq_len(ncol(all.NONtumour)), all.NONtumour$Tma_ac)
+length(unique(all.NONtumour$Tma_ac))
+#sample 'n_cells_dr' per ImageNumber
+cs <- unlist(lapply(cs, function(u) 
+  sample(u, min(n_cells_dr, length(u)))))
+#sub.test <- sce[,cs]
+all.NONtumour.sub <- all.NONtumour[,cs]
+
+#calculate percentage of cells subsetting results in
+p<-dim(assay(all.NONtumour.sub))[2]/dim(assay(all.NONtumour))[2]*100
+
+#results in % of all cells
+print(paste("Subsetting results in a total number of",dim(assay(
+  all.NONtumour.sub))[2],"cells, which corresponds to",round(p,digits=2), "% of all cells of the original sce." ))
+
+saveRDS(all.NONtumour.sub, file=file.path(data_folder, paste("all_NONtumour_sub.rds")))
+```
+
+```{r, load subset, message=FALSE, warning=FALSE, echo=FALSE}
+all.NONtumour.sub <- readRDS(file=file.path(data_folder, paste("all_NONtumour_sub.rds")))
+```
+
+```{r, calculate umap, warning=F, message=F, echo=F, eval=FALSE}
+p <-c(10,50,100)
+
+for(i in p){
+all.NONtumour.sub <- runUMAP(all.NONtumour.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("UMAP_p", i),
+                 #use_dimred="PCA_20",
+                 n_neighbors = i)
+saveRDS(all.NONtumour.sub, file=file.path(data_folder, paste("all_NONtumour_sub.rds")))
+
+}
+saveRDS(all.NONtumour.sub, file=file.path(data_folder, paste("all_NONtumour_sub.rds")))
+
+p=50
+for(i in p){
+all.NONtumour.sub <- runTSNE(all.NONtumour.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("tSNE_p", i),
+                 #use_dimred="PCA_20",
+                 perplexity = i)
+saveRDS(all.NONtumour.sub, file=file.path(data_folder, paste("all_NONtumour_sub.rds")))
+}
+```
+
+
+**UMAP with Tumour markers**
+```{r,plot umap tumour marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+dat <-as.data.frame(reducedDims(all.NONtumour.sub)$`UMAP_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.NONtumour.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% good.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+**tsne with Tumour markers**
+```{r,plot tsne tumour marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+dat <-as.data.frame(reducedDims(all.NONtumour.sub)$`tSNE_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.NONtumour.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% good.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+
+
+
+```{r Clustering using Rphenoannoy, fig.width=15, fig.height=15}
+hmcol<-rev(brewer.pal(11,"RdBu"))
+
+#marker.clustering <- good.marker[!good.marker %in% "Pan Cytokeratin + Keratin Epithelial"]
+#cellnb <- 250000
+#subsample <-  colnames(sce)[sample(length(colnames(sce)), cellnb)]
+#sce_sub <- sce[, colnames(sce) %in% subsample]
+
+#set ks for clustering
+#i <- 100
+k <- c(30)
+#i<-30
+for (i in k) {
+    all.NONtumour$RPmembership <- factor(Rphenoannoy(data = t(assay(all.NONtumour[rownames(all.NONtumour) %in% good.marker,],"c_counts_asinh")), k = i)[[2]]$membership)
+    cluster <- paste0("rp_NONtumour_k",i)
+  colnames(colData(all.NONtumour))[which(names(colData(all.NONtumour)) == "RPmembership")] <- paste0(cluster)
+  agg_sce <-aggregateAcrossCells(all.NONtumour, ids=all.NONtumour[[cluster]], average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+ 
+    scater::plotHeatmap(agg_sce,
+              features = good.marker,
+              #features = all.marker,
+              #features = marker.clustering,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              colour_columns_by = c(cluster),
+              main=paste0("Heatmap NON Tumour cells, ",cluster))
+    
+  #save sce clustering
+  saveRDS(all.NONtumour, file=file.path(data_folder, paste("merge_all-NONtumour_workingfile.rds",sep="")))
+}
+```
+
+```{r}
+
+colnames(colData(all.NONtumour))[!colnames(colData(all.NONtumour)) %in%colnames(colData(tumour_tumour))]
+colnames(colData(all.NONtumour))
+
+tumour_NONtumour$rp_tumour_k30 <-NULL
+tumour_NONtumour$rp_tumour_k35 <-NULL
+tumour_NONtumour$rp_tumour_k40 <-NULL
+
+tumour_NONtumour$tumour_nontumour <-NULL
+all.NONtumour$rp_NONtumour_k30 <-NULL
+all.NONtumour.final <- cbind(all.NONtumour, tumour_NONtumour)
+
+all.NONtumour.final$tumour_nontumour <-"non tumour"
+saveRDS(all.NONtumour.final, file=file.path(data_folder,"non_tumour", paste("merge_all-NONtumour_final.rds",sep="")))
+
+
+tumour_tumour$rp_tumour_k30 <-NULL
+tumour_tumour$rp_tumour_k35 <-NULL
+tumour_tumour$rp_tumour_k40 <-NULL
+
+tumour_tumour$tumour_nontumour <-NULL
+tumour_tumour$tumour_nontumour <-"tumour"
+
+colnames(colData(all.NONtumour.final))[!colnames(colData(all.NONtumour.final)) %in%colnames(colData(tumour_tumour))]
+
+all.final <- cbind(tumour_tumour,all.NONtumour.final)
+saveRDS(all.final, file=file.path(data_folder, paste("merge_all-cells-tumourVSnontumour_final.rds",sep="")))
+```
+#FINAL NON TUMOUR CLUSTERING to distinguish between stroma / immune (and eventual left over tumour cells (which would need to be remerged to the tumour_tumour dataset))
+```{r}
+wd <-dirname(getwd())
+
+data_folder <-(file.path(wd,"sce_objects","tumour_non-tumour_from_TMA"))
+all.NONtumour.final <- readRDS(file=file.path(data_folder,"non_tumour", "merge_all-NONtumour_final.rds"))
+
+all.NONtumour.final$Tma_ac %>% unique() %>% length()
+ac_clinical$Tma_ac %>% unique() %>% length()
+
+unique(all.NONtumour.final$Tma_ac)[!unique(all.NONtumour.final$Tma_ac) %in%ac_clinical$Tma_ac]
+unique(ac_clinical$Tma_ac)[!unique(ac_clinical$Tma_ac) %in%all.NONtumour.final$Tma_ac]
+```
+
+
+```{r, Define good markers, echo=F, warning=F, message=FALSE}
+all.marker <-rownames(all.NONtumour.final)
+
+bad.marker <- c("Iridium_191","Iridium_193","Cadherin-6","Histone H3") 
+good.marker <- all.marker[!all.marker %in% bad.marker]
+print(good.marker)
+```
+
+```{r, sub non tumour, warning=F, message=F, echo=F, eval=F}
+#split cells by ImageNumber
+n_cells_dr <- 100
+cs <- split(seq_len(ncol(all.NONtumour.final)), all.NONtumour.final$Tma_ac)
+length(unique(all.NONtumour.final$Tma_ac))
+#sample 'n_cells_dr' per ImageNumber
+cs <- unlist(lapply(cs, function(u) 
+  sample(u, min(n_cells_dr, length(u)))))
+#sub.test <- sce[,cs]
+all.NONtumour.final.sub <- all.NONtumour.final[,cs]
+
+#calculate percentage of cells subsetting results in
+p<-dim(assay(all.NONtumour.final.sub))[2]/dim(assay(all.NONtumour.final))[2]*100
+
+#results in % of all cells
+print(paste("Subsetting results in a total number of",dim(assay(
+  all.NONtumour.final.sub))[2],"cells, which corresponds to",round(p,digits=2), "% of all cells of the original sce." ))
+
+saveRDS(all.NONtumour.final.sub, file=file.path(data_folder,"non_tumour", paste("all_NONtumour-final_sub.rds")))
+```
+
+```{r, load subset, message=FALSE, warning=FALSE, echo=FALSE}
+all.NONtumour.final.sub <- readRDS(file=file.path(data_folder, paste("all_NONtumour-final_sub.rds")))
+```
+
+```{r, calculate umap, warning=F, message=F, echo=F, eval=FALSE}
+p <-c(50)
+
+for(i in p){
+all.NONtumour.final.sub <- runUMAP(all.NONtumour.final.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("UMAP_p", i),
+                 #use_dimred="PCA_20",
+                 n_neighbors = i)
+saveRDS(all.NONtumour.final.sub, file=file.path(data_folder,"non_tumour", paste("all_NONtumour-final_sub.rds")))
+
+}
+saveRDS(all.NONtumour.final.sub, file=file.path(data_folder,"non_tumour", paste("all_NONtumour-final_sub.rds")))
+
+p=50
+for(i in p){
+all.NONtumour.final.sub <- runTSNE(all.NONtumour.final.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("tSNE_p", i),
+                 #use_dimred="PCA_20",
+                 perplexity = i)
+saveRDS(all.NONtumour.final.sub, file=file.path(data_folder,"non_tumour", paste("all_NONtumour-final_sub.rds")))
+}
+```
+
+
+**UMAP with Tumour markers**
+```{r,plot umap tumour marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+dat <-as.data.frame(reducedDims(all.NONtumour.final.sub)$`UMAP_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.NONtumour.final.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% good.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+**tsne with Tumour markers**
+```{r,plot tsne tumour marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+dat <-as.data.frame(reducedDims(all.NONtumour.final.sub)$`tSNE_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.NONtumour.final.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% good.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+
+
+
+```{r Clustering using Rphenoannoy, fig.width=20, fig.height=15}
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+
+#marker.clustering <- good.marker[!good.marker %in% "Pan Cytokeratin + Keratin Epithelial"]
+#cellnb <- 250000
+#subsample <-  colnames(sce)[sample(length(colnames(sce)), cellnb)]
+#sce_sub <- sce[, colnames(sce) %in% subsample]
+
+#set ks for clustering
+#i <- 100
+k <- c(30,35,40)
+#i<-30
+for (i in k) {
+  #  all.NONtumour.final$RPmembership <- factor(Rphenoannoy(data = t(assay(all.NONtumour.final[rownames(all.NONtumour.final) %in% good.marker,],"c_counts_asinh")), k = i)[[2]]$membership)
+   cluster <- paste0("rp_NONtumour-final_k",i)
+#  colnames(colData(all.NONtumour.final))[which(names(colData(all.NONtumour.final)) == "RPmembership")] <- paste0(cluster)
+  agg_sce <-aggregateAcrossCells(all.NONtumour.final, ids=all.NONtumour.final[[cluster]], average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+ 
+    scater::plotHeatmap(agg_sce,
+              features = good.marker,
+              #features = all.marker,
+              #features = marker.clustering,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              colour_columns_by = c(cluster),
+              main=paste0("Heatmap final NON Tumour cells, ",cluster))
+    
+  #save sce clustering
+ # saveRDS(all.NONtumour.final, file=file.path(data_folder,"non_tumour", "merge_all-NONtumour_final_workingfile.rds"))
+}
+```
+
+
+```{r}
+is.tumour <- c(35, 73,66,75,51,53)
+is.immune <- c(59,48,70,58,69,23,50,44,64,30,6,60,12,56,27,47,31,4,29,62,65,26,52,74,39,15,17,22,38,39,21)
+is.stroma <- c(53,16,45,46,35,3,7,28,8,18,24,11,36,20,13,10,41,25,55,54,68,71,42,40,61,57,19,51,14,34,63,43,67,37,33,32,1,2,72, 49)
+is.undef <- c(9,5)
+
+all.NONtumour.final$cell_category[all.NONtumour.final$`rp_NONtumour-final_k30` %in% is.tumour] <- "tumour"
+all.NONtumour.final$cell_category[all.NONtumour.final$`rp_NONtumour-final_k30` %in% is.immune] <- "immune"
+all.NONtumour.final$cell_category[all.NONtumour.final$`rp_NONtumour-final_k30` %in% is.stroma] <- "stroma"
+all.NONtumour.final$cell_category[all.NONtumour.final$`rp_NONtumour-final_k30` %in% is.undef] <- "undefined"
+
+table(all.NONtumour.final$cell_category)
+
+```
+
+```{r, add clustering results to subset tumour}
+rp_df <- data.frame("CellID"=all.NONtumour.final$CellID, "rp_NONtumour-final_k30"=all.NONtumour.final$`rp_NONtumour-final_k30`,
+                    "cell_category"=all.NONtumour.final$cell_category)
+
+cur_DF <- as_tibble(colData(all.NONtumour.final.sub)) %>% left_join(rp_df, by = "CellID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(all.NONtumour.final.sub$ImageNumber, all.NONtumour.final.sub$CellNumber))
+
+colData(all.NONtumour.final.sub) <- cur_DF
+rownames(colData(all.NONtumour.final.sub)) <-all.NONtumour.final.sub$CellID
+
+cluster <- "cell_category"
+p <-plotReducedDim(all.NONtumour.final.sub, "UMAP_p50", colour_by=paste(cluster), text_by = paste(cluster), point_size=0.5,)+labs(x="UMAP-1", y="UMAP-2", title="UMAP") 
+plot(p)
+```
+
+
+```{r, fig.width=20, fig.height=10}
+cluster <- paste0("rp_NONtumour-final_k",30)
+agg_sce <-aggregateAcrossCells(all.NONtumour.final, ids=all.NONtumour.final[[cluster]], average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+ 
+    scater::plotHeatmap(agg_sce,
+              features = good.marker,
+              #features = all.marker,
+              #features = marker.clustering,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c("cell_category"),
+              main=paste0("Heatmap final NON Tumour cells, ",cluster))
+```
+```{r}
+saveRDS(all.NONtumour.final, file=file.path(data_folder,"non_tumour", "merge_all-NONtumour_final_workingfile.rds"))
+
+all.NONtumour.final_TUMOUR <- all.NONtumour.final[,all.NONtumour.final$cell_category=="tumour"]
+all.NONtumour.final_IMMUNE <- all.NONtumour.final[,all.NONtumour.final$cell_category=="immune"]
+all.NONtumour.final_STROMA <- all.NONtumour.final[,all.NONtumour.final$cell_category=="stroma"]
+all.NONtumour.final_UNDEF <- all.NONtumour.final[,all.NONtumour.final$cell_category=="undefined"]
+
+saveRDS(all.NONtumour.final_TUMOUR, file=file.path(data_folder,"non_tumour", "merge_all-NONtumour_final_workingfile_TUMOUR.rds"))
+saveRDS(all.NONtumour.final_IMMUNE, file=file.path(data_folder,"non_tumour", "merge_all-NONtumour_final_workingfile_IMMUNE.rds"))
+saveRDS(all.NONtumour.final_STROMA, file=file.path(data_folder,"non_tumour", "merge_all-NONtumour_final_workingfile_STROMA.rds"))
+saveRDS(all.NONtumour.final_UNDEF, file=file.path(data_folder,"non_tumour", "merge_all-NONtumour_final_workingfile_UNDEFINED.rds"))
+
+all.NONtumour.final_minusT <- all.NONtumour.final[,all.NONtumour.final$cell_category!="tumour"]
+
+```
+
+```{r, merge non.tumour_TUMOUR back to tumour_tumour}
+colnames(colData(all.NONtumour.final_TUMOUR))[!colnames(colData(all.NONtumour.final_TUMOUR)) %in%colnames(colData(tumour_tumour))]
+colnames(colData(tumour_tumour))[!colnames(colData(tumour_tumour)) %in%colnames(colData(all.NONtumour.final_TUMOUR))]
+
+all.NONtumour.final_TUMOUR$cell_category <-NULL
+all.NONtumour.final_TUMOUR$`rp_NONtumour-final_k30` <-NULL
+all.NONtumour.final_TUMOUR$`rp_NONtumour-final_k35` <-NULL
+all.NONtumour.final_TUMOUR$`rp_NONtumour-final_k40` <-NULL
+
+tumour_tumour$rp_tumour_k30 <-NULL
+tumour_tumour$rp_tumour_k35 <-NULL
+tumour_tumour$rp_tumour_k40 <-NULL
+
+tumour.final <-cbind(tumour_tumour, all.NONtumour.final_TUMOUR)
+
+saveRDS(tumour.final, file=file.path(data_folder,"tumour", "FINAL_merge_all_tumour-final_after-nontumourclustering.rds"))
+
+
+
+colnames(colData(all.NONtumour.final_minusT))[!colnames(colData(all.NONtumour.final_minusT)) %in%colnames(colData(tumour.final))]
+colnames(colData(tumour.final))[!colnames(colData(tumour.final)) %in%colnames(colData(all.NONtumour.final_minusT))]
+
+tumour.final$cell_category <-"tumour"
+all.NONtumour.final_minusT$`rp_NONtumour-final_k30` <-NULL
+all.NONtumour.final_minusT$`rp_NONtumour-final_k35` <-NULL
+all.NONtumour.final_minusT$`rp_NONtumour-final_k40` <-NULL
+
+all.category <- cbind(tumour.final,all.NONtumour.final_minusT)
+all.category$cat <- ifelse(all.category$cell_category=="tumour","tumour","non_tumour")
+
+all.category
+```
+
diff --git a/04_02_cluster_Tcells.Rmd b/04_02_cluster_Tcells.Rmd
new file mode 100644
index 0000000..860cb5a
--- /dev/null
+++ b/04_02_cluster_Tcells.Rmd
@@ -0,0 +1,556 @@
+---
+title: "R Notebook - Cluster T cells"
+output:
+  html_document:
+    df_print: paged
+---
+
+```{r, import libraries, message=F, echo=F, warning=F}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+library(dplyr)
+library(tidyr)
+library(mclust)
+library(ggplot2)
+library(RColorBrewer)
+library(scater)
+library(Rphenoannoy)
+set.seed(101100)
+```
+
+
+```{r, Set wd and load data}
+#set working directory 
+wd <-dirname(getwd())
+
+data_folder <-(file.path(wd,"sce_objects","immune","Tcell"))
+
+#RAW
+#all.tcell <- readRDS(file=file.path(data_folder, "tcells_RAW.rds"))
+
+#workingfile
+all.tcell <- readRDS(file=file.path(data_folder, "all_tcell_workingfile.rds"))
+#saveRDS(all.tcell, file=file.path(data_folder, "all_tcell_workingfile.rds"))
+```
+
+
+```{r, Define tcell markers, echo=F, warning=F, message=FALSE}
+all.marker <-rownames(all.tcell)
+
+bad.marker <- c("Iridium_191","Iridium_193","Cadherin-6","Histone H3") 
+good.marker <- all.marker[!all.marker %in% bad.marker]
+btcell.marker <-c("CD20","Indoleamine 2- 3-dioxygenase (IDO)","CD3" ,"TCF1/TCF7" ,"FOXP3","CD45RA + CD45R0","CD8a","CD4"  ,"Ki-67","CD279 (PD-1)") #to kick out final B cells from merge cluster
+
+tcell.marker <-c("Indoleamine 2- 3-dioxygenase (IDO)","CD3" ,"TCF1/TCF7" ,"FOXP3","CD45RA + CD45R0","CD8a","CD4"  ,"Ki-67","CD279 (PD-1)")
+
+print(tcell.marker)
+```
+
+```{r, subset tcell all, message=F, echo=F, warning=F, eval=F}
+#split cells by ImageNumber
+n_cells_dr <- 100
+cs <- split(seq_len(ncol(all.tcell)), all.tcell$Tma_ac)
+length(unique(all.tcell$Tma_ac))
+#sample 'n_cells_dr' per ImageNumber
+cs <- unlist(lapply(cs, function(u) 
+  sample(u, min(n_cells_dr, length(u)))))
+#sub.test <- sce[,cs]
+all.tcell.sub <- all.tcell[,cs]
+
+#calculate percentage of cells subsetting results in
+p<-dim(assay(all.tcell.sub))[2]/dim(assay(all.tcell))[2]*100
+
+#results in % of all cells
+print(paste("Subsetting results in a total number of",dim(assay(
+  all.tcell.sub))[2],"cells, which corresponds to",round(p,digits=2), "% of all cells of the original sce." ))
+
+saveRDS(all.tcell.sub, file=file.path(data_folder, paste("all_tcell_sub.rds")))
+```
+
+```{r, load subset all tcells, message=FALSE, warning=FALSE, echo=FALSE}
+all.tcell.sub <- readRDS(file=file.path(data_folder, paste("all_tcell_sub.rds")))
+```
+
+```{r, calculate umap all tcells, warning=F, message=F, echo=F, eval=FALSE}
+p <-c(10,50,100)
+p=50
+for(i in p){
+all.tcell.sub <- runUMAP(all.tcell.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("UMAP_p", i),
+                 #use_dimred="PCA_20",
+                 n_neighbors = i)
+saveRDS(all.tcell.sub, file=file.path(data_folder, paste("all_tcell_sub.rds")))
+
+}
+saveRDS(all.tcell.sub, file=file.path(data_folder, paste("all_tcell_sub.rds")))
+
+
+for(i in p){
+all.tcell.sub <- runTSNE(all.tcell.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("tSNE_p", i),
+                 #use_dimred="PCA_20",
+                 perplexity = i)
+saveRDS(all.tcell.sub, file=file.path(data_folder, paste("all_tcell_sub.rds")))
+}
+```
+
+
+**UMAP with good markers**
+```{r,plot umap all t cells immune marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+dat <-as.data.frame(reducedDims(all.tcell.sub)$`UMAP_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.tcell.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% btcell.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+**tsne with Tumour markers**
+```{r,plot tsne t cell marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE,eval=F}
+dat <-as.data.frame(reducedDims(all.tcell.sub)$`tSNE_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.tcell.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% good.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+
+```{r Clustering all tcell using Rphenoannoy, fig.width=25, fig.height=12 , message=F, echo=F, warning=F, eval=F}
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+
+#marker.clustering <- good.marker[!good.marker %in% "Pan Cytokeratin + Keratin Epithelial"]
+#cellnb <- 250000
+#subsample <-  colnames(sce)[sample(length(colnames(sce)), cellnb)]
+#sce_sub <- sce[, colnames(sce) %in% subsample]
+
+#set ks for clustering
+#i <- 100
+k <- c(20)
+#i<-10
+for (i in k) {
+    all.tcell$RPmembership <- factor(Rphenoannoy(data = t(assay(all.tcell[rownames(all.tcell) %in% immune.marker,],"c_counts_asinh")), k = i)[[2]]$membership)
+    cluster <- paste0("rp_tcell_immune_k",i)
+    cluster <- paste0("rp_tcell_tb_k",i)
+  colnames(colData(all.tcell))[which(names(colData(all.tcell)) == "RPmembership")] <- paste0(cluster)
+  agg_sce <-aggregateAcrossCells(all.tcell, ids=all.tcell[[cluster]], average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+ 
+    scater::plotHeatmap(agg_sce,
+              features = btcell.marker,
+              #features = immune.marker,
+              #features = marker.clustering,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c(cluster),
+              main=paste0("Heatmap tumour cells, ",cluster))
+    
+  #save sce clustering
+  saveRDS(all.tcell, file=file.path(data_folder, "all_tcell_workingfile.rds"))
+}
+#all.tcell <-readRDS( file=file.path(data_folder, "all_tcell_workingfile.rds"))
+
+```
+```{r, add clustering results to subset all t cells, message=F, echo=F, warning=F, eval=F}
+rp_df <- data.frame("CellID"=all.tcell$CellID, "rp_tcell_tb_k20"=all.tcell$rp_tcell_tb_k20)
+
+cur_DF <- as_tibble(colData(all.tcell.sub)) %>% left_join(rp_df, by = "CellID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(all.tcell.sub$ImageNumber, all.tcell.sub$CellNumber))
+
+colData(all.tcell.sub) <- cur_DF
+rownames(colData(all.tcell.sub)) <-all.tcell.sub$CellID
+
+```
+
+```{r, plot all t cell sub cluster on umap, message=F, echo=F, warning=F, eval=F}
+cluster <- "rp_tcell_tb_k20"
+p <-plotReducedDim(all.tcell.sub[, all.tcell.sub$rp_tcell_tb_k20==17], "UMAP_p50", colour_by=paste(cluster), text_by = paste(cluster),text_colour ="red", point_size=0.5,)+labs(x="UMAP-1", y="UMAP-2", title="UMAP") 
+plot(p)
+```
+```{r, assign labels, B cells and T cells, message=F, echo=F, warning=F}
+is.bcell <-17
+all.tcell.bcell <- all.tcell[, all.tcell$rp_tcell_tb_k20==17]
+#saveRDS(all.tcell.bcell, file=file.path(data_folder, "tcells_bcells.RDS"))
+
+tcell.final <- all.tcell[, all.tcell$rp_tcell_tb_k20 != 17]
+#saveRDS(tcell.final, file=file.path(data_folder, "tcell_final_workingfile.RDS"))
+
+tcell.final$rp_tcell_immune_k20 <-NULL
+colnames(colData(tcell.final))
+```
+
+
+#################
+#T cells final
+
+```{r, Set wd and load data t cell final, message=F, echo=F, warning=F}
+#set working directory 
+wd <-dirname(getwd())
+
+data_folder <-(file.path(wd,"sce_objects","immune","Tcell"))
+
+#RAW
+tcell.final <- readRDS(file=file.path(data_folder, "tcell_final.RDS"))
+
+#workingfile
+tcell.final <- readRDS(file=file.path(data_folder, "tcell_final_workingfile.rds"))
+#saveRDS(tcell.final, file=file.path(data_folder, "tcell_final_workingfile.rds"))
+
+colnames(colData(tcell.final))
+```
+
+
+```{r, Define tcell markers final, echo=F, warning=F, message=FALSE}
+all.marker <-rownames(tcell.final)
+
+bad.marker <- c("Iridium_191","Iridium_193","Cadherin-6","Histone H3") 
+good.marker <- all.marker[!all.marker %in% bad.marker]
+
+tcell.marker <-c("Indoleamine 2- 3-dioxygenase (IDO)","CD3" ,"TCF1/TCF7" ,"FOXP3","CD45RA + CD45R0","CD8a","CD4"  ,"Ki-67","CD279 (PD-1)")
+
+print(tcell.marker)
+```
+
+```{r, subset tcell final, message=F, echo=F, warning=F, eval=F}
+#split cells by ImageNumber
+n_cells_dr <- 100
+cs <- split(seq_len(ncol(tcell.final)), tcell.final$Tma_ac)
+length(unique(tcell.final$Tma_ac))
+#sample 'n_cells_dr' per ImageNumber
+cs <- unlist(lapply(cs, function(u) 
+  sample(u, min(n_cells_dr, length(u)))))
+#sub.test <- sce[,cs]
+tcell.final.sub <- tcell.final[,cs]
+
+#calculate percentage of cells subsetting results in
+p<-dim(assay(tcell.final.sub))[2]/dim(assay(tcell.final))[2]*100
+
+#results in % of all cells
+print(paste("Subsetting results in a total number of",dim(assay(
+  tcell.final.sub))[2],"cells, which corresponds to",round(p,digits=2), "% of all cells of the original sce." ))
+
+saveRDS(tcell.final.sub, file=file.path(data_folder, paste("tcell_final_sub.rds")))
+```
+
+```{r, load subset t cell final, message=FALSE, warning=FALSE, echo=FALSE}
+tcell.final.sub <- readRDS(file=file.path(data_folder, paste("tcell_final_sub.rds")))
+```
+
+```{r, calculate umap t cell final, warning=F, message=F, echo=F, eval=FALSE}
+p <-c(10,50,100)
+p=50
+for(i in p){
+tcell.final.sub <- runUMAP(tcell.final.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("UMAP_p", i),
+                 #use_dimred="PCA_20",
+                 n_neighbors = i)
+saveRDS(tcell.final.sub, file=file.path(data_folder, paste("tcell_final_sub.rds")))
+
+}
+saveRDS(tcell.final.sub, file=file.path(data_folder, paste("tcell_final_sub.rds")))
+
+
+for(i in p){
+tcell.final.sub <- runTSNE(tcell.final.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("tSNE_p", i),
+                 #use_dimred="PCA_20",
+                 perplexity = i)
+saveRDS(tcell.final.sub, file=file.path(data_folder, paste("tcell_final_sub.rds")))
+}
+```
+
+
+**UMAP with good markers**
+```{r,plot umap t cell final t cell marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+dat <-as.data.frame(reducedDims(tcell.final.sub)$`UMAP_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(tcell.final.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% good.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+
+```{r, Clustering t cell final using Rphenoannoy, fig.width=25, fig.height=12, message=F, echo=F, warning=F, eval=F}
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+
+#marker.clustering <- good.marker[!good.marker %in% "Pan Cytokeratin + Keratin Epithelial"]
+#cellnb <- 250000
+#subsample <-  colnames(sce)[sample(length(colnames(sce)), cellnb)]
+#sce_sub <- sce[, colnames(sce) %in% subsample]
+
+#set ks for clustering
+#i <- 100
+k <- c(35)
+#i<-30
+for (i in k) {
+    tcell.final$RPmembership <- factor(Rphenoannoy(data = t(assay(tcell.final[rownames(tcell.final) %in% tcell.marker,],"c_counts_asinh")), k = i)[[2]]$membership)
+    cluster <- paste0("rp_tcell_k",i)
+  colnames(colData(tcell.final))[which(names(colData(tcell.final)) == "RPmembership")] <- paste0(cluster)
+  agg_sce <-aggregateAcrossCells(tcell.final, ids=tcell.final[[cluster]], average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+ 
+    scater::plotHeatmap(agg_sce,
+              features = tcell.marker,
+              #features = immune.marker,
+              #features = marker.clustering,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c(cluster),
+              main=paste0("Heatmap tumour cells, ",cluster))
+    
+  #save sce clustering
+  saveRDS(tcell.final, file=file.path(data_folder, "tcell_final_workingfile.rds"))
+}
+#tcell.final <-readRDS( file=file.path(data_folder, "all_tcell_workingfile.rds"))
+
+```
+```{r, assign cluster levels T cell final,fig.width=12, fig.height=10, message=F, echo=F, warning=F}
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+
+is.immune <- c(1,9,3)
+is.CD8 <- c(10,20,6,21,12,13,7,5)
+is.CD8.TCF <- c(22,19)
+is.CD8.IDO <- c(17)
+is.CD8.ki67 <- c(2)
+is.CD4.PD1 <-c(23)
+is.CD4.Treg <-c(18)
+is.CD4 <-c(4,11,14)
+is.CD4.ki67 <-c(8)
+is.CD4.TCF <-c(16)
+is.CD4.IDO <-c(15)
+
+tcell.final$TcellType[tcell.final$rp_tcell_k30%in% is.immune] <-"Immune"
+tcell.final$TcellType[tcell.final$rp_tcell_k30%in% is.CD8] <-"CD8"
+tcell.final$TcellType[tcell.final$rp_tcell_k30%in% is.CD8.TCF] <-"TCF1/7_CD8"
+tcell.final$TcellType[tcell.final$rp_tcell_k30%in% is.CD8.IDO] <-"IDO_CD8"
+tcell.final$TcellType[tcell.final$rp_tcell_k30%in% is.CD8.ki67] <-"ki67_CD8"
+tcell.final$TcellType[tcell.final$rp_tcell_k30%in% is.CD4.PD1] <-"PD1_CD4"
+tcell.final$TcellType[tcell.final$rp_tcell_k30%in% is.CD4.Treg] <-"CD4_Treg"
+tcell.final$TcellType[tcell.final$rp_tcell_k30%in% is.CD4] <-"CD4"
+tcell.final$TcellType[tcell.final$rp_tcell_k30%in% is.CD4.ki67] <-"ki67_CD4"
+tcell.final$TcellType[tcell.final$rp_tcell_k30%in% is.CD4.TCF] <-"TCF1/7_CD4"
+tcell.final$TcellType[tcell.final$rp_tcell_k30%in% is.CD4.IDO] <-"IDO_CD4"
+
+agg_sce <-aggregateAcrossCells(tcell.final, ids=tcell.final$rp_tcell_k30, average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+ 
+    scater::plotHeatmap(agg_sce,
+              features = tcell.marker,
+              #features = immune.marker,
+              #features = marker.clustering,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c("TcellType"))
+    
+tcell.final.immune <- tcell.final[, tcell.final$TcellType=="Immune"]
+#saveRDS(tcell.final.immune, file=file.path(data_folder, "tcell-final-immune.rds"))
+tcell.final <- tcell.final[, tcell.final$TcellType!="Immune"]
+
+#saveRDS(tcell.final, file=file.path(data_folder, "FINAL_Tcells-minusImmune_RAW.rds"))
+#saveRDS(tcell.final, file=file.path(data_folder, "FINAL_Tcells-minusImmune_workingfile.rds"))
+
+tcell.final$TcellCategory <- ifelse(tcell.final$rp_tcell_k30 %in%is.CD8 |
+                                      tcell.final$rp_tcell_k30 %in%is.CD8.TCF|
+                                      tcell.final$rp_tcell_k30 %in%is.CD8.IDO|
+                                      tcell.final$rp_tcell_k30 %in%is.CD8.ki67,
+                                    "CD8","CD4")
+
+table(tcell.final$TcellCategory)
+table(tcell.final$TcellType)
+```
+
+```{r, subset tcell final minus immune, message=F, echo=F, warning=F, eval=F}
+#split cells by ImageNumber
+n_cells_dr <- 100
+cs <- split(seq_len(ncol(tcell.final)), tcell.final$Tma_ac)
+length(unique(tcell.final$Tma_ac))
+#sample 'n_cells_dr' per ImageNumber
+cs <- unlist(lapply(cs, function(u) 
+  sample(u, min(n_cells_dr, length(u)))))
+#sub.test <- sce[,cs]
+tcell.final.sub <- tcell.final[,cs]
+
+#calculate percentage of cells subsetting results in
+p<-dim(assay(tcell.final.sub))[2]/dim(assay(tcell.final))[2]*100
+
+#results in % of all cells
+print(paste("Subsetting results in a total number of",dim(assay(
+  tcell.final.sub))[2],"cells, which corresponds to",round(p,digits=2), "% of all cells of the original sce." ))
+
+saveRDS(tcell.final.sub, file=file.path(data_folder, paste("tcell_final-minusImmune_sub.rds")))
+```
+
+```{r, load subset t cells minus immune, message=FALSE, warning=FALSE, echo=FALSE}
+tcell.final.sub <- readRDS(file=file.path(data_folder, paste("tcell_final-minusImmune_sub.rds")))
+```
+
+```{r, calculate umap t cells minus immune, warning=F, message=F, echo=F, eval=FALSE}
+p <-c(10,50,100)
+p=50
+for(i in p){
+tcell.final.sub <- runUMAP(tcell.final.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("UMAP_p", i),
+                 #use_dimred="PCA_20",
+                 n_neighbors = i)
+saveRDS(tcell.final.sub, file=file.path(data_folder, paste("tcell_final-minusImmune_sub.rds")))
+
+}
+saveRDS(tcell.final.sub, file=file.path(data_folder, paste("tcell_final-minusImmune_sub.rds")))
+
+
+for(i in p){
+tcell.final.sub <- runTSNE(tcell.final.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("tSNE_p", i),
+                 #use_dimred="PCA_20",
+                 perplexity = i)
+saveRDS(tcell.final.sub, file=file.path(data_folder, paste("tcell_final-minusImmune_sub.rds")))
+}
+```
+
+
+**UMAP with good markers**
+```{r,plot umap t cell minus immune t cell marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+dat <-as.data.frame(reducedDims(tcell.final.sub)$`UMAP_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(tcell.final.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% tcell.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+```{r, plot umap with t cell category and types minus immune, message=F, warning=F, echo=F}
+#Tell Category
+cluster<- "TcellCategory"
+p <-plotReducedDim(tcell.final.sub, "UMAP_p50", colour_by=paste(cluster), text_by = paste(cluster),text_colour ="red", point_size=0.5,)+labs(x="UMAP-1", y="UMAP-2", title="UMAP") 
+plot(p)
+
+#Tell Type
+cluster<- "TcellType"
+p <-plotReducedDim(tcell.final.sub, "UMAP_p50", colour_by=paste(cluster), text_by = paste(cluster),text_colour ="red", point_size=0.5,)+labs(x="UMAP-1", y="UMAP-2", title="UMAP") 
+plot(p)
+```
diff --git a/04_cluster_immune-cells.Rmd b/04_cluster_immune-cells.Rmd
new file mode 100644
index 0000000..61c4702
--- /dev/null
+++ b/04_cluster_immune-cells.Rmd
@@ -0,0 +1,1123 @@
+---
+title: "R Notebook - Analyse Immune cells"
+output:
+  html_document:
+    df_print: paged
+---
+
+```{r, import libraries,echo=F, warning=FALSE, message=FALSE}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+
+library(dplyr)
+library(tidyr)
+library(mclust)
+library(ggplot2)
+library(RColorBrewer)
+library(scater)
+library(Rphenoannoy)
+set.seed(101100)
+```
+
+
+```{r, Set wd and load data,echo=F, warning=FALSE, message=FALSE}
+#set working directory 
+wd <-dirname(getwd())
+
+data_folder <-(file.path(wd,"sce_objects","immune"))
+
+#RAW
+#all.immune <- readRDS(file=file.path(data_folder, "all_immune_RAW.rds"))
+
+#workingfile
+all.immune <- readRDS(file=file.path(data_folder, "all_immune_workingfile.rds"))
+
+#saveRDS(all.immune, file=file.path(results_folder, "all_immune_RAW.rds"))
+#saveRDS(all.immune, file=file.path(data_folder, "all_immune_workingfile.rds"))
+
+```
+
+
+```{r, Define immune markers, echo=F, warning=F, message=FALSE}
+all.marker <-rownames(all.immune)
+
+bad.marker <- c("Iridium_191","Iridium_193","Cadherin-6","Histone H3") 
+good.marker <- all.marker[!all.marker %in% bad.marker]
+immune.marker <-c("Myeloperoxidase (MPO)" ,"HLA-DR","CD20","CD68","Indoleamine 2- 3-dioxygenase (IDO)","CD3" ,"TCF1/TCF7" ,"FOXP3","CD45RA + CD45R0","CD8a","CD4"  ,"CD15" ,"Ki-67","CD279 (PD-1)")
+
+print(immune.marker)
+```
+
+```{r, subset all.immune,echo=F, warning=FALSE, message=FALSE, eval=F}
+#split cells by ImageNumber
+n_cells_dr <- 100
+cs <- split(seq_len(ncol(all.immune)), all.immune$Tma_ac)
+length(unique(all.immune$Tma_ac))
+#sample 'n_cells_dr' per ImageNumber
+cs <- unlist(lapply(cs, function(u) 
+  sample(u, min(n_cells_dr, length(u)))))
+#sub.test <- sce[,cs]
+all.immune.sub <- all.immune[,cs]
+
+#calculate percentage of cells subsetting results in
+p<-dim(assay(all.immune.sub))[2]/dim(assay(all.immune))[2]*100
+
+#results in % of all cells
+print(paste("Subsetting results in a total number of",dim(assay(
+  all.immune.sub))[2],"cells, which corresponds to",round(p,digits=2), "% of all cells of the original sce." ))
+
+saveRDS(all.immune.sub, file=file.path(data_folder, paste("all_immune_sub.rds")))
+```
+
+```{r, load subset, message=FALSE, warning=FALSE, echo=FALSE}
+all.immune.sub <- readRDS(file=file.path(data_folder, paste("all_immune_sub.rds")))
+```
+
+```{r, calculate umap, warning=F, message=F, echo=F, eval=FALSE}
+p <-c(10,50,100)
+p=50
+for(i in p){
+all.immune.sub <- runUMAP(all.immune.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("UMAP_p", i),
+                 #use_dimred="PCA_20",
+                 n_neighbors = i)
+saveRDS(all.immune.sub, file=file.path(data_folder, paste("all_immune_sub.rds")))
+
+}
+saveRDS(all.immune.sub, file=file.path(data_folder, paste("all_immune_sub.rds")))
+
+
+for(i in p){
+all.immune.sub <- runTSNE(all.immune.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("tSNE_p", i),
+                 #use_dimred="PCA_20",
+                 perplexity = i)
+saveRDS(all.immune.sub, file=file.path(data_folder, paste("all_immune_sub.rds")))
+}
+```
+
+
+**UMAP with good markers**
+```{r,plot umap tumour marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+dat <-as.data.frame(reducedDims(all.immune.sub)$`UMAP_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.immune.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% good.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+**tsne with immune markers**
+```{r,plot tsne tumour marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE, eval=F}
+dat <-as.data.frame(reducedDims(all.immune.sub)$`tSNE_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.immune.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% good.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+
+```{r, Clustering using Rphenoannoy,echo=F, warning=FALSE, message=FALSE,eval=F, fig.width=25, fig.height=12}
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+
+#marker.clustering <- good.marker[!good.marker %in% "Pan Cytokeratin + Keratin Epithelial"]
+#cellnb <- 250000
+#subsample <-  colnames(sce)[sample(length(colnames(sce)), cellnb)]
+#sce_sub <- sce[, colnames(sce) %in% subsample]
+
+#set ks for clustering
+#i <- 100
+k <- c(20)
+#i<-10
+for (i in k) {
+    all.immune$RPmembership <- factor(Rphenoannoy(data = t(assay(all.immune[rownames(all.immune) %in% good.marker,],"c_counts_asinh")), k = i)[[2]]$membership)
+    cluster <- paste0("rp_immune_all_k",i)
+  colnames(colData(all.immune))[which(names(colData(all.immune)) == "RPmembership")] <- paste0(cluster)
+  agg_sce <-aggregateAcrossCells(all.immune, ids=all.immune[[cluster]], average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+ 
+    scater::plotHeatmap(agg_sce,
+              features = good.marker,
+              #features = all.marker,
+              #features = marker.clustering,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c(cluster),
+              main=paste0("Heatmap tumour cells, ",cluster))
+    
+  #save sce clustering
+  saveRDS(all.immune, file=file.path(data_folder, "all_immune_workingfile.rds"))
+}
+#all.immune <-readRDS( file=file.path(data_folder, "all_immune_workingfile.rds"))
+
+```
+
+```{r, HM all immune tumour, fig.width=20, fig.height=10, message=F, warning=F, echo=F}
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+
+table(all.immune$rp_immune_all_k20)
+is.tumour <- c(60,22,73,71)
+all.immune$tumour_immune <-ifelse(all.immune$rp_immune_all_k20 %in% is.tumour, "tumour","immune")
+table(all.immune$tumour_immune)
+
+cluster="rp_immune_all_k20"
+agg_sce <-aggregateAcrossCells(all.immune, ids=all.immune[[cluster]], average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+    scater::plotHeatmap(agg_sce,
+              features = good.marker,
+              #features = all.marker,
+              #features = marker.clustering,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c(cluster,"tumour_immune"),
+              main=paste0("Heatmap tumour cells, ",cluster))
+    
+immune.final <- all.immune[, all.immune$tumour_immune=="immune"]
+immune.tumour <- all.immune[, all.immune$tumour_immune=="tumour"]
+
+#saveRDS(immune.final, file=file.path(data_folder, "final_immune_RAW.rds"))
+#saveRDS(immune.tumour, file=file.path(data_folder, "all_immune_workingfile_TUMOUR.rds"))
+```
+
+
+################################################################################################################################
+#carrying on using immune.final from here
+```{r, immune final read in, message=F, warning=F, echo=F}
+#immune.final <-readRDS(file=file.path(data_folder, "final_immune_RAW.rds"))
+#saveRDS(immune.final,file=file.path(data_folder, "final_immune_workingfile.rds"))
+
+immune.marker <-c("Myeloperoxidase (MPO)" ,"HLA-DR","CD20","CD68","Indoleamine 2- 3-dioxygenase (IDO)","CD3" ,"TCF1/TCF7" ,"FOXP3","CD45RA + CD45R0","CD8a","CD4"  ,"CD15" ,"Ki-67","CD279 (PD-1)")
+```
+
+```{r, subset immune.final, message=F, warning=F, echo=F, eval=F}
+#split cells by ImageNumber
+n_cells_dr <- 100
+cs <- split(seq_len(ncol(immune.final)), immune.final$Tma_ac)
+length(unique(immune.final$Tma_ac))
+#sample 'n_cells_dr' per ImageNumber
+cs <- unlist(lapply(cs, function(u) 
+  sample(u, min(n_cells_dr, length(u)))))
+#sub.test <- sce[,cs]
+immune.final.sub <- immune.final[,cs]
+
+#calculate percentage of cells subsetting results in
+p<-dim(assay(immune.final.sub))[2]/dim(assay(immune.final))[2]*100
+
+#results in % of all cells
+print(paste("Subsetting results in a total number of",dim(assay(
+  immune.final.sub))[2],"cells, which corresponds to",round(p,digits=2), "% of all cells of the original sce." ))
+
+saveRDS(immune.final.sub, file=file.path(data_folder, paste("immune_final_sub.rds")))
+```
+
+```{r, load subset immune.final, message=FALSE, warning=FALSE, echo=FALSE}
+immune.final.sub <- readRDS(file=file.path(data_folder, paste("immune_final_sub.rds")))
+```
+
+```{r, calculate umap immune.final, warning=F, message=F, echo=F, eval=FALSE}
+p <-c(10,50,100)
+p=50
+for(i in p){
+immune.final.sub <- runUMAP(immune.final.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("UMAP_p", i),
+                 #use_dimred="PCA_20",
+                 n_neighbors = i)
+saveRDS(immune.final.sub, file=file.path(data_folder, paste("immune_final_sub.rds")))
+
+}
+saveRDS(immune.final.sub, file=file.path(data_folder, paste("immune_final_sub.rds")))
+
+
+for(i in p){
+immune.final.sub <- runTSNE(immune.final.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("tSNE_p", i),
+                 #use_dimred="PCA_20",
+                 perplexity = i)
+saveRDS(immune.final.sub, file=file.path(data_folder, paste("immune_final_sub.rds")))
+}
+```
+
+
+**UMAP with good markers**
+```{r,plot umap immune.final sub immune marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+dat <-as.data.frame(reducedDims(immune.final.sub)$`UMAP_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(immune.final.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% good.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+```{r Clustering immune.final using Rphenoannoy, fig.width=25, fig.height=12, message=F, warning=F, echo=F, eval=F}
+immune.final$rp_immune_all_k20 <-NULL
+immune.final$`rp_NONtumour-final_k30` <-NULL
+immune.final$`rp_NONtumour-final_k35` <-NULL
+immune.final$`rp_NONtumour-final_k40` <-NULL
+
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+
+#marker.clustering <- good.marker[!good.marker %in% "Pan Cytokeratin + Keratin Epithelial"]
+#cellnb <- 250000
+#subsample <-  colnames(sce)[sample(length(colnames(sce)), cellnb)]
+#sce_sub <- sce[, colnames(sce) %in% subsample]
+
+#set ks for clustering
+#i <- 100
+k <- c(20)
+#i<-10
+for (i in k) {
+    immune.final$RPmembership <- factor(Rphenoannoy(data = t(assay(immune.final[rownames(immune.final) %in% immune.marker,],"c_counts_asinh")), k = i)[[2]]$membership)
+    cluster <- paste0("rp_immune_final_k",i)
+  colnames(colData(immune.final))[which(names(colData(immune.final)) == "RPmembership")] <- paste0(cluster)
+  agg_sce <-aggregateAcrossCells(immune.final, ids=immune.final[[cluster]], average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+ 
+    scater::plotHeatmap(agg_sce,
+              features = good.marker,
+              #features = all.marker,
+              #features = marker.clustering,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c(cluster),
+              main=paste0("Heatmap tumour cells, ",cluster))
+    
+  #save sce clustering
+  saveRDS(immune.final, file=file.path(data_folder, "final_immune_workingfile.rds"))
+}
+#immune.final <-readRDS( file=file.path(data_folder, "final_immune_workingfile.rds"))
+immune.final <-readRDS( file=file.path(wd,"sce_objects","immune", "final_immune_workingfile.rds"))
+```
+
+
+##########################
+immune + stroma.immune
+#add immune cells from stromal clustering
+```{r, merge immune cells from stromal clustering, message=F, warning=F, echo=F, eval=F}
+all.stroma.immune <-readRDS(file=file.path(wd,"sce_objects","stroma", "all_stroma_workingfile_IMMUNE.rds"))
+colnames(colData(all.stroma.immune))[!colnames(colData(all.stroma.immune)) %in%colnames(colData(immune.final))]
+all.stroma.immune$`rp_NONtumour-final_k30` <-NULL
+all.stroma.immune$`rp_NONtumour-final_k35` <-NULL
+all.stroma.immune$`rp_NONtumour-final_k40` <-NULL
+all.stroma.immune$`rp_stroma_all-stroma-marker_k20` <-NULL
+all.stroma.immune$rp_stroma_all_k20 <-NULL
+
+colnames(colData(immune.final))[!colnames(colData(immune.final)) %in%colnames(colData(all.stroma.immune))]
+immune.final$rp_immune_all_k20 <-NULL
+immune.final$tumour_immune <-NULL
+immune.final$rp_immune_final_k20 <-NULL
+
+immune.final <- cbind(immune.final, all.stroma.immune)
+saveRDS(immune.final, file=file.path(data_folder, "final_immune_RAW_afterStromaMerge.rds"))
+```
+
+```{r, read immune final after immune stroma merge, message=F, warning=F, echo=F}
+immune.final <-readRDS(file=file.path(data_folder, "final_immune_RAW_afterStromaMerge.rds"))
+immune.final <-readRDS(file=file.path(data_folder, "final_immune_afterStromaMerge_workingfile.rds"))
+
+#saveRDS(immune.final,file=file.path(data_folder, "final_immune_afterStromaMerge_workingfile.rds"))
+
+immune.marker <-c("Myeloperoxidase (MPO)" ,"HLA-DR","CD20","CD68","Indoleamine 2- 3-dioxygenase (IDO)","CD3" ,"TCF1/TCF7" ,"FOXP3","CD45RA + CD45R0","CD8a","CD4"  ,"CD15" ,"Ki-67","CD279 (PD-1)")
+```
+
+```{r, subset immune final after stroma merge, message=F, warning=F, echo=F, eval=F}
+#split cells by ImageNumber
+n_cells_dr <- 100
+cs <- split(seq_len(ncol(immune.final)), immune.final$Tma_ac)
+length(unique(immune.final$Tma_ac))
+#sample 'n_cells_dr' per ImageNumber
+cs <- unlist(lapply(cs, function(u) 
+  sample(u, min(n_cells_dr, length(u)))))
+#sub.test <- sce[,cs]
+immune.final.sub <- immune.final[,cs]
+
+#calculate percentage of cells subsetting results in
+p<-dim(assay(immune.final.sub))[2]/dim(assay(immune.final))[2]*100
+
+#results in % of all cells
+print(paste("Subsetting results in a total number of",dim(assay(
+  immune.final.sub))[2],"cells, which corresponds to",round(p,digits=2), "% of all cells of the original sce." ))
+
+saveRDS(immune.final.sub, file=file.path(data_folder, paste("immune_final_sub_aftermerge.rds")))
+```
+
+```{r, load subset immune final after stroma merge, message=FALSE, warning=FALSE, echo=FALSE}
+immune.final.sub <- readRDS(file=file.path(data_folder, paste("immune_final_sub.rds")))
+```
+
+```{r, calculate umap immune final after stroma merge, warning=F, message=F, echo=F, eval=FALSE}
+p <-c(10,50,100)
+p=50
+for(i in p){
+immune.final.sub <- runUMAP(immune.final.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("UMAP_p", i),
+                 #use_dimred="PCA_20",
+                 n_neighbors = i)
+saveRDS(immune.final.sub, file=file.path(data_folder, paste("immune_final_sub_aftermerge.rds")))
+
+}
+saveRDS(immune.final.sub, file=file.path(data_folder, paste("immune_final_sub_aftermerge.rds")))
+
+
+for(i in p){
+immune.final.sub <- runTSNE(immune.final.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("tSNE_p", i),
+                 #use_dimred="PCA_20",
+                 perplexity = i)
+saveRDS(immune.final.sub, file=file.path(data_folder, paste("immune_final_sub_aftermerge.rds")))
+}
+```
+
+
+**UMAP with good markers**
+```{r,plot umap immune after stroma merge immune marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+dat <-as.data.frame(reducedDims(immune.final.sub)$`UMAP_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(immune.final.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% immune.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+```{r, Clustering immune final after stroma merge using Rphenoannoy, fig.width=25, fig.height=12, message=F, warning=F, echo=F, eval=F}
+immune.final$cell_category <-NULL
+immune.final$tumour_nontumour <-NULL
+
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+
+#marker.clustering <- good.marker[!good.marker %in% "Pan Cytokeratin + Keratin Epithelial"]
+#cellnb <- 250000
+#subsample <-  colnames(sce)[sample(length(colnames(sce)), cellnb)]
+#sce_sub <- sce[, colnames(sce) %in% subsample]
+
+#set ks for clustering
+#i <- 100
+k <- c(15)
+#i<-20
+for (i in k) {
+    immune.final$RPmembership <- factor(Rphenoannoy(data = t(assay(immune.final[rownames(immune.final) %in% immune.marker,],"c_counts_asinh")), k = i)[[2]]$membership)
+    cluster <- paste0("rp_immune_final_k",i)
+  colnames(colData(immune.final))[which(names(colData(immune.final)) == "RPmembership")] <- paste0(cluster)
+  agg_sce <-aggregateAcrossCells(immune.final, ids=immune.final[[cluster]], average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+ 
+    scater::plotHeatmap(agg_sce,
+              features = immune.marker,
+              #features = all.marker,
+              #features = marker.clustering,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c(cluster),
+              main=paste0("Heatmap immune cells, ",cluster))
+    
+  #save sce clustering
+saveRDS(immune.final,file=file.path(data_folder, "final_immune_afterStromaMerge_workingfile.rds"))
+}
+#immune.final <-readRDS( file=file.path(wd,"sce_objects","immune", "final_immune_afterStromaMerge_workingfile.rds"))
+```
+
+```{r, add clustering results to subset immune final after stroma merge, message=F, warning=F, echo=F, eval=F}
+rp_df <- data.frame("CellID"=immune.final$CellID, "rp_immune_final_k15"=immune.final$rp_immune_final_k15,
+                    "rp_immune_final_k20"=immune.final$rp_immune_final_k20,
+                    "rp_immune_final_k30"=immune.final$rp_immune_final_k30,
+                    "som_25"=immune.final$som_25)
+
+cur_DF <- as_tibble(colData(immune.final.sub)) %>% left_join(rp_df, by = "CellID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(immune.final.sub$ImageNumber, immune.final.sub$CellNumber))
+
+colData(immune.final.sub) <- cur_DF
+rownames(colData(immune.final.sub)) <-immune.final.sub$CellID
+```
+
+
+```{r, cluster immune cells using FLOWSOM, echo=F, eval=FALSE, warning=F, message=F, fig.width=15, fig.height=10, eval=F}
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+assay(immune.final, "exprs")<-assay(immune.final, "c_counts_asinh")
+#run FlowSOM
+re <- CATALYST::cluster(immune.final, features =immune.marker, verbose = FALSE, maxK = 40)
+
+cl <-c(25)
+#i <-5
+for (i in cl){
+  #i=35
+  cluster <- paste0("som_",i)
+  immune.final[[cluster]] <- as.factor(cluster_ids(re, paste0("meta",i)))
+  agg_sce <- aggregateAcrossCells(immune.final, ids=immune.final[[cluster]], average=TRUE, use_exprs_values="c_counts_asinh_scaled")
+
+  #plot Heatmap
+  scater::plotHeatmap(agg_sce,
+              #features = fibro.marker,
+              features=immune.marker,
+              exprs_values = "c_counts_asinh_scaled",
+              #symmetric = FALSE,
+              zlim=c(-0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c("ids"),
+              main=paste0("Heatmap immune, ",cluster))
+}
+saveRDS(immune.final,file=file.path(data_folder, "final_immune_afterStromaMerge_workingfile.rds"))
+
+```
+
+```{r,Plot HM FLOWSOM cluster immune final after stroma merge, echo=F, warning=F, message=F, fig.width=15, fig.height=10}
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+cl <-c(25)
+#i <-5
+for (i in cl){
+  #i=35
+  cluster <- paste0("som_",i)
+  agg_sce <- aggregateAcrossCells(immune.final, ids=immune.final[[cluster]], average=TRUE, use_exprs_values="c_counts_asinh_scaled")
+
+  #plot Heatmap
+  scater::plotHeatmap(agg_sce,
+              #features = fibro.marker,
+              features=immune.marker,
+              exprs_values = "c_counts_asinh_scaled",
+              #symmetric = FALSE,
+              zlim=c(-0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c("ids"),
+              main=paste0("Heatmap immune, ",cluster))
+}
+```
+
+```{r, subset immune final into t cell and non immune T cell, message=F, echo=F, warning=F}
+is.tcell <- c(3,4,5,8,10)
+tcell.sce <- immune.final[, immune.final$som_25 %in% is.tcell]
+immune.nontcell <-immune.final[, !immune.final$som_25 %in% is.tcell]
+unique(tcell.sce$som_25)
+#saveRDS(tcell.sce, file=file.path(data_folder,"Tcell","tcells_RAW.rds"))
+#saveRDS(immune.nontcell, file=file.path(data_folder,"IMMUNE_immune_nontcells_RAW.rds"))
+
+```
+
+```{r, after T cell clustering merge all non T cell immune cells, message=F, echo=F, warning=F, eval=F}
+#B cells and immune cells from T cell sce
+tcell.bcells <- readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/immune/Tcell/tcells_bcells.RDS")
+tcell.immune <- readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/immune/Tcell/tcell-final-immune.rds")
+
+colnames(colData(tcell.immune))[!colnames(colData(tcell.immune)) %in%colnames(colData(immune.nontcell))]
+tcell.immune$TcellCategory <-NULL
+tcell.immune$TcellType <-NULL
+tcell.immune$rp_tcell_tb_k20 <-NULL
+tcell.immune$rp_tcell_k20 <-NULL
+tcell.immune$rp_tcell_k15 <-NULL
+tcell.immune$rp_tcell_k30 <-NULL
+tcell.immune$rp_tcell_k35 <-NULL
+
+tcell.bcells$TcellCategory <-NULL
+tcell.bcells$TcellType <-NULL
+tcell.bcells$rp_tcell_tb_k20 <-NULL
+tcell.bcells$rp_tcell_k20 <-NULL
+tcell.bcells$rp_tcell_k15 <-NULL
+tcell.bcells$rp_tcell_k30 <-NULL
+tcell.bcells$rp_tcell_k35 <-NULL
+tcell.bcells$rp_tcell_all_k20 <-NULL
+tcell.bcells$rp_tcell_immune_k20 <-NULL
+
+colnames(colData(immune.nontcell))[!colnames(colData(immune.nontcell)) %in%colnames(colData(tcell.bcells))]
+immune.nontcell$som_30 <-NULL
+immune.nontcell$som_1 <-NULL
+immune.nontcell$som_2 <-NULL
+immune.nontcell$som_3 <-NULL
+immune.nontcell$som_4 <-NULL
+immune.nontcell$som_5 <-NULL
+immune.nontcell$som_6 <-NULL
+immune.nontcell$som_7 <-NULL
+immune.nontcell$som_8 <-NULL
+immune.nontcell$som_9 <-NULL
+
+
+immune.final <- cbind(immune.nontcell, tcell.bcells, tcell.immune)
+saveRDS(immune.final, file=file.path(data_folder,"NonTcell_immune_RAW_afterStroma_Tcell-merge.rds"))
+saveRDS(immune.final, file=file.path(data_folder,"NonTcell_immune_afterStroma_Tcell-merge_workingfile.rds"))
+```
+
+#####################################################################################################################################
+#after remerging the no T cells from the T cell data. 
+```{r, Set wd and load data non T cell immune, message=F, echo=F, warning=F}
+#set working directory 
+wd <-dirname(getwd())
+
+data_folder <-(file.path(wd,"sce_objects","immune"))
+
+#RAW
+#immune.final <- readRDS(file=file.path(data_folder, "NonTcell_immune_RAW_afterStroma_Tcell-merge.rds"))
+
+#workingfile
+immune.final <- readRDS(file=file.path(data_folder, "NonTcell_immune_afterStroma_Tcell-merge_workingfile.rds"))
+set.seed(101100)
+```
+
+```{r, Define immune markers non T cell immune, echo=F, warning=F, message=FALSE}
+all.marker <-rownames(immune.final)
+
+bad.marker <- c("Iridium_191","Iridium_193","Cadherin-6","Histone H3") 
+good.marker <- all.marker[!all.marker %in% bad.marker]
+immune.marker <-c("Myeloperoxidase (MPO)" ,"HLA-DR","CD20","CD68","CD45RA + CD45R0"  ,"CD15")
+
+print(immune.marker)
+```
+
+```{r, subset non T cell immune, message=F, echo=F, warning=F, eval=F}
+#split cells by ImageNumber
+n_cells_dr <- 100
+cs <- split(seq_len(ncol(immune.final)), immune.final$Tma_ac)
+length(unique(immune.final$Tma_ac))
+#sample 'n_cells_dr' per ImageNumber
+cs <- unlist(lapply(cs, function(u) 
+  sample(u, min(n_cells_dr, length(u)))))
+#sub.test <- sce[,cs]
+immune.final.sub <- immune.final[,cs]
+
+#calculate percentage of cells subsetting results in
+p<-dim(assay(immune.final.sub))[2]/dim(assay(immune.final))[2]*100
+
+#results in % of all cells
+print(paste("Subsetting results in a total number of",dim(assay(
+  immune.final.sub))[2],"cells, which corresponds to",round(p,digits=2), "% of all cells of the original sce." ))
+
+saveRDS(immune.final.sub, file=file.path(data_folder, paste("non_tcell_immune_sub.rds")))
+```
+
+```{r, load subset non t cell immune, message=FALSE, warning=FALSE, echo=FALSE}
+immune.final.sub <- readRDS(file=file.path(data_folder, paste("non_tcell_immune_sub.rds")))
+colnames(colData(immune.final.sub))
+immune.final.sub$rp_immune_final_k15<-NULL
+```
+
+```{r, calculate umap non T cell immune, warning=F, message=F, echo=F, eval=FALSE}
+p <-c(10,50,100)
+p=50
+for(i in p){
+immune.final.sub <- runUMAP(immune.final.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("UMAP_p", i),
+                 #use_dimred="PCA_20",
+                 n_neighbors = i)
+saveRDS(immune.final.sub, file=file.path(data_folder, paste("non_tcell_immune_sub.rds")))
+
+}
+saveRDS(immune.final.sub, file=file.path(data_folder, paste("non_tcell_immune_sub.rds")))
+
+
+for(i in p){
+immune.final.sub <- runTSNE(immune.final.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("tSNE_p", i),
+                 #use_dimred="PCA_20",
+                 perplexity = i)
+saveRDS(immune.final.sub, file=file.path(data_folder, paste("non_tcell_immune_sub.rds")))
+}
+```
+
+
+**UMAP with good markers**
+```{r,plot umap non t cell immune marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+dat <-as.data.frame(reducedDims(immune.final.sub)$`UMAP_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(immune.final.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% good.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+**tsne with Tumour markers**
+```{r,plot tsne immune marker non T cell immune, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE, eval=F}
+dat <-as.data.frame(reducedDims(immune.final.sub)$`tSNE_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(immune.final.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% good.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+
+```{r Clustering non T cell immune using Rphenoannoy, fig.width=12, fig.height=3, message=F, echo=F, warning=F, eval=F}
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+
+#marker.clustering <- good.marker[!good.marker %in% "Pan Cytokeratin + Keratin Epithelial"]
+#cellnb <- 250000
+#subsample <-  colnames(sce)[sample(length(colnames(sce)), cellnb)]
+#sce_sub <- sce[, colnames(sce) %in% subsample]
+
+#set ks for clustering
+#i <- 100
+k <- c(20)
+#i<-50
+for (i in k) {
+    immune.final$RPmembership <- factor(Rphenoannoy(data = t(assay(immune.final[rownames(immune.final) %in% immune.marker,],"c_counts_asinh")),k = i)[[2]]$membership)
+    cluster <- paste0("rp_immune_k",i)
+ colnames(colData(immune.final))[which(names(colData(immune.final)) == "RPmembership")] <- paste0(cluster)
+  agg_sce <-aggregateAcrossCells(immune.final, ids=immune.final[[cluster]], average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+ 
+    scater::plotHeatmap(agg_sce,
+              features = immune.marker,
+              #features = all.marker,
+              #features = marker.clustering,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c(cluster),
+              main=paste0("Heatmap tumour cells, ",cluster))
+    
+  #save sce clustering
+ saveRDS(immune.final, file=file.path(data_folder, "NonTcell_immune_afterStroma_Tcell-merge_workingfile.rds"))
+}
+#immune.final <-readRDS( file=file.path(data_folder, "NonTcell_immune_afterStroma_Tcell-merge_workingfile.rds"))
+
+```
+
+```{r, add cell categories non t cell immune, fig.width=12, fig.height=4, message=F, echo=F, warning=F}
+is.bcell <- c(25,26,31,21,19,22,20)
+is.neutro <- c(17,18,27,32,23)
+is.myeloid <-c(11,12,14,28,30,10,16,6,9,24,5,1,13,7)
+is.other <-c(29,2,8,3,4,15)
+
+immune.final$immune_category[immune.final$rp_immune_k20 %in% is.bcell]<-"Bcell"
+immune.final$immune_category[immune.final$rp_immune_k20 %in% is.neutro]<-"Neutrophil"
+immune.final$immune_category[immune.final$rp_immune_k20 %in% is.myeloid]<-"Myeloid"
+immune.final$immune_category[immune.final$rp_immune_k20 %in% is.other]<-"other"
+cluster<-"rp_immune_k20"
+agg_sce <-aggregateAcrossCells(immune.final, ids=immune.final[[cluster]], average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+ 
+    scater::plotHeatmap(agg_sce,
+              features = immune.marker,
+              #features = all.marker,
+              #features = marker.clustering,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c("immune_category"),
+              main=paste0("Heatmap tumour cells, ",cluster))
+    
+
+```
+
+```{r, add clustering results to subset no  tcell immune, message=F, echo=F, warning=F}
+immune.final.sub$rp_immune_k20 <-NULL
+immune.final.sub$immune_category <-NULL
+rp_df <- data.frame("CellID"=immune.final$CellID,
+                    "rp_immune_k20"=immune.final$rp_immune_k20,
+                    "immune_category"=immune.final$immune_category)
+
+cur_DF <- as_tibble(colData(immune.final.sub)) %>% left_join(rp_df, by = "CellID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(immune.final.sub$ImageNumber, immune.final.sub$CellNumber))
+
+colData(immune.final.sub) <- cur_DF
+rownames(colData(immune.final.sub)) <-immune.final.sub$CellID
+```
+
+```{r, non t cell immune sub umap clusters, message=F, echo=F, warning=F}
+cluster <- "rp_immune_k20"
+p <-plotReducedDim(immune.final.sub, "UMAP_p50", colour_by=paste(cluster), text_by = paste(cluster),text_colour ="red", point_size=0.5,)+labs(x="UMAP-1", y="UMAP-2", title="UMAP") 
+plot(p)
+
+cluster<- "immune_category"
+p <-plotReducedDim(immune.final.sub, "UMAP_p50", colour_by=paste(cluster), text_by = paste(cluster),text_colour ="red", point_size=0.5,)+labs(x="UMAP-1", y="UMAP-2", title="UMAP") 
+plot(p)
+```
+```{r, non t cell immune sub final mod, message=F, echo=F, warning=F, eval=F}
+colnames(colData(immune.final))
+
+immune.final$rp_immune_k30 <- NULL
+immune.final$rp_immune_k40<- NULL
+immune.final$rp_immune_k50<- NULL
+immune.final$rp_immune_k20.1<- NULL
+immune.final$rp_immune_final_k15<- NULL
+immune.final$rp_immune_final_k20<- NULL
+immune.final$rp_immune_final_k30<- NULL
+immune.final$rp_immune_final_k10<- NULL
+
+saveRDS(immune.final, file=file.path(data_folder, "NonTcell_immune_afterStroma_Tcell-merge_workingfile.rds"))
+
+```
+
+#merge non t cell immune and t cell
+```{r, merge non t cell immune and t cell sce after clustering back, message=F, echo=F, warning=F, eval=F}
+
+tcell.final <- readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/immune/Tcell/FINAL_Tcells-minusImmune_workingfile.rds")
+tcell.final$immune_category <-"Tcell"
+
+immune.final$TcellCategory <-immune.final$immune_category
+immune.final$TcellType<-immune.final$immune_category
+immune.final$rp_immune_k20 <-NULL
+tcell.final$rp_immune_final_k10 <-NULL
+tcell.final$rp_immune_final_k15 <-NULL
+tcell.final$rp_immune_final_k20 <-NULL
+tcell.final$rp_tcell_k15 <-NULL
+tcell.final$rp_tcell_k20 <-NULL
+tcell.final$rp_tcell_k30 <-NULL
+tcell.final$rp_tcell_k35 <-NULL
+tcell.final$rp_tcell_tb_k20 <-NULL
+tcell.final$rp_immune_final_k30<-NULL
+
+colnames(colData(immune.final))[!colnames(colData(immune.final)) %in%colnames(colData(tcell.final))]
+colnames(colData(tcell.final))[!colnames(colData(tcell.final)) %in%colnames(colData(immune.final))]
+
+all.immune.cells <-cbind(immune.final, tcell.final)
+#saveRDS(all.immune.cells, file=file.path(data_folder, "FINAL_ALL_IMMUNE_CELLS_incTcellTypes.RDS"))
+
+```
+
+#############################################################################################################################################################################
+#All immune cells including T cell types
+
+
+```{r, Set wd and load data all immune combined, message=F, echo=F, warning=F}
+#set working directory 
+wd <-dirname(getwd())
+
+data_folder <-(file.path(wd,"sce_objects","immune"))
+
+#RAW
+#all.immune.cells <- readRDS(file=file.path(data_folder, "FINAL_ALL_IMMUNE_CELLS_incTcellTypes.RDS"))
+
+#workingfile
+all.immune.cells <- readRDS(file=file.path(data_folder, "FINAL_ALL_IMMUNE_CELLS_incTcellTypes_workingfile.RDS"))
+
+#saveRDS(all.immune.cells, file=file.path(data_folder, "FINAL_ALL_IMMUNE_CELLS_incTcellTypes_workingfile.RDS"))
+```
+
+
+```{r, Define immune markers all immune, echo=F, warning=F, message=FALSE}
+all.marker <-rownames(all.immune.cells)
+
+bad.marker <- c("Iridium_191","Iridium_193","Cadherin-6","Histone H3") 
+good.marker <- all.marker[!all.marker %in% bad.marker]
+immune.marker <-c("Myeloperoxidase (MPO)" ,"HLA-DR","CD20","CD68","Indoleamine 2- 3-dioxygenase (IDO)","CD3" ,"TCF1/TCF7" ,"FOXP3","CD45RA + CD45R0","CD8a","CD4" ,"CD15" ,"Ki-67","CD279 (PD-1)")
+
+print(immune.marker)
+```
+
+```{r, subset all immune including T cells, message=F, echo=F, warning=F, eval=F}
+#split cells by ImageNumber
+n_cells_dr <- 100
+cs <- split(seq_len(ncol(all.immune.cells)), all.immune.cells$Tma_ac)
+length(unique(all.immune.cells$Tma_ac))
+#sample 'n_cells_dr' per ImageNumber
+cs <- unlist(lapply(cs, function(u) 
+  sample(u, min(n_cells_dr, length(u)))))
+#sub.test <- sce[,cs]
+all.immune.cells.sub <- all.immune.cells[,cs]
+
+#calculate percentage of cells subsetting results in
+p<-dim(assay(all.immune.cells.sub))[2]/dim(assay(all.immune.cells))[2]*100
+
+#results in % of all cells
+print(paste("Subsetting results in a total number of",dim(assay(
+  all.immune.cells.sub))[2],"cells, which corresponds to",round(p,digits=2), "% of all cells of the original sce." ))
+
+saveRDS(all.immune.cells.sub, file=file.path(data_folder, paste("all_immune_types_sub.rds")))
+```
+
+```{r, load subset immune plus T celss, message=FALSE, warning=FALSE, echo=FALSE}
+all.immune.cells.sub <- readRDS(file=file.path(data_folder,paste("all_immune_types_sub.rds")))
+```
+
+```{r, calculate umap all immune plus T cells, warning=F, message=F, echo=F, eval=FALSE}
+p <-c(10,50,100)
+p=50
+for(i in p){
+all.immune.cells.sub <- runUMAP(all.immune.cells.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("UMAP_p", i),
+                 #use_dimred="PCA_20",
+                 n_neighbors = i)
+saveRDS(all.immune.cells.sub, file=file.path(data_folder, paste("all_immune_types_sub.rds")))
+
+}
+saveRDS(all.immune.cells.sub, file=file.path(data_folder, paste("all_immune_types_sub.rds")))
+
+
+for(i in p){
+all.immune.cells.sub <- runTSNE(all.immune.cells.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("tSNE_p", i),
+                 #use_dimred="PCA_20",
+                 perplexity = i)
+saveRDS(all.immune.cells.sub, file=file.path(data_folder, paste("all_immune_types_sub.rds")))
+}
+```
+
+
+**UMAP with good markers**
+```{r,plot umap all immune plus T cells immune marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+dat <-as.data.frame(reducedDims(all.immune.cells.sub)$`UMAP_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.immune.cells.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% immune.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 5)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+**tsne with Tumour markers**
+```{r,plot tsne immune plus T cells immune marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE, eval=F}
+dat <-as.data.frame(reducedDims(all.immune.cells.sub)$`tSNE_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.immune.cells.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% immune.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+```{r, plot immune category on immune plus t cell umap, message=F, echo=F, warning=F}
+#Immune category
+cluster<- "immune_category"
+p <-plotReducedDim(all.immune.cells.sub, "UMAP_p50", colour_by=paste(cluster), text_by = paste(cluster),text_colour ="red", point_size=0.5,)+labs(x="UMAP-1", y="UMAP-2", title="UMAP") 
+plot(p)
+
+#Tcell category
+cluster<- "TcellCategory"
+p <-plotReducedDim(all.immune.cells.sub, "UMAP_p50", colour_by=paste(cluster), text_by = paste(cluster),text_colour ="red", point_size=0.5,)+labs(x="UMAP-1", y="UMAP-2", title="UMAP") 
+plot(p)
+
+#Immune category
+cluster<- "TcellType"
+p <-plotReducedDim(all.immune.cells.sub, "UMAP_p50", colour_by=paste(cluster), text_by = paste(cluster),text_colour ="red", point_size=0.5,)+labs(x="UMAP-1", y="UMAP-2", title="UMAP") 
+plot(p)
+```
+
diff --git a/05_cluster_stroma-cells.Rmd b/05_cluster_stroma-cells.Rmd
new file mode 100644
index 0000000..37874b5
--- /dev/null
+++ b/05_cluster_stroma-cells.Rmd
@@ -0,0 +1,421 @@
+---
+title: "R Notebook - Analyse Stromal cells"
+output:
+  html_document:
+    df_print: paged
+---
+
+```{r, import libraries, message=F, echo=F, warning=F}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+
+library(dplyr)
+library(tidyr)
+library(mclust)
+library(ggplot2)
+library(RColorBrewer)
+library(scater)
+library(Rphenoannoy)
+set.seed(101100)
+```
+
+
+```{r, Set wd and load data all stroma, message=F, echo=F, warning=F}
+#set working directory 
+wd <-dirname(getwd())
+
+data_folder <-(file.path(wd,"sce_objects","stroma"))
+
+#RAW
+#all.stroma <- readRDS(file=file.path(data_folder, "all_stroma_RAW.rds"))
+
+#workingfile
+all.stroma <- readRDS(file=file.path(data_folder, "all_stroma_workingfile.rds"))
+#saveRDS(all.stroma, file=file.path(data_folder, "all_stroma_RAW.rds"))
+
+#saveRDS(all.stroma, file=file.path(data_folder, "all_stroma_workingfile.rds"))
+
+```
+
+
+```{r, Define stroma markers, echo=F, warning=F, message=FALSE}
+all.marker <-rownames(all.stroma)
+
+bad.marker <- c("Iridium_191","Iridium_193","Cadherin-6","Histone H3") 
+good.marker <- all.marker[!all.marker %in% bad.marker]
+stroma.marker <-c("FSP1 / S100A4","SMA","FAP","Cadherin-11","Carbonic Anhydrase IX","Collagen I + Fibronectin",
+                 "VCAM1","Indoleamine 2- 3-dioxygenase (IDO)","Podoplanin","MMP11","CD73","MMP9","p75 (CD271)","CD10","Vimentin","CD248 / Endosialin","PDGFR-b","CD34","CXCL12","CCL21","Ki-67","Caveolin-1","CD146",
+                 "vWF + CD31","LYVE-1" ,  "PNAd"   )
+print(stroma.marker)
+```
+
+```{r, subset all.stroma, message=F, echo=F, warning=F, eval=F}
+#split cells by ImageNumber
+n_cells_dr <- 100
+cs <- split(seq_len(ncol(all.stroma)), all.stroma$Tma_ac)
+length(unique(all.stroma$Tma_ac))
+#sample 'n_cells_dr' per ImageNumber
+cs <- unlist(lapply(cs, function(u) 
+  sample(u, min(n_cells_dr, length(u)))))
+#sub.test <- sce[,cs]
+all.stroma.sub <- all.stroma[,cs]
+
+#calculate percentage of cells subsetting results in
+p<-dim(assay(all.stroma.sub))[2]/dim(assay(all.stroma))[2]*100
+
+#results in % of all cells
+print(paste("Subsetting results in a total number of",dim(assay(
+  all.stroma.sub))[2],"cells, which corresponds to",round(p,digits=2), "% of all cells of the original sce." ))
+
+saveRDS(all.stroma.sub, file=file.path(data_folder, paste("all_stroma_sub.rds")))
+```
+
+```{r, load subset, message=FALSE, warning=FALSE, echo=FALSE}
+all.stroma.sub <- readRDS(file=file.path(data_folder, paste("all_stroma_sub.rds")))
+```
+
+```{r, calculate umap all stroma sub, warning=F, message=F, echo=F, eval=FALSE}
+p <-c(10,50,100)
+p=50
+for(i in p){
+all.stroma.sub <- runUMAP(all.stroma.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("UMAP_p", i),
+                 #use_dimred="PCA_20",
+                 n_neighbors = i)
+saveRDS(all.stroma.sub, file=file.path(data_folder, paste("all_stroma_sub.rds")))
+
+}
+saveRDS(all.stroma.sub, file=file.path(data_folder, paste("all_stroma_sub.rds")))
+
+
+for(i in p){
+all.stroma.sub <- runTSNE(all.stroma.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("tSNE_p", i),
+                 #use_dimred="PCA_20",
+                 perplexity = i)
+saveRDS(all.stroma.sub, file=file.path(data_folder, paste("all_stroma_sub.rds")))
+}
+```
+
+
+**UMAP with good markers**
+```{r,plot umap all.stroma.sub. good marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+dat <-as.data.frame(reducedDims(all.stroma.sub)$`UMAP_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.stroma.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% good.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+**tsne with Tumour markers**
+```{r,plot tsne all stroma sub good marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE, eval=F}
+dat <-as.data.frame(reducedDims(all.stroma.sub)$`tSNE_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.stroma.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% good.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+
+```{r Clustering all stroma using Rphenoannoy, fig.width=25, fig.height=12, message=F, echo=F, warning=F, eval=F}
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+
+#marker.clustering <- good.marker[!good.marker %in% "Pan Cytokeratin + Keratin Epithelial"]
+#cellnb <- 250000
+#subsample <-  colnames(sce)[sample(length(colnames(sce)), cellnb)]
+#sce_sub <- sce[, colnames(sce) %in% subsample]
+
+#set ks for clustering
+#i <- 100
+k <- c(20)
+#i<-10
+for (i in k) {
+    all.stroma$RPmembership <- factor(Rphenoannoy(data = t(assay(all.stroma[rownames(all.stroma) %in% good.marker,],"c_counts_asinh")), k = i)[[2]]$membership)
+    cluster <- paste0("rp_stroma_all_k",i)
+  colnames(colData(all.stroma))[which(names(colData(all.stroma)) == "RPmembership")] <- paste0(cluster)
+  agg_sce <-aggregateAcrossCells(all.stroma, ids=all.stroma[[cluster]], average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+ 
+    scater::plotHeatmap(agg_sce,
+              features = good.marker,
+              #features = all.marker,
+              #features = marker.clustering,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c(cluster),
+              main=paste0("Heatmap tumour cells, ",cluster))
+    
+  #save sce clustering
+  saveRDS(all.stroma, file=file.path(data_folder, "all_stroma_workingfile.rds"))
+}
+#all.stroma <-readRDS( file=file.path(data_folder, "all_stroma_workingfile.rds"))
+
+```
+```{r, categorise all stroma clusters, message=F, echo=F, warning=F}
+is.tumour <- c(68)
+is.immune <- c(23,53)
+not.stroma <- c(68,23,53)
+all.stroma.tumour <- all.stroma[, all.stroma$rp_stroma_all_k20 %in% is.tumour]
+all.stroma.immune <- all.stroma[, all.stroma$rp_stroma_all_k20 %in% is.immune]
+all.stroma.final <- all.stroma[, !all.stroma$rp_stroma_all_k20 %in% not.stroma]
+
+#saveRDS(all.stroma.tumour, file=file.path(data_folder, "all_stroma_workingfile_TUMOUR.rds"))
+#saveRDS(all.stroma.immune, file=file.path(data_folder, "all_stroma_workingfile_IMMUNE.rds"))
+#saveRDS(all.stroma.final, file=file.path(data_folder, "FINAL_all_stroma_RAW.rds"))
+#stroma.final <-all.stroma.final
+#rm(all.stroma.final, all.stroma.immune, all.stroma.tumour)
+```
+
+
+
+
+################################################################################################################################
+#carrying on using stroma.final from here
+```{r, read in final stroma, message=F, echo=F, warning=F}
+wd <-dirname(getwd())
+
+data_folder <-(file.path(wd,"sce_objects","stroma"))
+
+#RAW
+#stroma.final <-readRDS(file=file.path(data_folder, "FINAL_all_stroma_RAW.rds"))
+
+#Workingfile
+stroma.final <-readRDS(file=file.path(data_folder, "FINAL_all_stroma_workingfile.rds"))
+
+#saveRDS(stroma.final,file=file.path(data_folder,"FINAL_all_stroma_workingfile.rds"))
+
+stroma.marker <-c("FSP1 / S100A4","SMA","FAP","Cadherin-11","Carbonic Anhydrase IX","Collagen I + Fibronectin",
+                  "VCAM1","Indoleamine 2- 3-dioxygenase (IDO)","Podoplanin","MMP11","CD73","MMP9","p75 (CD271)",
+                  "CD10","Vimentin","CD248 / Endosialin","PDGFR-b","CD34","CXCL12","CCL21","Ki-67","Caveolin-1",
+                  "CD146", "vWF + CD31","LYVE-1" ,  "PNAd" )
+
+```
+
+```{r, subset stroma final, message=F, echo=F, warning=F, eval=F}
+#split cells by ImageNumber
+n_cells_dr <- 100
+cs <- split(seq_len(ncol(stroma.final)), stroma.final$Tma_ac)
+length(unique(stroma.final$Tma_ac))
+#sample 'n_cells_dr' per ImageNumber
+cs <- unlist(lapply(cs, function(u) 
+  sample(u, min(n_cells_dr, length(u)))))
+#sub.test <- sce[,cs]
+stroma.final.sub <- stroma.final[,cs]
+
+#calculate percentage of cells subsetting results in
+p<-dim(assay(stroma.final.sub))[2]/dim(assay(stroma.final))[2]*100
+
+#results in % of all cells
+print(paste("Subsetting results in a total number of",dim(assay(
+  stroma.final.sub))[2],"cells, which corresponds to",round(p,digits=2), "% of all cells of the original sce." ))
+
+saveRDS(stroma.final.sub, file=file.path(data_folder, paste("stroma_final_sub.rds")))
+```
+
+```{r, load final stroma subset, message=FALSE, warning=FALSE, echo=FALSE}
+stroma.final.sub <- readRDS(file=file.path(data_folder, paste("stroma_final_sub.rds")))
+```
+
+```{r, calculate umap final stroma, warning=F, message=F, echo=F, eval=FALSE}
+p <-c(10,50,100)
+p=50
+for(i in p){
+stroma.final.sub <- runUMAP(stroma.final.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("UMAP_p", i),
+                 #use_dimred="PCA_20",
+                 n_neighbors = i)
+saveRDS(stroma.final.sub, file=file.path(data_folder, paste("stroma_final_sub.rds")))
+
+}
+saveRDS(stroma.final.sub, file=file.path(data_folder, paste("stroma_final_sub.rds")))
+
+
+for(i in p){
+stroma.final.sub <- runTSNE(stroma.final.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("tSNE_p", i),
+                 #use_dimred="PCA_20",
+                 perplexity = i)
+saveRDS(stroma.final.sub, file=file.path(data_folder, paste("stroma_final_sub.rds")))
+}
+```
+
+
+**UMAP with good markers**
+```{r,plot umap final stroma stroma marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+dat <-as.data.frame(reducedDims(stroma.final.sub)$`UMAP_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(stroma.final.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% stroma.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+```{r Clustering stroma final,using Rphenoannoy, fig.width=25, fig.height=12, message=F, echo=F, warning=F, eval=F}
+stroma.final$rp_stroma_all_k20 <-NULL
+stroma.final$`rp_stroma_all-stroma-marker_k20` <-NULL
+stroma.final$`rp_NONtumour-final_k35` <-NULL
+stroma.final$`rp_NONtumour-final_k40` <-NULL
+
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+
+#marker.clustering <- good.marker[!good.marker %in% "Pan Cytokeratin + Keratin Epithelial"]
+#cellnb <- 250000
+#subsample <-  colnames(sce)[sample(length(colnames(sce)), cellnb)]
+#sce_sub <- sce[, colnames(sce) %in% subsample]
+
+#set ks for clustering
+#i <- 100
+k <- c(20,25,30,35,40)
+#i<-10
+for (i in k) {
+   # stroma.final$RPmembership <- factor(Rphenoannoy(data = t(assay(stroma.final[rownames(stroma.final) %in% stroma.marker,],"c_counts_asinh")), k = i)[[2]]$membership)
+    cluster <- paste0("rp_stroma_final_k",i)
+ # colnames(colData(stroma.final))[which(names(colData(stroma.final)) == "RPmembership")] <- paste0(cluster)
+  agg_sce <-aggregateAcrossCells(stroma.final, ids=stroma.final[[cluster]], average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+ 
+    scater::plotHeatmap(agg_sce,
+              features = stroma.marker,
+              #features = all.marker,
+              #features = marker.clustering,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c(cluster),
+              main=paste0("Heatmap tumour cells, ",cluster))
+    
+  #save sce clustering
+  #saveRDS(stroma.final,file=file.path(data_folder,"FINAL_all_stroma_workingfile.rds"))
+
+}
+#stroma.final <-readRDS( file=file.path(data_folder, "final_stroma_workingfile.rds"))
+
+```
+
+
+```{r,HM stroma final, fig.width=25, fig.height=12, message=F, echo=F, warning=F}
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+
+is.vessel <- c(40,34,14,22,47,50,31,23,46,49,27,16,32,43,48,11,62,23,61,10)
+
+stroma.final$vessel <- ifelse(stroma.final$rp_stroma_final_k20 %in% is.vessel, "vessel","CAF")
+
+agg_sce <-aggregateAcrossCells(stroma.final, ids=stroma.final$rp_stroma_final_k20, average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+ 
+    scater::plotHeatmap(agg_sce,
+              features = stroma.marker,
+              #features = all.marker,
+              #features = marker.clustering,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c("vessel"))
+    
+stroma.vessel <- stroma.final[, stroma.final$vessel =="vessel"]
+stroma.CAF <- stroma.final[, stroma.final$vessel =="CAF"]
+
+#saveRDS(stroma.vessel, file=file.path(data_folder, "vessel","stroma_final_VESSEL_workingfile.rds"))
+#saveRDS(stroma.vessel, file=file.path(data_folder, "vessel","stroma_final_VESSEL_RAW.rds"))
+
+#saveRDS(stroma.vessel, file=file.path(data_folder,"stroma_final_VESSEL.rds"))
+
+#saveRDS(stroma.CAF, file=file.path(data_folder,"stroma_final_CAF.rds"))
+```
diff --git a/06_cluster_vessel-cells.Rmd b/06_cluster_vessel-cells.Rmd
new file mode 100644
index 0000000..67555b7
--- /dev/null
+++ b/06_cluster_vessel-cells.Rmd
@@ -0,0 +1,531 @@
+---
+title: "R Notebook"
+output:
+  html_document:
+    df_print: paged
+---
+
+```{r, import libraries, message=F, warning=F, echo=F}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+
+library(dplyr)
+library(tidyr)
+library(mclust)
+library(ggplot2)
+library(RColorBrewer)
+library(scater)
+library(Rphenoannoy)
+set.seed(101100)
+```
+
+
+```{r, Set wd and load data, message=F, warning=F, echo=F,eval=F}
+#set working directory 
+wd <-dirname(getwd())
+
+data_folder <-(file.path(wd,"sce_objects","stroma","vessel"))
+
+#RAW
+all.vessel<- readRDS(file=file.path(data_folder, "stroma_final_VESSEL_RAW.rds"))
+
+#workingfile
+all.vessel <- readRDS(file=file.path(data_folder, "stroma_final_VESSEL_workingfile.rds"))
+saveRDS(all.vessel, file=file.path(data_folder, "stroma_final_VESSEL_workingfile.rds"))
+```
+
+
+```{r, Define stroma_vessel markers, echo=F, warning=F, message=FALSE, eval=F}
+all.marker <-rownames(all.vessel)
+
+bad.marker <- c("Iridium_191","Iridium_193","Cadherin-6","Histone H3") 
+good.marker <- all.marker[!all.marker %in% bad.marker]
+stroma.marker <-c("FSP1 / S100A4","SMA","FAP","Cadherin-11","Carbonic Anhydrase IX","Collagen I + Fibronectin",
+                 "VCAM1","Indoleamine 2- 3-dioxygenase (IDO)","Podoplanin","MMP11","CD73","MMP9","p75 (CD271)","CD10","Vimentin","CD248 / Endosialin","PDGFR-b","CD34","CXCL12","CCL21","Ki-67","Caveolin-1","CD146",
+                 "vWF + CD31","LYVE-1" ,  "PNAd"   )
+
+vessel.marker <-c("SMA","FAP", "VCAM1","p75 (CD271)",
+                  "Vimentin","CD248 / Endosialin",
+                  "PDGFR-b","CD34","CXCL12","CCL21",
+                  "CD146","vWF + CD31","LYVE-1" ,"PNAd")
+print(vessel.marker)
+```
+
+```{r, subset vessel, message=F, warning=F, echo=F,eval=F}
+#split cells by ImageNumber
+n_cells_dr <- 100
+cs <- split(seq_len(ncol(all.vessel)), all.vessel$Tma_ac)
+length(unique(all.vessel$Tma_ac))
+#sample 'n_cells_dr' per ImageNumber
+cs <- unlist(lapply(cs, function(u) 
+  sample(u, min(n_cells_dr, length(u)))))
+#sub.test <- sce[,cs]
+all.vessel.sub <- all.vessel[,cs]
+
+#calculate percentage of cells subsetting results in
+p<-dim(assay(all.vessel.sub))[2]/dim(assay(all.vessel))[2]*100
+
+#results in % of all cells
+print(paste("Subsetting results in a total number of",dim(assay(
+  all.vessel.sub))[2],"cells, which corresponds to",round(p,digits=2), "% of all cells of the original sce." ))
+
+saveRDS(all.vessel.sub, file=file.path(data_folder, paste("all_vessel_sub.rds")))
+```
+
+```{r, load subset vessel, message=FALSE, warning=FALSE, echo=FALSE, eval=F}
+all.vessel.sub <- readRDS(file=file.path(data_folder, paste("all_vessel_sub.rds")))
+```
+
+```{r, calculate umap vessel, warning=F, message=F, echo=F, eval=FALSE}
+p <-c(10,50,100)
+p=50
+for(i in p){
+all.vessel.sub <- runUMAP(all.vessel.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("UMAP_p", i),
+                 #use_dimred="PCA_20",
+                 n_neighbors = i)
+saveRDS(all.vessel.sub, file=file.path(data_folder, paste("all_vessel_sub.rds")))
+
+}
+saveRDS(all.vessel.sub, file=file.path(data_folder, paste("all_vessel_sub.rds")))
+
+
+for(i in p){
+all.vessel.sub <- runTSNE(all.vessel.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("tSNE_p", i),
+                 #use_dimred="PCA_20",
+                 perplexity = i)
+saveRDS(all.vessel.sub, file=file.path(data_folder, paste("all_vessel_sub.rds")))
+}
+```
+
+
+**UMAP with good markers**
+```{r,plot umap vessel stroma marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE, eval=F}
+dat <-as.data.frame(reducedDims(all.vessel.sub)$`UMAP_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.vessel.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% good.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+**tsne with Tumour markers**
+```{r,plot tsne vessel stroma marker, fig.width=12, fig.height=8, message=F, warning=F, echo=F,eval=F}
+dat <-as.data.frame(reducedDims(all.vessel.sub)$`tSNE_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.vessel.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% good.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+
+```{r Clustering vessel stroma cells using Rphenoannoy, fig.width=25, fig.height=12, message=F, warning=F, echo=F,eval=F}
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+
+#marker.clustering <- good.marker[!good.marker %in% "Pan Cytokeratin + Keratin Epithelial"]
+#cellnb <- 250000
+#subsample <-  colnames(sce)[sample(length(colnames(sce)), cellnb)]
+#sce_sub <- sce[, colnames(sce) %in% subsample]
+
+#set ks for clustering
+#i <- 100
+k <- c(15,20)
+#i<-10
+for (i in k) {
+    all.vessel$RPmembership <- factor(Rphenoannoy(data = t(assay(all.vessel[rownames(all.vessel) %in% good.marker,],"c_counts_asinh")), k = i)[[2]]$membership)
+    cluster <- paste0("rp_vessel_all_k",i)
+  colnames(colData(all.vessel))[which(names(colData(all.vessel)) == "RPmembership")] <- paste0(cluster)
+  agg_sce <-aggregateAcrossCells(all.vessel, ids=all.vessel[[cluster]], average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+ 
+    scater::plotHeatmap(agg_sce,
+              features = good.marker,
+              #features = all.marker,
+              #features = marker.clustering,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c(cluster),
+              main=paste0("Heatmap tumour cells, ",cluster))
+    
+  #save sce clustering
+  saveRDS(all.vessel, file=file.path(data_folder, "stroma_final_VESSEL_workingfile.rds"))
+
+}
+#all.vessel <-readRDS( file=file.path(data_folder, "stroma_final_VESSEL_workingfile.rds"))
+
+```
+
+
+```{r, categorise vessel cells,message=F, warning=F, echo=F,eval=F}
+is.tumour <- c(43,41,44)
+all.vessel.tumour <- all.vessel[, all.vessel$rp_vessel_all_k20 %in% is.tumour]
+all.vessel.final <- all.vessel[, !all.vessel$rp_vessel_all_k20 %in% is.tumour]
+
+saveRDS(all.vessel.tumour, file=file.path(data_folder, "all_vessel_workingfile_TUMOUR.rds"))
+saveRDS(all.vessel.final, file=file.path(data_folder, "all_vessel_workingfile_VESSEL.rds"))
+```
+
+
+```{r Clustering vessel only using Rphenoannoy, fig.width=15, fig.height=5, message=F, warning=F, echo=F,eval=F}
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+
+#marker.clustering <- good.marker[!good.marker %in% "Pan Cytokeratin + Keratin Epithelial"]
+#cellnb <- 250000
+#subsample <-  colnames(sce)[sample(length(colnames(sce)), cellnb)]
+#sce_sub <- sce[, colnames(sce) %in% subsample]
+
+#set ks for clustering
+#i <- 100
+k <- c(15,20,30)
+#i<-20
+for (i in k) {
+    all.vessel.final$RPmembership <- factor(Rphenoannoy(data = t(assay(all.vessel.final[rownames(all.vessel.final) %in% good.marker,],"c_counts_asinh")), k = i)[[2]]$membership)
+    cluster <- paste0("rp_vessel_vessel_k",i)
+  colnames(colData(all.vessel.final))[which(names(colData(all.vessel.final)) == "RPmembership")] <- paste0(cluster)
+  agg_sce <-aggregateAcrossCells(all.vessel.final, ids=all.vessel.final[[cluster]], average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+ 
+    scater::plotHeatmap(agg_sce,
+              features =vessel.marker,
+              #features = all.marker,
+              #features = marker.clustering,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c(cluster),
+              main=paste0("Heatmap tumour cells, ",cluster))
+    
+  #save sce clustering
+  saveRDS(all.vessel.final, file=file.path(data_folder, "all_vessel_workingfile_VESSEL.rds"))
+
+}
+#all.vessel.final <-readRDS( file=file.path(data_folder, "stroma_final_VESSEL_workingfile.rds"))
+
+```
+
+```{r, vessel type vessel only, fig.width=15, fig.height=5, message=F, warning=F, echo=F,eval=F}
+is.vessel <- c(31,27,13,28,12,21,5,18,11,38,8,35,34,43,9,39,30,6)
+
+all.vessel.final$vesseltype <- ifelse(all.vessel.final$rp_vessel_vessel_k20 %in%is.vessel, "vessel","CAF")
+
+ cluster <- paste0("rp_vessel_vessel_k20")
+  agg_sce <-aggregateAcrossCells(all.vessel.final, ids=all.vessel.final[[cluster]], average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+ 
+    scater::plotHeatmap(agg_sce,
+              features =vessel.marker,
+              #features = all.marker,
+              #features = marker.clustering,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c("vesseltype"),
+              main=paste0("Heatmap vessel cells, ",cluster))
+    
+table(all.vessel.final$vesseltype)
+
+vessel.vessel <- all.vessel.final[, all.vessel.final$rp_vessel_vessel_k20 %in% is.vessel]
+vessel.CAF <- all.vessel.final[, !all.vessel.final$rp_vessel_vessel_k20 %in% is.vessel]
+
+saveRDS(vessel.vessel, file=file.path(data_folder, "vessel-VESSEL.rds"))
+saveRDS(vessel.CAF, file=file.path(wd,"sce_objects","stroma", "vessel-CAF.rds"))
+```
+
+###After adding HEV from CAF clustering
+
+
+
+```{r, Set wd and load data final vessel, message=F, warning=F, echo=F}
+#set working directory 
+wd <-dirname(getwd())
+
+data_folder <-(file.path(wd,"sce_objects","final objects with categories","FINAL"))
+
+#RAW
+#all.vessel<- readRDS(file=file.path(data_folder, "VESSEL_CLINICAL-DATA_FILTERED.rds"))
+
+#workingfile
+data_folder <-(file.path(wd,"sce_objects","stroma","vessel"))
+all.vessel <-readRDS(file=file.path(data_folder, "FINAL_VESSEL_clinical-data_workingfile.rds"))
+
+#all.vessel <- readRDS(file=file.path(data_folder, "stroma_final_VESSEL_workingfile.rds"))
+#saveRDS(all.vessel, file=file.path(data_folder, "FINAL_VESSEL_clinical-data_workingfile.rds"))
+  saveRDS(all.vessel, file=file.path(data_folder, "FINAL_VESSEL_clinical-data_workingfile.rds"))
+
+```
+
+
+```{r, Define final vessel markers, echo=F, warning=F, message=FALSE}
+all.marker <-rownames(all.vessel)
+
+bad.marker <- c("Iridium_191","Iridium_193","Cadherin-6","Histone H3") 
+good.marker <- all.marker[!all.marker %in% bad.marker]
+
+vessel.marker <-c("SMA", "VCAM1","CCL21",
+                  "CD146","vWF + CD31","LYVE-1" ,"PNAd")
+print(vessel.marker)
+```
+
+
+
+
+```{r, calculate umap vessel final, warning=F, message=F, echo=F, eval=FALSE}
+p <-c(10,50,100)
+p=50
+for(i in p){
+all.vessel <- runUMAP(all.vessel,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("UMAP_p", i),
+                 #use_dimred="PCA_20",
+                 n_neighbors = i)
+saveRDS(all.vessel, file=file.path(data_folder, "FINAL_VESSEL_clinical-data_workingfile.rds"))
+
+}
+
+
+#for(i in p){
+#all.vessel <- runTSNE(all.vessel,
+ #                exprs_values = "c_counts_asinh",
+ #                name = paste0("tSNE_p", i),
+ #                #use_dimred="PCA_20",
+ #               perplexity = i)
+#saveRDS(all.vessel, file=file.path(data_folder, "FINAL_VESSEL_clinical-data_workingfile.rds"))
+#}
+```
+
+
+**UMAP with good markers**
+```{r,plot umap vessel.final vessel marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+dat <-as.data.frame(reducedDims(all.vessel)$`UMAP_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.vessel,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% vessel.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+**tsne with Tumour markers**
+```{r,plot tsne vessel final marker, fig.width=12, fig.height=8, message=F, warning=F, echo=F,eval=F}
+dat <-as.data.frame(reducedDims(all.vessel.sub)$`tSNE_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.vessel.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% good.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+
+
+```{r Clustering vessel final cells using Rphenoannoy, fig.width=12, fig.height=6, message=F, warning=F, echo=F,eval=F}
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+
+#marker.clustering <- good.marker[!good.marker %in% "Pan Cytokeratin + Keratin Epithelial"]
+#cellnb <- 250000
+#subsample <-  colnames(sce)[sample(length(colnames(sce)), cellnb)]
+#sce_sub <- sce[, colnames(sce) %in% subsample]
+
+#set ks for clustering
+#i <- 100
+k <- c(50)
+#i<-10
+for (i in k) {
+    all.vessel$RPmembership <- factor(Rphenoannoy(data = t(assay(all.vessel[rownames(all.vessel) %in% vessel.marker,],"c_counts_asinh")), k = i)[[2]]$membership)
+    cluster <- paste0("rp_vessel_all_k",i)
+  colnames(colData(all.vessel))[which(names(colData(all.vessel)) == "RPmembership")] <- paste0(cluster)
+  agg_sce <-aggregateAcrossCells(all.vessel, ids=all.vessel[[cluster]], average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+ 
+    scater::plotHeatmap(agg_sce,
+              features = vessel.marker,
+              #features = all.marker,
+              #features = marker.clustering,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c(cluster),
+              main=paste0("Heatmap tumour cells, ",cluster))
+    
+  #save sce clustering
+  saveRDS(all.vessel, file=file.path(data_folder, "FINAL_VESSEL_clinical-data_workingfile.rds"))
+
+}
+#all.vessel <-readRDS( file=file.path(data_folder, "stroma_final_VESSEL_workingfile.rds"))
+
+```
+
+
+```{r, categorise vessel final,echo=F, message=F, warning=F}
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+
+is.blood <- c(17,19,10,20,14,7,8,12,9,11)
+is.lymph <-c(16,18,15,13,22,21)
+is.HEV <-c(3,6,1,2,4,5)
+
+all.vessel$vessel_type[all.vessel$rp_vessel_all_k50 %in% is.HEV] <-"HEV"
+all.vessel$vessel_type[all.vessel$rp_vessel_all_k50 %in% is.lymph] <-"Lymphatic"
+all.vessel$vessel_type[all.vessel$rp_vessel_all_k50 %in% is.blood] <-"Blood"
+
+agg_sce <-aggregateAcrossCells(all.vessel, ids=all.vessel$rp_vessel_all_k50, average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+ 
+    scater::plotHeatmap(agg_sce,
+              features = vessel.marker,
+              #features = all.marker,
+              #features = marker.clustering,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c("vessel_type"))
+    
+  agg_sce <-aggregateAcrossCells(all.vessel, ids=all.vessel$vessel_type, average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+ 
+    scater::plotHeatmap(agg_sce,
+              features = vessel.marker,
+              #features = all.marker,
+              #features = marker.clustering,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c("vessel_type"))
+    
+table(all.vessel$vessel_type)
+```
+
+```{r, save vessel final, eval=F, message=F, warning=F, echo=F}
+data_folder <-(file.path(wd,"sce_objects","final objects with categories","FINAL"))
+all.vessel$rp_vessel_all_k50 <-NULL
+colnames(colData(all.vessel))
+  saveRDS(all.vessel, file=file.path(data_folder, "FINAL_VESSEL_clinical-data_workingfile.rds"))
+
+```
+
diff --git a/07_cluster_CAFs.Rmd b/07_cluster_CAFs.Rmd
new file mode 100644
index 0000000..3e70a6d
--- /dev/null
+++ b/07_cluster_CAFs.Rmd
@@ -0,0 +1,641 @@
+---
+title: "R Notebook"
+output:
+  html_document:
+    df_print: paged
+---
+
+```{r, import libraries, echo=F, message=F, warning=F}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+
+library(dplyr)
+library(tidyr)
+library(mclust)
+library(ggplot2)
+library(RColorBrewer)
+library(scater)
+library(Rphenoannoy)
+set.seed(101100)
+```
+
+
+```{r, Set wd and load data, echo=F, message=F, warning=F}
+#set working directory 
+wd <-dirname(getwd())
+
+data_folder <-(file.path(wd,"sce_objects","stroma","CAF"))
+
+#RAW
+#stroma.fibro<- readRDS(file=file.path(data_folder, "stroma_final_CAF.rds"))
+#vessel.CAF <-readRDS(file=file.path(data_folder, "vessel-CAF.rds"))
+#all.fibro <- readRDS(file=file=file.path(data_folder,"all_fibros_RAW.rds"))
+#workingfile
+all.fibro <- readRDS(file=file.path(data_folder,"all_fibros_workingfile.rds"))
+```
+
+```{r, merge CAFs from vessel, echo=F, message=F, warning=F, eval=F}
+colnames(colData(stroma.fibro))[!colnames(colData(stroma.fibro)) %in%colnames(colData(vessel.CAF))]
+colnames(colData(vessel.CAF))[!colnames(colData(vessel.CAF)) %in%colnames(colData(stroma.fibro))]
+
+vessel.CAF$rp_vessel_all_k15 <-NULL
+vessel.CAF$rp_vessel_all_k20 <-NULL
+vessel.CAF$rp_vessel_vessel_k15 <-NULL
+vessel.CAF$rp_vessel_vessel_k20 <-NULL
+vessel.CAF$rp_vessel_vessel_k30 <-NULL
+vessel.CAF$vesseltype <-NULL
+
+all.fibro <-cbind(stroma.fibro, vessel.CAF)
+
+#saveRDS(all.fibro, file=file.path(data_folder,"all_fibros_RAW.rds"))
+#saveRDS(all.fibro, file=file.path(data_folder,"all_fibros_workingfile.rds"))
+```
+
+```{r, Define stroma markers, echo=F, warning=F, message=FALSE}
+all.marker <-rownames(all.fibro)
+
+bad.marker <- c("Iridium_191","Iridium_193","Cadherin-6","Histone H3") 
+good.marker <- all.marker[!all.marker %in% bad.marker]
+fibro.marker <-c("FSP1 / S100A4","SMA","FAP","Cadherin-11","Carbonic Anhydrase IX","Collagen I + Fibronectin",
+                 "VCAM1","Indoleamine 2- 3-dioxygenase (IDO)","Podoplanin","MMP11","CD73","MMP9","p75 (CD271)","CD10","Vimentin","CD248 / Endosialin","PDGFR-b","CD34","CXCL12","CCL21","Ki-67","Caveolin-1","CD146")
+
+
+print(fibro.marker)
+```
+
+```{r, subset CAF all, echo=F, message=F, warning=F, eval=F}
+#split cells by ImageNumber
+n_cells_dr <- 100
+cs <- split(seq_len(ncol(all.fibro)), all.fibro$Tma_ac)
+length(unique(all.fibro$Tma_ac))
+#sample 'n_cells_dr' per ImageNumber
+cs <- unlist(lapply(cs, function(u) 
+  sample(u, min(n_cells_dr, length(u)))))
+#sub.test <- sce[,cs]
+all.fibro.sub <- all.fibro[,cs]
+
+#calculate percentage of cells subsetting results in
+p<-dim(assay(all.fibro.sub))[2]/dim(assay(all.fibro))[2]*100
+
+#results in % of all cells
+print(paste("Subsetting results in a total number of",dim(assay(
+  all.fibro.sub))[2],"cells, which corresponds to",round(p,digits=2), "% of all cells of the original sce." ))
+
+saveRDS(all.fibro.sub, file=file.path(data_folder, paste("all_fibro_sub.rds")))
+```
+
+```{r, load subset CAF all, message=FALSE, warning=FALSE, echo=FALSE}
+all.fibro.sub <- readRDS(file=file.path(data_folder, paste("all_fibro_sub.rds")))
+all.fibro.sub <- readRDS(file=file.path(data_folder, paste("final_fibro_sub.rds")))
+```
+
+```{r, calculate umap CAF all, warning=F, message=F, echo=F, eval=FALSE}
+p <-c(10,25,50,75,100)
+#p=50
+for(i in p){
+all.fibro.sub <- runUMAP(all.fibro.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("UMAP_p", i),
+                 #use_dimred="PCA_20",
+                 n_neighbors = i)
+saveRDS(all.fibro.sub, file=file.path(data_folder, paste("all_fibro_sub.rds")))
+
+}
+saveRDS(all.fibro.sub, file=file.path(data_folder, paste("all_fibro_sub.rds")))
+
+
+for(i in p){
+all.fibro.sub <- runTSNE(all.fibro.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("tSNE_p", i),
+                 #use_dimred="PCA_20",
+                 perplexity = i)
+saveRDS(all.fibro.sub, file=file.path(data_folder, paste("all_fibro_sub.rds")))
+}
+saveRDS(all.fibro.sub, file=file.path(data_folder, paste("all_fibro_sub.rds")))
+
+```
+
+
+**UMAP with good markers**
+```{r,plot umap CAF all good marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+fibro.marker <-c("FSP1 / S100A4","SMA","FAP","Cadherin-11","Carbonic Anhydrase IX","Collagen I + Fibronectin",
+                 "VCAM1","Indoleamine 2- 3-dioxygenase (IDO)","Podoplanin","MMP11","CD73","MMP9","p75 (CD271)","CD10","Vimentin","CD248 / Endosialin","PDGFR-b","CD34","CXCL12","CCL21","Ki-67","Caveolin-1","CD146")
+
+fibro.marker <-c("FSP1 / S100A4","SMA","FAP","Cadherin-11","Carbonic Anhydrase IX","Collagen I + Fibronectin",
+                 "Indoleamine 2- 3-dioxygenase (IDO)","Podoplanin","MMP11","CD73","CD10","Vimentin","CD248 / Endosialin","PDGFR-b","CD34","Ki-67","Caveolin-1","CD146")
+
+fibro.marker.sub <- c("SMA","FAP","Ki-67","PDGFR-b","CD10","Carbonic Anhydrase IX","CD146","CD73","CD34")
+u.map <-c("UMAP_p10","UMAP_p25","UMAP_p50","UMAP_p75","UMAP_p100")
+
+i="UMAP_p10"
+i = "tSNE_p100"
+for(i in u.map){
+dat <-as.data.frame(reducedDims(all.fibro.sub)[[i]])
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.fibro.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% fibro.marker.sub], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 3)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+ggsave(filename=file.path(plot_folder, paste0("",i,"CAFmarker",".png",sep="")), plot=p, width=8, height=6)
+plot(p)
+
+
+#CAFType
+cluster <- "cell_type"
+p <-plotReducedDim(all.fibro.sub, paste(i), colour_by=paste(cluster), point_size=0.5,)+labs(x="UMAP-1", y="UMAP-2", title="UMAP") +scale_color_d3("category20")# text_by = paste(cluster),text_colour ="black",
+plot(p)
+ggsave(plot=p, file=file.path(plot_folder, paste0("",i,"_CAF_types.png")))
+#CAFSubType
+cluster <- "cell_subtype"
+p <-plotReducedDim(all.fibro.sub, paste(i), colour_by=paste(cluster), text_by = paste(cluster),text_colour ="red", point_size=0.5,)+labs(x="UMAP-1", y="UMAP-2", title="UMAP")+scale_color_d3("category20") 
+plot(p)
+ggsave(plot=p, file=file.path(plot_folder, paste0("",i,"_CAF_subtypes.png")))
+
+}
+```
+**tsne with good markers**
+```{r,plot umap CAF all good marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+
+t.sne <-c("tSNE_p10","tSNE_p25","tSNE_p50","tSNE_p75","tSNE_p100")
+
+for(i in t.sne){
+dat <-as.data.frame(reducedDims(all.fibro.sub)[[i]])
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.fibro.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% fibro.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+ggsave(filename=file.path(plot_folder, paste0("tSNE_p",i,"CAFmarker",".png",sep="")), plot=p, width=16, height=10)
+plot(p)
+
+
+#CAFType
+cluster <- "cell_type"
+p <-plotReducedDim(all.fibro.sub, paste(i), colour_by=paste(cluster), text_by = paste(cluster),text_colour ="black", point_size=0.5,)+labs(x="tSNE-1", y="tSNE-2", title="tSNE") +scale_color_d3("category20")
+plot(p)
+ggsave(plot=p, file=file.path(plot_folder, paste0("tSNE_",i,"_CAF_types.png")))
+#CAFSubType
+cluster <- "cell_subtype"
+p <-plotReducedDim(all.fibro.sub, paste(i), colour_by=paste(cluster), text_by = paste(cluster),text_colour ="red", point_size=0.5,)+labs(x="tSNE-1", y="tSNE-2", title="tSNE")+scale_color_d3("category20") 
+plot(p)
+ggsave(plot=p, file=file.path(plot_folder, paste0("tSNE_p",i,"_CAF_subtypes.png")))
+
+}
+```
+```{r}
+df <- data.frame("CellID"=fibro.sce$CellID,
+                 "CAFtype" =fibro.sce$cell_type,
+                 "CAFsubtype"= fibro.sce$cell_subtype)
+
+cur_DF <- as_tibble(colData(all.fibro.sub)) %>% left_join(df, by = "CellID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(all.fibro.sub$ImageNumber, all.fibro.sub$CellNumber))
+
+colData(all.fibro.sub) <- cur_DF
+rownames(colData(all.fibro.sub)) <-all.fibro.sub$CellID
+```
+
+#FLOWSOM clustering
+
+```{r, cluster CAFs using FLOWSOM, echo=F, eval=FALSE, warning=F, message=F, fig.width=15, fig.height=8}
+fibro.marker.cluster <-c("SMA","FAP", "Cadherin-11", "Carbonic Anhydrase IX","Collagen I + Fibronectin",
+                 #"VCAM1",
+                 "Indoleamine 2- 3-dioxygenase (IDO)","Podoplanin","MMP11","CD73",
+                 #"MMP9",
+                 "CD10","Vimentin","CD248 / Endosialin",
+                 #"LYVE-1",
+                 "PDGFR-b","CD34","CXCL12","CCL21","Ki-67",
+                 #"Caveolin-1",
+                 "CD146","PNAd")
+
+
+
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+assay(all.fibro, "exprs")<-assay(all.fibro, "c_counts_asinh")
+
+#run FlowSOM
+re2 <- CATALYST::cluster(all.fibro, features =fibro.marker.cluster, verbose = FALSE, maxK = 50)
+fibro.test <- all.fibro
+fibro.test2 <- all.fibro
+cl <-c(30:45)
+#i <-5
+for (i in cl){
+  #i=35
+  cluster <- paste0("som_",i)
+  fibro.test2[[cluster]] <- as.factor(cluster_ids(re2, paste0("meta",i)))
+  agg_sce <- aggregateAcrossCells(fibro.test2, ids=fibro.test2[[cluster]], average=TRUE, use_exprs_values="c_counts_asinh_scaled")
+#plot Heatmap
+  scater::plotHeatmap(agg_sce,
+              #features = fibro.marker,
+              features=fibro.marker.cluster,
+              exprs_values = "c_counts_asinh_scaled",
+              #symmetric = FALSE,
+              zlim=c(-0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c("ids"),
+              main=paste0("Heatmap Fibros, ",cluster))
+}
+```
+
+**Heatmap fibroblast cluster after FLOWSOM clustering som_30**
+```{r, assign CAF levels, fig.width=15, fig.height=8, echo=F, message=FALSE,warning=FALSE, eval=FALSE}
+fibro.test2 <-readRDS(file=file.path(data_folder,"all_fibros_workingfile.rds"))
+
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+
+is.HEV <-c(17,10,9)
+is.iCAF.CD248 <-c(6,22,35,44)
+is.vCAF <-c(1,2,3,4,5)
+is.tpCAF.CD10 <-c(8,16)
+is.tpCAF.CD73 <-c(21)
+is.iCAF.CD34 <-c(7,32)
+is.IDO.CAF <-c(13)
+is.mCAF.Col.Cdh <-c(29,19,31)
+is.mCAF.MMP11 <-c(39,14,40,12,11,19)
+is.SMA.CAF <-c(24,27,25,20,18)
+is.Collagen.CAF <-c(37,30,33)
+is.dCAF <-c(28,43)
+is.PDPN.CAF <-c(34,23)
+is.hypox.tpCAF <-c(38,45)
+is.hCAF <-c(46,36,41,42)
+is.other <-c(15,26)
+
+fibro.test2$CAFsubtype[fibro.test2$som_45 %in% is.HEV] <-"HEV"
+fibro.test2$CAFsubtype[fibro.test2$som_45 %in% is.iCAF.CD248] <-"iCAF_CD248"
+fibro.test2$CAFsubtype[fibro.test2$som_45 %in% is.vCAF] <-"vCAF"
+fibro.test2$CAFsubtype[fibro.test2$som_45 %in% is.tpCAF.CD10] <-"tpCAF_CD10"
+fibro.test2$CAFsubtype[fibro.test2$som_45 %in% is.tpCAF.CD73] <-"tpCAF_CD73"
+fibro.test2$CAFsubtype[fibro.test2$som_45 %in% is.iCAF.CD34] <-"iCAF_CD34"
+fibro.test2$CAFsubtype[fibro.test2$som_45 %in% is.IDO.CAF] <-"IDO_CAF"
+fibro.test2$CAFsubtype[fibro.test2$som_45 %in% is.mCAF.Col.Cdh] <-"mCAF_Col_Cdh"
+fibro.test2$CAFsubtype[fibro.test2$som_45 %in% is.mCAF.MMP11] <-"mCAF_MMP11"
+
+fibro.test2$CAFsubtype[fibro.test2$som_45 %in% is.SMA.CAF] <-"SMA_CAF"
+fibro.test2$CAFsubtype[fibro.test2$som_45 %in% is.hypox.tpCAF] <-"hypoxic_tpCAF"
+fibro.test2$CAFsubtype[fibro.test2$som_45 %in% is.Collagen.CAF] <-"Collagen_CAF"
+fibro.test2$CAFsubtype[fibro.test2$som_45 %in% is.dCAF] <-"dCAF"
+fibro.test2$CAFsubtype[fibro.test2$som_45 %in% is.PDPN.CAF] <-"PDPN_CAF"
+fibro.test2$CAFsubtype[fibro.test2$som_45 %in% is.hCAF] <-"hypoxic_CAF"
+fibro.test2$CAFsubtype[fibro.test2$som_45 %in% is.other] <-"other"
+
+fibro.test2$CAFtype[fibro.test2$som_45 %in% is.HEV] <-"HEV"
+fibro.test2$CAFtype[fibro.test2$som_45 %in% is.iCAF.CD248] <-"iCAF"
+fibro.test2$CAFtype[fibro.test2$som_45 %in% is.vCAF] <-"vCAF"
+fibro.test2$CAFtype[fibro.test2$som_45 %in% is.tpCAF.CD10] <-"tpCAF"
+fibro.test2$CAFtype[fibro.test2$som_45 %in% is.tpCAF.CD73] <-"tpCAF"
+fibro.test2$CAFtype[fibro.test2$som_45 %in% is.iCAF.CD34] <-"iCAF"
+fibro.test2$CAFtype[fibro.test2$som_45 %in% is.IDO.CAF] <-"IDO_CAF"
+fibro.test2$CAFtype[fibro.test2$som_45 %in% is.mCAF] <-"mCAF"
+fibro.test2$CAFtype[fibro.test2$som_45 %in% is.SMA.CAF] <-"SMA_CAF"
+fibro.test2$CAFtype[fibro.test2$som_45 %in% is.hypox.tpCAF] <-"hypoxic_tpCAF"
+fibro.test2$CAFtype[fibro.test2$som_45 %in% is.Collagen.CAF] <-"Collagen_CAF"
+fibro.test2$CAFtype[fibro.test2$som_45 %in% is.dCAF] <-"dCAF"
+fibro.test2$CAFtype[fibro.test2$som_45 %in% is.PDPN.CAF] <-"PDPN_CAF"
+fibro.test2$CAFtype[fibro.test2$som_45 %in% is.hCAF] <-"hypoxic_CAF"
+fibro.test2$CAFtype[fibro.test2$som_45 %in% is.other] <-"other"
+```
+
+
+```{r, plot HM CAF all, fig.width=15, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+
+#saveRDS(fibro.test2, file=file.path(data_folder,"all_fibros_workingfile.rds"))
+fibro.marker.cluster <-c("SMA","FAP", "Cadherin-11", "Carbonic Anhydrase IX","Collagen I + Fibronectin",
+                 #"VCAM1",
+                 "Indoleamine 2- 3-dioxygenase (IDO)","Podoplanin","MMP11","CD73",
+                 #"MMP9",
+                 "CD10","Vimentin","CD248 / Endosialin",
+                 #"LYVE-1",
+                 "PDGFR-b","CD34","CXCL12","CCL21","Ki-67",
+                 #"Caveolin-1",
+                 "CD146","PNAd")
+i <-45
+cluster <- paste0("som_",i)
+agg_sce <- aggregateAcrossCells(all.fibro[,all.fibro$CAFtype !="other"&all.fibro$CAFtype !="HEV"], ids=all.fibro[,all.fibro$CAFtype !="other"&all.fibro$CAFtype !="HEV"][[cluster]], average=TRUE, use_exprs_values="c_counts_asinh_scaled")
+
+ #plot Heatmap
+scater::plotHeatmap(agg_sce,
+              #features = good.marker,
+              features=fibro.marker.cluster,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(-0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c("CAFtype","CAFsubtype"),
+              main=paste0("Heatmap Fibros, ",cluster))
+
+agg_sce <- aggregateAcrossCells(all.fibro, ids=all.fibro$CAFtype, average=TRUE, use_exprs_values="c_counts_asinh_scaled")
+
+ #plot Heatmap
+scater::plotHeatmap(agg_sce,
+              #features = good.marker,
+              features=fibro.marker.cluster,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(-0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c("CAFtype"),
+              main=paste0("Heatmap Fibros, ",cluster))
+
+agg_sce <- aggregateAcrossCells(all.fibro, ids=all.fibro$CAFsubtype, average=TRUE, use_exprs_values="c_counts_asinh_scaled")
+
+ #plot Heatmap
+scater::plotHeatmap(agg_sce,
+              #features = good.marker,
+              features=fibro.marker.cluster,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(-0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c("CAFsubtype"),
+              main=paste0("Heatmap Fibros, ",cluster))
+
+table(all.fibro$CAFtype)
+table(all.fibro$CAFsubtype)
+```
+
+
+
+```{r, subset CAF final, message=F, warning=F, echo=F}
+#fibro.test2.PNAd <- fibro.test2[,fibro.test2$CAFtype=="HEV"]
+#fibro.test2.other<- fibro.test2[,fibro.test2$CAFtype=="other"]
+
+fibro.final <- all.fibro[,all.fibro$CAFtype!="HEV"&
+                             all.fibro$CAFtype!="other"]
+
+#saveRDS(fibro.test2.PNAd, file=file.path(data_folder,"all_fibros_workingfile_HEV.rds"))
+
+#saveRDS(fibro.test2.other, file=file.path(data_folder,"all_fibros_workingfile_OTHER.rds"))
+
+#saveRDS(fibro.final, file=file.path(data_folder,"all_fibros_workingfile_FIBROs.rds"))
+
+#saveRDS(fibro.final, file=file.path(data_folder,"FINAL_Fibros.rds"))
+
+```
+
+
+```{r, add clustering results to subset fibro final, message=F, warning=F, echo=F}
+rp_df <- data.frame("CellID"=all.fibro$CellID, "som_45"=all.fibro$som_45)
+rp_df <- data.frame("CellID"=all.fibro$CellID, "CAFtype"=all.fibro$CAFtype, "CAFsubtype"=all.fibro$CAFsubtype)
+
+cur_DF <- as_tibble(colData(all.fibro.sub)) %>% left_join(rp_df, by = "CellID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(all.fibro.sub$ImageNumber, all.fibro.sub$CellNumber))
+
+colData(all.fibro.sub) <- cur_DF
+rownames(colData(all.fibro.sub)) <-all.fibro.sub$CellID
+```
+
+```{r, plot UMAP ALL FIBROS types, message=F, warning=F, echo=F, eval=F}
+#cluster <- "som_40"
+#plotReducedDim(all.fibro.sub[, all.fibro.sub$som_40==15], "UMAP_p50", colour_by=paste(cluster), text_by = paste(cluster),text_colour ="red", point_size=0.5,)+labs(x="UMAP-1", y="UMAP-2", title="UMAP") 
+
+#cluster <- "som_45"
+#plotReducedDim(all.fibro.sub, "UMAP_p50", colour_by=paste(cluster), text_by = paste(cluster),text_colour ="red", point_size=0.5,)+labs(x="UMAP-1", y="UMAP-2", title="UMAP") 
+
+#CAFType
+cluster <- "CAFtype"
+plotReducedDim(all.fibro.sub, "UMAP_p50", colour_by=paste(cluster), text_by = paste(cluster),text_colour ="red", point_size=0.5,)+labs(x="UMAP-1", y="UMAP-2", title="UMAP") 
+
+#CAFSubType
+cluster <- "CAFsubtype"
+plotReducedDim(all.fibro.sub, "UMAP_p50", colour_by=paste(cluster), text_by = paste(cluster),text_colour ="red", point_size=0.5,)+labs(x="UMAP-1", y="UMAP-2", title="UMAP") 
+```
+#Fibro final tsne
+```{r, subset CAF all, echo=F, message=F, warning=F, eval=F}
+#split cells by ImageNumber
+n_cells_dr <- 100
+cs <- split(seq_len(ncol(fibro.sce)), fibro.sce$Tma_ac)
+length(unique(fibro.sce$Tma_ac))
+#sample 'n_cells_dr' per ImageNumber
+cs <- unlist(lapply(cs, function(u) 
+  sample(u, min(n_cells_dr, length(u)))))
+#sub.test <- sce[,cs]
+all.fibro.sub <- fibro.sce[,cs]
+
+#calculate percentage of cells subsetting results in
+p<-dim(assay(all.fibro.sub))[2]/dim(assay(fibro.sce))[2]*100
+
+#results in % of all cells
+print(paste("Subsetting results in a total number of",dim(assay(
+  all.fibro.sub))[2],"cells, which corresponds to",round(p,digits=2), "% of all cells of the original sce." ))
+
+saveRDS(all.fibro.sub, file=file.path(data_folder, paste("final_fibro_sub.rds")))
+```
+
+```{r, load subset CAF all, message=FALSE, warning=FALSE, echo=FALSE}
+all.fibro.sub <- readRDS(file=file.path(data_folder, paste("final_fibro_sub.rds")))
+```
+
+```{r, calculate umap CAF all, warning=F, message=F, echo=F, eval=FALSE}
+p <-c(10,25,50,75,100)
+#p=50
+for(i in p){
+all.fibro.sub <- runUMAP(all.fibro.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("UMAP_p", i),
+                 #use_dimred="PCA_20",
+                 n_neighbors = i)
+saveRDS(all.fibro.sub, file=file.path(data_folder, paste("final_fibro_sub.rds")))
+
+}
+saveRDS(all.fibro.sub, file=file.path(data_folder, paste("final_fibro_sub.rds")))
+
+
+for(i in p){
+all.fibro.sub <- runTSNE(all.fibro.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("tSNE_p", i),
+                 #use_dimred="PCA_20",
+                 perplexity = i)
+saveRDS(all.fibro.sub, file=file.path(data_folder, paste("final_fibro_sub.rds")))
+}
+saveRDS(all.fibro.sub, file=file.path(data_folder, paste("final_fibro_sub.rds")))
+
+```
+
+
+**UMAP with good markers**
+```{r,plot umap CAF all good marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+
+plot_folder <- file.path(data_folder,"plots")
+u.map <-c("UMAP_p10","UMAP_p25","UMAP_p50","UMAP_p75","UMAP_p100")
+
+for( i in u.map){
+dat <-as.data.frame(reducedDims(all.fibro.sub)[[i]])
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.fibro.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% fibro.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+ggsave(filename=file.path(plot_folder, paste0("fibro_sub_fibro-Marker_",i,".png")), plot=p, width=16, height=10)
+plot(p)
+
+
+#CAFType
+cluster <- "cell_type"
+p <-plotReducedDim(all.fibro.sub, paste(i), colour_by=paste(cluster), text_by = paste(cluster),text_colour ="red", point_size=0.5,)+labs(x="tSNE-1", y="tSNE-2", title="tSNE") 
+plot(p)
+ggsave(p,file=file.path(plot_folder,paste0( "fibro_sub_",i,"CAFtype.png")) )
+#CAFSubType
+cluster <- "cell_subtype"
+p <-plotReducedDim(all.fibro.sub, paste(i), colour_by=paste(cluster), text_by = paste(cluster),text_colour ="red", point_size=0.5,)+labs(x="tSNE-1", y="tSNE-2", title="tSNE") 
+plot(p)
+ggsave(p,file=file.path(plot_folder,paste0( "fibro_sub_",i,"CAFsubtype.png")) )
+
+}
+```
+**tsne with good markers**
+```{r,plot umap CAF all good marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+
+t.sne <-c("tSNE_p10","tSNE_p25","tSNE_p50","tSNE_p75","tSNE_p100")
+
+for(i in t.sne){
+dat <-as.data.frame(reducedDims(all.fibro.sub)[[i]])
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.fibro.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% fibro.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+
+#CAFType
+cluster <- "cell_type"
+p <-plotReducedDim(all.fibro.sub, paste(i), colour_by=paste(cluster), text_by = paste(cluster),text_colour ="red", point_size=0.5,)+labs(x="tSNE-1", y="tSNE-2", title="tSNE") 
+plot(p)
+#CAFSubType
+cluster <- "cell_subtype"
+p <-plotReducedDim(all.fibro.sub, paste(i), colour_by=paste(cluster), text_by = paste(cluster),text_colour ="red", point_size=0.5,)+labs(x="tSNE-1", y="tSNE-2", title="tSNE") 
+plot(p)
+}
+```
+
+
+
+```{r}
+install.packages("circlize")
+library(circlize)
+origin <- paste0("orig ", sample(c(1:10), 20, replace = T))
+destination <- paste0("dest ", sample(c(1:10), 20, replace = T))
+data <- data.frame(origin, destination)
+
+# Transform input data in a adjacency matrix
+adjacencyData <- with(data, table(origin, destination))
+ 
+# Charge the circlize library
+library(circlize)
+ 
+# Make the circular plot
+chordDiagram(adjacencyData, transparency = 0.5)
+```
+```{r}
+library(scales)
+show_col(pal_d3("category20")(20))
+```
+
diff --git a/08_merge-all-cells_post-Clustering.Rmd b/08_merge-all-cells_post-Clustering.Rmd
new file mode 100644
index 0000000..ead6da0
--- /dev/null
+++ b/08_merge-all-cells_post-Clustering.Rmd
@@ -0,0 +1,369 @@
+---
+title: "R Notebook"
+output: html_notebook
+---
+
+```{r, import libraries}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+
+library(dplyr)
+library(tidyr)
+library(mclust)
+library(ggplot2)
+library(RColorBrewer)
+library(scater)
+library(Rphenoannoy)
+set.seed(101100)
+```
+
+```{r}
+final.fibro <-readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/FINAL_Fibros.rds")
+final.fibro.other <-readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/all_fibros_workingfile_OTHER.rds")
+final.fibro.HEV <-readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/all_fibros_workingfile_HEV.rds")
+
+final.vessel <-readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/vessel-VESSEL.rds")
+final.vessel.tumour <-readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/all_vessel_workingfile_TUMOUR.rds")
+
+final.stroma.tumour <- readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/all_stroma_workingfile_TUMOUR.rds")
+
+final.immune.tumour <- readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/all_immune_workingfile_TUMOUR.rds")
+final.immune.nonTcell <-readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/NonTcell_immune_afterStroma_Tcell-merge_workingfile.rds")
+final.immune.Tcell <- readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/FINAL_Tcells-minusImmune_workingfile.rds")
+
+
+final.tumour.tumour <-readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/merge_all-tumour_TUMOUR.rds")
+
+final.nontumour.undefined <-readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/merge_all-NONtumour_final_workingfile_UNDEFINED.rds")
+final.nontumour.tumour <-readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/merge_all-NONtumour_final_workingfile_TUMOUR.rds")
+
+```
+
+```{r}
+#final.fibro
+colData(final.fibro) %>% colnames()
+final.fibro$cell_category <- "Fibroblast"
+final.fibro$cell_type <- final.fibro$CAFtype
+final.fibro$cell_subtype <- final.fibro$CAFsubtype
+
+cur_DF <- colData(final.fibro)%>% data.frame() %>% select(-contains(c("som_","rp_","CAF"))) %>% DataFrame()
+colnames(cur_DF)
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(final.fibro$ImageNumber, final.fibro$CellNumber))
+
+colData(final.fibro) <- cur_DF
+
+#final.fibro.other
+colData(final.fibro.other) %>% colnames()
+final.fibro.other$cell_category <- "Other"
+final.fibro.other$cell_type <- "Other"
+final.fibro.other$cell_subtype <- "Other"
+
+cur_DF <- colData(final.fibro.other)%>% data.frame() %>% select(-contains(c("som_","rp_","CAF"))) %>% DataFrame()
+
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(final.fibro.other$ImageNumber, final.fibro.other$CellNumber))
+
+colData(final.fibro.other) <- cur_DF
+
+#final.fibro.HEV
+colData(final.fibro.HEV) %>% colnames()
+final.fibro.HEV$cell_category <- "vessel"
+final.fibro.HEV$cell_type <- "HEV"
+final.fibro.HEV$cell_subtype <- "HEV"
+
+cur_DF <- colData(final.fibro.HEV)%>% data.frame() %>% select(-contains(c("som_","rp_","CAF"))) %>% DataFrame()
+colnames(cur_DF)
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(final.fibro.HEV$ImageNumber, final.fibro.HEV$CellNumber))
+
+colData(final.fibro.HEV) <- cur_DF
+
+#final.vessel
+colData(final.vessel) %>% colnames()
+final.vessel$cell_category <- "vessel"
+final.vessel$cell_type <- "vessel"
+final.vessel$cell_subtype <- "vessel"
+
+cur_DF <- colData(final.vessel)%>% data.frame() %>% select(-contains(c("som_","rp_","vessel","tumour"))) %>% DataFrame()
+colnames(cur_DF)
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(final.vessel$ImageNumber, final.vessel$CellNumber))
+
+colData(final.vessel) <- cur_DF
+
+
+#final.vessel.tumour.tumour
+colData(final.vessel.tumour) %>% colnames()
+final.vessel.tumour$cell_category <- "Tumour"
+final.vessel.tumour$cell_type <- "Tumour"
+final.vessel.tumour$cell_subtype <- "Tumour"
+
+cur_DF <- colData(final.vessel.tumour)%>% data.frame() %>% select(-contains(c("som_","rp_","vessel","tumour"))) %>% DataFrame()
+colnames(cur_DF)
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(final.vessel.tumour$ImageNumber, final.vessel.tumour$CellNumber))
+
+colData(final.vessel.tumour) <- cur_DF
+
+
+#final.stroma.tumour.tumour
+colData(final.stroma.tumour) %>% colnames()
+final.stroma.tumour$cell_category <- "Tumour"
+final.stroma.tumour$cell_type <- "Tumour"
+final.stroma.tumour$cell_subtype <- "Tumour"
+
+cur_DF <- colData(final.stroma.tumour)%>% data.frame() %>% select(-contains(c("som_","rp_","vessel","tumour"))) %>% DataFrame()
+colnames(cur_DF)
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(final.stroma.tumour$ImageNumber, final.stroma.tumour$CellNumber))
+
+colData(final.stroma.tumour) <- cur_DF
+
+#final.immune.tumour.tumour
+colData(final.immune.tumour) %>% colnames()
+final.immune.tumour$cell_category <- "Tumour"
+final.immune.tumour$cell_type <- "Tumour"
+final.immune.tumour$cell_subtype <- "Tumour"
+
+cur_DF <- colData(final.immune.tumour)%>% data.frame() %>% select(-contains(c("som_","rp_","vessel","tumour"))) %>% DataFrame()
+colnames(cur_DF)
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(final.immune.tumour$ImageNumber, final.immune.tumour$CellNumber))
+
+colData(final.immune.tumour) <- cur_DF
+
+
+#final.immune.nonTcell.tumour
+colData(final.immune.nonTcell) %>% colnames()
+final.immune.nonTcell$cell_category <- "Immune"
+final.immune.nonTcell$cell_type <- final.immune.nonTcell$immune_category
+final.immune.nonTcell$cell_subtype <- final.immune.nonTcell$immune_category
+
+cur_DF <- colData(final.immune.nonTcell)%>% data.frame() %>% select(-contains(c("som_","rp_","vessel","tumour","immune"))) %>% DataFrame()
+colnames(cur_DF)
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(final.immune.nonTcell$ImageNumber, final.immune.nonTcell$CellNumber))
+
+colData(final.immune.nonTcell) <- cur_DF
+
+#final.immune.Tcell
+colData(final.immune.Tcell) %>% colnames()
+final.immune.Tcell$cell_category <- "T cell"
+final.immune.Tcell$cell_type <- final.immune.Tcell$TcellCategory
+final.immune.Tcell$cell_subtype <-  final.immune.Tcell$TcellType
+
+cur_DF <- colData(final.immune.Tcell)%>% data.frame() %>% select(-contains(c("som_","rp_","immune","tumour","Tcell"))) %>% DataFrame()
+colnames(cur_DF)
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(final.immune.Tcell$ImageNumber, final.immune.Tcell$CellNumber))
+
+colData(final.immune.Tcell) <- cur_DF
+
+#final.tumour.tumour
+colData(final.tumour.tumour) %>% colnames()
+final.tumour.tumour$cell_category <- "Tumour"
+final.tumour.tumour$cell_type <- "Tumour"
+final.tumour.tumour$cell_subtype <-  "Tumour"
+
+cur_DF <- colData(final.tumour.tumour)%>% data.frame() %>% select(-contains(c("som_","rp_","immune","tumour","Tcell"))) %>% DataFrame()
+colnames(cur_DF)
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(final.tumour.tumour$ImageNumber, final.tumour.tumour$CellNumber))
+
+colData(final.tumour.tumour) <- cur_DF
+
+#final.nontumour.tumour
+colData(final.nontumour.tumour) %>% colnames()
+final.nontumour.tumour$cell_category <- "Tumour"
+final.nontumour.tumour$cell_type <- "Tumour"
+final.nontumour.tumour$cell_subtype <-  "Tumour"
+
+cur_DF <- colData(final.nontumour.tumour)%>% data.frame() %>% select(-contains(c("som_","rp_","immune","tumour","Tcell"))) %>% DataFrame()
+colnames(cur_DF)
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(final.nontumour.tumour$ImageNumber, final.nontumour.tumour$CellNumber))
+
+colData(final.nontumour.tumour) <- cur_DF
+
+#final.nontumour.undefined
+colData(final.nontumour.undefined) %>% colnames()
+final.nontumour.undefined$cell_category <- "Other"
+final.nontumour.undefined$cell_type <- "Other"
+final.nontumour.undefined$cell_subtype <-  "Other"
+
+cur_DF <- colData(final.nontumour.undefined)%>% data.frame() %>% select(-contains(c("som_","rp_","immune","tumour","Tcell"))) %>% DataFrame()
+colnames(cur_DF)
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(final.nontumour.undefined$ImageNumber, final.nontumour.undefined$CellNumber))
+
+colData(final.nontumour.undefined) <- cur_DF
+
+```
+
+```{r}
+
+final.fibro %>%colData() %>%colnames() %>%length()
+assay(final.fibro, "exprs") <-NULL
+
+final.fibro.other %>%colData() %>%colnames()%>%length()
+assay(final.fibro.other, "exprs") <-NULL
+
+final.fibro.HEV  %>%colData() %>%colnames()%>%length()
+assay(final.fibro.HEV, "exprs") <-NULL
+
+final.vessel  %>%colData() %>%colnames()%>%length()
+assay(final.vessel, "exprs") <-NULL
+
+final.vessel.tumour %>%colData() %>%colnames()%>%length()
+assay(final.vessel.tumour, "exprs") <-NULL
+
+final.stroma.tumour %>%colData() %>%colnames()%>%length()
+assay(final.stroma.tumour, "exprs") <-NULL
+
+final.immune.tumour  %>%colData() %>%colnames()%>%length()
+assay(final.immune.tumour, "exprs") <-NULL
+
+final.immune.nonTcell  %>%colData() %>%colnames()%>%length()
+assay(final.immune.nonTcell, "exprs") <-NULL
+
+final.immune.Tcell %>%colData() %>%colnames()%>%length()
+assay(final.immune.Tcell, "exprs") <-NULL
+
+
+final.tumour.tumour %>%colData() %>%colnames()%>%length()
+assay(final.tumour.tumour, "exprs") <-NULL
+
+final.nontumour.undefined  %>%colData() %>%colnames()%>%length()
+assay(final.nontumour.undefined, "exprs") <-NULL
+
+final.nontumour.tumour  %>%colData() %>%colnames()%>%length()
+assay(final.nontumour.tumour, "exprs") <-NULL
+
+all.cells <- cbind(final.fibro,final.fibro.other,final.fibro.HEV,final.vessel,final.vessel.tumour,final.stroma.tumour,final.immune.tumour,final.immune.nonTcell,final.immune.Tcell,final.tumour.tumour,final.nontumour.undefined,final.nontumour.tumour)
+
+saveRDS(all.cells,"~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/all_cells_combined_types.rds")
+
+
+all.cells$cell_category[all.cells$cell_type=="other"] <-"Other"
+all.cells$cell_type[all.cells$cell_type=="other"] <-"Other"
+
+table(all.cells$cell_category)
+rm(final.fibro,final.fibro.other,final.fibro.HEV,final.vessel,final.vessel.tumour,final.stroma.tumour,final.immune.tumour,final.immune.nonTcell,final.immune.Tcell,final.tumour.tumour,final.nontumour.undefined,final.nontumour.tumour)
+```
+
+
+#add clinical data
+```{r}
+wd <-dirname(getwd())
+
+clinical.data <- read.csv(file=file.path(wd,"sce_objects","clinical_data", "clinical_data_ROI_ac_combined_CORRECT.csv"))
+clinical.data$TMA <-clinical.data$TMA.x
+clinical.data$TMA.x <-NULL
+clinical.data$TMA.y <-NULL
+clinical.data$X.1 <-NULL
+clinical.data$X <-NULL
+head(clinical.data)
+
+unique(clinical.data$DX.name)
+clinical.data$DX.name[clinical.data$Patient_ID=="Control"] <-"Control"
+table(clinical.data$DX.name)
+clinical.data$Patient_ID %>% unique() %>% length() #1071 PATIENTS IN TOTAL
+
+area <- read.csv(file=file.path(wd,"clinical_data", "area.csv"))
+area$X <- NULL
+area$Tma_ac <- area$TMA_ImageID
+area$TMA_ImageID <-NULL
+
+area_clinical <- left_join(clinical.data, area, by="Tma_ac")
+length(unique(area$Tma_ac))
+length(unique(clinical.data$Tma_ac))
+
+length(unique(area_clinical$Tma_ac))
+length(unique(area_clinical$Patient_ID))
+length(unique(clinical.data$Patient_ID))
+head(area_clinical)
+
+colnames(area_clinical)[colnames(area_clinical) %in% colnames(colData(all.cells))]
+
+area_clinical$Patient_ID[!area_clinical$Patient_ID %in% unique(all.cells$Patient_ID)]
+unique(all.cells$Patient_ID)[!unique(all.cells$Patient_ID) %in% area_clinical$Patient_ID]
+
+all.cells[, is.na(all.cells$Patient_ID)]$Tma_ac %>% unique()
+
+saveRDS(all.cells,"~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/all_cells_combined_types_CLINICAL-DATA.rds")
+
+all.cells.filtered <- all.cells[, all.cells$Tma_ac!="88A_1"&all.cells$Tma_ac!="88A_2"&all.cells$Tma_ac!="88A_3"&all.cells$Tma_ac!="88A_4"&all.cells$Tma_ac!="86B_24"]
+
+all.filtered <-all.cells[, all.cells$Tma_ac!="88A_1"&all.cells$Tma_ac!="88A_2"&all.cells$Tma_ac!="88A_3"&all.cells$Tma_ac!="88A_4"&all.cells$Tma_ac!="86B_24"]
+
+all.cells[, all.cells$Tma_ac=="88A_1"|
+            all.cells$Tma_ac=="88A_2"|
+            all.cells$Tma_ac=="88A_3"|
+            all.cells$Tma_ac=="88A_4"|
+            all.cells$Tma_ac=="86B_24"]
+
+saveRDS(all.filtered,"~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/all_cells_combined_types_CLINICAL-DATA_FILTERED.rds")
+
+```
+
+
+
+```{r, add clustering results to subset tumour}
+cur_DF <- as_tibble(colData(all.cells)) %>% left_join(area_clinical, by = c("Tma_ac","TMA","acID")) %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(all.cells$ImageNumber, all.cells$CellNumber))
+
+colData(all.cells) <- cur_DF
+rownames(colData(all.cells)) <-all.cells$CellID
+
+unique(all.cells$DX.name)
+head(colData(all.cells))
+all.cells$Patient_ID %>% unique() %>% length()
+length(unique(all.filtered[,all.filtered$Patient_ID!="Control" ]$Patient_ID))
+#1056 patients in total!
+
+length(unique(clinical.data$Patient_ID))
+length(unique(all.cells$Patient_ID))
+length(unique(all.filtered$Patient_ID))
+
+
+length(unique(clinical.data$Patient_ID))
+tcell.sce$DX.name[is.na(tcell.sce$DX.name)]<-"NA"
+
+
+table(all.filtered$cell_category)
+```
+```{r}
+all.fibro <- all.filtered[, all.filtered$cell_category=="Fibroblast"]
+all.vessel <- all.filtered[, all.filtered$cell_category=="vessel"]
+all.tumour <- all.filtered[, all.filtered$cell_category=="Tumour"]
+all.immune <- all.filtered[, all.filtered$cell_category=="Immune"|all.filtered$cell_category=="T cell"]
+all.tcell<- all.filtered[, all.filtered$cell_category=="T cell"]
+all.immune.nonT <- all.filtered[, all.filtered$cell_category=="Immune"]
+all.other <- all.filtered[, all.filtered$cell_category=="Other"]
+
+unique(all.filtered$cell_category)
+
+table(all.filtered$cell_category)
+
+
+
+saveRDS(all.fibro,"~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/FINAL/FIBRO_CLINICAL-DATA_FILTERED.rds")
+
+saveRDS(all.vessel,"~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/FINAL/VESSEL_CLINICAL-DATA_FILTERED.rds")
+
+saveRDS(all.tumour,"~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/FINAL/TUMOUR_CLINICAL-DATA_FILTERED.rds")
+
+saveRDS(all.other,"~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/FINAL/OTHER_CLINICAL-DATA_FILTERED.rds")
+
+saveRDS(all.immune,"~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/FINAL/IMMUNE_CLINICAL-DATA_FILTERED.rds")
+
+
+saveRDS(all.immune.nonT,"~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/FINAL/IMMUNE_nonT_CLINICAL-DATA_FILTERED.rds")
+
+saveRDS(all.tcell,"~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/FINAL/TCELL-only_CLINICAL-DATA_FILTERED.rds")
+
+saveRDS(all.filtered,"~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/FINAL/all_cells_combined_types_CLINICAL-DATA_FILTERED.rds")
+
+all.filtered <-readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/FINAL/all_cells_combined_types_CLINICAL-DATA_FILTERED.rds")
+length(unique(all.filtered$Patient_ID))
+```
+```{r}
+df <- colData(all.filtered)%>% data.frame() %>% select(c(CellID, cell_category, Compartment))
+
+df$Mask <- ifelse(df$Compartment>0, "Tumour","Stroma")
+i <-"Tumour"
+  table(df$cell_category, df$Mask)
+
+```
\ No newline at end of file
diff --git a/09_cluster_tumour_cells.Rmd b/09_cluster_tumour_cells.Rmd
new file mode 100644
index 0000000..f1350e4
--- /dev/null
+++ b/09_cluster_tumour_cells.Rmd
@@ -0,0 +1,259 @@
+---
+title: "R Notebook - Cluster Tumour cells hypoxic non hypoxic"
+output:
+  html_document:
+    df_print: paged
+---
+
+
+```{r, import libraries, echo=F, warnings=F, message=F}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+library(dplyr)
+library(tidyr)
+library(mclust)
+library(ggplot2)
+library(RColorBrewer)
+library(scater)
+library(Rphenoannoy)
+library(factoextra)
+library(cluster)
+library(dendextend)
+library(ggthemes)
+library(ggpubr)
+library(dplyr)
+#library(tidyverse)
+library(RColorBrewer)
+library(pals)
+library(qwraps2)
+library(table1)
+library(SingleCellExperiment)
+library(tidyr)
+library(scater)
+library(ggridges)
+library(viridis)
+library(viridisLite)
+library(ggplot2)
+library(data.table)
+library(CATALYST)
+library(gridExtra)
+library(Rphenograph)
+library(ComplexHeatmap)
+library(CATALYST)
+library(scales)
+library(survival)
+library(broom)
+library(pheatmap)
+library(FlowSOM)
+library(Seurat)
+library(Rphenoannoy)
+library(dplyr)
+library(data.table)
+library(ggthemes)
+library(diffcyt)
+library(edgeR)
+library(rstatix)
+library(dendextend)
+library(ggdendro)
+library(dendextend)
+library(FactoMineR)
+library(factoextra)
+library(survminer)
+library(corrplot)
+library(rstatix)
+library(graphics)
+library(cowplot)
+library(cluster)
+library(glmnet)
+library(fastDummies)
+library(ggsci)
+library(FactoMineR)
+library(factoextra)
+set.seed(101100)
+```
+
+```{r, Set wd and load data final vessel, message=F, warning=F, echo=F}
+#set working directory 
+wd <-dirname(getwd())
+
+data_folder <-(file.path(wd,"sce_objects","final objects with categories","FINAL"))
+
+#RAW
+#all.tumour <-readRDS(file=file.path(data_folder, "TUMOUR_CLINICAL-DATA_FILTERED.rds"))
+
+data_folder <-(file.path(wd,"sce_objects","Tumour"))
+
+#saveRDS(all.tumour, file=file.path(data_folder, "FINAL_All_Tumour_clustered.rds"))
+
+#workingfile
+all.tumour <- readRDS(file=file.path(data_folder, "FINAL_All_Tumour_clustered.rds"))
+
+```
+
+Tumour marker
+```{r, Define final vessel markers, echo=F, warning=F, message=FALSE}
+all.marker <-rownames(all.tumour)
+
+bad.marker <- c("Iridium_191","Iridium_193","Cadherin-6","Histone H3") 
+good.marker <- all.marker[!all.marker %in% bad.marker]
+
+tumour.marker <-c("Carbonic Anhydrase IX","Pan Cytokeratin + Keratin Epithelial")
+print(tumour.marker)
+```
+
+#GLM per TmaBlock separating cells into tumour/non-tumour
+
+```{r, glm tumour non tumour 86A}
+dat.counts <-as.data.frame(t((assay(all.tumour,"c_counts_asinh"))))
+dat.counts$CellID <- rownames(dat.counts)
+dat.counts.panCK <- dat.counts %>% select(CellID, `Carbonic Anhydrase IX`)
+
+ggplot(dat.counts.panCK, aes(x=`Carbonic Anhydrase IX`)) + 
+  geom_density()
+
+glm.tumour <-Mclust(dat.counts.panCK$`Carbonic Anhydrase IX`,G=2)
+#plot(glm.tumour)
+table(glm.tumour$classification)
+
+all.tumour$mclust <- glm.tumour$classification
+```
+
+```{r, subset tumour all, echo=F, message=F, warning=F, eval=F}
+#split cells by ImageNumber
+n_cells_dr <- 100
+cs <- split(seq_len(ncol(all.tumour)), all.tumour$Tma_ac)
+length(unique(all.tumour$Tma_ac))
+#sample 'n_cells_dr' per ImageNumber
+cs <- unlist(lapply(cs, function(u) 
+  sample(u, min(n_cells_dr, length(u)))))
+#sub.test <- sce[,cs]
+all.tumour.sub <- all.tumour[,cs]
+
+#calculate percentage of cells subsetting results in
+p<-dim(assay(all.tumour.sub))[2]/dim(assay(all.tumour))[2]*100
+
+#results in % of all cells
+print(paste("Subsetting results in a total number of",dim(assay(
+  all.tumour.sub))[2],"cells, which corresponds to",round(p,digits=2), "% of all cells of the original sce." ))
+
+saveRDS(all.tumour.sub, file=file.path(data_folder, paste("all_tumour_sub.rds")))
+```
+
+```{r, load subset tumour all, message=FALSE, warning=FALSE, echo=FALSE}
+all.tumour.sub <- readRDS(file=file.path(data_folder, paste("all_tumour_sub.rds")))
+```
+
+```{r, calculate umap tumour final, warning=F, message=F, echo=F, eval=FALSE}
+p <-c(10,50,100)
+p=50
+for(i in p){
+all.tumour.sub <- runUMAP(all.tumour.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("UMAP_p", i),
+                 #use_dimred="PCA_20",
+                 n_neighbors = i)
+saveRDS(all.tumour.sub, file=file.path(data_folder, paste("all_tumour_sub.rds")))
+
+}
+
+
+#for(i in p){
+#all.tumour.sub <- runTSNE(all.tumour.sub,
+ #                exprs_values = "c_counts_asinh",
+ #                name = paste0("tSNE_p", i),
+ #                #use_dimred="PCA_20",
+ #               perplexity = i)
+#saveRDS(all.tumour.sub, file=file.path(data_folder, paste("all_tumour_sub.rds")))
+#}
+```
+
+
+**UMAP with good markers**
+```{r,plot umap tumour.final tumour marker, fig.width=12, fig.height=8, echo=F, message=FALSE,warning=FALSE}
+dat <-as.data.frame(reducedDims(all.tumour.sub)$`UMAP_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.tumour.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% tumour.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+```
+Expression of CA9 and panCK by tumour type
+```{r, Vln plot CA9 and PanCK, echo=F, warnings=F, message=F}
+p <- ggplot(dat.all.long, aes(x=target, y=counts)) + 
+  geom_violin()+
+p
+
+
+dat <-as.data.frame(reducedDims(all.tumour.sub)$`UMAP_p50`, all.tumour.sub$CellID)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.tumour.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+cell_df <- data.frame("CellID"=all.tumour.sub$CellID,"mclust"= all.tumour.sub$mclust)
+cell_df$cell <-paste("X",cell_df$CellID, sep="")
+
+dat.all <- left_join(dat.all, cell_df, by="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% tumour.marker], names_to = "target", values_to = "counts")
+
+p <- ggplot(dat.all.long, aes(x=target, y=counts, fill=mclust)) + 
+  geom_violin()
+p
+```
+
+```{r, umap tumour final hypoxic vs normal mclust, echo=F, warnings=F, message=F}
+all.tumour$mclust <- as.factor(all.tumour$mclust)
+all.tumour.sub$mclust <- as.factor(all.tumour.sub$mclust)
+
+cluster <- "mclust"
+plotReducedDim(all.tumour.sub, "UMAP_p50", colour_by=paste(cluster), text_by = paste(cluster),text_colour ="red", point_size=0.5,)+labs(x="UMAP-1", y="UMAP-2", title="UMAP") 
+```
+
+```{r, umap tumour final hypoxic vs normal, echo=F, warnings=F, message=F}
+all.tumour$tumour_type <- ifelse(all.tumour$mclust=="1", "normal", "hypoxic")
+all.tumour.sub$tumour_type <- ifelse(all.tumour.sub$mclust=="1", "normal", "hypoxic")
+
+
+cluster <- "tumour_type"
+plotReducedDim(all.tumour.sub, "UMAP_p50", colour_by=paste(cluster), text_by = paste(cluster),text_colour ="red", point_size=0.5,)+labs(x="UMAP-1", y="UMAP-2", title="UMAP") 
+```
+
+```{r, save RDS and csv, echo=F, warnings=F, message=F, eval=F}
+saveRDS(all.tumour, file=file.path(data_folder, "FINAL_All_Tumour_clustered.rds"))
+saveRDS(all.tumour.sub, file=file.path(data_folder, paste("all_tumour_sub.rds")))
+
+data_folder <-(file.path(wd,"sce_objects","final objects with categories","FINAL"))
+
+cell_df <- data.frame("CellID"=all.tumour$CellID, "Tumour_Type"=all.tumour$tumour_type)
+write.csv(cell_df, file=file.path(data_folder, "tumour_celltype.csv"))
+```
+
diff --git a/Analysis_Distances.Rmd b/Analysis_Distances.Rmd
new file mode 100644
index 0000000..c9caae1
--- /dev/null
+++ b/Analysis_Distances.Rmd
@@ -0,0 +1,5164 @@
+---
+title: "R Notebook - Immune cells intra- vs extra-tumour"
+output: html_notebook
+---
+
+
+```{r Load libraries, echo=F, message=F, warning=F}
+library(ggpubr)
+library(dplyr)
+library(pals)
+#library(tidyverse)
+library(RColorBrewer)
+library(qwraps2)
+library(imcRtools)
+library(table1)
+library(SingleCellExperiment)
+#library(uwot)
+library(tidyr)
+library(scater)
+library(ggridges)
+library(ggsci)
+library(viridis)
+library(viridisLite)
+library(ggplot2)
+library(data.table)
+library(CATALYST)
+library(gridExtra)
+library(Rphenograph)
+library(ComplexHeatmap)
+library(CATALYST)
+library(scales)
+library(survival)
+library(broom)
+library(pheatmap)
+library(FlowSOM)
+library(Seurat)
+library(Rphenoannoy)
+library(dplyr)
+library(data.table)
+library(ggthemes)
+library(diffcyt)
+library(edgeR)
+library(rstatix)
+library(dendextend)
+
+library(ggdendro)
+library(dendextend)
+library(FactoMineR)
+library(factoextra)
+library(survminer)
+library(corrplot)
+library(rstatix)
+library(graphics)
+library(cowplot)
+library(cluster)
+library(glmnet)
+library(fastDummies)
+
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+library(dplyr)
+library(tidyr)
+library(mclust)
+library(ggplot2)
+library(RColorBrewer)
+library(scater)
+library(Rphenoannoy)
+library(factoextra)
+library(cluster)
+library(dendextend)
+library(ggthemes)
+set.seed(101100)
+library(spicyR)
+```
+
+```{r}
+#set working directory 
+wd <-dirname(getwd())
+
+#clinical.data
+#data_folder <-(file.path(wd,"sce_objects","final objects with categories","FINAL"))
+#all.sce <- readRDS("/mnt/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/FINAL/FINAL_All_Clustered_FILTERED_inc_tumour_vessel_immune_CAF_minusOther.rds")
+#all.sce$DX.name[is.na(all.sce$DX.name)]<-"NA"
+
+#all.sce$tcell_subtype <-all.sce$cell_subtype
+#all.sce$tcell_subtype[all.sce$cell_category=="Fibroblast"] <- all.sce[, all.sce$cell_category=="Fibroblast"]$cell_type
+#unique(all.sce$tcell_subtype)
+"FINAL_All_Clustered_FILTERED_inc_tumour_vessel_immune_CAF_minusOther.rds"
+
+#all clustered minus 5-10% of lowest images
+data_folder <-file.path(wd,"sce_objects","merge_plus_tumour")
+
+all.sce_pat.roi <-readRDS(file=file.path(data_folder, paste("ALL_minus-Tumour_sce_pat_roi_rem.rds",sep="")))
+all.sce_pat <-readRDS(file=file.path(data_folder, paste("ALL_plus-Tumour_sce_pat_rem.rds",sep="")))
+
+all.sce_pat.roi$DX.name[is.na(all.sce_pat.roi$DX.name)]<-"NA"
+all.sce_pat$DX.name[is.na(all.sce_pat$DX.name)]<-"NA"
+
+#merged by category pat_roi_rem
+all.pat.roi<-readRDS(file=file.path(wd, "sce_objects", "merge_by_category", "all_merge_Tumour_Immune_Tcell_CAF_Vessel_pat-roi-rem.rds"))
+
+all.pat.roi$DX.name[is.na(all.pat.roi$DX.name)]<-"NA"
+
+
+#DISTANCES
+data_folder <- file.path(wd, "sce_objects","Distances")
+plot_folder <- file.path(wd, "plots")
+all.sce_pat.roi <-readRDS(file=file.path(data_folder,"all_minus_other_distances.rds"))
+
+```
+
+
+```{r}
+#clinical.data <-as.data.frame(colData(immune.sce))
+
+clinical.data <- read.csv(file=file.path(wd,"sce_objects","clinical_data", "clinical_data_ROI_ac_combined_CORRECT.csv"))
+
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+clinical.data$X <-NULL
+clinical.data$X.1 <-NULL
+head(clinical.data)
+```
+
+```{r, fig.width=18, fig.height=18}
+#SMA
+df <- data.frame(all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"|all.sce_pat.roi$cell_category=="Immune"]$CellID,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"|all.sce_pat.roi$cell_category=="Immune"]$Compartment,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"|all.sce_pat.roi$cell_category=="Immune"]$cell_subtype,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"|all.sce_pat.roi$cell_category=="Immune"]$DX.name,
+                 all.sce_pat.roi[, all.sce_pat.roi$cell_category=="T cell"|all.sce_pat.roi$cell_category=="Immune"]$Patient_ID)
+
+colnames(df) <- c("CellID","Distance","Celltype","Type","Patient_ID")
+df <- df %>% subset(Type=="Adenocarcinoma"|Type=="Squamous cell carcinoma")
+head(df)
+
+df_strat <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+df_strat <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_lo_density.csv"))
+df <-left_join(df, df_strat, by="Patient_ID")
+df$mCAF_G[is.na(df$mCAF_G)==T] <-"NA"
+df$tpCAF_G[is.na(df$tpCAF_G)==T] <-"NA"
+df$SMA_CAF_G[is.na(df$SMA_CAF_G)==T] <-"NA"
+
+df$Compartment <- df$Distance
+df$Compartment[df$Distance <= (-150)] <-"(-150)"
+df$Compartment[df$Distance > (-150) & df$Distance <= (-120)] <-"-150 to -120"
+df$Compartment[df$Distance > (-120) & df$Distance <= (-90)] <-"-120 to -90"
+df$Compartment[df$Distance > (-90) & df$Distance <= (-60)] <-"-90 to -60"
+df$Compartment[df$Distance > (-60) & df$Distance <= (-30)] <-"-60 to -30"
+df$Compartment[df$Distance > (-30) & df$Distance <= (0)] <-"-30 to -0"
+
+df$Compartment[df$Distance > 0 & df$Distance <= (30)] <-"0 to 30"
+df$Compartment[df$Distance > 30] <-">30"
+
+df$Compartment <- factor(df$Compartment, levels = c("(-150)","-150 to -120","-120 to -90","-90 to -60", "-60 to -30", "-30 to -0","0 to 30", ">30"))
+
+df <- df%>%subset(is.na(SMA_CAF_G)!=T& Celltype!="SMA_CAF")
+test <-df %>%
+  dplyr::group_by(Patient_ID, Celltype,Compartment)%>%
+  dplyr::summarise(n=n()) %>%
+  dplyr::group_by(Patient_ID, Celltype)%>%
+  dplyr::mutate(freq=n/sum(n))%>%
+  dplyr::ungroup()
+
+test <-left_join(test, df_strat, by = "Patient_ID")
+test <- test %>% subset(Celltype=="CD4"|Celltype=="CD8"|Celltype=="Bcell"|Celltype=="Neutrophil"|Celltype=="Myeloid")  
+pvalues <- test %>% subset(SMA_CAF_G=="SMA_CAF high"|SMA_CAF_G=="SMA_CAF low")%>%
+      group_by(Celltype,Compartment) %>% 
+      summarise(p=wilcox.test(jitter(freq)~SMA_CAF_G, paired=F)$p.value)
+      test <- left_join(test, pvalues, by.x = c("Celltype","Compartment"), by.y =c("Celltype","Compartment"), all.x = TRUE)
+      test$p.wt <- paste0('p=',round(test$p, digits=3))
+ 
+    p <-  test %>% subset(SMA_CAF_G=="SMA_CAF high"|SMA_CAF_G=="SMA_CAF low")%>%
+        ggplot(aes(x= SMA_CAF_G, y = freq, color=p<0.05))+#.data[[i]]
+        #geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        #facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        facet_grid(Celltype~Compartment)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        theme_bw()+
+        #ylim(c(-400,400))+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+    plot(p)        
+ggsave(plot=p, file= file.path(plot_folder, "Boxplot_DistanceBins_CD4_CD8_Neutrophil_Myeloid_Bcell_smaCAFden.pdf"), width=12, height=12)
+  
+#iCAF
+df <- data.frame(all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"|all.sce_pat.roi$cell_category=="Immune"]$CellID,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"|all.sce_pat.roi$cell_category=="Immune"]$Compartment,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"|all.sce_pat.roi$cell_category=="Immune"]$cell_subtype,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"|all.sce_pat.roi$cell_category=="Immune"]$DX.name,
+                 all.sce_pat.roi[, all.sce_pat.roi$cell_category=="T cell"|all.sce_pat.roi$cell_category=="Immune"]$Patient_ID)
+
+colnames(df) <- c("CellID","Distance","Celltype","Type","Patient_ID")
+df <- df %>% subset(Type=="Adenocarcinoma"|Type=="Squamous cell carcinoma")
+head(df)
+
+df_strat <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+df_strat <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_lo_density.csv"))
+df <-left_join(df, df_strat, by="Patient_ID")
+df$mCAF_G[is.na(df$mCAF_G)==T] <-"NA"
+df$tpCAF_G[is.na(df$tpCAF_G)==T] <-"NA"
+df$iCAF_CAF_G[is.na(df$iCAF_G)==T] <-"NA"
+
+df$Compartment <- df$Distance
+df$Compartment[df$Distance <= (-150)] <-"(-150)"
+df$Compartment[df$Distance > (-150) & df$Distance <= (-120)] <-"-150 to -120"
+df$Compartment[df$Distance > (-120) & df$Distance <= (-90)] <-"-120 to -90"
+df$Compartment[df$Distance > (-90) & df$Distance <= (-60)] <-"-90 to -60"
+df$Compartment[df$Distance > (-60) & df$Distance <= (-30)] <-"-60 to -30"
+df$Compartment[df$Distance > (-30) & df$Distance <= (0)] <-"-30 to -0"
+
+df$Compartment[df$Distance > 0 & df$Distance <= (30)] <-"0 to 30"
+df$Compartment[df$Distance > 30] <-">30"
+
+df$Compartment <- factor(df$Compartment, levels = c("(-150)","-150 to -120","-120 to -90","-90 to -60", "-60 to -30", "-30 to -0","0 to 30", ">30"))
+
+df <- df%>%subset(is.na(iCAF_G)!=T& Celltype!="iCAF")
+test <-df %>%
+  dplyr::group_by(Patient_ID, Celltype,Compartment)%>%
+  dplyr::summarise(n=n()) %>%
+  dplyr::group_by(Patient_ID, Celltype)%>%
+  dplyr::mutate(freq=n/sum(n))%>%
+  dplyr::ungroup()
+
+test <-left_join(test, df_strat, by = "Patient_ID")
+test <- test %>% subset(Celltype=="CD4"|Celltype=="CD8"|Celltype=="Bcell"|Celltype=="Neutrophil"|Celltype=="Myeloid")  
+pvalues <- test %>% subset(iCAF_G=="iCAF high"|iCAF_G=="iCAF low")%>%
+      group_by(Celltype,Compartment) %>% 
+      summarise(p=wilcox.test(jitter(freq)~iCAF_G, paired=F)$p.value)
+      test <- left_join(test, pvalues, by.x = c("Celltype","Compartment"), by.y =c("Celltype","Compartment"), all.x = TRUE)
+      test$p.wt <- paste0('p=',round(test$p, digits=3))
+ 
+    p <-  test %>% subset(iCAF_G=="iCAF high"|iCAF_G=="iCAF low")%>%
+        ggplot(aes(x= iCAF_G, y = freq, color=p<0.05))+#.data[[i]]
+        #geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        #facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        facet_grid(Celltype~Compartment)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        theme_bw()+
+        #ylim(c(-400,400))+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+    plot(p)        
+ggsave(plot=p, file= file.path(plot_folder, "Boxplot_DistanceBins_CD4_CD8_Neutrophil_Myeloid_Bcell_iCAFden.pdf"), width=12, height=12)
+
+#vCAF
+test <-df %>%
+  dplyr::group_by(Patient_ID, Celltype,Compartment)%>%
+  dplyr::summarise(n=n()) %>%
+  dplyr::group_by(Patient_ID, Celltype)%>%
+  dplyr::mutate(freq=n/sum(n))%>%
+  dplyr::ungroup()
+
+test <-left_join(test, df_strat, by = "Patient_ID")
+test <- test %>% subset(Celltype=="CD4"|Celltype=="CD8"|Celltype=="Bcell"|Celltype=="Neutrophil"|Celltype=="Myeloid")  
+pvalues <- test %>% subset(vCAF_G=="vCAF high"|vCAF_G=="vCAF low")%>%
+      group_by(Celltype,Compartment) %>% 
+      summarise(p=wilcox.test(jitter(freq)~vCAF_G, paired=F)$p.value)
+      test <- left_join(test, pvalues, by.x = c("Celltype","Compartment"), by.y =c("Celltype","Compartment"), all.x = TRUE)
+      test$p.wt <- paste0('p=',round(test$p, digits=3))
+ 
+    p <-  test %>% subset(vCAF_G=="vCAF high"|vCAF_G=="vCAF low")%>%
+        ggplot(aes(x= vCAF_G, y = freq, color=p<0.05))+#.data[[i]]
+        #geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        #facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        facet_grid(Celltype~Compartment)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        theme_bw()+
+        #ylim(c(-400,400))+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+    plot(p)        
+ggsave(plot=p, file= file.path(plot_folder, "Boxplot_DistanceBins_CD4_CD8_Neutrophil_Myeloid_Bcell_vCAFden.pdf"), width=12, height=12)
+
+#mCAF
+test <-df %>%
+  dplyr::group_by(Patient_ID, Celltype,Compartment)%>%
+  dplyr::summarise(n=n()) %>%
+  dplyr::group_by(Patient_ID, Celltype)%>%
+  dplyr::mutate(freq=n/sum(n))%>%
+  dplyr::ungroup()
+
+test <-left_join(test, df_strat, by = "Patient_ID")
+test <- test %>% subset(Celltype=="CD4"|Celltype=="CD8"|Celltype=="Bcell"|Celltype=="Neutrophil"|Celltype=="Myeloid")  
+pvalues <- test %>% subset(mCAF_G=="mCAF high"|mCAF_G=="mCAF low")%>%
+      group_by(Celltype,Compartment) %>% 
+      summarise(p=wilcox.test(jitter(freq)~mCAF_G, paired=F)$p.value)
+      test <- left_join(test, pvalues, by.x = c("Celltype","Compartment"), by.y =c("Celltype","Compartment"), all.x = TRUE)
+      test$p.wt <- paste0('p=',round(test$p, digits=3))
+ 
+    p <-  test %>% subset(mCAF_G=="mCAF high"|mCAF_G=="mCAF low")%>%
+        ggplot(aes(x= mCAF_G, y = freq, color=p<0.05))+#.data[[i]]
+        #geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        #facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        facet_grid(Celltype~Compartment)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        theme_bw()+
+        #ylim(c(-400,400))+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+    plot(p)        
+ggsave(plot=p, file= file.path(plot_folder, "Boxplot_DistanceBins_CD4_CD8_Neutrophil_Myeloid_Bcell_mCAFden.pdf"), width=12, height=12)
+
+#dCAF
+test <-df %>%
+  dplyr::group_by(Patient_ID, Celltype,Compartment)%>%
+  dplyr::summarise(n=n()) %>%
+  dplyr::group_by(Patient_ID, Celltype)%>%
+  dplyr::mutate(freq=n/sum(n))%>%
+  dplyr::ungroup()
+
+test <-left_join(test, df_strat, by = "Patient_ID")
+test <- test %>% subset(Celltype=="CD4"|Celltype=="CD8"|Celltype=="Bcell"|Celltype=="Neutrophil"|Celltype=="Myeloid")  
+pvalues <- test %>% subset(dCAF_G=="dCAF high"|dCAF_G=="dCAF low")%>%
+      group_by(Celltype,Compartment) %>% 
+      summarise(p=wilcox.test(jitter(freq)~dCAF_G, paired=F)$p.value)
+      test <- left_join(test, pvalues, by.x = c("Celltype","Compartment"), by.y =c("Celltype","Compartment"), all.x = TRUE)
+      test$p.wt <- paste0('p=',round(test$p, digits=3))
+ 
+    p <-  test %>% subset(dCAF_G=="dCAF high"|dCAF_G=="dCAF low")%>%
+        ggplot(aes(x= dCAF_G, y = freq, color=p<0.05))+#.data[[i]]
+        #geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        #facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        facet_grid(Celltype~Compartment)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        theme_bw()+
+        #ylim(c(-400,400))+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+    plot(p)        
+ggsave(plot=p, file= file.path(plot_folder, "Boxplot_DistanceBins_CD4_CD8_Neutrophil_Myeloid_Bcell_dCAFden.pdf"), width=12, height=12)
+
+#tpCAF
+test <-df %>%
+  dplyr::group_by(Patient_ID, Celltype,Compartment)%>%
+  dplyr::summarise(n=n()) %>%
+  dplyr::group_by(Patient_ID, Celltype)%>%
+  dplyr::mutate(freq=n/sum(n))%>%
+  dplyr::ungroup()
+
+test <-left_join(test, df_strat, by = "Patient_ID")
+test <- test %>% subset(Celltype=="CD4"|Celltype=="CD8"|Celltype=="Bcell"|Celltype=="Neutrophil"|Celltype=="Myeloid")  
+pvalues <- test %>% subset(tpCAF_G=="tpCAF high"|tpCAF_G=="tpCAF low")%>%
+      group_by(Celltype,Compartment) %>% 
+      summarise(p=wilcox.test(jitter(freq)~tpCAF_G, paired=F)$p.value)
+      test <- left_join(test, pvalues, by.x = c("Celltype","Compartment"), by.y =c("Celltype","Compartment"), all.x = TRUE)
+      test$p.wt <- paste0('p=',round(test$p, digits=3))
+ 
+    p <-  test %>% subset(tpCAF_G=="tpCAF high"|tpCAF_G=="tpCAF low")%>%
+        ggplot(aes(x= tpCAF_G, y = freq, color=p<0.05))+#.data[[i]]
+        #geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        #facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        facet_grid(Celltype~Compartment)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        theme_bw()+
+        #ylim(c(-400,400))+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+    plot(p)        
+ggsave(plot=p, file= file.path(plot_folder, "Boxplot_DistanceBins_CD4_CD8_Neutrophil_Myeloid_Bcell_tpCAFden.pdf"), width=12, height=12)
+
+#IDO CAF
+test <-df %>%
+  dplyr::group_by(Patient_ID, Celltype,Compartment)%>%
+  dplyr::summarise(n=n()) %>%
+  dplyr::group_by(Patient_ID, Celltype)%>%
+  dplyr::mutate(freq=n/sum(n))%>%
+  dplyr::ungroup()
+
+test <-left_join(test, df_strat, by = "Patient_ID")
+test <- test %>% subset(Celltype=="CD4"|Celltype=="CD8"|Celltype=="Bcell"|Celltype=="Neutrophil"|Celltype=="Myeloid")  
+pvalues <- test %>% subset(IDO_CAF_G=="IDO_CAF high"|IDO_CAF_G=="IDO_CAF low")%>%
+      group_by(Celltype,Compartment) %>% 
+      summarise(p=wilcox.test(jitter(freq)~IDO_CAF_G, paired=F)$p.value)
+      test <- left_join(test, pvalues, by.x = c("Celltype","Compartment"), by.y =c("Celltype","Compartment"), all.x = TRUE)
+      test$p.wt <- paste0('p=',round(test$p, digits=3))
+ 
+    p <-  test %>% subset(IDO_CAF_G=="IDO_CAF high"|IDO_CAF_G=="IDO_CAF low")%>%
+        ggplot(aes(x= IDO_CAF_G, y = freq, color=p<0.05))+#.data[[i]]
+        #geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        #facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        facet_grid(Celltype~Compartment)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        theme_bw()+
+        #ylim(c(-400,400))+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+    plot(p)        
+ggsave(plot=p, file= file.path(plot_folder, "Boxplot_DistanceBins_CD4_CD8_Neutrophil_Myeloid_Bcell_IDO_CAFden.pdf"), width=12, height=12)
+
+#hypoxic_tpCAF
+test <-df %>%
+  dplyr::group_by(Patient_ID, Celltype,Compartment)%>%
+  dplyr::summarise(n=n()) %>%
+  dplyr::group_by(Patient_ID, Celltype)%>%
+  dplyr::mutate(freq=n/sum(n))%>%
+  dplyr::ungroup()
+
+test <-left_join(test, df_strat, by = "Patient_ID")
+test <- test %>% subset(Celltype=="CD4"|Celltype=="CD8"|Celltype=="Bcell"|Celltype=="Neutrophil"|Celltype=="Myeloid")  
+pvalues <- test %>% subset(hypoxic_tpCAF_G=="hypoxic_tpCAF high"|hypoxic_tpCAF_G=="hypoxic_tpCAF low")%>%
+      group_by(Celltype,Compartment) %>% 
+      summarise(p=wilcox.test(jitter(freq)~hypoxic_tpCAF_G, paired=F)$p.value)
+      test <- left_join(test, pvalues, by.x = c("Celltype","Compartment"), by.y =c("Celltype","Compartment"), all.x = TRUE)
+      test$p.wt <- paste0('p=',round(test$p, digits=3))
+ 
+    p <-  test %>% subset(hypoxic_tpCAF_G=="hypoxic_tpCAF high"|hypoxic_tpCAF_G=="hypoxic_tpCAF low")%>%
+        ggplot(aes(x= hypoxic_tpCAF_G, y = freq, color=p<0.05))+#.data[[i]]
+        #geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        #facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        facet_grid(Celltype~Compartment)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        theme_bw()+
+        #ylim(c(-400,400))+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+    plot(p)        
+ggsave(plot=p, file= file.path(plot_folder, "Boxplot_DistanceBins_CD4_CD8_Neutrophil_Myeloid_Bcell_hypoxic_tpCAFden.pdf"), width=12, height=12)
+
+#hypoxic_CAF
+test <-df %>%
+  dplyr::group_by(Patient_ID, Celltype,Compartment)%>%
+  dplyr::summarise(n=n()) %>%
+  dplyr::group_by(Patient_ID, Celltype)%>%
+  dplyr::mutate(freq=n/sum(n))%>%
+  dplyr::ungroup()
+
+test <-left_join(test, df_strat, by = "Patient_ID")
+test <- test %>% subset(Celltype=="CD4"|Celltype=="CD8"|Celltype=="Bcell"|Celltype=="Neutrophil"|Celltype=="Myeloid")  
+pvalues <- test %>% subset(hypoxic_CAF_G=="hypoxic_CAF high"|hypoxic_CAF_G=="hypoxic_CAF low")%>%
+      group_by(Celltype,Compartment) %>% 
+      summarise(p=wilcox.test(jitter(freq)~hypoxic_CAF_G, paired=F)$p.value)
+      test <- left_join(test, pvalues, by.x = c("Celltype","Compartment"), by.y =c("Celltype","Compartment"), all.x = TRUE)
+      test$p.wt <- paste0('p=',round(test$p, digits=3))
+ 
+    p <-  test %>% subset(hypoxic_CAF_G=="hypoxic_CAF high"|hypoxic_CAF_G=="hypoxic_CAF low")%>%
+        ggplot(aes(x= hypoxic_CAF_G, y = freq, color=p<0.05))+#.data[[i]]
+        #geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        #facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        facet_grid(Celltype~Compartment)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        theme_bw()+
+        #ylim(c(-400,400))+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+    plot(p)        
+ggsave(plot=p, file= file.path(plot_folder, "Boxplot_DistanceBins_CD4_CD8_Neutrophil_Myeloid_Bcell_hypoxic_CAFden.pdf"), width=12, height=12)
+
+#PDPN_CAF
+test <-df %>%
+  dplyr::group_by(Patient_ID, Celltype,Compartment)%>%
+  dplyr::summarise(n=n()) %>%
+  dplyr::group_by(Patient_ID, Celltype)%>%
+  dplyr::mutate(freq=n/sum(n))%>%
+  dplyr::ungroup()
+
+test <-left_join(test, df_strat, by = "Patient_ID")
+test <- test %>% subset(Celltype=="CD4"|Celltype=="CD8"|Celltype=="Bcell"|Celltype=="Neutrophil"|Celltype=="Myeloid")  
+pvalues <- test %>% subset(PDPN_CAF_G=="PDPN_CAF high"|PDPN_CAF_G=="PDPN_CAF low")%>%
+      group_by(Celltype,Compartment) %>% 
+      summarise(p=wilcox.test(jitter(freq)~PDPN_CAF_G, paired=F)$p.value)
+      test <- left_join(test, pvalues, by.x = c("Celltype","Compartment"), by.y =c("Celltype","Compartment"), all.x = TRUE)
+      test$p.wt <- paste0('p=',round(test$p, digits=3))
+ 
+    p <-  test %>% subset(PDPN_CAF_G=="PDPN_CAF high"|PDPN_CAF_G=="PDPN_CAF low")%>%
+        ggplot(aes(x= PDPN_CAF_G, y = freq, color=p<0.05))+#.data[[i]]
+        #geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        #facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        facet_grid(Celltype~Compartment)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        theme_bw()+
+        #ylim(c(-400,400))+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+    plot(p)        
+ggsave(plot=p, file= file.path(plot_folder, "Boxplot_DistanceBins_CD4_CD8_Neutrophil_Myeloid_Bcell_PDPN_CAFden.pdf"), width=12, height=12)
+
+#Collagen_CAF
+test <-df %>%
+  dplyr::group_by(Patient_ID, Celltype,Compartment)%>%
+  dplyr::summarise(n=n()) %>%
+  dplyr::group_by(Patient_ID, Celltype)%>%
+  dplyr::mutate(freq=n/sum(n))%>%
+  dplyr::ungroup()
+
+test <-left_join(test, df_strat, by = "Patient_ID")
+test <- test %>% subset(Celltype=="CD4"|Celltype=="CD8"|Celltype=="Bcell"|Celltype=="Neutrophil"|Celltype=="Myeloid")  
+pvalues <- test %>% subset(Collagen_CAF_G=="Collagen_CAF high"|Collagen_CAF_G=="Collagen_CAF low")%>%
+      group_by(Celltype,Compartment) %>% 
+      summarise(p=wilcox.test(jitter(freq)~Collagen_CAF_G, paired=F)$p.value)
+      test <- left_join(test, pvalues, by.x = c("Celltype","Compartment"), by.y =c("Celltype","Compartment"), all.x = TRUE)
+      test$p.wt <- paste0('p=',round(test$p, digits=3))
+ 
+    p <-  test %>% subset(Collagen_CAF_G=="Collagen_CAF high"|Collagen_CAF_G=="Collagen_CAF low")%>%
+        ggplot(aes(x= Collagen_CAF_G, y = freq, color=p<0.05))+#.data[[i]]
+        #geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        #facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        facet_grid(Celltype~Compartment)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        theme_bw()+
+        #ylim(c(-400,400))+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+    plot(p)        
+ggsave(plot=p, file= file.path(plot_folder, "Boxplot_DistanceBins_CD4_CD8_Neutrophil_Myeloid_Bcell_Collagen_CAFden.pdf"), width=12, height=12)
+
+```
+
+```{r, fig.width=18, fig.height=18}
+df <- data.frame(all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"|all.sce_pat.roi$cell_category=="Immune"]$CellID,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"|all.sce_pat.roi$cell_category=="Immune"]$Compartment,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"|all.sce_pat.roi$cell_category=="Immune"]$cell_subtype,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"|all.sce_pat.roi$cell_category=="Immune"]$DX.name,
+                 all.sce_pat.roi[, all.sce_pat.roi$cell_category=="T cell"|all.sce_pat.roi$cell_category=="Immune"]$Patient_ID)
+
+colnames(df) <- c("CellID","Distance","Celltype","Type","Patient_ID")
+df <- df %>% subset(Type=="Adenocarcinoma"|Type=="Squamous cell carcinoma")
+head(df)
+
+df_strat <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+df_strat <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_lo_density.csv"))
+df <-left_join(df, df_strat, by="Patient_ID")
+df$mCAF_G[is.na(df$mCAF_G)==T] <-"NA"
+df$tpCAF_G[is.na(df$tpCAF_G)==T] <-"NA"
+df$SMA_CAF_G[is.na(df$SMA_CAF_G)==T] <-"NA"
+
+df$Compartment <- df$Distance
+df$Compartment[df$Distance <= (-150)] <-"(-150)"
+df$Compartment[df$Distance > (-150) & df$Distance <= (-120)] <-"-150 to -120"
+df$Compartment[df$Distance > (-120) & df$Distance <= (-90)] <-"-120 to -90"
+df$Compartment[df$Distance > (-90) & df$Distance <= (-60)] <-"-90 to -60"
+df$Compartment[df$Distance > (-60) & df$Distance <= (-30)] <-"-60 to -30"
+df$Compartment[df$Distance > (-30) & df$Distance <= (0)] <-"-30 to -0"
+
+df$Compartment[df$Distance > 0 & df$Distance <= (30)] <-"0 to 30"
+df$Compartment[df$Distance > 30] <-">30"
+
+df$Compartment <- factor(df$Compartment, levels = c("(-150)","-150 to -120","-120 to -90","-90 to -60", "-60 to -30", "-30 to -0","0 to 30", ">30"))
+
+df <- df%>%subset(is.na(SMA_CAF_G)!=T& Celltype!="SMA_CAF")
+test <-df %>%
+  dplyr::group_by(Patient_ID, Celltype,Compartment)%>%
+  dplyr::summarise(n=n()) %>%
+  dplyr::group_by(Patient_ID, Celltype)%>%
+  dplyr::mutate(freq=n/sum(n))%>%
+  dplyr::ungroup()
+
+test <-left_join(test, df_strat, by = "Patient_ID")
+test <- test %>% subset(Celltype=="CD4"|Celltype=="CD8"|Celltype=="Bcell"|Celltype=="Neutrophil"|Celltype=="Myeloid")  
+pvalues <- test %>% subset(SMA_CAF_G=="SMA_CAF high"|SMA_CAF_G=="SMA_CAF low")%>%
+      group_by(Celltype,Compartment) %>% 
+      summarise(p=wilcox.test(jitter(freq)~SMA_CAF_G, paired=F)$p.value)
+      test <- left_join(test, pvalues, by.x = c("Celltype","Compartment"), by.y =c("Celltype","Compartment"), all.x = TRUE)
+      test$p.wt <- paste0('p=',round(test$p, digits=3))
+ 
+    p <-  test %>% subset(SMA_CAF_G=="SMA_CAF high"|SMA_CAF_G=="SMA_CAF low")%>%
+        ggplot(aes(x= SMA_CAF_G, y = freq, color=p<0.05))+#.data[[i]]
+        #geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        #facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        facet_grid(Celltype~Compartment)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        theme_bw()+
+        #ylim(c(-400,400))+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+    plot(p)        
+ggsave(plot=p, file= file.path(plot_folder, "Boxplot_DistanceBins_CD4_CD8_Neutrophil_Myeloid_Bcell_smaCAFden.pdf"), width=12, height=12)
+      
+
+#Immune types
+df <- data.frame(all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$CellID,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$Compartment,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$cell_subtype,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$DX.name,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$Patient_ID)
+
+colnames(df) <- c("CellID","Distance","Celltype","Type","Patient_ID")
+df <- df %>% subset(Type=="Adenocarcinoma"|Type=="Squamous cell carcinoma")
+head(df)
+
+df_strat <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+
+df <-left_join(df, df_strat, by="Patient_ID")
+df$mCAF_G[is.na(df$mCAF_G)==T] <-"NA"
+df$tpCAF_G[is.na(df$tpCAF_G)==T] <-"NA"
+df$SMA_CAF_G[is.na(df$SMA_CAF_G)==T] <-"NA"
+
+df$Compartment <- df$Distance
+df$Compartment[df$Distance <= (-150)] <-"(-210)"
+df$Compartment[df$Distance > (-150) & df$Distance <= (-120)] <-"-50 to -40"
+df$Compartment[df$Distance > (-120) & df$Distance <= (-90)] <-"-40 to -30"
+df$Compartment[df$Distance > (-90) & df$Distance <= (-60)] <-"-30 to -20"
+df$Compartment[df$Distance > (-60) & df$Distance <= (-30)] <-"-20 to -10"
+df$Compartment[df$Distance > (-30) & df$Distance <= (0)] <-"-10 to -0"
+
+df$Compartment[df$Distance > 0 & df$Distance <= (30)] <-"0 to 10"
+df$Compartment[df$Distance > 30] <-"10 to 20"
+
+df$Compartment <- factor(df$Compartment, levels = c("(-210)","-50 to -40","-40 to -30","-30 to -20", "-20 to -10", "-10 to -0","0 to 10", "10 to 20"))
+
+df <- df%>%subset(is.na(mCAF_G)!=T& Celltype!="mCAF")
+test <-df %>%
+  dplyr::group_by(Patient_ID, Celltype,Compartment)%>%
+  dplyr::summarise(n=n()) %>%
+  dplyr::group_by(Patient_ID, Celltype)%>%
+  dplyr::mutate(freq=n/sum(n))%>%
+  dplyr::ungroup()
+
+test <-left_join(test, df_strat, by = "Patient_ID")
+  
+pvalues <- test %>% subset(mCAF_G=="mCAF high"|mCAF_G=="mCAF low")%>%
+      group_by(Celltype,Compartment) %>% 
+      summarise(p=wilcox.test(jitter(freq)~mCAF_G, paired=F)$p.value)
+      test <- left_join(test, pvalues, by.x = c("Celltype","Compartment"), by.y =c("Celltype","Compartment"), all.x = TRUE)
+      test$p.wt <- paste0('p=',round(test$p, digits=3))
+ 
+      test %>% subset(mCAF_G=="mCAF high"|mCAF_G=="mCAF low")%>%
+        ggplot(aes(x= mCAF_G, y = freq, color=p<0.05))+#.data[[i]]
+        #geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        #facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        facet_grid(Celltype~Compartment)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        theme_bw()+
+        #ylim(c(-400,400))+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+            
+  #tpCAF
+df <- data.frame(all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"]$CellID,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"]$Compartment,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"]$cell_subtype,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"]$DX.name,
+                 all.sce_pat.roi[, all.sce_pat.roi$cell_category=="T cell"]$Patient_ID)
+
+colnames(df) <- c("CellID","Distance","Celltype","Type","Patient_ID")
+df <- df %>% subset(Type=="Adenocarcinoma"|Type=="Squamous cell carcinoma")
+head(df)
+
+df_strat <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+
+df <-left_join(df, df_strat, by="Patient_ID")
+df$tpCAF_G[is.na(df$tpCAF_G)==T] <-"NA"
+df$tpCAF_G[is.na(df$tpCAF_G)==T] <-"NA"
+df$SMA_CAF_G[is.na(df$SMA_CAF_G)==T] <-"NA"
+
+df$Compartment <- df$Distance
+df$Compartment[df$Distance <= (-150)] <-"(-210)"
+df$Compartment[df$Distance > (-150) & df$Distance <= (-120)] <-"-50 to -40"
+df$Compartment[df$Distance > (-120) & df$Distance <= (-90)] <-"-40 to -30"
+df$Compartment[df$Distance > (-90) & df$Distance <= (-60)] <-"-30 to -20"
+df$Compartment[df$Distance > (-60) & df$Distance <= (-30)] <-"-20 to -10"
+df$Compartment[df$Distance > (-30) & df$Distance <= (0)] <-"-10 to -0"
+
+df$Compartment[df$Distance > 0 & df$Distance <= (30)] <-"0 to 10"
+df$Compartment[df$Distance > 30] <-"10 to 20"
+
+df$Compartment <- factor(df$Compartment, levels = c("(-210)","-50 to -40","-40 to -30","-30 to -20", "-20 to -10", "-10 to -0","0 to 10", "10 to 20"))
+
+df <- df%>%subset(is.na(tpCAF_G)!=T& Celltype!="tpCAF")
+test <-df %>%
+  dplyr::group_by(Patient_ID, Celltype,Compartment)%>%
+  dplyr::summarise(n=n()) %>%
+  dplyr::group_by(Patient_ID, Celltype)%>%
+  dplyr::mutate(freq=n/sum(n))%>%
+  dplyr::ungroup()
+
+test <-left_join(test, df_strat, by = "Patient_ID")
+  
+pvalues <- test %>% subset(tpCAF_G=="tpCAF high"|tpCAF_G=="tpCAF low")%>%
+      group_by(Celltype,Compartment) %>% 
+      summarise(p=wilcox.test(jitter(freq)~tpCAF_G, paired=F)$p.value)
+      test <- left_join(test, pvalues, by.x = c("Celltype","Compartment"), by.y =c("Celltype","Compartment"), all.x = TRUE)
+      test$p.wt <- paste0('p=',round(test$p, digits=3))
+ 
+      test %>% subset(tpCAF_G=="tpCAF high"|tpCAF_G=="tpCAF low")%>%
+        ggplot(aes(x= tpCAF_G, y = freq, color=p<0.05))+#.data[[i]]
+        #geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        #facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        facet_grid(Celltype~Compartment)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        theme_bw()+
+        #ylim(c(-400,400))+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+            
+
+      
+
+#Immune types
+df <- data.frame(all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$CellID,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$Compartment,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$cell_subtype,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$DX.name,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$Patient_ID)
+
+colnames(df) <- c("CellID","Distance","Celltype","Type","Patient_ID")
+df <- df %>% subset(Type=="Adenocarcinoma"|Type=="Squamous cell carcinoma")
+head(df)
+
+df_strat <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+
+df <-left_join(df, df_strat, by="Patient_ID")
+df$tpCAF_G[is.na(df$tpCAF_G)==T] <-"NA"
+df$tpCAF_G[is.na(df$tpCAF_G)==T] <-"NA"
+df$SMA_CAF_G[is.na(df$SMA_CAF_G)==T] <-"NA"
+
+df$Compartment <- df$Distance
+df$Compartment[df$Distance <= (-150)] <-"(-210)"
+df$Compartment[df$Distance > (-150) & df$Distance <= (-120)] <-"-50 to -40"
+df$Compartment[df$Distance > (-120) & df$Distance <= (-90)] <-"-40 to -30"
+df$Compartment[df$Distance > (-90) & df$Distance <= (-60)] <-"-30 to -20"
+df$Compartment[df$Distance > (-60) & df$Distance <= (-30)] <-"-20 to -10"
+df$Compartment[df$Distance > (-30) & df$Distance <= (0)] <-"-10 to -0"
+
+df$Compartment[df$Distance > 0 & df$Distance <= (30)] <-"0 to 10"
+df$Compartment[df$Distance > 30] <-"10 to 20"
+
+df$Compartment <- factor(df$Compartment, levels = c("(-210)","-50 to -40","-40 to -30","-30 to -20", "-20 to -10", "-10 to -0","0 to 10", "10 to 20"))
+
+df <- df%>%subset(is.na(tpCAF_G)!=T& Celltype!="tpCAF")
+test <-df %>%
+  dplyr::group_by(Patient_ID, Celltype,Compartment)%>%
+  dplyr::summarise(n=n()) %>%
+  dplyr::group_by(Patient_ID, Celltype)%>%
+  dplyr::mutate(freq=n/sum(n))%>%
+  dplyr::ungroup()
+
+test <-left_join(test, df_strat, by = "Patient_ID")
+  
+pvalues <- test %>% subset(tpCAF_G=="tpCAF high"|tpCAF_G=="tpCAF low")%>%
+      group_by(Celltype,Compartment) %>% 
+      summarise(p=wilcox.test(jitter(freq)~tpCAF_G, paired=F)$p.value)
+      test <- left_join(test, pvalues, by.x = c("Celltype","Compartment"), by.y =c("Celltype","Compartment"), all.x = TRUE)
+      test$p.wt <- paste0('p=',round(test$p, digits=3))
+ 
+      test %>% subset(tpCAF_G=="tpCAF high"|tpCAF_G=="tpCAF low")%>%
+        ggplot(aes(x= tpCAF_G, y = freq, color=p<0.05))+#.data[[i]]
+        #geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        #facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        facet_grid(Celltype~Compartment)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        theme_bw()+
+        #ylim(c(-400,400))+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+            
+   #iCAF
+df <- data.frame(all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"]$CellID,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"]$Compartment,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"]$cell_subtype,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"]$DX.name,
+                 all.sce_pat.roi[, all.sce_pat.roi$cell_category=="T cell"]$Patient_ID)
+
+colnames(df) <- c("CellID","Distance","Celltype","Type","Patient_ID")
+df <- df %>% subset(Type=="Adenocarcinoma"|Type=="Squamous cell carcinoma")
+head(df)
+
+df_strat <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+
+df <-left_join(df, df_strat, by="Patient_ID")
+df$iCAF_G[is.na(df$iCAF_G)==T] <-"NA"
+df$iCAF_G[is.na(df$iCAF_G)==T] <-"NA"
+df$SMA_CAF_G[is.na(df$SMA_CAF_G)==T] <-"NA"
+
+df$Compartment <- df$Distance
+df$Compartment[df$Distance <= (-150)] <-"(-210)"
+df$Compartment[df$Distance > (-150) & df$Distance <= (-120)] <-"-50 to -40"
+df$Compartment[df$Distance > (-120) & df$Distance <= (-90)] <-"-40 to -30"
+df$Compartment[df$Distance > (-90) & df$Distance <= (-60)] <-"-30 to -20"
+df$Compartment[df$Distance > (-60) & df$Distance <= (-30)] <-"-20 to -10"
+df$Compartment[df$Distance > (-30) & df$Distance <= (0)] <-"-10 to -0"
+
+df$Compartment[df$Distance > 0 & df$Distance <= (30)] <-"0 to 10"
+df$Compartment[df$Distance > 30] <-"10 to 20"
+
+df$Compartment <- factor(df$Compartment, levels = c("(-210)","-50 to -40","-40 to -30","-30 to -20", "-20 to -10", "-10 to -0","0 to 10", "10 to 20"))
+
+df <- df%>%subset(is.na(iCAF_G)!=T& Celltype!="iCAF")
+test <-df %>%
+  dplyr::group_by(Patient_ID, Celltype,Compartment)%>%
+  dplyr::summarise(n=n()) %>%
+  dplyr::group_by(Patient_ID, Celltype)%>%
+  dplyr::mutate(freq=n/sum(n))%>%
+  dplyr::ungroup()
+
+test <-left_join(test, df_strat, by = "Patient_ID")
+  
+pvalues <- test %>% subset(iCAF_G=="iCAF high"|iCAF_G=="iCAF low")%>%
+      group_by(Celltype,Compartment) %>% 
+      summarise(p=wilcox.test(jitter(freq)~iCAF_G, paired=F)$p.value)
+      test <- left_join(test, pvalues, by.x = c("Celltype","Compartment"), by.y =c("Celltype","Compartment"), all.x = TRUE)
+      test$p.wt <- paste0('p=',round(test$p, digits=3))
+ 
+      test %>% subset(iCAF_G=="iCAF high"|iCAF_G=="iCAF low")%>%
+        ggplot(aes(x= iCAF_G, y = freq, color=p<0.05))+#.data[[i]]
+        #geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        #facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        facet_grid(Celltype~Compartment)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        theme_bw()+
+        #ylim(c(-400,400))+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+            
+
+      
+
+#Immune types
+df <- data.frame(all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$CellID,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$Compartment,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$cell_subtype,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$DX.name,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$Patient_ID)
+
+colnames(df) <- c("CellID","Distance","Celltype","Type","Patient_ID")
+df <- df %>% subset(Type=="Adenocarcinoma"|Type=="Squamous cell carcinoma")
+head(df)
+
+df_strat <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+
+df <-left_join(df, df_strat, by="Patient_ID")
+df$iCAF_G[is.na(df$iCAF_G)==T] <-"NA"
+df$iCAF_G[is.na(df$iCAF_G)==T] <-"NA"
+df$SMA_CAF_G[is.na(df$SMA_CAF_G)==T] <-"NA"
+
+df$Compartment <- df$Distance
+df$Compartment[df$Distance <= (-150)] <-"(-210)"
+df$Compartment[df$Distance > (-150) & df$Distance <= (-120)] <-"-50 to -40"
+df$Compartment[df$Distance > (-120) & df$Distance <= (-90)] <-"-40 to -30"
+df$Compartment[df$Distance > (-90) & df$Distance <= (-60)] <-"-30 to -20"
+df$Compartment[df$Distance > (-60) & df$Distance <= (-30)] <-"-20 to -10"
+df$Compartment[df$Distance > (-30) & df$Distance <= (0)] <-"-10 to -0"
+
+df$Compartment[df$Distance > 0 & df$Distance <= (30)] <-"0 to 10"
+df$Compartment[df$Distance > 30] <-"10 to 20"
+
+df$Compartment <- factor(df$Compartment, levels = c("(-210)","-50 to -40","-40 to -30","-30 to -20", "-20 to -10", "-10 to -0","0 to 10", "10 to 20"))
+
+df <- df%>%subset(is.na(iCAF_G)!=T& Celltype!="iCAF")
+test <-df %>%
+  dplyr::group_by(Patient_ID, Celltype,Compartment)%>%
+  dplyr::summarise(n=n()) %>%
+  dplyr::group_by(Patient_ID, Celltype)%>%
+  dplyr::mutate(freq=n/sum(n))%>%
+  dplyr::ungroup()
+
+test <-left_join(test, df_strat, by = "Patient_ID")
+  
+pvalues <- test %>% subset(iCAF_G=="iCAF high"|iCAF_G=="iCAF low")%>%
+      group_by(Celltype,Compartment) %>% 
+      summarise(p=wilcox.test(jitter(freq)~iCAF_G, paired=F)$p.value)
+      test <- left_join(test, pvalues, by.x = c("Celltype","Compartment"), by.y =c("Celltype","Compartment"), all.x = TRUE)
+      test$p.wt <- paste0('p=',round(test$p, digits=3))
+ 
+      test %>% subset(iCAF_G=="iCAF high"|iCAF_G=="iCAF low")%>%
+        ggplot(aes(x= iCAF_G, y = freq, color=p<0.05))+#.data[[i]]
+        #geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        #facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        facet_grid(Celltype~Compartment)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        theme_bw()+
+        #ylim(c(-400,400))+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+  
+      
+       #SMA_CAF
+df <- data.frame(all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"]$CellID,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"]$Compartment,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"]$cell_subtype,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"]$DX.name,
+                 all.sce_pat.roi[, all.sce_pat.roi$cell_category=="T cell"]$Patient_ID)
+
+colnames(df) <- c("CellID","Distance","Celltype","Type","Patient_ID")
+df <- df %>% subset(Type=="Adenocarcinoma"|Type=="Squamous cell carcinoma")
+head(df)
+
+df_strat <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+
+df <-left_join(df, df_strat, by="Patient_ID")
+df$SMA_CAF_G[is.na(df$SMA_CAF_G)==T] <-"NA"
+df$SMA_CAF_G[is.na(df$SMA_CAF_G)==T] <-"NA"
+df$SMA_CAF_G[is.na(df$SMA_CAF_G)==T] <-"NA"
+
+df$Compartment <- df$Distance
+df$Compartment[df$Distance <= (-150)] <-"(-210)"
+df$Compartment[df$Distance > (-150) & df$Distance <= (-120)] <-"-50 to -40"
+df$Compartment[df$Distance > (-120) & df$Distance <= (-90)] <-"-40 to -30"
+df$Compartment[df$Distance > (-90) & df$Distance <= (-60)] <-"-30 to -20"
+df$Compartment[df$Distance > (-60) & df$Distance <= (-30)] <-"-20 to -10"
+df$Compartment[df$Distance > (-30) & df$Distance <= (0)] <-"-10 to -0"
+
+df$Compartment[df$Distance > 0 & df$Distance <= (30)] <-"0 to 10"
+df$Compartment[df$Distance > 30] <-"10 to 20"
+
+df$Compartment <- factor(df$Compartment, levels = c("(-210)","-50 to -40","-40 to -30","-30 to -20", "-20 to -10", "-10 to -0","0 to 10", "10 to 20"))
+
+df <- df%>%subset(is.na(SMA_CAF_G)!=T& Celltype!="SMA_CAF")
+test <-df %>%
+  dplyr::group_by(Patient_ID, Celltype,Compartment)%>%
+  dplyr::summarise(n=n()) %>%
+  dplyr::group_by(Patient_ID, Celltype)%>%
+  dplyr::mutate(freq=n/sum(n))%>%
+  dplyr::ungroup()
+
+test <-left_join(test, df_strat, by = "Patient_ID")
+
+pvalues <- test %>% subset(SMA_CAF_G=="SMA_CAF high"|SMA_CAF_G=="SMA_CAF low")%>%
+      group_by(Celltype,Compartment) %>% 
+      summarise(p=wilcox.test(jitter(freq)~SMA_CAF_G, paired=F)$p.value)
+      test <- left_join(test, pvalues, by.x = c("Celltype","Compartment"), by.y =c("Celltype","Compartment"), all.x = TRUE)
+      test$p.wt <- paste0('p=',round(test$p, digits=3))
+ 
+      test %>% subset(SMA_CAF_G=="SMA_CAF high"|SMA_CAF_G=="SMA_CAF low")%>%
+        ggplot(aes(x= SMA_CAF_G, y = freq, color=p<0.05))+#.data[[i]]
+        #geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        #facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        facet_grid(Celltype~Compartment)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        theme_bw()+
+        #ylim(c(-400,400))+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+            
+
+      
+
+#Immune types
+df <- data.frame(all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$CellID,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$Compartment,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$cell_subtype,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$DX.name,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$Patient_ID)
+
+colnames(df) <- c("CellID","Distance","Celltype","Type","Patient_ID")
+df <- df %>% subset(Type=="Adenocarcinoma"|Type=="Squamous cell carcinoma")
+head(df)
+
+df_strat <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+
+df <-left_join(df, df_strat, by="Patient_ID")
+df$SMA_CAF_G[is.na(df$SMA_CAF_G)==T] <-"NA"
+df$SMA_CAF_G[is.na(df$SMA_CAF_G)==T] <-"NA"
+df$SMA_CAF_G[is.na(df$SMA_CAF_G)==T] <-"NA"
+
+df$Compartment <- df$Distance
+df$Compartment[df$Distance <= (-150)] <-"(-210)"
+df$Compartment[df$Distance > (-150) & df$Distance <= (-120)] <-"-50 to -40"
+df$Compartment[df$Distance > (-120) & df$Distance <= (-90)] <-"-40 to -30"
+df$Compartment[df$Distance > (-90) & df$Distance <= (-60)] <-"-30 to -20"
+df$Compartment[df$Distance > (-60) & df$Distance <= (-30)] <-"-20 to -10"
+df$Compartment[df$Distance > (-30) & df$Distance <= (0)] <-"-10 to -0"
+
+df$Compartment[df$Distance > 0 & df$Distance <= (30)] <-"0 to 10"
+df$Compartment[df$Distance > 30] <-"10 to 20"
+
+df$Compartment <- factor(df$Compartment, levels = c("(-210)","-50 to -40","-40 to -30","-30 to -20", "-20 to -10", "-10 to -0","0 to 10", "10 to 20"))
+
+df <- df%>%subset(is.na(SMA_CAF_G)!=T& Celltype!="SMA_CAF")
+test <-df %>%
+  dplyr::group_by(Patient_ID, Celltype,Compartment)%>%
+  dplyr::summarise(n=n()) %>%
+  dplyr::group_by(Patient_ID, Celltype)%>%
+  dplyr::mutate(freq=n/sum(n))%>%
+  dplyr::ungroup()
+
+test <-left_join(test, df_strat, by = "Patient_ID")
+  
+pvalues <- test %>% subset(SMA_CAF_G=="SMA_CAF high"|SMA_CAF_G=="SMA_CAF low")%>%
+      group_by(Celltype,Compartment) %>% 
+      summarise(p=wilcox.test(jitter(freq)~SMA_CAF_G, paired=F)$p.value)
+      test <- left_join(test, pvalues, by.x = c("Celltype","Compartment"), by.y =c("Celltype","Compartment"), all.x = TRUE)
+      test$p.wt <- paste0('p=',round(test$p, digits=3))
+ 
+      test %>% subset(SMA_CAF_G=="SMA_CAF high"|SMA_CAF_G=="SMA_CAF low")%>%
+        ggplot(aes(x= SMA_CAF_G, y = freq, color=p<0.05))+#.data[[i]]
+        #geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        #facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        facet_grid(Celltype~Compartment)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        theme_bw()+
+        #ylim(c(-400,400))+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+  
+      
+```
+
+
+patients with immune cells intratumoural
+```{r fig.width=20, fig.height=20}
+#all.sce_pat.roi <- all.cells
+df <- data.frame(all.sce_pat.roi$CellID,
+                 all.sce_pat.roi$Compartment,
+                 all.sce_pat.roi$cell_category,
+                 all.sce_pat.roi$DX.name,
+                 all.sce_pat.roi$RoiID)
+
+all.sce_pat.roi$immune_category <- all.sce_pat.roi$cell_category
+all.sce_pat.roi$immune_category[all.sce_pat.roi$immune_category=="T cell"] <-"Immune"
+all.sce_pat.roi$immune_category <- factor(all.sce_pat.roi$immune_category, levels = c("Fibroblast", "Immune", "Other","Tumour","vessel"))
+
+
+#Cell categories tumuor, immune, fibro, vessel, other
+df <- data.frame(all.sce_pat.roi$CellID,
+                 all.sce_pat.roi$Compartment,
+                 all.sce_pat.roi$immune_category,
+                 all.sce_pat.roi$DX.name,
+                 all.sce_pat.roi$RoiID)
+
+colnames(df) <- c("CellID","Distance","Celltype","Type","ROI")
+df <- df %>% subset(Type=="Adenocarcinoma"|Type=="Squamous cell carcinoma")
+head(df)
+
+df %>% subset(Celltype=="Tumour") %>% ggplot(aes(x=Distance))+geom_density()+geom_vline(xintercept=0)+xlim(c(-100,150))
+
+roi_rem <-df[df$Distance >300,]$ROI %>%unique()
+
+df <-df[!df$ROI %in% roi_rem,]
+#+  scale_x_continuous(trans = "log10")
+
+
+p <-df%>%#subset(Celltype!="Other") %>%
+  ggplot(aes(x=Distance, color=Celltype)) +
+  geom_density(alpha=0.3,size=1)+
+  xlim(c(-100,150))+
+  scale_colour_tableau()+
+ geom_vline(xintercept = 0,color = 'black', linetype="dashed")+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
+plot(p)
+ggsave(plot=p, file=file.path(plot_folder, "Distance_cell_categories.pdf"), width=8, height=4)
+
+
+#Fibro figure
+#Cell categories tumuor, immune, fibro, vessel, other
+df <- data.frame(all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Fibroblast"]$CellID,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Fibroblast"]$Compartment,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Fibroblast"]$cell_subtype,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Fibroblast"]$DX.name,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Fibroblast"]$RoiID)
+
+colnames(df) <- c("CellID","Distance","Celltype","Type","ROI")
+df <- df %>% subset(Type=="Adenocarcinoma"|Type=="Squamous cell carcinoma")
+head(df)
+
+df %>% subset(Celltype=="Tumour") %>% ggplot(aes(x=Distance))+geom_density()+geom_vline(xintercept=0)+xlim(c(-100,150))
+
+roi_rem <-df[df$Distance >300,]$ROI %>%unique()
+
+df <-df[!df$ROI %in% roi_rem,]
+#+  scale_x_continuous(trans = "log10")
+
+category20 <- c("#1F77B4","#FF7F0E","#2CA02C","#D62728","#9467BD","#8C564B","#E377C2","#7F7F7FFF","#BCBD22","#17BECF","#AEC7E8")
+p <-df%>%#subset(Celltype=="tpCAF"|Celltype=="mCAF"|Celltype=="tpCAF"|Celltype=="SMA_CAF"|Celltype=="iCAF")%>%
+  ggplot(aes(x=Distance, color=Celltype)) +
+  geom_density(alpha=0.3,size=1)+
+  xlim(c(-100,150))+
+  scale_colour_tableau()+
+  scale_colour_manual(values=as.vector(category20)[1:11])+
+ geom_vline(xintercept = 0,color = 'black', linetype="dashed")+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
+plot(p)
+ggsave(plot=p, file=file.path(plot_folder, "Distance_CAF_tumourstromaborder.pdf"), width=8, height=4)
+  #facet_wrap(~Type)
+  #scale_x_log10()
+unique(df$all.sce_pat.roi.DX.name)
+summary(df$Distance)
+
+
+#Fibroblast Types
+df <- data.frame(all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Fibroblast"]$CellID,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Fibroblast"]$Compartment,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Fibroblast"]$cell_type,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Fibroblast"]$DX.name,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Fibroblast"]$Grade,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Fibroblast"]$Patient_ID )
+
+colnames(df) <- c("CellID","Distance","Celltype","Type","Grade","Patient_ID")
+df <- df %>% subset(Type=="Adenocarcinoma"|Type=="Squamous cell carcinoma")
+head(df)
+
+df_strat <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+
+df <-left_join(df, df_strat, by="Patient_ID")
+
+df%>%
+  subset(Celltype=="tpCAF"|Celltype=="mCAF"|Celltype=="mCAF"|Celltype=="SMA_CAF"|Celltype=="iCAF")%>%
+  ggplot(aes(x=Distance, color=Celltype, fill=Celltype)) +
+  geom_density(alpha=0.3,size=1)+
+  xlim(c(-100,150))+
+  #facet_wrap(~Type+Celltype)+
+  #facet_grid(mCAF_G~Celltype)+
+  scale_fill_igv()+
+  scale_color_igv()
+  #scale_x_log10()
+
+
+df$Compartment <- df$Distance
+df$Compartment[df$Distance < (-200)] <-"-200"
+df$Compartment[df$Distance > (-200) & df$Distance < (-100)] <-"-200 to -100"
+df$Compartment[df$Distance > (-100) & df$Distance < (-50)] <-"-100 to -50"
+df$Compartment[df$Distance > (-50) & df$Distance < (-40)] <-"-50 to -40"
+df$Compartment[df$Distance > (-40) & df$Distance < (-30)] <-"-40 to -30"
+df$Compartment[df$Distance > (-30) & df$Distance < (-20)] <-"-30 to -20"
+df$Compartment[df$Distance > (-20) & df$Distance < (-10)] <-"-20 to -10"
+df$Compartment[df$Distance > (-10) & df$Distance < (0)] <-"-10 to -0"
+
+df$Compartment[df$Distance > 0 & df$Distance < (10)] <-"0 to 10"
+df$Compartment[df$Distance > 10 & df$Distance < (20)] <-"10 to 20"
+df$Compartment[df$Distance > 20 & df$Distance < (30)] <-"20 to 30"
+df$Compartment[df$Distance > 20] <-">20"
+
+df$Compartment <- factor(df$Compartment, levels = c("-200","-200 to -100","-100 to -50","-50 to -40","-40 to -30","-30 to -20", "-20 to -10", "-10 to -0","0 to 10", "10 to 20", "20 to 30", ">20"))
+
+df %>%
+  count(Celltype, Type,Grade, Compartment) %>%
+  group_by(Celltype,Type,Grade) %>%
+  mutate(freq = n / sum(n))%>%
+  ggplot(aes(fill=Compartment, x=freq, y=Celltype)) + 
+    geom_bar(position="fill", stat="identity")+
+  scale_fill_igv()+
+  scale_fill_manual(values=palette(glasbey(32)))+
+  facet_grid(Grade~Type)
+
+
+df_t <-prop.table(table(df$Celltype,df$Compartment),1) %>% data.frame
+
+colnames(df_t) <- c("Celltype","Distance","Freq")
+df_t$Distance <- factor(df_t$Distance, levels = c("-200",
+                                                        "-200 to -100",
+                                                        "-100 to -50",
+                                                        "-50 to -40",
+                                                        "-40 to -30",
+                                                        "-30 to -20",
+                                                        "-20 to -10",
+                                                        "-10 to -0",
+                                                        "0 to 10",
+                                                        "10 to 20",
+                                                        "20 to 30",
+                                                        ">20"))
+p <- ggplot(df_t, aes(fill=Distance, x=Freq, y=Celltype)) + 
+    geom_bar(position="fill", stat="identity")+
+  scale_fill_igv()+
+  scale_fill_manual(values=palette(glasbey(32)))
+plot(p)
+  
+df%>%
+  #subset(Celltype=="tpCAF"|Celltype=="IDO_CAF")%>%
+  ggplot(aes(x=Distance, color=Celltype, fill=Celltype)) +
+  geom_density(alpha=0.3,size=1)+
+  xlim(c(-100,150))+
+  #facet_wrap(~Type+Celltype)+
+  facet_grid(Grade~Celltype)+
+  scale_fill_igv()+
+  scale_color_igv()
+  #scale_x_log10()
+
+
+df %>% ggplot(aes(x=Distance))+geom_density()+geom_vline(xintercept=0)+xlim(c(-100,150))
+
+
+#+  scale_x_continuous(trans = "log10")
+
+df%>%
+  subset(Celltype=="tpCAF"|Celltype=="mCAF"|Celltype=="SMA_CAF"|Celltype=="iCAF"|Celltype=="vCAF")%>%
+  ggplot(aes(x=Distance, color=Celltype, fill=Celltype)) +
+  geom_density(alpha=0.3,size=1)+
+  xlim(c(-100,150))+
+  #facet_wrap(~Type+Celltype)+
+  #facet_grid(Type~Celltype)+
+  scale_fill_igv()+
+  scale_color_igv()
+  #scale_x_log10()
+
+df%>%
+  #subset(Celltype=="tpCAF"|Celltype=="IDO_CAF")%>%
+  ggplot(aes(x=Distance, color=Celltype, fill=Celltype)) +
+  geom_density(alpha=0.3,size=1)+
+  xlim(c(-100,150))+
+  #facet_wrap(~Type+Celltype)+
+  facet_grid(Grade~Celltype)+
+  scale_fill_igv()+
+  scale_color_igv()
+  #scale_x_log10()
+
+unique(df$Celltype)
+
+#Tcell types
+#Fibroblast Types
+df <- data.frame(all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"|all.sce_pat.roi$cell_category=="Immune"]$CellID,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"|all.sce_pat.roi$cell_category=="Immune"]$Compartment,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"|all.sce_pat.roi$cell_category=="Immune"]$cell_subtype,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"|all.sce_pat.roi$cell_category=="Immune"]$DX.name,
+                 all.sce_pat.roi[, all.sce_pat.roi$cell_category=="T cell"|all.sce_pat.roi$cell_category=="Immune"]$Patient_ID)
+
+colnames(df) <- c("CellID","Distance","Celltype","Type","Patient_ID")
+df <- df %>% subset(Type=="Adenocarcinoma"|Type=="Squamous cell carcinoma")
+head(df)
+
+df_strat <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+df_strat <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_lo_density.csv"))
+
+df <-left_join(df, df_strat, by="Patient_ID")
+df$mCAF_G[is.na(df$mCAF_G)==T] <-"NA"
+df$tpCAF_G[is.na(df$tpCAF_G)==T] <-"NA"
+df$SMA_CAF_G[is.na(df$SMA_CAF_G)==T] <-"NA"
+df%>%subset(mCAF_G!="NA")%>%
+  #subset(Celltype=="tpCAF"|Celltype=="IDO_CAF")%>%
+  ggplot(aes(x=Distance, color=mCAF_G, fill=mCAF_G)) +
+  geom_density(alpha=0.3,size=1)+
+  xlim(c(-100,150))+
+  facet_wrap(~Celltype)+
+  #facet_grid(mCAF_G~Celltype)+
+  scale_fill_igv()+
+  scale_color_igv()
+  #scale_x_log10()
+
+df%>% subset(mCAF_G =="mCAF high"|mCAF_G=="mCAF low")%>%
+  #subset(Celltype=="tpCAF"|Celltype=="IDO_CAF")%>%
+  ggplot(aes(y=Distance, x=Celltype, color=mCAF_G, fill=mCAF_G)) +
+  geom_boxplot()+
+  ylim(c(-100,150))+
+  #facet_wrap(~Type+Celltype)+
+  facet_grid(mCAF_G~Celltype)+
+  scale_fill_igv()+
+  scale_color_igv()
+  #scale_x_log10()
+
+#here
+df <-df %>% subset(Distance >-(600))
+pvalues <- df %>% subset(mCAF_G=="mCAF high"|mCAF_G=="mCAF low")%>%
+      group_by(Celltype) %>% 
+      summarise(p=wilcox.test(Distance~mCAF_G, paired=F)$p.value)
+      df <- left_join(df, pvalues, by.x = c("Celltype"), by.y =c("Celltype"), all.x = TRUE)
+      df$p.wt <- paste0('p=',round(df$p, digits=3))
+
+      #Plot list, all js together over i
+      df %>% subset(mCAF_G=="mCAF high"|mCAF_G=="mCAF low")%>%
+        ggplot(aes(x= mCAF_G, y = Distance, colour=p<0.05))+#.data[[i]]
+        #geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        #facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        facet_grid(~Celltype+p.wt)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        theme_bw()+
+        ylim(c(-400,400))+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+      
+      
+df %>% ggplot(aes(x=Distance))+geom_density()+geom_vline(xintercept=0)+xlim(c(-100,150))
+
+
+
+
+#test
+
+df$Compartment <- df$Distance
+df$Compartment[df$Distance <= (-150)] <-"(-210)"
+df$Compartment[df$Distance > (-150) & df$Distance <= (-120)] <-"-50 to -40"
+df$Compartment[df$Distance > (-120) & df$Distance <= (-90)] <-"-40 to -30"
+df$Compartment[df$Distance > (-90) & df$Distance <= (-60)] <-"-30 to -20"
+df$Compartment[df$Distance > (-60) & df$Distance <= (-30)] <-"-20 to -10"
+df$Compartment[df$Distance > (-30) & df$Distance <= (0)] <-"-10 to -0"
+
+df$Compartment[df$Distance > 0 & df$Distance <= (30)] <-"0 to 10"
+df$Compartment[df$Distance > 30] <-"10 to 20"
+
+df$Compartment <- factor(df$Compartment, levels = c("(-210)","-50 to -40","-40 to -30","-30 to -20", "-20 to -10", "-10 to -0","0 to 10", "10 to 20"))
+
+df <- df%>%subset(is.na(mCAF_G)!=T& Celltype!="mCAF")
+test <-df %>%
+  dplyr::group_by(Patient_ID, Celltype,Compartment)%>%
+  dplyr::summarise(n=n()) %>%
+  dplyr::group_by(Patient_ID, Celltype)%>%
+  dplyr::mutate(freq=n/sum(n))%>%
+  dplyr::ungroup()
+
+test <-left_join(test, df_strat, by = "Patient_ID")
+  
+pvalues <- test %>% subset(mCAF_G=="mCAF high"|mCAF_G=="mCAF low")%>%
+      group_by(Celltype,Compartment) %>% 
+      summarise(p=wilcox.test(jitter(freq)~mCAF_G, paired=F)$p.value)
+      test <- left_join(test, pvalues, by.x = c("Celltype","Compartment"), by.y =c("Celltype","Compartment"), all.x = TRUE)
+      test$p.wt <- paste0('p=',round(test$p, digits=3))
+ 
+      test %>% subset(mCAF_G=="mCAF high"|mCAF_G=="mCAF low")%>%
+        ggplot(aes(x= mCAF_G, y = freq, color=p<0.05))+#.data[[i]]
+        #geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        #facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        facet_grid(Celltype~Compartment)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        theme_bw()+
+        #ylim(c(-400,400))+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+            
+      
+      
+      
+      
+#+  scale_x_continuous(trans = "log10")
+
+df%>%
+  subset(Celltype=="tpCAF"|Celltype=="IDO_CAF")%>%
+  ggplot(aes(x=Distance, color=Celltype, fill=Celltype)) +
+  geom_density(alpha=0.3,size=1)+
+  xlim(c(-100,150))+
+  #facet_wrap(~Type+Celltype)+
+  #facet_grid(Type~Celltype)+
+  scale_fill_igv()+
+  scale_color_igv()
+  #scale_x_log10()
+
+df%>%
+  #subset(Celltype=="tpCAF"|Celltype=="IDO_CAF")%>%
+  ggplot(aes(x=Distance, color=Celltype, fill=Celltype)) +
+  geom_density(alpha=0.3,size=1)+
+  xlim(c(-100,150))+
+  #facet_wrap(~Type+Celltype)+
+  facet_grid(Type~Celltype)+
+  scale_fill_igv()+
+  scale_color_igv()
+  #scale_x_log10()
+
+unique(df$Celltype)
+
+
+#Immune types
+df <- data.frame(all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$CellID,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$Compartment,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$cell_subtype,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$DX.name)
+
+colnames(df) <- c("CellID","Distance","Celltype","Type")
+df <- df %>% subset(Type=="Adenocarcinoma"|Type=="Squamous cell carcinoma")
+head(df)
+
+df %>% ggplot(aes(x=Distance))+geom_density()+geom_vline(xintercept=0)+xlim(c(-100,150))
+
+
+#+  scale_x_continuous(trans = "log10")
+
+df%>%
+  #subset(Celltype=="tpCAF"|Celltype=="IDO_CAF")%>%
+  ggplot(aes(x=Distance, color=Celltype, fill=Celltype)) +
+  geom_density(alpha=0.3,size=1)+
+  xlim(c(-100,150))+
+  #facet_wrap(~Type+Celltype)+
+ # facet_grid(Type~Celltype)+
+  scale_fill_igv()+
+  scale_color_igv()
+  #scale_x_log10()
+
+
+df%>%
+  #subset(Celltype=="tpCAF"|Celltype=="IDO_CAF")%>%
+  ggplot(aes(x=Distance, color=Celltype, fill=Celltype)) +
+  geom_density(alpha=0.3,size=1)+
+  xlim(c(-100,150))+
+  #facet_wrap(~Type+Celltype)+
+  facet_grid(Type~Celltype)+
+  scale_fill_igv()+
+  scale_color_igv()
+  #scale_x_log10()f
+
+
+
+# install.packages("car")
+library(car)
+df_s <- df%>% subset(mCAF_G!="NA"&Distance>(-200)&Distance<200)
+df_s$mCAF_G <- as.factor(df_s$mCAF_G)
+densityPlot(df_s$Distance ~ df_s$mCAF_G,
+            legend = list(title = "Species"),
+            xlab = "Flipper length",
+            xlim = c(-200, 200))
+
+# install.packages("sm")
+library(sm)
+
+sm.density.compare(x = df_s$Distance,
+                   group = df_s$mCAF_G,
+                   model = "equal")
+```
+
+```{r}
+df <- data.frame(all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Fibroblast"]$CellID,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Fibroblast"]$Compartment,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Fibroblast"]$cell_type,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Fibroblast"]$DX.name,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Fibroblast"]$RoiID)
+
+colnames(df) <- c("CellID","Distance","Celltype","Type","ROI")
+head(df)
+roi_rem <-df[df$Distance >300,]$ROI %>%unique()
+
+df <-df[!df$ROI %in% roi_rem,]
+
+df %>% ggplot(aes(x=Celltype, y=Distance, fill=Celltype))+geom_boxplot()+theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+
+kruskal.test(Distance ~Celltype, data=df)
+
+library(FSA)
+library(dunn.test)
+dunnTest(Distance ~Celltype, data=df,
+              method="bonferroni")
+
+dunn.test(df$Distance, g=df$Celltype,
+              method="bonferroni")
+
+library(ggpubr)
+ggqqplot(df$Distance)
+
+one.way <- aov(Distance~Celltype, data = df)
+
+summary(one.way)
+TukeyHSD(one.way)
+```
+
+```{r}
+df <- data.frame(all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$CellID,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$Compartment,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$cell_subtype,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$DX.name,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="Immune"]$Patient_ID)
+
+colnames(df) <- c("CellID","Distance","Celltype","Type","Patient_ID")
+#df <- df %>% subset(Type=="Adenocarcinoma"|Type=="Squamous cell carcinoma")
+head(df)
+
+
+df$Patient_ID <-factor(df$Patient_ID)
+df_2 <- df %>% 
+  group_by(Patient_ID) %>% 
+  mutate(Invasion = if_else(any(Distance >0), "Invasion","No Invasion"))
+df_immune_inv <-df_2[!duplicated(df_2$Patient_ID),] %>% select(Patient_ID, Invasion)
+
+table(df_immune_inv$Invasion)
+
+#T cells
+df <- data.frame(all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"]$CellID,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"]$Compartment,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"]$cell_subtype,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"]$DX.name,
+                 all.sce_pat.roi[,all.sce_pat.roi$cell_category=="T cell"]$Patient_ID)
+
+colnames(df) <- c("CellID","Distance","Celltype","Type","Patient_ID")
+#df <- df %>% subset(Type=="Adenocarcinoma"|Type=="Squamous cell carcinoma")
+head(df)
+
+
+df$Patient_ID <-factor(df$Patient_ID)
+df_2 <- df %>% 
+  group_by(Patient_ID) %>% 
+  mutate(Invasion = if_else(any(Distance >30), "Invasion","No Invasion"))
+df_tcell_inv <-df_2[!duplicated(df_2$Patient_ID),] %>% select(Patient_ID, Invasion)
+
+table(df_tcell_inv$Invasion)
+
+```
+
+#Spatial analysis using imcRtools
+```{r, fig.width=12, fig.height=12}
+library(imcRtools)
+test <- all.filtered.sub
+tma87A <- all.sce_pat.roi[, all.sce_pat.roi$TMA =="87A"]
+tma87A <- buildSpatialGraph(tma87A,
+                            img_id = "RoiID",
+                            type = "knn",
+                            k = 10, k_max_dist = 20,
+                            name="knn_k10_dist20",
+                            coords = c("Center_X", "Center_Y"))
+#save different ks and max_ks under different names
+
+colPairNames(tma87A)
+
+library(ggplot2)
+library(ggraph)
+
+plotSpatial(tma87A[, tma87A$acID ==2],
+            img_id = "RoiID",
+            node_color_by = "cell_category",
+            node_shape_by = "RoiID",
+            node_size_by = "Area",
+            draw_edges = TRUE,
+           # colPairName = "knn_interaction_graph",
+           colPairName="knn_k15_dist20",
+            directed = FALSE,
+            coords=c("Center_X", "Center_Y"))
+
+plotSpatial(tma87A[, tma87A$acID ==2],
+            img_id = "RoiID",
+            node_color_by = "cell_category",
+            node_shape_by = "RoiID",
+            node_size_by = "Area",
+            draw_edges = TRUE,
+           # colPairName = "knn_interaction_graph",
+           colPairName="knn_k10_dist20",
+            directed = FALSE,
+            coords=c("Center_X", "Center_Y"))
+
+unique(tma87A$cell_category)
+
+
+tma87A <- aggregateNeighbors(tma87A,
+                                  colPairName = "knn_k15_dist20",
+                                  aggregate_by = "metadata",
+                                  count_by = "cell_category")
+head(tma87A$aggregatedNeighbors)
+
+
+cur_cluster <- kmeans(tma87A$aggregatedNeighbors, centers = 3)
+tma87A$clustered_neighbors <- factor(cur_cluster$cluster)
+
+plotSpatial(tma87A[, tma87A$acID ==2],
+            img_id = "RoiID",
+            coords=c("Center_X", "Center_Y"),
+            node_color_by = "cell_category",
+            node_size_fix = 4,
+            edge_width_fix = 2,
+            edge_color_by = "clustered_neighbors",
+            draw_edges = TRUE,
+            colPairName = "knn_k15_dist20",
+            directed = FALSE,
+            nodes_first = FALSE) +
+    scale_color_brewer(palette = "Set2") +
+    scale_edge_color_brewer(palette = "Set1")
+
+
+out <- countInteractions(tma87A,
+                         group_by = "RoiID",
+                         label = "cell_category",
+                         method = "classic",
+                         colPairName = "knn_k15_dist20")
+out
+
+out <- testInteractions(tma87A,
+                        group_by = "RoiID",
+                        label = "cell_category",
+                        method = "classic",
+                        colPairName = "knn_k15_dist20")
+out
+```
+
+
+#Spatial analysis using imcRtools
+```{r, fig.width=50, fig.height=50}
+library(imcRtools)
+#test <- all.filtered.sub
+#tma87A <- all.sce_pat.roi[, all.sce_pat.roi$TMA =="87A"]
+
+all.sce_pat.roi <- buildSpatialGraph(all.sce_pat.roi,
+                            img_id = "RoiID",
+                            type = "knn",
+                            k = 15, k_max_dist = 20,
+                            name="knn_k15_dist20",
+                            coords = c("Center_X", "Center_Y"))
+
+all.sce_pat.roi <- buildSpatialGraph(all.sce_pat.roi,
+                            img_id = "RoiID",
+                            type = "knn",
+                            k = 10, k_max_dist = 20,
+                            name="knn_k10_dist20",
+                            coords = c("Center_X", "Center_Y"))
+#save different ks and max_ks under different names
+
+#colPairNames(tma87A)
+
+library(ggplot2)
+library(ggraph)
+
+plotSpatial(all.sce_pat.roi[, all.sce_pat.roi$acID ==2],
+            img_id = "RoiID",
+            node_color_by = "tcell_subtype",
+            node_shape_by = "RoiID",
+            node_size_by = "Area",
+            draw_edges = TRUE,
+           # colPairName = "knn_interaction_graph",
+           colPairName="knn_k15_dist20",
+            directed = FALSE,
+            coords=c("Center_X", "Center_Y"))
+
+plotSpatial(all.sce_pat.roi[, all.sce_pat.roi$acID ==2],
+            img_id = "RoiID",
+            node_color_by = "tcell_subtype",
+            node_shape_by = "RoiID",
+            node_size_by = "Area",
+            draw_edges = TRUE,
+           # colPairName = "knn_interaction_graph",
+           colPairName="knn_k10_dist20",
+            directed = FALSE,
+            coords=c("Center_X", "Center_Y"))
+
+unique(all.sce_pat.roi$tcell_subtype)
+
+
+all.sce_pat.roi <- aggregateNeighbors(all.sce_pat.roi,
+                                  colPairName = "knn_k15_dist20",
+                                  aggregate_by = "metadata",
+                                  count_by = "tcell_subtype")
+head(all.sce_pat.roi$aggregatedNeighbors)
+df <-all.sce_pat.roi$aggregatedNeighbors
+rownames(df) <-colnames(all.sce_pat.roi)
+df$cell_type <- all.sce_pat.roi$tcell_subtype
+df <- data.frame(df)
+df$CellID <- rownames(df)
+df
+data.frame("CellID"=all.sce_pat.roi$CellID, "Type"=all.sce_pat.roi$tcell_subtype)
+
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 30)
+all.sce_pat.roi$clustered_neighbors_30 <- factor(cur_cluster$cluster)
+
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 29)
+all.sce_pat.roi$clustered_neighbors_29 <- factor(cur_cluster$cluster)
+
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 28)
+all.sce_pat.roi$clustered_neighbors_28 <- factor(cur_cluster$cluster)
+
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 27)
+all.sce_pat.roi$clustered_neighbors_27 <- factor(cur_cluster$cluster)
+
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 26)
+all.sce_pat.roi$clustered_neighbors_26 <- factor(cur_cluster$cluster)
+
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 25)
+all.sce_pat.roi$clustered_neighbors_25 <- factor(cur_cluster$cluster)
+
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 24)
+all.sce_pat.roi$clustered_neighbors_24 <- factor(cur_cluster$cluster)
+
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 23)
+all.sce_pat.roi$clustered_neighbors_23 <- factor(cur_cluster$cluster)
+
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 22)
+all.sce_pat.roi$clustered_neighbors_22 <- factor(cur_cluster$cluster)
+
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 21)
+all.sce_pat.roi$clustered_neighbors_21 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 20)
+all.sce_pat.roi$clustered_neighbors_20 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 19)
+all.sce_pat.roi$clustered_neighbors_19 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 18)
+all.sce_pat.roi$clustered_neighbors_18 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 17)
+all.sce_pat.roi$clustered_neighbors_17 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 16)
+all.sce_pat.roi$clustered_neighbors_16 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 15)
+all.sce_pat.roi$clustered_neighbors_15 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 14)
+all.sce_pat.roi$clustered_neighbors_14 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 13)
+all.sce_pat.roi$clustered_neighbors_13 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 12)
+all.sce_pat.roi$clustered_neighbors_12 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 11)
+all.sce_pat.roi$clustered_neighbors_11 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 10)
+all.sce_pat.roi$clustered_neighbors_10 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 9)
+all.sce_pat.roi$clustered_neighbors_9 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 8)
+all.sce_pat.roi$clustered_neighbors_8 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 7)
+all.sce_pat.roi$clustered_neighbors_7 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(all.sce_pat.roi$aggregatedNeighbors, centers = 6)
+all.sce_pat.roi$clustered_neighbors_6 <- factor(cur_cluster$cluster)
+
+
+data_folder <- file.path(wd, "sce_objects","Distances")
+saveRDS(all.sce_pat.roi, file=file.path(data_folder,"all_minus_other_distances.rds"))
+all.sce_pat.roi <-readRDS(file=file.path(data_folder,"all_minus_other_distances.rds"))
+
+plotSpatial(all.sce_pat.roi[, all.sce_pat.roi$acID ==2],
+            img_id = "RoiID",
+            coords=c("Center_X", "Center_Y"),
+            node_color_by = "tcell_subtype",
+            node_size_fix = 3,
+            edge_width_fix = 2,
+            edge_color_by = "clustered_neighbors_15",
+            draw_edges = TRUE,
+            colPairName = "knn_k15_dist20",
+            directed = FALSE,
+            nodes_first = FALSE) +
+  scale_color_igv()#+
+   # scale_color_brewer(palette = "Set2") +
+    #scale_edge_color_brewer(palette = "Set1")
+
+
+clus <- c(5:30)
+clus=20
+for(i in clus){
+  p_clus <- paste0("clustered_neighbors_",i)
+p <-plotSpatial(all.sce_pat.roi[, all.sce_pat.roi$acID ==1],
+            img_id = "RoiID",
+            coords=c("Center_X", "Center_Y"),
+            node_color_by = "tcell_subtype",
+            node_size_fix = 3,
+            edge_width_fix = 2,
+            edge_color_by = paste(p_clus),
+            draw_edges = TRUE,
+            colPairName = "knn_k15_dist20",
+            directed = FALSE,
+            nodes_first = FALSE) +
+  scale_color_igv()#+
+   # scale_color_brewer(palette = "Set2") +
+    #scale_edge_color_brewer(palette = "Set1")
+plot(p)
+}
+df <-round(prop.table(table(all.sce_pat.roi$clustered_neighbors_5, all.sce_pat.roi$tcell_subtype),1)*100, digits = 2)
+
+pheatmap(scale(df))
+
+data_folder <-file.path(wd,"sce_objects","merge_plus_tumour")
+out <- countInteractions(all.sce_pat.roi,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "classic",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_all-sce-pat-roi.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_all-sce-pat-roi.csv"))
+
+out <- testInteractions(all.sce_pat.roi,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "classic",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_all-sce-pat-roi.csv"))
+
+length(unique(all.sce_pat.roi[, all.sce_pat.roi$TmaID==86]$tcell_subtype))
+
+tma86 <-all.sce_pat.roi[, all.sce_pat.roi$TmaID==86]
+saveRDS(tma86, file=file.path(data_folder, "tma86_distances.rds"))
+tma86<- readRDS(file=file.path(data_folder, "tma86_distances.rds"))
+
+tma87 <-all.sce_pat.roi[, all.sce_pat.roi$TmaID==87]
+saveRDS(tma87, file=file.path(data_folder, "tma87_distances.rds"))
+tma87<- readRDS(file=file.path(data_folder, "tma87_distances.rds"))
+
+tma88 <-all.sce_pat.roi[, all.sce_pat.roi$TmaID==88]
+saveRDS(tma88, file=file.path(data_folder, "tma88_distances.rds"))
+
+
+tma175 <-all.sce_pat.roi[, all.sce_pat.roi$TmaID==175]
+saveRDS(tma175, file=file.path(data_folder, "tma175_distances.rds"))
+
+tma176 <-all.sce_pat.roi[, all.sce_pat.roi$TmaID==176]
+saveRDS(tma176, file=file.path(data_folder, "tma176_distances.rds"))
+
+tma178 <-all.sce_pat.roi[, all.sce_pat.roi$TmaID==178]
+saveRDS(tma178, file=file.path(data_folder, "tma178_distances.rds"))
+
+```
+
+
+```{r, fig.width=8, fig.height=8}
+clus <- c(10:30)
+p_clus <- paste0("clustered_neighbors_",clus)
+
+for(i in p_clus){
+df <-round(prop.table(table(all.sce_pat.roi[[i]], all.sce_pat.roi$tcell_subtype),1)*100, digits = 2)
+
+pheatmap(t(scale(df))) %>%print
+
+}
+
+clus <- c(15)
+p_clus <- paste0("clustered_neighbors_",clus)
+
+for(i in p_clus){
+  
+  df <-round(prop.table(table(all.sce_pat.roi[[i]], all.sce_pat.roi$tcell_subtype),1)*100, digits = 2)
+
+  df <-df %>% data.frame
+  colnames(df) <- c("Cluster","CellType","Freq")
+  print(df %>% pivot_wider(id_cols=Cluster,values_from=Freq, names_from=CellType))
+  p <- ggplot(df, aes(fill=CellType, x=Freq, y=Cluster)) + 
+    geom_bar(position="fill", stat="identity")+
+  scale_fill_igv()+
+  scale_fill_manual(values=palette(glasbey(32)))
+  plot(p)
+}
+```
+
+
+
+
+```{r tma86}
+out <- countInteractions(tma86,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "histocat",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma86_histocat.csv"))
+##out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma86.csv"))
+
+out <- testInteractions(tma86,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "histocat",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma86_histocat.csv"))
+```
+
+
+```{r tma87}
+data_folder <- file.path(wd, "sce_objects","Distances")
+
+tma87 <-readRDS(file=file.path(data_folder, "tma87_distances.rds"))
+
+out <- countInteractions(tma87,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "histocat",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma87_histocat.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma87.csv"))
+
+out <- testInteractions(tma87,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "histocat",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma87_histocat.csv"))
+```
+
+
+```{r tma88}
+out <- countInteractions(tma88,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "histocat",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma88_histocat.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma88.csv"))
+
+out <- testInteractions(tma88,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "histocat",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma88_histocat.csv"))
+
+out <- countInteractions(tma175,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "histocat",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma175_histocat.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma175.csv"))
+
+out <- testInteractions(tma175,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "histocat",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma175_histocat.csv"))
+
+out <- countInteractions(tma176,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "histocat",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma176_histocat.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma176.csv"))
+
+out <- testInteractions(tma176,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "histocat",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma176_histocat.csv"))
+
+out <- countInteractions(tma178,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "histocat",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma178_histocat.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma178.csv"))
+
+out <- testInteractions(tma178,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "histocat",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma178_histocat.csv"))
+```
+
+
+```{r, fig.width=8, fig.height=8}
+df <-round(prop.table(table(all.sce_pat.roi$clustered_neighbors_10, all.sce_pat.roi$tcell_subtype),1)*100, digits = 2)
+
+pheatmap(t(scale(df)))
+
+```
+
+```{r}
+df
+```
+
+```{r}
+df <-data.frame(round(prop.table(table(all.sce_pat.roi$Patient_ID,
+                            all.sce_pat.roi$clustered_neighbors_10),1)*100, digits = 2))
+
+df <- data.frame(table(all.sce_pat.roi$Patient_ID,
+                            all.sce_pat.roi$clustered_neighbors_10))
+colnames(df) <- c("Patient_ID","Spatial_Cluster","n")
+
+df <-df %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+df[is.na(df)] <- 0
+
+df_clin <- data.frame("Patient_ID"=all.sce_pat.roi$Patient_ID,"Grade"=all.sce_pat.roi$Grade)
+df_clin = df_clin[!duplicated(df_clin$Patient_ID),]
+
+df <- left_join(df, df_clin, by="Patient_ID")
+
+clinical.data  %>% select(paste(i))
+i ="Grade"
+
+```
+
+```{r}
+#clinical.data <-as.data.frame(colData(immune.sce))
+
+clinical.data <- read.csv(file=file.path(wd,"sce_objects","clinical_data", "clinical_data_ROI_ac_combined_CORRECT.csv"))
+
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+clinical.data$X <-NULL
+clinical.data$X.1 <-NULL
+head(clinical.data)
+```
+
+```{r  KW W Spatial cluster 5, fig.width=15, fig.height=10,message=FALSE, warning=FALSE, echo=FALSE}
+
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj","Smok")
+
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Smok")
+
+
+#categories <- "Grade"
+plot_list <- list()
+
+df <- data.frame(table(all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                            all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                              all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$clustered_neighbors_15))
+
+colnames(df) <- c("Patient_ID","Spatial_Cluster","n")
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+df <-df[!df$Patient_ID%in% neoadj_pat,]
+
+df <-df %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+df[is.na(df)] <- 0
+
+
+for (i in (categories)) {
+df_clin <- clinical.data %>% select(paste(categories),"Patient_ID" )
+
+    t <- left_join(df, df_clin, by="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Spatial_Cluster <-droplevels(tdat$Spatial_Cluster)
+  
+  for (j in unique(t$Spatial_Cluster)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Spatial_Cluster <-droplevels(tdat$Spatial_Cluster)
+
+    tdat <-subset(tdat, Spatial_Cluster==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Spatial_Cluster) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Spatial_Cluster, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Spatial_Cluster <-droplevels(tdat$Spatial_Cluster)
+
+      tdat <-subset(tdat, Spatial_Cluster==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Spatial_Cluster) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Spatial_Cluster", by.y ="Spatial_Cluster", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Spatial_Cluster+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Spatial_Cluster"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Spatial_Cluster")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Spatial_Cluster)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Proportions_Wneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Spatial_Cluster)))
+ # dev.off()
+}
+```
+
+
+
+
+
+```{r  KW W Spatial cluster 10, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+#categories <- "Grade"
+plot_list <- list()
+
+   df <- data.frame(table(all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                            all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                            all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$clustered_neighbors_10))
+colnames(df) <- c("Patient_ID","Spatial_Cluster","n")
+
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+df <-df[!df$Patient_ID%in% neoadj_pat,]
+
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+
+df <-df %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+df[is.na(df)] <- 0
+
+for (i in (categories)) {
+
+
+df_clin <- clinical.data %>% select(paste(categories),"Patient_ID" )
+
+    t <- left_join(df, df_clin, by="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Spatial_Cluster <-droplevels(tdat$Spatial_Cluster)
+  
+  for (j in unique(t$Spatial_Cluster)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Spatial_Cluster <-droplevels(tdat$Spatial_Cluster)
+
+    tdat <-subset(tdat, Spatial_Cluster==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Spatial_Cluster) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Spatial_Cluster, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Spatial_Cluster <-droplevels(tdat$Spatial_Cluster)
+
+      tdat <-subset(tdat, Spatial_Cluster==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Spatial_Cluster) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Spatial_Cluster", by.y ="Spatial_Cluster", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Spatial_Cluster+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Spatial_Cluster"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Spatial_Cluster")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Spatial_Cluster)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Proportions_Wneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Spatial_Cluster)))
+ # dev.off()
+}
+```
+
+
+
+```{r  KW W Spatial cluster 15, fig.width=12, fig.height=10, message=FALSE, warning=FALSE, echo=FALSE}
+
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+#categories <- "Grade"
+plot_list <- list()
+
+df <- data.frame(table(all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                            all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                              all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$clustered_neighbors_15))
+   
+colnames(df) <- c("Patient_ID","Spatial_Cluster","n")
+
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+df <-df[!df$Patient_ID%in% neoadj_pat,]
+
+df <-df %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+df[is.na(df)] <- 0
+
+
+for (i in (categories)) {
+
+df_clin <- clinical.data %>% select(paste(categories),"Patient_ID" )
+
+    t <- left_join(df, df_clin, by="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Spatial_Cluster <-droplevels(tdat$Spatial_Cluster)
+  
+  for (j in unique(t$Spatial_Cluster)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Spatial_Cluster <-droplevels(tdat$Spatial_Cluster)
+
+    tdat <-subset(tdat, Spatial_Cluster==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Spatial_Cluster) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Spatial_Cluster, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Spatial_Cluster <-droplevels(tdat$Spatial_Cluster)
+
+      tdat <-subset(tdat, Spatial_Cluster==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Spatial_Cluster) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Spatial_Cluster", by.y ="Spatial_Cluster", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Spatial_Cluster+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Spatial_Cluster"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Spatial_Cluster")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=7) #, ncol=round(length(unique(t$Spatial_Cluster)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Proportions_Wneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Spatial_Cluster)))
+ # dev.off()
+}
+```
+
+
+
+```{r, fig.width=8, fig.height=8}
+df <-round(prop.table(table(all.sce_pat.roi$clustered_neighbors_15, all.sce_pat.roi$tcell_subtype),1)*100, digits = 2)
+
+pheatmap(t(scale(df)))
+pheatmap(t(df))
+
+```
+
+
+#Differential abundance analysis loop   
+```{r  Differential Abundance Analysis Types over Metastasis, fig.width=6, fig.height=4, warning=F, message=F,echo=F, out.width="50%"}
+#Patient Number must be numeric
+library(tidyr)
+#groupId must be factor
+#t <-as.data.frame(table(fibro[,fibro$CAF_Type!="undefined"]$Patient_ID,factor(fibro[,fibro$CAF_Type!="undefined"]$CAF_Type)))
+#  colnames(t)<-c("Patient_ID","Phenotype","n")
+ # t$Phenotype <-factor(t$Phenotype)
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+df$X <-NULL
+  
+colData(all.sce_pat.roi)<-as.data.frame(colData(all.sce_pat.roi)) %>% 
+  group_by(Patient_ID) %>% mutate(NR=cur_group_id()) %>% DataFrame()
+dat <-as_tibble(colData(all.sce_pat.roi))
+#if necessary: change group_id labels
+
+dat <-left_join(dat, df, by="Patient_ID")
+#clinical.data$NR <-c(1:length(unique(clinical.data$Patient_ID)))
+#dat <-merge(t,clinical.data,by="Patient_ID")
+dat <-dat[dat$DX.name=="Adenocarcinoma" | dat$DX.name=="Squamous cell carcinoma",]
+#dat$LN.Met <- ifelse(dat$N ==0, "No LN Metastases", "LN Metastases")
+#dat$Dist.Met <- ifelse(dat$M.new ==0, "No Dist. Metastases", "Dist. Metastases")
+#dat$NeoAdj <- ifelse(dat$Radio==1 |dat$Chemo==1, "NeoAdjuvantTherapy", "NoNeoAdjuvantTherapy")
+
+#Only for grade
+#dat <-dat[dat$Grade=="2" | dat$Grade=="3",]
+#category <-"Grade"
+#dat<-dat %>% drop_na(Grade)
+
+
+caf_groups <- colnames(df[,-1])
+category <-c("Gender","Relapse","Chemo","Dist.Met","LN.Met",caf_groups)
+
+#group_id <-c("M.new")
+#dat$group_id <-ifelse(dat$M.new ==0, 0,1)
+#dat[[group_id]] <-as.factor(dat[[group_id]])
+
+#define the naming category aka Phenotype
+
+#dat<-dat %>% drop_na(paste(group_id))
+j <-"clustered_neighbors_15"
+
+plot_list <- list()
+for (i in category) {
+  #for(k in unique(dat$DX.name)){
+   # dat.l <-subset(dat, DX.name==k)
+
+dat.l <-dat
+k="both tumour types"
+dat.l$group_id <-as.factor(dat.l[[i]])
+dat.l<-dat.l %>% drop_na(paste(i))
+
+test <- dat.l %>% select(c("NR",j,"group_id"))
+#colnames(test) <-c("NR", "Phenotype","group_id")
+
+test_wide <- pivot_wider(test,id_cols=c("NR","group_id"),names_from = paste(j), values_from =paste(j), values_fn = list(clustered_neighbors_15=length),names_prefix = "")
+#names(test_wide)[names(test_wide) == "FibroType_1"] <- "FibroType_1 / tCAF"
+#names(test_wide)[names(test_wide) == "FibroType_2"] <- "FibroType_2 / vCAF"
+test_wide
+test_wide[is.na(test_wide)] <- 0
+
+design <- createDesignMatrix(
+  test_wide[,c("NR","group_id")], cols_design = c( "NR","group_id")
+)
+
+contrast <- createContrast(c(rep(0, ncol(design)-1),1))
+data.frame(parameters = colnames(design), contrast)
+
+test_wide_norm <- test_wide %>% select(-c("NR","group_id"))
+#Normalize
+norm_factors <- calcNormFactors(t(test_wide_norm), method = "TMM")
+y <- DGEList(t(test_wide_norm), norm.factors = norm_factors)
+
+#y <- DGEList(t(both[,-c("patient_id","group_id")]))
+y <- estimateDisp(y,design)
+
+fit <- glmFit(y, design)
+lrt <- glmLRT(fit, contrast = contrast)
+top <- edgeR::topTags(lrt, n = Inf, adjust.method = "BH", 
+        sort.by = "none")
+t.op <-as.data.frame(top)
+t.op <- tibble::rownames_to_column(t.op, "Neighbour_Cluster")
+
+plot_list[[k]] <-
+ggplot(t.op, aes(reorder(Neighbour_Cluster,logFC), logFC)) +
+  geom_col(aes(fill=PValue<0.05)) +
+  coord_flip() +
+  scale_fill_tableau()+
+  labs(x="Neighbour_Cluster", y="logFC",
+       title=paste("Phenotypes over",i,k))+
+  theme_bw()+
+  theme(strip.background = element_blank(),
+        panel.background=element_rect(fill='white', colour = 'black'),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        axis.text.y = element_text(size=10))
+#plot(p)
+
+# reference = Metastasis
+#ggsave(plot=p, file=file.path(plot_folder, paste("Differential_Abundance_Analysis_Fibro_n_over_",i,".pdf")))
+  #}
+  gridExtra::grid.arrange(grobs = plot_list)
+  
+    #save out in individual pdfs for each variable
+  #pdf(file=file.path(plot_folder, paste0("DifferentialAbundance_Analysis_Fibro_n_over_",i,".pdf")))
+  #gridExtra::grid.arrange(grobs = plot_list)
+  #dev.off()
+}
+
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=Neighbour_Cluster))+geom_point()+scale_color_tableau()
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=PValue<0.05))+geom_point()+scale_color_tableau()
+```
+
+
+
+#MINUS TUMOUR
+
+```{r}
+data_folder <-file.path(wd,"sce_objects","final objects with categories","FINAL")
+
+sce <- readRDS(file=file.path(data_folder, "ALL_minus-Tumour_sce_pat_roi_rem.rds"))
+
+data_folder <-file.path(wd,"sce_objects","merge_minus_Tumour")
+sce2 <- readRDS(file=file.path(data_folder, "ALL_minus-Tumour_sce_pat_roi_rem.rds"))
+sce2$cell_subtype%>% unique
+
+
+
+data_folder <-file.path(wd,"sce_objects","merge_minus_Tumour")
+unique(all.sce$cell_category)
+all.sce[, all.sce$cell_category!="Other"]
+saveRDS(all.sce_pat.roi,file=file.path(data_folder, paste("ALL_minus-Tumour_sce_pat_roi_rem.rds",sep="")))
+saveRDS(all.sce_roi,file=file.path(data_folder, paste("ALL_minus-Tumour_sce_roi_rem.rds",sep="")))
+saveRDS(all.sce_pat,file=file.path(data_folder, paste("ALL_minus-Tumour_sce_pat_rem.rds",sep="")))
+
+sce <-readRDS(file=file.path(data_folder, paste("ALL_minus-Tumour_sce_pat_roi_rem.rds",sep="")))
+#all.sce_roi <-readRDS(file=file.path(data_folder, paste("ALL_minus-Tumour_sce_roi_rem.rds",sep="")))
+#all.sce_pat <-readRDS(file=file.path(data_folder, paste("ALL_minus-Tumour_sce_pat_rem.rds",sep="")))
+
+```
+
+#Spatial analysis using imcRtools
+```{r, fig.width=50, fig.height=50}
+library(imcRtools)
+#test <- all.filtered.sub
+#tma87A <- sce[, sce$TMA =="87A"]
+
+sce$tcell_subtype <-sce$cell_subtype
+sce$tcell_subtype[sce$cell_category=="Fibroblast"] <- sce[, sce$cell_category=="Fibroblast"]$cell_type
+#unique(all.sce$tcell_subtype)
+sce <- buildSpatialGraph(sce,
+                            img_id = "RoiID",
+                            type = "knn",
+                            k = 15, k_max_dist = 20,
+                            name="knn_k15_dist20",
+                            coords = c("Center_X", "Center_Y"))
+
+data_folder <- file.path(wd, "sce_objects","Distances")
+saveRDS(sce, file=file.path(data_folder,"all_minus_TUMOURother_distances.rds"))
+
+#save different ks and max_ks under different names
+
+#colPairNames(tma87A)
+
+library(ggplot2)
+library(ggraph)
+
+plotSpatial(sce[, sce$acID ==2],
+            img_id = "RoiID",
+            node_color_by = "tcell_subtype",
+            node_shape_by = "RoiID",
+            node_size_by = "Area",
+            draw_edges = TRUE,
+           # colPairName = "knn_interaction_graph",
+           colPairName="knn_k15_dist20",
+            directed = FALSE,
+            coords=c("Center_X", "Center_Y"))
+
+unique(sce$tcell_subtype)
+
+
+sce <- aggregateNeighbors(sce,
+                          colPairName = "knn_k15_dist20",
+                                  aggregate_by = "metadata",
+                                  count_by = "tcell_subtype")
+head(sce$aggregatedNeighbors)
+df <-sce$aggregatedNeighbors
+rownames(df) <-colnames(sce)
+df$cell_type <- sce$tcell_subtype
+df <- data.frame(df)
+df$CellID <- rownames(df)
+df
+data.frame("CellID"=sce$CellID, "Type"=sce$tcell_subtype)
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 5)
+sce$clustered_neighbors_5 <- factor(cur_cluster$cluster)
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 10)
+sce$clustered_neighbors_10 <- factor(cur_cluster$cluster)
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 15)
+sce$clustered_neighbors_15 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 20)
+sce$clustered_neighbors_20 <- factor(cur_cluster$cluster)
+
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 30)
+sce$clustered_neighbors_30 <- factor(cur_cluster$cluster)
+
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 29)
+sce$clustered_neighbors_29 <- factor(cur_cluster$cluster)
+
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 28)
+sce$clustered_neighbors_28 <- factor(cur_cluster$cluster)
+
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 27)
+sce$clustered_neighbors_27 <- factor(cur_cluster$cluster)
+
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 26)
+sce$clustered_neighbors_26 <- factor(cur_cluster$cluster)
+
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 25)
+sce$clustered_neighbors_25 <- factor(cur_cluster$cluster)
+
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 24)
+sce$clustered_neighbors_24 <- factor(cur_cluster$cluster)
+
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 23)
+sce$clustered_neighbors_23 <- factor(cur_cluster$cluster)
+
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 22)
+sce$clustered_neighbors_22 <- factor(cur_cluster$cluster)
+
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 21)
+sce$clustered_neighbors_21 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 20)
+sce$clustered_neighbors_20 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 19)
+sce$clustered_neighbors_19 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 18)
+sce$clustered_neighbors_18 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 17)
+sce$clustered_neighbors_17 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 16)
+sce$clustered_neighbors_16 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 15)
+sce$clustered_neighbors_15 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 14)
+sce$clustered_neighbors_14 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 13)
+sce$clustered_neighbors_13 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 12)
+sce$clustered_neighbors_12 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 11)
+sce$clustered_neighbors_11 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 10)
+sce$clustered_neighbors_10 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 9)
+sce$clustered_neighbors_9 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 8)
+sce$clustered_neighbors_8 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 7)
+sce$clustered_neighbors_7 <- factor(cur_cluster$cluster)
+
+cur_cluster <- kmeans(sce$aggregatedNeighbors, centers = 6)
+sce$clustered_neighbors_6 <- factor(cur_cluster$cluster)
+
+
+data_folder <- file.path(wd, "sce_objects","Distances")
+saveRDS(sce, file=file.path(data_folder,"all_minus_TUMOURother_distances.rds"))
+sce <-readRDS(file=file.path(data_folder,"all_minus_TUMOURother_distances.rds"))
+
+plotSpatial(sce[, sce$acID ==2],
+            img_id = "RoiID",
+            coords=c("Center_X", "Center_Y"),
+            node_color_by = "tcell_subtype",
+            node_size_fix = 3,
+            edge_width_fix = 2,
+            edge_color_by = "clustered_neighbors_15",
+            draw_edges = TRUE,
+            colPairName = "knn_k15_dist20",
+            directed = FALSE,
+            nodes_first = FALSE) +
+  scale_color_igv()#+
+   # scale_color_brewer(palette = "Set2") +
+    #scale_edge_color_brewer(palette = "Set1")
+
+2165101+2875218
+4991390
+sce$tcell_subtype %>% table
+```
+
+```{r, fig.width=8, fig.height=8}
+clus <- c(10:30)
+p_clus <- paste0("clustered_neighbors_",clus)
+
+for(i in p_clus){
+df <-round(prop.table(table(sce[[i]], sce$tcell_subtype),1)*100, digits = 2)
+
+pheatmap(t(scale(df))) %>%print
+
+}
+
+clus <- c(30)
+p_clus <- paste0("clustered_neighbors_",clus)
+
+
+for(i in p_clus){
+  
+  df <-round(prop.table(table(sce[[i]], sce$tcell_subtype),1)*100, digits = 2)
+  df <-df %>% data.frame
+  colnames(df) <- c("Cluster","CellType","Freq")
+print(df %>% pivot_wider(id_cols=Cluster,values_from=Freq, names_from=CellType))
+  p <- ggplot(df, aes(fill=CellType, x=Freq, y=Cluster)) + 
+    geom_bar(position="fill", stat="identity")+
+  scale_fill_igv()+
+  scale_fill_manual(values=palette(glasbey(32)))
+  plot(p)
+}
+
+```
+
+```{r, fig.width=50, fig.height=50}
+df <-round(prop.table(table(sce$clustered_neighbors_5, sce$tcell_subtype),1)*100, digits = 2)
+
+pheatmap(scale(df))
+
+data_folder <-file.path(wd,"sce_objects","merge_plus_tumour")
+out <- countInteractions(sce,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "classic",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_all-sce-pat-roi.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_all-sce-pat-roi.csv"))
+
+out <- testInteractions(sce,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "classic",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_all-sce-pat-roi.csv"))
+
+length(unique(sce[, sce$TmaID==86]$tcell_subtype))
+
+tma86 <-sce[, sce$TmaID==86]
+saveRDS(tma86, file=file.path(data_folder, "tma86_minusTUMOUR_distances.rds"))
+tma86<- readRDS(file=file.path(data_folder, "tma86_minusTUMOUR_distances.rds"))
+
+tma87 <-sce[, sce$TmaID==87]
+saveRDS(tma87, file=file.path(data_folder, "tma87_minusTUMOUR_distances.rds"))
+tma87<- readRDS(file=file.path(data_folder, "tma87_minusTUMOUR_distances.rds"))
+
+tma88 <-sce[, sce$TmaID==88]
+saveRDS(tma88, file=file.path(data_folder, "tma88_minusTUMOUR_distances.rds"))
+
+
+tma175 <-sce[, sce$TmaID==175]
+saveRDS(tma175, file=file.path(data_folder, "tma175_minusTUMOUR_distances.rds"))
+
+tma176 <-sce[, sce$TmaID==176]
+saveRDS(tma176, file=file.path(data_folder, "tma176_minusTUMOUR_distances.rds"))
+
+tma178 <-sce[, sce$TmaID==178]
+saveRDS(tma178, file=file.path(data_folder, "tma178_minusTUMOUR_distances.rds"))
+
+unique(tma87$cell_category)
+```
+
+
+#Differential abundance analysis loop   
+```{r  Differential Abundance Analysis Types over Metastasis, fig.width=8, fig.height=6, warning=F, message=F,echo=F, out.width="50%"}
+#Patient Number must be numeric
+library(tidyr)
+#groupId must be factor
+#t <-as.data.frame(table(fibro[,fibro$CAF_Type!="undefined"]$Patient_ID,factor(fibro[,fibro$CAF_Type!="undefined"]$CAF_Type)))
+#  colnames(t)<-c("Patient_ID","Phenotype","n")
+ # t$Phenotype <-factor(t$Phenotype)
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+df$X <-NULL
+  
+colData(sce)<-as.data.frame(colData(sce)) %>% 
+  group_by(Patient_ID) %>% mutate(NR=cur_group_id()) %>% DataFrame()
+dat <-as_tibble(colData(sce))
+#if necessary: change group_id labels
+
+dat <-left_join(dat, df, by="Patient_ID")
+#clinical.data$NR <-c(1:length(unique(clinical.data$Patient_ID)))
+#dat <-merge(t,clinical.data,by="Patient_ID")
+dat <-dat[dat$DX.name=="Adenocarcinoma" | dat$DX.name=="Squamous cell carcinoma",]
+#dat$LN.Met <- ifelse(dat$N ==0, "No LN Metastases", "LN Metastases")
+#dat$Dist.Met <- ifelse(dat$M.new ==0, "No Dist. Metastases", "Dist. Metastases")
+#dat$NeoAdj <- ifelse(dat$Radio==1 |dat$Chemo==1, "NeoAdjuvantTherapy", "NoNeoAdjuvantTherapy")
+
+#Only for grade
+#dat <-dat[dat$Grade=="2" | dat$Grade=="3",]
+#category <-"Grade"
+#dat<-dat %>% drop_na(Grade)
+
+
+caf_groups <- colnames(df[,-1])
+category <-c("Gender","Relapse","Chemo","Dist.Met","LN.Met",caf_groups)
+category <- "LN.Met"
+#category <- "tpCAF_G"
+#group_id <-c("M.new")
+#dat$group_id <-ifelse(dat$M.new ==0, 0,1)
+#dat[[group_id]] <-as.factor(dat[[group_id]])
+
+#define the naming category aka Phenotype
+
+#dat<-dat %>% drop_na(paste(group_id))
+j <-"clustered_neighbors_30"
+
+plot_list <- list()
+for (i in category) {
+  #for(k in unique(dat$DX.name)){
+   # dat.l <-subset(dat, DX.name==k)
+
+dat.l <-dat
+k="both tumour types"
+dat.l$group_id <-as.factor(dat.l[[i]])
+dat.l<-dat.l %>% drop_na(paste(i))
+
+test <- dat.l %>% select(c("NR",j,"group_id"))
+#colnames(test) <-c("NR", "Phenotype","group_id")
+
+test_wide <- pivot_wider(test,id_cols=c("NR","group_id"),names_from = paste(j), values_from =paste(j), values_fn = list(clustered_neighbors_30=length),names_prefix = "")
+#names(test_wide)[names(test_wide) == "FibroType_1"] <- "FibroType_1 / tCAF"
+#names(test_wide)[names(test_wide) == "FibroType_2"] <- "FibroType_2 / vCAF"
+test_wide
+test_wide[is.na(test_wide)] <- 0
+
+design <- createDesignMatrix(
+  test_wide[,c("NR","group_id")], cols_design = c( "NR","group_id")
+)
+
+contrast <- createContrast(c(rep(0, ncol(design)-1),1))
+data.frame(parameters = colnames(design), contrast)
+
+test_wide_norm <- test_wide %>% select(-c("NR","group_id"))
+#Normalize
+norm_factors <- calcNormFactors(t(test_wide_norm), method = "TMM")
+y <- DGEList(t(test_wide_norm), norm.factors = norm_factors)
+
+#y <- DGEList(t(both[,-c("patient_id","group_id")]))
+y <- estimateDisp(y,design)
+
+fit <- glmFit(y, design)
+lrt <- glmLRT(fit, contrast = contrast)
+top <- edgeR::topTags(lrt, n = Inf, adjust.method = "BH", 
+        sort.by = "none")
+t.op <-as.data.frame(top)
+t.op <- tibble::rownames_to_column(t.op, "Neighbour_Cluster")
+
+plot_list[[k]] <-
+ggplot(t.op, aes(reorder(Neighbour_Cluster,logFC), logFC)) +
+  geom_col(aes(fill=PValue<0.05)) +
+  coord_flip() +
+  scale_fill_tableau()+
+  labs(x="Neighbour_Cluster", y="logFC",
+       title=paste("Phenotypes over",i,k))+
+  theme_bw()+
+  theme(strip.background = element_blank(),
+        panel.background=element_rect(fill='white', colour = 'black'),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        axis.text.y = element_text(size=10))
+#plot(p)
+
+# reference = Metastasis
+#ggsave(plot=p, file=file.path(plot_folder, paste("Differential_Abundance_Analysis_Fibro_n_over_",i,".pdf")))
+  #}
+  gridExtra::grid.arrange(grobs = plot_list)
+  
+    #save out in individual pdfs for each variable
+  #pdf(file=file.path(plot_folder, paste0("DifferentialAbundance_Analysis_Fibro_n_over_",i,".pdf")))
+  #gridExtra::grid.arrange(grobs = plot_list)
+  #dev.off()
+}
+
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=Neighbour_Cluster))+geom_point()+scale_color_tableau()
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=PValue<0.05))+geom_point()+scale_color_tableau()
+```
+
+```{r, fig.width=50, fig.height=50}
+```
+
+```{r histocat}
+out <- countInteractions(tma86,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "histocat",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma86_minusTUMOUR_histocat.csv"))
+##out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma86.csv"))
+
+out <- testInteractions(tma86,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "histocat",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma86_minusTUMOUR_histocat.csv"))
+
+data_folder <- file.path(wd, "sce_objects","Distances")
+
+tma87 <-readRDS(file=file.path(data_folder, "tma87_distances.rds"))
+
+out <- countInteractions(tma87,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "histocat",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma87_minusTUMOUR_histocat.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma87.csv"))
+
+out <- testInteractions(tma87,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "histocat",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma87_minusTUMOUR_histocat.csv"))
+
+out <- countInteractions(tma88,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "histocat",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma88_minusTUMOUR_histocat.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma88.csv"))
+
+out <- testInteractions(tma88,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "histocat",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma88_minusTUMOUR_histocat.csv"))
+
+out <- countInteractions(tma175,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "histocat",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma175_minusTUMOUR_histocat.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma175.csv"))
+
+out <- testInteractions(tma175,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "histocat",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma175_minusTUMOUR_histocat.csv"))
+
+out <- countInteractions(tma176,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "histocat",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma176_minusTUMOUR_histocat.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma176.csv"))
+
+out <- testInteractions(tma176,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "histocat",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma176_minusTUMOUR_histocat.csv"))
+
+out <- countInteractions(tma178,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "histocat",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma178_minusTUMOUR_histocat.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma178.csv"))
+
+out <- testInteractions(tma178,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "histocat",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma178_minusTUMOUR_histocat.csv"))
+
+#classic
+out <- countInteractions(tma86,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "classic",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma86_minusTUMOUR_classic.csv"))
+##out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma86.csv"))
+
+out <- testInteractions(tma86,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "classic",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma86_minusTUMOUR_classic.csv"))
+
+data_folder <- file.path(wd, "sce_objects","Distances")
+
+#tma87 <-readRDS(file=file.path(data_folder, "tma87_distances.rds"))
+
+out <- countInteractions(tma87,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "classic",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma87_minusTUMOUR_classic.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma87.csv"))
+
+out <- testInteractions(tma87,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "classic",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma87_minusTUMOUR_classic.csv"))
+
+out <- countInteractions(tma88,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "classic",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma88_minusTUMOUR_classic.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma88.csv"))
+
+out <- testInteractions(tma88,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "classic",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma88_minusTUMOUR_classic.csv"))
+
+out <- countInteractions(tma175,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "classic",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma175_minusTUMOUR_classic.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma175.csv"))
+
+out <- testInteractions(tma175,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "classic",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma175_minusTUMOUR_classic.csv"))
+
+out <- countInteractions(tma176,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "classic",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma176_minusTUMOUR_classic.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma176.csv"))
+
+out <- testInteractions(tma176,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "classic",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma176_minusTUMOUR_classic.csv"))
+
+out <- countInteractions(tma178,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "classic",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma178_minusTUMOUR_classic.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma178.csv"))
+
+out <- testInteractions(tma178,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "classic",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma178_minusTUMOUR_classic.csv"))
+```
+
+```{r merge histocat minus tumour}
+out_86 <- read.csv(file= file.path(data_folder, "testInteractions_tcellsub_tma86_minusTUMOUR_histocat.csv"))
+out_87 <- read.csv(file= file.path(data_folder, "testInteractions_tcellsub_tma87_minusTUMOUR_histocat.csv"))
+out_88 <- read.csv(file= file.path(data_folder, "testInteractions_tcellsub_tma88_minusTUMOUR_histocat.csv"))
+out_175 <- read.csv(file= file.path(data_folder, "testInteractions_tcellsub_tma175_minusTUMOUR_histocat.csv"))
+out_176 <- read.csv(file= file.path(data_folder, "testInteractions_tcellsub_tma176_minusTUMOUR_histocat.csv"))
+out_178 <- read.csv(file= file.path(data_folder, "testInteractions_tcellsub_tma178_minusTUMOUR_histocat.csv"))
+
+out_minusT_h <- rbind(out_86,out_87,out_88,out_175,out_176,out_178)
+write.csv(out_minusT_h, file=file.path(data_folder, "all_out_minusTumour_histocat.csv"))
+
+out_86 <- read.csv(file= file.path(data_folder, "testInteractions_tcellsub_tma86_minusTUMOUR_classic.csv"))
+out_87 <- read.csv(file= file.path(data_folder, "testInteractions_tcellsub_tma87_minusTUMOUR_classic.csv"))
+out_88 <- read.csv(file= file.path(data_folder, "testInteractions_tcellsub_tma88_minusTUMOUR_classic.csv"))
+out_175 <- read.csv(file= file.path(data_folder, "testInteractions_tcellsub_tma175_minusTUMOUR_classic.csv"))
+out_176 <- read.csv(file= file.path(data_folder, "testInteractions_tcellsub_tma176_minusTUMOUR_classic.csv"))
+out_178 <- read.csv(file= file.path(data_folder, "testInteractions_tcellsub_tma178_minusTUMOUR_classic.csv"))
+
+out_minusT_c <- rbind(out_86,out_87,out_88,out_175,out_176,out_178)
+write.csv(out_minusT_c, file=file.path(data_folder, "all_out_minusTumour_classic.csv"))
+out_minusT_c <- read.csv(file=file.path(data_folder, "all_out_minusTumour_classic.csv"))
+
+out_178$from_label %>% unique()
+```
+
+```{r merge histocat incl tumour}
+out_86 <- read.csv(file= file.path(data_folder, "testInteractions_tcellsub_tma86_histocat.csv"))
+out_87 <- read.csv(file= file.path(data_folder, "testInteractions_tcellsub_tma87_histocat.csv"))
+out_88 <- read.csv(file= file.path(data_folder, "testInteractions_tcellsub_tma88_histocat.csv"))
+out_175 <- read.csv(file= file.path(data_folder, "testInteractions_tcellsub_tma175_histocat.csv"))
+out_176 <- read.csv(file= file.path(data_folder, "testInteractions_tcellsub_tma176_histocat.csv"))
+out_178 <- read.csv(file= file.path(data_folder, "testInteractions_tcellsub_tma178_histocat.csv"))
+
+out_h <- rbind(out_86,out_87,out_88,out_175,out_176,out_178)
+write.csv(out_h, file=file.path(data_folder, "all_out_histocat.csv"))
+out_h <- read.csv(file=file.path(data_folder, "all_out_histocat.csv"))
+
+out_86 <- read.csv(file= file.path(data_folder, "testInteractions_tcellsub_tma86.csv"))
+out_87 <- read.csv(file= file.path(data_folder, "testInteractions_tcellsub_tma87.csv"))
+out_88 <- read.csv(file= file.path(data_folder, "testInteractions_tcellsub_tma88.csv"))
+out_175 <- read.csv(file= file.path(data_folder, "testInteractions_tcellsub_tma175.csv"))
+out_176 <- read.csv(file= file.path(data_folder, "testInteractions_tcellsub_tma176.csv"))
+out_178 <- read.csv(file= file.path(data_folder, "testInteractions_tcellsub_tma178.csv"))
+
+out_c <- rbind(out_86,out_87,out_88,out_175,out_176,out_178)
+write.csv(out_c, file=file.path(data_folder, "all_out_classic.csv"))
+
+out_c <- read.csv(file=file.path(data_folder, "all_out_classic.csv"))
+
+```
+
+```{r}
+#clinical.data <-as.data.frame(colData(immune.sce))
+
+clinical.data <- read.csv(file=file.path(wd,"sce_objects","clinical_data", "clinical_data_ROI_ac_combined_CORRECT.csv"))
+
+clinical.data = clinical.data[!duplicated(clinical.data$RoiID),]
+clinical.data$X <-NULL
+clinical.data$X.1 <-NULL
+head(clinical.data)
+```
+
+
+
+```{r all classical, fig.width=12, fig.height=10}
+out_c$RoiID <- out_c$group_by
+cur_test <- left_join(out_c, clinical.data, by="RoiID")
+
+cur_test$LN.Met[is.na(cur_test$LN.Met)]<-"NA"
+cur_test$Grade[is.na(cur_test$Grade)]<-"NA"
+cur_test$Dist.Met[is.na(cur_test$Dist.Met)]<-"NA"
+cur_test$M.new[is.na(cur_test$M.new)]<-"NA"
+
+table(cur_test$LN.Met)
+
+cur_test %>% subset(DX.name=="Adenocarcinoma"| DX.name=="Squamous cell carcinoma")%>% subset(Grade !="NA") %>% as_tibble() %>%#)
+  group_by(Grade, from_label, to_label, DX.name) %>%
+  summarize(mean_sig = mean(sigval, na.rm = TRUE)) %>%
+  ggplot() + 
+    geom_tile(aes(from_label, to_label, fill = mean_sig)) + facet_grid( DX.name~ Grade) +
+  scale_fill_gradient2(limits=c(-1, 1),low = "dark blue", mid = "white", high = "dark red") +
+  theme_classic(base_size = 15) +
+  theme(axis.text.x = element_text(angle = 90, hjust = 0))
+
+#not split
+p <-cur_test %>% subset(DX.name=="Adenocarcinoma"| DX.name=="Squamous cell carcinoma") %>% as_tibble() %>%#)
+  group_by(from_label, to_label, DX.name) %>%
+  summarize(mean_sig = mean(sigval, na.rm = TRUE)) %>%
+  ggplot() + 
+    geom_tile(aes(from_label, to_label, fill = mean_sig)) + #facet_grid( DX.name~ Grade) +
+  scale_fill_gradient2(limits=c(-1, 1),low = "dark blue", mid = "white", high = "dark red") +
+  theme_classic(base_size = 15) +
+  theme(axis.text.x = element_text(angle = 90, hjust = 0))
+ggsave(plot=p, file=file.path(plot_folder, "Neighbourhood_classical.pdf"), width=8, height=7)
+```
+
+
+```{r all histocat, fig.width=18, fig.height=12}
+out_h$RoiID <- out_h$group_by
+cur_test <- left_join(out_h, clinical.data, by="RoiID")
+
+cur_test$LN.Met[is.na(cur_test$LN.Met)]<-"NA"
+cur_test$Grade[is.na(cur_test$Grade)]<-"NA"
+cur_test$Dist.Met[is.na(cur_test$Dist.Met)]<-"NA"
+cur_test$M.new[is.na(cur_test$M.new)]<-"NA"
+
+table(cur_test$LN.Met)
+
+#not split
+p <-cur_test %>% subset(DX.name=="Adenocarcinoma"| DX.name=="Squamous cell carcinoma") %>% as_tibble() %>%#)
+  group_by(from_label, to_label, DX.name) %>%
+  summarize(mean_sig = mean(sigval, na.rm = TRUE)) %>%
+  ggplot() + 
+    geom_tile(aes(from_label, to_label, fill = mean_sig)) + #facet_grid( DX.name~ Grade) +
+  scale_fill_gradient2(limits=c(-1, 1),low = "dark blue", mid = "white", high = "dark red") +
+  theme_classic(base_size = 15) +
+  theme(axis.text.x = element_text(angle = 90, hjust = 0))
+ggsave(plot=p, file=file.path(plot_folder, "Neighbourhood_histocat.pdf"), width=8, height=6)
+
+#split
+cur_test %>% subset(DX.name=="Adenocarcinoma"| DX.name=="Squamous cell carcinoma")%>% subset(Grade !="NA") %>% as_tibble() %>%#)
+  group_by(Grade, from_label, to_label, DX.name) %>%
+  summarize(mean_sig = mean(sigval, na.rm = TRUE)) %>%
+  ggplot() + 
+    geom_tile(aes(from_label, to_label, fill = mean_sig)) + facet_grid( DX.name~ Grade) +
+  scale_fill_gradient2(limits=c(-1, 1),low = "dark blue", mid = "white", high = "dark red") +
+  theme_classic(base_size = 15) +
+  theme(axis.text.x = element_text(angle = 90, hjust = 0))
+ggsave(plot=p, file=file.path(plot_folder, "Neighbourhood_histocat.pdf"), width=8, height=7)
+
+
+```
+
+```{r all histocat, fig.width=18, fig.height=12}
+out_minusT_h$RoiID <- out_minusT_h$group_by
+cur_test <- left_join(out_c, clinical.data, by="RoiID")
+
+cur_test$LN.Met[is.na(cur_test$LN.Met)]<-"NA"
+cur_test$Grade[is.na(cur_test$Grade)]<-"NA"
+cur_test$Dist.Met[is.na(cur_test$Dist.Met)]<-"NA"
+cur_test$M.new[is.na(cur_test$M.new)]<-"NA"
+
+table(cur_test$LN.Met)
+
+cur_test %>% subset(DX.name=="Adenocarcinoma"| DX.name=="Squamous cell carcinoma")%>% subset(Grade !="NA") %>% as_tibble() %>%#)
+  group_by(Grade, from_label, to_label, DX.name) %>%
+  summarize(mean_sig = mean(sigval, na.rm = TRUE)) %>%
+  ggplot() + 
+    geom_tile(aes(from_label, to_label, fill = mean_sig)) + facet_grid( DX.name~ Grade) +
+  scale_fill_gradient2(limits=c(-1, 1),low = "dark blue", mid = "white", high = "dark red") +
+  theme_classic(base_size = 15) +
+  theme(axis.text.x = element_text(angle = 90, hjust = 0))
+
+
+out_minusT_h$from_label %>% unique()
+```
+
+```{r minus tumour classic, fig.width=18, fig.height=12}
+out_minusT_c$RoiID <- out_minusT_c$group_by
+cur_test <- left_join(out_c, clinical.data, by="RoiID")
+
+cur_test$LN.Met[is.na(cur_test$LN.Met)]<-"NA"
+cur_test$Grade[is.na(cur_test$Grade)]<-"NA"
+cur_test$Dist.Met[is.na(cur_test$Dist.Met)]<-"NA"
+cur_test$M.new[is.na(cur_test$M.new)]<-"NA"
+
+table(cur_test$LN.Met)
+
+cur_test %>% subset(DX.name=="Adenocarcinoma"| DX.name=="Squamous cell carcinoma")%>% subset(Grade !="NA") %>% as_tibble() %>%#)
+  group_by(Grade, from_label, to_label, DX.name) %>%
+  summarize(mean_sig = mean(sigval, na.rm = TRUE)) %>%
+  ggplot() + 
+    geom_tile(aes(from_label, to_label, fill = mean_sig)) + facet_grid( DX.name~ Grade) +
+  scale_fill_gradient2(limits=c(-1, 1),low = "dark blue", mid = "white", high = "dark red") +
+  theme_classic(base_size = 15) +
+  theme(axis.text.x = element_text(angle = 90, hjust = 0))
+
+
+out_minusT_c$from_label %>% unique()
+```
+
+
+
+
+####### AC SQC split
+```{r}
+data_folder <- file.path(wd, "sce_objects","Distances")
+all.sce_pat.roi <-readRDS(file=file.path(data_folder,"all_minus_other_distances.rds"))
+
+tma86_ac <-all.sce_pat.roi[, all.sce_pat.roi$TmaID==86&all.sce_pat.roi$DX.name=="Adenocarcinoma"]
+saveRDS(tma86_ac, file=file.path(data_folder, "tma86_ac_distances.rds"))
+#tma86<- readRDS(file=file.path(data_folder, "tma86_ac_distances.rds"))
+
+tma87_ac <-all.sce_pat.roi[, all.sce_pat.roi$TmaID==87&all.sce_pat.roi$DX.name=="Adenocarcinoma"]
+saveRDS(tma87_ac, file=file.path(data_folder, "tma87_ac_distances.rds"))
+#tma87<- readRDS(file=file.path(data_folder, "tma87_ac_distances.rds"))
+
+tma88_ac <-all.sce_pat.roi[, all.sce_pat.roi$TmaID==88&all.sce_pat.roi$DX.name=="Adenocarcinoma"]
+saveRDS(tma88_ac, file=file.path(data_folder, "tma88_ac_distances.rds"))
+
+
+tma175_ac <-all.sce_pat.roi[, all.sce_pat.roi$TmaID==175&all.sce_pat.roi$DX.name=="Adenocarcinoma"]
+saveRDS(tma175_ac, file=file.path(data_folder, "tma175_ac_distances.rds"))
+
+tma176_ac <-all.sce_pat.roi[, all.sce_pat.roi$TmaID==176&all.sce_pat.roi$DX.name=="Adenocarcinoma"]
+saveRDS(tma176_ac, file=file.path(data_folder, "tma176_ac_distances.rds"))
+
+tma178_ac <-all.sce_pat.roi[, all.sce_pat.roi$TmaID==178&all.sce_pat.roi$DX.name=="Adenocarcinoma"]
+saveRDS(tma178_ac, file=file.path(data_folder, "tma178_ac_distances.rds"))
+
+
+tma86_sqc <-all.sce_pat.roi[, all.sce_pat.roi$TmaID==86&all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]
+saveRDS(tma86_sqc, file=file.path(data_folder, "tma86_sqc_distances.rds"))
+#tma86<- readRDS(file=file.path(data_folder, "tma86_sqc_distances.rds"))
+
+tma87_sqc <-all.sce_pat.roi[, all.sce_pat.roi$TmaID==87&all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]
+saveRDS(tma87_sqc, file=file.path(data_folder, "tma87_sqc_distances.rds"))
+#tma87<- readRDS(file=file.path(data_folder, "tma87_sqc_distances.rds"))
+
+tma88_sqc <-all.sce_pat.roi[, all.sce_pat.roi$TmaID==88&all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]
+saveRDS(tma88_sqc, file=file.path(data_folder, "tma88_sqc_distances.rds"))
+
+
+tma175_sqc <-all.sce_pat.roi[, all.sce_pat.roi$TmaID==175&all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]
+saveRDS(tma175_sqc, file=file.path(data_folder, "tma175_sqc_distances.rds"))
+
+tma176_sqc <-all.sce_pat.roi[, all.sce_pat.roi$TmaID==176&all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]
+saveRDS(tma176_sqc, file=file.path(data_folder, "tma176_sqc_distances.rds"))
+
+tma178_sqc <-all.sce_pat.roi[, all.sce_pat.roi$TmaID==178&all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]
+saveRDS(tma178_sqc, file=file.path(data_folder, "tma178_sqc_distances.rds"))
+
+```
+
+```{r}
+library(imcRtools)
+out <- countInteractions(tma86_ac,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "classic",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma86_AC_classic.csv"))
+##out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma86.csv"))
+
+out <- testInteractions(tma86_ac,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "classic",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma86_AC_classic.csv"))
+
+data_folder <- file.path(wd, "sce_objects","Distances")
+
+#tma87 <-readRDS(file=file.path(data_folder, "tma87_distances.rds"))
+
+out <- countInteractions(tma87_ac,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "classic",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma87_AC_classic.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma87.csv"))
+
+out <- testInteractions(tma87_ac,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "classic",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma87_AC_classic.csv"))
+
+out <- countInteractions(tma88_ac,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "classic",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma88_AC_classic.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma88.csv"))
+
+out <- testInteractions(tma88_ac,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "classic",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma88_AC_classic.csv"))
+
+out <- countInteractions(tma175_ac,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "classic",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma175_AC_classic.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma175.csv"))
+
+out <- testInteractions(tma175_ac,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "classic",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma175_AC_classic.csv"))
+
+out <- countInteractions(tma176_ac,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "classic",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma176_AC_classic.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma176.csv"))
+
+out <- testInteractions(tma176_ac,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "classic",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma176_AC_classic.csv"))
+
+out <- countInteractions(tma178_ac,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "classic",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma178_AC_classic.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma178.csv"))
+
+out <- testInteractions(tma178_ac,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "classic",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma178_AC_classic.csv"))
+
+#sqc
+out <- countInteractions(tma86_sqc,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "classic",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma86_SQC_classic.csv"))
+##out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma86.csv"))
+
+out <- testInteractions(tma86_sqc,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "classic",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma86_SQC_classic.csv"))
+
+data_folder <- file.path(wd, "sce_objects","Distances")
+
+#tma87 <-readRDS(file=file.path(data_folder, "tma87_distances.rds"))
+
+out <- countInteractions(tma87_sqc,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "classic",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma87_SQC_classic.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma87.csv"))
+
+out <- testInteractions(tma87_sqc,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "classic",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma87_SQC_classic.csv"))
+
+out <- countInteractions(tma88_sqc,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "classic",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma88_SQC_classic.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma88.csv"))
+
+out <- testInteractions(tma88_sqc,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "classic",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma88_SQC_classic.csv"))
+
+out <- countInteractions(tma175_sqc,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "classic",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma175_SQC_classic.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma175.csv"))
+
+out <- testInteractions(tma175_sqc,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "classic",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma175_SQC_classic.csv"))
+
+out <- countInteractions(tma176_sqc,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "classic",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma176_SQC_classic.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma176.csv"))
+
+out <- testInteractions(tma176_sqc,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "classic",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma176_SQC_classic.csv"))
+
+out <- countInteractions(tma178_sqc,
+                         group_by = "RoiID",
+                         label = "tcell_subtype",
+                         method = "classic",
+                         colPairName = "knn_k15_dist20")
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"countInteractions_tcellsub_tma178_SQC_classic.csv"))
+#out <- read.csv(file=file.path(data_folder,"countInteractions_tcellsub_tma178.csv"))
+
+out <- testInteractions(tma178_sqc,
+                        group_by = "RoiID",
+                        label = "tcell_subtype",
+                        method = "classic",
+                        colPairName = "knn_k15_dist20")
+out
+out <- data.frame(out)
+write.csv(out,file=file.path(data_folder,"testInteractions_tcellsub_tma178_SQC_classic.csv"))
+```
+
+```{r}
+
+if (!require("BiocManager"))
+    install.packages("BiocManager")
+BiocManager::install("spicyR")
+library(spicyR)
+
+```
+```{r}
+
+if (!require("BiocManager"))
+    install.packages("BiocManager")
+BiocManager::install("spicyR")
+library(spicyR)
+
+```
+
+```{r Grade, fig.width=12, fig.height=12}
+
+dat.counts <-as.data.frame(t((assay(all.sce_pat.roi,"c_counts_asinh_scaled"))))
+dat.counts$cellID <- rownames(dat.counts)
+
+colnames(dat.counts)
+dat.counts$MPO <- dat.counts$`Myeloperoxidase (MPO)`
+dat.counts$IDO <- dat.counts$`Indoleamine 2- 3-dioxygenase (IDO)`
+dat.counts$CA9 <- dat.counts$`Carbonic Anhydrase IX`
+dat.counts$CD45 <- dat.counts$`CD45RA + CD45R0`
+dat.counts$CD248 <- dat.counts$`CD248 / Endosialin`
+dat.counts$p75 <- dat.counts$`p75 (CD271)`
+dat.counts$panCK <- dat.counts$`Pan Cytokeratin + Keratin Epithelial`
+dat.counts$vWF <- dat.counts$`vWF + CD31`
+dat.counts$LYVE1 <- dat.counts$`LYVE-1`
+dat.counts$ki67 <- dat.counts$`Ki-67`
+dat.counts$Collagen <- dat.counts$`Collagen I + Fibronectin`
+dat.counts$HLADR <- dat.counts$`HLA-DR`
+dat.counts$TCF17 <- dat.counts$`TCF1/TCF7`
+dat.counts$Cdh6 <- dat.counts$`Cadherin-6`
+dat.counts$Cdh11 <- dat.counts$`Cadherin-11`
+dat.counts$PD1 <- dat.counts$`CD279 (PD-1)`
+dat.counts$PDGFRb <- dat.counts$`PDGFR-b`
+dat.counts$Cav1 <- dat.counts$`Caveolin-1`
+dat.counts$Iridium191 <- dat.counts$Iridium_191
+dat.counts$Iridium193 <- dat.counts$Iridium_193
+dat.counts$Histone <- dat.counts$`Histone H3`
+dat.counts$FSP <- dat.counts$`FSP1 / S100A4`
+
+dat.counts$`Myeloperoxidase (MPO)` <- NULL
+dat.counts$`Indoleamine 2- 3-dioxygenase (IDO)` <- NULL
+dat.counts$`Carbonic Anhydrase IX` <- NULL
+dat.counts$`CD45RA + CD45R0` <- NULL
+dat.counts$`CD248 / Endosialin` <- NULL
+dat.counts$`p75 (CD271)` <- NULL
+dat.counts$`Pan Cytokeratin + Keratin Epithelial` <- NULL
+dat.counts$`vWF + CD31` <- NULL
+dat.counts$`LYVE-1` <- NULL
+dat.counts$`Ki-67` <- NULL
+dat.counts$`Collagen I + Fibronectin` <- NULL
+dat.counts$`HLA-DR` <- NULL
+dat.counts$`TCF1/TCF7` <- NULL
+dat.counts$`Cadherin-6` <- NULL
+dat.counts$`Cadherin-11` <- NULL
+dat.counts$`CD279 (PD-1)` <- NULL
+dat.counts$`PDGFR-b` <- NULL
+dat.counts$`Caveolin-1` <- NULL
+dat.counts$Iridium_191 <- NULL
+dat.counts$Iridium_193 <- NULL
+dat.counts$`Histone H3` <- NULL
+dat.counts$`FSP1 / S100A4` <-NULL
+
+colnames(dat.counts) <- paste("Intensity_Mean", colnames(dat.counts), sep = "_")
+dat.counts$cellID <- dat.counts$Intensity_Mean_cellID
+dat.counts$Intensity_Mean_cellID <-NULL
+dat.counts$AreaShape_Center_X <- all.sce_pat.roi$Center_X
+dat.counts$AreaShape_Center_Y <- all.sce_pat.roi$Center_Y
+
+dat.counts$AreaShape_Center_X <- all.sce_pat.roi$Center_X
+dat.counts$AreaShape_Center_Y <- all.sce_pat.roi$Center_Y
+dat.counts$imageID <- all.sce_pat.roi$RoiID
+dat.counts$cellType <- all.sce_pat.roi$tcell_subtype
+
+cellExp <- SegmentedCells(dat.counts, cellProfiler = TRUE)
+cellSum <- cellSummary(cellExp)
+cellSummary(cellExp) <- cellSum
+
+
+
+clinical.data <- read.csv(file=file.path(wd,"sce_objects","clinical_data", "clinical_data_ROI_ac_combined_CORRECT.csv"))
+
+clinical.data = clinical.data[!duplicated(clinical.data$RoiID),]
+clinical.data$imageID <- clinical.data$RoiID
+phenoData <- clinical.data[clinical.data$RoiID %in% unique(all.sce_pat.roi$RoiID),]
+cols <- c("DX.name", "Grade", "Stage", "T.new","M.new","N","Relapse","Smok","LN.Met","Dist.Met","NeoAdj","Gender")
+phenoData[cols] <- lapply(phenoData[cols], factor)  ## as.factor() could also be used
+
+
+phenoData <- DataFrame(phenoData)
+imagePheno(cellExp) <- phenoData
+
+#Grade
+spicyTest <- spicy(cellExp, 
+                   condition = "Grade", 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", "spicyR_Grade.rds"))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(all.sce_pat.roi$tcell_subtype))))
+ ggsave(plot=p, file=file.path(wd, "plots","spicyR_Grade.pdf"))
+ 
+ 
+```
+
+
+```{r Grade, fig.width=12, fig.height=12}
+
+dat.counts <-as.data.frame(t((assay(all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"| all.sce_pat.roi$DX.name=="Squamous cell carcinoma"],"c_counts_asinh_scaled"))))
+dat.counts$cellID <- rownames(dat.counts)
+
+colnames(dat.counts)
+dat.counts$MPO <- dat.counts$`Myeloperoxidase (MPO)`
+dat.counts$IDO <- dat.counts$`Indoleamine 2- 3-dioxygenase (IDO)`
+dat.counts$CA9 <- dat.counts$`Carbonic Anhydrase IX`
+dat.counts$CD45 <- dat.counts$`CD45RA + CD45R0`
+dat.counts$CD248 <- dat.counts$`CD248 / Endosialin`
+dat.counts$p75 <- dat.counts$`p75 (CD271)`
+dat.counts$panCK <- dat.counts$`Pan Cytokeratin + Keratin Epithelial`
+dat.counts$vWF <- dat.counts$`vWF + CD31`
+dat.counts$LYVE1 <- dat.counts$`LYVE-1`
+dat.counts$ki67 <- dat.counts$`Ki-67`
+dat.counts$Collagen <- dat.counts$`Collagen I + Fibronectin`
+dat.counts$HLADR <- dat.counts$`HLA-DR`
+dat.counts$TCF17 <- dat.counts$`TCF1/TCF7`
+dat.counts$Cdh6 <- dat.counts$`Cadherin-6`
+dat.counts$Cdh11 <- dat.counts$`Cadherin-11`
+dat.counts$PD1 <- dat.counts$`CD279 (PD-1)`
+dat.counts$PDGFRb <- dat.counts$`PDGFR-b`
+dat.counts$Cav1 <- dat.counts$`Caveolin-1`
+dat.counts$Iridium191 <- dat.counts$Iridium_191
+dat.counts$Iridium193 <- dat.counts$Iridium_193
+dat.counts$Histone <- dat.counts$`Histone H3`
+dat.counts$FSP <- dat.counts$`FSP1 / S100A4`
+
+dat.counts$`Myeloperoxidase (MPO)` <- NULL
+dat.counts$`Indoleamine 2- 3-dioxygenase (IDO)` <- NULL
+dat.counts$`Carbonic Anhydrase IX` <- NULL
+dat.counts$`CD45RA + CD45R0` <- NULL
+dat.counts$`CD248 / Endosialin` <- NULL
+dat.counts$`p75 (CD271)` <- NULL
+dat.counts$`Pan Cytokeratin + Keratin Epithelial` <- NULL
+dat.counts$`vWF + CD31` <- NULL
+dat.counts$`LYVE-1` <- NULL
+dat.counts$`Ki-67` <- NULL
+dat.counts$`Collagen I + Fibronectin` <- NULL
+dat.counts$`HLA-DR` <- NULL
+dat.counts$`TCF1/TCF7` <- NULL
+dat.counts$`Cadherin-6` <- NULL
+dat.counts$`Cadherin-11` <- NULL
+dat.counts$`CD279 (PD-1)` <- NULL
+dat.counts$`PDGFR-b` <- NULL
+dat.counts$`Caveolin-1` <- NULL
+dat.counts$Iridium_191 <- NULL
+dat.counts$Iridium_193 <- NULL
+dat.counts$`Histone H3` <- NULL
+dat.counts$`FSP1 / S100A4` <-NULL
+
+colnames(dat.counts) <- paste("Intensity_Mean", colnames(dat.counts), sep = "_")
+dat.counts$cellID <- dat.counts$Intensity_Mean_cellID
+dat.counts$Intensity_Mean_cellID <-NULL
+dat.counts$AreaShape_Center_X <- all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"| all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$Center_X
+dat.counts$AreaShape_Center_Y <- all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"| all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$Center_Y
+
+dat.counts$AreaShape_Center_X <- all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"| all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$Center_X
+dat.counts$AreaShape_Center_Y <- all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"| all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$Center_Y
+dat.counts$imageID <- all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"| all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID
+dat.counts$cellType <- all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"| all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$tcell_subtype
+
+cellExp <- SegmentedCells(dat.counts, cellProfiler = TRUE)
+cellSum <- cellSummary(cellExp)
+cellSummary(cellExp) <- cellSum
+
+
+
+clinical.data <- read.csv(file=file.path(wd,"sce_objects","clinical_data", "clinical_data_ROI_ac_combined_CORRECT.csv"))
+
+clinical.data = clinical.data[!duplicated(clinical.data$RoiID),]
+clinical.data$imageID <- clinical.data$RoiID
+phenoData <- clinical.data[clinical.data$RoiID %in% unique(all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"| all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID),]
+cols <- c("DX.name", "Grade", "Stage", "T.new","M.new","N","Relapse","Smok","LN.Met","Dist.Met","NeoAdj","Gender","Radio","Chemo")
+phenoData[cols] <- lapply(phenoData[cols], factor)  ## as.factor() could also be used
+
+
+phenoData <- DataFrame(phenoData)
+imagePheno(cellExp) <- phenoData
+
+#Type
+spicyTest <- spicy(cellExp, 
+                   condition = "DX.name", 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", "spicyR_type.rds"))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(all.sce_pat.roi$tcell_subtype))))
+
+ ggsave(plot=p, file=file.path(wd, "plots","spicyR_type.pdf"))
+ 
+#Relapse
+spicyTest <- spicy(cellExp, 
+                   condition = "Relapse", 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", "spicyR_Relapse.rds"))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(all.sce_pat.roi$tcell_subtype))))
+
+ ggsave(plot=p, file=file.path(wd, "plots","spicyR_Relapse.pdf"))
+ 
+ #NeoAdj
+spicyTest <- spicy(cellExp, 
+                   condition = "NeoAdj", 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", "spicyR_NeoAdj.rds"))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(all.sce_pat.roi$tcell_subtype))))
+
+ ggsave(plot=p, file=file.path(wd, "plots","spicyR_NeoAdj.pdf"))
+ 
+ #LN.Met
+spicyTest <- spicy(cellExp, 
+                   condition = "LN.Met", 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", "spicyR_LNMet.rds"))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(all.sce_pat.roi$tcell_subtype))))
+
+ ggsave(plot=p, file=file.path(wd, "plots","spicyR_LNMet.pdf"))
+ 
+ #Dist.Met
+spicyTest <- spicy(cellExp, 
+                   condition = "Dist.Met", 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", "spicyR_DistMet.rds"))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(all.sce_pat.roi$tcell_subtype))))
+
+ ggsave(plot=p, file=file.path(wd, "plots","spicyR_DistMet.pdf"))
+ 
+ #Radido
+spicyTest <- spicy(cellExp, 
+                   condition = "Radio", 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", "spicyR_Radio.rds"))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(all.sce_pat.roi$tcell_subtype))))
+
+ ggsave(plot=p, file=file.path(wd, "plots","spicyR_Radio.pdf"))
+ 
+#Chemo
+spicyTest <- spicy(cellExp, 
+                   condition = "Chemo", 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", "spicyR_Chemo.rds"))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(all.sce_pat.roi$tcell_subtype))))
+
+ ggsave(plot=p, file=file.path(wd, "plots","spicyR_Chemo.pdf"))
+ 
+ 
+```
+
+```{r Grade AC, fig.width=12, fig.height=12}
+all.sce_pat.roi$Grade[is.na(all.sce_pat.roi$Grade)]<-"NA"
+
+#sub_sce <- all.sce_pat.roi[,all.sce_pat.roi$Grade=="2"|
+#                             all.sce_pat.roi$Grade=="3"]
+dat.counts <-as.data.frame(t((assay(all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"],"c_counts_asinh_scaled"))))
+dat.counts$cellID <- rownames(dat.counts)
+
+colnames(dat.counts)
+dat.counts$MPO <- dat.counts$`Myeloperoxidase (MPO)`
+dat.counts$IDO <- dat.counts$`Indoleamine 2- 3-dioxygenase (IDO)`
+dat.counts$CA9 <- dat.counts$`Carbonic Anhydrase IX`
+dat.counts$CD45 <- dat.counts$`CD45RA + CD45R0`
+dat.counts$CD248 <- dat.counts$`CD248 / Endosialin`
+dat.counts$p75 <- dat.counts$`p75 (CD271)`
+dat.counts$panCK <- dat.counts$`Pan Cytokeratin + Keratin Epithelial`
+dat.counts$vWF <- dat.counts$`vWF + CD31`
+dat.counts$LYVE1 <- dat.counts$`LYVE-1`
+dat.counts$ki67 <- dat.counts$`Ki-67`
+dat.counts$Collagen <- dat.counts$`Collagen I + Fibronectin`
+dat.counts$HLADR <- dat.counts$`HLA-DR`
+dat.counts$TCF17 <- dat.counts$`TCF1/TCF7`
+dat.counts$Cdh6 <- dat.counts$`Cadherin-6`
+dat.counts$Cdh11 <- dat.counts$`Cadherin-11`
+dat.counts$PD1 <- dat.counts$`CD279 (PD-1)`
+dat.counts$PDGFRb <- dat.counts$`PDGFR-b`
+dat.counts$Cav1 <- dat.counts$`Caveolin-1`
+dat.counts$Iridium191 <- dat.counts$Iridium_191
+dat.counts$Iridium193 <- dat.counts$Iridium_193
+dat.counts$Histone <- dat.counts$`Histone H3`
+dat.counts$FSP <- dat.counts$`FSP1 / S100A4`
+
+dat.counts$`Myeloperoxidase (MPO)` <- NULL
+dat.counts$`Indoleamine 2- 3-dioxygenase (IDO)` <- NULL
+dat.counts$`Carbonic Anhydrase IX` <- NULL
+dat.counts$`CD45RA + CD45R0` <- NULL
+dat.counts$`CD248 / Endosialin` <- NULL
+dat.counts$`p75 (CD271)` <- NULL
+dat.counts$`Pan Cytokeratin + Keratin Epithelial` <- NULL
+dat.counts$`vWF + CD31` <- NULL
+dat.counts$`LYVE-1` <- NULL
+dat.counts$`Ki-67` <- NULL
+dat.counts$`Collagen I + Fibronectin` <- NULL
+dat.counts$`HLA-DR` <- NULL
+dat.counts$`TCF1/TCF7` <- NULL
+dat.counts$`Cadherin-6` <- NULL
+dat.counts$`Cadherin-11` <- NULL
+dat.counts$`CD279 (PD-1)` <- NULL
+dat.counts$`PDGFR-b` <- NULL
+dat.counts$`Caveolin-1` <- NULL
+dat.counts$Iridium_191 <- NULL
+dat.counts$Iridium_193 <- NULL
+dat.counts$`Histone H3` <- NULL
+dat.counts$`FSP1 / S100A4` <-NULL
+
+colnames(dat.counts) <- paste("Intensity_Mean", colnames(dat.counts), sep = "_")
+dat.counts$cellID <- dat.counts$Intensity_Mean_cellID
+dat.counts$Intensity_Mean_cellID <-NULL
+dat.counts$AreaShape_Center_X <- all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"]$Center_X
+dat.counts$AreaShape_Center_Y <- all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"]$Center_Y
+
+dat.counts$AreaShape_Center_X <- all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"]$Center_X
+dat.counts$AreaShape_Center_Y <- all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"]$Center_Y
+dat.counts$imageID <- all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"]$RoiID
+dat.counts$cellType <- all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"]$tcell_subtype
+
+cellExp <- SegmentedCells(dat.counts, cellProfiler = TRUE)
+cellSum <- cellSummary(cellExp)
+cellSummary(cellExp) <- cellSum
+
+
+
+clinical.data <- read.csv(file=file.path(wd,"sce_objects","clinical_data", "clinical_data_ROI_ac_combined_CORRECT.csv"))
+
+clinical.data = clinical.data[!duplicated(clinical.data$RoiID),]
+clinical.data$imageID <- clinical.data$RoiID
+phenoData <- clinical.data[clinical.data$RoiID %in% unique(all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"]$RoiID),]
+cols <- c("DX.name", "Grade", "Stage", "T.new","M.new","N","Relapse","Smok","LN.Met","Dist.Met","NeoAdj","Gender","Radio","Chemo")
+phenoData[cols] <- lapply(phenoData[cols], factor)  ## as.factor() could also be used
+
+
+phenoData <- DataFrame(phenoData)
+imagePheno(cellExp) <- phenoData
+
+#Grade
+spicyTest <- spicy(cellExp, 
+                   condition = "Grade", 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", "spicyR_AC_Grade.rds"))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(all.sce_pat.roi$tcell_subtype))))
+topPairs(spicyTest)
+
+ggsave(plot=p, file=file.path(wd, "plots","spicyR_AC_Grade.pdf"))
+ 
+ 
+#Relapse
+spicyTest <- spicy(cellExp, 
+                   condition = "Relapse", 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", "spicyR_AC_Relapse.rds"))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(all.sce_pat.roi$tcell_subtype))))
+
+ ggsave(plot=p, file=file.path(wd, "plots","spicyR_Relapse.pdf"))
+ 
+ #NeoAdj
+spicyTest <- spicy(cellExp, 
+                   condition = "NeoAdj", 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", "spicyR_AC_NeoAdj.rds"))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(all.sce_pat.roi$tcell_subtype))))
+
+ ggsave(plot=p, file=file.path(wd, "plots","spicyR_AC_NeoAdj.pdf"))
+ 
+ #LN.Met
+spicyTest <- spicy(cellExp, 
+                   condition = "LN.Met", 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", "spicyR_AC_LNMet.rds"))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(all.sce_pat.roi$tcell_subtype))))
+
+ ggsave(plot=p, file=file.path(wd, "plots","spicyR_AC_LNMet.pdf"))
+ 
+#Dist.Met
+spicyTest <- spicy(cellExp, 
+                   condition = "Dist.Met", 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", "spicyR_AC_DistMet.rds"))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(all.sce_pat.roi$tcell_subtype))))
+
+ ggsave(plot=p, file=file.path(wd, "plots","spicyR_AC_DistMet.pdf"))
+ 
+#Radio
+spicyTest <- spicy(cellExp, 
+                   condition = "Radio", 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", "spicyR_AC_Radio.rds"))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(all.sce_pat.roi$tcell_subtype))))
+
+ ggsave(plot=p, file=file.path(wd, "plots","spicyR_AC_Radio.pdf"))
+ 
+#Chemo
+spicyTest <- spicy(cellExp, 
+                   condition = "Chemo", 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", "spicyR_AC_Chemo.rds"))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(all.sce_pat.roi$tcell_subtype))))
+
+ ggsave(plot=p, file=file.path(wd, "plots","spicyR_AC_Chemo.pdf"))
+ 
+ 
+```
+
+```{r Dist.Met AC, fig.width=12, fig.height=12}
+all.sce_pat.roi$Grade[is.na(all.sce_pat.roi$Grade)]<-"NA"
+
+#sub_sce <- all.sce_pat.roi[,all.sce_pat.roi$Grade=="2"|
+#                             all.sce_pat.roi$Grade=="3"]
+dat.counts <-as.data.frame(t((assay(all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Squamous cell carcinoma"],"c_counts_asinh_scaled"))))
+dat.counts$cellID <- rownames(dat.counts)
+
+colnames(dat.counts)
+dat.counts$MPO <- dat.counts$`Myeloperoxidase (MPO)`
+dat.counts$IDO <- dat.counts$`Indoleamine 2- 3-dioxygenase (IDO)`
+dat.counts$CA9 <- dat.counts$`Carbonic Anhydrase IX`
+dat.counts$CD45 <- dat.counts$`CD45RA + CD45R0`
+dat.counts$CD248 <- dat.counts$`CD248 / Endosialin`
+dat.counts$p75 <- dat.counts$`p75 (CD271)`
+dat.counts$panCK <- dat.counts$`Pan Cytokeratin + Keratin Epithelial`
+dat.counts$vWF <- dat.counts$`vWF + CD31`
+dat.counts$LYVE1 <- dat.counts$`LYVE-1`
+dat.counts$ki67 <- dat.counts$`Ki-67`
+dat.counts$Collagen <- dat.counts$`Collagen I + Fibronectin`
+dat.counts$HLADR <- dat.counts$`HLA-DR`
+dat.counts$TCF17 <- dat.counts$`TCF1/TCF7`
+dat.counts$Cdh6 <- dat.counts$`Cadherin-6`
+dat.counts$Cdh11 <- dat.counts$`Cadherin-11`
+dat.counts$PD1 <- dat.counts$`CD279 (PD-1)`
+dat.counts$PDGFRb <- dat.counts$`PDGFR-b`
+dat.counts$Cav1 <- dat.counts$`Caveolin-1`
+dat.counts$Iridium191 <- dat.counts$Iridium_191
+dat.counts$Iridium193 <- dat.counts$Iridium_193
+dat.counts$Histone <- dat.counts$`Histone H3`
+dat.counts$FSP <- dat.counts$`FSP1 / S100A4`
+
+dat.counts$`Myeloperoxidase (MPO)` <- NULL
+dat.counts$`Indoleamine 2- 3-dioxygenase (IDO)` <- NULL
+dat.counts$`Carbonic Anhydrase IX` <- NULL
+dat.counts$`CD45RA + CD45R0` <- NULL
+dat.counts$`CD248 / Endosialin` <- NULL
+dat.counts$`p75 (CD271)` <- NULL
+dat.counts$`Pan Cytokeratin + Keratin Epithelial` <- NULL
+dat.counts$`vWF + CD31` <- NULL
+dat.counts$`LYVE-1` <- NULL
+dat.counts$`Ki-67` <- NULL
+dat.counts$`Collagen I + Fibronectin` <- NULL
+dat.counts$`HLA-DR` <- NULL
+dat.counts$`TCF1/TCF7` <- NULL
+dat.counts$`Cadherin-6` <- NULL
+dat.counts$`Cadherin-11` <- NULL
+dat.counts$`CD279 (PD-1)` <- NULL
+dat.counts$`PDGFR-b` <- NULL
+dat.counts$`Caveolin-1` <- NULL
+dat.counts$Iridium_191 <- NULL
+dat.counts$Iridium_193 <- NULL
+dat.counts$`Histone H3` <- NULL
+dat.counts$`FSP1 / S100A4` <-NULL
+
+colnames(dat.counts) <- paste("Intensity_Mean", colnames(dat.counts), sep = "_")
+dat.counts$cellID <- dat.counts$Intensity_Mean_cellID
+dat.counts$Intensity_Mean_cellID <-NULL
+dat.counts$AreaShape_Center_X <- all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$Center_X
+dat.counts$AreaShape_Center_Y <- all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$Center_Y
+
+dat.counts$AreaShape_Center_X <- all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$Center_X
+dat.counts$AreaShape_Center_Y <- all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$Center_Y
+dat.counts$imageID <- all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID
+dat.counts$cellType <- all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$tcell_subtype
+
+cellExp <- SegmentedCells(dat.counts, cellProfiler = TRUE)
+cellSum <- cellSummary(cellExp)
+cellSummary(cellExp) <- cellSum
+
+
+
+clinical.data <- read.csv(file=file.path(wd,"sce_objects","clinical_data", "clinical_data_ROI_ac_combined_CORRECT.csv"))
+
+clinical.data = clinical.data[!duplicated(clinical.data$RoiID),]
+clinical.data$imageID <- clinical.data$RoiID
+phenoData <- clinical.data[clinical.data$RoiID %in% unique(all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID),]
+cols <- c("DX.name", "Grade", "Stage", "T.new","M.new","N","Relapse","Smok","LN.Met","Dist.Met","NeoAdj","Gender","Radio","Chemo")
+phenoData[cols] <- lapply(phenoData[cols], factor)  ## as.factor() could also be used
+
+
+phenoData <- DataFrame(phenoData)
+imagePheno(cellExp) <- phenoData
+
+#Grade
+spicyTest <- spicy(cellExp, 
+                   condition = "Grade", 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", "spicyR_SCQ_Grade.rds"))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(all.sce_pat.roi$tcell_subtype))))
+topPairs(spicyTest)
+
+ggsave(plot=p, file=file.path(wd, "plots","spicyR_SCQ_Grade.pdf"))
+ 
+ 
+#Relapse
+spicyTest <- spicy(cellExp, 
+                   condition = "Relapse", 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", "spicyR_SCQ_Relapse.rds"))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(all.sce_pat.roi$tcell_subtype))))
+
+ ggsave(plot=p, file=file.path(wd, "plots","spicyR_Relapse.pdf"))
+ 
+ #NeoAdj
+spicyTest <- spicy(cellExp, 
+                   condition = "NeoAdj", 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", "spicyR_SCQ_NeoAdj.rds"))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(all.sce_pat.roi$tcell_subtype))))
+
+ ggsave(plot=p, file=file.path(wd, "plots","spicyR_SCQ_NeoAdj.pdf"))
+ 
+ #LN.Met
+spicyTest <- spicy(cellExp, 
+                   condition = "LN.Met", 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", "spicyR_SCQ_LNMet.rds"))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(all.sce_pat.roi$tcell_subtype))))
+
+ ggsave(plot=p, file=file.path(wd, "plots","spicyR_SCQ_LNMet.pdf"))
+ 
+#Dist.Met
+spicyTest <- spicy(cellExp, 
+                   condition = "Dist.Met", 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", "spicyR_SCQ_DistMet.rds"))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(all.sce_pat.roi$tcell_subtype))))
+
+ ggsave(plot=p, file=file.path(wd, "plots","spicyR_SCQ_DistMet.pdf"))
+ 
+#Radio
+spicyTest <- spicy(cellExp, 
+                   condition = "Radio", 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", "spicyR_SCQ_Radio.rds"))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(all.sce_pat.roi$tcell_subtype))))
+
+ ggsave(plot=p, file=file.path(wd, "plots","spicyR_SCQ_Radio.pdf"))
+ 
+#Chemo
+spicyTest <- spicy(cellExp, 
+                   condition = "Chemo", 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", "spicyR_SCQ_Chemo.rds"))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(all.sce_pat.roi$tcell_subtype))))
+
+ ggsave(plot=p, file=file.path(wd, "plots","spicyR_SQC_Chemo.pdf"))
+ 
+
+```
+
+```{r}
+#all prop
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+
+sub_sce <- all.sce_pat.roi
+cur_DF <- as_tibble(colData(sub_sce)) %>% left_join(df, by = "Patient_ID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sub_sce$ImageNumber, sub_sce$CellNumber))
+
+sub_sce <- sub_sce[,sub_sce$Patient_ID %in% df$Patient_ID]
+  
+dat.counts <-as.data.frame(t((assay(sub_sce[, sub_sce$DX.name=="Adenocarcinoma"| sub_sce$DX.name=="Squamous cell carcinoma"],"c_counts_asinh_scaled"))))
+dat.counts$cellID <- rownames(dat.counts)
+
+colnames(dat.counts)
+dat.counts$MPO <- dat.counts$`Myeloperoxidase (MPO)`
+dat.counts$IDO <- dat.counts$`Indoleamine 2- 3-dioxygenase (IDO)`
+dat.counts$CA9 <- dat.counts$`Carbonic Anhydrase IX`
+dat.counts$CD45 <- dat.counts$`CD45RA + CD45R0`
+dat.counts$CD248 <- dat.counts$`CD248 / Endosialin`
+dat.counts$p75 <- dat.counts$`p75 (CD271)`
+dat.counts$panCK <- dat.counts$`Pan Cytokeratin + Keratin Epithelial`
+dat.counts$vWF <- dat.counts$`vWF + CD31`
+dat.counts$LYVE1 <- dat.counts$`LYVE-1`
+dat.counts$ki67 <- dat.counts$`Ki-67`
+dat.counts$Collagen <- dat.counts$`Collagen I + Fibronectin`
+dat.counts$HLADR <- dat.counts$`HLA-DR`
+dat.counts$TCF17 <- dat.counts$`TCF1/TCF7`
+dat.counts$Cdh6 <- dat.counts$`Cadherin-6`
+dat.counts$Cdh11 <- dat.counts$`Cadherin-11`
+dat.counts$PD1 <- dat.counts$`CD279 (PD-1)`
+dat.counts$PDGFRb <- dat.counts$`PDGFR-b`
+dat.counts$Cav1 <- dat.counts$`Caveolin-1`
+dat.counts$Iridium191 <- dat.counts$Iridium_191
+dat.counts$Iridium193 <- dat.counts$Iridium_193
+dat.counts$Histone <- dat.counts$`Histone H3`
+dat.counts$FSP <- dat.counts$`FSP1 / S100A4`
+
+dat.counts$`Myeloperoxidase (MPO)` <- NULL
+dat.counts$`Indoleamine 2- 3-dioxygenase (IDO)` <- NULL
+dat.counts$`Carbonic Anhydrase IX` <- NULL
+dat.counts$`CD45RA + CD45R0` <- NULL
+dat.counts$`CD248 / Endosialin` <- NULL
+dat.counts$`p75 (CD271)` <- NULL
+dat.counts$`Pan Cytokeratin + Keratin Epithelial` <- NULL
+dat.counts$`vWF + CD31` <- NULL
+dat.counts$`LYVE-1` <- NULL
+dat.counts$`Ki-67` <- NULL
+dat.counts$`Collagen I + Fibronectin` <- NULL
+dat.counts$`HLA-DR` <- NULL
+dat.counts$`TCF1/TCF7` <- NULL
+dat.counts$`Cadherin-6` <- NULL
+dat.counts$`Cadherin-11` <- NULL
+dat.counts$`CD279 (PD-1)` <- NULL
+dat.counts$`PDGFR-b` <- NULL
+dat.counts$`Caveolin-1` <- NULL
+dat.counts$Iridium_191 <- NULL
+dat.counts$Iridium_193 <- NULL
+dat.counts$`Histone H3` <- NULL
+dat.counts$`FSP1 / S100A4` <-NULL
+
+colnames(dat.counts) <- paste("Intensity_Mean", colnames(dat.counts), sep = "_")
+dat.counts$cellID <- dat.counts$Intensity_Mean_cellID
+dat.counts$Intensity_Mean_cellID <-NULL
+dat.counts$AreaShape_Center_X <- sub_sce[, sub_sce$DX.name=="Adenocarcinoma"| sub_sce$DX.name=="Squamous cell carcinoma"]$Center_X
+dat.counts$AreaShape_Center_Y <- sub_sce[, sub_sce$DX.name=="Adenocarcinoma"| sub_sce$DX.name=="Squamous cell carcinoma"]$Center_Y
+
+dat.counts$AreaShape_Center_X <- sub_sce[, sub_sce$DX.name=="Adenocarcinoma"| sub_sce$DX.name=="Squamous cell carcinoma"]$Center_X
+dat.counts$AreaShape_Center_Y <- sub_sce[, sub_sce$DX.name=="Adenocarcinoma"| sub_sce$DX.name=="Squamous cell carcinoma"]$Center_Y
+dat.counts$imageID <- sub_sce[, sub_sce$DX.name=="Adenocarcinoma"| sub_sce$DX.name=="Squamous cell carcinoma"]$RoiID
+dat.counts$cellType <- sub_sce[, sub_sce$DX.name=="Adenocarcinoma"| sub_sce$DX.name=="Squamous cell carcinoma"]$tcell_subtype
+
+cellExp <- SegmentedCells(dat.counts, cellProfiler = TRUE)
+cellSum <- cellSummary(cellExp)
+cellSummary(cellExp) <- cellSum
+
+
+dat <- cur_DF %>% data.frame()
+dat = dat[!duplicated(dat$RoiID),]
+dat$imageID <- dat$RoiID
+
+#phenoData <- clinical.data[clinical.data$RoiID %in% unique(sub_sce[, sub_sce$DX.name=="Squamous cell carcinoma"]$RoiID),]
+phenoData <- dat
+caf_groups <- df %>% select(-c(Patient_ID, X)) %>% colnames()
+
+cols <- c("DX.name", "Grade", "Stage", "T.new","M.new","N","Relapse","Smok","LN.Met","Dist.Met","NeoAdj","Gender","Radio","Chemo",paste(caf_groups))
+phenoData[cols] <- lapply(phenoData[cols], factor)  ## as.factor() could also be used
+
+phenoData <- DataFrame(phenoData)
+imagePheno(cellExp) <- phenoData
+
+caf_groups <- df %>% select(-c(Patient_ID, X)) %>% colnames()
+caf_groups <- c("IDO_CAF_G","PDPN_CAF_G")
+for(i in caf_groups){
+spicyTest <- spicy(cellExp, 
+                   condition = paste(i), 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", paste0("spicyR_",i,"all_prop.rds")))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(sub_sce$tcell_subtype))))
+
+ ggsave(plot=p, file=file.path(wd, "plots",paste0("spicyR_",i,"_all_prop.pdf")))
+}
+
+
+#sqc prop
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion_Squamous cell carcinoma.csv"))
+sub_sce <- all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]
+cur_DF <- as_tibble(colData(sub_sce)) %>% left_join(df, by = "Patient_ID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sub_sce$ImageNumber, sub_sce$CellNumber))
+
+sub_sce <- sub_sce[,sub_sce$Patient_ID %in% df$Patient_ID]
+dat.counts <-as.data.frame(t((assay(sub_sce[, sub_sce$DX.name=="Squamous cell carcinoma"],"c_counts_asinh_scaled"))))
+dat.counts$cellID <- rownames(dat.counts)
+
+colnames(dat.counts)
+dat.counts$MPO <- dat.counts$`Myeloperoxidase (MPO)`
+dat.counts$IDO <- dat.counts$`Indoleamine 2- 3-dioxygenase (IDO)`
+dat.counts$CA9 <- dat.counts$`Carbonic Anhydrase IX`
+dat.counts$CD45 <- dat.counts$`CD45RA + CD45R0`
+dat.counts$CD248 <- dat.counts$`CD248 / Endosialin`
+dat.counts$p75 <- dat.counts$`p75 (CD271)`
+dat.counts$panCK <- dat.counts$`Pan Cytokeratin + Keratin Epithelial`
+dat.counts$vWF <- dat.counts$`vWF + CD31`
+dat.counts$LYVE1 <- dat.counts$`LYVE-1`
+dat.counts$ki67 <- dat.counts$`Ki-67`
+dat.counts$Collagen <- dat.counts$`Collagen I + Fibronectin`
+dat.counts$HLADR <- dat.counts$`HLA-DR`
+dat.counts$TCF17 <- dat.counts$`TCF1/TCF7`
+dat.counts$Cdh6 <- dat.counts$`Cadherin-6`
+dat.counts$Cdh11 <- dat.counts$`Cadherin-11`
+dat.counts$PD1 <- dat.counts$`CD279 (PD-1)`
+dat.counts$PDGFRb <- dat.counts$`PDGFR-b`
+dat.counts$Cav1 <- dat.counts$`Caveolin-1`
+dat.counts$Iridium191 <- dat.counts$Iridium_191
+dat.counts$Iridium193 <- dat.counts$Iridium_193
+dat.counts$Histone <- dat.counts$`Histone H3`
+dat.counts$FSP <- dat.counts$`FSP1 / S100A4`
+
+dat.counts$`Myeloperoxidase (MPO)` <- NULL
+dat.counts$`Indoleamine 2- 3-dioxygenase (IDO)` <- NULL
+dat.counts$`Carbonic Anhydrase IX` <- NULL
+dat.counts$`CD45RA + CD45R0` <- NULL
+dat.counts$`CD248 / Endosialin` <- NULL
+dat.counts$`p75 (CD271)` <- NULL
+dat.counts$`Pan Cytokeratin + Keratin Epithelial` <- NULL
+dat.counts$`vWF + CD31` <- NULL
+dat.counts$`LYVE-1` <- NULL
+dat.counts$`Ki-67` <- NULL
+dat.counts$`Collagen I + Fibronectin` <- NULL
+dat.counts$`HLA-DR` <- NULL
+dat.counts$`TCF1/TCF7` <- NULL
+dat.counts$`Cadherin-6` <- NULL
+dat.counts$`Cadherin-11` <- NULL
+dat.counts$`CD279 (PD-1)` <- NULL
+dat.counts$`PDGFR-b` <- NULL
+dat.counts$`Caveolin-1` <- NULL
+dat.counts$Iridium_191 <- NULL
+dat.counts$Iridium_193 <- NULL
+dat.counts$`Histone H3` <- NULL
+dat.counts$`FSP1 / S100A4` <-NULL
+
+colnames(dat.counts) <- paste("Intensity_Mean", colnames(dat.counts), sep = "_")
+dat.counts$cellID <- dat.counts$Intensity_Mean_cellID
+dat.counts$Intensity_Mean_cellID <-NULL
+dat.counts$AreaShape_Center_X <- sub_sce[,sub_sce$DX.name=="Squamous cell carcinoma"]$Center_X
+dat.counts$AreaShape_Center_Y <- sub_sce[, sub_sce$DX.name=="Squamous cell carcinoma"]$Center_Y
+
+dat.counts$AreaShape_Center_X <- sub_sce[, sub_sce$DX.name=="Squamous cell carcinoma"]$Center_X
+dat.counts$AreaShape_Center_Y <- sub_sce[, sub_sce$DX.name=="Squamous cell carcinoma"]$Center_Y
+dat.counts$imageID <- sub_sce[, sub_sce$DX.name=="Squamous cell carcinoma"]$RoiID
+dat.counts$cellType <- sub_sce[, sub_sce$DX.name=="Squamous cell carcinoma"]$tcell_subtype
+
+cellExp <- SegmentedCells(dat.counts, cellProfiler = TRUE)
+cellSum <- cellSummary(cellExp)
+cellSummary(cellExp) <- cellSum
+
+
+caf_groups <- df %>% select(-c(Patient_ID, X)) %>% colnames()
+
+clinical.data <- read.csv(file=file.path(wd,"sce_objects","clinical_data", "clinical_data_ROI_ac_combined_CORRECT.csv"))
+
+clinical.data = clinical.data[!duplicated(clinical.data$RoiID),]
+clinical.data$imageID <- clinical.data$RoiID
+
+dat <- cur_DF %>% data.frame()
+dat = dat[!duplicated(dat$RoiID),]
+dat$imageID <- dat$RoiID
+
+#phenoData <- clinical.data[clinical.data$RoiID %in% unique(sub_sce[, sub_sce$DX.name=="Squamous cell carcinoma"]$RoiID),]
+phenoData <- dat
+
+cols <- c("DX.name", "Grade", "Stage", "T.new","M.new","N","Relapse","Smok","LN.Met","Dist.Met","NeoAdj","Gender","Radio","Chemo",paste(caf_groups))
+phenoData[cols] <- lapply(phenoData[cols], factor)  ## as.factor() could also be used
+
+
+phenoData <- DataFrame(phenoData)
+imagePheno(cellExp) <- phenoData
+
+caf_groups <- df %>% select(-c(Patient_ID, X)) %>% colnames()
+
+for(i in caf_groups){
+spicyTest <- spicy(cellExp, 
+                   condition = i, 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", paste0("spicyR_",i,"_SQC_prop.rds")))
+
+pdf(file=file.path(wd, "plots",paste0("spicyR_",i,"_SQC_prop.pdf")))
+signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(sub_sce$tcell_subtype))))
+
+dev.off()
+}
+
+
+
+#ac prop
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion_Adenocarcinoma.csv"))
+
+sub_sce <- all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"]
+cur_DF <- as_tibble(colData(sub_sce)) %>% left_join(df, by = "Patient_ID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sub_sce$ImageNumber, sub_sce$CellNumber))
+
+sub_sce <- sub_sce[,sub_sce$Patient_ID %in% df$Patient_ID]
+  
+dat.counts <-as.data.frame(t((assay(sub_sce[, sub_sce$DX.name=="Adenocarcinoma"],"c_counts_asinh_scaled"))))
+dat.counts$cellID <- rownames(dat.counts)
+
+colnames(dat.counts)
+dat.counts$MPO <- dat.counts$`Myeloperoxidase (MPO)`
+dat.counts$IDO <- dat.counts$`Indoleamine 2- 3-dioxygenase (IDO)`
+dat.counts$CA9 <- dat.counts$`Carbonic Anhydrase IX`
+dat.counts$CD45 <- dat.counts$`CD45RA + CD45R0`
+dat.counts$CD248 <- dat.counts$`CD248 / Endosialin`
+dat.counts$p75 <- dat.counts$`p75 (CD271)`
+dat.counts$panCK <- dat.counts$`Pan Cytokeratin + Keratin Epithelial`
+dat.counts$vWF <- dat.counts$`vWF + CD31`
+dat.counts$LYVE1 <- dat.counts$`LYVE-1`
+dat.counts$ki67 <- dat.counts$`Ki-67`
+dat.counts$Collagen <- dat.counts$`Collagen I + Fibronectin`
+dat.counts$HLADR <- dat.counts$`HLA-DR`
+dat.counts$TCF17 <- dat.counts$`TCF1/TCF7`
+dat.counts$Cdh6 <- dat.counts$`Cadherin-6`
+dat.counts$Cdh11 <- dat.counts$`Cadherin-11`
+dat.counts$PD1 <- dat.counts$`CD279 (PD-1)`
+dat.counts$PDGFRb <- dat.counts$`PDGFR-b`
+dat.counts$Cav1 <- dat.counts$`Caveolin-1`
+dat.counts$Iridium191 <- dat.counts$Iridium_191
+dat.counts$Iridium193 <- dat.counts$Iridium_193
+dat.counts$Histone <- dat.counts$`Histone H3`
+dat.counts$FSP <- dat.counts$`FSP1 / S100A4`
+
+dat.counts$`Myeloperoxidase (MPO)` <- NULL
+dat.counts$`Indoleamine 2- 3-dioxygenase (IDO)` <- NULL
+dat.counts$`Carbonic Anhydrase IX` <- NULL
+dat.counts$`CD45RA + CD45R0` <- NULL
+dat.counts$`CD248 / Endosialin` <- NULL
+dat.counts$`p75 (CD271)` <- NULL
+dat.counts$`Pan Cytokeratin + Keratin Epithelial` <- NULL
+dat.counts$`vWF + CD31` <- NULL
+dat.counts$`LYVE-1` <- NULL
+dat.counts$`Ki-67` <- NULL
+dat.counts$`Collagen I + Fibronectin` <- NULL
+dat.counts$`HLA-DR` <- NULL
+dat.counts$`TCF1/TCF7` <- NULL
+dat.counts$`Cadherin-6` <- NULL
+dat.counts$`Cadherin-11` <- NULL
+dat.counts$`CD279 (PD-1)` <- NULL
+dat.counts$`PDGFR-b` <- NULL
+dat.counts$`Caveolin-1` <- NULL
+dat.counts$Iridium_191 <- NULL
+dat.counts$Iridium_193 <- NULL
+dat.counts$`Histone H3` <- NULL
+dat.counts$`FSP1 / S100A4` <-NULL
+
+colnames(dat.counts) <- paste("Intensity_Mean", colnames(dat.counts), sep = "_")
+dat.counts$cellID <- dat.counts$Intensity_Mean_cellID
+dat.counts$Intensity_Mean_cellID <-NULL
+dat.counts$AreaShape_Center_X <- sub_sce[,sub_sce$DX.name=="Adenocarcinoma"]$Center_X
+dat.counts$AreaShape_Center_Y <- sub_sce[, sub_sce$DX.name=="Adenocarcinoma"]$Center_Y
+
+dat.counts$AreaShape_Center_X <- sub_sce[, sub_sce$DX.name=="Adenocarcinoma"]$Center_X
+dat.counts$AreaShape_Center_Y <- sub_sce[, sub_sce$DX.name=="Adenocarcinoma"]$Center_Y
+dat.counts$imageID <- sub_sce[, sub_sce$DX.name=="Adenocarcinoma"]$RoiID
+dat.counts$cellType <- sub_sce[, sub_sce$DX.name=="Adenocarcinoma"]$tcell_subtype
+
+cellExp <- SegmentedCells(dat.counts, cellProfiler = TRUE)
+cellSum <- cellSummary(cellExp)
+cellSummary(cellExp) <- cellSum
+
+
+dat <- cur_DF %>% data.frame()
+dat = dat[!duplicated(dat$RoiID),]
+dat$imageID <- dat$RoiID
+
+#phenoData <- clinical.data[clinical.data$RoiID %in% unique(sub_sce[, sub_sce$DX.name=="Squamous cell carcinoma"]$RoiID),]
+phenoData <- dat
+
+cols <- c("DX.name", "Grade", "Stage", "T.new","M.new","N","Relapse","Smok","LN.Met","Dist.Met","NeoAdj","Gender","Radio","Chemo",paste(caf_groups))
+phenoData[cols] <- lapply(phenoData[cols], factor)  ## as.factor() could also be used
+
+
+phenoData <- DataFrame(phenoData)
+imagePheno(cellExp) <- phenoData
+
+caf_groups <- df %>% select(-c(Patient_ID, X)) %>% colnames()
+
+for(i in caf_groups){
+spicyTest <- spicy(cellExp, 
+                   condition = paste(i), 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", paste0("spicyR_",i,"_AC_prop.rds")))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(sub_sce$tcell_subtype))))
+
+ ggsave(plot=p, file=file.path(wd, "plots",paste0("spicyR_",i,"_AC_prop.pdf")))
+}
+
+#all density
+
+#ddensity
+
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_lo_density.csv"))
+mCAF_low_PID <-df[df$mCAF_G =="mCAF low"|df$mCAF_G =="mCAF high",]$Patient_ID
+#sub_sce <- all.sce_pat.roi
+sub_sce <- all.sce[,all.sce$Patient_ID %in% mCAF_low_PID]
+cur_DF <- as_tibble(colData(sub_sce)) %>% left_join(df, by = "Patient_ID") %>% DataFrame()
+#cur_DF$mCAF_G[is.na(cur_DF$mCAF_G)] <- "NA"
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sub_sce$ImageNumber, sub_sce$CellNumber))
+
+#sub_sce <- sub_sce[,sub_sce$Patient_ID %in% df$Patient_ID]
+sub_sce <- sub_sce[,sub_sce$Patient_ID %in% mCAF_low_PID]
+
+dat.counts <-as.data.frame(t((assay(sub_sce[, sub_sce$DX.name=="Adenocarcinoma"| sub_sce$DX.name=="Squamous cell carcinoma"],"c_counts_asinh_scaled"))))
+dat.counts$cellID <- rownames(dat.counts)
+
+colnames(dat.counts)
+dat.counts$MPO <- dat.counts$`Myeloperoxidase (MPO)`
+dat.counts$IDO <- dat.counts$`Indoleamine 2- 3-dioxygenase (IDO)`
+dat.counts$CA9 <- dat.counts$`Carbonic Anhydrase IX`
+dat.counts$CD45 <- dat.counts$`CD45RA + CD45R0`
+dat.counts$CD248 <- dat.counts$`CD248 / Endosialin`
+dat.counts$p75 <- dat.counts$`p75 (CD271)`
+dat.counts$panCK <- dat.counts$`Pan Cytokeratin + Keratin Epithelial`
+dat.counts$vWF <- dat.counts$`vWF + CD31`
+dat.counts$LYVE1 <- dat.counts$`LYVE-1`
+dat.counts$ki67 <- dat.counts$`Ki-67`
+dat.counts$Collagen <- dat.counts$`Collagen I + Fibronectin`
+dat.counts$HLADR <- dat.counts$`HLA-DR`
+dat.counts$TCF17 <- dat.counts$`TCF1/TCF7`
+dat.counts$Cdh6 <- dat.counts$`Cadherin-6`
+dat.counts$Cdh11 <- dat.counts$`Cadherin-11`
+dat.counts$PD1 <- dat.counts$`CD279 (PD-1)`
+dat.counts$PDGFRb <- dat.counts$`PDGFR-b`
+dat.counts$Cav1 <- dat.counts$`Caveolin-1`
+dat.counts$Iridium191 <- dat.counts$Iridium_191
+dat.counts$Iridium193 <- dat.counts$Iridium_193
+dat.counts$Histone <- dat.counts$`Histone H3`
+dat.counts$FSP <- dat.counts$`FSP1 / S100A4`
+
+dat.counts$`Myeloperoxidase (MPO)` <- NULL
+dat.counts$`Indoleamine 2- 3-dioxygenase (IDO)` <- NULL
+dat.counts$`Carbonic Anhydrase IX` <- NULL
+dat.counts$`CD45RA + CD45R0` <- NULL
+dat.counts$`CD248 / Endosialin` <- NULL
+dat.counts$`p75 (CD271)` <- NULL
+dat.counts$`Pan Cytokeratin + Keratin Epithelial` <- NULL
+dat.counts$`vWF + CD31` <- NULL
+dat.counts$`LYVE-1` <- NULL
+dat.counts$`Ki-67` <- NULL
+dat.counts$`Collagen I + Fibronectin` <- NULL
+dat.counts$`HLA-DR` <- NULL
+dat.counts$`TCF1/TCF7` <- NULL
+dat.counts$`Cadherin-6` <- NULL
+dat.counts$`Cadherin-11` <- NULL
+dat.counts$`CD279 (PD-1)` <- NULL
+dat.counts$`PDGFR-b` <- NULL
+dat.counts$`Caveolin-1` <- NULL
+dat.counts$Iridium_191 <- NULL
+dat.counts$Iridium_193 <- NULL
+dat.counts$`Histone H3` <- NULL
+dat.counts$`FSP1 / S100A4` <-NULL
+
+colnames(dat.counts) <- paste("Intensity_Mean", colnames(dat.counts), sep = "_")
+dat.counts$cellID <- dat.counts$Intensity_Mean_cellID
+dat.counts$Intensity_Mean_cellID <-NULL
+dat.counts$AreaShape_Center_X <- sub_sce[, sub_sce$DX.name=="Adenocarcinoma"| sub_sce$DX.name=="Squamous cell carcinoma"]$Center_X
+dat.counts$AreaShape_Center_Y <- sub_sce[, sub_sce$DX.name=="Adenocarcinoma"| sub_sce$DX.name=="Squamous cell carcinoma"]$Center_Y
+
+dat.counts$AreaShape_Center_X <- sub_sce[, sub_sce$DX.name=="Adenocarcinoma"| sub_sce$DX.name=="Squamous cell carcinoma"]$Center_X
+dat.counts$AreaShape_Center_Y <- sub_sce[, sub_sce$DX.name=="Adenocarcinoma"| sub_sce$DX.name=="Squamous cell carcinoma"]$Center_Y
+dat.counts$imageID <- sub_sce[, sub_sce$DX.name=="Adenocarcinoma"| sub_sce$DX.name=="Squamous cell carcinoma"]$RoiID
+dat.counts$cellType <- sub_sce[, sub_sce$DX.name=="Adenocarcinoma"| sub_sce$DX.name=="Squamous cell carcinoma"]$cell_subtype
+
+cellExp <- SegmentedCells(dat.counts, cellProfiler = TRUE)
+cellSum <- cellSummary(cellExp)
+cellSummary(cellExp) <- cellSum
+
+
+dat <- cur_DF %>% data.frame()
+dat = dat[!duplicated(dat$RoiID),]
+dat$imageID <- dat$RoiID
+
+#phenoData <- clinical.data[clinical.data$RoiID %in% unique(sub_sce[, sub_sce$DX.name=="Squamous cell carcinoma"]$RoiID),]
+phenoData <- dat
+caf_groups <- df %>% select(-c(Patient_ID, X,hypoxic_tpCAF_G)) %>% colnames()
+
+cols <- c("DX.name", "Grade", "Stage", "T.new","M.new","N","Relapse","Smok","LN.Met","Dist.Met","NeoAdj","Gender","Radio","Chemo",paste(caf_groups))
+phenoData[cols] <- lapply(phenoData[cols], factor)  ## as.factor() could also be used
+
+phenoData <- DataFrame(phenoData)
+imagePheno(cellExp) <- phenoData
+
+caf_groups <- df %>% select(-c(Patient_ID, X,hypoxic_tpCAF_G)) %>% colnames()
+#here
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_lo_density.csv"))
+
+sub_sce <- all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"| all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]
+cur_DF <- as_tibble(colData(sub_sce)) %>% left_join(df, by = "Patient_ID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sub_sce$ImageNumber, sub_sce$CellNumber))
+
+sub_sce <- sub_sce[,sub_sce$Patient_ID %in% df$Patient_ID]
+  
+dat.counts <-as.data.frame(t((assay(sub_sce,"c_counts_asinh_scaled"))))
+dat.counts$cellID <- rownames(dat.counts)
+
+colnames(dat.counts)
+dat.counts$MPO <- dat.counts$`Myeloperoxidase (MPO)`
+dat.counts$IDO <- dat.counts$`Indoleamine 2- 3-dioxygenase (IDO)`
+dat.counts$CA9 <- dat.counts$`Carbonic Anhydrase IX`
+dat.counts$CD45 <- dat.counts$`CD45RA + CD45R0`
+dat.counts$CD248 <- dat.counts$`CD248 / Endosialin`
+dat.counts$p75 <- dat.counts$`p75 (CD271)`
+dat.counts$panCK <- dat.counts$`Pan Cytokeratin + Keratin Epithelial`
+dat.counts$vWF <- dat.counts$`vWF + CD31`
+dat.counts$LYVE1 <- dat.counts$`LYVE-1`
+dat.counts$ki67 <- dat.counts$`Ki-67`
+dat.counts$Collagen <- dat.counts$`Collagen I + Fibronectin`
+dat.counts$HLADR <- dat.counts$`HLA-DR`
+dat.counts$TCF17 <- dat.counts$`TCF1/TCF7`
+dat.counts$Cdh6 <- dat.counts$`Cadherin-6`
+dat.counts$Cdh11 <- dat.counts$`Cadherin-11`
+dat.counts$PD1 <- dat.counts$`CD279 (PD-1)`
+dat.counts$PDGFRb <- dat.counts$`PDGFR-b`
+dat.counts$Cav1 <- dat.counts$`Caveolin-1`
+dat.counts$Iridium191 <- dat.counts$Iridium_191
+dat.counts$Iridium193 <- dat.counts$Iridium_193
+dat.counts$Histone <- dat.counts$`Histone H3`
+dat.counts$FSP <- dat.counts$`FSP1 / S100A4`
+
+dat.counts$`Myeloperoxidase (MPO)` <- NULL
+dat.counts$`Indoleamine 2- 3-dioxygenase (IDO)` <- NULL
+dat.counts$`Carbonic Anhydrase IX` <- NULL
+dat.counts$`CD45RA + CD45R0` <- NULL
+dat.counts$`CD248 / Endosialin` <- NULL
+dat.counts$`p75 (CD271)` <- NULL
+dat.counts$`Pan Cytokeratin + Keratin Epithelial` <- NULL
+dat.counts$`vWF + CD31` <- NULL
+dat.counts$`LYVE-1` <- NULL
+dat.counts$`Ki-67` <- NULL
+dat.counts$`Collagen I + Fibronectin` <- NULL
+dat.counts$`HLA-DR` <- NULL
+dat.counts$`TCF1/TCF7` <- NULL
+dat.counts$`Cadherin-6` <- NULL
+dat.counts$`Cadherin-11` <- NULL
+dat.counts$`CD279 (PD-1)` <- NULL
+dat.counts$`PDGFR-b` <- NULL
+dat.counts$`Caveolin-1` <- NULL
+dat.counts$Iridium_191 <- NULL
+dat.counts$Iridium_193 <- NULL
+dat.counts$`Histone H3` <- NULL
+dat.counts$`FSP1 / S100A4` <-NULL
+
+colnames(dat.counts) <- paste("Intensity_Mean", colnames(dat.counts), sep = "_")
+dat.counts$cellID <- dat.counts$Intensity_Mean_cellID
+dat.counts$Intensity_Mean_cellID <-NULL
+dat.counts$AreaShape_Center_X <- sub_sce$Center_X
+dat.counts$AreaShape_Center_Y <- sub_sce$Center_Y
+
+dat.counts$AreaShape_Center_X <- sub_sce$Center_X
+dat.counts$AreaShape_Center_Y <- sub_sce$Center_Y
+dat.counts$imageID <- sub_sce$RoiID
+dat.counts$cellType <- sub_sce$tcell_subtype
+
+cellExp <- SegmentedCells(dat.counts, cellProfiler = TRUE)
+cellSum <- cellSummary(cellExp)
+cellSummary(cellExp) <- cellSum
+
+
+dat <- cur_DF %>% data.frame()
+dat = dat[!duplicated(dat$RoiID),]
+dat$imageID <- dat$RoiID
+
+#phenoData <- clinical.data[clinical.data$RoiID %in% unique(sub_sce[, sub_sce$DX.name=="Squamous cell carcinoma"]$RoiID),]
+phenoData <- dat
+
+cols <- c("DX.name", "Grade", "Stage", "T.new","M.new","N","Relapse","Smok","LN.Met","Dist.Met","NeoAdj","Gender","Radio","Chemo",paste(caf_groups))
+phenoData[cols] <- lapply(phenoData[cols], factor)  ## as.factor() could also be used
+
+
+phenoData <- DataFrame(phenoData)
+imagePheno(cellExp) <- phenoData
+
+caf_groups <- c("iCAF_G","vCAF_G","dCAF_G","tpCAF_G","hypoxic_CAF_G","Collagen_CAF_G","PDPN_CAF_G","SMA_CAF_G","IDO_CAF_G")
+for(i in caf_groups){
+spicyTest <- spicy(cellExp, 
+                   condition = paste(i), 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", paste0("NEWspicyR_",i,"all_tumour.rds")))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(sub_sce$cell_subtype))))
+p
+ ggsave(plot=p, file=file.path(wd, "plots",paste0("NEWspicyR_",i,"_all_tumour.pdf")))
+}
+
+caf_groups <-"mCAF_G"
+
+for(i in caf_groups){
+spicyTest <- spicy(cellExp, 
+                   condition = paste(i), 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", paste0("spicyR_",i,"all_mCAF_low_density.rds")))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(sub_sce$cell_subtype))))
+p
+ ggsave(plot=p, file=file.path(wd, "plots",paste0("spicyR_",i,"_all_mCAF_low_density.pdf")))
+}
+
+#sqc density
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_density_split_Squamous cell carcinoma.csv"))
+sub_sce <- all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]
+cur_DF <- as_tibble(colData(sub_sce)) %>% left_join(df, by = "Patient_ID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sub_sce$ImageNumber, sub_sce$CellNumber))
+
+sub_sce <- sub_sce[,sub_sce$Patient_ID %in% df$Patient_ID]
+  
+dat.counts <-as.data.frame(t((assay(sub_sce[, sub_sce$DX.name=="Squamous cell carcinoma"],"c_counts_asinh_scaled"))))
+dat.counts$cellID <- rownames(dat.counts)
+
+colnames(dat.counts)
+dat.counts$MPO <- dat.counts$`Myeloperoxidase (MPO)`
+dat.counts$IDO <- dat.counts$`Indoleamine 2- 3-dioxygenase (IDO)`
+dat.counts$CA9 <- dat.counts$`Carbonic Anhydrase IX`
+dat.counts$CD45 <- dat.counts$`CD45RA + CD45R0`
+dat.counts$CD248 <- dat.counts$`CD248 / Endosialin`
+dat.counts$p75 <- dat.counts$`p75 (CD271)`
+dat.counts$panCK <- dat.counts$`Pan Cytokeratin + Keratin Epithelial`
+dat.counts$vWF <- dat.counts$`vWF + CD31`
+dat.counts$LYVE1 <- dat.counts$`LYVE-1`
+dat.counts$ki67 <- dat.counts$`Ki-67`
+dat.counts$Collagen <- dat.counts$`Collagen I + Fibronectin`
+dat.counts$HLADR <- dat.counts$`HLA-DR`
+dat.counts$TCF17 <- dat.counts$`TCF1/TCF7`
+dat.counts$Cdh6 <- dat.counts$`Cadherin-6`
+dat.counts$Cdh11 <- dat.counts$`Cadherin-11`
+dat.counts$PD1 <- dat.counts$`CD279 (PD-1)`
+dat.counts$PDGFRb <- dat.counts$`PDGFR-b`
+dat.counts$Cav1 <- dat.counts$`Caveolin-1`
+dat.counts$Iridium191 <- dat.counts$Iridium_191
+dat.counts$Iridium193 <- dat.counts$Iridium_193
+dat.counts$Histone <- dat.counts$`Histone H3`
+dat.counts$FSP <- dat.counts$`FSP1 / S100A4`
+
+dat.counts$`Myeloperoxidase (MPO)` <- NULL
+dat.counts$`Indoleamine 2- 3-dioxygenase (IDO)` <- NULL
+dat.counts$`Carbonic Anhydrase IX` <- NULL
+dat.counts$`CD45RA + CD45R0` <- NULL
+dat.counts$`CD248 / Endosialin` <- NULL
+dat.counts$`p75 (CD271)` <- NULL
+dat.counts$`Pan Cytokeratin + Keratin Epithelial` <- NULL
+dat.counts$`vWF + CD31` <- NULL
+dat.counts$`LYVE-1` <- NULL
+dat.counts$`Ki-67` <- NULL
+dat.counts$`Collagen I + Fibronectin` <- NULL
+dat.counts$`HLA-DR` <- NULL
+dat.counts$`TCF1/TCF7` <- NULL
+dat.counts$`Cadherin-6` <- NULL
+dat.counts$`Cadherin-11` <- NULL
+dat.counts$`CD279 (PD-1)` <- NULL
+dat.counts$`PDGFR-b` <- NULL
+dat.counts$`Caveolin-1` <- NULL
+dat.counts$Iridium_191 <- NULL
+dat.counts$Iridium_193 <- NULL
+dat.counts$`Histone H3` <- NULL
+dat.counts$`FSP1 / S100A4` <-NULL
+
+colnames(dat.counts) <- paste("Intensity_Mean", colnames(dat.counts), sep = "_")
+dat.counts$cellID <- dat.counts$Intensity_Mean_cellID
+dat.counts$Intensity_Mean_cellID <-NULL
+dat.counts$AreaShape_Center_X <- sub_sce[,sub_sce$DX.name=="Squamous cell carcinoma"]$Center_X
+dat.counts$AreaShape_Center_Y <- sub_sce[, sub_sce$DX.name=="Squamous cell carcinoma"]$Center_Y
+
+dat.counts$AreaShape_Center_X <- sub_sce[, sub_sce$DX.name=="Squamous cell carcinoma"]$Center_X
+dat.counts$AreaShape_Center_Y <- sub_sce[, sub_sce$DX.name=="Squamous cell carcinoma"]$Center_Y
+dat.counts$imageID <- sub_sce[, sub_sce$DX.name=="Squamous cell carcinoma"]$RoiID
+dat.counts$cellType <- sub_sce[, sub_sce$DX.name=="Squamous cell carcinoma"]$tcell_subtype
+
+cellExp <- SegmentedCells(dat.counts, cellProfiler = TRUE)
+cellSum <- cellSummary(cellExp)
+cellSummary(cellExp) <- cellSum
+
+
+dat <- cur_DF %>% data.frame()
+dat = dat[!duplicated(dat$RoiID),]
+dat$imageID <- dat$RoiID
+
+#phenoData <- clinical.data[clinical.data$RoiID %in% unique(sub_sce[, sub_sce$DX.name=="Squamous cell carcinoma"]$RoiID),]
+phenoData <- dat
+
+cols <- c("DX.name", "Grade", "Stage", "T.new","M.new","N","Relapse","Smok","LN.Met","Dist.Met","NeoAdj","Gender","Radio","Chemo",paste(caf_groups))
+phenoData[cols] <- lapply(phenoData[cols], factor)  ## as.factor() could also be used
+
+phenoData <- DataFrame(phenoData)
+imagePheno(cellExp) <- phenoData
+
+caf_groups <- df %>% select(-c(Patient_ID, X,hypoxic_tpCAF_G)) %>% colnames()
+
+for(i in caf_groups){
+spicyTest <- spicy(cellExp, 
+                   condition = paste(i), 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", paste0("spicyR_",i,"_SQC_density.rds")))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(sub_sce$tcell_subtype))))
+
+ ggsave(plot=p, file=file.path(wd, "plots",paste0("spicyR_",i,"_SQC_density.pdf")))
+}
+
+
+#ac density
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion_Adenocarcinoma.csv"))
+
+sub_sce <- all.sce_pat.roi[, all.sce_pat.roi$DX.name=="Adenocarcinoma"]
+cur_DF <- as_tibble(colData(sub_sce)) %>% left_join(df, by = "Patient_ID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sub_sce$ImageNumber, sub_sce$CellNumber))
+
+sub_sce <- sub_sce[,sub_sce$Patient_ID %in% df$Patient_ID]
+  
+dat.counts <-as.data.frame(t((assay(sub_sce[, sub_sce$DX.name=="Adenocarcinoma"],"c_counts_asinh_scaled"))))
+dat.counts$cellID <- rownames(dat.counts)
+
+colnames(dat.counts)
+dat.counts$MPO <- dat.counts$`Myeloperoxidase (MPO)`
+dat.counts$IDO <- dat.counts$`Indoleamine 2- 3-dioxygenase (IDO)`
+dat.counts$CA9 <- dat.counts$`Carbonic Anhydrase IX`
+dat.counts$CD45 <- dat.counts$`CD45RA + CD45R0`
+dat.counts$CD248 <- dat.counts$`CD248 / Endosialin`
+dat.counts$p75 <- dat.counts$`p75 (CD271)`
+dat.counts$panCK <- dat.counts$`Pan Cytokeratin + Keratin Epithelial`
+dat.counts$vWF <- dat.counts$`vWF + CD31`
+dat.counts$LYVE1 <- dat.counts$`LYVE-1`
+dat.counts$ki67 <- dat.counts$`Ki-67`
+dat.counts$Collagen <- dat.counts$`Collagen I + Fibronectin`
+dat.counts$HLADR <- dat.counts$`HLA-DR`
+dat.counts$TCF17 <- dat.counts$`TCF1/TCF7`
+dat.counts$Cdh6 <- dat.counts$`Cadherin-6`
+dat.counts$Cdh11 <- dat.counts$`Cadherin-11`
+dat.counts$PD1 <- dat.counts$`CD279 (PD-1)`
+dat.counts$PDGFRb <- dat.counts$`PDGFR-b`
+dat.counts$Cav1 <- dat.counts$`Caveolin-1`
+dat.counts$Iridium191 <- dat.counts$Iridium_191
+dat.counts$Iridium193 <- dat.counts$Iridium_193
+dat.counts$Histone <- dat.counts$`Histone H3`
+dat.counts$FSP <- dat.counts$`FSP1 / S100A4`
+
+dat.counts$`Myeloperoxidase (MPO)` <- NULL
+dat.counts$`Indoleamine 2- 3-dioxygenase (IDO)` <- NULL
+dat.counts$`Carbonic Anhydrase IX` <- NULL
+dat.counts$`CD45RA + CD45R0` <- NULL
+dat.counts$`CD248 / Endosialin` <- NULL
+dat.counts$`p75 (CD271)` <- NULL
+dat.counts$`Pan Cytokeratin + Keratin Epithelial` <- NULL
+dat.counts$`vWF + CD31` <- NULL
+dat.counts$`LYVE-1` <- NULL
+dat.counts$`Ki-67` <- NULL
+dat.counts$`Collagen I + Fibronectin` <- NULL
+dat.counts$`HLA-DR` <- NULL
+dat.counts$`TCF1/TCF7` <- NULL
+dat.counts$`Cadherin-6` <- NULL
+dat.counts$`Cadherin-11` <- NULL
+dat.counts$`CD279 (PD-1)` <- NULL
+dat.counts$`PDGFR-b` <- NULL
+dat.counts$`Caveolin-1` <- NULL
+dat.counts$Iridium_191 <- NULL
+dat.counts$Iridium_193 <- NULL
+dat.counts$`Histone H3` <- NULL
+dat.counts$`FSP1 / S100A4` <-NULL
+
+colnames(dat.counts) <- paste("Intensity_Mean", colnames(dat.counts), sep = "_")
+dat.counts$cellID <- dat.counts$Intensity_Mean_cellID
+dat.counts$Intensity_Mean_cellID <-NULL
+dat.counts$AreaShape_Center_X <- sub_sce[,sub_sce$DX.name=="Adenocarcinoma"]$Center_X
+dat.counts$AreaShape_Center_Y <- sub_sce[, sub_sce$DX.name=="Adenocarcinoma"]$Center_Y
+
+dat.counts$AreaShape_Center_X <- sub_sce[, sub_sce$DX.name=="Adenocarcinoma"]$Center_X
+dat.counts$AreaShape_Center_Y <- sub_sce[, sub_sce$DX.name=="Adenocarcinoma"]$Center_Y
+dat.counts$imageID <- sub_sce[, sub_sce$DX.name=="Adenocarcinoma"]$RoiID
+dat.counts$cellType <- sub_sce[, sub_sce$DX.name=="Adenocarcinoma"]$tcell_subtype
+
+cellExp <- SegmentedCells(dat.counts, cellProfiler = TRUE)
+cellSum <- cellSummary(cellExp)
+cellSummary(cellExp) <- cellSum
+
+
+dat <- cur_DF %>% data.frame()
+dat = dat[!duplicated(dat$RoiID),]
+dat$imageID <- dat$RoiID
+
+#phenoData <- clinical.data[clinical.data$RoiID %in% unique(sub_sce[, sub_sce$DX.name=="Squamous cell carcinoma"]$RoiID),]
+phenoData <- dat
+
+cols <- c("DX.name", "Grade", "Stage", "T.new","M.new","N","Relapse","Smok","LN.Met","Dist.Met","NeoAdj","Gender","Radio","Chemo",paste(caf_groups))
+phenoData[cols] <- lapply(phenoData[cols], factor)  ## as.factor() could also be used
+
+
+phenoData <- DataFrame(phenoData)
+imagePheno(cellExp) <- phenoData
+
+caf_groups <- df %>% select(-c(Patient_ID, X,hypoxic_tpCAF_G)) %>% colnames()
+
+for(i in caf_groups){
+spicyTest <- spicy(cellExp, 
+                   condition = paste(i), 
+                   subject = "Patient_ID")#
+
+saveRDS(spicyTest, file = file.path(wd, "sce_objects","Distances", paste0("spicyR_",i,"_AC_density.rds")))
+
+ p <-signifPlot(spicyTest, 
+           breaks=c(-3, 3, 0.5),
+           marksToPlot = c(paste(unique(sub_sce$tcell_subtype))))
+
+ ggsave(plot=p, file=file.path(wd, "plots",paste0("spicyR_",i,"_AC_density.pdf")))
+}
+
+```
+
+
+
+
+#Differential abundance analysis loop   
+```{r  Differential Abundance Analysis Types over Metastasis, fig.width=6, fig.height=6, warning=F, message=F,echo=F}
+#Patient Number must be numeric
+library(tidyr)
+#groupId must be factor
+#t <-as.data.frame(table(fibro[,fibro$CAF_Type!="undefined"]$Patient_ID,factor(fibro[,fibro$CAF_Type!="undefined"]$CAF_Type)))
+#  colnames(t)<-c("Patient_ID","Phenotype","n")
+ # t$Phenotype <-factor(t$Phenotype)
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+df$X <-NULL
+  
+colData(all.sce_pat.roi)<-as.data.frame(colData(all.sce_pat.roi)) %>% 
+  group_by(Patient_ID) %>% mutate(NR=cur_group_id()) %>% DataFrame()
+dat <-as_tibble(colData(all.sce_pat.roi))
+#if necessary: change group_id labels
+
+dat <-left_join(dat, df, by="Patient_ID")
+#clinical.data$NR <-c(1:length(unique(clinical.data$Patient_ID)))
+#dat <-merge(t,clinical.data,by="Patient_ID")
+dat <-dat[dat$DX.name=="Adenocarcinoma" | dat$DX.name=="Squamous cell carcinoma",]
+#dat$LN.Met <- ifelse(dat$N ==0, "No LN Metastases", "LN Metastases")
+#dat$Dist.Met <- ifelse(dat$M.new ==0, "No Dist. Metastases", "Dist. Metastases")
+#dat$NeoAdj <- ifelse(dat$Radio==1 |dat$Chemo==1, "NeoAdjuvantTherapy", "NoNeoAdjuvantTherapy")
+
+#Only for grade
+#dat <-dat[dat$Grade=="2" | dat$Grade=="3",]
+#category <-"Grade"
+#dat<-dat %>% drop_na(Grade)
+
+
+caf_groups <- colnames(df[,-1])
+category <-c("Gender","Relapse","Chemo","Dist.Met","LN.Met",caf_groups)
+
+#group_id <-c("M.new")
+#dat$group_id <-ifelse(dat$M.new ==0, 0,1)
+#dat[[group_id]] <-as.factor(dat[[group_id]])
+
+#define the naming category aka Phenotype
+
+#dat<-dat %>% drop_na(paste(group_id))
+j <-"cell_subtype"
+
+plot_list <- list()
+for (i in category) {
+  #for(k in unique(dat$DX.name)){
+   # dat.l <-subset(dat, DX.name==k)
+
+dat.l <-dat
+k="both tumour types"
+dat.l$group_id <-as.factor(dat.l[[i]])
+dat.l<-dat.l %>% drop_na(paste(i))
+
+test <- dat.l %>% select(c("NR",j,"group_id"))
+#colnames(test) <-c("NR", "Phenotype","group_id")
+
+test_wide <- pivot_wider(test,id_cols=c("NR","group_id"),names_from = paste(j), values_from =paste(j), values_fn = list(cell_subtype=length),names_prefix = "")
+#names(test_wide)[names(test_wide) == "FibroType_1"] <- "FibroType_1 / tCAF"
+#names(test_wide)[names(test_wide) == "FibroType_2"] <- "FibroType_2 / vCAF"
+test_wide
+test_wide[is.na(test_wide)] <- 0
+
+design <- createDesignMatrix(
+  test_wide[,c("NR","group_id")], cols_design = c( "NR","group_id")
+)
+
+contrast <- createContrast(c(rep(0, ncol(design)-1),1))
+data.frame(parameters = colnames(design), contrast)
+
+test_wide_norm <- test_wide %>% select(-c("NR","group_id"))
+#Normalize
+norm_factors <- calcNormFactors(t(test_wide_norm), method = "TMM")
+y <- DGEList(t(test_wide_norm), norm.factors = norm_factors)
+
+#y <- DGEList(t(both[,-c("patient_id","group_id")]))
+y <- estimateDisp(y,design)
+
+fit <- glmFit(y, design)
+lrt <- glmLRT(fit, contrast = contrast)
+top <- edgeR::topTags(lrt, n = Inf, adjust.method = "BH", 
+        sort.by = "none")
+t.op <-as.data.frame(top)
+t.op <- tibble::rownames_to_column(t.op, "Neighbour_Cluster")
+
+plot_list[[k]] <-
+ggplot(t.op, aes(reorder(Neighbour_Cluster,logFC), logFC)) +
+  geom_col(aes(fill=PValue<0.05)) +
+  coord_flip() +
+  scale_fill_tableau()+
+  labs(x="Neighbour_Cluster", y="logFC",
+       title=paste("Phenotypes over",i,k))+
+  theme_bw()+
+  theme(strip.background = element_blank(),
+        panel.background=element_rect(fill='white', colour = 'black'),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        axis.text.y = element_text(size=10))
+#plot(p)
+
+# reference = Metastasis
+#ggsave(plot=p, file=file.path(plot_folder, paste("Differential_Abundance_Analysis_Fibro_n_over_",i,".pdf")))
+  #}
+  gridExtra::grid.arrange(grobs = plot_list)
+  
+    #save out in individual pdfs for each variable
+  #pdf(file=file.path(plot_folder, paste0("DifferentialAbundance_Analysis_Fibro_n_over_",i,".pdf")))
+  #gridExtra::grid.arrange(grobs = plot_list)
+  #dev.off()
+}
+
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=Neighbour_Cluster))+geom_point()+scale_color_tableau()
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=PValue<0.05))+geom_point()+scale_color_tableau()
+```
\ No newline at end of file
diff --git a/Analysis_Fibro.Rmd b/Analysis_Fibro.Rmd
new file mode 100644
index 0000000..ea76026
--- /dev/null
+++ b/Analysis_Fibro.Rmd
@@ -0,0 +1,7340 @@
+---
+title: "R Notebook - Analysis Fibros"
+output:
+  html_document:
+    df_print: paged
+---
+```{r load libraries, echo=F, message=F, warning=F}
+library(ggpubr)
+library(dplyr)
+#library(tidyverse)
+library(RColorBrewer)
+library(qwraps2)
+library(table1)
+library(SingleCellExperiment)
+#library(uwot)
+library(tidyr)
+library(scater)
+library(ggridges)
+library(viridis)
+library(viridisLite)
+library(ggplot2)
+library(data.table)
+library(CATALYST)
+library(gridExtra)
+library(Rphenograph)
+library(ComplexHeatmap)
+library(CATALYST)
+library(scales)
+library(survival)
+library(broom)
+library(pheatmap)
+library(FlowSOM)
+library(Seurat)
+library(Rphenoannoy)
+library(dplyr)
+library(data.table)
+library(ggthemes)
+library(diffcyt)
+library(edgeR)
+library(rstatix)
+library(dendextend)
+
+library(ggdendro)
+library(dendextend)
+library(FactoMineR)
+library(factoextra)
+library(survminer)
+library(corrplot)
+library(rstatix)
+library(graphics)
+library(cowplot)
+library(cluster)
+library(glmnet)
+library(fastDummies)
+
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+library(dplyr)
+library(tidyr)
+library(mclust)
+library(ggplot2)
+library(RColorBrewer)
+library(scater)
+library(Rphenoannoy)
+library(factoextra)
+library(cluster)
+library(dendextend)
+library(ggthemes)
+set.seed(101100)
+```
+
+
+
+```{r  Set wd and load data}
+#set working directory 
+wd <-dirname(getwd())
+
+#clinical.data
+data_folder <-(file.path(wd,"sce_objects","final objects with categories","FINAL"))
+plot_folder <-(file.path(wd,"plots"))
+
+fibro.sce <- readRDS(file=file.path(data_folder, "FIBRO_CLINICAL-DATA_FILTERED.rds"))
+fibro.sce$DX.name[is.na(fibro.sce$DX.name)]<-"NA"
+
+```
+
+Define clinical data
+```{r clinical data, message=FALSE, warning=FALSE, echo=FALSE}
+clinical.data <- read.csv(file=file.path(wd,"sce_objects","clinical_data", "clinical_data_ROI_ac_combined_CORRECT.csv"))
+
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+clinical.data$X <-NULL
+clinical.data$X.1 <-NULL
+head(clinical.data)
+```
+Table with Immune cell numbers including Fibros removing undefined
+```{r  Table with immune cell numbers per patient, echo=FALSE, message=FALSE, warning=FALSE}
+#All Fibros together
+tbl <- as.data.frame(table(fibro.sce[,fibro.sce$Patient_ID!="Control" ]$Patient_ID))
+colnames(tbl) <-c("Patient ID", "Fibro number overall")
+#write.csv(tbl, file=file.path(plot_folder, paste("Immune cell numbers per patient overall including T-excluding undefined cells.csv")))
+#print(tbl)
+
+summary(tbl$`Fibro number overall`) #3-6259
+tbl[tbl$`Fibro number overall` <=100,] #43 patients have less than 100 F
+
+#Lowest 10% of all patients' Fibro numbers
+tbl[order(tbl$`Fibro number overall`),]$`Fibro number overall`[1:100] #1-23
+
+
+#Highest 10% of all patients' Fibro numbers
+tbl[order(-tbl$`Fibro number overall`),]$`Fibro number overall`[1:100] #760-2378
+
+tbl[tbl$`Fibro number overall` <=100,] #43 patients have less than 100 Fibros=10%. 77 patients have less than 15 Fibros
+
+all_fibro_pat <- tbl[tbl$`Fibro number overall` <=100,]$`Patient ID`
+
+length(unique(fibro.sce$Patient_ID))
+```
+
+Remove ROIs
+```{r  Table with Immune cell numbers per image, echo=FALSE, message=FALSE, warning=FALSE}
+#All Immune cells together
+tbl <- as.data.frame(table(fibro.sce[,fibro.sce$Patient_ID!="Control"]$RoiID))
+colnames(tbl) <-c("ROI ID", "Fibro number overall")
+#write.csv(tbl, file=file.path(plot_folder, paste("Fibro numbers per ROI overall including Fibros-excluding undefined.csv")))
+#print(tbl)
+summary(tbl$`Fibro number overall`) #1-5075
+tbl[tbl$`Fibro number overall` <=10,] #93 images have less than 50 Fibros (10% of median)
+#Lowest 10% of all patients' Fibro numbers
+tbl[order(tbl$`Fibro number overall`),]$`Fibro number overall`[1:200] #1-110
+
+
+#Highest 10% of all patients' Fibro numbers
+tbl[order(-tbl$`Fibro number overall`),]$`Fibro number overall`[1:200] #1385-5075
+#per image cut at 50 Fibros per image equals lowest 5% -> ensures that there's at least 50 Fibros per patient
+
+all_fibro_roi <- tbl[tbl$`Fibro number overall` <=50,]$`ROI ID`
+```
+
+
+Remove patients and roi
+```{r  patient roi and pat_roi removal, echo=FALSE, message=FALSE, warning=FALSE}
+#Patient removal
+fibro.sce_pat <- fibro.sce[,fibro.sce$Patient_ID!="Control"&
+                               !fibro.sce$Patient_ID%in%all_fibro_pat]
+length(unique(fibro.sce_pat$Patient_ID)) #1025
+
+#Roi removal
+fibro.sce_roi <- fibro.sce[,fibro.sce$Patient_ID!="Control"&
+                               !fibro.sce$RoiID%in%all_fibro_roi]
+length(unique(fibro.sce_roi$Patient_ID)) #1039
+
+#Patient & Roi removal
+fibro.sce_pat.roi <- fibro.sce[,fibro.sce$Patient_ID!="Control"&
+                               !fibro.sce$Patient_ID%in%all_fibro_pat&
+                               !fibro.sce$RoiID%in%all_fibro_roi]
+length(unique(fibro.sce_pat.roi$Patient_ID)) #1025
+```
+
+```{r  Save SCE patient and roi removed, include=FALSE, echo=FALSE,warning=FALSE, message=FALSE, eval=FALSE}
+data_folder <-(file.path(wd,"sce_objects","stroma","CAF"))
+
+saveRDS(fibro.sce_pat.roi,file=file.path(data_folder, paste("Fibro_sce_pat_roi_rem.rds",sep="")))
+saveRDS(fibro.sce_roi,file=file.path(data_folder, paste("Fibro_sce_roi_rem.rds",sep="")))
+saveRDS(fibro.sce_pat,file=file.path(data_folder, paste("Fibro_sce_pat_rem.rds",sep="")))
+
+fibro.sce_pat.roi <-readRDS(file=file.path(data_folder, paste("Fibro_sce_pat_roi_rem.rds",sep="")))
+fibro.sce_roi <-readRDS(file=file.path(data_folder, paste("Fibro_sce_roi_rem.rds",sep="")))
+fibro.sce_pat <-readRDS(file=file.path(data_folder, paste("Fibro_sce_pat_rem.rds",sep="")))
+
+table(fibro.sce_pat.roi$cell_type)
+```
+
+
+##Analysis
+#Proportions Fibro CATEGORY
+
+## **Analysis**
+In the following, only tumours that were identified as adenocarcinomas or squamous cell carcinomas by pathologists will be used.
+
+## **Fibro Category**
+
+## **Proportions**
+Optimal number of patient metaclusters Fibro category proportions
+```{r  optimal number of clusters for non Fibro category type proportion clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Proportions
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+tdat <-as.data.frame(table( fibro.sce_pat[, fibro.sce_pat$Patient_ID!="Control"&
+                                           fibro.sce_pat$DX.name=="Adenocarcinoma"|
+                                           fibro.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            fibro.sce_pat[, fibro.sce_pat$Patient_ID!="Control"&
+                                           fibro.sce_pat$DX.name=="Adenocarcinoma"|
+                                           fibro.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_type))
+
+
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+#tdat_m <-tdat_m[,1:2]
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+sbg <-cutree(hc, k=2)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=15)
+
+p <-fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=15)
+plot(p)
+#ggsave(p, file=file.path(plot_folder, "Silhouette_optClusters_Meta4.pdf"), width=6, height=4)
+#install.packages("cluster")
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+#res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+## Patients are ordered by hierarchical clustering (euclidian, wards.D2)
+```{r  Fibro category proportions ordered by hierarchical clustering, message=FALSE, warning=FALSE, echo=FALSE}
+fibro.sce_pat$DX.name[is.na(fibro.sce_pat$DX.name)] <- "NA"
+
+tdat <-as.data.frame(table( fibro.sce_pat[, fibro.sce_pat$Patient_ID!="Control"&
+                                           fibro.sce_pat$DX.name=="Adenocarcinoma"|
+                                           fibro.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            fibro.sce_pat[, fibro.sce_pat$Patient_ID!="Control"&
+                                           fibro.sce_pat$DX.name=="Adenocarcinoma"|
+                                           fibro.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = hc$labels[hc$order])
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = hc$labels[hc$order])
+
+tdat_wide
+```
+
+Patient based metaclusters split by metacluster
+```{r  Barplot Fibro category proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#Barplot proportions ordered by
+mycol <-c("#1D76B5","#EF7D20","#2CA237","#D6272A","#8F67A9","#8D564C","#D278AF","#7F7F7F","#BCBC20","#34B9C9","#AEC6E8")
+p <-ggplot(tdat,aes(x=freq, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+#scale_fill_tableau("Tableau 20")+
+  scale_fill_manual(values=mycol)
+plot(p)
+##ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_Tcell-Category_over_FILTEREDpatients_ordered-hc.pdf")), width=6, height=6)
+
+#dendrogram coloured by metacluster
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k =4) %>%  as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()+
+  coord_flip()+
+  scale_y_reverse()
+
+#n
+p2<-ggplot(tdat,aes(x=n, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+#scale_fill_tableau("Tableau 20")+
+  theme(
+        axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())+
+  scale_fill_manual(values=mycol)
+plt <-p1+
+  p+theme(legend.position = "none", axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())+
+  p2
+
+plot(plt)
+ggsave(plot=plt, file=file.path(plot_folder, paste("Barplot_n_hc_CAF_type_proportions_K4.pdf")), width=12, height=6)
+```
+
+```{r  merge hc metacluster into t_dat Fibro category proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k= 4))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+
+p <-ggplot(tdat_ct,aes(y=freq,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")+
+   scale_fill_manual(values=mycol)
+ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_split_metacluster_caf-category_proportions_K4.pdf")))
+plot(p)
+
+write.csv(ct, file=file.path(plot_folder, "patient_groups_k4.csv"))
+```
+
+```{r  heatmap metacluster cell category expression proportions}
+tdat_ct
+tdat_ct_w <-tdat_ct %>% pivot_wider(id_cols = "metacluster",names_from = "Phenotype", values_from = "freq",values_fn = mean)
+tdat_ct_w_m <- as.matrix(tdat_ct_w[,-1])
+rownames(tdat_ct_w_m) <-tdat_ct_w$metacluster
+
+pheatmap(scale(t(tdat_ct_w_m)))
+
+pheatmap(t(tdat_ct_w_m))
+
+cal_z_score <- function(x){
+  (x - mean(x)) / sd(x)
+}
+ 
+data_subset_norm <- t(apply(tdat_ct_w_m, 2, cal_z_score))
+
+pdf(file.path(plot_folder, "metacluster_k4_CAF.pdf"))
+pheatmap(data_subset_norm)
+dev.off()
+
+ggsave(p, file=file.path(plot_folder, "metacluster_k4_CAF.pdf"))
+
+p <- ggplot(tdat_ct, aes(x=as.factor(metacluster), y=freq)) + 
+    geom_boxplot(fill="slateblue", alpha=0.2) + 
+    facet_wrap(~Phenotype,scales="free_y")+
+    xlab("metacluster")
+ggsave(p, file=file.path(plot_folder, "metacluster_k4_CAF.pdf"))
+tdat_ct %>%
+  group_by(metacluster, Phenotype) %>%summarise(Mean=mean(freq*100), Max=max(freq*100), Min=min(freq*100), Median=median(freq*100), Std=sd(freq*100))
+```
+```{r, fig.width=8, fig.height=8}
+chisq_dat <- left_join(tdat_wide_ct, ct)
+
+chisq_dat$Relapse <-factor(chisq_dat$Relapse) 
+chisq_dat$Grade <-factor(chisq_dat$Grade)
+chisq_dat$DX.name <-factor(chisq_dat$DX.name) 
+
+levels(chisq_dat$Relapse) <- c("No Relapse", "Relapse")
+levels(chisq_dat$DX.name) <- c("LUAC", "LUSC")
+levels(chisq_dat$Grade) <- c("Grade 1", "Grade 2","Grade 3")
+
+var <- c("DX.name","Grade","Dist.Met","LN.Met","Relapse", "Gender")
+
+for(i in var){
+  print(i)
+  chisq <- chisq.test(chisq_dat$metacluster, chisq_dat[[i]])
+chisq %>% print
+chisq$observed
+
+corrplot(chisq$residuals, is.cor = FALSE,col=colorRampPalette(c("white","lightblue","red"))(100))
+pdf(file=file.path(plot_folder, paste("Chisq_meta4_",i,".pdf")), width=4, height=4)
+corrplot(chisq$residuals, is.cor = FALSE,col=colorRampPalette(c("white","lightblue","red"))(100))
+
+dev.off()
+}
+```
+
+Differential abundance metaclusters
+
+
+
+## **Survival analysis** - Fibro category
+Fibro categories (CD4 and CD8) are grouped into "high" and "low" using the proportion's median as a cut-off.    
+- high > median  
+- low < median  
+Patients who received neoadjuvant treatment are excluded from survival analysis.  
+```{r  survival data Fibro category proportions high-low, message=FALSE, warning=FALSE, echo=FALSE}
+tdat_wide_ct <- left_join(tdat_wide, clinical.data, by="Patient_ID")
+
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat,]
+
+#high low by median
+surv_dat$Collagen_CAF_G <- ifelse(surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+surv_dat$hypoxic_CAF_G <- ifelse(surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+surv_dat$mCAF_G <- ifelse(surv_dat$mCAF >summary(surv_dat$mCAF)[3],"mCAF high","mCAF low")
+surv_dat$SMA_CAF_G <- ifelse(surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+surv_dat$tpCAF_G <- ifelse(surv_dat$tpCAF >summary(surv_dat$tpCAF)[3],"tpCAF high","tpCAF low")
+surv_dat$iCAF_G <- ifelse(surv_dat$iCAF >summary(surv_dat$iCAF)[3],"iCAF high","iCAF low")
+surv_dat$vCAF_G <- ifelse(surv_dat$vCAF >summary(surv_dat$vCAF)[3],"vCAF high","vCAF low")
+surv_dat$dCAF_G <- ifelse(surv_dat$dCAF >summary(surv_dat$dCAF)[3],"dCAF high","dCAF low")
+surv_dat$hypoxic_tpCAF_G <- ifelse(surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+surv_dat$IDO_CAF_G <- ifelse(surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+surv_dat$PDPN_CAF_G <- ifelse(surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+
+caf_strat <-surv_dat %>% select(contains(c("Patient_ID","_G")))
+write.csv(caf_strat, file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster")
+surv <- "metacluster"
+#surv <-"metacluster"
+```
+
+
+#CoxPH for Fibro category proportions corrected for Stage, Grade and M
+```{r  Coxph Fibro category proportions, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   PDPN_CAF + IDO_CAF+hypoxic_tpCAF+dCAF+vCAF+iCAF+tpCAF+mCAF+hypoxic_CAF+SMA_CAF+Collagen_CAF+Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+#scale_colour_tableau()
+  scale_color_jco()
+#
+plot(p)
+ggsave(plot=p, file=file.path(plot_folder, "CAF_coxph.pdf"), width=6, height=4)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r  survival Fibro category proportions not split high low, message=FALSE, warning=FALSE, echo=FALSE, fig.height=5, fig.width=5}
+#out.width="50%",
+tdat_wide_ct <- left_join(tdat_wide, clinical.data, by="Patient_ID")
+
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+surv_dat <- left_join(surv_dat, ct)
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat,]
+surv <- "metacluster"
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+ 
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+print(pw)
+print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+   symbols = c("****", "***", "**", "*", "+", "."),
+    abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               #conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               palette = "jco",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#PDF
+pdf(file=file.path(plot_folder, paste0("Survival_CAF_Prop_hi-low_OS over",i,"_per_",k,".pdf")), width=6, height=6)
+print(p,newpage=FALSE)
+dev.off()
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  facet_wrap(~strata)+theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+print(pw)
+print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+   symbols = c("****", "***", "**", "*", "+", "."),
+   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+#km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               palette="jco",
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#PDF
+pdf(file=file.path(plot_folder, paste0("Survival_CAF_Prop_hi-low_DFS over",i,"_per_",k,".pdf")), width=6, height=6)
+print(p,newpage=FALSE)
+dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+#clean
+#Differential abundance analysis loop   
+```{r  Differential Abundance Analysis Types over Metastasis clean, fig.width=6, fig.height=4, warning=F, message=F,echo=F, out.width="50%"}
+#Patient Number must be numeric
+library(tidyr)
+#groupId must be factor
+
+dat<-tdat_wide_ct %>% 
+  group_by(Patient_ID) %>% mutate(NR=cur_group_id()) %>% as_tibble()
+#if necessary: change group_id labels
+dat <-left_join(dat, ct)
+dat <-dat[dat$DX.name=="Adenocarcinoma" | dat$DX.name=="Squamous cell carcinoma",]
+
+
+subtype <-c("Gender","Relapse","Chemo","Dist.Met","LN.Met","Relapse","DX.name")
+
+#define the naming subtype aka Celltype
+
+j <-"metacluster"
+
+plot_list <- list()
+for (i in subtype) {
+  #for(k in unique(dat$DX.name)){    #uncomment this if you want to split your loop e.g. by tumour type
+   # dat.l <-subset(dat, DX.name==k) #uncomment this if you want to split your loop e.g. by tumour type
+
+dat.l <-dat
+k="both tumour types" # comment this if you want to split by e.g. tumour type
+dat.l$group_id <-as.factor(dat.l[[i]])
+dat.l<-dat.l %>% drop_na(paste(i))
+
+test <- dat.l %>% select(c("NR",j,"group_id"))
+
+test_wide <- pivot_wider(test,id_cols=c("NR","group_id"),names_from = "metacluster", values_from ="metacluster", values_fn = list(metacluster=length),names_prefix = "")
+
+test_wide
+test_wide[is.na(test_wide)] <- 0
+
+design <- createDesignMatrix(
+  test_wide[,c("NR","group_id")], cols_design = c( "NR","group_id")
+)
+
+contrast <- createContrast(c(rep(0, ncol(design)-1),1))
+data.frame(parameters = colnames(design), contrast)
+
+test_wide_norm <- test_wide %>% select(-c("NR","group_id"))
+#Normalize
+norm_factors <- calcNormFactors(t(test_wide_norm), method = "TMM")
+y <- DGEList(t(test_wide_norm), norm.factors = norm_factors)
+
+y <- estimateDisp(y,design)
+
+fit <- glmFit(y, design)
+lrt <- glmLRT(fit, contrast = contrast)
+top <- edgeR::topTags(lrt, n = Inf, adjust.method = "BH", 
+        sort.by = "none")
+t.op <-as.data.frame(top)
+t.op <- tibble::rownames_to_column(t.op, "FibroTypes")
+
+plot_list[[k]] <-
+ggplot(t.op, aes(reorder(FibroTypes,logFC), logFC)) +
+  geom_col(aes(fill=PValue<0.05)) +
+  coord_flip() +
+  scale_fill_tableau()+
+  labs(x="Celltype", y="logFC",
+       title=paste("Phenotypes over",i,k))+
+  theme_bw()+
+  theme(strip.background = element_blank(),
+        panel.background=element_rect(fill='white', colour = 'black'),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        axis.text.y = element_text(size=10))
+#plot(p)
+
+
+  #} #uncomment this if you want to split by e.g. tumour type
+  gridExtra::grid.arrange(grobs = plot_list)
+  
+    #save individual pdf plots for each variable
+  pdf(file=file.path(plot_folder, paste0("DA_Celltype_n_over_",i,".pdf")))
+  gridExtra::grid.arrange(grobs = plot_list)
+  dev.off()
+}
+
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=FibroTypes))+geom_point()+scale_color_tableau()
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=PValue<0.05))+geom_point()+scale_color_tableau()
+
+surv_dat
+tdat_wide_ct
+```
+
+## Kruskal-Wallis / Wilcoxon Fibro category ~ clinical data excluding neoadjuvant therapy
+```{r  KW W Fibro category proportions WO neo, fig.width=12, fig.height=10, message=FALSE, warning=FALSE, echo=FALSE}
+#sce.all <- readRDS(file=file.path(data_folder, "FINAL_All_Clustered_FILTERED_inc_tumour_vessel_immune_CAF_minusOther.rds"))
+sce.all$DX.name <- as.factor(sce.all$DX.name)
+sce.all$DX.name %>% replace_na("NA")
+sce.all$DX.name[is.na(sce.all$DX.name)] <- "NA"
+
+df <- as.data.frame(colData(sce.all[,sce.all$Patient_ID!="Control"&
+                                          sce.all$DX.name=="Adenocarcinoma"|
+                                          sce.all$DX.name=="Squamous cell carcinoma"]))
+
+#remove neoadj patients
+df <-df[df$Patient_ID%in% tdat_wide_ct$Patient_ID,]
+
+categories <- c("metacluster") 
+#categories <- c("DX.name", "Grade") 
+i <- c("metacluster") 
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$cell_subtype, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-left_join(t, ct, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+     theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)
+    }
+    
+    else {
+if(is.element(i,colnames(ct)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=p<0.05))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_colour_manual(values = setNames(c(palette("Tableau 10")[1:2]),c(F, T)))+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ pdf(file=file.path(plot_folder, paste0("KW_W_CAF_TME",i,".pdf")), width=15, height=15)
+  gridExtra::grid.arrange(grobs = plot_list, ncol=round(length(unique(t$Phenotype))/5))
+ dev.off()
+}
+```
+
+```{r, fig.width=6,fig.height=4}
+sce.all$OS_c <-ifelse(sce.all$OS >1231, "long OS","short OS")
+sce.all$DFS_c <-ifelse(sce.all$DFS >1427, "long DFS","short DFS")
+
+library(diffcyt)
+library(edgeR)
+library(ggplot2)
+library(ggthemes)
+#BiocManager::install(c("diffcyt", "edgeR"))
+#Patient Number must be numeric
+library(tidyr)
+library(dplyr)
+#groupId must be factor
+sce.all$DX.name <- as.factor(sce.all$DX.name)
+colData(sce.all)<-as.data.frame(colData(sce.all)) %>% 
+  group_by(Patient_ID) %>% mutate(NR=cur_group_id()) %>% DataFrame()
+dat <-as_tibble(colData(sce.all))
+
+#if necessary: change group_id labels
+
+dat$DX.name <- as.factor(dat$DX.name)
+
+dat <- dat[is.na(dat$DX.name)==F,]
+dat <-dat[dat$DX.name=="Adenocarcinoma" | dat$DX.name=="Squamous cell carcinoma",]
+dat$DX.name %>% table
+
+subtype <-c("OS_c","DFS_c")
+
+#define the naming subtype aka Celltype
+
+j <-"cell_type"
+#subtype="OS_c"
+j <- "cell_subtype"
+plot_list <- list()
+
+for (i in subtype) {
+  #for(k in unique(dat$DX.name)){    #uncomment this if you want to split your loop e.g. by tumour type
+   # dat.l <-subset(dat, DX.name==k) #uncomment this if you want to split your loop e.g. by tumour type
+
+dat.l <-dat
+k="both tumour types" # comment this if you want to split by e.g. tumour type
+dat.l$group_id <-as.factor(dat.l[[i]])
+dat.l<-dat.l %>% drop_na(paste(i))
+
+test <- dat.l %>% select(c("NR",j,"group_id"))
+
+test_wide <- pivot_wider(test,id_cols=c("NR","group_id"),names_from = "cell_subtype", values_from ="cell_subtype", values_fn = list(cell_subtype=length),names_prefix = "")
+
+test_wide
+test_wide[is.na(test_wide)] <- 0
+
+design <- createDesignMatrix(
+  test_wide[,c("NR","group_id")], cols_design = c( "NR","group_id")
+)
+
+contrast <- createContrast(c(rep(0, ncol(design)-1),1))
+data.frame(parameters = colnames(design), contrast)
+
+test_wide_norm <- test_wide %>% select(-c("NR","group_id"))
+#Normalize
+norm_factors <- calcNormFactors(t(test_wide_norm), method = "TMM")
+y <- DGEList(t(test_wide_norm), norm.factors = norm_factors)
+
+y <- estimateDisp(y,design)
+
+fit <- glmFit(y, design)
+lrt <- glmLRT(fit, contrast = contrast)
+top <- edgeR::topTags(lrt, n = Inf, adjust.method = "BH", 
+        sort.by = "none")
+t.op <-as.data.frame(top)
+t.op <- tibble::rownames_to_column(t.op, "FibroTypes")
+library(ggthemes)
+plot_list[[k]] <-
+ggplot(t.op, aes(reorder(FibroTypes,logFC), logFC)) +
+  geom_col(aes(fill=PValue<0.05)) +
+  coord_flip() +
+  scale_fill_tableau()+
+  labs(x="Celltype", y="logFC",
+       title=paste("Phenotypes over",i,k))+
+  theme_bw()+
+  theme(strip.background = element_blank(),
+        panel.background=element_rect(fill='white', colour = 'black'),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        axis.text.y = element_text(size=10))
+#plot(p)
+
+
+  #} #uncomment this if you want to split by e.g. tumour type
+  gridExtra::grid.arrange(grobs = plot_list)
+  
+    #save individual pdf plots for each variable
+  pdf(file=file.path(plot_folder, paste0("DA_all_Celltype_n_over_OS",i,".pdf")))
+  gridExtra::grid.arrange(grobs = plot_list)
+  dev.off()
+}
+
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=FibroTypes))+geom_point()+scale_color_tableau()
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=PValue<0.05))+geom_point()+scale_color_tableau()
+```
+
+
+```{r, fig.width=8, fig.height=8}
+df_t <- table(df$cell_subtype, df$Patient_ID)
+
+df_t <- prop.table(df_t, margin=2)
+df_t <- as.data.frame(df_t)
+colnames(df_t) <- c("Phenotype", "Patient_ID","freq")
+df_ct<-left_join(df_t, ct, by.x="Patient_ID", by.y="Patient_ID")
+    
+
+head(df_ct)
+df_ct_w <- pivot_wider(df_ct,id_cols=c("metacluster"),names_from = "Phenotype", values_from ="freq", values_fn = mean,names_prefix = "")
+
+df_ct_w_m <- as.matrix(df_ct_w[,-1])
+rownames(df_ct_w_m) <-df_ct_w$metacluster
+
+pheatmap(scale(t(df_ct_w_m)))
+
+pheatmap(t(df_ct_w_m))
+
+
+cal_z_score <- function(x){
+  (x - mean(x)) / sd(x)
+}
+ 
+data_subset_norm <- t(apply(df_ct_w_m, 2, cal_z_score))
+pheatmap(data_subset_norm)
+
+pdf(file.path(plot_folder, "metacluster_k4_CAF_TME.pdf"))
+pheatmap(data_subset_norm)
+dev.off()
+
+#remove CAFs
+
+df_ct_w_s <-df_ct_w %>% select(-contains(c("CAF")))
+
+df_ct_w_s_m <- as.matrix(df_ct_w_s[,-1])
+rownames(df_ct_w_s_m) <-df_ct_w_s$metacluster
+
+data_subset_norm <- t(apply(df_ct_w_s_m, 2, cal_z_score))
+pheatmap(data_subset_norm)
+
+pdf(file.path(plot_folder, "metacluster_k4_CAF_TME_minusCAF.pdf"))
+pheatmap(data_subset_norm)
+dev.off()
+```
+
+
+#####
+## Split by tumour type (AC and SQC individually)
+```{r  survival Fibro category proportions split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#OS survival
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster")
+
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+    
+dat$Collagen_CAF_G <- ifelse(dat$Collagen_CAF >summary(dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+dat$hypoxic_CAF_G <- ifelse(dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+dat$mCAF_G <- ifelse(dat$mCAF >summary(dat$mCAF)[3],"mCAF high","mCAF low")
+dat$SMA_CAF_G <- ifelse(dat$SMA_CAF >summary(dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+dat$tpCAF_G <- ifelse(dat$tpCAF >summary(dat$tpCAF)[3],"tpCAF high","tpCAF low")
+dat$iCAF_G <- ifelse(dat$iCAF >summary(dat$iCAF)[3],"iCAF high","iCAF low")
+dat$vCAF_G <- ifelse(dat$vCAF >summary(dat$vCAF)[3],"vCAF high","vCAF low")
+dat$dCAF_G <- ifelse(dat$dCAF >summary(dat$dCAF)[3],"dCAF high","dCAF low")
+dat$hypoxic_tpCAF_G <- ifelse(dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+dat$IDO_CAF_G <- ifelse(dat$IDO_CAF >summary(dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+dat$PDPN_CAF_G <- ifelse(dat$PDPN_CAF >summary(dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+
+
+caf_strat <-dat %>% select(contains(c("Patient_ID","_G")))
+write.csv(caf_strat, file=file.path(wd,"patient_strat",paste0("CAF_strat_hi_low_proportion_",k,".csv")))
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+   #abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+#km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               palette="jco",
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#PDF
+pdf(file=file.path(plot_folder, paste0("Survival_CAF_Prop_hi-low_OS over",i,"_per_",k,".pdf")), width=4, height=4)
+print(p,newpage=FALSE)
+dev.off()
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               palette="jco",
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#PDF
+pdf(file=file.path(plot_folder, paste0("Survival_CAF_Prop_hi-low_DFS over",i,"_per_",k,".pdf")), width=4, height=4)
+print(p,newpage=FALSE)
+dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+} 
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high / low as well as the proportion (continuous).   
+```{r  coxph Fibro category proportions, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.width=8, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+#correct
+df <-Coefficients %>% as.data.frame 
+df$Coefficients <- rownames(df)
+colnames(df) <- c("Predictor","Coefficient")
+df <-df[df$Predictor !=0,]
+
+na_td_res <- df[-grep("NA",df$Coefficient),]
+
+p <-ggplot(na_td_res, aes(x = reorder(Coefficient,Predictor), y = Predictor, color=Predictor >0)) +
+geom_point(size = 4) +
+#geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+#scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 0,color = 'black')+
+theme_bw()+
+theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  geom_segment(aes(y = 0, 
+                   x = Coefficient, 
+                   yend = Predictor, 
+                   xend = Coefficient), 
+               color = "black") +
+scale_colour_jco()
+p
+ggsave(plot=p, file=file.path(plot_folder, "Lasso_coxph_CAF_prop.pdf"), width = 8, height = 6)
+
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_jco()
+p
+##ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Proportions_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Cell groups categorised into "high", "mid", "low".  
+- high > 3rd quantile  
+- mid >1st quantile and < 3rd quantile  
+- low < 1st quantile  
+```{r  survival Fibro category proportions classified as high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat$mCAF_G[surv_dat$mCAF >summary(surv_dat$mCAF)[5]] <-"mCAF high"
+surv_dat$mCAF_G[surv_dat$mCAF <=summary(surv_dat$mCAF)[2]] <-"mCAF low"
+surv_dat$mCAF_G[surv_dat$mCAF >summary(surv_dat$mCAF)[2]& surv_dat$mCAF<=summary(surv_dat$mCAF)[5]] <-"mCAF medium"
+
+surv_dat$vCAF_G[surv_dat$vCAF >summary(surv_dat$vCAF)[5]] <-"vCAF high"
+surv_dat$vCAF_G[surv_dat$vCAF <=summary(surv_dat$vCAF)[2]] <-"vCAF low"
+surv_dat$vCAF_G[surv_dat$vCAF >summary(surv_dat$vCAF)[2]& surv_dat$vCAF<=summary(surv_dat$vCAF)[5]] <-"vCAF medium"
+
+surv_dat$tpCAF_G[surv_dat$tpCAF >summary(surv_dat$tpCAF)[5]] <-"tpCAF high"
+surv_dat$tpCAF_G[surv_dat$tpCAF <=summary(surv_dat$tpCAF)[2]] <-"tpCAF low"
+surv_dat$tpCAF_G[surv_dat$tpCAF >summary(surv_dat$tpCAF)[2]& surv_dat$tpCAF<=summary(surv_dat$tpCAF)[5]] <-"tpCAF medium"
+
+surv_dat$iCAF_G[surv_dat$iCAF >summary(surv_dat$iCAF)[5]] <-"iCAF high"
+surv_dat$iCAF_G[surv_dat$iCAF <=summary(surv_dat$iCAF)[2]] <-"iCAF low"
+surv_dat$iCAF_G[surv_dat$iCAF >summary(surv_dat$iCAF)[2]& surv_dat$iCAF<=summary(surv_dat$iCAF)[5]] <-"iCAF medium"
+
+surv_dat$dCAF_G[surv_dat$dCAF >summary(surv_dat$dCAF)[5]] <-"dCAF high"
+surv_dat$dCAF_G[surv_dat$dCAF <=summary(surv_dat$dCAF)[2]] <-"dCAF low"
+surv_dat$dCAF_G[surv_dat$dCAF >summary(surv_dat$dCAF)[2]& surv_dat$dCAF<=summary(surv_dat$dCAF)[5]] <-"dCAF medium"
+
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[5]] <-"Collagen_CAF high"
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF <=summary(surv_dat$Collagen_CAF)[2]] <-"Collagen_CAF low"
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[2]& surv_dat$Collagen_CAF<=summary(surv_dat$Collagen_CAF)[5]] <-"Collagen_CAF medium"
+
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[5]] <-"hypoxic_CAF high"
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF <=summary(surv_dat$hypoxic_CAF)[2]] <-"hypoxic_CAF low"
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[2]& surv_dat$hypoxic_CAF<=summary(surv_dat$hypoxic_CAF)[5]] <-"hypoxic_CAF medium"
+
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[5]] <-"SMA_CAF high"
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF <=summary(surv_dat$SMA_CAF)[2]] <-"SMA_CAF low"
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[2]& surv_dat$SMA_CAF<=summary(surv_dat$SMA_CAF)[5]] <-"SMA_CAF medium"
+
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF high"
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF <=summary(surv_dat$hypoxic_tpCAF)[2]] <-"hypoxic_tpCAF low"
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[2]& surv_dat$hypoxic_tpCAF<=summary(surv_dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF medium"
+
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[5]] <-"IDO_CAF high"
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF <=summary(surv_dat$IDO_CAF)[2]] <-"IDO_CAF low"
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[2]& surv_dat$IDO_CAF<=summary(surv_dat$IDO_CAF)[5]] <-"IDO_CAF medium"
+
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[5]] <-"PDPN_CAF high"
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF <=summary(surv_dat$PDPN_CAF)[2]] <-"PDPN_CAF low"
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[2]& surv_dat$PDPN_CAF<=summary(surv_dat$PDPN_CAF)[5]] <-"PDPN_CAF medium"
+
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster")
+
+
+caf_strat <-surv_dat %>% select(contains(c("Patient_ID","_G")))
+write.csv(caf_strat, file=file.path(wd,"patient_strat","CAF_strat_hi_mid_low_proportion.csv"))
+
+```
+
+## Survival Fibro category proportions high mid low not split
+```{r  survival Fibro category proportions not split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Survival Fibro category proportions high mid low split by tumour type
+```{r  survival Fibro category proportions split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$mCAF_G[dat$mCAF >summary(dat$mCAF)[5]] <-"mCAF high"
+dat$mCAF_G[dat$mCAF <=summary(dat$mCAF)[2]] <-"mCAF low"
+dat$mCAF_G[dat$mCAF >summary(dat$mCAF)[2]& dat$mCAF<=summary(dat$mCAF)[5]] <-"mCAF medium"
+
+dat$vCAF_G[dat$vCAF >summary(dat$vCAF)[5]] <-"vCAF high"
+dat$vCAF_G[dat$vCAF <=summary(dat$vCAF)[2]] <-"vCAF low"
+dat$vCAF_G[dat$vCAF >summary(dat$vCAF)[2]& dat$vCAF<=summary(dat$vCAF)[5]] <-"vCAF medium"
+
+dat$tpCAF_G[dat$tpCAF >summary(dat$tpCAF)[5]] <-"tpCAF high"
+dat$tpCAF_G[dat$tpCAF <=summary(dat$tpCAF)[2]] <-"tpCAF low"
+dat$tpCAF_G[dat$tpCAF >summary(dat$tpCAF)[2]& dat$tpCAF<=summary(dat$tpCAF)[5]] <-"tpCAF medium"
+
+dat$iCAF_G[dat$iCAF >summary(dat$iCAF)[5]] <-"iCAF high"
+dat$iCAF_G[dat$iCAF <=summary(dat$iCAF)[2]] <-"iCAF low"
+dat$iCAF_G[dat$iCAF >summary(dat$iCAF)[2]& dat$iCAF<=summary(dat$iCAF)[5]] <-"iCAF medium"
+
+dat$dCAF_G[dat$dCAF >summary(dat$dCAF)[5]] <-"dCAF high"
+dat$dCAF_G[dat$dCAF <=summary(dat$dCAF)[2]] <-"dCAF low"
+dat$dCAF_G[dat$dCAF >summary(dat$dCAF)[2]& dat$dCAF<=summary(dat$dCAF)[5]] <-"dCAF medium"
+
+dat$Collagen_CAF_G[dat$Collagen_CAF >summary(dat$Collagen_CAF)[5]] <-"Collagen_CAF high"
+dat$Collagen_CAF_G[dat$Collagen_CAF <=summary(dat$Collagen_CAF)[2]] <-"Collagen_CAF low"
+dat$Collagen_CAF_G[dat$Collagen_CAF >summary(dat$Collagen_CAF)[2]& dat$Collagen_CAF<=summary(dat$Collagen_CAF)[5]] <-"Collagen_CAF medium"
+
+dat$hypoxic_CAF_G[dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[5]] <-"hypoxic_CAF high"
+dat$hypoxic_CAF_G[dat$hypoxic_CAF <=summary(dat$hypoxic_CAF)[2]] <-"hypoxic_CAF low"
+dat$hypoxic_CAF_G[dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[2]& dat$hypoxic_CAF<=summary(dat$hypoxic_CAF)[5]] <-"hypoxic_CAF medium"
+
+dat$SMA_CAF_G[dat$SMA_CAF >summary(dat$SMA_CAF)[5]] <-"SMA_CAF high"
+dat$SMA_CAF_G[dat$SMA_CAF <=summary(dat$SMA_CAF)[2]] <-"SMA_CAF low"
+dat$SMA_CAF_G[dat$SMA_CAF >summary(dat$SMA_CAF)[2]& dat$SMA_CAF<=summary(dat$SMA_CAF)[5]] <-"SMA_CAF medium"
+
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF high"
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF <=summary(dat$hypoxic_tpCAF)[2]] <-"hypoxic_tpCAF low"
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[2]& dat$hypoxic_tpCAF<=summary(dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF medium"
+
+dat$IDO_CAF_G[dat$IDO_CAF >summary(dat$IDO_CAF)[5]] <-"IDO_CAF high"
+dat$IDO_CAF_G[dat$IDO_CAF <=summary(dat$IDO_CAF)[2]] <-"IDO_CAF low"
+dat$IDO_CAF_G[dat$IDO_CAF >summary(dat$IDO_CAF)[2]& dat$IDO_CAF<=summary(dat$IDO_CAF)[5]] <-"IDO_CAF medium"
+
+dat$PDPN_CAF_G[dat$PDPN_CAF >summary(dat$PDPN_CAF)[5]] <-"PDPN_CAF high"
+dat$PDPN_CAF_G[dat$PDPN_CAF <=summary(dat$PDPN_CAF)[2]] <-"PDPN_CAF low"
+dat$PDPN_CAF_G[dat$PDPN_CAF >summary(dat$PDPN_CAF)[2]& dat$PDPN_CAF<=summary(dat$PDPN_CAF)[5]] <-"PDPN_CAF medium"
+
+
+caf_strat <-dat %>% select(contains(c("Patient_ID","_G")))
+write.csv(caf_strat, file=file.path(wd,"patient_strat",paste0("CAF_strat_hi_mid_low_proportion_",k,".csv")))
+
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high mid low as well as the proportion (continuous).   
+```{r  coxph Fibro category proportions high mid low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=8, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+#correct
+
+
+df <-Coefficients %>% as.data.frame 
+df$Coefficients <- rownames(df)
+colnames(df) <- c("Predictor","Coefficient")
+df <-df[df$Predictor !=0,]
+
+na_td_res <- df[-grep("NA",df$Coefficient),]
+
+p <-ggplot(na_td_res, aes(x = reorder(Coefficient,Predictor), y = Predictor, color=Predictor >0)) +
+geom_point(size = 4) +
+#geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+#scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 0,color = 'black')+
+theme_bw()+
+  geom_segment(aes(y = 0, 
+                   x = Coefficient, 
+                   yend = Predictor, 
+                   xend = Coefficient), 
+               color = "black") +
+scale_colour_tableau()
+p
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+##ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Proportions_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+
+
+In group comparison (Wilcoxon for 2 groups or KW for >2 groups) for Fibro categories over clinical variables:  
+  - T.new (TNM)
+  - N (TNM)
+  - M.new (TNM)
+  - Relapse
+  - Grade
+  - Stage
+  - Gender
+  - Typ (Tumour Type)
+  - Dist.Met (Distant metastases yes/no)
+  - LN.Met (Lymph node metastases yes/no)
+  
+## Kruskal-Wallis / Wilcoxon Fibro category ~ clinical data excluding neoadjuvant therapy
+```{r  KW W Fibro category proportions WO neo, fig.width=12, fig.height=10, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(fibro.sce_pat[,fibro.sce_pat$Patient_ID!="Control"&
+                                          fibro.sce_pat$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat$DX.name=="Squamous cell carcinoma"]))
+
+#remove neoadj patients
+df <-df[!df$Patient_ID%in% neoadj_pat$Patient_ID,]
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender") 
+#categories <- c("DX.name", "Grade") 
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$cell_type, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+     theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=p<0.05))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_colour_manual(values = setNames(c(palette("Tableau 10")[1:2]),c(F, T)))+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+  pdf(file=file.path(plot_folder, paste0("KW_W_CAF_Proportions_WOneo",i,".pdf")), width=10, height=6)
+  gridExtra::grid.arrange(grobs = plot_list, ncol=round(length(unique(t$Phenotype))/2))
+  dev.off()
+}
+```
+
+## Kruskal-Wallis / Wilcoxon Fibro category ~ clinical data excluding neoadjuvant therapy SPLIT by tumour type
+```{r  KW W Fibro category proportions WO neo SPLIT by tumour type, fig.width=12, fig.height=18, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(fibro.sce_pat[,fibro.sce_pat$Patient_ID!="Control"&
+                                          fibro.sce_pat$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat$DX.name=="Squamous cell carcinoma"]))
+
+#remove neoadj patients
+df <-df[!df$Patient_ID%in% neoadj_pat$Patient_ID,]
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","Stage","Grade","Gender") 
+#categories <- c("DX.name", "Grade") 
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$cell_type, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype, DX.name) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+     # facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      facet_grid(Phenotype~DX.name)+
+      scale_color_viridis_d()+
+      scale_color_tableau()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+     theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat$i <-tdat[[i]]
+      tdat$i <- as.factor(tdat$i)
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+
+      pvalues <- tdat %>% 
+      group_by(Phenotype,DX.name) %>% 
+      summarise(p=wilcox.test(freq~i, paired=F)$p.value)
+      tdat <- left_join(tdat, pvalues, by.x = c("DX.name","Phenotype"), by.y =c("Phenotype","DX.name"), all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      
+#res <- tdat %>% group_by(DX.name) %>% 
+#       do(w = wilcox.test(freq~i, data=., paired=FALSE)) %>% 
+ #      summarise(DX.name, Wilcox = w$p.value)
+
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=p<0.05))+#.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        #facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+        facet_grid(Phenotype~DX.name+p.wt)+
+        scale_color_tableau()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=3) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+  #pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Proportions_WOneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+
+## Neoadjuvantly treated patients are included, the same categories are used plus:  
+- NeoAdj (Neoadjuvant therapy yes/no)
+- Chemo (Neoadjuvant therapy = chemotherapy)
+- Radio (Neoadjuvant therapy = radiotherapy)
+#here now
+```{r  KW W Fibro category proportions with NEO, fig.width=12, fig.height=10, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(fibro.sce_pat[,fibro.sce_pat$Patient_ID!="Control"&
+                                          fibro.sce_pat$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat$DX.name=="Squamous cell carcinoma"]))
+
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+categories <- "Grade"
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$cell_type, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      scale_color_jco()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      #labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=p <0.05))+#.data[[i]])
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+
+        scale_colour_manual(values = setNames(c(palette("Tableau 10")[1:2]),c(F, T)))+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        #labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        labs(x=paste(i),  fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=4) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+  pdf(file=file.path(plot_folder, paste0("KW_W_CAF_Proportions_Wneo",i,"_new.pdf")), width=10, height=6)
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6)#ncol=round(length(unique(t$Phenotype))/2)
+  dev.off()
+}
+```
+
+
+```{r  KW W Fibro category proportions with NEO SPLIT by tumour type, fig.width=12, fig.height=18, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(fibro.sce_pat[,fibro.sce_pat$Patient_ID!="Control"&
+                                          fibro.sce_pat$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat$DX.name=="Squamous cell carcinoma"]))
+
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$cell_type, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype, DX.name) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+     # facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      facet_grid(Phenotype~DX.name)+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+     theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat$i <-tdat[[i]]
+      tdat$i <- as.factor(tdat$i)
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+
+      pvalues <- tdat %>% 
+      group_by(Phenotype,DX.name) %>% 
+      summarise(p=wilcox.test(freq~i, paired=F)$p.value)
+      tdat <- left_join(tdat, pvalues, by.x = c("DX.name","Phenotype"), by.y =c("Phenotype","DX.name"), all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=p<0.05))+#colour=.data[[i]]
+        geom_boxplot()+
+        geom_point()+
+       # facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+        facet_grid(Phenotype~DX.name+p.wt)+
+        scale_color_tableau()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=3) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+  #pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Proportions_WOneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+## **Fibro category**
+## **Densities**
+Generally, densities are first calculated per tissue area. For patients with more than 1 image, the average density is calculated subsequently.  
+Optimal number of Fibro category densities patients metaclusters:
+```{r  Fibro category density optimal number of clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Densities
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+#tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+library(FactoMineR)
+library(factoextra)
+sbg <-cutree(hc, k=8)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=20)
+
+#install.packages("cluster")
+library(cluster)
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+```{r  Calculate densities Fibro category, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+
+  #remove super high density outliers (total density >10000)  
+ # tdat <-tdat[ !tdat$Patient_ID%in%surv_excl,]
+
+#CAF type distribution over all patients
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+#order Patients after clustering results
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = labels(hc))
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = labels(hc))
+
+#tdat_wide <-tdat_wide[ !tdat_wide$Patient_ID%in%surv_excl,]
+
+```
+
+## Barplot showing absolute density per patietn coloured by Fibro category together with hierarchical clustering tree coloured by patient metaclusters.
+```{r Barolot Fibro category densities, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+#barolot (sqrt of density)
+p <-ggplot(tdat,aes(y=sqrt(density),x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+#barplot total density
+p <-ggplot(tdat,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+
+#coloured dendrogram
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 8) %>% as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()
+pg <-plot_grid(p1,p, align="v", ncol=1)
+pg
+##ggsave(plot=pg, file=file.path(plot_folder, paste("Barplot_Tcell-Category-Densities_HC_ordered_dendro.pdf")), width=12, height=6)
+```
+Add patient metacluster to data and plot clusters separately for better identification
+```{r Fibro category densities merge hc, message=FALSE, warning=FALSE, echo=FALSE,fig.width=12, fig.height=6}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k=8))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+ggplot(tdat_ct,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+
+
+```
+
+```{r  heatmap metacluster cell category expression density}
+tdat_ct
+tdat_ct_w <-tdat_ct %>% pivot_wider(id_cols = "metacluster",names_from = "Phenotype", values_from = "density",values_fn = mean)
+tdat_ct_w_m <- as.matrix(tdat_ct_w[,-1])
+rownames(tdat_ct_w_m) <-tdat_ct_w$metacluster
+
+pheatmap(scale(t(tdat_ct_w_m)))
+
+pheatmap(t(tdat_ct_w_m))
+```
+
+
+## Survival analysis over Fibro category densities high low.  
+- High > median
+- Low < median
+```{r  surv_dat Fibro category densities high low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- rowSums(surv_dat[,2:12])
+#surv_excl <-surv_dat[surv_dat$total_density>10000,]$Patient_ID
+
+#surv_dat <-surv_dat[ !surv_dat$Patient_ID%in%surv_excl]
+#high low
+surv_dat$Density_G <- ifelse(surv_dat$total_density >summary(surv_dat$total_density)[3],"Density high","Density low")
+surv_dat$mCAF_G <- ifelse(surv_dat$mCAF >summary(surv_dat$mCAF)[3],"mCAF high","mCAF low")
+surv_dat$vCAF_G <- ifelse(surv_dat$vCAF >summary(surv_dat$vCAF)[3],"vCAF high","vCAF low")
+surv_dat$iCAF_G <- ifelse(surv_dat$iCAF >summary(surv_dat$iCAF)[3],"iCAF high","iCAF low")
+surv_dat$dCAF_G <- ifelse(surv_dat$dCAF >summary(surv_dat$dCAF)[3],"dCAF high","dCAF low")
+surv_dat$tpCAF_G <- ifelse(surv_dat$tpCAF >summary(surv_dat$tpCAF)[3],"tpCAF high","tpCAF low")
+surv_dat$hypoxic_tpCAF_G <- ifelse(surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+surv_dat$hypoxic_CAF_G <- ifelse(surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+surv_dat$Collagen_CAF_G <- ifelse(surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+surv_dat$PDPN_CAF_G <- ifelse(surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+surv_dat$SMA_CAF_G <- ifelse(surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+surv_dat$IDO_CAF_G <- ifelse(surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+
+
+caf_strat <-surv_dat %>% select(contains(c("_G", "Patient_ID")))
+write.csv(caf_strat,file=file.path(wd,"patient_strat","CAF_strat_hi_lo_density.csv"))
+```
+
+#CoxPH for Fibro category density corrected for Stage, Grade and M
+```{r  Coxph Fibro category density, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   IDO_CAF + SMA_CAF+PDPN_CAF +hypoxic_CAF+hypoxic_tpCAF+tpCAF+ dCAF+iCAF+vCAF+mCAF+Collagen_CAF+Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+scale_colour_jco()
+#
+plot(p)
+ggsave(plot=p, file=file.path(plot_folder, "CAF_density_coxph.pdf"), width=6, height=4)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r  survival Fibro category densities high low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=4}
+#Not split
+#surv <-c("metacluster")
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+
+#surv <-c("metacluster")
+library(survminer)
+
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               palette="jco",
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+pdf(file=file.path(plot_folder, paste0("Survival_Density-CAF_hi-low_OS over",i,"_per_",k,".pdf")), width=4, height=4)
+print(p,newpage=FALSE)
+dev.off()
+
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+#km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               palette="jco",
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+pdf(file=file.path(plot_folder, paste0("Survival_Density-CAF_hi-low_DFS over",i,"_per_",k,".pdf")), width=4, height=4)
+print(p,newpage=FALSE)
+dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r  survival Fibro category densities high low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Split
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G <- ifelse(dat$total_density >summary(dat$total_density)[3],"Density high","Density low")
+dat$mCAF_G <- ifelse(dat$mCAF >summary(dat$mCAF)[3],"mCAF high","mCAF low")
+dat$vCAF_G <- ifelse(dat$vCAF >summary(dat$vCAF)[3],"vCAF high","vCAF low")
+dat$iCAF_G <- ifelse(dat$iCAF >summary(dat$iCAF)[3],"iCAF high","iCAF low")
+dat$dCAF_G <- ifelse(dat$dCAF >summary(dat$dCAF)[3],"dCAF high","dCAF low")
+dat$tpCAF_G <- ifelse(dat$tpCAF >summary(dat$tpCAF)[3],"tpCAF high","tpCAF low")
+dat$hypoxic_tpCAF_G <- ifelse(dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+dat$hypoxic_CAF_G <- ifelse(dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+dat$Collagen_CAF_G <- ifelse(dat$Collagen_CAF >summary(dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+dat$PDPN_CAF_G <- ifelse(dat$PDPN_CAF >summary(dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+dat$SMA_CAF_G <- ifelse(dat$SMA_CAF >summary(dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+dat$IDO_CAF_G <- ifelse(dat$IDO_CAF >summary(dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+
+caf_strat <- dat %>% select(contains(c("Patient_ID","_G")))
+write.csv(caf_strat, file=file.path(wd,"patient_strat", paste0("CAF_strat_hi_low_density_split_",k,".csv")))
+  }
+}
+
+library(survminer)
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G <- ifelse(dat$total_density >summary(dat$total_density)[3],"Density high","Density low")
+dat$mCAF_G <- ifelse(dat$mCAF >summary(dat$mCAF)[3],"mCAF high","mCAF low")
+dat$vCAF_G <- ifelse(dat$vCAF >summary(dat$vCAF)[3],"vCAF high","vCAF low")
+dat$iCAF_G <- ifelse(dat$iCAF >summary(dat$iCAF)[3],"iCAF high","iCAF low")
+dat$dCAF_G <- ifelse(dat$dCAF >summary(dat$dCAF)[3],"dCAF high","dCAF low")
+dat$tpCAF_G <- ifelse(dat$tpCAF >summary(dat$tpCAF)[3],"tpCAF high","tpCAF low")
+dat$hypoxic_tpCAF_G <- ifelse(dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+dat$hypoxic_CAF_G <- ifelse(dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+dat$Collagen_CAF_G <- ifelse(dat$Collagen_CAF >summary(dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+dat$PDPN_CAF_G <- ifelse(dat$PDPN_CAF >summary(dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+dat$SMA_CAF_G <- ifelse(dat$SMA_CAF >summary(dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+dat$IDO_CAF_G <- ifelse(dat$IDO_CAF >summary(dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               palette="jco",
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+pdf(file=file.path(plot_folder, paste0("Survival_Density-CAF_hi-low_OS over",i,"_per_",k,".pdf")), width=4, height=4)
+print(p,newpage=FALSE)
+dev.off()
+
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               palette="jco",
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+pdf(file=file.path(plot_folder, paste0("Survival_Density-CAF_hi-low_DFS over",i,"_per_",k,".pdf")), width=4, height=4)
+print(p,newpage=FALSE)
+dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high / low as well as the densities (continuous).   
+```{r  coxph Fibro category densities high-low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.width=8, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+#correct
+
+
+df <-Coefficients %>% as.data.frame 
+df$Coefficients <- rownames(df)
+colnames(df) <- c("Predictor","Coefficient")
+df <-df[df$Predictor !=0,]
+
+na_td_res <- df[-grep("NA",df$Coefficient),]
+
+p <-ggplot(na_td_res, aes(x = reorder(Coefficient,Predictor), y = Predictor, color=Predictor >0)) +
+geom_point(size = 4) +
+#geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+#scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 0,color = 'black')+
+theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  geom_segment(aes(y = 0, 
+                   x = Coefficient, 
+                   yend = Predictor, 
+                   xend = Coefficient), 
+               color = "black") +
+scale_colour_jco()
+p
+ggsave(plot=p, file=file.path(plot_folder, "Lasso_coxph_CAF-density.pdf"))
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Fibro_Density_high-low_Lasso.pdf")), plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Survival Fibro category densities high-mid-low     
+- high    > 3rd quantile   
+- mid     > 1st quantile < 3rd quantile   
+- low     < 1st quantile   
+```{r  surv_dat Fibro category densities high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- rowSums(surv_dat[,2:12])
+
+#High Med Low
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[5]] <-"Density high"
+surv_dat$Density_G[surv_dat$total_density <=summary(surv_dat$total_density)[2]] <-"Density low"
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[2]& surv_dat$total_density<summary(surv_dat$total_density)[5]] <-"Density medium"
+
+surv_dat$mCAF_G[surv_dat$mCAF >summary(surv_dat$mCAF)[5]] <-"mCAF high"
+surv_dat$mCAF_G[surv_dat$mCAF <=summary(surv_dat$mCAF)[2]] <-"mCAF low"
+surv_dat$mCAF_G[surv_dat$mCAF >summary(surv_dat$mCAF)[2]& surv_dat$mCAF<=summary(surv_dat$mCAF)[5]] <-"mCAF medium"
+
+surv_dat$vCAF_G[surv_dat$vCAF >summary(surv_dat$vCAF)[5]] <-"vCAF high"
+surv_dat$vCAF_G[surv_dat$vCAF <=summary(surv_dat$vCAF)[2]] <-"vCAF low"
+surv_dat$vCAF_G[surv_dat$vCAF >summary(surv_dat$vCAF)[2]& surv_dat$vCAF<=summary(surv_dat$vCAF)[5]] <-"vCAF medium"
+
+surv_dat$tpCAF_G[surv_dat$tpCAF >summary(surv_dat$tpCAF)[5]] <-"tpCAF high"
+surv_dat$tpCAF_G[surv_dat$tpCAF <=summary(surv_dat$tpCAF)[2]] <-"tpCAF low"
+surv_dat$tpCAF_G[surv_dat$tpCAF >summary(surv_dat$tpCAF)[2]& surv_dat$tpCAF<=summary(surv_dat$tpCAF)[5]] <-"tpCAF medium"
+
+surv_dat$iCAF_G[surv_dat$iCAF >summary(surv_dat$iCAF)[5]] <-"iCAF high"
+surv_dat$iCAF_G[surv_dat$iCAF <=summary(surv_dat$iCAF)[2]] <-"iCAF low"
+surv_dat$iCAF_G[surv_dat$iCAF >summary(surv_dat$iCAF)[2]& surv_dat$iCAF<=summary(surv_dat$iCAF)[5]] <-"iCAF medium"
+
+surv_dat$dCAF_G[surv_dat$dCAF >summary(surv_dat$dCAF)[5]] <-"dCAF high"
+surv_dat$dCAF_G[surv_dat$dCAF <=summary(surv_dat$dCAF)[2]] <-"dCAF low"
+surv_dat$dCAF_G[surv_dat$dCAF >summary(surv_dat$dCAF)[2]& surv_dat$dCAF<=summary(surv_dat$dCAF)[5]] <-"dCAF medium"
+
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[5]] <-"Collagen_CAF high"
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF <=summary(surv_dat$Collagen_CAF)[2]] <-"Collagen_CAF low"
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[2]& surv_dat$Collagen_CAF<=summary(surv_dat$Collagen_CAF)[5]] <-"Collagen_CAF medium"
+
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[5]] <-"hypoxic_CAF high"
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF <=summary(surv_dat$hypoxic_CAF)[2]] <-"hypoxic_CAF low"
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[2]& surv_dat$hypoxic_CAF<=summary(surv_dat$hypoxic_CAF)[5]] <-"hypoxic_CAF medium"
+
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[5]] <-"SMA_CAF high"
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF <=summary(surv_dat$SMA_CAF)[2]] <-"SMA_CAF low"
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[2]& surv_dat$SMA_CAF<=summary(surv_dat$SMA_CAF)[5]] <-"SMA_CAF medium"
+
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF high"
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF <=summary(surv_dat$hypoxic_tpCAF)[2]] <-"hypoxic_tpCAF low"
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[2]& surv_dat$hypoxic_tpCAF<=summary(surv_dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF medium"
+
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[5]] <-"IDO_CAF high"
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF <=summary(surv_dat$IDO_CAF)[2]] <-"IDO_CAF low"
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[2]& surv_dat$IDO_CAF<=summary(surv_dat$IDO_CAF)[5]] <-"IDO_CAF medium"
+
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[5]] <-"PDPN_CAF high"
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF <=summary(surv_dat$PDPN_CAF)[2]] <-"PDPN_CAF low"
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[2]& surv_dat$PDPN_CAF<=summary(surv_dat$PDPN_CAF)[5]] <-"PDPN_CAF medium"
+
+
+caf_strat <- dat %>% select(contains(c("Patient_ID","_G")))
+write.csv(caf_strat, file=file.path(wd,"patient_strat", paste0("CAF_strat_hi_mid_low_density.csv")))
+
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+```
+
+## Fibro category densities split by tumour type
+```{r  survival Fibro category densities high-mid-low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+#surv <-"metacluster"
+library(survminer)
+
+for (i in surv){
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## Fibro category densities split by tumour type
+```{r  survival Fibro category densities high-mid-low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G[dat$total_density >summary(dat$total_density)[5]] <-"Density high"
+dat$Density_G[dat$total_density <summary(dat$total_density)[2]] <-"Density low"
+dat$Density_G[dat$total_density >summary(dat$total_density)[2]& dat$total_density<summary(dat$total_density)[5]] <-"Density medium"
+
+dat$mCAF_G[dat$mCAF >summary(dat$mCAF)[5]] <-"mCAF high"
+dat$mCAF_G[dat$mCAF <=summary(dat$mCAF)[2]] <-"mCAF low"
+dat$mCAF_G[dat$mCAF >summary(dat$mCAF)[2]& dat$mCAF<=summary(dat$mCAF)[5]] <-"mCAF medium"
+
+dat$vCAF_G[dat$vCAF >summary(dat$vCAF)[5]] <-"vCAF high"
+dat$vCAF_G[dat$vCAF <=summary(dat$vCAF)[2]] <-"vCAF low"
+dat$vCAF_G[dat$vCAF >summary(dat$vCAF)[2]& dat$vCAF<=summary(dat$vCAF)[5]] <-"vCAF medium"
+
+dat$tpCAF_G[dat$tpCAF >summary(dat$tpCAF)[5]] <-"tpCAF high"
+dat$tpCAF_G[dat$tpCAF <=summary(dat$tpCAF)[2]] <-"tpCAF low"
+dat$tpCAF_G[dat$tpCAF >summary(dat$tpCAF)[2]& dat$tpCAF<=summary(dat$tpCAF)[5]] <-"tpCAF medium"
+
+dat$iCAF_G[dat$iCAF >summary(dat$iCAF)[5]] <-"iCAF high"
+dat$iCAF_G[dat$iCAF <=summary(dat$iCAF)[2]] <-"iCAF low"
+dat$iCAF_G[dat$iCAF >summary(dat$iCAF)[2]& dat$iCAF<=summary(dat$iCAF)[5]] <-"iCAF medium"
+
+dat$dCAF_G[dat$dCAF >summary(dat$dCAF)[5]] <-"dCAF high"
+dat$dCAF_G[dat$dCAF <=summary(dat$dCAF)[2]] <-"dCAF low"
+dat$dCAF_G[dat$dCAF >summary(dat$dCAF)[2]& dat$dCAF<=summary(dat$dCAF)[5]] <-"dCAF medium"
+
+dat$Collagen_CAF_G[dat$Collagen_CAF >summary(dat$Collagen_CAF)[5]] <-"Collagen_CAF high"
+dat$Collagen_CAF_G[dat$Collagen_CAF <=summary(dat$Collagen_CAF)[2]] <-"Collagen_CAF low"
+dat$Collagen_CAF_G[dat$Collagen_CAF >summary(dat$Collagen_CAF)[2]& dat$Collagen_CAF<=summary(dat$Collagen_CAF)[5]] <-"Collagen_CAF medium"
+
+dat$hypoxic_CAF_G[dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[5]] <-"hypoxic_CAF high"
+dat$hypoxic_CAF_G[dat$hypoxic_CAF <=summary(dat$hypoxic_CAF)[2]] <-"hypoxic_CAF low"
+dat$hypoxic_CAF_G[dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[2]& dat$hypoxic_CAF<=summary(dat$hypoxic_CAF)[5]] <-"hypoxic_CAF medium"
+
+dat$SMA_CAF_G[dat$SMA_CAF >summary(dat$SMA_CAF)[5]] <-"SMA_CAF high"
+dat$SMA_CAF_G[dat$SMA_CAF <=summary(dat$SMA_CAF)[2]] <-"SMA_CAF low"
+dat$SMA_CAF_G[dat$SMA_CAF >summary(dat$SMA_CAF)[2]& dat$SMA_CAF<=summary(dat$SMA_CAF)[5]] <-"SMA_CAF medium"
+
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF high"
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF <=summary(dat$hypoxic_tpCAF)[2]] <-"hypoxic_tpCAF low"
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[2]& dat$hypoxic_tpCAF<=summary(dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF medium"
+
+dat$IDO_CAF_G[dat$IDO_CAF >summary(dat$IDO_CAF)[5]] <-"IDO_CAF high"
+dat$IDO_CAF_G[dat$IDO_CAF <=summary(dat$IDO_CAF)[2]] <-"IDO_CAF low"
+dat$IDO_CAF_G[dat$IDO_CAF >summary(dat$IDO_CAF)[2]& dat$IDO_CAF<=summary(dat$IDO_CAF)[5]] <-"IDO_CAF medium"
+
+dat$PDPN_CAF_G[dat$PDPN_CAF >summary(dat$PDPN_CAF)[5]] <-"PDPN_CAF high"
+dat$PDPN_CAF_G[dat$PDPN_CAF <=summary(dat$PDPN_CAF)[2]] <-"PDPN_CAF low"
+dat$PDPN_CAF_G[dat$PDPN_CAF >summary(dat$PDPN_CAF)[2]& dat$PDPN_CAF<=summary(dat$PDPN_CAF)[5]] <-"PDPN_CAF medium"
+
+
+caf_strat <- dat %>% select(contains(c("Patient_ID","_G")))
+write.csv(caf_strat, file=file.path(wd,"patient_strat", paste0("CAF_strat_hi_mid_low_density_split_",k,".csv")))
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+
+
+## Lasso-regressed cox-ph model   
+including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high mid low as well as the densities (continuous).   
+```{r coxph Fibro category densities high-mid-low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=8, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor")))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+#correct
+
+
+df <-Coefficients %>% as.data.frame 
+df$Coefficients <- rownames(df)
+colnames(df) <- c("Predictor","Coefficient")
+df <-df[df$Predictor !=0,]
+
+na_td_res <- df[-grep("NA",df$Coefficient),]
+
+p <-ggplot(na_td_res, aes(x = reorder(Coefficient,Predictor), y = Predictor, color=Predictor >0)) +
+geom_point(size = 4) +
+#geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+#scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 0,color = 'black')+
+theme_bw()+
+  geom_segment(aes(y = 0, 
+                   x = Coefficient, 
+                   yend = Predictor, 
+                   xend = Coefficient), 
+               color = "black") +
+scale_colour_tableau()
+p
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+##ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Density_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables without neoadjuvant treated patients.
+```{r  KW W Fibro category density WO neo,warning=FALSE, message=FALSE, echo=FALSE, fig.width=12, fig.height=8}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+#categories <- c("DX.name","Grade")
+fibro.sce_pat.roi$NeoAdj <- ifelse(fibro.sce_pat.roi$Chemo==1 |fibro.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[2:12])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:13])), names_to = "Phenotype",values_to = "density")
+
+#remove
+tdat <-tdat[!tdat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+#categories <- c("DX.name","Grade")
+
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   stat.test$y.position <-sqrt(stat.test$y.position)
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = sqrt(density), colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+        
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = sqrt(density), colour=p<0.05))+ #.data[[i]]
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+        #facet_grid(Phenotype~DX.name+p.wt)+
+        scale_color_viridis_d()+
+        scale_colour_manual(values = setNames(c(palette("Tableau 10")[1:2]),c(F, T)))+
+        
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ pdf(file=file.path(plot_folder, paste0("KW_W_CAF_Densities_WOneo",i,".pdf")), width=12, height=8)
+ gridExtra::grid.arrange(grobs = plot_list, ncol=6)
+ dev.off()
+}
+```
+
+
+```{r  KW W Fibro category density WO neo split by tumour type,warning=FALSE, message=FALSE, echo=FALSE, fig.width=12, fig.height=18}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+#categories <- c("DX.name","Grade")
+fibro.sce_pat.roi$NeoAdj <- ifelse(fibro.sce_pat.roi$Chemo==1 |fibro.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[2:12])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:13])), names_to = "Phenotype",values_to = "density")
+
+#remove
+tdat <-tdat[!tdat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","Stage","Grade","Gender")
+#categories <- c("DX.name","Grade")
+
+plot_list <- list()
+
+
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   stat.test$y.position <-sqrt(stat.test$y.position)
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = sqrt(density), colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat$i <-tdat[[i]]
+        tdat$i <- as.factor(tdat$i)
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+        
+      pvalues <- tdat %>% 
+      group_by(Phenotype, DX.name) %>% 
+      summarise(p=wilcox.test(density~i, paired=F)$p.value)
+      tdat <- left_join(tdat, pvalues, by.x = c("Phenotype","DX.name"), by.y =c("Phenotype","DX.name"), all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = sqrt(density), colour=p<0.05))+#.data[[i]]
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        #facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        facet_grid(Phenotype~DX.name+p.wt)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=3) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("KW_W_CAF_Densities_WOneo",i,".pdf")), width=12, height=8)
+ #gridExtra::grid.arrange(grobs = plot_list, ncol=6)
+# dev.off()
+}
+
+
+```
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables with neoadjuvant treated patients.
+```{r  KW Wilcox Fibro Category density W neo,warning=FALSE, message=FALSE, echo=FALSE, fig.width=12, fig.height=8}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+fibro.sce_pat.roi$NeoAdj <- ifelse(fibro.sce_pat.roi$Chemo==1 |fibro.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[,2:12])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:13])), names_to = "Phenotype",values_to = "density")
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+plot_list <- list()
+
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+     stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   stat.test$y.position <-sqrt(stat.test$y.position)
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = sqrt(density), colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = sqrt(density), color=p>0.05))+#.data[[i]]
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #facet_grid(Phenotype~DX.name+p.wt)+
+        #ggtitle("paired")+
+        scale_color_viridis_d()+
+         scale_colour_manual(values = setNames(c(palette("Tableau 10")[1:2]),c(T, F)))+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ pdf(file=file.path(plot_folder, paste0("KW_W_CAF_Densities_Wneo",i,".pdf")), width=12, height=8)
+ gridExtra::grid.arrange(grobs = plot_list, ncol=6)
+ dev.off()
+}
+```
+
+
+```{r  KW Wilcox Fibro Category density W neo split by tumouor,warning=FALSE, message=FALSE, echo=FALSE,fig.width=12, fig.height=18}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+fibro.sce_pat.roi$NeoAdj <- ifelse(fibro.sce_pat.roi$Chemo==1 |fibro.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[,2:12])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:13])), names_to = "Phenotype",values_to = "density")
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+plot_list <- list()
+
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype, DX.name) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   stat.test$y.position <-sqrt(stat.test$y.position)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = sqrt(density), colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      #facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      facet_grid(Phenotype~DX.name)+
+      scale_color_viridis_d()+
+      theme_bw()+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat$i <-tdat[[i]]
+        tdat$i <- as.factor(tdat$i)
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+        
+      pvalues <- tdat %>% 
+      group_by(Phenotype, DX.name) %>% 
+      summarise(p=wilcox.test(density~i, paired=F)$p.value)
+      tdat <- left_join(tdat, pvalues, by.x = c("Phenotype","DX.name"), by.y =c("Phenotype","DX.name"), all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = sqrt(density), colour=p<0.05))+#colour=.data[[i]]
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        #facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        facet_grid(Phenotype~DX.name+p.wt)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=3) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("KW_W_CAF_Densities_WOneo",i,".pdf")), width=12, height=8)
+ #gridExtra::grid.arrange(grobs = plot_list, ncol=6)
+# dev.off()
+}
+
+
+```
+
+```{r}
+#proportions
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion_Squamous cell carcinoma.csv"))
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion_Adenocarcinoma.csv"))
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+```
+
+```{r}
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+unique(colnames(df))
+table(df$tpCAF_G, df$Collagen_CAF_G)
+table(df$tpCAF_G, df$hypoxic_CAF_G)
+table(df$tpCAF_G, df$mCAF_G)
+table(df$tpCAF_G, df$SMA_CAF_G)
+table(df$tpCAF_G, df$tpCAF_G)
+table(df$tpCAF_G, df$iCAF_G)
+table(df$tpCAF_G, df$vCAF_G)
+table(df$tpCAF_G, df$dCAF_G)
+table(df$tpCAF_G, df$hypoxic_tpCAF_G)
+table(df$tpCAF_G, df$IDO_CAF_G)
+table(df$tpCAF_G, df$PDPN_CAF_G)
+
+table(df$Collagen_CAF_G, df$hypoxic_CAF_G)
+table(df$Collagen_CAF_G, df$mCAF_G)
+table(df$Collagen_CAF_G, df$SMA_CAF_G)
+table(df$Collagen_CAF_G, df$tpCAF_G)
+table(df$Collagen_CAF_G, df$iCAF_G)
+table(df$Collagen_CAF_G, df$vCAF_G)
+table(df$Collagen_CAF_G, df$dCAF_G)
+table(df$Collagen_CAF_G, df$hypoxic_tpCAF_G)
+table(df$Collagen_CAF_G, df$IDO_CAF_G)
+table(df$Collagen_CAF_G, df$PDPN_CAF_G)
+
+table(df$hypoxic_CAF_G, df$mCAF_G)
+table(df$hypoxic_CAF_G, df$SMA_CAF_G)
+table(df$hypoxic_CAF_G, df$tpCAF_G)
+table(df$hypoxic_CAF_G, df$iCAF_G)
+table(df$hypoxic_CAF_G, df$vCAF_G)
+table(df$hypoxic_CAF_G, df$dCAF_G)
+table(df$hypoxic_CAF_G, df$hypoxic_tpCAF_G)
+table(df$hypoxic_CAF_G, df$IDO_CAF_G)
+table(df$hypoxic_CAF_G, df$PDPN_CAF_G)
+
+table(df$mCAF_G, df$SMA_CAF_G)
+table(df$mCAF_G, df$tpCAF_G)
+table(df$mCAF_G, df$iCAF_G)
+table(df$mCAF_G, df$vCAF_G)
+table(df$mCAF_G, df$dCAF_G)
+table(df$mCAF_G, df$hypoxic_tpCAF_G)
+table(df$mCAF_G, df$IDO_CAF_G)
+table(df$mCAF_G, df$PDPN_CAF_G)
+
+table(df$iCAF_G, df$tpCAF_G)
+table(df$iCAF_G, df$iCAF_G)
+table(df$iCAF_G, df$vCAF_G)
+table(df$iCAF_G, df$dCAF_G)
+table(df$iCAF_G, df$hypoxic_tpCAF_G)
+table(df$iCAF_G, df$IDO_CAF_G)
+table(df$iCAF_G, df$PDPN_CAF_G)
+
+table(df$vCAF_G, df$dCAF_G)
+table(df$vCAF_G, df$hypoxic_tpCAF_G)
+table(df$vCAF_G, df$IDO_CAF_G)
+table(df$vCAF_G, df$PDPN_CAF_G)
+
+table(df$hypoxic_tpCAF_G, df$IDO_CAF_G)
+table(df$hypoxic_tpCAF_G, df$PDPN_CAF_G)
+
+table(df$IDO_CAF_G, df$PDPN_CAF_G)
+
+
+clinical.data
+```
+
+```{r}
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+#Area_px_Stroma
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Stroma,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+
+  #remove super high density outliers (total density >10000)  
+ # tdat <-tdat[ !tdat$Patient_ID%in%surv_excl,]
+#CAF type distribution over all patients
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+#order Patients after clustering results
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = labels(hc))
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = labels(hc))
+
+#tdat_wide <-tdat_wide[ !tdat_wide$Patient_ID%in%surv_excl,]
+
+```
+
+## All Density here
+```{r density all strat high low, fig.width=4, fig.height=4, message=F, warning=F}
+#here
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+
+#Density
+#df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_lo_density.csv"))
+#df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_mid_low_density.csv"))
+
+
+df$X <-NULL
+tdat_n <- tdat %>% 
+  group_by(Patient_ID) %>% 
+  summarise(density = sum(density))
+tdat_n$Phenotype <- "Total Stroma density"
+tdat_n$Phenotype <- as.factor(tdat_n$Phenotype)
+hl_m <- left_join(tdat, df, by="Patient_ID")
+hl_m <- left_join(tdat_n, df, by="Patient_ID")
+
+ac_sqc <- clinical.data[clinical.data$DX.name=="Adenocarcinoma"|clinical.data$DX.name=="Squamous cell carcinoma",]$Patient_ID
+hl_m <- hl_m[hl_m$Patient_ID %in% ac_sqc,]
+
+categories <- c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G") #,"Density_G"
+plot_list <- list()
+
+
+for (i in (categories)) {
+hl_m <- left_join(tdat_n, df, by="Patient_ID")
+hl_m <- hl_m[hl_m$Patient_ID %in% ac_sqc,]
+
+  for (j in unique(hl_m$Phenotype)){
+    df_l<-hl_m
+    df_l[[i]] <-as.factor(df_l[[i]])
+    df_l <-df_l %>% drop_na(i)
+    df_l$i <-as.factor(df_l[[i]])
+    df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+    df_l <-subset(df_l, Phenotype==j)
+
+    if (length(unique(df_l[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- df_l%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   stat.test$y.position <-sqrt(stat.test$y.position)
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(df_l, aes(x= .data[[i]], y = sqrt(density), colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_tableau()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(length(unique(df_l[[i]])) == 2){
+    
+      df_l<-hl_m
+      df_l$i <- df_l[[i]]
+      df_l$i <-as.factor(df_l$i)
+      df_l[[i]] <-as.factor(df_l[[i]])
+      df_l <-df_l %>% drop_na(i)
+      df_l$Phenotype <-droplevels(df_l$Phenotype)
+      df_l <- left_join(df_l, clinical.data, by="Patient_ID")
+      df_l <-subset(df_l, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- df_l %>% 
+      #group_by(Phenotype,Grade) %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(density~i, paired=F)$p.value)
+     # df_l <- merge(df_l, pvalues, by.x = c("Phenotype","Grade"), by.y =c("Phenotype","Grade"), all.x = TRUE)
+      df_l <- merge(df_l, pvalues, by.x = c("Phenotype"), by.y =c("Phenotype"), all.x = TRUE)
+      df_l$p.wt <- paste0('p=',round(df_l$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(df_l, aes(x= .data[[i]], y = sqrt(density), colour=p<0.05))+#.data[[i]]
+        #geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #facet_grid(Phenotype~Grade+p.wt)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        scale_colour_manual(values = setNames(c(palette("Tableau 10")[1:2]),c(F, T)))+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }  
+    }
+   }  
+#print(categories)
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=1) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("tcell_category_Density_Wo-NeoAdj",i,".pdf")))
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=6)
+ # dev.off()
+}
+```
+## SQC Density
+```{r density all strat high low, fig.width=4, fig.height=4, message=F, warning=F}
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion_Squamous cell carcinoma.csv"))
+
+#Density
+#df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_lo_density.csv"))
+#df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_mid_low_density.csv"))
+
+
+df$X <-NULL
+tdat_n <- tdat %>% 
+  group_by(Patient_ID) %>% 
+  summarise(density = sum(density))
+tdat_n$Phenotype <- "Total Stroma density"
+tdat_n$Phenotype <- as.factor(tdat_n$Phenotype)
+hl_m <- left_join(tdat, df, by="Patient_ID")
+hl_m <- left_join(tdat_n, df, by="Patient_ID")
+
+ac_sqc <- clinical.data[clinical.data$DX.name=="Squamous cell carcinoma",]$Patient_ID
+hl_m <- hl_m[hl_m$Patient_ID %in% ac_sqc,]
+
+categories <- c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G") #,"Density_G"
+plot_list <- list()
+
+
+for (i in (categories)) {
+hl_m <- left_join(tdat_n, df, by="Patient_ID")
+hl_m <- hl_m[hl_m$Patient_ID %in% ac_sqc,]
+
+  for (j in unique(hl_m$Phenotype)){
+    df_l<-hl_m
+    df_l[[i]] <-as.factor(df_l[[i]])
+    df_l <-df_l %>% drop_na(i)
+    df_l$i <-as.factor(df_l[[i]])
+    df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+    df_l <-subset(df_l, Phenotype==j)
+
+    if (length(unique(df_l[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- df_l%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   stat.test$y.position <-sqrt(stat.test$y.position)
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(df_l, aes(x= .data[[i]], y = sqrt(density), colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_tableau()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(length(unique(df_l[[i]])) == 2){
+    
+      df_l<-hl_m
+      df_l$i <- df_l[[i]]
+      df_l$i <-as.factor(df_l$i)
+      df_l[[i]] <-as.factor(df_l[[i]])
+      df_l <-df_l %>% drop_na(i)
+      df_l$Phenotype <-droplevels(df_l$Phenotype)
+      df_l <- left_join(df_l, clinical.data, by="Patient_ID")
+      df_l <-subset(df_l, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- df_l %>% 
+     # group_by(Phenotype,Grade) %>% 
+      group_by(Phenotype) %>%   
+      summarise(p=wilcox.test(density~i, paired=F)$p.value)
+     # df_l <- merge(df_l, pvalues, by.x = c("Phenotype","Grade"), by.y =c("Phenotype","Grade"), all.x = TRUE)
+      df_l <- merge(df_l, pvalues, by.x = c("Phenotype"), by.y =c("Phenotype"), all.x = TRUE)
+      df_l$p.wt <- paste0('p=',round(df_l$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(df_l, aes(x= .data[[i]], y = sqrt(density), colour=p<0.05))+#.data[[i]]
+        #geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #facet_grid(Phenotype~Grade+p.wt)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        scale_colour_manual(values = setNames(c(palette("Tableau 10")[1:2]),c(F, T)))+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }  
+ 
+    }
+   }  
+#print(categories)
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=1) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("tcell_category_Density_Wo-NeoAdj",i,".pdf")))
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=6)
+ # dev.off()
+}
+```
+
+## AC Density
+```{r density all strat high low, fig.width=4, fig.height=4, message=F, warning=F}
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion_Adenocarcinoma.csv"))
+
+#Density
+#df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_lo_density.csv"))
+#df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_mid_low_density.csv"))
+
+
+df$X <-NULL
+tdat_n <- tdat %>% 
+  group_by(Patient_ID) %>% 
+  summarise(density = sum(density))
+tdat_n$Phenotype <- "Total Stroma density"
+tdat_n$Phenotype <- as.factor(tdat_n$Phenotype)
+hl_m <- left_join(tdat, df, by="Patient_ID")
+hl_m <- left_join(tdat_n, df, by="Patient_ID")
+
+ac_sqc <- clinical.data[clinical.data$DX.name=="Adenocarcinoma",]$Patient_ID
+hl_m <- hl_m[hl_m$Patient_ID %in% ac_sqc,]
+
+categories <- c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G") #,"Density_G"
+plot_list <- list()
+
+
+for (i in (categories)) {
+hl_m <- left_join(tdat_n, df, by="Patient_ID")
+hl_m <- hl_m[hl_m$Patient_ID %in% ac_sqc,]
+
+  for (j in unique(hl_m$Phenotype)){
+    df_l<-hl_m
+    df_l[[i]] <-as.factor(df_l[[i]])
+    df_l <-df_l %>% drop_na(i)
+    df_l$i <-as.factor(df_l[[i]])
+    df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+    df_l <-subset(df_l, Phenotype==j)
+
+    if (length(unique(df_l[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- df_l%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   stat.test$y.position <-sqrt(stat.test$y.position)
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(df_l, aes(x= .data[[i]], y = sqrt(density), colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_tableau()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(length(unique(df_l[[i]])) == 2){
+    
+      df_l<-hl_m
+      df_l$i <- df_l[[i]]
+      df_l$i <-as.factor(df_l$i)
+      df_l[[i]] <-as.factor(df_l[[i]])
+      df_l <-df_l %>% drop_na(i)
+      df_l$Phenotype <-droplevels(df_l$Phenotype)
+      df_l <- left_join(df_l, clinical.data, by="Patient_ID")
+      df_l <-subset(df_l, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- df_l %>% 
+     # group_by(Phenotype,Grade) %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(density~i, paired=F)$p.value)
+     # df_l <- merge(df_l, pvalues, by.x = c("Phenotype","Grade"), by.y =c("Phenotype","Grade"), all.x = TRUE)
+      df_l <- merge(df_l, pvalues, by.x = c("Phenotype"), by.y =c("Phenotype"), all.x = TRUE)
+      df_l$p.wt <- paste0('p=',round(df_l$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(df_l, aes(x= .data[[i]], y = sqrt(density), colour=p<0.05))+#.data[[i]]
+        #geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #facet_grid(Phenotype~Grade+p.wt)+
+        #ggtitle("paired")+
+        scale_color_tableau()+
+        scale_colour_manual(values = setNames(c(palette("Tableau 10")[1:2]),c(F, T)))+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }  
+ 
+    }
+   }  
+#print(categories)
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=1) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("tcell_category_Density_Wo-NeoAdj",i,".pdf")))
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=6)
+ # dev.off()
+}
+```
+
+#Distance binning
+```{r, Distance binning}
+dat.fibro <-as_tibble(colData(fibro.sce_pat.roi))
+#dat.fibro$Distance <- dat.fibro$Compartment
+dat.fibro$cell_type <-factor(dat.fibro$cell_type)
+#comp
+dat.fibro$Distance_comp <- cut(dat.fibro$Compartment, breaks=c((-10000000), 0, 10000000), labels=c( "Stroma", "Tumour"))
+
+#10px
+dat.fibro$Distance_30px <- cut(dat.fibro$Compartment, breaks=c(-10000, -150, -120, -90,-60, -30, 0, 30, 60, 90, 1000), labels=c("<(-150)","-150 - (-120)", "-120 - (-90)", "-90 - (-60)", "-60 - (-30)", "-30 - 0", "0 - 30", "30 - 60", "60 - 90", ">90" ))
+dat.fibro$Distance_30px <- factor(dat.fibro$Distance_30px, levels = c("<(-150)","-150 - (-120)", "-120 - (-90)", "-90 - (-60)", "-60 - (-30)", "-30 - 0", "0 - 30", "30 - 60", "60 - 90", ">90"))
+
+```
+
+#looping through distances based on phenotype
+```{r, looping through distances, fig.height=12, fig.width=25}
+#dat.fibro <-as_tibble(colData(sub.sce))
+#get distance binning from column names
+categories <- grep("Distance_", names(dat.fibro), value=TRUE)
+categories <- c("Distance_30px")
+i <- c("Distance_30px")
+
+df <- dat.fibro
+
+plot_list <- list()
+
+for (i in (categories)) {
+  #plot_list <-list()
+  t<- table(df$cell_type, df$Patient_ID, as.matrix(df[[i]]))
+  t <-as.data.frame(t)
+  colnames(t) <- c("Phenotype", "Patient", i,"n")
+  t$Phenotype <-factor(t$Phenotype)
+  t[[i]] <-factor(t[[i]], levels=levels(df[[i]]))
+   #calculate frequency over Phenotype
+  t<-t %>%
+    #dplyr::count(Phenotype, i,Patient) %>%
+    dplyr::group_by(Patient) %>%
+    dplyr::mutate(freq = n / sum(n))
+  
+  tdat<-t
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat <-subset(tdat, Phenotype==j)
+    tdat$i <-tdat[[i]]
+    
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs(x="Distance", y="Proportion", fill="Area_Phenotype")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+     # stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+      tdat <-subset(tdat, Phenotype==j)
+  detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+  pvalues <- tdat %>% 
+  group_by(Phenotype) %>% 
+  summarise(p=wilcox.test(freq~i)$p.value)
+  tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+  tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+  #Plot list, all js together over i
+  plot_list[[j]]<-
+    ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+    geom_boxplot()+
+    geom_point()+
+    facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+    theme(axis.title.x=element_text("Phenotype"))+
+    scale_color_viridis_d()+
+    theme_bw()+
+    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+    labs(x="Distance", y="Proportion", fill="Area_Phenotype")+
+    theme(axis.text.x =element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+    theme(legend.position = "none")
+    }
+   }  
+  #plot
+  suppressWarnings(gridExtra::grid.arrange(grobs = plot_list, ncol=round(length(unique(t$Phenotype))/2)))
+  
+  #save out in individual pdfs for each variable
+  #pdf(file=file.path(results_folder, paste0("CAF_Type_",i,"_new.pdf")),width=12, height=6)
+  #gridExtra::grid.arrange(grobs = plot_list, ncol=round(length(unique(t$Phenotype))/2))
+  #dev.off()
+} 
+```
+
+
+
+#looping through phenotypes based on distances
+```{r, looping through distances, fig.height=12, fig.width=25}
+#dat.fibro <-as_tibble(colData(sub.sce))
+#get distance binning from column names
+categories <- grep("Distance_", names(dat.fibro), value=TRUE)
+categories <- c("Distance_comp")
+i <- c("Distance_30px")
+categories <- c("Distance_30px")
+
+df <- dat.fibro
+
+plot_list <- list()
+
+
+
+
+
+for (i in (categories)) {
+  #plot_list <-list()
+  t<- table(df$cell_type, df$Patient_ID,df[[i]])
+  t <-as.data.frame(t)
+  colnames(t) <- c("Phenotype", "Patient", i,"n")
+  t$Phenotype <-factor(t$Phenotype)
+  t[[i]] <-factor(t[[i]], levels=levels(df[[i]]))
+   #calculate frequency over Phenotype
+  t<-t %>%
+    #dplyr::count(Phenotype, i,Patient) %>%
+    dplyr::group_by(Patient, Distance_30px) %>%
+    dplyr::mutate(freq = n / sum(n))
+  t$freq[is.na(t$freq)==T] <-0
+  tdat<-t
+  
+  for (j in unique(t$Distance_30px)){
+    tdat<-t
+    tdat <-subset(tdat, Distance_30px==j)
+    tdat$i <-tdat$Phenotype
+    
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Distance_30px) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= Phenotype, y = freq, colour=Phenotype))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Distance_30px, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs(x="Distance", y="Proportion", fill="Area_Phenotype")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+    # stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+   
+  #plot
+  suppressWarnings(gridExtra::grid.arrange(grobs = plot_list, ncol=round(length(unique(t$Phenotype))/2)))
+  
+  #save out in individual pdfs for each variable
+  #pdf(file=file.path(results_folder, paste0("CAF_Type_",i,"_new.pdf")),width=12, height=6)
+  #gridExtra::grid.arrange(grobs = plot_list, ncol=round(length(unique(t$Phenotype))/2))
+  #dev.off()
+} 
+
+dat.fibro %>%
+  select(Patient_ID,CellType, Compartment) %>%
+  group_by(CellType) %>%
+ summarise(across( fun=mean))
+
+
+dat.fibro %>%
+  group_by(CellType) %>%
+  summarise(mean = mean(Compartment), n = n())
+
+summary(dat.fibro$Compartment)
+stat(dat.fibro$Compartment, dat.fibro$CellType)
+```
+########################################################################################################################################################
+
+##Analysis
+#Proportions Fibro CATEGORY
+
+## **Analysis**
+In the following, only tumours that were identified as adenocarcinomas or squamous cell carcinomas by pathologists will be used.
+
+## **Fibro Category**
+
+## **Proportions**
+Optimal number of patient metaclusters Fibro category proportions
+```{r  optimal number of clusters for non Fibro category subtype proportion clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Proportions
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+tdat <-as.data.frame(table( fibro.sce_pat[, fibro.sce_pat$Patient_ID!="Control"&
+                                           fibro.sce_pat$DX.name=="Adenocarcinoma"|
+                                           fibro.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            fibro.sce_pat[, fibro.sce_pat$Patient_ID!="Control"&
+                                           fibro.sce_pat$DX.name=="Adenocarcinoma"|
+                                           fibro.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+
+
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+#tdat_m <-tdat_m[,1:2]
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+sbg <-cutree(hc, k=2)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=15)
+
+#install.packages("cluster")
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+#res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+## Patients are ordered by hierarchical clustering (euclidian, wards.D2)
+```{r  Fibro subtype proportions ordered by hierarchical clustering, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table( fibro.sce_pat[, fibro.sce_pat$Patient_ID!="Control"&
+                                           fibro.sce_pat$DX.name=="Adenocarcinoma"|
+                                           fibro.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            fibro.sce_pat[, fibro.sce_pat$Patient_ID!="Control"&
+                                           fibro.sce_pat$DX.name=="Adenocarcinoma"|
+                                           fibro.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = hc$labels[hc$order])
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = hc$labels[hc$order])
+
+tdat_wide
+```
+
+Patient based metaclusters split by metacluster
+```{r  Barplot Fibro subtype proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#Barplot proportions ordered by
+p <-ggplot(tdat,aes(x=freq, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")
+#plot(p)
+##ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_Tcell-Category_over_FILTEREDpatients_ordered-hc.pdf")), width=6, height=6)
+
+#dendrogram coloured by metacluster
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 14) %>%  as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()+
+  coord_flip()+
+  scale_y_reverse()
+
+#n
+p2<-ggplot(tdat,aes(x=n, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+theme(
+        axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())
+plt <-p1+
+  p+theme(legend.position = "none", axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())+
+  p2
+
+plot(plt)
+##ggsave(plot=plt, file=file.path(plot_folder, paste("Barplot_n_hc_cell_subtype_proportions.pdf")), width=6, height=6)
+```
+
+```{r  merge hc metacluster into t_dat Fibro subtype proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k= 10))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+
+p <-ggplot(tdat_ct,aes(y=freq,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+##ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_split_metacluster_tcell-category_proportions.pdf")))
+plot(p)
+```
+
+
+## **Survival analysis** - Fibro category
+Fibro categories (CD4 and CD8) are grouped into "high" and "low" using the proportion's median as a cut-off.    
+- high > median  
+- low < median  
+Patients who received neoadjuvant treatment are excluded from survival analysis.  
+```{r  survival data Fibro subtype proportions high-low, message=FALSE, warning=FALSE, echo=FALSE}
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat,]
+
+#high low by median
+surv_dat$Collagen_CAF_G <- ifelse(surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+surv_dat$hypoxic_CAF_G <- ifelse(surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+
+surv_dat$mCAF_MMP11_G <- ifelse(surv_dat$mCAF_MMP11 >summary(surv_dat$mCAF_MMP11)[3],"mCAF_MMP11 high","mCAF_MMP11 low")
+surv_dat$mCAF_Col_Cdh_G <- ifelse(surv_dat$mCAF_Col_Cdh >summary(surv_dat$mCAF_Col_Cdh)[3],"mCAF_Col_Cdh high","mCAF_Col_Cdh low")
+
+surv_dat$SMA_CAF_G <- ifelse(surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+
+surv_dat$tpCAF_CD10_G <- ifelse(surv_dat$tpCAF_CD10 >summary(surv_dat$tpCAF_CD10)[3],"tpCAF_CD10 high","tpCAF_CD10 low")
+surv_dat$tpCAF_CD73_G <- ifelse(surv_dat$tpCAF_CD73 >summary(surv_dat$tpCAF_CD73)[3],"tpCAF_CD73 high","tpCAF_CD73 low")
+
+surv_dat$iCAF_CD248_G <- ifelse(surv_dat$iCAF_CD248 >summary(surv_dat$iCAF_CD248)[3],"iCAF_CD248 high","iCAF_CD248 low")
+surv_dat$iCAF_CD34_G <- ifelse(surv_dat$iCAF_CD34 >summary(surv_dat$iCAF_CD34)[3],"iCAF_CD34 high","iCAF_CD34 low")
+
+surv_dat$vCAF_G <- ifelse(surv_dat$vCAF >summary(surv_dat$vCAF)[3],"vCAF high","vCAF low")
+surv_dat$dCAF_G <- ifelse(surv_dat$dCAF >summary(surv_dat$dCAF)[3],"dCAF high","dCAF low")
+surv_dat$hypoxic_tpCAF_G <- ifelse(surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+surv_dat$IDO_CAF_G <- ifelse(surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+surv_dat$PDPN_CAF_G <- ifelse(surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","tpCAF_CD10_G","tpCAF_CD73_G","SMA_CAF_G","mCAF_MMP11_G","mCAF_Col_Cdh_G","hypoxic_CAF_G","Collagen_CAF_G","iCAF_CD34_G", "iCAF_CD248_G","metacluster")
+```
+
+
+#CoxPH for Fibro category proportions corrected for Stage, Grade and M
+```{r  Coxph Fibro subtype proportions, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   mCAF_MMP11+PDPN_CAF +mCAF_Col_Cdh+iCAF_CD34+iCAF_CD248+ IDO_CAF+hypoxic_tpCAF+dCAF+vCAF+tpCAF_CD10+tpCAF_CD73+hypoxic_CAF+SMA_CAF+Collagen_CAF+ Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r  survival Fibro subtype proportions not split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  facet_wrap(~strata)+theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r  survival Fibro subtype proportions split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#OS survival
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","tpCAF_CD10_G","tpCAF_CD73_G","SMA_CAF_G","mCAF_MMP11_G","mCAF_Col_Cdh_G","hypoxic_CAF_G","Collagen_CAF_G","iCAF_CD34_G", "iCAF_CD248_G","metacluster")
+
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+    
+dat$Collagen_CAF_G <- ifelse(dat$Collagen_CAF >summary(dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+dat$hypoxic_CAF_G <- ifelse(dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+
+dat$mCAF_MMP11_G <- ifelse(dat$mCAF_MMP11 >summary(dat$mCAF_MMP11)[3],"mCAF_MMP11 high","mCAF_MMP11 low")
+dat$mCAF_Col_Cdh_G <- ifelse(dat$mCAF_Col_Cdh >summary(dat$mCAF_Col_Cdh)[3],"mCAF_Col_Cdh high","mCAF_Col_Cdh low")
+
+dat$SMA_CAF_G <- ifelse(dat$SMA_CAF >summary(dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+
+dat$tpCAF_CD10_G <- ifelse(dat$tpCAF_CD10 >summary(dat$tpCAF_CD10)[3],"tpCAF_CD10 high","tpCAF_CD10 low")
+dat$tpCAF_CD73_G <- ifelse(dat$tpCAF_CD73 >summary(dat$tpCAF_CD73)[3],"tpCAF_CD73 high","tpCAF_CD73 low")
+
+dat$iCAF_CD248_G <- ifelse(dat$iCAF_CD248 >summary(dat$iCAF_CD248)[3],"iCAF_CD248 high","iCAF_CD248 low")
+dat$iCAF_CD34_G <- ifelse(dat$iCAF_CD34 >summary(dat$iCAF_CD34)[3],"iCAF_CD34 high","iCAF_CD34 low")
+
+dat$vCAF_G <- ifelse(dat$vCAF >summary(dat$vCAF)[3],"vCAF high","vCAF low")
+dat$dCAF_G <- ifelse(dat$dCAF >summary(dat$dCAF)[3],"dCAF high","dCAF low")
+dat$hypoxic_tpCAF_G <- ifelse(dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+dat$IDO_CAF_G <- ifelse(dat$IDO_CAF >summary(dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+dat$PDPN_CAF_G <- ifelse(dat$PDPN_CAF >summary(dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+   #abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high / low as well as the proportion (continuous).   
+```{r  coxph Fibro subtype proportions, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+#correrct
+
+
+df <-Coefficients %>% as.data.frame 
+df$Coefficients <- rownames(df)
+colnames(df) <- c("Predictor","Coefficient")
+df <-df[df$Predictor !=0,]
+
+na_td_res <- df[-grep("NA",df$Coefficient),]
+
+p <-ggplot(na_td_res, aes(x = reorder(Coefficient,Predictor), y = Predictor, color=Predictor >0)) +
+geom_point(size = 4) +
+#geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+#scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 0,color = 'black')+
+theme_bw()+
+  geom_segment(aes(y = 0, 
+                   x = Coefficient, 
+                   yend = Predictor, 
+                   xend = Coefficient), 
+               color = "black") +
+scale_colour_tableau()
+p
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+##ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Proportions_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Cell groups categorised into "high", "mid", "low".  
+- high > 3rd quantile  
+- mid >1st quantile and < 3rd quantile  
+- low < 1st quantile  
+```{r  survival Fibro subtype proportions classified as high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat$mCAF_Col_Cdh_G[surv_dat$mCAF_Col_Cdh >summary(surv_dat$mCAF_Col_Cdh)[5]] <-"mCAF_Col_Cdh high"
+surv_dat$mCAF_Col_Cdh_G[surv_dat$mCAF_Col_Cdh <=summary(surv_dat$mCAF_Col_Cdh)[2]] <-"mCAF_Col_Cdh low"
+surv_dat$mCAF_Col_Cdh_G[surv_dat$mCAF_Col_Cdh >summary(surv_dat$mCAF_Col_Cdh)[2]& surv_dat$mCAF_Col_Cdh<=summary(surv_dat$mCAF_Col_Cdh)[5]] <-"mCAF_Col_Cdh medium"
+
+surv_dat$mCAF_MMP11_G[surv_dat$mCAF_MMP11 >summary(surv_dat$mCAF_MMP11)[5]] <-"mCAF_MMP11 high"
+surv_dat$mCAF_MMP11_G[surv_dat$mCAF_MMP11 <=summary(surv_dat$mCAF_MMP11)[2]] <-"mCAF_MMP11 low"
+surv_dat$mCAF_MMP11_G[surv_dat$mCAF_MMP11 >summary(surv_dat$mCAF_MMP11)[2]& surv_dat$mCAF_MMP11<=summary(surv_dat$mCAF_MMP11)[5]] <-"mCAF_MMP11 medium"
+
+surv_dat$vCAF_G[surv_dat$vCAF >summary(surv_dat$vCAF)[5]] <-"vCAF high"
+surv_dat$vCAF_G[surv_dat$vCAF <=summary(surv_dat$vCAF)[2]] <-"vCAF low"
+surv_dat$vCAF_G[surv_dat$vCAF >summary(surv_dat$vCAF)[2]& surv_dat$vCAF<=summary(surv_dat$vCAF)[5]] <-"vCAF medium"
+
+surv_dat$tpCAF_CD10_G[surv_dat$tpCAF_CD10 >summary(surv_dat$tpCAF_CD10)[5]] <-"tpCAF_CD10 high"
+surv_dat$tpCAF_CD10_G[surv_dat$tpCAF_CD10 <=summary(surv_dat$tpCAF_CD10)[2]] <-"tpCAF_CD10 low"
+surv_dat$tpCAF_CD10_G[surv_dat$tpCAF_CD10 >summary(surv_dat$tpCAF_CD10)[2]& surv_dat$tpCAF_CD10<=summary(surv_dat$tpCAF_CD10)[5]] <-"tpCAF_CD10 medium"
+
+surv_dat$tpCAF_CD73_G[surv_dat$tpCAF_CD73 >summary(surv_dat$tpCAF_CD73)[5]] <-"tpCAF_CD73 high"
+surv_dat$tpCAF_CD73_G[surv_dat$tpCAF_CD73 <=summary(surv_dat$tpCAF_CD73)[2]] <-"tpCAF_CD73 low"
+surv_dat$tpCAF_CD73_G[surv_dat$tpCAF_CD73 >summary(surv_dat$tpCAF_CD73)[2]& surv_dat$tpCAF_CD73<=summary(surv_dat$tpCAF_CD73)[5]] <-"tpCAF_CD73 medium"
+
+surv_dat$iCAF_CD34_G[surv_dat$iCAF_CD34 >summary(surv_dat$iCAF_CD34)[5]] <-"iCAF_CD34 high"
+surv_dat$iCAF_CD34_G[surv_dat$iCAF_CD34 <=summary(surv_dat$iCAF_CD34)[2]] <-"iCAF_CD34 low"
+surv_dat$iCAF_CD34_G[surv_dat$iCAF_CD34 >summary(surv_dat$iCAF_CD34)[2]& surv_dat$iCAF_CD34<=summary(surv_dat$iCAF_CD34)[5]] <-"iCAF_CD34 medium"
+
+surv_dat$iCAF_CD248_G[surv_dat$iCAF_CD248 >summary(surv_dat$iCAF_CD248)[5]] <-"iCAF_CD248 high"
+surv_dat$iCAF_CD248_G[surv_dat$iCAF_CD248 <=summary(surv_dat$iCAF_CD248)[2]] <-"iCAF_CD248 low"
+surv_dat$iCAF_CD248_G[surv_dat$iCAF_CD248 >summary(surv_dat$iCAF_CD248)[2]& surv_dat$iCAF_CD248<=summary(surv_dat$iCAF_CD248)[5]] <-"iCAF_CD248 medium"
+
+surv_dat$dCAF_G[surv_dat$dCAF >summary(surv_dat$dCAF)[5]] <-"dCAF high"
+surv_dat$dCAF_G[surv_dat$dCAF <=summary(surv_dat$dCAF)[2]] <-"dCAF low"
+surv_dat$dCAF_G[surv_dat$dCAF >summary(surv_dat$dCAF)[2]& surv_dat$dCAF<=summary(surv_dat$dCAF)[5]] <-"dCAF medium"
+
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[5]] <-"Collagen_CAF high"
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF <=summary(surv_dat$Collagen_CAF)[2]] <-"Collagen_CAF low"
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[2]& surv_dat$Collagen_CAF<=summary(surv_dat$Collagen_CAF)[5]] <-"Collagen_CAF medium"
+
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[5]] <-"hypoxic_CAF high"
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF <=summary(surv_dat$hypoxic_CAF)[2]] <-"hypoxic_CAF low"
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[2]& surv_dat$hypoxic_CAF<=summary(surv_dat$hypoxic_CAF)[5]] <-"hypoxic_CAF medium"
+
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[5]] <-"SMA_CAF high"
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF <=summary(surv_dat$SMA_CAF)[2]] <-"SMA_CAF low"
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[2]& surv_dat$SMA_CAF<=summary(surv_dat$SMA_CAF)[5]] <-"SMA_CAF medium"
+
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF high"
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF <=summary(surv_dat$hypoxic_tpCAF)[2]] <-"hypoxic_tpCAF low"
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[2]& surv_dat$hypoxic_tpCAF<=summary(surv_dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF medium"
+
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[5]] <-"IDO_CAF high"
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF <=summary(surv_dat$IDO_CAF)[2]] <-"IDO_CAF low"
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[2]& surv_dat$IDO_CAF<=summary(surv_dat$IDO_CAF)[5]] <-"IDO_CAF medium"
+
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[5]] <-"PDPN_CAF high"
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF <=summary(surv_dat$PDPN_CAF)[2]] <-"PDPN_CAF low"
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[2]& surv_dat$PDPN_CAF<=summary(surv_dat$PDPN_CAF)[5]] <-"PDPN_CAF medium"
+
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","SMA_CAF_G","hypoxic_CAF_G","Collagen_CAF_G","tpCAF_CD73_G","tpCAF_CD10_G","mCAF_Col_Cdh_G","mCAF_MMP11_G","iCAF_CD34_G", "iCAF_CD248_G", "metacluster")
+
+```
+
+## Survival Fibro category proportions high mid low not split
+```{r  survival Fibro subtype proportions not split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Survival Fibro category proportions high mid low split by tumour type
+```{r  survival Fibro subtype proportions split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$mCAF_Col_Cdh_G[dat$mCAF_Col_Cdh >summary(dat$mCAF_Col_Cdh)[5]] <-"mCAF_Col_Cdh high"
+dat$mCAF_Col_Cdh_G[dat$mCAF_Col_Cdh <=summary(dat$mCAF_Col_Cdh)[2]] <-"mCAF_Col_Cdh low"
+dat$mCAF_Col_Cdh_G[dat$mCAF_Col_Cdh >summary(dat$mCAF_Col_Cdh)[2]& dat$mCAF_Col_Cdh<=summary(dat$mCAF_Col_Cdh)[5]] <-"mCAF_Col_Cdh medium"
+
+dat$mCAF_MMP11_G[dat$mCAF_MMP11 >summary(dat$mCAF_MMP11)[5]] <-"mCAF_MMP11 high"
+dat$mCAF_MMP11_G[dat$mCAF_MMP11 <=summary(dat$mCAF_MMP11)[2]] <-"mCAF_MMP11 low"
+dat$mCAF_MMP11_G[dat$mCAF_MMP11 >summary(dat$mCAF_MMP11)[2]& dat$mCAF_MMP11<=summary(dat$mCAF_MMP11)[5]] <-"mCAF_MMP11 medium"
+
+dat$vCAF_G[dat$vCAF >summary(dat$vCAF)[5]] <-"vCAF high"
+dat$vCAF_G[dat$vCAF <=summary(dat$vCAF)[2]] <-"vCAF low"
+dat$vCAF_G[dat$vCAF >summary(dat$vCAF)[2]& dat$vCAF<=summary(dat$vCAF)[5]] <-"vCAF medium"
+
+dat$tpCAF_CD10_G[dat$tpCAF_CD10 >summary(dat$tpCAF_CD10)[5]] <-"tpCAF_CD10 high"
+dat$tpCAF_CD10_G[dat$tpCAF_CD10 <=summary(dat$tpCAF_CD10)[2]] <-"tpCAF_CD10 low"
+dat$tpCAF_CD10_G[dat$tpCAF_CD10 >summary(dat$tpCAF_CD10)[2]& dat$tpCAF_CD10<=summary(dat$tpCAF_CD10)[5]] <-"tpCAF_CD10 medium"
+
+dat$tpCAF_CD73_G[dat$tpCAF_CD73 >summary(dat$tpCAF_CD73)[5]] <-"tpCAF_CD73 high"
+dat$tpCAF_CD73_G[dat$tpCAF_CD73 <=summary(dat$tpCAF_CD73)[2]] <-"tpCAF_CD73 low"
+dat$tpCAF_CD73_G[dat$tpCAF_CD73 >summary(dat$tpCAF_CD73)[2]& dat$tpCAF_CD73<=summary(dat$tpCAF_CD73)[5]] <-"tpCAF_CD73 medium"
+
+dat$iCAF_CD34_G[dat$iCAF_CD34 >summary(dat$iCAF_CD34)[5]] <-"iCAF_CD34 high"
+dat$iCAF_CD34_G[dat$iCAF_CD34 <=summary(dat$iCAF_CD34)[2]] <-"iCAF_CD34 low"
+dat$iCAF_CD34_G[dat$iCAF_CD34 >summary(dat$iCAF_CD34)[2]& dat$iCAF_CD34<=summary(dat$iCAF_CD34)[5]] <-"iCAF_CD34 medium"
+
+dat$iCAF_CD248_G[dat$iCAF_CD248 >summary(dat$iCAF_CD248)[5]] <-"iCAF_CD248 high"
+dat$iCAF_CD248_G[dat$iCAF_CD248 <=summary(dat$iCAF_CD248)[2]] <-"iCAF_CD248 low"
+dat$iCAF_CD248_G[dat$iCAF_CD248 >summary(dat$iCAF_CD248)[2]& dat$iCAF_CD248<=summary(dat$iCAF_CD248)[5]] <-"iCAF_CD248 medium"
+
+dat$dCAF_G[dat$dCAF >summary(dat$dCAF)[5]] <-"dCAF high"
+dat$dCAF_G[dat$dCAF <=summary(dat$dCAF)[2]] <-"dCAF low"
+dat$dCAF_G[dat$dCAF >summary(dat$dCAF)[2]& dat$dCAF<=summary(dat$dCAF)[5]] <-"dCAF medium"
+
+dat$Collagen_CAF_G[dat$Collagen_CAF >summary(dat$Collagen_CAF)[5]] <-"Collagen_CAF high"
+dat$Collagen_CAF_G[dat$Collagen_CAF <=summary(dat$Collagen_CAF)[2]] <-"Collagen_CAF low"
+dat$Collagen_CAF_G[dat$Collagen_CAF >summary(dat$Collagen_CAF)[2]& dat$Collagen_CAF<=summary(dat$Collagen_CAF)[5]] <-"Collagen_CAF medium"
+
+dat$hypoxic_CAF_G[dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[5]] <-"hypoxic_CAF high"
+dat$hypoxic_CAF_G[dat$hypoxic_CAF <=summary(dat$hypoxic_CAF)[2]] <-"hypoxic_CAF low"
+dat$hypoxic_CAF_G[dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[2]& dat$hypoxic_CAF<=summary(dat$hypoxic_CAF)[5]] <-"hypoxic_CAF medium"
+
+dat$SMA_CAF_G[dat$SMA_CAF >summary(dat$SMA_CAF)[5]] <-"SMA_CAF high"
+dat$SMA_CAF_G[dat$SMA_CAF <=summary(dat$SMA_CAF)[2]] <-"SMA_CAF low"
+dat$SMA_CAF_G[dat$SMA_CAF >summary(dat$SMA_CAF)[2]& dat$SMA_CAF<=summary(dat$SMA_CAF)[5]] <-"SMA_CAF medium"
+
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF high"
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF <=summary(dat$hypoxic_tpCAF)[2]] <-"hypoxic_tpCAF low"
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[2]& dat$hypoxic_tpCAF<=summary(dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF medium"
+
+dat$IDO_CAF_G[dat$IDO_CAF >summary(dat$IDO_CAF)[5]] <-"IDO_CAF high"
+dat$IDO_CAF_G[dat$IDO_CAF <=summary(dat$IDO_CAF)[2]] <-"IDO_CAF low"
+dat$IDO_CAF_G[dat$IDO_CAF >summary(dat$IDO_CAF)[2]& dat$IDO_CAF<=summary(dat$IDO_CAF)[5]] <-"IDO_CAF medium"
+
+dat$PDPN_CAF_G[dat$PDPN_CAF >summary(dat$PDPN_CAF)[5]] <-"PDPN_CAF high"
+dat$PDPN_CAF_G[dat$PDPN_CAF <=summary(dat$PDPN_CAF)[2]] <-"PDPN_CAF low"
+dat$PDPN_CAF_G[dat$PDPN_CAF >summary(dat$PDPN_CAF)[2]& dat$PDPN_CAF<=summary(dat$PDPN_CAF)[5]] <-"PDPN_CAF medium"
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high mid low as well as the proportion (continuous).   
+```{r  coxph Fibro subtype proportions high mid low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+#correct
+
+
+df <-Coefficients %>% as.data.frame 
+df$Coefficients <- rownames(df)
+colnames(df) <- c("Predictor","Coefficient")
+df <-df[df$Predictor !=0,]
+
+na_td_res <- df[-grep("NA",df$Coefficient),]
+
+p <-ggplot(na_td_res, aes(x = reorder(Coefficient,Predictor), y = Predictor, color=Predictor >0)) +
+geom_point(size = 4) +
+#geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+#scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 0,color = 'black')+
+theme_bw()+
+  geom_segment(aes(y = 0, 
+                   x = Coefficient, 
+                   yend = Predictor, 
+                   xend = Coefficient), 
+               color = "black") +
+scale_colour_tableau()
+p
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+##ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Proportions_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+#Correlation of Fibro category proportions with clinical parameters
+```{r  Correlation of non Fibro subtype proportions and clinical parameters, message=FALSE, warning=FALSE, echo=FALSE, eval=F}
+fibro.sce_pat$NeoAdj <- ifelse(fibro.sce_pat$Chemo==1 |fibro.sce_pat$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(fibro.sce_pat[,fibro.sce_pat$Patient_ID!="Control"&
+                                          fibro.sce_pat$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                           fibro.sce_pat[,fibro.sce_pat$Patient_ID!="Control"&
+                                          fibro.sce_pat$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+tdat_wide <-merge(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_as <-tdat_wide[tdat_wide$DX.name=="Adenocarcinoma" | tdat_wide$DX.name=="Squamous cell carcinoma",]
+tdat_wide_as$LN.Met <-ifelse(tdat_wide_as$LN.Met=="LN Metastases", 1,0)
+tdat_wide_as$Dist.Met <-ifelse(tdat_wide_as$Dist.Met=="Dist. Metastases", 1,0)
+
+c.param <- c("Gender","T.new","N","M.new","Dist.Met","LN.Met","Stage")
+for(i in c.param){
+  print(i)
+print(cor.test(tdat_wide_as$CD4,tdat_wide_as[[i]], method="spearman", exact=F))
+  }
+```
+
+
+In group comparison (Wilcoxon for 2 groups or KW for >2 groups) for Fibro categories over clinical variables:  
+  - T.new (TNM)
+  - N (TNM)
+  - M.new (TNM)
+  - Relapse
+  - Grade
+  - Stage
+  - Gender
+  - Typ (Tumour Type)
+  - Dist.Met (Distant metastases yes/no)
+  - LN.Met (Lymph node metastases yes/no)
+  
+## Kruskal-Wallis / Wilcoxon Fibro category ~ clinical data excluding neoadjuvant therapy
+```{r  KW W Fibro subtype proportions WO neo, fig.width=12, fig.height=10, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(fibro.sce_pat[,fibro.sce_pat$Patient_ID!="Control"&
+                                          fibro.sce_pat$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat$DX.name=="Squamous cell carcinoma"]))
+
+#remove neoadj patients
+df <-df[!df$Patient_ID%in% neoadj_pat$Patient_ID,]
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender") 
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$cell_subtype, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=p<0.05))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_colour_manual(values = setNames(c(palette("Tableau 10")[1:2]),c(F, T)))+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=7) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+  #pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Proportions_WOneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+## Neoadjuvantly treated patients are included, the same categories are used plus:  
+- NeoAdj (Neoadjuvant therapy yes/no)
+- Chemo (Neoadjuvant therapy = chemotherapy)
+- Radio (Neoadjuvant therapy = radiotherapy)
+```{r  KW W Fibro subtype proportions with NEO, fig.width=12, fig.height=10, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(fibro.sce_pat[,fibro.sce_pat$Patient_ID!="Control"&
+                                          fibro.sce_pat$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat$DX.name=="Squamous cell carcinoma"]))
+
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$cell_subtype, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=p<0.05))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_colour_manual(values = setNames(c(palette("Tableau 10")[1:2]),c(F, T)))+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=7) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Proportions_Wneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+
+
+## **Fibro subtype**
+## **Densities**
+Generally, densities are first calculated per tissue area. For patients with more than 1 image, the average density is calculated subsequently.  
+Optimal number of Fibro subtype densities patients metaclusters:
+```{r  Fibro subtype density optimal number of clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Densities
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+#tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+library(FactoMineR)
+library(factoextra)
+sbg <-cutree(hc, k=8)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=20)
+
+#install.packages("cluster")
+library(cluster)
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+```{r  Calculate densities Fibro subtype, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+  
+  
+#CAF type distribution over all patients
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+#order Patients after clustering results
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = labels(hc))
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = labels(hc))
+```
+
+## Barplot showing absolute density per patietn coloured by Fibro subtype together with hierarchical clustering tree coloured by patient metaclusters.
+```{r Barolot Fibro subtype densities, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+#barolot (sqrt of density)
+p <-ggplot(tdat,aes(y=sqrt(density),x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+#barplot total density
+p <-ggplot(tdat,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+
+#coloured dendrogram
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 10) %>% as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()
+pg <-plot_grid(p1,p, align="v", ncol=1)
+pg
+##ggsave(plot=pg, file=file.path(plot_folder, paste("Barplot_Tcell-subtype-Densities_HC_ordered_dendro.pdf")), width=12, height=6)
+```
+Add patient metacluster to data and plot clusters separately for better identification
+```{r Fibro subtype densities merge hc, message=FALSE, warning=FALSE, echo=FALSE}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k=10))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+ggplot(tdat_ct,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+```
+
+## Survival analysis over Fibro subtype densities high low.  
+- High > median
+- Low < median
+```{r  surv_dat Fibro subtype densities high low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- rowSums(surv_dat[,2:15])
+
+#high low
+surv_dat$Density_G <- ifelse(surv_dat$total_density >summary(surv_dat$total_density)[3],"Density high","Density low")
+surv_dat$Collagen_CAF_G <- ifelse(surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+surv_dat$hypoxic_CAF_G <- ifelse(surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+
+surv_dat$mCAF_MMP11_G <- ifelse(surv_dat$mCAF_MMP11 >summary(surv_dat$mCAF_MMP11)[3],"mCAF_MMP11 high","mCAF_MMP11 low")
+surv_dat$mCAF_Col_Cdh_G <- ifelse(surv_dat$mCAF_Col_Cdh >summary(surv_dat$mCAF_Col_Cdh)[3],"mCAF_Col_Cdh high","mCAF_Col_Cdh low")
+
+surv_dat$SMA_CAF_G <- ifelse(surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+
+surv_dat$tpCAF_CD10_G <- ifelse(surv_dat$tpCAF_CD10 >summary(surv_dat$tpCAF_CD10)[3],"tpCAF_CD10 high","tpCAF_CD10 low")
+surv_dat$tpCAF_CD73_G <- ifelse(surv_dat$tpCAF_CD73 >summary(surv_dat$tpCAF_CD73)[3],"tpCAF_CD73 high","tpCAF_CD73 low")
+
+surv_dat$iCAF_CD248_G <- ifelse(surv_dat$iCAF_CD248 >summary(surv_dat$iCAF_CD248)[3],"iCAF_CD248 high","iCAF_CD248 low")
+surv_dat$iCAF_CD34_G <- ifelse(surv_dat$iCAF_CD34 >summary(surv_dat$iCAF_CD34)[3],"iCAF_CD34 high","iCAF_CD34 low")
+
+surv_dat$vCAF_G <- ifelse(surv_dat$vCAF >summary(surv_dat$vCAF)[3],"vCAF high","vCAF low")
+surv_dat$dCAF_G <- ifelse(surv_dat$dCAF >summary(surv_dat$dCAF)[3],"dCAF high","dCAF low")
+surv_dat$hypoxic_tpCAF_G <- ifelse(surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+surv_dat$IDO_CAF_G <- ifelse(surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+surv_dat$PDPN_CAF_G <- ifelse(surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+
+```
+
+#CoxPH for Fibro subtype density corrected for Stage, Grade and M
+```{r  Coxph Fibro subtype density, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   IDO_CAF + SMA_CAF+PDPN_CAF+Collagen_CAF +hypoxic_CAF+hypoxic_tpCAF+tpCAF_CD10+tpCAF_CD73+ dCAF+iCAF_CD34+iCAF_CD248+vCAF+mCAF_MMP11+mCAF_Col_Cdh+Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r  survival Fibro subtype densities high low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+#surv <-c("metacluster")
+#surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+library(survminer)
+
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+#km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r  survival Fibro subtype densities high low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Split
+library(survminer)
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G <- ifelse(dat$total_density >summary(dat$total_density)[3],"Density high","Density low")
+dat$Density_G <- ifelse(dat$total_density >summary(dat$total_density)[3],"Density high","Density low")
+dat$Collagen_CAF_G <- ifelse(dat$Collagen_CAF >summary(dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+dat$hypoxic_CAF_G <- ifelse(dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+
+dat$mCAF_MMP11_G <- ifelse(dat$mCAF_MMP11 >summary(dat$mCAF_MMP11)[3],"mCAF_MMP11 high","mCAF_MMP11 low")
+dat$mCAF_Col_Cdh_G <- ifelse(dat$mCAF_Col_Cdh >summary(dat$mCAF_Col_Cdh)[3],"mCAF_Col_Cdh high","mCAF_Col_Cdh low")
+
+dat$SMA_CAF_G <- ifelse(dat$SMA_CAF >summary(dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+
+dat$tpCAF_CD10_G <- ifelse(dat$tpCAF_CD10 >summary(dat$tpCAF_CD10)[3],"tpCAF_CD10 high","tpCAF_CD10 low")
+dat$tpCAF_CD73_G <- ifelse(dat$tpCAF_CD73 >summary(dat$tpCAF_CD73)[3],"tpCAF_CD73 high","tpCAF_CD73 low")
+
+dat$iCAF_CD248_G <- ifelse(dat$iCAF_CD248 >summary(dat$iCAF_CD248)[3],"iCAF_CD248 high","iCAF_CD248 low")
+dat$iCAF_CD34_G <- ifelse(dat$iCAF_CD34 >summary(dat$iCAF_CD34)[3],"iCAF_CD34 high","iCAF_CD34 low")
+
+dat$vCAF_G <- ifelse(dat$vCAF >summary(dat$vCAF)[3],"vCAF high","vCAF low")
+dat$dCAF_G <- ifelse(dat$dCAF >summary(dat$dCAF)[3],"dCAF high","dCAF low")
+dat$hypoxic_tpCAF_G <- ifelse(dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+dat$IDO_CAF_G <- ifelse(dat$IDO_CAF >summary(dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+dat$PDPN_CAF_G <- ifelse(dat$PDPN_CAF >summary(dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high / low as well as the densities (continuous).   
+```{r  coxph Fibro subtype densities high-low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=6, fig.width=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+#correct
+
+
+df <-Coefficients %>% as.data.frame 
+df$Coefficients <- rownames(df)
+colnames(df) <- c("Predictor","Coefficient")
+df <-df[df$Predictor !=0,]
+
+na_td_res <- df[-grep("NA",df$Coefficient),]
+
+p <-ggplot(na_td_res, aes(x = reorder(Coefficient,Predictor), y = Predictor, color=Predictor >0)) +
+geom_point(size = 4) +
+#geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+#scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 0,color = 'black')+
+theme_bw()+
+  geom_segment(aes(y = 0, 
+                   x = Coefficient, 
+                   yend = Predictor, 
+                   xend = Coefficient), 
+               color = "black") +
+scale_colour_tableau()
+p
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+##ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-subtype_Density_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Survival Fibro subtype densities high-mid-low     
+- high    > 3rd quantile   
+- mid     > 1st quantile < 3rd quantile   
+- low     < 1st quantile   
+```{r  surv_dat Fibro subtype densities high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- rowSums(surv_dat[,2:15])
+
+#High Med Low
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[5]] <-"Density high"
+surv_dat$Density_G[surv_dat$total_density <summary(surv_dat$total_density)[2]] <-"Density low"
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[2]& surv_dat$total_density<summary(surv_dat$total_density)[5]] <-"Density medium"
+
+surv_dat$mCAF_Col_Cdh_G[surv_dat$mCAF_Col_Cdh >summary(surv_dat$mCAF_Col_Cdh)[5]] <-"mCAF_Col_Cdh high"
+surv_dat$mCAF_Col_Cdh_G[surv_dat$mCAF_Col_Cdh <=summary(surv_dat$mCAF_Col_Cdh)[2]] <-"mCAF_Col_Cdh low"
+surv_dat$mCAF_Col_Cdh_G[surv_dat$mCAF_Col_Cdh >summary(surv_dat$mCAF_Col_Cdh)[2]& surv_dat$mCAF_Col_Cdh<=summary(surv_dat$mCAF_Col_Cdh)[5]] <-"mCAF_Col_Cdh medium"
+
+surv_dat$mCAF_MMP11_G[surv_dat$mCAF_MMP11 >summary(surv_dat$mCAF_MMP11)[5]] <-"mCAF_MMP11 high"
+surv_dat$mCAF_MMP11_G[surv_dat$mCAF_MMP11 <=summary(surv_dat$mCAF_MMP11)[2]] <-"mCAF_MMP11 low"
+surv_dat$mCAF_MMP11_G[surv_dat$mCAF_MMP11 >summary(surv_dat$mCAF_MMP11)[2]& surv_dat$mCAF_MMP11<=summary(surv_dat$mCAF_MMP11)[5]] <-"mCAF_MMP11 medium"
+
+surv_dat$vCAF_G[surv_dat$vCAF >summary(surv_dat$vCAF)[5]] <-"vCAF high"
+surv_dat$vCAF_G[surv_dat$vCAF <=summary(surv_dat$vCAF)[2]] <-"vCAF low"
+surv_dat$vCAF_G[surv_dat$vCAF >summary(surv_dat$vCAF)[2]& surv_dat$vCAF<=summary(surv_dat$vCAF)[5]] <-"vCAF medium"
+
+surv_dat$tpCAF_CD10_G[surv_dat$tpCAF_CD10 >summary(surv_dat$tpCAF_CD10)[5]] <-"tpCAF_CD10 high"
+surv_dat$tpCAF_CD10_G[surv_dat$tpCAF_CD10 <=summary(surv_dat$tpCAF_CD10)[2]] <-"tpCAF_CD10 low"
+surv_dat$tpCAF_CD10_G[surv_dat$tpCAF_CD10 >summary(surv_dat$tpCAF_CD10)[2]& surv_dat$tpCAF_CD10<=summary(surv_dat$tpCAF_CD10)[5]] <-"tpCAF_CD10 medium"
+
+surv_dat$tpCAF_CD73_G[surv_dat$tpCAF_CD73 >summary(surv_dat$tpCAF_CD73)[5]] <-"tpCAF_CD73 high"
+surv_dat$tpCAF_CD73_G[surv_dat$tpCAF_CD73 <=summary(surv_dat$tpCAF_CD73)[2]] <-"tpCAF_CD73 low"
+surv_dat$tpCAF_CD73_G[surv_dat$tpCAF_CD73 >summary(surv_dat$tpCAF_CD73)[2]& surv_dat$tpCAF_CD73<=summary(surv_dat$tpCAF_CD73)[5]] <-"tpCAF_CD73 medium"
+
+surv_dat$iCAF_CD34_G[surv_dat$iCAF_CD34 >summary(surv_dat$iCAF_CD34)[5]] <-"iCAF_CD34 high"
+surv_dat$iCAF_CD34_G[surv_dat$iCAF_CD34 <=summary(surv_dat$iCAF_CD34)[2]] <-"iCAF_CD34 low"
+surv_dat$iCAF_CD34_G[surv_dat$iCAF_CD34 >summary(surv_dat$iCAF_CD34)[2]& surv_dat$iCAF_CD34<=summary(surv_dat$iCAF_CD34)[5]] <-"iCAF_CD34 medium"
+
+surv_dat$iCAF_CD248_G[surv_dat$iCAF_CD248 >summary(surv_dat$iCAF_CD248)[5]] <-"iCAF_CD248 high"
+surv_dat$iCAF_CD248_G[surv_dat$iCAF_CD248 <=summary(surv_dat$iCAF_CD248)[2]] <-"iCAF_CD248 low"
+surv_dat$iCAF_CD248_G[surv_dat$iCAF_CD248 >summary(surv_dat$iCAF_CD248)[2]& surv_dat$iCAF_CD248<=summary(surv_dat$iCAF_CD248)[5]] <-"iCAF_CD248 medium"
+
+surv_dat$dCAF_G[surv_dat$dCAF >summary(surv_dat$dCAF)[5]] <-"dCAF high"
+surv_dat$dCAF_G[surv_dat$dCAF <=summary(surv_dat$dCAF)[2]] <-"dCAF low"
+surv_dat$dCAF_G[surv_dat$dCAF >summary(surv_dat$dCAF)[2]& surv_dat$dCAF<=summary(surv_dat$dCAF)[5]] <-"dCAF medium"
+
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[5]] <-"Collagen_CAF high"
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF <=summary(surv_dat$Collagen_CAF)[2]] <-"Collagen_CAF low"
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[2]& surv_dat$Collagen_CAF<=summary(surv_dat$Collagen_CAF)[5]] <-"Collagen_CAF medium"
+
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[5]] <-"hypoxic_CAF high"
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF <=summary(surv_dat$hypoxic_CAF)[2]] <-"hypoxic_CAF low"
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[2]& surv_dat$hypoxic_CAF<=summary(surv_dat$hypoxic_CAF)[5]] <-"hypoxic_CAF medium"
+
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[5]] <-"SMA_CAF high"
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF <=summary(surv_dat$SMA_CAF)[2]] <-"SMA_CAF low"
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[2]& surv_dat$SMA_CAF<=summary(surv_dat$SMA_CAF)[5]] <-"SMA_CAF medium"
+
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF high"
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF <=summary(surv_dat$hypoxic_tpCAF)[2]] <-"hypoxic_tpCAF low"
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[2]& surv_dat$hypoxic_tpCAF<=summary(surv_dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF medium"
+
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[5]] <-"IDO_CAF high"
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF <=summary(surv_dat$IDO_CAF)[2]] <-"IDO_CAF low"
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[2]& surv_dat$IDO_CAF<=summary(surv_dat$IDO_CAF)[5]] <-"IDO_CAF medium"
+
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[5]] <-"PDPN_CAF high"
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF <=summary(surv_dat$PDPN_CAF)[2]] <-"PDPN_CAF low"
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[2]& surv_dat$PDPN_CAF<=summary(surv_dat$PDPN_CAF)[5]] <-"PDPN_CAF medium"
+
+#surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+```
+
+## Fibro subtype densities split by tumour type
+```{r  survival Fibro subtype densities high-mid-low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+#surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+library(survminer)
+
+for (i in surv){
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## Fibro subtype densities split by tumour type
+```{r  survival Fibro subtype densities high-mid-low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_CD34_G","iCAF_CD248_G","tpCAF_CD10_G","tpCAF_CD73_G","SMA_CAF_G","mCAF_MMP11_G","mCAF_Col_Cdh_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G[dat$total_density >summary(dat$total_density)[5]] <-"Density high"
+dat$Density_G[dat$total_density <summary(dat$total_density)[2]] <-"Density low"
+dat$Density_G[dat$total_density >summary(dat$total_density)[2]& dat$total_density<summary(dat$total_density)[5]] <-"Density medium"
+
+dat$mCAF_Col_Cdh_G[dat$mCAF_Col_Cdh >summary(dat$mCAF_Col_Cdh)[5]] <-"mCAF_Col_Cdh high"
+dat$mCAF_Col_Cdh_G[dat$mCAF_Col_Cdh <=summary(dat$mCAF_Col_Cdh)[2]] <-"mCAF_Col_Cdh low"
+dat$mCAF_Col_Cdh_G[dat$mCAF_Col_Cdh >summary(dat$mCAF_Col_Cdh)[2]& dat$mCAF_Col_Cdh<=summary(dat$mCAF_Col_Cdh)[5]] <-"mCAF_Col_Cdh medium"
+
+dat$mCAF_MMP11_G[dat$mCAF_MMP11 >summary(dat$mCAF_MMP11)[5]] <-"mCAF_MMP11 high"
+dat$mCAF_MMP11_G[dat$mCAF_MMP11 <=summary(dat$mCAF_MMP11)[2]] <-"mCAF_MMP11 low"
+dat$mCAF_MMP11_G[dat$mCAF_MMP11 >summary(dat$mCAF_MMP11)[2]& dat$mCAF_MMP11<=summary(dat$mCAF_MMP11)[5]] <-"mCAF_MMP11 medium"
+
+dat$vCAF_G[dat$vCAF >summary(dat$vCAF)[5]] <-"vCAF high"
+dat$vCAF_G[dat$vCAF <=summary(dat$vCAF)[2]] <-"vCAF low"
+dat$vCAF_G[dat$vCAF >summary(dat$vCAF)[2]& dat$vCAF<=summary(dat$vCAF)[5]] <-"vCAF medium"
+
+dat$tpCAF_CD10_G[dat$tpCAF_CD10 >summary(dat$tpCAF_CD10)[5]] <-"tpCAF_CD10 high"
+dat$tpCAF_CD10_G[dat$tpCAF_CD10 <=summary(dat$tpCAF_CD10)[2]] <-"tpCAF_CD10 low"
+dat$tpCAF_CD10_G[dat$tpCAF_CD10 >summary(dat$tpCAF_CD10)[2]& dat$tpCAF_CD10<=summary(dat$tpCAF_CD10)[5]] <-"tpCAF_CD10 medium"
+
+dat$tpCAF_CD73_G[dat$tpCAF_CD73 >summary(dat$tpCAF_CD73)[5]] <-"tpCAF_CD73 high"
+dat$tpCAF_CD73_G[dat$tpCAF_CD73 <=summary(dat$tpCAF_CD73)[2]] <-"tpCAF_CD73 low"
+dat$tpCAF_CD73_G[dat$tpCAF_CD73 >summary(dat$tpCAF_CD73)[2]& dat$tpCAF_CD73<=summary(dat$tpCAF_CD73)[5]] <-"tpCAF_CD73 medium"
+
+dat$iCAF_CD34_G[dat$iCAF_CD34 >summary(dat$iCAF_CD34)[5]] <-"iCAF_CD34 high"
+dat$iCAF_CD34_G[dat$iCAF_CD34 <=summary(dat$iCAF_CD34)[2]] <-"iCAF_CD34 low"
+dat$iCAF_CD34_G[dat$iCAF_CD34 >summary(dat$iCAF_CD34)[2]& dat$iCAF_CD34<=summary(dat$iCAF_CD34)[5]] <-"iCAF_CD34 medium"
+
+dat$iCAF_CD248_G[dat$iCAF_CD248 >summary(dat$iCAF_CD248)[5]] <-"iCAF_CD248 high"
+dat$iCAF_CD248_G[dat$iCAF_CD248 <=summary(dat$iCAF_CD248)[2]] <-"iCAF_CD248 low"
+dat$iCAF_CD248_G[dat$iCAF_CD248 >summary(dat$iCAF_CD248)[2]& dat$iCAF_CD248<=summary(dat$iCAF_CD248)[5]] <-"iCAF_CD248 medium"
+
+dat$dCAF_G[dat$dCAF >summary(dat$dCAF)[5]] <-"dCAF high"
+dat$dCAF_G[dat$dCAF <=summary(dat$dCAF)[2]] <-"dCAF low"
+dat$dCAF_G[dat$dCAF >summary(dat$dCAF)[2]& dat$dCAF<=summary(dat$dCAF)[5]] <-"dCAF medium"
+
+dat$Collagen_CAF_G[dat$Collagen_CAF >summary(dat$Collagen_CAF)[5]] <-"Collagen_CAF high"
+dat$Collagen_CAF_G[dat$Collagen_CAF <=summary(dat$Collagen_CAF)[2]] <-"Collagen_CAF low"
+dat$Collagen_CAF_G[dat$Collagen_CAF >summary(dat$Collagen_CAF)[2]& dat$Collagen_CAF<=summary(dat$Collagen_CAF)[5]] <-"Collagen_CAF medium"
+
+dat$hypoxic_CAF_G[dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[5]] <-"hypoxic_CAF high"
+dat$hypoxic_CAF_G[dat$hypoxic_CAF <=summary(dat$hypoxic_CAF)[2]] <-"hypoxic_CAF low"
+dat$hypoxic_CAF_G[dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[2]& dat$hypoxic_CAF<=summary(dat$hypoxic_CAF)[5]] <-"hypoxic_CAF medium"
+
+dat$SMA_CAF_G[dat$SMA_CAF >summary(dat$SMA_CAF)[5]] <-"SMA_CAF high"
+dat$SMA_CAF_G[dat$SMA_CAF <=summary(dat$SMA_CAF)[2]] <-"SMA_CAF low"
+dat$SMA_CAF_G[dat$SMA_CAF >summary(dat$SMA_CAF)[2]& dat$SMA_CAF<=summary(dat$SMA_CAF)[5]] <-"SMA_CAF medium"
+
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF high"
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF <=summary(dat$hypoxic_tpCAF)[2]] <-"hypoxic_tpCAF low"
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[2]& dat$hypoxic_tpCAF<=summary(dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF medium"
+
+dat$IDO_CAF_G[dat$IDO_CAF >summary(dat$IDO_CAF)[5]] <-"IDO_CAF high"
+dat$IDO_CAF_G[dat$IDO_CAF <=summary(dat$IDO_CAF)[2]] <-"IDO_CAF low"
+dat$IDO_CAF_G[dat$IDO_CAF >summary(dat$IDO_CAF)[2]& dat$IDO_CAF<=summary(dat$IDO_CAF)[5]] <-"IDO_CAF medium"
+
+dat$PDPN_CAF_G[dat$PDPN_CAF >summary(dat$PDPN_CAF)[5]] <-"PDPN_CAF high"
+dat$PDPN_CAF_G[dat$PDPN_CAF <=summary(dat$PDPN_CAF)[2]] <-"PDPN_CAF low"
+dat$PDPN_CAF_G[dat$PDPN_CAF >summary(dat$PDPN_CAF)[2]& dat$PDPN_CAF<=summary(dat$PDPN_CAF)[5]] <-"PDPN_CAF medium"
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+
+
+## Lasso-regressed cox-ph model   
+including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high mid low as well as the densities (continuous).   
+```{r coxph Fibro subtype densities high-mid-low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor")))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+#correct
+
+
+df <-Coefficients %>% as.data.frame 
+df$Coefficients <- rownames(df)
+colnames(df) <- c("Predictor","Coefficient")
+df <-df[df$Predictor !=0,]
+
+na_td_res <- df[-grep("NA",df$Coefficient),]
+
+p <-ggplot(na_td_res, aes(x = reorder(Coefficient,Predictor), y = Predictor, color=Predictor >0)) +
+geom_point(size = 4) +
+#geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+#scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 0,color = 'black')+
+theme_bw()+
+  geom_segment(aes(y = 0, 
+                   x = Coefficient, 
+                   yend = Predictor, 
+                   xend = Coefficient), 
+               color = "black") +
+scale_colour_tableau()
+p
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+##ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-subtype_Density_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables without neoadjuvant treated patients.
+```{r  KW W Fibro subtype density WO neo,warning=FALSE, message=FALSE, echo=FALSE, fig.width=18, fig.height=12}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+fibro.sce_pat.roi$NeoAdj <- ifelse(fibro.sce_pat.roi$Chemo==1 |fibro.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[2:15])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:16])), names_to = "Phenotype",values_to = "density")
+
+#remove
+tdat <-tdat[!tdat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=p<0.05))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_colour_manual(values = setNames(c(palette("Tableau 10")[1:2]),c(F, T)))+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=5) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-subtype_Densities_WOneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables with neoadjuvant treated patients.
+```{r  KW Wilcox Fibro subtype density W neo,warning=FALSE, message=FALSE, echo=FALSE, fig.width=18, fig.height=12}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+fibro.sce_pat.roi$NeoAdj <- ifelse(fibro.sce_pat.roi$Chemo==1 |fibro.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[,2:15])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:16])), names_to = "Phenotype",values_to = "density")
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=p<0.05))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_colour_manual(values = setNames(c(palette("Tableau 10")[1:2]),c(F, T)))+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=7) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-subtype_Densities_Wneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+```{r  Correlation Fibro subtype densities with clinical parameters, eval=F, echo=T, message=FALSE, warning=FALSE, echo=FALSE}
+fibro.sce_pat.roi$NeoAdj <- ifelse(fibro.sce_pat.roi$Chemo==1 |fibro.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+fibro.sce_pat.roi$NeoAdj <- ifelse(fibro.sce_pat.roi$Chemo==1 |fibro.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- tdat_wide$CD4 + tdat_wide$CD8
+
+tdat_wide <-merge(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_as <-tdat_wide[tdat_wide$DX.name=="Adenocarcinoma" | tdat_wide$DX.name=="Squamous cell carcinoma",]
+tdat_wide_as$LN.Met <-ifelse(tdat_wide_as$LN.Met=="LN Metastases", 1,0)
+tdat_wide_as$Dist.Met <-ifelse(tdat_wide_as$Dist.Met=="Dist. Metastases", 1,0)
+
+for(i in c.param){
+  print(i)
+  print(cor.test(tdat_wide_as$CD4,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$CD8,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$total_density,tdat_wide_as[[i]], method="spearman", exact=F))
+}
+```
+
+
+```{r  Correlations T cell type densities amongst eachother, message=FALSE, warning=FALSE, echo=FALSE, fig.width=6, fig.height=6}
+fibro.sce_pat.roi$NeoAdj <- ifelse(fibro.sce_pat.roi$Chemo==1 |fibro.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+tdat <-tdat %>% 
+  group_by(RoiID,Phenotype) %>%
+  summarise(across(-c(Patient_ID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+tdat_wide <- pivot_wider(tdat,id_cols=c("RoiID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+neoadj_roi <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(RoiID) 
+
+tdat_wide <-tdat_wide[!tdat_wide$RoiID%in% neoadj_roi$RoiID,]
+rownames(tdat_wide)<-tdat_wide$RoiID
+
+library(corrplot)
+cor(tdat_wide[,-1], method="pearson")
+#corrplot(cor(tdat_wide[,-1], method="pearson"), col=rev(brewer.pal(10,"RdBu")))
+
+#test <- immune.pheno.sub %>% select(contains("Ratio"))
+#test$Ratio_tCAF_vCAF <-NULL
+sig <- cor.mtest(tdat_wide[,-1], conf.level = .95) #is this test right?
+
+
+#plot
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+         col=rev(brewer.pal(10,"RdBu")),
+         pch.col = "white",
+         #type="upper",
+         title = "Correlation of CAF type densities",
+        mar=c(0,0,3,0))
+
+col_fun = colorRamp2(c(0, 0.5, 1), c("blue", "white", "red"))
+
+pdf(file=file.path(plot_folder, "CAF_density_corrplot_upper.pdf"), width=6, height=6)
+
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         type="upper",
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+        #col=rev(brewer.pal(8,"RdBu")),
+        col=rev(COL2('RdBu', 8)),
+        method="square",
+         pch.col = "white",
+         #type="upper",
+       #  title = "Correlation of CAF type densities",
+        mar=c(0,0,3,0))
+dev.off()
+
+#col=wes_palette("Zissou1", 100, type = "continuous"),
+```
+
+
+```{r  Correlations T cell type densities amongst eachother per roi, message=FALSE, warning=FALSE, echo=FALSE, fig.width=6, fig.height=6}
+fibro.sce_pat.roi$NeoAdj <- ifelse(fibro.sce_pat.roi$Chemo==1 |fibro.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+tdat <-tdat %>% 
+  group_by(RoiID,Phenotype) %>%
+  summarise(across(-c(Patient_ID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+tdat_wide <- pivot_wider(tdat,id_cols=c("RoiID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+neoadj_roi <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(RoiID) 
+
+tdat_wide <-tdat_wide[!tdat_wide$RoiID%in% neoadj_roi$RoiID,]
+rownames(tdat_wide)<-tdat_wide$RoiID
+
+library(corrplot)
+cor(tdat_wide[,-1], method="pearson")
+#corrplot(cor(tdat_wide[,-1], method="pearson"), col=rev(brewer.pal(10,"RdBu")))
+
+#test <- immune.pheno.sub %>% select(contains("Ratio"))
+#test$Ratio_tCAF_vCAF <-NULL
+sig <- cor.mtest(tdat_wide[,-1], conf.level = .95) #is this test right?
+
+
+#plot
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+         col=rev(brewer.pal(10,"RdBu")),
+         pch.col = "white",
+         #type="upper",
+         title = "Correlation of T cell type densities",
+        mar=c(0,0,3,0))
+```
+
+
+
+
+```{r  Correlations proportions per image CAF Type densities, fig.width=10, fig.height=10, message=FALSE,warning=FALSE,echo=FALSE, fig.width=6, fig.height=6}
+fibro.sce_pat.roi$NeoAdj <- ifelse(fibro.sce_pat.roi$Chemo==1 |fibro.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$DX.name!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$DX.name!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+tdat <-tdat %>% 
+  group_by(RoiID,Phenotype) %>%
+  summarise(across(-c(Patient_ID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+tdat_wide <- pivot_wider(tdat,id_cols=c("RoiID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+neoadj_roi <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(RoiID) 
+
+tdat_wide <-tdat_wide[!tdat_wide$RoiID%in% neoadj_roi$RoiID,]
+rownames(tdat_wide)<-tdat_wide$RoiID
+
+library(corrplot)
+cor(tdat_wide[,-1], method="pearson")
+#corrplot(cor(tdat_wide[,-1], method="pearson"), col=rev(brewer.pal(10,"RdBu")))
+
+#test <- immune.pheno.sub %>% select(contains("Ratio"))
+#test$Ratio_tCAF_vCAF <-NULL
+sig <- cor.mtest(tdat_wide[,-1], conf.level = .95) #is this test right?
+
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+         col=rev(brewer.pal(10,"RdBu")),
+         pch.col = "white",
+         #type="upper",
+         title = "Correlation CAF Type densities",
+        mar=c(0,0,3,0))
+```
+
+
+
+```{r  Correlations proportions per image CAF types,message=FALSE,warning=FALSE,echo=FALSE, fig.width=6, fig.height=6}
+fibro.sce_roi$NeoAdj <- ifelse(fibro.sce_roi$Chemo==1 |fibro.sce_roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(fibro.sce_roi[,fibro.sce_roi$DX.name!="Control"&
+                                          fibro.sce_roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_roi[,fibro.sce_roi$DX.name!="Control"&
+                                          fibro.sce_roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$RoiID <-droplevels(tdat$RoiID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("RoiID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+#tdat_wide <-tdat_wide[!tdat_wide$RoiID%in% neoadj_pat$RoiID,]
+rownames(tdat_wide)<-tdat_wide$RoiID
+
+library(corrplot)
+cor(tdat_wide[,-1], method="pearson")
+#corrplot(cor(tdat_wide[,-1], method="pearson"), col=rev(brewer.pal(10,"RdBu")))
+
+sig <- cor.mtest(tdat_wide[,-1], conf.level = .95) #is this test right?
+
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+         col=rev(brewer.pal(10,"RdBu")),
+         pch.col = "white",
+         #type="upper",
+         title = "Correlated ratios of CAF type proportions",
+        mar=c(0,0,3,0))
+
+pdf(file=file.path(plot_folder, "CAF_proportion_corrplot.pdf"), width=6, height=6)
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         type="lower",
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+        #col=rev(brewer.pal(8,"RdBu")),
+        col=rev(COL2('RdBu', 8)),
+        method="square",
+         pch.col = "white",
+         #type="upper",
+        # title = "Correlation of CAF type proportions",
+        mar=c(0,0,3,0))
+dev.off()
+```
+
+```{r  Correlations proportions per patient CAF types,message=FALSE,warning=FALSE,echo=FALSE, fig.width=6, fig.height=6}
+fibro.sce_pat.roi$NeoAdj <- ifelse(fibro.sce_pat.roi$Chemo==1 |fibro.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$DX.name!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$DX.name!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+rownames(tdat_wide)<-tdat_wide$Patient_ID
+
+library(corrplot)
+cor(tdat_wide[,-1], method="pearson")
+#corrplot(cor(tdat_wide[,-1], method="pearson"), col=rev(brewer.pal(10,"RdBu")))
+
+sig <- cor.mtest(tdat_wide[,-1], conf.level = .95) #is this test right?
+
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+         col=rev(brewer.pal(10,"RdBu")),
+         pch.col = "white",
+         #type="upper",
+         title = "Correlated ratios of CAF type proportions per patient",
+        mar=c(0,0,3,0))
+```
+
+
+#Differential abundance loop
+
+#Differential abundance analysis loop   
+```{r  Differential Abundance Analysis Types over Metastasis, fig.width=6, fig.height=4, warning=F, message=F,echo=F, out.width="50%"}
+#Patient Number must be numeric
+library(tidyr)
+#groupId must be factor
+#t <-as.data.frame(table(fibro[,fibro$CAF_Type!="undefined"]$Patient_ID,factor(fibro[,fibro$CAF_Type!="undefined"]$CAF_Type)))
+#  colnames(t)<-c("Patient_ID","Phenotype","n")
+ # t$Phenotype <-factor(t$Phenotype)
+
+  
+colData(fibro.sce_pat[, fibro.sce_pat$DX.name=="Adenocarcinoma"| fibro.sce_pat$DX.name=="Squamous cell carcinoma"])<-as.data.frame(colData(fibro.sce_pat[, fibro.sce_pat$DX.name=="Adenocarcinoma"| fibro.sce_pat$DX.name=="Squamous cell carcinoma"])) %>% 
+  group_by(Patient_ID) %>% mutate(NR=cur_group_id()) %>% DataFrame()
+dat <-as_tibble(colData(fibro.sce_pat[, fibro.sce_pat$DX.name=="Adenocarcinoma"| fibro.sce_pat$DX.name=="Squamous cell carcinoma"]))
+#if necessary: change group_id labels
+
+#clinical.data$NR <-c(1:length(unique(clinical.data$Patient_ID)))
+#dat <-merge(t,clinical.data,by="Patient_ID")
+dat <-dat[dat$DX.name=="Adenocarcinoma" | dat$DX.name=="Squamous cell carcinoma",]
+dat$LN.Met <- ifelse(dat$N ==0, "No LN Metastases", "LN Metastases")
+dat$Dist.Met <- ifelse(dat$M.new ==0, "No Dist. Metastases", "Dist. Metastases")
+dat$NeoAdj <- ifelse(dat$Radio==1 |dat$Chemo==1, "NeoAdjuvantTherapy", "NoNeoAdjuvantTherapy")
+
+#Only for grade
+#dat <-dat[dat$Grade=="2" | dat$Grade=="3",]
+#subtype <-"Grade"
+#dat<-dat %>% drop_na(Grade)
+
+
+
+subtype <-c("Gender","Relapse","Chemo","Dist.Met","LN.Met","Radio","DX.name")
+#subtype <-c("Gender","Relapse","Chemo","Dist.Met","LN.Met","Radio","NeoAdj")
+
+#subtype <- "Relapse"
+#group_id <-c("M.new")
+#dat$group_id <-ifelse(dat$M.new ==0, 0,1)
+#dat[[group_id]] <-as.factor(dat[[group_id]])
+
+#define the naming subtype aka Phenotype
+
+#dat<-dat %>% drop_na(paste(group_id))
+j <-"cell_type"
+
+plot_list <- list()
+for (i in subtype) {
+  #for(k in unique(dat$DX.name)){
+  # dat.l <-subset(dat, DX.name==k)
+
+dat.l <-dat
+k="both tumour types"
+dat.l$group_id <-as.factor(dat.l[[i]])
+dat.l<-dat.l %>% drop_na(paste(i))
+
+test <- dat.l %>% select(c("NR",j,"group_id"))
+#colnames(test) <-c("NR", "Phenotype","group_id")
+
+test_wide <- pivot_wider(test,id_cols=c("NR","group_id"),names_from = "cell_type", values_from ="cell_type", values_fn = list(cell_type=length),names_prefix = "")
+#names(test_wide)[names(test_wide) == "FibroType_1"] <- "FibroType_1 / tCAF"
+#names(test_wide)[names(test_wide) == "FibroType_2"] <- "FibroType_2 / vCAF"
+test_wide
+test_wide[is.na(test_wide)] <- 0
+
+design <- createDesignMatrix(
+  test_wide[,c("NR","group_id")], cols_design = c( "NR","group_id")
+)
+
+contrast <- createContrast(c(rep(0, ncol(design)-1),1))
+data.frame(parameters = colnames(design), contrast)
+
+test_wide_norm <- test_wide %>% select(-c("NR","group_id"))
+#Normalize
+norm_factors <- calcNormFactors(t(test_wide_norm), method = "TMM")
+y <- DGEList(t(test_wide_norm), norm.factors = norm_factors)
+
+#y <- DGEList(t(both[,-c("patient_id","group_id")]))
+y <- estimateDisp(y,design)
+
+fit <- glmFit(y, design)
+lrt <- glmLRT(fit, contrast = contrast)
+top <- edgeR::topTags(lrt, n = Inf, adjust.method = "BH", 
+        sort.by = "none")
+t.op <-as.data.frame(top)
+t.op <- tibble::rownames_to_column(t.op, "FibroTypes")
+
+plot_list[[k]] <-
+ggplot(t.op, aes(reorder(FibroTypes,logFC), logFC)) +
+  geom_col(aes(fill=PValue<0.05)) +
+  coord_flip() +
+  scale_fill_tableau()+
+  scale_fill_jco()+
+  labs(x="Fibroblast Type", y="logFC",
+       title=paste("Phenotypes over",i,k))+
+  theme_bw()+
+  theme(strip.background = element_blank(),
+        panel.background=element_rect(fill='white', colour = 'black'),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        axis.text.y = element_text(size=10))
+#plot(p)
+
+# reference = Metastasis
+##ggsave(plot=p, file=file.path(plot_folder, paste("Differential_Abundance_Analysis_Fibro_n_over_",i,".pdf")))
+ # }
+  gridExtra::grid.arrange(grobs = plot_list)
+  
+    #save out in individual pdfs for each variable
+ pdf(file=file.path(plot_folder, paste0("DA_CAF_n_over_",i,"_NOTsplit.pdf")), width=6, height=4)
+  gridExtra::grid.arrange(grobs = plot_list)
+  dev.off()
+}
+
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=FibroTypes))+geom_point()+scale_color_tableau()
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=PValue<0.05))+geom_point()+scale_color_tableau()
+```
+
+#clean
+#Differential abundance analysis loop   
+```{r  Differential Abundance Analysis Types over Metastasis clean, fig.width=6, fig.height=4, warning=F, message=F,echo=F, out.width="50%"}
+#Patient Number must be numeric
+library(tidyr)
+#groupId must be factor
+
+colData(fibro.sce_pat)<-as.data.frame(colData(fibro.sce_pat)) %>% 
+  group_by(Patient_ID) %>% mutate(NR=cur_group_id()) %>% DataFrame()
+dat <-as_tibble(colData(fibro.sce_pat))
+#if necessary: change group_id labels
+
+dat <-dat[dat$DX.name=="Adenocarcinoma" | dat$DX.name=="Squamous cell carcinoma",]
+
+
+subtype <-c("Gender","Relapse","Chemo","Dist.Met","LN.Met","Relapse")
+
+#define the naming subtype aka Celltype
+
+j <-"cell_type"
+
+plot_list <- list()
+for (i in subtype) {
+  for(k in unique(dat$DX.name)){    #uncomment this if you want to split your loop e.g. by tumour type
+    dat.l <-subset(dat, DX.name==k) #uncomment this if you want to split your loop e.g. by tumour type
+
+#dat.l <-dat
+#k="both tumour types" # comment this if you want to split by e.g. tumour type
+dat.l$group_id <-as.factor(dat.l[[i]])
+dat.l<-dat.l %>% drop_na(paste(i))
+
+test <- dat.l %>% select(c("NR",j,"group_id"))
+
+test_wide <- pivot_wider(test,id_cols=c("NR","group_id"),names_from = "cell_type", values_from ="cell_type", values_fn = list(cell_type=length),names_prefix = "")
+
+test_wide
+test_wide[is.na(test_wide)] <- 0
+
+design <- createDesignMatrix(
+  test_wide[,c("NR","group_id")], cols_design = c( "NR","group_id")
+)
+
+contrast <- createContrast(c(rep(0, ncol(design)-1),1))
+data.frame(parameters = colnames(design), contrast)
+
+test_wide_norm <- test_wide %>% select(-c("NR","group_id"))
+#Normalize
+norm_factors <- calcNormFactors(t(test_wide_norm), method = "TMM")
+y <- DGEList(t(test_wide_norm), norm.factors = norm_factors)
+
+y <- estimateDisp(y,design)
+
+fit <- glmFit(y, design)
+lrt <- glmLRT(fit, contrast = contrast)
+top <- edgeR::topTags(lrt, n = Inf, adjust.method = "BH", 
+        sort.by = "none")
+t.op <-as.data.frame(top)
+t.op <- tibble::rownames_to_column(t.op, "FibroTypes")
+
+plot_list[[k]] <-
+ggplot(t.op, aes(reorder(FibroTypes,logFC), logFC)) +
+  geom_col(aes(fill=PValue<0.05)) +
+  coord_flip() +
+  scale_fill_tableau()+
+  labs(x="Celltype", y="logFC",
+       title=paste("Phenotypes over",i,k))+
+  theme_bw()+
+  theme(strip.background = element_blank(),
+        panel.background=element_rect(fill='white', colour = 'black'),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        axis.text.y = element_text(size=10))
+#plot(p)
+
+
+  } #uncomment this if you want to split by e.g. tumour type
+  gridExtra::grid.arrange(grobs = plot_list)
+  
+    #save individual pdf plots for each variable
+  pdf(file=file.path(plot_folder, paste0("NEW_DA_Celltype_n_over_",i,"_in_",k,".pdf")))
+  gridExtra::grid.arrange(grobs = plot_list)
+  dev.off()
+}
+
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=FibroTypes))+geom_point()+scale_color_tableau()
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=PValue<0.05))+geom_point()+scale_color_tableau()
+```
+
+#############################################################################################################################################
+#only mCAF and SMA high CAF
+
+
+## Patients are ordered by hierarchical clustering (euclidian, wards.D2)
+```{r  Fibro subtype proportions ordered by hierarchical clustering only mCAF SMA, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table( fibro.sce_pat[, fibro.sce_pat$Patient_ID!="Control"&
+                                           fibro.sce_pat$DX.name=="Adenocarcinoma"|
+                                           fibro.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            fibro.sce_pat[, fibro.sce_pat$Patient_ID!="Control"&
+                                           fibro.sce_pat$DX.name=="Adenocarcinoma"|
+                                           fibro.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_type))
+
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat[tdat$Phenotype=="mCAF"|tdat$Phenotype=="SMA_CAF"|tdat$Phenotype=="iCAF",]
+tdat <-tdat[tdat$Phenotype=="mCAF"|tdat$Phenotype=="SMA_CAF",]
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = hc$labels[hc$order])
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = hc$labels[hc$order])
+
+tdat_wide
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=15)
+
+```
+
+Patient based metaclusters split by metacluster
+```{r  Barplot Fibro subtype proportions only mCAF SMA, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#Barplot proportions ordered by
+p <-ggplot(tdat,aes(x=freq, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 10")
+#plot(p)
+##ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_Tcell-subtype_over_FILTEREDpatients_ordered-hc.pdf")), width=6, height=6)
+
+#dendrogram coloured by metacluster
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k =2) %>%  as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()+
+  coord_flip()+
+  scale_y_reverse()
+
+#n
+p2<-ggplot(tdat,aes(x=n, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 10")+theme(
+        axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())
+plt <-p1+
+  p+theme(legend.position = "none", axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())+
+  p2
+
+plot(plt)
+##ggsave(plot=plt, file=file.path(plot_folder, paste("Barplot_n_hc_cell_type_proportions.pdf")), width=6, height=6)
+```
+
+```{r  merge hc metacluster into t_dat Fibro subtype proportions only mCAF SMA, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k= 2))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+
+p <-ggplot(tdat_ct,aes(y=freq,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 10")+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+##ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_split_metacluster_tcell-subtype_proportions.pdf")))
+plot(p)
+
+#ordered by prop
+p <-ggplot(tdat_ct,aes(y=reorder(freq, desc()),x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 10")+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+plot(p)
+```
+```{r  heatmap metacluster cell subtype expression proportions}
+tdat_ct
+tdat_ct_w <-tdat_ct %>% pivot_wider(id_cols = "metacluster",names_from = "Phenotype", values_from = "freq",values_fn = mean)
+tdat_ct_w_m <- as.matrix(tdat_ct_w[,-1])
+rownames(tdat_ct_w_m) <-tdat_ct_w$metacluster
+
+#pheatmap(scale(t(tdat_ct_w_m)))
+library(pheatmap)
+pheatmap(t(tdat_ct_w_m))
+```
+
+## **Survival analysis** - Fibro subtype
+Fibro categories (CD4 and CD8) are grouped into "high" and "low" using the proportion's median as a cut-off.    
+- high > median  
+- low < median  
+Patients who received neoadjuvant treatment are excluded from survival analysis.  
+```{r  survival data Fibro subtype proportions high-low only mCAF SMA, message=FALSE, warning=FALSE, echo=FALSE}
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat,]
+
+#high low by median
+surv_dat$mCAF_G <- ifelse(surv_dat$mCAF >summary(surv_dat$mCAF)[3],"mCAF high","mCAF low")
+surv_dat$SMA_CAF_G <- ifelse(surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+#surv_dat$iCAF_G <- ifelse(surv_dat$iCAF >summary(surv_dat$iCAF)[3],"iCAF high","iCAF low")
+
+caf_strat <-surv_dat %>% select(contains(c("Patient_ID","_G")))
+#write.csv(caf_strat, file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+
+surv <-c("SMA_CAF_G","mCAF_G","metacluster")
+#surv <-"metacluster"
+```
+
+
+#CoxPH for Fibro subtype proportions corrected for Stage, Grade and M
+```{r  Coxph Fibro subtype proportions only mCAF SMA, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+#res.cox <- coxph(Surv(time, status) ~   iCAF+mCAF+SMA_CAF+ Stage+Grade+M.new, data =  surv_dat)
+res.cox <- coxph(Surv(time, status) ~   mCAF+SMA_CAF+ Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r  survival Fibro subtype proportions not split high low only mCAF SMA, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+ 
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+  #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  facet_wrap(~strata)+theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r  survival Fibro subtype proportions split high low only mCAF SMA, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#OS survival
+
+
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+    dat$mCAF_G <- ifelse(dat$mCAF >summary(dat$mCAF)[3],"mCAF high","mCAF low")
+dat$SMA_CAF_G <- ifelse(dat$SMA_CAF >summary(dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+    #dat$iCAF_G <- ifelse(dat$iCAF >summary(dat$iCAF)[3],"iCAF high","iCAF low")
+
+#caf_strat <-dat %>% select(contains(c("Patient_ID","_G")))
+##write.csv(caf_strat, file=file.path(wd,"patient_strat",paste0("CAF_strat_hi_low_proportion_",k,".csv")))
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+   #abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
diff --git a/Analysis_Fibro_DensitiesperStroma.Rmd b/Analysis_Fibro_DensitiesperStroma.Rmd
new file mode 100644
index 0000000..6796899
--- /dev/null
+++ b/Analysis_Fibro_DensitiesperStroma.Rmd
@@ -0,0 +1,3213 @@
+---
+title: "R Notebook - Analysis Fibros density calculated by stroma area only"
+output:
+  html_document:
+    df_print: paged
+---
+```{r load libraries, echo=F, message=F, warning=F}
+library(ggpubr)
+library(dplyr)
+#library(tidyverse)
+library(RColorBrewer)
+library(qwraps2)
+library(table1)
+library(SingleCellExperiment)
+#library(uwot)
+library(tidyr)
+library(scater)
+library(ggridges)
+library(viridis)
+library(viridisLite)
+library(ggplot2)
+library(data.table)
+library(CATALYST)
+library(gridExtra)
+library(Rphenograph)
+library(ComplexHeatmap)
+library(CATALYST)
+library(scales)
+library(survival)
+library(broom)
+library(pheatmap)
+library(FlowSOM)
+library(Seurat)
+library(Rphenoannoy)
+library(dplyr)
+library(data.table)
+library(ggthemes)
+library(diffcyt)
+library(edgeR)
+library(rstatix)
+library(dendextend)
+
+library(ggdendro)
+library(dendextend)
+library(FactoMineR)
+library(factoextra)
+library(survminer)
+library(corrplot)
+library(rstatix)
+library(graphics)
+library(cowplot)
+library(cluster)
+library(glmnet)
+library(fastDummies)
+
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+library(dplyr)
+library(tidyr)
+library(mclust)
+library(ggplot2)
+library(RColorBrewer)
+library(scater)
+library(Rphenoannoy)
+library(factoextra)
+library(cluster)
+library(dendextend)
+library(ggthemes)
+set.seed(101100)
+```
+
+
+
+```{r  Set wd and load data}
+#set working directory 
+wd <-dirname(getwd())
+
+#clinical.data
+data_folder <-(file.path(wd,"sce_objects","final objects with categories","FINAL"))
+fibro.sce <- readRDS(file=file.path(data_folder, "FIBRO_CLINICAL-DATA_FILTERED.rds"))
+fibro.sce$DX.name[is.na(fibro.sce$DX.name)]<-"NA"
+
+```
+
+Define clinical data
+```{r clinical data, message=FALSE, warning=FALSE, echo=FALSE}
+clinical.data <- read.csv(file=file.path(wd,"sce_objects","clinical_data", "clinical_data_ROI_ac_combined_CORRECT.csv"))
+
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+clinical.data$X <-NULL
+clinical.data$X.1 <-NULL
+head(clinical.data)
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+
+```
+Table with Immune cell numbers including Fibros removing undefined
+```{r  Table with immune cell numbers per patient, echo=FALSE, message=FALSE, warning=FALSE}
+#All Fibros together
+tbl <- as.data.frame(table(fibro.sce[,fibro.sce$Patient_ID!="Control" ]$Patient_ID))
+colnames(tbl) <-c("Patient ID", "Fibro number overall")
+#write.csv(tbl, file=file.path(plot_folder, paste("Immune cell numbers per patient overall including T-excluding undefined cells.csv")))
+##print(tbl)
+
+summary(tbl$`Fibro number overall`) #3-6259
+tbl[tbl$`Fibro number overall` <=100,] #43 patients have less than 100 F
+
+#Lowest 10% of all patients' Fibro numbers
+tbl[order(tbl$`Fibro number overall`),]$`Fibro number overall`[1:100] #1-23
+
+
+#Highest 10% of all patients' Fibro numbers
+tbl[order(-tbl$`Fibro number overall`),]$`Fibro number overall`[1:100] #760-2378
+
+tbl[tbl$`Fibro number overall` <=100,] #43 patients have less than 100 Fibros=10%. 77 patients have less than 15 Fibros
+
+all_fibro_pat <- tbl[tbl$`Fibro number overall` <=100,]$`Patient ID`
+
+length(unique(fibro.sce$Patient_ID))
+```
+
+Remove ROIs
+```{r  Table with Immune cell numbers per image, echo=FALSE, message=FALSE, warning=FALSE}
+#All Immune cells together
+tbl <- as.data.frame(table(fibro.sce[,fibro.sce$Patient_ID!="Control"]$RoiID))
+colnames(tbl) <-c("ROI ID", "Fibro number overall")
+#write.csv(tbl, file=file.path(plot_folder, paste("Fibro numbers per ROI overall including Fibros-excluding undefined.csv")))
+#print(tbl)
+summary(tbl$`Fibro number overall`) #1-5075
+tbl[tbl$`Fibro number overall` <=10,] #93 images have less than 50 Fibros (10% of median)
+#Lowest 10% of all patients' Fibro numbers
+tbl[order(tbl$`Fibro number overall`),]$`Fibro number overall`[1:200] #1-110
+
+
+#Highest 10% of all patients' Fibro numbers
+tbl[order(-tbl$`Fibro number overall`),]$`Fibro number overall`[1:200] #1385-5075
+#per image cut at 50 Fibros per image equals lowest 5% -> ensures that there's at least 50 Fibros per patient
+
+all_fibro_roi <- tbl[tbl$`Fibro number overall` <=50,]$`ROI ID`
+```
+
+
+Remove patients and roi
+```{r  patient roi and pat_roi removal, echo=FALSE, message=FALSE, warning=FALSE}
+#Patient removal
+fibro.sce_pat <- fibro.sce[,fibro.sce$Patient_ID!="Control"&
+                               !fibro.sce$Patient_ID%in%all_fibro_pat]
+length(unique(fibro.sce_pat$Patient_ID)) #1025
+
+#Roi removal
+fibro.sce_roi <- fibro.sce[,fibro.sce$Patient_ID!="Control"&
+                               !fibro.sce$RoiID%in%all_fibro_roi]
+length(unique(fibro.sce_roi$Patient_ID)) #1039
+
+#Patient & Roi removal
+fibro.sce_pat.roi <- fibro.sce[,fibro.sce$Patient_ID!="Control"&
+                               !fibro.sce$Patient_ID%in%all_fibro_pat&
+                               !fibro.sce$RoiID%in%all_fibro_roi]
+length(unique(fibro.sce_pat.roi$Patient_ID)) #1025
+```
+
+```{r  Save SCE patient and roi removed, include=FALSE, echo=FALSE,warning=FALSE, message=FALSE, eval=FALSE}
+data_folder <-(file.path(wd,"sce_objects","stroma","CAF"))
+
+saveRDS(fibro.sce_pat.roi,file=file.path(data_folder, paste("Fibro_sce_pat_roi_rem.rds",sep="")))
+saveRDS(fibro.sce_roi,file=file.path(data_folder, paste("Fibro_sce_roi_rem.rds",sep="")))
+saveRDS(fibro.sce_pat,file=file.path(data_folder, paste("Fibro_sce_pat_rem.rds",sep="")))
+
+fibro.sce_pat.roi <-readRDS(file=file.path(data_folder, paste("Fibro_sce_pat_roi_rem.rds",sep="")))
+fibro.sce_roi <-readRDS(file=file.path(data_folder, paste("Fibro_sce_roi_rem.rds",sep="")))
+fibro.sce_pat <-readRDS(file=file.path(data_folder, paste("Fibro_sce_pat_rem.rds",sep="")))
+```
+
+
+
+## **Fibro category**
+## **Densities**
+Generally, densities are first calculated per tissue area. For patients with more than 1 image, the average density is calculated subsequently.  
+Optimal number of Fibro category densities patients metaclusters:
+```{r  Fibro category density optimal number of clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Densities
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Stroma,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+#tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+library(FactoMineR)
+library(factoextra)
+sbg <-cutree(hc, k=8)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=20)
+
+#install.packages("cluster")
+library(cluster)
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+```{r  Calculate densities Fibro category, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Stroma,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+
+  #remove super high density outliers (total density >10000)  
+ # tdat <-tdat[ !tdat$Patient_ID%in%surv_excl,]
+#CAF type distribution over all patients
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+#order Patients after clustering results
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = labels(hc))
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = labels(hc))
+
+#tdat_wide <-tdat_wide[ !tdat_wide$Patient_ID%in%surv_excl,]
+
+```
+
+## Barplot showing absolute density per patietn coloured by Fibro category together with hierarchical clustering tree coloured by patient metaclusters.
+```{r Barolot Fibro category densities, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+#barolot (sqrt of density)
+p <-ggplot(tdat,aes(y=sqrt(density),x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+#barplot total density
+p <-ggplot(tdat,aes(y=sqrt(density),x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+
+#coloured dendrogram
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 8) %>% as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()
+pg <-plot_grid(p1,p, align="v", ncol=1)
+pg
+##ggsave(plot=pg, file=file.path(plot_folder, paste("Barplot_Tcell-Category-Densities_HC_ordered_dendro.pdf")), width=12, height=6)
+```
+Add patient metacluster to data and plot clusters separately for better identification
+```{r Fibro category densities merge hc, message=FALSE, warning=FALSE, echo=FALSE}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k=8))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+ggplot(tdat_ct,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+
+
+```
+```{r  heatmap metacluster cell category expression density}
+tdat_ct
+tdat_ct_w <-tdat_ct %>% pivot_wider(id_cols = "metacluster",names_from = "Phenotype", values_from = "density",values_fn = mean)
+tdat_ct_w_m <- as.matrix(tdat_ct_w[,-1])
+rownames(tdat_ct_w_m) <-tdat_ct_w$metacluster
+
+pheatmap(scale(t(tdat_ct_w_m)))
+
+pheatmap(t(tdat_ct_w_m))
+```
+## Survival analysis over Fibro category densities high low.  
+- High > median
+- Low < median
+```{r  surv_dat Fibro category densities high low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- rowSums(surv_dat[,2:12])
+#surv_excl <-surv_dat[surv_dat$total_density>10000,]$Patient_ID
+
+#surv_dat <-surv_dat[ !surv_dat$Patient_ID%in%surv_excl]
+#high low
+surv_dat$Density_G <- ifelse(surv_dat$total_density >summary(surv_dat$total_density)[3],"Density high","Density low")
+surv_dat$mCAF_G <- ifelse(surv_dat$mCAF >summary(surv_dat$mCAF)[3],"mCAF high","mCAF low")
+surv_dat$vCAF_G <- ifelse(surv_dat$vCAF >summary(surv_dat$vCAF)[3],"vCAF high","vCAF low")
+surv_dat$iCAF_G <- ifelse(surv_dat$iCAF >summary(surv_dat$iCAF)[3],"iCAF high","iCAF low")
+surv_dat$dCAF_G <- ifelse(surv_dat$dCAF >summary(surv_dat$dCAF)[3],"dCAF high","dCAF low")
+surv_dat$tpCAF_G <- ifelse(surv_dat$tpCAF >summary(surv_dat$tpCAF)[3],"tpCAF high","tpCAF low")
+surv_dat$hypoxic_tpCAF_G <- ifelse(surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+surv_dat$hypoxic_CAF_G <- ifelse(surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+surv_dat$Collagen_CAF_G <- ifelse(surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+surv_dat$PDPN_CAF_G <- ifelse(surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+surv_dat$SMA_CAF_G <- ifelse(surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+surv_dat$IDO_CAF_G <- ifelse(surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+
+
+caf_strat <-surv_dat %>% select(contains(c("_G", "Patient_ID")))
+write.csv(caf_strat,file=file.path(wd,"patient_strat","CAF_strat_hi_lo_density.csv"))
+```
+
+#CoxPH for Fibro category density corrected for Stage, Grade and M
+```{r  Coxph Fibro category density, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   IDO_CAF + SMA_CAF+PDPN_CAF+Collagen_CAF +hypoxic_CAF+hypoxic_tpCAF+tpCAF+ dCAF+iCAF+vCAF+mCAF+Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r  survival Fibro category densities high low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+#surv <-c("metacluster")
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+
+#surv <-c("metacluster")
+library(survminer)
+
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+   #symbols = c("****", "***", "**", "*", "+", "."),
+   #abbr.colnames = FALSE, na = " "))
+#km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+##print(pw)
+##print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   #symbols = c("****", "***", "**", "*", "+", "."),
+#   #abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r  survival Fibro category densities high low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Split
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G <- ifelse(dat$total_density >summary(dat$total_density)[3],"Density high","Density low")
+dat$mCAF_G <- ifelse(dat$mCAF >summary(dat$mCAF)[3],"mCAF high","mCAF low")
+dat$vCAF_G <- ifelse(dat$vCAF >summary(dat$vCAF)[3],"vCAF high","vCAF low")
+dat$iCAF_G <- ifelse(dat$iCAF >summary(dat$iCAF)[3],"iCAF high","iCAF low")
+dat$dCAF_G <- ifelse(dat$dCAF >summary(dat$dCAF)[3],"dCAF high","dCAF low")
+dat$tpCAF_G <- ifelse(dat$tpCAF >summary(dat$tpCAF)[3],"tpCAF high","tpCAF low")
+dat$hypoxic_tpCAF_G <- ifelse(dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+dat$hypoxic_CAF_G <- ifelse(dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+dat$Collagen_CAF_G <- ifelse(dat$Collagen_CAF >summary(dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+dat$PDPN_CAF_G <- ifelse(dat$PDPN_CAF >summary(dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+dat$SMA_CAF_G <- ifelse(dat$SMA_CAF >summary(dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+dat$IDO_CAF_G <- ifelse(dat$IDO_CAF >summary(dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+
+caf_strat <- dat %>% select(contains(c("Patient_ID","_G")))
+write.csv(caf_strat, file=file.path(wd,"patient_strat", paste0("CAF_strat_hi_low_density_split_",k,".csv")))
+  }
+}
+
+library(survminer)
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G <- ifelse(dat$total_density >summary(dat$total_density)[3],"Density high","Density low")
+dat$mCAF_G <- ifelse(dat$mCAF >summary(dat$mCAF)[3],"mCAF high","mCAF low")
+dat$vCAF_G <- ifelse(dat$vCAF >summary(dat$vCAF)[3],"vCAF high","vCAF low")
+dat$iCAF_G <- ifelse(dat$iCAF >summary(dat$iCAF)[3],"iCAF high","iCAF low")
+dat$dCAF_G <- ifelse(dat$dCAF >summary(dat$dCAF)[3],"dCAF high","dCAF low")
+dat$tpCAF_G <- ifelse(dat$tpCAF >summary(dat$tpCAF)[3],"tpCAF high","tpCAF low")
+dat$hypoxic_tpCAF_G <- ifelse(dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+dat$hypoxic_CAF_G <- ifelse(dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+dat$Collagen_CAF_G <- ifelse(dat$Collagen_CAF >summary(dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+dat$PDPN_CAF_G <- ifelse(dat$PDPN_CAF >summary(dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+dat$SMA_CAF_G <- ifelse(dat$SMA_CAF >summary(dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+dat$IDO_CAF_G <- ifelse(dat$IDO_CAF >summary(dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+##print(pw)
+##print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   #symbols = c("****", "***", "**", "*", "+", "."),
+#   #abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+#km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+##print(pw)
+##print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   #symbols = c("****", "***", "**", "*", "+", "."),
+ #  #abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+#km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high / low as well as the densities (continuous).   
+```{r  coxph Fibro category densities high-low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%"}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Fibro_Density_high-low_Lasso.pdf")), plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Survival Fibro category densities high-mid-low     
+- high    > 3rd quantile   
+- mid     > 1st quantile < 3rd quantile   
+- low     < 1st quantile   
+```{r  surv_dat Fibro category densities high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- rowSums(surv_dat[,2:12])
+
+#High Med Low
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[5]] <-"Density high"
+surv_dat$Density_G[surv_dat$total_density <=summary(surv_dat$total_density)[2]] <-"Density low"
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[2]& surv_dat$total_density<summary(surv_dat$total_density)[5]] <-"Density medium"
+
+surv_dat$mCAF_G[surv_dat$mCAF >summary(surv_dat$mCAF)[5]] <-"mCAF high"
+surv_dat$mCAF_G[surv_dat$mCAF <=summary(surv_dat$mCAF)[2]] <-"mCAF low"
+surv_dat$mCAF_G[surv_dat$mCAF >summary(surv_dat$mCAF)[2]& surv_dat$mCAF<=summary(surv_dat$mCAF)[5]] <-"mCAF medium"
+
+surv_dat$vCAF_G[surv_dat$vCAF >summary(surv_dat$vCAF)[5]] <-"vCAF high"
+surv_dat$vCAF_G[surv_dat$vCAF <=summary(surv_dat$vCAF)[2]] <-"vCAF low"
+surv_dat$vCAF_G[surv_dat$vCAF >summary(surv_dat$vCAF)[2]& surv_dat$vCAF<=summary(surv_dat$vCAF)[5]] <-"vCAF medium"
+
+surv_dat$tpCAF_G[surv_dat$tpCAF >summary(surv_dat$tpCAF)[5]] <-"tpCAF high"
+surv_dat$tpCAF_G[surv_dat$tpCAF <=summary(surv_dat$tpCAF)[2]] <-"tpCAF low"
+surv_dat$tpCAF_G[surv_dat$tpCAF >summary(surv_dat$tpCAF)[2]& surv_dat$tpCAF<=summary(surv_dat$tpCAF)[5]] <-"tpCAF medium"
+
+surv_dat$iCAF_G[surv_dat$iCAF >summary(surv_dat$iCAF)[5]] <-"iCAF high"
+surv_dat$iCAF_G[surv_dat$iCAF <=summary(surv_dat$iCAF)[2]] <-"iCAF low"
+surv_dat$iCAF_G[surv_dat$iCAF >summary(surv_dat$iCAF)[2]& surv_dat$iCAF<=summary(surv_dat$iCAF)[5]] <-"iCAF medium"
+
+surv_dat$dCAF_G[surv_dat$dCAF >summary(surv_dat$dCAF)[5]] <-"dCAF high"
+surv_dat$dCAF_G[surv_dat$dCAF <=summary(surv_dat$dCAF)[2]] <-"dCAF low"
+surv_dat$dCAF_G[surv_dat$dCAF >summary(surv_dat$dCAF)[2]& surv_dat$dCAF<=summary(surv_dat$dCAF)[5]] <-"dCAF medium"
+
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[5]] <-"Collagen_CAF high"
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF <=summary(surv_dat$Collagen_CAF)[2]] <-"Collagen_CAF low"
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[2]& surv_dat$Collagen_CAF<=summary(surv_dat$Collagen_CAF)[5]] <-"Collagen_CAF medium"
+
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[5]] <-"hypoxic_CAF high"
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF <=summary(surv_dat$hypoxic_CAF)[2]] <-"hypoxic_CAF low"
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[2]& surv_dat$hypoxic_CAF<=summary(surv_dat$hypoxic_CAF)[5]] <-"hypoxic_CAF medium"
+
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[5]] <-"SMA_CAF high"
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF <=summary(surv_dat$SMA_CAF)[2]] <-"SMA_CAF low"
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[2]& surv_dat$SMA_CAF<=summary(surv_dat$SMA_CAF)[5]] <-"SMA_CAF medium"
+
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF high"
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF <=summary(surv_dat$hypoxic_tpCAF)[2]] <-"hypoxic_tpCAF low"
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[2]& surv_dat$hypoxic_tpCAF<=summary(surv_dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF medium"
+
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[5]] <-"IDO_CAF high"
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF <=summary(surv_dat$IDO_CAF)[2]] <-"IDO_CAF low"
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[2]& surv_dat$IDO_CAF<=summary(surv_dat$IDO_CAF)[5]] <-"IDO_CAF medium"
+
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[5]] <-"PDPN_CAF high"
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF <=summary(surv_dat$PDPN_CAF)[2]] <-"PDPN_CAF low"
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[2]& surv_dat$PDPN_CAF<=summary(surv_dat$PDPN_CAF)[5]] <-"PDPN_CAF medium"
+
+
+caf_strat <- dat %>% select(contains(c("Patient_ID","_G")))
+write.csv(caf_strat, file=file.path(wd,"patient_strat", paste0("CAF_strat_hi_mid_low_density.csv")))
+
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+```
+
+## Fibro category densities split by tumour type
+```{r  survival Fibro category densities high-mid-low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+#surv <-"metacluster"
+library(survminer)
+
+for (i in surv){
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+##print(pw)
+##print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   #symbols = c("****", "***", "**", "*", "+", "."),
+ #  #abbr.colnames = FALSE, na = " "))
+km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+##print(pw)
+##print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   #symbols = c("****", "***", "**", "*", "+", "."),
+ #  #abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## Fibro category densities split by tumour type
+```{r  survival Fibro category densities high-mid-low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G[dat$total_density >summary(dat$total_density)[5]] <-"Density high"
+dat$Density_G[dat$total_density <summary(dat$total_density)[2]] <-"Density low"
+dat$Density_G[dat$total_density >summary(dat$total_density)[2]& dat$total_density<summary(dat$total_density)[5]] <-"Density medium"
+dat$mCAF_G[dat$mCAF >summary(dat$mCAF)[5]] <-"mCAF high"
+dat$mCAF_G[dat$mCAF <=summary(dat$mCAF)[2]] <-"mCAF low"
+dat$mCAF_G[dat$mCAF >summary(dat$mCAF)[2]& dat$mCAF<=summary(dat$mCAF)[5]] <-"mCAF medium"
+
+dat$vCAF_G[dat$vCAF >summary(dat$vCAF)[5]] <-"vCAF high"
+dat$vCAF_G[dat$vCAF <=summary(dat$vCAF)[2]] <-"vCAF low"
+dat$vCAF_G[dat$vCAF >summary(dat$vCAF)[2]& dat$vCAF<=summary(dat$vCAF)[5]] <-"vCAF medium"
+
+dat$tpCAF_G[dat$tpCAF >summary(dat$tpCAF)[5]] <-"tpCAF high"
+dat$tpCAF_G[dat$tpCAF <=summary(dat$tpCAF)[2]] <-"tpCAF low"
+dat$tpCAF_G[dat$tpCAF >summary(dat$tpCAF)[2]& dat$tpCAF<=summary(dat$tpCAF)[5]] <-"tpCAF medium"
+
+dat$iCAF_G[dat$iCAF >summary(dat$iCAF)[5]] <-"iCAF high"
+dat$iCAF_G[dat$iCAF <=summary(dat$iCAF)[2]] <-"iCAF low"
+dat$iCAF_G[dat$iCAF >summary(dat$iCAF)[2]& dat$iCAF<=summary(dat$iCAF)[5]] <-"iCAF medium"
+
+dat$dCAF_G[dat$dCAF >summary(dat$dCAF)[5]] <-"dCAF high"
+dat$dCAF_G[dat$dCAF <=summary(dat$dCAF)[2]] <-"dCAF low"
+dat$dCAF_G[dat$dCAF >summary(dat$dCAF)[2]& dat$dCAF<=summary(dat$dCAF)[5]] <-"dCAF medium"
+
+dat$Collagen_CAF_G[dat$Collagen_CAF >summary(dat$Collagen_CAF)[5]] <-"Collagen_CAF high"
+dat$Collagen_CAF_G[dat$Collagen_CAF <=summary(dat$Collagen_CAF)[2]] <-"Collagen_CAF low"
+dat$Collagen_CAF_G[dat$Collagen_CAF >summary(dat$Collagen_CAF)[2]& dat$Collagen_CAF<=summary(dat$Collagen_CAF)[5]] <-"Collagen_CAF medium"
+
+dat$hypoxic_CAF_G[dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[5]] <-"hypoxic_CAF high"
+dat$hypoxic_CAF_G[dat$hypoxic_CAF <=summary(dat$hypoxic_CAF)[2]] <-"hypoxic_CAF low"
+dat$hypoxic_CAF_G[dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[2]& dat$hypoxic_CAF<=summary(dat$hypoxic_CAF)[5]] <-"hypoxic_CAF medium"
+
+dat$SMA_CAF_G[dat$SMA_CAF >summary(dat$SMA_CAF)[5]] <-"SMA_CAF high"
+dat$SMA_CAF_G[dat$SMA_CAF <=summary(dat$SMA_CAF)[2]] <-"SMA_CAF low"
+dat$SMA_CAF_G[dat$SMA_CAF >summary(dat$SMA_CAF)[2]& dat$SMA_CAF<=summary(dat$SMA_CAF)[5]] <-"SMA_CAF medium"
+
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF high"
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF <=summary(dat$hypoxic_tpCAF)[2]] <-"hypoxic_tpCAF low"
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[2]& dat$hypoxic_tpCAF<=summary(dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF medium"
+
+dat$IDO_CAF_G[dat$IDO_CAF >summary(dat$IDO_CAF)[5]] <-"IDO_CAF high"
+dat$IDO_CAF_G[dat$IDO_CAF <=summary(dat$IDO_CAF)[2]] <-"IDO_CAF low"
+dat$IDO_CAF_G[dat$IDO_CAF >summary(dat$IDO_CAF)[2]& dat$IDO_CAF<=summary(dat$IDO_CAF)[5]] <-"IDO_CAF medium"
+
+dat$PDPN_CAF_G[dat$PDPN_CAF >summary(dat$PDPN_CAF)[5]] <-"PDPN_CAF high"
+dat$PDPN_CAF_G[dat$PDPN_CAF <=summary(dat$PDPN_CAF)[2]] <-"PDPN_CAF low"
+dat$PDPN_CAF_G[dat$PDPN_CAF >summary(dat$PDPN_CAF)[2]& dat$PDPN_CAF<=summary(dat$PDPN_CAF)[5]] <-"PDPN_CAF medium"
+
+
+caf_strat <- dat %>% select(contains(c("Patient_ID","_G")))
+write.csv(caf_strat, file=file.path(wd,"patient_strat", paste0("CAF_strat_hi_mid_low_density_split_",k,".csv")))
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+##print(pw)
+##print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   #symbols = c("****", "***", "**", "*", "+", "."),
+#   #abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+#km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+##print(pw)
+##print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   #symbols = c("****", "***", "**", "*", "+", "."),
+#   #abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+#km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+
+
+## Lasso-regressed cox-ph model   
+including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high mid low as well as the densities (continuous).   
+```{r coxph Fibro category densities high-mid-low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=12, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor")))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+##ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Density_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables without neoadjuvant treated patients.
+```{r  KW W Fibro category density WO neo,warning=FALSE, message=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+categories <- c("DX.name","Grade")
+fibro.sce_pat.roi$NeoAdj <- ifelse(fibro.sce_pat.roi$Chemo==1 |fibro.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Stroma,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[2:12])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:13])), names_to = "Phenotype",values_to = "density")
+
+#remove
+tdat <-tdat[!tdat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+categories <- c("DX.name","Grade")
+
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   stat.test$y.position <-sqrt(stat.test$y.position)
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = sqrt(density), colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = sqrt(density), colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+  #pdf(file=file.path(plot_folder, paste0("KW_W_CAF_Densities_WOneo",i,".pdf")), width=12, height=8)
+ #gridExtra::grid.arrange(grobs = plot_list, ncol=6)
+ #dev.off()
+}
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables with neoadjuvant treated patients.
+```{r  KW Wilcox Fibro Category density W neo,warning=FALSE, message=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+fibro.sce_pat.roi$NeoAdj <- ifelse(fibro.sce_pat.roi$Chemo==1 |fibro.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Stroma,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[,2:12])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:13])), names_to = "Phenotype",values_to = "density")
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Densities_Wneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+```{r  Correlation Fibro category densities with clinical parameters, eval=F, echo=T, message=FALSE, warning=FALSE, echo=FALSE}
+fibro.sce_pat.roi$NeoAdj <- ifelse(fibro.sce_pat.roi$Chemo==1 |fibro.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+fibro.sce_pat.roi$NeoAdj <- ifelse(fibro.sce_pat.roi$Chemo==1 |fibro.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Stroma,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- tdat_wide$CD4 + tdat_wide$CD8
+
+tdat_wide <-merge(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_as <-tdat_wide[tdat_wide$DX.name=="Adenocarcinoma" | tdat_wide$DX.name=="Squamous cell carcinoma",]
+tdat_wide_as$LN.Met <-ifelse(tdat_wide_as$LN.Met=="LN Metastases", 1,0)
+tdat_wide_as$Dist.Met <-ifelse(tdat_wide_as$Dist.Met=="Dist. Metastases", 1,0)
+
+for(i in c.param){
+  print(i)
+  print(cor.test(tdat_wide_as$CD4,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$CD8,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$total_density,tdat_wide_as[[i]], method="spearman", exact=F))
+}
+```
+
+
+
+
+## **Fibro subtype**
+## **Densities**
+Generally, densities are first calculated per tissue area. For patients with more than 1 image, the average density is calculated subsequently.  
+Optimal number of Fibro subtype densities patients metaclusters:
+```{r  Fibro subtype density optimal number of clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Densities
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Stroma,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+#tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+library(FactoMineR)
+library(factoextra)
+sbg <-cutree(hc, k=8)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=20)
+
+#install.packages("cluster")
+library(cluster)
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+```{r  Calculate densities Fibro subtype, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Stroma,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+  
+  
+#CAF type distribution over all patients
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+#order Patients after clustering results
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = labels(hc))
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = labels(hc))
+```
+
+## Barplot showing absolute density per patietn coloured by Fibro subtype together with hierarchical clustering tree coloured by patient metaclusters.
+```{r Barolot Fibro subtype densities, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+#barolot (sqrt of density)
+p <-ggplot(tdat,aes(y=sqrt(density),x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+#barplot total density
+p <-ggplot(tdat,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+
+#coloured dendrogram
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 10) %>% as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()
+pg <-plot_grid(p1,p, align="v", ncol=1)
+pg
+##ggsave(plot=pg, file=file.path(plot_folder, paste("Barplot_Tcell-subtype-Densities_HC_ordered_dendro.pdf")), width=12, height=6)
+```
+Add patient metacluster to data and plot clusters separately for better identification
+```{r Fibro subtype densities merge hc, message=FALSE, warning=FALSE, echo=FALSE}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k=10))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+ggplot(tdat_ct,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+```
+
+## Survival analysis over Fibro subtype densities high low.  
+- High > median
+- Low < median
+```{r  surv_dat Fibro subtype densities high low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- rowSums(surv_dat[,2:15])
+
+#high low
+surv_dat$Density_G <- ifelse(surv_dat$total_density >summary(surv_dat$total_density)[3],"Density high","Density low")
+surv_dat$Collagen_CAF_G <- ifelse(surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+surv_dat$hypoxic_CAF_G <- ifelse(surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+
+surv_dat$mCAF_MMP11_G <- ifelse(surv_dat$mCAF_MMP11 >summary(surv_dat$mCAF_MMP11)[3],"mCAF_MMP11 high","mCAF_MMP11 low")
+surv_dat$mCAF_Col_Cdh_G <- ifelse(surv_dat$mCAF_Col_Cdh >summary(surv_dat$mCAF_Col_Cdh)[3],"mCAF_Col_Cdh high","mCAF_Col_Cdh low")
+
+surv_dat$SMA_CAF_G <- ifelse(surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+
+surv_dat$tpCAF_CD10_G <- ifelse(surv_dat$tpCAF_CD10 >summary(surv_dat$tpCAF_CD10)[3],"tpCAF_CD10 high","tpCAF_CD10 low")
+surv_dat$tpCAF_CD73_G <- ifelse(surv_dat$tpCAF_CD73 >summary(surv_dat$tpCAF_CD73)[3],"tpCAF_CD73 high","tpCAF_CD73 low")
+
+surv_dat$iCAF_CD248_G <- ifelse(surv_dat$iCAF_CD248 >summary(surv_dat$iCAF_CD248)[3],"iCAF_CD248 high","iCAF_CD248 low")
+surv_dat$iCAF_CD34_G <- ifelse(surv_dat$iCAF_CD34 >summary(surv_dat$iCAF_CD34)[3],"iCAF_CD34 high","iCAF_CD34 low")
+
+surv_dat$vCAF_G <- ifelse(surv_dat$vCAF >summary(surv_dat$vCAF)[3],"vCAF high","vCAF low")
+surv_dat$dCAF_G <- ifelse(surv_dat$dCAF >summary(surv_dat$dCAF)[3],"dCAF high","dCAF low")
+surv_dat$hypoxic_tpCAF_G <- ifelse(surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+surv_dat$IDO_CAF_G <- ifelse(surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+surv_dat$PDPN_CAF_G <- ifelse(surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+
+```
+
+#CoxPH for Fibro subtype density corrected for Stage, Grade and M
+```{r  Coxph Fibro subtype density, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   IDO_CAF + SMA_CAF+PDPN_CAF+Collagen_CAF +hypoxic_CAF+hypoxic_tpCAF+tpCAF_CD10+tpCAF_CD73+ dCAF+iCAF_CD34+iCAF_CD248+vCAF+mCAF_MMP11+mCAF_Col_Cdh+Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r  survival Fibro subtype densities high low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+#surv <-c("metacluster")
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_CD34_G","iCAF_CD248_G","tpCAF_CD10_G","tpCAF_CD73_G","SMA_CAF_G","mCAF_MMP11_G","mCAF_Col_Cdh_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+library(survminer)
+
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+##print(pw)
+##print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   #symbols = c("****", "***", "**", "*", "+", "."),
+#   #abbr.colnames = FALSE, na = " "))
+#km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+##print(pw)
+##print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   #symbols = c("****", "***", "**", "*", "+", "."),
+#   #abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r  survival Fibro subtype densities high low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Split
+library(survminer)
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G <- ifelse(dat$total_density >summary(dat$total_density)[3],"Density high","Density low")
+dat$Density_G <- ifelse(dat$total_density >summary(dat$total_density)[3],"Density high","Density low")
+dat$Collagen_CAF_G <- ifelse(dat$Collagen_CAF >summary(dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+dat$hypoxic_CAF_G <- ifelse(dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+
+dat$mCAF_MMP11_G <- ifelse(dat$mCAF_MMP11 >summary(dat$mCAF_MMP11)[3],"mCAF_MMP11 high","mCAF_MMP11 low")
+dat$mCAF_Col_Cdh_G <- ifelse(dat$mCAF_Col_Cdh >summary(dat$mCAF_Col_Cdh)[3],"mCAF_Col_Cdh high","mCAF_Col_Cdh low")
+
+dat$SMA_CAF_G <- ifelse(dat$SMA_CAF >summary(dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+
+dat$tpCAF_CD10_G <- ifelse(dat$tpCAF_CD10 >summary(dat$tpCAF_CD10)[3],"tpCAF_CD10 high","tpCAF_CD10 low")
+dat$tpCAF_CD73_G <- ifelse(dat$tpCAF_CD73 >summary(dat$tpCAF_CD73)[3],"tpCAF_CD73 high","tpCAF_CD73 low")
+
+dat$iCAF_CD248_G <- ifelse(dat$iCAF_CD248 >summary(dat$iCAF_CD248)[3],"iCAF_CD248 high","iCAF_CD248 low")
+dat$iCAF_CD34_G <- ifelse(dat$iCAF_CD34 >summary(dat$iCAF_CD34)[3],"iCAF_CD34 high","iCAF_CD34 low")
+
+dat$vCAF_G <- ifelse(dat$vCAF >summary(dat$vCAF)[3],"vCAF high","vCAF low")
+dat$dCAF_G <- ifelse(dat$dCAF >summary(dat$dCAF)[3],"dCAF high","dCAF low")
+dat$hypoxic_tpCAF_G <- ifelse(dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+dat$IDO_CAF_G <- ifelse(dat$IDO_CAF >summary(dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+dat$PDPN_CAF_G <- ifelse(dat$PDPN_CAF >summary(dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+##print(pw)
+##print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   #symbols = c("****", "***", "**", "*", "+", "."),
+#   #abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+#km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+##print(pw)
+##print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   #symbols = c("****", "***", "**", "*", "+", "."),
+ #  #abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+#km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high / low as well as the densities (continuous).   
+```{r  coxph Fibro subtype densities high-low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=6, fig.width=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+##ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-subtype_Density_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Survival Fibro subtype densities high-mid-low     
+- high    > 3rd quantile   
+- mid     > 1st quantile < 3rd quantile   
+- low     < 1st quantile   
+```{r  surv_dat Fibro subtype densities high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- rowSums(surv_dat[,2:15])
+
+#High Med Low
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[5]] <-"Density high"
+surv_dat$Density_G[surv_dat$total_density <summary(surv_dat$total_density)[2]] <-"Density low"
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[2]& surv_dat$total_density<summary(surv_dat$total_density)[5]] <-"Density medium"
+
+surv_dat$mCAF_Col_Cdh_G[surv_dat$mCAF_Col_Cdh >summary(surv_dat$mCAF_Col_Cdh)[5]] <-"mCAF_Col_Cdh high"
+surv_dat$mCAF_Col_Cdh_G[surv_dat$mCAF_Col_Cdh <=summary(surv_dat$mCAF_Col_Cdh)[2]] <-"mCAF_Col_Cdh low"
+surv_dat$mCAF_Col_Cdh_G[surv_dat$mCAF_Col_Cdh >summary(surv_dat$mCAF_Col_Cdh)[2]& surv_dat$mCAF_Col_Cdh<=summary(surv_dat$mCAF_Col_Cdh)[5]] <-"mCAF_Col_Cdh medium"
+
+surv_dat$mCAF_MMP11_G[surv_dat$mCAF_MMP11 >summary(surv_dat$mCAF_MMP11)[5]] <-"mCAF_MMP11 high"
+surv_dat$mCAF_MMP11_G[surv_dat$mCAF_MMP11 <=summary(surv_dat$mCAF_MMP11)[2]] <-"mCAF_MMP11 low"
+surv_dat$mCAF_MMP11_G[surv_dat$mCAF_MMP11 >summary(surv_dat$mCAF_MMP11)[2]& surv_dat$mCAF_MMP11<=summary(surv_dat$mCAF_MMP11)[5]] <-"mCAF_MMP11 medium"
+
+surv_dat$vCAF_G[surv_dat$vCAF >summary(surv_dat$vCAF)[5]] <-"vCAF high"
+surv_dat$vCAF_G[surv_dat$vCAF <=summary(surv_dat$vCAF)[2]] <-"vCAF low"
+surv_dat$vCAF_G[surv_dat$vCAF >summary(surv_dat$vCAF)[2]& surv_dat$vCAF<=summary(surv_dat$vCAF)[5]] <-"vCAF medium"
+
+surv_dat$tpCAF_CD10_G[surv_dat$tpCAF_CD10 >summary(surv_dat$tpCAF_CD10)[5]] <-"tpCAF_CD10 high"
+surv_dat$tpCAF_CD10_G[surv_dat$tpCAF_CD10 <=summary(surv_dat$tpCAF_CD10)[2]] <-"tpCAF_CD10 low"
+surv_dat$tpCAF_CD10_G[surv_dat$tpCAF_CD10 >summary(surv_dat$tpCAF_CD10)[2]& surv_dat$tpCAF_CD10<=summary(surv_dat$tpCAF_CD10)[5]] <-"tpCAF_CD10 medium"
+
+surv_dat$tpCAF_CD73_G[surv_dat$tpCAF_CD73 >summary(surv_dat$tpCAF_CD73)[5]] <-"tpCAF_CD73 high"
+surv_dat$tpCAF_CD73_G[surv_dat$tpCAF_CD73 <=summary(surv_dat$tpCAF_CD73)[2]] <-"tpCAF_CD73 low"
+surv_dat$tpCAF_CD73_G[surv_dat$tpCAF_CD73 >summary(surv_dat$tpCAF_CD73)[2]& surv_dat$tpCAF_CD73<=summary(surv_dat$tpCAF_CD73)[5]] <-"tpCAF_CD73 medium"
+
+surv_dat$iCAF_CD34_G[surv_dat$iCAF_CD34 >summary(surv_dat$iCAF_CD34)[5]] <-"iCAF_CD34 high"
+surv_dat$iCAF_CD34_G[surv_dat$iCAF_CD34 <=summary(surv_dat$iCAF_CD34)[2]] <-"iCAF_CD34 low"
+surv_dat$iCAF_CD34_G[surv_dat$iCAF_CD34 >summary(surv_dat$iCAF_CD34)[2]& surv_dat$iCAF_CD34<=summary(surv_dat$iCAF_CD34)[5]] <-"iCAF_CD34 medium"
+
+surv_dat$iCAF_CD248_G[surv_dat$iCAF_CD248 >summary(surv_dat$iCAF_CD248)[5]] <-"iCAF_CD248 high"
+surv_dat$iCAF_CD248_G[surv_dat$iCAF_CD248 <=summary(surv_dat$iCAF_CD248)[2]] <-"iCAF_CD248 low"
+surv_dat$iCAF_CD248_G[surv_dat$iCAF_CD248 >summary(surv_dat$iCAF_CD248)[2]& surv_dat$iCAF_CD248<=summary(surv_dat$iCAF_CD248)[5]] <-"iCAF_CD248 medium"
+
+surv_dat$dCAF_G[surv_dat$dCAF >summary(surv_dat$dCAF)[5]] <-"dCAF high"
+surv_dat$dCAF_G[surv_dat$dCAF <=summary(surv_dat$dCAF)[2]] <-"dCAF low"
+surv_dat$dCAF_G[surv_dat$dCAF >summary(surv_dat$dCAF)[2]& surv_dat$dCAF<=summary(surv_dat$dCAF)[5]] <-"dCAF medium"
+
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[5]] <-"Collagen_CAF high"
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF <=summary(surv_dat$Collagen_CAF)[2]] <-"Collagen_CAF low"
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[2]& surv_dat$Collagen_CAF<=summary(surv_dat$Collagen_CAF)[5]] <-"Collagen_CAF medium"
+
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[5]] <-"hypoxic_CAF high"
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF <=summary(surv_dat$hypoxic_CAF)[2]] <-"hypoxic_CAF low"
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[2]& surv_dat$hypoxic_CAF<=summary(surv_dat$hypoxic_CAF)[5]] <-"hypoxic_CAF medium"
+
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[5]] <-"SMA_CAF high"
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF <=summary(surv_dat$SMA_CAF)[2]] <-"SMA_CAF low"
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[2]& surv_dat$SMA_CAF<=summary(surv_dat$SMA_CAF)[5]] <-"SMA_CAF medium"
+
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF high"
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF <=summary(surv_dat$hypoxic_tpCAF)[2]] <-"hypoxic_tpCAF low"
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[2]& surv_dat$hypoxic_tpCAF<=summary(surv_dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF medium"
+
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[5]] <-"IDO_CAF high"
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF <=summary(surv_dat$IDO_CAF)[2]] <-"IDO_CAF low"
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[2]& surv_dat$IDO_CAF<=summary(surv_dat$IDO_CAF)[5]] <-"IDO_CAF medium"
+
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[5]] <-"PDPN_CAF high"
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF <=summary(surv_dat$PDPN_CAF)[2]] <-"PDPN_CAF low"
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[2]& surv_dat$PDPN_CAF<=summary(surv_dat$PDPN_CAF)[5]] <-"PDPN_CAF medium"
+
+#surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+```
+
+## Fibro subtype densities split by tumour type
+```{r  survival Fibro subtype densities high-mid-low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+#surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+library(survminer)
+
+for (i in surv){
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+##print(pw)
+##print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   #symbols = c("****", "***", "**", "*", "+", "."),
+ #  #abbr.colnames = FALSE, na = " "))
+km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+##print(pw)
+##print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   #symbols = c("****", "***", "**", "*", "+", "."),
+ #  #abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## Fibro subtype densities split by tumour type
+```{r  survival Fibro subtype densities high-mid-low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_CD34_G","iCAF_CD248_G","tpCAF_CD10_G","tpCAF_CD73_G","SMA_CAF_G","mCAF_MMP11_G","mCAF_Col_Cdh_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G[dat$total_density >summary(dat$total_density)[5]] <-"Density high"
+dat$Density_G[dat$total_density <summary(dat$total_density)[2]] <-"Density low"
+dat$Density_G[dat$total_density >summary(dat$total_density)[2]& dat$total_density<summary(dat$total_density)[5]] <-"Density medium"
+
+dat$mCAF_Col_Cdh_G[dat$mCAF_Col_Cdh >summary(dat$mCAF_Col_Cdh)[5]] <-"mCAF_Col_Cdh high"
+dat$mCAF_Col_Cdh_G[dat$mCAF_Col_Cdh <=summary(dat$mCAF_Col_Cdh)[2]] <-"mCAF_Col_Cdh low"
+dat$mCAF_Col_Cdh_G[dat$mCAF_Col_Cdh >summary(dat$mCAF_Col_Cdh)[2]& dat$mCAF_Col_Cdh<=summary(dat$mCAF_Col_Cdh)[5]] <-"mCAF_Col_Cdh medium"
+
+dat$mCAF_MMP11_G[dat$mCAF_MMP11 >summary(dat$mCAF_MMP11)[5]] <-"mCAF_MMP11 high"
+dat$mCAF_MMP11_G[dat$mCAF_MMP11 <=summary(dat$mCAF_MMP11)[2]] <-"mCAF_MMP11 low"
+dat$mCAF_MMP11_G[dat$mCAF_MMP11 >summary(dat$mCAF_MMP11)[2]& dat$mCAF_MMP11<=summary(dat$mCAF_MMP11)[5]] <-"mCAF_MMP11 medium"
+
+dat$vCAF_G[dat$vCAF >summary(dat$vCAF)[5]] <-"vCAF high"
+dat$vCAF_G[dat$vCAF <=summary(dat$vCAF)[2]] <-"vCAF low"
+dat$vCAF_G[dat$vCAF >summary(dat$vCAF)[2]& dat$vCAF<=summary(dat$vCAF)[5]] <-"vCAF medium"
+
+dat$tpCAF_CD10_G[dat$tpCAF_CD10 >summary(dat$tpCAF_CD10)[5]] <-"tpCAF_CD10 high"
+dat$tpCAF_CD10_G[dat$tpCAF_CD10 <=summary(dat$tpCAF_CD10)[2]] <-"tpCAF_CD10 low"
+dat$tpCAF_CD10_G[dat$tpCAF_CD10 >summary(dat$tpCAF_CD10)[2]& dat$tpCAF_CD10<=summary(dat$tpCAF_CD10)[5]] <-"tpCAF_CD10 medium"
+
+dat$tpCAF_CD73_G[dat$tpCAF_CD73 >summary(dat$tpCAF_CD73)[5]] <-"tpCAF_CD73 high"
+dat$tpCAF_CD73_G[dat$tpCAF_CD73 <=summary(dat$tpCAF_CD73)[2]] <-"tpCAF_CD73 low"
+dat$tpCAF_CD73_G[dat$tpCAF_CD73 >summary(dat$tpCAF_CD73)[2]& dat$tpCAF_CD73<=summary(dat$tpCAF_CD73)[5]] <-"tpCAF_CD73 medium"
+
+dat$iCAF_CD34_G[dat$iCAF_CD34 >summary(dat$iCAF_CD34)[5]] <-"iCAF_CD34 high"
+dat$iCAF_CD34_G[dat$iCAF_CD34 <=summary(dat$iCAF_CD34)[2]] <-"iCAF_CD34 low"
+dat$iCAF_CD34_G[dat$iCAF_CD34 >summary(dat$iCAF_CD34)[2]& dat$iCAF_CD34<=summary(dat$iCAF_CD34)[5]] <-"iCAF_CD34 medium"
+
+dat$iCAF_CD248_G[dat$iCAF_CD248 >summary(dat$iCAF_CD248)[5]] <-"iCAF_CD248 high"
+dat$iCAF_CD248_G[dat$iCAF_CD248 <=summary(dat$iCAF_CD248)[2]] <-"iCAF_CD248 low"
+dat$iCAF_CD248_G[dat$iCAF_CD248 >summary(dat$iCAF_CD248)[2]& dat$iCAF_CD248<=summary(dat$iCAF_CD248)[5]] <-"iCAF_CD248 medium"
+
+dat$dCAF_G[dat$dCAF >summary(dat$dCAF)[5]] <-"dCAF high"
+dat$dCAF_G[dat$dCAF <=summary(dat$dCAF)[2]] <-"dCAF low"
+dat$dCAF_G[dat$dCAF >summary(dat$dCAF)[2]& dat$dCAF<=summary(dat$dCAF)[5]] <-"dCAF medium"
+
+dat$Collagen_CAF_G[dat$Collagen_CAF >summary(dat$Collagen_CAF)[5]] <-"Collagen_CAF high"
+dat$Collagen_CAF_G[dat$Collagen_CAF <=summary(dat$Collagen_CAF)[2]] <-"Collagen_CAF low"
+dat$Collagen_CAF_G[dat$Collagen_CAF >summary(dat$Collagen_CAF)[2]& dat$Collagen_CAF<=summary(dat$Collagen_CAF)[5]] <-"Collagen_CAF medium"
+
+dat$hypoxic_CAF_G[dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[5]] <-"hypoxic_CAF high"
+dat$hypoxic_CAF_G[dat$hypoxic_CAF <=summary(dat$hypoxic_CAF)[2]] <-"hypoxic_CAF low"
+dat$hypoxic_CAF_G[dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[2]& dat$hypoxic_CAF<=summary(dat$hypoxic_CAF)[5]] <-"hypoxic_CAF medium"
+
+dat$SMA_CAF_G[dat$SMA_CAF >summary(dat$SMA_CAF)[5]] <-"SMA_CAF high"
+dat$SMA_CAF_G[dat$SMA_CAF <=summary(dat$SMA_CAF)[2]] <-"SMA_CAF low"
+dat$SMA_CAF_G[dat$SMA_CAF >summary(dat$SMA_CAF)[2]& dat$SMA_CAF<=summary(dat$SMA_CAF)[5]] <-"SMA_CAF medium"
+
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF high"
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF <=summary(dat$hypoxic_tpCAF)[2]] <-"hypoxic_tpCAF low"
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[2]& dat$hypoxic_tpCAF<=summary(dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF medium"
+
+dat$IDO_CAF_G[dat$IDO_CAF >summary(dat$IDO_CAF)[5]] <-"IDO_CAF high"
+dat$IDO_CAF_G[dat$IDO_CAF <=summary(dat$IDO_CAF)[2]] <-"IDO_CAF low"
+dat$IDO_CAF_G[dat$IDO_CAF >summary(dat$IDO_CAF)[2]& dat$IDO_CAF<=summary(dat$IDO_CAF)[5]] <-"IDO_CAF medium"
+
+dat$PDPN_CAF_G[dat$PDPN_CAF >summary(dat$PDPN_CAF)[5]] <-"PDPN_CAF high"
+dat$PDPN_CAF_G[dat$PDPN_CAF <=summary(dat$PDPN_CAF)[2]] <-"PDPN_CAF low"
+dat$PDPN_CAF_G[dat$PDPN_CAF >summary(dat$PDPN_CAF)[2]& dat$PDPN_CAF<=summary(dat$PDPN_CAF)[5]] <-"PDPN_CAF medium"
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+##print(pw)
+##print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   #symbols = c("****", "***", "**", "*", "+", "."),
+#   #abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+#km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+##print(pw)
+##print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   #symbols = c("****", "***", "**", "*", "+", "."),
+#   #abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+#km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+
+
+## Lasso-regressed cox-ph model   
+including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high mid low as well as the densities (continuous).   
+```{r coxph Fibro subtype densities high-mid-low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor")))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+##ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-subtype_Density_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables without neoadjuvant treated patients.
+```{r  KW W Fibro subtype density WO neo,warning=FALSE, message=FALSE, echo=FALSE, fig.width=15, fig.height=10}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+fibro.sce_pat.roi$NeoAdj <- ifelse(fibro.sce_pat.roi$Chemo==1 |fibro.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Stroma,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[2:15])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:16])), names_to = "Phenotype",values_to = "density")
+
+#remove
+tdat <-tdat[!tdat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=8) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-subtype_Densities_WOneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables with neoadjuvant treated patients.
+```{r  KW Wilcox Fibro subtype density W neo,warning=FALSE, message=FALSE, echo=FALSE, fig.width=15, fig.height=10}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+fibro.sce_pat.roi$NeoAdj <- ifelse(fibro.sce_pat.roi$Chemo==1 |fibro.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Stroma,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[,2:15])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:16])), names_to = "Phenotype",values_to = "density")
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=8) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-subtype_Densities_Wneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+```{r  Correlation Fibro subtype densities with clinical parameters, eval=F, echo=T, message=FALSE, warning=FALSE, echo=FALSE}
+fibro.sce_pat.roi$NeoAdj <- ifelse(fibro.sce_pat.roi$Chemo==1 |fibro.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+fibro.sce_pat.roi$NeoAdj <- ifelse(fibro.sce_pat.roi$Chemo==1 |fibro.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Stroma,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- tdat_wide$CD4 + tdat_wide$CD8
+
+tdat_wide <-merge(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_as <-tdat_wide[tdat_wide$DX.name=="Adenocarcinoma" | tdat_wide$DX.name=="Squamous cell carcinoma",]
+tdat_wide_as$LN.Met <-ifelse(tdat_wide_as$LN.Met=="LN Metastases", 1,0)
+tdat_wide_as$Dist.Met <-ifelse(tdat_wide_as$Dist.Met=="Dist. Metastases", 1,0)
+
+for(i in c.param){
+  print(i)
+  print(cor.test(tdat_wide_as$CD4,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$CD8,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$total_density,tdat_wide_as[[i]], method="spearman", exact=F))
+}
+```
+
+
+```{r  Correlations T cell type densities amongst eachother, message=FALSE, warning=FALSE, echo=FALSE, fig.width=6, fig.height=6}
+fibro.sce_pat.roi$NeoAdj <- ifelse(fibro.sce_pat.roi$Chemo==1 |fibro.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Stroma,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+tdat <-tdat %>% 
+  group_by(RoiID,Phenotype) %>%
+  summarise(across(-c(Patient_ID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+tdat_wide <- pivot_wider(tdat,id_cols=c("RoiID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+neoadj_roi <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(RoiID) 
+
+tdat_wide <-tdat_wide[!tdat_wide$RoiID%in% neoadj_roi$RoiID,]
+rownames(tdat_wide)<-tdat_wide$RoiID
+
+library(corrplot)
+cor(tdat_wide[,-1], method="pearson")
+#corrplot(cor(tdat_wide[,-1], method="pearson"), col=rev(brewer.pal(10,"RdBu")))
+
+#test <- immune.pheno.sub %>% select(contains("Ratio"))
+#test$Ratio_tCAF_vCAF <-NULL
+sig <- cor.mtest(tdat_wide[,-1], conf.level = .95) #is this test right?
+
+
+#plot
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+         col=rev(brewer.pal(10,"RdBu")),
+         pch.col = "white",
+         #type="upper",
+         title = "Correlation of T cell type densities",
+        mar=c(0,0,3,0))
+```
+
+
+```{r  Correlations T cell type densities amongst eachother per roi, message=FALSE, warning=FALSE, echo=FALSE, fig.width=6, fig.height=6}
+fibro.sce_pat.roi$NeoAdj <- ifelse(fibro.sce_pat.roi$Chemo==1 |fibro.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Stroma,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+tdat <-tdat %>% 
+  group_by(RoiID,Phenotype) %>%
+  summarise(across(-c(Patient_ID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+tdat_wide <- pivot_wider(tdat,id_cols=c("RoiID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+neoadj_roi <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(RoiID) 
+
+tdat_wide <-tdat_wide[!tdat_wide$RoiID%in% neoadj_roi$RoiID,]
+rownames(tdat_wide)<-tdat_wide$RoiID
+
+library(corrplot)
+cor(tdat_wide[,-1], method="pearson")
+#corrplot(cor(tdat_wide[,-1], method="pearson"), col=rev(brewer.pal(10,"RdBu")))
+
+#test <- immune.pheno.sub %>% select(contains("Ratio"))
+#test$Ratio_tCAF_vCAF <-NULL
+sig <- cor.mtest(tdat_wide[,-1], conf.level = .95) #is this test right?
+
+
+#plot
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+         col=rev(brewer.pal(10,"RdBu")),
+         pch.col = "white",
+         #type="upper",
+         title = "Correlation of T cell type densities",
+        mar=c(0,0,3,0))
+```
+
+
+
+
+```{r  Correlations proportions per image CAF Type densities, fig.width=10, fig.height=10, message=FALSE,warning=FALSE,echo=FALSE, fig.width=6, fig.height=6}
+fibro.sce_pat.roi$NeoAdj <- ifelse(fibro.sce_pat.roi$Chemo==1 |fibro.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$DX.name!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$DX.name!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Stroma,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+tdat <-tdat %>% 
+  group_by(RoiID,Phenotype) %>%
+  summarise(across(-c(Patient_ID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+tdat_wide <- pivot_wider(tdat,id_cols=c("RoiID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+neoadj_roi <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(RoiID) 
+
+tdat_wide <-tdat_wide[!tdat_wide$RoiID%in% neoadj_roi$RoiID,]
+rownames(tdat_wide)<-tdat_wide$RoiID
+
+library(corrplot)
+cor(tdat_wide[,-1], method="pearson")
+#corrplot(cor(tdat_wide[,-1], method="pearson"), col=rev(brewer.pal(10,"RdBu")))
+
+#test <- immune.pheno.sub %>% select(contains("Ratio"))
+#test$Ratio_tCAF_vCAF <-NULL
+sig <- cor.mtest(tdat_wide[,-1], conf.level = .95) #is this test right?
+
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+         col=rev(brewer.pal(10,"RdBu")),
+         pch.col = "white",
+         #type="upper",
+         title = "Correlation CAF Type densities",
+        mar=c(0,0,3,0))
+```
+
+
+
+
+
+#Differential abundance loop
+
+#Differential abundance analysis loop   
+```{r  Differential Abundance Analysis Types over Metastasis, fig.width=6, fig.height=4, warning=F, message=F,echo=F, out.width="50%"}
+#Patient Number must be numeric
+library(tidyr)
+#groupId must be factor
+#t <-as.data.frame(table(fibro[,fibro$CAF_Type!="undefined"]$Patient_ID,factor(fibro[,fibro$CAF_Type!="undefined"]$CAF_Type)))
+#  colnames(t)<-c("Patient_ID","Phenotype","n")
+ # t$Phenotype <-factor(t$Phenotype)
+
+  
+colData(fibro.sce_pat)<-as.data.frame(colData(fibro.sce_pat)) %>% 
+  group_by(Patient_ID) %>% mutate(NR=cur_group_id()) %>% DataFrame()
+dat <-as_tibble(colData(fibro.sce_pat))
+#if necessary: change group_id labels
+
+#clinical.data$NR <-c(1:length(unique(clinical.data$Patient_ID)))
+#dat <-merge(t,clinical.data,by="Patient_ID")
+dat <-dat[dat$DX.name=="Adenocarcinoma" | dat$DX.name=="Squamous cell carcinoma",]
+dat$LN.Met <- ifelse(dat$N ==0, "No LN Metastases", "LN Metastases")
+dat$Dist.Met <- ifelse(dat$M.new ==0, "No Dist. Metastases", "Dist. Metastases")
+dat$NeoAdj <- ifelse(dat$Radio==1 |dat$Chemo==1, "NeoAdjuvantTherapy", "NoNeoAdjuvantTherapy")
+
+#Only for grade
+#dat <-dat[dat$Grade=="2" | dat$Grade=="3",]
+#subtype <-"Grade"
+#dat<-dat %>% drop_na(Grade)
+
+
+
+subtype <-c("Gender","Relapse","Chemo","Dist.Met","LN.Met")
+
+#group_id <-c("M.new")
+#dat$group_id <-ifelse(dat$M.new ==0, 0,1)
+#dat[[group_id]] <-as.factor(dat[[group_id]])
+
+#define the naming subtype aka Phenotype
+
+#dat<-dat %>% drop_na(paste(group_id))
+j <-"cell_type"
+
+plot_list <- list()
+for (i in subtype) {
+  #for(k in unique(dat$DX.name)){
+   # dat.l <-subset(dat, DX.name==k)
+
+dat.l <-dat
+k="both tumour types"
+dat.l$group_id <-as.factor(dat.l[[i]])
+dat.l<-dat.l %>% drop_na(paste(i))
+
+test <- dat.l %>% select(c("NR",j,"group_id"))
+#colnames(test) <-c("NR", "Phenotype","group_id")
+
+test_wide <- pivot_wider(test,id_cols=c("NR","group_id"),names_from = "cell_type", values_from ="cell_type", values_fn = list(cell_type=length),names_prefix = "")
+#names(test_wide)[names(test_wide) == "FibroType_1"] <- "FibroType_1 / tCAF"
+#names(test_wide)[names(test_wide) == "FibroType_2"] <- "FibroType_2 / vCAF"
+test_wide
+test_wide[is.na(test_wide)] <- 0
+
+design <- createDesignMatrix(
+  test_wide[,c("NR","group_id")], cols_design = c( "NR","group_id")
+)
+
+contrast <- createContrast(c(rep(0, ncol(design)-1),1))
+data.frame(parameters = colnames(design), contrast)
+
+test_wide_norm <- test_wide %>% select(-c("NR","group_id"))
+#Normalize
+norm_factors <- calcNormFactors(t(test_wide_norm), method = "TMM")
+y <- DGEList(t(test_wide_norm), norm.factors = norm_factors)
+
+#y <- DGEList(t(both[,-c("patient_id","group_id")]))
+y <- estimateDisp(y,design)
+
+fit <- glmFit(y, design)
+lrt <- glmLRT(fit, contrast = contrast)
+top <- edgeR::topTags(lrt, n = Inf, adjust.method = "BH", 
+        sort.by = "none")
+t.op <-as.data.frame(top)
+t.op <- tibble::rownames_to_column(t.op, "FibroTypes")
+
+plot_list[[k]] <-
+ggplot(t.op, aes(reorder(FibroTypes,logFC), logFC)) +
+  geom_col(aes(fill=PValue<0.05)) +
+  coord_flip() +
+  scale_fill_tableau()+
+  labs(x="Fibroblast Type", y="logFC",
+       title=paste("Phenotypes over",i,k))+
+  theme_bw()+
+  theme(strip.background = element_blank(),
+        panel.background=element_rect(fill='white', colour = 'black'),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        axis.text.y = element_text(size=10))
+#plot(p)
+
+# reference = Metastasis
+##ggsave(plot=p, file=file.path(plot_folder, paste("Differential_Abundance_Analysis_Fibro_n_over_",i,".pdf")))
+  #}
+  gridExtra::grid.arrange(grobs = plot_list)
+  
+    #save out in individual pdfs for each variable
+  #pdf(file=file.path(plot_folder, paste0("DifferentialAbundance_Analysis_Fibro_n_over_",i,".pdf")))
+  #gridExtra::grid.arrange(grobs = plot_list)
+  #dev.off()
+}
+
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=FibroTypes))+geom_point()+scale_color_tableau()
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=PValue<0.05))+geom_point()+scale_color_tableau()
+```
+
+
+
+#############################################################################################################################################
+#only mCAF and SMA high CAF
+
+
+## Patients are ordered by hierarchical clustering (euclidian, wards.D2)
+```{r  Fibro subtype proportions ordered by hierarchical clustering only mCAF SMA, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table( fibro.sce_pat[, fibro.sce_pat$Patient_ID!="Control"&
+                                           fibro.sce_pat$DX.name=="Adenocarcinoma"|
+                                           fibro.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            fibro.sce_pat[, fibro.sce_pat$Patient_ID!="Control"&
+                                           fibro.sce_pat$DX.name=="Adenocarcinoma"|
+                                           fibro.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_type))
+
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat[tdat$Phenotype=="mCAF"|tdat$Phenotype=="SMA_CAF",]
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = hc$labels[hc$order])
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = hc$labels[hc$order])
+
+tdat_wide
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=15)
+
+```
+
+Patient based metaclusters split by metacluster
+```{r  Barplot Fibro subtype proportions only mCAF SMA, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#Barplot proportions ordered by
+p <-ggplot(tdat,aes(x=freq, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")
+#plot(p)
+##ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_Tcell-subtype_over_FILTEREDpatients_ordered-hc.pdf")), width=6, height=6)
+
+#dendrogram coloured by metacluster
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k =2) %>%  as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()+
+  coord_flip()+
+  scale_y_reverse()
+
+#n
+p2<-ggplot(tdat,aes(x=n, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+theme(
+        axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())
+plt <-p1+
+  p+theme(legend.position = "none", axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())+
+  p2
+
+plot(plt)
+##ggsave(plot=plt, file=file.path(plot_folder, paste("Barplot_n_hc_cell_type_proportions.pdf")), width=6, height=6)
+```
+
+```{r  merge hc metacluster into t_dat Fibro subtype proportions only mCAF SMA, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k= 2))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+
+p <-ggplot(tdat_ct,aes(y=freq,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+##ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_split_metacluster_tcell-subtype_proportions.pdf")))
+plot(p)
+```
+```{r  heatmap metacluster cell subtype expression proportions}
+tdat_ct
+tdat_ct_w <-tdat_ct %>% pivot_wider(id_cols = "metacluster",names_from = "Phenotype", values_from = "freq",values_fn = mean)
+tdat_ct_w_m <- as.matrix(tdat_ct_w[,-1])
+rownames(tdat_ct_w_m) <-tdat_ct_w$metacluster
+
+pheatmap(scale(t(tdat_ct_w_m)))
+
+pheatmap(t(tdat_ct_w_m))
+```
+
+## **Survival analysis** - Fibro subtype
+Fibro categories (CD4 and CD8) are grouped into "high" and "low" using the proportion's median as a cut-off.    
+- high > median  
+- low < median  
+Patients who received neoadjuvant treatment are excluded from survival analysis.  
+```{r  survival data Fibro subtype proportions high-low only mCAF SMA, message=FALSE, warning=FALSE, echo=FALSE}
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat,]
+
+#high low by median
+surv_dat$mCAF_G <- ifelse(surv_dat$mCAF >summary(surv_dat$mCAF)[3],"mCAF high","mCAF low")
+surv_dat$SMA_CAF_G <- ifelse(surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+
+caf_strat <-surv_dat %>% select(contains(c("Patient_ID","_G")))
+write.csv(caf_strat, file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+
+surv <-c("SMA_CAF_G","mCAF_G", "metacluster")
+#surv <-"metacluster"
+```
+
+
+#CoxPH for Fibro subtype proportions corrected for Stage, Grade and M
+```{r  Coxph Fibro subtype proportions only mCAF SMA, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   mCAF+SMA_CAF+ Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r  survival Fibro subtype proportions not split high low only mCAF SMA, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+ 
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+   #symbols = c("****", "***", "**", "*", "+", "."),
+    #abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+#km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  facet_wrap(~strata)+theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+   #symbols = c("****", "***", "**", "*", "+", "."),
+   #abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+#km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r  survival Fibro subtype proportions split high low only mCAF SMA, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#OS survival
+
+
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+    dat$mCAF_G <- ifelse(dat$mCAF >summary(dat$mCAF)[3],"mCAF high","mCAF low")
+dat$SMA_CAF_G <- ifelse(dat$SMA_CAF >summary(dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+
+#caf_strat <-dat %>% select(contains(c("Patient_ID","_G")))
+#write.csv(caf_strat, file=file.path(wd,"patient_strat",paste0("CAF_strat_hi_low_proportion_",k,".csv")))
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+##print(pw)
+##print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  #symbols = c("****", "***", "**", "*", "+", "."),
+   ##abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+#km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+##print(pw)
+##print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  #symbols = c("****", "***", "**", "*", "+", "."),
+ #  #abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+#km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
\ No newline at end of file
diff --git a/Analysis_Immune-nonT.Rmd b/Analysis_Immune-nonT.Rmd
new file mode 100644
index 0000000..6eb66ed
--- /dev/null
+++ b/Analysis_Immune-nonT.Rmd
@@ -0,0 +1,2175 @@
+---
+title: "R Notebook - Analysis Immune cells"
+output: html_notebook
+---
+
+```{r, import libraries}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+library(dplyr)
+library(tidyr)
+library(mclust)
+library(ggplot2)
+library(RColorBrewer)
+library(scater)
+library(Rphenoannoy)
+library(factoextra)
+library(cluster)
+library(dendextend)
+library(ggthemes)
+set.seed(101100)
+```
+
+
+
+```{r, Set wd and load data}
+#set working directory 
+wd <-dirname(getwd())
+
+data_folder <-(file.path(wd,"sce_objects","immune"))
+
+#RAW
+immune.sce <- readRDS(file=file.path(data_folder, "FINAL_Tcells-minusImmune_workingfile.rds"))
+immune.sce <- readRDS(file=file.path(data_folder, "FINAL_Analysis_Tcell-clinicaldata_area_RAW.rds"))
+
+#workingfile
+saveRDS(immune.sce, file=file.path(data_folder, "FINAL_Analysis_Tcell-clinicaldata_area_RAW.rds"))
+saveRDS(immune.sce, file=file.path(data_folder, "FINAL_Analysis_Tcell-clinicaldata_workingfile.rds"))
+
+#clinical.data
+data_folder <-(file.path(wd,"sce_objects","final objects with categories","FINAL"))
+immune.sce <- readRDS(file=file.path(data_folder, "IMMUNE_nonT_CLINICAL-DATA_FILTERED.rds"))
+immune.sce$DX.name[is.na(immune.sce$DX.name)]<-"NA"
+```
+
+
+Define clinical data
+```{r, clinical data, message=FALSE, warning=FALSE, echo=FALSE}
+#clinical.data <-as.data.frame(colData(immune.sce))
+
+clinical.data <- read.csv(file=file.path(wd,"sce_objects","clinical_data", "clinical_data_ROI_ac_combined_CORRECT.csv"))
+
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+clinical.data$X <-NULL
+clinical.data$X.1 <-NULL
+head(clinical.data)
+```
+
+
+Table with Immune cell numbers including Immune cells removing undefined
+```{r, Table with immune cell numbers per patient, echo=FALSE, message=FALSE, warning=FALSE}
+#All Immune cells together
+tbl <- as.data.frame(table(immune.sce[,immune.sce$Patient_ID!="Control" ]$Patient_ID))
+colnames(tbl) <-c("Patient ID", "Immune cell number overall")
+#write.csv(tbl, file=file.path(plot_folder, paste("Immune cell numbers per patient overall including T-excluding undefined cells.csv")))
+print(tbl)
+
+summary(tbl$`Immune cell number overall`) #3-6259
+tbl[tbl$`Immune cell number overall` <=100,] #31 patients have less than 100 Immune cells=3% but 10% of median/ 89 patients have less than 300 Immune cells
+
+#Lowest 10% of all patients' Immune cell numbers
+tbl[order(tbl$`Immune cell number overall`),]$`Immune cell number overall`[1:100] #1-67
+
+
+#Highest 10% of all patients' Immune cell numbers
+tbl[order(-tbl$`Immune cell number overall`),]$`Immune cell number overall`[1:100] #1503-7186
+
+tbl[tbl$`Immune cell number overall` <=50,] #77 patients have less than 60 Immune cells=10%.
+
+all_immune_pat <- tbl[tbl$`Immune cell number overall` <=50,]$`Patient ID`
+
+length(unique(immune.sce$Patient_ID))
+```
+
+Remove ROIs
+```{r, Table with Immune cell numbers per image, echo=FALSE, message=FALSE, warning=FALSE}
+#All Immune cells together
+tbl <- as.data.frame(table(immune.sce[,immune.sce$Patient_ID!="Control"]$RoiID))
+colnames(tbl) <-c("ROI ID", "Immune cell number overall")
+#write.csv(tbl, file=file.path(plot_folder, paste("Immune cell numbers per ROI overall including Immune cells-excluding undefined.csv")))
+print(tbl)
+summary(tbl$`Immune cell number overall`) #1-6569
+tbl[tbl$`Immune cell number overall` <=25,] #204 images have less than 50 Immune cells (10% of median)
+#Lowest 10% of all patients' Immune cell numbers
+tbl[order(tbl$`Immune cell number overall`),]$`Immune cell number overall`[1:200] #1-25
+
+
+#Highest 10% of all patients' Immune cell numbers
+tbl[order(-tbl$`Immune cell number overall`),]$`Immune cell number overall`[1:200] #1385-5075
+#per image cut at 50 Immune cells per image equals lowest 5% -> ensures that there's at least 50 Immune cells per patient
+
+all_immune_roi <- tbl[tbl$`Immune cell number overall` <=25,]$`ROI ID`
+```
+
+
+Remove patients and roi
+```{r, patient roi and pat_roi removal, echo=FALSE, message=FALSE, warning=FALSE}
+#Patient removal
+immune.sce_pat <- immune.sce[,immune.sce$Patient_ID!="Control"&
+                               !immune.sce$Patient_ID%in%all_immune_pat]
+length(unique(immune.sce_pat$Patient_ID)) #991
+
+#Roi removal
+immune.sce_roi <- immune.sce[,immune.sce$Patient_ID!="Control"&
+                               !immune.sce$RoiID%in%all_immune_roi]
+length(unique(immune.sce_roi$Patient_ID)) #1016
+
+#Patient & Roi removal
+immune.sce_pat.roi <- immune.sce[,immune.sce$Patient_ID!="Control"&
+                               !immune.sce$Patient_ID%in%all_immune_pat&
+                               !immune.sce$RoiID%in%all_immune_roi]
+length(unique(immune.sce_pat.roi$Patient_ID)) #991
+
+```
+
+```{r, Save SCE patient and roi removed, include=FALSE, echo=FALSE,warning=FALSE, message=FALSE, eval=FALSE}
+data_folder <-(file.path(wd,"sce_objects","immune"))
+
+saveRDS(immune.sce_pat.roi,file=file.path(data_folder, paste("Immune_sce_pat_roi_rem.rds",sep="")))
+saveRDS(immune.sce_roi,file=file.path(data_folder, paste("Immune_sce_roi_rem.rds",sep="")))
+saveRDS(immune.sce_pat,file=file.path(data_folder, paste("Immune_sce_pat_rem.rds",sep="")))
+
+immune.sce_pat.roi <-readRDS(file=file.path(data_folder, paste("Immune_sce_pat_roi_rem.rds",sep="")))
+immune.sce_roi <-readRDS(file=file.path(data_folder, paste("Immune_sce_roi_rem.rds",sep="")))
+immune.sce_pat <-readRDS(file=file.path(data_folder, paste("Immune_sce_pat_rem.rds",sep="")))
+```
+
+
+##Analysis
+#Proportions Immune cell CATEGORY
+
+## **Analysis**
+In the following, only tumours that were identified as adenocarcinomas or squamous cell carcinomas by pathologists will be used.
+
+## **Immune cell Category**
+
+## **Proportions**
+Optimal number of patient metaclusters Immune cell category proportions
+```{r, optimal number of clusters for non Immune cell category type proportion clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Proportions
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+tdat <-as.data.frame(table( immune.sce_pat[, immune.sce_pat$Patient_ID!="Control"&
+                                           immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                           immune.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            immune.sce_pat[, immune.sce_pat$Patient_ID!="Control"&
+                                           immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                           immune.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_type))
+
+
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+#tdat_m <-tdat_m[,1:2]
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+sbg <-cutree(hc, k=2)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=15)
+
+#install.packages("cluster")
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+#res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+## Patients are ordered by hierarchical clustering (euclidian, wards.D2)
+```{r, Immune cell category proportions ordered by hierarchical clustering, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table( immune.sce_pat[, immune.sce_pat$Patient_ID!="Control"&
+                                           immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                           immune.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            immune.sce_pat[, immune.sce_pat$Patient_ID!="Control"&
+                                           immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                           immune.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = hc$labels[hc$order])
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = hc$labels[hc$order])
+
+tdat_wide
+```
+
+Patient based metaclusters split by metacluster
+```{r, Barplot Immune cell category proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#Barplot proportions ordered by
+p <-ggplot(tdat,aes(x=freq, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()
+#plot(p)
+#ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_Tcell-Category_over_FILTEREDpatients_ordered-hc.pdf")), width=6, height=6)
+
+#dendrogram coloured by metacluster
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 4) %>%  as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()+
+  coord_flip()+
+  scale_y_reverse()
+
+#n
+p2<-ggplot(tdat,aes(x=n, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+theme(
+        axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())
+plt <-p1+
+  p+theme(legend.position = "none", axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())+
+  p2
+
+plot(plt)
+#ggsave(plot=plt, file=file.path(plot_folder, paste("Barplot_n_hc_cell_type_proportions.pdf")), width=6, height=6)
+```
+
+```{r, merge hc metacluster into t_dat Immune cell category proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k= 4))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+
+p <-ggplot(tdat_ct,aes(y=freq,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+#ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_split_metacluster_tcell-category_proportions.pdf")))
+plot(p)
+```
+
+
+## **Survival analysis** - Immune cell category
+Immune cell categories (CD4 and CD8) are grouped into "high" and "low" using the proportion's median as a cut-off.    
+- high > median  
+- low < median  
+Patients who received neoadjuvant treatment are excluded from survival analysis.  
+```{r, survival data Immune cell category proportions high-low, message=FALSE, warning=FALSE, echo=FALSE}
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat,]
+
+#high low by median
+surv_dat$Neutrophil_G <- ifelse(surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[3],"Neutrophil high","Neutrophil low")
+surv_dat$Bcell_G <- ifelse(surv_dat$Bcell >summary(surv_dat$Bcell)[3],"B cell high","B cell low")
+surv_dat$Myeloid_G <- ifelse(surv_dat$Myeloid >summary(surv_dat$Myeloid)[3],"Myeloid high","Myeloid low")
+
+surv <-c("Bcell_G","Neutrophil_G","Myeloid_G", "metacluster")
+```
+
+
+#CoxPH for Immune cell category proportions corrected for Stage, Grade and M
+```{r, Coxph Immune cell category proportions, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   Neutrophil+Bcell + Myeloid + Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r, survival Immune cell category proportions not split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+
+
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  facet_wrap(~strata)+theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r, survival Immune cell category proportions split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#OS survival
+surv <-c("Bcell_G","Neutrophil_G","Myeloid_G", "metacluster")
+
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+surv_dat$Neutrophil_G <- ifelse(surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[3],"Neutrophil high","Neutrophil low")
+surv_dat$Bcell_G <- ifelse(surv_dat$Bcell >summary(surv_dat$Bcell)[3],"B cell high","B cell low")
+surv_dat$Myeloid_G <- ifelse(surv_dat$Myeloid >summary(surv_dat$Myeloid)[3],"Myeloid high","Myeloid low")
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+   #abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and Immune cell categories (CD4 vs CD8) classified as high / low as well as the proportion (continuous).   
+```{r, coxph Immune cell category proportions, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Proportions_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Cell groups categorised into "high", "mid", "low".  
+- high > 3rd quantile  
+- mid >1st quantile and < 3rd quantile  
+- low < 1st quantile  
+```{r, survival Immune cell category proportions classified as high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[5]] <-"Bcell high"
+surv_dat$Bcell_G[surv_dat$Bcell <=summary(surv_dat$Bcell)[2]] <-"Bcell low"
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[2]& surv_dat$Bcell<=summary(surv_dat$Bcell)[5]] <-"Bcell medium"
+
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[5]] <-"Myeloid high"
+surv_dat$Myeloid_G[surv_dat$Myeloid <=summary(surv_dat$Myeloid)[2]] <-"Myeloid low"
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[2]& surv_dat$Myeloid<=summary(surv_dat$Myeloid)[5]] <-"Myeloid medium"
+
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[5]] <-"Neutrophil high"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil <=summary(surv_dat$Neutrophil)[2]] <-"Neutrophil low"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[2]& surv_dat$Neutrophil<=summary(surv_dat$Neutrophil)[5]] <-"Neutrophil medium"
+
+surv <-c("Myeloid_G","Neutrophil_G","Bcell_G" ,"metacluster")
+```
+
+## Survival Immune cell category proportions high mid low not split
+```{r, survival Immune cell category proportions not split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Survival Immune cell category proportions high mid low split by tumour type
+```{r, survival Immune cell category proportions split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[5]] <-"Bcell high"
+surv_dat$Bcell_G[surv_dat$Bcell <=summary(surv_dat$Bcell)[2]] <-"Bcell low"
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[2]& surv_dat$Bcell<=summary(surv_dat$Bcell)[5]] <-"Bcell medium"
+
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[5]] <-"Myeloid high"
+surv_dat$Myeloid_G[surv_dat$Myeloid <=summary(surv_dat$Myeloid)[2]] <-"Myeloid low"
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[2]& surv_dat$Myeloid<=summary(surv_dat$Myeloid)[5]] <-"Myeloid medium"
+
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[5]] <-"Neutrophil high"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil <=summary(surv_dat$Neutrophil)[2]] <-"Neutrophil low"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[2]& surv_dat$Neutrophil<=summary(surv_dat$Neutrophil)[5]] <-"Neutrophil medium"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+including clinical parameters, patient metacluster and Immune cell categories (CD4 vs CD8) classified as high mid low as well as the proportion (continuous).   
+```{r, coxph Immune cell category proportions high mid low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Proportions_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+#Correlation of Immune cell category proportions with clinical parameters
+```{r, Correlation of non Immune cell category proportions and clinical parameters, message=FALSE, warning=FALSE, echo=FALSE}
+immune.sce_pat$NeoAdj <- ifelse(immune.sce_pat$Chemo==1 |immune.sce_pat$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(immune.sce_pat[,immune.sce_pat$Patient_ID!="Control"&
+                                          immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                           immune.sce_pat[,immune.sce_pat$Patient_ID!="Control"&
+                                          immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+tdat_wide <-merge(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_as <-tdat_wide[tdat_wide$DX.name=="Adenocarcinoma" | tdat_wide$DX.name=="Squamous cell carcinoma",]
+tdat_wide_as$LN.Met <-ifelse(tdat_wide_as$LN.Met=="LN Metastases", 1,0)
+tdat_wide_as$Dist.Met <-ifelse(tdat_wide_as$Dist.Met=="Dist. Metastases", 1,0)
+
+c.param <- c("Gender","T.new","N","M.new","Dist.Met","LN.Met","Stage")
+for(i in c.param){
+  print(i)
+print(cor.test(tdat_wide_as$Bcell,tdat_wide_as[[i]], method="spearman", exact=F))
+print(cor.test(tdat_wide_as$Neutrophil,tdat_wide_as[[i]], method="spearman", exact=F))
+print(cor.test(tdat_wide_as$Myeloid,tdat_wide_as[[i]], method="spearman", exact=F))
+  }
+```
+
+
+In group comparison (Wilcoxon for 2 groups or KW for >2 groups) for Immune cell categories over clinical variables:  
+  - T.new (TNM)
+  - N (TNM)
+  - M.new (TNM)
+  - Relapse
+  - Grade
+  - Stage
+  - Gender
+  - Typ (Tumour Type)
+  - Dist.Met (Distant metastases yes/no)
+  - LN.Met (Lymph node metastases yes/no)
+  
+## Kruskal-Wallis / Wilcoxon Immune cell category ~ clinical data excluding neoadjuvant therapy
+```{r, KW W Immune cell category proportions WO neo, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(immune.sce_pat[,immune.sce_pat$Patient_ID!="Control"&
+                                          immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat$DX.name=="Squamous cell carcinoma"]))
+
+#remove neoadj patients
+df <-df[!df$Patient_ID%in% neoadj_pat$Patient_ID,]
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender") 
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$cell_type, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=2) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+  #pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Proportions_WOneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+## Neoadjuvantly treated patients are included, the same categories are used plus:  
+- NeoAdj (Neoadjuvant therapy yes/no)
+- Chemo (Neoadjuvant therapy = chemotherapy)
+- Radio (Neoadjuvant therapy = radiotherapy)
+```{r, KW W Immune cell category proportions with NEO, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(immune.sce_pat[,immune.sce_pat$Patient_ID!="Control"&
+                                          immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat$DX.name=="Squamous cell carcinoma"]))
+
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$cell_type, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=2) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Proportions_Wneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+
+
+## **Immune cell category**
+## **Densities**
+Generally, densities are first calculated per tissue area. For patients with more than 1 image, the average density is calculated subsequently.  
+Optimal number of Immune cell category densities patients metaclusters:
+```{r, Immune cell category density optimal number of clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Densities
+tdat <-as.data.frame(table(immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=immune.sce_pat.roi$RoiID,
+                                "Area"=immune.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=immune.sce_pat.roi$Patient_ID,
+                                "DX.name"=immune.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+#tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+library(FactoMineR)
+library(factoextra)
+sbg <-cutree(hc, k=8)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=20)
+
+#install.packages("cluster")
+library(cluster)
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+```{r, Calculate densities Immune cell category, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table(immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=immune.sce_pat.roi$RoiID,
+                                "Area"=immune.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=immune.sce_pat.roi$Patient_ID,
+                                "DX.name"=immune.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+  
+  
+#CAF type distribution over all patients
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+#order Patients after clustering results
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = labels(hc))
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = labels(hc))
+```
+
+## Barplot showing absolute density per patietn coloured by Immune cell category together with hierarchical clustering tree coloured by patient metaclusters.
+```{r,Barolot Immune cell category densities, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+#barolot (sqrt of density)
+p <-ggplot(tdat,aes(y=sqrt(density),x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+#barplot total density
+p <-ggplot(tdat,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+
+#coloured dendrogram
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 2) %>% as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()
+pg <-plot_grid(p1,p, align="v", ncol=1)
+pg
+#ggsave(plot=pg, file=file.path(plot_folder, paste("Barplot_Tcell-Category-Densities_HC_ordered_dendro.pdf")), width=12, height=6)
+```
+Add patient metacluster to data and plot clusters separately for better identification
+```{r,Immune cell category densities merge hc, message=FALSE, warning=FALSE, echo=FALSE}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k=2))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+ggplot(tdat_ct,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+```
+
+## Survival analysis over Immune cell category densities high low.  
+- High > median
+- Low < median
+```{r, surv_dat Immune cell category densities high low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- rowSums(surv_dat[,2:4])
+
+#high low
+surv_dat$Density_G <- ifelse(surv_dat$total_density >summary(surv_dat$total_density)[3],"Density high","Density low")
+surv_dat$Bcell_G <- ifelse(surv_dat$Bcell >summary(surv_dat$Bcell)[3],"Bcell high","Bcell low")
+surv_dat$Myeloid_G <- ifelse(surv_dat$Myeloid >summary(surv_dat$Myeloid)[3],"Myeloid high","Myeloid low")
+surv_dat$Neutrophil_G <- ifelse(surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[3],"Neutrophil high","Neutrophil low")
+```
+
+#CoxPH for Immune cell category density corrected for Stage, Grade and M
+```{r, Coxph Immune cell category density, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   Myeloid+ Bcell+ Neutrophil + Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r, survival Immune cell category densities high low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+#surv <-c("metacluster")
+surv <-c("Neutrophil_G","Bcell_G","Myeloid_G","metacluster", "Density_G")
+library(survminer)
+
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+#km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r, survival Immune cell category densities high low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Split
+library(survminer)
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+
+#high low
+surv_dat$Density_G <- ifelse(surv_dat$total_density >summary(surv_dat$total_density)[3],"Density high","Density low")
+surv_dat$Bcell_G <- ifelse(surv_dat$Bcell >summary(surv_dat$Bcell)[3],"Bcell high","Bcell low")
+surv_dat$Myeloid_G <- ifelse(surv_dat$Myeloid >summary(surv_dat$Myeloid)[3],"Myeloid high","Myeloid low")
+surv_dat$Neutrophil_G <- ifelse(surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[3],"Neutrophil high","Neutrophil low")
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and Immune cell categories (CD4 vs CD8) classified as high / low as well as the densities (continuous).   
+```{r, coxph Immune cell category densities high-low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=6, fig.width=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c("Density_G",paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"Density_G","metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Density_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Survival Immune cell category densities high-mid-low     
+- high    > 3rd quantile   
+- mid     > 1st quantile < 3rd quantile   
+- low     < 1st quantile   
+```{r, surv_dat Immune cell category densities high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- rowSums(surv_dat[,2:4])
+
+#High Med Low
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[5]] <-"Density high"
+surv_dat$Density_G[surv_dat$total_density <summary(surv_dat$total_density)[2]] <-"Density low"
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[2]& surv_dat$total_density<summary(surv_dat$total_density)[5]] <-"Density medium"
+
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[5]] <-"Neutrophil high"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil <summary(surv_dat$Neutrophil)[2]] <-"Neutrophil low"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[2]& surv_dat$Neutrophil<summary(surv_dat$Neutrophil)[5]] <-"Neutrophil medium"
+
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[5]] <-"Myeloid high"
+surv_dat$Myeloid_G[surv_dat$Myeloid <summary(surv_dat$Myeloid)[2]] <-"Myeloid low"
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[2]& surv_dat$Myeloid<summary(surv_dat$Myeloid)[5]] <-"Myeloid medium"
+
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[5]] <-"Bcell high"
+surv_dat$Bcell_G[surv_dat$Bcell <summary(surv_dat$Bcell)[2]] <-"Bcell low"
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[2]& surv_dat$Bcell<summary(surv_dat$Bcell)[5]] <-"Bcell medium"
+
+surv <-c("Bcell_G","Myeloid_G","Neutrophil_G","Density_G","metacluster")
+```
+
+## Immune cell category densities split by tumour type
+```{r, survival Immune cell category densities high-mid-low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+surv <-c("Bcell_G","Myeloid_G","Neutrophil_G","Density_G","metacluster")
+library(survminer)
+
+for (i in surv){
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## Immune cell category densities split by tumour type
+```{r, survival Immune cell category densities high-mid-low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+surv <-c("Bcell_G","Neutrophil_G","Myeloid_G","Density_G","metacluster")
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G[dat$total_density >summary(dat$total_density)[5]] <-"Density high"
+dat$Density_G[dat$total_density <=summary(dat$total_density)[2]] <-"Density low"
+dat$Density_G[dat$total_density >summary(dat$total_density)[2]& dat$total_density<summary(dat$total_density)[5]] <-"Density medium"
+
+dat$Bcell_G[dat$Bcell >summary(dat$Bcell)[5]] <-"Bcell high"
+dat$Bcell_G[dat$Bcell <=summary(dat$Bcell)[2]] <-"Bcell low"
+dat$Bcell_G[dat$Bcell >summary(dat$Bcell)[2]& dat$Bcell<summary(dat$Bcell)[5]] <-"Bcell medium"
+
+dat$Neutrophil_G[dat$Neutrophil >summary(dat$Neutrophil)[5]] <-"Neutrophil high"
+dat$Neutrophil_G[dat$Neutrophil <=summary(dat$Neutrophil)[2]] <-"Neutrophil low"
+dat$Neutrophil_G[dat$Neutrophil >summary(dat$Neutrophil)[2]& dat$Neutrophil<summary(dat$Neutrophil)[5]] <-"Neutrophil medium"
+
+dat$Myeloid_G[dat$Myeloid >summary(dat$Myeloid)[5]] <-"Myeloid high"
+dat$Myeloid_G[dat$Myeloid <=summary(dat$Myeloid)[2]] <-"Myeloid low"
+dat$Myeloid_G[dat$Myeloid >summary(dat$Myeloid)[2]& dat$Myeloid<summary(dat$Myeloid)[5]] <-"Myeloid medium"
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+
+
+## Lasso-regressed cox-ph model   
+including clinical parameters, patient metacluster and Immune cell categories (CD4 vs CD8) classified as high mid low as well as the densities (continuous).   
+```{r,coxph Immune cell category densities high-mid-low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor")))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Density_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables without neoadjuvant treated patients.
+```{r, KW W Immune cell category density WO neo,warning=FALSE, messages=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+immune.sce_pat.roi$NeoAdj <- ifelse(immune.sce_pat.roi$Chemo==1 |immune.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=immune.sce_pat.roi$RoiID,
+                                "Area"=immune.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=immune.sce_pat.roi$Patient_ID,
+                                "DX.name"=immune.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[,2:4])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:5])), names_to = "Phenotype",values_to = "density")
+
+#remove
+tdat <-tdat[!tdat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=3) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Densities_WOneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables with neoadjuvant treated patients.
+```{r, KW Wilcox Immune cell Category density W neo,warning=FALSE, messages=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+immune.sce_pat.roi$NeoAdj <- ifelse(immune.sce_pat.roi$Chemo==1 |immune.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=immune.sce_pat.roi$RoiID,
+                                "Area"=immune.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=immune.sce_pat.roi$Patient_ID,
+                                "DX.name"=immune.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[,2:4])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:5])), names_to = "Phenotype",values_to = "density")
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=3) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Densities_Wneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+```{r, Correlation Immune cell category densities with clinical parameters, eval=F, echo=T, message=FALSE, warning=FALSE, echo=FALSE}
+immune.sce_pat.roi$NeoAdj <- ifelse(immune.sce_pat.roi$Chemo==1 |immune.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+immune.sce_pat.roi$NeoAdj <- ifelse(immune.sce_pat.roi$Chemo==1 |immune.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=immune.sce_pat.roi$RoiID,
+                                "Area"=immune.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=immune.sce_pat.roi$Patient_ID,
+                                "DX.name"=immune.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[,2:4])
+
+tdat_wide <-merge(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_as <-tdat_wide[tdat_wide$DX.name=="Adenocarcinoma" | tdat_wide$DX.name=="Squamous cell carcinoma",]
+tdat_wide_as$LN.Met <-ifelse(tdat_wide_as$LN.Met=="LN Metastases", 1,0)
+tdat_wide_as$Dist.Met <-ifelse(tdat_wide_as$Dist.Met=="Dist. Metastases", 1,0)
+
+for(i in c.param){
+  print(i)
+  print(cor.test(tdat_wide_as$Neutrophil,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$Bcell,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$Myeloid,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$total_density,tdat_wide_as[[i]], method="spearman", exact=F))
+}
+```
+
diff --git a/Analysis_Immune_Tcell_category.Rmd b/Analysis_Immune_Tcell_category.Rmd
new file mode 100644
index 0000000..6b72c64
--- /dev/null
+++ b/Analysis_Immune_Tcell_category.Rmd
@@ -0,0 +1,2174 @@
+---
+title: "R Notebook -  Analysis Immune + T cell category"
+output:
+  html_document:
+    df_print: paged
+---
+
+
+```{r, import libraries, echo=F, warnings=F, message=F}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+library(dplyr)
+library(tidyr)
+library(mclust)
+library(ggplot2)
+library(RColorBrewer)
+library(scater)
+library(Rphenoannoy)
+library(factoextra)
+library(cluster)
+library(dendextend)
+library(ggthemes)
+library(ggpubr)
+library(dplyr)
+#library(tidyverse)
+library(RColorBrewer)
+library(pals)
+library(qwraps2)
+library(table1)
+library(SingleCellExperiment)
+library(tidyr)
+library(scater)
+library(ggridges)
+library(viridis)
+library(viridisLite)
+library(ggplot2)
+library(data.table)
+library(CATALYST)
+library(gridExtra)
+library(Rphenograph)
+library(ComplexHeatmap)
+library(CATALYST)
+library(scales)
+library(survival)
+library(broom)
+library(pheatmap)
+library(FlowSOM)
+library(Seurat)
+library(Rphenoannoy)
+library(dplyr)
+library(data.table)
+library(ggthemes)
+library(diffcyt)
+library(edgeR)
+library(rstatix)
+library(dendextend)
+library(ggdendro)
+library(dendextend)
+library(FactoMineR)
+library(factoextra)
+library(survminer)
+library(corrplot)
+library(rstatix)
+library(graphics)
+library(cowplot)
+library(cluster)
+library(glmnet)
+library(fastDummies)
+library(ggsci)
+library(FactoMineR)
+library(factoextra)
+set.seed(101100)
+```
+
+```{r}
+wd <-dirname(getwd())
+
+data_folder <-(file.path(wd,"sce_objects","immune"))
+
+
+immune.sce_pat.roi <-readRDS(file=file.path(data_folder, paste("Immune_all_sce_pat_roi_rem.rds",sep="")))
+immune.sce_roi <-readRDS(file=file.path(data_folder, paste("Immune_all_sce_roi_rem.rds",sep="")))
+immune.sce_pat <-readRDS(file=file.path(data_folder, paste("Immune_all_sce_pat_rem.rds",sep="")))
+```
+
+
+Define clinical data
+```{r, clinical data, message=FALSE, warning=FALSE, echo=FALSE}
+#clinical.data <-as.data.frame(colData(immune.sce))
+
+clinical.data <- read.csv(file=file.path(wd,"sce_objects","clinical_data", "clinical_data_ROI_ac_combined_CORRECT.csv"))
+
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+clinical.data$X <-NULL
+clinical.data$X.1 <-NULL
+head(clinical.data)
+```
+## **Immune cell Category**
+
+## **Proportions**
+Optimal number of patient metaclusters Immune cell category proportions
+```{r, optimal number of clusters for non Immune cell category type proportion clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Proportions
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+tdat <-as.data.frame(table( immune.sce_pat[, immune.sce_pat$Patient_ID!="Control"&
+                                           immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                           immune.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            immune.sce_pat[, immune.sce_pat$Patient_ID!="Control"&
+                                           immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                           immune.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_type))
+
+
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+#tdat_m <-tdat_m[,1:2]
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+sbg <-cutree(hc, k=2)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=15)
+
+#install.packages("cluster")
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+#res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+## Patients are ordered by hierarchical clustering (euclidian, wards.D2)
+```{r, Immune cell category proportions ordered by hierarchical clustering, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table( immune.sce_pat[, immune.sce_pat$Patient_ID!="Control"&
+                                           immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                           immune.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            immune.sce_pat[, immune.sce_pat$Patient_ID!="Control"&
+                                           immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                           immune.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = hc$labels[hc$order])
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = hc$labels[hc$order])
+
+tdat_wide
+```
+
+Patient based metaclusters split by metacluster
+```{r, Barplot Immune cell category proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#Barplot proportions ordered by
+p <-ggplot(tdat,aes(x=freq, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()
+#plot(p)
+#ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_Tcell-Category_over_FILTEREDpatients_ordered-hc.pdf")), width=6, height=6)
+
+#dendrogram coloured by metacluster
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 9) %>%  as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()+
+  coord_flip()+
+  scale_y_reverse()
+
+#n
+p2<-ggplot(tdat,aes(x=n, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+theme(
+        axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())
+plt <-p1+
+  p+theme(legend.position = "none", axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())+
+  p2
+
+plot(plt)
+#ggsave(plot=plt, file=file.path(plot_folder, paste("Barplot_n_hc_cell_type_proportions.pdf")), width=6, height=6)
+```
+
+```{r, merge hc metacluster into t_dat Immune cell category proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k= 9))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+
+p <-ggplot(tdat_ct,aes(y=freq,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+#ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_split_metacluster_tcell-category_proportions.pdf")))
+plot(p)
+```
+
+
+## **Survival analysis** - Immune cell category
+Immune cell categories (CD4 and CD8) are grouped into "high" and "low" using the proportion's median as a cut-off.    
+- high > median  
+- low < median  
+Patients who received neoadjuvant treatment are excluded from survival analysis.  
+```{r, survival data Immune cell category proportions high-low, message=FALSE, warning=FALSE, echo=FALSE}
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat,]
+
+#high low by median
+surv_dat$Neutrophil_G <- ifelse(surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[3],"Neutrophil high","Neutrophil low")
+surv_dat$Bcell_G <- ifelse(surv_dat$Bcell >summary(surv_dat$Bcell)[3],"B cell high","B cell low")
+surv_dat$Myeloid_G <- ifelse(surv_dat$Myeloid >summary(surv_dat$Myeloid)[3],"Myeloid high","Myeloid low")
+surv_dat$CD4_G <- ifelse(surv_dat$CD4 >summary(surv_dat$CD4)[3],"CD4 high","CD4 low")
+surv_dat$CD8_G <- ifelse(surv_dat$CD8 >summary(surv_dat$CD8)[3],"CD8 high","CD8 low")
+
+surv <-c("Bcell_G","Neutrophil_G","Myeloid_G","CD4_G","CD8_G", "metacluster")
+```
+
+
+#CoxPH for Immune cell category proportions corrected for Stage, Grade and M
+```{r, Coxph Immune cell category proportions, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   Neutrophil+Bcell+CD4+CD8 + Myeloid + Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r, survival Immune cell category proportions not split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+
+
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  facet_wrap(~strata)+theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r, survival Immune cell category proportions split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#OS survival
+surv <-c("Bcell_G","Neutrophil_G","CD4_G","CD8_G","Myeloid_G", "metacluster")
+
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+surv_dat$Neutrophil_G <- ifelse(surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[3],"Neutrophil high","Neutrophil low")
+surv_dat$Bcell_G <- ifelse(surv_dat$Bcell >summary(surv_dat$Bcell)[3],"B cell high","B cell low")
+surv_dat$Myeloid_G <- ifelse(surv_dat$Myeloid >summary(surv_dat$Myeloid)[3],"Myeloid high","Myeloid low")
+surv_dat$CD4_G <- ifelse(surv_dat$CD4 >summary(surv_dat$CD4)[3],"CD4 high","CD4 low")
+surv_dat$CD8_G <- ifelse(surv_dat$CD8 >summary(surv_dat$CD8)[3],"CD8 high","CD8 low")
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+   #abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and Immune cell categories (CD4 vs CD8) classified as high / low as well as the proportion (continuous).   
+```{r, coxph Immune cell category proportions, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Proportions_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Cell groups categorised into "high", "mid", "low".  
+- high > 3rd quantile  
+- mid >1st quantile and < 3rd quantile  
+- low < 1st quantile  
+```{r, survival Immune cell category proportions classified as high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[5]] <-"Bcell high"
+surv_dat$Bcell_G[surv_dat$Bcell <=summary(surv_dat$Bcell)[2]] <-"Bcell low"
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[2]& surv_dat$Bcell<=summary(surv_dat$Bcell)[5]] <-"Bcell medium"
+
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[5]] <-"Myeloid high"
+surv_dat$Myeloid_G[surv_dat$Myeloid <=summary(surv_dat$Myeloid)[2]] <-"Myeloid low"
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[2]& surv_dat$Myeloid<=summary(surv_dat$Myeloid)[5]] <-"Myeloid medium"
+
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[5]] <-"Neutrophil high"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil <=summary(surv_dat$Neutrophil)[2]] <-"Neutrophil low"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[2]& surv_dat$Neutrophil<=summary(surv_dat$Neutrophil)[5]] <-"Neutrophil medium"
+
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[5]] <-"CD4 high"
+surv_dat$CD4_G[surv_dat$CD4 <=summary(surv_dat$CD4)[2]] <-"CD4 low"
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[2]& surv_dat$CD4<=summary(surv_dat$CD4)[5]] <-"CD4 medium"
+
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[5]] <-"CD8 high"
+surv_dat$CD8_G[surv_dat$CD8 <=summary(surv_dat$CD8)[2]] <-"CD8 low"
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[2]& surv_dat$CD8<=summary(surv_dat$CD8)[5]] <-"CD8 medium"
+
+surv <-c("Myeloid_G","Neutrophil_G","Bcell_G" ,"CD4_G","CD8_G","metacluster")
+```
+
+## Survival Immune cell category proportions high mid low not split
+```{r, survival Immune cell category proportions not split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Survival Immune cell category proportions high mid low split by tumour type
+```{r, survival Immune cell category proportions split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[5]] <-"Bcell high"
+surv_dat$Bcell_G[surv_dat$Bcell <=summary(surv_dat$Bcell)[2]] <-"Bcell low"
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[2]& surv_dat$Bcell<=summary(surv_dat$Bcell)[5]] <-"Bcell medium"
+
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[5]] <-"Myeloid high"
+surv_dat$Myeloid_G[surv_dat$Myeloid <=summary(surv_dat$Myeloid)[2]] <-"Myeloid low"
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[2]& surv_dat$Myeloid<=summary(surv_dat$Myeloid)[5]] <-"Myeloid medium"
+
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[5]] <-"Neutrophil high"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil <=summary(surv_dat$Neutrophil)[2]] <-"Neutrophil low"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[2]& surv_dat$Neutrophil<=summary(surv_dat$Neutrophil)[5]] <-"Neutrophil medium"
+
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[5]] <-"CD4 high"
+surv_dat$CD4_G[surv_dat$CD4 <=summary(surv_dat$CD4)[2]] <-"CD4 low"
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[2]& surv_dat$CD4<=summary(surv_dat$CD4)[5]] <-"CD4 medium"
+
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[5]] <-"CD8 high"
+surv_dat$CD8_G[surv_dat$CD8 <=summary(surv_dat$CD8)[2]] <-"CD8 low"
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[2]& surv_dat$CD8<=summary(surv_dat$CD8)[5]] <-"CD8 medium"
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+including clinical parameters, patient metacluster and Immune cell categories (CD4 vs CD8) classified as high mid low as well as the proportion (continuous).   
+```{r, coxph Immune cell category proportions high mid low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Proportions_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+#Correlation of Immune cell category proportions with clinical parameters
+```{r, Correlation of non Immune cell category proportions and clinical parameters, message=FALSE, warning=FALSE, echo=FALSE}
+immune.sce_pat$NeoAdj <- ifelse(immune.sce_pat$Chemo==1 |immune.sce_pat$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(immune.sce_pat[,immune.sce_pat$Patient_ID!="Control"&
+                                          immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                           immune.sce_pat[,immune.sce_pat$Patient_ID!="Control"&
+                                          immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+tdat_wide <-merge(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_as <-tdat_wide[tdat_wide$DX.name=="Adenocarcinoma" | tdat_wide$DX.name=="Squamous cell carcinoma",]
+tdat_wide_as$LN.Met <-ifelse(tdat_wide_as$LN.Met=="LN Metastases", 1,0)
+tdat_wide_as$Dist.Met <-ifelse(tdat_wide_as$Dist.Met=="Dist. Metastases", 1,0)
+
+c.param <- c("Gender","T.new","N","M.new","Dist.Met","LN.Met","Stage")
+for(i in c.param){
+  print(i)
+print(cor.test(tdat_wide_as$Bcell,tdat_wide_as[[i]], method="spearman", exact=F))
+print(cor.test(tdat_wide_as$Neutrophil,tdat_wide_as[[i]], method="spearman", exact=F))
+print(cor.test(tdat_wide_as$Myeloid,tdat_wide_as[[i]], method="spearman", exact=F))
+print(cor.test(tdat_wide_as$CD4,tdat_wide_as[[i]], method="spearman", exact=F))
+print(cor.test(tdat_wide_as$CD8,tdat_wide_as[[i]], method="spearman", exact=F))
+
+  }
+```
+
+
+In group comparison (Wilcoxon for 2 groups or KW for >2 groups) for Immune cell categories over clinical variables:  
+  - T.new (TNM)
+  - N (TNM)
+  - M.new (TNM)
+  - Relapse
+  - Grade
+  - Stage
+  - Gender
+  - Typ (Tumour Type)
+  - Dist.Met (Distant metastases yes/no)
+  - LN.Met (Lymph node metastases yes/no)
+  
+## Kruskal-Wallis / Wilcoxon Immune cell category ~ clinical data excluding neoadjuvant therapy
+```{r, KW W Immune cell category proportions WO neo, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(immune.sce_pat[,immune.sce_pat$Patient_ID!="Control"&
+                                          immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat$DX.name=="Squamous cell carcinoma"]))
+
+#remove neoadj patients
+df <-df[!df$Patient_ID%in% neoadj_pat$Patient_ID,]
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender") 
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$cell_type, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=2) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+  #pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Proportions_WOneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+## Neoadjuvantly treated patients are included, the same categories are used plus:  
+- NeoAdj (Neoadjuvant therapy yes/no)
+- Chemo (Neoadjuvant therapy = chemotherapy)
+- Radio (Neoadjuvant therapy = radiotherapy)
+```{r, KW W Immune cell category proportions with NEO, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(immune.sce_pat[,immune.sce_pat$Patient_ID!="Control"&
+                                          immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat$DX.name=="Squamous cell carcinoma"]))
+
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$cell_type, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=2) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Proportions_Wneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+
+
+## **Immune cell category**
+## **Densities**
+Generally, densities are first calculated per tissue area. For patients with more than 1 image, the average density is calculated subsequently.  
+Optimal number of Immune cell category densities patients metaclusters:
+```{r, Immune cell category density optimal number of clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Densities
+tdat <-as.data.frame(table(immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=immune.sce_pat.roi$RoiID,
+                                "Area"=immune.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=immune.sce_pat.roi$Patient_ID,
+                                "DX.name"=immune.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+#tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+library(FactoMineR)
+library(factoextra)
+sbg <-cutree(hc, k=8)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=20)
+
+#install.packages("cluster")
+library(cluster)
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+```{r, Calculate densities Immune cell category, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table(immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=immune.sce_pat.roi$RoiID,
+                                "Area"=immune.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=immune.sce_pat.roi$Patient_ID,
+                                "DX.name"=immune.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+  
+  
+#CAF type distribution over all patients
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+#order Patients after clustering results
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = labels(hc))
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = labels(hc))
+```
+
+## Barplot showing absolute density per patietn coloured by Immune cell category together with hierarchical clustering tree coloured by patient metaclusters.
+```{r,Barolot Immune cell category densities, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+#barolot (sqrt of density)
+p <-ggplot(tdat,aes(y=sqrt(density),x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+#barplot total density
+p <-ggplot(tdat,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+
+#coloured dendrogram
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 2) %>% as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()
+pg <-plot_grid(p1,p, align="v", ncol=1)
+pg
+#ggsave(plot=pg, file=file.path(plot_folder, paste("Barplot_Tcell-Category-Densities_HC_ordered_dendro.pdf")), width=12, height=6)
+```
+Add patient metacluster to data and plot clusters separately for better identification
+```{r,Immune cell category densities merge hc, message=FALSE, warning=FALSE, echo=FALSE}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k=2))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+ggplot(tdat_ct,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+```
+
+## Survival analysis over Immune cell category densities high low.  
+- High > median
+- Low < median
+```{r, surv_dat Immune cell category densities high low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- rowSums(surv_dat[,2:6])
+
+#high low
+surv_dat$Density_G <- ifelse(surv_dat$total_density >summary(surv_dat$total_density)[3],"Density high","Density low")
+surv_dat$Bcell_G <- ifelse(surv_dat$Bcell >summary(surv_dat$Bcell)[3],"Bcell high","Bcell low")
+surv_dat$Myeloid_G <- ifelse(surv_dat$Myeloid >summary(surv_dat$Myeloid)[3],"Myeloid high","Myeloid low")
+surv_dat$Neutrophil_G <- ifelse(surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[3],"Neutrophil high","Neutrophil low")
+surv_dat$CD8_G <- ifelse(surv_dat$CD8 >summary(surv_dat$CD8)[3],"CD8 high","CD8 low")
+surv_dat$CD4_G <- ifelse(surv_dat$CD4 >summary(surv_dat$CD4)[3],"CD4 high","CD4 low")
+
+```
+
+#CoxPH for Immune cell category density corrected for Stage, Grade and M
+```{r, Coxph Immune cell category density, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   Myeloid+ Bcell+ Neutrophil +CD4+CD8+ Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r, survival Immune cell category densities high low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+#surv <-c("metacluster")
+surv <-c("Neutrophil_G","Bcell_G","Myeloid_G","metacluster", "CD8_G","CD4_G","Density_G")
+library(survminer)
+
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+#km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r, survival Immune cell category densities high low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Split
+library(survminer)
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+
+#high low
+surv_dat$Density_G <- ifelse(surv_dat$total_density >summary(surv_dat$total_density)[3],"Density high","Density low")
+surv_dat$Bcell_G <- ifelse(surv_dat$Bcell >summary(surv_dat$Bcell)[3],"Bcell high","Bcell low")
+surv_dat$Myeloid_G <- ifelse(surv_dat$Myeloid >summary(surv_dat$Myeloid)[3],"Myeloid high","Myeloid low")
+surv_dat$Neutrophil_G <- ifelse(surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[3],"Neutrophil high","Neutrophil low")
+surv_dat$CD4_G <- ifelse(surv_dat$CD4 >summary(surv_dat$CD4)[3],"CD4 high","CD4 low")
+surv_dat$CD8_G <- ifelse(surv_dat$CD8 >summary(surv_dat$CD8)[3],"CD8 high","CD8 low")
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and Immune cell categories (CD4 vs CD8) classified as high / low as well as the densities (continuous).   
+```{r, coxph Immune cell category densities high-low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=6, fig.width=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c("Density_G",paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"Density_G","metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Density_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Survival Immune cell category densities high-mid-low     
+- high    > 3rd quantile   
+- mid     > 1st quantile < 3rd quantile   
+- low     < 1st quantile   
+```{r, surv_dat Immune cell category densities high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- rowSums(surv_dat[,2:6])
+
+#High Med Low
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[5]] <-"Density high"
+surv_dat$Density_G[surv_dat$total_density <summary(surv_dat$total_density)[2]] <-"Density low"
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[2]& surv_dat$total_density<summary(surv_dat$total_density)[5]] <-"Density medium"
+
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[5]] <-"Neutrophil high"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil <summary(surv_dat$Neutrophil)[2]] <-"Neutrophil low"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[2]& surv_dat$Neutrophil<summary(surv_dat$Neutrophil)[5]] <-"Neutrophil medium"
+
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[5]] <-"Myeloid high"
+surv_dat$Myeloid_G[surv_dat$Myeloid <summary(surv_dat$Myeloid)[2]] <-"Myeloid low"
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[2]& surv_dat$Myeloid<summary(surv_dat$Myeloid)[5]] <-"Myeloid medium"
+
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[5]] <-"Bcell high"
+surv_dat$Bcell_G[surv_dat$Bcell <summary(surv_dat$Bcell)[2]] <-"Bcell low"
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[2]& surv_dat$Bcell<summary(surv_dat$Bcell)[5]] <-"Bcell medium"
+
+
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[5]] <-"CD8 high"
+surv_dat$CD8_G[surv_dat$CD8 <summary(surv_dat$CD8)[2]] <-"CD8 low"
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[2]& surv_dat$CD8<summary(surv_dat$CD8)[5]] <-"CD8 medium"
+
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[5]] <-"CD4 high"
+surv_dat$CD4_G[surv_dat$CD4 <summary(surv_dat$CD4)[2]] <-"CD4 low"
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[2]& surv_dat$CD4<summary(surv_dat$CD4)[5]] <-"CD4 medium"
+
+surv <-c("Bcell_G","CD4_G","CD8_G","Myeloid_G","Neutrophil_G","Density_G","metacluster")
+```
+
+## Immune cell category densities split by tumour type
+```{r, survival Immune cell category densities high-mid-low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+surv <-c("Bcell_G","CD4_G","CD8_G","Myeloid_G","Neutrophil_G","Density_G","metacluster")
+library(survminer)
+
+for (i in surv){
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## Immune cell category densities split by tumour type
+```{r, survival Immune cell category densities high-mid-low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+surv <-c("Bcell_G","Neutrophil_G","Myeloid_G","CD4_G","CD8_G","Density_G","metacluster")
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G[dat$total_density >summary(dat$total_density)[5]] <-"Density high"
+dat$Density_G[dat$total_density <=summary(dat$total_density)[2]] <-"Density low"
+dat$Density_G[dat$total_density >summary(dat$total_density)[2]& dat$total_density<summary(dat$total_density)[5]] <-"Density medium"
+
+dat$Bcell_G[dat$Bcell >summary(dat$Bcell)[5]] <-"Bcell high"
+dat$Bcell_G[dat$Bcell <=summary(dat$Bcell)[2]] <-"Bcell low"
+dat$Bcell_G[dat$Bcell >summary(dat$Bcell)[2]& dat$Bcell<summary(dat$Bcell)[5]] <-"Bcell medium"
+
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[5]] <-"CD8 high"
+surv_dat$CD8_G[surv_dat$CD8 <summary(surv_dat$CD8)[2]] <-"CD8 low"
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[2]& surv_dat$CD8<summary(surv_dat$CD8)[5]] <-"CD8 medium"
+
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[5]] <-"CD4 high"
+surv_dat$CD4_G[surv_dat$CD4 <summary(surv_dat$CD4)[2]] <-"CD4 low"
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[2]& surv_dat$CD4<summary(surv_dat$CD4)[5]] <-"CD4 medium"
+
+dat$Neutrophil_G[dat$Neutrophil >summary(dat$Neutrophil)[5]] <-"Neutrophil high"
+dat$Neutrophil_G[dat$Neutrophil <=summary(dat$Neutrophil)[2]] <-"Neutrophil low"
+dat$Neutrophil_G[dat$Neutrophil >summary(dat$Neutrophil)[2]& dat$Neutrophil<summary(dat$Neutrophil)[5]] <-"Neutrophil medium"
+
+dat$Myeloid_G[dat$Myeloid >summary(dat$Myeloid)[5]] <-"Myeloid high"
+dat$Myeloid_G[dat$Myeloid <=summary(dat$Myeloid)[2]] <-"Myeloid low"
+dat$Myeloid_G[dat$Myeloid >summary(dat$Myeloid)[2]& dat$Myeloid<summary(dat$Myeloid)[5]] <-"Myeloid medium"
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+
+
+## Lasso-regressed cox-ph model   
+including clinical parameters, patient metacluster and Immune cell categories (CD4 vs CD8) classified as high mid low as well as the densities (continuous).   
+```{r,coxph Immune cell category densities high-mid-low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor")))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Density_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables without neoadjuvant treated patients.
+```{r, KW W Immune cell category density WO neo,warning=FALSE, messages=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+immune.sce_pat.roi$NeoAdj <- ifelse(immune.sce_pat.roi$Chemo==1 |immune.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=immune.sce_pat.roi$RoiID,
+                                "Area"=immune.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=immune.sce_pat.roi$Patient_ID,
+                                "DX.name"=immune.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[,2:6])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:7])), names_to = "Phenotype",values_to = "density")
+
+#remove
+tdat <-tdat[!tdat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=3) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Densities_WOneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables with neoadjuvant treated patients.
+```{r, KW Wilcox Immune cell Category density W neo,warning=FALSE, messages=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+immune.sce_pat.roi$NeoAdj <- ifelse(immune.sce_pat.roi$Chemo==1 |immune.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=immune.sce_pat.roi$RoiID,
+                                "Area"=immune.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=immune.sce_pat.roi$Patient_ID,
+                                "DX.name"=immune.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[,2:6])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:7])), names_to = "Phenotype",values_to = "density")
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=3) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Densities_Wneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+```{r, Correlation Immune cell category densities with clinical parameters, eval=F, echo=T, message=FALSE, warning=FALSE, echo=FALSE}
+immune.sce_pat.roi$NeoAdj <- ifelse(immune.sce_pat.roi$Chemo==1 |immune.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+immune.sce_pat.roi$NeoAdj <- ifelse(immune.sce_pat.roi$Chemo==1 |immune.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=immune.sce_pat.roi$RoiID,
+                                "Area"=immune.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=immune.sce_pat.roi$Patient_ID,
+                                "DX.name"=immune.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[,2:6])
+
+tdat_wide <-merge(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_as <-tdat_wide[tdat_wide$DX.name=="Adenocarcinoma" | tdat_wide$DX.name=="Squamous cell carcinoma",]
+tdat_wide_as$LN.Met <-ifelse(tdat_wide_as$LN.Met=="LN Metastases", 1,0)
+tdat_wide_as$Dist.Met <-ifelse(tdat_wide_as$Dist.Met=="Dist. Metastases", 1,0)
+
+for(i in c.param){
+  print(i)
+  print(cor.test(tdat_wide_as$Neutrophil,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$Bcell,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$Tcell,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$Myeloid,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$total_density,tdat_wide_as[[i]], method="spearman", exact=F))
+}
+```
diff --git a/Analysis_Immune_Tcell_types.Rmd b/Analysis_Immune_Tcell_types.Rmd
new file mode 100644
index 0000000..79088e1
--- /dev/null
+++ b/Analysis_Immune_Tcell_types.Rmd
@@ -0,0 +1,2347 @@
+---
+title: "R Notebook - Analysis Immune + T cell Types"
+output:
+  html_document:
+    df_print: paged
+---
+
+
+```{r, import libraries, echo=F, warnings=F, message=F}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+library(dplyr)
+library(tidyr)
+library(mclust)
+library(ggplot2)
+library(RColorBrewer)
+library(scater)
+library(Rphenoannoy)
+library(factoextra)
+library(cluster)
+library(dendextend)
+library(ggthemes)
+library(ggpubr)
+library(dplyr)
+#library(tidyverse)
+library(RColorBrewer)
+library(pals)
+library(qwraps2)
+library(table1)
+library(SingleCellExperiment)
+library(tidyr)
+library(scater)
+library(ggridges)
+library(viridis)
+library(viridisLite)
+library(ggplot2)
+library(data.table)
+library(CATALYST)
+library(gridExtra)
+library(Rphenograph)
+library(ComplexHeatmap)
+library(CATALYST)
+library(scales)
+library(survival)
+library(broom)
+library(pheatmap)
+library(FlowSOM)
+library(Seurat)
+library(Rphenoannoy)
+library(dplyr)
+library(data.table)
+library(ggthemes)
+library(diffcyt)
+library(edgeR)
+library(rstatix)
+library(dendextend)
+library(ggdendro)
+library(dendextend)
+library(FactoMineR)
+library(factoextra)
+library(survminer)
+library(corrplot)
+library(rstatix)
+library(graphics)
+library(cowplot)
+library(cluster)
+library(glmnet)
+library(fastDummies)
+library(ggsci)
+library(FactoMineR)
+library(factoextra)
+set.seed(101100)
+```
+
+
+```{r}
+wd <-dirname(getwd())
+
+data_folder <-(file.path(wd,"sce_objects","immune"))
+
+
+immune.sce_pat.roi <-readRDS(file=file.path(data_folder, paste("Immune_all_sce_pat_roi_rem.rds",sep="")))
+immune.sce_roi <-readRDS(file=file.path(data_folder, paste("Immune_all_sce_roi_rem.rds",sep="")))
+immune.sce_pat <-readRDS(file=file.path(data_folder, paste("Immune_all_sce_pat_rem.rds",sep="")))
+```
+
+
+Define clinical data
+```{r, clinical data, message=FALSE, warning=FALSE, echo=FALSE}
+#clinical.data <-as.data.frame(colData(immune.sce))
+
+clinical.data <- read.csv(file=file.path(wd,"sce_objects","clinical_data", "clinical_data_ROI_ac_combined_CORRECT.csv"))
+
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+clinical.data$X <-NULL
+clinical.data$X.1 <-NULL
+head(clinical.data)
+```
+#Immune T cell type
+
+
+##Analysis
+#Proportions Immune cell CATEGORY
+
+## **Analysis**
+In the following, only tumours that were identified as adenocarcinomas or squamous cell carcinomas by pathologists will be used.
+
+## **Immune cell Category**
+
+## **Proportions**
+Optimal number of patient metaclusters Immune cell category proportions
+```{r, optimal number of clusters for non Immune cell category type proportion clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Proportions
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+tdat <-as.data.frame(table( immune.sce_pat[, immune.sce_pat$Patient_ID!="Control"&
+                                           immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                           immune.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            immune.sce_pat[, immune.sce_pat$Patient_ID!="Control"&
+                                           immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                           immune.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+
+
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+#tdat_m <-tdat_m[,1:2]
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+sbg <-cutree(hc, k=2)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=15)
+
+#install.packages("cluster")
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+#res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+## Patients are ordered by hierarchical clustering (euclidian, wards.D2)
+```{r, Immune cell category proportions ordered by hierarchical clustering, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table( immune.sce_pat[, immune.sce_pat$Patient_ID!="Control"&
+                                           immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                           immune.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            immune.sce_pat[, immune.sce_pat$Patient_ID!="Control"&
+                                           immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                           immune.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = hc$labels[hc$order])
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = hc$labels[hc$order])
+
+tdat_wide
+```
+
+Patient based metaclusters split by metacluster
+```{r, Barplot Immune cell category proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#Barplot proportions ordered by
+p <-ggplot(tdat,aes(x=freq, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()
+#plot(p)
+#ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_Tcell-Category_over_FILTEREDpatients_ordered-hc.pdf")), width=6, height=6)
+
+#dendrogram coloured by metacluster
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 9) %>%  as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()+
+  coord_flip()+
+  scale_y_reverse()
+
+#n
+p2<-ggplot(tdat,aes(x=n, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+theme(
+        axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())
+plt <-p1+
+  p+theme(legend.position = "none", axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())+
+  p2
+
+plot(plt)
+#ggsave(plot=plt, file=file.path(plot_folder, paste("Barplot_n_hc_cell_subtype_proportions.pdf")), width=6, height=6)
+```
+
+```{r, merge hc metacluster into t_dat Immune cell category proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k= 9))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+
+p <-ggplot(tdat_ct,aes(y=freq,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+#ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_split_metacluster_tcell-category_proportions.pdf")))
+plot(p)
+```
+
+
+## **Survival analysis** - Immune cell category
+Immune cell categories (CD4 and CD8) are grouped into "high" and "low" using the proportion's median as a cut-off.    
+- high > median  
+- low < median  
+Patients who received neoadjuvant treatment are excluded from survival analysis.  
+```{r, survival data Immune cell category proportions high-low, message=FALSE, warning=FALSE, echo=FALSE}
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat,]
+
+#high low by median
+surv_dat$Neutrophil_G <- ifelse(surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[3],"Neutrophil high","Neutrophil low")
+surv_dat$Bcell_G <- ifelse(surv_dat$Bcell >summary(surv_dat$Bcell)[3],"B cell high","B cell low")
+surv_dat$Myeloid_G <- ifelse(surv_dat$Myeloid >summary(surv_dat$Myeloid)[3],"Myeloid high","Myeloid low")
+surv_dat$CD4_G <- ifelse(surv_dat$CD4 >summary(surv_dat$CD4)[3],"CD4 high","CD4 low")
+surv_dat$CD4_Treg_G <- ifelse(surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[3],"CD4 Treg high","CD4 Treg low")
+surv_dat$CD4_IDO_G <- ifelse(surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[3],"CD4 IDO high","CD4 IDO low")
+surv_dat$CD4_PD1_G <- ifelse(surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[3],"CD4 PD1 high","CD4 PD1 low")
+surv_dat$CD4_TCF_G <- ifelse(surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[3],"CD4 TCF1/7 high","CD4 TCF1/7 low")
+surv_dat$CD4_ki67_G <- ifelse(surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[3],"dividing CD4 high","dividing CD4 low")
+
+surv_dat$CD8_IDO_G <- ifelse(surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[3],"CD8 IDO high","CD8 IDO low")
+surv_dat$CD8_TCF_G <- ifelse(surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[3],"CD8 TCF1/7 high","CD8 TCF1/7 low")
+surv_dat$CD8_G <- ifelse(surv_dat$CD8 >summary(surv_dat$CD8)[3],"CD8 high","CD8 low")
+surv_dat$CD8_ki67_G <- ifelse(surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[3],"dividing CD8 high","dividing CD8 low")
+
+surv <-c("Bcell_G","Neutrophil_G","Myeloid_G", "metacluster","CD8_G","CD4_G","CD4_Treg_G","CD4_IDO_G","CD4_PD1_G","CD4_TCF_G","CD4_ki67_G", "CD8_IDO_G","CD8_TCF_G","CD8_ki67_G")
+```
+
+
+#CoxPH for Immune cell category proportions corrected for Stage, Grade and M
+```{r, Coxph Immune cell category proportions, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   Neutrophil+Bcell+ CD4 + CD8+`TCF1/7_CD4`+ki67_CD8+`TCF1/7_CD8`+PD1_CD4+IDO_CD8 +ki67_CD4+IDO_CD4+CD4_Treg + Myeloid + Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r, survival Immune cell category proportions not split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+
+
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  facet_wrap(~strata)+theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r, survival Immune cell category proportions split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#OS survival
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+surv_dat$Neutrophil_G <- ifelse(surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[3],"Neutrophil high","Neutrophil low")
+surv_dat$Bcell_G <- ifelse(surv_dat$Bcell >summary(surv_dat$Bcell)[3],"B cell high","B cell low")
+surv_dat$Myeloid_G <- ifelse(surv_dat$Myeloid >summary(surv_dat$Myeloid)[3],"Myeloid high","Myeloid low")
+
+surv_dat$CD4_G <- ifelse(surv_dat$CD4 >summary(surv_dat$CD4)[3],"CD4 high","CD4 low")
+surv_dat$CD4_Treg_G <- ifelse(surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[3],"CD4 Treg high","CD4 Treg low")
+surv_dat$CD4_IDO_G <- ifelse(surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[3],"CD4 IDO high","CD4 IDO low")
+surv_dat$CD4_PD1_G <- ifelse(surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[3],"CD4 PD1 high","CD4 PD1 low")
+surv_dat$CD4_TCF_G <- ifelse(surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[3],"CD4 TCF1/7 high","CD4 TCF1/7 low")
+surv_dat$CD4_ki67_G <- ifelse(surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[3],"dividing CD4 high","dividing CD4 low")
+
+surv_dat$CD8_IDO_G <- ifelse(surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[3],"CD8 IDO high","CD8 IDO low")
+surv_dat$CD8_TCF_G <- ifelse(surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[3],"CD8 TCF1/7 high","CD8 TCF1/7 low")
+surv_dat$CD8_G <- ifelse(surv_dat$CD8 >summary(surv_dat$CD8)[3],"CD8 high","CD8 low")
+surv_dat$CD8_ki67_G <- ifelse(surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[3],"dividing CD8 high","dividing CD8 low")
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+   #abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and Immune cell categories (CD4 vs CD8) classified as high / low as well as the proportion (continuous).   
+```{r, coxph Immune cell category proportions, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Proportions_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Cell groups categorised into "high", "mid", "low".  
+- high > 3rd quantile  
+- mid >1st quantile and < 3rd quantile  
+- low < 1st quantile  
+```{r, survival Immune cell category proportions classified as high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[5]] <-"Bcell high"
+surv_dat$Bcell_G[surv_dat$Bcell <=summary(surv_dat$Bcell)[2]] <-"Bcell low"
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[2]& surv_dat$Bcell<=summary(surv_dat$Bcell)[5]] <-"Bcell medium"
+
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[5]] <-"Myeloid high"
+surv_dat$Myeloid_G[surv_dat$Myeloid <=summary(surv_dat$Myeloid)[2]] <-"Myeloid low"
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[2]& surv_dat$Myeloid<=summary(surv_dat$Myeloid)[5]] <-"Myeloid medium"
+
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[5]] <-"Neutrophil high"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil <=summary(surv_dat$Neutrophil)[2]] <-"Neutrophil low"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[2]& surv_dat$Neutrophil<=summary(surv_dat$Neutrophil)[5]] <-"Neutrophil medium"
+
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[5]] <-"CD4 high"
+surv_dat$CD4_G[surv_dat$CD4 <=summary(surv_dat$CD4)[2]] <-"CD4 low"
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[2]& surv_dat$CD4<summary(surv_dat$CD4)[5]] <-"CD4 medium"
+
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[5]] <-"CD4 Treg high"
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg <=summary(surv_dat$CD4_Treg)[2]] <-"CD4 Treg low"
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[2]& surv_dat$CD4_Treg<summary(surv_dat$CD4_Treg)[5]] <-"CD4 T reg medium"
+
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[5]] <-"IDO_CD4 high"
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 <=summary(surv_dat$IDO_CD4)[2]] <-"IDO_CD4 low"
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[2]& surv_dat$IDO_CD4<summary(surv_dat$IDO_CD4)[5]] <-"IDO_CD4 medium"
+
+surv_dat$CD4_PD1_G[surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[5]] <-"PD1_CD4 high"
+surv_dat$CD4_PD1_G[surv_dat$PD1_CD4 <=summary(surv_dat$PD1_CD4)[2]] <-"PD1_CD4 low"
+surv_dat$CD4_PD1_G[surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[2]& surv_dat$PD1_CD4<summary(surv_dat$PD1_CD4)[5]] <-"PD1_CD4 medium"
+
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[5]] <-"CD4 TCF1/7 high"
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` <=summary(surv_dat$`TCF1/7_CD4`)[2]] <-"CD4 TCF1/7  low"
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[2]& surv_dat$`TCF1/7_CD4`<summary(surv_dat$`TCF1/7_CD4`)[5]] <-"CD4 TCF1/7  medium"
+
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[5]] <-"CD4 dividing high"
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 <=summary(surv_dat$ki67_CD4)[2]] <-"CD4 dividing low"
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[2]& surv_dat$ki67_CD4<summary(surv_dat$ki67_CD4)[5]] <-"CD4 dividing medium"
+
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[5]] <-"CD8 high"
+surv_dat$CD8_G[surv_dat$CD8 <=summary(surv_dat$CD8)[2]] <-"CD8 low"
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[2]& surv_dat$CD8<summary(surv_dat$CD8)[5]] <-"CD8 medium"
+
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[5]] <-"CD8 IDO high"
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 <=summary(surv_dat$IDO_CD8)[2]] <-"CD8 IDO low"
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[2]& surv_dat$IDO_CD8<summary(surv_dat$IDO_CD8)[5]] <-"CD8 IDO medium"
+
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[5]] <-"CD8 TCF1/7 high"
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` <=summary(surv_dat$`TCF1/7_CD8`)[2]] <-"CD8 TCF1/7 low"
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[2]& surv_dat$`TCF1/7_CD8`<summary(surv_dat$`TCF1/7_CD8`)[5]] <-"CD8 TCF1/7 medium"
+
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[5]] <-"CD8 dividing high"
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 <=summary(surv_dat$ki67_CD8)[2]] <-"CD8 dividing low"
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[2]& surv_dat$ki67_CD8<summary(surv_dat$ki67_CD8)[5]] <-"CD8 dividing medium"
+
+surv <-c("Bcell_G","Neutrophil_G","Myeloid_G", "metacluster","CD8_G","CD4_G","CD4_Treg_G","CD4_IDO_G","CD4_PD1_G","CD4_TCF_G","CD4_ki67_G", "CD8_IDO_G","CD8_TCF_G","CD8_ki67_G")
+```
+
+## Survival Immune cell category proportions high mid low not split
+```{r, survival Immune cell category proportions not split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Survival Immune cell category proportions high mid low split by tumour type
+```{r, survival Immune cell category proportions split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[5]] <-"Bcell high"
+surv_dat$Bcell_G[surv_dat$Bcell <=summary(surv_dat$Bcell)[2]] <-"Bcell low"
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[2]& surv_dat$Bcell<=summary(surv_dat$Bcell)[5]] <-"Bcell medium"
+
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[5]] <-"Myeloid high"
+surv_dat$Myeloid_G[surv_dat$Myeloid <=summary(surv_dat$Myeloid)[2]] <-"Myeloid low"
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[2]& surv_dat$Myeloid<=summary(surv_dat$Myeloid)[5]] <-"Myeloid medium"
+
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[5]] <-"Neutrophil high"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil <=summary(surv_dat$Neutrophil)[2]] <-"Neutrophil low"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[2]& surv_dat$Neutrophil<=summary(surv_dat$Neutrophil)[5]] <-"Neutrophil medium"
+
+
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[5]] <-"CD4 high"
+surv_dat$CD4_G[surv_dat$CD4 <=summary(surv_dat$CD4)[2]] <-"CD4 low"
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[2]& surv_dat$CD4<summary(surv_dat$CD4)[5]] <-"CD4 medium"
+
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[5]] <-"CD4 Treg high"
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg <=summary(surv_dat$CD4_Treg)[2]] <-"CD4 Treg low"
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[2]& surv_dat$CD4_Treg<summary(surv_dat$CD4_Treg)[5]] <-"CD4 T reg medium"
+
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[5]] <-"IDO_CD4 high"
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 <=summary(surv_dat$IDO_CD4)[2]] <-"IDO_CD4 low"
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[2]& surv_dat$IDO_CD4<summary(surv_dat$IDO_CD4)[5]] <-"IDO_CD4 medium"
+
+surv_dat$CD4_PD1_G[surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[5]] <-"PD1_CD4 high"
+surv_dat$CD4_PD1_G[surv_dat$PD1_CD4 <=summary(surv_dat$PD1_CD4)[2]] <-"PD1_CD4 low"
+surv_dat$CD4_PD1_G[surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[2]& surv_dat$PD1_CD4<summary(surv_dat$PD1_CD4)[5]] <-"PD1_CD4 medium"
+
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[5]] <-"CD4 TCF1/7 high"
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` <=summary(surv_dat$`TCF1/7_CD4`)[2]] <-"CD4 TCF1/7  low"
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[2]& surv_dat$`TCF1/7_CD4`<summary(surv_dat$`TCF1/7_CD4`)[5]] <-"CD4 TCF1/7  medium"
+
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[5]] <-"CD4 dividing high"
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 <=summary(surv_dat$ki67_CD4)[2]] <-"CD4 dividing low"
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[2]& surv_dat$ki67_CD4<summary(surv_dat$ki67_CD4)[5]] <-"CD4 dividing medium"
+
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[5]] <-"CD8 high"
+surv_dat$CD8_G[surv_dat$CD8 <=summary(surv_dat$CD8)[2]] <-"CD8 low"
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[2]& surv_dat$CD8<summary(surv_dat$CD8)[5]] <-"CD8 medium"
+
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[5]] <-"CD8 IDO high"
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 <=summary(surv_dat$IDO_CD8)[2]] <-"CD8 IDO low"
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[2]& surv_dat$IDO_CD8<summary(surv_dat$IDO_CD8)[5]] <-"CD8 IDO medium"
+
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[5]] <-"CD8 TCF1/7 high"
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` <=summary(surv_dat$`TCF1/7_CD8`)[2]] <-"CD8 TCF1/7 low"
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[2]& surv_dat$`TCF1/7_CD8`<summary(surv_dat$`TCF1/7_CD8`)[5]] <-"CD8 TCF1/7 medium"
+
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[5]] <-"CD8 dividing high"
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 <=summary(surv_dat$ki67_CD8)[2]] <-"CD8 dividing low"
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[2]& surv_dat$ki67_CD8<summary(surv_dat$ki67_CD8)[5]] <-"CD8 dividing medium"
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+including clinical parameters, patient metacluster and Immune cell categories (CD4 vs CD8) classified as high mid low as well as the proportion (continuous).   
+```{r, coxph Immune cell category proportions high mid low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Proportions_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+#Correlation of Immune cell category proportions with clinical parameters
+```{r, Correlation of non Immune cell category proportions and clinical parameters, message=FALSE, warning=FALSE, echo=FALSE, eval=F}
+immune.sce_pat$NeoAdj <- ifelse(immune.sce_pat$Chemo==1 |immune.sce_pat$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(immune.sce_pat[,immune.sce_pat$Patient_ID!="Control"&
+                                          immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                           immune.sce_pat[,immune.sce_pat$Patient_ID!="Control"&
+                                          immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+tdat_wide <-merge(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_as <-tdat_wide[tdat_wide$DX.name=="Adenocarcinoma" | tdat_wide$DX.name=="Squamous cell carcinoma",]
+tdat_wide_as$LN.Met <-ifelse(tdat_wide_as$LN.Met=="LN Metastases", 1,0)
+tdat_wide_as$Dist.Met <-ifelse(tdat_wide_as$Dist.Met=="Dist. Metastases", 1,0)
+
+c.param <- c("Gender","T.new","N","M.new","Dist.Met","LN.Met","Stage")
+for(i in c.param){
+  print(i)
+print(cor.test(tdat_wide_as$Bcell,tdat_wide_as[[i]], method="spearman", exact=F))
+print(cor.test(tdat_wide_as$Neutrophil,tdat_wide_as[[i]], method="spearman", exact=F))
+print(cor.test(tdat_wide_as$Myeloid,tdat_wide_as[[i]], method="spearman", exact=F))
+print(cor.test(tdat_wide_as$Tcell,tdat_wide_as[[i]], method="spearman", exact=F))
+  }
+```
+
+
+In group comparison (Wilcoxon for 2 groups or KW for >2 groups) for Immune cell categories over clinical variables:  
+  - T.new (TNM)
+  - N (TNM)
+  - M.new (TNM)
+  - Relapse
+  - Grade
+  - Stage
+  - Gender
+  - Typ (Tumour Type)
+  - Dist.Met (Distant metastases yes/no)
+  - LN.Met (Lymph node metastases yes/no)
+  
+## Kruskal-Wallis / Wilcoxon Immune cell category ~ clinical data excluding neoadjuvant therapy
+```{r, KW W Immune cell category proportions WO neo, fig.width=15, fig.height=10, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(immune.sce_pat[,immune.sce_pat$Patient_ID!="Control"&
+                                          immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat$DX.name=="Squamous cell carcinoma"]))
+
+#remove neoadj patients
+df <-df[!df$Patient_ID%in% neoadj_pat$Patient_ID,]
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender") 
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$cell_subtype, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=7) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+  #pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Proportions_WOneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+## Neoadjuvantly treated patients are included, the same categories are used plus:  
+- NeoAdj (Neoadjuvant therapy yes/no)
+- Chemo (Neoadjuvant therapy = chemotherapy)
+- Radio (Neoadjuvant therapy = radiotherapy)
+```{r, KW W Immune cell category proportions with NEO, fig.width=15, fig.height=10, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(immune.sce_pat[,immune.sce_pat$Patient_ID!="Control"&
+                                          immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat$DX.name=="Squamous cell carcinoma"]))
+
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$cell_subtype, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=7) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Proportions_Wneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+
+
+## **Immune cell category**
+## **Densities**
+Generally, densities are first calculated per tissue area. For patients with more than 1 image, the average density is calculated subsequently.  
+Optimal number of Immune cell category densities patients metaclusters:
+```{r, Immune cell category density optimal number of clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Densities
+tdat <-as.data.frame(table(immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=immune.sce_pat.roi$RoiID,
+                                "Area"=immune.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=immune.sce_pat.roi$Patient_ID,
+                                "DX.name"=immune.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+#tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+library(FactoMineR)
+library(factoextra)
+sbg <-cutree(hc, k=8)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=20)
+
+#install.packages("cluster")
+library(cluster)
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+```{r, Calculate densities Immune cell category, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table(immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=immune.sce_pat.roi$RoiID,
+                                "Area"=immune.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=immune.sce_pat.roi$Patient_ID,
+                                "DX.name"=immune.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+  
+  
+#CAF type distribution over all patients
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+#order Patients after clustering results
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = labels(hc))
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = labels(hc))
+```
+
+## Barplot showing absolute density per patietn coloured by Immune cell category together with hierarchical clustering tree coloured by patient metaclusters.
+```{r,Barolot Immune cell category densities, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+#barolot (sqrt of density)
+p <-ggplot(tdat,aes(y=sqrt(density),x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+#barplot total density
+p <-ggplot(tdat,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+
+#coloured dendrogram
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 2) %>% as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()
+pg <-plot_grid(p1,p, align="v", ncol=1)
+pg
+#ggsave(plot=pg, file=file.path(plot_folder, paste("Barplot_Tcell-Category-Densities_HC_ordered_dendro.pdf")), width=12, height=6)
+```
+Add patient metacluster to data and plot clusters separately for better identification
+```{r,Immune cell category densities merge hc, message=FALSE, warning=FALSE, echo=FALSE}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k=2))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+ggplot(tdat_ct,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+```
+
+## Survival analysis over Immune cell category densities high low.  
+- High > median
+- Low < median
+```{r, surv_dat Immune cell category densities high low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- rowSums(surv_dat[,2:14])
+
+#high low
+surv_dat$Density_G <- ifelse(surv_dat$total_density >summary(surv_dat$total_density)[3],"Density high","Density low")
+surv_dat$Bcell_G <- ifelse(surv_dat$Bcell >summary(surv_dat$Bcell)[3],"Bcell high","Bcell low")
+surv_dat$Myeloid_G <- ifelse(surv_dat$Myeloid >summary(surv_dat$Myeloid)[3],"Myeloid high","Myeloid low")
+surv_dat$Neutrophil_G <- ifelse(surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[3],"Neutrophil high","Neutrophil low")
+
+surv_dat$CD4_G <- ifelse(surv_dat$CD4 >summary(surv_dat$CD4)[3],"CD4 high","CD4 low")
+surv_dat$CD4_Treg_G <- ifelse(surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[3],"CD4 Treg high","CD4 Treg low")
+surv_dat$CD4_IDO_G <- ifelse(surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[3],"CD4 IDO high","CD4 IDO low")
+surv_dat$CD4_PD1_G <- ifelse(surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[3],"CD4 PD1 high","CD4 PD1 low")
+surv_dat$CD4_TCF_G <- ifelse(surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[3],"CD4 TCF1/7 high","CD4 TCF1/7 low")
+surv_dat$CD4_ki67_G <- ifelse(surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[3],"dividing CD4 high","dividing CD4 low")
+
+surv_dat$CD8_IDO_G <- ifelse(surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[3],"CD8 IDO high","CD8 IDO low")
+surv_dat$CD8_TCF_G <- ifelse(surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[3],"CD8 TCF1/7 high","CD8 TCF1/7 low")
+surv_dat$CD8_G <- ifelse(surv_dat$CD8 >summary(surv_dat$CD8)[3],"CD8 high","CD8 low")
+surv_dat$CD8_ki67_G <- ifelse(surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[3],"dividing CD8 high","dividing CD8 low")
+
+surv <-c("Neutrophil_G","Myeloid_G","Bcell_G","CD8_G","CD4_G","CD4_Treg_G","CD4_IDO_G","CD4_PD1_G","CD4_TCF_G","CD4_ki67_G", "CD8_IDO_G","CD8_TCF_G","CD8_ki67_G","metacluster")
+
+```
+
+#CoxPH for Immune cell category density corrected for Stage, Grade and M
+```{r, Coxph Immune cell category density, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   Myeloid+ Bcell+ Neutrophil + CD4 + CD8+`TCF1/7_CD4`+ki67_CD8+`TCF1/7_CD8`+PD1_CD4+IDO_CD8 +ki67_CD4+IDO_CD4+CD4_Treg+ Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r, survival Immune cell category densities high low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+#surv <-c("metacluster")
+library(survminer)
+
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+#km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r, survival Immune cell category densities high low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Split
+library(survminer)
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+
+#high low
+surv_dat$Density_G <- ifelse(surv_dat$total_density >summary(surv_dat$total_density)[3],"Density high","Density low")
+surv_dat$Bcell_G <- ifelse(surv_dat$Bcell >summary(surv_dat$Bcell)[3],"Bcell high","Bcell low")
+surv_dat$Myeloid_G <- ifelse(surv_dat$Myeloid >summary(surv_dat$Myeloid)[3],"Myeloid high","Myeloid low")
+surv_dat$Neutrophil_G <- ifelse(surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[3],"Neutrophil high","Neutrophil low")
+
+surv_dat$CD4_G <- ifelse(surv_dat$CD4 >summary(surv_dat$CD4)[3],"CD4 high","CD4 low")
+surv_dat$CD4_Treg_G <- ifelse(surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[3],"CD4 Treg high","CD4 Treg low")
+surv_dat$CD4_IDO_G <- ifelse(surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[3],"CD4 IDO high","CD4 IDO low")
+surv_dat$CD4_PD1_G <- ifelse(surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[3],"CD4 PD1 high","CD4 PD1 low")
+surv_dat$CD4_TCF_G <- ifelse(surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[3],"CD4 TCF1/7 high","CD4 TCF1/7 low")
+surv_dat$CD4_ki67_G <- ifelse(surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[3],"dividing CD4 high","dividing CD4 low")
+
+surv_dat$CD8_IDO_G <- ifelse(surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[3],"CD8 IDO high","CD8 IDO low")
+surv_dat$CD8_TCF_G <- ifelse(surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[3],"CD8 TCF1/7 high","CD8 TCF1/7 low")
+surv_dat$CD8_G <- ifelse(surv_dat$CD8 >summary(surv_dat$CD8)[3],"CD8 high","CD8 low")
+surv_dat$CD8_ki67_G <- ifelse(surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[3],"dividing CD8 high","dividing CD8 low")
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and Immune cell categories (CD4 vs CD8) classified as high / low as well as the densities (continuous).   
+```{r, coxph Immune cell category densities high-low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=6, fig.width=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c("Density_G",paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"Density_G","metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Density_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Survival Immune cell category densities high-mid-low     
+- high    > 3rd quantile   
+- mid     > 1st quantile < 3rd quantile   
+- low     < 1st quantile   
+```{r, surv_dat Immune cell category densities high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- rowSums(surv_dat[,2:14])
+
+#High Med Low
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[5]] <-"Density high"
+surv_dat$Density_G[surv_dat$total_density <summary(surv_dat$total_density)[2]] <-"Density low"
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[2]& surv_dat$total_density<summary(surv_dat$total_density)[5]] <-"Density medium"
+
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[5]] <-"Neutrophil high"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil <summary(surv_dat$Neutrophil)[2]] <-"Neutrophil low"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[2]& surv_dat$Neutrophil<summary(surv_dat$Neutrophil)[5]] <-"Neutrophil medium"
+
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[5]] <-"Myeloid high"
+surv_dat$Myeloid_G[surv_dat$Myeloid <summary(surv_dat$Myeloid)[2]] <-"Myeloid low"
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[2]& surv_dat$Myeloid<summary(surv_dat$Myeloid)[5]] <-"Myeloid medium"
+
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[5]] <-"Bcell high"
+surv_dat$Bcell_G[surv_dat$Bcell <summary(surv_dat$Bcell)[2]] <-"Bcell low"
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[2]& surv_dat$Bcell<summary(surv_dat$Bcell)[5]] <-"Bcell medium"
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[5]] <-"CD4 high"
+surv_dat$CD4_G[surv_dat$CD4 <summary(surv_dat$CD4)[2]] <-"CD4 low"
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[2]& surv_dat$CD4<summary(surv_dat$CD4)[5]] <-"CD4 medium"
+
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[5]] <-"CD4 Treg high"
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg <summary(surv_dat$CD4_Treg)[2]] <-"CD4 Treg low"
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[2]& surv_dat$CD4_Treg<summary(surv_dat$CD4_Treg)[5]] <-"CD4 T reg medium"
+
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[5]] <-"IDO_CD4 high"
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 <summary(surv_dat$IDO_CD4)[2]] <-"IDO_CD4 low"
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[2]& surv_dat$IDO_CD4<summary(surv_dat$IDO_CD4)[5]] <-"IDO_CD4 medium"
+
+surv_dat$CD4_PD1_G[surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[5]] <-"PD1_CD4 high"
+surv_dat$CD4_PD1_G[surv_dat$PD1_CD4 <summary(surv_dat$PD1_CD4)[2]] <-"PD1_CD4 low"
+surv_dat$CD4_PD1_G[surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[2]& surv_dat$PD1_CD4<summary(surv_dat$PD1_CD4)[5]] <-"PD1_CD4 medium"
+
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[5]] <-"CD4 TCF1/7 high"
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` <summary(surv_dat$`TCF1/7_CD4`)[2]] <-"CD4 TCF1/7  low"
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[2]& surv_dat$`TCF1/7_CD4`<summary(surv_dat$`TCF1/7_CD4`)[5]] <-"CD4 TCF1/7  medium"
+
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[5]] <-"CD4 dividing high"
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 <summary(surv_dat$ki67_CD4)[2]] <-"CD4 dividing low"
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[2]& surv_dat$ki67_CD4<summary(surv_dat$ki67_CD4)[5]] <-"CD4 dividing medium"
+
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[5]] <-"CD8 high"
+surv_dat$CD8_G[surv_dat$CD8 <summary(surv_dat$CD8)[2]] <-"CD8 low"
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[2]& surv_dat$CD8<summary(surv_dat$CD8)[5]] <-"CD8 medium"
+
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[5]] <-"CD8 IDO high"
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 <summary(surv_dat$IDO_CD8)[2]] <-"CD8 IDO low"
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[2]& surv_dat$IDO_CD8<summary(surv_dat$IDO_CD8)[5]] <-"CD8 IDO medium"
+
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[5]] <-"CD8 TCF1/7 high"
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` <summary(surv_dat$`TCF1/7_CD8`)[2]] <-"CD8 TCF1/7 low"
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[2]& surv_dat$`TCF1/7_CD8`<summary(surv_dat$`TCF1/7_CD8`)[5]] <-"CD8 TCF1/7 medium"
+
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[5]] <-"CD8 dividing high"
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 <summary(surv_dat$ki67_CD8)[2]] <-"CD8 dividing low"
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[2]& surv_dat$ki67_CD8<summary(surv_dat$ki67_CD8)[5]] <-"CD8 dividing medium"
+
+```
+
+## Immune cell category densities split by tumour type
+```{r, survival Immune cell category densities high-mid-low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+library(survminer)
+
+for (i in surv){
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## Immune cell category densities split by tumour type
+```{r, survival Immune cell category densities high-mid-low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+surv <-c("Neutrophil_G","Myeloid_G","Bcell_G","CD8_G","CD4_G","CD4_Treg_G","CD4_IDO_G","CD4_PD1_G","CD4_TCF_G","CD4_ki67_G", "CD8_IDO_G","CD8_TCF_G","CD8_ki67_G","metacluster")
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+
+#High Med Low
+dat$Density_G[dat$total_density >summary(dat$total_density)[5]] <-"Density high"
+dat$Density_G[dat$total_density <summary(dat$total_density)[2]] <-"Density low"
+dat$Density_G[dat$total_density >summary(dat$total_density)[2]& dat$total_density<summary(dat$total_density)[5]] <-"Density medium"
+
+dat$Neutrophil_G[dat$Neutrophil >summary(dat$Neutrophil)[5]] <-"Neutrophil high"
+dat$Neutrophil_G[dat$Neutrophil <summary(dat$Neutrophil)[2]] <-"Neutrophil low"
+dat$Neutrophil_G[dat$Neutrophil >summary(dat$Neutrophil)[2]& dat$Neutrophil<summary(dat$Neutrophil)[5]] <-"Neutrophil medium"
+
+dat$Myeloid_G[dat$Myeloid >summary(dat$Myeloid)[5]] <-"Myeloid high"
+dat$Myeloid_G[dat$Myeloid <summary(dat$Myeloid)[2]] <-"Myeloid low"
+dat$Myeloid_G[dat$Myeloid >summary(dat$Myeloid)[2]& dat$Myeloid<summary(dat$Myeloid)[5]] <-"Myeloid medium"
+
+dat$Bcell_G[dat$Bcell >summary(dat$Bcell)[5]] <-"Bcell high"
+dat$Bcell_G[dat$Bcell <summary(dat$Bcell)[2]] <-"Bcell low"
+dat$Bcell_G[dat$Bcell >summary(dat$Bcell)[2]& dat$Bcell<summary(dat$Bcell)[5]] <-"Bcell medium"
+dat$CD4_G[dat$CD4 >summary(dat$CD4)[5]] <-"CD4 high"
+dat$CD4_G[dat$CD4 <summary(dat$CD4)[2]] <-"CD4 low"
+dat$CD4_G[dat$CD4 >summary(dat$CD4)[2]& dat$CD4<summary(dat$CD4)[5]] <-"CD4 medium"
+
+dat$CD4_Treg_G[dat$CD4_Treg >summary(dat$CD4_Treg)[5]] <-"CD4 Treg high"
+dat$CD4_Treg_G[dat$CD4_Treg <summary(dat$CD4_Treg)[2]] <-"CD4 Treg low"
+dat$CD4_Treg_G[dat$CD4_Treg >summary(dat$CD4_Treg)[2]& dat$CD4_Treg<summary(dat$CD4_Treg)[5]] <-"CD4 T reg medium"
+
+dat$CD4_IDO_G[dat$IDO_CD4 >summary(dat$IDO_CD4)[5]] <-"IDO_CD4 high"
+dat$CD4_IDO_G[dat$IDO_CD4 <summary(dat$IDO_CD4)[2]] <-"IDO_CD4 low"
+dat$CD4_IDO_G[dat$IDO_CD4 >summary(dat$IDO_CD4)[2]& dat$IDO_CD4<summary(dat$IDO_CD4)[5]] <-"IDO_CD4 medium"
+
+dat$CD4_PD1_G[dat$PD1_CD4 >summary(dat$PD1_CD4)[5]] <-"PD1_CD4 high"
+dat$CD4_PD1_G[dat$PD1_CD4 <summary(dat$PD1_CD4)[2]] <-"PD1_CD4 low"
+dat$CD4_PD1_G[dat$PD1_CD4 >summary(dat$PD1_CD4)[2]& dat$PD1_CD4<summary(dat$PD1_CD4)[5]] <-"PD1_CD4 medium"
+
+dat$CD4_TCF_G[dat$`TCF1/7_CD4` >summary(dat$`TCF1/7_CD4`)[5]] <-"CD4 TCF1/7 high"
+dat$CD4_TCF_G[dat$`TCF1/7_CD4` <summary(dat$`TCF1/7_CD4`)[2]] <-"CD4 TCF1/7  low"
+dat$CD4_TCF_G[dat$`TCF1/7_CD4` >summary(dat$`TCF1/7_CD4`)[2]& dat$`TCF1/7_CD4`<summary(dat$`TCF1/7_CD4`)[5]] <-"CD4 TCF1/7  medium"
+
+dat$CD4_ki67_G[dat$ki67_CD4 >summary(dat$ki67_CD4)[5]] <-"CD4 dividing high"
+dat$CD4_ki67_G[dat$ki67_CD4 <summary(dat$ki67_CD4)[2]] <-"CD4 dividing low"
+dat$CD4_ki67_G[dat$ki67_CD4 >summary(dat$ki67_CD4)[2]& dat$ki67_CD4<summary(dat$ki67_CD4)[5]] <-"CD4 dividing medium"
+
+dat$CD8_G[dat$CD8 >summary(dat$CD8)[5]] <-"CD8 high"
+dat$CD8_G[dat$CD8 <summary(dat$CD8)[2]] <-"CD8 low"
+dat$CD8_G[dat$CD8 >summary(dat$CD8)[2]& dat$CD8<summary(dat$CD8)[5]] <-"CD8 medium"
+
+dat$CD8_IDO_G[dat$IDO_CD8 >summary(dat$IDO_CD8)[5]] <-"CD8 IDO high"
+dat$CD8_IDO_G[dat$IDO_CD8 <summary(dat$IDO_CD8)[2]] <-"CD8 IDO low"
+dat$CD8_IDO_G[dat$IDO_CD8 >summary(dat$IDO_CD8)[2]& dat$IDO_CD8<summary(dat$IDO_CD8)[5]] <-"CD8 IDO medium"
+
+dat$CD8_TCF_G[dat$`TCF1/7_CD8` >summary(dat$`TCF1/7_CD8`)[5]] <-"CD8 TCF1/7 high"
+dat$CD8_TCF_G[dat$`TCF1/7_CD8` <summary(dat$`TCF1/7_CD8`)[2]] <-"CD8 TCF1/7 low"
+dat$CD8_TCF_G[dat$`TCF1/7_CD8` >summary(dat$`TCF1/7_CD8`)[2]& dat$`TCF1/7_CD8`<summary(dat$`TCF1/7_CD8`)[5]] <-"CD8 TCF1/7 medium"
+
+dat$CD8_ki67_G[dat$ki67_CD8 >summary(dat$ki67_CD8)[5]] <-"CD8 dividing high"
+dat$CD8_ki67_G[dat$ki67_CD8 <summary(dat$ki67_CD8)[2]] <-"CD8 dividing low"
+dat$CD8_ki67_G[dat$ki67_CD8 >summary(dat$ki67_CD8)[2]& dat$ki67_CD8<summary(dat$ki67_CD8)[5]] <-"CD8 dividing medium"
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+
+
+## Lasso-regressed cox-ph model   
+including clinical parameters, patient metacluster and Immune cell categories (CD4 vs CD8) classified as high mid low as well as the densities (continuous).   
+```{r,coxph Immune cell category densities high-mid-low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor")))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Density_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables without neoadjuvant treated patients.
+```{r, KW W Immune cell category density WO neo,warning=FALSE, message=FALSE, echo=FALSE, fig.width=12, fig.height=10}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+immune.sce_pat.roi$NeoAdj <- ifelse(immune.sce_pat.roi$Chemo==1 |immune.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=immune.sce_pat.roi$RoiID,
+                                "Area"=immune.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=immune.sce_pat.roi$Patient_ID,
+                                "DX.name"=immune.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[,2:14])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:15])), names_to = "Phenotype",values_to = "density")
+
+#remove
+tdat <-tdat[!tdat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=7) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Densities_WOneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables with neoadjuvant treated patients.
+```{r, KW Wilcox Immune cell Category density W neo,warning=FALSE, message=FALSE, echo=FALSE, fig.width=12, fig.height=10}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+immune.sce_pat.roi$NeoAdj <- ifelse(immune.sce_pat.roi$Chemo==1 |immune.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=immune.sce_pat.roi$RoiID,
+                                "Area"=immune.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=immune.sce_pat.roi$Patient_ID,
+                                "DX.name"=immune.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[,2:14])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:15])), names_to = "Phenotype",values_to = "density")
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=7) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Densities_Wneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+```{r, Correlation Immune cell category densities with clinical parameters, eval=F, echo=T, message=FALSE, warning=FALSE, echo=FALSE, eval=F}
+immune.sce_pat.roi$NeoAdj <- ifelse(immune.sce_pat.roi$Chemo==1 |immune.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+immune.sce_pat.roi$NeoAdj <- ifelse(immune.sce_pat.roi$Chemo==1 |immune.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=immune.sce_pat.roi$RoiID,
+                                "Area"=immune.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=immune.sce_pat.roi$Patient_ID,
+                                "DX.name"=immune.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[,2:5])
+
+tdat_wide <-merge(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_as <-tdat_wide[tdat_wide$DX.name=="Adenocarcinoma" | tdat_wide$DX.name=="Squamous cell carcinoma",]
+tdat_wide_as$LN.Met <-ifelse(tdat_wide_as$LN.Met=="LN Metastases", 1,0)
+tdat_wide_as$Dist.Met <-ifelse(tdat_wide_as$Dist.Met=="Dist. Metastases", 1,0)
+
+for(i in c.param){
+  print(i)
+  print(cor.test(tdat_wide_as$Neutrophil,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$Bcell,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$Tcell,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$Myeloid,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$total_density,tdat_wide_as[[i]], method="spearman", exact=F))
+}
+```
+
diff --git a/Analysis_Immune_Types.Rmd b/Analysis_Immune_Types.Rmd
new file mode 100644
index 0000000..3544067
--- /dev/null
+++ b/Analysis_Immune_Types.Rmd
@@ -0,0 +1,2252 @@
+---
+title: "R Notebook - Analysis Immune cell types"
+output:
+  html_document:
+    df_print: paged
+---
+```{r, import libraries, echo=F, warnings=F, message=F}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+library(dplyr)
+library(tidyr)
+library(mclust)
+library(ggplot2)
+library(RColorBrewer)
+library(scater)
+library(Rphenoannoy)
+library(factoextra)
+library(cluster)
+library(dendextend)
+library(ggthemes)
+library(ggpubr)
+library(dplyr)
+#library(tidyverse)
+library(RColorBrewer)
+library(pals)
+library(qwraps2)
+library(table1)
+library(SingleCellExperiment)
+library(tidyr)
+library(scater)
+library(ggridges)
+library(viridis)
+library(viridisLite)
+library(ggplot2)
+library(data.table)
+library(CATALYST)
+library(gridExtra)
+library(Rphenograph)
+library(ComplexHeatmap)
+library(CATALYST)
+library(scales)
+library(survival)
+library(broom)
+library(pheatmap)
+library(FlowSOM)
+library(Seurat)
+library(Rphenoannoy)
+library(dplyr)
+library(data.table)
+library(ggthemes)
+library(diffcyt)
+library(edgeR)
+library(rstatix)
+library(dendextend)
+library(ggdendro)
+library(dendextend)
+library(FactoMineR)
+library(factoextra)
+library(survminer)
+library(corrplot)
+library(rstatix)
+library(graphics)
+library(cowplot)
+library(cluster)
+library(glmnet)
+library(fastDummies)
+library(ggsci)
+library(FactoMineR)
+library(factoextra)
+set.seed(101100)
+```
+
+
+
+```{r, Set wd and load data}
+#set working directory 
+wd <-dirname(getwd())
+
+data_folder <-(file.path(wd,"sce_objects","immune"))
+
+#RAW
+#immune.sce <- readRDS(file=file.path(data_folder, "FINAL_Tcells-minusImmune_workingfile.rds"))
+#immune.sce <- readRDS(file=file.path(data_folder, "FINAL_Analysis_Tcell-clinicaldata_area_RAW.rds"))
+
+#workingfile
+#saveRDS(immune.sce, file=file.path(data_folder, "FINAL_Analysis_Tcell-clinicaldata_area_RAW.rds"))
+#saveRDS(immune.sce, file=file.path(data_folder, "FINAL_Analysis_Tcell-clinicaldata_workingfile.rds"))
+
+#clinical.data
+data_folder <-(file.path(wd,"sce_objects","final objects with categories","FINAL"))
+immune.sce <- readRDS(file=file.path(data_folder, "IMMUNE_CLINICAL-DATA_FILTERED.rds"))
+immune.sce$DX.name[is.na(immune.sce$DX.name)]<-"NA"
+```
+
+
+Define clinical data
+```{r, clinical data, message=FALSE, warning=FALSE, echo=FALSE}
+#clinical.data <-as.data.frame(colData(immune.sce))
+
+clinical.data <- read.csv(file=file.path(wd,"sce_objects","clinical_data", "clinical_data_ROI_ac_combined_CORRECT.csv"))
+
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+clinical.data$X <-NULL
+clinical.data$X.1 <-NULL
+head(clinical.data)
+```
+
+
+Table with Immune cell numbers including Immune cells removing undefined
+```{r, Table with immune cell numbers per patient, echo=FALSE, message=FALSE, warning=FALSE}
+#All Immune cells together
+tbl <- as.data.frame(table(immune.sce[,immune.sce$Patient_ID!="Control" ]$Patient_ID))
+colnames(tbl) <-c("Patient ID", "Immune cell number overall")
+#write.csv(tbl, file=file.path(plot_folder, paste("Immune cell numbers per patient overall including T-excluding undefined cells.csv")))
+#print(tbl)
+
+summary(tbl$`Immune cell number overall`) #3-6259
+tbl[tbl$`Immune cell number overall` <=100,] #31 patients have less than 100 Immune cells=3% but 10% of median/ 89 patients have less than 300 Immune cells
+
+#Lowest 10% of all patients' Immune cell numbers
+tbl[order(tbl$`Immune cell number overall`),]$`Immune cell number overall`[1:100] #1-67
+
+
+#Highest 10% of all patients' Immune cell numbers
+tbl[order(-tbl$`Immune cell number overall`),]$`Immune cell number overall`[1:100] #1503-7186
+
+tbl[tbl$`Immune cell number overall` <=100,] #97 patients have less than 100 Immune cells=10%.
+
+all_immune_pat <- tbl[tbl$`Immune cell number overall` <=100,]$`Patient ID`
+
+length(unique(immune.sce$Patient_ID))
+```
+
+Remove ROIs
+```{r, Table with Immune cell numbers per image, echo=FALSE, message=FALSE, warning=FALSE}
+#All Immune cells together
+tbl <- as.data.frame(table(immune.sce[,immune.sce$Patient_ID!="Control"]$RoiID))
+colnames(tbl) <-c("ROI ID", "Immune cell number overall")
+#write.csv(tbl, file=file.path(plot_folder, paste("Immune cell numbers per ROI overall including Immune cells-excluding undefined.csv")))
+#print(tbl)
+summary(tbl$`Immune cell number overall`) #1-6569
+tbl[tbl$`Immune cell number overall` <=50,] #214 images have less than 50 Immune cells (10% of median)
+#Lowest 10% of all patients' Immune cell numbers
+tbl[order(tbl$`Immune cell number overall`),]$`Immune cell number overall`[1:200] #1-47
+
+
+#Highest 10% of all patients' Immune cell numbers
+tbl[order(-tbl$`Immune cell number overall`),]$`Immune cell number overall`[1:200] #1385-5075
+#per image cut at 50 Immune cells per image equals lowest 5% -> ensures that there's at least 50 Immune cells per patient
+
+all_immune_roi <- tbl[tbl$`Immune cell number overall` <=50,]$`ROI ID`
+```
+
+
+Remove patients and roi
+```{r, patient roi and pat_roi removal, echo=FALSE, message=FALSE, warning=FALSE}
+immune.sce$immune_type <- immune.sce$cell_type
+immune.sce$immune_type[immune.sce$immune_type=="CD4"|
+                         immune.sce$immune_type=="CD8"] <-"Tcell"
+#Patient removal
+immune.sce_pat <- immune.sce[,immune.sce$Patient_ID!="Control"&
+                               !immune.sce$Patient_ID%in%all_immune_pat]
+length(unique(immune.sce_pat$Patient_ID)) #972
+
+#Roi removal
+immune.sce_roi <- immune.sce[,immune.sce$Patient_ID!="Control"&
+                               !immune.sce$RoiID%in%all_immune_roi]
+length(unique(immune.sce_roi$Patient_ID)) #1009
+
+#Patient & Roi removal
+immune.sce_pat.roi <- immune.sce[,immune.sce$Patient_ID!="Control"&
+                               !immune.sce$Patient_ID%in%all_immune_pat&
+                               !immune.sce$RoiID%in%all_immune_roi]
+length(unique(immune.sce_pat.roi$Patient_ID)) #972
+
+```
+
+```{r, Save SCE patient and roi removed, include=FALSE, echo=FALSE,warning=FALSE, message=FALSE, eval=FALSE}
+data_folder <-(file.path(wd,"sce_objects","immune"))
+
+saveRDS(immune.sce_pat.roi,file=file.path(data_folder, paste("Immune_all_sce_pat_roi_rem.rds",sep="")))
+saveRDS(immune.sce_roi,file=file.path(data_folder, paste("Immune_all_sce_roi_rem.rds",sep="")))
+saveRDS(immune.sce_pat,file=file.path(data_folder, paste("Immune_all_sce_pat_rem.rds",sep="")))
+
+immune.sce_pat.roi <-readRDS(file=file.path(data_folder, paste("Immune_all_sce_pat_roi_rem.rds",sep="")))
+immune.sce_roi <-readRDS(file=file.path(data_folder, paste("Immune_all_sce_roi_rem.rds",sep="")))
+immune.sce_pat <-readRDS(file=file.path(data_folder, paste("Immune_all_sce_pat_rem.rds",sep="")))
+```
+
+
+##Analysis
+#Proportions Immune cell CATEGORY
+
+## **Analysis**
+In the following, only tumours that were identified as adenocarcinomas or squamous cell carcinomas by pathologists will be used.
+
+## **Immune cell Category**
+
+## **Proportions**
+Optimal number of patient metaclusters Immune cell category proportions
+```{r, optimal number of clusters for non Immune cell category type proportion clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Proportions
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+tdat <-as.data.frame(table( immune.sce_pat[, immune.sce_pat$Patient_ID!="Control"&
+                                           immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                           immune.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            immune.sce_pat[, immune.sce_pat$Patient_ID!="Control"&
+                                           immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                           immune.sce_pat$DX.name=="Squamous cell carcinoma"]$immune_type))
+
+
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+#tdat_m <-tdat_m[,1:2]
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+sbg <-cutree(hc, k=2)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=15)
+
+#install.packages("cluster")
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+#res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+## Patients are ordered by hierarchical clustering (euclidian, wards.D2)
+```{r, Immune cell category proportions ordered by hierarchical clustering, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table( immune.sce_pat[, immune.sce_pat$Patient_ID!="Control"&
+                                           immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                           immune.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            immune.sce_pat[, immune.sce_pat$Patient_ID!="Control"&
+                                           immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                           immune.sce_pat$DX.name=="Squamous cell carcinoma"]$immune_type))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = hc$labels[hc$order])
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = hc$labels[hc$order])
+
+tdat_wide
+```
+
+Patient based metaclusters split by metacluster
+```{r, Barplot Immune cell category proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#Barplot proportions ordered by
+p <-ggplot(tdat,aes(x=freq, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()
+#plot(p)
+#ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_Tcell-Category_over_FILTEREDpatients_ordered-hc.pdf")), width=6, height=6)
+
+#dendrogram coloured by metacluster
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 3) %>%  as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()+
+  coord_flip()+
+  scale_y_reverse()
+
+#n
+p2<-ggplot(tdat,aes(x=n, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+theme(
+        axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())
+plt <-p1+
+  p+theme(legend.position = "none", axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())+
+  p2
+
+plot(plt)
+#ggsave(plot=plt, file=file.path(plot_folder, paste("Barplot_n_hc_immune_type_proportions.pdf")), width=6, height=6)
+```
+
+```{r, merge hc metacluster into t_dat Immune cell category proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k= 3))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+
+p <-ggplot(tdat_ct,aes(y=freq,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+#ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_split_metacluster_tcell-category_proportions.pdf")))
+plot(p)
+```
+
+
+## **Survival analysis** - Immune cell category
+Immune cell categories (CD4 and CD8) are grouped into "high" and "low" using the proportion's median as a cut-off.    
+- high > median  
+- low < median  
+Patients who received neoadjuvant treatment are excluded from survival analysis.  
+```{r, survival data Immune cell category proportions high-low, message=FALSE, warning=FALSE, echo=FALSE}
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat,]
+
+#high low by median
+surv_dat$Neutrophil_G <- ifelse(surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[3],"Neutrophil high","Neutrophil low")
+surv_dat$Bcell_G <- ifelse(surv_dat$Bcell >summary(surv_dat$Bcell)[3],"B cell high","B cell low")
+surv_dat$Myeloid_G <- ifelse(surv_dat$Myeloid >summary(surv_dat$Myeloid)[3],"Myeloid high","Myeloid low")
+surv_dat$Tcell_G <- ifelse(surv_dat$Tcell >summary(surv_dat$Tcell)[3],"Tcell high","Tcell low")
+
+surv <-c("Bcell_G","Neutrophil_G","Myeloid_G","Tcell_G", "metacluster")
+```
+
+
+#CoxPH for Immune cell category proportions corrected for Stage, Grade and M
+```{r, Coxph Immune cell category proportions, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   Neutrophil+Bcell+Tcell + Myeloid + Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r, survival Immune cell category proportions not split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+
+
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+#km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  facet_wrap(~strata)+theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+#km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r, survival Immune cell category proportions split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#OS survival
+surv <-c("Bcell_G","Neutrophil_G","Tcell_G","Myeloid_G", "metacluster")
+
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+surv_dat$Neutrophil_G <- ifelse(surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[3],"Neutrophil high","Neutrophil low")
+surv_dat$Bcell_G <- ifelse(surv_dat$Bcell >summary(surv_dat$Bcell)[3],"B cell high","B cell low")
+surv_dat$Myeloid_G <- ifelse(surv_dat$Myeloid >summary(surv_dat$Myeloid)[3],"Myeloid high","Myeloid low")
+surv_dat$Tcell_G <- ifelse(surv_dat$Tcell >summary(surv_dat$Tcell)[3],"Tcell high","Tcell low")
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+   #abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+#km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+#km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and Immune cell categories (CD4 vs CD8) classified as high / low as well as the proportion (continuous).   
+```{r, coxph Immune cell category proportions, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Proportions_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Cell groups categorised into "high", "mid", "low".  
+- high > 3rd quantile  
+- mid >1st quantile and < 3rd quantile  
+- low < 1st quantile  
+```{r, survival Immune cell category proportions classified as high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[5]] <-"Bcell high"
+surv_dat$Bcell_G[surv_dat$Bcell <=summary(surv_dat$Bcell)[2]] <-"Bcell low"
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[2]& surv_dat$Bcell<=summary(surv_dat$Bcell)[5]] <-"Bcell medium"
+
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[5]] <-"Myeloid high"
+surv_dat$Myeloid_G[surv_dat$Myeloid <=summary(surv_dat$Myeloid)[2]] <-"Myeloid low"
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[2]& surv_dat$Myeloid<=summary(surv_dat$Myeloid)[5]] <-"Myeloid medium"
+
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[5]] <-"Neutrophil high"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil <=summary(surv_dat$Neutrophil)[2]] <-"Neutrophil low"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[2]& surv_dat$Neutrophil<=summary(surv_dat$Neutrophil)[5]] <-"Neutrophil medium"
+
+surv_dat$Tcell_G[surv_dat$Tcell >summary(surv_dat$Tcell)[5]] <-"Tcell high"
+surv_dat$Tcell_G[surv_dat$Tcell <=summary(surv_dat$Tcell)[2]] <-"Tcell low"
+surv_dat$Tcell_G[surv_dat$Tcell >summary(surv_dat$Tcell)[2]& surv_dat$Tcell<=summary(surv_dat$Tcell)[5]] <-"Tcell medium"
+
+surv <-c("Myeloid_G","Neutrophil_G","Bcell_G" ,"Tcell_G","metacluster")
+```
+
+## Survival Immune cell category proportions high mid low not split
+```{r, survival Immune cell category proportions not split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+#km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+#km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Survival Immune cell category proportions high mid low split by tumour type
+```{r, survival Immune cell category proportions split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[5]] <-"Bcell high"
+surv_dat$Bcell_G[surv_dat$Bcell <=summary(surv_dat$Bcell)[2]] <-"Bcell low"
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[2]& surv_dat$Bcell<=summary(surv_dat$Bcell)[5]] <-"Bcell medium"
+
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[5]] <-"Myeloid high"
+surv_dat$Myeloid_G[surv_dat$Myeloid <=summary(surv_dat$Myeloid)[2]] <-"Myeloid low"
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[2]& surv_dat$Myeloid<=summary(surv_dat$Myeloid)[5]] <-"Myeloid medium"
+
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[5]] <-"Neutrophil high"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil <=summary(surv_dat$Neutrophil)[2]] <-"Neutrophil low"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[2]& surv_dat$Neutrophil<=summary(surv_dat$Neutrophil)[5]] <-"Neutrophil medium"
+
+
+surv_dat$Tcell_G[surv_dat$Tcell >summary(surv_dat$Tcell)[5]] <-"Tcell high"
+surv_dat$Tcell_G[surv_dat$Tcell <=summary(surv_dat$Tcell)[2]] <-"Tcell low"
+surv_dat$Tcell_G[surv_dat$Tcell >summary(surv_dat$Tcell)[2]& surv_dat$Tcell<=summary(surv_dat$Tcell)[5]] <-"Tcell medium"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+#km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+#km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+including clinical parameters, patient metacluster and Immune cell categories (CD4 vs CD8) classified as high mid low as well as the proportion (continuous).   
+```{r, coxph Immune cell category proportions high mid low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Proportions_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+#Correlation of Immune cell category proportions with clinical parameters
+```{r, Correlation of non Immune cell category proportions and clinical parameters, message=FALSE, warning=FALSE, echo=FALSE}
+immune.sce_pat$NeoAdj <- ifelse(immune.sce_pat$Chemo==1 |immune.sce_pat$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(immune.sce_pat[,immune.sce_pat$Patient_ID!="Control"&
+                                          immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                           immune.sce_pat[,immune.sce_pat$Patient_ID!="Control"&
+                                          immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat$DX.name=="Squamous cell carcinoma"]$immune_type))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+tdat_wide <-merge(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_as <-tdat_wide[tdat_wide$DX.name=="Adenocarcinoma" | tdat_wide$DX.name=="Squamous cell carcinoma",]
+tdat_wide_as$LN.Met <-ifelse(tdat_wide_as$LN.Met=="LN Metastases", 1,0)
+tdat_wide_as$Dist.Met <-ifelse(tdat_wide_as$Dist.Met=="Dist. Metastases", 1,0)
+
+c.param <- c("Gender","T.new","N","M.new","Dist.Met","LN.Met","Stage")
+for(i in c.param){
+  print(i)
+print(cor.test(tdat_wide_as$Bcell,tdat_wide_as[[i]], method="spearman", exact=F))
+print(cor.test(tdat_wide_as$Neutrophil,tdat_wide_as[[i]], method="spearman", exact=F))
+print(cor.test(tdat_wide_as$Myeloid,tdat_wide_as[[i]], method="spearman", exact=F))
+print(cor.test(tdat_wide_as$Tcell,tdat_wide_as[[i]], method="spearman", exact=F))
+  }
+```
+
+
+In group comparison (Wilcoxon for 2 groups or KW for >2 groups) for Immune cell categories over clinical variables:  
+  - T.new (TNM)
+  - N (TNM)
+  - M.new (TNM)
+  - Relapse
+  - Grade
+  - Stage
+  - Gender
+  - Typ (Tumour Type)
+  - Dist.Met (Distant metastases yes/no)
+  - LN.Met (Lymph node metastases yes/no)
+  
+## Kruskal-Wallis / Wilcoxon Immune cell category ~ clinical data excluding neoadjuvant therapy
+```{r, KW W Immune cell category proportions WO neo, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(immune.sce_pat[,immune.sce_pat$Patient_ID!="Control"&
+                                          immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat$DX.name=="Squamous cell carcinoma"]))
+
+#remove neoadj patients
+df <-df[!df$Patient_ID%in% neoadj_pat$Patient_ID,]
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender") 
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$immune_type, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=2) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+  #pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Proportions_WOneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+## Neoadjuvantly treated patients are included, the same categories are used plus:  
+- NeoAdj (Neoadjuvant therapy yes/no)
+- Chemo (Neoadjuvant therapy = chemotherapy)
+- Radio (Neoadjuvant therapy = radiotherapy)
+```{r, KW W Immune cell category proportions with NEO, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(immune.sce_pat[,immune.sce_pat$Patient_ID!="Control"&
+                                          immune.sce_pat$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat$DX.name=="Squamous cell carcinoma"]))
+
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$immune_type, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=2) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Proportions_Wneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+
+
+## **Immune cell category**
+## **Densities**
+Generally, densities are first calculated per tissue area. For patients with more than 1 image, the average density is calculated subsequently.  
+Optimal number of Immune cell category densities patients metaclusters:
+```{r, Immune cell category density optimal number of clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Densities
+tdat <-as.data.frame(table(immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$immune_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=immune.sce_pat.roi$RoiID,
+                                "Area"=immune.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=immune.sce_pat.roi$Patient_ID,
+                                "DX.name"=immune.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+#tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+library(FactoMineR)
+library(factoextra)
+sbg <-cutree(hc, k=8)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=20)
+
+#install.packages("cluster")
+library(cluster)
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+```{r, Calculate densities Immune cell category, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table(immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$immune_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=immune.sce_pat.roi$RoiID,
+                                "Area"=immune.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=immune.sce_pat.roi$Patient_ID,
+                                "DX.name"=immune.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+  
+  
+#CAF type distribution over all patients
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+#order Patients after clustering results
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = labels(hc))
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = labels(hc))
+```
+
+## Barplot showing absolute density per patietn coloured by Immune cell category together with hierarchical clustering tree coloured by patient metaclusters.
+```{r,Barolot Immune cell category densities, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+#barolot (sqrt of density)
+p <-ggplot(tdat,aes(y=sqrt(density),x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+#barplot total density
+p <-ggplot(tdat,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+
+#coloured dendrogram
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 2) %>% as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()
+pg <-plot_grid(p1,p, align="v", ncol=1)
+pg
+#ggsave(plot=pg, file=file.path(plot_folder, paste("Barplot_Tcell-Category-Densities_HC_ordered_dendro.pdf")), width=12, height=6)
+```
+Add patient metacluster to data and plot clusters separately for better identification
+```{r,Immune cell category densities merge hc, message=FALSE, warning=FALSE, echo=FALSE}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k=2))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+ggplot(tdat_ct,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+```
+
+## Survival analysis over Immune cell category densities high low.  
+- High > median
+- Low < median
+```{r, surv_dat Immune cell category densities high low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- rowSums(surv_dat[,2:5])
+
+#high low
+surv_dat$Density_G <- ifelse(surv_dat$total_density >summary(surv_dat$total_density)[3],"Density high","Density low")
+surv_dat$Bcell_G <- ifelse(surv_dat$Bcell >summary(surv_dat$Bcell)[3],"Bcell high","Bcell low")
+surv_dat$Myeloid_G <- ifelse(surv_dat$Myeloid >summary(surv_dat$Myeloid)[3],"Myeloid high","Myeloid low")
+surv_dat$Neutrophil_G <- ifelse(surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[3],"Neutrophil high","Neutrophil low")
+surv_dat$Tcell_G <- ifelse(surv_dat$Tcell >summary(surv_dat$Tcell)[3],"Tcell high","Tcell low")
+
+```
+
+#CoxPH for Immune cell category density corrected for Stage, Grade and M
+```{r, Coxph Immune cell category density, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   Myeloid+ Bcell+ Neutrophil +Tcell+ Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r, survival Immune cell category densities high low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+#surv <-c("metacluster")
+surv <-c("Neutrophil_G","Bcell_G","Myeloid_G","metacluster", "Tcell_G","Density_G")
+library(survminer)
+
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-immune_type_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+#km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-immune_type_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+#km_trt_fit
+}
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r, survival Immune cell category densities high low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Split
+library(survminer)
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+
+#high low
+surv_dat$Density_G <- ifelse(surv_dat$total_density >summary(surv_dat$total_density)[3],"Density high","Density low")
+surv_dat$Bcell_G <- ifelse(surv_dat$Bcell >summary(surv_dat$Bcell)[3],"Bcell high","Bcell low")
+surv_dat$Myeloid_G <- ifelse(surv_dat$Myeloid >summary(surv_dat$Myeloid)[3],"Myeloid high","Myeloid low")
+surv_dat$Neutrophil_G <- ifelse(surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[3],"Neutrophil high","Neutrophil low")
+surv_dat$Tcell_G <- ifelse(surv_dat$Tcell >summary(surv_dat$Tcell)[3],"Tcell high","Tcell low")
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+#km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-immune_type_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+#km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-immune_type_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and Immune cell categories (CD4 vs CD8) classified as high / low as well as the densities (continuous).   
+```{r, coxph Immune cell category densities high-low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=6, fig.width=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c("Density_G",paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"Density_G","metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Density_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Survival Immune cell category densities high-mid-low     
+- high    > 3rd quantile   
+- mid     > 1st quantile < 3rd quantile   
+- low     < 1st quantile   
+```{r, surv_dat Immune cell category densities high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- rowSums(surv_dat[,2:5])
+
+#High Med Low
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[5]] <-"Density high"
+surv_dat$Density_G[surv_dat$total_density <summary(surv_dat$total_density)[2]] <-"Density low"
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[2]& surv_dat$total_density<summary(surv_dat$total_density)[5]] <-"Density medium"
+
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[5]] <-"Neutrophil high"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil <summary(surv_dat$Neutrophil)[2]] <-"Neutrophil low"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[2]& surv_dat$Neutrophil<summary(surv_dat$Neutrophil)[5]] <-"Neutrophil medium"
+
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[5]] <-"Myeloid high"
+surv_dat$Myeloid_G[surv_dat$Myeloid <summary(surv_dat$Myeloid)[2]] <-"Myeloid low"
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[2]& surv_dat$Myeloid<summary(surv_dat$Myeloid)[5]] <-"Myeloid medium"
+
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[5]] <-"Bcell high"
+surv_dat$Bcell_G[surv_dat$Bcell <summary(surv_dat$Bcell)[2]] <-"Bcell low"
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[2]& surv_dat$Bcell<summary(surv_dat$Bcell)[5]] <-"Bcell medium"
+
+
+surv_dat$Tcell_G[surv_dat$Tcell >summary(surv_dat$Tcell)[5]] <-"Tcell high"
+surv_dat$Tcell_G[surv_dat$Tcell <summary(surv_dat$Tcell)[2]] <-"Tcell low"
+surv_dat$Tcell_G[surv_dat$Tcell >summary(surv_dat$Tcell)[2]& surv_dat$Tcell<summary(surv_dat$Tcell)[5]] <-"Tcell medium"
+
+surv <-c("Bcell_G","Tcell_G","Myeloid_G","Neutrophil_G","Density_G","metacluster")
+```
+
+## Immune cell category densities split by tumour type
+```{r, survival Immune cell category densities high-mid-low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+surv <-c("Bcell_G","Myeloid_G","Neutrophil_G","Tcell_G","Density_G","metacluster")
+library(survminer)
+
+for (i in surv){
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-immune_type_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+#km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-immune_type_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+#km_trt_fit
+}
+```
+
+## Immune cell category densities split by tumour type
+```{r, survival Immune cell category densities high-mid-low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+surv <-c("Bcell_G","Neutrophil_G","Myeloid_G","Tcell_G","Density_G","metacluster")
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G[dat$total_density >summary(dat$total_density)[5]] <-"Density high"
+dat$Density_G[dat$total_density <=summary(dat$total_density)[2]] <-"Density low"
+dat$Density_G[dat$total_density >summary(dat$total_density)[2]& dat$total_density<summary(dat$total_density)[5]] <-"Density medium"
+
+dat$Bcell_G[dat$Bcell >summary(dat$Bcell)[5]] <-"Bcell high"
+dat$Bcell_G[dat$Bcell <=summary(dat$Bcell)[2]] <-"Bcell low"
+dat$Bcell_G[dat$Bcell >summary(dat$Bcell)[2]& dat$Bcell<summary(dat$Bcell)[5]] <-"Bcell medium"
+
+dat$Tcell_G[dat$Tcell >summary(dat$Tcell)[5]] <-"Tcell high"
+dat$Tcell_G[dat$Tcell <=summary(dat$Tcell)[2]] <-"Tcell low"
+dat$Tcell_G[dat$Tcell >summary(dat$Tcell)[2]& dat$Tcell<summary(dat$Tcell)[5]] <-"Tcell medium"
+
+dat$Neutrophil_G[dat$Neutrophil >summary(dat$Neutrophil)[5]] <-"Neutrophil high"
+dat$Neutrophil_G[dat$Neutrophil <=summary(dat$Neutrophil)[2]] <-"Neutrophil low"
+dat$Neutrophil_G[dat$Neutrophil >summary(dat$Neutrophil)[2]& dat$Neutrophil<summary(dat$Neutrophil)[5]] <-"Neutrophil medium"
+
+dat$Myeloid_G[dat$Myeloid >summary(dat$Myeloid)[5]] <-"Myeloid high"
+dat$Myeloid_G[dat$Myeloid <=summary(dat$Myeloid)[2]] <-"Myeloid low"
+dat$Myeloid_G[dat$Myeloid >summary(dat$Myeloid)[2]& dat$Myeloid<summary(dat$Myeloid)[5]] <-"Myeloid medium"
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+#km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-immune_type_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+#km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-immune_type_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+
+
+## Lasso-regressed cox-ph model   
+including clinical parameters, patient metacluster and Immune cell categories (CD4 vs CD8) classified as high mid low as well as the densities (continuous).   
+```{r,coxph Immune cell category densities high-mid-low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor")))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Density_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables without neoadjuvant treated patients.
+```{r, KW W Immune cell category density WO neo,warning=FALSE, message=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+immune.sce_pat.roi$NeoAdj <- ifelse(immune.sce_pat.roi$Chemo==1 |immune.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$immune_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=immune.sce_pat.roi$RoiID,
+                                "Area"=immune.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=immune.sce_pat.roi$Patient_ID,
+                                "DX.name"=immune.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[,2:5])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:6])), names_to = "Phenotype",values_to = "density")
+
+#remove
+tdat <-tdat[!tdat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=3) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Densities_WOneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables with neoadjuvant treated patients.
+```{r, KW Wilcox Immune cell Category density W neo,warning=FALSE, message=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+immune.sce_pat.roi$NeoAdj <- ifelse(immune.sce_pat.roi$Chemo==1 |immune.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$immune_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=immune.sce_pat.roi$RoiID,
+                                "Area"=immune.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=immune.sce_pat.roi$Patient_ID,
+                                "DX.name"=immune.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[,2:5])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:6])), names_to = "Phenotype",values_to = "density")
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=3) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Densities_Wneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+```{r, Correlation Immune cell category densities with clinical parameters, eval=F, echo=T, message=FALSE, warning=FALSE, echo=FALSE}
+immune.sce_pat.roi$NeoAdj <- ifelse(immune.sce_pat.roi$Chemo==1 |immune.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+immune.sce_pat.roi$NeoAdj <- ifelse(immune.sce_pat.roi$Chemo==1 |immune.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           immune.sce_pat.roi[,immune.sce_pat.roi$Patient_ID!="Control"&
+                                          immune.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          immune.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$immune_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=immune.sce_pat.roi$RoiID,
+                                "Area"=immune.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=immune.sce_pat.roi$Patient_ID,
+                                "DX.name"=immune.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[,2:5])
+
+tdat_wide <-merge(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_as <-tdat_wide[tdat_wide$DX.name=="Adenocarcinoma" | tdat_wide$DX.name=="Squamous cell carcinoma",]
+tdat_wide_as$LN.Met <-ifelse(tdat_wide_as$LN.Met=="LN Metastases", 1,0)
+tdat_wide_as$Dist.Met <-ifelse(tdat_wide_as$Dist.Met=="Dist. Metastases", 1,0)
+
+for(i in c.param){
+  print(i)
+  print(cor.test(tdat_wide_as$Neutrophil,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$Bcell,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$Tcell,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$Myeloid,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$total_density,tdat_wide_as[[i]], method="spearman", exact=F))
+}
+```
diff --git a/Analysis_Tcells.Rmd b/Analysis_Tcells.Rmd
new file mode 100644
index 0000000..0f07fa7
--- /dev/null
+++ b/Analysis_Tcells.Rmd
@@ -0,0 +1,4486 @@
+---
+title: "R Notebook - Analysis T cells"
+output:
+  html_document:
+    df_print: paged
+---
+
+```{r, import libraries, echo=F, warnings=F, message=F}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+library(dplyr)
+library(tidyr)
+library(mclust)
+library(ggplot2)
+library(RColorBrewer)
+library(scater)
+library(Rphenoannoy)
+library(factoextra)
+library(cluster)
+library(dendextend)
+library(ggthemes)
+library(ggpubr)
+library(dplyr)
+#library(tidyverse)
+library(RColorBrewer)
+library(pals)
+library(qwraps2)
+library(table1)
+library(SingleCellExperiment)
+library(tidyr)
+library(scater)
+library(ggridges)
+library(viridis)
+library(viridisLite)
+library(ggplot2)
+library(data.table)
+library(CATALYST)
+library(gridExtra)
+library(Rphenograph)
+library(ComplexHeatmap)
+library(CATALYST)
+library(scales)
+library(survival)
+library(broom)
+library(pheatmap)
+library(FlowSOM)
+library(Seurat)
+library(Rphenoannoy)
+library(dplyr)
+library(data.table)
+library(ggthemes)
+library(diffcyt)
+library(edgeR)
+library(rstatix)
+library(dendextend)
+library(ggdendro)
+library(dendextend)
+library(FactoMineR)
+library(factoextra)
+library(survminer)
+library(corrplot)
+library(rstatix)
+library(graphics)
+library(cowplot)
+library(cluster)
+library(glmnet)
+library(fastDummies)
+library(ggsci)
+library(FactoMineR)
+library(factoextra)
+set.seed(101100)
+```
+
+
+
+```{r, Set wd and load data, echo=F, warnings=F, message=F}
+#set working directory 
+wd <-dirname(getwd())
+
+data_folder <-(file.path(wd,"sce_objects","immune","Tcell"))
+
+#RAW
+#tcell.sce <- readRDS(file=file.path(data_folder, "FINAL_Tcells-minusImmune_workingfile.rds"))
+#tcell.sce <- readRDS(file=file.path(data_folder, "FINAL_Analysis_Tcell-clinicaldata_area_RAW.rds"))
+
+#workingfile
+#saveRDS(tcell.sce, file=file.path(data_folder, "FINAL_Analysis_Tcell-clinicaldata_area_RAW.rds"))
+#saveRDS(tcell.sce, file=file.path(data_folder, "FINAL_Analysis_Tcell-clinicaldata_workingfile.rds"))
+
+#clinical.data
+data_folder <-(file.path(wd,"sce_objects","final objects with categories","FINAL"))
+tcell.sce <- readRDS(file=file.path(data_folder, "TCELL-only_CLINICAL-DATA_FILTERED.rds"))
+tcell.sce$DX.name[is.na(tcell.sce$DX.name)]<-"NA"
+
+```
+
+```{r, load clinical data, echo=F, warnings=F, message=F}
+clinical.data <- read.csv(file=file.path(wd,"sce_objects","clinical_data", "clinical_data_ROI_ac_combined_CORRECT.csv"))
+clinical.data$TMA <-clinical.data$TMA.x
+clinical.data$TMA.x <-NULL
+clinical.data$TMA.y <-NULL
+clinical.data$X.1 <-NULL
+clinical.data$X <-NULL
+head(clinical.data)
+
+unique(clinical.data$DX.name)
+clinical.data$DX.name[clinical.data$Patient_ID=="Control"] <-"Control"
+table(clinical.data$DX.name)
+clinical.data$Patient_ID %>% unique() %>% length() #1071 PATIENTS IN TOTAL
+
+area <- read.csv(file=file.path(wd,"clinical_data", "area.csv"))
+area$X <- NULL
+area$Tma_ac <- area$TMA_ImageID
+area$TMA_ImageID <-NULL
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+```
+
+Table with Immune cell numbers including T cells removing undefined
+```{r, Table with immune cell numbers per patient, echo=FALSE, message=FALSE, warning=FALSE}
+#All T cells together
+tbl <- as.data.frame(table(tcell.sce[,tcell.sce$Patient_ID!="Control" ]$Patient_ID))
+colnames(tbl) <-c("Patient ID", "T cell number overall")
+#write.csv(tbl, file=file.path(plot_folder, paste("Immune cell numbers per patient overall including T-excluding undefined cells.csv")))
+#print(tbl)
+
+#summary(tbl$`T cell number overall`) #3-6259
+#tbl[tbl$`T cell number overall` <=100,] #31 patients have less than 100 T cells=3% but 10% of median/ 89 patients have less than 300 T cells
+
+#Lowest 10% of all patients' T cell numbers
+#tbl[order(tbl$`T cell number overall`),]$`T cell number overall`[1:100] #1-23
+
+
+#Highest 10% of all patients' T cell numbers
+#tbl[order(-tbl$`T cell number overall`),]$`T cell number overall`[1:100] #760-2378
+
+#tbl[tbl$`T cell number overall` <=15,] #102 patients have less than 100 T cells=10%. 77 patients have less than 15 T cells
+
+all_tcell_pat <- tbl[tbl$`T cell number overall` <=20,]$`Patient ID`
+
+length(unique(tcell.sce$Patient_ID))
+```
+
+Remove ROIs
+```{r, Table with Immune cell numbers per image, echo=FALSE, message=FALSE, warning=FALSE}
+#All Immune cells together
+tbl <- as.data.frame(table(tcell.sce[,tcell.sce$Patient_ID!="Control"]$RoiID))
+colnames(tbl) <-c("ROI ID", "Immune cell number overall")
+#write.csv(tbl, file=file.path(plot_folder, paste("Immune cell numbers per ROI overall including T cells-excluding undefined.csv")))
+#print(tbl)
+#summary(tbl$`Immune cell number overall`) #1-5075
+#tbl[tbl$`Immune cell number overall` <=10,] #93 images have less than 50 Immune cells (10% of median)
+#Lowest 10% of all patients' Immune cell numbers
+#tbl[order(tbl$`Immune cell number overall`),]$`Immune cell number overall`[1:200] #1-110
+
+
+#Highest 10% of all patients' Immune cell numbers
+#tbl[order(-tbl$`Immune cell number overall`),]$`Immune cell number overall`[1:200] #1385-5075
+#per image cut at 50 Immune cells per image equals lowest 5% -> ensures that there's at least 50 Immune cells per patient
+
+all_tcell_roi <- tbl[tbl$`Immune cell number overall` <=10,]$`ROI ID`
+```
+
+
+Remove patients and roi
+```{r, patient roi and pat_roi removal, echo=FALSE, message=FALSE, warning=FALSE}
+#Patient removal
+tcell.sce_pat <- tcell.sce[,tcell.sce$Patient_ID!="Control"&
+                               !tcell.sce$Patient_ID%in%all_tcell_pat]
+length(unique(tcell.sce_pat$Patient_ID)) #954
+
+#Roi removal
+tcell.sce_roi <- tcell.sce[,tcell.sce$Patient_ID!="Control"&
+                               !tcell.sce$RoiID%in%all_tcell_roi]
+length(unique(tcell.sce_roi$Patient_ID)) #987
+
+#Patient & Roi removal
+tcell.sce_pat.roi <- tcell.sce[,tcell.sce$Patient_ID!="Control"&
+                               !tcell.sce$Patient_ID%in%all_tcell_pat&
+                               !tcell.sce$RoiID%in%all_tcell_roi]
+length(unique(tcell.sce_pat.roi$Patient_ID)) #954
+
+```
+
+```{r, Save SCE patient and roi removed, include=FALSE, echo=FALSE,warning=FALSE, message=FALSE, eval=FALSE}
+data_folder <-(file.path(wd,"sce_objects","immune","Tcell"))
+
+#saveRDS(tcell.sce_pat.roi,file=file.path(data_folder, paste("Tcell_sce_pat_roi_rem.rds",sep="")))
+#saveRDS(tcell.sce_roi,file=file.path(data_folder, paste("Tcell_sce_roi_rem.rds",sep="")))
+#saveRDS(tcell.sce_pat,file=file.path(data_folder, paste("Tcell_sce_pat_rem.rds",sep="")))
+
+tcell.sce_pat.roi <-readRDS(file=file.path(data_folder, paste("Tcell_sce_pat_roi_rem.rds",sep="")))
+tcell.sce_roi <-readRDS(file=file.path(data_folder, paste("Tcell_sce_roi_rem.rds",sep="")))
+tcell.sce_pat <-readRDS(file=file.path(data_folder, paste("Tcell_sce_pat_rem.rds",sep="")))
+```
+
+
+##Analysis
+#Proportions T cell CATEGORY
+
+## **Analysis**
+In the following, only tumours that were identified as adenocarcinomas or squamous cell carcinomas by pathologists will be used.
+
+## **T cell Category**
+
+## **Proportions**
+Optimal number of patient metaclusters T cell category proportions
+```{r, optimal number of clusters for non T cell category type proportion clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Proportions
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+tdat <-as.data.frame(table( tcell.sce_pat[, tcell.sce_pat$Patient_ID!="Control"&
+                                           tcell.sce_pat$DX.name=="Adenocarcinoma"|
+                                           tcell.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            tcell.sce_pat[, tcell.sce_pat$Patient_ID!="Control"&
+                                           tcell.sce_pat$DX.name=="Adenocarcinoma"|
+                                           tcell.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_type))
+
+
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+#tdat_m <-tdat_m[,1:2]
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+sbg <-cutree(hc, k=2)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=15)
+
+#install.packages("cluster")
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+#res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+## Patients are ordered by hierarchical clustering (euclidian, wards.D2)
+```{r, T cell category proportions ordered by hierarchical clustering, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table( tcell.sce_pat[, tcell.sce_pat$Patient_ID!="Control"&
+                                           tcell.sce_pat$DX.name=="Adenocarcinoma"|
+                                           tcell.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            tcell.sce_pat[, tcell.sce_pat$Patient_ID!="Control"&
+                                           tcell.sce_pat$DX.name=="Adenocarcinoma"|
+                                           tcell.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = hc$labels[hc$order])
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = hc$labels[hc$order])
+
+tdat_wide
+```
+
+Patient based metaclusters split by metacluster
+```{r, Barplot T cell category proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#Barplot proportions ordered by
+p <-ggplot(tdat,aes(x=freq, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()
+#plot(p)
+#ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_Tcell-Category_over_FILTEREDpatients_ordered-hc.pdf")), width=6, height=6)
+
+#dendrogram coloured by metacluster
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 3) %>%  as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()+
+  coord_flip()+
+  scale_y_reverse()
+
+#n
+p2<-ggplot(tdat,aes(x=n, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+theme(
+        axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())
+plt <-p1+
+  p+theme(legend.position = "none", axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())+
+  p2
+
+plot(plt)
+#ggsave(plot=plt, file=file.path(plot_folder, paste("Barplot_n_hc_cell_type_proportions.pdf")), width=6, height=6)
+```
+
+```{r, merge hc metacluster into t_dat T cell category proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k= 3))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+
+p <-ggplot(tdat_ct,aes(y=freq,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+#ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_split_metacluster_tcell-category_proportions.pdf")))
+plot(p)
+```
+
+
+## **Survival analysis** - T cell category
+T cell categories (CD4 and CD8) are grouped into "high" and "low" using the proportion's median as a cut-off.    
+- high > median  
+- low < median  
+Patients who received neoadjuvant treatment are excluded from survival analysis.  
+```{r, survival data T cell category proportions high-low, message=FALSE, warning=FALSE, echo=FALSE}
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat,]
+
+#high low by median
+surv_dat$CD4_G <- ifelse(surv_dat$CD4 >summary(surv_dat$CD4)[3],"CD4 high","CD4 low")
+surv_dat$CD8_G <- ifelse(surv_dat$CD8 >summary(surv_dat$CD8)[3],"CD8 high","CD8 low")
+surv <-c("CD8_G","CD4_G", "metacluster")
+```
+
+
+#CoxPH for T cell category proportions corrected for Stage, Grade and M
+```{r, Coxph T cell category proportions corrected, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   CD4 + CD8 + Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r, survival T cell category proportions not split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+
+
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  facet_wrap(~strata)+theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r, survival T cell category proportions split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#OS survival
+surv <-c("CD8_G","CD4_G", "metacluster")
+
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$CD4_G <- ifelse(dat$CD4 >summary(dat$CD4)[3],"CD4 high","CD4 low")
+dat$CD8_G <- ifelse(dat$CD8 >summary(dat$CD8)[3],"CD8 high","CD8 low")
+   
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+   #abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and T cell categories (CD4 vs CD8) classified as high / low as well as the proportion (continuous).   
+```{r, coxph T cell category proportions, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c("CD4_G","CD8_G","Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c("CD4_G","CD8_G","metacluster",colnames(clinical.data),contains("censor"))) #))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Proportions_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Cell groups categorised into "high", "mid", "low".  
+- high > 3rd quantile  
+- mid >1st quantile and < 3rd quantile  
+- low < 1st quantile  
+```{r, survival T cell category proportions classified as high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[5]] <-"CD4 high"
+surv_dat$CD4_G[surv_dat$CD4 <summary(surv_dat$CD4)[2]] <-"CD4 low"
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[2]& surv_dat$CD4<summary(surv_dat$CD4)[5]] <-"CD4 medium"
+
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[5]] <-"CD8 high"
+surv_dat$CD8_G[surv_dat$CD8 <summary(surv_dat$CD8)[2]] <-"CD8 low"
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[2]& surv_dat$CD8<summary(surv_dat$CD8)[5]] <-"CD8 medium"
+
+surv <-c("CD8_G","CD4_G", "metacluster")
+```
+
+## Survival T cell category proportions high mid low not split
+```{r, survival t cell category proportions not split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Survival T cell category proportions high mid low split by tumour type
+```{r, survival t cell category proportions split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$CD4_G[dat$CD4 >summary(dat$CD4)[5]] <-"CD4 high"
+dat$CD4_G[dat$CD4 <summary(dat$CD4)[2]] <-"CD4 low"
+dat$CD4_G[dat$CD4 >summary(dat$CD4)[2]& dat$CD4<summary(dat$CD4)[5]] <-"CD4 medium"
+
+dat$CD8_G[dat$CD8 >summary(dat$CD8)[5]] <-"CD8 high"
+dat$CD8_G[dat$CD8 <summary(dat$CD8)[2]] <-"CD8 low"
+dat$CD8_G[dat$CD8 >summary(dat$CD8)[2]& dat$CD8<summary(dat$CD8)[5]] <-"CD8 medium"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+including clinical parameters, patient metacluster and T cell categories (CD4 vs CD8) classified as high mid low as well as the proportion (continuous).   
+```{r, coxph T cell category proportions high mid low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c("CD4_G","CD8_G","Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c("CD4_G","CD8_G","metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Proportions_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+#Correlation of T cell category proportions with clinical parameters
+```{r, Correlation of non T cell category proportions and clinical parameters, message=FALSE, warning=FALSE, echo=FALSE}
+tcell.sce_pat$NeoAdj <- ifelse(tcell.sce_pat$Chemo==1 |tcell.sce_pat$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(tcell.sce_pat[,tcell.sce_pat$Patient_ID!="Control"&
+                                          tcell.sce_pat$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                           tcell.sce_pat[,tcell.sce_pat$Patient_ID!="Control"&
+                                          tcell.sce_pat$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+tdat_wide <-merge(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_as <-tdat_wide[tdat_wide$DX.name=="Adenocarcinoma" | tdat_wide$DX.name=="Squamous cell carcinoma",]
+tdat_wide_as$LN.Met <-ifelse(tdat_wide_as$LN.Met=="LN Metastases", 1,0)
+tdat_wide_as$Dist.Met <-ifelse(tdat_wide_as$Dist.Met=="Dist. Metastases", 1,0)
+
+c.param <- c("Gender","T.new","N","M.new","Dist.Met","LN.Met","Stage")
+for(i in c.param){
+  print(i)
+print(cor.test(tdat_wide_as$CD4,tdat_wide_as[[i]], method="spearman", exact=F))
+  }
+```
+
+
+In group comparison (Wilcoxon for 2 groups or KW for >2 groups) for T cell categories over clinical variables:  
+  - T.new (TNM)
+  - N (TNM)
+  - M.new (TNM)
+  - Relapse
+  - Grade
+  - Stage
+  - Gender
+  - Typ (Tumour Type)
+  - Dist.Met (Distant metastases yes/no)
+  - LN.Met (Lymph node metastases yes/no)
+  
+## Kruskal-Wallis / Wilcoxon T cell category ~ clinical data excluding neoadjuvant therapy
+```{r, KW W T cell category proportions WO neo, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(tcell.sce_pat[,tcell.sce_pat$Patient_ID!="Control"&
+                                          tcell.sce_pat$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat$DX.name=="Squamous cell carcinoma"]))
+
+#remove neoadj patients
+df <-df[!df$Patient_ID%in% neoadj_pat$Patient_ID,]
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender") 
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$cell_type, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=2) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+  #pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Proportions_WOneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+## Neoadjuvantly treated patients are included, the same categories are used plus:  
+- NeoAdj (Neoadjuvant therapy yes/no)
+- Chemo (Neoadjuvant therapy = chemotherapy)
+- Radio (Neoadjuvant therapy = radiotherapy)
+```{r, KW W T cell category proportions with NEO, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(tcell.sce_pat[,tcell.sce_pat$Patient_ID!="Control"&
+                                          tcell.sce_pat$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat$DX.name=="Squamous cell carcinoma"]))
+
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$cell_type, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=2) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Proportions_Wneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+
+
+## **T cell category**
+## **Densities**
+Generally, densities are first calculated per tissue area. For patients with more than 1 image, the average density is calculated subsequently.  
+Optimal number of T cell category densities patients metaclusters:
+```{r, T cell category density optimal number of clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Densities
+tdat <-as.data.frame(table(tcell.sce_pat.roi[,tcell.sce_pat.roi$Patient_ID!="Control"&
+                                          tcell.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           tcell.sce_pat.roi[,tcell.sce_pat.roi$Patient_ID!="Control"&
+                                          tcell.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=tcell.sce_pat.roi$RoiID,
+                                "Area"=tcell.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=tcell.sce_pat.roi$Patient_ID,
+                                "DX.name"=tcell.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+#tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+library(FactoMineR)
+library(factoextra)
+sbg <-cutree(hc, k=8)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=20)
+
+#install.packages("cluster")
+library(cluster)
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+```{r, Calculate densities T cell category, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table(tcell.sce_pat.roi[,tcell.sce_pat.roi$Patient_ID!="Control"&
+                                          tcell.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           tcell.sce_pat.roi[,tcell.sce_pat.roi$Patient_ID!="Control"&
+                                          tcell.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=tcell.sce_pat.roi$RoiID,
+                                "Area"=tcell.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=tcell.sce_pat.roi$Patient_ID,
+                                "DX.name"=tcell.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+  
+  
+#CAF type distribution over all patients
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+#order Patients after clustering results
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = labels(hc))
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = labels(hc))
+```
+
+## Barplot showing absolute density per patietn coloured by T cell category together with hierarchical clustering tree coloured by patient metaclusters.
+```{r,Barolot T cell category densities, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+#barolot (sqrt of density)
+p <-ggplot(tdat,aes(y=sqrt(density),x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+#barplot total density
+p <-ggplot(tdat,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+
+#coloured dendrogram
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 2) %>% as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()
+pg <-plot_grid(p1,p, align="v", ncol=1)
+pg
+#ggsave(plot=pg, file=file.path(plot_folder, paste("Barplot_Tcell-Category-Densities_HC_ordered_dendro.pdf")), width=12, height=6)
+```
+Add patient metacluster to data and plot clusters separately for better identification
+```{r,T cell category densities merge hc, message=FALSE, warning=FALSE, echo=FALSE}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k=2))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+ggplot(tdat_ct,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+```
+
+## Survival analysis over T cell category densities high low.  
+- High > median
+- Low < median
+```{r, surv_dat T cell category densities high low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- surv_dat$CD4+surv_dat$CD8
+
+#high low
+surv_dat$Density_G <- ifelse(surv_dat$total_density >summary(surv_dat$total_density)[3],"Density high","Density low")
+surv_dat$CD4_G <- ifelse(surv_dat$CD4 >summary(surv_dat$CD4)[3],"CD4 high","CD4 low")
+surv_dat$CD8_G <- ifelse(surv_dat$CD8 >summary(surv_dat$CD8)[3],"CD8 high","CD8 low")
+```
+
+#CoxPH for T cell category density corrected for Stage, Grade and M
+```{r, Coxph T cell category density corrected, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   CD4 + CD8 + Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r, survival T cell category densities high low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+#surv <-c("metacluster")
+surv <-c("CD8_G","CD4_G","metacluster", "Density_G")
+library(survminer)
+
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+#km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r, survival T cell category densities high low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Split
+library(survminer)
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G <- ifelse(dat$total_density >summary(dat$total_density)[3],"Density high","Density low")
+dat$CD4_G <- ifelse(dat$CD4 >summary(dat$CD4)[3],"CD4 high","CD4 low")
+dat$CD8_G <- ifelse(dat$CD8 >summary(dat$CD8)[3],"CD8 high","CD8 low")   
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and T cell categories (CD4 vs CD8) classified as high / low as well as the densities (continuous).   
+```{r, coxph T cell category densities high-low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=6, fig.width=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c("Density_G","CD4_G","CD8_G","Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c("CD4_G","CD8_G","Density_G","metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Density_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Survival T cell category densities high-mid-low     
+- high    > 3rd quantile   
+- mid     > 1st quantile < 3rd quantile   
+- low     < 1st quantile   
+```{r, surv_dat T cell category densities high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- surv_dat$CD4+surv_dat$CD8
+
+#High Med Low
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[5]] <-"Density high"
+surv_dat$Density_G[surv_dat$total_density <summary(surv_dat$total_density)[2]] <-"Density low"
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[2]& surv_dat$total_density<summary(surv_dat$total_density)[5]] <-"Density medium"
+
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[5]] <-"CD4 high"
+surv_dat$CD4_G[surv_dat$CD4 <summary(surv_dat$CD4)[2]] <-"CD4 low"
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[2]& surv_dat$CD4<summary(surv_dat$CD4)[5]] <-"CD4 medium"
+
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[5]] <-"CD8 high"
+surv_dat$CD8_G[surv_dat$CD8 <summary(surv_dat$CD8)[2]] <-"CD8 low"
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[2]& surv_dat$CD8<summary(surv_dat$CD8)[5]] <-"CD8 medium"
+
+surv <-c("CD4_G","CD8_G","Density_G","metacluster")
+```
+
+## T cell category densities split by tumour type
+```{r, survival t cell category densities high-mid-low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+surv <-c("CD4_G","CD8_G","Density_G","metacluster")
+library(survminer)
+
+for (i in surv){
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## T cell category densities split by tumour type
+```{r, survival t cell category densities high-mid-low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+surv <-c("CD4_G","CD8_G","Density_G","metacluster")
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G[dat$total_density >summary(dat$total_density)[5]] <-"Density high"
+dat$Density_G[dat$total_density <summary(dat$total_density)[2]] <-"Density low"
+dat$Density_G[dat$total_density >summary(dat$total_density)[2]& dat$total_density<summary(dat$total_density)[5]] <-"Density medium"
+
+dat$CD4_G[dat$CD4 >summary(dat$CD4)[5]] <-"CD4 high"
+dat$CD4_G[dat$CD4 <summary(dat$CD4)[2]] <-"CD4 low"
+dat$CD4_G[dat$CD4 >summary(dat$CD4)[2]& dat$CD4<summary(dat$CD4)[5]] <-"CD4 medium"
+
+dat$CD8_G[dat$CD8 >summary(dat$CD8)[5]] <-"CD8 high"
+dat$CD8_G[dat$CD8 <summary(dat$CD8)[2]] <-"CD8 low"
+dat$CD8_G[dat$CD8 >summary(dat$CD8)[2]& dat$CD8<summary(dat$CD8)[5]] <-"CD8 medium"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+
+
+## Lasso-regressed cox-ph model   
+including clinical parameters, patient metacluster and T cell categories (CD4 vs CD8) classified as high mid low as well as the densities (continuous).   
+```{r,coxph T cell category densities high-mid-low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c("Density_G","CD4_G","CD8_G","Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c("CD4_G","CD8_G","Density_G","metacluster",colnames(clinical.data),contains("censor")))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Density_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables without neoadjuvant treated patients.
+```{r, KW W T cell category density WO neo,warning=FALSE, message=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+tcell.sce_pat.roi$NeoAdj <- ifelse(tcell.sce_pat.roi$Chemo==1 |tcell.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(tcell.sce_pat.roi[,tcell.sce_pat.roi$Patient_ID!="Control"&
+                                          tcell.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           tcell.sce_pat.roi[,tcell.sce_pat.roi$Patient_ID!="Control"&
+                                          tcell.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=tcell.sce_pat.roi$RoiID,
+                                "Area"=tcell.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=tcell.sce_pat.roi$Patient_ID,
+                                "DX.name"=tcell.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- tdat_wide$CD4 + tdat_wide$CD8
+tdat <-tdat_wide %>% pivot_longer(cols=c("CD4","CD8","total_density"), names_to = "Phenotype",values_to = "density")
+
+#remove
+tdat <-tdat[!tdat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=3) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Densities_WOneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables with neoadjuvant treated patients.
+```{r, KW Wilcox T cell Category density W neo,warning=F,message=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+tcell.sce_pat.roi$NeoAdj <- ifelse(tcell.sce_pat.roi$Chemo==1 |tcell.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(tcell.sce_pat.roi[,tcell.sce_pat.roi$Patient_ID!="Control"&
+                                          tcell.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           tcell.sce_pat.roi[,tcell.sce_pat.roi$Patient_ID!="Control"&
+                                          tcell.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=tcell.sce_pat.roi$RoiID,
+                                "Area"=tcell.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=tcell.sce_pat.roi$Patient_ID,
+                                "DX.name"=tcell.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- tdat_wide$CD4 + tdat_wide$CD8
+tdat <-tdat_wide %>% pivot_longer(cols=c("CD4","CD8","total_density"), names_to = "Phenotype",values_to = "density")
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=3) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Densities_Wneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+```{r, Correlation T cell category densities with clinical parameters, eval=F, echo=T, message=FALSE, warning=FALSE, echo=FALSE}
+tcell.sce_pat.roi$NeoAdj <- ifelse(tcell.sce_pat.roi$Chemo==1 |tcell.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tcell.sce_pat.roi$NeoAdj <- ifelse(tcell.sce_pat.roi$Chemo==1 |tcell.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(tcell.sce_pat.roi[,tcell.sce_pat.roi$Patient_ID!="Control"&
+                                          tcell.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           tcell.sce_pat.roi[,tcell.sce_pat.roi$Patient_ID!="Control"&
+                                          tcell.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=tcell.sce_pat.roi$RoiID,
+                                "Area"=tcell.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=tcell.sce_pat.roi$Patient_ID,
+                                "DX.name"=tcell.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- tdat_wide$CD4 + tdat_wide$CD8
+
+tdat_wide <-merge(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_as <-tdat_wide[tdat_wide$DX.name=="Adenocarcinoma" | tdat_wide$DX.name=="Squamous cell carcinoma",]
+tdat_wide_as$LN.Met <-ifelse(tdat_wide_as$LN.Met=="LN Metastases", 1,0)
+tdat_wide_as$Dist.Met <-ifelse(tdat_wide_as$Dist.Met=="Dist. Metastases", 1,0)
+
+for(i in c.param){
+  print(i)
+  print(cor.test(tdat_wide_as$CD4,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$CD8,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$total_density,tdat_wide_as[[i]], method="spearman", exact=F))
+}
+```
+
+
+#T cell Type
+
+##Analysis
+#Proportions T cell CATEGORY
+
+## **Analysis**
+In the following, only tumours that were identified as adenocarcinomas or squamous cell carcinomas by pathologists will be used.
+
+## **T cell Category**
+
+## **DDensity**
+Optimal number of patient metaclusters T cell category proportions
+```{r, optimal number of clusters for non T cell category type density clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Proportions
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+tdat <-as.data.frame(table( tcell.sce_pat[, tcell.sce_pat$Patient_ID!="Control"&
+                                           tcell.sce_pat$DX.name=="Adenocarcinoma"|
+                                           tcell.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            tcell.sce_pat[, tcell.sce_pat$Patient_ID!="Control"&
+                                           tcell.sce_pat$DX.name=="Adenocarcinoma"|
+                                           tcell.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+
+
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+#tdat_m <-tdat_m[,1:2]
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+sbg <-cutree(hc, k=2)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=15)
+
+#install.packages("cluster")
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+#res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+## Patients are ordered by hierarchical clustering (euclidian, wards.D2)
+```{r, T cell category density ordered by hierarchical clustering, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table( tcell.sce_pat[, tcell.sce_pat$Patient_ID!="Control"&
+                                           tcell.sce_pat$DX.name=="Adenocarcinoma"|
+                                           tcell.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            tcell.sce_pat[, tcell.sce_pat$Patient_ID!="Control"&
+                                           tcell.sce_pat$DX.name=="Adenocarcinoma"|
+                                           tcell.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = hc$labels[hc$order])
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = hc$labels[hc$order])
+
+tdat_wide
+```
+
+Patient based metaclusters split by metacluster
+```{r, Barplot T cell category density, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#Barplot density ordered by
+p <-ggplot(tdat,aes(x=freq, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()
+#plot(p)
+#ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_Tcell-Category_over_FILTEREDpatients_ordered-hc.pdf")), width=6, height=6)
+
+#dendrogram coloured by metacluster
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 8) %>%  as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()+
+  coord_flip()+
+  scale_y_reverse()
+
+#n
+p2<-ggplot(tdat,aes(x=n, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+theme(
+        axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())
+
+plt <-p1+
+  p+theme(legend.position = "none", axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())+
+  p2
+
+plot(plt)
+#ggsave(plot=plt, file=file.path(plot_folder, paste("Barplot_n_hc_cell_subtype_density.pdf")), width=6, height=6)
+```
+
+```{r, merge hc metacluster into t_dat T cell category density, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k= 8))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+
+p <-ggplot(tdat_ct,aes(y=freq,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+#ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_split_metacluster_tcell-category_density.pdf")))
+plot(p)
+```
+
+
+## **Survival analysis** - T cell category
+T cell categories (CD4 and CD8) are grouped into "high" and "low" using the proportion's median as a cut-off.    
+- high > median  
+- low < median  
+Patients who received neoadjuvant treatment are excluded from survival analysis.  
+```{r, survival data T cell category density high-low, message=FALSE, warning=FALSE, echo=FALSE}
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat,]
+
+#high low by median
+surv_dat$CD4_G <- ifelse(surv_dat$CD4 >summary(surv_dat$CD4)[3],"CD4 high","CD4 low")
+surv_dat$CD4_Treg_G <- ifelse(surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[3],"CD4 Treg high","CD4 Treg low")
+surv_dat$CD4_IDO_G <- ifelse(surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[3],"CD4 IDO high","CD4 IDO low")
+surv_dat$CD4_PD1_G <- ifelse(surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[3],"CD4 PD1 high","CD4 PD1 low")
+surv_dat$CD4_TCF_G <- ifelse(surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[3],"CD4 TCF1/7 high","CD4 TCF1/7 low")
+surv_dat$CD4_ki67_G <- ifelse(surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[3],"dividing CD4 high","dividing CD4 low")
+
+surv_dat$CD8_IDO_G <- ifelse(surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[3],"CD8 IDO high","CD8 IDO low")
+surv_dat$CD8_TCF_G <- ifelse(surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[3],"CD8 TCF1/7 high","CD8 TCF1/7 low")
+surv_dat$CD8_G <- ifelse(surv_dat$CD8 >summary(surv_dat$CD8)[3],"CD8 high","CD8 low")
+surv_dat$CD8_ki67_G <- ifelse(surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[3],"dividing CD8 high","dividing CD8 low")
+
+surv <-c("CD8_G","CD4_G","CD4_Treg_G","CD4_IDO_G","CD4_PD1_G","CD4_TCF_G","CD4_ki67_G", "CD8_IDO_G","CD8_TCF_G","CD8_ki67_G","metacluster")
+```
+
+
+#CoxPH for T cell category density corrected for Stage, Grade and M
+```{r, Coxph T cell category density, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   CD4 + CD8+`TCF1/7_CD4`+ki67_CD8+`TCF1/7_CD8`+PD1_CD4+IDO_CD8 +ki67_CD4+IDO_CD4+CD4_Treg+ Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r, survival T cell category density not split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+
+
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  facet_wrap(~strata)+theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r, survival T cell category density split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#OS survival
+surv <-c("CD8_G","CD4_G","CD4_Treg_G","CD4_IDO_G","CD4_PD1_G","CD4_TCF_G","CD4_ki67_G", "CD8_IDO_G","CD8_TCF_G","CD8_ki67_G","metacluster") 
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+
+#high low by median
+surv_dat$CD4_G <- ifelse(surv_dat$CD4 >summary(surv_dat$CD4)[3],"CD4 high","CD4 low")
+surv_dat$CD4_Treg_G <- ifelse(surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[3],"CD4 Treg high","CD4 Treg low")
+surv_dat$CD4_IDO_G <- ifelse(surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[3],"CD4 IDO high","CD4 IDO low")
+surv_dat$CD4_PD1_G <- ifelse(surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[3],"CD4 PD1 high","CD4 PD1 low")
+surv_dat$CD4_TCF_G <- ifelse(surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[3],"CD4 TCF1/7 high","CD4 TCF1/7 low")
+surv_dat$CD4_ki67_G <- ifelse(surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[3],"dividing CD4 high","dividing CD4 low")
+
+surv_dat$CD8_IDO_G <- ifelse(surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[3],"CD8 IDO high","CD8 IDO low")
+surv_dat$CD8_TCF_G <- ifelse(surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[3],"CD8 TCF1/7 high","CD8 TCF1/7 low")
+surv_dat$CD8_G <- ifelse(surv_dat$CD8 >summary(surv_dat$CD8)[3],"CD8 high","CD8 low")
+surv_dat$CD8_ki67_G <- ifelse(surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[3],"dividing CD8 high","dividing CD8 low")
+
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+   #abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and T cell categories (CD4 vs CD8) classified as high / low as well as the proportion (continuous).   
+```{r, coxph T cell category density, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),colnames(clinical.data),contains("censor"))) #))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_density_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Cell groups categorised into "high", "mid", "low".  
+- high > 3rd quantile  
+- mid >1st quantile and < 3rd quantile  
+- low < 1st quantile  
+```{r, survival T cell category density classified as high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[5]] <-"CD4 high"
+surv_dat$CD4_G[surv_dat$CD4 <summary(surv_dat$CD4)[2]] <-"CD4 low"
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[2]& surv_dat$CD4<summary(surv_dat$CD4)[5]] <-"CD4 medium"
+
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[5]] <-"CD4 Treg high"
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg <summary(surv_dat$CD4_Treg)[2]] <-"CD4 Treg low"
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[2]& surv_dat$CD4_Treg<summary(surv_dat$CD4_Treg)[5]] <-"CD4 T reg medium"
+
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[5]] <-"IDO_CD4 high"
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 <summary(surv_dat$IDO_CD4)[2]] <-"IDO_CD4 low"
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[2]& surv_dat$IDO_CD4<summary(surv_dat$IDO_CD4)[5]] <-"IDO_CD4 medium"
+
+surv_dat$PD1_CD4_G[surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[5]] <-"PD1_CD4 high"
+surv_dat$PD1_CD4_G[surv_dat$PD1_CD4 <summary(surv_dat$PD1_CD4)[2]] <-"PD1_CD4 low"
+surv_dat$PD1_CD4_G[surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[2]& surv_dat$PD1_CD4<summary(surv_dat$PD1_CD4)[5]] <-"PD1_CD4 medium"
+
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[5]] <-"CD4 TCF1/7 high"
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` <summary(surv_dat$`TCF1/7_CD4`)[2]] <-"CD4 TCF1/7  low"
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[2]& surv_dat$`TCF1/7_CD4`<summary(surv_dat$`TCF1/7_CD4`)[5]] <-"CD4 TCF1/7  medium"
+
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[5]] <-"CD4 dividing high"
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 <summary(surv_dat$ki67_CD4)[2]] <-"CD4 dividing low"
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[2]& surv_dat$ki67_CD4<summary(surv_dat$ki67_CD4)[5]] <-"CD4 dividing medium"
+
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[5]] <-"CD8 high"
+surv_dat$CD8_G[surv_dat$CD8 <summary(surv_dat$CD8)[2]] <-"CD8 low"
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[2]& surv_dat$CD8<summary(surv_dat$CD8)[5]] <-"CD8 medium"
+
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[5]] <-"CD8 IDO high"
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 <summary(surv_dat$IDO_CD8)[2]] <-"CD8 IDO low"
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[2]& surv_dat$IDO_CD8<summary(surv_dat$IDO_CD8)[5]] <-"CD8 IDO medium"
+
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[5]] <-"CD8 TCF1/7 high"
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` <summary(surv_dat$`TCF1/7_CD8`)[2]] <-"CD8 TCF1/7 low"
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[2]& surv_dat$`TCF1/7_CD8`<summary(surv_dat$`TCF1/7_CD8`)[5]] <-"CD8 TCF1/7 medium"
+
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[5]] <-"CD8 dividing high"
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 <summary(surv_dat$ki67_CD8)[2]] <-"CD8 dividing low"
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[2]& surv_dat$ki67_CD8<summary(surv_dat$ki67_CD8)[5]] <-"CD8 dividing medium"
+
+surv <-c("CD8_G","CD4_G","CD4_Treg_G","CD4_IDO_G","CD4_PD1_G","CD4_TCF_G","CD4_ki67_G", "CD8_IDO_G","CD8_TCF_G","CD8_ki67_G","metacluster")
+```
+
+## Survival T cell category density high mid low not split
+```{r, survival t cell category density not split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Survival T cell category density high mid low split by tumour type
+```{r, survival t cell category density split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+surv <-c("CD8_G","CD4_G","CD4_Treg_G","CD4_IDO_G","CD4_PD1_G","CD4_TCF_G","CD4_ki67_G", "CD8_IDO_G","CD8_TCF_G","CD8_ki67_G","metacluster")
+
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[5]] <-"CD4 high"
+surv_dat$CD4_G[surv_dat$CD4 <summary(surv_dat$CD4)[2]] <-"CD4 low"
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[2]& surv_dat$CD4<summary(surv_dat$CD4)[5]] <-"CD4 medium"
+
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[5]] <-"CD4 Treg high"
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg <summary(surv_dat$CD4_Treg)[2]] <-"CD4 Treg low"
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[2]& surv_dat$CD4_Treg<summary(surv_dat$CD4_Treg)[5]] <-"CD4 T reg medium"
+
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[5]] <-"IDO_CD4 high"
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 <summary(surv_dat$IDO_CD4)[2]] <-"IDO_CD4 low"
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[2]& surv_dat$IDO_CD4<summary(surv_dat$IDO_CD4)[5]] <-"IDO_CD4 medium"
+
+surv_dat$PD1_CD4_G[surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[5]] <-"PD1_CD4 high"
+surv_dat$PD1_CD4_G[surv_dat$PD1_CD4 <summary(surv_dat$PD1_CD4)[2]] <-"PD1_CD4 low"
+surv_dat$PD1_CD4_G[surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[2]& surv_dat$PD1_CD4<summary(surv_dat$PD1_CD4)[5]] <-"PD1_CD4 medium"
+
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[5]] <-"CD4 TCF1/7 high"
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` <summary(surv_dat$`TCF1/7_CD4`)[2]] <-"CD4 TCF1/7  low"
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[2]& surv_dat$`TCF1/7_CD4`<summary(surv_dat$`TCF1/7_CD4`)[5]] <-"CD4 TCF1/7  medium"
+
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[5]] <-"CD4 dividing high"
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 <summary(surv_dat$ki67_CD4)[2]] <-"CD4 dividing low"
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[2]& surv_dat$ki67_CD4<summary(surv_dat$ki67_CD4)[5]] <-"CD4 dividing medium"
+
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[5]] <-"CD8 high"
+surv_dat$CD8_G[surv_dat$CD8 <summary(surv_dat$CD8)[2]] <-"CD8 low"
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[2]& surv_dat$CD8<summary(surv_dat$CD8)[5]] <-"CD8 medium"
+
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[5]] <-"CD8 IDO high"
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 <summary(surv_dat$IDO_CD8)[2]] <-"CD8 IDO low"
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[2]& surv_dat$IDO_CD8<summary(surv_dat$IDO_CD8)[5]] <-"CD8 IDO medium"
+
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[5]] <-"CD8 TCF1/7 high"
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` <summary(surv_dat$`TCF1/7_CD8`)[2]] <-"CD8 TCF1/7 low"
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[2]& surv_dat$`TCF1/7_CD8`<summary(surv_dat$`TCF1/7_CD8`)[5]] <-"CD8 TCF1/7 medium"
+
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[5]] <-"CD8 dividing high"
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 <summary(surv_dat$ki67_CD8)[2]] <-"CD8 dividing low"
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[2]& surv_dat$ki67_CD8<summary(surv_dat$ki67_CD8)[5]] <-"CD8 dividing medium"
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+including clinical parameters, patient metacluster and T cell categories (CD4 vs CD8) classified as high mid low as well as the proportion (continuous).   
+```{r, coxph T cell category density high mid low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_density_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+#Correlation of T cell category density with clinical parameters
+```{r, Correlation of non T cell category density and clinical parameters, message=FALSE, warning=FALSE, echo=FALSE}
+tcell.sce_pat$NeoAdj <- ifelse(tcell.sce_pat$Chemo==1 |tcell.sce_pat$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(tcell.sce_pat[,tcell.sce_pat$Patient_ID!="Control"&
+                                          tcell.sce_pat$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                           tcell.sce_pat[,tcell.sce_pat$Patient_ID!="Control"&
+                                          tcell.sce_pat$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+tdat_wide <-merge(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_as <-tdat_wide[tdat_wide$DX.name=="Adenocarcinoma" | tdat_wide$DX.name=="Squamous cell carcinoma",]
+tdat_wide_as$LN.Met <-ifelse(tdat_wide_as$LN.Met=="LN Metastases", 1,0)
+tdat_wide_as$Dist.Met <-ifelse(tdat_wide_as$Dist.Met=="Dist. Metastases", 1,0)
+
+c.param <- c("Gender","T.new","N","M.new","Dist.Met","LN.Met","Stage")
+for(i in c.param){
+  print(i)
+print(cor.test(tdat_wide_as$CD4,tdat_wide_as[[i]], method="spearman", exact=F))
+  }
+```
+
+
+In group comparison (Wilcoxon for 2 groups or KW for >2 groups) for T cell categories over clinical variables:  
+  - T.new (TNM)
+  - N (TNM)
+  - M.new (TNM)
+  - Relapse
+  - Grade
+  - Stage
+  - Gender
+  - Typ (Tumour Type)
+  - Dist.Met (Distant metastases yes/no)
+  - LN.Met (Lymph node metastases yes/no)
+  
+## Kruskal-Wallis / Wilcoxon T cell category ~ clinical data excluding neoadjuvant therapy
+```{r, KW W T cell type density WO neo, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(tcell.sce_pat[,tcell.sce_pat$Patient_ID!="Control"&
+                                          tcell.sce_pat$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat$DX.name=="Squamous cell carcinoma"]))
+
+#remove neoadj patients
+df <-df[!df$Patient_ID%in% neoadj_pat$Patient_ID,]
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender") 
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$cell_subtype, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=5) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+  #pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_density_WOneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+## Neoadjuvantly treated patients are included, the same categories are used plus:  
+- NeoAdj (Neoadjuvant therapy yes/no)
+- Chemo (Neoadjuvant therapy = chemotherapy)
+- Radio (Neoadjuvant therapy = radiotherapy)
+```{r, KW W T cell category density with NEO, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(tcell.sce_pat[,tcell.sce_pat$Patient_ID!="Control"&
+                                          tcell.sce_pat$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat$DX.name=="Squamous cell carcinoma"]))
+
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$cell_subtype, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=5) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_density_Wneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+Correlation of T cell category density per image
+```{r, Correlations density per image T cell category, message=FALSE, warning=FALSE, echo=FALSE, eval=F}
+tcell.sce_roi$NeoAdj <- ifelse(tcell.sce_roi$Chemo==1 |tcell.sce_roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(tcell.sce_roi[,tcell.sce_roi$Patient_ID!="Control"&
+                                          tcell.sce_roi$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           tcell.sce_roi[,tcell.sce_roi$Patient_ID!="Control"&
+                                          tcell.sce_roi$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$RoiID <-droplevels(tdat$RoiID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("RoiID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+tdat_wide <-tdat_wide[!tdat_wide$RoiID%in% neoadj_pat$RoiID,]
+rownames(tdat_wide)<-tdat_wide$RoiID
+
+
+cor(tdat_wide[,-1], method="pearson")
+#corrplot(cor(tdat_wide[,-1], method="pearson"), col=rev(brewer.pal(10,"RdBu")))
+
+sig <- cor.mtest(tdat_wide[,-1], conf.level = .95) #is this test right?
+
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+         col=rev(brewer.pal(10,"RdBu")),
+         pch.col = "white",
+         #type="upper",
+         title = "Correlated ratios of CD4+ and CD8+ T cells",
+        mar=c(0,0,3,0))
+```
+
+
+## **T cell Type**
+## **Densities**
+Generally, densities are first calculated per tissue area. For patients with more than 1 image, the average density is calculated subsequently.  
+Optimal number of T cell density densities patients metaclusters:
+```{r, T cell density density optimal number of clusters type, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Densities
+tdat <-as.data.frame(table(tcell.sce_pat.roi[,tcell.sce_pat.roi$Patient_ID!="Control"&
+                                          tcell.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           tcell.sce_pat.roi[,tcell.sce_pat.roi$Patient_ID!="Control"&
+                                          tcell.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=tcell.sce_pat.roi$RoiID,
+                                "Area"=tcell.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=tcell.sce_pat.roi$Patient_ID,
+                                "DX.name"=tcell.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+#tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+library(FactoMineR)
+library(factoextra)
+sbg <-cutree(hc, k=8)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=20)
+
+#install.packages("cluster")
+library(cluster)
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+```{r, Calculate densities T cell type, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table(tcell.sce_pat.roi[,tcell.sce_pat.roi$Patient_ID!="Control"&
+                                          tcell.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           tcell.sce_pat.roi[,tcell.sce_pat.roi$Patient_ID!="Control"&
+                                          tcell.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=tcell.sce_pat.roi$RoiID,
+                                "Area"=tcell.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=tcell.sce_pat.roi$Patient_ID,
+                                "DX.name"=tcell.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+  
+  
+#CAF type distribution over all patients
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+#order Patients after clustering results
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = labels(hc))
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = labels(hc))
+```
+
+## Barplot showing absolute density per patietn coloured by T cell density together with hierarchical clustering tree coloured by patient metaclusters.
+```{r,Barolot T cell type densities, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+#barolot (sqrt of density)
+p <-ggplot(tdat,aes(y=sqrt(density),x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+#barplot total density
+p <-ggplot(tdat,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+
+#coloured dendrogram
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 8) %>% as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()
+pg <-plot_grid(p1,p, align="v", ncol=1)
+pg
+#ggsave(plot=pg, file=file.path(plot_folder, paste("Barplot_Tcell-density-Densities_HC_ordered_dendro.pdf")), width=12, height=6)
+```
+Add patient metacluster to data and plot clusters separately for better identification
+```{r,T cell density densities merge hc, message=FALSE, warning=FALSE, echo=FALSE}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k=8))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+ggplot(tdat_ct,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+```
+
+## Survival analysis over T cell density densities high low.  
+- High > median
+- Low < median
+```{r, surv_dat T cell density densities high low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- rowSums(surv_dat[,2:11])
+
+#high low
+surv_dat$Density_G <- ifelse(surv_dat$total_density >summary(surv_dat$total_density)[3],"Density high","Density low")
+surv_dat$CD4_G <- ifelse(surv_dat$CD4 >summary(surv_dat$CD4)[3],"CD4 high","CD4 low")
+surv_dat$CD4_Treg_G <- ifelse(surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[3],"CD4 Treg high","CD4 Treg low")
+surv_dat$CD4_IDO_G <- ifelse(surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[3],"CD4 IDO high","CD4 IDO low")
+surv_dat$CD4_PD1_G <- ifelse(surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[3],"CD4 PD1 high","CD4 PD1 low")
+surv_dat$CD4_TCF_G <- ifelse(surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[3],"CD4 TCF1/7 high","CD4 TCF1/7 low")
+surv_dat$CD4_ki67_G <- ifelse(surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[3],"dividing CD4 high","dividing CD4 low")
+
+surv_dat$CD8_IDO_G <- ifelse(surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[3],"CD8 IDO high","CD8 IDO low")
+surv_dat$CD8_TCF_G <- ifelse(surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[3],"CD8 TCF1/7 high","CD8 TCF1/7 low")
+surv_dat$CD8_G <- ifelse(surv_dat$CD8 >summary(surv_dat$CD8)[3],"CD8 high","CD8 low")
+surv_dat$CD8_ki67_G <- ifelse(surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[3],"dividing CD8 high","dividing CD8 low")
+
+surv <-c("CD8_G","CD4_G","CD4_Treg_G","CD4_IDO_G","CD4_PD1_G","CD4_TCF_G","CD4_ki67_G", "CD8_IDO_G","CD8_TCF_G","CD8_ki67_G","metacluster")
+```
+
+#CoxPH for T cell density density corrected for Stage, Grade and M
+```{r, Coxph T cell density density, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   CD4 + CD8+`TCF1/7_CD4`+ki67_CD8+`TCF1/7_CD8`+PD1_CD4+IDO_CD8 +ki67_CD4+IDO_CD4+CD4_Treg + Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r, survival T cell density densities high low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+#surv <-c("metacluster")
+library(survminer)
+
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+#km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r, survival T cell density densities high low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+surv <-c("CD8_G","CD4_G","CD4_Treg_G","CD4_IDO_G","CD4_PD1_G","CD4_TCF_G","CD4_ki67_G", "CD8_IDO_G","CD8_TCF_G","CD8_ki67_G","metacluster")
+
+#Split
+library(survminer)
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+surv_dat$total_density <- rowSums(surv_dat[,2:11])
+surv_dat$Density_G <- ifelse(surv_dat$total_density >summary(surv_dat$total_density)[3],"Density high","Density low")
+surv_dat$CD4_G <- ifelse(surv_dat$CD4 >summary(surv_dat$CD4)[3],"CD4 high","CD4 low")
+surv_dat$CD4_Treg_G <- ifelse(surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[3],"CD4 Treg high","CD4 Treg low")
+surv_dat$CD4_IDO_G <- ifelse(surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[3],"CD4 IDO high","CD4 IDO low")
+surv_dat$CD4_PD1_G <- ifelse(surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[3],"CD4 PD1 high","CD4 PD1 low")
+surv_dat$CD4_TCF_G <- ifelse(surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[3],"CD4 TCF1/7 high","CD4 TCF1/7 low")
+surv_dat$CD4_ki67_G <- ifelse(surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[3],"dividing CD4 high","dividing CD4 low")
+
+surv_dat$CD8_IDO_G <- ifelse(surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[3],"CD8 IDO high","CD8 IDO low")
+surv_dat$CD8_TCF_G <- ifelse(surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[3],"CD8 TCF1/7 high","CD8 TCF1/7 low")
+surv_dat$CD8_G <- ifelse(surv_dat$CD8 >summary(surv_dat$CD8)[3],"CD8 high","CD8 low")
+surv_dat$CD8_ki67_G <- ifelse(surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[3],"dividing CD8 high","dividing CD8 low")
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and T cell categories (CD4 vs CD8) classified as high / low as well as the densities (continuous).   
+```{r, coxph T cell density densities high-low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=6, fig.width=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"Density_G","metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-density_Density_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Survival T cell density densities high-mid-low     
+- high    > 3rd quantile   
+- mid     > 1st quantile < 3rd quantile   
+- low     < 1st quantile   
+```{r, surv_dat T cell density densities high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- rowSums(surv_dat[,2:11])
+
+#High Med Low
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[5]] <-"Density high"
+surv_dat$Density_G[surv_dat$total_density <=summary(surv_dat$total_density)[2]] <-"Density low"
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[2]& surv_dat$total_density<summary(surv_dat$total_density)[5]] <-"Density medium"
+
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[5]] <-"CD4 high"
+surv_dat$CD4_G[surv_dat$CD4 <=summary(surv_dat$CD4)[2]] <-"CD4 low"
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[2]& surv_dat$CD4<summary(surv_dat$CD4)[5]] <-"CD4 medium"
+
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[5]] <-"CD4 Treg high"
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg <=summary(surv_dat$CD4_Treg)[2]] <-"CD4 Treg low"
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[2]& surv_dat$CD4_Treg<summary(surv_dat$CD4_Treg)[5]] <-"CD4 T reg medium"
+
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[5]] <-"IDO_CD4 high"
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 <=summary(surv_dat$IDO_CD4)[2]] <-"IDO_CD4 low"
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[2]& surv_dat$IDO_CD4<summary(surv_dat$IDO_CD4)[5]] <-"IDO_CD4 medium"
+
+surv_dat$CD4_PD1_G[surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[5]] <-"PD1_CD4 high"
+surv_dat$CD4_PD1_G[surv_dat$PD1_CD4 <=summary(surv_dat$PD1_CD4)[2]] <-"PD1_CD4 low"
+surv_dat$CD4_PD1_G[surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[2]& surv_dat$PD1_CD4<summary(surv_dat$PD1_CD4)[5]] <-"PD1_CD4 medium"
+
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[5]] <-"CD4 TCF1/7 high"
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` <=summary(surv_dat$`TCF1/7_CD4`)[2]] <-"CD4 TCF1/7  low"
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[2]& surv_dat$`TCF1/7_CD4`<summary(surv_dat$`TCF1/7_CD4`)[5]] <-"CD4 TCF1/7  medium"
+
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[5]] <-"CD4 dividing high"
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 <=summary(surv_dat$ki67_CD4)[2]] <-"CD4 dividing low"
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[2]& surv_dat$ki67_CD4<summary(surv_dat$ki67_CD4)[5]] <-"CD4 dividing medium"
+
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[5]] <-"CD8 high"
+surv_dat$CD8_G[surv_dat$CD8 <=summary(surv_dat$CD8)[2]] <-"CD8 low"
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[2]& surv_dat$CD8<summary(surv_dat$CD8)[5]] <-"CD8 medium"
+
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[5]] <-"CD8 IDO high"
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 <=summary(surv_dat$IDO_CD8)[2]] <-"CD8 IDO low"
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[2]& surv_dat$IDO_CD8<summary(surv_dat$IDO_CD8)[5]] <-"CD8 IDO medium"
+
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[5]] <-"CD8 TCF1/7 high"
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` <=summary(surv_dat$`TCF1/7_CD8`)[2]] <-"CD8 TCF1/7 low"
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[2]& surv_dat$`TCF1/7_CD8`<summary(surv_dat$`TCF1/7_CD8`)[5]] <-"CD8 TCF1/7 medium"
+
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[5]] <-"CD8 dividing high"
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 <=summary(surv_dat$ki67_CD8)[2]] <-"CD8 dividing low"
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[2]& surv_dat$ki67_CD8<summary(surv_dat$ki67_CD8)[5]] <-"CD8 dividing medium"
+
+surv <-c("CD8_G","CD4_G","CD4_Treg_G","CD4_IDO_G","CD4_PD1_G","CD4_TCF_G","CD4_ki67_G", "CD8_IDO_G","CD8_TCF_G","CD8_ki67_G","metacluster")
+```
+
+## T cell density densities split by tumour type
+```{r, survival t cell density densities high-mid-low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+library(survminer)
+
+for (i in surv){
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## T cell density densities split by tumour type
+```{r, survival t cell density densities high-mid-low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+surv <-c("CD8_G","CD4_G","CD4_Treg_G","CD4_IDO_G","CD4_PD1_G","CD4_TCF_G","CD4_ki67_G", "CD8_IDO_G","CD8_TCF_G","CD8_ki67_G","metacluster")
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[5]] <-"Density high"
+surv_dat$Density_G[surv_dat$total_density <=summary(surv_dat$total_density)[2]] <-"Density low"
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[2]& surv_dat$total_density<summary(surv_dat$total_density)[5]] <-"Density medium"
+
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[5]] <-"CD4 high"
+surv_dat$CD4_G[surv_dat$CD4 <=summary(surv_dat$CD4)[2]] <-"CD4 low"
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[2]& surv_dat$CD4<summary(surv_dat$CD4)[5]] <-"CD4 medium"
+
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[5]] <-"CD4 Treg high"
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg <=summary(surv_dat$CD4_Treg)[2]] <-"CD4 Treg low"
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[2]& surv_dat$CD4_Treg<summary(surv_dat$CD4_Treg)[5]] <-"CD4 T reg medium"
+
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[5]] <-"IDO_CD4 high"
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 <=summary(surv_dat$IDO_CD4)[2]] <-"IDO_CD4 low"
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[2]& surv_dat$IDO_CD4<summary(surv_dat$IDO_CD4)[5]] <-"IDO_CD4 medium"
+
+surv_dat$CD4_PD1_G[surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[5]] <-"PD1_CD4 high"
+surv_dat$CD4_PD1_G[surv_dat$PD1_CD4 <=summary(surv_dat$PD1_CD4)[2]] <-"PD1_CD4 low"
+surv_dat$CD4_PD1_G[surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[2]& surv_dat$PD1_CD4<summary(surv_dat$PD1_CD4)[5]] <-"PD1_CD4 medium"
+
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[5]] <-"CD4 TCF1/7 high"
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` <=summary(surv_dat$`TCF1/7_CD4`)[2]] <-"CD4 TCF1/7  low"
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[2]& surv_dat$`TCF1/7_CD4`<summary(surv_dat$`TCF1/7_CD4`)[5]] <-"CD4 TCF1/7  medium"
+
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[5]] <-"CD4 dividing high"
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 <=summary(surv_dat$ki67_CD4)[2]] <-"CD4 dividing low"
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[2]& surv_dat$ki67_CD4<summary(surv_dat$ki67_CD4)[5]] <-"CD4 dividing medium"
+
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[5]] <-"CD8 high"
+surv_dat$CD8_G[surv_dat$CD8 <=summary(surv_dat$CD8)[2]] <-"CD8 low"
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[2]& surv_dat$CD8<summary(surv_dat$CD8)[5]] <-"CD8 medium"
+
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[5]] <-"CD8 IDO high"
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 <=summary(surv_dat$IDO_CD8)[2]] <-"CD8 IDO low"
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[2]& surv_dat$IDO_CD8<summary(surv_dat$IDO_CD8)[5]] <-"CD8 IDO medium"
+
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[5]] <-"CD8 TCF1/7 high"
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` <=summary(surv_dat$`TCF1/7_CD8`)[2]] <-"CD8 TCF1/7 low"
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[2]& surv_dat$`TCF1/7_CD8`<summary(surv_dat$`TCF1/7_CD8`)[5]] <-"CD8 TCF1/7 medium"
+
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[5]] <-"CD8 dividing high"
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 <=summary(surv_dat$ki67_CD8)[2]] <-"CD8 dividing low"
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[2]& surv_dat$ki67_CD8<summary(surv_dat$ki67_CD8)[5]] <-"CD8 dividing medium"
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+
+
+## Lasso-regressed cox-ph model   
+including clinical parameters, patient metacluster and T cell categories (CD4 vs CD8) classified as high mid low as well as the densities (continuous).   
+```{r,coxph T cell density densities high-mid-low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor")))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-density_Density_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables without neoadjuvant treated patients.
+```{r, KW W T cell density density WO neo,warning=FALSE, message=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+tcell.sce_pat.roi$NeoAdj <- ifelse(tcell.sce_pat.roi$Chemo==1 |tcell.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(tcell.sce_pat.roi[,tcell.sce_pat.roi$Patient_ID!="Control"&
+                                          tcell.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           tcell.sce_pat.roi[,tcell.sce_pat.roi$Patient_ID!="Control"&
+                                          tcell.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=tcell.sce_pat.roi$RoiID,
+                                "Area"=tcell.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=tcell.sce_pat.roi$Patient_ID,
+                                "DX.name"=tcell.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[,2:11])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:12])), names_to = "Phenotype",values_to = "density")
+
+#remove
+tdat <-tdat[!tdat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-density_Densities_WOneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables with neoadjuvant treated patients.
+```{r, KW Wilcox T cell density density W neo,warning=FALSE, message=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+tcell.sce_pat.roi$NeoAdj <- ifelse(tcell.sce_pat.roi$Chemo==1 |tcell.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(tcell.sce_pat.roi[,tcell.sce_pat.roi$Patient_ID!="Control"&
+                                          tcell.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           tcell.sce_pat.roi[,tcell.sce_pat.roi$Patient_ID!="Control"&
+                                          tcell.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=tcell.sce_pat.roi$RoiID,
+                                "Area"=tcell.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=tcell.sce_pat.roi$Patient_ID,
+                                "DX.name"=tcell.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[,2:11])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:12])), names_to = "Phenotype",values_to = "density")
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-density_Densities_Wneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+```{r, Correlation T cell density densities with clinical parameters, eval=F, echo=T, message=FALSE, warning=FALSE, echo=FALSE}
+tcell.sce_pat.roi$NeoAdj <- ifelse(tcell.sce_pat.roi$Chemo==1 |tcell.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tcell.sce_pat.roi$NeoAdj <- ifelse(tcell.sce_pat.roi$Chemo==1 |tcell.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(tcell.sce_pat.roi[,tcell.sce_pat.roi$Patient_ID!="Control"&
+                                          tcell.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           tcell.sce_pat.roi[,tcell.sce_pat.roi$Patient_ID!="Control"&
+                                          tcell.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=tcell.sce_pat.roi$RoiID,
+                                "Area"=tcell.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=tcell.sce_pat.roi$Patient_ID,
+                                "DX.name"=tcell.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- tdat_wide$CD4 + tdat_wide$CD8
+
+tdat_wide <-merge(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_as <-tdat_wide[tdat_wide$DX.name=="Adenocarcinoma" | tdat_wide$DX.name=="Squamous cell carcinoma",]
+tdat_wide_as$LN.Met <-ifelse(tdat_wide_as$LN.Met=="LN Metastases", 1,0)
+tdat_wide_as$Dist.Met <-ifelse(tdat_wide_as$Dist.Met=="Dist. Metastases", 1,0)
+
+for(i in c.param){
+  print(i)
+  print(cor.test(tdat_wide_as$CD4,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$CD8,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$total_density,tdat_wide_as[[i]], method="spearman", exact=F))
+}
+```
+
+
+
+## Correlation of T cell type densites amongst eachother
+```{r, Correlations T cell type densities amongst eachother, message=FALSE, warning=FALSE, echo=FALSE, fig.width=6, fig.height=6}
+tcell.sce_pat.roi$NeoAdj <- ifelse(tcell.sce_pat.roi$Chemo==1 |tcell.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(tcell.sce_pat.roi[,tcell.sce_pat.roi$Patient_ID!="Control"&
+                                          tcell.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           tcell.sce_pat.roi[,tcell.sce_pat.roi$Patient_ID!="Control"&
+                                          tcell.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          tcell.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=tcell.sce_pat.roi$RoiID,
+                                "Area"=tcell.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=tcell.sce_pat.roi$Patient_ID,
+                                "DX.name"=tcell.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+tdat <-tdat %>% 
+  group_by(RoiID,Phenotype) %>%
+  summarise(across(-c(Patient_ID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+tdat_wide <- pivot_wider(tdat,id_cols=c("RoiID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+neoadj_roi <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(RoiID) 
+
+tdat_wide <-tdat_wide[!tdat_wide$RoiID%in% neoadj_roi$RoiID,]
+rownames(tdat_wide)<-tdat_wide$RoiID
+
+library(corrplot)
+cor(tdat_wide[,-1], method="pearson")
+#corrplot(cor(tdat_wide[,-1], method="pearson"), col=rev(brewer.pal(10,"RdBu")))
+
+#test <- immune.pheno.sub %>% select(contains("Ratio"))
+#test$Ratio_tCAF_vCAF <-NULL
+sig <- cor.mtest(tdat_wide[,-1], conf.level = .95) #is this test right?
+
+
+#plot
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+         col=rev(brewer.pal(10,"RdBu")),
+         pch.col = "white",
+         #type="upper",
+         title = "Correlation of T cell type densities",
+        mar=c(0,0,3,0))
+```
diff --git a/Analysis_all_clustered_prop_den.Rmd b/Analysis_all_clustered_prop_den.Rmd
new file mode 100644
index 0000000..3c17f28
--- /dev/null
+++ b/Analysis_all_clustered_prop_den.Rmd
@@ -0,0 +1,3069 @@
+---
+title: "R Notebook- All clustered, all merged"
+output:
+  html_document:
+    df_print: paged
+---
+```{r Load libraries, echo=F, message=F, warning=F}
+library(ggpubr)
+library(dplyr)
+#library(tidyverse)
+library(RColorBrewer)
+library(qwraps2)
+library(table1)
+library(SingleCellExperiment)
+#library(uwot)
+library(tidyr)
+library(scater)
+library(ggridges)
+library(ggsci)
+library(viridis)
+library(viridisLite)
+library(ggplot2)
+library(data.table)
+library(CATALYST)
+library(gridExtra)
+library(Rphenograph)
+library(ComplexHeatmap)
+library(CATALYST)
+library(scales)
+library(survival)
+library(broom)
+library(pheatmap)
+library(FlowSOM)
+library(Seurat)
+library(Rphenoannoy)
+library(dplyr)
+library(data.table)
+library(ggthemes)
+library(diffcyt)
+library(edgeR)
+library(rstatix)
+library(dendextend)
+
+library(ggdendro)
+library(dendextend)
+library(FactoMineR)
+library(factoextra)
+library(survminer)
+library(corrplot)
+library(rstatix)
+library(graphics)
+library(cowplot)
+library(cluster)
+library(glmnet)
+library(fastDummies)
+
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+library(dplyr)
+library(tidyr)
+library(mclust)
+library(ggplot2)
+library(RColorBrewer)
+library(scater)
+library(Rphenoannoy)
+library(factoextra)
+library(cluster)
+library(dendextend)
+library(ggthemes)
+set.seed(101100)
+
+```
+
+```{r}
+#set working directory 
+wd <-dirname(getwd())
+
+#clinical.data
+#data_folder <-(file.path(wd,"sce_objects","final objects with categories","FINAL"))
+#all.sce <- readRDS("/mnt/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/FINAL/FINAL_All_Clustered_FILTERED_inc_tumour_vessel_immune_CAF_minusOther.rds")
+#all.sce$DX.name[is.na(all.sce$DX.name)]<-"NA"
+
+#all.sce$tcell_subtype <-all.sce$cell_subtype
+#all.sce$tcell_subtype[all.sce$cell_category=="Fibroblast"] <- all.sce[, all.sce$cell_category=="Fibroblast"]$cell_type
+#unique(all.sce$tcell_subtype)
+
+data_folder <-file.path(wd,"sce_objects","merge_plus_tumour")
+
+all.sce_pat.roi <-readRDS(file=file.path(data_folder, paste("ALL_minus-Tumour_sce_pat_roi_rem.rds",sep="")))
+all.sce_pat <-readRDS(file=file.path(data_folder, paste("ALL_plus-Tumour_sce_pat_rem.rds",sep="")))
+unique(all.sce_pat.roi$cell_category)
+```
+
+
+
+```{r}
+#clinical.data <-as.data.frame(colData(immune.sce))
+
+clinical.data <- read.csv(file=file.path(wd,"sce_objects","clinical_data", "clinical_data_ROI_ac_combined_CORRECT.csv"))
+
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+clinical.data$X <-NULL
+clinical.data$X.1 <-NULL
+head(clinical.data)
+```
+
+```{r, eval=F}
+#All Fibros together
+tbl <- as.data.frame(table(all.sce[,all.sce$Patient_ID!="Control" ]$Patient_ID))
+colnames(tbl) <-c("Patient ID", "Cell number overall")
+#write.csv(tbl, file=file.path(plot_folder, paste("Immune cell numbers per patient overall including T-excluding undefined cells.csv")))
+print(tbl)
+
+summary(tbl$`Cell number overall`) #3-6259
+tbl[tbl$`Cell number overall` <=100,] #43 patients have less than 100 F
+
+#Lowest 10% of all patients' Cell numbers
+tbl[order(tbl$`Cell number overall`),]$`Cell number overall`[1:100] #2-2114 = lowest 10%, 5-503 = lowest 5%
+
+
+#Highest 10% of all patients' Cell numbers
+tbl[order(-tbl$`Cell number overall`),]$`Cell number overall`[1:100] #760-2378
+
+tbl[tbl$`Cell number overall` <=100,] #43 patients have less than 100 Fibros=10%. 77 patients have less than 15 Fibros
+
+all_cells_pat <- tbl[tbl$`Cell number overall` <=2000,]$`Patient ID`
+length(all_cells_pat)
+length(unique(all.sce$Patient_ID))
+```
+
+```{r, eval=F}
+#All Immune cells together
+tbl <- as.data.frame(table(all.sce[,all.sce$Patient_ID!="Control"]$RoiID))
+colnames(tbl) <-c("ROI ID", "Cell number overall")
+#write.csv(tbl, file=file.path(plot_folder, paste("Cell numbers per ROI overall including Fibros-excluding undefined.csv")))
+print(tbl)
+summary(tbl$`Cell number overall`) #1-5075
+tbl[tbl$`Cell number overall` <=100,] #93 images have less than 50 Fibros (10% of median)
+#Lowest 10% of all patients' Cell numbers
+tbl[order(tbl$`Cell number overall`),]$`Cell number overall`[1:200] #1-110
+
+
+#Highest 10% of all patients' Cell numbers
+tbl[order(-tbl$`Cell number overall`),]$`Cell number overall`[1:200] #1385-5075
+#per image cut at 50 Fibros per image equals lowest 5% -> ensures that there's at least 50 Fibros per patient
+all_cells_roi <- tbl[tbl$`Cell number overall` <=1000,]$`ROI ID`
+length(all_cells_roi)
+```
+
+
+Remove patients and roi
+```{r  patient roi and pat_roi removal, echo=FALSE, message=FALSE, warning=FALSE, eval=F}
+#Patient removal
+all.sce_pat <- all.sce[,all.sce$Patient_ID!="Control"&
+                               !all.sce$Patient_ID%in%all_cells_pat]
+length(unique(all.sce_pat$Patient_ID)) #1025
+
+#Roi removal
+all.sce_roi <- all.sce[,all.sce$Patient_ID!="Control"&
+                               !all.sce$RoiID%in%all_cells_roi]
+length(unique(all.sce_roi$Patient_ID)) #1039
+
+#Patient & Roi removal
+all.sce_pat.roi <- all.sce[,all.sce$Patient_ID!="Control"&
+                               !all.sce$Patient_ID%in%all_cells_pat&
+                               !all.sce$RoiID%in%all_cells_roi]
+length(unique(all.sce_pat.roi$Patient_ID)) #1025
+```
+
+```{r  Save SCE patient and roi removed, echo=FALSE,warning=FALSE, message=FALSE, eval=FALSE}
+data_folder <-file.path(wd,"sce_objects","merge_plus_tumour")
+unique(all.sce$cell_category)
+all.sce[, all.sce$cell_category!="Other"]
+saveRDS(all.sce_pat.roi,file=file.path(data_folder, paste("ALL_plus-Tumour_sce_pat_roi_rem.rds",sep="")))
+saveRDS(all.sce_roi,file=file.path(data_folder, paste("ALL_plusTumour_sce_roi_rem.rds",sep="")))
+saveRDS(all.sce_pat,file=file.path(data_folder, paste("ALL_plus-Tumour_sce_pat_rem.rds",sep="")))
+
+all.sce_pat.roi <-readRDS(file=file.path(data_folder, paste("ALL_minus-Tumour_sce_pat_roi_rem.rds",sep="")))
+all.sce_roi <-readRDS(file=file.path(data_folder, paste("ALL_plusTumour_sce_roi_rem.rds",sep="")))
+all.sce_pat <-readRDS(file=file.path(data_folder, paste("ALL_plus-Tumour_sce_pat_rem.rds",sep="")))
+```
+
+
+
+##Analysis
+#Proportions Fibro CATEGORY
+
+## **Analysis**
+In the following, only tumours that were identified as adenocarcinomas or squamous cell carcinomas by pathologists will be used.
+
+## **Fibro Category**
+
+## **Proportions**
+Optimal number of patient metaclusters Fibro category proportions
+```{r  optimal number of clusters for non Fibro category type proportion clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4, eval=F}
+#Calculate Proportions
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+tdat <-as.data.frame(table( all.sce_pat[, all.sce_pat$Patient_ID!="Control"&
+                                           all.sce_pat$DX.name=="Adenocarcinoma"|
+                                           all.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            all.sce_pat[, all.sce_pat$Patient_ID!="Control"&
+                                           all.sce_pat$DX.name=="Adenocarcinoma"|
+                                           all.sce_pat$DX.name=="Squamous cell carcinoma"]$tcell_subtype))
+
+
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+#tdat_m <-tdat_m[,1:2]
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+sbg <-cutree(hc, k=2)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=15)
+
+#install.packages("cluster")
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+#res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+## Patients are ordered by hierarchical clustering (euclidian, wards.D2)
+```{r  Fibro category proportions ordered by hierarchical clustering, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table( all.sce_pat[, all.sce_pat$Patient_ID!="Control"&
+                                           all.sce_pat$DX.name=="Adenocarcinoma"|
+                                           all.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            all.sce_pat[, all.sce_pat$Patient_ID!="Control"&
+                                           all.sce_pat$DX.name=="Adenocarcinoma"|
+                                           all.sce_pat$DX.name=="Squamous cell carcinoma"]$tcell_subtype))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = hc$labels[hc$order])
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = hc$labels[hc$order])
+
+tdat_wide
+```
+
+Patient based metaclusters split by metacluster
+```{r  Barplot Fibro category proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#Barplot proportions ordered by
+p <-ggplot(tdat,aes(x=freq, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_igv()
+#plot(p)
+#ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_Tcell-Category_over_FILTEREDpatients_ordered-hc.pdf")), width=6, height=6)
+
+#dendrogram coloured by metacluster
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k =15) %>%  as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()+
+  coord_flip()+
+  scale_y_reverse()
+
+#n
+p2<-ggplot(tdat,aes(x=n, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_igv()+theme(
+        axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())
+plt <-p1+
+  p+theme(legend.position = "none", axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())+
+  p2
+
+plot(plt)
+#ggsave(plot=plt, file=file.path(plot_folder, paste("Barplot_n_hc_cell_type_proportions.pdf")), width=6, height=6)
+```
+
+```{r  merge hc metacluster into t_dat Fibro category proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k= 15))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-left_join(tdat, ct, by="Patient_ID")
+
+p <-ggplot(tdat_ct,aes(y=freq,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_igv()+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+#ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_split_metacluster_tcell-category_proportions.pdf")))
+plot(p)
+```
+```{r, fig.width=12, fig.height=6}
+m <- match(order.histo,clin$SaNr)
+fq_df <- cbind(data.table("sample" = order.histo),clin[m,sel.clin])
+fq_df$sample <- factor(fq_df$sample,levels = order.histo)
+
+tdat_pat <- unique(tdat$Patient_ID)
+df <- clinical.data %>% select(Grade, DX.name, Patient_ID, T.new, N, M.new,Relapse) %>%
+  reshape2::melt(id.vars =c("Patient_ID"), variable.name = "clinical_features", value.name = "value")
+length(unique(df$Patient_ID))
+
+df <-df[df$Patient_ID %in% tdat_pat,]
+
+df$Patient_ID <- factor(df$Patient_ID, levels = hc$labels[hc$order])
+
+p<- ggplot(df, aes(x=Patient_ID, y=clinical_features, fill= value))+
+  geom_tile(color="white")+
+     # scale_fill_manual(values = color_clusters_3)+
+   theme_bw()+ theme(
+    panel.grid = element_blank(), 
+    legend.key.size = unit(4, "mm"),
+    axis.text = element_text(color = "black"),
+    axis.text.x = element_text(angle = 45, hjust = 1),
+    axis.title.x = element_blank(),
+    axis.ticks.x = element_blank())
+p
+
+df <- clinical.data %>% select(Grade, DX.name, Patient_ID, T.new, N, M.new,Relapse)
+df <-df[df$Patient_ID %in% tdat_pat,]
+
+df$Patient_ID <- factor(df$Patient_ID, levels = hc$labels[hc$order])
+df$Relapse <- factor(df$Relapse)
+df$Grade <- factor(df$Grade)
+df$T.new <- factor(df$T.new)
+df$N <- factor(df$N)
+df$M.new <- factor(df$M.new)
+
+Heatmap(df$DX.name, name = "Type", width = unit(5, "mm"))+
+Heatmap(df$Grade, name = "Grade", width = unit(5, "mm"))+
+Heatmap(df$T.new, name = "T", width = unit(5, "mm"))+
+Heatmap(df$N, name = "N", width = unit(5, "mm"))+
+Heatmap(df$M.new, name = "M", width = unit(5, "mm"))+
+Heatmap(df$Relapse, name = "Relapse", width = unit(5, "mm"))
+
+col<- palette("Tableau 10")
+```
+
+```{r cell proportions over tumour size, fig.width=12, fig.height=12}
+c.param <- c("Size")
+for(i in c.param){
+  print(i)
+  for(k in unique(colnames(tdat_wide_ct[,2:30]))){
+    print(k)
+print(cor.test(tdat_wide_ct[[k]],tdat_wide_ct[[i]], method="pearson", exact=F))
+  }
+}
+
+tdat_c <- left_join(tdat, clinical.data, by="Patient_ID")
+
+
+tdat_c %>% ggplot(aes(x=Size, y=freq, color=Phenotype))+facet_wrap(~Phenotype, scales="free_y")+geom_point()+geom_smooth(method=lm,color="darkred")
+```
+
+## **Survival analysis** - Fibro category
+Fibro categories (CD4 and CD8) are grouped into "high" and "low" using the proportion's median as a cut-off.    
+- high > median  
+- low < median  
+Patients who received neoadjuvant treatment are excluded from survival analysis.  
+```{r  survival data Fibro category proportions high-low, message=FALSE, warning=FALSE, echo=FALSE}
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat,]
+
+#high low by median
+surv_dat$Collagen_CAF_G <- ifelse(surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+surv_dat$hypoxic_CAF_G <- ifelse(surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+surv_dat$mCAF_G <- ifelse(surv_dat$mCAF >summary(surv_dat$mCAF)[3],"mCAF high","mCAF low")
+surv_dat$SMA_CAF_G <- ifelse(surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+surv_dat$tpCAF_G <- ifelse(surv_dat$tpCAF >summary(surv_dat$tpCAF)[3],"tpCAF high","tpCAF low")
+surv_dat$iCAF_G <- ifelse(surv_dat$iCAF >summary(surv_dat$iCAF)[3],"iCAF high","iCAF low")
+surv_dat$vCAF_G <- ifelse(surv_dat$vCAF >summary(surv_dat$vCAF)[3],"vCAF high","vCAF low")
+surv_dat$dCAF_G <- ifelse(surv_dat$dCAF >summary(surv_dat$dCAF)[3],"dCAF high","dCAF low")
+surv_dat$hypoxic_tpCAF_G <- ifelse(surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+surv_dat$IDO_CAF_G <- ifelse(surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+surv_dat$PDPN_CAF_G <- ifelse(surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+
+surv_dat$Neutrophil_G <- ifelse(surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[3],"Neutrophil high","Neutrophil low")
+surv_dat$Bcell_G <- ifelse(surv_dat$Bcell >summary(surv_dat$Bcell)[3],"B cell high","B cell low")
+surv_dat$Myeloid_G <- ifelse(surv_dat$Myeloid >summary(surv_dat$Myeloid)[3],"Myeloid high","Myeloid low")
+surv_dat$CD4_G <- ifelse(surv_dat$CD4 >summary(surv_dat$CD4)[3],"CD4 high","CD4 low")
+surv_dat$CD4_Treg_G <- ifelse(surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[3],"CD4 Treg high","CD4 Treg low")
+surv_dat$CD4_IDO_G <- ifelse(surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[3],"CD4 IDO high","CD4 IDO low")
+surv_dat$CD4_PD1_G <- ifelse(surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[3],"CD4 PD1 high","CD4 PD1 low")
+surv_dat$CD4_TCF_G <- ifelse(surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[3],"CD4 TCF1/7 high","CD4 TCF1/7 low")
+surv_dat$CD4_ki67_G <- ifelse(surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[3],"dividing CD4 high","dividing CD4 low")
+
+surv_dat$CD8_IDO_G <- ifelse(surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[3],"CD8 IDO high","CD8 IDO low")
+surv_dat$CD8_TCF_G <- ifelse(surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[3],"CD8 TCF1/7 high","CD8 TCF1/7 low")
+surv_dat$CD8_G <- ifelse(surv_dat$CD8 >summary(surv_dat$CD8)[3],"CD8 high","CD8 low")
+surv_dat$CD8_ki67_G <- ifelse(surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[3],"dividing CD8 high","dividing CD8 low")
+
+surv_dat$normal_G <- ifelse(surv_dat$normal >summary(surv_dat$normal)[3],"normal high","normal low")
+surv_dat$hypoxic_G <- ifelse(surv_dat$hypoxic >summary(surv_dat$hypoxic)[3],"hypoxic high","hypoxic low")
+
+
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G","Bcell_G","Neutrophil_G","Myeloid_G", "metacluster","CD8_G","CD4_G","CD4_Treg_G","CD4_IDO_G","CD4_PD1_G","CD4_TCF_G","CD4_ki67_G", "CD8_IDO_G","CD8_TCF_G","CD8_ki67_G","normal_G","hypoxic_G")
+```
+
+
+#CoxPH for Fibro category proportions corrected for Stage, Grade and M
+```{r  Coxph Fibro category proportions, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   PDPN_CAF + IDO_CAF+hypoxic_tpCAF+dCAF+vCAF+iCAF+tpCAF+mCAF+hypoxic_CAF+SMA_CAF+Collagen_CAF+Neutrophil+Bcell+ CD4 + CD8+`TCF1/7_CD4`+ki67_CD8+`TCF1/7_CD8`+PD1_CD4+IDO_CD8 +ki67_CD4+IDO_CD4+CD4_Treg + Myeloid + normal+hypoxic+ Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+#geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()+
+  scale_y_continuous(trans = "log10")
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r  survival Fibro category proportions not split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  facet_wrap(~strata)+theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r  survival Fibro category proportions split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#OS survival
+#surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster")
+
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+    
+dat$Collagen_CAF_G <- ifelse(dat$Collagen_CAF >summary(dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+dat$hypoxic_CAF_G <- ifelse(dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+dat$mCAF_G <- ifelse(dat$mCAF >summary(dat$mCAF)[3],"mCAF high","mCAF low")
+dat$SMA_CAF_G <- ifelse(dat$SMA_CAF >summary(dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+dat$tpCAF_G <- ifelse(dat$tpCAF >summary(dat$tpCAF)[3],"tpCAF high","tpCAF low")
+dat$iCAF_G <- ifelse(dat$iCAF >summary(dat$iCAF)[3],"iCAF high","iCAF low")
+dat$vCAF_G <- ifelse(dat$vCAF >summary(dat$vCAF)[3],"vCAF high","vCAF low")
+dat$dCAF_G <- ifelse(dat$dCAF >summary(dat$dCAF)[3],"dCAF high","dCAF low")
+dat$hypoxic_tpCAF_G <- ifelse(dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+dat$IDO_CAF_G <- ifelse(dat$IDO_CAF >summary(dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+dat$PDPN_CAF_G <- ifelse(dat$PDPN_CAF >summary(dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+
+dat$Neutrophil_G <- ifelse(dat$Neutrophil >summary(dat$Neutrophil)[3],"Neutrophil high","Neutrophil low")
+dat$Bcell_G <- ifelse(dat$Bcell >summary(dat$Bcell)[3],"B cell high","B cell low")
+dat$Myeloid_G <- ifelse(dat$Myeloid >summary(dat$Myeloid)[3],"Myeloid high","Myeloid low")
+dat$CD4_G <- ifelse(dat$CD4 >summary(dat$CD4)[3],"CD4 high","CD4 low")
+dat$CD4_Treg_G <- ifelse(dat$CD4_Treg >summary(dat$CD4_Treg)[3],"CD4 Treg high","CD4 Treg low")
+dat$CD4_IDO_G <- ifelse(dat$IDO_CD4 >summary(dat$IDO_CD4)[3],"CD4 IDO high","CD4 IDO low")
+dat$CD4_PD1_G <- ifelse(dat$PD1_CD4 >summary(dat$PD1_CD4)[3],"CD4 PD1 high","CD4 PD1 low")
+dat$CD4_TCF_G <- ifelse(dat$`TCF1/7_CD4` >summary(dat$`TCF1/7_CD4`)[3],"CD4 TCF1/7 high","CD4 TCF1/7 low")
+dat$CD4_ki67_G <- ifelse(dat$ki67_CD4 >summary(dat$ki67_CD4)[3],"dividing CD4 high","dividing CD4 low")
+
+dat$CD8_IDO_G <- ifelse(dat$IDO_CD8 >summary(dat$IDO_CD8)[3],"CD8 IDO high","CD8 IDO low")
+dat$CD8_TCF_G <- ifelse(dat$`TCF1/7_CD8` >summary(dat$`TCF1/7_CD8`)[3],"CD8 TCF1/7 high","CD8 TCF1/7 low")
+dat$CD8_G <- ifelse(dat$CD8 >summary(dat$CD8)[3],"CD8 high","CD8 low")
+dat$CD8_ki67_G <- ifelse(dat$ki67_CD8 >summary(dat$ki67_CD8)[3],"dividing CD8 high","dividing CD8 low")
+
+dat$hypoxic_G <- ifelse(dat$hypoxic >summary(dat$hypoxic)[3],"hypoxic high","hypoxic low")
+dat$normal_G <- ifelse(dat$normal >summary(dat$normal)[3],"normal high","normal low")
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+   #abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high / low as well as the proportion (continuous).   
+```{r  coxph Fibro category proportions, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.width=10, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = sqrt(estimate),color =p.value < 0.05)) +
+geom_point(size = 4) +
+#geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()+
+scale_y_continuous(trans = "log10")
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Proportions_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Cell groups categorised into "high", "mid", "low".  
+- high > 3rd quantile  
+- mid >1st quantile and < 3rd quantile  
+- low < 1st quantile  
+```{r  survival Fibro category proportions classified as high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat$mCAF_G[surv_dat$mCAF >summary(surv_dat$mCAF)[5]] <-"mCAF high"
+surv_dat$mCAF_G[surv_dat$mCAF <=summary(surv_dat$mCAF)[2]] <-"mCAF low"
+surv_dat$mCAF_G[surv_dat$mCAF >summary(surv_dat$mCAF)[2]& surv_dat$mCAF<=summary(surv_dat$mCAF)[5]] <-"mCAF medium"
+
+surv_dat$vCAF_G[surv_dat$vCAF >summary(surv_dat$vCAF)[5]] <-"vCAF high"
+surv_dat$vCAF_G[surv_dat$vCAF <=summary(surv_dat$vCAF)[2]] <-"vCAF low"
+surv_dat$vCAF_G[surv_dat$vCAF >summary(surv_dat$vCAF)[2]& surv_dat$vCAF<=summary(surv_dat$vCAF)[5]] <-"vCAF medium"
+
+surv_dat$tpCAF_G[surv_dat$tpCAF >summary(surv_dat$tpCAF)[5]] <-"tpCAF high"
+surv_dat$tpCAF_G[surv_dat$tpCAF <=summary(surv_dat$tpCAF)[2]] <-"tpCAF low"
+surv_dat$tpCAF_G[surv_dat$tpCAF >summary(surv_dat$tpCAF)[2]& surv_dat$tpCAF<=summary(surv_dat$tpCAF)[5]] <-"tpCAF medium"
+
+surv_dat$iCAF_G[surv_dat$iCAF >summary(surv_dat$iCAF)[5]] <-"iCAF high"
+surv_dat$iCAF_G[surv_dat$iCAF <=summary(surv_dat$iCAF)[2]] <-"iCAF low"
+surv_dat$iCAF_G[surv_dat$iCAF >summary(surv_dat$iCAF)[2]& surv_dat$iCAF<=summary(surv_dat$iCAF)[5]] <-"iCAF medium"
+
+surv_dat$dCAF_G[surv_dat$dCAF >summary(surv_dat$dCAF)[5]] <-"dCAF high"
+surv_dat$dCAF_G[surv_dat$dCAF <=summary(surv_dat$dCAF)[2]] <-"dCAF low"
+surv_dat$dCAF_G[surv_dat$dCAF >summary(surv_dat$dCAF)[2]& surv_dat$dCAF<=summary(surv_dat$dCAF)[5]] <-"dCAF medium"
+
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[5]] <-"Collagen_CAF high"
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF <=summary(surv_dat$Collagen_CAF)[2]] <-"Collagen_CAF low"
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[2]& surv_dat$Collagen_CAF<=summary(surv_dat$Collagen_CAF)[5]] <-"Collagen_CAF medium"
+
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[5]] <-"hypoxic_CAF high"
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF <=summary(surv_dat$hypoxic_CAF)[2]] <-"hypoxic_CAF low"
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[2]& surv_dat$hypoxic_CAF<=summary(surv_dat$hypoxic_CAF)[5]] <-"hypoxic_CAF medium"
+
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[5]] <-"SMA_CAF high"
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF <=summary(surv_dat$SMA_CAF)[2]] <-"SMA_CAF low"
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[2]& surv_dat$SMA_CAF<=summary(surv_dat$SMA_CAF)[5]] <-"SMA_CAF medium"
+
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF high"
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF <=summary(surv_dat$hypoxic_tpCAF)[2]] <-"hypoxic_tpCAF low"
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[2]& surv_dat$hypoxic_tpCAF<=summary(surv_dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF medium"
+
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[5]] <-"IDO_CAF high"
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF <=summary(surv_dat$IDO_CAF)[2]] <-"IDO_CAF low"
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[2]& surv_dat$IDO_CAF<=summary(surv_dat$IDO_CAF)[5]] <-"IDO_CAF medium"
+
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[5]] <-"PDPN_CAF high"
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF <=summary(surv_dat$PDPN_CAF)[2]] <-"PDPN_CAF low"
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[2]& surv_dat$PDPN_CAF<=summary(surv_dat$PDPN_CAF)[5]] <-"PDPN_CAF medium"
+
+surv_dat$normal_G[surv_dat$normal >summary(surv_dat$normal)[5]] <-"normal high"
+surv_dat$normal_G[surv_dat$normal <=summary(surv_dat$normal)[2]] <-"normal low"
+surv_dat$normal_G[surv_dat$normal >summary(surv_dat$normal)[2]& surv_dat$normal<=summary(surv_dat$normal)[5]] <-"normal medium"
+
+surv_dat$hypoxic_G[surv_dat$hypoxic >summary(surv_dat$hypoxic)[5]] <-"hypoxic high"
+surv_dat$hypoxic_G[surv_dat$hypoxic <=summary(surv_dat$hypoxic)[2]] <-"hypoxic low"
+surv_dat$hypoxic_G[surv_dat$hypoxic >summary(surv_dat$hypoxic)[2]& surv_dat$hypoxic<=summary(surv_dat$hypoxic)[5]] <-"hypoxic medium"
+
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[5]] <-"Bcell high"
+surv_dat$Bcell_G[surv_dat$Bcell <=summary(surv_dat$Bcell)[2]] <-"Bcell low"
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[2]& surv_dat$Bcell<=summary(surv_dat$Bcell)[5]] <-"Bcell medium"
+
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[5]] <-"Myeloid high"
+surv_dat$Myeloid_G[surv_dat$Myeloid <=summary(surv_dat$Myeloid)[2]] <-"Myeloid low"
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[2]& surv_dat$Myeloid<=summary(surv_dat$Myeloid)[5]] <-"Myeloid medium"
+
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[5]] <-"Neutrophil high"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil <=summary(surv_dat$Neutrophil)[2]] <-"Neutrophil low"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[2]& surv_dat$Neutrophil<=summary(surv_dat$Neutrophil)[5]] <-"Neutrophil medium"
+
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[5]] <-"CD4 high"
+surv_dat$CD4_G[surv_dat$CD4 <=summary(surv_dat$CD4)[2]] <-"CD4 low"
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[2]& surv_dat$CD4<summary(surv_dat$CD4)[5]] <-"CD4 medium"
+
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[5]] <-"CD4 Treg high"
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg <=summary(surv_dat$CD4_Treg)[2]] <-"CD4 Treg low"
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[2]& surv_dat$CD4_Treg<summary(surv_dat$CD4_Treg)[5]] <-"CD4 T reg medium"
+
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[5]] <-"IDO_CD4 high"
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 <=summary(surv_dat$IDO_CD4)[2]] <-"IDO_CD4 low"
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[2]& surv_dat$IDO_CD4<summary(surv_dat$IDO_CD4)[5]] <-"IDO_CD4 medium"
+
+surv_dat$CD4_PD1_G[surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[5]] <-"PD1_CD4 high"
+surv_dat$CD4_PD1_G[surv_dat$PD1_CD4 <=summary(surv_dat$PD1_CD4)[2]] <-"PD1_CD4 low"
+surv_dat$CD4_PD1_G[surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[2]& surv_dat$PD1_CD4<summary(surv_dat$PD1_CD4)[5]] <-"PD1_CD4 medium"
+
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[5]] <-"CD4 TCF1/7 high"
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` <=summary(surv_dat$`TCF1/7_CD4`)[2]] <-"CD4 TCF1/7  low"
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[2]& surv_dat$`TCF1/7_CD4`<summary(surv_dat$`TCF1/7_CD4`)[5]] <-"CD4 TCF1/7  medium"
+
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[5]] <-"CD4 dividing high"
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 <=summary(surv_dat$ki67_CD4)[2]] <-"CD4 dividing low"
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[2]& surv_dat$ki67_CD4<summary(surv_dat$ki67_CD4)[5]] <-"CD4 dividing medium"
+
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[5]] <-"CD8 high"
+surv_dat$CD8_G[surv_dat$CD8 <=summary(surv_dat$CD8)[2]] <-"CD8 low"
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[2]& surv_dat$CD8<summary(surv_dat$CD8)[5]] <-"CD8 medium"
+
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[5]] <-"CD8 IDO high"
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 <=summary(surv_dat$IDO_CD8)[2]] <-"CD8 IDO low"
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[2]& surv_dat$IDO_CD8<summary(surv_dat$IDO_CD8)[5]] <-"CD8 IDO medium"
+
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[5]] <-"CD8 TCF1/7 high"
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` <=summary(surv_dat$`TCF1/7_CD8`)[2]] <-"CD8 TCF1/7 low"
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[2]& surv_dat$`TCF1/7_CD8`<summary(surv_dat$`TCF1/7_CD8`)[5]] <-"CD8 TCF1/7 medium"
+
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[5]] <-"CD8 dividing high"
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 <=summary(surv_dat$ki67_CD8)[2]] <-"CD8 dividing low"
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[2]& surv_dat$ki67_CD8<summary(surv_dat$ki67_CD8)[5]] <-"CD8 dividing medium"
+
+#surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster")
+
+```
+
+## Survival Fibro category proportions high mid low not split
+```{r  survival Fibro category proportions not split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Survival Fibro category proportions high mid low split by tumour type
+```{r  survival Fibro category proportions split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$mCAF_G[dat$mCAF >summary(dat$mCAF)[5]] <-"mCAF high"
+dat$mCAF_G[dat$mCAF <=summary(dat$mCAF)[2]] <-"mCAF low"
+dat$mCAF_G[dat$mCAF >summary(dat$mCAF)[2]& dat$mCAF<=summary(dat$mCAF)[5]] <-"mCAF medium"
+
+dat$vCAF_G[dat$vCAF >summary(dat$vCAF)[5]] <-"vCAF high"
+dat$vCAF_G[dat$vCAF <=summary(dat$vCAF)[2]] <-"vCAF low"
+dat$vCAF_G[dat$vCAF >summary(dat$vCAF)[2]& dat$vCAF<=summary(dat$vCAF)[5]] <-"vCAF medium"
+
+dat$tpCAF_G[dat$tpCAF >summary(dat$tpCAF)[5]] <-"tpCAF high"
+dat$tpCAF_G[dat$tpCAF <=summary(dat$tpCAF)[2]] <-"tpCAF low"
+dat$tpCAF_G[dat$tpCAF >summary(dat$tpCAF)[2]& dat$tpCAF<=summary(dat$tpCAF)[5]] <-"tpCAF medium"
+
+dat$iCAF_G[dat$iCAF >summary(dat$iCAF)[5]] <-"iCAF high"
+dat$iCAF_G[dat$iCAF <=summary(dat$iCAF)[2]] <-"iCAF low"
+dat$iCAF_G[dat$iCAF >summary(dat$iCAF)[2]& dat$iCAF<=summary(dat$iCAF)[5]] <-"iCAF medium"
+
+dat$dCAF_G[dat$dCAF >summary(dat$dCAF)[5]] <-"dCAF high"
+dat$dCAF_G[dat$dCAF <=summary(dat$dCAF)[2]] <-"dCAF low"
+dat$dCAF_G[dat$dCAF >summary(dat$dCAF)[2]& dat$dCAF<=summary(dat$dCAF)[5]] <-"dCAF medium"
+
+dat$Collagen_CAF_G[dat$Collagen_CAF >summary(dat$Collagen_CAF)[5]] <-"Collagen_CAF high"
+dat$Collagen_CAF_G[dat$Collagen_CAF <=summary(dat$Collagen_CAF)[2]] <-"Collagen_CAF low"
+dat$Collagen_CAF_G[dat$Collagen_CAF >summary(dat$Collagen_CAF)[2]& dat$Collagen_CAF<=summary(dat$Collagen_CAF)[5]] <-"Collagen_CAF medium"
+
+dat$hypoxic_CAF_G[dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[5]] <-"hypoxic_CAF high"
+dat$hypoxic_CAF_G[dat$hypoxic_CAF <=summary(dat$hypoxic_CAF)[2]] <-"hypoxic_CAF low"
+dat$hypoxic_CAF_G[dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[2]& dat$hypoxic_CAF<=summary(dat$hypoxic_CAF)[5]] <-"hypoxic_CAF medium"
+
+dat$SMA_CAF_G[dat$SMA_CAF >summary(dat$SMA_CAF)[5]] <-"SMA_CAF high"
+dat$SMA_CAF_G[dat$SMA_CAF <=summary(dat$SMA_CAF)[2]] <-"SMA_CAF low"
+dat$SMA_CAF_G[dat$SMA_CAF >summary(dat$SMA_CAF)[2]& dat$SMA_CAF<=summary(dat$SMA_CAF)[5]] <-"SMA_CAF medium"
+
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF high"
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF <=summary(dat$hypoxic_tpCAF)[2]] <-"hypoxic_tpCAF low"
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[2]& dat$hypoxic_tpCAF<=summary(dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF medium"
+
+dat$IDO_CAF_G[dat$IDO_CAF >summary(dat$IDO_CAF)[5]] <-"IDO_CAF high"
+dat$IDO_CAF_G[dat$IDO_CAF <=summary(dat$IDO_CAF)[2]] <-"IDO_CAF low"
+dat$IDO_CAF_G[dat$IDO_CAF >summary(dat$IDO_CAF)[2]& dat$IDO_CAF<=summary(dat$IDO_CAF)[5]] <-"IDO_CAF medium"
+
+dat$PDPN_CAF_G[dat$PDPN_CAF >summary(dat$PDPN_CAF)[5]] <-"PDPN_CAF high"
+dat$PDPN_CAF_G[dat$PDPN_CAF <=summary(dat$PDPN_CAF)[2]] <-"PDPN_CAF low"
+dat$PDPN_CAF_G[dat$PDPN_CAF >summary(dat$PDPN_CAF)[2]& dat$PDPN_CAF<=summary(dat$PDPN_CAF)[5]] <-"PDPN_CAF medium"
+
+dat$normal_G[dat$normal >summary(dat$normal)[5]] <-"normal high"
+dat$normal_G[dat$normal <=summary(dat$normal)[2]] <-"normal low"
+dat$normal_G[dat$normal >summary(dat$normal)[2]& dat$normal<=summary(dat$normal)[5]] <-"normal medium"
+
+dat$hypoxic_G[dat$hypoxic >summary(dat$hypoxic)[5]] <-"hypoxic high"
+dat$hypoxic_G[dat$hypoxic <=summary(dat$hypoxic)[2]] <-"hypoxic low"
+dat$hypoxic_G[dat$hypoxic >summary(dat$hypoxic)[2]& dat$hypoxic<=summary(dat$hypoxic)[5]] <-"hypoxic medium"
+
+dat$Bcell_G[dat$Bcell >summary(dat$Bcell)[5]] <-"Bcell high"
+dat$Bcell_G[dat$Bcell <=summary(dat$Bcell)[2]] <-"Bcell low"
+dat$Bcell_G[dat$Bcell >summary(dat$Bcell)[2]& dat$Bcell<=summary(dat$Bcell)[5]] <-"Bcell medium"
+
+dat$Myeloid_G[dat$Myeloid >summary(dat$Myeloid)[5]] <-"Myeloid high"
+dat$Myeloid_G[dat$Myeloid <=summary(dat$Myeloid)[2]] <-"Myeloid low"
+dat$Myeloid_G[dat$Myeloid >summary(dat$Myeloid)[2]& dat$Myeloid<=summary(dat$Myeloid)[5]] <-"Myeloid medium"
+
+dat$Neutrophil_G[dat$Neutrophil >summary(dat$Neutrophil)[5]] <-"Neutrophil high"
+dat$Neutrophil_G[dat$Neutrophil <=summary(dat$Neutrophil)[2]] <-"Neutrophil low"
+dat$Neutrophil_G[dat$Neutrophil >summary(dat$Neutrophil)[2]& dat$Neutrophil<=summary(dat$Neutrophil)[5]] <-"Neutrophil medium"
+
+dat$CD4_G[dat$CD4 >summary(dat$CD4)[5]] <-"CD4 high"
+dat$CD4_G[dat$CD4 <=summary(dat$CD4)[2]] <-"CD4 low"
+dat$CD4_G[dat$CD4 >summary(dat$CD4)[2]& dat$CD4<summary(dat$CD4)[5]] <-"CD4 medium"
+
+dat$CD4_Treg_G[dat$CD4_Treg >summary(dat$CD4_Treg)[5]] <-"CD4 Treg high"
+dat$CD4_Treg_G[dat$CD4_Treg <=summary(dat$CD4_Treg)[2]] <-"CD4 Treg low"
+dat$CD4_Treg_G[dat$CD4_Treg >summary(dat$CD4_Treg)[2]& dat$CD4_Treg<summary(dat$CD4_Treg)[5]] <-"CD4 T reg medium"
+
+dat$CD4_IDO_G[dat$IDO_CD4 >summary(dat$IDO_CD4)[5]] <-"IDO_CD4 high"
+dat$CD4_IDO_G[dat$IDO_CD4 <=summary(dat$IDO_CD4)[2]] <-"IDO_CD4 low"
+dat$CD4_IDO_G[dat$IDO_CD4 >summary(dat$IDO_CD4)[2]& dat$IDO_CD4<summary(dat$IDO_CD4)[5]] <-"IDO_CD4 medium"
+
+dat$CD4_PD1_G[dat$PD1_CD4 >summary(dat$PD1_CD4)[5]] <-"PD1_CD4 high"
+dat$CD4_PD1_G[dat$PD1_CD4 <=summary(dat$PD1_CD4)[2]] <-"PD1_CD4 low"
+dat$CD4_PD1_G[dat$PD1_CD4 >summary(dat$PD1_CD4)[2]& dat$PD1_CD4<summary(dat$PD1_CD4)[5]] <-"PD1_CD4 medium"
+
+dat$CD4_TCF_G[dat$`TCF1/7_CD4` >summary(dat$`TCF1/7_CD4`)[5]] <-"CD4 TCF1/7 high"
+dat$CD4_TCF_G[dat$`TCF1/7_CD4` <=summary(dat$`TCF1/7_CD4`)[2]] <-"CD4 TCF1/7  low"
+dat$CD4_TCF_G[dat$`TCF1/7_CD4` >summary(dat$`TCF1/7_CD4`)[2]& dat$`TCF1/7_CD4`<summary(dat$`TCF1/7_CD4`)[5]] <-"CD4 TCF1/7  medium"
+
+dat$CD4_ki67_G[dat$ki67_CD4 >summary(dat$ki67_CD4)[5]] <-"CD4 dividing high"
+dat$CD4_ki67_G[dat$ki67_CD4 <=summary(dat$ki67_CD4)[2]] <-"CD4 dividing low"
+dat$CD4_ki67_G[dat$ki67_CD4 >summary(dat$ki67_CD4)[2]& dat$ki67_CD4<summary(dat$ki67_CD4)[5]] <-"CD4 dividing medium"
+
+dat$CD8_G[dat$CD8 >summary(dat$CD8)[5]] <-"CD8 high"
+dat$CD8_G[dat$CD8 <=summary(dat$CD8)[2]] <-"CD8 low"
+dat$CD8_G[dat$CD8 >summary(dat$CD8)[2]& dat$CD8<summary(dat$CD8)[5]] <-"CD8 medium"
+
+dat$CD8_IDO_G[dat$IDO_CD8 >summary(dat$IDO_CD8)[5]] <-"CD8 IDO high"
+dat$CD8_IDO_G[dat$IDO_CD8 <=summary(dat$IDO_CD8)[2]] <-"CD8 IDO low"
+dat$CD8_IDO_G[dat$IDO_CD8 >summary(dat$IDO_CD8)[2]& dat$IDO_CD8<summary(dat$IDO_CD8)[5]] <-"CD8 IDO medium"
+
+dat$CD8_TCF_G[dat$`TCF1/7_CD8` >summary(dat$`TCF1/7_CD8`)[5]] <-"CD8 TCF1/7 high"
+dat$CD8_TCF_G[dat$`TCF1/7_CD8` <=summary(dat$`TCF1/7_CD8`)[2]] <-"CD8 TCF1/7 low"
+dat$CD8_TCF_G[dat$`TCF1/7_CD8` >summary(dat$`TCF1/7_CD8`)[2]& dat$`TCF1/7_CD8`<summary(dat$`TCF1/7_CD8`)[5]] <-"CD8 TCF1/7 medium"
+
+dat$CD8_ki67_G[dat$ki67_CD8 >summary(dat$ki67_CD8)[5]] <-"CD8 dividing high"
+dat$CD8_ki67_G[dat$ki67_CD8 <=summary(dat$ki67_CD8)[2]] <-"CD8 dividing low"
+dat$CD8_ki67_G[dat$ki67_CD8 >summary(dat$ki67_CD8)[2]& dat$ki67_CD8<summary(dat$ki67_CD8)[5]] <-"CD8 dividing medium"
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high mid low as well as the proportion (continuous).   
+```{r  coxph Fibro category proportions high mid low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+#geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()+scale_y_continuous(trans = "log10")
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Proportions_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+In group comparison (Wilcoxon for 2 groups or KW for >2 groups) for Fibro categories over clinical variables:  
+  - T.new (TNM)
+  - N (TNM)
+  - M.new (TNM)
+  - Relapse
+  - Grade
+  - Stage
+  - Gender
+  - Typ (Tumour Type)
+  - Dist.Met (Distant metastases yes/no)
+  - LN.Met (Lymph node metastases yes/no)
+  
+## Kruskal-Wallis / Wilcoxon Fibro category ~ clinical data excluding neoadjuvant therapy
+```{r  KW W Fibro category proportions WO neo,  fig.width=12, fig.height=30, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(all.sce_pat[,all.sce_pat$Patient_ID!="Control"&
+                                          all.sce_pat$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat$DX.name=="Squamous cell carcinoma"]))
+
+#remove neoadj patients
+df <-df[!df$Patient_ID%in% neoadj_pat$Patient_ID,]
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender") 
+#categories="N"
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$tcell_subtype, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=5) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+  #pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Proportions_WOneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+## Neoadjuvantly treated patients are included, the same categories are used plus:  
+- NeoAdj (Neoadjuvant therapy yes/no)
+- Chemo (Neoadjuvant therapy = chemotherapy)
+- Radio (Neoadjuvant therapy = radiotherapy)
+```{r  KW W Fibro category proportions with NEO, fig.width=12, fig.height=30, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(all.sce_pat[,all.sce_pat$Patient_ID!="Control"&
+                                          all.sce_pat$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat$DX.name=="Squamous cell carcinoma"]))
+
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$tcell_subtype, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=5) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Proportions_Wneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+```{r  Correlations proportions per image,message=FALSE,warning=FALSE,echo=FALSE, fig.width=6, fig.height=6}
+all.sce_pat.roi$NeoAdj <- ifelse(all.sce_pat.roi$Chemo==1 |all.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.sce_pat.roi[,all.sce_pat.roi$DX.name!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.sce_pat.roi[,all.sce_pat.roi$DX.name!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$tcell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$RoiID <-droplevels(tdat$RoiID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("RoiID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+tdat_wide <-tdat_wide[!tdat_wide$RoiID%in% neoadj_pat$RoiID,]
+rownames(tdat_wide)<-tdat_wide$RoiID
+
+library(corrplot)
+cor(tdat_wide[,-1], method="pearson")
+#corrplot(cor(tdat_wide[,-1], method="pearson"), col=rev(brewer.pal(10,"RdBu")))
+
+sig <- cor.mtest(tdat_wide[,-1], conf.level = .95) #is this test right?
+
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+         col=rev(brewer.pal(10,"RdBu")),
+         pch.col = "white",
+         #type="upper",
+         title = "Correlated proportions",
+        mar=c(0,0,3,0))
+```
+
+## **Fibro category**
+## **Densities**
+Generally, densities are first calculated per tissue area. For patients with more than 1 image, the average density is calculated subsequently.  
+Optimal number of Fibro category densities patients metaclusters:
+```{r  Fibro category density optimal number of clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4, eval=F}
+#Calculate Densities
+tdat <-as.data.frame(table(all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$tcell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.sce_pat.roi$RoiID,
+                                "Area"=all.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.sce_pat.roi$Patient_ID,
+                                "DX.name"=all.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+#tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+library(FactoMineR)
+library(factoextra)
+sbg <-cutree(hc, k=8)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=20)
+
+#install.packages("cluster")
+library(cluster)
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+```{r  Calculate densities Fibro category, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table(all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$tcell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.sce_pat.roi$RoiID,
+                                "Area"=all.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.sce_pat.roi$Patient_ID,
+                                "DX.name"=all.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+  
+  
+#CAF type distribution over all patients
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+#order Patients after clustering results
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = labels(hc))
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = labels(hc))
+```
+
+## Barplot showing absolute density per patietn coloured by Fibro category together with hierarchical clustering tree coloured by patient metaclusters.
+```{r Barolot Fibro category densities, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+#barolot (sqrt of density)
+p <-ggplot(tdat,aes(y=sqrt(density),x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_igv()+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+#barplot total density
+p <-ggplot(tdat,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_igv()+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+
+#coloured dendrogram
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 10) %>% as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()
+pg <-plot_grid(p1,p, align="v", ncol=1)
+pg
+#ggsave(plot=pg, file=file.path(plot_folder, paste("Barplot_Tcell-Category-Densities_HC_ordered_dendro.pdf")), width=12, height=6)
+```
+Add patient metacluster to data and plot clusters separately for better identification
+```{r Fibro category densities merge hc, message=FALSE, warning=FALSE, echo=FALSE}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k=10))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-left_join(tdat, ct, by="Patient_ID")
+ggplot(tdat_ct,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_igv()+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+```
+```{r density over size, fig.width=12, fig.height=12}
+c.param <- c("Size")
+for(i in c.param){
+  print(i)
+  for(k in unique(colnames(tdat_wide_ct[,2:30]))){
+    print(k)
+print(cor.test(tdat_wide_ct[[k]],tdat_wide_ct[[i]], method="spearman", exact=F))
+  }
+}
+
+tdat_c <- left_join(tdat, clinical.data, by="Patient_ID")
+
+
+tdat_c %>% ggplot(aes(x=Size, y=density, color=Phenotype))+facet_wrap(~Phenotype, scales="free_y")+geom_point()+geom_smooth(method=lm,color="darkred")
+```
+## Survival analysis over Fibro category densities high low.  
+- High > median
+- Low < median
+```{r  surv_dat Fibro category densities high low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- rowSums(surv_dat[,2:30])
+
+#high low
+surv_dat$Collagen_CAF_G <- ifelse(surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+surv_dat$hypoxic_CAF_G <- ifelse(surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+surv_dat$mCAF_G <- ifelse(surv_dat$mCAF >summary(surv_dat$mCAF)[3],"mCAF high","mCAF low")
+surv_dat$SMA_CAF_G <- ifelse(surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+surv_dat$tpCAF_G <- ifelse(surv_dat$tpCAF >summary(surv_dat$tpCAF)[3],"tpCAF high","tpCAF low")
+surv_dat$iCAF_G <- ifelse(surv_dat$iCAF >summary(surv_dat$iCAF)[3],"iCAF high","iCAF low")
+surv_dat$vCAF_G <- ifelse(surv_dat$vCAF >summary(surv_dat$vCAF)[3],"vCAF high","vCAF low")
+surv_dat$dCAF_G <- ifelse(surv_dat$dCAF >summary(surv_dat$dCAF)[3],"dCAF high","dCAF low")
+surv_dat$hypoxic_tpCAF_G <- ifelse(surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+surv_dat$IDO_CAF_G <- ifelse(surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+surv_dat$PDPN_CAF_G <- ifelse(surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+
+surv_dat$Neutrophil_G <- ifelse(surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[3],"Neutrophil high","Neutrophil low")
+surv_dat$Bcell_G <- ifelse(surv_dat$Bcell >summary(surv_dat$Bcell)[3],"B cell high","B cell low")
+surv_dat$Myeloid_G <- ifelse(surv_dat$Myeloid >summary(surv_dat$Myeloid)[3],"Myeloid high","Myeloid low")
+surv_dat$CD4_G <- ifelse(surv_dat$CD4 >summary(surv_dat$CD4)[3],"CD4 high","CD4 low")
+surv_dat$CD4_Treg_G <- ifelse(surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[3],"CD4 Treg high","CD4 Treg low")
+surv_dat$CD4_IDO_G <- ifelse(surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[3],"CD4 IDO high","CD4 IDO low")
+surv_dat$CD4_PD1_G <- ifelse(surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[3],"CD4 PD1 high","CD4 PD1 low")
+surv_dat$CD4_TCF_G <- ifelse(surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[3],"CD4 TCF1/7 high","CD4 TCF1/7 low")
+surv_dat$CD4_ki67_G <- ifelse(surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[3],"dividing CD4 high","dividing CD4 low")
+
+surv_dat$CD8_IDO_G <- ifelse(surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[3],"CD8 IDO high","CD8 IDO low")
+surv_dat$CD8_TCF_G <- ifelse(surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[3],"CD8 TCF1/7 high","CD8 TCF1/7 low")
+surv_dat$CD8_G <- ifelse(surv_dat$CD8 >summary(surv_dat$CD8)[3],"CD8 high","CD8 low")
+surv_dat$CD8_ki67_G <- ifelse(surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[3],"dividing CD8 high","dividing CD8 low")
+
+surv_dat$normal_G <- ifelse(surv_dat$normal >summary(surv_dat$normal)[3],"normal high","normal low")
+surv_dat$hypoxic_G <- ifelse(surv_dat$hypoxic >summary(surv_dat$hypoxic)[3],"hypoxic high","hypoxic low")
+
+
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G","Bcell_G","Neutrophil_G","Myeloid_G", "metacluster","CD8_G","CD4_G","CD4_Treg_G","CD4_IDO_G","CD4_PD1_G","CD4_TCF_G","CD4_ki67_G", "CD8_IDO_G","CD8_TCF_G","CD8_ki67_G","normal_G","hypoxic_G")
+```
+
+#CoxPH for Fibro category density corrected for Stage, Grade and M
+```{r  Coxph Fibro category density, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   PDPN_CAF + IDO_CAF+hypoxic_tpCAF+dCAF+vCAF+iCAF+tpCAF+mCAF+hypoxic_CAF+SMA_CAF+Collagen_CAF+Neutrophil+Bcell+ CD4 + CD8+`TCF1/7_CD4`+ki67_CD8+`TCF1/7_CD8`+PD1_CD4+IDO_CD8 +ki67_CD4+IDO_CD4+CD4_Treg + Myeloid + normal+hypoxic+ Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = sqrt(estimate),color =p.value < 0.05)) +
+geom_point(size = 4) +
+#geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r  survival Fibro category densities high low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+#surv <-c("metacluster")
+#surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+library(survminer)
+
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+#km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r  survival Fibro category densities high low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Split
+library(survminer)
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G <- ifelse(dat$total_density >summary(dat$total_density)[3],"Density high","Density low")
+dat$Collagen_CAF_G <- ifelse(dat$Collagen_CAF >summary(dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+dat$hypoxic_CAF_G <- ifelse(dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+dat$mCAF_G <- ifelse(dat$mCAF >summary(dat$mCAF)[3],"mCAF high","mCAF low")
+dat$SMA_CAF_G <- ifelse(dat$SMA_CAF >summary(dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+dat$tpCAF_G <- ifelse(dat$tpCAF >summary(dat$tpCAF)[3],"tpCAF high","tpCAF low")
+dat$iCAF_G <- ifelse(dat$iCAF >summary(dat$iCAF)[3],"iCAF high","iCAF low")
+dat$vCAF_G <- ifelse(dat$vCAF >summary(dat$vCAF)[3],"vCAF high","vCAF low")
+dat$dCAF_G <- ifelse(dat$dCAF >summary(dat$dCAF)[3],"dCAF high","dCAF low")
+dat$hypoxic_tpCAF_G <- ifelse(dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+dat$IDO_CAF_G <- ifelse(dat$IDO_CAF >summary(dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+dat$PDPN_CAF_G <- ifelse(dat$PDPN_CAF >summary(dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+
+dat$Neutrophil_G <- ifelse(dat$Neutrophil >summary(dat$Neutrophil)[3],"Neutrophil high","Neutrophil low")
+dat$Bcell_G <- ifelse(dat$Bcell >summary(dat$Bcell)[3],"B cell high","B cell low")
+dat$Myeloid_G <- ifelse(dat$Myeloid >summary(dat$Myeloid)[3],"Myeloid high","Myeloid low")
+dat$CD4_G <- ifelse(dat$CD4 >summary(dat$CD4)[3],"CD4 high","CD4 low")
+dat$CD4_Treg_G <- ifelse(dat$CD4_Treg >summary(dat$CD4_Treg)[3],"CD4 Treg high","CD4 Treg low")
+dat$CD4_IDO_G <- ifelse(dat$IDO_CD4 >summary(dat$IDO_CD4)[3],"CD4 IDO high","CD4 IDO low")
+dat$CD4_PD1_G <- ifelse(dat$PD1_CD4 >summary(dat$PD1_CD4)[3],"CD4 PD1 high","CD4 PD1 low")
+dat$CD4_TCF_G <- ifelse(dat$`TCF1/7_CD4` >summary(dat$`TCF1/7_CD4`)[3],"CD4 TCF1/7 high","CD4 TCF1/7 low")
+dat$CD4_ki67_G <- ifelse(dat$ki67_CD4 >summary(dat$ki67_CD4)[3],"dividing CD4 high","dividing CD4 low")
+
+dat$CD8_IDO_G <- ifelse(dat$IDO_CD8 >summary(dat$IDO_CD8)[3],"CD8 IDO high","CD8 IDO low")
+dat$CD8_TCF_G <- ifelse(dat$`TCF1/7_CD8` >summary(dat$`TCF1/7_CD8`)[3],"CD8 TCF1/7 high","CD8 TCF1/7 low")
+dat$CD8_G <- ifelse(dat$CD8 >summary(dat$CD8)[3],"CD8 high","CD8 low")
+dat$CD8_ki67_G <- ifelse(dat$ki67_CD8 >summary(dat$ki67_CD8)[3],"dividing CD8 high","dividing CD8 low")
+
+dat$normal_G <- ifelse(dat$normal >summary(dat$normal)[3],"normal high","normal low")
+dat$hypoxic_G <- ifelse(dat$hypoxic >summary(dat$hypoxic)[3],"hypoxic high","hypoxic low")
+
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high / low as well as the densities (continuous).   
+```{r  coxph Fibro category densities high-low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=6, fig.width=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Density_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Survival Fibro category densities high-mid-low     
+- high    > 3rd quantile   
+- mid     > 1st quantile < 3rd quantile   
+- low     < 1st quantile   
+```{r  surv_dat Fibro category densities high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- rowSums(surv_dat[,2:30])
+
+#High Med Low
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[5]] <-"Density high"
+surv_dat$Density_G[surv_dat$total_density <=summary(surv_dat$total_density)[2]] <-"Density low"
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[2]& surv_dat$total_density<summary(surv_dat$total_density)[5]] <-"Density medium"
+
+surv_dat$mCAF_G[surv_dat$mCAF >summary(surv_dat$mCAF)[5]] <-"mCAF high"
+surv_dat$mCAF_G[surv_dat$mCAF <=summary(surv_dat$mCAF)[2]] <-"mCAF low"
+surv_dat$mCAF_G[surv_dat$mCAF >summary(surv_dat$mCAF)[2]& surv_dat$mCAF<=summary(surv_dat$mCAF)[5]] <-"mCAF medium"
+
+surv_dat$vCAF_G[surv_dat$vCAF >summary(surv_dat$vCAF)[5]] <-"vCAF high"
+surv_dat$vCAF_G[surv_dat$vCAF <=summary(surv_dat$vCAF)[2]] <-"vCAF low"
+surv_dat$vCAF_G[surv_dat$vCAF >summary(surv_dat$vCAF)[2]& surv_dat$vCAF<=summary(surv_dat$vCAF)[5]] <-"vCAF medium"
+
+surv_dat$tpCAF_G[surv_dat$tpCAF >summary(surv_dat$tpCAF)[5]] <-"tpCAF high"
+surv_dat$tpCAF_G[surv_dat$tpCAF <=summary(surv_dat$tpCAF)[2]] <-"tpCAF low"
+surv_dat$tpCAF_G[surv_dat$tpCAF >summary(surv_dat$tpCAF)[2]& surv_dat$tpCAF<=summary(surv_dat$tpCAF)[5]] <-"tpCAF medium"
+
+surv_dat$iCAF_G[surv_dat$iCAF >summary(surv_dat$iCAF)[5]] <-"iCAF high"
+surv_dat$iCAF_G[surv_dat$iCAF <=summary(surv_dat$iCAF)[2]] <-"iCAF low"
+surv_dat$iCAF_G[surv_dat$iCAF >summary(surv_dat$iCAF)[2]& surv_dat$iCAF<=summary(surv_dat$iCAF)[5]] <-"iCAF medium"
+
+surv_dat$dCAF_G[surv_dat$dCAF >summary(surv_dat$dCAF)[5]] <-"dCAF high"
+surv_dat$dCAF_G[surv_dat$dCAF <=summary(surv_dat$dCAF)[2]] <-"dCAF low"
+surv_dat$dCAF_G[surv_dat$dCAF >summary(surv_dat$dCAF)[2]& surv_dat$dCAF<=summary(surv_dat$dCAF)[5]] <-"dCAF medium"
+
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[5]] <-"Collagen_CAF high"
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF <=summary(surv_dat$Collagen_CAF)[2]] <-"Collagen_CAF low"
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[2]& surv_dat$Collagen_CAF<=summary(surv_dat$Collagen_CAF)[5]] <-"Collagen_CAF medium"
+
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[5]] <-"hypoxic_CAF high"
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF <=summary(surv_dat$hypoxic_CAF)[2]] <-"hypoxic_CAF low"
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[2]& surv_dat$hypoxic_CAF<=summary(surv_dat$hypoxic_CAF)[5]] <-"hypoxic_CAF medium"
+
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[5]] <-"SMA_CAF high"
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF <=summary(surv_dat$SMA_CAF)[2]] <-"SMA_CAF low"
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[2]& surv_dat$SMA_CAF<=summary(surv_dat$SMA_CAF)[5]] <-"SMA_CAF medium"
+
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF high"
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF <=summary(surv_dat$hypoxic_tpCAF)[2]] <-"hypoxic_tpCAF low"
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[2]& surv_dat$hypoxic_tpCAF<=summary(surv_dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF medium"
+
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[5]] <-"IDO_CAF high"
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF <=summary(surv_dat$IDO_CAF)[2]] <-"IDO_CAF low"
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[2]& surv_dat$IDO_CAF<=summary(surv_dat$IDO_CAF)[5]] <-"IDO_CAF medium"
+
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[5]] <-"PDPN_CAF high"
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF <=summary(surv_dat$PDPN_CAF)[2]] <-"PDPN_CAF low"
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[2]& surv_dat$PDPN_CAF<=summary(surv_dat$PDPN_CAF)[5]] <-"PDPN_CAF medium"
+
+surv_dat$normal_G[surv_dat$normal >summary(surv_dat$normal)[5]] <-"normal high"
+surv_dat$normal_G[surv_dat$normal <=summary(surv_dat$normal)[2]] <-"normal low"
+surv_dat$normal_G[surv_dat$normal >summary(surv_dat$normal)[2]& surv_dat$normal<=summary(surv_dat$normal)[5]] <-"normal medium"
+
+surv_dat$hypoxic_G[surv_dat$hypoxic >summary(surv_dat$hypoxic)[5]] <-"hypoxic high"
+surv_dat$hypoxic_G[surv_dat$hypoxic <=summary(surv_dat$hypoxic)[2]] <-"hypoxic low"
+surv_dat$hypoxic_G[surv_dat$hypoxic >summary(surv_dat$hypoxic)[2]& surv_dat$hypoxic<=summary(surv_dat$hypoxic)[5]] <-"hypoxic medium"
+
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[5]] <-"Bcell high"
+surv_dat$Bcell_G[surv_dat$Bcell <=summary(surv_dat$Bcell)[2]] <-"Bcell low"
+surv_dat$Bcell_G[surv_dat$Bcell >summary(surv_dat$Bcell)[2]& surv_dat$Bcell<=summary(surv_dat$Bcell)[5]] <-"Bcell medium"
+
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[5]] <-"Myeloid high"
+surv_dat$Myeloid_G[surv_dat$Myeloid <=summary(surv_dat$Myeloid)[2]] <-"Myeloid low"
+surv_dat$Myeloid_G[surv_dat$Myeloid >summary(surv_dat$Myeloid)[2]& surv_dat$Myeloid<=summary(surv_dat$Myeloid)[5]] <-"Myeloid medium"
+
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[5]] <-"Neutrophil high"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil <=summary(surv_dat$Neutrophil)[2]] <-"Neutrophil low"
+surv_dat$Neutrophil_G[surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[2]& surv_dat$Neutrophil<=summary(surv_dat$Neutrophil)[5]] <-"Neutrophil medium"
+
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[5]] <-"CD4 high"
+surv_dat$CD4_G[surv_dat$CD4 <=summary(surv_dat$CD4)[2]] <-"CD4 low"
+surv_dat$CD4_G[surv_dat$CD4 >summary(surv_dat$CD4)[2]& surv_dat$CD4<summary(surv_dat$CD4)[5]] <-"CD4 medium"
+
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[5]] <-"CD4 Treg high"
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg <=summary(surv_dat$CD4_Treg)[2]] <-"CD4 Treg low"
+surv_dat$CD4_Treg_G[surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[2]& surv_dat$CD4_Treg<summary(surv_dat$CD4_Treg)[5]] <-"CD4 T reg medium"
+
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[5]] <-"IDO_CD4 high"
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 <=summary(surv_dat$IDO_CD4)[2]] <-"IDO_CD4 low"
+surv_dat$CD4_IDO_G[surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[2]& surv_dat$IDO_CD4<summary(surv_dat$IDO_CD4)[5]] <-"IDO_CD4 medium"
+
+surv_dat$CD4_PD1_G[surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[5]] <-"PD1_CD4 high"
+surv_dat$CD4_PD1_G[surv_dat$PD1_CD4 <=summary(surv_dat$PD1_CD4)[2]] <-"PD1_CD4 low"
+surv_dat$CD4_PD1_G[surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[2]& surv_dat$PD1_CD4<summary(surv_dat$PD1_CD4)[5]] <-"PD1_CD4 medium"
+
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[5]] <-"CD4 TCF1/7 high"
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` <=summary(surv_dat$`TCF1/7_CD4`)[2]] <-"CD4 TCF1/7  low"
+surv_dat$CD4_TCF_G[surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[2]& surv_dat$`TCF1/7_CD4`<summary(surv_dat$`TCF1/7_CD4`)[5]] <-"CD4 TCF1/7  medium"
+
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[5]] <-"CD4 dividing high"
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 <=summary(surv_dat$ki67_CD4)[2]] <-"CD4 dividing low"
+surv_dat$CD4_ki67_G[surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[2]& surv_dat$ki67_CD4<summary(surv_dat$ki67_CD4)[5]] <-"CD4 dividing medium"
+
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[5]] <-"CD8 high"
+surv_dat$CD8_G[surv_dat$CD8 <=summary(surv_dat$CD8)[2]] <-"CD8 low"
+surv_dat$CD8_G[surv_dat$CD8 >summary(surv_dat$CD8)[2]& surv_dat$CD8<summary(surv_dat$CD8)[5]] <-"CD8 medium"
+
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[5]] <-"CD8 IDO high"
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 <=summary(surv_dat$IDO_CD8)[2]] <-"CD8 IDO low"
+surv_dat$CD8_IDO_G[surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[2]& surv_dat$IDO_CD8<summary(surv_dat$IDO_CD8)[5]] <-"CD8 IDO medium"
+
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[5]] <-"CD8 TCF1/7 high"
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` <=summary(surv_dat$`TCF1/7_CD8`)[2]] <-"CD8 TCF1/7 low"
+surv_dat$CD8_TCF_G[surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[2]& surv_dat$`TCF1/7_CD8`<summary(surv_dat$`TCF1/7_CD8`)[5]] <-"CD8 TCF1/7 medium"
+
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[5]] <-"CD8 dividing high"
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 <=summary(surv_dat$ki67_CD8)[2]] <-"CD8 dividing low"
+surv_dat$CD8_ki67_G[surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[2]& surv_dat$ki67_CD8<summary(surv_dat$ki67_CD8)[5]] <-"CD8 dividing medium"
+
+#surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+```
+
+## Fibro category densities split by tumour type
+```{r  survival Fibro category densities high-mid-low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+#surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+library(survminer)
+
+for (i in surv){
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## Fibro category densities split by tumour type
+```{r  survival Fibro category densities high-mid-low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G[dat$total_density >summary(dat$total_density)[5]] <-"Density high"
+dat$Density_G[dat$total_density <summary(dat$total_density)[2]] <-"Density low"
+dat$Density_G[dat$total_density >summary(dat$total_density)[2]& dat$total_density<summary(dat$total_density)[5]] <-"Density medium"
+
+dat$mCAF_G[dat$mCAF >summary(dat$mCAF)[5]] <-"mCAF high"
+dat$mCAF_G[dat$mCAF <=summary(dat$mCAF)[2]] <-"mCAF low"
+dat$mCAF_G[dat$mCAF >summary(dat$mCAF)[2]& dat$mCAF<=summary(dat$mCAF)[5]] <-"mCAF medium"
+
+dat$vCAF_G[dat$vCAF >summary(dat$vCAF)[5]] <-"vCAF high"
+dat$vCAF_G[dat$vCAF <=summary(dat$vCAF)[2]] <-"vCAF low"
+dat$vCAF_G[dat$vCAF >summary(dat$vCAF)[2]& dat$vCAF<=summary(dat$vCAF)[5]] <-"vCAF medium"
+
+dat$tpCAF_G[dat$tpCAF >summary(dat$tpCAF)[5]] <-"tpCAF high"
+dat$tpCAF_G[dat$tpCAF <=summary(dat$tpCAF)[2]] <-"tpCAF low"
+dat$tpCAF_G[dat$tpCAF >summary(dat$tpCAF)[2]& dat$tpCAF<=summary(dat$tpCAF)[5]] <-"tpCAF medium"
+
+dat$iCAF_G[dat$iCAF >summary(dat$iCAF)[5]] <-"iCAF high"
+dat$iCAF_G[dat$iCAF <=summary(dat$iCAF)[2]] <-"iCAF low"
+dat$iCAF_G[dat$iCAF >summary(dat$iCAF)[2]& dat$iCAF<=summary(dat$iCAF)[5]] <-"iCAF medium"
+
+dat$dCAF_G[dat$dCAF >summary(dat$dCAF)[5]] <-"dCAF high"
+dat$dCAF_G[dat$dCAF <=summary(dat$dCAF)[2]] <-"dCAF low"
+dat$dCAF_G[dat$dCAF >summary(dat$dCAF)[2]& dat$dCAF<=summary(dat$dCAF)[5]] <-"dCAF medium"
+
+dat$Collagen_CAF_G[dat$Collagen_CAF >summary(dat$Collagen_CAF)[5]] <-"Collagen_CAF high"
+dat$Collagen_CAF_G[dat$Collagen_CAF <=summary(dat$Collagen_CAF)[2]] <-"Collagen_CAF low"
+dat$Collagen_CAF_G[dat$Collagen_CAF >summary(dat$Collagen_CAF)[2]& dat$Collagen_CAF<=summary(dat$Collagen_CAF)[5]] <-"Collagen_CAF medium"
+
+dat$hypoxic_CAF_G[dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[5]] <-"hypoxic_CAF high"
+dat$hypoxic_CAF_G[dat$hypoxic_CAF <=summary(dat$hypoxic_CAF)[2]] <-"hypoxic_CAF low"
+dat$hypoxic_CAF_G[dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[2]& dat$hypoxic_CAF<=summary(dat$hypoxic_CAF)[5]] <-"hypoxic_CAF medium"
+
+dat$SMA_CAF_G[dat$SMA_CAF >summary(dat$SMA_CAF)[5]] <-"SMA_CAF high"
+dat$SMA_CAF_G[dat$SMA_CAF <=summary(dat$SMA_CAF)[2]] <-"SMA_CAF low"
+dat$SMA_CAF_G[dat$SMA_CAF >summary(dat$SMA_CAF)[2]& dat$SMA_CAF<=summary(dat$SMA_CAF)[5]] <-"SMA_CAF medium"
+
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF high"
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF <=summary(dat$hypoxic_tpCAF)[2]] <-"hypoxic_tpCAF low"
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[2]& dat$hypoxic_tpCAF<=summary(dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF medium"
+
+dat$IDO_CAF_G[dat$IDO_CAF >summary(dat$IDO_CAF)[5]] <-"IDO_CAF high"
+dat$IDO_CAF_G[dat$IDO_CAF <=summary(dat$IDO_CAF)[2]] <-"IDO_CAF low"
+dat$IDO_CAF_G[dat$IDO_CAF >summary(dat$IDO_CAF)[2]& dat$IDO_CAF<=summary(dat$IDO_CAF)[5]] <-"IDO_CAF medium"
+
+dat$PDPN_CAF_G[dat$PDPN_CAF >summary(dat$PDPN_CAF)[5]] <-"PDPN_CAF high"
+dat$PDPN_CAF_G[dat$PDPN_CAF <=summary(dat$PDPN_CAF)[2]] <-"PDPN_CAF low"
+dat$PDPN_CAF_G[dat$PDPN_CAF >summary(dat$PDPN_CAF)[2]& dat$PDPN_CAF<=summary(dat$PDPN_CAF)[5]] <-"PDPN_CAF medium"
+
+dat$normal_G[dat$normal >summary(dat$normal)[5]] <-"normal high"
+dat$normal_G[dat$normal <=summary(dat$normal)[2]] <-"normal low"
+dat$normal_G[dat$normal >summary(dat$normal)[2]& dat$normal<=summary(dat$normal)[5]] <-"normal medium"
+
+dat$hypoxic_G[dat$hypoxic >summary(dat$hypoxic)[5]] <-"hypoxic high"
+dat$hypoxic_G[dat$hypoxic <=summary(dat$hypoxic)[2]] <-"hypoxic low"
+dat$hypoxic_G[dat$hypoxic >summary(dat$hypoxic)[2]& dat$hypoxic<=summary(dat$hypoxic)[5]] <-"hypoxic medium"
+
+dat$Bcell_G[dat$Bcell >summary(dat$Bcell)[5]] <-"Bcell high"
+dat$Bcell_G[dat$Bcell <=summary(dat$Bcell)[2]] <-"Bcell low"
+dat$Bcell_G[dat$Bcell >summary(dat$Bcell)[2]& dat$Bcell<=summary(dat$Bcell)[5]] <-"Bcell medium"
+
+dat$Myeloid_G[dat$Myeloid >summary(dat$Myeloid)[5]] <-"Myeloid high"
+dat$Myeloid_G[dat$Myeloid <=summary(dat$Myeloid)[2]] <-"Myeloid low"
+dat$Myeloid_G[dat$Myeloid >summary(dat$Myeloid)[2]& dat$Myeloid<=summary(dat$Myeloid)[5]] <-"Myeloid medium"
+
+dat$Neutrophil_G[dat$Neutrophil >summary(dat$Neutrophil)[5]] <-"Neutrophil high"
+dat$Neutrophil_G[dat$Neutrophil <=summary(dat$Neutrophil)[2]] <-"Neutrophil low"
+dat$Neutrophil_G[dat$Neutrophil >summary(dat$Neutrophil)[2]& dat$Neutrophil<=summary(dat$Neutrophil)[5]] <-"Neutrophil medium"
+
+dat$CD4_G[dat$CD4 >summary(dat$CD4)[5]] <-"CD4 high"
+dat$CD4_G[dat$CD4 <=summary(dat$CD4)[2]] <-"CD4 low"
+dat$CD4_G[dat$CD4 >summary(dat$CD4)[2]& dat$CD4<summary(dat$CD4)[5]] <-"CD4 medium"
+
+dat$CD4_Treg_G[dat$CD4_Treg >summary(dat$CD4_Treg)[5]] <-"CD4 Treg high"
+dat$CD4_Treg_G[dat$CD4_Treg <=summary(dat$CD4_Treg)[2]] <-"CD4 Treg low"
+dat$CD4_Treg_G[dat$CD4_Treg >summary(dat$CD4_Treg)[2]& dat$CD4_Treg<summary(dat$CD4_Treg)[5]] <-"CD4 T reg medium"
+
+dat$CD4_IDO_G[dat$IDO_CD4 >summary(dat$IDO_CD4)[5]] <-"IDO_CD4 high"
+dat$CD4_IDO_G[dat$IDO_CD4 <=summary(dat$IDO_CD4)[2]] <-"IDO_CD4 low"
+dat$CD4_IDO_G[dat$IDO_CD4 >summary(dat$IDO_CD4)[2]& dat$IDO_CD4<summary(dat$IDO_CD4)[5]] <-"IDO_CD4 medium"
+
+dat$CD4_PD1_G[dat$PD1_CD4 >summary(dat$PD1_CD4)[5]] <-"PD1_CD4 high"
+dat$CD4_PD1_G[dat$PD1_CD4 <=summary(dat$PD1_CD4)[2]] <-"PD1_CD4 low"
+dat$CD4_PD1_G[dat$PD1_CD4 >summary(dat$PD1_CD4)[2]& dat$PD1_CD4<summary(dat$PD1_CD4)[5]] <-"PD1_CD4 medium"
+
+dat$CD4_TCF_G[dat$`TCF1/7_CD4` >summary(dat$`TCF1/7_CD4`)[5]] <-"CD4 TCF1/7 high"
+dat$CD4_TCF_G[dat$`TCF1/7_CD4` <=summary(dat$`TCF1/7_CD4`)[2]] <-"CD4 TCF1/7  low"
+dat$CD4_TCF_G[dat$`TCF1/7_CD4` >summary(dat$`TCF1/7_CD4`)[2]& dat$`TCF1/7_CD4`<summary(dat$`TCF1/7_CD4`)[5]] <-"CD4 TCF1/7  medium"
+
+dat$CD4_ki67_G[dat$ki67_CD4 >summary(dat$ki67_CD4)[5]] <-"CD4 dividing high"
+dat$CD4_ki67_G[dat$ki67_CD4 <=summary(dat$ki67_CD4)[2]] <-"CD4 dividing low"
+dat$CD4_ki67_G[dat$ki67_CD4 >summary(dat$ki67_CD4)[2]& dat$ki67_CD4<summary(dat$ki67_CD4)[5]] <-"CD4 dividing medium"
+
+dat$CD8_G[dat$CD8 >summary(dat$CD8)[5]] <-"CD8 high"
+dat$CD8_G[dat$CD8 <=summary(dat$CD8)[2]] <-"CD8 low"
+dat$CD8_G[dat$CD8 >summary(dat$CD8)[2]& dat$CD8<summary(dat$CD8)[5]] <-"CD8 medium"
+
+dat$CD8_IDO_G[dat$IDO_CD8 >summary(dat$IDO_CD8)[5]] <-"CD8 IDO high"
+dat$CD8_IDO_G[dat$IDO_CD8 <=summary(dat$IDO_CD8)[2]] <-"CD8 IDO low"
+dat$CD8_IDO_G[dat$IDO_CD8 >summary(dat$IDO_CD8)[2]& dat$IDO_CD8<summary(dat$IDO_CD8)[5]] <-"CD8 IDO medium"
+
+dat$CD8_TCF_G[dat$`TCF1/7_CD8` >summary(dat$`TCF1/7_CD8`)[5]] <-"CD8 TCF1/7 high"
+dat$CD8_TCF_G[dat$`TCF1/7_CD8` <=summary(dat$`TCF1/7_CD8`)[2]] <-"CD8 TCF1/7 low"
+dat$CD8_TCF_G[dat$`TCF1/7_CD8` >summary(dat$`TCF1/7_CD8`)[2]& dat$`TCF1/7_CD8`<summary(dat$`TCF1/7_CD8`)[5]] <-"CD8 TCF1/7 medium"
+
+dat$CD8_ki67_G[dat$ki67_CD8 >summary(dat$ki67_CD8)[5]] <-"CD8 dividing high"
+dat$CD8_ki67_G[dat$ki67_CD8 <=summary(dat$ki67_CD8)[2]] <-"CD8 dividing low"
+dat$CD8_ki67_G[dat$ki67_CD8 >summary(dat$ki67_CD8)[2]& dat$ki67_CD8<summary(dat$ki67_CD8)[5]] <-"CD8 dividing medium"
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+
+
+## Lasso-regressed cox-ph model   
+including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high mid low as well as the densities (continuous).   
+```{r coxph Fibro category densities high-mid-low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor")))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Density_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables without neoadjuvant treated patients.
+```{r  KW W Fibro category density WO neo,warning=FALSE, message=FALSE, echo=FALSE,  fig.width=12, fig.height=30}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+all.sce_pat.roi$NeoAdj <- ifelse(all.sce_pat.roi$Chemo==1 |all.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$tcell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.sce_pat.roi$RoiID,
+                                "Area"=all.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.sce_pat.roi$Patient_ID,
+                                "DX.name"=all.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[2:30])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:31])), names_to = "Phenotype",values_to = "density")
+
+#remove
+tdat <-tdat[!tdat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(), legend.position = "none")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(), legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=5) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Densities_WOneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables with neoadjuvant treated patients.
+```{r  KW Wilcox Fibro Category density W neo,warning=FALSE, message=FALSE, echo=FALSE,  fig.width=12, fig.height=30}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+all.sce_pat.roi$NeoAdj <- ifelse(all.sce_pat.roi$Chemo==1 |all.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$tcell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.sce_pat.roi$RoiID,
+                                "Area"=all.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.sce_pat.roi$Patient_ID,
+                                "DX.name"=all.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[,2:30])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:31])), names_to = "Phenotype",values_to = "density")
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(), legend.position = "none")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(), legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=5) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Densities_Wneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+
+
+
+```{r  Correlations T cell type densities amongst eachother, message=FALSE, warning=FALSE, echo=FALSE, fig.width=6, fig.height=6}
+all.sce_pat.roi$NeoAdj <- ifelse(all.sce_pat.roi$Chemo==1 |all.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$tcell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.sce_pat.roi$RoiID,
+                                "Area"=all.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.sce_pat.roi$Patient_ID,
+                                "DX.name"=all.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+tdat <-tdat %>% 
+  group_by(RoiID,Phenotype) %>%
+  summarise(across(-c(Patient_ID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+tdat_wide <- pivot_wider(tdat,id_cols=c("RoiID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+neoadj_roi <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(RoiID) 
+
+tdat_wide <-tdat_wide[!tdat_wide$RoiID%in% neoadj_roi$RoiID,]
+rownames(tdat_wide)<-tdat_wide$RoiID
+
+library(corrplot)
+cor(tdat_wide[,-1], method="pearson")
+#corrplot(cor(tdat_wide[,-1], method="pearson"), col=rev(brewer.pal(10,"RdBu")))
+
+#test <- immune.pheno.sub %>% select(contains("Ratio"))
+#test$Ratio_tCAF_vCAF <-NULL
+sig <- cor.mtest(tdat_wide[,-1], conf.level = .95) #is this test right?
+
+
+#plot
+corrplot(cor(tdat_wide[,-1], method="kendall"),
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+         col=rev(brewer.pal(10,"RdBu")),
+         pch.col = "white",
+         #type="upper",
+         title = "Correlation densities",
+        mar=c(0,0,3,0))
+```
+
+
+
+
+
+#######
+```{r  KW W testing CAF groups high low proportions, warning=FALSE, message=FALSE, echo=F, eval=F,fig.width=15, fig.height=15}
+#high.low proportions
+df <- read.csv(file=file.path(data_folder, paste("CAF_hi-low_median_proportions.csv")))
+#df <- read.csv(file=file.path(data_folder,  paste("CAF_hi-mid-low-quantiles_proportions.csv")))
+df <- read.csv(file=file.path(data_folder, paste("CAF_hi-low_3rdquantile_proportions.csv")))
+df$X <-NULL
+
+hl_m <- merge(tdat_sub, df, by="Patient_ID")
+
+categories <- c("dCAF_G","iCAF_G","d_tpCAF_G","hCAF_G", "tpCAF_G","SMA_CAF_G","IDO_CAF_G","vCAF_G","mCAF_G") 
+plot_list <- list()
+
+for (i in (categories)) {
+hl_m <- merge(tdat, df, by="Patient_ID")
+  for (j in unique(hl_m$Phenotype)){
+    df_l<-hl_m
+    df_l[[i]] <-as.factor(df_l[[i]])
+    df_l <-df_l %>% drop_na(i)
+    df_l$i <-as.factor(df_l[[i]])
+    df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+    df_l <-subset(df_l, Phenotype==j)
+
+    if (length(unique(df_l[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- df_l%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(df_l, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_tableau()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(length(unique(df_l[[i]])) == 2){
+    
+      df_l<-hl_m
+      df_l[[i]] <-as.factor(df_l[[i]])
+      df_l <-df_l %>% drop_na(i)
+      df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+      df_l <-subset(df_l, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- df_l %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(df_l$freq~df_l[[i]], paired=F)$p.value)
+      df_l <- merge(df_l, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      df_l$p.wt <- paste0('p=',round(df_l$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(df_l, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+        scale_color_tableau()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("tcell_category_Proportion_Wo-NeoAdj",i,".pdf")))
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=6)
+ # dev.off()
+}
+```
+
+
+
+
+```{r}
+#all.sce$immune_category <- all.sce$cell_category
+#all.sce$immune_category[all.sce$immune_category=="T cell"] <-"Immune"
+#unique(all.sce$immune_category)
+cat <- c("Immune","T cell","Tumour","vessel") #"Fibroblast",
+#cat <-"Fibroblast"
+#i<-"vessel"
+tdat <-data.frame()
+tdat_wide <-data.frame("Patient_ID"=unique(all.sce_pat.roi[,all.sce_pat.roi$DX.name!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$Patient_ID))
+for(i in cat){
+  dat.sce <- all.sce_pat.roi[, all.sce_pat.roi$cell_category== paste(i)]
+  df <-as.data.frame(table(dat.sce[,dat.sce$DX.name!="Control"&
+                                          dat.sce$DX.name=="Adenocarcinoma"|
+                                          dat.sce$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                           dat.sce[,dat.sce$DX.name!="Control"&
+                                          dat.sce$DX.name=="Adenocarcinoma"|
+                                     dat.sce$DX.name=="Squamous cell carcinoma"]$tcell_subtype))
+  colnames(df)<-c("Patient_ID","Phenotype","n")
+  df <-df %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+df[is.na(df)] <- 0
+df$Phenotype <-droplevels(df$Phenotype)
+df$Patient_ID <-droplevels(df$Patient_ID)
+
+tdat <- rbind.data.frame(df, tdat)
+df_wide <- pivot_wider(df,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+  #tdat$Type <- paste(i)
+  
+  tdat_wide <- left_join(tdat_wide,df_wide)
+}
+
+
+sig <- cor.mtest(tdat_wide[complete.cases(tdat_wide), ][,-1], conf.level = .95) #is this test right?
+
+corrplot(cor(tdat_wide[complete.cases(tdat_wide), ][,-1], method="pearson"),
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+         col=rev(brewer.pal(10,"RdBu")),
+         pch.col = "white",
+         #type="upper",
+         title = "Correlated ratios of CAF type proportions per patient",
+        mar=c(0,0,3,0))
+```
+
+
+#Differential abundance loop
+
+#Differential abundance analysis loop   
+```{r  Differential Abundance Analysis Types over Metastasis, fig.width=10, fig.height=6, warning=F, message=F,echo=F, out.width="50%"}
+#Patient Number must be numeric
+library(tidyr)
+#groupId must be factor
+#t <-as.data.frame(table(fibro[,fibro$CAF_Type!="undefined"]$Patient_ID,factor(fibro[,fibro$CAF_Type!="undefined"]$CAF_Type)))
+#  colnames(t)<-c("Patient_ID","Phenotype","n")
+ # t$Phenotype <-factor(t$Phenotype)
+
+  
+colData(all.sce_pat)<-as.data.frame(colData(all.sce_pat)) %>% 
+  group_by(Patient_ID) %>% mutate(NR=cur_group_id()) %>% DataFrame()
+dat <-as_tibble(colData(all.sce_pat))
+#if necessary: change group_id labels
+
+#clinical.data$NR <-c(1:length(unique(clinical.data$Patient_ID)))
+#dat <-merge(t,clinical.data,by="Patient_ID")
+dat <-dat[dat$DX.name=="Adenocarcinoma" | dat$DX.name=="Squamous cell carcinoma",]
+dat$LN.Met <- ifelse(dat$N ==0, "No LN Metastases", "LN Metastases")
+dat$Dist.Met <- ifelse(dat$M.new ==0, "No Dist. Metastases", "Dist. Metastases")
+dat$NeoAdj <- ifelse(dat$Radio==1 |dat$Chemo==1, "NeoAdjuvantTherapy", "NoNeoAdjuvantTherapy")
+
+#Only for grade
+#dat <-dat[dat$Grade=="2" | dat$Grade=="3",]
+#category <-"Grade"
+#dat<-dat %>% drop_na(Grade)
+
+
+
+category <-c("Gender","Relapse","Chemo","Dist.Met","LN.Met")
+
+#group_id <-c("M.new")
+#dat$group_id <-ifelse(dat$M.new ==0, 0,1)
+#dat[[group_id]] <-as.factor(dat[[group_id]])
+
+#define the naming category aka Phenotype
+
+#dat<-dat %>% drop_na(paste(group_id))
+j <-"tcell_subtype"
+
+plot_list <- list()
+for (i in category) {
+  #for(k in unique(dat$DX.name)){
+   # dat.l <-subset(dat, DX.name==k)
+
+dat.l <-dat
+k="both tumour types"
+dat.l$group_id <-as.factor(dat.l[[i]])
+dat.l<-dat.l %>% drop_na(paste(i))
+
+test <- dat.l %>% select(c("NR",j,"group_id"))
+#colnames(test) <-c("NR", "Phenotype","group_id")
+
+test_wide <- pivot_wider(test,id_cols=c("NR","group_id"),names_from = "tcell_subtype", values_from ="tcell_subtype", values_fn = list(tcell_subtype=length),names_prefix = "")
+#names(test_wide)[names(test_wide) == "FibroType_1"] <- "FibroType_1 / tCAF"
+#names(test_wide)[names(test_wide) == "FibroType_2"] <- "FibroType_2 / vCAF"
+test_wide
+test_wide[is.na(test_wide)] <- 0
+
+design <- createDesignMatrix(
+  test_wide[,c("NR","group_id")], cols_design = c( "NR","group_id")
+)
+
+contrast <- createContrast(c(rep(0, ncol(design)-1),1))
+data.frame(parameters = colnames(design), contrast)
+
+test_wide_norm <- test_wide %>% select(-c("NR","group_id"))
+#Normalize
+norm_factors <- calcNormFactors(t(test_wide_norm), method = "TMM")
+y <- DGEList(t(test_wide_norm), norm.factors = norm_factors)
+
+#y <- DGEList(t(both[,-c("patient_id","group_id")]))
+y <- estimateDisp(y,design)
+
+fit <- glmFit(y, design)
+lrt <- glmLRT(fit, contrast = contrast)
+top <- edgeR::topTags(lrt, n = Inf, adjust.method = "BH", 
+        sort.by = "none")
+t.op <-as.data.frame(top)
+t.op <- tibble::rownames_to_column(t.op, "FibroTypes")
+
+plot_list[[k]] <-
+ggplot(t.op, aes(reorder(FibroTypes,logFC), logFC)) +
+  geom_col(aes(fill=PValue<0.05)) +
+  coord_flip() +
+  scale_fill_tableau()+
+  labs(x="Fibroblast Type", y="logFC",
+       title=paste("Phenotypes over",i,k))+
+  theme_bw()+
+  theme(strip.background = element_blank(),
+        panel.background=element_rect(fill='white', colour = 'black'),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        axis.text.y = element_text(size=10))
+#plot(p)
+
+# reference = Metastasis
+#ggsave(plot=p, file=file.path(plot_folder, paste("Differential_Abundance_Analysis_Fibro_n_over_",i,".pdf")))
+  #}
+  gridExtra::grid.arrange(grobs = plot_list)
+  
+    #save out in individual pdfs for each variable
+  #pdf(file=file.path(plot_folder, paste0("DifferentialAbundance_Analysis_Fibro_n_over_",i,".pdf")))
+  #gridExtra::grid.arrange(grobs = plot_list)
+  #dev.off()
+}
+
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=FibroTypes))+geom_point()+scale_color_tableau()
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=PValue<0.05))+geom_point()+scale_color_tableau()
+```
diff --git a/Analysis_all_minus_Tumour.Rmd b/Analysis_all_minus_Tumour.Rmd
new file mode 100644
index 0000000..e7630f0
--- /dev/null
+++ b/Analysis_all_minus_Tumour.Rmd
@@ -0,0 +1,5181 @@
+---
+title: "R Notebook"
+output: html_notebook
+---
+```{r Load libraries}
+library(ggpubr)
+library(dplyr)
+#library(tidyverse)
+library(RColorBrewer)
+library(qwraps2)
+library(table1)
+library(SingleCellExperiment)
+#library(uwot)
+library(tidyr)
+library(scater)
+library(ggridges)
+library(viridis)
+library(viridisLite)
+library(ggplot2)
+library(data.table)
+library(CATALYST)
+library(gridExtra)
+library(Rphenograph)
+library(ComplexHeatmap)
+library(CATALYST)
+library(scales)
+library(survival)
+library(broom)
+library(pheatmap)
+library(FlowSOM)
+library(Seurat)
+library(Rphenoannoy)
+library(dplyr)
+library(data.table)
+library(ggthemes)
+library(diffcyt)
+library(edgeR)
+library(rstatix)
+library(dendextend)
+
+library(ggdendro)
+library(dendextend)
+library(FactoMineR)
+library(factoextra)
+library(survminer)
+library(corrplot)
+library(rstatix)
+library(graphics)
+library(cowplot)
+library(cluster)
+library(glmnet)
+library(fastDummies)
+
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+library(dplyr)
+library(tidyr)
+library(mclust)
+library(ggplot2)
+library(RColorBrewer)
+library(scater)
+library(Rphenoannoy)
+library(factoextra)
+library(cluster)
+library(dendextend)
+library(ggthemes)
+set.seed(101100)
+
+```
+
+```{r}
+#set working directory 
+wd <-dirname(getwd())
+
+#clinical.data
+data_folder <-(file.path(wd,"sce_objects","final objects with categories","FINAL"))
+all.sce <- readRDS(file=file.path(data_folder, "all_cells_combined_types_CLINICAL-DATA_FILTERED.rds"))
+all.sce$DX.name[is.na(all.sce$DX.name)]<-"NA"
+
+all.sce <-all.sce[,all.sce$cell_category!="Tumour"& all.sce$cell_category!="Other"]
+all.sce <-all.sce[, all.sce$cell_category!="Other"]
+table(all.sce$cell_category)
+saveRDS(all.sce, file=file.path(data_folder, "all_minus_Tumour-Other_CLINICAL-DATA_FILTERED.rds"))
+all.sce <- readRDS(file=file.path(data_folder, "all_minus_Tumour-Other_CLINICAL-DATA_FILTERED.rds"))
+```
+
+
+
+```{r}
+#clinical.data <-as.data.frame(colData(immune.sce))
+
+clinical.data <- read.csv(file=file.path(wd,"sce_objects","clinical_data", "clinical_data_ROI_ac_combined_CORRECT.csv"))
+
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+clinical.data$X <-NULL
+clinical.data$X.1 <-NULL
+head(clinical.data)
+```
+
+```{r}
+#All Fibros together
+tbl <- as.data.frame(table(all.sce[,all.sce$Patient_ID!="Control" ]$Patient_ID))
+colnames(tbl) <-c("Patient ID", "Cell number overall")
+#write.csv(tbl, file=file.path(plot_folder, paste("Immune cell numbers per patient overall including T-excluding undefined cells.csv")))
+print(tbl)
+
+summary(tbl$`Cell number overall`) #3-6259
+tbl[tbl$`Cell number overall` <=100,] #43 patients have less than 100 F
+
+#Lowest 10% of all patients' Cell numbers
+tbl[order(tbl$`Cell number overall`),]$`Cell number overall`[1:100] #5-822 = lowest 10%, 5-503 = lowest 5%
+
+
+#Highest 10% of all patients' Cell numbers
+tbl[order(-tbl$`Cell number overall`),]$`Cell number overall`[1:100] #760-2378
+
+tbl[tbl$`Cell number overall` <=100,] #43 patients have less than 100 Fibros=10%. 77 patients have less than 15 Fibros
+
+all_cells_pat <- tbl[tbl$`Cell number overall` <=500,]$`Patient ID`
+length(all_cells_pat)
+length(unique(all.sce$Patient_ID))
+```
+
+```{r}
+#All Immune cells together
+tbl <- as.data.frame(table(all.sce[,all.sce$Patient_ID!="Control"]$RoiID))
+colnames(tbl) <-c("ROI ID", "Cell number overall")
+#write.csv(tbl, file=file.path(plot_folder, paste("Cell numbers per ROI overall including Fibros-excluding undefined.csv")))
+print(tbl)
+summary(tbl$`Cell number overall`) #1-5075
+tbl[tbl$`Cell number overall` <=100,] #93 images have less than 50 Fibros (10% of median)
+#Lowest 10% of all patients' Cell numbers
+tbl[order(tbl$`Cell number overall`),]$`Cell number overall`[1:200] #1-110
+
+
+#Highest 10% of all patients' Cell numbers
+tbl[order(-tbl$`Cell number overall`),]$`Cell number overall`[1:200] #1385-5075
+#per image cut at 50 Fibros per image equals lowest 5% -> ensures that there's at least 50 Fibros per patient
+all_cells_roi <- tbl[tbl$`Cell number overall` <=250,]$`ROI ID`
+length(all_cells_roi)
+```
+
+
+Remove patients and roi
+```{r  patient roi and pat_roi removal, echo=FALSE, message=FALSE, warning=FALSE}
+#Patient removal
+all.sce_pat <- all.sce[,all.sce$Patient_ID!="Control"&
+                               !all.sce$Patient_ID%in%all_cells_pat]
+length(unique(all.sce_pat$Patient_ID)) #1025
+
+#Roi removal
+all.sce_roi <- all.sce[,all.sce$Patient_ID!="Control"&
+                               !all.sce$RoiID%in%all_cells_roi]
+length(unique(all.sce_roi$Patient_ID)) #1039
+
+#Patient & Roi removal
+all.sce_pat.roi <- all.sce[,all.sce$Patient_ID!="Control"&
+                               !all.sce$Patient_ID%in%all_cells_pat&
+                               !all.sce$RoiID%in%all_cells_roi]
+length(unique(all.sce_pat.roi$Patient_ID)) #1025
+```
+
+```{r  Save SCE patient and roi removed, echo=FALSE,warning=FALSE, message=FALSE, eval=FALSE}
+data_folder <-file.path(wd,"sce_objects","merge_minus_Tumour")
+unique(all.sce$cell_category)
+all.sce[, all.sce$cell_category!="Other"]
+saveRDS(all.sce_pat.roi,file=file.path(data_folder, paste("ALL_minus-Tumour_sce_pat_roi_rem.rds",sep="")))
+saveRDS(all.sce_roi,file=file.path(data_folder, paste("ALL_minus-Tumour_sce_roi_rem.rds",sep="")))
+saveRDS(all.sce_pat,file=file.path(data_folder, paste("ALL_minus-Tumour_sce_pat_rem.rds",sep="")))
+
+all.sce_pat.roi <-readRDS(file=file.path(data_folder, paste("ALL_minus-Tumour_sce_pat_roi_rem.rds",sep="")))
+all.sce_roi <-readRDS(file=file.path(data_folder, paste("ALL_minus-Tumour_sce_roi_rem.rds",sep="")))
+all.sce_pat <-readRDS(file=file.path(data_folder, paste("ALL_minus-Tumour_sce_pat_rem.rds",sep="")))
+```
+
+```{r}
+unique(all.sce$cell_subtype)
+all.sce$cell_subtype[all.sce$cell_category=="Fibroblast"] <- all.sce[, all.sce$cell_category=="Fibroblast"]$cell_type
+```
+
+##Analysis
+#Proportions Fibro CATEGORY
+
+## **Analysis**
+In the following, only tumours that were identified as adenocarcinomas or squamous cell carcinomas by pathologists will be used.
+
+## **Fibro Category**
+
+## **Proportions**
+Optimal number of patient metaclusters Fibro category proportions
+```{r  optimal number of clusters for non Fibro category type proportion clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Proportions
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+tdat <-as.data.frame(table( all.sce_pat[, all.sce_pat$Patient_ID!="Control"&
+                                           all.sce_pat$DX.name=="Adenocarcinoma"|
+                                           all.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            all.sce_pat[, all.sce_pat$Patient_ID!="Control"&
+                                           all.sce_pat$DX.name=="Adenocarcinoma"|
+                                           all.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_type))
+
+
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+#tdat_m <-tdat_m[,1:2]
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+sbg <-cutree(hc, k=2)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=15)
+
+#install.packages("cluster")
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+#res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+## Patients are ordered by hierarchical clustering (euclidian, wards.D2)
+```{r  Fibro category proportions ordered by hierarchical clustering, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table( all.sce_pat[, all.sce_pat$Patient_ID!="Control"&
+                                           all.sce_pat$DX.name=="Adenocarcinoma"|
+                                           all.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            all.sce_pat[, all.sce_pat$Patient_ID!="Control"&
+                                           all.sce_pat$DX.name=="Adenocarcinoma"|
+                                           all.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = hc$labels[hc$order])
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = hc$labels[hc$order])
+
+tdat_wide
+```
+
+Patient based metaclusters split by metacluster
+```{r  Barplot Fibro category proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#Barplot proportions ordered by
+p <-ggplot(tdat,aes(x=freq, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")
+#plot(p)
+#ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_Tcell-Category_over_FILTEREDpatients_ordered-hc.pdf")), width=6, height=6)
+
+#dendrogram coloured by metacluster
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k =15) %>%  as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()+
+  coord_flip()+
+  scale_y_reverse()
+
+#n
+p2<-ggplot(tdat,aes(x=n, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+theme(
+        axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())
+plt <-p1+
+  p+theme(legend.position = "none", axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())+
+  p2
+
+plot(plt)
+#ggsave(plot=plt, file=file.path(plot_folder, paste("Barplot_n_hc_cell_type_proportions.pdf")), width=6, height=6)
+```
+
+```{r  merge hc metacluster into t_dat Fibro category proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k= 15))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+
+p <-ggplot(tdat_ct,aes(y=freq,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+#ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_split_metacluster_tcell-category_proportions.pdf")))
+plot(p)
+```
+
+
+## **Survival analysis** - Fibro category
+Fibro categories (CD4 and CD8) are grouped into "high" and "low" using the proportion's median as a cut-off.    
+- high > median  
+- low < median  
+Patients who received neoadjuvant treatment are excluded from survival analysis.  
+```{r  survival data Fibro category proportions high-low, message=FALSE, warning=FALSE, echo=FALSE}
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat,]
+
+#high low by median
+surv_dat$Collagen_CAF_G <- ifelse(surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+surv_dat$hypoxic_CAF_G <- ifelse(surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+surv_dat$mCAF_G <- ifelse(surv_dat$mCAF >summary(surv_dat$mCAF)[3],"mCAF high","mCAF low")
+surv_dat$SMA_CAF_G <- ifelse(surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+surv_dat$tpCAF_G <- ifelse(surv_dat$tpCAF >summary(surv_dat$tpCAF)[3],"tpCAF high","tpCAF low")
+surv_dat$iCAF_G <- ifelse(surv_dat$iCAF >summary(surv_dat$iCAF)[3],"iCAF high","iCAF low")
+surv_dat$vCAF_G <- ifelse(surv_dat$vCAF >summary(surv_dat$vCAF)[3],"vCAF high","vCAF low")
+surv_dat$dCAF_G <- ifelse(surv_dat$dCAF >summary(surv_dat$dCAF)[3],"dCAF high","dCAF low")
+surv_dat$hypoxic_tpCAF_G <- ifelse(surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+surv_dat$IDO_CAF_G <- ifelse(surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+surv_dat$PDPN_CAF_G <- ifelse(surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+
+
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster")
+```
+
+
+#CoxPH for Fibro category proportions corrected for Stage, Grade and M
+```{r  Coxph Fibro category proportions, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   PDPN_CAF + IDO_CAF+hypoxic_tpCAF+dCAF+vCAF+iCAF+tpCAF+mCAF+hypoxic_CAF+SMA_CAF+Collagen_CAF+Bcell+CD4+CD8+Myeloid+Neutrophil+ Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = sqrt(estimate),color =p.value < 0.05)) +
+geom_point(size = 4) +
+#geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r  survival Fibro category proportions not split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  facet_wrap(~strata)+theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r  survival Fibro category proportions split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#OS survival
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster")
+
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+    
+dat$Collagen_CAF_G <- ifelse(dat$Collagen_CAF >summary(dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+dat$hypoxic_CAF_G <- ifelse(dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+dat$mCAF_G <- ifelse(dat$mCAF >summary(dat$mCAF)[3],"mCAF high","mCAF low")
+dat$SMA_CAF_G <- ifelse(dat$SMA_CAF >summary(dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+dat$tpCAF_G <- ifelse(dat$tpCAF >summary(dat$tpCAF)[3],"tpCAF high","tpCAF low")
+dat$iCAF_G <- ifelse(dat$iCAF >summary(dat$iCAF)[3],"iCAF high","iCAF low")
+dat$vCAF_G <- ifelse(dat$vCAF >summary(dat$vCAF)[3],"vCAF high","vCAF low")
+dat$dCAF_G <- ifelse(dat$dCAF >summary(dat$dCAF)[3],"dCAF high","dCAF low")
+dat$hypoxic_tpCAF_G <- ifelse(dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+dat$IDO_CAF_G <- ifelse(dat$IDO_CAF >summary(dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+dat$PDPN_CAF_G <- ifelse(dat$PDPN_CAF >summary(dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+   #abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high / low as well as the proportion (continuous).   
+```{r  coxph Fibro category proportions, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Proportions_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Cell groups categorised into "high", "mid", "low".  
+- high > 3rd quantile  
+- mid >1st quantile and < 3rd quantile  
+- low < 1st quantile  
+```{r  survival Fibro category proportions classified as high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat$mCAF_G[surv_dat$mCAF >summary(surv_dat$mCAF)[5]] <-"mCAF high"
+surv_dat$mCAF_G[surv_dat$mCAF <=summary(surv_dat$mCAF)[2]] <-"mCAF low"
+surv_dat$mCAF_G[surv_dat$mCAF >summary(surv_dat$mCAF)[2]& surv_dat$mCAF<=summary(surv_dat$mCAF)[5]] <-"mCAF medium"
+
+surv_dat$vCAF_G[surv_dat$vCAF >summary(surv_dat$vCAF)[5]] <-"vCAF high"
+surv_dat$vCAF_G[surv_dat$vCAF <=summary(surv_dat$vCAF)[2]] <-"vCAF low"
+surv_dat$vCAF_G[surv_dat$vCAF >summary(surv_dat$vCAF)[2]& surv_dat$vCAF<=summary(surv_dat$vCAF)[5]] <-"vCAF medium"
+
+surv_dat$tpCAF_G[surv_dat$tpCAF >summary(surv_dat$tpCAF)[5]] <-"tpCAF high"
+surv_dat$tpCAF_G[surv_dat$tpCAF <=summary(surv_dat$tpCAF)[2]] <-"tpCAF low"
+surv_dat$tpCAF_G[surv_dat$tpCAF >summary(surv_dat$tpCAF)[2]& surv_dat$tpCAF<=summary(surv_dat$tpCAF)[5]] <-"tpCAF medium"
+
+surv_dat$iCAF_G[surv_dat$iCAF >summary(surv_dat$iCAF)[5]] <-"iCAF high"
+surv_dat$iCAF_G[surv_dat$iCAF <=summary(surv_dat$iCAF)[2]] <-"iCAF low"
+surv_dat$iCAF_G[surv_dat$iCAF >summary(surv_dat$iCAF)[2]& surv_dat$iCAF<=summary(surv_dat$iCAF)[5]] <-"iCAF medium"
+
+surv_dat$dCAF_G[surv_dat$dCAF >summary(surv_dat$dCAF)[5]] <-"dCAF high"
+surv_dat$dCAF_G[surv_dat$dCAF <=summary(surv_dat$dCAF)[2]] <-"dCAF low"
+surv_dat$dCAF_G[surv_dat$dCAF >summary(surv_dat$dCAF)[2]& surv_dat$dCAF<=summary(surv_dat$dCAF)[5]] <-"dCAF medium"
+
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[5]] <-"Collagen_CAF high"
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF <=summary(surv_dat$Collagen_CAF)[2]] <-"Collagen_CAF low"
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[2]& surv_dat$Collagen_CAF<=summary(surv_dat$Collagen_CAF)[5]] <-"Collagen_CAF medium"
+
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[5]] <-"hypoxic_CAF high"
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF <=summary(surv_dat$hypoxic_CAF)[2]] <-"hypoxic_CAF low"
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[2]& surv_dat$hypoxic_CAF<=summary(surv_dat$hypoxic_CAF)[5]] <-"hypoxic_CAF medium"
+
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[5]] <-"SMA_CAF high"
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF <=summary(surv_dat$SMA_CAF)[2]] <-"SMA_CAF low"
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[2]& surv_dat$SMA_CAF<=summary(surv_dat$SMA_CAF)[5]] <-"SMA_CAF medium"
+
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF high"
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF <=summary(surv_dat$hypoxic_tpCAF)[2]] <-"hypoxic_tpCAF low"
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[2]& surv_dat$hypoxic_tpCAF<=summary(surv_dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF medium"
+
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[5]] <-"IDO_CAF high"
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF <=summary(surv_dat$IDO_CAF)[2]] <-"IDO_CAF low"
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[2]& surv_dat$IDO_CAF<=summary(surv_dat$IDO_CAF)[5]] <-"IDO_CAF medium"
+
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[5]] <-"PDPN_CAF high"
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF <=summary(surv_dat$PDPN_CAF)[2]] <-"PDPN_CAF low"
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[2]& surv_dat$PDPN_CAF<=summary(surv_dat$PDPN_CAF)[5]] <-"PDPN_CAF medium"
+
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster")
+
+```
+
+## Survival Fibro category proportions high mid low not split
+```{r  survival Fibro category proportions not split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Survival Fibro category proportions high mid low split by tumour type
+```{r  survival Fibro category proportions split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$mCAF_G[dat$mCAF >summary(dat$mCAF)[5]] <-"mCAF high"
+dat$mCAF_G[dat$mCAF <=summary(dat$mCAF)[2]] <-"mCAF low"
+dat$mCAF_G[dat$mCAF >summary(dat$mCAF)[2]& dat$mCAF<=summary(dat$mCAF)[5]] <-"mCAF medium"
+
+dat$vCAF_G[dat$vCAF >summary(dat$vCAF)[5]] <-"vCAF high"
+dat$vCAF_G[dat$vCAF <=summary(dat$vCAF)[2]] <-"vCAF low"
+dat$vCAF_G[dat$vCAF >summary(dat$vCAF)[2]& dat$vCAF<=summary(dat$vCAF)[5]] <-"vCAF medium"
+
+dat$tpCAF_G[dat$tpCAF >summary(dat$tpCAF)[5]] <-"tpCAF high"
+dat$tpCAF_G[dat$tpCAF <=summary(dat$tpCAF)[2]] <-"tpCAF low"
+dat$tpCAF_G[dat$tpCAF >summary(dat$tpCAF)[2]& dat$tpCAF<=summary(dat$tpCAF)[5]] <-"tpCAF medium"
+
+dat$iCAF_G[dat$iCAF >summary(dat$iCAF)[5]] <-"iCAF high"
+dat$iCAF_G[dat$iCAF <=summary(dat$iCAF)[2]] <-"iCAF low"
+dat$iCAF_G[dat$iCAF >summary(dat$iCAF)[2]& dat$iCAF<=summary(dat$iCAF)[5]] <-"iCAF medium"
+
+dat$dCAF_G[dat$dCAF >summary(dat$dCAF)[5]] <-"dCAF high"
+dat$dCAF_G[dat$dCAF <=summary(dat$dCAF)[2]] <-"dCAF low"
+dat$dCAF_G[dat$dCAF >summary(dat$dCAF)[2]& dat$dCAF<=summary(dat$dCAF)[5]] <-"dCAF medium"
+
+dat$Collagen_CAF_G[dat$Collagen_CAF >summary(dat$Collagen_CAF)[5]] <-"Collagen_CAF high"
+dat$Collagen_CAF_G[dat$Collagen_CAF <=summary(dat$Collagen_CAF)[2]] <-"Collagen_CAF low"
+dat$Collagen_CAF_G[dat$Collagen_CAF >summary(dat$Collagen_CAF)[2]& dat$Collagen_CAF<=summary(dat$Collagen_CAF)[5]] <-"Collagen_CAF medium"
+
+dat$hypoxic_CAF_G[dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[5]] <-"hypoxic_CAF high"
+dat$hypoxic_CAF_G[dat$hypoxic_CAF <=summary(dat$hypoxic_CAF)[2]] <-"hypoxic_CAF low"
+dat$hypoxic_CAF_G[dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[2]& dat$hypoxic_CAF<=summary(dat$hypoxic_CAF)[5]] <-"hypoxic_CAF medium"
+
+dat$SMA_CAF_G[dat$SMA_CAF >summary(dat$SMA_CAF)[5]] <-"SMA_CAF high"
+dat$SMA_CAF_G[dat$SMA_CAF <=summary(dat$SMA_CAF)[2]] <-"SMA_CAF low"
+dat$SMA_CAF_G[dat$SMA_CAF >summary(dat$SMA_CAF)[2]& dat$SMA_CAF<=summary(dat$SMA_CAF)[5]] <-"SMA_CAF medium"
+
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF high"
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF <=summary(dat$hypoxic_tpCAF)[2]] <-"hypoxic_tpCAF low"
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[2]& dat$hypoxic_tpCAF<=summary(dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF medium"
+
+dat$IDO_CAF_G[dat$IDO_CAF >summary(dat$IDO_CAF)[5]] <-"IDO_CAF high"
+dat$IDO_CAF_G[dat$IDO_CAF <=summary(dat$IDO_CAF)[2]] <-"IDO_CAF low"
+dat$IDO_CAF_G[dat$IDO_CAF >summary(dat$IDO_CAF)[2]& dat$IDO_CAF<=summary(dat$IDO_CAF)[5]] <-"IDO_CAF medium"
+
+dat$PDPN_CAF_G[dat$PDPN_CAF >summary(dat$PDPN_CAF)[5]] <-"PDPN_CAF high"
+dat$PDPN_CAF_G[dat$PDPN_CAF <=summary(dat$PDPN_CAF)[2]] <-"PDPN_CAF low"
+dat$PDPN_CAF_G[dat$PDPN_CAF >summary(dat$PDPN_CAF)[2]& dat$PDPN_CAF<=summary(dat$PDPN_CAF)[5]] <-"PDPN_CAF medium"
+ 
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high mid low as well as the proportion (continuous).   
+```{r  coxph Fibro category proportions high mid low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Proportions_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+#Correlation of Fibro category proportions with clinical parameters
+```{r  Correlation of non Fibro category proportions and clinical parameters, message=FALSE, warning=FALSE, echo=FALSE}
+all.sce_pat$NeoAdj <- ifelse(all.sce_pat$Chemo==1 |all.sce_pat$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.sce_pat[,all.sce_pat$Patient_ID!="Control"&
+                                          all.sce_pat$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                           all.sce_pat[,all.sce_pat$Patient_ID!="Control"&
+                                          all.sce_pat$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+tdat_wide <-merge(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_as <-tdat_wide[tdat_wide$DX.name=="Adenocarcinoma" | tdat_wide$DX.name=="Squamous cell carcinoma",]
+tdat_wide_as$LN.Met <-ifelse(tdat_wide_as$LN.Met=="LN Metastases", 1,0)
+tdat_wide_as$Dist.Met <-ifelse(tdat_wide_as$Dist.Met=="Dist. Metastases", 1,0)
+
+c.param <- c("Gender","T.new","N","M.new","Dist.Met","LN.Met","Stage")
+for(i in c.param){
+  print(i)
+print(cor.test(tdat_wide_as$CD4,tdat_wide_as[[i]], method="spearman", exact=F))
+}
+
+cor.test(tdat_wide_as$iCAF,tdat_wide_as$Grade, method="spearman", exact=F)$p.value
+
+
+tdat_wide_as_sub <-tdat_wide_as %>% select(c("Patient_ID",contains("CAF"),"Gender","T.new","N","M.new","Dist.Met","LN.Met","Stage"))
+rownames(tdat_wide_as_sub)<-tdat_wide_as_sub$Patient_ID
+
+library(corrplot)
+cor(tdat_wide_as_sub[,-1], method="spearman")
+
+#corrplot(cor(tdat_wide[,-1], method="pearson"), col=rev(brewer.pal(10,"RdBu")))
+
+sig <- cor.mtest(tdat_wide[,-1], conf.level = .95) #is this test right?
+
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+         col=rev(brewer.pal(10,"RdBu")),
+         pch.col = "white",
+         #type="upper",
+         title = "Correlated ratios of CAF type proportions per patient",
+        mar=c(0,0,3,0))
+
+```
+
+
+In group comparison (Wilcoxon for 2 groups or KW for >2 groups) for Fibro categories over clinical variables:  
+  - T.new (TNM)
+  - N (TNM)
+  - M.new (TNM)
+  - Relapse
+  - Grade
+  - Stage
+  - Gender
+  - Typ (Tumour Type)
+  - Dist.Met (Distant metastases yes/no)
+  - LN.Met (Lymph node metastases yes/no)
+  
+## Kruskal-Wallis / Wilcoxon Fibro category ~ clinical data excluding neoadjuvant therapy
+```{r  KW W Fibro category proportions WO neo, fig.width=12, fig.height=10, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(all.sce_pat[,all.sce_pat$Patient_ID!="Control"&
+                                          all.sce_pat$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat$DX.name=="Squamous cell carcinoma"]))
+
+#remove neoadj patients
+df <-df[!df$Patient_ID%in% neoadj_pat$Patient_ID,]
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender") 
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$cell_type, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+  #pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Proportions_WOneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+## Neoadjuvantly treated patients are included, the same categories are used plus:  
+- NeoAdj (Neoadjuvant therapy yes/no)
+- Chemo (Neoadjuvant therapy = chemotherapy)
+- Radio (Neoadjuvant therapy = radiotherapy)
+```{r  KW W Fibro category proportions with NEO, fig.width=12, fig.height=10, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(all.sce_pat[,all.sce_pat$Patient_ID!="Control"&
+                                          all.sce_pat$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat$DX.name=="Squamous cell carcinoma"]))
+
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$cell_type, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Proportions_Wneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+
+
+## **Fibro category**
+## **Densities**
+Generally, densities are first calculated per tissue area. For patients with more than 1 image, the average density is calculated subsequently.  
+Optimal number of Fibro category densities patients metaclusters:
+```{r  Fibro category density optimal number of clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Densities
+tdat <-as.data.frame(table(all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.sce_pat.roi$RoiID,
+                                "Area"=all.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.sce_pat.roi$Patient_ID,
+                                "DX.name"=all.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+#tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+library(FactoMineR)
+library(factoextra)
+sbg <-cutree(hc, k=8)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=20)
+
+#install.packages("cluster")
+library(cluster)
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+```{r  Calculate densities Fibro category, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table(all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.sce_pat.roi$RoiID,
+                                "Area"=all.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.sce_pat.roi$Patient_ID,
+                                "DX.name"=all.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+  
+  
+#CAF type distribution over all patients
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+#order Patients after clustering results
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = labels(hc))
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = labels(hc))
+```
+
+## Barplot showing absolute density per patietn coloured by Fibro category together with hierarchical clustering tree coloured by patient metaclusters.
+```{r Barolot Fibro category densities, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+#barolot (sqrt of density)
+p <-ggplot(tdat,aes(y=sqrt(density),x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+#barplot total density
+p <-ggplot(tdat,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+
+#coloured dendrogram
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 10) %>% as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()
+pg <-plot_grid(p1,p, align="v", ncol=1)
+pg
+#ggsave(plot=pg, file=file.path(plot_folder, paste("Barplot_Tcell-Category-Densities_HC_ordered_dendro.pdf")), width=12, height=6)
+```
+Add patient metacluster to data and plot clusters separately for better identification
+```{r Fibro category densities merge hc, message=FALSE, warning=FALSE, echo=FALSE}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k=10))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+ggplot(tdat_ct,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+```
+
+## Survival analysis over Fibro category densities high low.  
+- High > median
+- Low < median
+```{r  surv_dat Fibro category densities high low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- rowSums(surv_dat[,2:12])
+
+#high low
+surv_dat$Density_G <- ifelse(surv_dat$total_density >summary(surv_dat$total_density)[3],"Density high","Density low")
+surv_dat$mCAF_G <- ifelse(surv_dat$mCAF >summary(surv_dat$mCAF)[3],"mCAF high","mCAF low")
+surv_dat$vCAF_G <- ifelse(surv_dat$vCAF >summary(surv_dat$vCAF)[3],"vCAF high","vCAF low")
+surv_dat$iCAF_G <- ifelse(surv_dat$iCAF >summary(surv_dat$iCAF)[3],"iCAF high","iCAF low")
+surv_dat$dCAF_G <- ifelse(surv_dat$dCAF >summary(surv_dat$dCAF)[3],"dCAF high","dCAF low")
+surv_dat$tpCAF_G <- ifelse(surv_dat$tpCAF >summary(surv_dat$tpCAF)[3],"tpCAF high","tpCAF low")
+surv_dat$hypoxic_tpCAF_G <- ifelse(surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+surv_dat$hypoxic_CAF_G <- ifelse(surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+surv_dat$Collagen_CAF_G <- ifelse(surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+surv_dat$PDPN_CAF_G <- ifelse(surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+surv_dat$SMA_CAF_G <- ifelse(surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+surv_dat$IDO_CAF_G <- ifelse(surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+
+
+```
+
+#CoxPH for Fibro category density corrected for Stage, Grade and M
+```{r  Coxph Fibro category density, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   IDO_CAF + SMA_CAF+PDPN_CAF+Collagen_CAF +hypoxic_CAF+hypoxic_tpCAF+tpCAF+ dCAF+iCAF+vCAF+mCAF+Bcell+CD4+CD8+Myeloid+Neutrophil+Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = sqrt(estimate),color =p.value < 0.05)) +
+geom_point(size = 4) +
+#geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r  survival Fibro category densities high low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+#surv <-c("metacluster")
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+library(survminer)
+
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+#km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r  survival Fibro category densities high low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Split
+library(survminer)
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G <- ifelse(dat$total_density >summary(dat$total_density)[3],"Density high","Density low")
+dat$mCAF_G <- ifelse(dat$mCAF >summary(dat$mCAF)[3],"mCAF high","mCAF low")
+dat$vCAF_G <- ifelse(dat$vCAF >summary(dat$vCAF)[3],"vCAF high","vCAF low")
+dat$iCAF_G <- ifelse(dat$iCAF >summary(dat$iCAF)[3],"iCAF high","iCAF low")
+dat$dCAF_G <- ifelse(dat$dCAF >summary(dat$dCAF)[3],"dCAF high","dCAF low")
+dat$tpCAF_G <- ifelse(dat$tpCAF >summary(dat$tpCAF)[3],"tpCAF high","tpCAF low")
+dat$hypoxic_tpCAF_G <- ifelse(dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+dat$hypoxic_CAF_G <- ifelse(dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+dat$Collagen_CAF_G <- ifelse(dat$Collagen_CAF >summary(dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+dat$PDPN_CAF_G <- ifelse(dat$PDPN_CAF >summary(dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+dat$SMA_CAF_G <- ifelse(dat$SMA_CAF >summary(dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+dat$IDO_CAF_G <- ifelse(dat$IDO_CAF >summary(dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high / low as well as the densities (continuous).   
+```{r  coxph Fibro category densities high-low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=6, fig.width=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Density_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Survival Fibro category densities high-mid-low     
+- high    > 3rd quantile   
+- mid     > 1st quantile < 3rd quantile   
+- low     < 1st quantile   
+```{r  surv_dat Fibro category densities high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- rowSums(surv_dat[,2:12])
+
+#High Med Low
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[5]] <-"Density high"
+surv_dat$Density_G[surv_dat$total_density <=summary(surv_dat$total_density)[2]] <-"Density low"
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[2]& surv_dat$total_density<summary(surv_dat$total_density)[5]] <-"Density medium"
+
+surv_dat$mCAF_G[surv_dat$mCAF >summary(surv_dat$mCAF)[5]] <-"mCAF high"
+surv_dat$mCAF_G[surv_dat$mCAF <=summary(surv_dat$mCAF)[2]] <-"mCAF low"
+surv_dat$mCAF_G[surv_dat$mCAF >summary(surv_dat$mCAF)[2]& surv_dat$mCAF<=summary(surv_dat$mCAF)[5]] <-"mCAF medium"
+
+surv_dat$vCAF_G[surv_dat$vCAF >summary(surv_dat$vCAF)[5]] <-"vCAF high"
+surv_dat$vCAF_G[surv_dat$vCAF <=summary(surv_dat$vCAF)[2]] <-"vCAF low"
+surv_dat$vCAF_G[surv_dat$vCAF >summary(surv_dat$vCAF)[2]& surv_dat$vCAF<=summary(surv_dat$vCAF)[5]] <-"vCAF medium"
+
+surv_dat$tpCAF_G[surv_dat$tpCAF >summary(surv_dat$tpCAF)[5]] <-"tpCAF high"
+surv_dat$tpCAF_G[surv_dat$tpCAF <=summary(surv_dat$tpCAF)[2]] <-"tpCAF low"
+surv_dat$tpCAF_G[surv_dat$tpCAF >summary(surv_dat$tpCAF)[2]& surv_dat$tpCAF<=summary(surv_dat$tpCAF)[5]] <-"tpCAF medium"
+
+surv_dat$iCAF_G[surv_dat$iCAF >summary(surv_dat$iCAF)[5]] <-"iCAF high"
+surv_dat$iCAF_G[surv_dat$iCAF <=summary(surv_dat$iCAF)[2]] <-"iCAF low"
+surv_dat$iCAF_G[surv_dat$iCAF >summary(surv_dat$iCAF)[2]& surv_dat$iCAF<=summary(surv_dat$iCAF)[5]] <-"iCAF medium"
+
+surv_dat$dCAF_G[surv_dat$dCAF >summary(surv_dat$dCAF)[5]] <-"dCAF high"
+surv_dat$dCAF_G[surv_dat$dCAF <=summary(surv_dat$dCAF)[2]] <-"dCAF low"
+surv_dat$dCAF_G[surv_dat$dCAF >summary(surv_dat$dCAF)[2]& surv_dat$dCAF<=summary(surv_dat$dCAF)[5]] <-"dCAF medium"
+
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[5]] <-"Collagen_CAF high"
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF <=summary(surv_dat$Collagen_CAF)[2]] <-"Collagen_CAF low"
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[2]& surv_dat$Collagen_CAF<=summary(surv_dat$Collagen_CAF)[5]] <-"Collagen_CAF medium"
+
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[5]] <-"hypoxic_CAF high"
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF <=summary(surv_dat$hypoxic_CAF)[2]] <-"hypoxic_CAF low"
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[2]& surv_dat$hypoxic_CAF<=summary(surv_dat$hypoxic_CAF)[5]] <-"hypoxic_CAF medium"
+
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[5]] <-"SMA_CAF high"
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF <=summary(surv_dat$SMA_CAF)[2]] <-"SMA_CAF low"
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[2]& surv_dat$SMA_CAF<=summary(surv_dat$SMA_CAF)[5]] <-"SMA_CAF medium"
+
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF high"
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF <=summary(surv_dat$hypoxic_tpCAF)[2]] <-"hypoxic_tpCAF low"
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[2]& surv_dat$hypoxic_tpCAF<=summary(surv_dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF medium"
+
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[5]] <-"IDO_CAF high"
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF <=summary(surv_dat$IDO_CAF)[2]] <-"IDO_CAF low"
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[2]& surv_dat$IDO_CAF<=summary(surv_dat$IDO_CAF)[5]] <-"IDO_CAF medium"
+
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[5]] <-"PDPN_CAF high"
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF <=summary(surv_dat$PDPN_CAF)[2]] <-"PDPN_CAF low"
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[2]& surv_dat$PDPN_CAF<=summary(surv_dat$PDPN_CAF)[5]] <-"PDPN_CAF medium"
+
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+```
+
+## Fibro category densities split by tumour type
+```{r  survival Fibro category densities high-mid-low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+library(survminer)
+
+for (i in surv){
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## Fibro category densities split by tumour type
+```{r  survival Fibro category densities high-mid-low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G[dat$total_density >summary(dat$total_density)[5]] <-"Density high"
+dat$Density_G[dat$total_density <summary(dat$total_density)[2]] <-"Density low"
+dat$Density_G[dat$total_density >summary(dat$total_density)[2]& dat$total_density<summary(dat$total_density)[5]] <-"Density medium"
+dat$mCAF_G[dat$mCAF >summary(dat$mCAF)[5]] <-"mCAF high"
+dat$mCAF_G[dat$mCAF <=summary(dat$mCAF)[2]] <-"mCAF low"
+dat$mCAF_G[dat$mCAF >summary(dat$mCAF)[2]& dat$mCAF<=summary(dat$mCAF)[5]] <-"mCAF medium"
+
+dat$vCAF_G[dat$vCAF >summary(dat$vCAF)[5]] <-"vCAF high"
+dat$vCAF_G[dat$vCAF <=summary(dat$vCAF)[2]] <-"vCAF low"
+dat$vCAF_G[dat$vCAF >summary(dat$vCAF)[2]& dat$vCAF<=summary(dat$vCAF)[5]] <-"vCAF medium"
+
+dat$tpCAF_G[dat$tpCAF >summary(dat$tpCAF)[5]] <-"tpCAF high"
+dat$tpCAF_G[dat$tpCAF <=summary(dat$tpCAF)[2]] <-"tpCAF low"
+dat$tpCAF_G[dat$tpCAF >summary(dat$tpCAF)[2]& dat$tpCAF<=summary(dat$tpCAF)[5]] <-"tpCAF medium"
+
+dat$iCAF_G[dat$iCAF >summary(dat$iCAF)[5]] <-"iCAF high"
+dat$iCAF_G[dat$iCAF <=summary(dat$iCAF)[2]] <-"iCAF low"
+dat$iCAF_G[dat$iCAF >summary(dat$iCAF)[2]& dat$iCAF<=summary(dat$iCAF)[5]] <-"iCAF medium"
+
+dat$dCAF_G[dat$dCAF >summary(dat$dCAF)[5]] <-"dCAF high"
+dat$dCAF_G[dat$dCAF <=summary(dat$dCAF)[2]] <-"dCAF low"
+dat$dCAF_G[dat$dCAF >summary(dat$dCAF)[2]& dat$dCAF<=summary(dat$dCAF)[5]] <-"dCAF medium"
+
+dat$Collagen_CAF_G[dat$Collagen_CAF >summary(dat$Collagen_CAF)[5]] <-"Collagen_CAF high"
+dat$Collagen_CAF_G[dat$Collagen_CAF <=summary(dat$Collagen_CAF)[2]] <-"Collagen_CAF low"
+dat$Collagen_CAF_G[dat$Collagen_CAF >summary(dat$Collagen_CAF)[2]& dat$Collagen_CAF<=summary(dat$Collagen_CAF)[5]] <-"Collagen_CAF medium"
+
+dat$hypoxic_CAF_G[dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[5]] <-"hypoxic_CAF high"
+dat$hypoxic_CAF_G[dat$hypoxic_CAF <=summary(dat$hypoxic_CAF)[2]] <-"hypoxic_CAF low"
+dat$hypoxic_CAF_G[dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[2]& dat$hypoxic_CAF<=summary(dat$hypoxic_CAF)[5]] <-"hypoxic_CAF medium"
+
+dat$SMA_CAF_G[dat$SMA_CAF >summary(dat$SMA_CAF)[5]] <-"SMA_CAF high"
+dat$SMA_CAF_G[dat$SMA_CAF <=summary(dat$SMA_CAF)[2]] <-"SMA_CAF low"
+dat$SMA_CAF_G[dat$SMA_CAF >summary(dat$SMA_CAF)[2]& dat$SMA_CAF<=summary(dat$SMA_CAF)[5]] <-"SMA_CAF medium"
+
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF high"
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF <=summary(dat$hypoxic_tpCAF)[2]] <-"hypoxic_tpCAF low"
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[2]& dat$hypoxic_tpCAF<=summary(dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF medium"
+
+dat$IDO_CAF_G[dat$IDO_CAF >summary(dat$IDO_CAF)[5]] <-"IDO_CAF high"
+dat$IDO_CAF_G[dat$IDO_CAF <=summary(dat$IDO_CAF)[2]] <-"IDO_CAF low"
+dat$IDO_CAF_G[dat$IDO_CAF >summary(dat$IDO_CAF)[2]& dat$IDO_CAF<=summary(dat$IDO_CAF)[5]] <-"IDO_CAF medium"
+
+dat$PDPN_CAF_G[dat$PDPN_CAF >summary(dat$PDPN_CAF)[5]] <-"PDPN_CAF high"
+dat$PDPN_CAF_G[dat$PDPN_CAF <=summary(dat$PDPN_CAF)[2]] <-"PDPN_CAF low"
+dat$PDPN_CAF_G[dat$PDPN_CAF >summary(dat$PDPN_CAF)[2]& dat$PDPN_CAF<=summary(dat$PDPN_CAF)[5]] <-"PDPN_CAF medium"
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_type_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+
+
+## Lasso-regressed cox-ph model   
+including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high mid low as well as the densities (continuous).   
+```{r coxph Fibro category densities high-mid-low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor")))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Density_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables without neoadjuvant treated patients.
+```{r  KW W Fibro category density WO neo,warning=FALSE, messages=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+all.sce_pat.roi$NeoAdj <- ifelse(all.sce_pat.roi$Chemo==1 |all.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.sce_pat.roi$RoiID,
+                                "Area"=all.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.sce_pat.roi$Patient_ID,
+                                "DX.name"=all.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[2:12])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:13])), names_to = "Phenotype",values_to = "density")
+
+#remove
+tdat <-tdat[!tdat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=7) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Densities_WOneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables with neoadjuvant treated patients.
+```{r  KW Wilcox Fibro Category density W neo,warning=FALSE, messages=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+all.sce_pat.roi$NeoAdj <- ifelse(all.sce_pat.roi$Chemo==1 |all.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.sce_pat.roi$RoiID,
+                                "Area"=all.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.sce_pat.roi$Patient_ID,
+                                "DX.name"=all.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[,2:12])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:13])), names_to = "Phenotype",values_to = "density")
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=7) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Densities_Wneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+```{r  Correlation Fibro category densities with clinical parameters, eval=F, echo=T, message=FALSE, warning=FALSE, echo=FALSE}
+all.sce_pat.roi$NeoAdj <- ifelse(all.sce_pat.roi$Chemo==1 |all.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+all.sce_pat.roi$NeoAdj <- ifelse(all.sce_pat.roi$Chemo==1 |all.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.sce_pat.roi$RoiID,
+                                "Area"=all.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.sce_pat.roi$Patient_ID,
+                                "DX.name"=all.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- tdat_wide$CD4 + tdat_wide$CD8
+
+tdat_wide <-merge(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_as <-tdat_wide[tdat_wide$DX.name=="Adenocarcinoma" | tdat_wide$DX.name=="Squamous cell carcinoma",]
+tdat_wide_as$LN.Met <-ifelse(tdat_wide_as$LN.Met=="LN Metastases", 1,0)
+tdat_wide_as$Dist.Met <-ifelse(tdat_wide_as$Dist.Met=="Dist. Metastases", 1,0)
+
+for(i in c.param){
+  print(i)
+  print(cor.test(tdat_wide_as$CD4,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$CD8,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$total_density,tdat_wide_as[[i]], method="spearman", exact=F))
+}
+```
+
+
+
+
+########################################################################################################################################################
+
+##Analysis
+#Proportions Fibro CATEGORY
+
+## **Analysis**
+In the following, only tumours that were identified as adenocarcinomas or squamous cell carcinomas by pathologists will be used.
+
+## **Fibro Category**
+
+## **Proportions**
+Optimal number of patient metaclusters Fibro category proportions
+```{r  optimal number of clusters for non Fibro category type proportion clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Proportions
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+tdat <-as.data.frame(table( all.sce_pat[, all.sce_pat$Patient_ID!="Control"&
+                                           all.sce_pat$DX.name=="Adenocarcinoma"|
+                                           all.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            all.sce_pat[, all.sce_pat$Patient_ID!="Control"&
+                                           all.sce_pat$DX.name=="Adenocarcinoma"|
+                                           all.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+
+
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+#tdat_m <-tdat_m[,1:2]
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+sbg <-cutree(hc, k=2)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=15)
+
+#install.packages("cluster")
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+#res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+## Patients are ordered by hierarchical clustering (euclidian, wards.D2)
+```{r  Fibro category proportions ordered by hierarchical clustering, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table( all.sce_pat[, all.sce_pat$Patient_ID!="Control"&
+                                           all.sce_pat$DX.name=="Adenocarcinoma"|
+                                           all.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            all.sce_pat[, all.sce_pat$Patient_ID!="Control"&
+                                           all.sce_pat$DX.name=="Adenocarcinoma"|
+                                           all.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = hc$labels[hc$order])
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = hc$labels[hc$order])
+
+tdat_wide
+```
+
+Patient based metaclusters split by metacluster
+```{r  Barplot Fibro category proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#Barplot proportions ordered by
+p <-ggplot(tdat,aes(x=freq, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")
+#plot(p)
+#ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_Tcell-Category_over_FILTEREDpatients_ordered-hc.pdf")), width=6, height=6)
+
+#dendrogram coloured by metacluster
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 14) %>%  as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()+
+  coord_flip()+
+  scale_y_reverse()
+
+#n
+p2<-ggplot(tdat,aes(x=n, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+theme(
+        axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())
+plt <-p1+
+  p+theme(legend.position = "none", axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())+
+  p2
+
+plot(plt)
+#ggsave(plot=plt, file=file.path(plot_folder, paste("Barplot_n_hc_cell_subtype_proportions.pdf")), width=6, height=6)
+```
+
+```{r  merge hc metacluster into t_dat Fibro category proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k= 10))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+
+p <-ggplot(tdat_ct,aes(y=freq,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+#ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_split_metacluster_tcell-category_proportions.pdf")))
+plot(p)
+```
+
+
+## **Survival analysis** - Fibro category
+Fibro categories (CD4 and CD8) are grouped into "high" and "low" using the proportion's median as a cut-off.    
+- high > median  
+- low < median  
+Patients who received neoadjuvant treatment are excluded from survival analysis.  
+```{r  survival data Fibro category proportions high-low, message=FALSE, warning=FALSE, echo=FALSE}
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat,]
+
+#high low by median
+surv_dat$Collagen_CAF_G <- ifelse(surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+surv_dat$hypoxic_CAF_G <- ifelse(surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+
+surv_dat$mCAF_MMP11_G <- ifelse(surv_dat$mCAF_MMP11 >summary(surv_dat$mCAF_MMP11)[3],"mCAF_MMP11 high","mCAF_MMP11 low")
+surv_dat$mCAF_Col_Cdh_G <- ifelse(surv_dat$mCAF_Col_Cdh >summary(surv_dat$mCAF_Col_Cdh)[3],"mCAF_Col_Cdh high","mCAF_Col_Cdh low")
+
+surv_dat$SMA_CAF_G <- ifelse(surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+
+surv_dat$tpCAF_CD10_G <- ifelse(surv_dat$tpCAF_CD10 >summary(surv_dat$tpCAF_CD10)[3],"tpCAF_CD10 high","tpCAF_CD10 low")
+surv_dat$tpCAF_CD73_G <- ifelse(surv_dat$tpCAF_CD73 >summary(surv_dat$tpCAF_CD73)[3],"tpCAF_CD73 high","tpCAF_CD73 low")
+
+surv_dat$iCAF_CD248_G <- ifelse(surv_dat$iCAF_CD248 >summary(surv_dat$iCAF_CD248)[3],"iCAF_CD248 high","iCAF_CD248 low")
+surv_dat$iCAF_CD34_G <- ifelse(surv_dat$iCAF_CD34 >summary(surv_dat$iCAF_CD34)[3],"iCAF_CD34 high","iCAF_CD34 low")
+
+surv_dat$vCAF_G <- ifelse(surv_dat$vCAF >summary(surv_dat$vCAF)[3],"vCAF high","vCAF low")
+surv_dat$dCAF_G <- ifelse(surv_dat$dCAF >summary(surv_dat$dCAF)[3],"dCAF high","dCAF low")
+surv_dat$hypoxic_tpCAF_G <- ifelse(surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+surv_dat$IDO_CAF_G <- ifelse(surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+surv_dat$PDPN_CAF_G <- ifelse(surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","tpCAF_CD10_G","tpCAF_CD73_G","SMA_CAF_G","mCAF_MMP11_G","mCAF_Col_Cdh_G","hypoxic_CAF_G","Collagen_CAF_G","iCAF_CD34_G", "iCAF_CD248_G","metacluster")
+```
+
+
+#CoxPH for Fibro category proportions corrected for Stage, Grade and M
+```{r  Coxph Fibro category proportions, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   mCAF_MMP11+PDPN_CAF +mCAF_Col_Cdh+iCAF_CD34+iCAF_CD248+ IDO_CAF+hypoxic_tpCAF+dCAF+vCAF+tpCAF_CD10+tpCAF_CD73+hypoxic_CAF+SMA_CAF+Collagen_CAF+ Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r  survival Fibro category proportions not split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  facet_wrap(~strata)+theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r  survival Fibro category proportions split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#OS survival
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","tpCAF_CD10_G","tpCAF_CD73_G","SMA_CAF_G","mCAF_MMP11_G","mCAF_Col_Cdh_G","hypoxic_CAF_G","Collagen_CAF_G","iCAF_CD34_G", "iCAF_CD248_G","metacluster")
+
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+    
+dat$Collagen_CAF_G <- ifelse(dat$Collagen_CAF >summary(dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+dat$hypoxic_CAF_G <- ifelse(dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+
+dat$mCAF_MMP11_G <- ifelse(dat$mCAF_MMP11 >summary(dat$mCAF_MMP11)[3],"mCAF_MMP11 high","mCAF_MMP11 low")
+dat$mCAF_Col_Cdh_G <- ifelse(dat$mCAF_Col_Cdh >summary(dat$mCAF_Col_Cdh)[3],"mCAF_Col_Cdh high","mCAF_Col_Cdh low")
+
+dat$SMA_CAF_G <- ifelse(dat$SMA_CAF >summary(dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+
+dat$tpCAF_CD10_G <- ifelse(dat$tpCAF_CD10 >summary(dat$tpCAF_CD10)[3],"tpCAF_CD10 high","tpCAF_CD10 low")
+dat$tpCAF_CD73_G <- ifelse(dat$tpCAF_CD73 >summary(dat$tpCAF_CD73)[3],"tpCAF_CD73 high","tpCAF_CD73 low")
+
+dat$iCAF_CD248_G <- ifelse(dat$iCAF_CD248 >summary(dat$iCAF_CD248)[3],"iCAF_CD248 high","iCAF_CD248 low")
+dat$iCAF_CD34_G <- ifelse(dat$iCAF_CD34 >summary(dat$iCAF_CD34)[3],"iCAF_CD34 high","iCAF_CD34 low")
+
+dat$vCAF_G <- ifelse(dat$vCAF >summary(dat$vCAF)[3],"vCAF high","vCAF low")
+dat$dCAF_G <- ifelse(dat$dCAF >summary(dat$dCAF)[3],"dCAF high","dCAF low")
+dat$hypoxic_tpCAF_G <- ifelse(dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+dat$IDO_CAF_G <- ifelse(dat$IDO_CAF >summary(dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+dat$PDPN_CAF_G <- ifelse(dat$PDPN_CAF >summary(dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+   #abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high / low as well as the proportion (continuous).   
+```{r  coxph Fibro category proportions, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Proportions_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Cell groups categorised into "high", "mid", "low".  
+- high > 3rd quantile  
+- mid >1st quantile and < 3rd quantile  
+- low < 1st quantile  
+```{r  survival Fibro category proportions classified as high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat$mCAF_Col_Cdh_G[surv_dat$mCAF_Col_Cdh >summary(surv_dat$mCAF_Col_Cdh)[5]] <-"mCAF_Col_Cdh high"
+surv_dat$mCAF_Col_Cdh_G[surv_dat$mCAF_Col_Cdh <=summary(surv_dat$mCAF_Col_Cdh)[2]] <-"mCAF_Col_Cdh low"
+surv_dat$mCAF_Col_Cdh_G[surv_dat$mCAF_Col_Cdh >summary(surv_dat$mCAF_Col_Cdh)[2]& surv_dat$mCAF_Col_Cdh<=summary(surv_dat$mCAF_Col_Cdh)[5]] <-"mCAF_Col_Cdh medium"
+
+surv_dat$mCAF_MMP11_G[surv_dat$mCAF_MMP11 >summary(surv_dat$mCAF_MMP11)[5]] <-"mCAF_MMP11 high"
+surv_dat$mCAF_MMP11_G[surv_dat$mCAF_MMP11 <=summary(surv_dat$mCAF_MMP11)[2]] <-"mCAF_MMP11 low"
+surv_dat$mCAF_MMP11_G[surv_dat$mCAF_MMP11 >summary(surv_dat$mCAF_MMP11)[2]& surv_dat$mCAF_MMP11<=summary(surv_dat$mCAF_MMP11)[5]] <-"mCAF_MMP11 medium"
+
+surv_dat$vCAF_G[surv_dat$vCAF >summary(surv_dat$vCAF)[5]] <-"vCAF high"
+surv_dat$vCAF_G[surv_dat$vCAF <=summary(surv_dat$vCAF)[2]] <-"vCAF low"
+surv_dat$vCAF_G[surv_dat$vCAF >summary(surv_dat$vCAF)[2]& surv_dat$vCAF<=summary(surv_dat$vCAF)[5]] <-"vCAF medium"
+
+surv_dat$tpCAF_CD10_G[surv_dat$tpCAF_CD10 >summary(surv_dat$tpCAF_CD10)[5]] <-"tpCAF_CD10 high"
+surv_dat$tpCAF_CD10_G[surv_dat$tpCAF_CD10 <=summary(surv_dat$tpCAF_CD10)[2]] <-"tpCAF_CD10 low"
+surv_dat$tpCAF_CD10_G[surv_dat$tpCAF_CD10 >summary(surv_dat$tpCAF_CD10)[2]& surv_dat$tpCAF_CD10<=summary(surv_dat$tpCAF_CD10)[5]] <-"tpCAF_CD10 medium"
+
+surv_dat$tpCAF_CD73_G[surv_dat$tpCAF_CD73 >summary(surv_dat$tpCAF_CD73)[5]] <-"tpCAF_CD73 high"
+surv_dat$tpCAF_CD73_G[surv_dat$tpCAF_CD73 <=summary(surv_dat$tpCAF_CD73)[2]] <-"tpCAF_CD73 low"
+surv_dat$tpCAF_CD73_G[surv_dat$tpCAF_CD73 >summary(surv_dat$tpCAF_CD73)[2]& surv_dat$tpCAF_CD73<=summary(surv_dat$tpCAF_CD73)[5]] <-"tpCAF_CD73 medium"
+
+surv_dat$iCAF_CD34_G[surv_dat$iCAF_CD34 >summary(surv_dat$iCAF_CD34)[5]] <-"iCAF_CD34 high"
+surv_dat$iCAF_CD34_G[surv_dat$iCAF_CD34 <=summary(surv_dat$iCAF_CD34)[2]] <-"iCAF_CD34 low"
+surv_dat$iCAF_CD34_G[surv_dat$iCAF_CD34 >summary(surv_dat$iCAF_CD34)[2]& surv_dat$iCAF_CD34<=summary(surv_dat$iCAF_CD34)[5]] <-"iCAF_CD34 medium"
+
+surv_dat$iCAF_CD248_G[surv_dat$iCAF_CD248 >summary(surv_dat$iCAF_CD248)[5]] <-"iCAF_CD248 high"
+surv_dat$iCAF_CD248_G[surv_dat$iCAF_CD248 <=summary(surv_dat$iCAF_CD248)[2]] <-"iCAF_CD248 low"
+surv_dat$iCAF_CD248_G[surv_dat$iCAF_CD248 >summary(surv_dat$iCAF_CD248)[2]& surv_dat$iCAF_CD248<=summary(surv_dat$iCAF_CD248)[5]] <-"iCAF_CD248 medium"
+
+surv_dat$dCAF_G[surv_dat$dCAF >summary(surv_dat$dCAF)[5]] <-"dCAF high"
+surv_dat$dCAF_G[surv_dat$dCAF <=summary(surv_dat$dCAF)[2]] <-"dCAF low"
+surv_dat$dCAF_G[surv_dat$dCAF >summary(surv_dat$dCAF)[2]& surv_dat$dCAF<=summary(surv_dat$dCAF)[5]] <-"dCAF medium"
+
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[5]] <-"Collagen_CAF high"
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF <=summary(surv_dat$Collagen_CAF)[2]] <-"Collagen_CAF low"
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[2]& surv_dat$Collagen_CAF<=summary(surv_dat$Collagen_CAF)[5]] <-"Collagen_CAF medium"
+
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[5]] <-"hypoxic_CAF high"
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF <=summary(surv_dat$hypoxic_CAF)[2]] <-"hypoxic_CAF low"
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[2]& surv_dat$hypoxic_CAF<=summary(surv_dat$hypoxic_CAF)[5]] <-"hypoxic_CAF medium"
+
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[5]] <-"SMA_CAF high"
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF <=summary(surv_dat$SMA_CAF)[2]] <-"SMA_CAF low"
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[2]& surv_dat$SMA_CAF<=summary(surv_dat$SMA_CAF)[5]] <-"SMA_CAF medium"
+
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF high"
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF <=summary(surv_dat$hypoxic_tpCAF)[2]] <-"hypoxic_tpCAF low"
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[2]& surv_dat$hypoxic_tpCAF<=summary(surv_dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF medium"
+
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[5]] <-"IDO_CAF high"
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF <=summary(surv_dat$IDO_CAF)[2]] <-"IDO_CAF low"
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[2]& surv_dat$IDO_CAF<=summary(surv_dat$IDO_CAF)[5]] <-"IDO_CAF medium"
+
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[5]] <-"PDPN_CAF high"
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF <=summary(surv_dat$PDPN_CAF)[2]] <-"PDPN_CAF low"
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[2]& surv_dat$PDPN_CAF<=summary(surv_dat$PDPN_CAF)[5]] <-"PDPN_CAF medium"
+
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","SMA_CAF_G","hypoxic_CAF_G","Collagen_CAF_G","tpCAF_CD73_G","tpCAF_CD10_G","mCAF_Col_Cdh_G","mCAF_MMP11_G","iCAF_CD34_G", "iCAF_CD248_G", "metacluster")
+
+```
+
+## Survival Fibro category proportions high mid low not split
+```{r  survival Fibro category proportions not split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Survival Fibro category proportions high mid low split by tumour type
+```{r  survival Fibro category proportions split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$mCAF_Col_Cdh_G[dat$mCAF_Col_Cdh >summary(dat$mCAF_Col_Cdh)[5]] <-"mCAF_Col_Cdh high"
+dat$mCAF_Col_Cdh_G[dat$mCAF_Col_Cdh <=summary(dat$mCAF_Col_Cdh)[2]] <-"mCAF_Col_Cdh low"
+dat$mCAF_Col_Cdh_G[dat$mCAF_Col_Cdh >summary(dat$mCAF_Col_Cdh)[2]& dat$mCAF_Col_Cdh<=summary(dat$mCAF_Col_Cdh)[5]] <-"mCAF_Col_Cdh medium"
+
+dat$mCAF_MMP11_G[dat$mCAF_MMP11 >summary(dat$mCAF_MMP11)[5]] <-"mCAF_MMP11 high"
+dat$mCAF_MMP11_G[dat$mCAF_MMP11 <=summary(dat$mCAF_MMP11)[2]] <-"mCAF_MMP11 low"
+dat$mCAF_MMP11_G[dat$mCAF_MMP11 >summary(dat$mCAF_MMP11)[2]& dat$mCAF_MMP11<=summary(dat$mCAF_MMP11)[5]] <-"mCAF_MMP11 medium"
+
+dat$vCAF_G[dat$vCAF >summary(dat$vCAF)[5]] <-"vCAF high"
+dat$vCAF_G[dat$vCAF <=summary(dat$vCAF)[2]] <-"vCAF low"
+dat$vCAF_G[dat$vCAF >summary(dat$vCAF)[2]& dat$vCAF<=summary(dat$vCAF)[5]] <-"vCAF medium"
+
+dat$tpCAF_CD10_G[dat$tpCAF_CD10 >summary(dat$tpCAF_CD10)[5]] <-"tpCAF_CD10 high"
+dat$tpCAF_CD10_G[dat$tpCAF_CD10 <=summary(dat$tpCAF_CD10)[2]] <-"tpCAF_CD10 low"
+dat$tpCAF_CD10_G[dat$tpCAF_CD10 >summary(dat$tpCAF_CD10)[2]& dat$tpCAF_CD10<=summary(dat$tpCAF_CD10)[5]] <-"tpCAF_CD10 medium"
+
+dat$tpCAF_CD73_G[dat$tpCAF_CD73 >summary(dat$tpCAF_CD73)[5]] <-"tpCAF_CD73 high"
+dat$tpCAF_CD73_G[dat$tpCAF_CD73 <=summary(dat$tpCAF_CD73)[2]] <-"tpCAF_CD73 low"
+dat$tpCAF_CD73_G[dat$tpCAF_CD73 >summary(dat$tpCAF_CD73)[2]& dat$tpCAF_CD73<=summary(dat$tpCAF_CD73)[5]] <-"tpCAF_CD73 medium"
+
+dat$iCAF_CD34_G[dat$iCAF_CD34 >summary(dat$iCAF_CD34)[5]] <-"iCAF_CD34 high"
+dat$iCAF_CD34_G[dat$iCAF_CD34 <=summary(dat$iCAF_CD34)[2]] <-"iCAF_CD34 low"
+dat$iCAF_CD34_G[dat$iCAF_CD34 >summary(dat$iCAF_CD34)[2]& dat$iCAF_CD34<=summary(dat$iCAF_CD34)[5]] <-"iCAF_CD34 medium"
+
+dat$iCAF_CD248_G[dat$iCAF_CD248 >summary(dat$iCAF_CD248)[5]] <-"iCAF_CD248 high"
+dat$iCAF_CD248_G[dat$iCAF_CD248 <=summary(dat$iCAF_CD248)[2]] <-"iCAF_CD248 low"
+dat$iCAF_CD248_G[dat$iCAF_CD248 >summary(dat$iCAF_CD248)[2]& dat$iCAF_CD248<=summary(dat$iCAF_CD248)[5]] <-"iCAF_CD248 medium"
+
+dat$dCAF_G[dat$dCAF >summary(dat$dCAF)[5]] <-"dCAF high"
+dat$dCAF_G[dat$dCAF <=summary(dat$dCAF)[2]] <-"dCAF low"
+dat$dCAF_G[dat$dCAF >summary(dat$dCAF)[2]& dat$dCAF<=summary(dat$dCAF)[5]] <-"dCAF medium"
+
+dat$Collagen_CAF_G[dat$Collagen_CAF >summary(dat$Collagen_CAF)[5]] <-"Collagen_CAF high"
+dat$Collagen_CAF_G[dat$Collagen_CAF <=summary(dat$Collagen_CAF)[2]] <-"Collagen_CAF low"
+dat$Collagen_CAF_G[dat$Collagen_CAF >summary(dat$Collagen_CAF)[2]& dat$Collagen_CAF<=summary(dat$Collagen_CAF)[5]] <-"Collagen_CAF medium"
+
+dat$hypoxic_CAF_G[dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[5]] <-"hypoxic_CAF high"
+dat$hypoxic_CAF_G[dat$hypoxic_CAF <=summary(dat$hypoxic_CAF)[2]] <-"hypoxic_CAF low"
+dat$hypoxic_CAF_G[dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[2]& dat$hypoxic_CAF<=summary(dat$hypoxic_CAF)[5]] <-"hypoxic_CAF medium"
+
+dat$SMA_CAF_G[dat$SMA_CAF >summary(dat$SMA_CAF)[5]] <-"SMA_CAF high"
+dat$SMA_CAF_G[dat$SMA_CAF <=summary(dat$SMA_CAF)[2]] <-"SMA_CAF low"
+dat$SMA_CAF_G[dat$SMA_CAF >summary(dat$SMA_CAF)[2]& dat$SMA_CAF<=summary(dat$SMA_CAF)[5]] <-"SMA_CAF medium"
+
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF high"
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF <=summary(dat$hypoxic_tpCAF)[2]] <-"hypoxic_tpCAF low"
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[2]& dat$hypoxic_tpCAF<=summary(dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF medium"
+
+dat$IDO_CAF_G[dat$IDO_CAF >summary(dat$IDO_CAF)[5]] <-"IDO_CAF high"
+dat$IDO_CAF_G[dat$IDO_CAF <=summary(dat$IDO_CAF)[2]] <-"IDO_CAF low"
+dat$IDO_CAF_G[dat$IDO_CAF >summary(dat$IDO_CAF)[2]& dat$IDO_CAF<=summary(dat$IDO_CAF)[5]] <-"IDO_CAF medium"
+
+dat$PDPN_CAF_G[dat$PDPN_CAF >summary(dat$PDPN_CAF)[5]] <-"PDPN_CAF high"
+dat$PDPN_CAF_G[dat$PDPN_CAF <=summary(dat$PDPN_CAF)[2]] <-"PDPN_CAF low"
+dat$PDPN_CAF_G[dat$PDPN_CAF >summary(dat$PDPN_CAF)[2]& dat$PDPN_CAF<=summary(dat$PDPN_CAF)[5]] <-"PDPN_CAF medium"
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high mid low as well as the proportion (continuous).   
+```{r  coxph Fibro category proportions high mid low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Proportions_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+#Correlation of Fibro category proportions with clinical parameters
+```{r  Correlation of non Fibro category proportions and clinical parameters, message=FALSE, warning=FALSE, echo=FALSE}
+all.sce_pat$NeoAdj <- ifelse(all.sce_pat$Chemo==1 |all.sce_pat$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.sce_pat[,all.sce_pat$Patient_ID!="Control"&
+                                          all.sce_pat$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                           all.sce_pat[,all.sce_pat$Patient_ID!="Control"&
+                                          all.sce_pat$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+tdat_wide <-merge(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_as <-tdat_wide[tdat_wide$DX.name=="Adenocarcinoma" | tdat_wide$DX.name=="Squamous cell carcinoma",]
+tdat_wide_as$LN.Met <-ifelse(tdat_wide_as$LN.Met=="LN Metastases", 1,0)
+tdat_wide_as$Dist.Met <-ifelse(tdat_wide_as$Dist.Met=="Dist. Metastases", 1,0)
+
+c.param <- c("Gender","T.new","N","M.new","Dist.Met","LN.Met","Stage")
+for(i in c.param){
+  print(i)
+print(cor.test(tdat_wide_as$CD4,tdat_wide_as[[i]], method="spearman", exact=F))
+  }
+```
+
+
+In group comparison (Wilcoxon for 2 groups or KW for >2 groups) for Fibro categories over clinical variables:  
+  - T.new (TNM)
+  - N (TNM)
+  - M.new (TNM)
+  - Relapse
+  - Grade
+  - Stage
+  - Gender
+  - Typ (Tumour Type)
+  - Dist.Met (Distant metastases yes/no)
+  - LN.Met (Lymph node metastases yes/no)
+  
+## Kruskal-Wallis / Wilcoxon Fibro category ~ clinical data excluding neoadjuvant therapy
+```{r  KW W Fibro category proportions WO neo, fig.width=12, fig.height=10, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(all.sce_pat[,all.sce_pat$Patient_ID!="Control"&
+                                          all.sce_pat$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat$DX.name=="Squamous cell carcinoma"]))
+
+#remove neoadj patients
+df <-df[!df$Patient_ID%in% neoadj_pat$Patient_ID,]
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender") 
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$cell_subtype, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+  #pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Proportions_WOneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+## Neoadjuvantly treated patients are included, the same categories are used plus:  
+- NeoAdj (Neoadjuvant therapy yes/no)
+- Chemo (Neoadjuvant therapy = chemotherapy)
+- Radio (Neoadjuvant therapy = radiotherapy)
+```{r  KW W Fibro category proportions with NEO, fig.width=12, fig.height=10, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(all.sce_pat[,all.sce_pat$Patient_ID!="Control"&
+                                          all.sce_pat$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat$DX.name=="Squamous cell carcinoma"]))
+
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$cell_subtype, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Proportions_Wneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+
+#HERE
+## **Fibro category**
+## **Densities**
+Generally, densities are first calculated per tissue area. For patients with more than 1 image, the average density is calculated subsequently.  
+Optimal number of Fibro category densities patients metaclusters:
+```{r  Fibro category density optimal number of clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+all.sce_pat.roi$tcell_subtype <- all.sce_pat.roi$cell_subtype
+all.sce_pat.roi$tcell_subtype[all.sce_pat.roi$tcell_subtype=="iCAF_CD248"|all.sce_pat.roi$tcell_subtype=="iCAF_CD34"] <-"iCAF"
+all.sce_pat.roi$tcell_subtype[all.sce_pat.roi$tcell_subtype=="tpCAF_CD10"|all.sce_pat.roi$tcell_subtype=="tpCAF_CD73"] <-"tpCAF"
+all.sce_pat.roi$tcell_subtype[all.sce_pat.roi$tcell_subtype=="mCAF_MMP11"|all.sce_pat.roi$tcell_subtype=="mCAF_Col_Cdh"] <-"mCAF"
+
+all.sce_pat$tcell_subtype <- all.sce_pat$cell_subtype
+all.sce_pat$tcell_subtype[all.sce_pat$tcell_subtype=="iCAF_CD248"|all.sce_pat$tcell_subtype=="iCAF_CD34"] <-"iCAF"
+all.sce_pat$tcell_subtype[all.sce_pat$tcell_subtype=="tpCAF_CD10"|all.sce_pat$tcell_subtype=="tpCAF_CD73"] <-"tpCAF"
+all.sce_pat$tcell_subtype[all.sce_pat$tcell_subtype=="mCAF_MMP11"|all.sce_pat$tcell_subtype=="mCAF_Col_Cdh"] <-"mCAF"
+
+all.sce_roi$tcell_subtype <- all.sce_roi$cell_subtype
+all.sce_roi$tcell_subtype[all.sce_roi$tcell_subtype=="iCAF_CD248"|all.sce_roi$tcell_subtype=="iCAF_CD34"] <-"iCAF"
+all.sce_roi$tcell_subtype[all.sce_roi$tcell_subtype=="tpCAF_CD10"|all.sce_roi$tcell_subtype=="tpCAF_CD73"] <-"tpCAF"
+all.sce_roi$tcell_subtype[all.sce_roi$tcell_subtype=="mCAF_MMP11"|all.sce_roi$tcell_subtype=="mCAF_Col_Cdh"] <-"mCAF"
+
+#Calculate Densities
+tdat <-as.data.frame(table(all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$tcell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.sce_pat.roi$RoiID,
+                                "Area"=all.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.sce_pat.roi$Patient_ID,
+                                "DX.name"=all.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+#tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+library(FactoMineR)
+library(factoextra)
+sbg <-cutree(hc, k=8)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=20)
+
+#install.packages("cluster")
+library(cluster)
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+```{r  Calculate densities Fibro category, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table(all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$tcell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.sce_pat.roi$RoiID,
+                                "Area"=all.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.sce_pat.roi$Patient_ID,
+                                "DX.name"=all.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+  
+  
+#CAF type distribution over all patients
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+#order Patients after clustering results
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = labels(hc))
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = labels(hc))
+```
+
+## Barplot showing absolute density per patietn coloured by Fibro category together with hierarchical clustering tree coloured by patient metaclusters.
+```{r Barolot Fibro category densities, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+#barolot (sqrt of density)
+library(ggsci)
+p <-ggplot(tdat,aes(y=sqrt(density),x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")+#+theme(legend.position = "none")+
+scale_fill_igv()
+
+#barplot total density
+p <-ggplot(tdat,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")+#+theme(legend.position = "none")
+scale_fill_igv()
+
+#coloured dendrogram
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 8) %>% as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()
+pg <-plot_grid(p1,p, align="v", ncol=1)
+pg
+#ggsave(plot=pg, file=file.path(plot_folder, paste("Barplot_Tcell-Category-Densities_HC_ordered_dendro.pdf")), width=12, height=6)
+```
+Add patient metacluster to data and plot clusters separately for better identification
+```{r Fibro category densities merge hc, message=FALSE, warning=FALSE, echo=FALSE}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k=8))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+ggplot(tdat_ct,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+scale_fill_igv()+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+```
+
+## Survival analysis over Fibro category densities high low.  
+- High > median
+- Low < median
+```{r  surv_dat Fibro category densities high low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- rowSums(surv_dat[,2:27])
+
+#high low
+surv_dat$Density_G <- ifelse(surv_dat$total_density >summary(surv_dat$total_density)[3],"Density high","Density low")
+surv_dat$Collagen_CAF_G <- ifelse(surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+surv_dat$hypoxic_CAF_G <- ifelse(surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+
+surv_dat$mCAF_MMP11_G <- ifelse(surv_dat$mCAF_MMP11 >summary(surv_dat$mCAF_MMP11)[3],"mCAF_MMP11 high","mCAF_MMP11 low")
+surv_dat$mCAF_Col_Cdh_G <- ifelse(surv_dat$mCAF_Col_Cdh >summary(surv_dat$mCAF_Col_Cdh)[3],"mCAF_Col_Cdh high","mCAF_Col_Cdh low")
+
+surv_dat$SMA_CAF_G <- ifelse(surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+
+surv_dat$tpCAF_CD10_G <- ifelse(surv_dat$tpCAF_CD10 >summary(surv_dat$tpCAF_CD10)[3],"tpCAF_CD10 high","tpCAF_CD10 low")
+surv_dat$tpCAF_CD73_G <- ifelse(surv_dat$tpCAF_CD73 >summary(surv_dat$tpCAF_CD73)[3],"tpCAF_CD73 high","tpCAF_CD73 low")
+
+surv_dat$iCAF_CD248_G <- ifelse(surv_dat$iCAF_CD248 >summary(surv_dat$iCAF_CD248)[3],"iCAF_CD248 high","iCAF_CD248 low")
+surv_dat$iCAF_CD34_G <- ifelse(surv_dat$iCAF_CD34 >summary(surv_dat$iCAF_CD34)[3],"iCAF_CD34 high","iCAF_CD34 low")
+
+surv_dat$vCAF_G <- ifelse(surv_dat$vCAF >summary(surv_dat$vCAF)[3],"vCAF high","vCAF low")
+surv_dat$dCAF_G <- ifelse(surv_dat$dCAF >summary(surv_dat$dCAF)[3],"dCAF high","dCAF low")
+surv_dat$hypoxic_tpCAF_G <- ifelse(surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+surv_dat$IDO_CAF_G <- ifelse(surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+surv_dat$PDPN_CAF_G <- ifelse(surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+
+```
+
+#CoxPH for Fibro category density corrected for Stage, Grade and M
+```{r  Coxph Fibro category density, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+res.cox <- coxph(Surv(time, status) ~   CD8+CD4+CD4_Treg+IDO_CD4+PD1_CD4+`TCF1/7_CD4`+ki67_CD4+IDO_CD8+`TCF1/7_CD8`+ki67_CD8+IDO_CAF + SMA_CAF+PDPN_CAF+Collagen_CAF +hypoxic_CAF+hypoxic_tpCAF+tpCAF+vessel+HEV+Bcell+Myeloid+Bcell+ dCAF+iCAF+vCAF+mCAF+Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r  survival Fibro category densities high low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+surv <-c("metacluster")
+#surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+library(survminer)
+
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+print(pw)
+print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+   symbols = c("****", "***", "**", "*", "+", "."),
+   abbr.colnames = FALSE, na = " "))
+#km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r  survival Fibro category densities high low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Split
+library(survminer)
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G <- ifelse(dat$total_density >summary(dat$total_density)[3],"Density high","Density low")
+dat$Density_G <- ifelse(dat$total_density >summary(dat$total_density)[3],"Density high","Density low")
+dat$Collagen_CAF_G <- ifelse(dat$Collagen_CAF >summary(dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+dat$hypoxic_CAF_G <- ifelse(dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+
+dat$mCAF_MMP11_G <- ifelse(dat$mCAF_MMP11 >summary(dat$mCAF_MMP11)[3],"mCAF_MMP11 high","mCAF_MMP11 low")
+dat$mCAF_Col_Cdh_G <- ifelse(dat$mCAF_Col_Cdh >summary(dat$mCAF_Col_Cdh)[3],"mCAF_Col_Cdh high","mCAF_Col_Cdh low")
+
+dat$SMA_CAF_G <- ifelse(dat$SMA_CAF >summary(dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+
+dat$tpCAF_CD10_G <- ifelse(dat$tpCAF_CD10 >summary(dat$tpCAF_CD10)[3],"tpCAF_CD10 high","tpCAF_CD10 low")
+dat$tpCAF_CD73_G <- ifelse(dat$tpCAF_CD73 >summary(dat$tpCAF_CD73)[3],"tpCAF_CD73 high","tpCAF_CD73 low")
+
+dat$iCAF_CD248_G <- ifelse(dat$iCAF_CD248 >summary(dat$iCAF_CD248)[3],"iCAF_CD248 high","iCAF_CD248 low")
+dat$iCAF_CD34_G <- ifelse(dat$iCAF_CD34 >summary(dat$iCAF_CD34)[3],"iCAF_CD34 high","iCAF_CD34 low")
+
+dat$vCAF_G <- ifelse(dat$vCAF >summary(dat$vCAF)[3],"vCAF high","vCAF low")
+dat$dCAF_G <- ifelse(dat$dCAF >summary(dat$dCAF)[3],"dCAF high","dCAF low")
+dat$hypoxic_tpCAF_G <- ifelse(dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+dat$IDO_CAF_G <- ifelse(dat$IDO_CAF >summary(dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+dat$PDPN_CAF_G <- ifelse(dat$PDPN_CAF >summary(dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high / low as well as the densities (continuous).   
+```{r  coxph Fibro category densities high-low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=6, fig.width=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Density_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Survival Fibro category densities high-mid-low     
+- high    > 3rd quantile   
+- mid     > 1st quantile < 3rd quantile   
+- low     < 1st quantile   
+```{r  surv_dat Fibro category densities high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- rowSums(surv_dat[,2:15])
+
+#High Med Low
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[5]] <-"Density high"
+surv_dat$Density_G[surv_dat$total_density <summary(surv_dat$total_density)[2]] <-"Density low"
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[2]& surv_dat$total_density<summary(surv_dat$total_density)[5]] <-"Density medium"
+
+surv_dat$mCAF_Col_Cdh_G[surv_dat$mCAF_Col_Cdh >summary(surv_dat$mCAF_Col_Cdh)[5]] <-"mCAF_Col_Cdh high"
+surv_dat$mCAF_Col_Cdh_G[surv_dat$mCAF_Col_Cdh <=summary(surv_dat$mCAF_Col_Cdh)[2]] <-"mCAF_Col_Cdh low"
+surv_dat$mCAF_Col_Cdh_G[surv_dat$mCAF_Col_Cdh >summary(surv_dat$mCAF_Col_Cdh)[2]& surv_dat$mCAF_Col_Cdh<=summary(surv_dat$mCAF_Col_Cdh)[5]] <-"mCAF_Col_Cdh medium"
+
+surv_dat$mCAF_MMP11_G[surv_dat$mCAF_MMP11 >summary(surv_dat$mCAF_MMP11)[5]] <-"mCAF_MMP11 high"
+surv_dat$mCAF_MMP11_G[surv_dat$mCAF_MMP11 <=summary(surv_dat$mCAF_MMP11)[2]] <-"mCAF_MMP11 low"
+surv_dat$mCAF_MMP11_G[surv_dat$mCAF_MMP11 >summary(surv_dat$mCAF_MMP11)[2]& surv_dat$mCAF_MMP11<=summary(surv_dat$mCAF_MMP11)[5]] <-"mCAF_MMP11 medium"
+
+surv_dat$vCAF_G[surv_dat$vCAF >summary(surv_dat$vCAF)[5]] <-"vCAF high"
+surv_dat$vCAF_G[surv_dat$vCAF <=summary(surv_dat$vCAF)[2]] <-"vCAF low"
+surv_dat$vCAF_G[surv_dat$vCAF >summary(surv_dat$vCAF)[2]& surv_dat$vCAF<=summary(surv_dat$vCAF)[5]] <-"vCAF medium"
+
+surv_dat$tpCAF_CD10_G[surv_dat$tpCAF_CD10 >summary(surv_dat$tpCAF_CD10)[5]] <-"tpCAF_CD10 high"
+surv_dat$tpCAF_CD10_G[surv_dat$tpCAF_CD10 <=summary(surv_dat$tpCAF_CD10)[2]] <-"tpCAF_CD10 low"
+surv_dat$tpCAF_CD10_G[surv_dat$tpCAF_CD10 >summary(surv_dat$tpCAF_CD10)[2]& surv_dat$tpCAF_CD10<=summary(surv_dat$tpCAF_CD10)[5]] <-"tpCAF_CD10 medium"
+
+surv_dat$tpCAF_CD73_G[surv_dat$tpCAF_CD73 >summary(surv_dat$tpCAF_CD73)[5]] <-"tpCAF_CD73 high"
+surv_dat$tpCAF_CD73_G[surv_dat$tpCAF_CD73 <=summary(surv_dat$tpCAF_CD73)[2]] <-"tpCAF_CD73 low"
+surv_dat$tpCAF_CD73_G[surv_dat$tpCAF_CD73 >summary(surv_dat$tpCAF_CD73)[2]& surv_dat$tpCAF_CD73<=summary(surv_dat$tpCAF_CD73)[5]] <-"tpCAF_CD73 medium"
+
+surv_dat$iCAF_CD34_G[surv_dat$iCAF_CD34 >summary(surv_dat$iCAF_CD34)[5]] <-"iCAF_CD34 high"
+surv_dat$iCAF_CD34_G[surv_dat$iCAF_CD34 <=summary(surv_dat$iCAF_CD34)[2]] <-"iCAF_CD34 low"
+surv_dat$iCAF_CD34_G[surv_dat$iCAF_CD34 >summary(surv_dat$iCAF_CD34)[2]& surv_dat$iCAF_CD34<=summary(surv_dat$iCAF_CD34)[5]] <-"iCAF_CD34 medium"
+
+surv_dat$iCAF_CD248_G[surv_dat$iCAF_CD248 >summary(surv_dat$iCAF_CD248)[5]] <-"iCAF_CD248 high"
+surv_dat$iCAF_CD248_G[surv_dat$iCAF_CD248 <=summary(surv_dat$iCAF_CD248)[2]] <-"iCAF_CD248 low"
+surv_dat$iCAF_CD248_G[surv_dat$iCAF_CD248 >summary(surv_dat$iCAF_CD248)[2]& surv_dat$iCAF_CD248<=summary(surv_dat$iCAF_CD248)[5]] <-"iCAF_CD248 medium"
+
+surv_dat$dCAF_G[surv_dat$dCAF >summary(surv_dat$dCAF)[5]] <-"dCAF high"
+surv_dat$dCAF_G[surv_dat$dCAF <=summary(surv_dat$dCAF)[2]] <-"dCAF low"
+surv_dat$dCAF_G[surv_dat$dCAF >summary(surv_dat$dCAF)[2]& surv_dat$dCAF<=summary(surv_dat$dCAF)[5]] <-"dCAF medium"
+
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[5]] <-"Collagen_CAF high"
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF <=summary(surv_dat$Collagen_CAF)[2]] <-"Collagen_CAF low"
+surv_dat$Collagen_CAF_G[surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[2]& surv_dat$Collagen_CAF<=summary(surv_dat$Collagen_CAF)[5]] <-"Collagen_CAF medium"
+
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[5]] <-"hypoxic_CAF high"
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF <=summary(surv_dat$hypoxic_CAF)[2]] <-"hypoxic_CAF low"
+surv_dat$hypoxic_CAF_G[surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[2]& surv_dat$hypoxic_CAF<=summary(surv_dat$hypoxic_CAF)[5]] <-"hypoxic_CAF medium"
+
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[5]] <-"SMA_CAF high"
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF <=summary(surv_dat$SMA_CAF)[2]] <-"SMA_CAF low"
+surv_dat$SMA_CAF_G[surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[2]& surv_dat$SMA_CAF<=summary(surv_dat$SMA_CAF)[5]] <-"SMA_CAF medium"
+
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF high"
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF <=summary(surv_dat$hypoxic_tpCAF)[2]] <-"hypoxic_tpCAF low"
+surv_dat$hypoxic_tpCAF_G[surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[2]& surv_dat$hypoxic_tpCAF<=summary(surv_dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF medium"
+
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[5]] <-"IDO_CAF high"
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF <=summary(surv_dat$IDO_CAF)[2]] <-"IDO_CAF low"
+surv_dat$IDO_CAF_G[surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[2]& surv_dat$IDO_CAF<=summary(surv_dat$IDO_CAF)[5]] <-"IDO_CAF medium"
+
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[5]] <-"PDPN_CAF high"
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF <=summary(surv_dat$PDPN_CAF)[2]] <-"PDPN_CAF low"
+surv_dat$PDPN_CAF_G[surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[2]& surv_dat$PDPN_CAF<=summary(surv_dat$PDPN_CAF)[5]] <-"PDPN_CAF medium"
+
+#surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+```
+
+## Fibro category densities split by tumour type
+```{r  survival Fibro category densities high-mid-low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+#surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+library(survminer)
+
+for (i in surv){
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## Fibro category densities split by tumour type
+```{r  survival Fibro category densities high-mid-low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_CD34_G","iCAF_CD248_G","tpCAF_CD10_G","tpCAF_CD73_G","SMA_CAF_G","mCAF_MMP11_G","mCAF_Col_Cdh_G","hypoxic_CAF_G","Collagen_CAF_G", "metacluster","Density_G")
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G[dat$total_density >summary(dat$total_density)[5]] <-"Density high"
+dat$Density_G[dat$total_density <summary(dat$total_density)[2]] <-"Density low"
+dat$Density_G[dat$total_density >summary(dat$total_density)[2]& dat$total_density<summary(dat$total_density)[5]] <-"Density medium"
+
+dat$mCAF_Col_Cdh_G[dat$mCAF_Col_Cdh >summary(dat$mCAF_Col_Cdh)[5]] <-"mCAF_Col_Cdh high"
+dat$mCAF_Col_Cdh_G[dat$mCAF_Col_Cdh <=summary(dat$mCAF_Col_Cdh)[2]] <-"mCAF_Col_Cdh low"
+dat$mCAF_Col_Cdh_G[dat$mCAF_Col_Cdh >summary(dat$mCAF_Col_Cdh)[2]& dat$mCAF_Col_Cdh<=summary(dat$mCAF_Col_Cdh)[5]] <-"mCAF_Col_Cdh medium"
+
+dat$mCAF_MMP11_G[dat$mCAF_MMP11 >summary(dat$mCAF_MMP11)[5]] <-"mCAF_MMP11 high"
+dat$mCAF_MMP11_G[dat$mCAF_MMP11 <=summary(dat$mCAF_MMP11)[2]] <-"mCAF_MMP11 low"
+dat$mCAF_MMP11_G[dat$mCAF_MMP11 >summary(dat$mCAF_MMP11)[2]& dat$mCAF_MMP11<=summary(dat$mCAF_MMP11)[5]] <-"mCAF_MMP11 medium"
+
+dat$vCAF_G[dat$vCAF >summary(dat$vCAF)[5]] <-"vCAF high"
+dat$vCAF_G[dat$vCAF <=summary(dat$vCAF)[2]] <-"vCAF low"
+dat$vCAF_G[dat$vCAF >summary(dat$vCAF)[2]& dat$vCAF<=summary(dat$vCAF)[5]] <-"vCAF medium"
+
+dat$tpCAF_CD10_G[dat$tpCAF_CD10 >summary(dat$tpCAF_CD10)[5]] <-"tpCAF_CD10 high"
+dat$tpCAF_CD10_G[dat$tpCAF_CD10 <=summary(dat$tpCAF_CD10)[2]] <-"tpCAF_CD10 low"
+dat$tpCAF_CD10_G[dat$tpCAF_CD10 >summary(dat$tpCAF_CD10)[2]& dat$tpCAF_CD10<=summary(dat$tpCAF_CD10)[5]] <-"tpCAF_CD10 medium"
+
+dat$tpCAF_CD73_G[dat$tpCAF_CD73 >summary(dat$tpCAF_CD73)[5]] <-"tpCAF_CD73 high"
+dat$tpCAF_CD73_G[dat$tpCAF_CD73 <=summary(dat$tpCAF_CD73)[2]] <-"tpCAF_CD73 low"
+dat$tpCAF_CD73_G[dat$tpCAF_CD73 >summary(dat$tpCAF_CD73)[2]& dat$tpCAF_CD73<=summary(dat$tpCAF_CD73)[5]] <-"tpCAF_CD73 medium"
+
+dat$iCAF_CD34_G[dat$iCAF_CD34 >summary(dat$iCAF_CD34)[5]] <-"iCAF_CD34 high"
+dat$iCAF_CD34_G[dat$iCAF_CD34 <=summary(dat$iCAF_CD34)[2]] <-"iCAF_CD34 low"
+dat$iCAF_CD34_G[dat$iCAF_CD34 >summary(dat$iCAF_CD34)[2]& dat$iCAF_CD34<=summary(dat$iCAF_CD34)[5]] <-"iCAF_CD34 medium"
+
+dat$iCAF_CD248_G[dat$iCAF_CD248 >summary(dat$iCAF_CD248)[5]] <-"iCAF_CD248 high"
+dat$iCAF_CD248_G[dat$iCAF_CD248 <=summary(dat$iCAF_CD248)[2]] <-"iCAF_CD248 low"
+dat$iCAF_CD248_G[dat$iCAF_CD248 >summary(dat$iCAF_CD248)[2]& dat$iCAF_CD248<=summary(dat$iCAF_CD248)[5]] <-"iCAF_CD248 medium"
+
+dat$dCAF_G[dat$dCAF >summary(dat$dCAF)[5]] <-"dCAF high"
+dat$dCAF_G[dat$dCAF <=summary(dat$dCAF)[2]] <-"dCAF low"
+dat$dCAF_G[dat$dCAF >summary(dat$dCAF)[2]& dat$dCAF<=summary(dat$dCAF)[5]] <-"dCAF medium"
+
+dat$Collagen_CAF_G[dat$Collagen_CAF >summary(dat$Collagen_CAF)[5]] <-"Collagen_CAF high"
+dat$Collagen_CAF_G[dat$Collagen_CAF <=summary(dat$Collagen_CAF)[2]] <-"Collagen_CAF low"
+dat$Collagen_CAF_G[dat$Collagen_CAF >summary(dat$Collagen_CAF)[2]& dat$Collagen_CAF<=summary(dat$Collagen_CAF)[5]] <-"Collagen_CAF medium"
+
+dat$hypoxic_CAF_G[dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[5]] <-"hypoxic_CAF high"
+dat$hypoxic_CAF_G[dat$hypoxic_CAF <=summary(dat$hypoxic_CAF)[2]] <-"hypoxic_CAF low"
+dat$hypoxic_CAF_G[dat$hypoxic_CAF >summary(dat$hypoxic_CAF)[2]& dat$hypoxic_CAF<=summary(dat$hypoxic_CAF)[5]] <-"hypoxic_CAF medium"
+
+dat$SMA_CAF_G[dat$SMA_CAF >summary(dat$SMA_CAF)[5]] <-"SMA_CAF high"
+dat$SMA_CAF_G[dat$SMA_CAF <=summary(dat$SMA_CAF)[2]] <-"SMA_CAF low"
+dat$SMA_CAF_G[dat$SMA_CAF >summary(dat$SMA_CAF)[2]& dat$SMA_CAF<=summary(dat$SMA_CAF)[5]] <-"SMA_CAF medium"
+
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF high"
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF <=summary(dat$hypoxic_tpCAF)[2]] <-"hypoxic_tpCAF low"
+dat$hypoxic_tpCAF_G[dat$hypoxic_tpCAF >summary(dat$hypoxic_tpCAF)[2]& dat$hypoxic_tpCAF<=summary(dat$hypoxic_tpCAF)[5]] <-"hypoxic_tpCAF medium"
+
+dat$IDO_CAF_G[dat$IDO_CAF >summary(dat$IDO_CAF)[5]] <-"IDO_CAF high"
+dat$IDO_CAF_G[dat$IDO_CAF <=summary(dat$IDO_CAF)[2]] <-"IDO_CAF low"
+dat$IDO_CAF_G[dat$IDO_CAF >summary(dat$IDO_CAF)[2]& dat$IDO_CAF<=summary(dat$IDO_CAF)[5]] <-"IDO_CAF medium"
+
+dat$PDPN_CAF_G[dat$PDPN_CAF >summary(dat$PDPN_CAF)[5]] <-"PDPN_CAF high"
+dat$PDPN_CAF_G[dat$PDPN_CAF <=summary(dat$PDPN_CAF)[2]] <-"PDPN_CAF low"
+dat$PDPN_CAF_G[dat$PDPN_CAF >summary(dat$PDPN_CAF)[2]& dat$PDPN_CAF<=summary(dat$PDPN_CAF)[5]] <-"PDPN_CAF medium"
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-cell_subtype_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+
+
+## Lasso-regressed cox-ph model   
+including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high mid low as well as the densities (continuous).   
+```{r coxph Fibro category densities high-mid-low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor")))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tcell-Category_Density_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables without neoadjuvant treated patients.
+```{r  KW W Fibro category density WO neo,warning=FALSE, messages=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+all.sce_pat.roi$NeoAdj <- ifelse(all.sce_pat.roi$Chemo==1 |all.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.sce_pat.roi$RoiID,
+                                "Area"=all.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.sce_pat.roi$Patient_ID,
+                                "DX.name"=all.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[2:15])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:16])), names_to = "Phenotype",values_to = "density")
+
+#remove
+tdat <-tdat[!tdat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=7) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Densities_WOneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables with neoadjuvant treated patients.
+```{r  KW Wilcox Fibro Category density W neo,warning=FALSE, messages=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+all.sce_pat.roi$NeoAdj <- ifelse(all.sce_pat.roi$Chemo==1 |all.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.sce_pat.roi$RoiID,
+                                "Area"=all.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.sce_pat.roi$Patient_ID,
+                                "DX.name"=all.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- rowSums(tdat_wide[,2:15])
+tdat <-tdat_wide %>% pivot_longer(cols=c(colnames(tdat_wide[,2:16])), names_to = "Phenotype",values_to = "density")
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=7) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tcell-Category_Densities_Wneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+```{r  Correlation Fibro category densities with clinical parameters, eval=F, echo=T, message=FALSE, warning=FALSE, echo=FALSE}
+all.sce_pat.roi$NeoAdj <- ifelse(all.sce_pat.roi$Chemo==1 |all.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+all.sce_pat.roi$NeoAdj <- ifelse(all.sce_pat.roi$Chemo==1 |all.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.sce_pat.roi$RoiID,
+                                "Area"=all.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.sce_pat.roi$Patient_ID,
+                                "DX.name"=all.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- tdat_wide$CD4 + tdat_wide$CD8
+
+tdat_wide <-merge(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_as <-tdat_wide[tdat_wide$DX.name=="Adenocarcinoma" | tdat_wide$DX.name=="Squamous cell carcinoma",]
+tdat_wide_as$LN.Met <-ifelse(tdat_wide_as$LN.Met=="LN Metastases", 1,0)
+tdat_wide_as$Dist.Met <-ifelse(tdat_wide_as$Dist.Met=="Dist. Metastases", 1,0)
+
+for(i in c.param){
+  print(i)
+  print(cor.test(tdat_wide_as$CD4,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$CD8,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$total_density,tdat_wide_as[[i]], method="spearman", exact=F))
+}
+```
+
+
+```{r  Correlations T cell type densities amongst eachother, message=FALSE, warning=FALSE, echo=FALSE, fig.width=6, fig.height=6}
+all.sce_pat.roi$NeoAdj <- ifelse(all.sce_pat.roi$Chemo==1 |all.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.sce_pat.roi$RoiID,
+                                "Area"=all.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.sce_pat.roi$Patient_ID,
+                                "DX.name"=all.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+tdat <-tdat %>% 
+  group_by(RoiID,Phenotype) %>%
+  summarise(across(-c(Patient_ID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+tdat_wide <- pivot_wider(tdat,id_cols=c("RoiID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+neoadj_roi <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(RoiID) 
+
+tdat_wide <-tdat_wide[!tdat_wide$RoiID%in% neoadj_roi$RoiID,]
+rownames(tdat_wide)<-tdat_wide$RoiID
+
+library(corrplot)
+cor(tdat_wide[,-1], method="pearson")
+#corrplot(cor(tdat_wide[,-1], method="pearson"), col=rev(brewer.pal(10,"RdBu")))
+
+#test <- immune.pheno.sub %>% select(contains("Ratio"))
+#test$Ratio_tCAF_vCAF <-NULL
+sig <- cor.mtest(tdat_wide[,-1], conf.level = .95) #is this test right?
+
+
+#plot
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+         col=rev(brewer.pal(10,"RdBu")),
+         pch.col = "white",
+         #type="upper",
+         title = "Correlation of T cell type densities",
+        mar=c(0,0,3,0))
+```
+
+
+```{r  Correlations T cell type densities amongst eachother, message=FALSE, warning=FALSE, echo=FALSE, fig.width=6, fig.height=6}
+all.sce_pat.roi$NeoAdj <- ifelse(all.sce_pat.roi$Chemo==1 |all.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.sce_pat.roi$RoiID,
+                                "Area"=all.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.sce_pat.roi$Patient_ID,
+                                "DX.name"=all.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+tdat <-tdat %>% 
+  group_by(RoiID,Phenotype) %>%
+  summarise(across(-c(Patient_ID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+tdat_wide <- pivot_wider(tdat,id_cols=c("RoiID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+neoadj_roi <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(RoiID) 
+
+tdat_wide <-tdat_wide[!tdat_wide$RoiID%in% neoadj_roi$RoiID,]
+rownames(tdat_wide)<-tdat_wide$RoiID
+
+library(corrplot)
+cor(tdat_wide[,-1], method="pearson")
+#corrplot(cor(tdat_wide[,-1], method="pearson"), col=rev(brewer.pal(10,"RdBu")))
+
+#test <- immune.pheno.sub %>% select(contains("Ratio"))
+#test$Ratio_tCAF_vCAF <-NULL
+sig <- cor.mtest(tdat_wide[,-1], conf.level = .95) #is this test right?
+
+
+#plot
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+         col=rev(brewer.pal(10,"RdBu")),
+         pch.col = "white",
+         #type="upper",
+         title = "Correlation of T cell type densities",
+        mar=c(0,0,3,0))
+```
+
+
+
+#######
+```{r  KW W testing CAF groups high low proportions, warning=FALSE, message=FALSE, echo=F, fig.width=15, fig.height=15}
+#high.low proportions
+df <- read.csv(file=file.path(data_folder, paste("CAF_hi-low_median_proportions.csv")))
+#df <- read.csv(file=file.path(data_folder,  paste("CAF_hi-mid-low-quantiles_proportions.csv")))
+df <- read.csv(file=file.path(data_folder, paste("CAF_hi-low_3rdquantile_proportions.csv")))
+df$X <-NULL
+
+hl_m <- merge(tdat_sub, df, by="Patient_ID")
+
+categories <- c("dCAF_G","iCAF_G","d_tpCAF_G","hCAF_G", "tpCAF_G","SMA_CAF_G","IDO_CAF_G","vCAF_G","mCAF_G") 
+plot_list <- list()
+
+for (i in (categories)) {
+hl_m <- merge(tdat, df, by="Patient_ID")
+  for (j in unique(hl_m$Phenotype)){
+    df_l<-hl_m
+    df_l[[i]] <-as.factor(df_l[[i]])
+    df_l <-df_l %>% drop_na(i)
+    df_l$i <-as.factor(df_l[[i]])
+    df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+    df_l <-subset(df_l, Phenotype==j)
+
+    if (length(unique(df_l[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- df_l%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(df_l, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_tableau()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(length(unique(df_l[[i]])) == 2){
+    
+      df_l<-hl_m
+      df_l[[i]] <-as.factor(df_l[[i]])
+      df_l <-df_l %>% drop_na(i)
+      df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+      df_l <-subset(df_l, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- df_l %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(df_l$freq~df_l[[i]], paired=F)$p.value)
+      df_l <- merge(df_l, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      df_l$p.wt <- paste0('p=',round(df_l$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(df_l, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+        scale_color_tableau()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("tcell_category_Proportion_Wo-NeoAdj",i,".pdf")))
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=6)
+ # dev.off()
+}
+```
+
+
+```{r  Correlations proportions per image CAF types,message=FALSE,warning=FALSE,echo=FALSE, fig.width=6, fig.height=6}
+all.sce_pat.roi$NeoAdj <- ifelse(all.sce_pat.roi$Chemo==1 |all.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.sce_pat.roi[,all.sce_pat.roi$DX.name!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.sce_pat.roi[,all.sce_pat.roi$DX.name!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$RoiID <-droplevels(tdat$RoiID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("RoiID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+tdat_wide <-tdat_wide[!tdat_wide$RoiID%in% neoadj_pat$RoiID,]
+rownames(tdat_wide)<-tdat_wide$RoiID
+
+library(corrplot)
+cor(tdat_wide[,-1], method="pearson")
+#corrplot(cor(tdat_wide[,-1], method="pearson"), col=rev(brewer.pal(10,"RdBu")))
+
+sig <- cor.mtest(tdat_wide[,-1], conf.level = .95) #is this test right?
+
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+         col=rev(brewer.pal(10,"RdBu")),
+         pch.col = "white",
+         #type="upper",
+         title = "Correlated ratios of CAF type proportions",
+        mar=c(0,0,3,0))
+```
+
+```{r}
+all.sce$immune_category <- all.sce$cell_category
+all.sce$immune_category[all.sce$immune_category=="T cell"] <-"Immune"
+unique(all.sce$immune_category)
+cat <- c("Immune","T cell") #"Fibroblast",
+cat <-"Fibroblast"
+#i<-"vessel"
+tdat <-data.frame()
+tdat_wide <-data.frame("Patient_ID"=unique(all.sce_pat.roi[,all.sce_pat.roi$DX.name!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$Patient_ID))
+for(i in cat){
+  dat.sce <- all.sce_pat.roi[, all.sce_pat.roi$cell_category== paste(i)]
+  df <-as.data.frame(table(dat.sce[,dat.sce$DX.name!="Control"&
+                                          dat.sce$DX.name=="Adenocarcinoma"|
+                                          dat.sce$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                           dat.sce[,dat.sce$DX.name!="Control"&
+                                          dat.sce$DX.name=="Adenocarcinoma"|
+                                     dat.sce$DX.name=="Squamous cell carcinoma"]$cell_type))
+  colnames(df)<-c("Patient_ID","Phenotype","n")
+  df <-df %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+df[is.na(df)] <- 0
+df$Phenotype <-droplevels(df$Phenotype)
+df$Patient_ID <-droplevels(df$Patient_ID)
+
+tdat <- rbind.data.frame(df, tdat)
+df_wide <- pivot_wider(df,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+  #tdat$Type <- paste(i)
+  
+  tdat_wide <- left_join(tdat_wide,df_wide)
+}
+
+
+sig <- cor.mtest(tdat_wide[complete.cases(tdat_wide), ][,-1], conf.level = .95) #is this test right?
+
+corrplot(cor(tdat_wide[complete.cases(tdat_wide), ][,-1], method="pearson"),
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+         col=rev(brewer.pal(10,"RdBu")),
+         pch.col = "white",
+         #type="upper",
+         title = "Correlated ratios of CAF type proportions per patient",
+        mar=c(0,0,3,0))
+```
+
+```{r  Correlations proportions per patient CAF types,message=FALSE,warning=FALSE,echo=FALSE, fig.width=12, fig.height=12}
+all.sce_pat.roi$NeoAdj <- ifelse(all.sce_pat.roi$Chemo==1 |all.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.sce_pat.roi[,all.sce_pat.roi$DX.name!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                           all.sce_pat.roi[,all.sce_pat.roi$DX.name!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+rownames(tdat_wide)<-tdat_wide$Patient_ID
+
+library(corrplot)
+cor(tdat_wide[,-1], method="pearson")
+#corrplot(cor(tdat_wide[,-1], method="pearson"), col=rev(brewer.pal(10,"RdBu")))
+
+sig <- cor.mtest(tdat_wide[,-1], conf.level = .95) #is this test right?
+
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+         col=rev(brewer.pal(10,"RdBu")),
+         pch.col = "white",
+         #type="upper",
+         title = "Correlated ratios of CAF type proportions per patient",
+        mar=c(0,0,3,0))
+```
+
+
+#Differential abundance loop
+
+#Differential abundance analysis loop   
+```{r  Differential Abundance Analysis Types over Metastasis, fig.width=6, fig.height=4, warning=F, message=F,echo=F, out.width="50%"}
+#Patient Number must be numeric
+library(tidyr)
+#groupId must be factor
+#t <-as.data.frame(table(fibro[,fibro$CAF_Type!="undefined"]$Patient_ID,factor(fibro[,fibro$CAF_Type!="undefined"]$CAF_Type)))
+#  colnames(t)<-c("Patient_ID","Phenotype","n")
+ # t$Phenotype <-factor(t$Phenotype)
+
+  
+colData(all.sce_pat)<-as.data.frame(colData(all.sce_pat)) %>% 
+  group_by(Patient_ID) %>% mutate(NR=cur_group_id()) %>% DataFrame()
+dat <-as_tibble(colData(all.sce_pat))
+#if necessary: change group_id labels
+
+#clinical.data$NR <-c(1:length(unique(clinical.data$Patient_ID)))
+#dat <-merge(t,clinical.data,by="Patient_ID")
+dat <-dat[dat$DX.name=="Adenocarcinoma" | dat$DX.name=="Squamous cell carcinoma",]
+dat$LN.Met <- ifelse(dat$N ==0, "No LN Metastases", "LN Metastases")
+dat$Dist.Met <- ifelse(dat$M.new ==0, "No Dist. Metastases", "Dist. Metastases")
+dat$NeoAdj <- ifelse(dat$Radio==1 |dat$Chemo==1, "NeoAdjuvantTherapy", "NoNeoAdjuvantTherapy")
+
+#Only for grade
+#dat <-dat[dat$Grade=="2" | dat$Grade=="3",]
+#category <-"Grade"
+#dat<-dat %>% drop_na(Grade)
+
+
+
+category <-c("Gender","Relapse","Chemo","Dist.Met","LN.Met")
+
+#group_id <-c("M.new")
+#dat$group_id <-ifelse(dat$M.new ==0, 0,1)
+#dat[[group_id]] <-as.factor(dat[[group_id]])
+
+#define the naming category aka Phenotype
+
+#dat<-dat %>% drop_na(paste(group_id))
+j <-"cell_type"
+
+plot_list <- list()
+for (i in category) {
+  #for(k in unique(dat$DX.name)){
+   # dat.l <-subset(dat, DX.name==k)
+
+dat.l <-dat
+k="both tumour types"
+dat.l$group_id <-as.factor(dat.l[[i]])
+dat.l<-dat.l %>% drop_na(paste(i))
+
+test <- dat.l %>% select(c("NR",j,"group_id"))
+#colnames(test) <-c("NR", "Phenotype","group_id")
+
+test_wide <- pivot_wider(test,id_cols=c("NR","group_id"),names_from = "cell_type", values_from ="cell_type", values_fn = list(cell_type=length),names_prefix = "")
+#names(test_wide)[names(test_wide) == "FibroType_1"] <- "FibroType_1 / tCAF"
+#names(test_wide)[names(test_wide) == "FibroType_2"] <- "FibroType_2 / vCAF"
+test_wide
+test_wide[is.na(test_wide)] <- 0
+
+design <- createDesignMatrix(
+  test_wide[,c("NR","group_id")], cols_design = c( "NR","group_id")
+)
+
+contrast <- createContrast(c(rep(0, ncol(design)-1),1))
+data.frame(parameters = colnames(design), contrast)
+
+test_wide_norm <- test_wide %>% select(-c("NR","group_id"))
+#Normalize
+norm_factors <- calcNormFactors(t(test_wide_norm), method = "TMM")
+y <- DGEList(t(test_wide_norm), norm.factors = norm_factors)
+
+#y <- DGEList(t(both[,-c("patient_id","group_id")]))
+y <- estimateDisp(y,design)
+
+fit <- glmFit(y, design)
+lrt <- glmLRT(fit, contrast = contrast)
+top <- edgeR::topTags(lrt, n = Inf, adjust.method = "BH", 
+        sort.by = "none")
+t.op <-as.data.frame(top)
+t.op <- tibble::rownames_to_column(t.op, "FibroTypes")
+
+plot_list[[k]] <-
+ggplot(t.op, aes(reorder(FibroTypes,logFC), logFC)) +
+  geom_col(aes(fill=PValue<0.05)) +
+  coord_flip() +
+  scale_fill_tableau()+
+  labs(x="Fibroblast Type", y="logFC",
+       title=paste("Phenotypes over",i,k))+
+  theme_bw()+
+  theme(strip.background = element_blank(),
+        panel.background=element_rect(fill='white', colour = 'black'),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        axis.text.y = element_text(size=10))
+#plot(p)
+
+# reference = Metastasis
+#ggsave(plot=p, file=file.path(plot_folder, paste("Differential_Abundance_Analysis_Fibro_n_over_",i,".pdf")))
+  #}
+  gridExtra::grid.arrange(grobs = plot_list)
+  
+    #save out in individual pdfs for each variable
+  #pdf(file=file.path(plot_folder, paste0("DifferentialAbundance_Analysis_Fibro_n_over_",i,".pdf")))
+  #gridExtra::grid.arrange(grobs = plot_list)
+  #dev.off()
+}
+
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=FibroTypes))+geom_point()+scale_color_tableau()
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=PValue<0.05))+geom_point()+scale_color_tableau()
+```
diff --git a/Analysis_merged_by_category_T_I_C_V_patroirem.Rmd b/Analysis_merged_by_category_T_I_C_V_patroirem.Rmd
new file mode 100644
index 0000000..7a89b85
--- /dev/null
+++ b/Analysis_merged_by_category_T_I_C_V_patroirem.Rmd
@@ -0,0 +1,1701 @@
+---
+title: "R Notebook"
+output:
+  html_document:
+    df_print: paged
+---
+
+```{r  import libraries, echo=F, warnings=F, message=F}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(ggsci)
+library(stringr)
+library(DT)
+library(dplyr)
+library(tidyr)
+library(mclust)
+library(ggplot2)
+library(RColorBrewer)
+library(scater)
+library(Rphenoannoy)
+library(factoextra)
+library(cluster)
+library(dendextend)
+library(ggthemes)
+library(ggpubr)
+library(dplyr)
+#library(tidyverse)
+library(RColorBrewer)
+library(pals)
+library(qwraps2)
+library(table1)
+library(SingleCellExperiment)
+library(tidyr)
+library(scater)
+library(ggridges)
+library(viridis)
+library(viridisLite)
+library(ggplot2)
+library(data.table)
+library(CATALYST)
+library(gridExtra)
+library(Rphenograph)
+library(ComplexHeatmap)
+library(CATALYST)
+library(scales)
+library(survival)
+library(broom)
+library(pheatmap)
+library(FlowSOM)
+library(Seurat)
+library(Rphenoannoy)
+library(dplyr)
+library(data.table)
+library(ggthemes)
+library(diffcyt)
+library(edgeR)
+library(rstatix)
+library(dendextend)
+library(ggdendro)
+library(dendextend)
+library(FactoMineR)
+library(factoextra)
+library(survminer)
+library(corrplot)
+library(rstatix)
+library(graphics)
+library(cowplot)
+library(cluster)
+library(glmnet)
+library(fastDummies)
+library(ggsci)
+library(FactoMineR)
+library(factoextra)
+set.seed(101100)
+```
+
+```{r}
+wd <-dirname(getwd())
+set.seed(101100)
+```
+
+
+```{r load data objects, eval=F, echo=F}
+tcell.pat.roi <- readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/immune/Tcell/Tcell_sce_pat_roi_rem.rds")
+fibro.pat.roi <- readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/stroma/CAF/Fibro_sce_pat_roi_rem.rds")
+immune.pat.roi <-readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/immune/Immune_sce_pat_roi_rem.rds")
+vessel.pat.roi <-readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/stroma/vessel/vessel_sce_pat_roi_rem.rds")
+tumour.pat.roi <-readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/Tumour/Tumour_sce_pat_roi_rem.rds")
+```
+
+```{r merge objects, eval=FALSE, echo=F}
+colnames(colData(tcell.pat.roi))
+colnames(colData(fibro.pat.roi))
+colnames(colData(immune.pat.roi))
+
+colnames(colData(vessel.pat.roi))
+colnames(colData(tumour.pat.roi))
+
+fibro.pat.roi$cell_subtype <- fibro.pat.roi$cell_type
+all.pat.roi <-cbind(tcell.pat.roi,fibro.pat.roi,immune.pat.roi)
+
+vessel.pat.roi$cell_category <-"Vessel"
+vessel.pat.roi$cell_type <- vessel.pat.roi$vessel_type
+vessel.pat.roi$cell_subtype <- vessel.pat.roi$vessel_type
+vessel.pat.roi$vessel_type <-NULL
+vessel.pat.roi$rp_vessel_all_k50 <-NULL
+vessel.pat.roi$rp_vessel_all_k50.1 <-NULL
+reducedDim(vessel.pat.roi)<-NULL
+
+tumour.pat.roi$cell_category <-"Tumour"
+tumour.pat.roi$cell_type <- tumour.pat.roi$tumour_type
+tumour.pat.roi$cell_subtype <- tumour.pat.roi$tumour_type
+tumour.pat.roi$tumour_type <-NULL
+
+all.pat.roi <-cbind(tcell.pat.roi,fibro.pat.roi,immune.pat.roi,vessel.pat.roi,tumour.pat.roi)
+saveRDS(all.pat.roi, file=file.path(wd, "sce_objects", "merge_by_category", "all_merge_Tumour_Immune_Tcell_CAF_Vessel_pat-roi-rem.rds"))
+
+all.pat.roi<-readRDS(file=file.path(wd, "sce_objects", "merge_by_category", "all_merge_Tumour_Immune_Tcell_CAF_Vessel_pat-roi-rem.rds"))
+
+all.pat.roi$DX.name[is.na(all.pat.roi$DX.name)]<-"NA"
+all.pat.roi$cell_subtype %>% unique
+```
+
+```{r all together , eval=FALSE, echo=F}
+all.minusTumour <- readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/merge_minus_Tumour/all_minus_Tumour-Other_CLINICAL-DATA_FILTERED.rds")
+tumour.sce <- readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/Tumour/FINAL_All_Tumour_clustered.rds")
+
+colnames(colData(tumour.sce))[!colnames(colData(tumour.sce))%in% colnames(colData(all.sce))]
+colnames(colData(all.sce))[!colnames(colData(all.sce))%in% colnames(colData(tumour.sce))]
+
+tumour.sce$cell_type <- tumour.sce$tumour_type
+tumour.sce$cell_subtype <- tumour.sce$tumour_type
+tumour.sce$cell_category <-"Tumour"
+tumour.sce$tumour_type <-NULL
+
+all.sce <- cbind(all.sce, tumour.sce)
+saveRDS(all.sce, file=file.path(data_folder, "FINAL_All_Clustered_FILTERED_inc_tumour_vessel_immune_CAF_minusOther.rds"))
+unique(all.sce$cell_category)
+```
+
+```{r read data}
+
+all.pat.roi<-readRDS(file=file.path(wd, "sce_objects", "merge_by_category", "all_merge_Tumour_Immune_Tcell_CAF_Vessel_pat-roi-rem.rds"))
+
+all.pat.roi$DX.name[is.na(all.pat.roi$DX.name)]<-"NA"
+```
+
+
+
+```{r add T cell subtype, echo=F}
+all.pat.roi$tcell_subtype <-all.pat.roi$cell_subtype
+all.pat.roi$tcell_subtype[all.pat.roi$cell_category=="Fibroblast"] <- all.pat.roi[, all.pat.roi$cell_category=="Fibroblast"]$cell_type
+all.pat.roi$tcell_subtype %>% unique
+
+
+all.sce$tcell_subtype <-all.sce$cell_subtype
+all.sce$tcell_subtype[all.sce$cell_category=="Fibroblast"] <- all.sce[, all.sce$cell_category=="Fibroblast"]$cell_type
+all.sce$tcell_subtype %>% unique
+```
+#Differential abundance analysis loop   
+```{r  Differential Abundance Analysis Types over Metastasis, fig.width=8, fig.height=6, warning=F, message=F,echo=F, out.width="50%"}
+#Patient Number must be numeric
+library(tidyr)
+#groupId must be factor
+#t <-as.data.frame(table(fibro[,fibro$CAF_Type!="undefined"]$Patient_ID,factor(fibro[,fibro$CAF_Type!="undefined"]$CAF_Type)))
+#  colnames(t)<-c("Patient_ID","Phenotype","n")
+ # t$Phenotype <-factor(t$Phenotype)
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+df$X <-NULL
+  
+colData(all.pat.roi)<-as.data.frame(colData(all.pat.roi)) %>% 
+  group_by(Patient_ID) %>% mutate(NR=cur_group_id()) %>% DataFrame()
+dat <-as_tibble(colData(all.pat.roi))
+#if necessary: change group_id labels
+
+dat <-left_join(dat, df, by="Patient_ID")
+#clinical.data$NR <-c(1:length(unique(clinical.data$Patient_ID)))
+#dat <-merge(t,clinical.data,by="Patient_ID")
+dat <-dat[dat$DX.name=="Adenocarcinoma" | dat$DX.name=="Squamous cell carcinoma",]
+#dat$LN.Met <- ifelse(dat$N ==0, "No LN Metastases", "LN Metastases")
+#dat$Dist.Met <- ifelse(dat$M.new ==0, "No Dist. Metastases", "Dist. Metastases")
+#dat$NeoAdj <- ifelse(dat$Radio==1 |dat$Chemo==1, "NeoAdjuvantTherapy", "NoNeoAdjuvantTherapy")
+
+#Only for grade
+#dat <-dat[dat$Grade=="2" | dat$Grade=="3",]
+#category <-"Grade"
+#dat<-dat %>% drop_na(Grade)
+
+
+caf_groups <- colnames(df[,-1])
+category <-c("Gender","Relapse","Chemo","Dist.Met","LN.Met",caf_groups)
+
+#group_id <-c("M.new")
+#dat$group_id <-ifelse(dat$M.new ==0, 0,1)
+#dat[[group_id]] <-as.factor(dat[[group_id]])
+
+#define the naming category aka Phenotype
+
+#dat<-dat %>% drop_na(paste(group_id))
+j <-"tcell_subtype"
+
+plot_list <- list()
+for (i in category) {
+  #for(k in unique(dat$DX.name)){
+   # dat.l <-subset(dat, DX.name==k)
+
+dat.l <-dat
+k="both tumour types"
+dat.l$group_id <-as.factor(dat.l[[i]])
+dat.l<-dat.l %>% drop_na(paste(i))
+
+test <- dat.l %>% select(c("NR",j,"group_id"))
+#colnames(test) <-c("NR", "Phenotype","group_id")
+
+test_wide <- pivot_wider(test,id_cols=c("NR","group_id"),names_from = paste(j), values_from =paste(j), values_fn = list(tcell_subtype=length),names_prefix = "")
+#names(test_wide)[names(test_wide) == "FibroType_1"] <- "FibroType_1 / tCAF"
+#names(test_wide)[names(test_wide) == "FibroType_2"] <- "FibroType_2 / vCAF"
+test_wide
+test_wide[is.na(test_wide)] <- 0
+
+design <- createDesignMatrix(
+  test_wide[,c("NR","group_id")], cols_design = c( "NR","group_id")
+)
+
+contrast <- createContrast(c(rep(0, ncol(design)-1),1))
+data.frame(parameters = colnames(design), contrast)
+
+test_wide_norm <- test_wide %>% select(-c("NR","group_id"))
+#Normalize
+norm_factors <- calcNormFactors(t(test_wide_norm), method = "TMM")
+y <- DGEList(t(test_wide_norm), norm.factors = norm_factors)
+
+#y <- DGEList(t(both[,-c("patient_id","group_id")]))
+y <- estimateDisp(y,design)
+
+fit <- glmFit(y, design)
+lrt <- glmLRT(fit, contrast = contrast)
+top <- edgeR::topTags(lrt, n = Inf, adjust.method = "BH", 
+        sort.by = "none")
+t.op <-as.data.frame(top)
+t.op <- tibble::rownames_to_column(t.op, "Neighbour_Cluster")
+
+plot_list[[k]] <-
+ggplot(t.op, aes(reorder(Neighbour_Cluster,logFC), logFC)) +
+  geom_col(aes(fill=PValue<0.05)) +
+  coord_flip() +
+  scale_fill_tableau()+
+  labs(x="Neighbour_Cluster", y="logFC",
+       title=paste("Phenotypes over",i,k))+
+  theme_bw()+
+  theme(strip.background = element_blank(),
+        panel.background=element_rect(fill='white', colour = 'black'),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        axis.text.y = element_text(size=10))
+#plot(p)
+
+# reference = Metastasis
+#ggsave(plot=p, file=file.path(plot_folder, paste("Differential_Abundance_Analysis_Fibro_n_over_",i,".pdf")))
+  #}
+  gridExtra::grid.arrange(grobs = plot_list)
+  
+    #save out in individual pdfs for each variable
+  #pdf(file=file.path(plot_folder, paste0("DifferentialAbundance_Analysis_Fibro_n_over_",i,".pdf")))
+  #gridExtra::grid.arrange(grobs = plot_list)
+  #dev.off()
+}
+
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=Neighbour_Cluster))+geom_point()+scale_color_tableau()
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=PValue<0.05))+geom_point()+scale_color_tableau()
+```
+#Differential abundance analysis loop   
+```{r  Differential Abundance Analysis Types over Metastasis, fig.width=8, fig.height=6, warning=F, message=F,echo=F, out.width="50%"}
+#Patient Number must be numeric
+library(tidyr)
+#groupId must be factor
+#t <-as.data.frame(table(fibro[,fibro$CAF_Type!="undefined"]$Patient_ID,factor(fibro[,fibro$CAF_Type!="undefined"]$CAF_Type)))
+#  colnames(t)<-c("Patient_ID","Phenotype","n")
+ # t$Phenotype <-factor(t$Phenotype)
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+df$X <-NULL
+  
+colData(all.pat.roi)<-as.data.frame(colData(all.pat.roi)) %>% 
+  group_by(Patient_ID) %>% mutate(NR=cur_group_id()) %>% DataFrame()
+dat <-as_tibble(colData(all.pat.roi))
+#if necessary: change group_id labels
+
+colData(all.sce)<-as.data.frame(colData(all.sce)) %>% 
+  group_by(Patient_ID) %>% mutate(NR=cur_group_id()) %>% DataFrame()
+dat <-as_tibble(colData(all.sce))
+#if necessary: change group_id labels
+
+
+dat <-left_join(dat, df, by="Patient_ID")
+#clinical.data$NR <-c(1:length(unique(clinical.data$Patient_ID)))
+#dat <-merge(t,clinical.data,by="Patient_ID")
+dat <-dat[dat$DX.name=="Adenocarcinoma" | dat$DX.name=="Squamous cell carcinoma",]
+#dat$LN.Met <- ifelse(dat$N ==0, "No LN Metastases", "LN Metastases")
+#dat$Dist.Met <- ifelse(dat$M.new ==0, "No Dist. Metastases", "Dist. Metastases")
+#dat$NeoAdj <- ifelse(dat$Radio==1 |dat$Chemo==1, "NeoAdjuvantTherapy", "NoNeoAdjuvantTherapy")
+
+#Only for grade
+#dat <-dat[dat$Grade=="2" | dat$Grade=="3",]
+#category <-"Grade"
+#dat<-dat %>% drop_na(Grade)
+
+
+caf_groups <- colnames(df[,-1])
+category <-c("Gender","Relapse","Chemo","Dist.Met","LN.Met",caf_groups)
+
+#group_id <-c("M.new")
+#dat$group_id <-ifelse(dat$M.new ==0, 0,1)
+#dat[[group_id]] <-as.factor(dat[[group_id]])
+
+#define the naming category aka Phenotype
+
+#dat<-dat %>% drop_na(paste(group_id))
+j <-"tcell_subtype"
+
+plot_list <- list()
+for (i in category) {
+  #for(k in unique(dat$DX.name)){
+   # dat.l <-subset(dat, DX.name==k)
+
+dat.l <-dat
+k="both tumour types"
+dat.l$group_id <-as.factor(dat.l[[i]])
+dat.l<-dat.l %>% drop_na(paste(i))
+
+test <- dat.l %>% select(c("NR",j,"group_id"))
+#colnames(test) <-c("NR", "Phenotype","group_id")
+
+test_wide <- pivot_wider(test,id_cols=c("NR","group_id"),names_from = paste(j), values_from =paste(j), values_fn = list(tcell_subtype=length),names_prefix = "")
+#names(test_wide)[names(test_wide) == "FibroType_1"] <- "FibroType_1 / tCAF"
+#names(test_wide)[names(test_wide) == "FibroType_2"] <- "FibroType_2 / vCAF"
+test_wide
+test_wide[is.na(test_wide)] <- 0
+
+design <- createDesignMatrix(
+  test_wide[,c("NR","group_id")], cols_design = c( "NR","group_id")
+)
+
+contrast <- createContrast(c(rep(0, ncol(design)-1),1))
+data.frame(parameters = colnames(design), contrast)
+
+test_wide_norm <- test_wide %>% select(-c("NR","group_id"))
+#Normalize
+norm_factors <- calcNormFactors(t(test_wide_norm), method = "TMM")
+y <- DGEList(t(test_wide_norm), norm.factors = norm_factors)
+
+#y <- DGEList(t(both[,-c("patient_id","group_id")]))
+y <- estimateDisp(y,design)
+
+fit <- glmFit(y, design)
+lrt <- glmLRT(fit, contrast = contrast)
+top <- edgeR::topTags(lrt, n = Inf, adjust.method = "BH", 
+        sort.by = "none")
+t.op <-as.data.frame(top)
+t.op <- tibble::rownames_to_column(t.op, "Neighbour_Cluster")
+
+plot_list[[k]] <-
+ggplot(t.op, aes(reorder(Neighbour_Cluster,logFC), logFC)) +
+  geom_col(aes(fill=PValue<0.05)) +
+  coord_flip() +
+  scale_fill_tableau()+
+  labs(x="Neighbour_Cluster", y="logFC",
+       title=paste("Phenotypes over",i,k))+
+  theme_bw()+
+  theme(strip.background = element_blank(),
+        panel.background=element_rect(fill='white', colour = 'black'),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        axis.text.y = element_text(size=10))
+#plot(p)
+
+# reference = Metastasis
+#ggsave(plot=p, file=file.path(plot_folder, paste("Differential_Abundance_Analysis_Fibro_n_over_",i,".pdf")))
+  #}
+  gridExtra::grid.arrange(grobs = plot_list)
+  
+    #save out in individual pdfs for each variable
+  #pdf(file=file.path(plot_folder, paste0("DifferentialAbundance_Analysis_Fibro_n_over_",i,".pdf")))
+  #gridExtra::grid.arrange(grobs = plot_list)
+  #dev.off()
+}
+
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=Neighbour_Cluster))+geom_point()+scale_color_tableau()
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=PValue<0.05))+geom_point()+scale_color_tableau()
+```
+
+
+
+```{r set up tdat, echo=F}
+cat <- c("Immune","T cell","Fibroblast", "Vessel","Tumour") #"Fibroblast",
+unique(all.pat.roi$cell_category)
+tdat <-data.frame()
+tdat_wide <-data.frame("Patient_ID"=unique(all.pat.roi[,all.pat.roi$DX.name!="Control"&
+                                          all.pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.pat.roi$DX.name=="Squamous cell carcinoma"]$Patient_ID))
+for(i in cat){
+  dat.sce <- all.pat.roi[, all.pat.roi$cell_category== paste(i)]
+  df <-as.data.frame(table(dat.sce[,dat.sce$DX.name!="Control"&
+                                          dat.sce$DX.name=="Adenocarcinoma"|
+                                          dat.sce$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                           dat.sce[,dat.sce$DX.name!="Control"&
+                                          dat.sce$DX.name=="Adenocarcinoma"|
+                                     dat.sce$DX.name=="Squamous cell carcinoma"]$tcell_subtype))
+  colnames(df)<-c("Patient_ID","Phenotype","n")
+  df <-df %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+df[is.na(df)] <- 0
+df$Phenotype <-droplevels(df$Phenotype)
+df$Patient_ID <-droplevels(df$Patient_ID)
+
+tdat <- rbind.data.frame(df, tdat)
+df_wide <- pivot_wider(df,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+  #tdat$Type <- paste(i)
+  
+  tdat_wide <- left_join(tdat_wide,df_wide, by="Patient_ID")
+}
+
+write.csv(tdat_wide, file=file.path(wd,"sce_objects","merge_by_category","RAW_by_Category_tcell_subtype.csv"))
+```
+
+Define clinical data
+```{r clinical data, message=FALSE, warning=FALSE, echo=FALSE}
+#clinical.data <-as.data.frame(colData(immune.sce))
+
+clinical.data <- read.csv(file=file.path(wd,"sce_objects","clinical_data", "clinical_data_ROI_ac_combined_CORRECT.csv"))
+
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+clinical.data$X <-NULL
+clinical.data$X.1 <-NULL
+head(clinical.data)
+clinical.data$TmaBlock <- clinical.data$TMA.x
+clinical.data$TMA <- clinical.data$TMA.y
+clinical.data$TMA.x<-NULL
+clinical.data$TMA.y<-NULL
+
+clinical.data$DX.name[is.na(clinical.data$DX.name)]<-"NA"
+
+```
+
+
+```{r optimal number of clusters proportions ,out.width="50%",echo=FALSE,fig.width=6, fig.height=4, warning=FALSE, message=FALSE}
+#Calculate Proportions
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+
+cat <- c("Immune","T cell","Fibroblast","Vessel","Tumour") #"Fibroblast",
+
+tdat <-data.frame()
+tdat_wide <-data.frame("Patient_ID"=unique(all.pat.roi[,all.pat.roi$DX.name!="Control"&
+                                          all.pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.pat.roi$DX.name=="Squamous cell carcinoma"]$Patient_ID))
+for(i in cat){
+  dat.sce <- all.pat.roi[, all.pat.roi$cell_category== paste(i)]
+  df <-as.data.frame(table(dat.sce[,dat.sce$DX.name!="Control"&
+                                          dat.sce$DX.name=="Adenocarcinoma"|
+                                          dat.sce$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                           dat.sce[,dat.sce$DX.name!="Control"&
+                                          dat.sce$DX.name=="Adenocarcinoma"|
+                                     dat.sce$DX.name=="Squamous cell carcinoma"]$tcell_subtype))
+  colnames(df)<-c("Patient_ID","Phenotype","n")
+  df <-df %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+df[is.na(df)] <- 0
+df$Phenotype <-droplevels(df$Phenotype)
+df$Patient_ID <-droplevels(df$Patient_ID)
+
+tdat <- rbind.data.frame(df, tdat)
+df_wide <- pivot_wider(df,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+  #tdat$Type <- paste(i)
+  
+  tdat_wide <- left_join(tdat_wide,df_wide)
+  tdat_wide <- tdat_wide[complete.cases(tdat_wide), ]
+
+}
+
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+ 
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+
+
+sbg <-cutree(hc, k=2)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=30)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=30)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=30)
+
+#install.packages("cluster")
+library(cluster)
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 20, B = 100)
+fviz_gap_stat(gap_stat)
+```
+
+Patients are ordered by hierarchical clustering (euclidian, wards.D2)
+```{r calculate celltype proportions ordered by hierarchical clustering, message=FALSE, warning=FALSE, echo=FALSE}
+#Calculate Proportions
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+
+cat <- c("Immune","T cell","Fibroblast","Vessel","Tumour") #"Fibroblast",
+
+tdat <-data.frame()
+tdat_wide <-data.frame("Patient_ID"=unique(all.pat.roi[,all.pat.roi$DX.name!="Control"&
+                                          all.pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.pat.roi$DX.name=="Squamous cell carcinoma"]$Patient_ID))
+for(i in cat){
+  dat.sce <- all.pat.roi[, all.pat.roi$cell_category== paste(i)]
+  df <-as.data.frame(table(dat.sce[,dat.sce$DX.name!="Control"&
+                                          dat.sce$DX.name=="Adenocarcinoma"|
+                                          dat.sce$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                           dat.sce[,dat.sce$DX.name!="Control"&
+                                          dat.sce$DX.name=="Adenocarcinoma"|
+                                     dat.sce$DX.name=="Squamous cell carcinoma"]$tcell_subtype))
+  colnames(df)<-c("Patient_ID","Phenotype","n")
+  df <-df %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+df[is.na(df)] <- 0
+df$Phenotype <-droplevels(df$Phenotype)
+df$Patient_ID <-droplevels(df$Patient_ID)
+
+tdat <- rbind.data.frame(df, tdat)
+df_wide <- pivot_wider(df,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+  #tdat$Type <- paste(i)
+  
+  tdat_wide <- left_join(tdat_wide,df_wide)
+  #tdat_wide$sum <-rowSums(tdat_wide[2:25])
+  tdat_wide <- tdat_wide[complete.cases(tdat_wide), ]
+}
+tdat <- tdat[tdat$Patient_ID %in% tdat_wide$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = hc$labels[hc$order])
+
+#tdat <-pivot_longer(tdat_wide,cols=colnames(tdat_wide[2:25]), names_to="Phenotype", values_to="freq" )
+
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = hc$labels[hc$order])
+
+head(tdat_wide)
+```
+
+**Patient based metaclusters split by metacluster**
+```{r Barplot cell category proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#Barplot proportions ordered by
+p <-ggplot(tdat,aes(x=freq, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_igv()
+#plot(p)
+#ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_Tcell-Category_over_FILTEREDpatients_ordered-hc.pdf")), width=6, height=6)
+
+#dendrogram coloured by metacluster
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 20) %>%  as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()+
+  coord_flip()+
+  scale_y_reverse()
+
+#n
+p2<-ggplot(tdat,aes(x=n, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_igv()+theme(
+        axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())
+
+plt <-p1+
+  p+theme(legend.position = "none", axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())+
+  p2
+plot(plt)
+#ggsave(plot=plt, file=file.path(plot_folder, paste("Barplot_n_hc_Cell-category_proportions.pdf")), width=6, height=6)
+```
+
+```{r merge hc with cell category proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k= 20))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-merge(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- merge(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+ggplot(tdat_ct,aes(y=freq,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_igv()+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+  #theme(legend.position = "none")
+```
+
+
+```{r heatmap metacluster cell category expression proportions, fig.width=12, fig.height=12, echo=F, eval=F}
+tdat_ct
+tdat_ct_w <-tdat_ct %>% pivot_wider(id_cols = "metacluster",names_from = "Phenotype", values_from = "freq",values_fn = mean)
+tdat_ct_w_m <- as.matrix(tdat_ct_w[,-1])
+rownames(tdat_ct_w_m) <-tdat_ct_w$metacluster
+
+pheatmap(scale(t(tdat_ct_w_m)))
+
+pheatmap(t(tdat_ct_w_m))
+```
+
+## All proportions
+```{r proportions high low all, fig.width=15, fig.height=15, message=F, warning=F, echo=F}
+#Proportions
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+#df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_mid_low_proportion.csv"))
+
+
+df$X <-NULL
+
+hl_m <- left_join(tdat, df, by="Patient_ID")
+
+categories <- c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G") #
+plot_list <- list()
+
+
+ac_sqc <- clinical.data[clinical.data$DX.name=="Adenocarcinoma"|clinical.data$DX.name=="Squamous cell carcinoma",]$Patient_ID
+hl_m <- hl_m[hl_m$Patient_ID %in% ac_sqc,]
+
+for (i in (categories)) {
+hl_m <- left_join(tdat, df, by="Patient_ID")
+hl_m <- hl_m[hl_m$Patient_ID %in% ac_sqc,]
+
+  for (j in unique(hl_m$Phenotype)){
+    df_l<-hl_m
+    df_l[[i]] <-as.factor(df_l[[i]])
+    df_l <-df_l %>% drop_na(i)
+    df_l$i <-as.factor(df_l[[i]])
+    df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+    df_l <-subset(df_l, Phenotype==j)
+
+    if (length(unique(df_l[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- df_l%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(df_l, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_tableau()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(length(unique(df_l[[i]])) == 2){
+    
+      df_l<-hl_m
+      df_l[[i]] <-as.factor(df_l[[i]])
+      df_l <-df_l %>% drop_na(i)
+      df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+      df_l <-subset(df_l, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- df_l %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(df_l$freq~df_l[[i]], paired=F)$p.value)
+      df_l <- merge(df_l, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      df_l$p.wt <- paste0('p=',round(df_l$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(df_l, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+        scale_color_tableau()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+#print(categories)
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("tcell_category_Proportion_Wo-NeoAdj",i,".pdf")))
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=6)
+ # dev.off()
+}
+```
+
+
+## Adenocarcinoma Proportions
+```{r Adenocarcinoma proportions strat high low, fig.width=15, fig.height=15, message=F, warning=F, echo=F}
+#Proportions split
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion_Adenocarcinoma.csv"))
+
+#df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_mid_low_proportion_Adenocarcinoma.csv"))
+
+
+df$X <-NULL
+
+hl_m <- left_join(tdat, df, by="Patient_ID")
+pat_ac <- clinical.data[clinical.data$DX.name=="Adenocarcinoma",]$Patient_ID
+
+hl_m <- hl_m[hl_m$Patient_ID %in% pat_ac,]
+
+categories <- c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G") #,"Density_G"
+plot_list <- list()
+
+for (i in (categories)) {
+hl_m <- left_join(tdat, df, by="Patient_ID")
+hl_m <- hl_m[hl_m$Patient_ID %in% pat_ac,]
+
+  for (j in unique(hl_m$Phenotype)){
+    df_l<-hl_m
+    df_l[[i]] <-as.factor(df_l[[i]])
+    df_l <-df_l %>% drop_na(i)
+    df_l$i <-as.factor(df_l[[i]])
+    df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+    df_l <-subset(df_l, Phenotype==j)
+
+    if (length(unique(df_l[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- df_l%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(df_l, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_tableau()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(length(unique(df_l[[i]])) == 2){
+    
+      df_l<-hl_m
+      df_l[[i]] <-as.factor(df_l[[i]])
+      df_l <-df_l %>% drop_na(i)
+      df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+      df_l <-subset(df_l, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- df_l %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(df_l$freq~df_l[[i]], paired=F)$p.value)
+      df_l <- merge(df_l, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      df_l$p.wt <- paste0('p=',round(df_l$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(df_l, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+        scale_color_tableau()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+#print(categories)
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("tcell_category_Proportion_Wo-NeoAdj",i,".pdf")))
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=6)
+ # dev.off()
+}
+```
+
+## Squamous cell carcinoma Proportions
+```{r Squamous cell carcinoma prop, fig.width=15, fig.height=15, message=F, warning=F, echo=F}
+#Proportions
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion_Squamous cell carcinoma.csv"))
+
+#df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_mid_low_proportion_Squamous cell carcinoma.csv"))
+
+
+df$X <-NULL
+
+hl_m <- left_join(tdat, df, by="Patient_ID")
+pat_sqc <- clinical.data[clinical.data$DX.name=="Squamous cell carcinoma",]$Patient_ID
+
+hl_m <- hl_m[hl_m$Patient_ID %in% pat_sqc,]
+
+categories <- c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G") #,"Density_G"
+plot_list <- list()
+
+for (i in (categories)) {
+hl_m <- left_join(tdat, df, by="Patient_ID")
+hl_m <- hl_m[hl_m$Patient_ID %in% pat_sqc,]
+
+  for (j in unique(hl_m$Phenotype)){
+    df_l<-hl_m
+    df_l[[i]] <-as.factor(df_l[[i]])
+    df_l <-df_l %>% drop_na(i)
+    df_l$i <-as.factor(df_l[[i]])
+    df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+    df_l <-subset(df_l, Phenotype==j)
+
+    if (length(unique(df_l[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- df_l%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(df_l, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_tableau()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(length(unique(df_l[[i]])) == 2){
+    
+      df_l<-hl_m
+      df_l[[i]] <-as.factor(df_l[[i]])
+      df_l <-df_l %>% drop_na(i)
+      df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+      df_l <-subset(df_l, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- df_l %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(df_l$freq~df_l[[i]], paired=F)$p.value)
+      df_l <- merge(df_l, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      df_l$p.wt <- paste0('p=',round(df_l$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(df_l, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+        scale_color_tableau()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+#print(categories)
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("tcell_category_Proportion_Wo-NeoAdj",i,".pdf")))
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=6)
+ # dev.off()
+}
+```
+
+
+```{r  Correlations proportions per image CAF types,message=FALSE,warning=FALSE,echo=FALSE,eval=F, fig.width=6, fig.height=6}
+
+cat <- c("Immune","T cell","Fibroblast", "Vessel","Tumour") #"Fibroblast",
+unique(all.pat.roi$cell_category)
+tdat <-data.frame()
+tdat_wide <-data.frame("Patient_ID"=unique(all.pat.roi[,all.pat.roi$DX.name!="Control"&
+                                          all.pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.pat.roi$DX.name=="Squamous cell carcinoma"]$Patient_ID))
+for(i in cat){
+  dat.sce <- all.pat.roi[, all.pat.roi$cell_category== paste(i)]
+  df <-as.data.frame(table(dat.sce[,dat.sce$DX.name!="Control"&
+                                          dat.sce$DX.name=="Adenocarcinoma"|
+                                          dat.sce$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                           dat.sce[,dat.sce$DX.name!="Control"&
+                                          dat.sce$DX.name=="Adenocarcinoma"|
+                                     dat.sce$DX.name=="Squamous cell carcinoma"]$tcell_subtype))
+  colnames(df)<-c("Patient_ID","Phenotype","n")
+  df <-df %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+df[is.na(df)] <- 0
+df$Phenotype <-droplevels(df$Phenotype)
+df$Patient_ID <-droplevels(df$Patient_ID)
+
+tdat <- rbind.data.frame(df, tdat)
+df_wide <- pivot_wider(df,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+  #tdat$Type <- paste(i)
+  
+  tdat_wide <- left_join(tdat_wide,df_wide, by="Patient_ID")
+}
+
+tdat_wide <-tdat_wide[!tdat_wide$RoiID%in% neoadj_pat$RoiID,]
+rownames(tdat_wide)<-tdat_wide$RoiID
+
+library(corrplot)
+cor(tdat_wide[,-1], method="pearson")
+#corrplot(cor(tdat_wide[,-1], method="pearson"), col=rev(brewer.pal(10,"RdBu")))
+
+sig <- cor.mtest(tdat_wide[,-1], conf.level = .95) #is this test right?
+
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+         col=rev(brewer.pal(10,"RdBu")),
+         pch.col = "white",
+         #type="upper",
+         title = "Correlated proportions",
+        mar=c(0,0,3,0))
+```
+
+## Density
+
+```{r Calculate densities Fibro category, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table(all.pat.roi[,all.pat.roi$Patient_ID!="Control"&
+                                          all.pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.pat.roi[,all.pat.roi$Patient_ID!="Control"&
+                                          all.pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.pat.roi$DX.name=="Squamous cell carcinoma"]$tcell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.pat.roi$RoiID,
+                                "Area"=all.pat.roi$Area_px_Core,
+                                "Patient_ID"=all.pat.roi$Patient_ID,
+                                "DX.name"=all.pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+
+  #remove super high density outliers (total density >10000)  
+ # tdat <-tdat[ !tdat$Patient_ID%in%surv_excl,]
+
+#CAF type distribution over all patients
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+#keep only complete cases
+tdat_wide <- tdat_wide[complete.cases(tdat_wide), ]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+#order Patients after clustering results
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = labels(hc))
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = labels(hc))
+
+#tdat_wide <-tdat_wide[ !tdat_wide$Patient_ID%in%surv_excl,]
+
+```
+
+```{r merge hc with cell category density, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k= 10))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-merge(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- merge(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+ggplot(tdat_ct,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_igv()+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+  #theme(legend.position = "none")
+```
+
+```{r heatmap metacluster cell category expression density, fig.width=12, fig.height=12, echo=F}
+tdat_ct
+tdat_ct_w <-tdat_ct %>% pivot_wider(id_cols = "metacluster",names_from = "Phenotype", values_from = "density",values_fn = mean)
+tdat_ct_w_m <- as.matrix(tdat_ct_w[,-1])
+rownames(tdat_ct_w_m) <-tdat_ct_w$metacluster
+
+pheatmap(scale(t(tdat_ct_w_m)))
+
+pheatmap(t(tdat_ct_w_m))
+```
+
+## Barplot showing absolute density per patietn coloured by Fibro category together with hierarchical clustering tree coloured by patient metaclusters.
+```{r Barolot Fibro category densities, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+#barolot (sqrt of density)
+p <-ggplot(tdat,aes(y=sqrt(density),x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_igv()+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+#barplot total density
+p <-ggplot(tdat,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_igv()+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+
+#coloured dendrogram
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 8) %>% as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()
+pg <-plot_grid(p1,p, align="v", ncol=1)
+pg
+#ggsave(plot=pg, file=file.path(plot_folder, paste("Barplot_Tcell-Category-Densities_HC_ordered_dendro.pdf")), width=12, height=6)
+```
+
+## Adenocarcinoma Density
+```{r Adenocarcinoma density strat, fig.width=15, fig.height=15, message=F, warning=F, echo=F}
+#Density
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_density_split_Adenocarcinoma.csv"))
+
+#df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_mid_low_density_split_Adenocarcinoma.csv"))
+
+
+df$X <-NULL
+
+hl_m <- left_join(tdat, df, by="Patient_ID")
+pat_ac <- clinical.data[clinical.data$DX.name=="Adenocarcinoma",]$Patient_ID
+
+hl_m <- hl_m[hl_m$Patient_ID %in% pat_ac,]
+
+categories <- c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G","Density_G") #,"Density_G"
+plot_list <- list()
+
+for (i in (categories)) {
+hl_m <- left_join(tdat, df, by="Patient_ID")
+hl_m <- hl_m[hl_m$Patient_ID %in% pat_ac,]
+
+  for (j in unique(hl_m$Phenotype)){
+    df_l<-hl_m
+    df_l[[i]] <-as.factor(df_l[[i]])
+    df_l <-df_l %>% drop_na(i)
+    df_l$i <-as.factor(df_l[[i]])
+    df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+    df_l <-subset(df_l, Phenotype==j)
+
+    if (length(unique(df_l[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- df_l%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(df_l, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_tableau()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(length(unique(df_l[[i]])) == 2){
+    
+      df_l<-hl_m
+      df_l[[i]] <-as.factor(df_l[[i]])
+      df_l <-df_l %>% drop_na(i)
+      df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+      df_l <-subset(df_l, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- df_l %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(df_l$density~df_l[[i]], paired=F)$p.value)
+      df_l <- merge(df_l, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      df_l$p.wt <- paste0('p=',round(df_l$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(df_l, aes(x= .data[[i]], y = sqrt(density), colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+        scale_color_tableau()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+#print(categories)
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("tcell_category_Density_Wo-NeoAdj",i,".pdf")))
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=6)
+ # dev.off()
+}
+```
+
+## Squamous cell carcinoma Density
+```{r Squamous cell carcinoma Density, fig.width=15, fig.height=15, message=F, warning=F, echo=F}
+#Density
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_density_split_Squamous cell carcinoma.csv"))
+
+#df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_mid_low_density_split_Squamous cell carcinoma.csv"))
+
+
+df$X <-NULL
+
+hl_m <- left_join(tdat, df, by="Patient_ID")
+pat_sqc <- clinical.data[clinical.data$DX.name=="Squamous cell carcinoma",]$Patient_ID
+
+hl_m <- hl_m[hl_m$Patient_ID %in% pat_sqc,]
+
+categories <- c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G","Density_G") #,"Density_G"
+plot_list <- list()
+
+for (i in (categories)) {
+hl_m <- left_join(tdat, df, by="Patient_ID")
+hl_m <- hl_m[hl_m$Patient_ID %in% pat_sqc,]
+
+  for (j in unique(hl_m$Phenotype)){
+    df_l<-hl_m
+    df_l[[i]] <-as.factor(df_l[[i]])
+    df_l <-df_l %>% drop_na(i)
+    df_l$i <-as.factor(df_l[[i]])
+    df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+    df_l <-subset(df_l, Phenotype==j)
+
+    if (length(unique(df_l[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- df_l%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(df_l, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_tableau()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(length(unique(df_l[[i]])) == 2){
+    
+      df_l<-hl_m
+      df_l[[i]] <-as.factor(df_l[[i]])
+      df_l <-df_l %>% drop_na(i)
+      df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+      df_l <-subset(df_l, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- df_l %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(df_l$density~df_l[[i]], paired=F)$p.value)
+      df_l <- merge(df_l, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      df_l$p.wt <- paste0('p=',round(df_l$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(df_l, aes(x= .data[[i]], y = sqrt(density), colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+        scale_color_tableau()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+#print(categories)
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("tcell_category_Density_Wo-NeoAdj",i,".pdf")))
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=6)
+ # dev.off()
+}
+```
+
+## All Density
+```{r density all strat high low, fig.width=15, fig.height=15, message=F, warning=F}
+#here
+#Density
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_lo_density.csv"))
+#df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_mid_low_density.csv"))
+
+
+df$X <-NULL
+
+hl_m <- left_join(tdat, df, by="Patient_ID")
+
+ac_sqc <- clinical.data[clinical.data$DX.name=="Adenocarcinoma"|clinical.data$DX.name=="Squamous cell carcinoma",]$Patient_ID
+hl_m <- hl_m[hl_m$Patient_ID %in% ac_sqc,]
+
+categories <- c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G","Density_G") #,"Density_G"
+plot_list <- list()
+
+
+for (i in (categories)) {
+hl_m <- left_join(tdat, df, by="Patient_ID")
+hl_m <- hl_m[hl_m$Patient_ID %in% ac_sqc,]
+
+  for (j in unique(hl_m$Phenotype)){
+    df_l<-hl_m
+    df_l[[i]] <-as.factor(df_l[[i]])
+    df_l <-df_l %>% drop_na(i)
+    df_l$i <-as.factor(df_l[[i]])
+    df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+    df_l <-subset(df_l, Phenotype==j)
+
+    if (length(unique(df_l[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- df_l%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   stat.test$y.position <-sqrt(stat.test$y.position)
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(df_l, aes(x= .data[[i]], y = sqrt(density), colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_tableau()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(length(unique(df_l[[i]])) == 2){
+    
+      df_l<-hl_m
+      df_l[[i]] <-as.factor(df_l[[i]])
+      df_l <-df_l %>% drop_na(i)
+      df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+      df_l <-subset(df_l, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- df_l %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(df_l$density~df_l[[i]], paired=F)$p.value)
+      df_l <- merge(df_l, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      df_l$p.wt <- paste0('p=',round(df_l$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(df_l, aes(x= .data[[i]], y = sqrt(density), colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+        scale_color_tableau()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+#print(categories)
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("tcell_category_Density_Wo-NeoAdj",i,".pdf")))
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=6)
+ # dev.off()
+}
+```
+
+
+
+```{r}
+#proportions
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion_Squamous cell carcinoma.csv"))
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion_Adenocarcinoma.csv"))
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+
+```
+
+
+## Adenocarcinoma Density
+```{r Adenocarcinoma density strat, fig.width=15, fig.height=15, message=F, warning=F, echo=F}
+#Density
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_density_split_Adenocarcinoma.csv"))
+
+#df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_mid_low_density_split_Adenocarcinoma.csv"))
+
+
+df$X <-NULL
+
+hl_m <- left_join(tdat, df, by="Patient_ID")
+pat_ac <- clinical.data[clinical.data$DX.name=="Adenocarcinoma",]$Patient_ID
+
+hl_m <- hl_m[hl_m$Patient_ID %in% pat_ac,]
+
+categories <- c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G","Density_G") #,"Density_G"
+plot_list <- list()
+
+for (i in (categories)) {
+hl_m <- left_join(tdat, df, by="Patient_ID")
+hl_m <- hl_m[hl_m$Patient_ID %in% pat_ac,]
+
+  for (j in unique(hl_m$Phenotype)){
+    df_l<-hl_m
+    df_l[[i]] <-as.factor(df_l[[i]])
+    df_l <-df_l %>% drop_na(i)
+    df_l$i <-as.factor(df_l[[i]])
+    df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+    df_l <-subset(df_l, Phenotype==j)
+
+    if (length(unique(df_l[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- df_l%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(df_l, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_tableau()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(length(unique(df_l[[i]])) == 2){
+    
+      df_l<-hl_m
+      df_l[[i]] <-as.factor(df_l[[i]])
+      df_l <-df_l %>% drop_na(i)
+      df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+      df_l <-subset(df_l, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- df_l %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(df_l$density~df_l[[i]], paired=F)$p.value)
+      df_l <- merge(df_l, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      df_l$p.wt <- paste0('p=',round(df_l$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(df_l, aes(x= .data[[i]], y = sqrt(density), colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+        scale_color_tableau()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+#print(categories)
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("tcell_category_Density_Wo-NeoAdj",i,".pdf")))
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=6)
+ # dev.off()
+}
+```
+
+## Squamous cell carcinoma Density
+```{r Squamous cell carcinoma Density, fig.width=15, fig.height=15, message=F, warning=F, echo=F}
+#Density
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_density_split_Squamous cell carcinoma.csv"))
+
+#df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_mid_low_density_split_Squamous cell carcinoma.csv"))
+
+
+df$X <-NULL
+
+hl_m <- left_join(tdat, df, by="Patient_ID")
+pat_sqc <- clinical.data[clinical.data$DX.name=="Squamous cell carcinoma",]$Patient_ID
+
+hl_m <- hl_m[hl_m$Patient_ID %in% pat_sqc,]
+
+categories <- c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G","Density_G") #,"Density_G"
+plot_list <- list()
+
+for (i in (categories)) {
+hl_m <- left_join(tdat, df, by="Patient_ID")
+hl_m <- hl_m[hl_m$Patient_ID %in% pat_sqc,]
+
+  for (j in unique(hl_m$Phenotype)){
+    df_l<-hl_m
+    df_l[[i]] <-as.factor(df_l[[i]])
+    df_l <-df_l %>% drop_na(i)
+    df_l$i <-as.factor(df_l[[i]])
+    df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+    df_l <-subset(df_l, Phenotype==j)
+
+    if (length(unique(df_l[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- df_l%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(df_l, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_tableau()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(length(unique(df_l[[i]])) == 2){
+    
+      df_l<-hl_m
+      df_l[[i]] <-as.factor(df_l[[i]])
+      df_l <-df_l %>% drop_na(i)
+      df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+      df_l <-subset(df_l, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- df_l %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(df_l$density~df_l[[i]], paired=F)$p.value)
+      df_l <- merge(df_l, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      df_l$p.wt <- paste0('p=',round(df_l$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(df_l, aes(x= .data[[i]], y = sqrt(density), colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+        scale_color_tableau()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+#print(categories)
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("tcell_category_Density_Wo-NeoAdj",i,".pdf")))
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=6)
+ # dev.off()
+}
+```
+
+## All Density
+```{r density all strat high low, fig.width=15, fig.height=15, message=F, warning=F}
+#here
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+
+#Density
+#df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_lo_density.csv"))
+#df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_mid_low_density.csv"))
+
+
+df$X <-NULL
+
+hl_m <- left_join(tdat, df, by="Patient_ID")
+
+ac_sqc <- clinical.data[clinical.data$DX.name=="Adenocarcinoma"|clinical.data$DX.name=="Squamous cell carcinoma",]$Patient_ID
+hl_m <- hl_m[hl_m$Patient_ID %in% ac_sqc,]
+
+categories <- c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G","Density_G") #,"Density_G"
+plot_list <- list()
+
+
+for (i in (categories)) {
+hl_m <- left_join(tdat, df, by="Patient_ID")
+hl_m <- hl_m[hl_m$Patient_ID %in% ac_sqc,]
+
+  for (j in unique(hl_m$Phenotype)){
+    df_l<-hl_m
+    df_l[[i]] <-as.factor(df_l[[i]])
+    df_l <-df_l %>% drop_na(i)
+    df_l$i <-as.factor(df_l[[i]])
+    df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+    df_l <-subset(df_l, Phenotype==j)
+
+    if (length(unique(df_l[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- df_l%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   stat.test$y.position <-sqrt(stat.test$y.position)
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(df_l, aes(x= .data[[i]], y = sqrt(density), colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_tableau()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+    else {
+if(length(unique(df_l[[i]])) == 2){
+    
+      df_l<-hl_m
+      df_l[[i]] <-as.factor(df_l[[i]])
+      df_l <-df_l %>% drop_na(i)
+      df_l$Phenotype <-droplevels(df_l$Phenotype)
+
+      df_l <-subset(df_l, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- df_l %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(df_l$density~df_l[[i]], paired=F)$p.value)
+      df_l <- merge(df_l, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      df_l$p.wt <- paste0('p=',round(df_l$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(df_l, aes(x= .data[[i]], y = sqrt(density), colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+        scale_color_tableau()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),
+              legend.position = "none")
+    }  
+ 
+    }
+   }  
+#print(categories)
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=6) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("tcell_category_Density_Wo-NeoAdj",i,".pdf")))
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=6)
+ # dev.off()
+}
+```
diff --git a/Analysis_tumour.Rmd b/Analysis_tumour.Rmd
new file mode 100644
index 0000000..126e770
--- /dev/null
+++ b/Analysis_tumour.Rmd
@@ -0,0 +1,2215 @@
+---
+title: "R Notebook - Analysis Tumour cells"
+output:
+  html_document:
+    df_print: paged
+---
+
+```{r  import libraries, echo=F, warnings=F, message=F}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+library(dplyr)
+library(tidyr)
+library(mclust)
+library(ggplot2)
+library(RColorBrewer)
+library(scater)
+library(Rphenoannoy)
+library(factoextra)
+library(cluster)
+library(dendextend)
+library(ggthemes)
+library(ggpubr)
+library(dplyr)
+#library(tidyverse)
+library(RColorBrewer)
+library(pals)
+library(qwraps2)
+library(table1)
+library(SingleCellExperiment)
+library(tidyr)
+library(scater)
+library(ggridges)
+library(viridis)
+library(viridisLite)
+library(ggplot2)
+library(data.table)
+library(CATALYST)
+library(gridExtra)
+library(Rphenograph)
+library(ComplexHeatmap)
+library(CATALYST)
+library(scales)
+library(survival)
+library(broom)
+library(pheatmap)
+library(FlowSOM)
+library(Seurat)
+library(Rphenoannoy)
+library(dplyr)
+library(data.table)
+library(ggthemes)
+library(diffcyt)
+library(edgeR)
+library(rstatix)
+library(dendextend)
+library(ggdendro)
+library(dendextend)
+library(FactoMineR)
+library(factoextra)
+library(survminer)
+library(corrplot)
+library(rstatix)
+library(graphics)
+library(cowplot)
+library(cluster)
+library(glmnet)
+library(fastDummies)
+library(ggsci)
+library(FactoMineR)
+library(factoextra)
+set.seed(101100)
+```
+
+
+
+```{r  Set wd and load data final vessel, message=F, warning=F, echo=F}
+#set working directory 
+wd <-dirname(getwd())
+
+data_folder <-(file.path(wd,"sce_objects","final objects with categories","FINAL"))
+
+#RAW
+#all.tumour <-readRDS(file=file.path(data_folder, "TUMOUR_CLINICAL-DATA_FILTERED.rds"))
+
+data_folder <-(file.path(wd,"sce_objects","Tumour"))
+
+#saveRDS(all.tumour, file=file.path(data_folder, "FINAL_All_Tumour_clustered.rds"))
+
+#workingfile
+all.tumour <- readRDS(file=file.path(data_folder, "FINAL_All_Tumour_clustered.rds"))
+
+all.tumour$DX.name[is.na(all.tumour$DX.name)]<-"NA"
+
+```
+
+Tumour marker
+```{r  Define final tumour markers, echo=F, warning=F, message=FALSE}
+all.marker <-rownames(all.tumour)
+
+bad.marker <- c("Iridium_191","Iridium_193","Cadherin-6","Histone H3") 
+good.marker <- all.marker[!all.marker %in% bad.marker]
+
+tumour.marker <-c("Carbonic Anhydrase IX","Pan Cytokeratin + Keratin Epithelial")
+print(tumour.marker)
+```
+
+```{r  load clinical data, echo=F, warnings=F, message=F}
+clinical.data <- read.csv(file=file.path(wd,"sce_objects","clinical_data", "clinical_data_ROI_ac_combined_CORRECT.csv"))
+clinical.data$TMA <-clinical.data$TMA.x
+clinical.data$TMA.x <-NULL
+clinical.data$TMA.y <-NULL
+clinical.data$X.1 <-NULL
+clinical.data$X <-NULL
+head(clinical.data)
+
+unique(clinical.data$DX.name)
+clinical.data$DX.name[clinical.data$Patient_ID=="Control"] <-"Control"
+table(clinical.data$DX.name)
+clinical.data$Patient_ID %>% unique() %>% length() #1071 PATIENTS IN TOTAL
+
+area <- read.csv(file=file.path(wd,"clinical_data", "area.csv"))
+area$X <- NULL
+area$Tma_ac <- area$TMA_ImageID
+area$TMA_ImageID <-NULL
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+```
+
+Table with Immune cell numbers including tumour removing undefined
+```{r  Table with tumour cell numbers per patient, echo=FALSE, message=FALSE, warning=FALSE}
+#All tumour together
+tbl <- as.data.frame(table(all.tumour[,all.tumour$Patient_ID!="Control" ]$Patient_ID))
+colnames(tbl) <-c("Patient ID", "Tumour cell number overall")
+#write.csv(tbl, file=file.path(plot_folder, paste("Immune cell numbers per patient overall including T-excluding undefined cells.csv")))
+#print(tbl)
+
+#summary(tbl$`Tumour cell number overall`) #3-6259
+#tbl[tbl$`Tumour cell number overall` <=100,] #31 patients have less than 100 tumour=3% but 10% of median/ 89 patients have less than 300 tumour
+
+#Lowest 10% of all patients' Tumour cell numbers
+#tbl[order(tbl$`Tumour cell number overall`),]$`Tumour cell number overall`[1:100] #2-502
+
+
+#Highest 10% of all patients' Tumour cell numbers
+#tbl[order(-tbl$`Tumour cell number overall`),]$`Tumour cell number overall`[1:100] #5133-9247
+
+#tbl[tbl$`Tumour cell number overall` <=500,] #102 patients have less than 100 tumour=10%. 77 patients have less than 15 tumour
+
+all_Tumour_pat <- tbl[tbl$`Tumour cell number overall` <=50,]$`Patient ID`
+
+length(unique(all.tumour$Patient_ID))
+```
+
+Remove ROIs
+```{r  Table with tumour cell numbers per image, echo=FALSE, message=FALSE, warning=FALSE}
+#All Immune cells together
+tbl <- as.data.frame(table(all.tumour[,all.tumour$Patient_ID!="Control"]$RoiID))
+colnames(tbl) <-c("ROI ID", "Immune cell number overall")
+#write.csv(tbl, file=file.path(plot_folder, paste("Immune cell numbers per ROI overall including tumour-excluding undefined.csv")))
+#print(tbl)
+#summary(tbl$`Immune cell number overall`) #1-5075
+#tbl[tbl$`Immune cell number overall` <=200,] #93 images have less than 50 Immune cells (10% of median)
+#Lowest 10% of all patients' Immune cell numbers
+#tbl[order(tbl$`Immune cell number overall`),]$`Immune cell number overall`[1:200] #1-195
+
+
+#Highest 10% of all patients' Immune cell numbers
+#tbl[order(-tbl$`Immune cell number overall`),]$`Immune cell number overall`[1:200] #1385-5075
+#per image cut at 50 Immune cells per image equals lowest 5% -> ensures that there's at least 50 Immune cells per patient
+
+all_Tumour_roi <- tbl[tbl$`Immune cell number overall` <=200,]$`ROI ID`
+```
+
+
+Remove patients and roi
+```{r  patient roi and pat_roi removal, echo=FALSE, message=FALSE, warning=FALSE}
+#Patient removal
+all.tumour_pat <- all.tumour[,all.tumour$Patient_ID!="Control"&
+                               !all.tumour$Patient_ID%in%all_Tumour_pat]
+length(unique(all.tumour_pat$Patient_ID)) #954
+
+#Roi removal
+all.tumour_roi <- all.tumour[,all.tumour$Patient_ID!="Control"&
+                               !all.tumour$RoiID%in%all_Tumour_roi]
+length(unique(all.tumour_roi$Patient_ID)) #987
+
+#Patient & Roi removal
+all.tumour_pat.roi <- all.tumour[,all.tumour$Patient_ID!="Control"&
+                               !all.tumour$Patient_ID%in%all_Tumour_pat&
+                               !all.tumour$RoiID%in%all_Tumour_roi]
+length(unique(all.tumour_pat.roi$Patient_ID)) #954
+
+```
+
+```{r  Save SCE patient and roi removed, include=FALSE, echo=FALSE,warning=FALSE, message=FALSE, eval=FALSE}
+data_folder <-(file.path(wd,"sce_objects","Tumour"))
+
+all.tumour_pat.roi$DX.name[is.na(all.tumour_pat.roi$DX.name)]<-"NA"
+all.tumour_roi$DX.name[is.na(all.tumour_roi$DX.name)]<-"NA"
+all.tumour_pat$DX.name[is.na(all.tumour_pat$DX.name)]<-"NA"
+
+saveRDS(all.tumour_pat.roi,file=file.path(data_folder, paste("Tumour_sce_pat_roi_rem.rds",sep="")))
+saveRDS(all.tumour_roi,file=file.path(data_folder, paste("Tumour_sce_roi_rem.rds",sep="")))
+saveRDS(all.tumour_pat,file=file.path(data_folder, paste("Tumour_sce_pat_rem.rds",sep="")))
+
+all.tumour_pat.roi <-readRDS(file=file.path(data_folder, paste("Tumour_sce_pat_roi_rem.rds",sep="")))
+all.tumour_roi <-readRDS(file=file.path(data_folder, paste("Tumour_sce_roi_rem.rds",sep="")))
+all.tumour_pat <-readRDS(file=file.path(data_folder, paste("Tumour_sce_pat_rem.rds",sep="")))
+
+
+```
+
+
+##Analysis
+#Proportions tumour type
+
+## **Analysis**
+In the following, only tumours that were identified as adenocarcinomas or squamous cell carcinomas by pathologists will be used.
+
+## **tumour type**
+
+## **Proportions**
+Optimal number of patient metaclusters tumour type proportions
+```{r  optimal number of clusters for tumour type proportion clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Proportions
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+tdat <-as.data.frame(table( all.tumour_pat[, all.tumour_pat$Patient_ID!="Control"&
+                                           all.tumour_pat$DX.name=="Adenocarcinoma"|
+                                           all.tumour_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            all.tumour_pat[, all.tumour_pat$Patient_ID!="Control"&
+                                           all.tumour_pat$DX.name=="Adenocarcinoma"|
+                                           all.tumour_pat$DX.name=="Squamous cell carcinoma"]$tumour_type))
+
+
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+#tdat_m <-tdat_m[,1:2]
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+sbg <-cutree(hc, k=2)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=15)
+
+#install.packages("cluster")
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+#res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+## Patients are ordered by hierarchical clustering (euclidian, wards.D2)
+```{r  tumour type proportions ordered by hierarchical clustering, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table( all.tumour_pat[, all.tumour_pat$Patient_ID!="Control"&
+                                           all.tumour_pat$DX.name=="Adenocarcinoma"|
+                                           all.tumour_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            all.tumour_pat[, all.tumour_pat$Patient_ID!="Control"&
+                                           all.tumour_pat$DX.name=="Adenocarcinoma"|
+                                           all.tumour_pat$DX.name=="Squamous cell carcinoma"]$tumour_type))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = hc$labels[hc$order])
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = hc$labels[hc$order])
+
+tdat_wide
+```
+
+Patient based metaclusters split by metacluster
+```{r  Barplot tumour type proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#Barplot proportions ordered by
+p <-ggplot(tdat,aes(x=freq, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()
+#plot(p)
+#ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_Tumour-Category_over_FILTEREDpatients_ordered-hc.pdf")), width=6, height=6)
+
+#dendrogram coloured by metacluster
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 4) %>%  as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()+
+  coord_flip()+
+  scale_y_reverse()
+
+#n
+p2<-ggplot(tdat,aes(x=n, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+theme(
+        axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())
+plt <-p1+
+  p+theme(legend.position = "none", axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())+
+  p2
+
+plot(plt)
+#ggsave(plot=plt, file=file.path(plot_folder, paste("Barplot_n_hc_tumour_type_proportions.pdf")), width=6, height=6)
+```
+
+```{r  merge hc metacluster into t_dat tumour type proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k= 4))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+
+p <-ggplot(tdat_ct,aes(y=freq,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+#ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_split_metacluster_Tumour-category_proportions.pdf")))
+plot(p)
+```
+
+
+## **Survival analysis** - tumour type
+T cell categories (normal and hypoxic) are grouped into "high" and "low" using the proportion's median as a cut-off.    
+- high > median  
+- low < median  
+Patients who received neoadjuvant treatment are excluded from survival analysis.  
+```{r  survival data tumour type proportions high-low, message=FALSE, warning=FALSE, echo=FALSE}
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat,]
+
+#high low by median
+surv_dat$normal_G <- ifelse(surv_dat$normal >summary(surv_dat$normal)[3],"normal high","normal low")
+surv_dat$hypoxic_G <- ifelse(surv_dat$hypoxic >summary(surv_dat$hypoxic)[3],"hypoxic high","hypoxic low")
+surv <-c("hypoxic_G","normal_G", "metacluster")
+```
+
+
+#CoxPH for tumour type proportions corrected for Stage, Grade and M
+```{r  Coxph tumour type proportions corrected, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   normal + hypoxic + Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `normal+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r  survival tumour type proportions not split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+
+
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  facet_wrap(~strata)+theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r  survival tumour type proportions split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#OS survival
+surv <-c("hypoxic_G","normal_G", "metacluster")
+
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$normal_G <- ifelse(dat$normal >summary(dat$normal)[3],"normal high","normal low")
+dat$hypoxic_G <- ifelse(dat$hypoxic >summary(dat$hypoxic)[3],"hypoxic high","hypoxic low")
+   
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+   #abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and T cell categories (normal vs hypoxic) classified as high / low as well as the proportion (continuous).   
+```{r  coxph tumour type proportions, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c("normal_G","hypoxic_G","Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c("normal_G","hypoxic_G","metacluster",colnames(clinical.data),contains("censor"))) #))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tumour-Category_Proportions_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Cell groups categorised into "high", "mid", "low".  
+- high > 3rd quantile  
+- mid >1st quantile and < 3rd quantile  
+- low < 1st quantile  
+```{r  survival tumour type proportions classified as high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat$normal_G[surv_dat$normal >summary(surv_dat$normal)[5]] <-"normal high"
+surv_dat$normal_G[surv_dat$normal <summary(surv_dat$normal)[2]] <-"normal low"
+surv_dat$normal_G[surv_dat$normal >summary(surv_dat$normal)[2]& surv_dat$normal<summary(surv_dat$normal)[5]] <-"normal medium"
+
+surv_dat$hypoxic_G[surv_dat$hypoxic >summary(surv_dat$hypoxic)[5]] <-"hypoxic high"
+surv_dat$hypoxic_G[surv_dat$hypoxic <summary(surv_dat$hypoxic)[2]] <-"hypoxic low"
+surv_dat$hypoxic_G[surv_dat$hypoxic >summary(surv_dat$hypoxic)[2]& surv_dat$hypoxic<summary(surv_dat$hypoxic)[5]] <-"hypoxic medium"
+
+surv <-c("hypoxic_G","normal_G", "metacluster")
+```
+
+## Survival tumour type proportions high mid low not split
+```{r  survival tumour type proportions not split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#not split by Tumour Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Survival tumour type proportions high mid low split by tumour type
+```{r  survival tumour type proportions split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#split by Tumour Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$normal_G[dat$normal >summary(dat$normal)[5]] <-"normal high"
+dat$normal_G[dat$normal <summary(dat$normal)[2]] <-"normal low"
+dat$normal_G[dat$normal >summary(dat$normal)[2]& dat$normal<summary(dat$normal)[5]] <-"normal medium"
+
+dat$hypoxic_G[dat$hypoxic >summary(dat$hypoxic)[5]] <-"hypoxic high"
+dat$hypoxic_G[dat$hypoxic <summary(dat$hypoxic)[2]] <-"hypoxic low"
+dat$hypoxic_G[dat$hypoxic >summary(dat$hypoxic)[2]& dat$hypoxic<summary(dat$hypoxic)[5]] <-"hypoxic medium"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+including clinical parameters, patient metacluster and T cell categories (normal vs hypoxic) classified as high mid low as well as the proportion (continuous).   
+```{r  coxph tumour type proportions high mid low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c("normal_G","hypoxic_G","Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c("normal_G","hypoxic_G","metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tumour-Category_Proportions_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+#Correlation of tumour type proportions with clinical parameters
+```{r  Correlation of tumour type proportions and clinical parameters, message=FALSE, warning=FALSE, echo=FALSE}
+all.tumour_pat$NeoAdj <- ifelse(all.tumour_pat$Chemo==1 |all.tumour_pat$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.tumour_pat[,all.tumour_pat$Patient_ID!="Control"&
+                                          all.tumour_pat$DX.name=="Adenocarcinoma"|
+                                          all.tumour_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                           all.tumour_pat[,all.tumour_pat$Patient_ID!="Control"&
+                                          all.tumour_pat$DX.name=="Adenocarcinoma"|
+                                          all.tumour_pat$DX.name=="Squamous cell carcinoma"]$tumour_type))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+tdat_wide <-merge(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_as <-tdat_wide[tdat_wide$DX.name=="Adenocarcinoma" | tdat_wide$DX.name=="Squamous cell carcinoma",]
+tdat_wide_as$LN.Met <-ifelse(tdat_wide_as$LN.Met=="LN Metastases", 1,0)
+tdat_wide_as$Dist.Met <-ifelse(tdat_wide_as$Dist.Met=="Dist. Metastases", 1,0)
+
+c.param <- c("Gender","T.new","N","M.new","Dist.Met","LN.Met","Stage")
+for(i in c.param){
+  print(i)
+print(cor.test(tdat_wide_as$normal,tdat_wide_as[[i]], method="spearman", exact=F))
+  }
+```
+
+
+In group comparison (Wilcoxon for 2 groups or KW for >2 groups) for T cell categories over clinical variables:  
+  - T.new (TNM)
+  - N (TNM)
+  - M.new (TNM)
+  - Relapse
+  - Grade
+  - Stage
+  - Gender
+  - Typ (Tumour Type)
+  - Dist.Met (Distant metastases yes/no)
+  - LN.Met (Lymph node metastases yes/no)
+  
+## Kruskal-Wallis / Wilcoxon tumour type ~ clinical data excluding neoadjuvant therapy
+```{r  KW W tumour type proportions WO neo, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(all.tumour_pat[,all.tumour_pat$Patient_ID!="Control"&
+                                          all.tumour_pat$DX.name=="Adenocarcinoma"|
+                                          all.tumour_pat$DX.name=="Squamous cell carcinoma"]))
+
+#remove neoadj patients
+df <-df[!df$Patient_ID%in% neoadj_pat$Patient_ID,]
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender") 
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$tumour_type, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=2) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+  #pdf(file=file.path(plot_folder, paste0("KW_W_Tumour-Category_Proportions_WOneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+## Neoadjuvantly treated patients are included, the same categories are used plus:  
+- NeoAdj (Neoadjuvant therapy yes/no)
+- Chemo (Neoadjuvant therapy = chemotherapy)
+- Radio (Neoadjuvant therapy = radiotherapy)
+```{r  KW W tumour type proportions with NEO, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(all.tumour_pat[,all.tumour_pat$Patient_ID!="Control"&
+                                          all.tumour_pat$DX.name=="Adenocarcinoma"|
+                                          all.tumour_pat$DX.name=="Squamous cell carcinoma"]))
+
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$tumour_type, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=2) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tumour-Category_Proportions_Wneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+
+
+## **tumour type**
+## **Densities**
+Generally, densities are first calculated per tissue area. For patients with more than 1 image, the average density is calculated subsequently.  
+Optimal number of tumour type densities patients metaclusters:
+```{r  tumour type density optimal number of clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Densities
+tdat <-as.data.frame(table(all.tumour_pat.roi[,all.tumour_pat.roi$Patient_ID!="Control"&
+                                          all.tumour_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.tumour_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.tumour_pat.roi[,all.tumour_pat.roi$Patient_ID!="Control"&
+                                          all.tumour_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.tumour_pat.roi$DX.name=="Squamous cell carcinoma"]$tumour_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.tumour_pat.roi$RoiID,
+                                "Area"=all.tumour_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.tumour_pat.roi$Patient_ID,
+                                "DX.name"=all.tumour_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+#tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+library(FactoMineR)
+library(factoextra)
+sbg <-cutree(hc, k=8)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=20)
+
+#install.packages("cluster")
+library(cluster)
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+```{r  Calculate densities tumour type, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table(all.tumour_pat.roi[,all.tumour_pat.roi$Patient_ID!="Control"&
+                                          all.tumour_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.tumour_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.tumour_pat.roi[,all.tumour_pat.roi$Patient_ID!="Control"&
+                                          all.tumour_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.tumour_pat.roi$DX.name=="Squamous cell carcinoma"]$tumour_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.tumour_pat.roi$RoiID,
+                                "Area"=all.tumour_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.tumour_pat.roi$Patient_ID,
+                                "DX.name"=all.tumour_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+  
+  
+#CAF type distribution over all patients
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+#order Patients after clustering results
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = labels(hc))
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = labels(hc))
+```
+
+## Barplot showing absolute density per patietn coloured by tumour type together with hierarchical clustering tree coloured by patient metaclusters.
+```{r Barolot tumour type densities, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+#barolot (sqrt of density)
+p <-ggplot(tdat,aes(y=sqrt(density),x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+#barplot total density
+p <-ggplot(tdat,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+
+#coloured dendrogram
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 4) %>% as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()
+pg <-plot_grid(p1,p, align="v", ncol=1)
+pg
+#ggsave(plot=pg, file=file.path(plot_folder, paste("Barplot_Tumour-Category-Densities_HC_ordered_dendro.pdf")), width=12, height=6)
+```
+Add patient metacluster to data and plot clusters separately for better identification
+```{r tumour type densities merge hc, message=FALSE, warning=FALSE, echo=FALSE}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k=4))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+ggplot(tdat_ct,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+```
+
+## Survival analysis over tumour type densities high low.  
+- High > median
+- Low < median
+```{r  surv_dat tumour type densities high low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- surv_dat$normal+surv_dat$hypoxic
+
+#high low
+surv_dat$Density_G <- ifelse(surv_dat$total_density >summary(surv_dat$total_density)[3],"Density high","Density low")
+surv_dat$normal_G <- ifelse(surv_dat$normal >summary(surv_dat$normal)[3],"normal high","normal low")
+surv_dat$hypoxic_G <- ifelse(surv_dat$hypoxic >summary(surv_dat$hypoxic)[3],"hypoxic high","hypoxic low")
+```
+
+#CoxPH for tumour type density corrected for Stage, Grade and M
+```{r  Coxph tumour type density corrected, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   normal + hypoxic + Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `normal+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by tumour type (AC and SQC together)
+```{r  survival tumour type densities high low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+#surv <-c("metacluster")
+surv <-c("hypoxic_G","normal_G","metacluster", "Density_G")
+library(survminer)
+
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-tumour_type_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+#km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-tumour_type_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## Split by tumour type (AC and SQC individually)
+```{r  survival tumour type densities high low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Split
+library(survminer)
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G <- ifelse(dat$total_density >summary(dat$total_density)[3],"Density high","Density low")
+dat$normal_G <- ifelse(dat$normal >summary(dat$normal)[3],"normal high","normal low")
+dat$hypoxic_G <- ifelse(dat$hypoxic >summary(dat$hypoxic)[3],"hypoxic high","hypoxic low")   
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-tumour_type_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-tumour_type_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and T cell categories (normal vs hypoxic) classified as high / low as well as the densities (continuous).   
+```{r  coxph tumour type densities high-low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=6, fig.width=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c("Density_G","normal_G","hypoxic_G","Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c("normal_G","hypoxic_G","Density_G","metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tumour-Category_Density_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Survival tumour type densities high-mid-low     
+- high    > 3rd quantile   
+- mid     > 1st quantile < 3rd quantile   
+- low     < 1st quantile   
+```{r  surv_dat tumour type densities high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- surv_dat$normal+surv_dat$hypoxic
+
+#High Med Low
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[5]] <-"Density high"
+surv_dat$Density_G[surv_dat$total_density <summary(surv_dat$total_density)[2]] <-"Density low"
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[2]& surv_dat$total_density<summary(surv_dat$total_density)[5]] <-"Density medium"
+
+surv_dat$normal_G[surv_dat$normal >summary(surv_dat$normal)[5]] <-"normal high"
+surv_dat$normal_G[surv_dat$normal <summary(surv_dat$normal)[2]] <-"normal low"
+surv_dat$normal_G[surv_dat$normal >summary(surv_dat$normal)[2]& surv_dat$normal<summary(surv_dat$normal)[5]] <-"normal medium"
+
+surv_dat$hypoxic_G[surv_dat$hypoxic >summary(surv_dat$hypoxic)[5]] <-"hypoxic high"
+surv_dat$hypoxic_G[surv_dat$hypoxic <summary(surv_dat$hypoxic)[2]] <-"hypoxic low"
+surv_dat$hypoxic_G[surv_dat$hypoxic >summary(surv_dat$hypoxic)[2]& surv_dat$hypoxic<summary(surv_dat$hypoxic)[5]] <-"hypoxic medium"
+
+surv <-c("normal_G","hypoxic_G","Density_G","metacluster")
+```
+
+## tumour type densities split by tumour type
+```{r  survival tumour type densities high-mid-low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+surv <-c("normal_G","hypoxic_G","Density_G","metacluster")
+library(survminer)
+
+for (i in surv){
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-tumour_type_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-tumour_type_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## tumour type densities split by tumour type
+```{r  survival tumour type densities high-mid-low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+surv <-c("normal_G","hypoxic_G","Density_G","metacluster")
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G[dat$total_density >summary(dat$total_density)[5]] <-"Density high"
+dat$Density_G[dat$total_density <summary(dat$total_density)[2]] <-"Density low"
+dat$Density_G[dat$total_density >summary(dat$total_density)[2]& dat$total_density<summary(dat$total_density)[5]] <-"Density medium"
+
+dat$normal_G[dat$normal >summary(dat$normal)[5]] <-"normal high"
+dat$normal_G[dat$normal <summary(dat$normal)[2]] <-"normal low"
+dat$normal_G[dat$normal >summary(dat$normal)[2]& dat$normal<summary(dat$normal)[5]] <-"normal medium"
+
+dat$hypoxic_G[dat$hypoxic >summary(dat$hypoxic)[5]] <-"hypoxic high"
+dat$hypoxic_G[dat$hypoxic <summary(dat$hypoxic)[2]] <-"hypoxic low"
+dat$hypoxic_G[dat$hypoxic >summary(dat$hypoxic)[2]& dat$hypoxic<summary(dat$hypoxic)[5]] <-"hypoxic medium"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-tumour_type_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-tumour_type_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+
+
+## Lasso-regressed cox-ph model   
+including clinical parameters, patient metacluster and T cell categories (normal vs hypoxic) classified as high mid low as well as the densities (continuous).   
+```{r coxph tumour type densities high-mid-low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c("Density_G","normal_G","hypoxic_G","Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c("normal_G","hypoxic_G","Density_G","metacluster",colnames(clinical.data),contains("censor")))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_Tumour-Category_Density_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables without neoadjuvant treated patients.
+```{r  KW W tumour type density WO neo,warning=FALSE, message=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+all.tumour_pat.roi$NeoAdj <- ifelse(all.tumour_pat.roi$Chemo==1 |all.tumour_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.tumour_pat.roi[,all.tumour_pat.roi$Patient_ID!="Control"&
+                                          all.tumour_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.tumour_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.tumour_pat.roi[,all.tumour_pat.roi$Patient_ID!="Control"&
+                                          all.tumour_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.tumour_pat.roi$DX.name=="Squamous cell carcinoma"]$tumour_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.tumour_pat.roi$RoiID,
+                                "Area"=all.tumour_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.tumour_pat.roi$Patient_ID,
+                                "DX.name"=all.tumour_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- tdat_wide$normal + tdat_wide$hypoxic
+tdat <-tdat_wide %>% pivot_longer(cols=c("normal","hypoxic","total_density"), names_to = "Phenotype",values_to = "density")
+
+#remove
+tdat <-tdat[!tdat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=3) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("KW_W_Tumour-Category_Densities_WOneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables with neoadjuvant treated patients.
+```{r  KW Wilcox tumour type density W neo,warning=F,message=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+all.tumour_pat.roi$NeoAdj <- ifelse(all.tumour_pat.roi$Chemo==1 |all.tumour_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.tumour_pat.roi[,all.tumour_pat.roi$Patient_ID!="Control"&
+                                          all.tumour_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.tumour_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.tumour_pat.roi[,all.tumour_pat.roi$Patient_ID!="Control"&
+                                          all.tumour_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.tumour_pat.roi$DX.name=="Squamous cell carcinoma"]$tumour_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.tumour_pat.roi$RoiID,
+                                "Area"=all.tumour_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.tumour_pat.roi$Patient_ID,
+                                "DX.name"=all.tumour_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- tdat_wide$normal + tdat_wide$hypoxic
+tdat <-tdat_wide %>% pivot_longer(cols=c("normal","hypoxic","total_density"), names_to = "Phenotype",values_to = "density")
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=3) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_Tumour-Category_Densities_Wneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+```{r  Correlation tumour type densities with clinical parameters, eval=F, echo=T, message=FALSE, warning=FALSE, echo=FALSE}
+all.tumour_pat.roi$NeoAdj <- ifelse(all.tumour_pat.roi$Chemo==1 |all.tumour_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+all.tumour_pat.roi$NeoAdj <- ifelse(all.tumour_pat.roi$Chemo==1 |all.tumour_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.tumour_pat.roi[,all.tumour_pat.roi$Patient_ID!="Control"&
+                                          all.tumour_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.tumour_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.tumour_pat.roi[,all.tumour_pat.roi$Patient_ID!="Control"&
+                                          all.tumour_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.tumour_pat.roi$DX.name=="Squamous cell carcinoma"]$tumour_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.tumour_pat.roi$RoiID,
+                                "Area"=all.tumour_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.tumour_pat.roi$Patient_ID,
+                                "DX.name"=all.tumour_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- tdat_wide$normal + tdat_wide$hypoxic
+
+tdat_wide <-merge(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_as <-tdat_wide[tdat_wide$DX.name=="Adenocarcinoma" | tdat_wide$DX.name=="Squamous cell carcinoma",]
+tdat_wide_as$LN.Met <-ifelse(tdat_wide_as$LN.Met=="LN Metastases", 1,0)
+tdat_wide_as$Dist.Met <-ifelse(tdat_wide_as$Dist.Met=="Dist. Metastases", 1,0)
+
+for(i in c.param){
+  print(i)
+  print(cor.test(tdat_wide_as$normal,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$hypoxic,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$total_density,tdat_wide_as[[i]], method="spearman", exact=F))
+}
+```
diff --git a/Anaysis_vessel.Rmd b/Anaysis_vessel.Rmd
new file mode 100644
index 0000000..dcc7aac
--- /dev/null
+++ b/Anaysis_vessel.Rmd
@@ -0,0 +1,2240 @@
+---
+title: "R Notebook - Analysis vessel cells"
+output:
+  html_document:
+    df_print: paged
+---
+
+```{r  import libraries, echo=F, warnings=F, message=F}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+library(dplyr)
+library(tidyr)
+library(mclust)
+library(ggplot2)
+library(RColorBrewer)
+library(scater)
+library(Rphenoannoy)
+library(factoextra)
+library(cluster)
+library(dendextend)
+library(ggthemes)
+library(ggpubr)
+library(dplyr)
+#library(tidyverse)
+library(RColorBrewer)
+library(pals)
+library(qwraps2)
+library(table1)
+library(SingleCellExperiment)
+library(tidyr)
+library(scater)
+library(ggridges)
+library(viridis)
+library(viridisLite)
+library(ggplot2)
+library(data.table)
+library(CATALYST)
+library(gridExtra)
+library(Rphenograph)
+library(ComplexHeatmap)
+library(CATALYST)
+library(scales)
+library(survival)
+library(broom)
+library(pheatmap)
+library(FlowSOM)
+library(Seurat)
+library(Rphenoannoy)
+library(dplyr)
+library(data.table)
+library(ggthemes)
+library(diffcyt)
+library(edgeR)
+library(rstatix)
+library(dendextend)
+library(ggdendro)
+library(dendextend)
+library(FactoMineR)
+library(factoextra)
+library(survminer)
+library(corrplot)
+library(rstatix)
+library(graphics)
+library(cowplot)
+library(cluster)
+library(glmnet)
+library(fastDummies)
+library(ggsci)
+library(FactoMineR)
+library(factoextra)
+set.seed(101100)
+```
+
+
+```{r, Set wd and load data final vessel, message=F, warning=F, echo=F}
+#set working directory 
+wd <-dirname(getwd())
+
+data_folder <-(file.path(wd,"sce_objects","final objects with categories","FINAL"))
+
+#RAW
+#all.vessel<- readRDS(file=file.path(data_folder, "VESSEL_CLINICAL-DATA_FILTERED.rds"))
+
+#workingfile
+data_folder <-(file.path(wd,"sce_objects","stroma","vessel"))
+all.vessel <-readRDS(file=file.path(data_folder, "FINAL_VESSEL_clinical-data_workingfile.rds"))
+
+#all.vessel <-readRDS(file=file.path(data_folder, "FINAL_VESSEL_clinical-data_workingfile.rds"))
+#saveRDS(all.vessel, file=file.path(data_folder, "FINAL_VESSEL_clinical-data_workingfile.rds"))
+#saveRDS(all.vessel, file=file.path(data_folder, "FINAL_VESSEL_clinical-data_workingfile.rds"))
+
+all.vessel$DX.name[is.na(all.vessel$DX.name)]<-"NA"
+
+```
+
+vessel marker
+```{r  Define final vessel markers, echo=F, warning=F, message=FALSE}
+all.marker <-rownames(all.vessel)
+
+bad.marker <- c("Iridium_191","Iridium_193","Cadherin-6","Histone H3") 
+good.marker <- all.marker[!all.marker %in% bad.marker]
+
+vessel.marker <-c("Carbonic Anhydrase IX","Pan Cytokeratin + Keratin Epithelial")
+print(vessel.marker)
+```
+
+```{r  load clinical data, echo=F, warnings=F, message=F}
+clinical.data <- read.csv(file=file.path(wd,"sce_objects","clinical_data", "clinical_data_ROI_ac_combined_CORRECT.csv"))
+clinical.data$TMA <-clinical.data$TMA.x
+clinical.data$TMA.x <-NULL
+clinical.data$TMA.y <-NULL
+clinical.data$X.1 <-NULL
+clinical.data$X <-NULL
+head(clinical.data)
+
+unique(clinical.data$DX.name)
+clinical.data$DX.name[clinical.data$Patient_ID=="Control"] <-"Control"
+table(clinical.data$DX.name)
+clinical.data$Patient_ID %>% unique() %>% length() #1071 PATIENTS IN TOTAL
+
+area <- read.csv(file=file.path(wd,"clinical_data", "area.csv"))
+area$X <- NULL
+area$Tma_ac <- area$TMA_ImageID
+area$TMA_ImageID <-NULL
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+```
+
+Table with Immune cell numbers including vessel removing undefined
+```{r  Table with vessel cell numbers per patient, echo=FALSE, message=FALSE, warning=FALSE}
+#All vessel together
+tbl <- as.data.frame(table(all.vessel[,all.vessel$Patient_ID!="Control" ]$Patient_ID))
+colnames(tbl) <-c("Patient ID", "vessel cell number overall")
+#write.csv(tbl, file=file.path(plot_folder, paste("Immune cell numbers per patient overall including T-excluding undefined cells.csv")))
+#print(tbl)
+
+#summary(tbl$`vessel cell number overall`) #3-6259
+#tbl[tbl$`vessel cell number overall` <=100,] #31 patients have less than 100 vessel=3% but 10% of median/ 89 patients have less than 300 vessel
+
+#Lowest 10% of all patients' vessel cell numbers
+#tbl[order(tbl$`vessel cell number overall`),]$`vessel cell number overall`[1:100] #2-502
+
+
+#Highest 10% of all patients' vessel cell numbers
+#tbl[order(-tbl$`vessel cell number overall`),]$`vessel cell number overall`[1:100] #5133-9247
+
+#tbl[tbl$`vessel cell number overall` <=500,] #102 patients have less than 100 vessel=10%. 77 patients have less than 15 vessel
+
+all_vessel_pat <- tbl[tbl$`vessel cell number overall` <=15,]$`Patient ID`
+
+length(unique(all.vessel$Patient_ID))
+```
+
+Remove ROIs
+```{r  Table with vessel cell numbers per image, echo=FALSE, message=FALSE, warning=FALSE}
+#All Immune cells together
+tbl <- as.data.frame(table(all.vessel[,all.vessel$Patient_ID!="Control"]$RoiID))
+colnames(tbl) <-c("ROI ID", "Immune cell number overall")
+#write.csv(tbl, file=file.path(plot_folder, paste("Immune cell numbers per ROI overall including vessel-excluding undefined.csv")))
+#print(tbl)
+#summary(tbl$`Immune cell number overall`) #1-5075
+#tbl[tbl$`Immune cell number overall` <=200,] #93 images have less than 50 Immune cells (10% of median)
+#Lowest 10% of all patients' Immune cell numbers
+#tbl[order(tbl$`Immune cell number overall`),]$`Immune cell number overall`[1:200] #1-195
+
+
+#Highest 10% of all patients' Immune cell numbers
+#tbl[order(-tbl$`Immune cell number overall`),]$`Immune cell number overall`[1:200] #1385-5075
+#per image cut at 50 Immune cells per image equals lowest 5% -> ensures that there's at least 50 Immune cells per patient
+
+all_vessel_roi <- tbl[tbl$`Immune cell number overall` <=9,]$`ROI ID`
+```
+
+
+Remove patients and roi
+```{r  patient roi and pat_roi removal, echo=FALSE, message=FALSE, warning=FALSE}
+#Patient removal
+all.vessel_pat <- all.vessel[,all.vessel$Patient_ID!="Control"&
+                               !all.vessel$Patient_ID%in%all_vessel_pat]
+length(unique(all.vessel_pat$Patient_ID)) #954
+
+#Roi removal
+all.vessel_roi <- all.vessel[,all.vessel$Patient_ID!="Control"&
+                               !all.vessel$RoiID%in%all_vessel_roi]
+length(unique(all.vessel_roi$Patient_ID)) #987
+
+#Patient & Roi removal
+all.vessel_pat.roi <- all.vessel[,all.vessel$Patient_ID!="Control"&
+                               !all.vessel$Patient_ID%in%all_vessel_pat&
+                               !all.vessel$RoiID%in%all_vessel_roi]
+length(unique(all.vessel_pat.roi$Patient_ID)) #954
+
+```
+
+```{r  Save SCE patient and roi removed, include=FALSE, echo=FALSE,warning=FALSE, message=FALSE, eval=FALSE}
+data_folder <-(file.path(wd,"sce_objects","stroma","vessel"))
+
+all.vessel_pat.roi$DX.name[is.na(all.vessel_pat.roi$DX.name)]<-"NA"
+all.vessel_roi$DX.name[is.na(all.vessel_roi$DX.name)]<-"NA"
+all.vessel_pat$DX.name[is.na(all.vessel_pat$DX.name)]<-"NA"
+
+saveRDS(all.vessel_pat.roi,file=file.path(data_folder, paste("vessel_sce_pat_roi_rem.rds",sep="")))
+saveRDS(all.vessel_roi,file=file.path(data_folder, paste("vessel_sce_roi_rem.rds",sep="")))
+saveRDS(all.vessel_pat,file=file.path(data_folder, paste("vessel_sce_pat_rem.rds",sep="")))
+
+all.vessel_pat.roi <-readRDS(file=file.path(data_folder, paste("vessel_sce_pat_roi_rem.rds",sep="")))
+all.vessel_roi <-readRDS(file=file.path(data_folder, paste("vessel_sce_roi_rem.rds",sep="")))
+all.vessel_pat <-readRDS(file=file.path(data_folder, paste("vessel_sce_pat_rem.rds",sep="")))
+
+
+```
+
+
+##Analysis
+#Proportions vessel type
+
+## **Analysis**
+In the following, only vessels that were identified as adenocarcinomas or squamous cell carcinomas by pathologists will be used.
+
+## **vessel type**
+
+## **Proportions**
+Optimal number of patient metaclusters vessel type proportions
+```{r  optimal number of clusters for vessel type proportion clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Proportions
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+tdat <-as.data.frame(table( all.vessel_pat[, all.vessel_pat$Patient_ID!="Control"&
+                                           all.vessel_pat$DX.name=="Adenocarcinoma"|
+                                           all.vessel_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            all.vessel_pat[, all.vessel_pat$Patient_ID!="Control"&
+                                           all.vessel_pat$DX.name=="Adenocarcinoma"|
+                                           all.vessel_pat$DX.name=="Squamous cell carcinoma"]$vessel_type))
+
+
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+#tdat_m <-tdat_m[,1:2]
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+sbg <-cutree(hc, k=2)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=15)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=15)
+
+#install.packages("cluster")
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+#res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+## Patients are ordered by hierarchical clustering (euclidian, wards.D2)
+```{r  vessel type proportions ordered by hierarchical clustering, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table( all.vessel_pat[, all.vessel_pat$Patient_ID!="Control"&
+                                           all.vessel_pat$DX.name=="Adenocarcinoma"|
+                                           all.vessel_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            all.vessel_pat[, all.vessel_pat$Patient_ID!="Control"&
+                                           all.vessel_pat$DX.name=="Adenocarcinoma"|
+                                           all.vessel_pat$DX.name=="Squamous cell carcinoma"]$vessel_type))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = hc$labels[hc$order])
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = hc$labels[hc$order])
+
+tdat_wide
+```
+
+Patient based metaclusters split by metacluster
+```{r  Barplot vessel type proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#Barplot proportions ordered by
+p <-ggplot(tdat,aes(x=freq, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()
+#plot(p)
+#ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_vessel-Category_over_FILTEREDpatients_ordered-hc.pdf")), width=6, height=6)
+
+#dendrogram coloured by metacluster
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 4) %>%  as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()+
+  coord_flip()+
+  scale_y_reverse()
+
+#n
+p2<-ggplot(tdat,aes(x=n, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+theme(
+        axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())
+plt <-p1+
+  p+theme(legend.position = "none", axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())+
+  p2
+
+plot(plt)
+#ggsave(plot=plt, file=file.path(plot_folder, paste("Barplot_n_hc_vessel_type_proportions.pdf")), width=6, height=6)
+```
+
+```{r  merge hc metacluster into t_dat vessel type proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k= 4))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+
+p <-ggplot(tdat_ct,aes(y=freq,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+#ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_split_metacluster_vessel-category_proportions.pdf")))
+plot(p)
+```
+
+
+## **Survival analysis** - vessel type
+T cell categories (Blood and Lymphatic) are grouped into "high" and "low" using the proportion's median as a cut-off.    
+- high > median  
+- low < median  
+Patients who received neoadjuvant treatment are excluded from survival analysis.  
+```{r  survival data vessel type proportions high-low, message=FALSE, warning=FALSE, echo=FALSE}
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat,]
+
+#high low by median
+surv_dat$Blood_G <- ifelse(surv_dat$Blood >summary(surv_dat$Blood)[3],"Blood high","Blood low")
+surv_dat$Lymphatic_G <- ifelse(surv_dat$Lymphatic >summary(surv_dat$Lymphatic)[3],"Lymphatic high","Lymphatic low")
+surv_dat$HEV_G <- ifelse(surv_dat$HEV >summary(surv_dat$HEV)[3],"HEV high","HEV low")
+
+surv <-c("Lymphatic_G","Blood_G","HEV_G", "metacluster")
+```
+
+
+#CoxPH for vessel type proportions corrected for Stage, Grade and M
+```{r  Coxph vessel type proportions corrected, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   Blood + Lymphatic +HEV+ Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `Blood+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`Lymphatic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by vessel type (AC and SQC together)
+```{r  survival vessel type proportions not split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+
+
+#not split by vessel Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  facet_wrap(~strata)+theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Split by vessel type (AC and SQC individually)
+```{r  survival vessel type proportions split high low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#OS survival
+surv <-c("Lymphatic_G","Blood_G","HEV_G", "metacluster")
+
+#split by vessel Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Blood_G <- ifelse(dat$Blood >summary(dat$Blood)[3],"Blood high","Blood low")
+dat$Lymphatic_G <- ifelse(dat$Lymphatic >summary(dat$Lymphatic)[3],"Lymphatic high","Lymphatic low")
+   
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+   #abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and T cell categories (Blood vs Lymphatic) classified as high / low as well as the proportion (continuous).   
+```{r  coxph vessel type proportions, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c("Blood_G","Lymphatic_G","HEV_G","Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c("Blood_G","Lymphatic_G","HEV_G","metacluster",colnames(clinical.data),contains("censor"))) #))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_vessel-Category_Proportions_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Cell groups categorised into "high", "mid", "low".  
+- high > 3rd quantile  
+- mid >1st quantile and < 3rd quantile  
+- low < 1st quantile  
+```{r  survival vessel type proportions classified as high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat$Blood_G[surv_dat$Blood >summary(surv_dat$Blood)[5]] <-"Blood high"
+surv_dat$Blood_G[surv_dat$Blood <summary(surv_dat$Blood)[2]] <-"Blood low"
+surv_dat$Blood_G[surv_dat$Blood >summary(surv_dat$Blood)[2]& surv_dat$Blood<summary(surv_dat$Blood)[5]] <-"Blood medium"
+
+surv_dat$Lymphatic_G[surv_dat$Lymphatic >summary(surv_dat$Lymphatic)[5]] <-"Lymphatic high"
+surv_dat$Lymphatic_G[surv_dat$Lymphatic <summary(surv_dat$Lymphatic)[2]] <-"Lymphatic low"
+surv_dat$Lymphatic_G[surv_dat$Lymphatic >summary(surv_dat$Lymphatic)[2]& surv_dat$Lymphatic<summary(surv_dat$Lymphatic)[5]] <-"Lymphatic medium"
+
+surv_dat$HEV_G[surv_dat$HEV >summary(surv_dat$HEV)[5]] <-"HEV high"
+surv_dat$HEV_G[surv_dat$HEV <summary(surv_dat$HEV)[2]] <-"HEV low"
+surv_dat$HEV_G[surv_dat$HEV >summary(surv_dat$HEV)[2]& surv_dat$HEV<summary(surv_dat$HEV)[5]] <-"HEV medium"
+
+surv <-c("Lymphatic_G","Blood_G","HEV_G", "metacluster")
+```
+
+## Survival vessel type proportions high mid low not split
+```{r  survival vessel type proportions not split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#not split by vessel Type (AC/SQC)
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+  # symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+```
+
+## Survival vessel type proportions high mid low split by vessel type
+```{r  survival vessel type proportions split high mid low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#split by vessel Type (AC/SQC)
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Blood_G[dat$Blood >summary(dat$Blood)[5]] <-"Blood high"
+dat$Blood_G[dat$Blood <summary(dat$Blood)[2]] <-"Blood low"
+dat$Blood_G[dat$Blood >summary(dat$Blood)[2]& dat$Blood<summary(dat$Blood)[5]] <-"Blood medium"
+
+dat$Lymphatic_G[dat$Lymphatic >summary(dat$Lymphatic)[5]] <-"Lymphatic high"
+dat$Lymphatic_G[dat$Lymphatic <summary(dat$Lymphatic)[2]] <-"Lymphatic low"
+dat$Lymphatic_G[dat$Lymphatic >summary(dat$Lymphatic)[2]& dat$Lymphatic<summary(dat$Lymphatic)[5]] <-"Lymphatic medium"
+  
+dat$HEV_G[dat$HEV >summary(dat$HEV)[5]] <-"HEV high"
+dat$HEV_G[dat$HEV <summary(dat$HEV)[2]] <-"HEV low"
+dat$HEV_G[dat$HEV >summary(dat$HEV)[2]& dat$HEV<summary(dat$HEV)[5]] <-"HEV medium"
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = surv_dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+ #  symbols = c("****", "***", "**", "*", "+", "."),
+  # abbr.colnames = FALSE, na = " "))
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)   
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,2000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+including clinical parameters, patient metacluster and T cell categories (Blood vs Lymphatic) classified as high mid low as well as the proportion (continuous).   
+```{r  coxph vessel type proportions high mid low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c("Blood_G","Lymphatic_G","HEV_G","Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c("Blood_G","Lymphatic_G","HEV_G","metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_vessel-Category_Proportions_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+#Correlation of vessel type proportions with clinical parameters
+```{r  Correlation of vessel type proportions and clinical parameters, message=FALSE, warning=FALSE, echo=FALSE}
+all.vessel_pat$NeoAdj <- ifelse(all.vessel_pat$Chemo==1 |all.vessel_pat$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.vessel_pat[,all.vessel_pat$Patient_ID!="Control"&
+                                          all.vessel_pat$DX.name=="Adenocarcinoma"|
+                                          all.vessel_pat$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                           all.vessel_pat[,all.vessel_pat$Patient_ID!="Control"&
+                                          all.vessel_pat$DX.name=="Adenocarcinoma"|
+                                          all.vessel_pat$DX.name=="Squamous cell carcinoma"]$vessel_type))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+tdat_wide <-merge(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_as <-tdat_wide[tdat_wide$DX.name=="Adenocarcinoma" | tdat_wide$DX.name=="Squamous cell carcinoma",]
+tdat_wide_as$LN.Met <-ifelse(tdat_wide_as$LN.Met=="LN Metastases", 1,0)
+tdat_wide_as$Dist.Met <-ifelse(tdat_wide_as$Dist.Met=="Dist. Metastases", 1,0)
+
+c.param <- c("Gender","T.new","N","M.new","Dist.Met","LN.Met","Stage")
+for(i in c.param){
+  print(i)
+print(cor.test(tdat_wide_as$Blood,tdat_wide_as[[i]], method="spearman", exact=F))
+print(cor.test(tdat_wide_as$Lymphatic,tdat_wide_as[[i]], method="spearman", exact=F))
+print(cor.test(tdat_wide_as$HEV,tdat_wide_as[[i]], method="spearman", exact=F))
+  }
+```
+
+
+In group comparison (Wilcoxon for 2 groups or KW for >2 groups) for T cell categories over clinical variables:  
+  - T.new (TNM)
+  - N (TNM)
+  - M.new (TNM)
+  - Relapse
+  - Grade
+  - Stage
+  - Gender
+  - Typ (vessel Type)
+  - Dist.Met (Distant metastases yes/no)
+  - LN.Met (Lymph node metastases yes/no)
+  
+## Kruskal-Wallis / Wilcoxon vessel type ~ clinical data excluding neoadjuvant therapy
+```{r  KW W vessel type proportions WO neo, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(all.vessel_pat[,all.vessel_pat$Patient_ID!="Control"&
+                                          all.vessel_pat$DX.name=="Adenocarcinoma"|
+                                          all.vessel_pat$DX.name=="Squamous cell carcinoma"]))
+
+#remove neoadj patients
+df <-df[!df$Patient_ID%in% neoadj_pat$Patient_ID,]
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender") 
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$vessel_type, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=3) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+  #pdf(file=file.path(plot_folder, paste0("KW_W_vessel-Category_Proportions_WOneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+## Neoadjuvantly treated patients are included, the same categories are used plus:  
+- NeoAdj (Neoadjuvant therapy yes/no)
+- Chemo (Neoadjuvant therapy = chemotherapy)
+- Radio (Neoadjuvant therapy = radiotherapy)
+```{r  KW W vessel type proportions with NEO, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+df <- as.data.frame(colData(all.vessel_pat[,all.vessel_pat$Patient_ID!="Control"&
+                                          all.vessel_pat$DX.name=="Adenocarcinoma"|
+                                          all.vessel_pat$DX.name=="Squamous cell carcinoma"]))
+
+categories <- c("Dist.Met","LN.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+
+plot_list <- list()
+
+for (i in (categories)) {
+
+    t <- table(df$vessel_type, df$Patient_ID)
+    t <- prop.table(t, margin=2)
+    t <- as.data.frame(t)
+    colnames(t) <- c("Phenotype", "Patient_ID","freq")
+    t<-merge(t, clinical.data, by.x="Patient_ID", by.y="Patient_ID")
+    tdat <-t
+    tdat$i <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+  
+  for (j in unique(t$Phenotype)){
+    tdat<-t
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(freq~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+      geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=4, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Proportion")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)
+    }
+    
+    else {
+if(is.element(i,colnames(clinical.data)) == TRUE){
+    
+      tdat<-t
+      tdat[[i]] <-as.factor(tdat[[i]])
+      tdat <-tdat %>% drop_na(i)
+      tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+      tdat <-subset(tdat, Phenotype==j)
+      detach("package:dplyr", unload=TRUE)
+      library(dplyr)
+      
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$freq~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = freq, colour=.data[[i]]))+
+        geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=4, strip.position="top")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(x=paste(i), y="Proportion", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }  
+ 
+    }
+   }  
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=3) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_vessel-Category_Proportions_Wneo",i,".pdf")), width=10, height=5)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=length(unique(t$Phenotype)))
+ # dev.off()
+}
+```
+
+
+
+## **vessel type**
+## **Densities**
+Generally, densities are first calculated per tissue area. For patients with more than 1 image, the average density is calculated subsequently.  
+Optimal number of vessel type densities patients metaclusters:
+```{r  vessel type density optimal number of clusters, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=4}
+#Calculate Densities
+tdat <-as.data.frame(table(all.vessel_pat.roi[,all.vessel_pat.roi$Patient_ID!="Control"&
+                                          all.vessel_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.vessel_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.vessel_pat.roi[,all.vessel_pat.roi$Patient_ID!="Control"&
+                                          all.vessel_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.vessel_pat.roi$DX.name=="Squamous cell carcinoma"]$vessel_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.vessel_pat.roi$RoiID,
+                                "Area"=all.vessel_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.vessel_pat.roi$Patient_ID,
+                                "DX.name"=all.vessel_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+#tdat_wide <-tdat_wide[!tdat_wide$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+
+#devtools::install_github("kassambara/factoextra")
+library(FactoMineR)
+library(factoextra)
+sbg <-cutree(hc, k=8)
+fviz_cluster(list(data = tdat_wide[,-1], cluster = sbg))
+
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "wss", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "gap_stat", k.max=20)
+fviz_nbclust(tdat_wide[,-1], FUN = hcut, method = "silhouette", k.max=20)
+
+#install.packages("cluster")
+library(cluster)
+gap_stat <- clusGap(tdat_wide[,-1], FUN = hcut, nstart = 25, K.max = 10, B = 100)
+fviz_gap_stat(gap_stat)
+
+# Compute distance matrix
+res.dist <- dist(tdat_wide[,-1], method = "euclidean")
+```
+
+```{r  Calculate densities vessel type, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table(all.vessel_pat.roi[,all.vessel_pat.roi$Patient_ID!="Control"&
+                                          all.vessel_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.vessel_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.vessel_pat.roi[,all.vessel_pat.roi$Patient_ID!="Control"&
+                                          all.vessel_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.vessel_pat.roi$DX.name=="Squamous cell carcinoma"]$vessel_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.vessel_pat.roi$RoiID,
+                                "Area"=all.vessel_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.vessel_pat.roi$Patient_ID,
+                                "DX.name"=all.vessel_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+  
+  
+#CAF type distribution over all patients
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+
+hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+#order Patients after clustering results
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = labels(hc))
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = labels(hc))
+```
+
+## Barplot showing absolute density per patietn coloured by vessel type together with hierarchical clustering tree coloured by patient metaclusters.
+```{r Barolot vessel type densities, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+#barolot (sqrt of density)
+p <-ggplot(tdat,aes(y=sqrt(density),x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+#barplot total density
+p <-ggplot(tdat,aes(y=sqrt(density),x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")#+theme(legend.position = "none")
+
+
+#coloured dendrogram
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 4) %>% as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()
+pg <-plot_grid(p1,p, align="v", ncol=1)
+pg
+#ggsave(plot=pg, file=file.path(plot_folder, paste("Barplot_vessel-Category-Densities_HC_ordered_dendro.pdf")), width=12, height=6)
+```
+Add patient metacluster to data and plot clusters separately for better identification
+```{r vessel type densities merge hc, message=FALSE, warning=FALSE, echo=FALSE}
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+
+#cut dendrogram, add metacluster info to tdat
+ct <-as.data.frame(cutree(hc, k=4))
+
+colnames(ct)<-"metacluster"
+ct$metacluster <-as.factor(ct$metacluster)
+ct$Patient_ID <- rownames(ct)
+
+tdat_wide_ct <-left_join(tdat_wide, ct, by="Patient_ID")
+tdat_wide_ct <- left_join(tdat_wide_ct, clinical.data, by="Patient_ID")
+table(tdat_wide_ct$metacluster)
+
+#split barlot by metacluster
+tdat_ct <-merge(tdat, ct, by="Patient_ID")
+ggplot(tdat_ct,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau()+
+    theme(legend.position="bottom")+
+  facet_grid(~ metacluster, scales = "free", space = "free")
+```
+
+## Survival analysis over vessel type densities high low.  
+- High > median
+- Low < median
+```{r  surv_dat vessel type densities high low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- surv_dat$Blood+surv_dat$Lymphatic+surv_dat$HEV
+
+#high low
+surv_dat$Density_G <- ifelse(surv_dat$total_density >summary(surv_dat$total_density)[3],"Density high","Density low")
+surv_dat$Blood_G <- ifelse(surv_dat$Blood >summary(surv_dat$Blood)[3],"Blood high","Blood low")
+surv_dat$Lymphatic_G <- ifelse(surv_dat$Lymphatic >summary(surv_dat$Lymphatic)[3],"Lymphatic high","Lymphatic low")
+surv_dat$HEV_G <- ifelse(surv_dat$HEV >summary(surv_dat$HEV)[3],"HEV high","HEV low")
+
+```
+
+#CoxPH for vessel type density corrected for Stage, Grade and M
+```{r  Coxph vessel type density corrected, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   Blood + Lymphatic +HEV+ Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `Blood+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`Lymphatic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+
+p <-ggplot(td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+#
+plot(p)
+```
+
+## Not split by vessel type (AC and SQC together)
+```{r  survival vessel type densities high low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+#surv <-c("metacluster")
+surv <-c("Lymphatic_G","Blood_G","HEV_G","metacluster", "Density_G")
+library(survminer)
+
+for (i in surv){
+  #for (k in unique(surv_dat$DX.name)){
+    #dat <-surv_dat[surv_dat$DX.name==k,]
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-vessel_type_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+#km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-vessel_type_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## Split by vessel type (AC and SQC individually)
+```{r  survival vessel type densities high low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Split
+library(survminer)
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G <- ifelse(dat$total_density >summary(dat$total_density)[3],"Density high","Density low")
+dat$Blood_G <- ifelse(dat$Blood >summary(dat$Blood)[3],"Blood high","Blood low")
+dat$Lymphatic_G <- ifelse(dat$Lymphatic >summary(dat$Lymphatic)[3],"Lymphatic high","Lymphatic low")   
+dat$HEV_G <- ifelse(dat$HEV >summary(dat$HEV)[3],"HEV high","HEV low")   
+
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-vessel_type_hi-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-vessel_type_hi-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and T cell categories (Blood vs Lymphatic) classified as high / low as well as the densities (continuous).   
+```{r  coxph vessel type densities high-low, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=6, fig.width=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c("Density_G","Blood_G","Lymphatic_G","HEV_G","Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c("Blood_G","Lymphatic_G","HEV_G","Density_G","metacluster",colnames(clinical.data),contains("censor"))) #)),"AG" #,NT,"Area","TMA_ImageID"
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_vessel-Category_Density_high-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Survival vessel type densities high-mid-low     
+- high    > 3rd quantile   
+- mid     > 1st quantile < 3rd quantile   
+- low     < 1st quantile   
+```{r  surv_dat vessel type densities high mid low, message=FALSE, warning=FALSE, echo=FALSE}
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+surv_dat$total_density <- surv_dat$Blood+surv_dat$Lymphatic+surv_dat$HEV
+
+#High Med Low
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[5]] <-"Density high"
+surv_dat$Density_G[surv_dat$total_density <summary(surv_dat$total_density)[2]] <-"Density low"
+surv_dat$Density_G[surv_dat$total_density >summary(surv_dat$total_density)[2]& surv_dat$total_density<summary(surv_dat$total_density)[5]] <-"Density medium"
+
+surv_dat$Blood_G[surv_dat$Blood >summary(surv_dat$Blood)[5]] <-"Blood high"
+surv_dat$Blood_G[surv_dat$Blood <summary(surv_dat$Blood)[2]] <-"Blood low"
+surv_dat$Blood_G[surv_dat$Blood >summary(surv_dat$Blood)[2]& surv_dat$Blood<summary(surv_dat$Blood)[5]] <-"Blood medium"
+
+surv_dat$Lymphatic_G[surv_dat$Lymphatic >summary(surv_dat$Lymphatic)[5]] <-"Lymphatic high"
+surv_dat$Lymphatic_G[surv_dat$Lymphatic <summary(surv_dat$Lymphatic)[2]] <-"Lymphatic low"
+surv_dat$Lymphatic_G[surv_dat$Lymphatic >summary(surv_dat$Lymphatic)[2]& surv_dat$Lymphatic<summary(surv_dat$Lymphatic)[5]] <-"Lymphatic medium"
+
+surv_dat$HEV_G[surv_dat$HEV >summary(surv_dat$HEV)[5]] <-"HEV high"
+surv_dat$HEV_G[surv_dat$HEV <summary(surv_dat$HEV)[2]] <-"HEV low"
+surv_dat$HEV_G[surv_dat$HEV >summary(surv_dat$HEV)[2]& surv_dat$HEV<summary(surv_dat$HEV)[5]] <-"HEV medium"
+
+surv <-c("Blood_G","Lymphatic_G","HEV_G","Density_G","metacluster")
+```
+
+## vessel type densities split by vessel type
+```{r  survival vessel type densities high-mid-low not split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+#Not split
+surv <-c("Blood_G","Lymphatic_G","HEV_G","Density_G","metacluster")
+library(survminer)
+
+for (i in surv){
+  dat <-surv_dat
+  k="AC & SCC together"
+#OS
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-vessel_type_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+#print(p)
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit
+
+#disease free survival
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-vessel_type_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+ #  abbr.colnames = FALSE, na = " "))
+km_trt_fit
+}
+```
+
+## vessel type densities split by vessel type
+```{r  survival vessel type densities high-mid-low split, message=FALSE, warning=FALSE, echo=FALSE,out.width="50%", fig.height=4, fig.width=6}
+surv <-c("Blood_G","Lymphatic_G","HEV_G","Density_G","metacluster")
+
+for (i in surv){
+  for (k in unique(surv_dat$DX.name)){
+    dat <-surv_dat[surv_dat$DX.name==k,]
+dat$Density_G[dat$total_density >summary(dat$total_density)[5]] <-"Density high"
+dat$Density_G[dat$total_density <summary(dat$total_density)[2]] <-"Density low"
+dat$Density_G[dat$total_density >summary(dat$total_density)[2]& dat$total_density<summary(dat$total_density)[5]] <-"Density medium"
+
+dat$Blood_G[dat$Blood >summary(dat$Blood)[5]] <-"Blood high"
+dat$Blood_G[dat$Blood <summary(dat$Blood)[2]] <-"Blood low"
+dat$Blood_G[dat$Blood >summary(dat$Blood)[2]& dat$Blood<summary(dat$Blood)[5]] <-"Blood medium"
+
+dat$Lymphatic_G[dat$Lymphatic >summary(dat$Lymphatic)[5]] <-"Lymphatic high"
+dat$Lymphatic_G[dat$Lymphatic <summary(dat$Lymphatic)[2]] <-"Lymphatic low"
+dat$Lymphatic_G[dat$Lymphatic >summary(dat$Lymphatic)[2]& dat$Lymphatic<summary(dat$Lymphatic)[5]] <-"Lymphatic medium"
+  
+dat$HEV_G[dat$HEV >summary(dat$HEV)[5]] <-"HEV high"
+dat$HEV_G[dat$HEV <summary(dat$HEV)[2]] <-"HEV low"
+dat$HEV_G[dat$HEV >summary(dat$HEV)[2]& dat$HEV<summary(dat$HEV)[5]] <-"HEV medium"
+  
+#OS
+pw<-pairwise_survdiff(Surv(OS, censoringOS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(OS, censoringOS) ~ dat[[i]], data=dat)  
+km_trt_fit
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = F,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("OS over",i,"per",k))
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-vessel_type_hi-med-low_OS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#km_trt_fit
+#print(p)
+
+#disease free survival
+pw<-pairwise_survdiff(Surv(DFS, censoringDFS) ~ metacluster,data = dat)
+#print(pw)
+#print(symnum(pw$p.value, cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
+#   symbols = c("****", "***", "**", "*", "+", "."),
+#   abbr.colnames = FALSE, na = " "))
+
+km_trt_fit <- survfit(Surv(DFS, censoringDFS) ~ dat[[i]], data=dat)      
+km_trt_fit
+
+p <-ggsurvplot(km_trt_fit,
+               data=dat,
+               conf.int = T,
+               pval = T,
+               #conf.int.style = "step",
+               surv.median.line = "hv",
+               xlim=c(0,5000))+ # Specify median survival,
+               #legend.title = paste(i),
+               #legend.labs=c("Grade 1","Grade 2","Grade 3"),
+               #legend.labs=c(paste(i,sort(unique(surv_dat[[i]][!is.na(surv_dat[[i]])])))))+
+  labs(title=paste("DFS over",i,"per",k))#+
+ # facet_grid(.~surv_dat[[i]])
+print(p)
+
+#pdf(file=file.path(plot_folder, paste0("Survival_Density-vessel_type_hi-med-low_DFS over",i,"_per_",k,".pdf")))
+#print(p,newpage=FALSE)
+#dev.off()
+
+p <-p$plot+
+  theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  #facet_wrap(~strata)+
+  theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank())
+#print(p)
+#km_trt_fit
+  }
+}
+```
+
+
+
+## Lasso-regressed cox-ph model   
+including clinical parameters, patient metacluster and T cell categories (Blood vs Lymphatic) classified as high mid low as well as the densities (continuous).   
+```{r coxph vessel type densities high-mid-low, message=FALSE, warning=FALSE, echo=FALSE, out.width="50%", fig.width=6, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c("Density_G","Blood_G","Lymphatic_G","HEV_G","Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c("Blood_G","Lymphatic_G","HEV_G","Density_G","metacluster",colnames(clinical.data),contains("censor")))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()
+p
+#ggsave(file=file.path(plot_folder, paste("CoxPH_vessel-Category_Density_high-mid-low_Lasso.pdf")), width=10, height=10, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables without neoadjuvant treated patients.
+```{r  KW W vessel type density WO neo,warning=FALSE, message=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+all.vessel_pat.roi$NeoAdj <- ifelse(all.vessel_pat.roi$Chemo==1 |all.vessel_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.vessel_pat.roi[,all.vessel_pat.roi$Patient_ID!="Control"&
+                                          all.vessel_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.vessel_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.vessel_pat.roi[,all.vessel_pat.roi$Patient_ID!="Control"&
+                                          all.vessel_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.vessel_pat.roi$DX.name=="Squamous cell carcinoma"]$vessel_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.vessel_pat.roi$RoiID,
+                                "Area"=all.vessel_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.vessel_pat.roi$Patient_ID,
+                                "DX.name"=all.vessel_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- tdat_wide$Blood + tdat_wide$Lymphatic + tdat_wide$HEV
+tdat <-tdat_wide %>% pivot_longer(cols=c("Blood","Lymphatic","total_density"), names_to = "Phenotype",values_to = "density")
+
+#remove
+tdat <-tdat[!tdat$Patient_ID%in% neoadj_pat$Patient_ID,]
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=3) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+ # pdf(file=file.path(plot_folder, paste0("KW_W_vessel-Category_Densities_WOneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+## Kruskal Wallis / Wilcoxon comparison between groups of clincial variables with neoadjuvant treated patients.
+```{r  KW Wilcox vessel type density W neo,warning=F,message=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#define variables to be tested here
+categories <- c( "Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender")
+
+all.vessel_pat.roi$NeoAdj <- ifelse(all.vessel_pat.roi$Chemo==1 |all.vessel_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.vessel_pat.roi[,all.vessel_pat.roi$Patient_ID!="Control"&
+                                          all.vessel_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.vessel_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.vessel_pat.roi[,all.vessel_pat.roi$Patient_ID!="Control"&
+                                          all.vessel_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.vessel_pat.roi$DX.name=="Squamous cell carcinoma"]$vessel_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.vessel_pat.roi$RoiID,
+                                "Area"=all.vessel_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.vessel_pat.roi$Patient_ID,
+                                "DX.name"=all.vessel_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- tdat_wide$Blood + tdat_wide$Lymphatic + tdat_wide$HEV
+tdat <-tdat_wide %>% pivot_longer(cols=c("Blood","Lymphatic","total_density"), names_to = "Phenotype",values_to = "density")
+
+tdat_c <-merge(tdat, clinical.data, by="Patient_ID")
+categories <- c("Dist.Met", "T.new","M.new","N","Relapse","DX.name","Stage","Grade","Gender","Chemo","Radio","NeoAdj")
+plot_list <- list()
+for (i in (categories)) {
+  for (j in unique(tdat_c$Phenotype)){
+    tdat<-tdat_c
+    tdat[[i]] <-as.factor(tdat[[i]])
+    tdat <-tdat %>% drop_na(i)
+    tdat$i <-as.factor(tdat[[i]])
+    tdat$Phenotype <-as.factor(tdat$Phenotype)
+    tdat$Phenotype <-droplevels(tdat$Phenotype)
+
+    tdat <-subset(tdat, Phenotype==j)
+
+    if (length(unique(tdat[[i]])) > 2)
+      {
+    #calculate p values, add position for plot
+    stat.test <- tdat%>%
+     group_by(Phenotype) %>%
+     dunn_test(density~i,p.adjust.method = "bonferroni")
+    stat.test<-stat.test %>% select(-.y., -statistic)
+    stat.test <- stat.test %>% add_xy_position(x = i)
+   
+    #create plot list
+    plot_list[[j]]<-
+      ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+              geom_violin(width=0.5) +
+    geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+      #geom_boxplot()+
+      geom_point()+
+      facet_wrap(~Phenotype, scales="free", ncol=3, strip.position="top")+
+      scale_color_viridis_d()+
+      theme_bw()+
+      theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+      labs( y="Mean patient density")+
+      theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")+
+      stat_pvalue_manual(stat.test, label = "p.adj.signif", tip.length = 0, hide.ns = T)+
+      theme(legend.position = "none")
+    }
+    
+     else{
+      
+        tdat<-tdat_c
+        tdat[[i]] <-as.factor(tdat[[i]])
+        tdat <-tdat %>% drop_na(i)
+        tdat$Phenotype <-factor(tdat$Phenotype)
+        tdat$Phenotype <-droplevels(tdat$Phenotype)
+        tdat <-tdat %>% drop_na(i)
+
+        tdat <-subset(tdat, Phenotype==j)
+          detach("package:dplyr", unload=TRUE)
+          library(dplyr)
+      pvalues <- tdat %>% 
+      group_by(Phenotype) %>% 
+      summarise(p=wilcox.test(tdat$density~tdat[[i]], paired=F)$p.value)
+      tdat <- merge(tdat, pvalues, by.x = "Phenotype", by.y ="Phenotype", all.x = TRUE)
+      tdat$p.wt <- paste0('p=',round(tdat$p, digits=3))
+    
+      #Plot list, all js together over i
+      plot_list[[j]]<-
+        ggplot(tdat, aes(x= .data[[i]], y = density, colour=.data[[i]]))+
+        geom_violin(width=0.5) +
+        geom_boxplot(width=0.3, color="grey", alpha=0.2)+
+        #geom_boxplot()+
+        geom_point()+
+        facet_wrap(~Phenotype+p.wt, scales="free", ncol=3, strip.position="top")+
+        #ggtitle("paired")+
+        theme(axis.title.x=element_text("Phenotype"))+
+      
+        scale_color_viridis_d()+
+        theme_bw()+
+        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+        labs(y="Mean patient density", fill="Area_Phenotype")+
+        theme(axis.text.x = element_text(angle=45, hjust=1), axis.ticks.x = element_blank(),legend.position = "none")
+    }  
+ 
+    }
+
+  #plot
+  gridExtra::grid.arrange(grobs = plot_list, ncol=3) #, ncol=round(length(unique(t$Phenotype)))
+  
+  #save out in individual pdfs for each variable
+#  pdf(file=file.path(plot_folder, paste0("KW_W_vessel-Category_Densities_Wneo",i,".pdf")), width=12, height=6)
+ # gridExtra::grid.arrange(grobs = plot_list, ncol=3)
+#  dev.off()
+}
+```
+
+```{r  Correlation vessel type densities with clinical parameters, eval=F, echo=T, message=FALSE, warning=FALSE, echo=FALSE}
+all.vessel_pat.roi$NeoAdj <- ifelse(all.vessel_pat.roi$Chemo==1 |all.vessel_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+all.vessel_pat.roi$NeoAdj <- ifelse(all.vessel_pat.roi$Chemo==1 |all.vessel_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.vessel_pat.roi[,all.vessel_pat.roi$Patient_ID!="Control"&
+                                          all.vessel_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.vessel_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.vessel_pat.roi[,all.vessel_pat.roi$Patient_ID!="Control"&
+                                          all.vessel_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.vessel_pat.roi$DX.name=="Squamous cell carcinoma"]$vessel_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.vessel_pat.roi$RoiID,
+                                "Area"=all.vessel_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.vessel_pat.roi$Patient_ID,
+                                "DX.name"=all.vessel_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Sample) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+tdat_wide$total_density <- tdat_wide$Blood + tdat_wide$Lymphatic+tdat_wide$HEV
+
+tdat_wide <-merge(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_as <-tdat_wide[tdat_wide$DX.name=="Adenocarcinoma" | tdat_wide$DX.name=="Squamous cell carcinoma",]
+tdat_wide_as$LN.Met <-ifelse(tdat_wide_as$LN.Met=="LN Metastases", 1,0)
+tdat_wide_as$Dist.Met <-ifelse(tdat_wide_as$Dist.Met=="Dist. Metastases", 1,0)
+
+for(i in c.param){
+  print(i)
+  print(cor.test(tdat_wide_as$Blood,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$Lymphatic,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$HEV,tdat_wide_as[[i]], method="spearman", exact=F))
+  print(cor.test(tdat_wide_as$total_density,tdat_wide_as[[i]], method="spearman", exact=F))
+}
+```
diff --git a/Cytomapper_Sanity_Checks.Rmd b/Cytomapper_Sanity_Checks.Rmd
new file mode 100644
index 0000000..324b33b
--- /dev/null
+++ b/Cytomapper_Sanity_Checks.Rmd
@@ -0,0 +1,255 @@
+---
+title: "R Notebook"
+output: html_notebook
+---
+```{r}
+library(cytomapper)
+data(pancreasSCE)
+data(pancreasImages)
+data(pancreasMasks)
+
+plotPixels(image = pancreasImages, colour_by = c("H3", "CD99", "CDH"))
+
+plotCells(mask = pancreasMasks, object = pancreasSCE,
+            cell_id = "CellNb", img_id = "ImageNb", colour_by = "CD99",
+            outline_by = "CellType")
+
+plotCells(mask = pancreasMasks, object = pancreasSCE,
+            cell_id = "CellNb", img_id = "ImageNb", 
+            colour_by = "CellType")
+```
+
+```{r, fig.width=50, fig.height=50}
+wd <- dirname(getwd())
+wd <-"/mnt/lena_processed2/NSCLC_results"
+path.to.images <-file.path(wd,"cytomapper")
+
+all_masks <- loadImages(path.to.images, pattern = "_mask.tiff")
+all_masks <- loadImages(path.to.images, pattern = "2020115_LC_NSCLC_TMA_86_") 
+mcols(all_masks)$ImageNb <- c("1", "2","5")
+head(unique(as.numeric(all_masks[[1]])))
+all_masks <- scaleImages(all_masks, 2^16-1)
+head(unique(as.numeric(all_masks[[1]])))
+
+
+all_masks_2 <- loadImages(path.to.images, "2020121_LC_NSCLC_TMA_87_A_s0_a1_ac_ilastik_s2_Probabilitiescells_mask.tiff")
+all_masks_2 <- scaleImages(all_masks_2, 2^16-1)
+
+head(unique(as.numeric(all_masks_2[[1]])))
+
+unique(all.cells[, all.cells$RoiID=="86_A_A1,2"]$ImageID)
+
+tma86 <-all.cells[, all.cells$RoiID=="86_A_A1,1"|all.cells$RoiID=="86_A_A1,2"|all.cells$RoiID=="86_A_A1,5"]
+
+tma86 <-sce_86_A[,sce_86_A$acID==1|sce_86_A$acID==2|sce_86_A$acID==5]
+tma86_o <-sce_86A[, sce_86A$ROI==1|sce_86A$ROI==2|sce_86A$ROI==5]
+tma86_o$ImageNb <- tma86_o$ROI
+
+tma87 <-all.cells[, all.cells$RoiID=="87_A_A1,1"]
+
+tma87 <-all.cells[, all.cells$TMA=="87A"]
+
+all.final
+tma86$ImageNb <- tma86$acID
+
+tma87$ImageNb <- tma87$ImageID
+
+mcols(all_masks_2)$ImageNb <- c("1")
+plotCells(mask = all_masks, object = tma86,
+            cell_id = "CellNumber", img_id = "ImageNb",
+            outline_by = "cell_category")
+
+plotCells(all_masks, object = tma86_o,
+            img_id = "ImageNb", cell_id = "CellNumber",
+            colour_by = c("Pan Cytokeratin + Keratin Epithelial","SMA"),
+            exprs_values = "c_counts_asinh_scaled")
+
+#rownames(colData(tma86)) <- paste(colData(tma86)$TMA, colData(tma86)$ImageID, colData(tma86)$CellNumber, sep="_")
+plotCells(all_masks, object = tma86,
+            img_id = "ImageNb", cell_id = "CellNumber",
+            colour_by = c("Pan Cytokeratin + Keratin Epithelial","SMA"),
+            exprs_values = "c_counts_asinh_scaled")
+
+plotCells(all_masks, object = tma86,
+            img_id = "ImageNb", cell_id = "CellNumber",
+            colour_by = "CellNumber")
+
+plotCells(all_masks_2, object = tma87,
+            img_id = "ImageNb", cell_id = "CellNumber",
+            colour_by = c("Pan Cytokeratin + Keratin Epithelial","SMA"),
+            exprs_values = "c_counts_asinh_scaled")
+
+path.to.images <-file.path(wd,"mask")
+path.to.images <-file.path(wd,"cytomapper")
+
+all_masks <- loadImages(path.to.images, pattern = "2020121_LC_NSCLC_TMA_87_A") 
+#mcols(all_masks)$ImageNb <- c("1", "2","3")
+#mcols(all_masks)$ImageNb <- c("138")
+mcols(all_masks)$ImageNb <- c("1" ,"10","100","102", "104","105","106","107","108","109", "11","110","111","112","114","115","116","117","118","119","12","121","122" ,"123", "124" ,"125", "126", "127", "128" ,"129" ,"13" , "130" ,"131" ,"132" ,"133" ,"134" ,"135" ,"137", "138", "14" , "15" , "16" , "17" , "18" , "19"  ,"2"   ,"20"  ,"21"  ,"22"  ,"23" , "24" , "25" , "26" , "27" , "28"  ,"29" , "3"  , "30"  ,"31"  ,"32"  ,"33"  ,"34"  ,"36",  "37" , "38" , "39" , "4" ,  "40"  ,"41" , "42" , "43"  ,"44"  ,"45"  ,"46"  ,"47"  ,"48" , "5"  , "50" , "51" , "52" , "53",  "54"  ,"55"  ,"56" , "57"  ,"58" , "59" , "6"   ,"60" , "61" , "63" , "64"  ,"65" , "66" , "67"  ,"68" , "69" , "7" ,  "70" , "71",  "72" , "73" ,"74"  ,"75" , "76" , "77",  "78" , "79" , "8"  , "80"  ,"81"  ,"82"  ,"83" , "85",  "86" , "87"  ,"88" , "89" , "9"   ,"90"  ,"91" , "92" , "93" ,"94"  ,"95"  ,"96" , "97"  ,"98"  ,"99" )
+#mcols(all_masks)$ImageNb <- c(ac_sub$ImageNb)
+head(unique(as.numeric(all_masks[[1]])))
+all_masks <- scaleImages(all_masks, 2^16-1)
+
+tma87 <-sce_87_A[,sce_87_A$acID==1|sce_87_A$acID==2|sce_87_A$acID==3]
+tma87 <-sce_87_A[,sce_87_A$acID==138]
+tma87 <- all.cells[, all.cells$TMA=="87A"]
+
+tma87 <- all.sce_pat.roi[, all.sce_pat.roi$TMA=="87A"]
+
+tma87 <- all.category[, all.category$TMA=="87A"]
+tma87 <-tma87A
+tma87$ImageNb <- tma87$acID
+
+#tma87 <-sce_87_A
+tma87$ImageNb <- tma87$acID
+pdf(file=file.path("/mnt/lena_processed2/NSCLC_NEW/plots_cytomapper","panCK_SMA expression_87A_allCells_Categorised.pdf"), width=20, height=20)
+p1<-plotCells(all_masks, object = tma87,
+            img_id = "ImageNb", cell_id = "CellNumber",
+            colour_by = c("Pan Cytokeratin + Keratin Epithelial","SMA"),
+            exprs_values = "c_counts_asinh_scaled", return_plot = TRUE)
+dev.off()
+
+pdf(file=file.path("/mnt/lena_processed2/NSCLC_NEW/plots_cytomapper","87A_allCells_Categorised.pdf"), width=20, height=20)
+p2 <-plotCells(all_masks, object = tma87,
+            img_id = "ImageNb", cell_id = "CellNumber",
+            colour_by = "cell_category",
+            colour = list(cell_category = c("Tumour"="red","Immune"="blue","T cell"="blue","vessel"="yellow", "Fibroblast"="green", "Other"="pink")), return_plot = TRUE)#, "Other"="white"
+dev.off()
+
+
+tma87$mclust <-factor(tma87$mclust)
+plotCells(all_masks, object = tma87,
+            img_id = "ImageNb", cell_id = "CellNumber",
+            colour_by = "mclust",
+            colour = list(mclust = c("1" = "green","2"="red")))
+
+plotCells(all_masks, object = tma87,
+            img_id = "ImageNb", cell_id = "CellNumber",
+            colour_by = c("Pan Cytokeratin + Keratin Epithelial","SMA"),
+            exprs_values = "c_counts_asinh_scaled")
+
+library(cowplot)
+p3 <-plot_grid(ggdraw(p1$plot, clip = "on"), ggdraw(p2$plot))
+file3 <- tempfile("sce87A_tumour-nontumour_panCK_SMA-plots",fileext = ".png")
+save_plot(file3, p3, ncol=2, base_width = 10)
+
+unique(sce_87_A$acID)
+
+all_masks
+names(all_masks)
+ac_info <- str_split(names(all_masks), '_', simplify=TRUE)
+image$Metadata_Description
+head(ac_info)
+cell_meta$BatchID <- ac_info[cell_meta$ImageNumber,1]
+
+ac_sub <- as.data.frame(ac_info) %>%
+            separate( V8,
+              into = c("TMA", "ImageNb"), 
+              sep = "(?<=[A-Za-z])(?=[0-9])",
+              remove=F
+            )
+ac_sub$ImageNb
+as.data.frame(ac_info)
+
+mcols(all_masks)$ImageNb %in% unique(tma87$ImageNb)
+
+
+tma87$Distance <- tma87$Compartment
+tma87$Distance[tma87$Compartment <(0) & tma87$Compartment >(-30)] <- "-10 - 0"
+tma87$Distance[tma87$Compartment <=(-30) & tma87$Compartment >(-60)] <- "-10 - -20"
+tma87$Distance[tma87$Compartment <=(-60) & tma87$Compartment >(-90)] <- "-20 - -30"
+tma87$Distance[tma87$Compartment <=(-90) & tma87$Compartment >(-120)] <- "-30 - -40"
+tma87$Distance[tma87$Compartment <=(-120) & tma87$Compartment >(-150)] <- "-40 - -50"
+tma87$Distance[tma87$Compartment <=(-150) ] <- "< -50"
+tma87$Distance[tma87$Compartment >=0 ] <- ">0"
+tma87$Distance %>% unique
+tma87$Distance <- as.factor(tma87$Distance)
+
+plotCells(all_masks, object = tma87,
+            img_id = "ImageNb", cell_id = "CellNumber",
+            colour_by = "Distance",
+            colour = list(Distance = c("-10 - 0"="red","-10 - -20"="orange","-20 - -30"="yellow","-30 - -40"="green", "-40 - -50"="blue", "< -50"="white",">0"="grey")), return_plot = TRUE)#, "Other"="white"
+```
+
+```{r,fig.width=50, fig.height=50}
+pdf(file=file.path("/mnt/lena_processed2/NSCLC_NEW/plots_cytomapper","FINAL_tumour_vsNONtumour_87A.pdf"), width=20, height=20)
+p2 <-plotCells(all_masks, object = tma87,
+            img_id = "ImageNb", cell_id = "CellNumber",
+            colour_by = "cat",
+            colour = list(cat = c("non_tumour" = "green","tumour"="red")), return_plot = TRUE)
+dev.off()
+
+pdf(file=file.path("/mnt/lena_processed2/NSCLC_NEW/plots_cytomapper","FINAL_tumour_vsNONtumour_87A_cell-category.pdf"), width=20, height=20)
+
+p3<-plotCells(all_masks, object = tma87,
+            img_id = "ImageNb", cell_id = "CellNumber",
+            colour_by = "cell_category",
+            colour = list(cell_category = c("stroma" = "green","tumour"="red", "immune"="blue","undefined"="yellow")),return_plot = TRUE)
+dev.off()
+
+pdf(file=file.path("/mnt/lena_processed2/NSCLC_NEW/plots_cytomapper","Distance_to_tumour_87A.pdf"), width=20, height=20)
+
+tma87$Distance <- tma87$Compartment
+tma87$Distance[tma87$Compartment <(0) & tma87$Compartment >(-30)] <- "-10 - 0"
+tma87$Distance[tma87$Compartment <=(-30) & tma87$Compartment >(-60)] <- "-10 - -20"
+tma87$Distance[tma87$Compartment <=(-60) & tma87$Compartment >(-90)] <- "-20 - -30"
+tma87$Distance[tma87$Compartment <=(-90) & tma87$Compartment >(-120)] <- "-30 - -40"
+tma87$Distance[tma87$Compartment <=(-120) & tma87$Compartment >(-150)] <- "-40 - -50"
+tma87$Distance[tma87$Compartment <=(-150) ] <- "< -50"
+tma87$Distance[tma87$Compartment >=0 ] <- ">0"
+tma87$Distance %>% unique
+tma87$Distance <- as.factor(tma87$Distance)
+
+p <-plotCells(all_masks, object = tma87,
+            img_id = "ImageNb", cell_id = "CellNumber",
+            colour_by = "Distance",
+            colour = list(Distance = c("-10 - 0"="red","-10 - -20"="orange","-20 - -30"="yellow","-30 - -40"="green", "-40 - -50"="blue", "< -50"="purple",">0"="grey")), return_plot = TRUE)#, "Other"="white"
+dev.off()
+```
+```{r, fig.width=50, fig.height=50}
+path.to.images <-file.path(wd,"mask")
+all_masks <- loadImages(path.to.images, pattern = "2020121_LC_NSCLC_TMA_87_A") 
+mcols(all_masks)$ImageNb <- c("1" ,"10","100","102", "104","105","106","107","108","109", "11","110","111","112","114","115","116","117","118","119","12","121","122" ,"123", "124" ,"125", "126", "127", "128" ,"129" ,"13" , "130" ,"131" ,"132" ,"133" ,"134" ,"135" ,"137", "138", "14" , "15" , "16" , "17" , "18" , "19"  ,"2"   ,"20"  ,"21"  ,"22"  ,"23" , "24" , "25" , "26" , "27" , "28"  ,"29" , "3"  , "30"  ,"31"  ,"32"  ,"33"  ,"34"  ,"36",  "37" , "38" , "39" , "4" ,  "40"  ,"41" , "42" , "43"  ,"44"  ,"45"  ,"46"  ,"47"  ,"48" , "5"  , "50" , "51" , "52" , "53",  "54"  ,"55"  ,"56" , "57"  ,"58" , "59" , "6"   ,"60" , "61" , "63" , "64"  ,"65" , "66" , "67"  ,"68" , "69" , "7" ,  "70" , "71",  "72" , "73" ,"74"  ,"75" , "76" , "77",  "78" , "79" , "8"  , "80"  ,"81"  ,"82"  ,"83" , "85",  "86" , "87"  ,"88" , "89" , "9"   ,"90"  ,"91" , "92" , "93" ,"94"  ,"95"  ,"96" , "97"  ,"98"  ,"99" )
+#mcols(all_masks)$ImageNb <- c(ac_sub$ImageNb)
+head(unique(as.numeric(all_masks[[1]])))
+all_masks <- scaleImages(all_masks, 2^16-1)
+
+tma87 <- all.filtered[, all.filtered$TMA=="87A"]
+tma87 <- all.pat.roi[, all.pat.roi$TMA=="87A"]
+tma87 <- all.sce[, all.sce$TMA=="87A"]
+
+tma87$ImageNb <- tma87$acID
+
+
+#pdf(file=file.path("/mnt/lena_processed2/NSCLC_NEW/plots_cytomapper","panCK_SMA expression_87A_allCells_Categorised.pdf"), width=20, height=20)
+p1<-plotCells(all_masks, object = tma87,
+            img_id = "ImageNb", cell_id = "CellNumber",
+            colour_by = c("Pan Cytokeratin + Keratin Epithelial","SMA"),
+            exprs_values = "c_counts_asinh_scaled", return_plot = TRUE)
+#dev.off()
+
+#pdf(file=file.path("/mnt/lena_processed2/NSCLC_NEW/plots_cytomapper","87A_allCells_Categorised.pdf"), width=20, height=20)
+p2 <-plotCells(all_masks, object = tma87,
+            img_id = "ImageNb", cell_id = "CellNumber",
+            colour_by = "cell_category",
+            colour = list(cell_category = c("Tumour"="red","Immune"="blue","T cell"="blue","vessel"="yellow", "Fibroblast"="green")), return_plot = TRUE)
+#dev.off()
+
+pdf(file=file.path("/mnt/lena_processed2/NSCLC_NEW/plots_cytomapper","87A_tumour_stroma_Masks.pdf"), width=20, height=20)
+
+p3 <-plotCells(all_masks, object = tma87,
+            img_id = "ImageNb", cell_id = "CellNumber",
+            colour_by = "Mask",
+            colour = list(Mask = c("tumour"="red","stroma"="green")), return_plot = TRUE)
+dev.off()
+
+```
+
+```{r, fig.width=50, fig.height=50}
+tma87$Mask <- ifelse(tma87$Compartment > 0, "tumour","stroma")
+p3 <-plotCells(all_masks, object = tma87,
+            img_id = "ImageNb", cell_id = "CellNumber",
+            colour_by = "Mask",
+            colour = list(Mask = c("tumour"="red","stroma"="green")), return_plot = TRUE)
+```
+
diff --git a/Plots.Rmd b/Plots.Rmd
new file mode 100644
index 0000000..0d5f998
--- /dev/null
+++ b/Plots.Rmd
@@ -0,0 +1,2035 @@
+---
+title: "R Notebook"
+output: html_notebook
+---
+```{r, import libraries}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+
+library(dplyr)
+library(tidyr)
+library(mclust)
+library(ggplot2)
+library(RColorBrewer)
+library(scater)
+library(Rphenoannoy)
+set.seed(101100)
+```
+
+```{r}
+wd <- dirname(getwd())
+
+#Set working directory and folder structure
+data_folder <-(file.path(wd,"sce_objects","final objects with categories","FINAL"))
+
+#set plot folder for results
+plot_folder <-(file.path(wd,"plots"))
+
+```
+
+Define clinical data
+```{r clinical data, message=FALSE, warning=FALSE, echo=FALSE}
+clinical.data <- read.csv(file=file.path(wd,"sce_objects","clinical_data", "clinical_data_ROI_ac_combined_CORRECT.csv"))
+
+clinical.data = clinical.data[!duplicated(clinical.data$Patient_ID),]
+clinical.data$X <-NULL
+clinical.data$X.1 <-NULL
+head(clinical.data)
+```
+
+```{r}
+all.sce
+sce <- readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/FINAL/FINAL_All_Clustered_FILTERED_inc_tumour_vessel_immune_CAF_minusOther.rds")
+
+other.sce <- readRDS("~/data/lena_processed2/NSCLC_NEW/sce_objects/final objects with categories/FINAL/OTHER_CLINICAL-DATA_FILTERED.rds")
+
+colnames(colData(sce))[!colnames(colData(sce))%in% colnames(colData(other.sce))]
+colnames(colData(other.sce))[!colnames(colData(other.sce))%in% colnames(colData(sce))]
+colnames(colData(other.sce))[!colnames(colData(sce))%in% colnames(colData(other.sce))]
+colnames(colData(other.sce))[!colnames(colData(other.sce))%in% colnames(colData(sce))]
+colnames(colData(sce))[!colnames(colData(sce))%in% colnames(colData(other.sce))]
+
+
+all.sce <- cbind(sce, other.sce)
+saveRDS(all.sce, file=file.path(data_folder, "FINAL_All_Clustered_FILTERED_inc_tumour_vessel_immune_CAF_Other_new.rds"))
+all.sce <- readRDS(file=file.path(data_folder, "FINAL_All_Clustered_FILTERED_inc_tumour_vessel_immune_CAF_Other_new.rds"))
+table(all.sce$cell_category)
+
+df <- colData(all.sce) %>% data.frame
+df <-df[!duplicated(df[,"RoiID"]),]
+df_pat <-df %>% count(Patient_ID)
+
+table(df_pat$n) %>% prop.table()*100
+df_pat[df_pat$n ==2,]
+df_pat[df_pat$n ==91,]
+
+all.sce$tumour_stroma <- ifelse(all.sce$Compartment <0, "Stroma","Tumour")
+df_t <-table(all.sce$cell_category, all.sce$tumour_stroma)
+write.csv(df_t,file=file.path(plot_folder,"table_cellcategory_tumour-stroma-mask.csv"))
+
+df_t <-table(all.sce$cell_category, all.sce$tumour_stroma)%>% prop.table(margin=1)*100
+write.csv(df_t,file=file.path(plot_folder,"table_cellcategory_tumour-stroma-mask_percent.csv"))
+
+
+df <- colData(all.sce) %>% data.frame
+df$Distance <- df$Compartment
+df$Distance[df$Compartment <= (-150)] <-"(-150)"
+df$Distance[df$Compartment > (-150) & df$Compartment <= (-120)] <-"-150 to -120"
+df$Distance[df$Compartment > (-120) & df$Compartment <= (-90)] <-"-120 to -90"
+df$Distance[df$Compartment > (-90) & df$Compartment <= (-60)] <-"-90 to -60"
+df$Distance[df$Compartment > (-60) & df$Compartment <= (-10)] <-"-60 to 10"
+df$Distance[df$Compartment > (-10) & df$Compartment <= (0)] <-"-10 to -0"
+df$Distance[df$Compartment > (0)] <-"(>0)"
+df_t <-table(df$Distance, df$cell_category)
+write.csv(df_t,file=file.path(plot_folder,"table_cellcategory_bins10px.csv"))
+```
+
+
+```{r, Define stroma markers, echo=F, warning=F, message=FALSE}
+all.marker <-rownames(all.sce)
+#all.marker <-rownames(all.sce.sub)
+
+bad.marker <- c("Iridium_191","Iridium_193","Cadherin-6","Histone H3") 
+good.marker <- all.marker[!all.marker %in% bad.marker]
+print(good.marker)
+```
+
+
+```{r, echo=F, warning=F, message=FALSE}
+unique(all.sce$cell_subtype)
+all.sce$cell_subtype[all.sce$cell_subtype=="other"] <-"Other"
+
+all.sce$cell_subtype[all.sce$cell_subtype=="iCAF_CD248"|all.sce$cell_subtype=="iCAF_CD34"] <-"iCAF"
+all.sce$cell_subtype[all.sce$cell_subtype=="tpCAF_CD10"|all.sce$cell_subtype=="tpCAF_CD73"] <-"tpCAF"
+all.sce$cell_subtype[all.sce$cell_subtype=="mCAF_MMP11"|all.sce$cell_subtype=="mCAF_Col_Cdh"] <-"mCAF"
+all.sce$cell_subtype[all.sce$cell_subtype=="other"] <-"Other"
+
+all.sce$cell_category %>% unique
+all.sce$cell_subtype %>% unique
+all.sce$immune_category
+
+all.sce$immune_category <-all.sce$cell_category
+all.sce$immune_category[all.sce$cell_category=="T cell"] <-"Immune"
+all.sce$immune_category %>% unique
+```
+
+
+```{r, subset stroma}
+#split cells by ImageNumber
+n_cells_dr <- 100
+cs <- split(seq_len(ncol(all.sce)), all.sce$Tma_ac)
+length(unique(all.sce$Tma_ac))
+#sample 'n_cells_dr' per ImageNumber
+cs <- unlist(lapply(cs, function(u) 
+  sample(u, min(n_cells_dr, length(u)))))
+#sub.test <- sce[,cs]
+all.sce.sub <- all.sce[,cs]
+
+#calculate percentage of cells subsetting results in
+p<-dim(assay(all.sce.sub))[2]/dim(assay(all.sce))[2]*100
+
+#results in % of all cells
+print(paste("Subsetting results in a total number of",dim(assay(
+  all.sce.sub))[2],"cells, which corresponds to",round(p,digits=2), "% of all cells of the original sce." ))
+
+saveRDS(all.sce.sub, file=file.path(data_folder, paste("all_merged_filtered_inclOther_SUB.rds")))
+```
+
+```{r, load subset, message=FALSE, warning=FALSE, echo=FALSE}
+all.sce.sub <-readRDS( file=file.path(data_folder, paste("all_merged_filtered_inclOther_SUB.rds")))
+#test <- readRDS(file=file.path(data_folder, paste("all_clustered_SUB.rds")))
+```
+
+```{r}
+all.sce.sub$cell_category %>% unique
+all.sce.sub$cell_subtype %>% unique
+all.sce.sub$cell_subtype[all.sce.sub$cell_subtype=="other"] <- "Other"
+
+
+```
+
+```{r, calculate umap, warning=F, message=F, echo=F, eval=FALSE}
+p <-c(10,50,100)
+p=50
+for(i in p){
+all.sce.sub <- runUMAP(all.sce.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("UMAP_p", i),
+                 #use_dimred="PCA_20",
+                 n_neighbors = i)
+saveRDS(all.sce.sub, file=file.path(data_folder, paste("all_merged_filtered_inclOther_SUB.rds")))
+
+}
+saveRDS(all.sce.sub, file=file.path(data_folder, paste("all_merged_filtered_inclOther_SUB.rds")))
+
+
+for(i in p){
+all.sce.sub <- runTSNE(all.sce.sub,
+                 exprs_values = "c_counts_asinh",
+                 name = paste0("tSNE_p", i),
+                 #use_dimred="PCA_20",
+                 perplexity = i)
+saveRDS(all.sce.sub, file=file.path(data_folder, paste("all_merged_filtered_inclOther_SUB.rds")))
+}
+```
+
+
+**UMAP with good markers**
+```{r,plot umap tumour marker, fig.width=16, fig.height=10, echo=F, message=FALSE,warning=FALSE}
+dat <-as.data.frame(reducedDims(all.sce.sub)$`UMAP_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.sce.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% good.marker], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=8)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+ggsave(filename=file.path(plot_folder, paste("NEW_UMAP_good_marker_2.png",sep="")), plot=p, width=12, height=10)
+
+```
+**tsne with good markers**
+```{r,plot tsne good marker, fig.width=16, fig.height=12, echo=F, message=FALSE,warning=FALSE}
+dat <-as.data.frame(reducedDims(all.sce.sub)$`tSNE_p50`)
+dat$cell <- rownames(dat)
+dat.counts <-as.data.frame(t((assay(all.sce.sub,"c_counts_asinh_scaled"))))
+dat.counts$cell <- rownames(dat.counts)
+dat.all <-merge(dat, dat.counts, by.x="cell", by.y="cell")
+
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) %in% good.marker], names_to = "target", values_to = "counts")
+dat.all.long <-pivot_longer(dat.all, cols = colnames(dat.all)[colnames(dat.all) == "CD20"], names_to = "target", values_to = "counts")
+
+
+p <-dat.all.long %>%
+  ggplot(aes(x=V1, y=V2, color=counts))+
+  facet_wrap(~target, scales = "free", ncol = 6)+
+  geom_point(alpha=0.5, size=0.2)+
+  scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts')+
+  ggtitle('')+
+  theme(strip.background = element_blank(),
+        axis.line=element_blank(),
+        axis.text.x=element_blank(),
+        axis.text.y=element_blank(),
+        axis.ticks=element_blank(),
+        axis.title.x=element_blank(),
+        axis.title.y=element_blank(),
+        panel.background=element_blank(),
+        panel.border=element_blank(),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        plot.background=element_blank(),
+         strip.text = element_text(size=10)) 
+
+##ggsave(filename=file.path(plot_folder, paste("sub_fibro_fibro-Marker_UMAP_p50_NEW.png",sep="")), plot=p, width=16, height=10)
+plot(p)
+#ggsave(filename=file.path(plot_folder, paste("NEW_tSNE_all-clustered_good-markers_6col.png",sep="")), plot=p, width=16, height=16)
+ggsave(filename=file.path(plot_folder, paste("NEW_tSNE_all-clustered_good-markers_6col_scale.pdf",sep="")), plot=p, width=16, height=16)
+
+```
+
+```{r plot types on umap}
+cluster <- "cell_category"
+plotReducedDim(all.sce.sub, "UMAP_p50", colour_by=paste(cluster), text_by = paste(cluster), point_size=0.5,)+labs(x="UMAP-1", y="UMAP-2", title="Cell Category") #,text_colour ="red"
+
+all.sce.sub$immune_category <-all.sce.sub$cell_category
+all.sce.sub$immune_category[all.sce.sub$cell_category=="T cell"] <-"Immune"
+
+cluster <- "immune_category"
+plotReducedDim(all.sce.sub, "UMAP_p50", colour_by=paste(cluster), point_size=0.5,)+labs(x="UMAP-1", y="UMAP-2")+scale_color_tableau()
+
+cluster <- "cell_type"
+plotReducedDim(all.sce.sub, "UMAP_p50", colour_by=paste(cluster), point_size=0.5,)+labs(x="UMAP-1", y="UMAP-2")+scale_color_tableau()
+
+```
+
+```{r plot types on tsne}
+cluster <- "cell_category"
+plotReducedDim(all.sce.sub, "tSNE_p50", colour_by=paste(cluster), text_by = paste(cluster), point_size=0.5,)+labs(x="tSNE-1", y="tSNE-2", title="Cell Category")+scale_fill_tableau("Tableau 20") #,text_colour ="red"
+
+cluster <- "cell_type"
+plotReducedDim(all.sce.sub, "tSNE_p50", colour_by=paste(cluster), point_size=0.5,)+labs(x="tSNE-1", y="tSNE-2")+scale_color_igv()
+
+all.sce.sub$immune_category <-all.sce.sub$cell_category
+all.sce.sub$immune_category[all.sce.sub$cell_category=="T cell"] <-"Immune"
+
+#save this
+cluster <- "immune_category"
+p <-plotReducedDim(all.sce.sub, "tSNE_p50", colour_by=paste(cluster), point_size=0.5,)+labs(x="tSNE-1", y="tSNE-2")+scale_color_tableau()
+p <-p+scale_color_manual(values=c("#4E79A7","#E15759","#59A14F","#76B7B2","#F28E2B"))
+ggsave(filename=file.path(plot_folder, paste("NEW_tSNE_all-clustered_cell_category.png",sep="")), plot=p, width=9, height=6)
+
+library(pals)
+cluster <- "cell_subtype"
+p <-plotReducedDim(all.sce.sub, "tSNE_p50", colour_by=paste(cluster), point_size=0.5,)+labs(x="tSNE-1", y="tSNE-2")+scale_color_tableau()+scale_color_manual(values=as.vector(glasbey(31)))
+plot(p)
+ggsave(filename=file.path(plot_folder, paste("tSNE_all-clustered_cell_subtype.png",sep="")), plot=p, width=9, height=6)
+```
+
+```{r plot hm, fig.width=15, fig.height=20}
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+
+agg_sce <-aggregateAcrossCells(all.sce, ids=all.sce$cell_subtype, average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+  #plot Heatmap from scater
+ scater::plotHeatmap(agg_sce,
+              #features = fibro.marker,
+              features=good.marker,
+              exprs_values = "c_counts_asinh_scaled",
+              #symmetric = FALSE,
+              zlim=c(-0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              color_columns_by = c("cell_subtype","immune_category" ))
+  #plot Heatmap from scater
+  scater::plotHeatmap(agg_sce,
+              features = good.marker,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              colour_columns_by = cluster,
+              width=15, height=15,
+              filename=file.path(plot_folder, paste0("HM_Rpheno_all",cluster,"_RPA.pdf")))
+  
+```
+
+
+```{r, fig.width=15, fig.height=15}
+library(circlize)
+col_fun = colorRamp2(c(0, 0.5, 1), c("blue", "white", "red"))
+col_fun = colorRamp2(c(seq(from=0, to=1, by=0.1)), c("#313695" ,"#4575B4" ,"#74ADD1" ,"#ABD9E9", "#E0F3F8", "#FFFFBF", "#FEE090", "#FDAE61", "#F46D43" ,"#D73027", "#A50026"))
+
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+lgd = Legend(col_fun = col_fun, title = "foo")
+rdylbu
+agg_sce <-aggregateAcrossCells(all.sce, ids=all.sce$cell_subtype, average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+all.sce$DX.name[is.na(all.sce$DX.name)] <- "NA"
+
+agg_sce <-aggregateAcrossCells(all.sce[, all.sce$DX.name=="Adenocarcinoma"|
+                                         all.sce$DX.name=="Squamous cell carcinoma"], ids=all.sce$cell_subtype, average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+agg_sce <- agg_sce[rownames(agg_sce) %in% good.marker,]
+
+hm.genes <- rownames(agg_sce)
+
+hm.val <- assay(agg_sce,"c_counts_asinh_scaled")[hm.genes,]
+
+#Type
+n.cells <-as.data.frame(table(all.sce[, all.sce$DX.name=="Adenocarcinoma"|
+                                         all.sce$DX.name=="Squamous cell carcinoma"]$cell_subtype,
+                              all.sce[, all.sce$DX.name=="Adenocarcinoma"|
+                                         all.sce$DX.name=="Squamous cell carcinoma"]$DX.name))
+n.cells <-n.cells %>% pivot_wider(names_from = Var2, values_from="Freq")
+names(n.cells)[names(n.cells) == 'Var1'] <- 'Celltype'
+n.cells
+n.cells$Frequency <- rowSums(n.cells[2:3])
+#colnames(n.cells) <-c("ids","Frequency")
+df <-data.frame("Cellcategory"=all.sce$immune_category, "Celltype"=all.sce$cell_subtype) %>% unique
+n.cells <-left_join(n.cells, df, by="Celltype")
+col.pal = list(Celltype = as.vector(glasbey(30)), Cellcategory= palette("Tableau 10")[1:5])
+
+names(col.pal$Cellcategory) <- unique(all.sce$immune_category)
+names(col.pal$Celltype) <- unique(all.sce$cell_subtype)
+
+#no cluster colour
+col.pal <-list(Type= as.vector(glasbey(30)))
+
+names(col.pal$Type) <- unique(all.sce$immune_category)
+names(col.pal$Cluster) <- unique(all.sce$cell_subtype)
+
+col.pal
+
+#cell type
+p<-Heatmap(hm.val, name="Cluster", col=col_fun,clustering_method_rows = "ward.D2",clustering_method_columns = "ward.D2",top_annotation = HeatmapAnnotation(Frequency=anno_barplot(n.cells$Frequency), col=col.pal,Type=n.cells$Type, Cluster=n.cells$ids)) #,col=col.pal
+draw(p)
+
+p<-Heatmap(hm.val, name=" ",
+           col=col_fun,
+           clustering_method_rows = "ward.D2",
+           clustering_method_columns = "ward.D2",
+           top_annotation = HeatmapAnnotation(Cellnumber = anno_barplot(cbind(sqrt(n.cells$Adenocarcinoma), sqrt(n.cells$`Squamous cell carcinoma`)),
+                                                                 gp = gpar(fill = c("black","grey"), col = c("black","grey")),
+                                                                 height = unit(2, "cm")),
+                                              col=col.pal,
+                                              Cellcategory=n.cells$Cellcategory,
+                                              Celltype=n.cells$Celltype)) #,col=col.pal
+draw(p)
+
+#Save complex heatmap as pdf
+pdf(file=file.path(plot_folder, paste("C_HM_all-cells-freq_sqrt_2cmbar.pdf")), width=15, height=15)
+draw(p)
+dev.off()
+```
+
+
+#clean
+```{r, fig.width=15, fig.height=15}
+library(circlize)
+col_fun = colorRamp2(c(0, 0.5, 1), c("blue", "white", "red"))
+col_fun = colorRamp2(c(seq(from=0, to=1, by=0.1)), c("#313695" ,"#4575B4" ,"#74ADD1" ,"#ABD9E9", "#E0F3F8", "#FFFFBF", "#FEE090", "#FDAE61", "#F46D43" ,"#D73027", "#A50026"))
+
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+lgd = Legend(col_fun = col_fun, title = "foo")
+rdylbu
+agg_sce <-aggregateAcrossCells(all.sce, ids=all.sce$cell_subtype, average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+all.sce$DX.name[is.na(all.sce$DX.name)] <- "NA"
+
+agg_sce <-aggregateAcrossCells(all.sce[, all.sce$DX.name=="Adenocarcinoma"|
+                                         all.sce$DX.name=="Squamous cell carcinoma"], ids=all.sce$cell_subtype, average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+agg_sce <- agg_sce[rownames(agg_sce) %in% good.marker,]
+
+hm.genes <- rownames(agg_sce)
+
+hm.val <- assay(agg_sce,"c_counts_asinh_scaled")[hm.genes,]
+
+#Type
+n.cells <-as.data.frame(table(all.sce[, all.sce$DX.name=="Adenocarcinoma"|
+                                         all.sce$DX.name=="Squamous cell carcinoma"]$cell_subtype,
+                              all.sce[, all.sce$DX.name=="Adenocarcinoma"|
+                                         all.sce$DX.name=="Squamous cell carcinoma"]$DX.name))
+n.cells <-n.cells %>% pivot_wider(names_from = Var2, values_from="Freq")
+names(n.cells)[names(n.cells) == 'Var1'] <- 'Celltype'
+n.cells
+n.cells$Frequency <- rowSums(n.cells[2:3])
+#colnames(n.cells) <-c("ids","Frequency")
+df <-data.frame("Cellcategory"=all.sce$immune_category, "Celltype"=all.sce$cell_subtype) %>% unique
+n.cells <-left_join(n.cells, df, by="Celltype")
+col.pal = list(Celltype = as.vector(glasbey(30)), Cellcategory= palette("Tableau 10")[1:5])
+
+names(col.pal$Cellcategory) <- unique(all.sce$immune_category)
+names(col.pal$Celltype) <- unique(all.sce$cell_subtype)
+
+#no cluster colour
+col.pal <-list(Type= as.vector(glasbey(30)))
+
+names(col.pal$Type) <- unique(all.sce$immune_category)
+names(col.pal$Cluster) <- unique(all.sce$cell_subtype)
+
+col.pal
+
+#cell type
+p<-Heatmap(hm.val, name="Cluster", col=col_fun,clustering_method_rows = "ward.D2",clustering_method_columns = "ward.D2",top_annotation = HeatmapAnnotation(Frequency=anno_barplot(n.cells$Frequency), col=col.pal,Type=n.cells$Type, Cluster=n.cells$ids)) #,col=col.pal
+draw(p)
+
+p<-Heatmap(hm.val, name=" ",
+           col=col_fun,
+           clustering_method_rows = "ward.D2",
+           clustering_method_columns = "ward.D2",
+           top_annotation = HeatmapAnnotation(Cellnumber = anno_barplot(cbind(sqrt(n.cells$Adenocarcinoma), sqrt(n.cells$`Squamous cell carcinoma`)),
+                                                                 gp = gpar(fill = c("black","grey"), col = c("black","grey")),
+                                                                 height = unit(2, "cm")),
+                                              col=col.pal,
+                                              Cellcategory=n.cells$Cellcategory,
+                                              Celltype=n.cells$Celltype)) #,col=col.pal
+draw(p)
+
+#Save complex heatmap as pdf
+#pdf(file=file.path(plot_folder, paste("C_HM_all-cells-freq_sqrt_2cmbar.pdf")), width=15, height=15)
+#draw(p)
+#dev.off()
+```
+#Immune cells
+
+```{r plots, fig.width=15, fig.height=10}
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+
+immune.marker <-c("Myeloperoxidase (MPO)" ,"HLA-DR","CD20","CD68","Indoleamine 2- 3-dioxygenase (IDO)","CD3" ,"TCF1/TCF7" ,"FOXP3","CD45RA + CD45R0","CD8a","CD4"  ,"CD15" ,"Ki-67","CD279 (PD-1)")
+
+
+all.sce$immune_category <-all.sce$cell_category
+all.sce$immune_category[all.sce$cell_category=="T cell"] <-"Immune"
+
+agg_sce <-aggregateAcrossCells(all.sce[,all.sce$immune_category =="Immune"], ids=all.sce[,all.sce$immune_category =="Immune"]$cell_subtype, average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+  #plot Heatmap from scater
+  scater::plotHeatmap(agg_sce,
+              features = immune.marker,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              colour_columns_by = cluster)
+  #plot Heatmap from scater
+  scater::plotHeatmap(agg_sce,
+              features = immune.marker,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              colour_columns_by = cluster,
+              width=10, height=10,
+              filename=file.path(plot_folder, paste0("HM_Immune_inclTcells",cluster,"_RPA.pdf")))
+  
+```
+
+
+#CAFs
+
+```{r, fig.width=15, fig.height=10}
+hmcol<-rev(brewer.pal(11,"RdBu"))
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+
+fibro.marker.cluster <-c("SMA","FAP", "Cadherin-11", "Carbonic Anhydrase IX","Collagen I + Fibronectin",
+                 #"VCAM1",
+                 "Indoleamine 2- 3-dioxygenase (IDO)","Podoplanin","MMP11","CD73",
+                 #"MMP9",
+                 "CD10","Vimentin","CD248 / Endosialin",
+                 #"LYVE-1",
+                 "PDGFR-b","CD34","CXCL12","CCL21","Ki-67",
+                 #"Caveolin-1",
+                 "CD146","PNAd")
+
+all.sce$immune_category <-all.sce$cell_category
+all.sce$immune_category[all.sce$cell_category=="T cell"] <-"Immune"
+
+agg_sce <-aggregateAcrossCells(all.sce[,all.sce$cell_category =="Fibroblast"], ids=all.sce[,all.sce$cell_category =="Fibroblast"]$cell_type, average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+  #plot Heatmap from scater
+  scater::plotHeatmap(agg_sce,
+              features = fibro.marker.cluster,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              colour_columns_by = cluster)
+  
+  
+  #plot Heatmap from scater
+  scater::plotHeatmap(agg_sce,
+              features = fibro.marker.cluster,
+              exprs_values = "c_counts_asinh_scaled",
+              symmetric = FALSE,
+             # zlim=c(0,1),
+              color=rdylbu,
+              sortrowss=FALSE,
+              show_colnames=TRUE,
+              colour_columns_by = cluster,
+              width=10, height=10,
+              filename=file.path(plot_folder, paste0("HM_CAF_RPA.pdf")))
+  
+```
+#CAF c_HM
+```{r, fig.width=15, fig.height=15}
+library(circlize)
+col_fun = colorRamp2(c(0, 0.5, 1), c("blue", "white", "red"))
+col_fun = colorRamp2(c(seq(from=0, to=1, by=0.1)), c("#313695" ,"#4575B4" ,"#74ADD1" ,"#ABD9E9", "#E0F3F8", "#FFFFBF", "#FEE090", "#FDAE61", "#F46D43" ,"#D73027", "#A50026"))
+category20 <- c("#1F77B4","#FF7F0E","#2CA02C","#D62728","#9467BD","#8C564B","#E377C2","#7F7F7FFF","#BCBD22","#17BECF","#AEC7E8")
+
+fibro.marker.cluster <-c("SMA","FAP", "Cadherin-11", "Carbonic Anhydrase IX","Collagen I + Fibronectin",
+                 #"VCAM1",
+                 "Indoleamine 2- 3-dioxygenase (IDO)","Podoplanin","MMP11","CD73",
+                 #"MMP9",
+                 "CD10","Vimentin","CD248 / Endosialin",
+                 #"LYVE-1",
+                 "PDGFR-b","CD34","Ki-67",
+                 #"Caveolin-1",
+                 "CD146")
+
+rdylbu <-rev(brewer.pal(11,"RdYlBu"))
+lgd = Legend(col_fun = col_fun, title = "foo")
+rdylbu
+
+all.sce$DX.name[is.na(all.sce$DX.name)] <- "NA"
+
+fibro.sce <- all.sce[,all.sce$cell_category =="Fibroblast"]
+
+agg_sce <-aggregateAcrossCells(fibro.sce, ids=fibro.sce$cell_subtype, average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+
+agg_sce <-aggregateAcrossCells(fibro.sce[, fibro.sce$DX.name=="Adenocarcinoma"|
+                                         fibro.sce$DX.name=="Squamous cell carcinoma"], ids=fibro.sce[, fibro.sce$DX.name=="Adenocarcinoma"|
+                                         fibro.sce$DX.name=="Squamous cell carcinoma"]$cell_subtype, average=TRUE, use_exprs_values="c_counts_asinh_scaled" )
+
+agg_sce <- agg_sce[rownames(agg_sce) %in% fibro.marker.cluster,]
+
+
+agg_sce$cell_subtype <- factor(agg_sce$cell_subtype, levels = c("Collagen_CAF", "dCAF", "hypoxic_CAF","hypoxic_tpCAF","iCAF","IDO_CAF","mCAF","PDPN_CAF","SMA_CAF","tpCAF","vCAF"))
+fibro.sce$cell_subtype <- factor(fibro.sce$cell_subtype, levels = c("Collagen_CAF", "dCAF", "hypoxic_CAF","hypoxic_tpCAF","iCAF","IDO_CAF","mCAF","PDPN_CAF","SMA_CAF","tpCAF","vCAF"))
+
+#df_t$Celltype <- factor(df_t$Celltype, levels = c("Other", "Immune", "Stroma","Tumour"))
+
+hm.genes <- rownames(agg_sce)
+#hm.genes <- fibro.marker.cluster
+hm.val <- assay(agg_sce,"c_counts_asinh_scaled")[hm.genes,]
+
+#Type
+n.cells <-as.data.frame(table(fibro.sce[, fibro.sce$DX.name=="Adenocarcinoma"|
+                                         fibro.sce$DX.name=="Squamous cell carcinoma"]$cell_subtype,
+                              fibro.sce[, fibro.sce$DX.name=="Adenocarcinoma"|
+                                         fibro.sce$DX.name=="Squamous cell carcinoma"]$DX.name))
+n.cells <-n.cells %>% pivot_wider(names_from = Var2, values_from="Freq")
+names(n.cells)[names(n.cells) == 'Var1'] <- 'Celltype'
+n.cells
+n.cells$Frequency <- rowSums(n.cells[2:3])
+#colnames(n.cells) <-c("ids","Frequency")
+
+df <-data.frame("Cellcategory"=fibro.sce$immune_category, "Celltype"=fibro.sce$cell_subtype) %>% unique
+n.cells <-left_join(n.cells, df, by="Celltype")
+col.pal = list(Celltype = as.vector(category20), Cellcategory= palette("Tableau 10")[1])
+
+names(col.pal$Cellcategory) <- unique(fibro.sce$immune_category)
+names(col.pal$Celltype) <- unique(fibro.sce$cell_subtype)
+names(col.pal$Celltype) <- levels(fibro.sce$cell_subtype)
+
+#no cluster colour
+#col.pal <-list(Type= as.vector(category20)(11))
+
+#names(col.pal$Type) <- unique(fibro.sce$immune_category)
+#names(col.pal$Cluster) <- unique(fibro.sce$cell_subtype)
+
+#col.pal
+
+#cell type
+p<-Heatmap(hm.val, name="Cluster", col=col_fun,clustering_method_rows = "ward.D2",clustering_method_columns = "ward.D2",top_annotation = HeatmapAnnotation(Frequency=anno_barplot(n.cells$Frequency), col=col.pal,Type=n.cells$Type, Cluster=n.cells$ids)) #,col=col.pal
+draw(p)
+
+p<-Heatmap(hm.val, name=" ",
+           col=col_fun,
+           clustering_method_rows = "ward.D2",
+           clustering_method_columns = "ward.D2",
+           top_annotation = HeatmapAnnotation(Cellnumber = anno_barplot(cbind(sqrt(n.cells$Adenocarcinoma), sqrt(n.cells$`Squamous cell carcinoma`)),
+                                                                 gp = gpar(fill = c("black","grey"), col = c("black","grey")),
+                                                                 height = unit(2, "cm")),
+                                              col=col.pal,
+                                              #Cellcategory=n.cells$Cellcategory,
+                                              Celltype=n.cells$Celltype)) #,col=col.pal
+draw(p)
+
+#Save complex heatmap as pdf
+pdf(file=file.path(plot_folder, paste("C_HM_CAF-freq_sqrt_2cmbar.pdf")), width=8, height=8)
+draw(p)
+dev.off()
+```
+
+#CAF proportions per patient including clinical data and patient stratification into high and low
+```{r, fig.width=12, fig.height=6}
+category20 <- c("#1F77B4","#FF7F0E","#2CA02C","#D62728","#9467BD","#8C564B","#E377C2","#7F7F7FFF","#BCBD22","#17BECF","#AEC7E8")
+
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+
+
+df$X <- NULL
+cols <- df %>% select(-Patient_ID) %>% colnames
+df[cols] <- lapply(df[cols], factor)
+ clinical.subset <- clinical.data %>% select(Patient_ID, Grade, Gender, Stage, DX.name, Dist.Met, LN.Met, NeoAdj, DX.name, Relapse)
+ 
+ 
+clinical.subset$DX.name[is.na(clinical.subset$DX.name)] <- "NA"
+clinical.subset$Grade[is.na(clinical.subset$Grade)] <- "NA"
+clinical.subset$Gender[is.na(clinical.subset$Gender)] <- "NA"
+clinical.subset$Dist.Met[is.na(clinical.subset$Dist.Met)] <- "NA"
+clinical.subset$LN.Met[is.na(clinical.subset$LN.Met)] <- "NA"
+clinical.subset$NeoAdj[is.na(clinical.subset$NeoAdj)] <- "NA"
+clinical.subset$Relapse[is.na(clinical.subset$Relapse)] <- "NA"
+
+
+clinical.subset <- clinical.subset[clinical.subset$DX.name=="Adenocarcinoma"|clinical.subset$DX.name=="Squamous cell carcinoma",]
+clinical.subset$Grade <- factor(clinical.subset$Grade)
+clinical.subset$Gender <- factor(clinical.subset$Gender)
+clinical.subset$Stage <- factor(clinical.subset$Stage)
+clinical.subset$DX.name <- factor(clinical.subset$DX.name)
+clinical.subset$Dist.Met <- factor(clinical.subset$Dist.Met)
+clinical.subset$LN.Met <- factor(clinical.subset$LN.Met)
+clinical.subset$NeoAdj <- factor(clinical.subset$NeoAdj)
+clinical.subset$Relapse <- factor(clinical.subset$Relapse)
+clinical.subset$DX.name <- factor(clinical.subset$DX.name)
+clinical.subset <-left_join(clinical.subset, df, by="Patient_ID")
+ 
+levels(clinical.subset$Gender) <- c("male","female","NA")
+levels(clinical.subset$Grade) <- c("Grade 1","Grade 2","Grade 3","NA")
+levels(clinical.subset$Stage) <- c("Stage 1","Stage 2","Stage 3","Stage 4","Stage 5","Stage 6","Stage 7","NA")
+
+tdat <-as.data.frame(table( fibro.sce[, fibro.sce$Patient_ID!="Control"&
+                                           fibro.sce$DX.name=="Adenocarcinoma"|
+                                           fibro.sce$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            fibro.sce[, fibro.sce$Patient_ID!="Control"&
+                                           fibro.sce$DX.name=="Adenocarcinoma"|
+                                           fibro.sce$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+tdat_wide <- data.frame(tdat_wide)
+
+ 
+tdat_wide_c <- left_join(tdat_wide, clinical.subset, by = "Patient_ID")
+tdat_wide_c$DX.name[is.na(tdat_wide_c$DX.name)] <- "NA"
+tdat_wide_c$Grade[is.na(tdat_wide_c$Grade)] <- "NA"
+tdat_wide_c$Gender[is.na(tdat_wide_c$Gender)] <- "NA"
+tdat_wide_c$Dist.Met[is.na(tdat_wide_c$Dist.Met)] <- "NA"
+tdat_wide_c$LN.Met[is.na(tdat_wide_c$LN.Met)] <- "NA"
+tdat_wide_c$NeoAdj[is.na(tdat_wide_c$NeoAdj)] <- "NA"
+tdat_wide_c$Relapse[is.na(tdat_wide_c$Relapse)] <- "NA"
+
+tdat_wide_c$Patient_ID <- factor(tdat_wide_c$Patient_ID, levels = hc$labels[hc$order])
+tdat_wide_c <- tdat_wide_c %>% as.data.frame
+rownames(tdat_wide_c) <-tdat_wide_c$Patient_ID
+
+library(wesanderson)
+row_dend = as.dendrogram(hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2"))
+
+h1 <-Heatmap(as.matrix(tdat_wide_c$Patient_ID),
+             rect_gp = gpar(type = "none"),
+             cluster_rows=row_dend,
+             show_heatmap_legend = FALSE,
+        left_annotation = rowAnnotation(Proportions=anno_barplot(tdat_wide_c[2:12], 
+                                                                 gp=gpar(fill=category20, 
+                                                                         col=category20),
+                                                                 bar_width = 1, 
+                                                                 width = unit(6, "cm"), 
+                                                                 height = unit(6, "cm")),
+                    Grade=tdat_wide_c$Grade,
+                    Gender=tdat_wide_c$Gender,
+                    Dist.Met=tdat_wide_c$Dist.Met,
+                    LN.Met=tdat_wide_c$LN.Met,
+                    Type=tdat_wide_c$DX.name,
+                    NeoAdj=tdat_wide_c$NeoAdj,
+                    Relapse=tdat_wide_c$Relapse,
+                   # mCAF_G= tdat_wide_c$mCAF_G,
+                    tpCAF_G= tdat_wide_c$tpCAF_G,
+                    iCAF_G= tdat_wide_c$iCAF_G,
+                    #SMA_CAF_G= tdat_wide_c$SMA_CAF_G,
+                    PDPN_CAF_G= tdat_wide_c$PDPN_CAF_G,
+                    IDO_CAF_G= tdat_wide_c$IDO_CAF_G,
+                    hypoxic_CAF_G= tdat_wide_c$hypoxic_CAF_G,
+                    vCAF_G = tdat_wide_c$vCAF_G,
+                    dCAF_G = tdat_wide_c$dCAF_G,
+                    hypoxic_tpCAF_G= tdat_wide_c$hypoxic_tpCAF_G,
+        col = list(Grade=c("Grade 1"=palette("Tableau 10")[6],
+                           "Grade 2"=palette("Tableau 10")[7],
+                           "Grade 3"=palette("Tableau 10")[8],
+                           "NA"="black"),
+               Gender = c("male"=brewer.pal(n = 8, name = "Dark2")[1],
+                           "female"=brewer.pal(n = 8, name = "Dark2")[4],
+                           "NA"="black"),
+               Dist.Met = c("Dist. Metastases"=brewer.pal(n = 8, name = "Dark2")[2],
+                           "No Dist. Metastases"=brewer.pal(n = 8, name = "Dark2")[3],
+                           "NA"="black"),
+               LN.Met = c("LN Metastases"=brewer.pal(n = 8, name = "Dark2")[5],
+                           "No LN Metastases"=brewer.pal(n = 8, name = "Dark2")[8],
+                           "NA"="black"),
+               Type = c("Squamous cell carcinoma"="black",
+                           "Adenocarcinoma"="grey"),
+               NeoAdj = c("NeoAdjuvantTherapy"=brewer.pal(n = 8, name = "Dark2")[7],
+                          "NoNeoAdjuvantTherapy"=brewer.pal(n = 8, name = "Dark2")[6],
+                          "NA"= "black"),
+                Relapse = c("0"=pal_jco()(10)[4],
+                          "1"=pal_jco()(10)[5],
+                          "NA"= "black"),
+              # mCAF_G = c("mCAF high"="#046C9A",
+               #              "mCAF low"="#ECCBAE"),
+               tpCAF_G = c("tpCAF high"="#046C9A",
+                             "tpCAF low"="#ECCBAE"),
+               #SMA_CAF_G = c("SMA_CAF high"="#046C9A",
+               #              "SMA_CAF low"="#ECCBAE"),
+               iCAF_G = c("iCAF high"="#046C9A",
+                             "iCAF low"="#ECCBAE"),
+               PDPN_CAF_G = c("PDPN_CAF high"="#046C9A",
+                             "PDPN_CAF low"="#ECCBAE"),
+               IDO_CAF_G = c("IDO_CAF high"="#046C9A",
+                             "IDO_CAF low"="#ECCBAE"),
+               vCAF_G = c("vCAF high"="#ECCBAE",
+                             "vCAF low"="#046C9A"),
+               dCAF_G = c("dCAF high"="#ECCBAE",
+                             "dCAF low"="#046C9A"),
+               hypoxic_CAF_G = c("hypoxic_CAF high"="#ECCBAE",
+                             "hypoxic_CAF low"="#046C9A"),
+               hypoxic_tpCAF_G = c("hypoxic_tpCAF high"="#ECCBAE",
+                             "hypoxic_tpCAF low"="#046C9A")
+               
+               )
+                    
+                    ))
+draw(h1,heatmap_legend_side = "right")
+
+pdf(file=file.path(plot_folder, paste("All_patients_CAF_info_right_legend.pdf")), width=7, height=10)
+draw(h1,heatmap_legend_side = "right")
+
+dev.off()
+   
+
+pdf(file=file.path(plot_folder, paste("All_patients_info_CAF-strat_reduced.pdf")), width=12, height=6)
+draw(h1,heatmap_legend_side = "right")
+
+dev.off()
+
+
+
+
+
+
+
+df_t <-prop.table(table( 
+                            all.sce[, all.sce$Patient_ID!="Control"&
+                                           all.sce$DX.name=="Adenocarcinoma"|
+                                           all.sce$DX.name=="Squamous cell carcinoma"]$category_cell)) %>% data.frame
+colnames(df_t) <-c("Celltype","Freq")
+df_t$Celltype <- factor(df_t$Celltype, levels = c("Other", "Immune", "Stroma","Tumour"))
+ # levels(df_t$Var1) <- c("Tumour","Stroma","Immune","Other")
+p <-df_t%>%ggplot(aes(y=Freq, fill=Celltype, x=""))+geom_bar(position="fill", stat="identity")+scale_fill_tableau()+theme_void()
+plot(p)
+ggsave(p, file=file.path(plot_folder, "Barplot_proportions_tumour-immune-stroma-other.pdf"), width=2, height=8)
+```
+
+#da
+```{r  Differential Abundance Analysis Types over Metastasis clean, fig.width=6, fig.height=4, warning=F, message=F,echo=F, out.width="50%"}
+#Patient Number must be numeric
+library(tidyr)
+#groupId must be factor
+
+colData(fibro.sce)<-as.data.frame(colData(fibro.sce)) %>% 
+  group_by(Patient_ID) %>% mutate(NR=cur_group_id()) %>% DataFrame()
+dat <-as_tibble(colData(fibro.sce))
+#if necessary: change group_id labels
+
+dat <-dat[dat$DX.name=="Adenocarcinoma" | dat$DX.name=="Squamous cell carcinoma",]
+
+
+subtype <-c("Gender","Relapse","Chemo","Dist.Met","LN.Met","Relapse")
+
+#define the naming subtype aka Celltype
+
+j <-"cell_subtype"
+
+plot_list <- list()
+for (i in subtype) {
+  for(k in unique(dat$DX.name)){    #uncomment this if you want to split your loop e.g. by tumour type
+    dat.l <-subset(dat, DX.name==k) #uncomment this if you want to split your loop e.g. by tumour type
+
+dat.l <-dat
+#k="both tumour types" # comment this if you want to split by e.g. tumour type
+dat.l$group_id <-as.factor(dat.l[[i]])
+dat.l<-dat.l %>% drop_na(paste(i))
+
+test <- dat.l %>% select(c("NR",j,"group_id"))
+
+test_wide <- pivot_wider(test,id_cols=c("NR","group_id"),names_from = "cell_subtype", values_from ="cell_subtype", values_fn = list(cell_subtype=length),names_prefix = "")
+
+test_wide
+test_wide[is.na(test_wide)] <- 0
+
+design <- createDesignMatrix(
+  test_wide[,c("NR","group_id")], cols_design = c( "NR","group_id")
+)
+
+contrast <- createContrast(c(rep(0, ncol(design)-1),1))
+data.frame(parameters = colnames(design), contrast)
+
+test_wide_norm <- test_wide %>% select(-c("NR","group_id"))
+#Normalize
+norm_factors <- calcNormFactors(t(test_wide_norm), method = "TMM")
+y <- DGEList(t(test_wide_norm), norm.factors = norm_factors)
+
+y <- estimateDisp(y,design)
+
+fit <- glmFit(y, design)
+lrt <- glmLRT(fit, contrast = contrast)
+top <- edgeR::topTags(lrt, n = Inf, adjust.method = "BH", 
+        sort.by = "none")
+t.op <-as.data.frame(top)
+t.op <- tibble::rownames_to_column(t.op, "FibroTypes")
+
+plot_list[[k]] <-
+ggplot(t.op, aes(reorder(FibroTypes,logFC), logFC)) +
+  geom_col(aes(fill=PValue<0.05)) +
+  coord_flip() +
+  scale_fill_tableau()+
+  labs(x="Celltype", y="logFC",
+       title=paste("Phenotypes over",i,k))+
+  theme_bw()+
+  theme(strip.background = element_blank(),
+        panel.background=element_rect(fill='white', colour = 'black'),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        axis.text.y = element_text(size=10))
+#plot(p)
+
+
+  } #uncomment this if you want to split by e.g. tumour type
+  gridExtra::grid.arrange(grobs = plot_list)
+  
+    #save individual pdf plots for each variable
+  #pdf(file=file.path(plot_folder, paste0("DA_Celltype_n_over_",i,".pdf")))
+  #gridExtra::grid.arrange(grobs = plot_list)
+  #dev.off()
+}
+
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=FibroTypes))+geom_point()+scale_color_tableau()
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=PValue<0.05))+geom_point()+scale_color_tableau()
+```
+#end CAFs
+
+```{r}
+library("scales")
+
+palettes <- ggthemes_data[["tableau"]][["color-palettes"]][["regular"]]
+for (palname in names(palettes)) {
+  pal <- tableau_color_pal(palname)
+  max_n <- attr(pal, "max_n")
+  show_col(pal(max_n))
+  title(main = palname)
+}
+show_col(pal_d3("category20")(20))
+category20 <- c("#1f77b4","#ff7f0e","#2ca02c","#d62728","#9467bd","#8c564b","#e377c2","7f7f7f","#bcbd22","#17becf","#aec7e8")
+```
+
+
+#proportinos per patient
+## Patients are ordered by hierarchical clustering (euclidian, wards.D2)
+```{r  Fibro category proportions ordered by hierarchical clustering, message=FALSE, warning=FALSE, echo=FALSE}
+
+all.sce$category_cell <-all.sce$immune_category
+all.sce$category_cell[all.sce$cell_category=="vessel"] <-"Stroma"
+all.sce$category_cell[all.sce$cell_category=="Fibroblast"] <-"Stroma"
+
+all.sce$immune_category <-all.sce$cell_category
+all.sce$immune_category[all.sce$cell_category=="T cell"] <-"Immune"
+all.sce$DX.name[is.na(all.sce$DX.name)] <- "NA"
+
+tdat <-as.data.frame(table( all.sce[, all.sce$Patient_ID!="Control"&
+                                           all.sce$DX.name=="Adenocarcinoma"|
+                                           all.sce$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            all.sce[, all.sce$Patient_ID!="Control"&
+                                           all.sce$DX.name=="Adenocarcinoma"|
+                                           all.sce$DX.name=="Squamous cell carcinoma"]$category_cell))
+
+tdat <-as.data.frame(table( all.sce[, all.sce$Patient_ID!="Control"&
+                                           all.sce$DX.name=="Adenocarcinoma"|
+                                           all.sce$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            all.sce[, all.sce$Patient_ID!="Control"&
+                                           all.sce$DX.name=="Adenocarcinoma"|
+                                           all.sce$DX.name=="Squamous cell carcinoma"]$immune_category))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = hc$labels[hc$order])
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = hc$labels[hc$order])
+
+tdat_wide
+
+row_dend = as.dendrogram(hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2"))
+
+sum(tdat_wide[,-1])
+```
+
+```{r, fig.width=12, fig.height=6}
+var <- "CT4"
+plot <- "immune_frequencies_clinical"
+
+m <- match(order.histo,clin$SaNr)
+fq_df <- cbind(data.table("sample" = order.histo),clin[m,sel.clin])
+fq_df$sample <- factor(fq_df$sample,levels = order.histo)
+
+
+df <-reshape2::melt(fq_df, id.vars =c("sample"), variable.name = "clinical_features", value.name = "value")
+length(unique(df$sample))
+
+
+p<- ggplot(df, aes(y=Patient_ID, x=clinical_features, fill= value))+
+  geom_tile(color="white")+
+     # scale_fill_manual(values = color_clusters_3)+
+  scale_fill_tableau("Tableau 20")+
+   theme_bw()+ theme(
+    panel.grid = element_blank(), 
+    legend.key.size = unit(4, "mm"),
+    axis.text = element_text(color = "black"),
+    axis.text.x = element_text(angle = 45, hjust = 1),
+    axis.title.x = element_blank(),
+    axis.ticks.x = element_blank())
+p
+
+file_nm <- sprintf("%s__%s_%s.pdf", pp_prefix, var, plot)
+ ggsave(p, filename = file.path(vsrs_folder, file_nm), height = 6, width = 12,dpi=300)
+ 
+ #####################################################################
+ clinical.subset <- clinical.data %>% select(Patient_ID, Grade, Gender, Stage, DX.name, Dist.Met, LN.Met)
+ clinical.subset <- clinical.subset[clinical.subset$DX.name=="Adenocarcinoma"|clinical.subset$DX.name=="Squamous cell carcinoma",]
+ clinical.subset$Grade <- factor(clinical.subset$Grade)
+ clinical.subset$Gender <- factor(clinical.subset$Gender)
+ clinical.subset$Stage <- factor(clinical.subset$Stage)
+ clinical.subset$DX.name <- factor(clinical.subset$DX.name)
+ clinical.subset$Dist.Met <- factor(clinical.subset$Dist.Met)
+ clinical.subset$LN.Met <- factor(clinical.subset$LN.Met)
+ levels(clinical.subset$Gender) <- c("male","female")
+  levels(clinical.subset$Grade) <- c("Grade 1","Grade 2","Grade 3")
+ levels(clinical.subset$Stage) <- c("Stage 1","Stage 2","Stage 3","Stage 4","Stage 5","Stage 6","Stage 7")
+ df <-reshape2::melt(clinical.subset, id.vars =c("Patient_ID"), variable.name = "clinical_features", value.name = "value")
+levels(clinical.subset$Stage)
+
+ha = HeatmapAnnotation(foo = anno_simple(1:10, height = unit(2, "cm")))
+ha = HeatmapAnnotation(Grade=anno_simple(clinical.data$Grade))
+ha = rowAnnotation(Grade=anno_simple(clinical.data$Grade))
+
+plot(ha)
+tdat_wide_c <- left_join(tdat_wide, clinical.data, by="Patient_ID")
+tdat_wide_c$Patient_ID <- factor(tdat_wide_c$Patient_ID, levels = hc$labels[hc$order])
+tdat_wide_c <- tdat_wide_c %>% as.data.frame
+rownames(tdat_wide_c) <-tdat_wide_c$Patient_ID
+hm <- rowAnnotation(Proportions=anno_barplot(tdat_wide_c[2:5], gp=gpar(fill=2:5, col=2:5),bar_width = 1, width = unit(12, "cm"), height = unit(6, "cm")),
+                    Grade=anno_simple(tdat_wide_c$Grade),
+                    Gender=anno_simple(tdat_wide_c$Gender),
+                    Dist.Met=anno_simple(tdat_wide_c$Dist.Met),
+                    LN.Met=anno_simple(tdat_wide_c$LN.Met),
+                    Type=anno_simple(tdat_wide_c$DX.name),
+                    NeoAdj=anno_simple(tdat_wide_c$NeoAdj),
+                    Relapse=anno_simple(tdat_wide_c$Relapse))
+plot(hm)
+
+
+hc$labels[hc$order]
+h1 <-Heatmap(as.matrix(tdat_wide_c$Patient_ID), rect_gp = gpar(type = "none"),cluster_rows=row_dend,show_heatmap_legend = FALSE,
+        left_annotation = rowAnnotation(Proportions=anno_barplot(tdat_wide_c[2:5], gp=gpar(fill=2:5, col=2:5),bar_width = 1, width = unit(12, "cm"), height = unit(6, "cm")),
+                    Grade=anno_simple(tdat_wide_c$Grade),
+                    Gender=anno_simple(tdat_wide_c$Gender),
+                    Dist.Met=anno_simple(tdat_wide_c$Dist.Met),
+                    LN.Met=anno_simple(tdat_wide_c$LN.Met),
+                    Type=anno_simple(tdat_wide_c$DX.name),
+                    NeoAdj=anno_simple(tdat_wide_c$NeoAdj),
+                    Relapse=anno_simple(tdat_wide_c$Relapse)
+                    
+                    ))
+draw(h1)
+
+library(wesanderson)
+#here
+col_c <- c("#4E79A7","#E15759","#59A14F","#76B7B2","#F28E2B")
+h1 <-Heatmap(as.matrix(tdat_wide_c$Patient_ID), rect_gp = gpar(type = "none"),cluster_rows=row_dend,show_heatmap_legend = FALSE,
+        left_annotation = rowAnnotation(Proportions=anno_barplot(tdat_wide_c[2:6], gp=gpar(fill=col_c[1:5], col=col_c[1:5]),bar_width = 1, width = unit(6, "cm"), height = unit(6, "cm")),
+                    Grade=tdat_wide_c$Grade,
+                    Gender=tdat_wide_c$Gender,
+                    Dist.Met=tdat_wide_c$Dist.Met,
+                    LN.Met=tdat_wide_c$LN.Met,
+                    Type=tdat_wide_c$DX.name,
+                    NeoAdj=tdat_wide_c$NeoAdj,
+                    Relapse=tdat_wide_c$Relapse,
+        col = list(Grade=c("1"=palette("Tableau 10")[6],
+                           "2"=palette("Tableau 10")[7],
+                           "3"=palette("Tableau 10")[8],
+                           "NA"="black"),
+               Gender = c("1"=brewer.pal(n = 8, name = "Dark2")[1],
+                           "2"=brewer.pal(n = 8, name = "Dark2")[4],
+                           "NA"="black"),
+               Dist.Met = c("Dist. Metastases"=brewer.pal(n = 8, name = "Dark2")[2],
+                           "No Dist. Metastases"=brewer.pal(n = 8, name = "Dark2")[3],
+                           "NA"="black"),
+               LN.Met = c("LN Metastases"=brewer.pal(n = 8, name = "Dark2")[5],
+                           "No LN Metastases"=brewer.pal(n = 8, name = "Dark2")[8],
+                           "NA"="black"),
+               Type = c("Squamous cell carcinoma"="black",
+                           "Adenocarcinoma"="grey",
+                           "NA"="black"),
+               NeoAdj = c("NeoAdjuvantTherapy"=brewer.pal(n = 8, name = "Dark2")[7],
+                          "NoNeoAdjuvantTherapy"=brewer.pal(n = 8, name = "Dark2")[6],
+                          "NA"= "black"),
+                Relapse = c("0"=pal_jco()(10)[4],
+                          "1"=pal_jco()(10)[5],
+                          "NA"= "black")
+               )
+                    
+                    ))
+draw(h1,heatmap_legend_side = "right")
+
+pdf(file=file.path(plot_folder, paste("New_All_patients_info_bottom_legend.pdf")), width=7, height=10)
+draw(h1,heatmap_legend_side = "bottom")
+
+dev.off()
+   
+
+pdf(file=file.path(plot_folder, paste("NEW_All_patients_info.pdf")), width=8, height=6)
+draw(h1,heatmap_legend_side = "right")
+
+dev.off()
+
+
+tdat_wide_c$DX.name[is.na(tdat_wide_c$DX.name)] <- "NA"
+tdat_wide_c$Grade[is.na(tdat_wide_c$Grade)] <- "NA"
+tdat_wide_c$Gender[is.na(tdat_wide_c$Gender)] <- "NA"
+tdat_wide_c$Dist.Met[is.na(tdat_wide_c$Dist.Met)] <- "NA"
+tdat_wide_c$LN.Met[is.na(tdat_wide_c$LN.Met)] <- "NA"
+tdat_wide_c$NeoAdj[is.na(tdat_wide_c$NeoAdj)] <- "NA"
+tdat_wide_c$Relapse[is.na(tdat_wide_c$Relapse)] <- "NA"
+
+
+
+
+
+df_t <-prop.table(table( 
+                            all.sce[, all.sce$Patient_ID!="Control"&
+                                           all.sce$DX.name=="Adenocarcinoma"|
+                                           all.sce$DX.name=="Squamous cell carcinoma"]$immune_category)) %>% data.frame
+colnames(df_t) <-c("Celltype","Freq")
+df_t$Celltype <- factor(df_t$Celltype, levels = c("Fibroblast", "vessel", "Immune","Tumour","Other"))
+df_t$n <- df_t$Freq*100
+
+
+#here
+p <-df_t%>%ggplot(aes(y=Freq, fill=Celltype, x=""))+geom_bar(position="fill", stat="identity")+scale_fill_tableau()+theme_void()+scale_color_manual(values=c("#4E79A7","#E15759","#59A14F","#76B7B2","#F28E2B"))
+plot(p)
+ggsave(p, file=file.path(plot_folder, "NEW_Barplot_proportions_tumour-immune-stroma-other.pdf"), width=2, height=8)
+```
+
+Patient based metaclusters split by metacluster
+```{r  Barplot Fibro category proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=6, fig.height=12}
+#add clinical data bars
+tdat <-left_join(tdat,clinical.data, by="Patient_ID")
+clinical.data$Grade <- factor(clinical.data$Grade)
+clinical.data$Patient_ID <- factor(clinical.data$Patient_ID, levels = hc$labels[hc$order])
+
+clinical.data[clinical.data$Patient_ID %in% tdat$Patient_ID,]%>% ggplot(aes(y=Patient_ID, fill=Grade))+geom_bar()
+#Barplot proportions ordered by
+p <-ggplot(tdat,aes(x=freq, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 10")
+plot(p)
+ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_All_cells_Category_sito_over_FILTEREDpatients_ordered-hc.pdf")), width=6, height=6)
+
+#dendrogram coloured by metacluster
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k =15) %>%  as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()+
+  coord_flip()+
+  scale_y_reverse()
+
+#n
+p2<-ggplot(tdat,aes(x=n, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+theme(
+        axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())
+plt <-p1+
+  p+theme(legend.position = "none", axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())+
+  p2
+
+plot(plt)
+#ggsave(plot=plt, file=file.path(plot_folder, paste("Barplot_n_hc_cell_type_proportions.pdf")), width=6, height=6)
+```
+
+
+#CAF plots correlation plots upper and lower proportions and densities
+
+```{r  Correlations T cell type densities amongst eachother, message=FALSE, warning=FALSE, echo=FALSE, fig.width=6, fig.height=6}
+fibro.sce_pat.roi$NeoAdj <- ifelse(fibro.sce_pat.roi$Chemo==1 |fibro.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_pat.roi[,fibro.sce_pat.roi$Patient_ID!="Control"&
+                                          fibro.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=fibro.sce_pat.roi$RoiID,
+                                "Area"=fibro.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=fibro.sce_pat.roi$Patient_ID,
+                                "DX.name"=fibro.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+tdat <-tdat %>% 
+  group_by(RoiID,Phenotype) %>%
+  summarise(across(-c(Patient_ID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+tdat_wide <- pivot_wider(tdat,id_cols=c("RoiID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+neoadj_roi <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(RoiID) 
+
+tdat_wide <-tdat_wide[!tdat_wide$RoiID%in% neoadj_roi$RoiID,]
+rownames(tdat_wide)<-tdat_wide$RoiID
+
+library(corrplot)
+cor(tdat_wide[,-1], method="pearson")
+#corrplot(cor(tdat_wide[,-1], method="pearson"), col=rev(brewer.pal(10,"RdBu")))
+
+#test <- immune.pheno.sub %>% select(contains("Ratio"))
+#test$Ratio_tCAF_vCAF <-NULL
+sig <- cor.mtest(tdat_wide[,-1], conf.level = .95) #is this test right?
+
+
+#plot
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+         col=rev(brewer.pal(10,"RdBu")),
+         pch.col = "white",
+         #type="upper",
+         title = "Correlation of CAF type densities",
+        mar=c(0,0,3,0))
+
+col_fun = colorRamp2(c(0, 0.5, 1), c("blue", "white", "red"))
+
+pdf(file=file.path(plot_folder, "CAF_density_corrplot_upper.pdf"), width=6, height=6)
+
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         type="upper",
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+        #col=rev(brewer.pal(8,"RdBu")),
+        col=rev(COL2('RdBu', 8)),
+        method="square",
+         pch.col = "white",
+         #type="upper",
+       #  title = "Correlation of CAF type densities",
+        mar=c(0,0,3,0))
+dev.off()
+
+#col=wes_palette("Zissou1", 100, type = "continuous"),
+```
+
+```{r  Correlations proportions per image CAF types,message=FALSE,warning=FALSE,echo=FALSE, fig.width=6, fig.height=6}
+fibro.sce_roi$NeoAdj <- ifelse(fibro.sce_roi$Chemo==1 |fibro.sce_roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(fibro.sce_roi[,fibro.sce_roi$DX.name!="Control"&
+                                          fibro.sce_roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           fibro.sce_roi[,fibro.sce_roi$DX.name!="Control"&
+                                          fibro.sce_roi$DX.name=="Adenocarcinoma"|
+                                          fibro.sce_roi$DX.name=="Squamous cell carcinoma"]$cell_type))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+tdat <-tdat %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$RoiID <-droplevels(tdat$RoiID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("RoiID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+#tdat_wide <-tdat_wide[!tdat_wide$RoiID%in% neoadj_pat$RoiID,]
+rownames(tdat_wide)<-tdat_wide$RoiID
+
+library(corrplot)
+cor(tdat_wide[,-1], method="pearson")
+#corrplot(cor(tdat_wide[,-1], method="pearson"), col=rev(brewer.pal(10,"RdBu")))
+
+sig <- cor.mtest(tdat_wide[,-1], conf.level = .95) #is this test right?
+
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+         col=rev(brewer.pal(10,"RdBu")),
+         pch.col = "white",
+         #type="upper",
+         title = "Correlated ratios of CAF type proportions",
+        mar=c(0,0,3,0))
+
+pdf(file=file.path(plot_folder, "CAF_proportion_corrplot.pdf"), width=6, height=6)
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         type="lower",
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+        #col=rev(brewer.pal(8,"RdBu")),
+        col=rev(COL2('RdBu', 8)),
+        method="square",
+         pch.col = "white",
+         #type="upper",
+        # title = "Correlation of CAF type proportions",
+        mar=c(0,0,3,0))
+dev.off()
+```
+
+#all cells
+
+```{r  Table with immune cell numbers per patient, echo=FALSE, message=FALSE, warning=FALSE}
+#All Fibros together
+tbl <- as.data.frame(table(all.sce[,all.sce$Patient_ID!="Control" ]$Patient_ID))
+colnames(tbl) <-c("Patient ID", "Fibro number overall")
+#write.csv(tbl, file=file.path(plot_folder, paste("Immune cell numbers per patient overall including T-excluding undefined cells.csv")))
+#print(tbl)
+
+summary(tbl$`Fibro number overall`) #3-6259
+tbl[tbl$`Fibro number overall` <=100,] #43 patients have less than 100 F
+
+#Lowest 10% of all patients' Fibro numbers
+tbl[order(tbl$`Fibro number overall`),]$`Fibro number overall`[1:10] #1-23
+
+
+#Highest 10% of all patients' Fibro numbers
+tbl[order(-tbl$`Fibro number overall`),]$`Fibro number overall`[1:100] #760-2378
+
+tbl[tbl$`Fibro number overall` <=2000,] #43 patients have less than 100 Fibros=10%. 77 patients have less than 15 Fibros
+
+all_pat <- tbl[tbl$`Fibro number overall` <=2000,]$`Patient ID`
+
+length(unique(all.sce$Patient_ID))
+```
+
+Remove ROIs
+```{r  Table with Immune cell numbers per image, echo=FALSE, message=FALSE, warning=FALSE}
+#All Immune cells together
+tbl <- as.data.frame(table(all.sce[,all.sce$Patient_ID!="Control"]$RoiID))
+colnames(tbl) <-c("ROI ID", "Fibro number overall")
+#write.csv(tbl, file=file.path(plot_folder, paste("Fibro numbers per ROI overall including Fibros-excluding undefined.csv")))
+#print(tbl)
+summary(tbl$`Fibro number overall`) #1-5075
+tbl[tbl$`Fibro number overall` <=1000,] #93 images have less than 50 Fibros (10% of median)
+#Lowest 10% of all patients' Fibro numbers
+tbl[order(tbl$`Fibro number overall`),]$`Fibro number overall`[1:200] #1-110
+
+#Highest 10% of all patients' Fibro numbers
+tbl[order(-tbl$`Fibro number overall`),]$`Fibro number overall`[1:200] #1385-5075
+#per image cut at 50 Fibros per image equals lowest 5% -> ensures that there's at least 50 Fibros per patient
+
+all_roi <- tbl[tbl$`Fibro number overall` <=1000,]$`ROI ID`
+```
+
+
+Remove patients and roi
+```{r  patient roi and pat_roi removal, echo=FALSE, message=FALSE, warning=FALSE}
+#Patient removal
+all.sce_pat <- all.sce[,all.sce$Patient_ID!="Control"&
+                               !all.sce$Patient_ID%in%all_pat]
+length(unique(all.sce_pat$Patient_ID)) #1025
+
+#Roi removal
+all.sce_roi <- all.sce[,all.sce$Patient_ID!="Control"&
+                               !all.sce$RoiID%in%all_roi]
+length(unique(all.sce_roi$Patient_ID)) #1039
+
+#Patient & Roi removal
+all.sce_pat.roi <- all.sce[,all.sce$Patient_ID!="Control"&
+                               !all.sce$Patient_ID%in%all_pat&
+                               !all.sce$RoiID%in%all_roi]
+length(unique(all.sce_pat.roi$Patient_ID)) #1025
+```
+
+```{r, save RDS for all pat roi removed}
+saveRDS(all.sce_pat, file=file.path(data_folder, "FINAL_all-sce_inc_all_pat.rds"))
+saveRDS(all.sce_roi,file=file.path(data_folder, "FINAL_all-sce_inc_all_roi.rds"))
+saveRDS(all.sce_pat.roi,file=file.path(data_folder, "FINAL_all-sce_inc_all_pat-roi.rds"))
+
+#readRDS(file=file.path(data_folder, "FINAL_All_Clustered_FILTERED_inc_tumour_vessel_immune_CAF_Other.rds"))
+```
+
+
+```{r  Correlations T cell type densities amongst eachother, message=FALSE, warning=FALSE, echo=FALSE, fig.width=6, fig.height=6}
+all.sce_pat.roi$NeoAdj <- ifelse(all.sce_pat.roi$Chemo==1 |all.sce_pat.roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.sce_pat.roi[,all.sce_pat.roi$Patient_ID!="Control"&
+                                          all.sce_pat.roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_pat.roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.sce_pat.roi$RoiID,
+                                "Area"=all.sce_pat.roi$Area_px_Core,
+                                "Patient_ID"=all.sce_pat.roi$Patient_ID,
+                                "DX.name"=all.sce_pat.roi$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+tdat <-tdat %>% 
+  group_by(RoiID,Phenotype) %>%
+  summarise(across(-c(Patient_ID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+tdat_wide <- pivot_wider(tdat,id_cols=c("RoiID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+
+neoadj_roi <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(RoiID) 
+
+tdat_wide <-tdat_wide[!tdat_wide$RoiID%in% neoadj_roi$RoiID,]
+rownames(tdat_wide)<-tdat_wide$RoiID
+
+library(corrplot)
+cor(tdat_wide[,-1], method="pearson")
+#corrplot(cor(tdat_wide[,-1], method="pearson"), col=rev(brewer.pal(10,"RdBu")))
+
+#test <- immune.pheno.sub %>% select(contains("Ratio"))
+#test$Ratio_tCAF_vCAF <-NULL
+sig <- cor.mtest(tdat_wide[,-1], conf.level = .95) #is this test right?
+
+
+#plot
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+         col=rev(brewer.pal(10,"RdBu")),
+         pch.col = "white",
+         #type="upper",
+         title = "Correlation of CAF type densities",
+        mar=c(0,0,3,0))
+library(circlize)
+col_fun = colorRamp2(c(0, 0.5, 1), c("blue", "white", "red"))
+
+pdf(file=file.path(plot_folder, "All cellSUB_density_corrplot_upper.pdf"), width=6, height=6)
+
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         type="upper",
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+        #col=rev(brewer.pal(8,"RdBu")),
+        col=rev(COL2('RdBu', 8)),
+        method="square",
+         pch.col = "white",
+         #type="upper",
+       #  title = "Correlation of CAF type densities",
+        mar=c(0,0,3,0))
+dev.off()
+
+#col=wes_palette("Zissou1", 100, type = "continuous"),
+```
+
+```{r  Correlations proportions per image CAF types,message=FALSE,warning=FALSE,echo=FALSE, fig.width=6, fig.height=6}
+all.sce_roi$NeoAdj <- ifelse(all.sce_roi$Chemo==1 |all.sce_roi$Radio==1,"NeoAdjuvantTherapy","NoNeoAdjuvantTherapy")
+tdat <-as.data.frame(table(all.sce_roi[,all.sce_roi$DX.name!="Control"&
+                                          all.sce_roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_roi$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.sce_roi[,all.sce_roi$DX.name!="Control"&
+                                          all.sce_roi$DX.name=="Adenocarcinoma"|
+                                          all.sce_roi$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+tdat <- tdat[tdat$Phenotype!="normal"&tdat$Phenotype!="hypoxic",]
+tdat <-tdat %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$RoiID <-droplevels(tdat$RoiID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("RoiID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+#tdat_wide <-tdat_wide[!tdat_wide$RoiID%in% neoadj_pat$RoiID,]
+rownames(tdat_wide)<-tdat_wide$RoiID
+
+library(corrplot)
+cor(tdat_wide[,-1], method="pearson")
+#corrplot(cor(tdat_wide[,-1], method="pearson"), col=rev(brewer.pal(10,"RdBu")))
+
+sig <- cor.mtest(tdat_wide[,-1], conf.level = .95) #is this test right?
+
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+         col=rev(brewer.pal(10,"RdBu")),
+         pch.col = "white",
+         #type="upper",
+         title = "Correlated ratios of All celltype proportions",
+        mar=c(0,0,3,0))
+
+pdf(file=file.path(plot_folder, "All cellSUB_proportion_corrplot.pdf"), width=6, height=6)
+corrplot(cor(tdat_wide[,-1], method="pearson"),
+         type="lower",
+         tl.cex = .7,
+         number.cex = .7,
+         order = "hclust",
+         p.mat = sig$p,
+         insig = "label_sig",
+         tl.col = "black",
+         addrect = 3,
+         tl.srt = 90,
+         sig.level = c(.001, .01, .05),
+         pch.cex = .9,
+        #col=rev(brewer.pal(8,"RdBu")),
+        col=rev(COL2('RdBu', 8)),
+        method="square",
+         pch.col = "white",
+         #type="upper",
+        # title = "Correlation of CAF type proportions",
+        mar=c(0,0,3,0))
+dev.off()
+```
+
+
+
+#abundance testing caf types 
+
+
+#Differential abundance analysis loop   
+```{r  Differential Abundance Analysis Types over Metastasis, fig.width=6, fig.height=4, warning=F, message=F,echo=F, out.width="50%"}
+#Patient Number must be numeric
+library(tidyr)
+#groupId must be factor
+#t <-as.data.frame(table(fibro[,fibro$CAF_Type!="undefined"]$Patient_ID,factor(fibro[,fibro$CAF_Type!="undefined"]$CAF_Type)))
+#  colnames(t)<-c("Patient_ID","Phenotype","n")
+ # t$Phenotype <-factor(t$Phenotype)
+df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+#df <- read.csv(file=file.path(wd,"patient_strat","CAF_strat_hi_lo_density.csv"))
+
+df$X <-NULL
+  
+colData(all.sce_pat.roi)<-as.data.frame(colData(all.sce_pat.roi)) %>% 
+  group_by(Patient_ID) %>% mutate(NR=cur_group_id()) %>% DataFrame()
+dat <-as_tibble(colData(all.sce_pat.roi))
+
+colData(all.sce)<-as.data.frame(colData(all.sce)) %>% 
+  group_by(Patient_ID) %>% mutate(NR=cur_group_id()) %>% DataFrame()
+dat <-as_tibble(colData(all.sce))
+#if necessary: change group_id labels
+
+dat <-left_join(dat, df, by="Patient_ID")
+#clinical.data$NR <-c(1:length(unique(clinical.data$Patient_ID)))
+#dat <-merge(t,clinical.data,by="Patient_ID")
+dat <-dat[dat$DX.name=="Adenocarcinoma" | dat$DX.name=="Squamous cell carcinoma",]
+#dat$LN.Met <- ifelse(dat$N ==0, "No LN Metastases", "LN Metastases")
+#dat$Dist.Met <- ifelse(dat$M.new ==0, "No Dist. Metastases", "Dist. Metastases")
+#dat$NeoAdj <- ifelse(dat$Radio==1 |dat$Chemo==1, "NeoAdjuvantTherapy", "NoNeoAdjuvantTherapy")
+
+#Only for grade
+#dat <-dat[dat$Grade=="2" | dat$Grade=="3",]
+#category <-"Grade"
+#dat<-dat %>% drop_na(Grade)
+
+#proportions
+caf_groups <- colnames(df[,-1])
+#density
+#caf_groups <- colnames(df[,-13])
+category <-c("Gender","Relapse","Chemo","Dist.Met","LN.Met",caf_groups)
+category <- caf_groups
+#group_id <-c("M.new")
+#dat$group_id <-ifelse(dat$M.new ==0, 0,1)
+#dat[[group_id]] <-as.factor(dat[[group_id]])
+
+#define the naming category aka Phenotype
+
+#dat<-dat %>% drop_na(paste(group_id))
+j <-"cell_subtype"
+
+plot_list <- list()
+for (i in category) {
+  #for(k in unique(dat$DX.name)){
+   # dat.l <-subset(dat, DX.name==k)
+
+dat.l <-dat
+k="both tumour types"
+dat.l$group_id <-as.factor(dat.l[[i]])
+dat.l<-dat.l %>% drop_na(paste(i))
+
+test <- dat.l %>% select(c("NR",j,"group_id"))
+#colnames(test) <-c("NR", "Phenotype","group_id")
+
+test_wide <- pivot_wider(test,id_cols=c("NR","group_id"),names_from = paste(j), values_from =paste(j), values_fn = list(cell_subtype=length),names_prefix = "")
+#names(test_wide)[names(test_wide) == "FibroType_1"] <- "FibroType_1 / tCAF"
+#names(test_wide)[names(test_wide) == "FibroType_2"] <- "FibroType_2 / vCAF"
+test_wide
+test_wide[is.na(test_wide)] <- 0
+
+design <- createDesignMatrix(
+  test_wide[,c("NR","group_id")], cols_design = c( "NR","group_id")
+)
+
+contrast <- createContrast(c(rep(0, ncol(design)-1),1))
+data.frame(parameters = colnames(design), contrast)
+
+test_wide_norm <- test_wide %>% select(-c("NR","group_id"))
+#Normalize
+norm_factors <- calcNormFactors(t(test_wide_norm), method = "TMM")
+y <- DGEList(t(test_wide_norm), norm.factors = norm_factors)
+
+#y <- DGEList(t(both[,-c("patient_id","group_id")]))
+y <- estimateDisp(y,design)
+
+fit <- glmFit(y, design)
+lrt <- glmLRT(fit, contrast = contrast)
+top <- edgeR::topTags(lrt, n = Inf, adjust.method = "BH", 
+        sort.by = "none")
+t.op <-as.data.frame(top)
+t.op <- tibble::rownames_to_column(t.op, "Neighbour_Cluster")
+
+plot_list[[k]] <-
+ggplot(t.op, aes(reorder(Neighbour_Cluster,logFC), logFC)) +
+  geom_col(aes(fill=PValue<0.05)) +
+  coord_flip() +
+  scale_fill_tableau()+
+  scale_fill_jco()+
+  labs(x="Neighbour_Cluster", y="logFC",
+       title=paste("Phenotypes over",i,k))+
+  theme_bw()+
+  theme(strip.background = element_blank(),
+        panel.background=element_rect(fill='white', colour = 'black'),
+        panel.grid.major=element_blank(),
+        panel.grid.minor=element_blank(),
+        axis.text.y = element_text(size=10),
+        legend.position = "none")
+#plot(p)
+
+# reference = Metastasis
+#ggsave(plot=p, file=file.path(plot_folder, paste("Differential_Abundance_Analysis_Fibro_n_over_",i,".pdf")))
+  #}
+  gridExtra::grid.arrange(grobs = plot_list)
+  
+    #save out in individual pdfs for each variable
+  pdf(file=file.path(plot_folder, paste0("DA_CAF-strat_n_cellSUBtypes_PROP_",i,".pdf")), width=6, height=6)
+  gridExtra::grid.arrange(grobs = plot_list)
+  dev.off()
+}
+
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=Neighbour_Cluster))+geom_point()+scale_color_tableau()
+t.op %>% ggplot(aes(x=logCPM, y=logFC, color=PValue<0.05))+geom_point()+scale_color_tableau()
+```
+
+
+
+
+#all cell types bar plot proportions and density
+
+## Patients are ordered by hierarchical clustering (euclidian, wards.D2)
+```{r  Fibro category proportions ordered by hierarchical clustering, message=FALSE, warning=FALSE, echo=FALSE}
+tdat <-as.data.frame(table( all.sce[, all.sce$Patient_ID!="Control"&
+                                           all.sce$DX.name=="Adenocarcinoma"|
+                                           all.sce$DX.name=="Squamous cell carcinoma"]$Patient_ID,
+                            all.sce[, all.sce$Patient_ID!="Control"&
+                                           all.sce$DX.name=="Adenocarcinoma"|
+                                           all.sce$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("Patient_ID","Phenotype","n")
+tdat <-tdat %>% subset(Phenotype!="normal"&Phenotype!="hypoxic")
+tdat <-tdat %>%
+    dplyr::group_by(Patient_ID) %>%
+    dplyr::mutate(freq = n / sum(n))
+tdat[is.na(tdat)] <- 0
+tdat$Phenotype <-droplevels(tdat$Phenotype)
+tdat$Patient_ID <-droplevels(tdat$Patient_ID)
+
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="freq",names_prefix = "")
+
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+hc <- hclust(dist(tdat_m[,-1], method="euclidian"),"ward.D2")
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = hc$labels[hc$order])
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = hc$labels[hc$order])
+
+tdat_wide
+```
+
+```{r  survival data Fibro category proportions high-low, message=FALSE, warning=FALSE, echo=FALSE}
+tdat_wide_ct <- left_join(tdat_wide, clinical.data, by="Patient_ID")
+
+neoadj_pat <- clinical.data %>% filter(NeoAdj == "NeoAdjuvantTherapy") %>% select(Patient_ID) 
+surv_dat <-as.data.table(tdat_wide_ct)
+surv_dat<- surv_dat[is.na(surv_dat$OS) !=T,]
+
+# Censoring for OS (1 = event (death due to disease), 0 = censored)
+surv_dat[,censoringOS := 0]
+surv_dat[Ev.O == 1,censoringOS := 1]
+
+# Censoring for disease free survival (1 = event (recurrence), 0 = censored)
+surv_dat[ ,censoringDFS := 0]
+surv_dat[ OS > DFS, censoringDFS := 1]
+surv_dat$censoringDFS <-as.numeric(as.character(surv_dat$censoringDFS))
+
+surv_dat <-surv_dat[surv_dat$DX.name=="Adenocarcinoma"|surv_dat$DX.name=="Squamous cell carcinoma",]
+surv_dat <-surv_dat[!surv_dat$Patient_ID%in% neoadj_pat,]
+
+#high low by median
+surv_dat$Collagen_CAF_G <- ifelse(surv_dat$Collagen_CAF >summary(surv_dat$Collagen_CAF)[3],"Collagen_CAF high","Collagen_CAF low")
+surv_dat$hypoxic_CAF_G <- ifelse(surv_dat$hypoxic_CAF >summary(surv_dat$hypoxic_CAF)[3],"hypoxic_CAF high","hypoxic_CAF low")
+surv_dat$mCAF_G <- ifelse(surv_dat$mCAF >summary(surv_dat$mCAF)[3],"mCAF high","mCAF low")
+surv_dat$SMA_CAF_G <- ifelse(surv_dat$SMA_CAF >summary(surv_dat$SMA_CAF)[3],"SMA_CAF high","SMA_CAF low")
+surv_dat$tpCAF_G <- ifelse(surv_dat$tpCAF >summary(surv_dat$tpCAF)[3],"tpCAF high","tpCAF low")
+surv_dat$iCAF_G <- ifelse(surv_dat$iCAF >summary(surv_dat$iCAF)[3],"iCAF high","iCAF low")
+surv_dat$vCAF_G <- ifelse(surv_dat$vCAF >summary(surv_dat$vCAF)[3],"vCAF high","vCAF low")
+surv_dat$dCAF_G <- ifelse(surv_dat$dCAF >summary(surv_dat$dCAF)[3],"dCAF high","dCAF low")
+surv_dat$hypoxic_tpCAF_G <- ifelse(surv_dat$hypoxic_tpCAF >summary(surv_dat$hypoxic_tpCAF)[3],"hypoxic_tpCAF high","hypoxic_tpCAF low")
+surv_dat$IDO_CAF_G <- ifelse(surv_dat$IDO_CAF >summary(surv_dat$IDO_CAF)[3],"IDO_CAF high","IDO_CAF low")
+surv_dat$PDPN_CAF_G <- ifelse(surv_dat$PDPN_CAF >summary(surv_dat$PDPN_CAF)[3],"PDPN_CAF high","PDPN_CAF low")
+
+surv_dat$Neutrophil_G <- ifelse(surv_dat$Neutrophil >summary(surv_dat$Neutrophil)[3],"Neutrophil high","Neutrophil low")
+surv_dat$Bcell_G <- ifelse(surv_dat$Bcell >summary(surv_dat$Bcell)[3],"B cell high","B cell low")
+surv_dat$Myeloid_G <- ifelse(surv_dat$Myeloid >summary(surv_dat$Myeloid)[3],"Myeloid high","Myeloid low")
+surv_dat$CD4_G <- ifelse(surv_dat$CD4 >summary(surv_dat$CD4)[3],"CD4 high","CD4 low")
+surv_dat$CD4_Treg_G <- ifelse(surv_dat$CD4_Treg >summary(surv_dat$CD4_Treg)[3],"CD4 Treg high","CD4 Treg low")
+surv_dat$CD4_IDO_G <- ifelse(surv_dat$IDO_CD4 >summary(surv_dat$IDO_CD4)[3],"CD4 IDO high","CD4 IDO low")
+surv_dat$CD4_PD1_G <- ifelse(surv_dat$PD1_CD4 >summary(surv_dat$PD1_CD4)[3],"CD4 PD1 high","CD4 PD1 low")
+surv_dat$CD4_TCF_G <- ifelse(surv_dat$`TCF1/7_CD4` >summary(surv_dat$`TCF1/7_CD4`)[3],"CD4 TCF1/7 high","CD4 TCF1/7 low")
+surv_dat$CD4_ki67_G <- ifelse(surv_dat$ki67_CD4 >summary(surv_dat$ki67_CD4)[3],"dividing CD4 high","dividing CD4 low")
+
+surv_dat$CD8_IDO_G <- ifelse(surv_dat$IDO_CD8 >summary(surv_dat$IDO_CD8)[3],"CD8 IDO high","CD8 IDO low")
+surv_dat$CD8_TCF_G <- ifelse(surv_dat$`TCF1/7_CD8` >summary(surv_dat$`TCF1/7_CD8`)[3],"CD8 TCF1/7 high","CD8 TCF1/7 low")
+surv_dat$CD8_G <- ifelse(surv_dat$CD8 >summary(surv_dat$CD8)[3],"CD8 high","CD8 low")
+surv_dat$CD8_ki67_G <- ifelse(surv_dat$ki67_CD8 >summary(surv_dat$ki67_CD8)[3],"dividing CD8 high","dividing CD8 low")
+
+#caf_strat <-surv_dat %>% select(contains(c("Patient_ID","_G")))
+#write.csv(caf_strat, file=file.path(wd,"patient_strat","CAF_strat_hi_low_proportion.csv"))
+
+surv <-c("PDPN_CAF_G","IDO_CAF_G","hypoxic_tpCAF_G","dCAF_G","vCAF_G","iCAF_G","tpCAF_G","SMA_CAF_G","mCAF_G","hypoxic_CAF_G","Collagen_CAF_G","Bcell_G","Neutrophil_G","Myeloid_G","CD8_G","CD4_G","CD4_Treg_G","CD4_IDO_G","CD4_PD1_G","CD4_TCF_G","CD4_ki67_G", "CD8_IDO_G","CD8_TCF_G","CD8_ki67_G")
+```
+
+
+#CoxPH for Fibro category proportions corrected for Stage, Grade and M
+```{r  Coxph Fibro category proportions, echo=F, message=FALSE, warning=FALSE}
+surv_dat
+surv_dat<-surv_dat[!surv_dat$OS == 0, ]
+time <- surv_dat$OS
+status <- surv_dat$censoringOS
+surv_dat$metacluster <-as.factor(surv_dat$metacluster)
+surv_dat$Grade <-as.numeric(surv_dat$Grade)
+rownames(surv_dat) <- surv_dat$Patient_ID
+#res.cox <- coxph(Surv(time, status) ~ metacluster, data = surv_dat)
+#res.cox
+
+
+res.cox <- coxph(Surv(time, status) ~   PDPN_CAF + IDO_CAF+hypoxic_tpCAF+dCAF+vCAF+iCAF+tpCAF+mCAF+hypoxic_CAF+SMA_CAF+Collagen_CAF+Bcell+Blood+CD4+CD4_Treg+CD8+HEV+IDO_CD4+IDO_CD8+ki67_CD4+ki67_CD8+Lymphatic+Myeloid+Neutrophil+PD1_CD4+`TCF1/7_CD4`+`TCF1/7_CD8`+Other+Stage+Grade+M.new, data =  surv_dat)
+
+#res.cox <- coxph(Surv(time, status) ~ `CD4+`+tpCAF+iCAF+mCAF+dCAF+vCAF+`SMA+ CAF`+`IDO+ CAF`+`dividing tpCAF`+`hypoxic CAF`+Grade, data =  surv_dat)
+summary(res.cox)
+
+
+td_res = tidy(res.cox, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = log(estimate) + log(std.error))
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = log(estimate) - log(std.error))
+td_res<-td_res[!td_res$std.error == 0, ]
+td_res <-td_res[td_res$term !="Grade"&
+                  td_res$term !="Stage"&
+                  td_res$term !="M.new",]
+library(ggsci)
+p <-ggplot(td_res, aes(y = term, x = log(estimate),color =p.value < 0.05, shape=estimate>1)) +
+geom_point(size = 4) +
+scale_shape_manual(values = c(16, 17))+
+geom_errorbar(aes(xmax = conf.high, xmin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+#xlim(0,2)+
+geom_vline(xintercept = 0,color = 'black')+
+theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+#scale_colour_tableau()
+  scale_color_jco()+
+  theme(axis.text.x = element_text(angle=45, hjust=1))
+#
+plot(p)
+#ggsave(plot=p, file=file.path(plot_folder, "CoxPH_All_cells_prop_SD.pdf"), width=9, height=3)
+```
+
+#lasso regressed coxph for all cells
+## Lasso-regressed cox-ph model  
+Including clinical parameters, patient metacluster and Fibro categories (CD4 vs CD8) classified as high / low as well as the proportion (continuous).   
+```{r  coxph Fibro subtype proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=8, fig.height=6}
+surv_dat$metacluster <-as.numeric(surv_dat$metacluster)
+fd_test <- fastDummies::dummy_cols(surv_dat, select_columns = c(paste(surv),"Typ","Grade","T.new","N","M.new","Stage","Smok","Gender", "metacluster")) #
+
+fd_test<-fd_test[!fd_test$OS == 0, ]
+time <- fd_test$OS
+status <- fd_test$censoringOS
+
+rownames(fd_test) <- fd_test$Patient_ID
+
+fd_test <-fd_test %>%select(-c(paste(surv),"metacluster",colnames(clinical.data),contains("censor"))) #))
+
+fd_test[is.na(fd_test)] <- 0
+fd_test_matrix <-data.matrix(fd_test)
+
+#determine lambda
+cv.fit <- cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+fit <- glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+#plot(cv.fit)
+#cv.fit$lambda.min
+
+#loop to determine optimal lambda for lasso regression
+MSEs <- NULL
+for (i in 1:100){
+                 cv <-  cv.glmnet(fd_test_matrix, Surv(time, status), family = "cox", maxit = 1000)
+                 MSEs <- cbind(MSEs, cv$cvm)
+             }
+  rownames(MSEs) <- cv$lambda
+  lambda.min <- as.numeric(names(which.min(rowMeans(MSEs))))
+
+#coef(fit, s = lambda.min)
+Coefficients <- coef(fit, s = lambda.min)
+Active.Index <- which(Coefficients != 0)
+Active.Coefficients <- Coefficients[Active.Index]
+#Coefficients
+
+#correrct
+
+
+df <-Coefficients %>% as.data.frame 
+df$Coefficients <- rownames(df)
+colnames(df) <- c("Predictor","Coefficient")
+df <-df[df$Predictor !=0,]
+
+na_td_res <- df[-grep("NA",df$Coefficient),]
+
+p <-ggplot(na_td_res, aes(x = reorder(Coefficient,Predictor), y = Predictor, color=Predictor >0)) +
+geom_point(size = 4) +
+#geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+#scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 0,color = 'black')+
+theme_bw()+
+  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
+  geom_segment(aes(y = 0, 
+                   x = Coefficient, 
+                   yend = Predictor, 
+                   xend = Coefficient), 
+               color = "black") +
+scale_colour_jco()+
+  ylim(-15,5)
+p
+########
+active_coefficients <- Coefficients[,1] != 0
+x <- fd_test_matrix[,active_coefficients]
+x <- data.frame(x)
+
+vct <-colnames(x)
+coxdf<-coxph(Surv(time,status)~.,data=x)
+
+td_res = tidy(coxdf, exponentiate = TRUE)
+td_res = td_res %>% arrange(estimate) %>% mutate(term = factor(term,levels = term))
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.high = estimate + std.error)
+
+td_res<-td_res %>%
+  rowwise() %>%
+  dplyr::mutate(conf.low = estimate - std.error)
+td_res<-td_res[!td_res$std.error == 0, ]
+na_td_res <- td_res[-grep("NA",td_res$term),]
+
+
+p <-ggplot(na_td_res, aes(x = term, y = estimate,color =p.value < 0.05)) +
+geom_point(size = 4) +
+geom_errorbar(aes(ymax = conf.high, ymin = conf.low))+
+scale_colour_viridis_d("p value < 0.05")+
+coord_flip()+
+geom_hline(yintercept = 1,color = 'black')+
+theme_bw()+
+scale_colour_tableau()+
+  xlim(-10,10)
+plot(p)
+ggsave(file=file.path(plot_folder, paste("CoxPH_Lasso_allcelltypes_prop.pdf")), width=8, height=6, plot=p)
+
+ggforest(coxdf, data =  (fd_test %>% select(-contains("NA"))))
+
+
+#pdf(file=file.path(plot_folder,paste("COX_selectedFactors_glmnet.pdf")))
+#ggforest(coxdf, data = fd_test)
+#dev.off()
+```
+
+
+```{r  Barplot Fibro category proportions, message=FALSE, warning=FALSE, echo=FALSE, fig.width=12, fig.height=6}
+#Barplot proportions ordered by
+pp <-ggplot(tdat,aes(x=freq, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+scale_fill_manual(values=as.vector(glasbey(31)))
+#plot(p)
+##ggsave(plot=p, file=file.path(plot_folder, paste("Barplot_Tcell-Category_over_FILTEREDpatients_ordered-hc.pdf")), width=6, height=6)
+
+#dendrogram coloured by metacluster
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k =9) %>%  as.ggdend()
+pp1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()+
+  coord_flip()+
+  scale_y_reverse()
+
+#n
+pp2<-ggplot(tdat,aes(x=n, y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+scale_fill_manual(values=as.vector(glasbey(31)))+theme(
+        axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())
+#pplt <-pp1+
+ # pp+theme(legend.position = "none", axis.text.y=element_blank(),
+   #     axis.ticks.y=element_blank(),
+    #    axis.title.y=element_blank())+
+ # pp2
+#  p
+
+plot(pplt)
+#ggsave(plot=plt, file=file.path(plot_folder, paste("Barplot_n_hc_CAF_type_proportions.pdf")), width=12, height=6)
+```
+
+```{r,}
+tdat <-as.data.frame(table(all.sce[,all.sce$Patient_ID!="Control"&
+                                          all.sce$DX.name=="Adenocarcinoma"|
+                                          all.sce$DX.name=="Squamous cell carcinoma"]$RoiID,
+                           all.sce[,all.sce$Patient_ID!="Control"&
+                                          all.sce$DX.name=="Adenocarcinoma"|
+                                          all.sce$DX.name=="Squamous cell carcinoma"]$cell_subtype))
+colnames(tdat)<-c("RoiID","Phenotype","n")
+
+da <-dplyr::distinct(data.frame("RoiID"=all.sce$RoiID,
+                                "Area"=all.sce$Area_px_Core,
+                                "Patient_ID"=all.sce$Patient_ID,
+                                "DX.name"=all.sce$DX.name))
+
+  tdat <-merge(tdat,da,by="RoiID")
+  tdat <-tdat[tdat$DX.name=="Adenocarcinoma" | tdat$DX.name=="Squamous cell carcinoma",]
+  tdat <-tdat %>%
+    #dplyr::count(Type, Area,Patient_ID) %>%
+    dplyr::group_by(RoiID) %>%
+    dplyr::mutate(density = n / Area*1000000)
+  range(tdat$density)
+
+  tdat <-tdat %>% 
+  group_by(Patient_ID,Phenotype) %>%
+  summarise(across(-c(RoiID,n,Area,DX.name), sum, na.rm = TRUE))
+  range(tdat$density)
+  #t <-merge(t, clinical.data, by="Patient_ID")
+
+  #remove super high density outliers (total density >10000)  
+ # tdat <-tdat[ !tdat$Patient_ID%in%surv_excl,]
+
+#CAF type distribution over all patients
+tdat_wide <- pivot_wider(tdat,id_cols=c("Patient_ID"),names_from = "Phenotype", values_from ="density",names_prefix = "")
+tdat_m <-as.matrix(tdat_wide)
+rownames(tdat_m)<-tdat_wide$Patient_ID
+
+
+#hc <- hclust(dist(tdat_m[,-1]),"ward.D2")
+#order Patients after clustering results
+tdat_wide$Patient_ID <- factor(tdat_wide$Patient_ID, levels = labels(hc))
+tdat$Patient_ID <- factor(tdat$Patient_ID, levels = labels(hc))
+
+#tdat_wide <-tdat_wide[ !tdat_wide$Patient_ID%in%surv_excl,]
+
+```
+
+## Barplot showing absolute density per patietn coloured by Fibro category together with hierarchical clustering tree coloured by patient metaclusters.
+```{r Barolot Fibro category densities, fig.width=12, fig.height=6, message=FALSE, warning=FALSE, echo=FALSE}
+#barolot (sqrt of density)
+p <-ggplot(tdat,aes(x=sqrt(density),y=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+
+    theme(legend.position="bottom")+scale_fill_manual(values=as.vector(glasbey(31)))
+
+#barplot total density
+#p <-ggplot(tdat,aes(y=density,x=Patient_ID, fill=Phenotype))+geom_bar(stat="identity")+scale_fill_tableau("Tableau 20")+scale_fill_manual(values=as.vector(glasbey(31)))+
+ #   theme(legend.position="bottom")#+theme(legend.position = "none")
+
+
+#coloured dendrogram
+ggdend <-hc%>% as.dendrogram() %>% dendextend::set("branches_k_color", k = 8) %>% as.ggdend()
+p1 <-ggplot(ggdend, labels = F)+scale_x_continuous(expand = c(0, 0))+
+  theme_void()
+pg <-plot_grid(p1,p, align="v", ncol=1)
+pg
+##ggsave(plot=pg, file=file.path(plot_folder, paste("Barplot_Tcell-Category-Densities_HC_ordered_dendro.pdf")), width=12, height=6)
+
+pplt <-#pp1+
+  pp+theme(legend.position = "none", axis.text.y=element_blank(),
+        axis.ticks.y=element_blank(),
+        axis.title.y=element_blank())+
+ # pp2
+  p+theme(axis.ticks.y=element_blank(),
+        axis.title.y=element_blank(),
+        axis.text.y=element_blank()
+        )
+plot(pplt)
+ggsave(plot=pplt, file=file.path(plot_folder, "BARchart_all_cells_prop-and-density.pdf"), width=6, height=8)
+```
diff --git a/Read_LC_NSCLC_TMA_175_A.Rmd b/Read_LC_NSCLC_TMA_175_A.Rmd
new file mode 100644
index 0000000..6bdded6
--- /dev/null
+++ b/Read_LC_NSCLC_TMA_175_A.Rmd
@@ -0,0 +1,272 @@
+---
+title: "Read in CP data, NSCLC TMA 175_A"
+author: "Lena Cords"
+output:
+  html_document:
+    df_print: paged
+  html_notebook: default
+  pdf_document: default
+---
+
+```{r, import libraries}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+```
+
+
+```{r, Set wd}
+#set working directory 
+wd <- dirname(dirname(dirname(getwd())))
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_175_A",'cpinp')
+panel_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_175_A",'cpout')
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","RAW"))
+```
+
+
+```{r, Data import}
+#Read in: Cells, Image, Object and Panel
+cells <- read.csv(file=file.path(data_folder,paste('Cells.csv')))
+image <- read.csv(file=file.path(data_folder, paste("Image.csv")))
+object_relationship <- read.csv(file=file.path(data_folder, paste("distance20_Object relationships.csv")))
+panel <- read.csv(file=file.path(panel_folder, paste('panel.csv')))
+```
+
+```{r, What has been measured for each cell}
+#number of cells
+nrow(cells)
+unique(sub("_c[0-9]*$", "", colnames(cells)))
+
+#Select only the measurements you want to work on, based on the measurements displayed above Intensity_MeanIntensityCellsCorr_ColdStack
+counts <- cells[, grepl("Intensity_MeanIntensityCorrected_FullStackFiltered", colnames(cells))]
+```
+
+```{r Scale counts according to CP scaling factor}
+#when using 16bit data this should be 2^16 =65536
+scaling_factor <- unique(image$Scaling_FullStack)
+#Scale up the counts according to scaling factor from CellProfiler
+counts <- counts * scaling_factor
+
+#make sure the counts have actually been multiplied by the scaling factor
+range(counts)
+```
+
+
+```{r Add cell metadata as S4 Object for the SCE}
+#the S4Vector is needed for the SCE and a dataframe that stores the metadata per category
+cell_meta <- DataFrame(CellNumber = cells$ObjectNumber,
+                      Center_X=cells$Location_Center_X,
+                      Center_Y=cells$Location_Center_Y,
+                      Area = cells$AreaShape_Area,
+                      MajorAxisLength=cells$AreaShape_MajorAxisLength,
+                      MinorAxisLength=cells$AreaShape_MinorAxisLength,
+                      Compartment=cells$Intensity_MeanIntensity_TumourMask_Distance_B100)
+cell_meta$ImageNumber <- cells$ImageNumber
+
+#Create DataFrame for image meta data. Then match image and cell metadata by ImageNumber
+
+image_meta <- DataFrame(Area_Description = image$Metadata_description)
+
+
+#run this to get the image number for the meta data
+image_meta$ImageNumber <- image$ImageNumber
+
+
+#merge cell meta and image metadata by Image Number here
+  #cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+```
+
+
+```{r, Add cell metadata as S4 Object for the SCE from file name}
+#get info such as Batch, sample, ROI from original .mcd filename and add it to the cell metadata (colDat)
+
+#this is a good overview to get the different parts of metadata from the file name
+ac_info <- str_split(image$FileName_FullStack, '_', simplify=TRUE)
+image$Metadata_Description
+head(ac_info)
+cell_meta$BatchID <- ac_info[cell_meta$ImageNumber,1]
+cell_meta$Panel <- ac_info[cell_meta$ImageNumber,3]
+cell_meta$TmaID <- ac_info[cell_meta$ImageNumber,5]
+cell_meta$TmaBlock <- ac_info[cell_meta$ImageNumber,6]
+#cell_meta$ROI <- image$Metadata_roiid[cell_meta$ImageNumber]
+cell_meta$acID <- image$Metadata_acid[cell_meta$ImageNumber]
+#colnames(cells)
+```
+
+
+
+
+```{r Set rownames for SCE}
+#Should usually be cell id. However, in the cells.csv file the object number is counted per imgage, so to get unique cell IDs, they must be establishes by combining ObjectNumber, ImageNumber/ROINumber and SampleID
+
+rownames(cell_meta) <-paste(cell_meta$TmaID,cell_meta$TmaBlock, cell_meta$acID, cell_meta$CellNumber, sep='_')
+```
+
+
+```{r, Match channel names with metal names in exact order}
+library(DT)
+#loads panel
+DT::datatable(panel)
+
+#exactl order of channels can be extracted from the _full.csv files
+#header = FALSE is important here as otherwise it'll take the first metal as a header
+channel_metal <- read.csv(file.path(data_folder, paste("full_channelmeta.csv")), header=FALSE)
+
+#this reorders the panel according to the correct metal lsit
+panel <- panel[match(channel_metal[,1], panel$Metal.Tag),]
+#panel <- as.data.table(panel)
+#use Target from panel as the correct channel name to be displayed
+rownames(panel) <-panel$Clean_Target
+
+#channels in cell file are not ordered correctly (starting with 1,11,12 ... 2,21,22 ... 3,31,32...), hence we need to reorder them
+channelNumber <- as.numeric(sub("^.*_c", "", colnames(counts)))
+
+# Order counts based on channel number
+counts <- counts[,order(channelNumber, decreasing = FALSE)] 
+range(counts)
+```
+
+```{r build igraph object}
+library(igraph)
+# Construct neighbour data.frame
+# First in the ImageNumber_ObjectNumber format
+cur_df <- data.frame(CellID_1 = paste0(object_relationship$First.Image.Number, 
+                                       "_", object_relationship$First.Object.Number),
+                     CellID_2 = paste0(object_relationship$Second.Image.Number, 
+                                       "_", object_relationship$Second.Object.Number))
+# Create simple cell IDs
+cellID <- paste0(cell_meta$ImageNumber, "_", cell_meta$CellNumber)
+
+# Change cell IDs
+cur_df$CellID_1 <- rownames(cell_meta)[match(cur_df$CellID_1, cellID)]
+cur_df$CellID_2 <- rownames(cell_meta)[match(cur_df$CellID_2, cellID)]
+
+# Build graph
+g <- graph_from_data_frame(cur_df)
+g
+```
+
+
+```{r Create SingleCellExperiment}
+library(SingleCellExperiment)
+
+sce_175_A <-SingleCellExperiment(assays = list(counts=t(counts)))
+
+#marker name as row name cellID as colname
+rownames(sce_175_A)<-rownames(panel)
+colnames(sce_175_A)<-rownames(cell_meta)
+
+#store metadata in colDat and everything accordingly
+
+colData(sce_175_A) <-cell_meta
+rowData(sce_175_A) <-panel
+metadata(sce_175_A)<-list(graph=g)
+
+sce_175_A$CellID <- paste(sce_175_A$TmaID,sce_175_A$TmaBlock, sce_175_A$acID, sce_175_A$CellNumber, sep='_')
+
+#save SCE object
+saveRDS(sce_175_A, file=file.path(results_folder, paste("sce_175_A_2022.rds")))
+```
+
+
+#add counts and clinical data
+
+```{r, add transformed counts}
+censor_val <-0.999
+censor_dat <- function(x, quant = 0.999){
+  q = stats::quantile(x, quant)
+  x[x>q] = q
+  return(x)
+}
+fun.censor <- function(x) censor_dat(x, censor_val)
+fun.scale <- function(x) y <- x/max(x)
+
+#censored counts
+assay(sce_175_A, "c_counts") <- t(apply(assay(sce_175_A, "counts"), 1, fun.censor))
+
+#censored counts scaled 0-1
+assay(sce_175_A, "c_counts_scaled") <- t(apply(assay(sce_175_A, "c_counts"),
+                                           1, fun.scale))
+assay(sce_175_A, "c_counts_scaled")[assay(sce_175_A, "c_counts_scaled") < 0] <- 0
+
+
+
+#censored counts asinh scaled
+cofactor <- 1
+assay(sce_175_A, "c_counts_asinh") <- asinh((assay(sce_175_A,"c_counts"))/cofactor)
+
+#censored counts_asinh_scaled
+assay(sce_175_A, "c_counts_asinh_scaled") <- t(apply(assay(sce_175_A, "c_counts_asinh"),
+                                           1, fun.scale))
+assay(sce_175_A, "c_counts_asinh_scaled")[assay(sce_175_A, "c_counts_asinh") < 0] <- 0
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts"))
+
+saveRDS(sce_175_A,file=file.path(results_folder, paste("sce_175_A_counts_RAW",".rds",sep="")))
+```
+
+#Merge ROI xy Data into sce
+```{r, load clinical and meta data}
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_175_A",'cpinp')
+cd_folder <- file.path(wd,"lena_processed2","NSCLC_NEW","clinical_data")
+
+
+ac_meta <- read.csv(file=file.path(data_folder, paste("acquisition_metadata.csv")))
+clinical.data <-read.csv(file=file.path(cd_folder, paste("clinical_data.csv")))
+clinical.data$X <-NULL
+clinical.data$TmaBlock <-NULL
+```
+
+```{r, Add Roi coordinates}
+ac_sub <- data.frame(ac_meta$description,ac_meta$id)
+colnames(ac_sub) <- c("ROI","acID")
+
+ac_sub <- ac_sub %>%
+            separate(ROI, 
+              into = c("TMA", "ROI_xy"), 
+              sep = "(?<=[A-Za-z])(?=[0-9])",
+              remove=F
+            )
+ac_sub$TMA <-NULL
+
+ac_sub$RoiID <- paste("175",ac_sub$ROI, sep="_")
+ac_sub$ROI <-NULL
+ac_sub
+length(unique(ac_sub$ROI_xy))
+length(unique(ac_sub$RoiID))
+ac_sub$ROI_xy[duplicated(ac_sub$ROI_xy)]
+
+ac_sub$TMA <-"175A"
+ac_sub$Tma_ac <-paste(ac_sub$TMA, ac_sub$acID, sep="_")
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","clinical_data"))
+write.csv(ac_sub, file=file.path(results_folder, "ac_sub_175A.csv"))
+
+```
+
+```{r, add clinical data and RoiID}
+#change ROI to Image ID
+#names(colData(sce))[names(colData(sce)) == 'ROI'] <- 'ImageID'
+head(colData(sce_175_A))
+
+ac_clinical <- left_join(ac_sub, clinical.data, by="RoiID")
+
+cur_DF <- as_tibble(colData(sce_175_A)) %>% left_join(ac_clinical, by = "acID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sce_175_A$ImageNumber, sce_175_A$CellNumber))
+
+colData(sce_175_A) <- cur_DF
+rownames(colData(sce_175_A)) <-sce_175_A$CellID
+head(colData(sce_175_A))
+```
+
+```{r, save sce with counts and clinical data to folder counts_clinical data}
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts_clinical data"))
+
+saveRDS(sce_175_A,file=file.path(results_folder, paste("sce_175_A_counts_clinical-data_RAW",".rds",sep="")))
+```
\ No newline at end of file
diff --git a/Read_LC_NSCLC_TMA_175_B.Rmd b/Read_LC_NSCLC_TMA_175_B.Rmd
new file mode 100644
index 0000000..7893166
--- /dev/null
+++ b/Read_LC_NSCLC_TMA_175_B.Rmd
@@ -0,0 +1,271 @@
+---
+title: "Read in CP data, NSCLC TMA 175_B"
+author: "Lena Cords"
+output: html_notebook
+---
+
+```{r, import libraries}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+```
+
+
+```{r, Set wd}
+#set working directory 
+wd <- dirname(dirname(dirname(getwd())))
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_175_B",'cpinp')
+panel_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_175_B",'cpout')
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","RAW"))
+```
+
+
+```{r, Data import}
+#Read in: Cells, Image, Object and Panel
+cells <- read.csv(file=file.path(data_folder,paste('Cells.csv')))
+image <- read.csv(file=file.path(data_folder, paste("Image.csv")))
+object_relationship <- read.csv(file=file.path(data_folder, paste("distance20_Object relationships.csv")))
+panel <- read.csv(file=file.path(panel_folder, paste('panel.csv')))
+```
+
+```{r, What has been measured for each cell}
+#number of cells
+nrow(cells)
+unique(sub("_c[0-9]*$", "", colnames(cells)))
+
+#Select only the measurements you want to work on, based on the measurements displayed above Intensity_MeanIntensityCellsCorr_ColdStack
+counts <- cells[, grepl("Intensity_MeanIntensityCorrected_FullStackFiltered", colnames(cells))]
+```
+
+```{r Scale counts according to CP scaling factor}
+#when using 16bit data this should be 2^16 =65536
+scaling_factor <- unique(image$Scaling_FullStack)
+#Scale up the counts according to scaling factor from CellProfiler
+counts <- counts * scaling_factor
+
+#make sure the counts have actually been multiplied by the scaling factor
+range(counts)
+```
+
+
+```{r Add cell metadata as S4 Object for the SCE}
+#the S4Vector is needed for the SCE and a dataframe that stores the metadata per category
+cell_meta <- DataFrame(CellNumber = cells$ObjectNumber,
+                      Center_X=cells$Location_Center_X,
+                      Center_Y=cells$Location_Center_Y,
+                      Area = cells$AreaShape_Area,
+                      MajorAxisLength=cells$AreaShape_MajorAxisLength,
+                      MinorAxisLength=cells$AreaShape_MinorAxisLength,
+                      Compartment=cells$Intensity_MeanIntensity_TumourMask_Distance_B100)
+cell_meta$ImageNumber <- cells$ImageNumber
+
+#Create DataFrame for image meta data. Then match image and cell metadata by ImageNumber
+
+image_meta <- DataFrame(Area_Description = image$Metadata_description)
+
+
+#run this to get the image number for the meta data
+image_meta$ImageNumber <- image$ImageNumber
+
+
+#merge cell meta and image metadata by Image Number here
+  #cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+```
+
+
+```{r, Add cell metadata as S4 Object for the SCE from file name}
+#get info such as Batch, sample, ROI from original .mcd filename and add it to the cell metadata (colDat)
+
+#this is a good overview to get the different parts of metadata from the file name
+ac_info <- str_split(image$FileName_FullStack, '_', simplify=TRUE)
+image$Metadata_Description
+head(ac_info)
+cell_meta$BatchID <- ac_info[cell_meta$ImageNumber,1]
+cell_meta$Panel <- ac_info[cell_meta$ImageNumber,3]
+cell_meta$TmaID <- ac_info[cell_meta$ImageNumber,5]
+cell_meta$TmaBlock <- ac_info[cell_meta$ImageNumber,6]
+#cell_meta$ROI <- image$Metadata_roiid[cell_meta$ImageNumber]
+cell_meta$acID <- image$Metadata_acid[cell_meta$ImageNumber]
+#colnames(cells)
+```
+
+
+
+
+```{r Set rownames for SCE}
+#Should usually be cell id. However, in the cells.csv file the object number is counted per imgage, so to get unique cell IDs, they must be establishes by combining ObjectNumber, ImageNumber/ROINumber and SampleID
+
+rownames(cell_meta) <-paste(cell_meta$TmaID,cell_meta$TmaBlock, cell_meta$acID, cell_meta$CellNumber, sep='_')
+```
+
+
+```{r, Match channel names with metal names in exact order}
+library(DT)
+#loads panel
+DT::datatable(panel)
+
+#exactl order of channels can be extracted from the _full.csv files
+#header = FALSE is important here as otherwise it'll take the first metal as a header
+channel_metal <- read.csv(file.path(data_folder, paste("full_channelmeta.csv")), header=FALSE)
+
+#this reorders the panel according to the correct metal lsit
+panel <- panel[match(channel_metal[,1], panel$Metal.Tag),]
+#panel <- as.data.table(panel)
+#use Target from panel as the correct channel name to be displayed
+rownames(panel) <-panel$Clean_Target
+
+#channels in cell file are not ordered correctly (starting with 1,11,12 ... 2,21,22 ... 3,31,32...), hence we need to reorder them
+channelNumber <- as.numeric(sub("^.*_c", "", colnames(counts)))
+
+# Order counts based on channel number
+counts <- counts[,order(channelNumber, decreasing = FALSE)] 
+range(counts)
+```
+
+```{r build igraph object}
+library(igraph)
+# Construct neighbour data.frame
+# First in the ImageNumber_ObjectNumber format
+cur_df <- data.frame(CellID_1 = paste0(object_relationship$First.Image.Number, 
+                                       "_", object_relationship$First.Object.Number),
+                     CellID_2 = paste0(object_relationship$Second.Image.Number, 
+                                       "_", object_relationship$Second.Object.Number))
+# Create simple cell IDs
+cellID <- paste0(cell_meta$ImageNumber, "_", cell_meta$CellNumber)
+
+# Change cell IDs
+cur_df$CellID_1 <- rownames(cell_meta)[match(cur_df$CellID_1, cellID)]
+cur_df$CellID_2 <- rownames(cell_meta)[match(cur_df$CellID_2, cellID)]
+
+# Build graph
+g <- graph_from_data_frame(cur_df)
+g
+```
+
+
+```{r Create SingleCellExperiment}
+library(SingleCellExperiment)
+
+sce_175_B <-SingleCellExperiment(assays = list(counts=t(counts)))
+
+#marker name as row name cellID as colname
+rownames(sce_175_B)<-rownames(panel)
+colnames(sce_175_B)<-rownames(cell_meta)
+
+#store metadata in colDat and everything accordingly
+
+colData(sce_175_B) <-cell_meta
+rowData(sce_175_B) <-panel
+metadata(sce_175_B)<-list(graph=g)
+
+sce_175_B$CellID <- paste(sce_175_B$TmaID,sce_175_B$TmaBlock, sce_175_B$acID, sce_175_B$CellNumber, sep='_')
+
+#save SCE object
+saveRDS(sce_175_B, file=file.path(results_folder, paste("sce_175_B_2022.rds")))
+```
+
+
+
+#add counts and clinical data
+
+```{r, add transformed counts}
+censor_val <-0.999
+censor_dat <- function(x, quant = 0.999){
+  q = stats::quantile(x, quant)
+  x[x>q] = q
+  return(x)
+}
+fun.censor <- function(x) censor_dat(x, censor_val)
+fun.scale <- function(x) y <- x/max(x)
+
+#censored counts
+assay(sce_175_B, "c_counts") <- t(apply(assay(sce_175_B, "counts"), 1, fun.censor))
+
+#censored counts scaled 0-1
+assay(sce_175_B, "c_counts_scaled") <- t(apply(assay(sce_175_B, "c_counts"),
+                                           1, fun.scale))
+assay(sce_175_B, "c_counts_scaled")[assay(sce_175_B, "c_counts_scaled") < 0] <- 0
+
+
+
+#censored counts asinh scaled
+cofactor <- 1
+assay(sce_175_B, "c_counts_asinh") <- asinh((assay(sce_175_B,"c_counts"))/cofactor)
+
+#censored counts_asinh_scaled
+assay(sce_175_B, "c_counts_asinh_scaled") <- t(apply(assay(sce_175_B, "c_counts_asinh"),
+                                           1, fun.scale))
+assay(sce_175_B, "c_counts_asinh_scaled")[assay(sce_175_B, "c_counts_asinh") < 0] <- 0
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts"))
+
+saveRDS(sce_175_B,file=file.path(results_folder, paste("sce_175_B_counts_RAW",".rds",sep="")))
+```
+
+#Merge ROI xy Data into sce
+```{r, load clinical and meta data}
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_175_B",'cpinp')
+cd_folder <- file.path(wd,"lena_processed2","NSCLC_NEW","clinical_data")
+
+
+ac_meta <- read.csv(file=file.path(data_folder, paste("acquisition_metadata.csv")))
+clinical.data <-read.csv(file=file.path(cd_folder, paste("clinical_data.csv")))
+clinical.data$X <-NULL
+clinical.data$TmaBlock <-NULL
+```
+
+```{r, Add Roi coordinates}
+ac_sub <- data.frame(ac_meta$description,ac_meta$id)
+colnames(ac_sub) <- c("ROI","acID")
+
+ac_sub <- ac_sub %>%
+            separate(ROI, 
+              into = c("TMA", "ROI_xy"), 
+              sep = "(?<=[A-Za-z])(?=[0-9])",
+              remove=F
+            )
+ac_sub$TMA <-"B"
+ac_sub$ROI <-paste(ac_sub$TMA, ac_sub$ROI_xy, sep="")
+
+ac_sub$RoiID <- paste("175",ac_sub$ROI, sep="_")
+ac_sub$ROI <-NULL
+ac_sub
+
+length(unique(ac_sub$ROI_xy))
+length(unique(ac_sub$RoiID))
+ac_sub$ROI_xy[duplicated(ac_sub$ROI_xy)]
+
+ac_sub$TMA <-"175B"
+ac_sub$Tma_ac <-paste(ac_sub$TMA, ac_sub$acID, sep="_")
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","clinical_data"))
+write.csv(ac_sub, file=file.path(results_folder, "ac_sub_175B.csv"))
+
+```
+
+```{r, add clinical data and RoiID}
+#change ROI to Image ID
+#names(colData(sce))[names(colData(sce)) == 'ROI'] <- 'ImageID'
+head(colData(sce_175_B))
+
+ac_clinical <- left_join(ac_sub, clinical.data, by="RoiID")
+
+cur_DF <- as_tibble(colData(sce_175_B)) %>% left_join(ac_clinical, by = "acID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sce_175_B$ImageNumber, sce_175_B$CellNumber))
+
+colData(sce_175_B) <- cur_DF
+rownames(colData(sce_175_B)) <-sce_175_B$CellID
+head(colData(sce_175_B))
+```
+
+```{r, save sce with counts and clinical data to folder counts_clinical data}
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts_clinical data"))
+
+saveRDS(sce_175_B,file=file.path(results_folder, paste("sce_175_B_counts_clinical-data_RAW",".rds",sep="")))
+```
\ No newline at end of file
diff --git a/Read_LC_NSCLC_TMA_175_C.Rmd b/Read_LC_NSCLC_TMA_175_C.Rmd
new file mode 100644
index 0000000..40856fc
--- /dev/null
+++ b/Read_LC_NSCLC_TMA_175_C.Rmd
@@ -0,0 +1,272 @@
+---
+title: "Read in CP data, NSCLC TMA 175_C"
+author: "Lena Cords"
+output: html_notebook
+---
+
+```{r, import libraries}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+```
+
+
+
+```{r, Set wd}
+#set working directory 
+wd <- dirname(dirname(dirname(getwd())))
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_175_C",'cpinp')
+panel_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_175_C",'cpout')
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","RAW"))
+```
+
+
+```{r, Data import}
+#Read in: Cells, Image, Object and Panel
+cells <- read.csv(file=file.path(data_folder,paste('Cells.csv')))
+image <- read.csv(file=file.path(data_folder, paste("Image.csv")))
+object_relationship <- read.csv(file=file.path(data_folder, paste("distance20_Object relationships.csv")))
+panel <- read.csv(file=file.path(panel_folder, paste('panel.csv')))
+```
+
+```{r, What has been measured for each cell}
+#number of cells
+nrow(cells)
+unique(sub("_c[0-9]*$", "", colnames(cells)))
+
+#Select only the measurements you want to work on, based on the measurements displayed above Intensity_MeanIntensityCellsCorr_ColdStack
+counts <- cells[, grepl("Intensity_MeanIntensityCorrected_FullStackFiltered", colnames(cells))]
+```
+
+```{r Scale counts according to CP scaling factor}
+#when using 16bit data this should be 2^16 =65536
+scaling_factor <- unique(image$Scaling_FullStack)
+#Scale up the counts according to scaling factor from CellProfiler
+counts <- counts * scaling_factor
+
+#make sure the counts have actually been multiplied by the scaling factor
+range(counts)
+```
+
+
+```{r Add cell metadata as S4 Object for the SCE}
+#the S4Vector is needed for the SCE and a dataframe that stores the metadata per category
+cell_meta <- DataFrame(CellNumber = cells$ObjectNumber,
+                      Center_X=cells$Location_Center_X,
+                      Center_Y=cells$Location_Center_Y,
+                      Area = cells$AreaShape_Area,
+                      MajorAxisLength=cells$AreaShape_MajorAxisLength,
+                      MinorAxisLength=cells$AreaShape_MinorAxisLength,
+                      Compartment=cells$Intensity_MeanIntensity_TumourMask_Distance_B100)
+cell_meta$ImageNumber <- cells$ImageNumber
+
+#Create DataFrame for image meta data. Then match image and cell metadata by ImageNumber
+
+image_meta <- DataFrame(Area_Description = image$Metadata_description)
+
+
+#run this to get the image number for the meta data
+image_meta$ImageNumber <- image$ImageNumber
+
+
+#merge cell meta and image metadata by Image Number here
+  #cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+```
+
+
+```{r, Add cell metadata as S4 Object for the SCE from file name}
+#get info such as Batch, sample, ROI from original .mcd filename and add it to the cell metadata (colDat)
+
+#this is a good overview to get the different parts of metadata from the file name
+ac_info <- str_split(image$FileName_FullStack, '_', simplify=TRUE)
+image$Metadata_Description
+head(ac_info)
+cell_meta$BatchID <- ac_info[cell_meta$ImageNumber,1]
+cell_meta$Panel <- ac_info[cell_meta$ImageNumber,3]
+cell_meta$TmaID <- ac_info[cell_meta$ImageNumber,5]
+cell_meta$TmaBlock <- ac_info[cell_meta$ImageNumber,6]
+#cell_meta$ROI <- image$Metadata_roiid[cell_meta$ImageNumber]
+cell_meta$acID <- image$Metadata_acid[cell_meta$ImageNumber]
+#colnames(cells)
+```
+
+
+
+
+```{r Set rownames for SCE}
+#Should usually be cell id. However, in the cells.csv file the object number is counted per imgage, so to get unique cell IDs, they must be establishes by combining ObjectNumber, ImageNumber/ROINumber and SampleID
+
+rownames(cell_meta) <-paste(cell_meta$TmaID,cell_meta$TmaBlock, cell_meta$acID, cell_meta$CellNumber, sep='_')
+```
+
+
+```{r, Match channel names with metal names in exact order}
+library(DT)
+#loads panel
+DT::datatable(panel)
+
+#exactl order of channels can be extracted from the _full.csv files
+#header = FALSE is important here as otherwise it'll take the first metal as a header
+channel_metal <- read.csv(file.path(data_folder, paste("full_channelmeta.csv")), header=FALSE)
+
+#this reorders the panel according to the correct metal lsit
+panel <- panel[match(channel_metal[,1], panel$Metal.Tag),]
+#panel <- as.data.table(panel)
+#use Target from panel as the correct channel name to be displayed
+rownames(panel) <-panel$Clean_Target
+
+#channels in cell file are not ordered correctly (starting with 1,11,12 ... 2,21,22 ... 3,31,32...), hence we need to reorder them
+channelNumber <- as.numeric(sub("^.*_c", "", colnames(counts)))
+
+# Order counts based on channel number
+counts <- counts[,order(channelNumber, decreasing = FALSE)] 
+range(counts)
+```
+
+```{r build igraph object}
+library(igraph)
+# Construct neighbour data.frame
+# First in the ImageNumber_ObjectNumber format
+cur_df <- data.frame(CellID_1 = paste0(object_relationship$First.Image.Number, 
+                                       "_", object_relationship$First.Object.Number),
+                     CellID_2 = paste0(object_relationship$Second.Image.Number, 
+                                       "_", object_relationship$Second.Object.Number))
+# Create simple cell IDs
+cellID <- paste0(cell_meta$ImageNumber, "_", cell_meta$CellNumber)
+
+# Change cell IDs
+cur_df$CellID_1 <- rownames(cell_meta)[match(cur_df$CellID_1, cellID)]
+cur_df$CellID_2 <- rownames(cell_meta)[match(cur_df$CellID_2, cellID)]
+
+# Build graph
+g <- graph_from_data_frame(cur_df)
+g
+```
+
+
+```{r Create SingleCellExperiment}
+library(SingleCellExperiment)
+
+sce_175_C <-SingleCellExperiment(assays = list(counts=t(counts)))
+
+#marker name as row name cellID as colname
+rownames(sce_175_C)<-rownames(panel)
+colnames(sce_175_C)<-rownames(cell_meta)
+
+#store metadata in colDat and everything accordingly
+
+colData(sce_175_C) <-cell_meta
+rowData(sce_175_C) <-panel
+metadata(sce_175_C)<-list(graph=g)
+
+sce_175_C$CellID <- paste(sce_175_C$TmaID,sce_175_C$TmaBlock, sce_175_C$acID, sce_175_C$CellNumber, sep='_')
+
+#save SCE object
+saveRDS(sce_175_C, file=file.path(results_folder, paste("sce_175_C_2022.rds")))
+sce_175_C
+```
+
+
+#add counts and clinical data
+
+```{r, add transformed counts}
+censor_val <-0.999
+censor_dat <- function(x, quant = 0.999){
+  q = stats::quantile(x, quant)
+  x[x>q] = q
+  return(x)
+}
+fun.censor <- function(x) censor_dat(x, censor_val)
+fun.scale <- function(x) y <- x/max(x)
+
+#censored counts
+assay(sce_175_C, "c_counts") <- t(apply(assay(sce_175_C, "counts"), 1, fun.censor))
+
+#censored counts scaled 0-1
+assay(sce_175_C, "c_counts_scaled") <- t(apply(assay(sce_175_C, "c_counts"),
+                                           1, fun.scale))
+assay(sce_175_C, "c_counts_scaled")[assay(sce_175_C, "c_counts_scaled") < 0] <- 0
+
+
+
+#censored counts asinh scaled
+cofactor <- 1
+assay(sce_175_C, "c_counts_asinh") <- asinh((assay(sce_175_C,"c_counts"))/cofactor)
+
+#censored counts_asinh_scaled
+assay(sce_175_C, "c_counts_asinh_scaled") <- t(apply(assay(sce_175_C, "c_counts_asinh"),
+                                           1, fun.scale))
+assay(sce_175_C, "c_counts_asinh_scaled")[assay(sce_175_C, "c_counts_asinh") < 0] <- 0
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts"))
+
+saveRDS(sce_175_C,file=file.path(results_folder, paste("sce_175_C_counts_RAW",".rds",sep="")))
+```
+
+#Merge ROI xy Data into sce
+```{r, load clinical and meta data}
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_175_C",'cpinp')
+cd_folder <- file.path(wd,"lena_processed2","NSCLC_NEW","clinical_data")
+
+
+ac_meta <- read.csv(file=file.path(data_folder, paste("acquisition_metadata.csv")))
+clinical.data <-read.csv(file=file.path(cd_folder, paste("clinical_data.csv")))
+clinical.data$X <-NULL
+clinical.data$TmaBlock <-NULL
+```
+
+```{r, Add Roi coordinates}
+ac_sub <- data.frame(ac_meta$description,ac_meta$id)
+colnames(ac_sub) <- c("ROI","acID")
+
+ac_sub <- ac_sub %>%
+            separate(ROI, 
+              into = c("TMA", "ROI_xy"), 
+              sep = "(?<=[A-Za-z])(?=[0-9])",
+              remove=F
+            )
+ac_sub$TMA <-NULL
+
+ac_sub$RoiID <- paste("175",ac_sub$ROI, sep="_")
+ac_sub$ROI <-NULL
+ac_sub
+
+length(unique(ac_sub$ROI_xy))
+length(unique(ac_sub$RoiID))
+ac_sub$ROI_xy[duplicated(ac_sub$ROI_xy)]
+
+
+ac_sub$TMA <-"175C"
+ac_sub$Tma_ac <-paste(ac_sub$TMA, ac_sub$acID, sep="_")
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","clinical_data"))
+write.csv(ac_sub, file=file.path(results_folder, "ac_sub_175C.csv"))
+
+```
+
+```{r, add clinical data and RoiID}
+#change ROI to Image ID
+#names(colData(sce))[names(colData(sce)) == 'ROI'] <- 'ImageID'
+head(colData(sce_175_C))
+
+ac_clinical <- left_join(ac_sub, clinical.data, by="RoiID")
+
+cur_DF <- as_tibble(colData(sce_175_C)) %>% left_join(ac_clinical, by = "acID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sce_175_C$ImageNumber, sce_175_C$CellNumber))
+
+colData(sce_175_C) <- cur_DF
+rownames(colData(sce_175_C)) <-sce_175_C$CellID
+head(colData(sce_175_C))
+```
+
+```{r, save sce with counts and clinical data to folder counts_clinical data}
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts_clinical data"))
+
+saveRDS(sce_175_C,file=file.path(results_folder, paste("sce_175_C_counts_clinical-data_RAW",".rds",sep="")))
+```
\ No newline at end of file
diff --git a/Read_LC_NSCLC_TMA_176A.Rmd b/Read_LC_NSCLC_TMA_176A.Rmd
new file mode 100644
index 0000000..4f8e7be
--- /dev/null
+++ b/Read_LC_NSCLC_TMA_176A.Rmd
@@ -0,0 +1,276 @@
+---
+title: "Read in CP data, NSCLC TMA 176_A"
+author: "Lena Cords"
+output: html_notebook
+---
+
+```{r, import libraries}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+```
+
+
+
+```{r, Set wd}
+#set working directory 
+wd <- dirname(dirname(dirname(getwd())))
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_176_A",'cpinp')
+panel_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_176_A",'cpout')
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","RAW"))
+```
+
+```{r, Data import}
+#Read in: Cells, Image, Object and Panel
+cells <- read.csv(file=file.path(data_folder,paste('Cells.csv')))
+image <- read.csv(file=file.path(data_folder, paste("Image.csv")))
+object_relationship <- read.csv(file=file.path(data_folder, paste("distance20_Object relationships.csv")))
+panel <- read.csv(file=file.path(panel_folder, paste('panel.csv')))
+```
+
+```{r, What has been measured for each cell}
+#number of cells
+nrow(cells)
+unique(sub("_c[0-9]*$", "", colnames(cells)))
+
+#Select only the measurements you want to work on, based on the measurements displayed above Intensity_MeanIntensityCellsCorr_ColdStack
+counts <- cells[, grepl("Intensity_MeanIntensityCorrected_FullStackFiltered", colnames(cells))]
+```
+
+```{r Scale counts according to CP scaling factor}
+#when using 16bit data this should be 2^16 =65536
+scaling_factor <- unique(image$Scaling_FullStack)
+#Scale up the counts according to scaling factor from CellProfiler
+counts <- counts * scaling_factor
+
+#make sure the counts have actually been multiplied by the scaling factor
+range(counts)
+```
+
+
+```{r Add cell metadata as S4 Object for the SCE}
+#the S4Vector is needed for the SCE and a dataframe that stores the metadata per category
+cell_meta <- DataFrame(CellNumber = cells$ObjectNumber,
+                      Center_X=cells$Location_Center_X,
+                      Center_Y=cells$Location_Center_Y,
+                      Area = cells$AreaShape_Area,
+                      MajorAxisLength=cells$AreaShape_MajorAxisLength,
+                      MinorAxisLength=cells$AreaShape_MinorAxisLength,
+                      Compartment=cells$Intensity_MeanIntensity_TumourMask_Distance_B100)
+cell_meta$ImageNumber <- cells$ImageNumber
+
+#Create DataFrame for image meta data. Then match image and cell metadata by ImageNumber
+
+image_meta <- DataFrame(Area_Description = image$Metadata_description)
+
+
+#run this to get the image number for the meta data
+image_meta$ImageNumber <- image$ImageNumber
+
+
+#merge cell meta and image metadata by Image Number here
+  #cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+```
+
+
+```{r, Add cell metadata as S4 Object for the SCE from file name}
+#get info such as Batch, sample, ROI from original .mcd filename and add it to the cell metadata (colDat)
+
+#this is a good overview to get the different parts of metadata from the file name
+ac_info <- str_split(image$FileName_FullStack, '_', simplify=TRUE)
+image$Metadata_Description
+head(ac_info)
+cell_meta$BatchID <- ac_info[cell_meta$ImageNumber,1]
+cell_meta$Panel <- ac_info[cell_meta$ImageNumber,3]
+cell_meta$TmaID <- ac_info[cell_meta$ImageNumber,5]
+cell_meta$TmaBlock <- ac_info[cell_meta$ImageNumber,6]
+cell_meta$TmaBlock <- "A"
+#cell_meta$ROI <- image$Metadata_roiid[cell_meta$ImageNumber]
+cell_meta$acID <- image$Metadata_acid[cell_meta$ImageNumber]
+#colnames(cells)
+```
+
+
+
+
+```{r Set rownames for SCE}
+#Should usually be cell id. However, in the cells.csv file the object number is counted per imgage, so to get unique cell IDs, they must be establishes by combining ObjectNumber, ImageNumber/ROINumber and SampleID
+
+rownames(cell_meta) <-paste(cell_meta$TmaID,cell_meta$TmaBlock, cell_meta$acID, cell_meta$CellNumber, sep='_')
+```
+
+
+```{r, Match channel names with metal names in exact order}
+library(DT)
+#loads panel
+DT::datatable(panel)
+
+#exactl order of channels can be extracted from the _full.csv files
+#header = FALSE is important here as otherwise it'll take the first metal as a header
+channel_metal <- read.csv(file.path(data_folder, paste("full_channelmeta.csv")), header=FALSE)
+
+#this reorders the panel according to the correct metal lsit
+panel <- panel[match(channel_metal[,1], panel$Metal.Tag),]
+#panel <- as.data.table(panel)
+#use Target from panel as the correct channel name to be displayed
+rownames(panel) <-panel$Clean_Target
+
+#channels in cell file are not ordered correctly (starting with 1,11,12 ... 2,21,22 ... 3,31,32...), hence we need to reorder them
+channelNumber <- as.numeric(sub("^.*_c", "", colnames(counts)))
+
+# Order counts based on channel number
+counts <- counts[,order(channelNumber, decreasing = FALSE)] 
+range(counts)
+```
+
+```{r build igraph object}
+library(igraph)
+# Construct neighbour data.frame
+# First in the ImageNumber_ObjectNumber format
+cur_df <- data.frame(CellID_1 = paste0(object_relationship$First.Image.Number, 
+                                       "_", object_relationship$First.Object.Number),
+                     CellID_2 = paste0(object_relationship$Second.Image.Number, 
+                                       "_", object_relationship$Second.Object.Number))
+# Create simple cell IDs
+cellID <- paste0(cell_meta$ImageNumber, "_", cell_meta$CellNumber)
+
+# Change cell IDs
+cur_df$CellID_1 <- rownames(cell_meta)[match(cur_df$CellID_1, cellID)]
+cur_df$CellID_2 <- rownames(cell_meta)[match(cur_df$CellID_2, cellID)]
+
+# Build graph
+g <- graph_from_data_frame(cur_df)
+g
+```
+
+
+```{r Create SingleCellExperiment}
+library(SingleCellExperiment)
+
+sce_176_A <-SingleCellExperiment(assays = list(counts=t(counts)))
+
+#marker name as row name cellID as colname
+rownames(sce_176_A)<-rownames(panel)
+colnames(sce_176_A)<-rownames(cell_meta)
+
+#store metadata in colDat and everything accordingly
+
+colData(sce_176_A) <-cell_meta
+rowData(sce_176_A) <-panel
+metadata(sce_176_A)<-list(graph=g)
+
+sce_176_A$CellID <- paste(sce_176_A$TmaID,sce_176_A$TmaBlock, sce_176_A$acID, sce_176_A$CellNumber, sep='_')
+
+#save SCE object
+saveRDS(sce_176_A, file=file.path(results_folder, paste("sce_176_A_2022.rds")))
+sce_176_A
+```
+
+#add counts and clinical data
+
+```{r, add transformed counts}
+censor_val <-0.999
+censor_dat <- function(x, quant = 0.999){
+  q = stats::quantile(x, quant)
+  x[x>q] = q
+  return(x)
+}
+fun.censor <- function(x) censor_dat(x, censor_val)
+fun.scale <- function(x) y <- x/max(x)
+
+#censored counts
+assay(sce_176_A, "c_counts") <- t(apply(assay(sce_176_A, "counts"), 1, fun.censor))
+
+#censored counts scaled 0-1
+assay(sce_176_A, "c_counts_scaled") <- t(apply(assay(sce_176_A, "c_counts"),
+                                           1, fun.scale))
+assay(sce_176_A, "c_counts_scaled")[assay(sce_176_A, "c_counts_scaled") < 0] <- 0
+
+
+
+#censored counts asinh scaled
+cofactor <- 1
+assay(sce_176_A, "c_counts_asinh") <- asinh((assay(sce_176_A,"c_counts"))/cofactor)
+
+#censored counts_asinh_scaled
+assay(sce_176_A, "c_counts_asinh_scaled") <- t(apply(assay(sce_176_A, "c_counts_asinh"),
+                                           1, fun.scale))
+assay(sce_176_A, "c_counts_asinh_scaled")[assay(sce_176_A, "c_counts_asinh") < 0] <- 0
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts"))
+
+saveRDS(sce_176_A,file=file.path(results_folder, paste("sce_176_A_counts_RAW",".rds",sep="")))
+```
+
+#Merge ROI xy Data into sce
+```{r, load clinical and meta data}
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_176_A",'cpinp')
+cd_folder <- file.path(wd,"lena_processed2","NSCLC_NEW","clinical_data")
+
+
+ac_meta <- read.csv(file=file.path(data_folder, paste("acquisition_metadata.csv")))
+clinical.data <-read.csv(file=file.path(cd_folder, paste("clinical_data.csv")))
+clinical.data$X <-NULL
+clinical.data$TmaBlock <-NULL
+```
+
+```{r, Add Roi coordinates}
+ac_sub <- data.frame(ac_meta$description,ac_meta$id)
+colnames(ac_sub) <- c("ROI","acID")
+
+ac_sub <- ac_sub %>%
+            separate(ROI, 
+              into = c("TMA", "ROI_xy"), 
+              sep = "(?<=[A-Za-z])(?=[0-9])",
+              remove=F
+            )
+ac_sub$TMA <-NULL
+
+ac_sub$RoiID <- paste("176",ac_sub$ROI, sep="_")
+ac_sub$ROI <-NULL
+ac_sub
+
+length(unique(ac_sub$ROI_xy))
+length(unique(ac_sub$RoiID))
+ac_sub$ROI_xy[duplicated(ac_sub$ROI_xy)]
+
+ac_sub$TMA <-"176A"
+ac_sub$Tma_ac <-paste(ac_sub$TMA, ac_sub$acID, sep="_")
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","clinical_data"))
+write.csv(ac_sub, file=file.path(results_folder, "ac_sub_176A.csv"))
+
+```
+
+```{r, add clinical data and RoiID}
+#change ROI to Image ID
+#names(colData(sce))[names(colData(sce)) == 'ROI'] <- 'ImageID'
+head(colData(sce_176_A))
+sce_176_A$TmaBlock <-"A"
+ac_clinical <- left_join(ac_sub, clinical.data, by="RoiID")
+
+cur_DF <- as_tibble(colData(sce_176_A)) %>% left_join(ac_clinical, by = "acID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sce_176_A$ImageNumber, sce_176_A$CellNumber))
+
+colData(sce_176_A) <- cur_DF
+rownames(colData(sce_176_A)) <-sce_176_A$CellID
+head(colData(sce_176_A))
+
+ci_o <-sce_176_A$CellID
+
+sce_176_A$CellID <-paste(sce_176_A$TMA,sce_176_A$TmaBlock, sce_176_A$acID, sce_176_A$CellNumber, sep="_")
+rownames(colData(sce_176_A)) <-sce_176_A$CellID
+
+```
+
+```{r, save sce with counts and clinical data to folder counts_clinical data}
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts_clinical data"))
+
+saveRDS(sce_176_A,file=file.path(results_folder, paste("sce_176_A_counts_clinical-data_RAW",".rds",sep="")))
+```
\ No newline at end of file
diff --git a/Read_LC_NSCLC_TMA_176B.Rmd b/Read_LC_NSCLC_TMA_176B.Rmd
new file mode 100644
index 0000000..bac2d69
--- /dev/null
+++ b/Read_LC_NSCLC_TMA_176B.Rmd
@@ -0,0 +1,268 @@
+---
+title: "Read in CP data, NSCLC TMA 176_B"
+author: "Lena Cords"
+output: html_notebook
+---
+
+```{r, import libraries}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+```
+
+
+```{r, Set wd}
+#set working directory 
+wd <- dirname(dirname(dirname(getwd())))
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_176_B",'cpinp')
+panel_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_176_B",'cpout')
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","RAW"))
+```
+
+
+```{r, Data import}
+#Read in: Cells, Image, Object and Panel
+cells <- read.csv(file=file.path(data_folder,paste('Cells.csv')))
+image <- read.csv(file=file.path(data_folder, paste("Image.csv")))
+object_relationship <- read.csv(file=file.path(data_folder, paste("distance20_Object relationships.csv")))
+panel <- read.csv(file=file.path(panel_folder, paste('panel.csv')))
+```
+
+```{r, What has been measured for each cell}
+#number of cells
+nrow(cells)
+unique(sub("_c[0-9]*$", "", colnames(cells)))
+
+#Select only the measurements you want to work on, based on the measurements displayed above Intensity_MeanIntensityCellsCorr_ColdStack
+counts <- cells[, grepl("Intensity_MeanIntensityCorrected_FullStackFiltered", colnames(cells))]
+```
+
+```{r Scale counts according to CP scaling factor}
+#when using 16bit data this should be 2^16 =65536
+scaling_factor <- unique(image$Scaling_FullStack)
+#Scale up the counts according to scaling factor from CellProfiler
+counts <- counts * scaling_factor
+
+#make sure the counts have actually been multiplied by the scaling factor
+range(counts)
+```
+
+
+```{r Add cell metadata as S4 Object for the SCE}
+#the S4Vector is needed for the SCE and a dataframe that stores the metadata per category
+cell_meta <- DataFrame(CellNumber = cells$ObjectNumber,
+                      Center_X=cells$Location_Center_X,
+                      Center_Y=cells$Location_Center_Y,
+                      Area = cells$AreaShape_Area,
+                      MajorAxisLength=cells$AreaShape_MajorAxisLength,
+                      MinorAxisLength=cells$AreaShape_MinorAxisLength,
+                      Compartment=cells$Intensity_MeanIntensity_TumourMask_Distance_B100)
+cell_meta$ImageNumber <- cells$ImageNumber
+
+#Create DataFrame for image meta data. Then match image and cell metadata by ImageNumber
+
+image_meta <- DataFrame(Area_Description = image$Metadata_description)
+
+
+#run this to get the image number for the meta data
+image_meta$ImageNumber <- image$ImageNumber
+
+
+#merge cell meta and image metadata by Image Number here
+  #cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+```
+
+
+```{r, Add cell metadata as S4 Object for the SCE from file name}
+#get info such as Batch, sample, ROI from original .mcd filename and add it to the cell metadata (colDat)
+
+#this is a good overview to get the different parts of metadata from the file name
+ac_info <- str_split(image$FileName_FullStack, '_', simplify=TRUE)
+image$Metadata_Description
+head(ac_info)
+cell_meta$BatchID <- ac_info[cell_meta$ImageNumber,1]
+cell_meta$Panel <- ac_info[cell_meta$ImageNumber,3]
+cell_meta$TmaID <- ac_info[cell_meta$ImageNumber,5]
+cell_meta$TmaBlock <- ac_info[cell_meta$ImageNumber,6]
+#cell_meta$ROI <- image$Metadata_roiid[cell_meta$ImageNumber]
+cell_meta$acID <- image$Metadata_acid[cell_meta$ImageNumber]
+#colnames(cells)
+```
+
+
+
+
+```{r Set rownames for SCE}
+#Should usually be cell id. However, in the cells.csv file the object number is counted per imgage, so to get unique cell IDs, they must be establishes by combining ObjectNumber, ImageNumber/ROINumber and SampleID
+
+rownames(cell_meta) <-paste(cell_meta$TmaID,cell_meta$TmaBlock, cell_meta$acID, cell_meta$CellNumber, sep='_')
+```
+
+
+```{r, Match channel names with metal names in exact order}
+library(DT)
+#loads panel
+DT::datatable(panel)
+
+#exactl order of channels can be extracted from the _full.csv files
+#header = FALSE is important here as otherwise it'll take the first metal as a header
+channel_metal <- read.csv(file.path(data_folder, paste("full_channelmeta.csv")), header=FALSE)
+
+#this reorders the panel according to the correct metal lsit
+panel <- panel[match(channel_metal[,1], panel$Metal.Tag),]
+#panel <- as.data.table(panel)
+#use Target from panel as the correct channel name to be displayed
+rownames(panel) <-panel$Clean_Target
+
+#channels in cell file are not ordered correctly (starting with 1,11,12 ... 2,21,22 ... 3,31,32...), hence we need to reorder them
+channelNumber <- as.numeric(sub("^.*_c", "", colnames(counts)))
+
+# Order counts based on channel number
+counts <- counts[,order(channelNumber, decreasing = FALSE)] 
+range(counts)
+```
+
+```{r build igraph object}
+library(igraph)
+# Construct neighbour data.frame
+# First in the ImageNumber_ObjectNumber format
+cur_df <- data.frame(CellID_1 = paste0(object_relationship$First.Image.Number, 
+                                       "_", object_relationship$First.Object.Number),
+                     CellID_2 = paste0(object_relationship$Second.Image.Number, 
+                                       "_", object_relationship$Second.Object.Number))
+# Create simple cell IDs
+cellID <- paste0(cell_meta$ImageNumber, "_", cell_meta$CellNumber)
+
+# Change cell IDs
+cur_df$CellID_1 <- rownames(cell_meta)[match(cur_df$CellID_1, cellID)]
+cur_df$CellID_2 <- rownames(cell_meta)[match(cur_df$CellID_2, cellID)]
+
+# Build graph
+g <- graph_from_data_frame(cur_df)
+g
+```
+
+
+```{r Create SingleCellExperiment}
+library(SingleCellExperiment)
+
+sce_176_B <-SingleCellExperiment(assays = list(counts=t(counts)))
+
+#marker name as row name cellID as colname
+rownames(sce_176_B)<-rownames(panel)
+colnames(sce_176_B)<-rownames(cell_meta)
+
+#store metadata in colDat and everything accordingly
+
+colData(sce_176_B) <-cell_meta
+rowData(sce_176_B) <-panel
+metadata(sce_176_B)<-list(graph=g)
+
+sce_176_B$CellID <- paste(sce_176_B$TmaID,sce_176_B$TmaBlock, sce_176_B$acID, sce_176_B$CellNumber, sep='_')
+
+#save SCE object
+saveRDS(sce_176_B, file=file.path(results_folder, paste("sce_176_B_2022.rds")))
+sce_176_B
+```
+#add counts and clinical data
+
+```{r, add transformed counts}
+censor_val <-0.999
+censor_dat <- function(x, quant = 0.999){
+  q = stats::quantile(x, quant)
+  x[x>q] = q
+  return(x)
+}
+fun.censor <- function(x) censor_dat(x, censor_val)
+fun.scale <- function(x) y <- x/max(x)
+
+#censored counts
+assay(sce_176_B, "c_counts") <- t(apply(assay(sce_176_B, "counts"), 1, fun.censor))
+
+#censored counts scaled 0-1
+assay(sce_176_B, "c_counts_scaled") <- t(apply(assay(sce_176_B, "c_counts"),
+                                           1, fun.scale))
+assay(sce_176_B, "c_counts_scaled")[assay(sce_176_B, "c_counts_scaled") < 0] <- 0
+
+
+
+#censored counts asinh scaled
+cofactor <- 1
+assay(sce_176_B, "c_counts_asinh") <- asinh((assay(sce_176_B,"c_counts"))/cofactor)
+
+#censored counts_asinh_scaled
+assay(sce_176_B, "c_counts_asinh_scaled") <- t(apply(assay(sce_176_B, "c_counts_asinh"),
+                                           1, fun.scale))
+assay(sce_176_B, "c_counts_asinh_scaled")[assay(sce_176_B, "c_counts_asinh") < 0] <- 0
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts"))
+
+saveRDS(sce_176_B,file=file.path(results_folder, paste("sce_176_B_counts_RAW",".rds",sep="")))
+```
+
+#Merge ROI xy Data into sce
+```{r, load clinical and meta data}
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_176_B",'cpinp')
+cd_folder <- file.path(wd,"lena_processed2","NSCLC_NEW","clinical_data")
+
+
+ac_meta <- read.csv(file=file.path(data_folder, paste("acquisition_metadata.csv")))
+clinical.data <-read.csv(file=file.path(cd_folder, paste("clinical_data.csv")))
+clinical.data$X <-NULL
+clinical.data$TmaBlock <-NULL
+```
+
+```{r, Add Roi coordinates}
+ac_sub <- data.frame(ac_meta$description,ac_meta$id)
+colnames(ac_sub) <- c("ROI","acID")
+
+ac_sub <- ac_sub %>%
+            separate(ROI, 
+              into = c("TMA", "ROI_xy"), 
+              sep = "(?<=[A-Za-z])(?=[0-9])",
+              remove=F
+            )
+ac_sub$TMA <-NULL
+
+ac_sub$RoiID <- paste("176",ac_sub$ROI, sep="_")
+ac_sub$ROI <-NULL
+ac_sub
+
+length(unique(ac_sub$ROI_xy))
+length(unique(ac_sub$RoiID))
+ac_sub$ROI_xy[duplicated(ac_sub$ROI_xy)]
+
+ac_sub$TMA <-"176B"
+ac_sub$Tma_ac <-paste(ac_sub$TMA, ac_sub$acID, sep="_")
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","clinical_data"))
+write.csv(ac_sub, file=file.path(results_folder, "ac_sub_176B.csv"))
+
+```
+
+```{r, add clinical data and RoiID}
+#change ROI to Image ID
+#names(colData(sce))[names(colData(sce)) == 'ROI'] <- 'ImageID'
+head(colData(sce_176_B))
+
+ac_clinical <- left_join(ac_sub, clinical.data, by="RoiID")
+
+cur_DF <- as_tibble(colData(sce_176_B)) %>% left_join(ac_clinical, by = "acID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sce_176_B$ImageNumber, sce_176_B$CellNumber))
+
+colData(sce_176_B) <- cur_DF
+rownames(colData(sce_176_B)) <-sce_176_B$CellID
+head(colData(sce_176_B))
+```
+
+```{r, save sce with counts and clinical data to folder counts_clinical data}
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts_clinical data"))
+
+saveRDS(sce_176_B,file=file.path(results_folder, paste("sce_176_B_counts_clinical-data_RAW",".rds",sep="")))
+```
\ No newline at end of file
diff --git a/Read_LC_NSCLC_TMA_176C.Rmd b/Read_LC_NSCLC_TMA_176C.Rmd
new file mode 100644
index 0000000..ed023e9
--- /dev/null
+++ b/Read_LC_NSCLC_TMA_176C.Rmd
@@ -0,0 +1,271 @@
+---
+title: "Read in CP data, NSCLC TMA 176_C"
+author: "Lena Cords"
+output: html_notebook
+---
+
+```{r, import libraries}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+```
+
+
+```{r, Set wd}
+#set working directory 
+wd <- dirname(dirname(dirname(getwd())))
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_176_C",'cpinp')
+panel_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_176_C",'cpout')
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","RAW"))
+```
+
+
+```{r, Data import}
+#Read in: Cells, Image, Object and Panel
+cells <- read.csv(file=file.path(data_folder,paste('Cells.csv')))
+image <- read.csv(file=file.path(data_folder, paste("Image.csv")))
+object_relationship <- read.csv(file=file.path(data_folder, paste("distance20_Object relationships.csv")))
+panel <- read.csv(file=file.path(panel_folder, paste('panel.csv')))
+```
+
+```{r, What has been measured for each cell}
+#number of cells
+nrow(cells)
+unique(sub("_c[0-9]*$", "", colnames(cells)))
+
+#Select only the measurements you want to work on, based on the measurements displayed above Intensity_MeanIntensityCellsCorr_ColdStack
+counts <- cells[, grepl("Intensity_MeanIntensityCorrected_FullStackFiltered", colnames(cells))]
+```
+
+```{r Scale counts according to CP scaling factor}
+#when using 16bit data this should be 2^16 =65536
+scaling_factor <- unique(image$Scaling_FullStack)
+#Scale up the counts according to scaling factor from CellProfiler
+counts <- counts * scaling_factor
+
+#make sure the counts have actually been multiplied by the scaling factor
+range(counts)
+```
+
+
+```{r Add cell metadata as S4 Object for the SCE}
+#the S4Vector is needed for the SCE and a dataframe that stores the metadata per category
+cell_meta <- DataFrame(CellNumber = cells$ObjectNumber,
+                      Center_X=cells$Location_Center_X,
+                      Center_Y=cells$Location_Center_Y,
+                      Area = cells$AreaShape_Area,
+                      MajorAxisLength=cells$AreaShape_MajorAxisLength,
+                      MinorAxisLength=cells$AreaShape_MinorAxisLength,
+                      Compartment=cells$Intensity_MeanIntensity_TumourMask_Distance_B100)
+cell_meta$ImageNumber <- cells$ImageNumber
+
+#Create DataFrame for image meta data. Then match image and cell metadata by ImageNumber
+
+image_meta <- DataFrame(Area_Description = image$Metadata_description)
+
+
+#run this to get the image number for the meta data
+image_meta$ImageNumber <- image$ImageNumber
+
+
+#merge cell meta and image metadata by Image Number here
+  #cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+```
+
+
+```{r, Add cell metadata as S4 Object for the SCE from file name}
+#get info such as Batch, sample, ROI from original .mcd filename and add it to the cell metadata (colDat)
+
+#this is a good overview to get the different parts of metadata from the file name
+ac_info <- str_split(image$FileName_FullStack, '_', simplify=TRUE)
+image$Metadata_Description
+head(ac_info)
+cell_meta$BatchID <- ac_info[cell_meta$ImageNumber,1]
+cell_meta$Panel <- ac_info[cell_meta$ImageNumber,3]
+cell_meta$TmaID <- ac_info[cell_meta$ImageNumber,5]
+cell_meta$TmaBlock <- ac_info[cell_meta$ImageNumber,6]
+#cell_meta$ROI <- image$Metadata_roiid[cell_meta$ImageNumber]
+cell_meta$acID <- image$Metadata_acid[cell_meta$ImageNumber]
+#colnames(cells)
+```
+
+
+
+
+```{r Set rownames for SCE}
+#Should usually be cell id. However, in the cells.csv file the object number is counted per imgage, so to get unique cell IDs, they must be establishes by combining ObjectNumber, ImageNumber/ROINumber and SampleID
+
+rownames(cell_meta) <-paste(cell_meta$TmaID,cell_meta$TmaBlock, cell_meta$acID, cell_meta$CellNumber, sep='_')
+```
+
+
+```{r, Match channel names with metal names in exact order}
+library(DT)
+#loads panel
+DT::datatable(panel)
+
+#exactl order of channels can be extracted from the _full.csv files
+#header = FALSE is important here as otherwise it'll take the first metal as a header
+channel_metal <- read.csv(file.path(data_folder, paste("full_channelmeta.csv")), header=FALSE)
+
+#this reorders the panel according to the correct metal lsit
+panel <- panel[match(channel_metal[,1], panel$Metal.Tag),]
+#panel <- as.data.table(panel)
+#use Target from panel as the correct channel name to be displayed
+rownames(panel) <-panel$Clean_Target
+
+#channels in cell file are not ordered correctly (starting with 1,11,12 ... 2,21,22 ... 3,31,32...), hence we need to reorder them
+channelNumber <- as.numeric(sub("^.*_c", "", colnames(counts)))
+
+# Order counts based on channel number
+counts <- counts[,order(channelNumber, decreasing = FALSE)] 
+range(counts)
+```
+
+```{r build igraph object}
+library(igraph)
+# Construct neighbour data.frame
+# First in the ImageNumber_ObjectNumber format
+cur_df <- data.frame(CellID_1 = paste0(object_relationship$First.Image.Number, 
+                                       "_", object_relationship$First.Object.Number),
+                     CellID_2 = paste0(object_relationship$Second.Image.Number, 
+                                       "_", object_relationship$Second.Object.Number))
+# Create simple cell IDs
+cellID <- paste0(cell_meta$ImageNumber, "_", cell_meta$CellNumber)
+
+# Change cell IDs
+cur_df$CellID_1 <- rownames(cell_meta)[match(cur_df$CellID_1, cellID)]
+cur_df$CellID_2 <- rownames(cell_meta)[match(cur_df$CellID_2, cellID)]
+
+# Build graph
+g <- graph_from_data_frame(cur_df)
+g
+```
+
+
+```{r Create SingleCellExperiment}
+library(SingleCellExperiment)
+
+sce_176_C <-SingleCellExperiment(assays = list(counts=t(counts)))
+
+#marker name as row name cellID as colname
+rownames(sce_176_C)<-rownames(panel)
+colnames(sce_176_C)<-rownames(cell_meta)
+
+#store metadata in colDat and everything accordingly
+
+colData(sce_176_C) <-cell_meta
+rowData(sce_176_C) <-panel
+metadata(sce_176_C)<-list(graph=g)
+
+sce_176_C$CellID <- paste(sce_176_C$TmaID,sce_176_C$TmaBlock, sce_176_C$acID, sce_176_C$CellNumber, sep='_')
+
+#save SCE object
+saveRDS(sce_176_C, file=file.path(results_folder, paste("sce_176_C_2022.rds")))
+sce_176_C
+
+sce_176_C <-readRDS(file=file.path(results_folder, paste("sce_176_C_2022.rds")))
+```
+
+#add counts and clinical data
+
+```{r, add transformed counts}
+censor_val <-0.999
+censor_dat <- function(x, quant = 0.999){
+  q = stats::quantile(x, quant)
+  x[x>q] = q
+  return(x)
+}
+fun.censor <- function(x) censor_dat(x, censor_val)
+fun.scale <- function(x) y <- x/max(x)
+
+#censored counts
+assay(sce_176_C, "c_counts") <- t(apply(assay(sce_176_C, "counts"), 1, fun.censor))
+
+#censored counts scaled 0-1
+assay(sce_176_C, "c_counts_scaled") <- t(apply(assay(sce_176_C, "c_counts"),
+                                           1, fun.scale))
+assay(sce_176_C, "c_counts_scaled")[assay(sce_176_C, "c_counts_scaled") < 0] <- 0
+
+
+
+#censored counts asinh scaled
+cofactor <- 1
+assay(sce_176_C, "c_counts_asinh") <- asinh((assay(sce_176_C,"c_counts"))/cofactor)
+
+#censored counts_asinh_scaled
+assay(sce_176_C, "c_counts_asinh_scaled") <- t(apply(assay(sce_176_C, "c_counts_asinh"),
+                                           1, fun.scale))
+assay(sce_176_C, "c_counts_asinh_scaled")[assay(sce_176_C, "c_counts_asinh") < 0] <- 0
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts"))
+
+saveRDS(sce_176_C,file=file.path(results_folder, paste("sce_176_C_counts_RAW",".rds",sep="")))
+```
+
+#Merge ROI xy Data into sce
+```{r, load clinical and meta data}
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_176_C",'cpinp')
+cd_folder <- file.path(wd,"lena_processed2","NSCLC_NEW","clinical_data")
+
+
+ac_meta <- read.csv(file=file.path(data_folder, paste("acquisition_metadata.csv")))
+clinical.data <-read.csv(file=file.path(cd_folder, paste("clinical_data.csv")))
+clinical.data$X <-NULL
+clinical.data$TmaBlock <-NULL
+```
+
+```{r, Add Roi coordinates}
+ac_sub <- data.frame(ac_meta$description,ac_meta$id)
+colnames(ac_sub) <- c("ROI","acID")
+
+ac_sub <- ac_sub %>%
+            separate(ROI, 
+              into = c("TMA", "ROI_xy"), 
+              sep = "(?<=[A-Za-z])(?=[0-9])",
+              remove=F
+            )
+ac_sub$TMA <-NULL
+
+ac_sub$RoiID <- paste("176",ac_sub$ROI, sep="_")
+ac_sub$ROI <-NULL
+ac_sub
+
+length(unique(ac_sub$ROI_xy))
+length(unique(ac_sub$RoiID))
+ac_sub$ROI_xy[duplicated(ac_sub$ROI_xy)]
+
+ac_sub$TMA <-"176C"
+ac_sub$Tma_ac <-paste(ac_sub$TMA, ac_sub$acID, sep="_")
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","clinical_data"))
+write.csv(ac_sub, file=file.path(results_folder, "ac_sub_176C.csv"))
+
+```
+
+```{r, add clinical data and RoiID}
+#change ROI to Image ID
+#names(colData(sce))[names(colData(sce)) == 'ROI'] <- 'ImageID'
+head(colData(sce_176_C))
+
+ac_clinical <- left_join(ac_sub, clinical.data, by="RoiID")
+
+cur_DF <- as_tibble(colData(sce_176_C)) %>% left_join(ac_clinical, by = "acID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sce_176_C$ImageNumber, sce_176_C$CellNumber))
+
+colData(sce_176_C) <- cur_DF
+rownames(colData(sce_176_C)) <-sce_176_C$CellID
+head(colData(sce_176_C))
+```
+
+```{r, save sce with counts and clinical data to folder counts_clinical data}
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts_clinical data"))
+
+saveRDS(sce_176_C,file=file.path(results_folder, paste("sce_176_C_counts_clinical-data_RAW",".rds",sep="")))
+```
\ No newline at end of file
diff --git a/Read_LC_NSCLC_TMA_178_A.Rmd b/Read_LC_NSCLC_TMA_178_A.Rmd
new file mode 100644
index 0000000..a76a65b
--- /dev/null
+++ b/Read_LC_NSCLC_TMA_178_A.Rmd
@@ -0,0 +1,268 @@
+---
+title: "Read in CP data, NSCLC TMA 178_A"
+author: "Lena Cords"
+output: html_notebook
+---
+
+```{r, import libraries}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+```
+
+
+```{r, Set wd}
+#set working directory 
+wd <- dirname(dirname(dirname(getwd())))
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_178_A",'cpinp')
+panel_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_178_A",'cpout')
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","RAW"))
+```
+
+
+```{r, Data import}
+#Read in: Cells, Image, Object and Panel
+cells <- read.csv(file=file.path(data_folder,paste('Cells.csv')))
+image <- read.csv(file=file.path(data_folder, paste("Image.csv")))
+object_relationship <- read.csv(file=file.path(data_folder, paste("distance20_Object relationships.csv")))
+panel <- read.csv(file=file.path(panel_folder, paste('panel.csv')))
+```
+
+```{r, What has been measured for each cell}
+#number of cells
+nrow(cells)
+unique(sub("_c[0-9]*$", "", colnames(cells)))
+
+#Select only the measurements you want to work on, based on the measurements displayed above Intensity_MeanIntensityCellsCorr_ColdStack
+counts <- cells[, grepl("Intensity_MeanIntensityCorrected_FullStackFiltered", colnames(cells))]
+```
+
+```{r Scale counts according to CP scaling factor}
+#when using 16bit data this should be 2^16 =65536
+scaling_factor <- unique(image$Scaling_FullStack)
+#Scale up the counts according to scaling factor from CellProfiler
+counts <- counts * scaling_factor
+
+#make sure the counts have actually been multiplied by the scaling factor
+range(counts)
+```
+
+
+```{r Add cell metadata as S4 Object for the SCE}
+#the S4Vector is needed for the SCE and a dataframe that stores the metadata per category
+cell_meta <- DataFrame(CellNumber = cells$ObjectNumber,
+                      Center_X=cells$Location_Center_X,
+                      Center_Y=cells$Location_Center_Y,
+                      Area = cells$AreaShape_Area,
+                      MajorAxisLength=cells$AreaShape_MajorAxisLength,
+                      MinorAxisLength=cells$AreaShape_MinorAxisLength,
+                      Compartment=cells$Intensity_MeanIntensity_TumourMask_Distance_B100)
+cell_meta$ImageNumber <- cells$ImageNumber
+
+#Create DataFrame for image meta data. Then match image and cell metadata by ImageNumber
+
+image_meta <- DataFrame(Area_Description = image$Metadata_description)
+
+
+#run this to get the image number for the meta data
+image_meta$ImageNumber <- image$ImageNumber
+
+
+#merge cell meta and image metadata by Image Number here
+  #cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+```
+
+
+```{r, Add cell metadata as S4 Object for the SCE from file name}
+#get info such as Batch, sample, ROI from original .mcd filename and add it to the cell metadata (colDat)
+
+#this is a good overview to get the different parts of metadata from the file name
+ac_info <- str_split(image$FileName_FullStack, '_', simplify=TRUE)
+image$Metadata_Description
+head(ac_info)
+cell_meta$BatchID <- ac_info[cell_meta$ImageNumber,1]
+cell_meta$Panel <- ac_info[cell_meta$ImageNumber,3]
+cell_meta$TmaID <- ac_info[cell_meta$ImageNumber,5]
+cell_meta$TmaBlock <- ac_info[cell_meta$ImageNumber,6]
+#cell_meta$ROI <- image$Metadata_roiid[cell_meta$ImageNumber]
+cell_meta$acID <- image$Metadata_acid[cell_meta$ImageNumber]
+#colnames(cells)
+```
+
+
+
+
+```{r Set rownames for SCE}
+#Should usually be cell id. However, in the cells.csv file the object number is counted per imgage, so to get unique cell IDs, they must be establishes by combining ObjectNumber, ImageNumber/ROINumber and SampleID
+
+rownames(cell_meta) <-paste(cell_meta$TmaID,cell_meta$TmaBlock, cell_meta$acID, cell_meta$CellNumber, sep='_')
+```
+
+
+```{r, Match channel names with metal names in exact order}
+library(DT)
+#loads panel
+DT::datatable(panel)
+
+#exactl order of channels can be extracted from the _full.csv files
+#header = FALSE is important here as otherwise it'll take the first metal as a header
+channel_metal <- read.csv(file.path(data_folder, paste("full_channelmeta.csv")), header=FALSE)
+
+#this reorders the panel according to the correct metal lsit
+panel <- panel[match(channel_metal[,1], panel$Metal.Tag),]
+#panel <- as.data.table(panel)
+#use Target from panel as the correct channel name to be displayed
+rownames(panel) <-panel$Clean_Target
+
+#channels in cell file are not ordered correctly (starting with 1,11,12 ... 2,21,22 ... 3,31,32...), hence we need to reorder them
+channelNumber <- as.numeric(sub("^.*_c", "", colnames(counts)))
+
+# Order counts based on channel number
+counts <- counts[,order(channelNumber, decreasing = FALSE)] 
+range(counts)
+```
+
+```{r build igraph object}
+library(igraph)
+# Construct neighbour data.frame
+# First in the ImageNumber_ObjectNumber format
+cur_df <- data.frame(CellID_1 = paste0(object_relationship$First.Image.Number, 
+                                       "_", object_relationship$First.Object.Number),
+                     CellID_2 = paste0(object_relationship$Second.Image.Number, 
+                                       "_", object_relationship$Second.Object.Number))
+# Create simple cell IDs
+cellID <- paste0(cell_meta$ImageNumber, "_", cell_meta$CellNumber)
+
+# Change cell IDs
+cur_df$CellID_1 <- rownames(cell_meta)[match(cur_df$CellID_1, cellID)]
+cur_df$CellID_2 <- rownames(cell_meta)[match(cur_df$CellID_2, cellID)]
+
+# Build graph
+g <- graph_from_data_frame(cur_df)
+g
+```
+
+
+```{r Create SingleCellExperiment}
+library(SingleCellExperiment)
+
+sce_178_A <-SingleCellExperiment(assays = list(counts=t(counts)))
+
+#marker name as row name cellID as colname
+rownames(sce_178_A)<-rownames(panel)
+colnames(sce_178_A)<-rownames(cell_meta)
+
+#store metadata in colDat and everything accordingly
+
+colData(sce_178_A) <-cell_meta
+rowData(sce_178_A) <-panel
+metadata(sce_178_A)<-list(graph=g)
+
+sce_178_A$CellID <- paste(sce_178_A$TmaID,sce_178_A$TmaBlock, sce_178_A$acID, sce_178_A$CellNumber, sep='_')
+
+#save SCE object
+saveRDS(sce_178_A, file=file.path(results_folder, paste("sce_178_A_2022.rds")))
+sce_178_A
+```
+#add counts and clinical data
+
+```{r, add transformed counts}
+censor_val <-0.999
+censor_dat <- function(x, quant = 0.999){
+  q = stats::quantile(x, quant)
+  x[x>q] = q
+  return(x)
+}
+fun.censor <- function(x) censor_dat(x, censor_val)
+fun.scale <- function(x) y <- x/max(x)
+
+#censored counts
+assay(sce_178_A, "c_counts") <- t(apply(assay(sce_178_A, "counts"), 1, fun.censor))
+
+#censored counts scaled 0-1
+assay(sce_178_A, "c_counts_scaled") <- t(apply(assay(sce_178_A, "c_counts"),
+                                           1, fun.scale))
+assay(sce_178_A, "c_counts_scaled")[assay(sce_178_A, "c_counts_scaled") < 0] <- 0
+
+
+
+#censored counts asinh scaled
+cofactor <- 1
+assay(sce_178_A, "c_counts_asinh") <- asinh((assay(sce_178_A,"c_counts"))/cofactor)
+
+#censored counts_asinh_scaled
+assay(sce_178_A, "c_counts_asinh_scaled") <- t(apply(assay(sce_178_A, "c_counts_asinh"),
+                                           1, fun.scale))
+assay(sce_178_A, "c_counts_asinh_scaled")[assay(sce_178_A, "c_counts_asinh") < 0] <- 0
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts"))
+
+saveRDS(sce_178_A,file=file.path(results_folder, paste("sce_178_A_counts_RAW",".rds",sep="")))
+```
+
+#Merge ROI xy Data into sce
+```{r, load clinical and meta data}
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_178_A",'cpinp')
+cd_folder <- file.path(wd,"lena_processed2","NSCLC_NEW","clinical_data")
+
+
+ac_meta <- read.csv(file=file.path(data_folder, paste("acquisition_metadata.csv")))
+clinical.data <-read.csv(file=file.path(cd_folder, paste("clinical_data.csv")))
+clinical.data$X <-NULL
+clinical.data$TmaBlock <-NULL
+```
+
+```{r, Add Roi coordinates}
+ac_sub <- data.frame(ac_meta$description,ac_meta$id)
+colnames(ac_sub) <- c("ROI","acID")
+
+ac_sub <- ac_sub %>%
+            separate(ROI, 
+              into = c("TMA", "ROI_xy"), 
+              sep = "(?<=[A-Za-z]_)(?=[0-9])",
+              remove=F
+            )
+ac_sub$TMA <-NULL
+ac_sub$ROI <-paste("A",ac_sub$ROI_xy, sep="")
+ac_sub$RoiID <- paste("178",ac_sub$ROI, sep="_")
+ac_sub$ROI <-NULL
+ac_sub
+
+length(unique(ac_sub$ROI_xy))
+length(unique(ac_sub$RoiID))
+ac_sub$ROI_xy[duplicated(ac_sub$ROI_xy)]
+
+ac_sub$TMA <-"178A"
+ac_sub$Tma_ac <-paste(ac_sub$TMA, ac_sub$acID, sep="_")
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","clinical_data"))
+write.csv(ac_sub, file=file.path(results_folder, "ac_sub_178A.csv"))
+
+```
+
+```{r, add clinical data and RoiID}
+#change ROI to Image ID
+#names(colData(sce))[names(colData(sce)) == 'ROI'] <- 'ImageID'
+head(colData(sce_178_A))
+
+ac_clinical <- left_join(ac_sub, clinical.data, by="RoiID")
+
+cur_DF <- as_tibble(colData(sce_178_A)) %>% left_join(ac_clinical, by = "acID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sce_178_A$ImageNumber, sce_178_A$CellNumber))
+
+colData(sce_178_A) <- cur_DF
+rownames(colData(sce_178_A)) <-sce_178_A$CellID
+head(colData(sce_178_A))
+```
+
+```{r, save sce with counts and clinical data to folder counts_clinical data}
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts_clinical data"))
+
+saveRDS(sce_178_A,file=file.path(results_folder, paste("sce_178_A_counts_clinical-data_RAW",".rds",sep="")))
+```
\ No newline at end of file
diff --git a/Read_LC_NSCLC_TMA_178_B.Rmd b/Read_LC_NSCLC_TMA_178_B.Rmd
new file mode 100644
index 0000000..6bfb372
--- /dev/null
+++ b/Read_LC_NSCLC_TMA_178_B.Rmd
@@ -0,0 +1,268 @@
+---
+title: "Read in CP data, NSCLC TMA 178_B"
+author: "Lena Cords"
+output: html_notebook
+---
+
+```{r, import libraries}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+```
+
+
+
+```{r, Set wd}
+#set working directory 
+wd <- dirname(dirname(dirname(getwd())))
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_178_B_2",'cpinp')
+panel_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_178_B_2",'cpout')
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","RAW"))
+```
+
+
+```{r, Data import}
+#Read in: Cells, Image, Object and Panel
+cells <- read.csv(file=file.path(data_folder,paste('Cells.csv')))
+image <- read.csv(file=file.path(data_folder, paste("Image.csv")))
+object_relationship <- read.csv(file=file.path(data_folder, paste("distance20_Object relationships.csv")))
+panel <- read.csv(file=file.path(panel_folder, paste('panel.csv')))
+```
+
+```{r, What has been measured for each cell}
+#number of cells
+nrow(cells)
+unique(sub("_c[0-9]*$", "", colnames(cells)))
+
+#Select only the measurements you want to work on, based on the measurements displayed above Intensity_MeanIntensityCellsCorr_ColdStack
+counts <- cells[, grepl("Intensity_MeanIntensityCorrected_FullStackFiltered", colnames(cells))]
+```
+
+```{r Scale counts according to CP scaling factor}
+#when using 16bit data this should be 2^16 =65536
+scaling_factor <- unique(image$Scaling_FullStack)
+#Scale up the counts according to scaling factor from CellProfiler
+counts <- counts * scaling_factor
+
+#make sure the counts have actually been multiplied by the scaling factor
+range(counts)
+```
+
+
+```{r Add cell metadata as S4 Object for the SCE}
+#the S4Vector is needed for the SCE and a dataframe that stores the metadata per category
+cell_meta <- DataFrame(CellNumber = cells$ObjectNumber,
+                      Center_X=cells$Location_Center_X,
+                      Center_Y=cells$Location_Center_Y,
+                      Area = cells$AreaShape_Area,
+                      MajorAxisLength=cells$AreaShape_MajorAxisLength,
+                      MinorAxisLength=cells$AreaShape_MinorAxisLength,
+                      Compartment=cells$Intensity_MeanIntensity_TumourMask_Distance_B100)
+cell_meta$ImageNumber <- cells$ImageNumber
+
+#Create DataFrame for image meta data. Then match image and cell metadata by ImageNumber
+
+image_meta <- DataFrame(Area_Description = image$Metadata_description)
+
+
+#run this to get the image number for the meta data
+image_meta$ImageNumber <- image$ImageNumber
+
+
+#merge cell meta and image metadata by Image Number here
+  #cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+```
+
+
+```{r, Add cell metadata as S4 Object for the SCE from file name}
+#get info such as Batch, sample, ROI from original .mcd filename and add it to the cell metadata (colDat)
+
+#this is a good overview to get the different parts of metadata from the file name
+ac_info <- str_split(image$FileName_FullStack, '_', simplify=TRUE)
+image$Metadata_Description
+head(ac_info)
+cell_meta$BatchID <- ac_info[cell_meta$ImageNumber,1]
+cell_meta$Panel <- ac_info[cell_meta$ImageNumber,3]
+cell_meta$TmaID <- ac_info[cell_meta$ImageNumber,5]
+cell_meta$TmaBlock <- ac_info[cell_meta$ImageNumber,6]
+#cell_meta$ROI <- image$Metadata_roiid[cell_meta$ImageNumber]
+cell_meta$acID <- image$Metadata_acid[cell_meta$ImageNumber]
+#colnames(cells)
+```
+
+
+
+
+```{r Set rownames for SCE}
+#Should usually be cell id. However, in the cells.csv file the object number is counted per imgage, so to get unique cell IDs, they must be establishes by combining ObjectNumber, ImageNumber/ROINumber and SampleID
+
+rownames(cell_meta) <-paste(cell_meta$TmaID,cell_meta$TmaBlock, cell_meta$acID, cell_meta$CellNumber, sep='_')
+```
+
+
+```{r, Match channel names with metal names in exact order}
+library(DT)
+#loads panel
+DT::datatable(panel)
+
+#exactl order of channels can be extracted from the _full.csv files
+#header = FALSE is important here as otherwise it'll take the first metal as a header
+channel_metal <- read.csv(file.path(data_folder, paste("full_channelmeta.csv")), header=FALSE)
+
+#this reorders the panel according to the correct metal lsit
+panel <- panel[match(channel_metal[,1], panel$Metal.Tag),]
+#panel <- as.data.table(panel)
+#use Target from panel as the correct channel name to be displayed
+rownames(panel) <-panel$Clean_Target
+
+#channels in cell file are not ordered correctly (starting with 1,11,12 ... 2,21,22 ... 3,31,32...), hence we need to reorder them
+channelNumber <- as.numeric(sub("^.*_c", "", colnames(counts)))
+
+# Order counts based on channel number
+counts <- counts[,order(channelNumber, decreasing = FALSE)] 
+range(counts)
+```
+
+```{r build igraph object}
+library(igraph)
+# Construct neighbour data.frame
+# First in the ImageNumber_ObjectNumber format
+cur_df <- data.frame(CellID_1 = paste0(object_relationship$First.Image.Number, 
+                                       "_", object_relationship$First.Object.Number),
+                     CellID_2 = paste0(object_relationship$Second.Image.Number, 
+                                       "_", object_relationship$Second.Object.Number))
+# Create simple cell IDs
+cellID <- paste0(cell_meta$ImageNumber, "_", cell_meta$CellNumber)
+
+# Change cell IDs
+cur_df$CellID_1 <- rownames(cell_meta)[match(cur_df$CellID_1, cellID)]
+cur_df$CellID_2 <- rownames(cell_meta)[match(cur_df$CellID_2, cellID)]
+
+# Build graph
+g <- graph_from_data_frame(cur_df)
+g
+```
+
+
+```{r Create SingleCellExperiment}
+library(SingleCellExperiment)
+
+sce_178_B <-SingleCellExperiment(assays = list(counts=t(counts)))
+
+#marker name as row name cellID as colname
+rownames(sce_178_B)<-rownames(panel)
+colnames(sce_178_B)<-rownames(cell_meta)
+
+#store metadata in colDat and everything accordingly
+
+colData(sce_178_B) <-cell_meta
+rowData(sce_178_B) <-panel
+metadata(sce_178_B)<-list(graph=g)
+
+sce_178_B$CellID <- paste(sce_178_B$TmaID,sce_178_B$TmaBlock, sce_178_B$acID, sce_178_B$CellNumber, sep='_')
+
+#save SCE object
+saveRDS(sce_178_B, file=file.path(results_folder, paste("sce_178_B_2022.rds")))
+sce_178_B
+```
+#add counts and clinical data
+
+```{r, add transformed counts}
+censor_val <-0.999
+censor_dat <- function(x, quant = 0.999){
+  q = stats::quantile(x, quant)
+  x[x>q] = q
+  return(x)
+}
+fun.censor <- function(x) censor_dat(x, censor_val)
+fun.scale <- function(x) y <- x/max(x)
+
+#censored counts
+assay(sce_178_B, "c_counts") <- t(apply(assay(sce_178_B, "counts"), 1, fun.censor))
+
+#censored counts scaled 0-1
+assay(sce_178_B, "c_counts_scaled") <- t(apply(assay(sce_178_B, "c_counts"),
+                                           1, fun.scale))
+assay(sce_178_B, "c_counts_scaled")[assay(sce_178_B, "c_counts_scaled") < 0] <- 0
+
+
+
+#censored counts asinh scaled
+cofactor <- 1
+assay(sce_178_B, "c_counts_asinh") <- asinh((assay(sce_178_B,"c_counts"))/cofactor)
+
+#censored counts_asinh_scaled
+assay(sce_178_B, "c_counts_asinh_scaled") <- t(apply(assay(sce_178_B, "c_counts_asinh"),
+                                           1, fun.scale))
+assay(sce_178_B, "c_counts_asinh_scaled")[assay(sce_178_B, "c_counts_asinh") < 0] <- 0
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts"))
+
+saveRDS(sce_178_B,file=file.path(results_folder, paste("sce_178_B_counts_RAW",".rds",sep="")))
+```
+
+#Merge ROI xy Data into sce
+```{r, load clinical and meta data}
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_178_B_2",'cpinp')
+cd_folder <- file.path(wd,"lena_processed2","NSCLC_NEW","clinical_data")
+
+
+ac_meta <- read.csv(file=file.path(data_folder, paste("acquisition_metadata.csv")))
+clinical.data <-read.csv(file=file.path(cd_folder, paste("clinical_data.csv")))
+clinical.data$X <-NULL
+clinical.data$TmaBlock <-NULL
+```
+
+```{r, Add Roi coordinates}
+ac_sub <- data.frame(ac_meta$description,ac_meta$id)
+colnames(ac_sub) <- c("ROI","acID")
+
+ac_sub <- ac_sub %>%
+            separate(ROI, 
+              into = c("TMA", "ROI_xy"), 
+              sep = "(?<=[A-Za-z]_)(?=[0-9])",
+              remove=F
+            )
+ac_sub$TMA <-NULL
+ac_sub$RoiID <- paste("178_B",ac_sub$ROI_xy, sep="")
+ac_sub$ROI <-NULL
+ac_sub
+
+length(unique(ac_sub$ROI_xy))
+length(unique(ac_sub$RoiID))
+ac_sub$ROI_xy[duplicated(ac_sub$ROI_xy)]
+
+ac_sub$TMA <-"178B"
+ac_sub$Tma_ac <-paste(ac_sub$TMA, ac_sub$acID, sep="_")
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","clinical_data"))
+write.csv(ac_sub, file=file.path(results_folder, "ac_sub_178B.csv"))
+
+```
+
+```{r, add clinical data and RoiID}
+#change ROI to Image ID
+#names(colData(sce))[names(colData(sce)) == 'ROI'] <- 'ImageID'
+head(colData(sce_178_B))
+
+ac_clinical <- left_join(ac_sub, clinical.data, by="RoiID")
+
+cur_DF <- as_tibble(colData(sce_178_B)) %>% left_join(ac_clinical, by = "acID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sce_178_B$ImageNumber, sce_178_B$CellNumber))
+
+colData(sce_178_B) <- cur_DF
+rownames(colData(sce_178_B)) <-sce_178_B$CellID
+head(colData(sce_178_B))
+```
+
+```{r, save sce with counts and clinical data to folder counts_clinical data}
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts_clinical data"))
+
+saveRDS(sce_178_B,file=file.path(results_folder, paste("sce_178_B_counts_clinical-data_RAW",".rds",sep="")))
+```
\ No newline at end of file
diff --git a/Read_LC_NSCLC_TMA_178_C.Rmd b/Read_LC_NSCLC_TMA_178_C.Rmd
new file mode 100644
index 0000000..7aa30d7
--- /dev/null
+++ b/Read_LC_NSCLC_TMA_178_C.Rmd
@@ -0,0 +1,269 @@
+---
+title: "Read in CP data, NSCLC TMA 178_C"
+author: "Lena Cords"
+output:
+  html_document:
+    df_print: paged
+---
+
+```{r, import libraries}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+```
+
+
+
+```{r, Set wd}
+#set working directory 
+wd <- dirname(dirname(dirname(getwd())))
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_178_C",'cpinp')
+panel_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_178_C",'cpout')
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","RAW"))
+```
+
+
+```{r, Data import}
+#Read in: Cells, Image, Object and Panel
+cells <- read.csv(file=file.path(data_folder,paste('Cells.csv')))
+image <- read.csv(file=file.path(data_folder, paste("Image.csv")))
+object_relationship <- read.csv(file=file.path(data_folder, paste("distance20_Object relationships.csv")))
+panel <- read.csv(file=file.path(panel_folder, paste('panel.csv')))
+```
+
+```{r, What has been measured for each cell}
+#number of cells
+nrow(cells)
+unique(sub("_c[0-9]*$", "", colnames(cells)))
+
+#Select only the measurements you want to work on, based on the measurements displayed above Intensity_MeanIntensityCellsCorr_ColdStack
+counts <- cells[, grepl("Intensity_MeanIntensityCorrected_FullStackFiltered", colnames(cells))]
+```
+
+```{r Scale counts according to CP scaling factor}
+#when using 16bit data this should be 2^16 =65536
+scaling_factor <- unique(image$Scaling_FullStack)
+#Scale up the counts according to scaling factor from CellProfiler
+counts <- counts * scaling_factor
+
+#make sure the counts have actually been multiplied by the scaling factor
+range(counts)
+```
+
+
+```{r Add cell metadata as S4 Object for the SCE}
+#the S4Vector is needed for the SCE and a dataframe that stores the metadata per category
+cell_meta <- DataFrame(CellNumber = cells$ObjectNumber,
+                      Center_X=cells$Location_Center_X,
+                      Center_Y=cells$Location_Center_Y,
+                      Area = cells$AreaShape_Area,
+                      MajorAxisLength=cells$AreaShape_MajorAxisLength,
+                      MinorAxisLength=cells$AreaShape_MinorAxisLength,
+                      Compartment=cells$Intensity_MeanIntensity_TumourMask_Distance_B100)
+cell_meta$ImageNumber <- cells$ImageNumber
+
+#Create DataFrame for image meta data. Then match image and cell metadata by ImageNumber
+
+image_meta <- DataFrame(Area_Description = image$Metadata_description)
+
+
+#run this to get the image number for the meta data
+image_meta$ImageNumber <- image$ImageNumber
+
+
+#merge cell meta and image metadata by Image Number here
+  #cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+```
+
+
+```{r, Add cell metadata as S4 Object for the SCE from file name}
+#get info such as Batch, sample, ROI from original .mcd filename and add it to the cell metadata (colDat)
+
+#this is a good overview to get the different parts of metadata from the file name
+ac_info <- str_split(image$FileName_FullStack, '_', simplify=TRUE)
+image$Metadata_Description
+head(ac_info)
+cell_meta$BatchID <- ac_info[cell_meta$ImageNumber,1]
+cell_meta$Panel <- ac_info[cell_meta$ImageNumber,3]
+cell_meta$TmaID <- ac_info[cell_meta$ImageNumber,5]
+cell_meta$TmaBlock <- ac_info[cell_meta$ImageNumber,6]
+#cell_meta$ROI <- image$Metadata_roiid[cell_meta$ImageNumber]
+cell_meta$acID <- image$Metadata_acid[cell_meta$ImageNumber]
+#colnames(cells)
+```
+
+
+
+
+```{r Set rownames for SCE}
+#Should usually be cell id. However, in the cells.csv file the object number is counted per imgage, so to get unique cell IDs, they must be establishes by combining ObjectNumber, ImageNumber/ROINumber and SampleID
+
+rownames(cell_meta) <-paste(cell_meta$TmaID,cell_meta$TmaBlock, cell_meta$acID, cell_meta$CellNumber, sep='_')
+```
+
+
+```{r, Match channel names with metal names in exact order}
+library(DT)
+#loads panel
+DT::datatable(panel)
+
+#exactl order of channels can be extracted from the _full.csv files
+#header = FALSE is important here as otherwise it'll take the first metal as a header
+channel_metal <- read.csv(file.path(data_folder, paste("full_channelmeta.csv")), header=FALSE)
+
+#this reorders the panel according to the correct metal lsit
+panel <- panel[match(channel_metal[,1], panel$Metal.Tag),]
+#panel <- as.data.table(panel)
+#use Target from panel as the correct channel name to be displayed
+rownames(panel) <-panel$Clean_Target
+
+#channels in cell file are not ordered correctly (starting with 1,11,12 ... 2,21,22 ... 3,31,32...), hence we need to reorder them
+channelNumber <- as.numeric(sub("^.*_c", "", colnames(counts)))
+
+# Order counts based on channel number
+counts <- counts[,order(channelNumber, decreasing = FALSE)] 
+range(counts)
+```
+
+```{r build igraph object}
+library(igraph)
+# Construct neighbour data.frame
+# First in the ImageNumber_ObjectNumber format
+cur_df <- data.frame(CellID_1 = paste0(object_relationship$First.Image.Number, 
+                                       "_", object_relationship$First.Object.Number),
+                     CellID_2 = paste0(object_relationship$Second.Image.Number, 
+                                       "_", object_relationship$Second.Object.Number))
+# Create simple cell IDs
+cellID <- paste0(cell_meta$ImageNumber, "_", cell_meta$CellNumber)
+
+# Change cell IDs
+cur_df$CellID_1 <- rownames(cell_meta)[match(cur_df$CellID_1, cellID)]
+cur_df$CellID_2 <- rownames(cell_meta)[match(cur_df$CellID_2, cellID)]
+
+# Build graph
+g <- graph_from_data_frame(cur_df)
+g
+```
+
+
+```{r Create SingleCellExperiment}
+library(SingleCellExperiment)
+
+sce_178_C <-SingleCellExperiment(assays = list(counts=t(counts)))
+
+#marker name as row name cellID as colname
+rownames(sce_178_C)<-rownames(panel)
+colnames(sce_178_C)<-rownames(cell_meta)
+
+#store metadata in colDat and everything accordingly
+
+colData(sce_178_C) <-cell_meta
+rowData(sce_178_C) <-panel
+metadata(sce_178_C)<-list(graph=g)
+sce_178_C$CellID <- paste(sce_178_C$TmaID,sce_178_C$TmaBlock, sce_178_C$acID, sce_178_C$CellNumber, sep='_')
+
+#save SCE object
+saveRDS(sce_178_C, file=file.path(results_folder, paste("sce_178_C_2022.rds")))
+sce_178_C
+```
+#add counts and clinical data
+
+```{r, add transformed counts}
+censor_val <-0.999
+censor_dat <- function(x, quant = 0.999){
+  q = stats::quantile(x, quant)
+  x[x>q] = q
+  return(x)
+}
+fun.censor <- function(x) censor_dat(x, censor_val)
+fun.scale <- function(x) y <- x/max(x)
+
+#censored counts
+assay(sce_178_C, "c_counts") <- t(apply(assay(sce_178_C, "counts"), 1, fun.censor))
+
+#censored counts scaled 0-1
+assay(sce_178_C, "c_counts_scaled") <- t(apply(assay(sce_178_C, "c_counts"),
+                                           1, fun.scale))
+assay(sce_178_C, "c_counts_scaled")[assay(sce_178_C, "c_counts_scaled") < 0] <- 0
+
+
+
+#censored counts asinh scaled
+cofactor <- 1
+assay(sce_178_C, "c_counts_asinh") <- asinh((assay(sce_178_C,"c_counts"))/cofactor)
+
+#censored counts_asinh_scaled
+assay(sce_178_C, "c_counts_asinh_scaled") <- t(apply(assay(sce_178_C, "c_counts_asinh"),
+                                           1, fun.scale))
+assay(sce_178_C, "c_counts_asinh_scaled")[assay(sce_178_C, "c_counts_asinh") < 0] <- 0
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts"))
+
+saveRDS(sce_178_C,file=file.path(results_folder, paste("sce_178_C_counts_RAW",".rds",sep="")))
+```
+
+#Merge ROI xy Data into sce
+```{r, load clinical and meta data}
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_178_C",'cpinp')
+cd_folder <- file.path(wd,"lena_processed2","NSCLC_NEW","clinical_data")
+
+
+ac_meta <- read.csv(file=file.path(data_folder, paste("acquisition_metadata.csv")))
+clinical.data <-read.csv(file=file.path(cd_folder, paste("clinical_data.csv")))
+clinical.data$X <-NULL
+clinical.data$TmaBlock <-NULL
+```
+
+```{r, Add Roi coordinates}
+ac_sub <- data.frame(ac_meta$description,ac_meta$id)
+colnames(ac_sub) <- c("ROI","acID")
+
+ac_sub <- ac_sub %>%
+            separate(ROI, 
+              into = c("TMA", "ROI_xy"), 
+              sep = "(?<=[A-Za-z]_)(?=[0-9])",
+              remove=F
+            )
+ac_sub$TMA <-NULL
+ac_sub$RoiID <- paste("178",ac_sub$ROI, sep="_")
+ac_sub$ROI <-NULL
+ac_sub
+
+length(unique(ac_sub$ROI_xy))
+length(unique(ac_sub$RoiID))
+ac_sub$ROI_xy[duplicated(ac_sub$ROI_xy)]
+
+ac_sub$TMA <-"178C"
+ac_sub$Tma_ac <-paste(ac_sub$TMA, ac_sub$acID, sep="_")
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","clinical_data"))
+write.csv(ac_sub, file=file.path(results_folder, "ac_sub_178C.csv"))
+
+```
+
+```{r, add clinical data and RoiID}
+#change ROI to Image ID
+#names(colData(sce))[names(colData(sce)) == 'ROI'] <- 'ImageID'
+head(colData(sce_178_C))
+
+ac_clinical <- left_join(ac_sub, clinical.data, by="RoiID")
+
+cur_DF <- as_tibble(colData(sce_178_C)) %>% left_join(ac_clinical, by = "acID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sce_178_C$ImageNumber, sce_178_C$CellNumber))
+
+colData(sce_178_C) <- cur_DF
+rownames(colData(sce_178_C)) <-sce_178_C$CellID
+head(colData(sce_178_C))
+```
+
+```{r, save sce with counts and clinical data to folder counts_clinical data}
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts_clinical data"))
+
+saveRDS(sce_178_C,file=file.path(results_folder, paste("sce_178_C_counts_clinical-data_RAW",".rds",sep="")))
+```
\ No newline at end of file
diff --git a/Read_LC_NSCLC_TMA_86_A.Rmd b/Read_LC_NSCLC_TMA_86_A.Rmd
new file mode 100644
index 0000000..8b3a6fb
--- /dev/null
+++ b/Read_LC_NSCLC_TMA_86_A.Rmd
@@ -0,0 +1,263 @@
+---
+title: "Read in CP data, NSCLC TMA"
+author: "Lena Cords"
+output: html_notebook
+---
+
+```{r, import libraries}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+```
+
+
+```{r, Set wd}
+#set working directory 
+wd <- dirname(dirname(dirname(getwd())))
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_86_A",'cpinp')
+panel_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_86_A",'cpout')
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts"))
+```
+
+
+```{r, Data import}
+#Read in: Cells, Image, Object and Panel
+cells <- read.csv(file=file.path(data_folder,paste('Cells.csv')))
+image <- read.csv(file=file.path(data_folder, paste("Image.csv")))
+object_relationship <- read.csv(file=file.path(data_folder, paste("distance20_Object relationships.csv")))
+panel <- read.csv(file=file.path(panel_folder, paste('panel.csv')))
+```
+
+```{r, What has been measured for each cell}
+#number of cells
+nrow(cells)
+unique(sub("_c[0-9]*$", "", colnames(cells)))
+
+#Select only the measurements you want to work on, based on the measurements displayed above Intensity_MeanIntensityCellsCorr_ColdStack
+counts <- cells[, grepl("Intensity_MeanIntensityCorrected_FullStackFiltered", colnames(cells))]
+```
+
+```{r Scale counts according to CP scaling factor}
+#when using 16bit data this should be 2^16 =65536
+scaling_factor <- unique(image$Scaling_FullStack)
+#Scale up the counts according to scaling factor from CellProfiler
+counts <- counts * scaling_factor
+
+#make sure the counts have actually been multiplied by the scaling factor
+range(counts)
+#range(assay(all.cells[, all.cells$TMA=="86_A"], "counts"))
+```
+
+
+```{r Add cell metadata as S4 Object for the SCE}
+#the S4Vector is needed for the SCE and a dataframe that stores the metadata per category
+cell_meta <- DataFrame(CellNumber = cells$ObjectNumber,
+                      Center_X=cells$Location_Center_X,
+                      Center_Y=cells$Location_Center_Y,
+                      Area = cells$AreaShape_Area,
+                      MajorAxisLength=cells$AreaShape_MajorAxisLength,
+                      MinorAxisLength=cells$AreaShape_MinorAxisLength,
+                      Compartment=cells$Intensity_MeanIntensity_TumourMask_Distance_B100)
+cell_meta$ImageNumber <- cells$ImageNumber
+
+#Create DataFrame for image meta data. Then match image and cell metadata by ImageNumber
+
+image_meta <- DataFrame(Area_Description = image$Metadata_description)
+
+
+#run this to get the image number for the meta data
+image_meta$ImageNumber <- image$ImageNumber
+
+
+#merge cell meta and image metadata by Image Number here
+  #cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+```
+
+
+```{r, Add cell metadata as S4 Object for the SCE from file name}
+#get info such as Batch, sample, ROI from original .mcd filename and add it to the cell metadata (colDat)
+
+#this is a good overview to get the different parts of metadata from the file name
+ac_info <- str_split(image$FileName_FullStack, '_', simplify=TRUE)
+image$Metadata_Description
+head(ac_info)
+cell_meta$BatchID <- ac_info[cell_meta$ImageNumber,1]
+cell_meta$Panel <- ac_info[cell_meta$ImageNumber,3]
+cell_meta$TmaID <- ac_info[cell_meta$ImageNumber,5]
+cell_meta$TmaBlock <- ac_info[cell_meta$ImageNumber,6]
+#cell_meta$ROI <- image$Metadata_roiid[cell_meta$ImageNumber]
+cell_meta$acID <- image$Metadata_acid[cell_meta$ImageNumber]
+#colnames(cells)
+```
+
+
+
+
+```{r Set rownames for SCE}
+#Should usually be cell id. However, in the cells.csv file the object number is counted per imgage, so to get unique cell IDs, they must be establishes by combining ObjectNumber, ImageNumber/ROINumber and SampleID
+
+rownames(cell_meta) <-paste(cell_meta$TmaID,cell_meta$TmaBlock, cell_meta$acID, cell_meta$CellNumber, sep='_')
+head(rownames(cell_meta))
+```
+
+
+```{r, Match channel names with metal names in exact order}
+library(DT)
+#loads panel
+DT::datatable(panel)
+
+#exactl order of channels can be extracted from the _full.csv files
+#header = FALSE is important here as otherwise it'll take the first metal as a header
+channel_metal <- read.csv(file.path(data_folder, paste("full_channelmeta.csv")), header=FALSE)
+
+#this reorders the panel according to the correct metal lsit
+panel <- panel[match(channel_metal[,1], panel$Metal.Tag),]
+#panel <- as.data.table(panel)
+#use Target from panel as the correct channel name to be displayed
+rownames(panel) <-panel$Clean_Target
+
+#channels in cell file are not ordered correctly (starting with 1,11,12 ... 2,21,22 ... 3,31,32...), hence we need to reorder them
+channelNumber <- as.numeric(sub("^.*_c", "", colnames(counts)))
+
+# Order counts based on channel number
+counts <- counts[,order(channelNumber, decreasing = FALSE)] 
+range(counts)
+```
+
+```{r build igraph object}
+library(igraph)
+# Construct neighbour data.frame
+# First in the ImageNumber_ObjectNumber format
+cur_df <- data.frame(CellID_1 = paste0(object_relationship$First.Image.Number, 
+                                       "_", object_relationship$First.Object.Number),
+                     CellID_2 = paste0(object_relationship$Second.Image.Number, 
+                                       "_", object_relationship$Second.Object.Number))
+# Create simple cell IDs
+cellID <- paste0(cell_meta$ImageNumber, "_", cell_meta$CellNumber)
+
+# Change cell IDs
+cur_df$CellID_1 <- rownames(cell_meta)[match(cur_df$CellID_1, cellID)]
+cur_df$CellID_2 <- rownames(cell_meta)[match(cur_df$CellID_2, cellID)]
+
+# Build graph
+g <- graph_from_data_frame(cur_df)
+g
+```
+
+
+```{r Create SingleCellExperiment}
+library(SingleCellExperiment)
+
+sce_86A <-SingleCellExperiment(assays = list(counts=t(counts)))
+
+#marker name as row name cellID as colname
+rownames(sce_86A)<-rownames(panel)
+colnames(sce_86A)<-rownames(cell_meta)
+
+#store metadata in colDat and everything accordingly
+
+colData(sce_86A) <-cell_meta
+rowData(sce_86A) <-panel
+metadata(sce_86A)<-list(graph=g)
+
+sce_86A$CellID <- paste(sce_86A$TmaID,sce_86A$TmaBlock, sce_86A$acID, sce_86A$CellNumber, sep='_')
+
+#save SCE object
+saveRDS(sce_86A, file=file.path(results_folder, paste("sce_86_A_2022.rds")))
+```
+```{r, add transformed counts}
+censor_val <-0.999
+censor_dat <- function(x, quant = 0.999){
+  q = stats::quantile(x, quant)
+  x[x>q] = q
+  return(x)
+}
+fun.censor <- function(x) censor_dat(x, censor_val)
+fun.scale <- function(x) y <- x/max(x)
+
+#censored counts
+assay(sce_86_A, "c_counts") <- t(apply(assay(sce_86_A, "counts"), 1, fun.censor))
+
+#censored counts scaled 0-1
+assay(sce_86_A, "c_counts_scaled") <- t(apply(assay(sce_86_A, "c_counts"),
+                                           1, fun.scale))
+assay(sce_86_A, "c_counts_scaled")[assay(sce_86_A, "c_counts_scaled") < 0] <- 0
+
+
+
+#censored counts asinh scaled
+cofactor <- 1
+assay(sce_86_A, "c_counts_asinh") <- asinh((assay(sce_86_A,"c_counts"))/cofactor)
+
+#censored counts_asinh_scaled
+assay(sce_86_A, "c_counts_asinh_scaled") <- t(apply(assay(sce_86_A, "c_counts_asinh"),
+                                           1, fun.scale))
+assay(sce_86_A, "c_counts_asinh_scaled")[assay(sce_86_A, "c_counts_asinh") < 0] <- 0
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts"))
+
+saveRDS(sce_86_A,file=file.path(results_folder, paste("sce_86_A_counts_RAW",".rds",sep="")))
+```
+
+#Merge ROI xy Data into sce
+```{r, load clinical and meta data}
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_86_A",'cpinp')
+cd_folder <- file.path(wd,"lena_processed2","NSCLC_NEW","clinical_data")
+
+
+ac_meta <- read.csv(file=file.path(data_folder, paste("acquisition_metadata.csv")))
+clinical.data <-read.csv(file=file.path(cd_folder, paste("clinical_data.csv")))
+clinical.data$X <-NULL
+```
+
+```{r, Add Roi coordinates}
+ac_sub <- data.frame(ac_meta$description,ac_meta$id)
+colnames(ac_sub) <- c("ROI","acID")
+ac_sub$ROI[duplicated(ac_sub$ROI)]
+
+ac_sub <- ac_sub %>%
+            separate(ROI, 
+              into = c("TMA", "ROI_xy"), 
+              sep = "(?<=[A-Za-z])(?=[0-9])",
+              remove=F
+            )
+ac_sub$TMA <-NULL
+
+ac_sub$RoiID <- paste("86",ac_sub$ROI, sep="_")
+ac_sub$ROI <-NULL
+head(ac_sub)
+length(unique(ac_sub$RoiID))
+length(unique(ac_sub$acID))
+ac_sub$TMA <-"86A"
+ac_sub$Tma_ac <-paste(ac_sub$TMA, ac_sub$acID, sep="_")
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","clinical_data"))
+write.csv(ac_sub, file=file.path(results_folder, "ac_sub_86A.csv"))
+```
+
+```{r, add clinical data and RoiID}
+#change ROI to Image ID
+#names(colData(sce))[names(colData(sce)) == 'ROI'] <- 'ImageID'
+head(colData(sce_86_A))
+
+ac_clinical <- left_join(ac_sub, clinical.data, by="RoiID")
+
+cur_DF <- as_tibble(colData(sce_86_A)) %>% left_join(ac_clinical, by = "acID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sce_86_A$ImageNumber, sce_86_A$CellNumber))
+
+colData(sce_86_A) <- cur_DF
+rownames(colData(sce_86_A)) <-sce_86_A$CellID
+```
+
+```{r, save sce with counts and clinical data to folder counts_clinical data}
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","clinical_data"))
+write.csv(ac_sub, file=file.path(results_folder, "ac_sub_86A.csv"))
+saveRDS(sce_86_A,file=file.path(results_folder, paste("sce_86_A_counts_clinical-data_RAW",".rds",sep="")))
+```
+
diff --git a/Read_LC_NSCLC_TMA_86_B.Rmd b/Read_LC_NSCLC_TMA_86_B.Rmd
new file mode 100644
index 0000000..ef44693
--- /dev/null
+++ b/Read_LC_NSCLC_TMA_86_B.Rmd
@@ -0,0 +1,267 @@
+---
+title: "Read in CP data, NSCLC TMA 86_B"
+author: "Lena Cords"
+output:
+  html_document:
+    df_print: paged
+---
+
+```{r, import libraries}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+```
+
+
+```{r, Set wd}
+#set working directory 
+wd <- dirname(dirname(dirname(getwd())))
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_86_B",'cpinp')
+panel_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_86_B",'cpout')
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","RAW"))
+```
+
+
+```{r, Data import}
+#Read in: Cells, Image, Object and Panel
+cells <- read.csv(file=file.path(data_folder,paste('Cells.csv')))
+image <- read.csv(file=file.path(data_folder, paste("Image.csv")))
+object_relationship <- read.csv(file=file.path(data_folder, paste("distance20_Object relationships.csv")))
+panel <- read.csv(file=file.path(panel_folder, paste('panel.csv')))
+```
+
+```{r, What has been measured for each cell}
+#number of cells
+nrow(cells)
+unique(sub("_c[0-9]*$", "", colnames(cells)))
+
+#Select only the measurements you want to work on, based on the measurements displayed above Intensity_MeanIntensityCellsCorr_ColdStack
+counts <- cells[, grepl("Intensity_MeanIntensityCorrected_FullStackFiltered", colnames(cells))]
+```
+
+```{r Scale counts according to CP scaling factor}
+#when using 16bit data this should be 2^16 =65536
+scaling_factor <- unique(image$Scaling_FullStack)
+#Scale up the counts according to scaling factor from CellProfiler
+counts <- counts * scaling_factor
+
+#make sure the counts have actually been multiplied by the scaling factor
+range(counts)
+```
+
+
+```{r Add cell metadata as S4 Object for the SCE}
+#the S4Vector is needed for the SCE and a dataframe that stores the metadata per category
+cell_meta <- DataFrame(CellNumber = cells$ObjectNumber,
+                      Center_X=cells$Location_Center_X,
+                      Center_Y=cells$Location_Center_Y,
+                      Area = cells$AreaShape_Area,
+                      MajorAxisLength=cells$AreaShape_MajorAxisLength,
+                      MinorAxisLength=cells$AreaShape_MinorAxisLength,
+                      Compartment=cells$Intensity_MeanIntensity_TumourMask_Distance_B100)
+cell_meta$ImageNumber <- cells$ImageNumber
+
+#Create DataFrame for image meta data. Then match image and cell metadata by ImageNumber
+
+image_meta <- DataFrame(Area_Description = image$Metadata_description)
+
+
+#run this to get the image number for the meta data
+image_meta$ImageNumber <- image$ImageNumber
+
+
+#merge cell meta and image metadata by Image Number here
+  #cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+```
+
+
+```{r, Add cell metadata as S4 Object for the SCE from file name}
+#get info such as Batch, sample, ROI from original .mcd filename and add it to the cell metadata (colDat)
+
+#this is a good overview to get the different parts of metadata from the file name
+ac_info <- str_split(image$FileName_FullStack, '_', simplify=TRUE)
+image$Metadata_Description
+head(ac_info)
+cell_meta$BatchID <- ac_info[cell_meta$ImageNumber,1]
+cell_meta$Panel <- ac_info[cell_meta$ImageNumber,3]
+cell_meta$TmaID <- ac_info[cell_meta$ImageNumber,5]
+cell_meta$TmaBlock <- ac_info[cell_meta$ImageNumber,6]
+#cell_meta$ROI <- image$Metadata_roiid[cell_meta$ImageNumber]
+cell_meta$acID <- image$Metadata_acid[cell_meta$ImageNumber]
+#colnames(cells)
+```
+
+
+
+
+```{r Set rownames for SCE}
+#Should usually be cell id. However, in the cells.csv file the object number is counted per imgage, so to get unique cell IDs, they must be establishes by combining ObjectNumber, ImageNumber/ROINumber and SampleID
+
+rownames(cell_meta) <-paste(cell_meta$TmaID,cell_meta$TmaBlock, cell_meta$acID, cell_meta$CellNumber, sep='_')
+```
+
+
+```{r, Match channel names with metal names in exact order}
+library(DT)
+#loads panel
+DT::datatable(panel)
+
+#exactl order of channels can be extracted from the _full.csv files
+#header = FALSE is important here as otherwise it'll take the first metal as a header
+channel_metal <- read.csv(file.path(data_folder, paste("full_channelmeta.csv")), header=FALSE)
+
+#this reorders the panel according to the correct metal lsit
+panel <- panel[match(channel_metal[,1], panel$Metal.Tag),]
+#panel <- as.data.table(panel)
+#use Target from panel as the correct channel name to be displayed
+rownames(panel) <-panel$Clean_Target
+
+#channels in cell file are not ordered correctly (starting with 1,11,12 ... 2,21,22 ... 3,31,32...), hence we need to reorder them
+channelNumber <- as.numeric(sub("^.*_c", "", colnames(counts)))
+
+# Order counts based on channel number
+counts <- counts[,order(channelNumber, decreasing = FALSE)] 
+range(counts)
+```
+
+```{r build igraph object}
+library(igraph)
+# Construct neighbour data.frame
+# First in the ImageNumber_ObjectNumber format
+cur_df <- data.frame(CellID_1 = paste0(object_relationship$First.Image.Number, 
+                                       "_", object_relationship$First.Object.Number),
+                     CellID_2 = paste0(object_relationship$Second.Image.Number, 
+                                       "_", object_relationship$Second.Object.Number))
+# Create simple cell IDs
+cellID <- paste0(cell_meta$ImageNumber, "_", cell_meta$CellNumber)
+
+# Change cell IDs
+cur_df$CellID_1 <- rownames(cell_meta)[match(cur_df$CellID_1, cellID)]
+cur_df$CellID_2 <- rownames(cell_meta)[match(cur_df$CellID_2, cellID)]
+
+# Build graph
+g <- graph_from_data_frame(cur_df)
+g
+```
+
+
+```{r Create SingleCellExperiment}
+library(SingleCellExperiment)
+
+sce_86_B <-SingleCellExperiment(assays = list(counts=t(counts)))
+
+#marker name as row name cellID as colname
+rownames(sce_86_B)<-rownames(panel)
+colnames(sce_86_B)<-rownames(cell_meta)
+
+#store metadata in colDat and everything accordingly
+
+colData(sce_86_B) <-cell_meta
+rowData(sce_86_B) <-panel
+metadata(sce_86_B)<-list(graph=g)
+sce_86_B$CellID <- paste(sce_86_B$TmaID,sce_86_B$TmaBlock, sce_86_B$acID, sce_86_B$CellNumber, sep='_')
+#save SCE object
+saveRDS(sce_86_B, file=file.path(results_folder, paste("sce_86_B_2022.rds")))
+sce_86_B
+```
+
+```{r, add transformed counts}
+censor_val <-0.999
+censor_dat <- function(x, quant = 0.999){
+  q = stats::quantile(x, quant)
+  x[x>q] = q
+  return(x)
+}
+fun.censor <- function(x) censor_dat(x, censor_val)
+fun.scale <- function(x) y <- x/max(x)
+
+#censored counts
+assay(sce_86_B, "c_counts") <- t(apply(assay(sce_86_B, "counts"), 1, fun.censor))
+
+#censored counts scaled 0-1
+assay(sce_86_B, "c_counts_scaled") <- t(apply(assay(sce_86_B, "c_counts"),
+                                           1, fun.scale))
+assay(sce_86_B, "c_counts_scaled")[assay(sce_86_B, "c_counts_scaled") < 0] <- 0
+
+
+
+#censored counts asinh scaled
+cofactor <- 1
+assay(sce_86_B, "c_counts_asinh") <- asinh((assay(sce_86_B,"c_counts"))/cofactor)
+
+#censored counts_asinh_scaled
+assay(sce_86_B, "c_counts_asinh_scaled") <- t(apply(assay(sce_86_B, "c_counts_asinh"),
+                                           1, fun.scale))
+assay(sce_86_B, "c_counts_asinh_scaled")[assay(sce_86_B, "c_counts_asinh") < 0] <- 0
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts"))
+
+saveRDS(sce_86_B,file=file.path(results_folder, paste("sce_86_B_counts_RAW",".rds",sep="")))
+```
+
+#Merge ROI xy Data into sce
+```{r, load clinical and meta data}
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_86_B",'cpinp')
+cd_folder <- file.path(wd,"lena_processed2","NSCLC_NEW","clinical_data")
+
+
+ac_meta <- read.csv(file=file.path(data_folder, paste("acquisition_metadata.csv")))
+clinical.data <-read.csv(file=file.path(cd_folder, paste("clinical_data.csv")))
+clinical.data$X <-NULL
+clinical.data$TmaBlock <-NULL
+```
+
+```{r, Add Roi coordinates}
+ac_sub <- data.frame(ac_meta$description,ac_meta$id)
+colnames(ac_sub) <- c("ROI","acID")
+ac_sub$ROI[duplicated(ac_sub$ROI)]
+
+ac_sub <- ac_sub %>%
+            separate(ROI, 
+              into = c("TMA", "ROI_xy"), 
+              sep = "(?<=[A-Za-z])(?=[0-9])",
+              remove=F
+            )
+ac_sub$TMA <-NULL
+
+ac_sub$RoiID <- paste("86",ac_sub$ROI, sep="_")
+ac_sub$ROI <-NULL
+head(ac_sub)
+
+length(unique(ac_sub$acID))
+length(unique(ac_sub$RoiID))
+
+ac_sub$ROI_xy[duplicated(ac_sub$ROI_xy)]
+
+ac_sub$TMA <-"86B"
+ac_sub$Tma_ac <-paste(ac_sub$TMA, ac_sub$acID, sep="_")
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","clinical_data"))
+write.csv(ac_sub, file=file.path(results_folder, "ac_sub_86B.csv"))
+```
+
+```{r, add clinical data and RoiID}
+#change ROI to Image ID
+#names(colData(sce))[names(colData(sce)) == 'ROI'] <- 'ImageID'
+head(colData(sce_86_B))
+
+ac_clinical <- left_join(ac_sub, clinical.data, by="RoiID")
+
+cur_DF <- as_tibble(colData(sce_86_B)) %>% left_join(ac_clinical, by = "acID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sce_86_B$ImageNumber, sce_86_B$CellNumber))
+
+colData(sce_86_B) <- cur_DF
+rownames(colData(sce_86_B)) <-sce_86_B$CellID
+```
+
+```{r, save sce with counts and clinical data to folder counts_clinical data}
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts_clinical data"))
+
+saveRDS(sce_86_B,file=file.path(results_folder, paste("sce_86_B_counts_clinical-data_RAW",".rds",sep="")))
+```
diff --git a/Read_LC_NSCLC_TMA_86_C.Rmd b/Read_LC_NSCLC_TMA_86_C.Rmd
new file mode 100644
index 0000000..22b24f8
--- /dev/null
+++ b/Read_LC_NSCLC_TMA_86_C.Rmd
@@ -0,0 +1,274 @@
+---
+title: "Read in CP data, NSCLC TMA 86_C"
+author: "Lena Cords"
+output:
+  html_document:
+    df_print: paged
+---
+
+```{r, import libraries}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+```
+
+
+
+
+```{r, Set wd}
+#set working directory 
+wd <- dirname(dirname(dirname(getwd())))
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_86_C",'cpinp')
+panel_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_86_C",'cpout')
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","RAW"))
+```
+
+
+```{r, Data import}
+#Read in: Cells, Image, Object and Panel
+cells <- read.csv(file=file.path(data_folder,paste('Cells.csv')))
+image <- read.csv(file=file.path(data_folder, paste("Image.csv")))
+object_relationship <- read.csv(file=file.path(data_folder, paste("distance20_Object relationships.csv")))
+panel <- read.csv(file=file.path(panel_folder, paste('panel.csv')))
+```
+
+```{r, What has been measured for each cell}
+#number of cells
+nrow(cells)
+unique(sub("_c[0-9]*$", "", colnames(cells)))
+
+#Select only the measurements you want to work on, based on the measurements displayed above Intensity_MeanIntensityCellsCorr_ColdStack
+counts <- cells[, grepl("Intensity_MeanIntensityCorrected_FullStackFiltered", colnames(cells))]
+```
+
+```{r Scale counts according to CP scaling factor}
+#when using 16bit data this should be 2^16 =65536
+scaling_factor <- unique(image$Scaling_FullStack)
+#Scale up the counts according to scaling factor from CellProfiler
+counts <- counts * scaling_factor
+
+#make sure the counts have actually been multiplied by the scaling factor
+range(counts)
+```
+
+
+```{r Add cell metadata as S4 Object for the SCE}
+#the S4Vector is needed for the SCE and a dataframe that stores the metadata per category
+cell_meta <- DataFrame(CellNumber = cells$ObjectNumber,
+                      Center_X=cells$Location_Center_X,
+                      Center_Y=cells$Location_Center_Y,
+                      Area = cells$AreaShape_Area,
+                      MajorAxisLength=cells$AreaShape_MajorAxisLength,
+                      MinorAxisLength=cells$AreaShape_MinorAxisLength,
+                      Compartment=cells$Intensity_MeanIntensity_TumourMask_Distance_B100)
+cell_meta$ImageNumber <- cells$ImageNumber
+
+#Create DataFrame for image meta data. Then match image and cell metadata by ImageNumber
+
+image_meta <- DataFrame(Area_Description = image$Metadata_description)
+
+
+#run this to get the image number for the meta data
+image_meta$ImageNumber <- image$ImageNumber
+
+
+#merge cell meta and image metadata by Image Number here
+  #cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+```
+
+
+```{r, Add cell metadata as S4 Object for the SCE from file name}
+#get info such as Batch, sample, ROI from original .mcd filename and add it to the cell metadata (colDat)
+
+#this is a good overview to get the different parts of metadata from the file name
+ac_info <- str_split(image$FileName_FullStack, '_', simplify=TRUE)
+image$Metadata_Description
+head(ac_info)
+cell_meta$BatchID <- ac_info[cell_meta$ImageNumber,1]
+cell_meta$Panel <- ac_info[cell_meta$ImageNumber,3]
+cell_meta$TmaID <- ac_info[cell_meta$ImageNumber,5]
+cell_meta$TmaBlock <- ac_info[cell_meta$ImageNumber,6]
+#cell_meta$ROI <- image$Metadata_roiid[cell_meta$ImageNumber]
+cell_meta$acID <- image$Metadata_acid[cell_meta$ImageNumber]
+#colnames(cells)
+```
+
+
+
+
+```{r Set rownames for SCE}
+#Should usually be cell id. However, in the cells.csv file the object number is counted per imgage, so to get unique cell IDs, they must be establishes by combining ObjectNumber, ImageNumber/ROINumber and SampleID
+
+rownames(cell_meta) <-paste(cell_meta$TmaID,cell_meta$TmaBlock, cell_meta$acID, cell_meta$CellNumber, sep='_')
+```
+
+
+```{r, Match channel names with metal names in exact order}
+library(DT)
+#loads panel
+DT::datatable(panel)
+
+#exactl order of channels can be extracted from the _full.csv files
+#header = FALSE is important here as otherwise it'll take the first metal as a header
+channel_metal <- read.csv(file.path(data_folder, paste("full_channelmeta.csv")), header=FALSE)
+
+#this reorders the panel according to the correct metal lsit
+panel <- panel[match(channel_metal[,1], panel$Metal.Tag),]
+#panel <- as.data.table(panel)
+#use Target from panel as the correct channel name to be displayed
+rownames(panel) <-panel$Clean_Target
+
+#channels in cell file are not ordered correctly (starting with 1,11,12 ... 2,21,22 ... 3,31,32...), hence we need to reorder them
+channelNumber <- as.numeric(sub("^.*_c", "", colnames(counts)))
+
+# Order counts based on channel number
+counts <- counts[,order(channelNumber, decreasing = FALSE)] 
+range(counts)
+```
+
+```{r build igraph object}
+library(igraph)
+# Construct neighbour data.frame
+# First in the ImageNumber_ObjectNumber format
+cur_df <- data.frame(CellID_1 = paste0(object_relationship$First.Image.Number, 
+                                       "_", object_relationship$First.Object.Number),
+                     CellID_2 = paste0(object_relationship$Second.Image.Number, 
+                                       "_", object_relationship$Second.Object.Number))
+# Create simple cell IDs
+cellID <- paste0(cell_meta$ImageNumber, "_", cell_meta$CellNumber)
+
+# Change cell IDs
+cur_df$CellID_1 <- rownames(cell_meta)[match(cur_df$CellID_1, cellID)]
+cur_df$CellID_2 <- rownames(cell_meta)[match(cur_df$CellID_2, cellID)]
+
+# Build graph
+g <- graph_from_data_frame(cur_df)
+g
+```
+
+
+```{r Create SingleCellExperiment}
+library(SingleCellExperiment)
+
+sce_86_C <-SingleCellExperiment(assays = list(counts=t(counts)))
+
+#marker name as row name cellID as colname
+rownames(sce_86_C)<-rownames(panel)
+colnames(sce_86_C)<-rownames(cell_meta)
+
+#store metadata in colDat and everything accordingly
+
+colData(sce_86_C) <-cell_meta
+rowData(sce_86_C) <-panel
+metadata(sce_86_C)<-list(graph=g)
+
+sce_86_C$CellID <- paste(sce_86_C$TmaID,sce_86_C$TmaBlock, sce_86_C$acID, sce_86_C$CellNumber, sep='_')
+
+#save SCE object
+saveRDS(sce_86_C, file=file.path(results_folder, paste("sce_86_C_2022.rds")))
+sce_86_C
+```
+
+
+#add counts and clinical data
+
+```{r, add transformed counts}
+censor_val <-0.999
+censor_dat <- function(x, quant = 0.999){
+  q = stats::quantile(x, quant)
+  x[x>q] = q
+  return(x)
+}
+fun.censor <- function(x) censor_dat(x, censor_val)
+fun.scale <- function(x) y <- x/max(x)
+
+#censored counts
+assay(sce_86_C, "c_counts") <- t(apply(assay(sce_86_C, "counts"), 1, fun.censor))
+
+#censored counts scaled 0-1
+assay(sce_86_C, "c_counts_scaled") <- t(apply(assay(sce_86_C, "c_counts"),
+                                           1, fun.scale))
+assay(sce_86_C, "c_counts_scaled")[assay(sce_86_C, "c_counts_scaled") < 0] <- 0
+
+
+
+#censored counts asinh scaled
+cofactor <- 1
+assay(sce_86_C, "c_counts_asinh") <- asinh((assay(sce_86_C,"c_counts"))/cofactor)
+
+#censored counts_asinh_scaled
+assay(sce_86_C, "c_counts_asinh_scaled") <- t(apply(assay(sce_86_C, "c_counts_asinh"),
+                                           1, fun.scale))
+assay(sce_86_C, "c_counts_asinh_scaled")[assay(sce_86_C, "c_counts_asinh") < 0] <- 0
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts"))
+
+saveRDS(sce_86_C,file=file.path(results_folder, paste("sce_86_C_counts_RAW",".rds",sep="")))
+```
+
+#Merge ROI xy Data into sce
+```{r, load clinical and meta data}
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_86_C",'cpinp')
+cd_folder <- file.path(wd,"lena_processed2","NSCLC_NEW","clinical_data")
+
+
+ac_meta <- read.csv(file=file.path(data_folder, paste("acquisition_metadata.csv")))
+clinical.data <-read.csv(file=file.path(cd_folder, paste("clinical_data.csv")))
+clinical.data$X <-NULL
+clinical.data$TmaBlock <-NULL
+```
+
+```{r, Add Roi coordinates}
+ac_sub <- data.frame(ac_meta$description,ac_meta$id)
+colnames(ac_sub) <- c("ROI","acID")
+ac_sub$ROI[duplicated(ac_sub$ROI)]
+ac_sub$ROI[ac_sub$acID==24] <-"C2,10"
+
+ac_sub <- ac_sub %>%
+            separate(ROI, 
+              into = c("TMA", "ROI_xy"), 
+              sep = "(?<=[A-Za-z])(?=[0-9])",
+              remove=F
+            )
+ac_sub$TMA <-NULL
+
+ac_sub$RoiID <- paste("86",ac_sub$ROI, sep="_")
+ac_sub$ROI <-NULL
+length(unique(ac_sub$ROI_xy))
+length(unique(ac_sub$RoiID))
+ac_sub$ROI_xy[duplicated(ac_sub$ROI_xy)]
+
+
+ac_sub$TMA <-"86C"
+ac_sub$Tma_ac <-paste(ac_sub$TMA, ac_sub$acID, sep="_")
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","clinical_data"))
+write.csv(ac_sub, file=file.path(results_folder, "ac_sub_86C.csv"))
+
+```
+
+```{r, add clinical data and RoiID}
+#change ROI to Image ID
+#names(colData(sce))[names(colData(sce)) == 'ROI'] <- 'ImageID'
+head(colData(sce_86_C))
+
+ac_clinical <- left_join(ac_sub, clinical.data, by="RoiID")
+
+cur_DF <- as_tibble(colData(sce_86_C)) %>% left_join(ac_clinical, by = "acID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sce_86_C$ImageNumber, sce_86_C$CellNumber))
+
+colData(sce_86_C) <- cur_DF
+rownames(colData(sce_86_C)) <-sce_86_C$CellID
+```
+
+```{r, save sce with counts and clinical data to folder counts_clinical data}
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts_clinical data"))
+
+saveRDS(sce_86_C,file=file.path(results_folder, paste("sce_86_C_counts_clinical-data_RAW",".rds",sep="")))
+```
\ No newline at end of file
diff --git a/Read_LC_NSCLC_TMA_87_A.Rmd b/Read_LC_NSCLC_TMA_87_A.Rmd
new file mode 100644
index 0000000..2a7af16
--- /dev/null
+++ b/Read_LC_NSCLC_TMA_87_A.Rmd
@@ -0,0 +1,296 @@
+---
+title: "Read in CP data, NSCLC TMA 87_A"
+author: "Lena Cords"
+output: html_notebook
+---
+
+```{r, import libraries}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+```
+
+
+```{r, Set wd}
+#set working directory 
+wd <- dirname(dirname(dirname(getwd())))
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_87_A",'cpinp')
+panel_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_87_A",'cpout')
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","RAW"))
+```
+
+
+
+
+```{r, Data import}
+#Read in: Cells, Image, Object and Panel
+cells <- read.csv(file=file.path(data_folder,paste('Cells.csv')))
+image <- read.csv(file=file.path(data_folder, paste("Image.csv")))
+object_relationship <- read.csv(file=file.path(data_folder, paste("distance20_Object relationships.csv")))
+panel <- read.csv(file=file.path(panel_folder, paste('panel.csv')))
+```
+
+```{r, What has been measured for each cell}
+#number of cells
+nrow(cells)
+unique(sub("_c[0-9]*$", "", colnames(cells)))
+
+#Select only the measurements you want to work on, based on the measurements displayed above Intensity_MeanIntensityCellsCorr_ColdStack
+counts <- cells[, grepl("Intensity_MeanIntensityCorrected_FullStackFiltered", colnames(cells))]
+```
+
+```{r Scale counts according to CP scaling factor}
+#when using 16bit data this should be 2^16 =65536
+scaling_factor <- unique(image$Scaling_FullStack)
+#Scale up the counts according to scaling factor from CellProfiler
+counts <- counts * scaling_factor
+
+#make sure the counts have actually been multiplied by the scaling factor
+range(counts)
+```
+
+
+```{r Add cell metadata as S4 Object for the SCE}
+#the S4Vector is needed for the SCE and a dataframe that stores the metadata per category
+cell_meta <- DataFrame(CellNumber = cells$ObjectNumber,
+                      Center_X=cells$Location_Center_X,
+                      Center_Y=cells$Location_Center_Y,
+                      Area = cells$AreaShape_Area,
+                      MajorAxisLength=cells$AreaShape_MajorAxisLength,
+                      MinorAxisLength=cells$AreaShape_MinorAxisLength,
+                      Compartment=cells$Intensity_MeanIntensity_TumourMask_Distance_B100)
+cell_meta$ImageNumber <- cells$ImageNumber
+
+#Create DataFrame for image meta data. Then match image and cell metadata by ImageNumber
+
+image_meta <- DataFrame(Area_Description = image$Metadata_description)
+
+
+#run this to get the image number for the meta data
+image_meta$ImageNumber <- image$ImageNumber
+
+
+#merge cell meta and image metadata by Image Number here
+  #cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+```
+
+
+```{r, Add cell metadata as S4 Object for the SCE from file name}
+#get info such as Batch, sample, ROI from original .mcd filename and add it to the cell metadata (colDat)
+
+#this is a good overview to get the different parts of metadata from the file name
+ac_info <- str_split(image$FileName_FullStack, '_', simplify=TRUE)
+image$Metadata_Description
+head(ac_info)
+cell_meta$BatchID <- ac_info[cell_meta$ImageNumber,1]
+cell_meta$Panel <- ac_info[cell_meta$ImageNumber,3]
+cell_meta$TmaID <- ac_info[cell_meta$ImageNumber,5]
+cell_meta$TmaBlock <- ac_info[cell_meta$ImageNumber,6]
+#cell_meta$ROI <- image$Metadata_roiid[cell_meta$ImageNumber]
+cell_meta$acID <- image$Metadata_acid[cell_meta$ImageNumber]
+#colnames(cells)
+```
+
+
+
+
+```{r Set rownames for SCE}
+#Should usually be cell id. However, in the cells.csv file the object number is counted per imgage, so to get unique cell IDs, they must be establishes by combining ObjectNumber, ImageNumber/ROINumber and SampleID
+
+rownames(cell_meta) <-paste(cell_meta$TmaID,cell_meta$TmaBlock, cell_meta$acID, cell_meta$CellNumber, sep='_')
+```
+
+
+```{r, Match channel names with metal names in exact order}
+library(DT)
+#loads panel
+DT::datatable(panel)
+
+#exactl order of channels can be extracted from the _full.csv files
+#header = FALSE is important here as otherwise it'll take the first metal as a header
+channel_metal <- read.csv(file.path(data_folder, paste("full_channelmeta.csv")), header=FALSE)
+
+#this reorders the panel according to the correct metal lsit
+panel <- panel[match(channel_metal[,1], panel$Metal.Tag),]
+#panel <- as.data.table(panel)
+#use Target from panel as the correct channel name to be displayed
+rownames(panel) <-panel$Clean_Target
+
+#channels in cell file are not ordered correctly (starting with 1,11,12 ... 2,21,22 ... 3,31,32...), hence we need to reorder them
+channelNumber <- as.numeric(sub("^.*_c", "", colnames(counts)))
+
+# Order counts based on channel number
+counts <- counts[,order(channelNumber, decreasing = FALSE)] 
+range(counts)
+```
+
+```{r build igraph object}
+library(igraph)
+# Construct neighbour data.frame
+# First in the ImageNumber_ObjectNumber format
+cur_df <- data.frame(CellID_1 = paste0(object_relationship$First.Image.Number, 
+                                       "_", object_relationship$First.Object.Number),
+                     CellID_2 = paste0(object_relationship$Second.Image.Number, 
+                                       "_", object_relationship$Second.Object.Number))
+# Create simple cell IDs
+cellID <- paste0(cell_meta$ImageNumber, "_", cell_meta$CellNumber)
+
+# Change cell IDs
+cur_df$CellID_1 <- rownames(cell_meta)[match(cur_df$CellID_1, cellID)]
+cur_df$CellID_2 <- rownames(cell_meta)[match(cur_df$CellID_2, cellID)]
+
+# Build graph
+g <- graph_from_data_frame(cur_df)
+g
+```
+
+
+```{r Create SingleCellExperiment}
+library(SingleCellExperiment)
+
+sce_87_A <-SingleCellExperiment(assays = list(counts=t(counts)))
+
+#marker name as row name cellID as colname
+rownames(sce_87_A)<-rownames(panel)
+colnames(sce_87_A)<-rownames(cell_meta)
+
+#store metadata in colDat and everything accordingly
+
+colData(sce_87_A) <-cell_meta
+rowData(sce_87_A) <-panel
+metadata(sce_87_A)<-list(graph=g)
+
+sce_87_A$CellID <- paste(sce_87_A$TmaID,sce_87_A$TmaBlock, sce_87_A$acID, sce_87_A$CellNumber, sep='_')
+
+#save SCE object
+saveRDS(sce_87_A, file=file.path(results_folder, paste("sce_87_A_2022.rds")))
+```
+
+
+#add counts and clinical data
+
+```{r, add transformed counts}
+censor_val <-0.999
+censor_dat <- function(x, quant = 0.999){
+  q = stats::quantile(x, quant)
+  x[x>q] = q
+  return(x)
+}
+fun.censor <- function(x) censor_dat(x, censor_val)
+fun.scale <- function(x) y <- x/max(x)
+
+#censored counts
+assay(sce_87_A, "c_counts") <- t(apply(assay(sce_87_A, "counts"), 1, fun.censor))
+
+#censored counts scaled 0-1
+assay(sce_87_A, "c_counts_scaled") <- t(apply(assay(sce_87_A, "c_counts"),
+                                           1, fun.scale))
+assay(sce_87_A, "c_counts_scaled")[assay(sce_87_A, "c_counts_scaled") < 0] <- 0
+
+
+
+#censored counts asinh scaled
+cofactor <- 1
+assay(sce_87_A, "c_counts_asinh") <- asinh((assay(sce_87_A,"c_counts"))/cofactor)
+
+#censored counts_asinh_scaled
+assay(sce_87_A, "c_counts_asinh_scaled") <- t(apply(assay(sce_87_A, "c_counts_asinh"),
+                                           1, fun.scale))
+assay(sce_87_A, "c_counts_asinh_scaled")[assay(sce_87_A, "c_counts_asinh") < 0] <- 0
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts"))
+
+saveRDS(sce_87_A,file=file.path(results_folder, paste("sce_87_A_counts_RAW",".rds",sep="")))
+```
+
+#Merge ROI xy Data into sce
+```{r, load clinical and meta data}
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_87_A",'cpinp')
+cd_folder <- file.path(wd,"lena_processed2","NSCLC_NEW","clinical_data")
+
+
+ac_meta <- read.csv(file=file.path(data_folder, paste("acquisition_metadata.csv")))
+clinical.data <-read.csv(file=file.path(cd_folder, paste("clinical_data.csv")))
+clinical.data$X <-NULL
+clinical.data$TmaBlock <-NULL
+```
+
+```{r, Add Roi coordinates}
+ac_sub <- data.frame(ac_meta$description,ac_meta$id)
+colnames(ac_sub) <- c("ROI","acID")
+ac_sub$ROI[duplicated(ac_sub$ROI)]
+ac_sub$ROI[ac_sub$acID==138] <-"A3,15"
+ac_sub$ROI[ac_sub$acID==100] <-"A6,15"
+
+ac_sub <-ac_sub[ac_sub$acID!=49,] 
+
+
+ac_sub <- ac_sub %>%
+            separate(ROI, 
+              into = c("TMA", "ROI_xy"), 
+              sep = "(?<=[A-Za-z])(?=[0-9])",
+              remove=F
+            )
+#ac_sub$TmaImage_ID <- paste("87",ac_sub$TMA,ac_sub$acID, sep="_")
+ac_sub$TMA <-NULL
+
+ac_sub$RoiID <- paste("87",ac_sub$ROI, sep="_")
+ac_sub$ROI <-NULL
+head(ac_sub)
+length(unique(ac_sub$acID))
+length(unique(ac_sub$RoiID))
+
+ac_sub$ROI_xy[duplicated(ac_sub$ROI_xy)]
+
+ac_sub$TMA <-"87A"
+ac_sub$Tma_ac <-paste(ac_sub$TMA, ac_sub$acID, sep="_")
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","clinical_data"))
+write.csv(ac_sub, file=file.path(results_folder, "ac_sub_87A.csv"))
+
+```
+
+```{r, add clinical data and RoiID}
+#change ROI to Image ID
+#names(colData(sce))[names(colData(sce)) == 'ROI'] <- 'ImageID'
+head(colData(sce_87_A))
+
+ac_clinical <- left_join(ac_sub, clinical.data, by="RoiID")
+length(unique(ac_clinical$RoiID))
+
+#cur_DF <- as_tibble(colData(sce_87_A)) %>% left_join(ac_sub, by = "acID") %>% DataFrame()
+
+cur_DF <- as_tibble(colData(sce_87_A)) %>% left_join(ac_clinical, by = "acID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sce_87_A$ImageNumber, sce_87_A$CellNumber))
+
+colData(sce_87_A) <- cur_DF
+rownames(colData(sce_87_A)) <-sce_87_A$CellID
+
+
+cur_DF <- as_tibble(colData(sce_87_A)) %>% left_join(clinical.data, by = c("TmaID","TmaBlock","ROI_xy")) %>% DataFrame()
+
+
+head(colData(sce_87_A))
+head(clinical.data)
+
+clinical.data$TmaID <-clinical.data$TMA
+clinical.data$TMA <-NULL
+clinical.data <- clinical.data %>%
+            separate(ROI, 
+              into = c("TmaBlock", "ROI_xy"), 
+              sep = "(?<=[A-Za-z])(?=[0-9])",
+              remove=F
+            )
+```
+
+```{r, save sce with counts and clinical data to folder counts_clinical data}
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts_clinical data"))
+
+saveRDS(sce_87_A,file=file.path(results_folder, paste("sce_87_A_counts_clinical-data_RAW",".rds",sep="")))
+```
\ No newline at end of file
diff --git a/Read_LC_NSCLC_TMA_87_B.Rmd b/Read_LC_NSCLC_TMA_87_B.Rmd
new file mode 100644
index 0000000..466b33a
--- /dev/null
+++ b/Read_LC_NSCLC_TMA_87_B.Rmd
@@ -0,0 +1,268 @@
+---
+title: "Read in CP data, NSCLC TMA 87_B"
+author: "Lena Cords"
+output: html_notebook
+---
+
+```{r, import libraries}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+```
+
+
+```{r, Set wd}
+#set working directory 
+wd <- dirname(dirname(dirname(getwd())))
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_87_B",'cpinp')
+panel_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_87_B",'cpout')
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","RAW"))
+```
+
+
+
+```{r, Data import}
+#Read in: Cells, Image, Object and Panel
+cells <- read.csv(file=file.path(data_folder,paste('Cells.csv')))
+image <- read.csv(file=file.path(data_folder, paste("Image.csv")))
+object_relationship <- read.csv(file=file.path(data_folder, paste("distance20_Object relationships.csv")))
+panel <- read.csv(file=file.path(panel_folder, paste('panel.csv')))
+```
+
+```{r, What has been measured for each cell}
+#number of cells
+nrow(cells)
+unique(sub("_c[0-9]*$", "", colnames(cells)))
+
+#Select only the measurements you want to work on, based on the measurements displayed above Intensity_MeanIntensityCellsCorr_ColdStack
+counts <- cells[, grepl("Intensity_MeanIntensityCorrected_FullStackFiltered", colnames(cells))]
+```
+
+```{r Scale counts according to CP scaling factor}
+#when using 16bit data this should be 2^16 =65536
+scaling_factor <- unique(image$Scaling_FullStack)
+#Scale up the counts according to scaling factor from CellProfiler
+counts <- counts * scaling_factor
+
+#make sure the counts have actually been multiplied by the scaling factor
+range(counts)
+```
+
+
+```{r Add cell metadata as S4 Object for the SCE}
+#the S4Vector is needed for the SCE and a dataframe that stores the metadata per category
+cell_meta <- DataFrame(CellNumber = cells$ObjectNumber,
+                      Center_X=cells$Location_Center_X,
+                      Center_Y=cells$Location_Center_Y,
+                      Area = cells$AreaShape_Area,
+                      MajorAxisLength=cells$AreaShape_MajorAxisLength,
+                      MinorAxisLength=cells$AreaShape_MinorAxisLength,
+                      Compartment=cells$Intensity_MeanIntensity_TumourMask_Distance_B100)
+cell_meta$ImageNumber <- cells$ImageNumber
+
+#Create DataFrame for image meta data. Then match image and cell metadata by ImageNumber
+
+image_meta <- DataFrame(Area_Description = image$Metadata_description)
+
+
+#run this to get the image number for the meta data
+image_meta$ImageNumber <- image$ImageNumber
+
+
+#merge cell meta and image metadata by Image Number here
+  #cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+```
+
+
+```{r, Add cell metadata as S4 Object for the SCE from file name}
+#get info such as Batch, sample, ROI from original .mcd filename and add it to the cell metadata (colDat)
+
+#this is a good overview to get the different parts of metadata from the file name
+ac_info <- str_split(image$FileName_FullStack, '_', simplify=TRUE)
+image$Metadata_Description
+head(ac_info)
+cell_meta$BatchID <- ac_info[cell_meta$ImageNumber,1]
+cell_meta$Panel <- ac_info[cell_meta$ImageNumber,3]
+cell_meta$TmaID <- ac_info[cell_meta$ImageNumber,5]
+cell_meta$TmaBlock <- ac_info[cell_meta$ImageNumber,6]
+#cell_meta$ROI <- image$Metadata_roiid[cell_meta$ImageNumber]
+cell_meta$acID <- image$Metadata_acid[cell_meta$ImageNumber]
+#colnames(cells)
+```
+
+
+
+
+```{r Set rownames for SCE}
+#Should usually be cell id. However, in the cells.csv file the object number is counted per imgage, so to get unique cell IDs, they must be establishes by combining ObjectNumber, ImageNumber/ROINumber and SampleID
+
+rownames(cell_meta) <-paste(cell_meta$TmaID,cell_meta$TmaBlock, cell_meta$acID, cell_meta$CellNumber, sep='_')
+```
+
+
+```{r, Match channel names with metal names in exact order}
+library(DT)
+#loads panel
+DT::datatable(panel)
+
+#exactl order of channels can be extracted from the _full.csv files
+#header = FALSE is important here as otherwise it'll take the first metal as a header
+channel_metal <- read.csv(file.path(data_folder, paste("full_channelmeta.csv")), header=FALSE)
+
+#this reorders the panel according to the correct metal lsit
+panel <- panel[match(channel_metal[,1], panel$Metal.Tag),]
+#panel <- as.data.table(panel)
+#use Target from panel as the correct channel name to be displayed
+rownames(panel) <-panel$Clean_Target
+
+#channels in cell file are not ordered correctly (starting with 1,11,12 ... 2,21,22 ... 3,31,32...), hence we need to reorder them
+channelNumber <- as.numeric(sub("^.*_c", "", colnames(counts)))
+
+# Order counts based on channel number
+counts <- counts[,order(channelNumber, decreasing = FALSE)] 
+range(counts)
+```
+
+```{r build igraph object}
+library(igraph)
+# Construct neighbour data.frame
+# First in the ImageNumber_ObjectNumber format
+cur_df <- data.frame(CellID_1 = paste0(object_relationship$First.Image.Number, 
+                                       "_", object_relationship$First.Object.Number),
+                     CellID_2 = paste0(object_relationship$Second.Image.Number, 
+                                       "_", object_relationship$Second.Object.Number))
+# Create simple cell IDs
+cellID <- paste0(cell_meta$ImageNumber, "_", cell_meta$CellNumber)
+
+# Change cell IDs
+cur_df$CellID_1 <- rownames(cell_meta)[match(cur_df$CellID_1, cellID)]
+cur_df$CellID_2 <- rownames(cell_meta)[match(cur_df$CellID_2, cellID)]
+
+# Build graph
+g <- graph_from_data_frame(cur_df)
+g
+```
+
+
+```{r Create SingleCellExperiment}
+library(SingleCellExperiment)
+
+sce_87_B <-SingleCellExperiment(assays = list(counts=t(counts)))
+
+#marker name as row name cellID as colname
+rownames(sce_87_B)<-rownames(panel)
+colnames(sce_87_B)<-rownames(cell_meta)
+
+#store metadata in colDat and everything accordingly
+
+colData(sce_87_B) <-cell_meta
+rowData(sce_87_B) <-panel
+metadata(sce_87_B)<-list(graph=g)
+
+sce_87_B$CellID <- paste(sce_87_B$TmaID,sce_87_B$TmaBlock, sce_87_B$acID, sce_87_B$CellNumber, sep='_')
+#save SCE object
+saveRDS(sce_87_B, file=file.path(results_folder, paste("sce_87_B_2022.rds")))
+```
+
+
+#add counts and clinical data
+
+```{r, add transformed counts}
+censor_val <-0.999
+censor_dat <- function(x, quant = 0.999){
+  q = stats::quantile(x, quant)
+  x[x>q] = q
+  return(x)
+}
+fun.censor <- function(x) censor_dat(x, censor_val)
+fun.scale <- function(x) y <- x/max(x)
+
+#censored counts
+assay(sce_87_B, "c_counts") <- t(apply(assay(sce_87_B, "counts"), 1, fun.censor))
+
+#censored counts scaled 0-1
+assay(sce_87_B, "c_counts_scaled") <- t(apply(assay(sce_87_B, "c_counts"),
+                                           1, fun.scale))
+assay(sce_87_B, "c_counts_scaled")[assay(sce_87_B, "c_counts_scaled") < 0] <- 0
+
+
+
+#censored counts asinh scaled
+cofactor <- 1
+assay(sce_87_B, "c_counts_asinh") <- asinh((assay(sce_87_B,"c_counts"))/cofactor)
+
+#censored counts_asinh_scaled
+assay(sce_87_B, "c_counts_asinh_scaled") <- t(apply(assay(sce_87_B, "c_counts_asinh"),
+                                           1, fun.scale))
+assay(sce_87_B, "c_counts_asinh_scaled")[assay(sce_87_B, "c_counts_asinh") < 0] <- 0
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts"))
+
+saveRDS(sce_87_B,file=file.path(results_folder, paste("sce_87_B_counts_RAW",".rds",sep="")))
+```
+
+#Merge ROI xy Data into sce
+```{r, load clinical and meta data}
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_87_B",'cpinp')
+cd_folder <- file.path(wd,"lena_processed2","NSCLC_NEW","clinical_data")
+
+
+ac_meta <- read.csv(file=file.path(data_folder, paste("acquisition_metadata.csv")))
+clinical.data <-read.csv(file=file.path(cd_folder, paste("clinical_data.csv")))
+clinical.data$X <-NULL
+clinical.data$TmaBlock <-NULL
+```
+
+```{r, Add Roi coordinates}
+ac_sub <- data.frame(ac_meta$description,ac_meta$id)
+colnames(ac_sub) <- c("ROI","acID")
+
+ac_sub <- ac_sub %>%
+            separate(ROI, 
+              into = c("TMA", "ROI_xy"), 
+              sep = "(?<=[A-Za-z])(?=[0-9])",
+              remove=F
+            )
+ac_sub$TMA <-NULL
+
+ac_sub$RoiID <- paste("87",ac_sub$ROI, sep="_")
+ac_sub$ROI <-NULL
+head(ac_sub)
+length(unique(ac_sub$RoiID))
+length(unique(ac_sub$acID))
+
+ac_sub$ROI_xy[duplicated(ac_sub$ROI_xy)]
+
+ac_sub$TMA <-"87B"
+ac_sub$Tma_ac <-paste(ac_sub$TMA, ac_sub$acID, sep="_")
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","clinical_data"))
+write.csv(ac_sub, file=file.path(results_folder, "ac_sub_87B.csv"))
+
+```
+
+```{r, add clinical data and RoiID}
+#change ROI to Image ID
+#names(colData(sce))[names(colData(sce)) == 'ROI'] <- 'ImageID'
+head(colData(sce_87_B))
+
+ac_clinical_B <- left_join(ac_sub, clinical.data, by="RoiID")
+length(unique(ac_clinical$RoiID))
+cur_DF <- as_tibble(colData(sce_87_B)) %>% left_join(ac_clinical, by = "acID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sce_87_B$ImageNumber, sce_87_B$CellNumber))
+
+colData(sce_87_B) <- cur_DF
+rownames(colData(sce_87_B)) <-sce_87_B$CellID
+```
+
+```{r, save sce with counts and clinical data to folder counts_clinical data}
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts_clinical data"))
+
+saveRDS(sce_87_B,file=file.path(results_folder, paste("sce_87_B_counts_clinical-data_RAW",".rds",sep="")))
+```
diff --git a/Read_LC_NSCLC_TMA_87_C.Rmd b/Read_LC_NSCLC_TMA_87_C.Rmd
new file mode 100644
index 0000000..5feefe1
--- /dev/null
+++ b/Read_LC_NSCLC_TMA_87_C.Rmd
@@ -0,0 +1,271 @@
+---
+title: "Read in CP data, NSCLC TMA 87_C"
+author: "Lena Cords"
+output: html_notebook
+---
+
+```{r, import libraries}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+```
+
+
+```{r, Set wd}
+#set working directory 
+wd <- dirname(dirname(dirname(getwd())))
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_87_C",'cpinp')
+panel_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_87_C",'cpout')
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","RAW"))
+```
+
+
+
+```{r, Data import}
+#Read in: Cells, Image, Object and Panel
+cells <- read.csv(file=file.path(data_folder,paste('Cells.csv')))
+image <- read.csv(file=file.path(data_folder, paste("Image.csv")))
+object_relationship <- read.csv(file=file.path(data_folder, paste("distance20_Object relationships.csv")))
+panel <- read.csv(file=file.path(panel_folder, paste('panel.csv')))
+```
+
+```{r, What has been measured for each cell}
+#number of cells
+nrow(cells)
+unique(sub("_c[0-9]*$", "", colnames(cells)))
+
+#Select only the measurements you want to work on, based on the measurements displayed above Intensity_MeanIntensityCellsCorr_ColdStack
+counts <- cells[, grepl("Intensity_MeanIntensityCorrected_FullStackFiltered", colnames(cells))]
+```
+
+```{r Scale counts according to CP scaling factor}
+#when using 16bit data this should be 2^16 =65536
+scaling_factor <- unique(image$Scaling_FullStack)
+#Scale up the counts according to scaling factor from CellProfiler
+counts <- counts * scaling_factor
+
+#make sure the counts have actually been multiplied by the scaling factor
+range(counts)
+```
+
+
+```{r Add cell metadata as S4 Object for the SCE}
+#the S4Vector is needed for the SCE and a dataframe that stores the metadata per category
+cell_meta <- DataFrame(CellNumber = cells$ObjectNumber,
+                      Center_X=cells$Location_Center_X,
+                      Center_Y=cells$Location_Center_Y,
+                      Area = cells$AreaShape_Area,
+                      MajorAxisLength=cells$AreaShape_MajorAxisLength,
+                      MinorAxisLength=cells$AreaShape_MinorAxisLength,
+                      Compartment=cells$Intensity_MeanIntensity_TumourMask_Distance_B100)
+cell_meta$ImageNumber <- cells$ImageNumber
+
+#Create DataFrame for image meta data. Then match image and cell metadata by ImageNumber
+
+image_meta <- DataFrame(Area_Description = image$Metadata_description)
+
+
+#run this to get the image number for the meta data
+image_meta$ImageNumber <- image$ImageNumber
+
+
+#merge cell meta and image metadata by Image Number here
+  #cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+```
+
+
+```{r, Add cell metadata as S4 Object for the SCE from file name}
+#get info such as Batch, sample, ROI from original .mcd filename and add it to the cell metadata (colDat)
+
+#this is a good overview to get the different parts of metadata from the file name
+ac_info <- str_split(image$FileName_FullStack, '_', simplify=TRUE)
+image$Metadata_Description
+head(ac_info)
+cell_meta$BatchID <- ac_info[cell_meta$ImageNumber,1]
+cell_meta$Panel <- ac_info[cell_meta$ImageNumber,3]
+cell_meta$TmaID <- ac_info[cell_meta$ImageNumber,5]
+cell_meta$TmaBlock <- ac_info[cell_meta$ImageNumber,6]
+#cell_meta$ROI <- image$Metadata_roiid[cell_meta$ImageNumber]
+cell_meta$acID <- image$Metadata_acid[cell_meta$ImageNumber]
+#colnames(cells)
+```
+
+
+
+
+```{r Set rownames for SCE}
+#Should usually be cell id. However, in the cells.csv file the object number is counted per imgage, so to get unique cell IDs, they must be establishes by combining ObjectNumber, ImageNumber/ROINumber and SampleID
+
+rownames(cell_meta) <-paste(cell_meta$TmaID,cell_meta$TmaBlock, cell_meta$acID, cell_meta$CellNumber, sep='_')
+```
+
+
+```{r, Match channel names with metal names in exact order}
+library(DT)
+#loads panel
+DT::datatable(panel)
+
+#exactl order of channels can be extracted from the _full.csv files
+#header = FALSE is important here as otherwise it'll take the first metal as a header
+channel_metal <- read.csv(file.path(data_folder, paste("full_channelmeta.csv")), header=FALSE)
+
+#this reorders the panel according to the correct metal lsit
+panel <- panel[match(channel_metal[,1], panel$Metal.Tag),]
+#panel <- as.data.table(panel)
+#use Target from panel as the correct channel name to be displayed
+rownames(panel) <-panel$Clean_Target
+
+#channels in cell file are not ordered correctly (starting with 1,11,12 ... 2,21,22 ... 3,31,32...), hence we need to reorder them
+channelNumber <- as.numeric(sub("^.*_c", "", colnames(counts)))
+
+# Order counts based on channel number
+counts <- counts[,order(channelNumber, decreasing = FALSE)] 
+range(counts)
+```
+
+```{r build igraph object}
+library(igraph)
+# Construct neighbour data.frame
+# First in the ImageNumber_ObjectNumber format
+cur_df <- data.frame(CellID_1 = paste0(object_relationship$First.Image.Number, 
+                                       "_", object_relationship$First.Object.Number),
+                     CellID_2 = paste0(object_relationship$Second.Image.Number, 
+                                       "_", object_relationship$Second.Object.Number))
+# Create simple cell IDs
+cellID <- paste0(cell_meta$ImageNumber, "_", cell_meta$CellNumber)
+
+# Change cell IDs
+cur_df$CellID_1 <- rownames(cell_meta)[match(cur_df$CellID_1, cellID)]
+cur_df$CellID_2 <- rownames(cell_meta)[match(cur_df$CellID_2, cellID)]
+
+# Build graph
+g <- graph_from_data_frame(cur_df)
+g
+```
+
+
+```{r Create SingleCellExperiment}
+library(SingleCellExperiment)
+
+sce_87_C <-SingleCellExperiment(assays = list(counts=t(counts)))
+
+#marker name as row name cellID as colname
+rownames(sce_87_C)<-rownames(panel)
+colnames(sce_87_C)<-rownames(cell_meta)
+
+#store metadata in colDat and everything accordingly
+
+colData(sce_87_C) <-cell_meta
+rowData(sce_87_C) <-panel
+metadata(sce_87_C)<-list(graph=g)
+
+sce_87_C$CellID <- paste(sce_87_C$TmaID,sce_87_C$TmaBlock, sce_87_C$acID, sce_87_C$CellNumber, sep='_')
+
+#save SCE object
+saveRDS(sce_87_C, file=file.path(results_folder, paste("sce_87_C_2022.rds")))
+```
+
+
+#add counts and clinical data
+
+```{r, add transformed counts}
+censor_val <-0.999
+censor_dat <- function(x, quant = 0.999){
+  q = stats::quantile(x, quant)
+  x[x>q] = q
+  return(x)
+}
+fun.censor <- function(x) censor_dat(x, censor_val)
+fun.scale <- function(x) y <- x/max(x)
+
+#censored counts
+assay(sce_87_C, "c_counts") <- t(apply(assay(sce_87_C, "counts"), 1, fun.censor))
+
+#censored counts scaled 0-1
+assay(sce_87_C, "c_counts_scaled") <- t(apply(assay(sce_87_C, "c_counts"),
+                                           1, fun.scale))
+assay(sce_87_C, "c_counts_scaled")[assay(sce_87_C, "c_counts_scaled") < 0] <- 0
+
+
+
+#censored counts asinh scaled
+cofactor <- 1
+assay(sce_87_C, "c_counts_asinh") <- asinh((assay(sce_87_C,"c_counts"))/cofactor)
+
+#censored counts_asinh_scaled
+assay(sce_87_C, "c_counts_asinh_scaled") <- t(apply(assay(sce_87_C, "c_counts_asinh"),
+                                           1, fun.scale))
+assay(sce_87_C, "c_counts_asinh_scaled")[assay(sce_87_C, "c_counts_asinh") < 0] <- 0
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts"))
+
+saveRDS(sce_87_C,file=file.path(results_folder, paste("sce_87_C_counts_RAW",".rds",sep="")))
+```
+
+#Merge ROI xy Data into sce
+```{r, load clinical and meta data}
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_87_C",'cpinp')
+cd_folder <- file.path(wd,"lena_processed2","NSCLC_NEW","clinical_data")
+
+
+ac_meta <- read.csv(file=file.path(data_folder, paste("acquisition_metadata.csv")))
+clinical.data <-read.csv(file=file.path(cd_folder, paste("clinical_data.csv")))
+clinical.data$X <-NULL
+clinical.data$TmaBlock <-NULL
+```
+
+```{r, Add Roi coordinates}
+ac_sub <- data.frame(ac_meta$description,ac_meta$id)
+colnames(ac_sub) <- c("ROI","acID")
+
+ac_sub <- ac_sub %>%
+            separate(ROI, 
+              into = c("TMA", "ROI_xy"), 
+              sep = "(?<=[A-Za-z])(?=[0-9])",
+              remove=F
+            )
+ac_sub$TMA <-NULL
+
+ac_sub$RoiID <- paste("87",ac_sub$ROI, sep="_")
+ac_sub$ROI <-NULL
+head(ac_sub)
+length(unique(ac_sub$RoiID))
+
+length(unique(ac_sub$RoiID))
+ac_sub$ROI_xy[duplicated(ac_sub$ROI_xy)]
+
+ac_sub$TMA <-"87C"
+ac_sub$Tma_ac <-paste(ac_sub$TMA, ac_sub$acID, sep="_")
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","clinical_data"))
+write.csv(ac_sub, file=file.path(results_folder, "ac_sub_87C.csv"))
+
+```
+
+```{r, add clinical data and RoiID}
+#change ROI to Image ID
+#names(colData(sce))[names(colData(sce)) == 'ROI'] <- 'ImageID'
+head(colData(sce_87_C))
+
+ac_clinical_C <- left_join(ac_sub, clinical.data, by="RoiID")
+length(unique(ac_clinical$RoiID))
+cur_DF <- as_tibble(colData(sce_87_C)) %>% left_join(ac_clinical, by = "acID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sce_87_C$ImageNumber, sce_87_C$CellNumber))
+
+colData(sce_87_C) <- cur_DF
+rownames(colData(sce_87_C)) <-sce_87_C$CellID
+
+head(colData(sce_87_C))
+```
+
+```{r, save sce with counts and clinical data to folder counts_clinical data}
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts_clinical data"))
+
+saveRDS(sce_87_C,file=file.path(results_folder, paste("sce_87_C_counts_clinical-data_RAW",".rds",sep="")))
+```
\ No newline at end of file
diff --git a/Read_LC_NSCLC_TMA_88_A.Rmd b/Read_LC_NSCLC_TMA_88_A.Rmd
new file mode 100644
index 0000000..2e0c6ff
--- /dev/null
+++ b/Read_LC_NSCLC_TMA_88_A.Rmd
@@ -0,0 +1,285 @@
+---
+title: "Read in CP data, NSCLC TMA 88_A"
+author: "Lena Cords"
+output: html_notebook
+---
+
+```{r, import libraries}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+```
+
+
+
+```{r, Set wd}
+#set working directory 
+wd <- dirname(dirname(dirname(getwd())))
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_88_A",'cpinp')
+panel_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_88_A",'cpout')
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","RAW"))
+```
+
+
+```{r, Data import}
+#Read in: Cells, Image, Object and Panel
+cells <- read.csv(file=file.path(data_folder,paste('Cells.csv')))
+image <- read.csv(file=file.path(data_folder, paste("Image.csv")))
+object_relationship <- read.csv(file=file.path(data_folder, paste("distance20_Object relationships.csv")))
+panel <- read.csv(file=file.path(panel_folder, paste('panel.csv')))
+```
+
+```{r, What has been measured for each cell}
+#number of cells
+nrow(cells)
+unique(sub("_c[0-9]*$", "", colnames(cells)))
+
+#Select only the measurements you want to work on, based on the measurements displayed above Intensity_MeanIntensityCellsCorr_ColdStack
+counts <- cells[, grepl("Intensity_MeanIntensityCorrected_FullStackFiltered", colnames(cells))]
+```
+
+```{r Scale counts according to CP scaling factor}
+#when using 16bit data this should be 2^16 =65536
+scaling_factor <- unique(image$Scaling_FullStack)
+#Scale up the counts according to scaling factor from CellProfiler
+counts <- counts * scaling_factor
+
+#make sure the counts have actually been multiplied by the scaling factor
+range(counts)
+```
+
+
+```{r Add cell metadata as S4 Object for the SCE}
+#the S4Vector is needed for the SCE and a dataframe that stores the metadata per category
+cell_meta <- DataFrame(CellNumber = cells$ObjectNumber,
+                      Center_X=cells$Location_Center_X,
+                      Center_Y=cells$Location_Center_Y,
+                      Area = cells$AreaShape_Area,
+                      MajorAxisLength=cells$AreaShape_MajorAxisLength,
+                      MinorAxisLength=cells$AreaShape_MinorAxisLength,
+                      Compartment=cells$Intensity_MeanIntensity_TumourMask_Distance_B100)
+cell_meta$ImageNumber <- cells$ImageNumber
+
+#Create DataFrame for image meta data. Then match image and cell metadata by ImageNumber
+
+image_meta <- DataFrame(Area_Description = image$Metadata_description)
+
+
+#run this to get the image number for the meta data
+image_meta$ImageNumber <- image$ImageNumber
+
+
+#merge cell meta and image metadata by Image Number here
+  #cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+```
+
+
+```{r, Add cell metadata as S4 Object for the SCE from file name}
+#get info such as Batch, sample, ROI from original .mcd filename and add it to the cell metadata (colDat)
+
+#this is a good overview to get the different parts of metadata from the file name
+ac_info <- str_split(image$FileName_FullStack, '_', simplify=TRUE)
+image$Metadata_Description
+head(ac_info)
+cell_meta$BatchID <- ac_info[cell_meta$ImageNumber,1]
+cell_meta$Panel <- ac_info[cell_meta$ImageNumber,3]
+cell_meta$TmaID <- ac_info[cell_meta$ImageNumber,5]
+cell_meta$TmaBlock <- ac_info[cell_meta$ImageNumber,6]
+#cell_meta$ROI <- image$Metadata_roiid[cell_meta$ImageNumber]
+cell_meta$acID <- image$Metadata_acid[cell_meta$ImageNumber]
+#colnames(cells)
+```
+
+
+
+
+```{r Set rownames for SCE}
+#Should usually be cell id. However, in the cells.csv file the object number is counted per imgage, so to get unique cell IDs, they must be establishes by combining ObjectNumber, ImageNumber/ROINumber and SampleID
+
+rownames(cell_meta) <-paste(cell_meta$TmaID,cell_meta$TmaBlock, cell_meta$acID, cell_meta$CellNumber, sep='_')
+```
+
+
+```{r, Match channel names with metal names in exact order}
+library(DT)
+#loads panel
+DT::datatable(panel)
+
+#exactl order of channels can be extracted from the _full.csv files
+#header = FALSE is important here as otherwise it'll take the first metal as a header
+channel_metal <- read.csv(file.path(data_folder, paste("full_channelmeta.csv")), header=FALSE)
+
+#this reorders the panel according to the correct metal lsit
+panel <- panel[match(channel_metal[,1], panel$Metal.Tag),]
+#panel <- as.data.table(panel)
+#use Target from panel as the correct channel name to be displayed
+rownames(panel) <-panel$Clean_Target
+
+#channels in cell file are not ordered correctly (starting with 1,11,12 ... 2,21,22 ... 3,31,32...), hence we need to reorder them
+channelNumber <- as.numeric(sub("^.*_c", "", colnames(counts)))
+
+# Order counts based on channel number
+counts <- counts[,order(channelNumber, decreasing = FALSE)] 
+range(counts)
+```
+
+```{r build igraph object}
+library(igraph)
+# Construct neighbour data.frame
+# First in the ImageNumber_ObjectNumber format
+cur_df <- data.frame(CellID_1 = paste0(object_relationship$First.Image.Number, 
+                                       "_", object_relationship$First.Object.Number),
+                     CellID_2 = paste0(object_relationship$Second.Image.Number, 
+                                       "_", object_relationship$Second.Object.Number))
+# Create simple cell IDs
+cellID <- paste0(cell_meta$ImageNumber, "_", cell_meta$CellNumber)
+
+# Change cell IDs
+cur_df$CellID_1 <- rownames(cell_meta)[match(cur_df$CellID_1, cellID)]
+cur_df$CellID_2 <- rownames(cell_meta)[match(cur_df$CellID_2, cellID)]
+
+# Build graph
+g <- graph_from_data_frame(cur_df)
+g
+```
+
+
+```{r Create SingleCellExperiment}
+library(SingleCellExperiment)
+
+sce_88_A <-SingleCellExperiment(assays = list(counts=t(counts)))
+
+#marker name as row name cellID as colname
+rownames(sce_88_A)<-rownames(panel)
+colnames(sce_88_A)<-rownames(cell_meta)
+
+#store metadata in colDat and everything accordingly
+
+colData(sce_88_A) <-cell_meta
+rowData(sce_88_A) <-panel
+metadata(sce_88_A)<-list(graph=g)
+
+sce_88_A$CellID <- paste(sce_88_A$TmaID,sce_88_A$TmaBlock, sce_88_A$acID, sce_88_A$CellNumber, sep='_')
+
+#save SCE object
+saveRDS(sce_88_A, file=file.path(results_folder, paste("sce_88_A_2022.rds")))
+
+#remove acIDs 1-4 (tria ROIs 1-4) =248 cells. All cells=339006 , with 4 images removed: 338758
+sce_88_A <-sce_88_A[, sce_88_A$acID!=1&sce_88_A$acID!=2&sce_88_A$acID!=3&sce_88_A$acID!=4]
+saveRDS(sce_88_A, file=file.path(results_folder, paste("sce_88_A_2022_ac1-4rm.rds")))
+
+```
+
+
+
+
+#add counts and clinical data
+
+```{r, add transformed counts}
+censor_val <-0.999
+censor_dat <- function(x, quant = 0.999){
+  q = stats::quantile(x, quant)
+  x[x>q] = q
+  return(x)
+}
+fun.censor <- function(x) censor_dat(x, censor_val)
+fun.scale <- function(x) y <- x/max(x)
+
+#censored counts
+assay(sce_88_A, "c_counts") <- t(apply(assay(sce_88_A, "counts"), 1, fun.censor))
+
+#censored counts scaled 0-1
+assay(sce_88_A, "c_counts_scaled") <- t(apply(assay(sce_88_A, "c_counts"),
+                                           1, fun.scale))
+assay(sce_88_A, "c_counts_scaled")[assay(sce_88_A, "c_counts_scaled") < 0] <- 0
+
+
+
+#censored counts asinh scaled
+cofactor <- 1
+assay(sce_88_A, "c_counts_asinh") <- asinh((assay(sce_88_A,"c_counts"))/cofactor)
+
+#censored counts_asinh_scaled
+assay(sce_88_A, "c_counts_asinh_scaled") <- t(apply(assay(sce_88_A, "c_counts_asinh"),
+                                           1, fun.scale))
+assay(sce_88_A, "c_counts_asinh_scaled")[assay(sce_88_A, "c_counts_asinh") < 0] <- 0
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts"))
+
+saveRDS(sce_88_A,file=file.path(results_folder, paste("sce_88_A_counts_RAW_ac1-4rm",".rds",sep="")))
+```
+
+#Merge ROI xy Data into sce
+```{r, load clinical and meta data}
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_88_A",'cpinp')
+cd_folder <- file.path(wd,"lena_processed2","NSCLC_NEW","clinical_data")
+
+
+ac_meta <- read.csv(file=file.path(data_folder, paste("acquisition_metadata.csv")))
+clinical.data <-read.csv(file=file.path(cd_folder, paste("clinical_data.csv")))
+clinical.data$X <-NULL
+clinical.data$TmaBlock <-NULL
+```
+
+```{r, Add Roi coordinates}
+ac_sub <- data.frame(ac_meta$description,ac_meta$id)
+colnames(ac_sub) <- c("ROI","acID")
+
+ac_sub$ROI[duplicated(ac_sub$ROI)]
+
+ac_sub$ROI[ac_sub$acID==133] <-"A4,2"
+ac_sub$ROI[ac_sub$acID==134] <-"A5,2"
+
+ac_sub <- ac_sub %>%
+            separate(ROI, 
+              into = c("TMA", "ROI_xy"), 
+              sep = "(?<=[A-Za-z])(?=[0-9])",
+              remove=F
+            )
+ac_sub$TMA <-NULL
+
+ac_sub$RoiID <- paste("88",ac_sub$ROI, sep="_")
+#ac_sub$ROI <-NULL
+head(ac_sub)
+
+#remove empty ROIs
+ac_sub <- ac_sub[-c(1, 2, 3, 4), ]
+
+
+length(unique(ac_sub$acID))
+length(unique(ac_sub$RoiID))
+ac_sub$ROI_xy[duplicated(ac_sub$ROI_xy)]
+
+ac_sub$TMA <-"88A"
+ac_sub$Tma_ac <-paste(ac_sub$TMA, ac_sub$acID, sep="_")
+ac_sub$ROI <-NULL
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","clinical_data"))
+write.csv(ac_sub, file=file.path(results_folder, "ac_sub_88A.csv"))
+
+```
+
+```{r, add clinical data and RoiID}
+#change ROI to Image ID
+#names(colData(sce))[names(colData(sce)) == 'ROI'] <- 'ImageID'
+head(colData(sce_88_A))
+
+ac_clinical <- left_join(ac_sub, clinical.data, by="RoiID")
+
+cur_DF <- as_tibble(colData(sce_88_A)) %>% left_join(ac_clinical, by = "acID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sce_88_A$ImageNumber, sce_88_A$CellNumber))
+
+colData(sce_88_A) <- cur_DF
+rownames(colData(sce_88_A)) <-sce_88_A$CellID
+```
+
+```{r, save sce with counts and clinical data to folder counts_clinical data}
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts_clinical data"))
+
+saveRDS(sce_88_A,file=file.path(results_folder, paste("sce_88_A_counts_clinical-data_RAW",".rds",sep="")))
+```
\ No newline at end of file
diff --git a/Read_LC_NSCLC_TMA_88_B.Rmd b/Read_LC_NSCLC_TMA_88_B.Rmd
new file mode 100644
index 0000000..a42ad02
--- /dev/null
+++ b/Read_LC_NSCLC_TMA_88_B.Rmd
@@ -0,0 +1,277 @@
+---
+title: "Read in CP data, NSCLC TMA 88_B"
+author: "Lena Cords"
+output: html_notebook
+---
+
+```{r, import libraries}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+```
+
+
+
+
+```{r, Set wd}
+#set working directory 
+wd <- dirname(dirname(dirname(getwd())))
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_88_B",'cpinp')
+panel_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_88_B",'cpout')
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","RAW"))
+```
+
+
+```{r, Data import}
+#Read in: Cells, Image, Object and Panel
+cells <- read.csv(file=file.path(data_folder,paste('Cells.csv')))
+image <- read.csv(file=file.path(data_folder, paste("Image.csv")))
+object_relationship <- read.csv(file=file.path(data_folder, paste("distance20_Object relationships.csv")))
+panel <- read.csv(file=file.path(panel_folder, paste('panel.csv')))
+```
+
+```{r, What has been measured for each cell}
+#number of cells
+nrow(cells)
+unique(sub("_c[0-9]*$", "", colnames(cells)))
+
+#Select only the measurements you want to work on, based on the measurements displayed above Intensity_MeanIntensityCellsCorr_ColdStack
+counts <- cells[, grepl("Intensity_MeanIntensityCorrected_FullStackFiltered", colnames(cells))]
+```
+
+```{r Scale counts according to CP scaling factor}
+#when using 16bit data this should be 2^16 =65536
+scaling_factor <- unique(image$Scaling_FullStack)
+#Scale up the counts according to scaling factor from CellProfiler
+counts <- counts * scaling_factor
+
+#make sure the counts have actually been multiplied by the scaling factor
+range(counts)
+```
+
+
+```{r Add cell metadata as S4 Object for the SCE}
+#the S4Vector is needed for the SCE and a dataframe that stores the metadata per category
+cell_meta <- DataFrame(CellNumber = cells$ObjectNumber,
+                      Center_X=cells$Location_Center_X,
+                      Center_Y=cells$Location_Center_Y,
+                      Area = cells$AreaShape_Area,
+                      MajorAxisLength=cells$AreaShape_MajorAxisLength,
+                      MinorAxisLength=cells$AreaShape_MinorAxisLength,
+                      Compartment=cells$Intensity_MeanIntensity_TumourMask_Distance_B100)
+cell_meta$ImageNumber <- cells$ImageNumber
+
+#Create DataFrame for image meta data. Then match image and cell metadata by ImageNumber
+
+image_meta <- DataFrame(Area_Description = image$Metadata_description)
+
+
+#run this to get the image number for the meta data
+image_meta$ImageNumber <- image$ImageNumber
+
+
+#merge cell meta and image metadata by Image Number here
+  #cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+```
+
+
+```{r, Add cell metadata as S4 Object for the SCE from file name}
+#get info such as Batch, sample, ROI from original .mcd filename and add it to the cell metadata (colDat)
+
+#this is a good overview to get the different parts of metadata from the file name
+ac_info <- str_split(image$FileName_FullStack, '_', simplify=TRUE)
+image$Metadata_Description
+head(ac_info)
+cell_meta$BatchID <- ac_info[cell_meta$ImageNumber,1]
+cell_meta$Panel <- ac_info[cell_meta$ImageNumber,3]
+cell_meta$TmaID <- ac_info[cell_meta$ImageNumber,5]
+cell_meta$TmaBlock <- ac_info[cell_meta$ImageNumber,6]
+#cell_meta$ROI <- image$Metadata_roiid[cell_meta$ImageNumber]
+cell_meta$acID <- image$Metadata_acid[cell_meta$ImageNumber]
+#colnames(cells)
+```
+
+
+
+
+```{r Set rownames for SCE}
+#Should usually be cell id. However, in the cells.csv file the object number is counted per imgage, so to get unique cell IDs, they must be establishes by combining ObjectNumber, ImageNumber/ROINumber and SampleID
+
+rownames(cell_meta) <-paste(cell_meta$TmaID,cell_meta$TmaBlock, cell_meta$acID, cell_meta$CellNumber, sep='_')
+```
+
+
+```{r, Match channel names with metal names in exact order}
+library(DT)
+#loads panel
+DT::datatable(panel)
+
+#exactl order of channels can be extracted from the _full.csv files
+#header = FALSE is important here as otherwise it'll take the first metal as a header
+channel_metal <- read.csv(file.path(data_folder, paste("full_channelmeta.csv")), header=FALSE)
+
+#this reorders the panel according to the correct metal lsit
+panel <- panel[match(channel_metal[,1], panel$Metal.Tag),]
+#panel <- as.data.table(panel)
+#use Target from panel as the correct channel name to be displayed
+rownames(panel) <-panel$Clean_Target
+
+#channels in cell file are not ordered correctly (starting with 1,11,12 ... 2,21,22 ... 3,31,32...), hence we need to reorder them
+channelNumber <- as.numeric(sub("^.*_c", "", colnames(counts)))
+
+# Order counts based on channel number
+counts <- counts[,order(channelNumber, decreasing = FALSE)] 
+range(counts)
+```
+
+```{r build igraph object}
+library(igraph)
+# Construct neighbour data.frame
+# First in the ImageNumber_ObjectNumber format
+cur_df <- data.frame(CellID_1 = paste0(object_relationship$First.Image.Number, 
+                                       "_", object_relationship$First.Object.Number),
+                     CellID_2 = paste0(object_relationship$Second.Image.Number, 
+                                       "_", object_relationship$Second.Object.Number))
+# Create simple cell IDs
+cellID <- paste0(cell_meta$ImageNumber, "_", cell_meta$CellNumber)
+
+# Change cell IDs
+cur_df$CellID_1 <- rownames(cell_meta)[match(cur_df$CellID_1, cellID)]
+cur_df$CellID_2 <- rownames(cell_meta)[match(cur_df$CellID_2, cellID)]
+
+# Build graph
+g <- graph_from_data_frame(cur_df)
+g
+```
+
+
+```{r Create SingleCellExperiment}
+library(SingleCellExperiment)
+
+sce_88_B <-SingleCellExperiment(assays = list(counts=t(counts)))
+
+#marker name as row name cellID as colname
+rownames(sce_88_B)<-rownames(panel)
+colnames(sce_88_B)<-rownames(cell_meta)
+
+#store metadata in colDat and everything accordingly
+
+colData(sce_88_B) <-cell_meta
+rowData(sce_88_B) <-panel
+metadata(sce_88_B)<-list(graph=g)
+
+sce_88_B$CellID <- paste(sce_88_B$TmaID,sce_88_B$TmaBlock, sce_88_B$acID, sce_88_B$CellNumber, sep='_')
+
+#save SCE object
+saveRDS(sce_88_B, file=file.path(results_folder, paste("sce_88_B_2022.rds")))
+```
+
+
+
+
+#add counts and clinical data
+
+```{r, add transformed counts}
+censor_val <-0.999
+censor_dat <- function(x, quant = 0.999){
+  q = stats::quantile(x, quant)
+  x[x>q] = q
+  return(x)
+}
+fun.censor <- function(x) censor_dat(x, censor_val)
+fun.scale <- function(x) y <- x/max(x)
+
+#censored counts
+assay(sce_88_B, "c_counts") <- t(apply(assay(sce_88_B, "counts"), 1, fun.censor))
+
+#censored counts scaled 0-1
+assay(sce_88_B, "c_counts_scaled") <- t(apply(assay(sce_88_B, "c_counts"),
+                                           1, fun.scale))
+assay(sce_88_B, "c_counts_scaled")[assay(sce_88_B, "c_counts_scaled") < 0] <- 0
+
+
+
+#censored counts asinh scaled
+cofactor <- 1
+assay(sce_88_B, "c_counts_asinh") <- asinh((assay(sce_88_B,"c_counts"))/cofactor)
+
+#censored counts_asinh_scaled
+assay(sce_88_B, "c_counts_asinh_scaled") <- t(apply(assay(sce_88_B, "c_counts_asinh"),
+                                           1, fun.scale))
+assay(sce_88_B, "c_counts_asinh_scaled")[assay(sce_88_B, "c_counts_asinh") < 0] <- 0
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts"))
+
+saveRDS(sce_88_B,file=file.path(results_folder, paste("sce_88_B_counts_RAW",".rds",sep="")))
+```
+
+#Merge ROI xy Data into sce
+```{r, load clinical and meta data}
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_88_B",'cpinp')
+cd_folder <- file.path(wd,"lena_processed2","NSCLC_NEW","clinical_data")
+
+
+ac_meta <- read.csv(file=file.path(data_folder, paste("acquisition_metadata.csv")))
+clinical.data <-read.csv(file=file.path(cd_folder, paste("clinical_data.csv")))
+clinical.data$X <-NULL
+clinical.data$TmaBlock <-NULL
+```
+
+```{r, Add Roi coordinates}
+ac_sub <- data.frame(ac_meta$description,ac_meta$id)
+colnames(ac_sub) <- c("ROI","acID")
+
+
+ac_sub$ROI[duplicated(ac_sub$ROI)]
+
+
+ac_sub <- ac_sub %>%
+            separate(ROI, 
+              into = c("TMA", "ROI_xy"), 
+              sep = "(?<=[A-Za-z])(?=[0-9])",
+              remove=F
+            )
+ac_sub$TMA <-NULL
+
+ac_sub$RoiID <- paste("88",ac_sub$ROI, sep="_")
+ac_sub$ROI <-NULL
+head(ac_sub)
+
+length(unique(ac_sub$ROI_xy))
+length(unique(ac_sub$RoiID))
+length(unique(ac_sub$acID))
+ac_sub$ROI_xy[duplicated(ac_sub$ROI_xy)]
+
+ac_sub$TMA <-"88B"
+ac_sub$Tma_ac <-paste(ac_sub$TMA, ac_sub$acID, sep="_")
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","clinical_data"))
+write.csv(ac_sub, file=file.path(results_folder, "ac_sub_88B.csv"))
+
+```
+
+```{r, add clinical data and RoiID}
+#change ROI to Image ID
+#names(colData(sce))[names(colData(sce)) == 'ROI'] <- 'ImageID'
+head(colData(sce_88_B))
+
+ac_clinical <- left_join(ac_sub, clinical.data, by="RoiID")
+
+cur_DF <- as_tibble(colData(sce_88_B)) %>% left_join(ac_clinical, by = "acID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sce_88_B$ImageNumber, sce_88_B$CellNumber))
+
+colData(sce_88_B) <- cur_DF
+rownames(colData(sce_88_B)) <-sce_88_B$CellID
+```
+
+```{r, save sce with counts and clinical data to folder counts_clinical data}
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts_clinical data"))
+
+saveRDS(sce_88_B,file=file.path(results_folder, paste("sce_88_B_counts_clinical-data_RAW",".rds",sep="")))
+```
\ No newline at end of file
diff --git a/Read_LC_NSCLC_TMA_88_C.Rmd b/Read_LC_NSCLC_TMA_88_C.Rmd
new file mode 100644
index 0000000..66ce451
--- /dev/null
+++ b/Read_LC_NSCLC_TMA_88_C.Rmd
@@ -0,0 +1,272 @@
+---
+title: "Read in CP data, NSCLC TMA 88_C"
+author: "Lena Cords"
+output: html_notebook
+---
+
+```{r, import libraries}
+library(igraph)
+library(SingleCellExperiment)
+library(S4Vectors)
+library(stringr)
+library(DT)
+```
+
+
+
+
+```{r, Set wd}
+#set working directory 
+wd <- dirname(dirname(dirname(getwd())))
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_88_C",'cpinp')
+panel_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_88_C",'cpout')
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","RAW"))
+```
+
+
+```{r, Data import}
+#Read in: Cells, Image, Object and Panel
+cells <- read.csv(file=file.path(data_folder,paste('Cells.csv')))
+image <- read.csv(file=file.path(data_folder, paste("Image.csv")))
+object_relationship <- read.csv(file=file.path(data_folder, paste("distance20_Object relationships.csv")))
+panel <- read.csv(file=file.path(panel_folder, paste('panel.csv')))
+```
+
+```{r, What has been measured for each cell}
+#number of cells
+nrow(cells)
+unique(sub("_c[0-9]*$", "", colnames(cells)))
+
+#Select only the measurements you want to work on, based on the measurements displayed above Intensity_MeanIntensityCellsCorr_ColdStack
+counts <- cells[, grepl("Intensity_MeanIntensityCorrected_FullStackFiltered", colnames(cells))]
+```
+
+```{r Scale counts according to CP scaling factor}
+#when using 16bit data this should be 2^16 =65536
+scaling_factor <- unique(image$Scaling_FullStack)
+#Scale up the counts according to scaling factor from CellProfiler
+counts <- counts * scaling_factor
+
+#make sure the counts have actually been multiplied by the scaling factor
+range(counts)
+```
+
+
+```{r Add cell metadata as S4 Object for the SCE}
+#the S4Vector is needed for the SCE and a dataframe that stores the metadata per category
+cell_meta <- DataFrame(CellNumber = cells$ObjectNumber,
+                      Center_X=cells$Location_Center_X,
+                      Center_Y=cells$Location_Center_Y,
+                      Area = cells$AreaShape_Area,
+                      MajorAxisLength=cells$AreaShape_MajorAxisLength,
+                      MinorAxisLength=cells$AreaShape_MinorAxisLength,
+                      Compartment=cells$Intensity_MeanIntensity_TumourMask_Distance_B100)
+cell_meta$ImageNumber <- cells$ImageNumber
+
+#Create DataFrame for image meta data. Then match image and cell metadata by ImageNumber
+
+image_meta <- DataFrame(Area_Description = image$Metadata_description)
+
+
+#run this to get the image number for the meta data
+image_meta$ImageNumber <- image$ImageNumber
+
+
+#merge cell meta and image metadata by Image Number here
+  #cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+cell_meta <- merge(cell_meta, image_meta, by="ImageNumber")
+```
+
+
+```{r, Add cell metadata as S4 Object for the SCE from file name}
+#get info such as Batch, sample, ROI from original .mcd filename and add it to the cell metadata (colDat)
+
+#this is a good overview to get the different parts of metadata from the file name
+ac_info <- str_split(image$FileName_FullStack, '_', simplify=TRUE)
+image$Metadata_Description
+head(ac_info)
+cell_meta$BatchID <- ac_info[cell_meta$ImageNumber,1]
+cell_meta$Panel <- ac_info[cell_meta$ImageNumber,3]
+cell_meta$TmaID <- ac_info[cell_meta$ImageNumber,5]
+cell_meta$TmaBlock <- ac_info[cell_meta$ImageNumber,6]
+#cell_meta$ROI <- image$Metadata_roiid[cell_meta$ImageNumber]
+cell_meta$acID <- image$Metadata_acid[cell_meta$ImageNumber]
+#colnames(cells)
+```
+
+
+
+
+```{r Set rownames for SCE}
+#Should usually be cell id. However, in the cells.csv file the object number is counted per imgage, so to get unique cell IDs, they must be establishes by combining ObjectNumber, ImageNumber/ROINumber and SampleID
+
+rownames(cell_meta) <-paste(cell_meta$TmaID,cell_meta$TmaBlock, cell_meta$acID, cell_meta$CellNumber, sep='_')
+```
+
+
+```{r, Match channel names with metal names in exact order}
+library(DT)
+#loads panel
+DT::datatable(panel)
+
+#exactl order of channels can be extracted from the _full.csv files
+#header = FALSE is important here as otherwise it'll take the first metal as a header
+channel_metal <- read.csv(file.path(data_folder, paste("full_channelmeta.csv")), header=FALSE)
+
+#this reorders the panel according to the correct metal lsit
+panel <- panel[match(channel_metal[,1], panel$Metal.Tag),]
+#panel <- as.data.table(panel)
+#use Target from panel as the correct channel name to be displayed
+rownames(panel) <-panel$Clean_Target
+
+#channels in cell file are not ordered correctly (starting with 1,11,12 ... 2,21,22 ... 3,31,32...), hence we need to reorder them
+channelNumber <- as.numeric(sub("^.*_c", "", colnames(counts)))
+
+# Order counts based on channel number
+counts <- counts[,order(channelNumber, decreasing = FALSE)] 
+range(counts)
+```
+
+```{r build igraph object}
+library(igraph)
+# Construct neighbour data.frame
+# First in the ImageNumber_ObjectNumber format
+cur_df <- data.frame(CellID_1 = paste0(object_relationship$First.Image.Number, 
+                                       "_", object_relationship$First.Object.Number),
+                     CellID_2 = paste0(object_relationship$Second.Image.Number, 
+                                       "_", object_relationship$Second.Object.Number))
+# Create simple cell IDs
+cellID <- paste0(cell_meta$ImageNumber, "_", cell_meta$CellNumber)
+
+# Change cell IDs
+cur_df$CellID_1 <- rownames(cell_meta)[match(cur_df$CellID_1, cellID)]
+cur_df$CellID_2 <- rownames(cell_meta)[match(cur_df$CellID_2, cellID)]
+
+# Build graph
+g <- graph_from_data_frame(cur_df)
+g
+```
+
+
+```{r Create SingleCellExperiment}
+library(SingleCellExperiment)
+
+sce_88_C <-SingleCellExperiment(assays = list(counts=t(counts)))
+
+#marker name as row name cellID as colname
+rownames(sce_88_C)<-rownames(panel)
+colnames(sce_88_C)<-rownames(cell_meta)
+
+#store metadata in colDat and everything accordingly
+
+colData(sce_88_C) <-cell_meta
+rowData(sce_88_C) <-panel
+metadata(sce_88_C)<-list(graph=g)
+
+sce_88_C$CellID <- paste(sce_88_C$TmaID,sce_88_C$TmaBlock, sce_88_C$acID, sce_88_C$CellNumber, sep='_')
+
+#save SCE object
+saveRDS(sce_88_C, file=file.path(results_folder, paste("sce_88_C_2022.rds")))
+```
+
+
+
+#add counts and clinical data
+
+```{r, add transformed counts}
+censor_val <-0.999
+censor_dat <- function(x, quant = 0.999){
+  q = stats::quantile(x, quant)
+  x[x>q] = q
+  return(x)
+}
+fun.censor <- function(x) censor_dat(x, censor_val)
+fun.scale <- function(x) y <- x/max(x)
+
+#censored counts
+assay(sce_88_C, "c_counts") <- t(apply(assay(sce_88_C, "counts"), 1, fun.censor))
+
+#censored counts scaled 0-1
+assay(sce_88_C, "c_counts_scaled") <- t(apply(assay(sce_88_C, "c_counts"),
+                                           1, fun.scale))
+assay(sce_88_C, "c_counts_scaled")[assay(sce_88_C, "c_counts_scaled") < 0] <- 0
+
+
+
+#censored counts asinh scaled
+cofactor <- 1
+assay(sce_88_C, "c_counts_asinh") <- asinh((assay(sce_88_C,"c_counts"))/cofactor)
+
+#censored counts_asinh_scaled
+assay(sce_88_C, "c_counts_asinh_scaled") <- t(apply(assay(sce_88_C, "c_counts_asinh"),
+                                           1, fun.scale))
+assay(sce_88_C, "c_counts_asinh_scaled")[assay(sce_88_C, "c_counts_asinh") < 0] <- 0
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts"))
+
+saveRDS(sce_88_C,file=file.path(results_folder, paste("sce_88_C_counts_RAW",".rds",sep="")))
+```
+
+#Merge ROI xy Data into sce
+```{r, load clinical and meta data}
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_88_C",'cpinp')
+cd_folder <- file.path(wd,"lena_processed2","NSCLC_NEW","clinical_data")
+
+
+ac_meta <- read.csv(file=file.path(data_folder, paste("acquisition_metadata.csv")))
+clinical.data <-read.csv(file=file.path(cd_folder, paste("clinical_data.csv")))
+clinical.data$X <-NULL
+clinical.data$TmaBlock <-NULL
+```
+
+```{r, Add Roi coordinates}
+ac_sub <- data.frame(ac_meta$description,ac_meta$id)
+colnames(ac_sub) <- c("ROI","acID")
+
+ac_sub <- ac_sub %>%
+            separate(ROI, 
+              into = c("TMA", "ROI_xy"), 
+              sep = "(?<=[A-Za-z])(?=[0-9])",
+              remove=F
+            )
+ac_sub$TMA <-NULL
+
+ac_sub$RoiID <- paste("88",ac_sub$ROI, sep="_")
+ac_sub$ROI <-NULL
+head(ac_sub)
+
+length(unique(ac_sub$ROI_xy))
+length(unique(ac_sub$RoiID))
+ac_sub$ROI_xy[duplicated(ac_sub$ROI_xy)]
+
+ac_sub$TMA <-"88C"
+ac_sub$Tma_ac <-paste(ac_sub$TMA, ac_sub$acID, sep="_")
+
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","clinical_data"))
+write.csv(ac_sub, file=file.path(results_folder, "ac_sub_88C.csv"))
+
+```
+
+```{r, add clinical data and RoiID}
+#change ROI to Image ID
+#names(colData(sce))[names(colData(sce)) == 'ROI'] <- 'ImageID'
+head(colData(sce_88_C))
+
+ac_clinical <- left_join(ac_sub, clinical.data, by="RoiID")
+
+cur_DF <- as_tibble(colData(sce_88_C)) %>% left_join(ac_clinical, by = "acID") %>% DataFrame()
+all.equal(paste(cur_DF$ImageNumber, cur_DF$CellNumber), paste(sce_88_C$ImageNumber, sce_88_C$CellNumber))
+
+colData(sce_88_C) <- cur_DF
+rownames(colData(sce_88_C)) <-sce_88_C$CellID
+head(colData(sce_88_C))
+```
+
+```{r, save sce with counts and clinical data to folder counts_clinical data}
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","counts_clinical data"))
+
+saveRDS(sce_88_C,file=file.path(results_folder, paste("sce_88_C_counts_clinical-data_RAW",".rds",sep="")))
+```
\ No newline at end of file
diff --git a/combine_ROI_clinical-data.Rmd b/combine_ROI_clinical-data.Rmd
new file mode 100644
index 0000000..a8dc71c
--- /dev/null
+++ b/combine_ROI_clinical-data.Rmd
@@ -0,0 +1,106 @@
+---
+title: "R Notebook"
+output: html_notebook
+---
+
+```{r}
+wd <- "/mnt"
+results_folder <-(file.path(wd,"lena_processed2","NSCLC_NEW","sce_objects","clinical_data"))
+```
+
+```{r}
+ac_86A <-read.csv(file=file.path(results_folder, "ac_sub_86A.csv"))
+ac_86B <-read.csv(file=file.path(results_folder, "ac_sub_86B.csv"))
+ac_86C <-read.csv(file=file.path(results_folder, "ac_sub_86C.csv"))
+
+ac_87A <-read.csv(file=file.path(results_folder, "ac_sub_87A.csv"))
+ac_87B <-read.csv(file=file.path(results_folder, "ac_sub_87B.csv"))
+ac_87C <-read.csv(file=file.path(results_folder, "ac_sub_87C.csv"))
+
+ac_88A <-read.csv(file=file.path(results_folder, "ac_sub_88A.csv"))
+ac_88B <-read.csv(file=file.path(results_folder, "ac_sub_88B.csv"))
+ac_88C <-read.csv(file=file.path(results_folder, "ac_sub_88C.csv"))
+
+ac_175A <-read.csv(file=file.path(results_folder, "ac_sub_175A.csv"))
+ac_175B <-read.csv(file=file.path(results_folder, "ac_sub_175B.csv"))
+ac_175C <-read.csv(file=file.path(results_folder, "ac_sub_175C.csv"))
+
+ac_176A <-read.csv(file=file.path(results_folder, "ac_sub_176A.csv"))
+ac_176B <-read.csv(file=file.path(results_folder, "ac_sub_176B.csv"))
+ac_176C <-read.csv(file=file.path(results_folder, "ac_sub_176C.csv"))
+
+ac_178A <-read.csv(file=file.path(results_folder, "ac_sub_178A.csv"))
+ac_178B <-read.csv(file=file.path(results_folder, "ac_sub_178B.csv"))
+ac_178C <-read.csv(file=file.path(results_folder, "ac_sub_178C.csv"))
+ac_178C$RoiID <-paste("178_C",ac_178C$ROI_xy,sep="")
+clinical.data
+
+```
+
+```{r}
+ac_all <- rbind(ac_175A,ac_175B,ac_175C,
+                ac_176A,ac_176B,ac_176C,
+                ac_178A,ac_178B,ac_178C,
+                ac_86A,ac_86B,ac_86C,
+                ac_87A,ac_87B,ac_87C,
+                ac_88A,ac_88B,ac_88C)
+length(unique(ac_all$Tma_ac))
+ac_all$Tma_ac[duplicated(ac_all$Tma_ac)]
+ac_all$RoiID[duplicated(ac_all$RoiID)] #"88_A4,2" "88_A5,2" which had to be acquired twice
+#length(unique(all.cells.sub$Tma_ac))
+length(unique(ac_all$Tma_ac))
+length(unique(ac_all$RoiID))
+
+unique(all.cells.sub$Tma_ac) %in%ac_all$Tma_ac
+
+unique(all.cells.sub$Tma_ac)[!unique(all.cells.sub$Tma_ac)%in%ac_all$Tma_ac]
+unique(ac_all$Tma_ac)[!unique(ac_all$Tma_ac)%in%all.cells.sub$Tma_ac] #"176C_2" "86B_24" are empty acquisitions
+
+
+unique(all.cells.sub$Tma_ac)%>% length()
+unique(ac_all$Tma_ac) %>% length() #--> remove "176C_2" "86B_24" then 2072 == all.cells.sub minus 88A_1, 88A2, 88A3, 88A4
+unique(ac_all$RoiID) %>% length()
+#images 88A_1, 88A_2, 88A_3 and 88A_4 to be removed. Equas 248 cells
+ac_all$RoiID %>% unique()%>% length()
+
+ac_all <-ac_all[ac_all$Tma_ac!="176C_2"&ac_all$Tma_ac!="86B_24",]
+```
+
+```{r, read in clinical data}
+wd <-"/mnt"
+data_folder <- file.path(wd,"lena_processed","NSCLC_TMA","ImcSegmentationPipeline","analysis","LC_NSCLC_TMA_178_C",'cpinp')
+cd_folder <- file.path(wd,"lena_processed2","NSCLC_NEW","clinical_data")
+
+
+clinical.data <-read.csv(file=file.path(cd_folder, paste("clinical_data.csv")))
+clinical.data$X <-NULL
+clinical.data$TmaBlock <-NULL
+unique(clinical.data$test)
+clinical.data$test <-NULL
+```
+
+```{r merge clinical data and ac_all}
+ac_clinical <-left_join(ac_all, clinical.data, by="RoiID")
+
+ac_clinical$Patient_ID[!ac_clinical$Patient_ID %in% unique(clinical.data$Patient_ID)]
+
+ac_all$RoiID[!ac_all$RoiID %in% unique(clinical.data$RoiID)]
+
+ac_clinical$Patient_ID[is.na(ac_clinical$Patient_ID)] <-"Control"
+clinical.data$Patient_ID[is.na(clinical.data$Patient_ID)] <-"Control"
+
+length(unique(clinical.data$Patient_ID))
+
+length(unique(ac_clinical$Patient_ID))
+length(unique(ac_clinical$RoiID))
+length(unique(clinical.data$RoiID))
+
+length(unique(ac_all$RoiID))
+
+table(ac_clinical$DX.name)
+write.csv(ac_clinical, file=file.path(results_folder, "clinical_data_ROI_ac_combined_CORRECT.csv"))
+ac_clinical <-read.csv(file=file.path(results_folder, "clinical_data_ROI_ac_combined.csv"))
+
+ac_clinical$Tma_ac %>% unique() %>% length()
+```
+