From 9b0bbbfe7610d96ca1fa7bdb6e67c253dc394754 Mon Sep 17 00:00:00 2001
From: EnesSefaAyar <enessefaayar@gmail.com>
Date: Fri, 23 Feb 2024 12:26:08 +0100
Subject: [PATCH] peptide data updated with recently shared file by author.

---
 R/data.R                          | 25 +++++++----------
 inst/scripts/make-data_khan2023.R | 45 +++++++++++++------------------
 2 files changed, 29 insertions(+), 41 deletions(-)

diff --git a/R/data.R b/R/data.R
index 4a0e003..6d80d81 100644
--- a/R/data.R
+++ b/R/data.R
@@ -2431,14 +2431,14 @@
 ##' single-cell runs. Both table are then combined in a single
 ##' [QFeatures] object using the [scp::readSCP] function.
 ##' 
-##' The peptide data were generated from the SCoPE2 R script, 
-##' `EMTTGFB_singleCellProcessing.R`). The data were formated 
-##' to a [SingleCellExperiment] object and the sample metadata
-##' were matched to the column names (mapping is retrieved
-##' after running the SCoPE2 R script) and stored in the `colData`.
-##' The object is then added to the [QFeatures] object and the rows
-##' of the peptide data are linked to the rows of the PSM data based
-##' on the peptide sequence information through an `AssayLink` object.
+##' The peptide data were taken from the same google drive folder
+##' (`EpiToMesen.TGFB.nPoP_trial1_pepByCellMatrix_NSThreshDART_medIntCrNorm.txt`).
+##' The data were formated to a [SingleCellExperiment] object and the sample
+##' metadata were matched to the column names (mapping is retrieved
+##' after running the SCoPE2 R script, `EMTTGFB_singleCellProcessing.R`) and
+##' stored in the `colData`. The object is then added to the [QFeatures] object 
+##' and the rows of the PSM data are linked to the rows of the peptide data
+##' based on the peptide sequence information through an `AssayLink` object.
 ##' 
 ##' The imputed protein data were taken from the same google drive folder
 ##' (`EpiToMesen.TGFB.nPoP_trial1_ProtByCellMatrix_NSThreshDART_medIntCrNorm_imputedNotBC.csv`).
@@ -2451,13 +2451,8 @@
 ##'
 ##' The unimputed protein data were taken from the same google drive folder
 ##' (`EpiToMesen.TGFB.nPoP_trial1_ProtByCellMatrix_NSThreshDART_medIntCrNorm_unimputed.csv`).
-##' The data were formated to a [SingleCellExperiment] object and the sample
-##' metadata were matched to the column names (mapping is retrieved
-##' after running the SCoPE2 R script, `EMTTGFB_singleCellProcessing.R`) and
-##' stored in the `colData`. The object is then added to the [QFeatures] object
-##' and the rows of the peptide data are linked to the rows of the protein data
-##' based on the protein sequence information through an `AssayLink` object.
-##'
+##' The data were formated and added exactly as imputed data.
+##' 
 ##' @source
 ##' The data were downloaded from the
 ##' [Slavov Lab](https://scp.slavovlab.net/Khan_et_al_2023) website via a
diff --git a/inst/scripts/make-data_khan2023.R b/inst/scripts/make-data_khan2023.R
index c8417a6..629a961 100644
--- a/inst/scripts/make-data_khan2023.R
+++ b/inst/scripts/make-data_khan2023.R
@@ -104,15 +104,13 @@ idMap <- read.csv(paste0(root, "cellIDToChannel.csv"), row.names = 1)
 
 ####---- Add the peptide data ----####
 
-## The `peptides.csv` and `peptides_rowData.csv` files were generated using the
-## `EMTTGFB_singleCellProcessing.R` script from 
-## https://github.com/SlavovLab/EMT_TGFB_2023/tree/main.
-## `peptides.csv`: contains peptides x cells before the aggregation.
-## `peptides_rowData.csv`: contains rowData of peptides.
-
-read.csv(paste0(root, "peptides.csv")) %>%
-  rename(peptide = X) %>%
-  readSingleCellExperiment(ecol = 2:422, fnames = "peptide") ->
+## Peptide quantity matrix downloaded from:  
+## https://drive.google.com/drive/folders/1zCsRKWNQuAz5msxx0DfjDrIe6pUjqQmj
+
+peps <- read.delim(paste0(root, "EpiToMesen.TGFB.nPoP_trial1_pepByCellMatrix_NSThreshDART_medIntCrNorm.txt"))
+peps %>%
+  rename(peptide = pep) %>%
+  readSingleCellExperiment(ecol = 1:421, fnames = "peptide") ->
   peptides
 
 colnames(peptides) <- idMap$Channel[match(colnames(peptides), idMap$cellID)]
@@ -120,30 +118,25 @@ colData(peptides) <- DataFrame(annot[colnames(peptides), ])
 
 khan2023 <- addAssay(khan2023, peptides, name = "peptides")
 
+## Include rowData to peptides assay
+rowData(khan2023[["peptides"]]) <- DataFrame(peptide = peps$pep, 
+                                             protein = peps$prot)
+
 ## First find which PSM assays were included
-sel <- sapply(grep("eSK", names(khan2023), value = TRUE), function(name) {
-  x <- khan2023[[name]]
-  ## Does the current PSM data have at least 1 colname in common with pep?
-  inColnames <- any(colnames(x) %in% colnames(peptides))
-  ## Does the current PSM data have at least 1 peptide sequence in common with pep?
-  inSequence <- any(rowData(x)$peptide %in% rowData(peptides)$peptide)
-  return(inColnames && inSequence) ## The PSM assay must fulfill both conditions
+sel <- sapply(grep("eSK", names(khan2023), value = TRUE), 
+              function(name) {
+                x <- khan2023[[name]]
+                ## Does the current PSM data have at least 1 colname in common with pep?
+                inColnames <- any(colnames(x) %in% colnames(peptides))
+                ## Does the current PSM data have at least 1 peptide sequence in common with pep?
+                inSequence <- any(rowData(x)$peptide %in% rowData(peptides)$peptide)
+                return(inColnames && inSequence) ## The PSM assay must fulfill both conditions
 })
 
 ## Add an AssayLink that bridges the PSM assays and the peptide assay
 khan2023 <- addAssayLink(khan2023, from = which(sel), to = "peptides", 
                              varFrom = rep("peptide", sum(sel)), varTo = "peptide")
 
-## Include rowData to peptides assay
-read.csv(paste0(root, "peptides_rowData.csv"), row.names = 1) %>%
-  select(pep, prot) %>%
-  mutate(peptide = pep, protein = prot, pep = NULL, prot = NULL) %>%
-  unique() %>%
-  DataFrame() ->
-  pepRow
-  
-rowData(khan2023[["peptides"]]) <- pepRow
-
 ####---- Add the protein data ----####
 
 ## Imputed and un-imputed protein quantity matrices downloaded from: