forked from serrano-pozo-lab/glia-ihc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
partition-state.Rmd
169 lines (120 loc) · 4.54 KB
/
partition-state.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
---
title: "Partition by State"
description: |
This R script partitions the data into training, test, and validation sets using stratified random sampling by State (i.e., homeostatic vs. intermediate vs. reactive).
author:
- first_name: "Ayush"
last_name: "Noori"
url: https://www.github.com/ayushnoori
affiliation: Massachusetts General Hospital
affiliation_url: https://www.serranopozolab.org
orcid_id: 0000-0003-1420-1236
output:
distill::distill_article:
toc: true
---
```{r setup, include = FALSE}
knitr::opts_chunk$set(eval = FALSE)
```
# Dependencies
Load requisite packages and define directories. Note that this script uses my personal utilities package brainstorm, which can be downloaded via devtools::install_github("ayushnoori/brainstorm").
```{r load-packages, message=FALSE, warning=FALSE}
# data manipulation
library(data.table)
library(purrr)
library(magrittr)
# fast file system operations
library(fs)
# partition data
library(caret)
# utility functions
library(brainstorm)
```
Note that directories are relative to the R project path.
```{r define-directores}
# set directories
ddir = file.path("Data", "3 - ROIs")
pdir = file.path("Data", "5 - State Partition")
dir1 = file.path("Results", "CNN", "1.2 - State Partition")
dir4 = file.path("Results", "4 - Spectral Clustering")
# create file structure
celltypes = c("Astrocyte", "Microglia", "Vessel") %>% purrr::set_names()
grp = c("Train", "Test", "Validation") %>% purrr::set_names()
pheno = c("CTRL", "AD") %>% purrr::set_names()
state = c("Homeostatic", "Intermediate", "Reactive") %>% purrr::set_names()
dirs = pmap_chr(expand.grid(pdir, celltypes, grp, state), file.path)
# remove prior directories/files if they exist
check_dir = function(fname) {if(fs::dir_exists(fname)) fs::dir_delete(fname); fs::dir_create(fname)}
walk(dirs, check_dir)
```
# Retrieve ROI Paths
Write function to retrieve ROI paths.
```{r retrieve-paths}
retrieve_paths = function(fname) {
# list TIFF files in "/<celltype> ROIs" subdirectories
tiffs = map(celltypes, ~paste(.x, "ROIs") %>%
file.path(fname, .) %>%
list.files(pattern = "\\.tif$", full.names = TRUE))
return(tiffs)
}
```
Then, map function over crop list.
```{r get-paths}
# get crop list
crops = file.path(ddir, pheno) %>% list.files(full.names = TRUE)
# get TIFF file paths
tiffs = map(crops, retrieve_paths)
# aggregate TIFF file paths by cell type
tiffs = map(celltypes, ~unlist(map(tiffs, .x), use.names = FALSE))
```
# Partition ROIs
Define function to partition ROIs into training, test, and validation sets.
```{r partition-rois}
partition_rois = function(flist, lab, sc) {
# construct data table
dat = data.table(Path = flist)
message("\n", toupper(lab), " ANALYSIS:")
message("Total TIFF Files: ", nrow(dat))
message("Total ROI Measurements: ", nrow(sc))
# parse metadata from file path
dat = dat %>%
.[, Name := basename(Path)] %>%
.[, Group := lab] %>%
.[, Condition := map_chr(strsplit(Path, "/"), 3)] %>%
.[, Sample := flist %>% strsplit("/") %>% map_chr(4) %>%
strsplit("_") %>% map_chr(1)] %>%
.[, Batch := ifelse(Sample %in% c("1190", "1301", "2148", "2157",
"2191", "2207"), 1, 2)] %>%
.[, ID := gsub("(AD_|CTRL_|.tif)", "", Name)] %>%
merge(sc[, .(ID, State)], by = "ID", all = T)
# partition into test, training, and validation sets
train_lab = dat[createDataPartition(paste(State, Condition, sep = "_"),
p = 0.6, list = FALSE), Name]
test_lab = dat[!Name %in% train_lab] %>%
.[createDataPartition(paste(State, Condition, sep = "_"),
p = 0.5, list = FALSE), Name]
# create partition variable
dat %>%
.[, Partition := "Validation"] %>%
.[Name %in% train_lab, Partition := "Train"] %>%
.[Name %in% test_lab, Partition := "Test"]
# construct output path
dat[, Output := file.path(pdir, Group, Partition, State, Name)]
# copy TIFF files to appropriate output folder
pwalk(dat[, .(Path, Output)], ~fs::file_copy(.x, .y))
# print results
cat(paste("\n", lab, "ROIs:\n"))
walk(dat[, .(Condition, Partition, Sample)], ~print(summary(factor(.x))))
# return data table
return(dat)
}
```
Map function over TIFF file paths.
```{r apply-partition}
# read spectral clustering data
all_sc = readRDS(file.path(dir4, "Z-Score Data.rds"))
# partition ROIs
all = imap(tiffs, ~partition_rois(.x, .y, all_sc[[.y]]))
# save partition result
saveRDS(all, file.path(dir1, "ROI Partition by State.rds"))
```