partition-state.Rmd

---
title: "Partition by State"
description: |
 This R script partitions the data into training, test, and validation sets using stratified random sampling by State (i.e., homeostatic vs. intermediate vs. reactive).
author:
  - first_name: "Ayush"
    last_name: "Noori"
    url: https://www.github.com/ayushnoori
    affiliation: Massachusetts General Hospital
    affiliation_url: https://www.serranopozolab.org
    orcid_id: 0000-0003-1420-1236
output:
  distill::distill_article:
    toc: true
---

```{r setup, include = FALSE}
knitr::opts_chunk$set(eval = FALSE)
```

# Dependencies

Load requisite packages and define directories. Note that this script uses my personal utilities package brainstorm, which can be downloaded via devtools::install_github("ayushnoori/brainstorm").

```{r load-packages, message=FALSE, warning=FALSE}

# data manipulation
library(data.table)
library(purrr)
library(magrittr)

# fast file system operations
library(fs)

# partition data
library(caret)

# utility functions
library(brainstorm)

```

Note that directories are relative to the R project path.

```{r define-directores}

# set directories
ddir = file.path("Data", "3 - ROIs")
pdir = file.path("Data", "5 - State Partition")
dir1 = file.path("Results", "CNN", "1.2 - State Partition")
dir4 = file.path("Results", "4 - Spectral Clustering")

# create file structure
celltypes = c("Astrocyte", "Microglia", "Vessel") %>% purrr::set_names()
grp = c("Train", "Test", "Validation") %>% purrr::set_names()
pheno = c("CTRL", "AD") %>% purrr::set_names()
state = c("Homeostatic", "Intermediate", "Reactive") %>% purrr::set_names()
dirs = pmap_chr(expand.grid(pdir, celltypes, grp, state), file.path)

# remove prior directories/files if they exist
check_dir = function(fname) {if(fs::dir_exists(fname)) fs::dir_delete(fname); fs::dir_create(fname)}
walk(dirs, check_dir)

```

# Retrieve ROI Paths

Write function to retrieve ROI paths.

```{r retrieve-paths}

retrieve_paths = function(fname) {
  
  # list TIFF files in "/<celltype> ROIs" subdirectories
  tiffs = map(celltypes, ~paste(.x, "ROIs") %>%
                file.path(fname, .) %>%
                list.files(pattern = "\\.tif$", full.names = TRUE))
  
  return(tiffs)
  
}

```

Then, map function over crop list.

```{r get-paths}

# get crop list
crops = file.path(ddir, pheno) %>% list.files(full.names = TRUE)

# get TIFF file paths
tiffs = map(crops, retrieve_paths)

# aggregate TIFF file paths by cell type
tiffs = map(celltypes, ~unlist(map(tiffs, .x), use.names = FALSE))

```

# Partition ROIs

Define function to partition ROIs into training, test, and validation sets.

```{r partition-rois}

partition_rois = function(flist, lab, sc) {
  
  # construct data table
  dat = data.table(Path = flist)
  message("\n", toupper(lab), " ANALYSIS:")
  message("Total TIFF Files: ", nrow(dat))
  message("Total ROI Measurements: ", nrow(sc))
  
  # parse metadata from file path
  dat = dat %>%
    .[, Name := basename(Path)] %>% 
    .[, Group := lab] %>% 
    .[, Condition := map_chr(strsplit(Path, "/"), 3)] %>%
    .[, Sample := flist %>% strsplit("/") %>% map_chr(4) %>%
        strsplit("_") %>% map_chr(1)] %>%
    .[, Batch := ifelse(Sample %in% c("1190", "1301", "2148", "2157",
                                      "2191", "2207"), 1, 2)] %>%
    .[, ID := gsub("(AD_|CTRL_|.tif)", "", Name)] %>%
    merge(sc[, .(ID, State)], by = "ID", all = T)
  
  # partition into test, training, and validation sets
  train_lab = dat[createDataPartition(paste(State, Condition, sep = "_"),
                                      p = 0.6, list = FALSE), Name]
  test_lab = dat[!Name %in% train_lab] %>%
    .[createDataPartition(paste(State, Condition, sep = "_"),
                          p = 0.5, list = FALSE), Name]
  
  # create partition variable
  dat %>%
    .[, Partition := "Validation"] %>%
    .[Name %in% train_lab, Partition := "Train"] %>%
    .[Name %in% test_lab, Partition := "Test"] 
  
  # construct output path
  dat[, Output := file.path(pdir, Group, Partition, State, Name)]
  
  # copy TIFF files to appropriate output folder
  pwalk(dat[, .(Path, Output)], ~fs::file_copy(.x, .y))
  
  # print results
  cat(paste("\n", lab, "ROIs:\n"))
  walk(dat[, .(Condition, Partition, Sample)], ~print(summary(factor(.x))))

  # return data table
  return(dat)
  
}

```

Map function over TIFF file paths.

```{r apply-partition}

# read spectral clustering data
all_sc = readRDS(file.path(dir4, "Z-Score Data.rds"))

# partition ROIs
all = imap(tiffs, ~partition_rois(.x, .y, all_sc[[.y]]))

# save partition result
saveRDS(all, file.path(dir1, "ROI Partition by State.rds"))

```