-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path04-clustering.R
136 lines (113 loc) · 3.84 KB
/
04-clustering.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
## Read in SingleCellExperiment RDS object that has been normalized and has both PCA and UMAP embeddings.
## This script performs graph based clustering, by default Louvain,
## and outputs a SCE with the cluster assignments stored in the colData.
# Command line usage:
# Rscript --vanilla 04-clustering.R \
# --sce "results/Gawad_processed_data/SCPCS000245/SCPCL000343_miQC_downstream_processed_normalized_reduced_sce.rds" \
# --seed 2021 \
# --cluster_type "louvain" \
# --nearest_neighbors 10
## Set up -------------------------------------------------------------
library(optparse)
## Command line arguments/options
option_list <- list(
optparse::make_option(
c("-i", "--sce"),
type = "character",
default = NULL,
help = "path to normalized SCE",
),
optparse::make_option(
c("-s", "--seed"),
type = "integer",
default = 2021,
help = "seed integer",
metavar = "integer"
),
optparse::make_option(
c("-c", "--cluster_type"),
type = "character",
default = "louvain",
help = "Method used for clustering. Can be either louvain or walktrap.",
),
optparse::make_option(
c("-n", "--nearest_neighbors"),
type = "integer",
default = 10,
help = "Number of nearest neighbors to include during graph construction."
),
optparse::make_option(
c("-o", "--output_filepath"),
type = "character",
default = NULL,
help = "path to output RDS file containing SCE with cluster assignments added"
),
optparse::make_option(
c("--project_root"),
type = "character",
help = "the path to the root directory for the R project and where the `utils` folder lives."
)
)
# Read the arguments passed
opt <- parse_args(OptionParser(option_list = option_list))
# if project root is not provided use here::here()
if(is.null(opt$project_root)){
project_root <- here::here()
} else {
project_root <- opt$project_root
}
# Source in set up function
source(file.path(project_root, "utils", "setup-functions.R"))
# Check R version
check_r_version()
# Set up renv
setup_renv(project_filepath = project_root)
# source in clustering functions
source(file.path(project_root, "utils", "clustering-functions.R"))
## Load libraries
suppressPackageStartupMessages({
library(magrittr)
library(bluster)
library(SingleCellExperiment)
})
## Set the seed
set.seed(opt$seed)
# Check that the input file exists
if (!file.exists(opt$sce)){
stop(paste(opt$sce, "does not exist."))
}
# Check that clustering type is valid
if (!opt$cluster_type %in% c("louvain", "walktrap")) {
stop("--cluster_type (-c) must be either louvain or walktrap.")
}
# Check that `nearest_neighbors` is an integer
if (opt$nearest_neighbors %% 1 != 0){
stop("The --nearest_neighbors (-n) argument value must be an integer.")
}
# make sure that output file is provided and ends in rds
if (!is.null(opt$output_filepath)){
if(!(stringr::str_ends(opt$output_filepath, ".rds"))){
stop("output file name must end in .rds")
}
} else {
stop("--output_filepath (-o) must be provided.")
}
#### Read in data and check formatting -----------------------------------------
# Read in normalized sce object
sce <- readr::read_rds(opt$sce)
# check that data contains an SCE with PCA results
if(is(sce,"SingleCellExperiment")){
if(!"PCA" %in% reducedDimNames(sce)) {
stop("PCA results are not found in the provided SingleCellExperiment object.")
}
} else {
stop("file specified by --sce (-s) must contain a SingleCellExperiment object.")
}
#### Perform clustering --------------------------------------------------------
# perform clustering
sce <- graph_clustering(normalized_sce = sce,
nearest_neighbors_min = opt$nearest_neighbors,
nearest_neighbors_max = opt$nearest_neighbors,
cluster_type = opt$cluster_type)
# write output file
readr::write_rds(sce, opt$output_filepath)