-
Notifications
You must be signed in to change notification settings - Fork 78
/
Copy pathrliger.R
executable file
·6625 lines (6264 loc) · 260 KB
/
rliger.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#' @importFrom Matrix colSums rowSums t
#' @importFrom grDevices dev.off pdf
NULL
#' The LIGER Class
#'
#' The liger object is created from two or more single cell datasets. To construct a
#' liger object, the user needs to provide at least two expression (or another
#' single-cell modality) matrices. The class provides functions for data
#' preprocessing, integrative analysis, and visualization.
#'
#' The key slots used in the liger object are described below.
#'
#' @slot raw.data List of raw data matrices, one per experiment/dataset (genes by cells)
#' @slot norm.data List of normalized matrices (genes by cells)
#' @slot scale.data List of scaled matrices (cells by genes)
#' @slot sample.data List of sampled matrices (gene by cells)
#' @slot h5file.info List of HDF5-related information for each input dataset. Paths to raw data, indices,
#' indptr, barcodes, genes and the pipeline through which the HDF5 file is formated (10X, AnnData, etc),
#' type of sampled data (raw, normalized or scaled).
#' @slot cell.data Dataframe of cell attributes across all datasets (nrows equal to total number
#' cells across all datasets)
#' @slot var.genes Subset of informative genes shared across datasets to be used in matrix
#' factorization
#' @slot H Cell loading factors (one matrix per dataset, dimensions cells by k)
#' @slot H.norm Normalized cell loading factors (cells across all datasets combined into single
#' matrix)
#' @slot W Shared gene loading factors (k by genes)
#' @slot V Dataset-specific gene loading factors (one matrix per dataset, dimensions k by genes)
#' @slot A Matrices used for online learning (XH)
#' @slot B Matrices used for online learning (HTH)
#' @slot tsne.coords Matrix of 2D coordinates obtained from running t-SNE on H.norm or H matrices
#' @slot alignment.clusters Initial joint cluster assignments from shared factor alignment
#' @slot clusters Joint cluster assignments for cells
#' @slot snf List of values associated with shared nearest factor matrix for use in clustering and
#' alignment (out.summary contains edge weight information between cell combinations)
#' @slot agg.data Data aggregated within clusters
#' @slot parameters List of parameters used throughout analysis
#' @slot version Version of package used to create object
#'
#' @name liger-class
#' @rdname liger-class
#' @aliases liger-class
#' @exportClass liger
#' @importFrom Rcpp evalCpp
#' @useDynLib rliger
liger <- methods::setClass(
"liger",
slots = c(
raw.data = "list",
norm.data = "list",
scale.data = "list",
sample.data = "list",
h5file.info = "list",
cell.data = "data.frame",
var.genes = "vector",
H = "list",
H.norm = "matrix",
W = "matrix",
V = "list",
A = "list",
B = "list",
tsne.coords = "matrix",
alignment.clusters = 'factor',
clusters= "factor",
agg.data = "list",
parameters = "list",
snf = 'list',
version = 'ANY'
)
)
#' show method for liger
#'
#' @param object liger object
#' @name show
#' @aliases show,liger-method
#' @docType methods
#' @rdname show-methods
setMethod(
f = "show",
signature = "liger",
definition = function(object) {
cat(
"An object of class",
class(object),
"\nwith",
length([email protected]),
"datasets and\n",
nrow([email protected]),
"total cells."
)
invisible(x = NULL)
}
)
#######################################################################################
#### Data Preprocessing
#' Read 10X alignment data (including V3)
#'
#' This function generates a sparse matrix (genes x cells) from the data generated by 10X's
#' cellranger count pipeline. It can process V2 and V3 data together, producing either a single
#' merged matrix or list of matrices. Also handles multiple data types produced by 10X V3 (Gene
#' Expression, Antibody Capture, CRISPR, CUSTOM).
#'
#' @param sample.dirs List of directories containing either matrix.mtx(.gz) file along with genes.tsv,
#' (features.tsv), and barcodes.tsv, or outer level 10X output directory (containing outs directory).
#' @param sample.names Vector of names to use for samples (corresponding to sample.dirs)
#' @param merge Whether to merge all matrices of the same data type across samples or leave as list
#' of matrices (default TRUE).
#' @param num.cells Optional limit on number of cells returned for each sample (only for Gene
#' Expression data). Retains the cells with the highest numbers of transcripts (default NULL).
#' @param min.umis Minimum UMI threshold for cells (default 0).
#' @param use.filtered Whether to use 10X's filtered data (as opposed to raw). Only relevant for
#' sample.dirs containing 10X outs directory (default FALSE).
#' @param reference For 10X V<3, specify which reference directory to use if sample.dir is outer
#' level 10X directory (only necessary if more than one reference used for sequencing).
#' (default NULL)
#' @param data.type Indicates the protocol of the input data. If not specified, input data will be
#' considered scRNA-seq data (default 'rna', alternatives: 'atac').
#' @param verbose Print messages (TRUE by default)
#'
#' @return List of merged matrices across data types (returns sparse matrix if only one data type
#' detected), or nested list of matrices organized by sample if merge=F.
#'
#' @import Matrix
#' @importFrom utils read.delim read.table
#'
#' @export
#' @examples
#' \dontrun{
#' # 10X output directory V2 -- contains outs/raw_gene_bc_matrices/<reference>/...
#' sample.dir1 <- "path/to/outer/dir1"
#' # 10X output directory V3 -- for two data types, Gene Expression and CUSTOM
#' sample.dir2 <- "path/to/outer/dir2"
#' dges1 <- read10X(list(sample.dir1, sample.dir2), c("sample1", "sample2"), min.umis = 50)
#' ligerex <- createLiger(expr = dges1[["Gene Expression"]], custom = dges1[["CUSTOM"]])
#' }
read10X <- function(sample.dirs, sample.names, merge = TRUE, num.cells = NULL, min.umis = 0,
use.filtered = FALSE, reference = NULL, data.type = "rna", verbose = TRUE) {
datalist <- list()
datatypes <- c("Gene Expression")
if (length(num.cells) == 1) {
num.cells <- rep(num.cells, length(sample.dirs))
}
for (i in seq_along(sample.dirs)) {
print(paste0("Processing sample ", sample.names[i]))
sample.dir <- sample.dirs[[i]]
inner1 <- paste0(sample.dir, "/outs")
if (dir.exists(inner1)) {
sample.dir <- inner1
is_v3 <- dir.exists(paste0(sample.dir, "/filtered_feature_bc_matrix"))
matrix.prefix <- ifelse(use.filtered, "filtered", "raw")
if (is_v3) {
sample.dir <- paste0(sample.dir, "/", matrix.prefix, "_feature_bc_matrix")
} else {
if (is.null(reference)) {
references <- list.dirs(paste0(sample.dir, "/raw_gene_bc_matrices"),
full.names = FALSE,
recursive = FALSE
)
if (length(references) > 1) {
stop("Multiple reference genomes found. Please specify a single one.")
} else {
reference <- references[1]
}
}
sample.dir <- paste0(sample.dir, "/", matrix.prefix, "_gene_bc_matrices/", reference)
}
} else {
is_v3 <- file.exists(paste0(sample.dir, "/features.tsv.gz"))
}
suffix <- ifelse(is_v3, ".gz", "")
if (data.type == "rna") {
features.file <- ifelse(is_v3, paste0(sample.dir, "/features.tsv.gz"),
paste0(sample.dir, "/genes.tsv")
)
} else if (data.type == "atac") {
features.file <- ifelse(is_v3, paste0(sample.dir, "/peaks.bed.gz"),
paste0(sample.dir, "/peaks.bed")
)
}
matrix.file <- paste0(sample.dir, "/matrix.mtx", suffix)
barcodes.file <- paste0(sample.dir, "/barcodes.tsv", suffix)
rawdata <- readMM(matrix.file)
# convert to dgc matrix
if (class(rawdata)[1] == "dgTMatrix") {
rawdata <- as(rawdata, "CsparseMatrix")
}
# filter for UMIs first to increase speed
umi.pass <- which(colSums(rawdata) > min.umis)
if (length(umi.pass) == 0) {
message("No cells pass UMI cutoff. Please lower it.")
}
rawdata <- rawdata[, umi.pass, drop = FALSE]
barcodes <- readLines(barcodes.file)[umi.pass]
# Remove -1 tag from barcodes
if (all(grepl(barcodes, pattern = "\\-1$"))) {
barcodes <- as.vector(sapply(barcodes, function(x) {
strsplit(x, "-")[[1]][1]
}))
}
if (data.type == "rna") {
features <- read.delim(features.file, header = FALSE, stringsAsFactors = FALSE)
rownames(rawdata) <- make.unique(features[, 2])
} else if (data.type == "atac") {
features <- read.table(features.file, header = FALSE)
features <- paste0(features[, 1], ":", features[, 2], "-", features[, 3])
rownames(rawdata) <- features
}
# since some genes are only differentiated by ENSMBL
colnames(rawdata) <- barcodes
# split based on 10X datatype -- V3 has Gene Expression, Antibody Capture, CRISPR, CUSTOM
# V2 has only Gene Expression by default and just two columns
if (is.null(ncol(features))) {
samplelist <- list(rawdata)
names(samplelist) <- c("Chromatin Accessibility")
} else if (ncol(features) < 3) {
samplelist <- list(rawdata)
names(samplelist) <- c("Gene Expression")
} else {
sam.datatypes <- features[, 3]
sam.datatypes.unique <- unique(sam.datatypes)
# keep track of all unique datatypes
datatypes <- union(datatypes, unique(sam.datatypes))
samplelist <- lapply(sam.datatypes.unique, function(x) {
rawdata[which(sam.datatypes == x), ]
})
names(samplelist) <- sam.datatypes.unique
}
# num.cells filter only for gene expression data
if (!is.null(num.cells)) {
if (names(samplelist) == "Gene Expression" | names(samplelist) == "Chromatin Accessibility") {
data_label <- names(samplelist)
cs <- colSums(samplelist[[data_label]])
limit <- ncol(samplelist[[data_label]])
if (num.cells[i] > limit) {
if (verbose) {
message("You selected more cells than are in matrix ", i,
". Returning all ", limit, " cells.")
}
num.cells[i] <- limit
}
samplelist[[data_label]] <- samplelist[[data_label]][, order(cs, decreasing = TRUE)
[1:num.cells[i]]]
}
# cs <- colSums(samplelist[["Gene Expression"]])
# limit <- ncol(samplelist[["Gene Expression"]])
# if (num.cells[i] > limit) {
# print(paste0(
# "You selected more cells than are in matrix ", i,
# ". Returning all ", limit, " cells."
# ))
# num.cells[i] <- limit
# }
# samplelist[["Gene Expression"]] <- samplelist[["Gene Expression"]][, order(cs, decreasing = TRUE)
# [1:num.cells[i]]]
}
datalist[[i]] <- samplelist
}
if (merge) {
if (verbose) {
message("Merging samples")
}
return_dges <- lapply(datatypes, function(x) {
mergelist <- lapply(datalist, function(d) {
d[[x]]
})
mergelist <- mergelist[!sapply(mergelist, is.null)]
sample.names.x <- sample.names[!sapply(mergelist, is.null)]
MergeSparseDataAll(mergelist, sample.names)
})
names(return_dges) <- datatypes
# if only one type of data present
if (length(return_dges) == 1) {
if (verbose){
message("Returning ", datatypes, " data matrix")
}
return(return_dges[[1]])
}
return(return_dges)
} else {
names(datalist) <- sample.names
return(datalist)
}
}
#' Merge hdf5 files
#'
#' This function merges hdf5 files generated from different libraries (cell ranger by default)
#' before they are preprocessed through Liger pipeline.
#'
#' @param file.list List of path to hdf5 files.
#' @param library.names Vector of library names (corresponding to file.list)
#' @param new.filename String of new hdf5 file name after merging (default new.h5).
#' @param format.type string of HDF5 format (10X CellRanger by default).
#' @param data.name Path to the data values stored in HDF5 file.
#' @param indices.name Path to the indices of data points stored in HDF5 file.
#' @param indptr.name Path to the pointers stored in HDF5 file.
#' @param genes.name Path to the gene names stored in HDF5 file.
#' @param barcodes.name Path to the barcodes stored in HDF5 file.
#'
#' @return Directly generates newly merged hdf5 file.
#'
#' @import hdf5r
#'
#' @export
#' @examples
#' \dontrun{
#' # For instance, we want to merge two datasets saved in HDF5 files (10X CellRanger)
#' # paths to datasets: "library1.h5","library2.h5"
#' # dataset names: "lib1", "lib2"
#' # name for output HDF5 file: "merged.h5"
#' mergeH5(list("library1.h5","library2.h5"), c("lib1","lib2"), "merged.h5")
#' }
mergeH5 <- function(file.list,
library.names,
new.filename,
format.type = "10X",
data.name = NULL,
indices.name = NULL,
indptr.name = NULL,
genes.name = NULL,
barcodes.name = NULL){
h5_merged = hdf5r::H5File$new(paste0(new.filename,".h5"), mode = "w")
h5_merged$create_group("matrix")
h5_merged$create_group("matrix/features")
num_data_prev = 0
num_indptr_prev = 0
num_cells_prev = 0
last_inptr = 0
for (i in 1:length(file.list)){
h5file = hdf5r::H5File$new(file.list[[i]], mode = "r")
if (format.type == "10X"){
data = h5file[["matrix/data"]][]
indices = h5file[["matrix/indices"]][]
indptr = h5file[["matrix/indptr"]][]
barcodes = paste0(library.names[i], "_", h5file[["matrix/barcodes"]][])
genes = h5file[["matrix/features/name"]][]
} else if (format.type == "AnnData"){
data = h5file[["raw.X/data"]][]
indices = h5file[["raw.X/indices"]][]
indptr = h5file[["raw.X/indptr"]][]
barcodes = paste0(library.names[i], "_", h5file[["obs"]][]$cell)
genes = h5file[["raw.var"]][]$index
} else {
data = h5file[[data.name]][]
indices = h5file[[indices.name]][]
indptr = h5file[[indptr.name]][]
barcodes = paste0(library.names[i], "_", h5file[[barcodes.name]][])
genes = h5file[[genes.name]][]
}
if (i != 1) indptr = indptr[2:length(indptr)]
num_data = length(data)
num_indptr = length(indptr)
num_cells = length(barcodes)
indptr = indptr + last_inptr
last_inptr = indptr[num_indptr]
if (i == 1) {
h5_merged[["matrix/data"]] = data
h5_merged[["matrix/indices"]] = indices
h5_merged[["matrix/indptr"]] = indptr
h5_merged[["matrix/barcodes"]] = barcodes
h5_merged[["matrix/features/name"]] = genes
} else {
h5_merged[["matrix/data"]][(num_data_prev + 1):(num_data_prev + num_data)] = data
h5_merged[["matrix/indices"]][(num_data_prev + 1):(num_data_prev + num_data)] = indices
h5_merged[["matrix/indptr"]][(num_indptr_prev + 1):(num_indptr_prev + num_indptr)] = indptr
h5_merged[["matrix/barcodes"]][(num_cells_prev + 1):(num_cells_prev + num_cells)] = barcodes
}
num_data_prev = num_data_prev + num_data
num_indptr_prev = num_indptr_prev + num_indptr
num_cells_prev = num_cells_prev + num_cells
h5file$close_all()
}
h5_merged$close_all()
}
#' Restore links (to hdf5 files) for reloaded online Liger object
#'
#' When loading the saved online Liger object in a new R session, the links to hdf5 files may be corrupted. This functions enables
#' the restoration of those links so that new analyses can be carried out.
#'
#' @param object \code{liger} object.
#' @param file.path List of paths to hdf5 files.
#'
#' @return \code{liger} object with restored links.
#'
#' @import hdf5r
#'
#' @export
#' @examples
#' \dontrun{
#' # We want to restore the ligerex (liger object based on HDF5 files)
#' # It has broken connections to HDF5 files
#' # Call the following function and provide the paths to the correspoinding files
#' ligerex = restoreOnlineLiger(ligerex, file.path = list("path1/library1.h5", "path2/library2.h5"))
#' }
restoreOnlineLiger <- function(object, file.path = NULL) {
if (is.null(file.path) & is.null([email protected][[1]][["file.path"]])) { # file path is not provided by file.path param or liger object
stop('File path information is not stored in the liger object. Please provide a list of file paths through file.path parameter.')
}
if (!is.null(file.path)) { # if new file path is provided, update liger object h5file.info
for (i in 1:length([email protected])) {
[email protected][[i]][["file.path"]] = file.path[[i]]
}
}
# restore access to corresponding h5 files
[email protected] = lapply([email protected], function(x) hdf5r::H5File$new(x[["file.path"]], mode="r+"))
[email protected] = lapply([email protected], function(x) x[["norm.data"]])
[email protected] = lapply([email protected], function(x) x[["scale.data"]])
for (i in 1:length([email protected])){
if ([email protected][[i]][["format.type"]] == "10X"){
barcodes.name = "matrix/barcodes"
barcodes = [email protected][[i]][[barcodes.name]][]
num_cells = [email protected][[i]][[barcodes.name]]$dims
data.name = "matrix/data"
indices.name = "matrix/indices"
indptr.name = "matrix/indptr"
genes.name = "matrix/features/name"
} else if (format.type.list[i] == "AnnData"){
barcodes.name = "obs"
barcodes = [email protected][[i]][[barcodes.name]][]$cell
num_cells = length([email protected][[i]][[barcodes.name]][]$cell)
data.name = "raw.X/data"
indices.name = "raw.X/indices"
indptr.name = "raw.X/indptr"
genes.name = "raw.var"
} else {
barcodes = [email protected][[i]][[barcodes.name]][]
num_cells = length([email protected][[i]][[barcodes.name]][])
data.name = data.name
indices.name = indices.name
indptr.name = indptr.name
}
[email protected][[i]][["data"]] = [email protected][[i]][[data.name]]
[email protected][[i]][["indices"]] = [email protected][[i]][[indices.name]]
[email protected][[i]][["indptr"]] = [email protected][[i]][[indptr.name]]
[email protected][[i]][["barcodes"]] = [email protected][[i]][[barcodes.name]]
[email protected][[i]][["genes"]] = [email protected][[i]][[genes.name]]
}
return(object)
}
#' Create a liger object.
#'
#' This function initializes a liger object with the raw data passed in. It requires a list of
#' expression (or another single-cell modality) matrices (gene by cell) for at least two datasets.
#' By default, it converts all passed data into sparse matrices (dgCMatrix) to reduce object size.
#' It initializes cell.data with nUMI and nGene calculated for every cell.
#'
#' @param raw.data List of expression matrices (gene by cell). Should be named by dataset.
#' @param take.gene.union Whether to fill out raw.data matrices with union of genes across all
#' datasets (filling in 0 for missing data) (requires make.sparse = TRUE) (default FALSE).
#' @param remove.missing Whether to remove cells not expressing any measured genes, and genes not
#' expressed in any cells (if take.gene.union = TRUE, removes only genes not expressed in any
#' dataset) (default TRUE).
#' @param format.type HDF5 format (10X CellRanger by default).
#' @param data.name Path to the data values stored in HDF5 file.
#' @param indices.name Path to the indices of data points stored in HDF5 file.
#' @param indptr.name Path to the pointers stored in HDF5 file.
#' @param genes.name Path to the gene names stored in HDF5 file.
#' @param barcodes.name Path to the barcodes stored in HDF5 file.
#' @param verbose Print messages (TRUE by default)
#'
#' @return \code{liger} object with raw.data slot set.
#'
#' @import Matrix
#' @import hdf5r
#'
#' @export
#' @examples
#' # Demonstration using matrices with randomly generated numbers
#' Y <- matrix(runif(5000,0,2), 10,500)
#' Z <- matrix(runif(5000,0,2), 10,500)
#' ligerex <- createLiger(list(y_set = Y, z_set = Z))
createLiger <- function(raw.data,
take.gene.union = FALSE,
remove.missing = TRUE,
format.type = "10X",
data.name = NULL,
indices.name = NULL,
indptr.name = NULL,
genes.name = NULL,
barcodes.name = NULL,
verbose = TRUE) {
if (class(raw.data[[1]])[1] == "character") { #HDF5 filenames instead of in-memory matrices
object <- methods::new(Class = "liger", raw.data = raw.data,
version = packageVersion("rliger"))
object@V = rep(list(NULL), length(raw.data))
object@H = rep(list(NULL), length(raw.data))
cell.data = list()
if (length(format.type) == 1) format.type.list = rep(format.type, length(raw.data))
for (i in 1:length(raw.data)){
file.h5 = hdf5r::H5File$new(raw.data[[i]], mode="r+")
[email protected][[i]] = file.h5
if (format.type.list[i] == "10X"){
barcodes.name = "matrix/barcodes"
barcodes = file.h5[[barcodes.name]][]
num_cells = file.h5[[barcodes.name]]$dims
data.name = "matrix/data"
indices.name = "matrix/indices"
indptr.name = "matrix/indptr"
genes.name = "matrix/features/name"
} else if (format.type.list[i] == "AnnData"){
barcodes.name = "obs"
barcodes = file.h5[[barcodes.name]][]$cell
num_cells = length(file.h5[[barcodes.name]][]$cell)
data.name = "raw.X/data"
indices.name = "raw.X/indices"
indptr.name = "raw.X/indptr"
genes.name = "raw.var"
} else {
barcodes = file.h5[[barcodes.name]][]
num_cells = length(file.h5[[barcodes.name]][])
data.name = data.name
indices.name = indices.name
indptr.name = indptr.name
}
[email protected][[i]] = list(data = file.h5[[data.name]],
indices = file.h5[[indices.name]],
indptr = file.h5[[indptr.name]],
barcodes = file.h5[[barcodes.name]],
genes = file.h5[[genes.name]],
format.type = format.type.list[i],
sample.data.type = NULL,
file.path = raw.data[[i]])
if (file.h5$exists("norm.data")){
[email protected][[i]] = file.h5[["norm.data"]]
names([email protected])[[i]] = names([email protected])[[i]]
}
if (file.h5$exists("scale.data")){
[email protected][[i]] = file.h5[["scale.data"]]
names([email protected])[[i]] = names([email protected])[[i]]
}
if (file.h5$exists("cell.data")){
cell.data[[i]] = data.frame(dataset = file.h5[["cell.data"]][]$dataset,
nUMI = file.h5[["cell.data"]][]$nUMI,
nGene = file.h5[["cell.data"]][]$nGene)
rownames(cell.data[[i]]) = file.h5[["cell.data"]][]$barcode
} else {
dataset = rep(names([email protected])[i], num_cells)
cell.data[[i]] = data.frame(dataset)
rownames(cell.data[[i]]) = barcodes
}
}
if (is.null(names([email protected]))){
names([email protected]) <- as.character(paste0("data",1:length([email protected])))
}
[email protected] = Reduce(rbind, cell.data)
names(object@H) <- names(object@V) <- names([email protected]) <- names([email protected])
return(object)
}
raw.data <- lapply(raw.data, function(x) {
if (class(x)[1] == "dgTMatrix" | class(x)[1] == 'dgCMatrix') {
mat <- as(x, 'CsparseMatrix')
# Check if dimnames exist
if (is.null(x@Dimnames[[1]])) {
stop('Raw data must have both row (gene) and column (cell) names.')
}
mat@Dimnames <- x@Dimnames
return(mat)
} else {
as(as.matrix(x), 'CsparseMatrix')
}
})
if (length(Reduce(intersect, lapply(raw.data, colnames))) > 0 & length(raw.data) > 1) {
stop('At least one cell name is repeated across datasets; please make sure all cell names
are unique.')
}
if (take.gene.union) {
merged.data <- MergeSparseDataAll(raw.data)
if (remove.missing) {
missing_genes <- which(rowSums(merged.data) == 0)
if (length(missing_genes) > 0) {
if (verbose) {
message("Removing ", length(missing_genes),
" genes not expressed in any cells across merged datasets.")
}
if (length(missing_genes) < 25) {
if (verbose) {
message(rownames(merged.data)[missing_genes])
}
}
merged.data <- merged.data[-missing_genes, ]
}
}
raw.data <- lapply(raw.data, function(x) {
merged.data[, colnames(x)]
})
}
object <- methods::new(
Class = "liger",
raw.data = raw.data,
version = packageVersion("rliger")
)
# remove missing cells
if (remove.missing) {
object <- removeMissingObs(object, use.cols = TRUE, verbose = verbose)
# remove missing genes if not already merged
if (!take.gene.union) {
object <- removeMissingObs(object, use.cols = FALSE, verbose = verbose)
}
}
# Initialize cell.data for object with nUMI, nGene, and dataset
nUMI <- unlist(lapply([email protected], function(x) {
colSums(x)
}), use.names = FALSE)
nGene <- unlist(lapply([email protected], function(x) {
colSums(x > 0)
}), use.names = FALSE)
dataset <- unlist(lapply(seq_along([email protected]), function(i) {
rep(names([email protected])[i], ncol([email protected][[i]]))
}), use.names = FALSE)
[email protected] <- data.frame(nUMI, nGene, dataset)
rownames([email protected]) <- unlist(lapply([email protected], function(x) {
colnames(x)
}), use.names = FALSE)
return(object)
}
#create new dataset, first deleting existing record if dataset already exists
safe_h5_create = function(object, idx, dataset_name, dims, mode="double", chunk_size = dims)
{
if ([email protected][[idx]]$exists(dataset_name)) {
[email protected][[idx]]$create_dataset(name = dataset_name,dims = dims,dtype = mode, chunk_dims = chunk_size)
} else {
if ([email protected][[idx]]$exists("scale.data")) {
if ([email protected][[idx]][["scale.data"]]$dims[1] < length([email protected])){
extendDataSet([email protected][[idx]][["scale.data"]], c(length([email protected]), [email protected][[idx]][["scale.data"]]$dims[2]))
}
} else if ([email protected][[idx]]$exists("gene_vars")) {
if ([email protected][[idx]][["gene_vars"]]$dims[1] < length([email protected])){
extendDataSet([email protected][[idx]][["gene_vars"]], length([email protected]))
}
}
}
}
#' Normalize raw datasets to column sums
#'
#' This function normalizes data to account for total gene expression across a cell.
#'
#' @param object \code{liger} object.
#' @param chunk size of chunks in hdf5 file. (default 1000)
#' @param format.type string of HDF5 format (10X CellRanger by default).
#' @param remove.missing Whether to remove cells not expressing any measured genes, and genes not
#' expressed in any cells (if take.gene.union = TRUE, removes only genes not expressed in any
#' dataset) (default TRUE).
#' @param verbose Print progress bar/messages (TRUE by default)
#'
#' @return \code{liger} object with norm.data slot set.
#'
#' @import hdf5r
#'
#' @export
#' @examples
#' # Demonstration using matrices with randomly generated numbers
#' Y <- matrix(runif(5000,0,2), 10,500)
#' Z <- matrix(runif(5000,0,2), 10,500)
#' ligerex <- createLiger(list(y_set = Y, z_set = Z))
#' ligerex <- normalize(ligerex)
normalize <- function(object,
chunk = 1000,
format.type = "10X",
remove.missing = TRUE,
verbose = TRUE) {
if (class([email protected][[1]])[1] == "H5File") {
hdf5_files = names([email protected])
nUMI = c()
nGene = c()
for (i in 1:length(hdf5_files))
{
if (verbose) {
message(hdf5_files[i])
}
chunk_size = chunk
#fname = hdf5_files[[i]]
num_entries = [email protected][[i]][["data"]]$dims
num_cells = [email protected][[i]][["barcodes"]]$dims
num_genes = [email protected][[i]][["genes"]]$dims
prev_end_col = 1
prev_end_data = 1
prev_end_ind = 0
gene_sum_sq = rep(0,num_genes)
gene_means = rep(0,num_genes)
#file.h5$close_all()
safe_h5_create(object = object, idx = i, dataset_name = "/norm.data", dims = num_entries, mode = h5types$double, chunk_size = chunk_size)
safe_h5_create(object = object, idx = i, dataset_name = "/cell_sums", dims = num_cells, mode = h5types$int, chunk_size = chunk_size)
#file.h5 = H5File$new(fname, mode="r+")
num_chunks = ceiling(num_cells/chunk_size)
if (verbose) {
pb = txtProgressBar(0,num_chunks,style = 3)
}
ind = 0
while(prev_end_col < num_cells)
{
ind = ind + 1
if (num_cells - prev_end_col < chunk_size)
{
chunk_size = num_cells - prev_end_col + 1
}
start_inds = [email protected][[i]][["indptr"]][prev_end_col:(prev_end_col+chunk_size)]
row_inds = [email protected][[i]][["indices"]][(prev_end_ind+1):(tail(start_inds, 1))]
counts = [email protected][[i]][["data"]][(prev_end_ind+1):(tail(start_inds, 1))]
raw.data = sparseMatrix(i=row_inds[1:length(counts)]+1,p=start_inds[1:(chunk_size+1)]-prev_end_ind,x=counts,dims=c(num_genes,chunk_size))
nUMI = c(nUMI, colSums(raw.data))
nGene = c(nGene, colSums(raw.data > 0))
norm.data = Matrix.column_norm(raw.data)
[email protected][[i]][["norm.data"]][(prev_end_ind+1):(tail(start_inds, 1))] = norm.data@x
[email protected][[i]][["cell_sums"]][prev_end_col:(prev_end_col+chunk_size-1)] = Matrix::colSums(raw.data)
#h5write(norm.data,file=fname,name="/norm.data",index=list(prev_end_ind:tail(start_inds, 1)))
#h5write(colSums(raw.data),file=fname,name="/cell_sums",index=list(prev_end_col:(prev_end_col+chunk_size)))
prev_end_col = prev_end_col + chunk_size
prev_end_data = prev_end_data + length(norm.data@x)
prev_end_ind = tail(start_inds, 1)
# calculate row sum and sum of squares using normalized data
row_sums = Matrix::rowSums(norm.data)
gene_sum_sq = gene_sum_sq + rowSums(norm.data*norm.data)
gene_means = gene_means + row_sums
if (verbose) {
setTxtProgressBar(pb,ind)
}
}
if (verbose) {
setTxtProgressBar(pb,num_chunks)
cat("\n")
}
gene_means = gene_means / num_cells
safe_h5_create(object = object, idx = i, dataset_name = "gene_means", dims=num_genes, mode=h5types$double)
safe_h5_create(object = object, idx = i, dataset_name = "gene_sum_sq", dims=num_genes, mode=h5types$double)
[email protected][[i]][["gene_means"]][1:length(gene_means)] = gene_means
[email protected][[i]][["gene_sum_sq"]][1:length(gene_sum_sq)] = gene_sum_sq
[email protected][[i]] = [email protected][[i]][["norm.data"]]
rm(row_sums)
rm(raw.data)
}
[email protected]$nUMI = nUMI
[email protected]$nGene = nGene
for (i in 1:length([email protected])){
if ([email protected][[i]]$exists("cell.data")) {
cell.data.i$barcode = rownames(cell.data.i)
[email protected][[i]][["cell.data"]] = cell.data.i
}
}
names([email protected]) = names([email protected])
} else {
if (remove.missing) {
object <- removeMissingObs(object, slot.use = "raw.data", use.cols = TRUE)
}
if (class([email protected][[1]])[1] == "dgTMatrix" |
class([email protected][[1]])[1] == "dgCMatrix") {
[email protected] <- lapply([email protected], Matrix.column_norm)
} else {
[email protected] <- lapply([email protected], function(x) {
sweep(x, 2, colSums(x), "/")
})
}
}
return(object)
}
#' Calculate variance of gene expression across cells in an online fashion
#'
#' This function calculates the variance of gene expression values across cells for hdf5 files.
#'
#' @param object \code{liger} object. The input raw.data should be a list of hdf5 files.
#' Should call normalize and selectGenes before calling.
#' @param chunk size of chunks in hdf5 file. (default 1000)
#' @param verbose Print progress bar/messages (TRUE by default)
#'
#' @return \code{liger} object with scale.data slot set.
#'
#' @import hdf5r
calcGeneVars = function (object, chunk = 1000, verbose = TRUE)
{
hdf5_files = names([email protected])
for (i in 1:length(hdf5_files)) {
if (verbose) {
message(hdf5_files[i])
}
chunk_size = chunk
num_cells = [email protected][[i]][["barcodes"]]$dims
num_genes = [email protected][[i]][["genes"]]$dims
num_entries = [email protected][[i]][["data"]]$dims
prev_end_col = 1
prev_end_data = 1
prev_end_ind = 0
gene_vars = rep(0,num_genes)
gene_means = [email protected][[i]][["gene_means"]][]
gene_num_pos = rep(0,num_genes)
num_chunks = ceiling(num_cells/chunk_size)
if (verbose) {
pb = txtProgressBar(0, num_chunks, style = 3)
}
ind = 0
while (prev_end_col < num_cells) {
ind = ind + 1
if (num_cells - prev_end_col < chunk_size) {
chunk_size = num_cells - prev_end_col + 1
}
start_inds = [email protected][[i]][["indptr"]][prev_end_col:(prev_end_col+chunk_size)]
row_inds = [email protected][[i]][["indices"]][(prev_end_ind+1):(tail(start_inds, 1))]
counts = [email protected][[i]][(prev_end_ind+1):(tail(start_inds, 1))]
norm.data = sparseMatrix(i=row_inds[1:length(counts)]+1,p=start_inds[1:(chunk_size+1)]-prev_end_ind,x=counts,dims=c(num_genes,chunk_size))
num_read = length(counts)
prev_end_col = prev_end_col + chunk_size
prev_end_data = prev_end_data + num_read
prev_end_ind = tail(start_inds, 1)
gene_vars = gene_vars + sumSquaredDeviations(norm.data,gene_means)
if (verbose) {
setTxtProgressBar(pb, ind)
}
}
if (verbose) {
setTxtProgressBar(pb, num_chunks)
cat("\n")
}
gene_vars = gene_vars/(num_cells - 1)
safe_h5_create(object = object, idx = i, dataset_name = "/gene_vars", dims = num_genes, mode = h5types$double)
[email protected][[i]][["gene_vars"]][1:num_genes] = gene_vars
}
return(object)
}
#' Select a subset of informative genes
#'
#' This function identifies highly variable genes from each dataset and combines these gene sets
#' (either by union or intersection) for use in downstream analysis. Assuming that gene
#' expression approximately follows a Poisson distribution, this function identifies genes with
#' gene expression variance above a given variance threshold (relative to mean gene expression).
#' It also provides a log plot of gene variance vs gene expression (with a line indicating expected
#' expression across genes and cells). Selected genes are plotted in green.
#'
#' @param object \code{liger} object. Should have already called normalize.
#' @param var.thresh Variance threshold. Main threshold used to identify variable genes. Genes with
#' expression variance greater than threshold (relative to mean) are selected.
#' (higher threshold -> fewer selected genes). Accepts single value or vector with separate
#' var.thresh for each dataset. (default 0.1)
#' @param alpha.thresh Alpha threshold. Controls upper bound for expected mean gene expression
#' (lower threshold -> higher upper bound). (default 0.99)
#' @param num.genes Number of genes to find for each dataset. Optimises the value of var.thresh
#' for each dataset to get this number of genes. Accepts single value or vector with same length
#' as number of datasets (optional, default=NULL).
#' @param tol Tolerance to use for optimization if num.genes values passed in (default 0.0001).
#' @param datasets.use List of datasets to include for discovery of highly variable genes.
#' (default 1:length([email protected]))
#' @param combine How to combine variable genes across experiments. Either "union" or "intersection".
#' (default "union")
#' @param capitalize Capitalize gene names to match homologous genes (ie. across species)
#' (default FALSE)
#' @param do.plot Display log plot of gene variance vs. gene expression for each dataset.
#' Selected genes are plotted in green. (default FALSE)
#' @param cex.use Point size for plot.
#' @param chunk size of chunks in hdf5 file. (default 1000)
#'
#' @return \code{liger} object with var.genes slot set.
#'
#' @import hdf5r
#' @importFrom stats optimize
#' @importFrom graphics abline plot points title
#' @importFrom stats qnorm
#'
#' @export
#' @examples
#' \dontrun{
#' # Given datasets Y and Z
#' ligerex <- createLiger(list(y_set = Y, z_set = Z))
#' ligerex <- normalize(ligerex)
#' # use default selectGenes settings (var.thresh = 0.1)
#' ligerex <- selectGenes(ligerex)
#' # select a smaller subset of genes
#' ligerex <- selectGenes(ligerex, var.thresh = 0.3)
#' }
selectGenes <- function(object, var.thresh = 0.1, alpha.thresh = 0.99, num.genes = NULL,
tol = 0.0001, datasets.use = 1:length([email protected]), combine = "union",
capitalize = FALSE, do.plot = FALSE, cex.use = 0.3, chunk=1000)
{
if (class([email protected][[1]])[1] == "H5File") {
if ([email protected][[1]]$exists("gene_vars")) {
object = calcGeneVars(object,chunk)
}
hdf5_files = names([email protected])
if (length(var.thresh) == 1) {
var.thresh <- rep(var.thresh, length(hdf5_files))
}
genes.use <- c()
for (i in datasets.use) {
if ([email protected][[i]][["format.type"]] == "AnnData"){
genes = [email protected][[i]][["genes"]][]$index
} else {
genes = [email protected][[i]][["genes"]][]
}
if (capitalize) {
genes = toupper(genes)
}
trx_per_cell = [email protected][[i]][["cell_sums"]][]
gene_expr_mean = [email protected][[i]][["gene_means"]][]
gene_expr_var = [email protected][[i]][["gene_vars"]][]
names(gene_expr_mean) <- names(gene_expr_var) <- genes # assign gene names
nolan_constant <- mean((1/trx_per_cell))
alphathresh.corrected <- alpha.thresh/length(genes)
genemeanupper <- gene_expr_mean + qnorm(1 - alphathresh.corrected/2) *
sqrt(gene_expr_mean * nolan_constant/length(trx_per_cell))
genes.new <- names(gene_expr_var)[which(gene_expr_var/nolan_constant >
genemeanupper & log10(gene_expr_var) > log10(gene_expr_mean) +
(log10(nolan_constant) + var.thresh[i]))]
if (do.plot) {
plot(log10(gene_expr_mean), log10(gene_expr_var),
cex = cex.use, xlab = "Gene Expression Mean (log10)",
ylab = "Gene Expression Variance (log10)")
points(log10(gene_expr_mean[genes.new]), log10(gene_expr_var[genes.new]),
cex = cex.use, col = "green")
abline(log10(nolan_constant), 1, col = "purple")
legend("bottomright", paste0("Selected genes: ",
length(genes.new)), pch = 20, col = "green")
title(main = hdf5_files[i])
}
if (combine == "union") {
genes.use <- union(genes.use, genes.new)
}
if (combine == "intersection") {
if (length(genes.use) == 0) {
genes.use <- genes.new
}
genes.use <- intersect(genes.use, genes.new)
}
}
for (i in 1:length(hdf5_files)) {
if ([email protected][[i]][["format.type"]] == "AnnData"){
genes = [email protected][[i]][["genes"]][]$index
} else {
genes = [email protected][[i]][["genes"]][]
}
genes.use <- genes.use[genes.use %in% genes]
}
if (length(genes.use) == 0) {
warning("No genes were selected; lower var.thresh values or choose 'union' for combine parameter",
immediate. = TRUE)
}
[email protected] = genes.use
} else {
# Expand if only single var.thresh passed
if (length(var.thresh) == 1) {
var.thresh <- rep(var.thresh, length([email protected]))
}
if (length(num.genes) == 1) {
num.genes <- rep(num.genes, length([email protected]))
}
if (!identical(intersect(datasets.use, 1:length([email protected])),datasets.use)) {
datasets.use = intersect(datasets.use, 1:length([email protected]))
}
genes.use <- c()
for (i in datasets.use) {
if (capitalize) {