-
Notifications
You must be signed in to change notification settings - Fork 89
/
core.R
2944 lines (2682 loc) · 121 KB
/
core.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
############################################################
#
# DESeq2 organization of R files
#
# core ........... most of the statistical code (example call below)
# fitNbinomGLMs .. three functions for fitting NB GLMs
# methods ........ the S4 methods (estimateSizeFactors, etc.)
# AllClasses ..... class definitions and object constructors
# AllGenerics .... the generics defined in DESeq2
# results ........ results() function and helpers
# plots .......... all plotting functions
# lfcShrink ...... log2 fold change shrinkage
# helper ......... unmix, collapseReplicates, fpkm, fpm, DESeqParallel
# expanded ....... helpers for dealing with expanded model matrices
# wrappers ....... the R wrappers for the C++ functions (mine)
# RcppExports .... the R wrappers for the C++ functions (auto)
#
# rlogTransformation ... rlog
# varianceStabilizingTransformation ... VST
#
# general outline of the internal function calls.
# note: not all of these functions are exported.
#
# DESeq
# |- estimateSizeFactors
# |- estimateSizeFactorsForMatrix
# |- estimateDispersions
# |- estimateDispersionsGeneEst
# |- fitNbinomGLMs
# |- fitBeta (C++)
# |- fitDisp (C++)
# |- estimateDispersionsFit
# |- estimateDispersionsMAP
# |- estimateDispersionPriorVar
# |- fitDisp (C++)
# |- nbinomWaldTest
# |- fitGLMsWithPrior
# |- fitNbinomGLMs
# |- fitBeta (C++)
# |- estimateBetaPriorVar
# |- fitNbinomGLMs
# |- fitBeta (C++)
#
############################################################
#' DESeq2 package for differential analysis of count data
#'
#' The DESeq2 package is designed for normalization,
#' visualization, and differential analysis of high-dimensional
#' count data. It makes use of empirical Bayes techniques
#' to estimate priors for log fold change and dispersion, and
#' to calculate posterior estimates for these quantities.
#'
#' The main functions are:
#'
#' \itemize{
#' \item \code{\link{DESeqDataSet}} - build the dataset, see tximeta & tximport packages for preparing input
#' \item \code{\link{DESeq}} - perform differential analysis
#' \item \code{\link{results}} - build a results table
#' \item \code{\link{lfcShrink}} - estimate shrunken LFC (posterior estimates) using apeglm & ashr pakges
#' \item \code{\link{vst}} - apply variance stabilizing transformation, e.g. for PCA or sample clustering
#' \item Plots, e.g.: \code{\link{plotPCA}}, \code{\link{plotMA}}, \code{\link{plotCounts}}
#' }
#'
#' For detailed information on usage, see the package vignette, by typing
#' \code{vignette("DESeq2")}, or the workflow linked to on the first page
#' of the vignette.
#'
#' All software-related questions should be posted to the Bioconductor Support Site:
#'
#' \url{https://support.bioconductor.org}
#'
#' The code can be viewed at the GitHub repository,
#' which also lists the contributor code of conduct:
#'
#' \url{https://github.com/mikelove/tximport}
#'
#' @references
#'
#' Love, M.I., Huber, W., Anders, S. (2014)
#' Moderated estimation of fold change and dispersion
#' for RNA-seq data with DESeq2. Genome Biology, 15:550.
#' \url{https://doi.org/10.1186/s13059-014-0550-8}
#'
#' @author Michael Love, Wolfgang Huber, Simon Anders
#'
#' @docType package
#' @name DESeq2-package
#' @aliases DESeq2-package
#' @keywords package
NULL
#' Differential expression analysis based on the Negative Binomial (a.k.a. Gamma-Poisson) distribution
#'
#' This function performs a default analysis through the steps:
#' \enumerate{
#' \item estimation of size factors: \code{\link{estimateSizeFactors}}
#' \item estimation of dispersion: \code{\link{estimateDispersions}}
#' \item Negative Binomial GLM fitting and Wald statistics: \code{\link{nbinomWaldTest}}
#' }
#' For complete details on each step, see the manual pages of the respective
#' functions. After the \code{DESeq} function returns a DESeqDataSet object,
#' results tables (log2 fold changes and p-values) can be generated
#' using the \code{\link{results}} function.
#' Shrunken LFC can then be generated using the \code{\link{lfcShrink}} function.
#' All support questions should be posted to the Bioconductor
#' support site: \url{http://support.bioconductor.org}.
#'
#' The differential expression analysis uses a generalized linear model of the form:
#'
#' \deqn{ K_{ij} \sim \textrm{NB}( \mu_{ij}, \alpha_i) }{ K_ij ~ NB(mu_ij, alpha_i) }
#' \deqn{ \mu_{ij} = s_j q_{ij} }{ mu_ij = s_j q_ij }
#' \deqn{ \log_2(q_{ij}) = x_{j.} \beta_i }{ log2(q_ij) = x_j. beta_i }
#'
#' where counts \eqn{K_{ij}}{K_ij} for gene i, sample j are modeled using
#' a Negative Binomial distribution with fitted mean \eqn{\mu_{ij}}{mu_ij}
#' and a gene-specific dispersion parameter \eqn{\alpha_i}{alpha_i}.
#' The fitted mean is composed of a sample-specific size factor
#' \eqn{s_j}{s_j} and a parameter \eqn{q_{ij}}{q_ij} proportional to the
#' expected true concentration of fragments for sample j.
#' The coefficients \eqn{\beta_i}{beta_i} give the log2 fold changes for gene i for each
#' column of the model matrix \eqn{X}{X}.
#' The sample-specific size factors can be replaced by
#' gene-specific normalization factors for each sample using
#' \code{\link{normalizationFactors}}.
#'
#' For details on the fitting of the log2 fold changes and calculation of p-values,
#' see \code{\link{nbinomWaldTest}} if using \code{test="Wald"},
#' or \code{\link{nbinomLRT}} if using \code{test="LRT"}.
#'
#' Experiments without replicates do not allow for estimation of the dispersion
#' of counts around the expected value for each group, which is critical for
#' differential expression analysis. Analysis without replicates was deprecated
#' in v1.20 and is no longer supported since v1.22.
#'
#' The argument \code{minReplicatesForReplace} is used to decide which samples
#' are eligible for automatic replacement in the case of extreme Cook's distance.
#' By default, \code{DESeq} will replace outliers if the Cook's distance is
#' large for a sample which has 7 or more replicates (including itself).
#' Outlier replacement is turned off entirely for \code{fitType="glmGamPoi"}.
#' This replacement is performed by the \code{\link{replaceOutliers}}
#' function. This default behavior helps to prevent filtering genes
#' based on Cook's distance when there are many degrees of freedom.
#' See \code{\link{results}} for more information about filtering using
#' Cook's distance, and the 'Dealing with outliers' section of the vignette.
#' Unlike the behavior of \code{\link{replaceOutliers}}, here original counts are
#' kept in the matrix returned by \code{\link{counts}}, original Cook's
#' distances are kept in \code{assays(dds)[["cooks"]]}, and the replacement
#' counts used for fitting are kept in \code{assays(dds)[["replaceCounts"]]}.
#'
#' Note that if a log2 fold change prior is used (betaPrior=TRUE)
#' then expanded model matrices will be used in fitting. These are
#' described in \code{\link{nbinomWaldTest}} and in the vignette. The
#' \code{contrast} argument of \code{\link{results}} should be used for
#' generating results tables.
#'
#' @return a \code{\link{DESeqDataSet}} object with results stored as
#' metadata columns. These results should accessed by calling the \code{\link{results}}
#' function. By default this will return the log2 fold changes and p-values for the last
#' variable in the design formula. See \code{\link{results}} for how to access results
#' for other variables.
#'
#' @param object a DESeqDataSet object, see the constructor functions
#' \code{\link{DESeqDataSet}},
#' \code{\link{DESeqDataSetFromMatrix}},
#' \code{\link{DESeqDataSetFromHTSeqCount}}.
#' @param test either "Wald" or "LRT", which will then use either
#' Wald significance tests (defined by \code{\link{nbinomWaldTest}}),
#' or the likelihood ratio test on the difference in deviance between a
#' full and reduced model formula (defined by \code{\link{nbinomLRT}})
#' @param fitType either "parametric", "local", "mean", or "glmGamPoi"
#' for the type of fitting of dispersions to the mean intensity.
#' See \code{\link{estimateDispersions}} for description.
#' @param sfType either "ratio", "poscounts", or "iterate"
#' for the type of size factor estimation. See
#' \code{\link{estimateSizeFactors}} for description.
#' @param betaPrior whether or not to put a zero-mean normal prior on
#' the non-intercept coefficients
#' See \code{\link{nbinomWaldTest}} for description of the calculation
#' of the beta prior. In versions \code{>=1.16}, the default is set
#' to \code{FALSE}, and shrunken LFCs are obtained afterwards using
#' \code{\link{lfcShrink}}.
#' @param full for \code{test="LRT"}, the full model formula,
#' which is restricted to the formula in \code{design(object)}.
#' alternatively, it can be a model matrix constructed by the user.
#' advanced use: specifying a model matrix for full and \code{test="Wald"}
#' is possible if \code{betaPrior=FALSE}
#' @param reduced for \code{test="LRT"}, a reduced formula to compare against,
#' i.e., the full formula with the term(s) of interest removed.
#' alternatively, it can be a model matrix constructed by the user
#' @param quiet whether to print messages at each step
#' @param minReplicatesForReplace the minimum number of replicates required
#' in order to use \code{\link{replaceOutliers}} on a
#' sample. If there are samples with so many replicates, the model will
#' be refit after these replacing outliers, flagged by Cook's distance.
#' Set to \code{Inf} in order to never replace outliers.
#' It set to \code{Inf} for \code{fitType="glmGamPoi"}.
#' @param modelMatrixType either "standard" or "expanded", which describe
#' how the model matrix, X of the GLM formula is formed.
#' "standard" is as created by \code{model.matrix} using the
#' design formula. "expanded" includes an indicator variable for each
#' level of factors in addition to an intercept. for more information
#' see the Description of \code{\link{nbinomWaldTest}}.
#' betaPrior must be set to TRUE in order for expanded model matrices
#' to be fit.
#' @param useT logical, passed to \code{\link{nbinomWaldTest}}, default is FALSE,
#' where Wald statistics are assumed to follow a standard Normal
#' @param minmu lower bound on the estimated count for fitting gene-wise dispersion
#' and for use with \code{nbinomWaldTest} and \code{nbinomLRT}.
#' If \code{fitType="glmGamPoi"}, then 1e-6 will be used
#' (as this fitType is optimized for single cell data, where a lower
#' minmu is recommended), otherwise the default value
#' as evaluated on bulk datasets is 0.5
#' @param parallel if FALSE, no parallelization. if TRUE, parallel
#' execution using \code{BiocParallel}, see next argument \code{BPPARAM}.
#' Two notes on running in parallel using \code{BiocParallel}:
#' 1) it is recommended to filter out genes where all samples have
#' low counts before running DESeq2 in parellel: this improves efficiency
#' as otherwise you will be sending data to child processes, though those
#' have little power for detection of differences, and will likely be
#' removed by independent filtering anyway;
#' 2) it may be
#' advantageous to remove large, unneeded objects from your current
#' R environment before calling \code{DESeq},
#' as it is possible that R's internal garbage collection
#' will copy these files while running on worker nodes.
#' @param BPPARAM an optional parameter object passed internally
#' to \code{\link{bplapply}} when \code{parallel=TRUE}.
#' If not specified, the parameters last registered with
#' \code{\link{register}} will be used.
#'
#' @author Michael Love
#'
#' @references
#'
#' Love, M.I., Huber, W., Anders, S. (2014) Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biology, 15:550. \url{https://doi.org/10.1186/s13059-014-0550-8}
#'
#' For \code{fitType="glmGamPoi"}:
#'
#' Ahlmann-Eltze, C., Huber, W. (2020) glmGamPoi: Fitting Gamma-Poisson Generalized Linear Models on Single Cell Count Data. Bioinformatics. \url{https://doi.org/10.1093/bioinformatics/btaa1009}
#'
#' @import BiocGenerics BiocParallel S4Vectors IRanges GenomicRanges SummarizedExperiment Biobase Rcpp methods
#'
#' @importFrom locfit locfit
#' @importFrom graphics axis hist plot points
#' @importFrom stats Gamma as.formula coefficients df dnbinom dnorm formula glm loess lowess model.matrix optim p.adjust pchisq pnorm prcomp predict pt qf qnorm rchisq relevel rnbinom rnorm runif splinefun terms terms.formula approx
#' @importFrom utils read.table read.csv askYesNo menu
#' @importFrom stats4 summary
#' @importFrom matrixStats rowVars
#' @importFrom MatrixGenerics rowMeans rowSums colMeans colSums
#'
#' @useDynLib DESeq2
#'
#' @seealso \code{link{results}}, \code{\link{lfcShrink}}, \code{\link{nbinomWaldTest}}, \code{\link{nbinomLRT}}
#'
#' @examples
#'
#' # see vignette for suggestions on generating
#' # count tables from RNA-Seq data
#' cnts <- matrix(rnbinom(n=1000, mu=100, size=1/0.5), ncol=10)
#' cond <- factor(rep(1:2, each=5))
#'
#' # object construction
#' dds <- DESeqDataSetFromMatrix(cnts, DataFrame(cond), ~ cond)
#'
#' # standard analysis
#' dds <- DESeq(dds)
#' res <- results(dds)
#'
#' # moderated log2 fold changes
#' resultsNames(dds)
#' resLFC <- lfcShrink(dds, coef=2, type="apeglm")
#'
#' # an alternate analysis: likelihood ratio test
#' ddsLRT <- DESeq(dds, test="LRT", reduced= ~ 1)
#' resLRT <- results(ddsLRT)
#'
#' @export
DESeq <- function(object, test=c("Wald","LRT"),
fitType=c("parametric","local","mean", "glmGamPoi"),
sfType=c("ratio","poscounts","iterate"),
betaPrior,
full=design(object), reduced, quiet=FALSE,
minReplicatesForReplace=7, modelMatrixType,
useT=FALSE, minmu=if (fitType=="glmGamPoi") 1e-6 else 0.5,
parallel=FALSE, BPPARAM=bpparam()) {
# check arguments
stopifnot(is(object, "DESeqDataSet"))
test <- match.arg(test, choices=c("Wald","LRT"))
fitType <- match.arg(fitType, choices=c("parametric","local","mean","glmGamPoi"))
dispersionEstimator <- if (fitType == "glmGamPoi") {
"glmGamPoi"
} else {
"DESeq2"
}
# turn off outlier replacement for glmGamPoi
if (fitType == "glmGamPoi") {
minReplicatesForReplace <- Inf
if (parallel) {
warning("parallelization of DESeq() is not implemented for fitType='glmGamPoi'")
}
}
sfType <- match.arg(sfType, choices=c("ratio","poscounts","iterate"))
# more check arguments
stopifnot(is.logical(quiet))
stopifnot(is.numeric(minReplicatesForReplace))
stopifnot(is.logical(parallel))
modelAsFormula <- !is.matrix(full) & is(design(object), "formula")
if (missing(betaPrior)) {
betaPrior <- FALSE
} else {
stopifnot(is.logical(betaPrior))
}
# get rid of any NA in the mcols(mcols(object))
object <- sanitizeRowRanges(object)
if (test == "LRT") {
if (missing(reduced)) {
stop("likelihood ratio test requires a 'reduced' design, see ?DESeq")
}
if (betaPrior) {
stop("test='LRT' does not support use of LFC shrinkage, use betaPrior=FALSE")
}
if (!missing(modelMatrixType) && modelMatrixType=="expanded") {
stop("test='LRT' does not support use of expanded model matrix")
}
if (is.matrix(full) | is.matrix(reduced)) {
if (!(is.matrix(full) & is.matrix(reduced))) {
stop("if one of 'full' and 'reduced' is a matrix, the other must be also a matrix")
}
}
if (modelAsFormula) {
checkLRT(full, reduced)
} else {
checkFullRank(full)
checkFullRank(reduced)
if (ncol(full) <= ncol(reduced)) {
stop("the number of columns of 'full' should be more than the number of columns of 'reduced'")
}
}
}
if (test == "Wald" & !missing(reduced)) {
stop("'reduced' ignored when test='Wald'")
}
if (dispersionEstimator == "glmGamPoi" && test == "Wald") {
warning("glmGamPoi dispersion estimator should be used in combination with a LRT and not a Wald test.",
call. = FALSE)
}
# integer check
if (fitType != "glmGamPoi") {
if ( !is.integer( counts(object) ) ) {
stop("the count data is not in integer mode and fitType is not 'glmGamPoi'")
}
}
if (modelAsFormula) {
# run some tests common to DESeq, nbinomWaldTest, nbinomLRT
designAndArgChecker(object, betaPrior)
if (design(object) == formula(~1)) {
warning("the design is ~ 1 (just an intercept). is this intended?")
}
if (full != design(object)) {
stop("'full' specified as formula should equal design(object)")
}
modelMatrix <- NULL
} else {
# model not as formula, so DESeq() is using supplied model matrix
if (!quiet) message("using supplied model matrix")
if (betaPrior == TRUE) {
stop("betaPrior=TRUE is not supported for user-provided model matrices")
}
checkFullRank(full)
# this will be used for dispersion estimation and testing
modelMatrix <- full
}
attr(object, "betaPrior") <- betaPrior
stopifnot(length(parallel) == 1 & is.logical(parallel))
if (!is.null(sizeFactors(object)) || !is.null(normalizationFactors(object))) {
if (!quiet) {
if (!is.null(normalizationFactors(object))) {
message("using pre-existing normalization factors")
} else {
message("using pre-existing size factors")
}
}
} else {
if (!quiet) message("estimating size factors")
object <- estimateSizeFactors(object, type=sfType, quiet=quiet)
}
if (!parallel) {
if (!quiet) message("estimating dispersions")
object <- estimateDispersions(object, fitType=fitType, quiet=quiet, modelMatrix=modelMatrix, minmu=minmu)
if (!quiet) message("fitting model and testing")
if (test == "Wald") {
object <- nbinomWaldTest(object, betaPrior=betaPrior, quiet=quiet,
modelMatrix=modelMatrix,
modelMatrixType=modelMatrixType,
useT=useT,
minmu=minmu)
} else if (test == "LRT") {
object <- nbinomLRT(object, full=full,
reduced=reduced, quiet=quiet,
minmu=minmu,
type = dispersionEstimator)
}
} else if (parallel) {
if (!missing(modelMatrixType)) {
if (betaPrior) stopifnot(modelMatrixType=="expanded")
}
object <- DESeqParallel(object, test=test, fitType=fitType,
betaPrior=betaPrior, full=full, reduced=reduced,
quiet=quiet, modelMatrix=modelMatrix,
useT=useT, minmu=minmu,
BPPARAM=BPPARAM)
}
# if there are sufficient replicates, then pass through to refitting function
sufficientReps <- any(nOrMoreInCell(attr(object,"modelMatrix"),minReplicatesForReplace))
if (sufficientReps) {
object <- refitWithoutOutliers(object, test=test, betaPrior=betaPrior,
full=full, reduced=reduced, quiet=quiet,
minReplicatesForReplace=minReplicatesForReplace,
modelMatrix=modelMatrix,
modelMatrixType=modelMatrixType)
}
# stash the package version (again, also in construction)
metadata(object)[["version"]] <- packageVersion("DESeq2")
object
}
#' Make a simulated DESeqDataSet
#'
#' Constructs a simulated dataset of Negative Binomial data from
#' two conditions. By default, there are no fold changes between
#' the two conditions, but this can be adjusted with the \code{betaSD} argument.
#'
#' @param n number of rows
#' @param m number of columns
#' @param betaSD the standard deviation for non-intercept betas, i.e. beta ~ N(0,betaSD)
#' @param interceptMean the mean of the intercept betas (log2 scale)
#' @param interceptSD the standard deviation of the intercept betas (log2 scale)
#' @param dispMeanRel a function specifying the relationship of the dispersions on
#' \code{2^trueIntercept}
#' @param sizeFactors multiplicative factors for each sample
#'
#' @return a \code{\link{DESeqDataSet}} with true dispersion,
#' intercept and beta values in the metadata columns. Note that the true
#' betas are provided on the log2 scale.
#'
#' @examples
#'
#' dds <- makeExampleDESeqDataSet()
#' dds
#'
#' @export
makeExampleDESeqDataSet <- function(n=1000,m=12,betaSD=0,interceptMean=4,interceptSD=2,
dispMeanRel=function(x) 4/x + .1,sizeFactors=rep(1,m)) {
beta <- cbind(rnorm(n,interceptMean,interceptSD),rnorm(n,0,betaSD))
dispersion <- dispMeanRel(2^(beta[,1]))
colData <- DataFrame(condition=factor(rep(c("A","B"),times=c(ceiling(m/2),floor(m/2)))))
x <- if (m > 1) {
stats::model.matrix.default(~ colData$condition)
} else {
cbind(rep(1,m),rep(0,m))
}
mu <- t(2^(x %*% t(beta)) * sizeFactors)
countData <- matrix(rnbinom(m*n, mu=mu, size=1/dispersion), ncol=m)
mode(countData) <- "integer"
colnames(countData) <- paste("sample",1:m,sep="")
rowRanges <- GRanges("1",IRanges(start=(1:n - 1) * 100 + 1,width=100))
names(rowRanges) <- paste0("gene",1:n)
# set environment to global environment,
# to avoid the formula carrying with it all the objects
# here including 'object' itself.
design <- if (m > 1) {
as.formula("~ condition", env=.GlobalEnv)
} else {
as.formula("~ 1", env=.GlobalEnv)
}
object <- DESeqDataSetFromMatrix(countData = countData,
colData = colData,
design = design,
rowRanges = rowRanges)
trueVals <- DataFrame(trueIntercept = beta[,1],
trueBeta = beta[,2],
trueDisp = dispersion)
mcols(trueVals) <- DataFrame(type=rep("input",ncol(trueVals)),
description=c("simulated intercept values",
"simulated beta values",
"simulated dispersion values"))
mcols(object) <- cbind(mcols(object),trueVals)
return(object)
}
#' Low-level function to estimate size factors with robust regression.
#'
#' Given a matrix or data frame of count data, this function estimates the size
#' factors as follows: Each column is divided by the geometric means of the
#' rows. The median (or, if requested, another location estimator) of these
#' ratios (skipping the genes with a geometric mean of zero) is used as the size
#' factor for this column. Typically, one will not call this function directly, but use
#' \code{\link{estimateSizeFactors}}.
#'
#' @param counts a matrix or data frame of counts, i.e., non-negative integer
#' values
#' @param locfunc a function to compute a location for a sample. By default, the
#' median is used. However, especially for low counts, the
#' \code{\link[genefilter]{shorth}} function from genefilter may give better results.
#' @param geoMeans by default this is not provided, and the
#' geometric means of the counts are calculated within the function.
#' A vector of geometric means from another count matrix can be provided
#' for a "frozen" size factor calculation
#' @param controlGenes optional, numeric or logical index vector specifying those genes to
#' use for size factor estimation (e.g. housekeeping or spike-in genes)
#' @param type standard median ratio (\code{"ratio"}) or where the
#' geometric mean is only calculated over positive counts per row
#' (\code{"poscounts"})
#' @return a vector with the estimates size factors, one element per column
#' @author Simon Anders
#' @seealso \code{\link{estimateSizeFactors}}
#' @examples
#'
#' dds <- makeExampleDESeqDataSet()
#' estimateSizeFactorsForMatrix(counts(dds))
#' geoMeans <- exp(rowMeans(log(counts(dds))))
#' estimateSizeFactorsForMatrix(counts(dds),geoMeans=geoMeans)
#'
#' @export
estimateSizeFactorsForMatrix <- function(counts, locfunc=stats::median,
geoMeans, controlGenes,
type=c("ratio","poscounts")) {
type <- match.arg(type, c("ratio","poscounts"))
if (missing(geoMeans)) {
incomingGeoMeans <- FALSE
if (type == "ratio") {
loggeomeans <- MatrixGenerics::rowMeans(log(counts))
} else if (type == "poscounts") {
lc <- log(counts)
lc[!is.finite(lc)] <- 0
loggeomeans <- MatrixGenerics::rowMeans(lc)
allZero <- MatrixGenerics::rowSums(counts) == 0
loggeomeans[allZero] <- -Inf
}
} else {
incomingGeoMeans <- TRUE
if (length(geoMeans) != nrow(counts)) {
stop("geoMeans should be as long as the number of rows of counts")
}
loggeomeans <- log(geoMeans)
}
if (all(is.infinite(loggeomeans))) {
stop("every gene contains at least one zero, cannot compute log geometric means")
}
sf <- if (missing(controlGenes)) {
apply(counts, 2, function(cnts) {
exp(locfunc((log(cnts) - loggeomeans)[is.finite(loggeomeans) & cnts > 0]))
})
} else {
if ( !( is.numeric(controlGenes) | is.logical(controlGenes) ) ) {
stop("controlGenes should be either a numeric or logical vector")
}
loggeomeansSub <- loggeomeans[controlGenes]
apply(counts[controlGenes,,drop=FALSE], 2, function(cnts) {
exp(locfunc((log(cnts) - loggeomeansSub)[is.finite(loggeomeansSub) & cnts > 0]))
})
}
if (incomingGeoMeans) {
# stabilize size factors to have geometric mean of 1
sf <- sf/exp(mean(log(sf)))
}
sf
}
#' Low-level functions to fit dispersion estimates
#'
#' Normal users should instead use \code{\link{estimateDispersions}}.
#' These low-level functions are called by \code{\link{estimateDispersions}},
#' but are exported and documented for non-standard usage.
#' For instance, it is possible to replace fitted values with a custom fit and continue
#' with the maximum a posteriori dispersion estimation, as demonstrated in the
#' examples below.
#'
#' @param object a DESeqDataSet
#' @param fitType either "parametric", "local", "mean", or "glmGamPoi"
#' for the type of fitting of dispersions to the mean intensity.
#' See \code{\link{estimateDispersions}} for description.
#' @param outlierSD the number of standard deviations of log
#' gene-wise estimates above the prior mean (fitted value),
#' above which dispersion estimates will be labelled
#' outliers. Outliers will keep their original value and
#' not be shrunk using the prior.
#' @param dispPriorVar the variance of the normal prior on the log dispersions.
#' If not supplied, this is calculated as the difference between
#' the mean squared residuals of gene-wise estimates to the
#' fitted dispersion and the expected sampling variance
#' of the log dispersion
#' @param minDisp small value for the minimum dispersion, to allow
#' for calculations in log scale, one order of magnitude above this value is used
#' as a test for inclusion in mean-dispersion fitting
#' @param kappa_0 control parameter used in setting the initial proposal
#' in backtracking search, higher kappa_0 results in larger steps
#' @param dispTol control parameter to test for convergence of log dispersion,
#' stop when increase in log posterior is less than dispTol
#' @param maxit control parameter: maximum number of iterations to allow for convergence
#' @param useCR whether to use Cox-Reid correction
#' @param weightThreshold threshold for subsetting the design matrix and GLM weights
#' for calculating the Cox-Reid correction
#' @param quiet whether to print messages at each step
#' @param modelMatrix for advanced use only,
#' a substitute model matrix for gene-wise and MAP dispersion estimation
#' @param niter number of times to iterate between estimation of means and
#' estimation of dispersion
#' @param linearMu estimate the expected counts matrix using a linear model,
#' default is NULL, in which case a lienar model is used if the
#' number of groups defined by the model matrix is equal to the number
#' of columns of the model matrix
#' @param minmu lower bound on the estimated count for fitting gene-wise dispersion
#' @param alphaInit initial guess for the dispersion estimate, alpha
#' @param type can either be "DESeq2" or "glmGamPoi". Specifies if the glmGamPoi
#' package is used to calculate the dispersion. This can be significantly faster
#' if there are many replicates with small counts.
#'
#' @return a DESeqDataSet with gene-wise, fitted, or final MAP
#' dispersion estimates in the metadata columns of the object.
#'
#' \code{estimateDispersionsPriorVar} is called inside of \code{estimateDispersionsMAP}
#' and stores the dispersion prior variance as an attribute of
#' \code{dispersionFunction(dds)}, which can be manually provided to
#' \code{estimateDispersionsMAP} for parallel execution.
#'
#' @aliases estimateDispersionsGeneEst estimateDispersionsFit estimateDispersionsMAP estimateDispersionsPriorVar
#'
#' @examples
#'
#' dds <- makeExampleDESeqDataSet()
#' dds <- estimateSizeFactors(dds)
#' dds <- estimateDispersionsGeneEst(dds)
#' dds <- estimateDispersionsFit(dds)
#' dds <- estimateDispersionsMAP(dds)
#' plotDispEsts(dds)
#'
#' # after having run estimateDispersionsFit()
#' # the dispersion prior variance over all genes
#' # can be obtained like so:
#'
#' dispPriorVar <- estimateDispersionsPriorVar(dds)
#'
#' @seealso \code{\link{estimateDispersions}}
#'
#' @export
estimateDispersionsGeneEst <- function(object, minDisp=1e-8, kappa_0=1,
dispTol=1e-6, maxit=100, useCR=TRUE,
weightThreshold=1e-2,
quiet=FALSE,
modelMatrix=NULL, niter=1, linearMu=NULL,
minmu=if (type=="glmGamPoi") 1e-6 else 0.5,
alphaInit=NULL,
type = c("DESeq2", "glmGamPoi")) {
type <- match.arg(type, c("DESeq2", "glmGamPoi"))
if (!is.null(mcols(object)$dispGeneEst)) {
if (!quiet) message("found already estimated gene-wise dispersions, removing these")
removeCols <- c("dispGeneEst","dispGeneIter")
mcols(object) <- mcols(object)[,!names(mcols(object)) %in% removeCols,drop=FALSE]
}
stopifnot(length(minDisp) == 1)
stopifnot(length(kappa_0) == 1)
stopifnot(length(dispTol) == 1)
stopifnot(length(maxit) == 1)
if (log(minDisp/10) <= -30) {
stop("for computational stability, log(minDisp/10) should be above -30")
}
# in case the class of the mcols(mcols(object)) are not character
object <- sanitizeRowRanges(object)
if (is.null(modelMatrix)) {
modelMatrix <- getModelMatrix(object)
}
checkFullRank(modelMatrix)
if (nrow(modelMatrix) == ncol(modelMatrix)) {
stop("the number of samples and the number of model coefficients are equal,
i.e., there are no replicates to estimate the dispersion.
use an alternate design formula")
}
object <- getBaseMeansAndVariances(object)
# use weights if they are present in assays(object)
# (we need this already to decide about linear mu fitting)
attr(object, "weightsOK") <- NULL # reset any information
wlist <- getAndCheckWeights(object, modelMatrix, weightThreshold=weightThreshold)
object <- wlist$object
weights <- wlist$weights
# don't let weights go below 1e-6
weights <- pmax(weights, 1e-6)
useWeights <- wlist$useWeights
# only continue on the rows with non-zero row mean
objectNZ <- object[!mcols(object)$allZero,,drop=FALSE]
weights <- weights[!mcols(object)$allZero,,drop=FALSE]
if (is.null(alphaInit)) {
# this rough dispersion estimate (alpha_hat)
# is for estimating mu
# and for the initial starting point for line search
roughDisp <- roughDispEstimate(y = counts(objectNZ,normalized=TRUE),
x = modelMatrix)
momentsDisp <- momentsDispEstimate(objectNZ)
alpha_hat <- pmin(roughDisp, momentsDisp)
} else {
if (length(alphaInit) == 1) {
alpha_hat <- rep(alphaInit, nrow(objectNZ))
} else {
stopifnot(length(alphaInit) == nrow(objectNZ))
alpha_hat <- alphaInit
}
}
# bound the rough estimated alpha between minDisp and maxDisp for numeric stability
maxDisp <- max(10, ncol(object))
alpha_hat <- alpha_hat_new <- alpha_init <- pmin(pmax(minDisp, alpha_hat), maxDisp)
stopifnot(length(niter) == 1 & niter > 0)
# use a linear model to estimate the expected counts
# if the number of groups according to the model matrix
# is equal to the number of columns
if (is.null(linearMu)) {
modelMatrixGroups <- modelMatrixGroups(modelMatrix)
linearMu <- nlevels(modelMatrixGroups) == ncol(modelMatrix)
# also check for weights (then can't do linear mu)
if (useWeights) {
linearMu <- FALSE
}
}
# below, iterate between mean and dispersion estimation (niter) times
fitidx <- rep(TRUE,nrow(objectNZ))
mu <- matrix(0, nrow=nrow(objectNZ), ncol=ncol(objectNZ))
dispIter <- numeric(nrow(objectNZ))
# bound the estimated count by 'minmu'
# this helps make the fitting more robust,
# because 1/mu occurs in the weights for the NB GLM
for (iter in seq_len(niter)) {
# coding note: fitMu is over the non-zero rows and then subset by 'fitidx'
if (!linearMu) {
fit <- fitNbinomGLMs(objectNZ[fitidx,,drop=FALSE],
alpha_hat=alpha_hat[fitidx],
modelMatrix=modelMatrix, type=type)
fitMu <- fit$mu
} else {
fitMu <- linearModelMuNormalized(objectNZ[fitidx,,drop=FALSE],
modelMatrix)
}
fitMu[fitMu < minmu] <- minmu
mu[fitidx,] <- fitMu
# use of kappa_0 in backtracking search
# initial proposal = log(alpha) + kappa_0 * deriv. of log lik. w.r.t. log(alpha)
# use log(minDisp/10) to stop if dispersions going to -infinity
if (type == "DESeq2") {
dispRes <- fitDispWrapper(ySEXP = counts(objectNZ)[fitidx,,drop=FALSE],
xSEXP = modelMatrix,
mu_hatSEXP = fitMu,
log_alphaSEXP = log(alpha_hat)[fitidx],
log_alpha_prior_meanSEXP = log(alpha_hat)[fitidx],
log_alpha_prior_sigmasqSEXP = 1, min_log_alphaSEXP = log(minDisp/10),
kappa_0SEXP = kappa_0, tolSEXP = dispTol,
maxitSEXP = maxit, usePriorSEXP = FALSE,
weightsSEXP = weights,
useWeightsSEXP = useWeights,
weightThresholdSEXP = weightThreshold,
useCRSEXP = useCR)
dispIter[fitidx] <- dispRes$iter
alpha_hat_new[fitidx] <- pmin(exp(dispRes$log_alpha), maxDisp)
last_lp <- dispRes$last_lp
initial_lp <- dispRes$initial_lp
# only rerun those rows which moved
} else if (type == "glmGamPoi") {
if (!requireNamespace("glmGamPoi", quietly=TRUE)) {
stop("type='glmGamPoi' requires installing the Bioconductor package 'glmGamPoi'")
}
if (!quiet) message("using 'glmGamPoi' as fitType. If used in published research, please cite:
Ahlmann-Eltze, C., Huber, W. (2020) glmGamPoi: Fitting Gamma-Poisson
Generalized Linear Models on Single Cell Count Data. Bioinformatics.
https://doi.org/10.1093/bioinformatics/btaa1009")
Counts <- counts(objectNZ)[fitidx,,drop=FALSE]
initial_lp <- vapply(seq_len(nrow(fitMu)), function(idx) {
sum(dnbinom(x = Counts[idx,],
mu = fitMu[idx,],
size = 1 / alpha_hat[fitidx][idx],
log = TRUE))
}, FUN.VALUE = 0.0)
dispersion_fits <- glmGamPoi::overdispersion_mle(
Counts, mean = fitMu,
model_matrix = modelMatrix, verbose = ! quiet
)
dispIter[fitidx] <- dispersion_fits$iterations
alpha_hat_new[fitidx] <- pmin(dispersion_fits$estimate, maxDisp)
last_lp <- vapply(seq_len(nrow(fitMu)), function(idx) {
sum(dnbinom(x = Counts[idx,],
mu = fitMu[idx,],
size = 1 / alpha_hat_new[fitidx][idx],
log = TRUE))
}, FUN.VALUE = 0.0)
}
fitidx <- abs(log(alpha_hat_new) - log(alpha_hat)) > .05
fitidx[is.na(fitidx)] <- FALSE
alpha_hat <- alpha_hat_new
if (sum(fitidx) == 0) break
}
# dont accept moves if the log posterior did not
# increase by more than one millionth,
# and set the small estimates to the minimum dispersion
dispGeneEst <- alpha_hat
if (niter == 1) {
noIncrease <- last_lp < initial_lp + abs(initial_lp)/1e6
dispGeneEst[which(noIncrease)] <- alpha_init[which(noIncrease)]
}
# didn't reach the maxmium and iterated more than once
dispGeneEstConv <- dispIter < maxit & !(dispIter == 1)
# if lacking convergence from fitDisp() (C++)...
refitDisp <- !dispGeneEstConv & dispGeneEst > minDisp*10
if (sum(refitDisp) > 0) {
dispGrid <- fitDispGridWrapper(y = counts(objectNZ)[refitDisp,,drop=FALSE],
x = modelMatrix,
mu = mu[refitDisp,,drop=FALSE],
logAlphaPriorMean = rep(0,sum(refitDisp)),
logAlphaPriorSigmaSq = 1, usePrior = FALSE,
weightsSEXP = weights[refitDisp,,drop=FALSE],
useWeightsSEXP = useWeights,
weightThresholdSEXP = weightThreshold,
useCRSEXP = useCR)
dispGeneEst[refitDisp] <- dispGrid
}
dispGeneEst <- pmin(pmax(dispGeneEst, minDisp), maxDisp)
dispDataFrame <- buildDataFrameWithNARows(list(dispGeneEst=dispGeneEst,
dispGeneIter=dispIter),
mcols(object)$allZero)
mcols(dispDataFrame) <- DataFrame(type=rep("intermediate",ncol(dispDataFrame)),
description=c("gene-wise estimates of dispersion",
"number of iterations for gene-wise"))
mcols(object) <- cbind(mcols(object), dispDataFrame)
assays(object, withDimnames=FALSE)[["mu"]] <- buildMatrixWithNARows(mu, mcols(object)$allZero)
return(object)
}
#' @rdname estimateDispersionsGeneEst
#' @export
estimateDispersionsFit <- function(object,fitType=c("parametric","local","mean", "glmGamPoi"),
minDisp=1e-8, quiet=FALSE) {
if (is.null(mcols(object)$allZero)) {
object <- getBaseMeansAndVariances(object)
}
objectNZ <- object[!mcols(object)$allZero,,drop=FALSE]
useForFit <- mcols(objectNZ)$dispGeneEst > 100*minDisp
if (sum(useForFit) == 0) {
stop("all gene-wise dispersion estimates are within 2 orders of magnitude
from the minimum value, and so the standard curve fitting techniques will not work.
One can instead use the gene-wise estimates as final estimates:
dds <- estimateDispersionsGeneEst(dds)
dispersions(dds) <- mcols(dds)$dispGeneEst
...then continue with testing using nbinomWaldTest or nbinomLRT")
}
fitType <- match.arg(fitType, choices=c("parametric","local","mean", "glmGamPoi"))
stopifnot(length(fitType)==1)
stopifnot(length(minDisp)==1)
if (fitType == "parametric") {
trial <- try(dispFunction <- parametricDispersionFit(mcols(objectNZ)$baseMean[useForFit],
mcols(objectNZ)$dispGeneEst[useForFit]),
silent=TRUE)
if (inherits(trial,"try-error")) {
message("-- note: fitType='parametric', but the dispersion trend was not well captured by the
function: y = a/x + b, and a local regression fit was automatically substituted.
specify fitType='local' or 'mean' to avoid this message next time.")
fitType <- "local"
}
}
if (fitType == "local") {
dispFunction <- localDispersionFit(means = mcols(objectNZ)$baseMean[useForFit],
disps = mcols(objectNZ)$dispGeneEst[useForFit],
minDisp = minDisp)
}
if (fitType == "mean") {
useForMean <- mcols(objectNZ)$dispGeneEst > 10*minDisp
meanDisp <- mean(mcols(objectNZ)$dispGeneEst[useForMean],na.rm=TRUE,trim=0.001)
dispFunction <- function(means) meanDisp
attr( dispFunction, "mean" ) <- meanDisp
}
if (fitType == "glmGamPoi") {
if (!requireNamespace("glmGamPoi", quietly=TRUE)) {
stop("type='glmGamPoi' requires installing the Bioconductor package 'glmGamPoi'")
}
base_means <- mcols(objectNZ)$baseMean[useForFit]
median_fit <- glmGamPoi::loc_median_fit(base_means,
mcols(objectNZ)$dispGeneEst[useForFit])
get_closest_index <- function(x, vec){
iv <- findInterval(x, vec)
dist_left <- x - vec[ifelse(iv == 0, NA, iv)]
dist_right <- vec[iv + 1] - x
ifelse(! is.na(dist_left) & (is.na(dist_right) | dist_left < dist_right), iv, iv + 1)
}
sorted_bm <- sort(base_means)
ordered_medians <- median_fit[order(base_means)]
dispFunction <- function(means){
indices <- get_closest_index(means, sorted_bm)
ordered_medians[indices]
}
}
if (!(fitType %in% c("parametric","local","mean", "glmGamPoi"))) {
stop("unknown fitType")
}
# store the dispersion function and attributes
attr( dispFunction, "fitType" ) <- fitType
if (quiet) {
suppressMessages({ dispersionFunction(object) <- dispFunction })
} else {
dispersionFunction(object) <- dispFunction
}
return(object)
}
#' @rdname estimateDispersionsGeneEst
#' @export
estimateDispersionsMAP <- function(object, outlierSD=2, dispPriorVar,
minDisp=1e-8, kappa_0=1, dispTol=1e-6,
maxit=100, useCR=TRUE,
weightThreshold=1e-2,
modelMatrix=NULL,
type = c("DESeq2", "glmGamPoi"),
quiet=FALSE) {
stopifnot(length(outlierSD)==1)
stopifnot(length(minDisp)==1)
stopifnot(length(kappa_0)==1)
stopifnot(length(dispTol)==1)
stopifnot(length(maxit)==1)
type <- match.arg(type, c("DESeq2", "glmGamPoi"))
if (is.null(mcols(object)$allZero)) {
object <- getBaseMeansAndVariances(object)
}
if (!is.null(mcols(object)$dispersion)) {
if (!quiet) message("found already estimated dispersions, removing these")
removeCols <- c("dispersion","dispOutlier","dispMAP","dispIter","dispConv")
mcols(object) <- mcols(object)[,!names(mcols(object)) %in% removeCols,drop=FALSE]
}
if (is.null(modelMatrix)) {
modelMatrix <- getModelMatrix(object)
}
# fill in the calculated dispersion prior variance
if (missing(dispPriorVar)) {
# if no gene-wise estimates above minimum
if (sum(mcols(object)$dispGeneEst >= minDisp*100,na.rm=TRUE) == 0) {
warning(paste0("all genes have dispersion estimates < ",minDisp*10,
", returning disp = ",minDisp*10))
resultsList <- list(dispersion = rep(minDisp*10, sum(!mcols(object)$allZero)))
dispDataFrame <- buildDataFrameWithNARows(resultsList, mcols(object)$allZero)
mcols(dispDataFrame) <- DataFrame(type="intermediate",
description="final estimates of dispersion")
mcols(object) <- cbind(mcols(object), dispDataFrame)
dispFn <- dispersionFunction(object)
attr( dispFn, "dispPriorVar" ) <- 0.25
# store dispFn without doing estimation of variance
object@dispersionFunction <- dispFn
return(object)
}
dispPriorVar <- estimateDispersionsPriorVar(object, modelMatrix=modelMatrix)
dispFn <- dispersionFunction(object)
attr( dispFn, "dispPriorVar" ) <- dispPriorVar
# store dispFn without doing estimation of variance
object@dispersionFunction <- dispFn
} else {
dispFn <- dispersionFunction(object)
attr( dispFn, "dispPriorVar" ) <- dispPriorVar