diff --git a/.Rbuildignore b/.Rbuildignore index 7442342ca..875e4cd2b 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,3 +1,5 @@ +^renv$ +^renv\.lock$ ^.*\.Rproj$ ^\.Rproj\.user$ ^.*\.old$ diff --git a/.github/workflows/R_CMD_check.yaml b/.github/workflows/R_CMD_check.yaml index e81e333b5..b7b64ab33 100644 --- a/.github/workflows/R_CMD_check.yaml +++ b/.github/workflows/R_CMD_check.yaml @@ -5,6 +5,9 @@ on: - develop pull_request: +env: + GITHUB_PAT: ${{ secrets.PAT }} + jobs: r-cmd-check: @@ -13,21 +16,26 @@ jobs: name: R CMD check container: image: satijalab/seurat:develop - runs-on: self-hosted + runs-on: [ self-hosted ] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Remove vignettes dir run: rm -rf 'vignettes/' shell: bash + - name: Install additional dependencies + run: | + Rscript -e "remotes::install_github('mojaveazure/seurat-object', ref = 'feat/CalN_generic')" + Rscript -e "remotes::install_github('bnprks/BPCells')" + - name: Check run: devtools::check(args = "--no-manual", error_on = "warning", check_dir = "check", force_suggests = FALSE) shell: Rscript {0} - - name: Upload check results - if: failure() - uses: actions/upload-artifact@master - with: - name: results - path: check +# - name: Upload check results +# if: failure() +# uses: actions/upload-artifact@master +# with: +# name: results +# path: check diff --git a/.github/workflows/integration_checks.yaml b/.github/workflows/integration_checks.yaml new file mode 100644 index 000000000..030761436 --- /dev/null +++ b/.github/workflows/integration_checks.yaml @@ -0,0 +1,15 @@ +name: Integration Checks + +on: + push: + branches: + - develop + pull_request: + branches: + - develop + +jobs: + check-package: + runs-on: ubuntu-latest + steps: + - run: echo "This is a placeholder workflow." diff --git a/DESCRIPTION b/DESCRIPTION index d8317cf63..f3a885a91 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,22 +1,27 @@ Package: Seurat -Version: 4.3.0.9002 -Date: 2023-02-01 +Version: 5.1.0.9006 +Date: 2024-09-29 Title: Tools for Single Cell Genomics Description: A toolkit for quality control, analysis, and exploration of single cell RNA sequencing data. 'Seurat' aims to enable users to identify and interpret sources of heterogeneity from single cell transcriptomic measurements, and to integrate diverse types of single cell data. See Satija R, Farrell J, Gennert D, et al (2015) , Macosko E, Basu A, Satija R, et al (2015) , Stuart T, Butler A, et al (2019) , and Hao, Hao, et al (2020) for more details. Authors@R: c( person(given = "Andrew", family = "Butler", email = "abutler@nygenome.org", role = "ctb", comment = c(ORCID = "0000-0003-3608-0463")), person(given = "Saket", family = "Choudhary", email = "schoudhary@nygenome.org", role = "ctb", comment = c(ORCID = "0000-0001-5202-7633")), + person(given = 'David', family = 'Collins', email = 'dcollins@nygenome.org', role = 'ctb', comment = c(ORCID = '0000-0001-9243-7821')), person(given = "Charlotte", family = "Darby", email = "cdarby@nygenome.org", role = "ctb", comment = c(ORCID = "0000-0003-2195-5300")), person(given = "Jeff", family = "Farrell", email = "jfarrell@g.harvard.edu", role = "ctb"), + person(given = "Isabella", family = "Grabski", email = "igrabski@nygenome.org", role = "ctb", comment = c(ORCID = "0000-0002-0616-5469")), person(given = "Christoph", family = "Hafemeister", email = "chafemeister@nygenome.org", role = "ctb", comment = c(ORCID = "0000-0001-6365-8254")), person(given = "Yuhan", family = "Hao", email = "yhao@nygenome.org", role = "ctb", comment = c(ORCID = "0000-0002-1810-0822")), person(given = "Austin", family = "Hartman", email = "ahartman@nygenome.org", role = "ctb", comment = c(ORCID = "0000-0001-7278-1852")), - person(given = "Paul", family = "Hoffman", email = "seurat@nygenome.org", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-7693-8957")), + person(given = "Paul", family = "Hoffman", email = "hoff0792@umn.edu", role = "ctb", comment = c(ORCID = "0000-0002-7693-8957")), person(given = "Jaison", family = "Jain", email = "jjain@nygenome.org", role = "ctb", comment = c(ORCID = "0000-0002-9478-5018")), + person(given = "Longda", family = "Jiang", email = "ljiang@nygenome.org", role = "ctb", comment = c(ORCID = "0000-0003-4964-6497")), person(given = "Madeline", family = "Kowalski", email = "mkowalski@nygenome.org", role = "ctb", comment = c(ORCID = "0000-0002-5655-7620")), + person(given = "Skylar", family = "Li", email = "sli@nygenome.org", role = "ctb"), + person(given = "Gesmira", family = "Molla", email = 'gmolla@nygenome.org', role = 'ctb', comment = c(ORCID = '0000-0002-8628-5056')), person(given = "Efthymia", family = "Papalexi", email = "epapalexi@nygenome.org", role = "ctb", comment = c(ORCID = "0000-0001-5898-694X")), person(given = "Patrick", family = "Roelli", email = "proelli@nygenome.org", role = "ctb"), - person(given = "Rahul", family = "Satija", email = "rsatija@nygenome.org", role = "ctb", comment = c(ORCID = "0000-0001-9448-8833")), + person(given = "Rahul", family = "Satija", email = "seurat@nygenome.org", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-9448-8833")), person(given = "Karthik", family = "Shekhar", email = "kshekhar@berkeley.edu", role = "ctb"), person(given = "Avi", family = "Srivastava", email = "asrivastava@nygenome.org", role = "ctb", comment = c(ORCID = "0000-0001-9798-2079")), person(given = "Tim", family = "Stuart", email = "tstuart@nygenome.org", role = "ctb", comment = c(ORCID = "0000-0002-3044-0897")), @@ -26,15 +31,19 @@ Authors@R: c( ) URL: https://satijalab.org/seurat, https://github.com/satijalab/seurat BugReports: https://github.com/satijalab/seurat/issues +Additional_repositories: https://satijalab.r-universe.dev, https://bnprks.r-universe.dev Depends: R (>= 4.0.0), - methods + methods, + SeuratObject (>= 5.0.2) Imports: cluster, cowplot, + fastDummies, fitdistrplus, future, future.apply, + generics (>= 0.1.3), ggplot2 (>= 3.3.0), ggrepel, ggridges, @@ -48,6 +57,7 @@ Imports: jsonlite, KernSmooth, leiden (>= 0.3.1), + lifecycle, lmtest, MASS, Matrix (>= 1.5-0), @@ -58,18 +68,20 @@ Imports: plotly (>= 4.9.0), png, progressr, + purrr, RANN, RColorBrewer, Rcpp (>= 1.0.7), RcppAnnoy (>= 0.0.18), + RcppHNSW, reticulate, rlang, ROCR, + RSpectra, Rtsne, scales, - scattermore (>= 0.7), - sctransform (>= 0.3.5), - SeuratObject (>= 4.1.3), + scattermore (>= 1.2), + sctransform (>= 0.4.1), shiny, spatstat.explore, spatstat.geom, @@ -77,7 +89,7 @@ Imports: tibble, tools, utils, - uwot (>= 0.1.14) + uwot (>= 0.1.10) LinkingTo: Rcpp (>= 0.11.0), RcppEigen, RcppProgress License: MIT + file LICENSE LazyData: true @@ -92,16 +104,22 @@ Collate: 'differential_expression.R' 'dimensional_reduction.R' 'integration.R' + 'zzz.R' + 'integration5.R' 'mixscape.R' 'objects.R' 'preprocessing.R' + 'preprocessing5.R' + 'roxygen.R' + 'sketching.R' 'tree.R' 'utilities.R' - 'zzz.R' -RoxygenNote: 7.2.2 +RoxygenNote: 7.3.2 Encoding: UTF-8 Suggests: ape, + arrow, + BPCells, rsvd, testthat, hdf5r, @@ -125,4 +143,7 @@ Suggests: mixtools, ggrastr, data.table, - R.utils + R.utils, + presto, + DelayedArray, + harmony diff --git a/NAMESPACE b/NAMESPACE index 4984dc2be..dea8dcb78 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,19 +5,25 @@ S3method("SCTResults<-",SCTModel) S3method("[",SlideSeq) S3method("[",VisiumV1) S3method("levels<-",SCTAssay) +S3method(.CalcN,IterableMatrix) S3method(AnnotateAnchors,IntegrationAnchorSet) S3method(AnnotateAnchors,TransferAnchorSet) S3method(AnnotateAnchors,default) +S3method(Cells,SCTAssay) S3method(Cells,SCTModel) S3method(Cells,STARmap) S3method(Cells,SlideSeq) S3method(Cells,VisiumV1) +S3method(Features,SCTAssay) +S3method(Features,SCTModel) +S3method(FetchData,VisiumV1) S3method(FindClusters,Seurat) S3method(FindClusters,default) S3method(FindMarkers,Assay) S3method(FindMarkers,DimReduc) S3method(FindMarkers,SCTAssay) S3method(FindMarkers,Seurat) +S3method(FindMarkers,StdAssay) S3method(FindMarkers,default) S3method(FindNeighbors,Assay) S3method(FindNeighbors,Seurat) @@ -25,14 +31,19 @@ S3method(FindNeighbors,default) S3method(FindNeighbors,dist) S3method(FindSpatiallyVariableFeatures,Assay) S3method(FindSpatiallyVariableFeatures,Seurat) +S3method(FindSpatiallyVariableFeatures,StdAssay) S3method(FindSpatiallyVariableFeatures,default) S3method(FindVariableFeatures,Assay) S3method(FindVariableFeatures,SCTAssay) S3method(FindVariableFeatures,Seurat) +S3method(FindVariableFeatures,StdAssay) +S3method(FindVariableFeatures,V3Matrix) S3method(FindVariableFeatures,default) S3method(FoldChange,Assay) S3method(FoldChange,DimReduc) +S3method(FoldChange,SCTAssay) S3method(FoldChange,Seurat) +S3method(FoldChange,StdAssay) S3method(FoldChange,default) S3method(GetAssay,Seurat) S3method(GetImage,STARmap) @@ -41,17 +52,37 @@ S3method(GetImage,VisiumV1) S3method(GetTissueCoordinates,STARmap) S3method(GetTissueCoordinates,SlideSeq) S3method(GetTissueCoordinates,VisiumV1) +S3method(GetTissueCoordinates,VisiumV2) S3method(HVFInfo,SCTAssay) S3method(IntegrateEmbeddings,IntegrationAnchorSet) S3method(IntegrateEmbeddings,TransferAnchorSet) +S3method(LeverageScore,Assay) +S3method(LeverageScore,Seurat) +S3method(LeverageScore,StdAssay) +S3method(LeverageScore,default) +S3method(LogNormalize,IterableMatrix) +S3method(LogNormalize,V3Matrix) +S3method(LogNormalize,data.frame) +S3method(LogNormalize,default) S3method(MappingScore,AnchorSet) S3method(MappingScore,default) S3method(NormalizeData,Assay) S3method(NormalizeData,Seurat) +S3method(NormalizeData,StdAssay) +S3method(NormalizeData,V3Matrix) S3method(NormalizeData,default) +S3method(ProjectCellEmbeddings,Assay) +S3method(ProjectCellEmbeddings,IterableMatrix) +S3method(ProjectCellEmbeddings,SCTAssay) +S3method(ProjectCellEmbeddings,Seurat) +S3method(ProjectCellEmbeddings,StdAssay) +S3method(ProjectCellEmbeddings,default) S3method(ProjectUMAP,DimReduc) S3method(ProjectUMAP,Seurat) S3method(ProjectUMAP,default) +S3method(PseudobulkExpression,Assay) +S3method(PseudobulkExpression,Seurat) +S3method(PseudobulkExpression,StdAssay) S3method(Radius,STARmap) S3method(Radius,SlideSeq) S3method(Radius,VisiumV1) @@ -61,6 +92,8 @@ S3method(RenameCells,SlideSeq) S3method(RenameCells,VisiumV1) S3method(RunCCA,Seurat) S3method(RunCCA,default) +S3method(RunGraphLaplacian,Seurat) +S3method(RunGraphLaplacian,default) S3method(RunICA,Assay) S3method(RunICA,Seurat) S3method(RunICA,default) @@ -69,11 +102,15 @@ S3method(RunLDA,Seurat) S3method(RunLDA,default) S3method(RunPCA,Assay) S3method(RunPCA,Seurat) +S3method(RunPCA,Seurat5) +S3method(RunPCA,StdAssay) S3method(RunPCA,default) S3method(RunSLSI,Assay) S3method(RunSLSI,Seurat) +S3method(RunSLSI,StdAssay) S3method(RunSLSI,default) S3method(RunSPCA,Assay) +S3method(RunSPCA,Assay5) S3method(RunSPCA,Seurat) S3method(RunSPCA,default) S3method(RunTSNE,DimReduc) @@ -87,22 +124,39 @@ S3method(RunUMAP,default) S3method(SCTResults,SCTAssay) S3method(SCTResults,SCTModel) S3method(SCTResults,Seurat) +S3method(SCTransform,Assay) +S3method(SCTransform,IterableMatrix) +S3method(SCTransform,Seurat) +S3method(SCTransform,StdAssay) +S3method(SCTransform,default) S3method(ScaleData,Assay) +S3method(ScaleData,IterableMatrix) S3method(ScaleData,Seurat) +S3method(ScaleData,StdAssay) S3method(ScaleData,default) S3method(ScaleFactors,VisiumV1) +S3method(ScaleFactors,VisiumV2) S3method(ScoreJackStraw,DimReduc) S3method(ScoreJackStraw,JackStrawData) S3method(ScoreJackStraw,Seurat) +S3method(VST,IterableMatrix) +S3method(VST,default) +S3method(VST,dgCMatrix) +S3method(VST,matrix) +S3method(VariableFeatures,SCTAssay) +S3method(VariableFeatures,SCTModel) S3method(as.CellDataSet,Seurat) S3method(as.Seurat,CellDataSet) S3method(as.Seurat,SingleCellExperiment) S3method(as.SingleCellExperiment,Seurat) S3method(as.data.frame,Matrix) S3method(as.sparse,H5Group) +S3method(as.sparse,IterableMatrix) +S3method(components,SCTAssay) S3method(dim,STARmap) S3method(dim,SlideSeq) S3method(dim,VisiumV1) +S3method(dim,VisiumV2) S3method(fortify,Centroids) S3method(fortify,Molecules) S3method(fortify,Segmentation) @@ -140,7 +194,10 @@ export(BarcodeInflectionsPlot) export(BlackAndWhite) export(BlueAndRed) export(BoldTitle) +export(BridgeCellsRepresentation) export(BuildClusterTree) +export(BuildNicheAssay) +export(CCAIntegration) export(CalcPerturbSig) export(CalculateBarcodeInflections) export(CaseMatch) @@ -155,7 +212,9 @@ export(CollapseSpeciesExpressionMatrix) export(ColorDimSplit) export(CombinePlots) export(Command) +export(CountSketch) export(CreateAssayObject) +export(CreateCategoryMatrix) export(CreateDimReducObject) export(CreateSCTAssayObject) export(CreateSeuratObject) @@ -176,13 +235,17 @@ export(Embeddings) export(ExpMean) export(ExpSD) export(ExpVar) +export(FastRPCAIntegration) export(FastRowScale) export(FeatureLocator) export(FeaturePlot) export(FeatureScatter) export(FetchData) +export(FetchResiduals) export(FilterSlideSeq) export(FindAllMarkers) +export(FindBridgeIntegrationAnchors) +export(FindBridgeTransferAnchors) export(FindClusters) export(FindConservedMarkers) export(FindIntegrationAnchors) @@ -195,6 +258,7 @@ export(FindTransferAnchors) export(FindVariableFeatures) export(FoldChange) export(FontSize) +export(GaussianSketch) export(GeneSymbolThesarus) export(GetAssay) export(GetAssayData) @@ -208,6 +272,7 @@ export(GroupCorrelationPlot) export(HTODemux) export(HTOHeatmap) export(HVFInfo) +export(HarmonyIntegration) export(HoverLocator) export(IFeaturePlot) export(ISpatialDimPlot) @@ -220,21 +285,25 @@ export(Index) export(Indices) export(IntegrateData) export(IntegrateEmbeddings) +export(IntegrateLayers) export(Intensity) export(IsGlobal) export(JS) export(JackStraw) export(JackStrawPlot) +export(JointPCAIntegration) export(Key) export(L2CCA) export(L2Dim) export(LabelClusters) export(LabelPoints) +export(LeverageScore) export(LinkedDimPlot) export(LinkedFeaturePlot) export(Load10X_Spatial) export(LoadAkoya) export(LoadAnnoyIndex) +export(LoadCurioSeeker) export(LoadHuBMAPCODEX) export(LoadNanostring) export(LoadSTARmap) @@ -256,6 +325,7 @@ export(MixingMetric) export(MixscapeHeatmap) export(MixscapeLDA) export(NNPlot) +export(NNtoGraph) export(Neighbors) export(NoAxes) export(NoGrid) @@ -274,14 +344,24 @@ export(PredictAssay) export(PrepLDA) export(PrepSCTFindMarkers) export(PrepSCTIntegration) +export(PrepareBridgeReference) export(Project) +export(ProjectCellEmbeddings) +export(ProjectData) export(ProjectDim) +export(ProjectDimReduc) +export(ProjectIntegration) export(ProjectUMAP) +export(PseudobulkExpression) export(PurpleAndYellow) +export(RPCAIntegration) export(Radius) export(Read10X) +export(Read10X_Coordinates) export(Read10X_Image) +export(Read10X_ScaleFactors) export(Read10X_h5) +export(Read10X_probe_metadata) export(ReadAkoya) export(ReadMtx) export(ReadNanostring) @@ -302,6 +382,7 @@ export(RidgePlot) export(RotatedAxis) export(RowMergeSparseMatrices) export(RunCCA) +export(RunGraphLaplacian) export(RunICA) export(RunLDA) export(RunMarkVario) @@ -321,6 +402,8 @@ export(ScaleData) export(ScaleFactors) export(ScoreJackStraw) export(SelectIntegrationFeatures) +export(SelectIntegrationFeatures5) +export(SelectSCTIntegrationFeatures) export(SetAssayData) export(SetIdent) export(SetIntegrationData) @@ -334,6 +417,7 @@ export(SingleImageMap) export(SingleImagePlot) export(SingleRasterMap) export(SingleSpatialPlot) +export(SketchData) export(SpatialDimPlot) export(SpatialFeaturePlot) export(SpatialPlot) @@ -349,10 +433,13 @@ export(TopCells) export(TopFeatures) export(TopNeighbors) export(TransferData) +export(TransferSketchLabels) export(UMAPPlot) +export(UnSketchEmbeddings) export(UpdateSCTAssays) export(UpdateSeuratObject) export(UpdateSymbolList) +export(VST) export(VariableFeaturePlot) export(VariableFeatures) export(VizDimLoadings) @@ -365,9 +452,11 @@ export(as.Neighbor) export(as.Seurat) export(as.SingleCellExperiment) export(as.sparse) +export(components) export(scalefactors) exportClasses(AnchorSet) exportClasses(Assay) +exportClasses(BridgeReferenceSet) exportClasses(DimReduc) exportClasses(Graph) exportClasses(IntegrationAnchorSet) @@ -380,9 +469,11 @@ exportClasses(SeuratCommand) exportClasses(SpatialImage) exportClasses(TransferAnchorSet) exportClasses(VisiumV1) +exportClasses(VisiumV2) importClassesFrom(Matrix,dgCMatrix) importClassesFrom(SeuratObject,Assay) importClassesFrom(SeuratObject,DimReduc) +importClassesFrom(SeuratObject,FOV) importClassesFrom(SeuratObject,Graph) importClassesFrom(SeuratObject,JackStrawData) importClassesFrom(SeuratObject,Neighbor) @@ -390,6 +481,7 @@ importClassesFrom(SeuratObject,Seurat) importClassesFrom(SeuratObject,SeuratCommand) importClassesFrom(SeuratObject,SpatialImage) importFrom(KernSmooth,bkde) +importFrom(MASS,ginv) importFrom(MASS,glm.nb) importFrom(MASS,lda) importFrom(Matrix,Matrix) @@ -397,41 +489,57 @@ importFrom(Matrix,as.matrix) importFrom(Matrix,colMeans) importFrom(Matrix,colSums) importFrom(Matrix,crossprod) +importFrom(Matrix,diag) +importFrom(Matrix,qrR) importFrom(Matrix,readMM) importFrom(Matrix,rowMeans) importFrom(Matrix,rowSums) importFrom(Matrix,sparse.model.matrix) importFrom(Matrix,sparseMatrix) +importFrom(Matrix,summary) importFrom(Matrix,t) importFrom(RANN,nn2) importFrom(RColorBrewer,brewer.pal) importFrom(RColorBrewer,brewer.pal.info) importFrom(ROCR,performance) importFrom(ROCR,prediction) +importFrom(RSpectra,eigs_sym) importFrom(Rcpp,evalCpp) importFrom(RcppAnnoy,AnnoyAngular) importFrom(RcppAnnoy,AnnoyEuclidean) importFrom(RcppAnnoy,AnnoyHamming) importFrom(RcppAnnoy,AnnoyManhattan) +importFrom(RcppHNSW,hnsw_build) +importFrom(RcppHNSW,hnsw_search) importFrom(Rtsne,Rtsne) importFrom(SeuratObject,"%!NA%") importFrom(SeuratObject,"%NA%") importFrom(SeuratObject,"%iff%") importFrom(SeuratObject,"%||%") importFrom(SeuratObject,"DefaultAssay<-") +importFrom(SeuratObject,"DefaultLayer<-") importFrom(SeuratObject,"Idents<-") importFrom(SeuratObject,"Index<-") importFrom(SeuratObject,"JS<-") importFrom(SeuratObject,"Key<-") +importFrom(SeuratObject,"LayerData<-") importFrom(SeuratObject,"Loadings<-") importFrom(SeuratObject,"Misc<-") importFrom(SeuratObject,"Project<-") importFrom(SeuratObject,"Tool<-") importFrom(SeuratObject,"VariableFeatures<-") +importFrom(SeuratObject,.CalcN) +importFrom(SeuratObject,.CheckFmargin) +importFrom(SeuratObject,.FilterObjects) +importFrom(SeuratObject,.IsFutureSeurat) +importFrom(SeuratObject,.MARGIN) +importFrom(SeuratObject,.PropagateList) +importFrom(SeuratObject,.SparseSlots) importFrom(SeuratObject,AddMetaData) importFrom(SeuratObject,Assays) importFrom(SeuratObject,AttachDeps) importFrom(SeuratObject,Boundaries) +importFrom(SeuratObject,CastAssay) importFrom(SeuratObject,Cells) importFrom(SeuratObject,CellsByIdentities) importFrom(SeuratObject,Command) @@ -439,14 +547,17 @@ importFrom(SeuratObject,CreateAssayObject) importFrom(SeuratObject,CreateCentroids) importFrom(SeuratObject,CreateDimReducObject) importFrom(SeuratObject,CreateFOV) +importFrom(SeuratObject,CreateMolecules) importFrom(SeuratObject,CreateSegmentation) importFrom(SeuratObject,CreateSeuratObject) importFrom(SeuratObject,DefaultAssay) importFrom(SeuratObject,DefaultBoundary) importFrom(SeuratObject,DefaultDimReduc) importFrom(SeuratObject,DefaultFOV) +importFrom(SeuratObject,DefaultLayer) importFrom(SeuratObject,Distances) importFrom(SeuratObject,Embeddings) +importFrom(SeuratObject,EmptyDF) importFrom(SeuratObject,Features) importFrom(SeuratObject,FetchData) importFrom(SeuratObject,GetAssayData) @@ -458,9 +569,13 @@ importFrom(SeuratObject,Images) importFrom(SeuratObject,Index) importFrom(SeuratObject,Indices) importFrom(SeuratObject,IsGlobal) +importFrom(SeuratObject,IsSparse) importFrom(SeuratObject,JS) +importFrom(SeuratObject,JoinLayers) importFrom(SeuratObject,Key) importFrom(SeuratObject,Keys) +importFrom(SeuratObject,LayerData) +importFrom(SeuratObject,Layers) importFrom(SeuratObject,Loadings) importFrom(SeuratObject,LogSeuratCommand) importFrom(SeuratObject,Misc) @@ -471,6 +586,7 @@ importFrom(SeuratObject,PackageCheck) importFrom(SeuratObject,Project) importFrom(SeuratObject,Radius) importFrom(SeuratObject,Reductions) +importFrom(SeuratObject,RenameAssays) importFrom(SeuratObject,RenameCells) importFrom(SeuratObject,RenameIdents) importFrom(SeuratObject,ReorderIdent) @@ -478,11 +594,14 @@ importFrom(SeuratObject,RowMergeSparseMatrices) importFrom(SeuratObject,SVFInfo) importFrom(SeuratObject,SetAssayData) importFrom(SeuratObject,SetIdent) +importFrom(SeuratObject,SparseEmptyMatrix) importFrom(SeuratObject,SpatiallyVariableFeatures) importFrom(SeuratObject,StashIdent) importFrom(SeuratObject,Stdev) +importFrom(SeuratObject,StitchMatrix) importFrom(SeuratObject,Tool) importFrom(SeuratObject,UpdateSeuratObject) +importFrom(SeuratObject,UpdateSlots) importFrom(SeuratObject,VariableFeatures) importFrom(SeuratObject,WhichCells) importFrom(SeuratObject,as.Graph) @@ -493,11 +612,13 @@ importFrom(cluster,clara) importFrom(cowplot,get_legend) importFrom(cowplot,plot_grid) importFrom(cowplot,theme_cowplot) +importFrom(fastDummies,dummy_cols) importFrom(fitdistrplus,fitdist) importFrom(future,nbrOfWorkers) importFrom(future,plan) importFrom(future.apply,future_lapply) importFrom(future.apply,future_sapply) +importFrom(generics,components) importFrom(ggplot2,Geom) importFrom(ggplot2,GeomPolygon) importFrom(ggplot2,GeomViolin) @@ -634,6 +755,11 @@ importFrom(irlba,irlba) importFrom(jsonlite,fromJSON) importFrom(jsonlite,read_json) importFrom(leiden,leiden) +importFrom(lifecycle,deprecate_soft) +importFrom(lifecycle,deprecate_stop) +importFrom(lifecycle,deprecate_warn) +importFrom(lifecycle,deprecated) +importFrom(lifecycle,is_present) importFrom(lmtest,lrtest) importFrom(matrixStats,rowAnyNAs) importFrom(matrixStats,rowMeans2) @@ -670,14 +796,31 @@ importFrom(plotly,plot_ly) importFrom(plotly,raster2uri) importFrom(png,readPNG) importFrom(progressr,progressor) +importFrom(purrr,imap) importFrom(reticulate,import) importFrom(reticulate,py_module_available) importFrom(reticulate,py_set_seed) +importFrom(rlang,"!!!") importFrom(rlang,"!!") +importFrom(rlang,abort) +importFrom(rlang,arg_match) +importFrom(rlang,arg_match0) importFrom(rlang,as_label) -importFrom(rlang,invoke) +importFrom(rlang,as_name) +importFrom(rlang,caller_env) +importFrom(rlang,check_installed) +importFrom(rlang,enquo) +importFrom(rlang,exec) +importFrom(rlang,inform) +importFrom(rlang,is_integerish) importFrom(rlang,is_na) +importFrom(rlang,is_quosure) +importFrom(rlang,is_scalar_character) +importFrom(rlang,is_scalar_integerish) +importFrom(rlang,quo_get_env) +importFrom(rlang,quo_get_expr) importFrom(rlang,sym) +importFrom(rlang,warn) importFrom(scales,brewer_pal) importFrom(scales,hue_pal) importFrom(scales,rescale) @@ -755,6 +898,7 @@ importFrom(utils,globalVariables) importFrom(utils,head) importFrom(utils,isS3method) importFrom(utils,isS3stdGeneric) +importFrom(utils,lsf.str) importFrom(utils,methods) importFrom(utils,packageVersion) importFrom(utils,read.csv) diff --git a/NEWS.md b/NEWS.md index 75ed99993..2ad02f413 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,131 @@ # Unreleased + ## Changes -- Fix bug in `as.Seurat.SingleCellExperiment()` ([#6692](https://github.com/satijalab/seurat/issues/6692)) +- Surfaced more fine-grained control over what parts of a Xenium experiment are loaded in `LoadXenium` +- Added ability to load Xenium nucleus segmentation masks +- Updated `LoadXenium` to also read some run metadata (run start time, preservation method, panel used, organism, tissue type, instrument software version and stain kit used) into `misc` slot +- Updated `ReadXenium` to load cell_feature_matrix.h5 when present in favor of the MEX format files +- Added ability to read Xenium `segmentation_method` directly into `meta.data` +- Updated `ReadXenium` to load .parquet files using `arrow` instead of .csv.gz files to support XOA 3.0 +- Updated `RunSLSI` to support `BPCells` matrices +- Fixed `LoadXenium` to accommodate datasets without "Blank Codeword" or "Unassigned Codeword" matrices +- Fixed `ReadXenium` to properly parse multiple molecular outputs at once ([#8265](https://github.com/satijalab/seurat/issues/8265)) +- Fixed `RunPCA` to avoid converting `BPCells` matrices into dense matrices - significantly reduces the function's memory usage when running on `BPCells` matrices +- Added `features` parameter to `LeverageScore` and `SketchData` +- Updated `SketchData`'s `ncells` parameter to accept integer vector + +# Seurat 5.1.0 (2024-05-08) + +## Changes +- Updated interactive `SpatialDimPlot`s to display spatial coordinates +- Updated `BuildClusterTree` to `PseudobulkExpression(..., method = "aggregate")` instead of `AverageExpression` +- Fixed `SpatialPlot` to properly handle images with shared cells +- Added `cluster.name` parameter to `BuildNicheAssay` +- Added `VisiumV2` class, inherits `SeuratObject::FOV`, returned by `Load10X_Spatial` +- Updated `Load10X_Spatial` to support loading Visium HD datasets - added `bin.size` parameter +- Updated `Read10X_Coordinates` to handle new `tissue_coordinates.parquet` files +- Added `shape` parameter to `SpatialPlot` and wrappers: `SpatialFeaturePlot` and `SpatialDimPlot` +- Added `image.scale` parameter to `SpatialPlot` and related functions: `SpatialFeaturePlot`/`ISpatialFeaturePlot` and `SpatialDimPlot`/`ISpatialDimPlot` +- Restored `image.name` parameter to `Read10X_Image` +- Added `scale` parameter to `Radius.VisiumV1` +- Fixed calculation of `spot.radius` attribute for `VisiumV1` instance returned by `Read10X_Image` — replace `scale.factors$fiducial` with `scale.factors$spot` +- Added `Read10X_Coordinates` and `Read10X_ScaleFactors` +- Fixed `SpatialPlot` to properly handle mutli-assay `Seurat` instances + +# Seurat 5.0.3 (2024-03-18) + +## Changes +- Fixed `PercentAbove` to discount null values ([#8412](https://github.com/satijalab/seurat/issues/8412)) +- Added `log` parameter to `FeatureScatter` +- Fixed handling of `clip.range` for `SCTransform` when `ncells` is less than the size of the passed dataset + +# Seurat 5.0.2 (2024-02-28) + +## Changes + +- Fixed `AverageExpression` to handle `features` vector when `layer="scale.data"` and `return.seurat=TRUE` +- Added `fc.slot` parameter to `FindMarkers` +- Fixed `SCTransform` to handle `vars.to.regress` ([#8148](https://github.com/satijalab/seurat/issues/8148)) and ([#8349](https://github.com/satijalab/seurat/issues/8349)) +- Fixed `SCTransform` to handle fetching residuals ([#8185](https://github.com/satijalab/seurat/issues/8185)) + +# Seurat 5.0.1 (2023-11-16) + +## Changes + +- Fixed `SCTransform.StdAssay` to pass extra arguments to `sctransform::vst()`. Fixes [#875](https://github.com/satijalab/seurat/issues/7998) +- Fixed PercentageFeatureSet Layer calling [(#8009)](https://github.com/satijalab/seurat/issues/8009) +- Fixed cell highlighting [(#7914)](https://github.com/satijalab/seurat/pull/7914) +- Updated marker sorting to be by p-value with ties broken by absolute difference in percent expression +- Fixed issue with replicated barcodes in MappingScore [(#7922)](https://github.com/satijalab/seurat/issues/7922) +- Improved `PseudobulkExpression` by adding 'g' to cell names that started with numeric values +- Improved `PseudobulkExpression` by adding each variable specified in `group.by` as columns in the object metadata when `return.seurat=TRUE` +- Fixed `DimPlot` and `FeatureScatter` which were breaking when using the `split.by` argument with a variable that contained NAs + +# Seurat 5.0.0 (2023-10-25) + +## Added +- Add `BridgeCellsRepresentation` to construct a dictionary representation for each unimodal dataset. +- Add `BuildNicheAssay` to construct a new assay where each feature is a cell label. The values represent the sum of a particular cell label neighboring a given cell. +- Add `CalcDispersion` to calculate the dispersion of features. +- Add `CCAIntegration` to perform Seurat-CCA Integration. +- Add `CountSketch` to generate a CountSketch random matrix. +- Add `CreateCategoryMatrix` to create a one-hot matrix for a given label. +- Add `DISP` to find variable features based on dispersion. +- Add `FastRPCAIntegration` as a convenience wrapper function around the following three functions that are often run together when performing integration. +- Add `FetchResiduals_reference` as a temporary function to get residuals from the reference. +- Add `FetchResiduals` to call sctransform::get_residuals. +- Add `FetchResidualSCTModel` to calculate Pearson residuals of features not in the scale.data. +- Add `FindBridgeAnchor` to find bridge anchors between two unimodal datasets. +- Add `FindBridgeIntegrationAnchors` to find a set of anchors for integration between unimodal query and the other unimodal reference using a pre-computed BridgeReferenceSet. +- Add `FindBridgeTransferAnchors` to find a set of anchors for label transfer between unimodal query and the other unimodal reference using a pre-computed BridgeReferenceSet. +- Add `GaussianSketch` to perform Gaussian sketching. +- Add `HarmonyIntegration` to perform Harmony integration. +- Add `IntegrateLayers` to integrate layers in an assay object. +- Add `JointPCAIntegration` to perform Seurat-Joint PCA Integration. +- Add `LeverageScore` to compute the leverage scores for a given object. +- Add `LoadCurioSeeker` to load Curio Seeker data. +- Add `MVP` to find variable features based on mean.var.plot. +- Add `NNtoGraph` to convert the Neighbor class to an asymmetrical Graph class. +- Add `PrepareBridgeReference` to preprocess the multi-omic bridge and unimodal reference datasets into an extended reference. +- Add `ProjectCellEmbeddings` to project query data onto the reference dimensional reduction. +- Add `ProjectData` to project high-dimensional single-cell RNA expression data from a full dataset onto the lower-dimensional embedding of the sketch of the dataset. +- Add `ProjectDimReduc` to project query data to reference dimensional reduction. +- Add `ProjectIntegration` to integrate embeddings from the integrated sketched.assay. +- Add `PseudobulkExpression` to normalize the count data present in a given assay. +- Add `Read10X_probe_metadata` to read the probe metadata from a 10x Genomics probe barcode matrix file in HDF5 format. +- Add `RPCAIntegration` to perform Seurat-RPCA Integration. +- Add `RunGraphLaplacian` to run a graph Laplacian dimensionality reduction. +- Add `SelectIntegrationFeatures5` to select integration features for v5 assays. +- Add `SelectSCTIntegrationFeatures` to select SCT integration features. +- Add `SketchData` to use sketching methods to downsample high-dimensional single-cell RNA expression data for help with scalability for large datasets. +- Add `TransferSketchLabels` to transfer cell type labels from a sketched dataset to a full dataset based on the similarities in the lower-dimensional space. +- Add `UnSketchEmbeddings` to transfer embeddings from sketched cells to the full data. +- Add `VST` to apply a variance stabilizing transformation for selection of variable features. + +## Changes +- Change `FindTransferAnchors` so that anchor filtering is not performed by default +- Change `merge` so that layers will be added to a single Seurat object instead of combining raw count matrices +- Deprecate `slot` parameter in favor of `layers` in accessor and set methods + +# Seurat 4.4.0 (2023-09-27) + +## Added +- Add parallelization support with speed improvements for `PrepSCTFindMarkers` +- Fix bug in `LoadNanostring`([#7566](https://github.com/satijalab/seurat/pull/7566)) +## Changes +- Fix bug in `as.Seurat.SingleCellExperiment()` ([#6692](https://github.com/satijalab/seurat/issues/6692)) +- Support for Visium probe information introduced in Spaceranger 2.1 ([#7141](https://github.com/satijalab/seurat/pull/7141)) +- Add `LoadCurioSeeker` to load sequencing-based spatial datasets generated using the Curio Seeker +- Fix fold change calculation for assays ([#7095](https://github.com/satijalab/seurat/issues/7095)) +- Fix `pt.size` bug when rasterization is set to true ([#7379](https://github.com/satijalab/seurat/issues/7379)) +- Fix `FoldChange` and `FindMarkers` to support all normalization approaches ([#7115](https://github.com/satijalab/seurat/pull/7115),[#7110](https://github.com/satijalab/seurat/issues/7110),[#7095](https://github.com/satijalab/seurat/issues/7095),[#6976](https://github.com/satijalab/seurat/issues/6976),[#6654](https://github.com/satijalab/seurat/issues/6654),[#6701](https://github.com/satijalab/seurat/issues/6701),[#6773](https://github.com/satijalab/seurat/issues/6773), [#7107](https://github.com/satijalab/seurat/issues/7107)) +- Fix for handling newer ParseBio formats in `ReadParseBio` ([#7565](https://github.com/satijalab/seurat/pull/7565)) +- Fix for handling rasterization by default ([#7842](https://github.com/satijalab/seurat/pull/7842)) +- Fix bug in `ReadMtx()` to add back missing parameters +- Fix `SCTransform()` for V5 assays to retain gene attributes ([#7557](https://github.com/satijalab/seurat/issues/7557)) +- Fix `LeverageScore()` for objects with few features ([#7650](https://github.com/satijalab/seurat/issues/7650) + # Seurat 4.3.0 (2022-11-18) ## Added diff --git a/R/clustering.R b/R/clustering.R index 786a18fef..c20719b92 100644 --- a/R/clustering.R +++ b/R/clustering.R @@ -403,6 +403,7 @@ FindClusters.default <- function( #' @importFrom methods is #' #' @param graph.name Name of graph to use for the clustering algorithm +#' @param cluster.name Name of output clusters #' #' @rdname FindClusters #' @export @@ -412,6 +413,7 @@ FindClusters.default <- function( FindClusters.Seurat <- function( object, graph.name = NULL, + cluster.name = NULL, modularity.fxn = 1, initial.membership = NULL, node.sizes = NULL, @@ -452,9 +454,18 @@ FindClusters.Seurat <- function( verbose = verbose, ... ) - colnames(x = clustering.results) <- paste0(graph.name, "_", colnames(x = clustering.results)) - object <- AddMetaData(object = object, metadata = clustering.results) - Idents(object = object) <- colnames(x = clustering.results)[ncol(x = clustering.results)] + cluster.name <- cluster.name %||% + paste( + graph.name, + names(x = clustering.results), + sep = '_' + ) + names(x = clustering.results) <- cluster.name + # object <- AddMetaData(object = object, metadata = clustering.results) + # Idents(object = object) <- colnames(x = clustering.results)[ncol(x = clustering.results)] + idents.use <- names(x = clustering.results)[ncol(x = clustering.results)] + object[[]] <- clustering.results + Idents(object = object, replace = TRUE) <- object[[idents.use, drop = TRUE]] levels <- levels(x = object) levels <- tryCatch( expr = as.numeric(x = levels), @@ -496,7 +507,6 @@ FindClusters.Seurat <- function( #' @param nn.eps Error bound when performing nearest neighbor seach using RANN; #' default of 0.0 implies exact nearest neighbor search #' @param verbose Whether or not to print output to the console -#' @param force.recalc Force recalculation of (S)NN. #' @param l2.norm Take L2Norm of the data #' @param cache.index Include cached index in returned Neighbor object #' (only relevant if return.neighbor = TRUE) @@ -524,7 +534,6 @@ FindNeighbors.default <- function( annoy.metric = "euclidean", nn.eps = 0, verbose = TRUE, - force.recalc = FALSE, l2.norm = FALSE, cache.index = FALSE, index = NULL, @@ -634,7 +643,6 @@ FindNeighbors.Assay <- function( annoy.metric = "euclidean", nn.eps = 0, verbose = TRUE, - force.recalc = FALSE, l2.norm = FALSE, cache.index = FALSE, ... @@ -652,7 +660,6 @@ FindNeighbors.Assay <- function( annoy.metric = annoy.metric, nn.eps = nn.eps, verbose = verbose, - force.recalc = force.recalc, l2.norm = l2.norm, return.neighbor = return.neighbor, cache.index = cache.index, @@ -677,7 +684,6 @@ FindNeighbors.dist <- function( annoy.metric = "euclidean", nn.eps = 0, verbose = TRUE, - force.recalc = FALSE, l2.norm = FALSE, cache.index = FALSE, ... @@ -694,7 +700,6 @@ FindNeighbors.dist <- function( n.trees = n.trees, annoy.metric = annoy.metric, verbose = verbose, - force.recalc = force.recalc, l2.norm = l2.norm, return.neighbor = return.neighbor, cache.index = cache.index, @@ -739,7 +744,6 @@ FindNeighbors.Seurat <- function( annoy.metric = "euclidean", nn.eps = 0, verbose = TRUE, - force.recalc = FALSE, do.plot = FALSE, graph.name = NULL, l2.norm = FALSE, @@ -764,7 +768,6 @@ FindNeighbors.Seurat <- function( annoy.metric = annoy.metric, nn.eps = nn.eps, verbose = verbose, - force.recalc = force.recalc, l2.norm = l2.norm, return.neighbor = return.neighbor, cache.index = cache.index, @@ -772,9 +775,8 @@ FindNeighbors.Seurat <- function( ) } else { assay <- assay %||% DefaultAssay(object = object) - data.use <- GetAssay(object = object, assay = assay) neighbor.graphs <- FindNeighbors( - object = data.use, + object = object[[assay]], features = features, k.param = k.param, compute.SNN = compute.SNN, @@ -784,7 +786,6 @@ FindNeighbors.Seurat <- function( annoy.metric = annoy.metric, nn.eps = nn.eps, verbose = verbose, - force.recalc = force.recalc, l2.norm = l2.norm, return.neighbor = return.neighbor, cache.index = cache.index, @@ -1608,6 +1609,10 @@ NNHelper <- function(data, query = data, k, method, cache.index = FALSE, ...) { args <- args[intersect(x = names(x = args), y = names(x = formals(fun = AnnoyNN)))] do.call(what = 'AnnoyNN', args = args) }, + "hnsw" = { + args <- args[intersect(x = names(x = args), y = names(x = formals(fun = HnswNN)))] + do.call(what = 'HnswNN', args = args) + }, stop("Invalid method. Please choose one of 'rann', 'annoy'") ) ) diff --git a/R/convenience.R b/R/convenience.R index ff0e91900..6786fa03e 100644 --- a/R/convenience.R +++ b/R/convenience.R @@ -9,7 +9,7 @@ NULL #' @param fov Name to store FOV as #' @param assay Name to store expression matrix as -#' @inheritDotParams ReadAkoya +#' @param ... Ignored #' #' @return \code{LoadAkoya}: A \code{\link[SeuratObject]{Seurat}} object #' @@ -172,41 +172,110 @@ LoadVizgen <- function(data.dir, fov, assay = 'Vizgen', z = 3L) { #' @return \code{LoadXenium}: A \code{\link[SeuratObject]{Seurat}} object #' -#' @param data.dir Path to folder containing Nanostring SMI outputs +#' @param data.dir Path to folder containing Xenium outputs #' @param fov FOV name #' @param assay Assay name +#' @param mols.qv.threshold Remove transcript molecules with +#' a QV less than this threshold. QV >= 20 is the standard threshold +#' used to construct the cell x gene count matrix. +#' @param cell.centroids Whether or not to load cell centroids +#' @param molecule.coordinates Whether or not to load molecule pixel coordinates +#' @param segmentations One of "cell", "nucleus" or NULL (to load either cell +#' segmentations, nucleus segmentations or neither) +#' @param flip.xy Whether or not to flip the x/y coordinates of the Xenium outputs +#' to match what is displayed in Xenium Explorer, or fit on your screen better. #' #' @importFrom SeuratObject Cells CreateCentroids CreateFOV -#' CreateSegmentation CreateSeuratObject +#' CreateSegmentation CreateSeuratObject CreateMolecules #' #' @export #' #' @rdname ReadXenium #' -LoadXenium <- function(data.dir, fov = 'fov', assay = 'Xenium') { +LoadXenium <- function( + data.dir, + fov = 'fov', + assay = 'Xenium', + mols.qv.threshold = 20, + cell.centroids = TRUE, + molecule.coordinates = TRUE, + segmentations = NULL, + flip.xy = FALSE +) { + if(!is.null(segmentations) && !(segmentations %in% c('nucleus', 'cell'))) { + stop('segmentations must be NULL or one of "nucleus", "cell"') + } + + if(!cell.centroids && is.null(segmentations)) { + stop( + "Must load either centroids or cell/nucleus segmentations" + ) + } + data <- ReadXenium( data.dir = data.dir, - type = c("centroids", "segmentations"), + type = c("centroids", "segmentations", "nucleus_segmentations")[ + c(cell.centroids, isTRUE(segmentations == 'cell'), isTRUE(segmentations == 'nucleus')) + ], + outs = c("segmentation_method", "matrix", "microns")[ + c(cell.centroids || isTRUE(segmentations != 'nucleus'), TRUE, molecule.coordinates && (cell.centroids || !is.null(segmentations))) + ], + mols.qv.threshold = mols.qv.threshold, + flip.xy = flip.xy ) - segmentations.data <- list( - "centroids" = CreateCentroids(data$centroids), - "segmentation" = CreateSegmentation(data$segmentations) - ) - coords <- CreateFOV( - coords = segmentations.data, - type = c("segmentation", "centroids"), - molecules = data$microns, - assay = assay + segmentations <- intersect(c("segmentations", "nucleus_segmentations"), names(data)) + + segmentations.data <- Filter(Negate(is.null), list( + centroids = if(is.null(data$centroids)) { + NULL + } else { + CreateCentroids(data$centroids) + }, + segmentations = if(length(segmentations) > 0) { + CreateSegmentation( + data[[segmentations]] + ) + } else { + NULL + } + )) + + coords <- if(length(segmentations.data) > 0) { + CreateFOV( + segmentations.data, + assay = assay, + molecules = if(is.null(data$microns)) { + NULL + } else { + CreateMolecules(data$microns) + } + ) + } else { + NULL + } + + slot.map <- c( + `Blank Codeword` = 'BlankCodeword', + `Unassigned Codeword` = 'BlankCodeword', + `Negative Control Codeword` = 'ControlCodeword', + `Negative Control Probe` = 'ControlProbe', + `Genomic Control` = 'GenomicControl' ) xenium.obj <- CreateSeuratObject(counts = data$matrix[["Gene Expression"]], assay = assay) - if("Blank Codeword" %in% names(data$matrix)) - xenium.obj[["BlankCodeword"]] <- CreateAssayObject(counts = data$matrix[["Blank Codeword"]]) - else - xenium.obj[["BlankCodeword"]] <- CreateAssayObject(counts = data$matrix[["Unassigned Codeword"]]) - xenium.obj[["ControlCodeword"]] <- CreateAssayObject(counts = data$matrix[["Negative Control Codeword"]]) - xenium.obj[["ControlProbe"]] <- CreateAssayObject(counts = data$matrix[["Negative Control Probe"]]) + + if(!is.null(data$metadata)) { + Misc(xenium.obj, 'run_metadata') <- data$metadata + } + + if(!is.null(data$segmentation_method)) { + xenium.obj <- AddMetaData(xenium.obj, data$segmentation_method) + } + + for(name in intersect(names(slot.map), names(data$matrix))) { + xenium.obj[[slot.map[name]]] <- CreateAssayObject(counts = data$matrix[[name]]) + } xenium.obj[[fov]] <- coords return(xenium.obj) @@ -258,7 +327,9 @@ SpatialDimPlot <- function( pt.size.factor = 1.6, alpha = c(1, 1), image.alpha = 1, - stroke = 0.25, + image.scale = "lowres", + shape = 21, + stroke = NA, label.box = TRUE, interactive = FALSE, information = NULL @@ -281,6 +352,8 @@ SpatialDimPlot <- function( pt.size.factor = pt.size.factor, alpha = alpha, image.alpha = image.alpha, + image.scale = image.scale, + shape = shape, stroke = stroke, label.box = label.box, interactive = interactive, @@ -307,7 +380,9 @@ SpatialFeaturePlot <- function( pt.size.factor = 1.6, alpha = c(1, 1), image.alpha = 1, - stroke = 0.25, + image.scale = "lowres", + shape = 21, + stroke = NA, interactive = FALSE, information = NULL ) { @@ -325,6 +400,8 @@ SpatialFeaturePlot <- function( pt.size.factor = pt.size.factor, alpha = alpha, image.alpha = image.alpha, + image.scale = image.scale, + shape = shape, stroke = stroke, interactive = interactive, information = information @@ -391,7 +468,8 @@ SpecificDimPlot <- function(object, ...) { #' @export #' ReadParseBio <- function(data.dir, ...) { - mtx <- file.path(data.dir, "DGE.mtx") + file.dir <- list.files(path = data.dir, pattern = ".mtx") + mtx <- file.path(data.dir, file.dir) cells <- file.path(data.dir, "cell_metadata.csv") features <- file.path(data.dir, "all_genes.csv") return(ReadMtx( diff --git a/R/differential_expression.R b/R/differential_expression.R index c96e3fea4..515491545 100644 --- a/R/differential_expression.R +++ b/R/differential_expression.R @@ -46,10 +46,10 @@ FindAllMarkers <- function( object, assay = NULL, features = NULL, - logfc.threshold = 0.25, + logfc.threshold = 0.1, test.use = 'wilcox', slot = 'data', - min.pct = 0.1, + min.pct = 0.01, min.diff.pct = -Inf, node = NULL, verbose = TRUE, @@ -164,7 +164,7 @@ FindAllMarkers <- function( subset = (myAUC > return.thresh | myAUC < (1 - return.thresh)) ) } else if (is.null(x = node) || test.use %in% c('bimod', 't')) { - gde <- gde[order(gde$p_val, -gde[, 2]), ] + gde <- gde[order(gde$p_val, -abs(gde$pct.1-gde$pct.2)), ] gde <- subset(x = gde, subset = p_val < return.thresh) } if (nrow(x = gde) > 0) { @@ -409,19 +409,25 @@ FindConservedMarkers <- function( # Methods for Seurat-defined generics #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +#' @param slot Slot to pull data from; note that if \code{test.use} is +#' "negbinom", "poisson", or "DESeq2", \code{slot} will be set to "counts" #' @param cells.1 Vector of cell names belonging to group 1 #' @param cells.2 Vector of cell names belonging to group 2 -#' @param counts Count matrix if using scale.data for DE tests. This is used for -#' computing pct.1 and pct.2 and for filtering features based on fraction -#' expressing #' @param features Genes to test. Default is to use all genes +#' @param slot Slot to pull data from; note that if \code{test.use} is +#' "negbinom", "poisson", or "DESeq2", \code{slot} will be set to "counts" #' @param logfc.threshold Limit testing to genes which show, on average, at least -#' X-fold difference (log-scale) between the two groups of cells. Default is 0.25 +#' X-fold difference (log-scale) between the two groups of cells. Default is 0.1 #' Increasing logfc.threshold speeds up the function, but can miss weaker signals. +#' If the \code{slot} parameter is "scale.data" no filtering is performed. #' @param test.use Denotes which test to use. Available options are: #' \itemize{ #' \item{"wilcox"} : Identifies differentially expressed genes between two -#' groups of cells using a Wilcoxon Rank Sum test (default) +#' groups of cells using a Wilcoxon Rank Sum test (default); will use a fast +#' implementation by Presto if installed +#' \item{"wilcox_limma"} : Identifies differentially expressed genes between two +#' groups of cells using the limma implementation of the Wilcoxon Rank Sum test; +#' set this option to reproduce results from Seurat v4 #' \item{"bimod"} : Likelihood-ratio test for single cell gene expression, #' (McDavid et al., Bioinformatics, 2013) #' \item{"roc"} : Identifies 'markers' of gene expression using ROC analysis. @@ -460,11 +466,11 @@ FindConservedMarkers <- function( #' } #' @param min.pct only test genes that are detected in a minimum fraction of #' min.pct cells in either of the two populations. Meant to speed up the function -#' by not testing genes that are very infrequently expressed. Default is 0.1 +#' by not testing genes that are very infrequently expressed. Default is 0.01 #' @param min.diff.pct only test genes that show a minimum difference in the #' fraction of detection between the two groups. Set to -Inf by default -#' @param only.pos Only return positive markers (FALSE by default) #' @param verbose Print a progress bar once expression testing begins +#' @param only.pos Only return positive markers (FALSE by default) #' @param max.cells.per.ident Down sample each identity class to a max number. #' Default is no downsampling. Not activated by default (set to Inf) #' @param random.seed Random seed for downsampling @@ -473,10 +479,9 @@ FindConservedMarkers <- function( #' @param min.cells.feature Minimum number of cells expressing the feature in at least one #' of the two groups, currently only used for poisson and negative binomial tests #' @param min.cells.group Minimum number of cells in one of the groups -#' @param pseudocount.use Pseudocount to add to averaged expression values when -#' calculating logFC. 1 by default. #' @param fc.results data.frame from FoldChange -#' @param densify Convert the sparse matrix to a dense form before running the DE test. This can provide speedups but might require higher memory; default is FALSE +#' @param densify Convert the sparse matrix to a dense form before running the +#' DE test. This can provide speedups but might require higher memory; default is FALSE #' #' #' @importFrom Matrix rowMeans @@ -490,13 +495,12 @@ FindConservedMarkers <- function( FindMarkers.default <- function( object, slot = "data", - counts = numeric(), cells.1 = NULL, cells.2 = NULL, features = NULL, - logfc.threshold = 0.25, - test.use = 'wilcox', - min.pct = 0.1, + logfc.threshold = 0.1, + test.use = "wilcox", + min.pct = 0.01, min.diff.pct = -Inf, verbose = TRUE, only.pos = FALSE, @@ -505,12 +509,10 @@ FindMarkers.default <- function( latent.vars = NULL, min.cells.feature = 3, min.cells.group = 3, - pseudocount.use = 1, fc.results = NULL, densify = FALSE, ... ) { - pseudocount.use <- pseudocount.use %||% 1 ValidateCellGroups( object = object, cells.1 = cells.1, @@ -524,11 +526,6 @@ FindMarkers.default <- function( min.diff.pct <- -Inf logfc.threshold <- 0 } - data <- switch( - EXPR = slot, - 'scale.data' = counts, - object - ) # feature selection (based on percentages) alpha.min <- pmax(fc.results$pct.1, fc.results$pct.2) names(x = alpha.min) <- rownames(x = fc.results) @@ -545,6 +542,7 @@ FindMarkers.default <- function( warning("No features pass min.diff.pct threshold; returning empty data.frame") return(fc.results[features, ]) } + # feature selection (based on logFC) if (slot != "scale.data") { total.diff <- fc.results[, 1] #first column is logFC @@ -560,6 +558,7 @@ FindMarkers.default <- function( return(fc.results[features, ]) } } + # subsample cell groups if they are too large if (max.cells.per.ident < Inf) { set.seed(seed = random.seed) @@ -573,18 +572,35 @@ FindMarkers.default <- function( latent.vars <- latent.vars[c(cells.1, cells.2), , drop = FALSE] } } - de.results <- PerformDE( - object = object, - cells.1 = cells.1, - cells.2 = cells.2, - features = features, - test.use = test.use, - verbose = verbose, - min.cells.feature = min.cells.feature, - latent.vars = latent.vars, - densify = densify, - ... - ) + if (inherits(x = object, what = "IterableMatrix")){ + if(test.use != "wilcox"){ + stop("Differential expression with BPCells currently only supports the 'wilcox' method.", + " Please rerun with test.use = 'wilcox'") + } + data.use <- object[features, c(cells.1, cells.2), drop = FALSE] + groups <- c(rep("foreground", length(cells.1)), rep("background", length(cells.2))) + de.results <- suppressMessages( + BPCells::marker_features(data.use, group = groups, method = "wilcoxon") + ) + de.results <- subset(de.results, de.results$foreground == "foreground") + de.results <- data.frame(feature = de.results$feature, + p_val = de.results$p_val_raw) + rownames(de.results) <- de.results$feature + de.results$feature <- NULL + } else { + de.results <- PerformDE( + object = object, + cells.1 = cells.1, + cells.2 = cells.2, + features = features, + test.use = test.use, + verbose = verbose, + min.cells.feature = min.cells.feature, + latent.vars = latent.vars, + densify = densify, + ... + ) + } de.results <- cbind(de.results, fc.results[rownames(x = de.results), , drop = FALSE]) if (only.pos) { de.results <- de.results[de.results[, 2] > 0, , drop = FALSE] @@ -592,7 +608,7 @@ FindMarkers.default <- function( if (test.use %in% DEmethods_nocorrect()) { de.results <- de.results[order(-de.results$power, -de.results[, 1]), ] } else { - de.results <- de.results[order(de.results$p_val, -de.results[, 1]), ] + de.results <- de.results[order(de.results$p_val, -abs(de.results$pct.1-de.results$pct.2)), ] de.results$p_val_adj = p.adjust( p = de.results$p_val, method = "bonferroni", @@ -602,8 +618,27 @@ FindMarkers.default <- function( return(de.results) } + +#' @param fc.slot Slot used to calculate fold-change - will also affect the +#' default for \code{mean.fxn}, see below for more details. +#' @param pseudocount.use Pseudocount to add to averaged expression values when +#' calculating logFC. 1 by default. #' @param norm.method Normalization method for fold change calculation when #' \code{slot} is \dQuote{\code{data}} +#' @param mean.fxn Function to use for fold change or average difference calculation. +#' The default depends on the the value of \code{fc.slot}: +#' \itemize{ +#' \item{"counts"} : difference in the log of the mean counts, with pseudocount. +#' \item{"data"} : difference in the log of the average exponentiated data, with pseudocount. +#' This adjusts for differences in sequencing depth between cells, and assumes that "data" +#' has been log-normalized. +#' \item{"scale.data"} : difference in the means of scale.data. +#' } +#' @param fc.name Name of the fold change, average difference, or custom function column +#' in the output data.frame. If NULL, the fold change column will be named +#' according to the logarithm base (eg, "avg_log2FC"), or if using the scale.data +#' slot "avg_diff". +#' @param base The base with respect to which logarithms are computed. #' #' @rdname FindMarkers #' @concept differential_expression @@ -616,40 +651,27 @@ FindMarkers.Assay <- function( cells.1 = NULL, cells.2 = NULL, features = NULL, - logfc.threshold = 0.25, - test.use = 'wilcox', - min.pct = 0.1, - min.diff.pct = -Inf, - verbose = TRUE, - only.pos = FALSE, - max.cells.per.ident = Inf, - random.seed = 1, - latent.vars = NULL, - min.cells.feature = 3, - min.cells.group = 3, + test.use = "wilcox", + fc.slot = "data", pseudocount.use = 1, + norm.method = NULL, mean.fxn = NULL, fc.name = NULL, base = 2, - densify = FALSE, - norm.method = NULL, ... ) { - pseudocount.use <- pseudocount.use %||% 1 data.slot <- ifelse( test = test.use %in% DEmethods_counts(), - yes = 'counts', + yes = "counts", no = slot ) + if (length(x = Layers(object = object, search = slot)) > 1) { + stop(slot, " layers are not joined. Please run JoinLayers") + } data.use <- GetAssayData(object = object, slot = data.slot) - counts <- switch( - EXPR = data.slot, - 'scale.data' = GetAssayData(object = object, slot = "counts"), - numeric() - ) fc.results <- FoldChange( object = object, - slot = data.slot, + slot = fc.slot, cells.1 = cells.1, cells.2 = cells.2, features = features, @@ -661,31 +683,23 @@ FindMarkers.Assay <- function( ) de.results <- FindMarkers( object = data.use, - slot = data.slot, - counts = counts, cells.1 = cells.1, cells.2 = cells.2, features = features, - logfc.threshold = logfc.threshold, test.use = test.use, - min.pct = min.pct, - min.diff.pct = min.diff.pct, - verbose = verbose, - only.pos = only.pos, - max.cells.per.ident = max.cells.per.ident, - random.seed = random.seed, - latent.vars = latent.vars, - min.cells.feature = min.cells.feature, - min.cells.group = min.cells.group, - pseudocount.use = pseudocount.use, fc.results = fc.results, - densify = densify, ... ) return(de.results) } -#' @param recorrect_umi Recalculate corrected UMI counts using minimum of the median UMIs when performing DE using multiple SCT objects; default is TRUE +#' @method FindMarkers StdAssay +#' @export +#' +FindMarkers.StdAssay <- FindMarkers.Assay + +#' @param recorrect_umi Recalculate corrected UMI counts using minimum of the +#' median UMIs when performing DE using multiple SCT objects; default is TRUE #' #' @rdname FindMarkers #' @concept differential_expression @@ -694,35 +708,31 @@ FindMarkers.Assay <- function( #' FindMarkers.SCTAssay <- function( object, - slot = "data", cells.1 = NULL, cells.2 = NULL, features = NULL, - logfc.threshold = 0.25, - test.use = 'wilcox', - min.pct = 0.1, - min.diff.pct = -Inf, - verbose = TRUE, - only.pos = FALSE, - max.cells.per.ident = Inf, - random.seed = 1, - latent.vars = NULL, - min.cells.feature = 3, - min.cells.group = 3, + test.use = "wilcox", pseudocount.use = 1, + slot = "data", + fc.slot = "data", mean.fxn = NULL, fc.name = NULL, base = 2, - densify = FALSE, recorrect_umi = TRUE, ... ) { - pseudocount.use <- pseudocount.use %||% 1 data.slot <- ifelse( test = test.use %in% DEmethods_counts(), - yes = 'counts', + yes = "counts", no = slot ) + if (test.use %in% DEmethods_counts()){ + # set slot to counts + if (slot !="counts") { + message(paste0("Setting slot to counts for ", test.use, " (counts based test: ")) + slot <- "counts" + } + } if (recorrect_umi && length(x = levels(x = object)) > 1) { cell_attributes <- SCTResults(object = object, slot = "cell.attributes") observed_median_umis <- lapply( @@ -747,20 +757,21 @@ FindMarkers.SCTAssay <- function( } data.use <- GetAssayData(object = object, slot = data.slot) - counts <- switch( - EXPR = data.slot, - 'scale.data' = GetAssayData(object = object, slot = "counts"), - numeric() - ) - if (is.null(x = mean.fxn)){ - mean.fxn <- function(x) { - return(log(x = rowMeans(x = expm1(x = x)) + pseudocount.use, base = base)) - - } + # Default assumes the input is log1p(corrected counts) + default.mean.fxn <- function(x) { + return(log(x = (rowSums(x = expm1(x = x)) + pseudocount.use)/NCOL(x), base = base)) } + mean.fxn <- mean.fxn %||% switch( + EXPR = fc.slot, + "counts" = function(x) { + return(log(x = (rowSums(x = x) + pseudocount.use)/NCOL(x), base = base)) + }, + "scale.data" = rowMeans, + default.mean.fxn + ) fc.results <- FoldChange( object = object, - slot = data.slot, + slot = fc.slot, cells.1 = cells.1, cells.2 = cells.2, features = features, @@ -768,28 +779,14 @@ FindMarkers.SCTAssay <- function( mean.fxn = mean.fxn, fc.name = fc.name, base = base - ) + ) de.results <- FindMarkers( object = data.use, - slot = data.slot, - counts = counts, cells.1 = cells.1, cells.2 = cells.2, features = features, - logfc.threshold = logfc.threshold, test.use = test.use, - min.pct = min.pct, - min.diff.pct = min.diff.pct, - verbose = verbose, - only.pos = only.pos, - max.cells.per.ident = max.cells.per.ident, - random.seed = random.seed, - latent.vars = latent.vars, - min.cells.feature = min.cells.feature, - min.cells.group = min.cells.group, - pseudocount.use = pseudocount.use, fc.results = fc.results, - densify = densify, ... ) return(de.results) @@ -805,11 +802,11 @@ FindMarkers.SCTAssay <- function( FindMarkers.DimReduc <- function( object, cells.1 = NULL, - cells.2 = NULL, + cells.2 = NULL, features = NULL, - logfc.threshold = 0.25, + logfc.threshold = 0.1, test.use = "wilcox", - min.pct = 0.1, + min.pct = 0.01, min.diff.pct = -Inf, verbose = TRUE, only.pos = FALSE, @@ -818,14 +815,12 @@ FindMarkers.DimReduc <- function( latent.vars = NULL, min.cells.feature = 3, min.cells.group = 3, - pseudocount.use = 1, + densify = FALSE, mean.fxn = rowMeans, fc.name = NULL, - densify = FALSE, ... ) { - pseudocount.use <- pseudocount.use %||% 1 if (test.use %in% DEmethods_counts()) { stop("The following tests cannot be used for differential expression on a reduction as they assume a count model: ", paste(DEmethods_counts(), collapse=", ")) @@ -888,7 +883,7 @@ FindMarkers.DimReduc <- function( de.results$p_val_adj = p.adjust( p = de.results$p_val, method = "bonferroni", - n = nrow(x = object) + n = ncol(x = object) ) } return(de.results) @@ -900,19 +895,13 @@ FindMarkers.DimReduc <- function( #' @param ident.2 A second identity class for comparison; if \code{NULL}, #' use all other cells for comparison; if an object of class \code{phylo} or #' 'clustertree' is passed to \code{ident.1}, must pass a node to find markers for -#' @param reduction Reduction to use in differential expression testing - will test for DE on cell embeddings -#' @param group.by Regroup cells into a different identity class prior to performing differential expression (see example) -#' @param subset.ident Subset a particular identity class prior to regrouping. Only relevant if group.by is set (see example) +#' @param group.by Regroup cells into a different identity class prior to +#' performing differential expression (see example) +#' @param subset.ident Subset a particular identity class prior to regrouping. +#' Only relevant if group.by is set (see example) #' @param assay Assay to use in differential expression testing -#' @param slot Slot to pull data from; note that if \code{test.use} is "negbinom", "poisson", or "DESeq2", -#' \code{slot} will be set to "counts" -#' @param mean.fxn Function to use for fold change or average difference calculation. -#' If NULL, the appropriate function will be chose according to the slot used -#' @param fc.name Name of the fold change, average difference, or custom function column -#' in the output data.frame. If NULL, the fold change column will be named -#' according to the logarithm base (eg, "avg_log2FC"), or if using the scale.data -#' slot "avg_diff". -#' @param base The base with respect to which logarithms are computed. +#' @param reduction Reduction to use in differential expression testing - will +#' test for DE on cell embeddings #' #' @rdname FindMarkers #' @concept differential_expression @@ -923,27 +912,11 @@ FindMarkers.Seurat <- function( object, ident.1 = NULL, ident.2 = NULL, + latent.vars = NULL, group.by = NULL, subset.ident = NULL, assay = NULL, - slot = 'data', reduction = NULL, - features = NULL, - logfc.threshold = 0.25, - test.use = "wilcox", - min.pct = 0.1, - min.diff.pct = -Inf, - verbose = TRUE, - only.pos = FALSE, - max.cells.per.ident = Inf, - random.seed = 1, - latent.vars = NULL, - min.cells.feature = 3, - min.cells.group = 3, - mean.fxn = NULL, - fc.name = NULL, - base = 2, - densify = FALSE, ... ) { if (!is.null(x = group.by)) { @@ -958,6 +931,7 @@ FindMarkers.Seurat <- function( if (length(x = ident.1) == 0) { stop("At least 1 ident must be specified in `ident.1`") } + # select which data to use if (is.null(x = reduction)) { assay <- assay %||% DefaultAssay(object = object) @@ -967,12 +941,26 @@ FindMarkers.Seurat <- function( data.use <- object[[reduction]] cellnames.use <- rownames(x = data.use) } + cells <- IdentsToCells( object = object, ident.1 = ident.1, ident.2 = ident.2, cellnames.use = cellnames.use ) + cells <- sapply( + X = cells, + FUN = intersect, + y = cellnames.use, + simplify = FALSE, + USE.NAMES = TRUE + ) + if (!all(vapply(X = cells, FUN = length, FUN.VALUE = integer(length = 1L)))) { + abort( + message = "Cells in one or both identity groups are not present in the data requested" + ) + } + # fetch latent.vars if (!is.null(x = latent.vars)) { latent.vars <- FetchData( @@ -981,6 +969,7 @@ FindMarkers.Seurat <- function( cells = c(cells$cells.1, cells$cells.2) ) } + # check normalization method norm.command <- paste0("NormalizeData.", assay) norm.method <- if (norm.command %in% Command(object = object) && is.null(x = reduction)) { @@ -998,31 +987,17 @@ FindMarkers.Seurat <- function( ) } else { NULL - } + } + de.results <- FindMarkers( object = data.use, - slot = slot, + latent.vars = latent.vars, cells.1 = cells$cells.1, cells.2 = cells$cells.2, - features = features, - logfc.threshold = logfc.threshold, - test.use = test.use, - min.pct = min.pct, - min.diff.pct = min.diff.pct, - verbose = verbose, - only.pos = only.pos, - max.cells.per.ident = max.cells.per.ident, - random.seed = random.seed, - latent.vars = latent.vars, - min.cells.feature = min.cells.feature, - min.cells.group = min.cells.group, - mean.fxn = mean.fxn, - base = base, - fc.name = fc.name, - densify = densify, norm.method = norm.method, ... ) + return(de.results) } @@ -1070,6 +1045,7 @@ FoldChange.default <- function( #' when \code{slot} is \dQuote{\code{data}} #' #' @importFrom Matrix rowMeans +#' @importFrom Matrix rowSums #' @rdname FoldChange #' @concept differential_expression #' @export @@ -1086,22 +1062,99 @@ FoldChange.Assay <- function( base = 2, norm.method = NULL, ... +) { + data <- GetAssayData(object = object, slot = slot) + # By default run as if LogNormalize is done + log1pdata.mean.fxn <- function(x) { + # return(log(x = rowMeans(x = expm1(x = x)) + pseudocount.use, base = base)) + return(log(x = (rowSums(x = expm1(x = x)) + pseudocount.use)/NCOL(x), base = base)) + } + scaledata.mean.fxn <- rowMeans + counts.mean.fxn <- function(x) { + # return(log(x = rowMeans(x = x) + pseudocount.use, base = base)) + return(log(x = (rowSums(x = x) + pseudocount.use)/NCOL(x), base = base)) + } + if (!is.null(x = norm.method)) { + # For anything apart from log normalization set to rowMeans + if (norm.method!="LogNormalize") { + new.mean.fxn <- counts.mean.fxn + } else { + new.mean.fxn <- counts.mean.fxn + if (slot == "data") { + new.mean.fxn <- log1pdata.mean.fxn + } else if (slot == "scale.data") { + new.mean.fxn <- scaledata.mean.fxn + } + } + } else { + # If no normalization method is passed use slots to decide mean function + new.mean.fxn <- switch( + EXPR = slot, + 'data' = log1pdata.mean.fxn, + 'scale.data' = scaledata.mean.fxn, + 'counts' = counts.mean.fxn, + log1pdata.mean.fxn + ) + } + mean.fxn <- mean.fxn %||% new.mean.fxn + # Omit the decimal value of e from the column name if base == exp(1) + base.text <- ifelse( + test = base == exp(1), + yes = "", + no = base + ) + fc.name <- fc.name %||% ifelse( + test = slot == "scale.data", + yes = "avg_diff", + no = paste0("avg_log", base.text, "FC") + ) + FoldChange( + object = data, + cells.1 = cells.1, + cells.2 = cells.2, + features = features, + mean.fxn = mean.fxn, + fc.name = fc.name + ) +} + +#' @method FoldChange StdAssay +#' @export +#' +FoldChange.StdAssay <- FoldChange.Assay + +#' @importFrom Matrix rowMeans +#' @importFrom Matrix rowSums +#' @rdname FoldChange +#' @concept differential_expression +#' @export +#' @method FoldChange SCTAssay +FoldChange.SCTAssay <- function( + object, + cells.1, + cells.2, + features = NULL, + slot = "data", + pseudocount.use = 1, + fc.name = NULL, + mean.fxn = NULL, + base = 2, + ... ) { pseudocount.use <- pseudocount.use %||% 1 data <- GetAssayData(object = object, slot = slot) default.mean.fxn <- function(x) { - return(log(x = rowMeans(x = x) + pseudocount.use, base = base)) + # return(log(x = rowMeans(x = expm1(x = x)) + pseudocount.use, base = base)) + return(log(x = (rowSums(x = expm1(x = x)) + pseudocount.use)/NCOL(x), base = base)) } mean.fxn <- mean.fxn %||% switch( EXPR = slot, - 'data' = switch( - EXPR = norm.method %||% '', - 'LogNormalize' = function(x) { - return(log(x = rowMeans(x = expm1(x = x)) + pseudocount.use, base = base)) - }, - default.mean.fxn - ), + 'data' = default.mean.fxn, 'scale.data' = rowMeans, + 'counts' = function(x) { + # return(log(x = rowMeans(x = x) + pseudocount.use, base = base)) + return(log(x = (rowSums(x = x) + pseudocount.use)/NCOL(x), base = base)) + }, default.mean.fxn ) # Omit the decimal value of e from the column name if base == exp(1) @@ -1125,6 +1178,7 @@ FoldChange.Assay <- function( ) } + #' @importFrom Matrix rowMeans #' @rdname FoldChange #' @concept differential_expression @@ -1141,7 +1195,6 @@ FoldChange.DimReduc <- function( mean.fxn = NULL, ... ) { - pseudocount.use <- pseudocount.use %||% 1 mean.fxn <- mean.fxn %||% rowMeans fc.name <- fc.name %||% "avg_diff" data <- t(x = Embeddings(object = object)) @@ -1189,7 +1242,7 @@ FoldChange.Seurat <- function( slot = 'data', reduction = NULL, features = NULL, - pseudocount.use = NULL, + pseudocount.use = 1, mean.fxn = NULL, base = 2, fc.name = NULL, @@ -1219,6 +1272,24 @@ FoldChange.Seurat <- function( ident.2 = ident.2, cellnames.use = cellnames.use ) + # check normalization method + norm.command <- paste0("NormalizeData.", assay) + norm.method <- if (norm.command %in% Command(object = object) && is.null(x = reduction)) { + Command( + object = object, + command = norm.command, + value = "normalization.method" + ) + } else if (length(x = intersect(x = c("FindIntegrationAnchors", "FindTransferAnchors"), y = Command(object = object)))) { + command <- intersect(x = c("FindIntegrationAnchors", "FindTransferAnchors"), y = Command(object = object))[1] + Command( + object = object, + command = command, + value = "normalization.method" + ) + } else { + NULL + } fc.results <- FoldChange( object = data.use, cells.1 = cells$cells.1, @@ -1228,7 +1299,8 @@ FoldChange.Seurat <- function( pseudocount.use = pseudocount.use, mean.fxn = mean.fxn, base = base, - fc.name = fc.name + fc.name = fc.name, + norm.method = norm.method ) return(fc.results) } @@ -1305,7 +1377,7 @@ DEmethods_latent <- function() { # returns tests that require CheckDots DEmethods_checkdots <- function() { - c('wilcox', 'MAST', 'DESeq2') + c('wilcox', 'wilcox_limma', 'MAST', 'DESeq2') } # returns tests that do not use Bonferroni correction on the DE results @@ -1944,6 +2016,14 @@ PerformDE <- function( verbose = verbose, ... ), + 'wilcox_limma' = WilcoxDETest( + data.use = data.use, + cells.1 = cells.1, + cells.2 = cells.2, + verbose = verbose, + limma = TRUE, + ... + ), 'bimod' = DiffExpTest( data.use = data.use, cells.1 = cells.1, @@ -2019,16 +2099,23 @@ PerformDE <- function( #' @param assay Assay name where for SCT objects are stored; Default is 'SCT' #' @param verbose Print messages and progress #' @importFrom Matrix Matrix +#' @importFrom SeuratObject SparseEmptyMatrix +#' @importFrom pbapply pblapply +#' @importFrom future.apply future_lapply +#' @importFrom future nbrOfWorkers #' @importFrom sctransform correct_counts +#' @importFrom SeuratObject JoinLayers #' #' @return Returns a Seurat object with recorrected counts and data in the SCT assay. #' @export #' #' @concept differential_expression +#' @template section-progressr +#' @template section-future #' @examples #' data("pbmc_small") -#' pbmc_small1 <- SCTransform(object = pbmc_small, variable.features.n = 20) -#' pbmc_small2 <- SCTransform(object = pbmc_small, variable.features.n = 20) +#' pbmc_small1 <- SCTransform(object = pbmc_small, variable.features.n = 20, vst.flavor="v1") +#' pbmc_small2 <- SCTransform(object = pbmc_small, variable.features.n = 20, vst.flavor="v1") #' pbmc_merged <- merge(x = pbmc_small1, y = pbmc_small2) #' pbmc_merged <- PrepSCTFindMarkers(object = pbmc_merged) #' markers <- FindMarkers( @@ -2047,6 +2134,11 @@ PerformDE <- function( #' ) #' PrepSCTFindMarkers <- function(object, assay = "SCT", verbose = TRUE) { + if (verbose && nbrOfWorkers() == 1) { + my.lapply <- pblapply + } else { + my.lapply <- future_lapply + } if (length(x = levels(x = object[[assay]])) == 1) { if (verbose) { message("Only one SCT model is stored - skipping recalculating corrected counts") @@ -2071,8 +2163,8 @@ PrepSCTFindMarkers <- function(object, assay = "SCT", verbose = TRUE) { } model_median_umis <- SCTResults(object = object[[assay]], slot = "median_umi") - min_median_umi <- min(unlist(x = observed_median_umis)) - if (all(unlist(x = model_median_umis) == min_median_umi)){ + min_median_umi <- min(unlist(x = observed_median_umis), na.rm = TRUE) + if (all(unlist(x = model_median_umis) > min_median_umi)){ if (verbose){ message("Minimum UMI unchanged. Skipping re-correction.") } @@ -2095,6 +2187,12 @@ PrepSCTFindMarkers <- function(object, assay = "SCT", verbose = TRUE) { paste(umi.assay, collapse = ", ") ) } + umi.layers <- Layers(object = object, assay = umi.assay, search = 'counts') + if (length(x = umi.layers) > 1) { + object[[umi.assay]] <- JoinLayers( + object = object[[umi.assay]], + layers = "counts", new = "counts") + } raw_umi <- GetAssayData(object = object, assay = umi.assay, slot = "counts") corrected_counts <- Matrix( nrow = nrow(x = raw_umi), @@ -2113,27 +2211,43 @@ PrepSCTFindMarkers <- function(object, assay = "SCT", verbose = TRUE) { set_median_umi <- rep(min_median_umi, length(levels(x = object[[assay]]))) names(set_median_umi) <- levels(x = object[[assay]]) set_median_umi <- as.list(set_median_umi) + all_genes <- rownames(x = object[[assay]]) # correct counts - for (model_name in levels(x = object[[assay]])) { + my.correct_counts <- function(model_name){ model_genes <- rownames(x = model_pars_fit[[model_name]]) - x <- list( - model_str = model_str[[model_name]], - arguments = arguments[[model_name]], - model_pars_fit = as.matrix(x = model_pars_fit[[model_name]]), - cell_attr = cell_attr[[model_name]] - ) - cells <- rownames(x = cell_attr[[model_name]]) - umi <- raw_umi[model_genes, cells] + x <- list( + model_str = model_str[[model_name]], + arguments = arguments[[model_name]], + model_pars_fit = as.matrix(x = model_pars_fit[[model_name]]), + cell_attr = cell_attr[[model_name]] + ) + cells <- rownames(x = cell_attr[[model_name]]) + umi <- raw_umi[all_genes, cells] - umi_corrected <- correct_counts( - x = x, - umi = umi, - verbosity = 0, - scale_factor = min_median_umi - ) - corrected_counts[rownames(umi_corrected), colnames(umi_corrected)] <- umi_corrected + umi_corrected <- correct_counts( + x = x, + umi = umi, + verbosity = 0, + scale_factor = min_median_umi + ) + missing_features <- setdiff(x = all_genes, y = rownames(x = umi_corrected)) + corrected_counts.list <- NULL + gc(verbose = FALSE) + empty <- SparseEmptyMatrix(nrow = length(x = missing_features), ncol = ncol(x = umi_corrected)) + rownames(x = empty) <- missing_features + colnames(x = umi_corrected) <- colnames(x = umi_corrected) + + umi_corrected <- rbind(umi_corrected, empty)[all_genes,] + + return(umi_corrected) } - corrected_data <- log1p(corrected_counts) + corrected_counts.list <- my.lapply(X = levels(x = object[[assay]]), + FUN = my.correct_counts) + names(x = corrected_counts.list) <- levels(x = object[[assay]]) + + corrected_counts <- do.call(what = MergeSparseMatrices, args = corrected_counts.list) + corrected_counts <- as.sparse(x = corrected_counts) + corrected_data <- log1p(x = corrected_counts) suppressWarnings({object <- SetAssayData(object = object, assay = assay, slot = "counts", @@ -2143,10 +2257,22 @@ PrepSCTFindMarkers <- function(object, assay = "SCT", verbose = TRUE) { slot = "data", new.data = corrected_data)}) SCTResults(object = object[[assay]], slot = "median_umi") <- set_median_umi - return(object) } +PrepSCTFindMarkers.V5 <- function(object, assay = "SCT", umi.assay = "RNA", layer = "counts", verbose = TRUE) { + layers <- Layers(object = object[[umi.assay]], search = layer) + dataset.names <- gsub(pattern = paste0(layer, "."), replacement = "", x = layers) + for (i in seq_along(along.with = layers)) { + l <- layers[i] + counts <- LayerData( + object = object[[umi.assay]], + layer = l + ) + } + cells.grid <- DelayedArray::colAutoGrid(x = counts, ncol = min(length(Cells(object)), ncol(counts))) +} + # given a UMI count matrix, estimate NB theta parameter for each gene # and use fit of relationship with mean to assign regularized theta to each gene # @@ -2251,14 +2377,18 @@ ValidateCellGroups <- function( # Differential expression using Wilcoxon Rank Sum # # Identifies differentially expressed genes between two groups of cells using -# a Wilcoxon Rank Sum test. Makes use of limma::rankSumTestWithCorrelation for a +# a Wilcoxon Rank Sum test. Makes use of presto::wilcoxauc for a more efficient +# implementation of the wilcoxon test. If presto is not installed, or if limma +# is requested, makes use of limma::rankSumTestWithCorrelation for a # more efficient implementation of the wilcoxon test. Thanks to Yunshun Chen and -# Gordon Smyth for suggesting the limma implementation. +# Gordon Smyth for suggesting the limma implementation. If limma is also not installed, +# uses wilcox.test. # # @param data.use Data matrix to test # @param cells.1 Group 1 cells # @param cells.2 Group 2 cells # @param verbose Print a progress bar +# @param limma If limma should be used for testing; default is FALSE # @param ... Extra parameters passed to wilcox.test # # @return Returns a p-value ranked matrix of putative differentially expressed @@ -2282,6 +2412,7 @@ WilcoxDETest <- function( cells.1, cells.2, verbose = TRUE, + limma = FALSE, ... ) { data.use <- data.use[, c(cells.1, cells.2), drop = FALSE] @@ -2296,40 +2427,59 @@ WilcoxDETest <- function( yes = FALSE, no = TRUE ) + presto.check <- PackageCheck("presto", error = FALSE) limma.check <- PackageCheck("limma", error = FALSE) - if (limma.check[1] && overflow.check) { - p_val <- my.sapply( - X = 1:nrow(x = data.use), - FUN = function(x) { - return(min(2 * min(limma::rankSumTestWithCorrelation(index = j, statistics = data.use[x, ])), 1)) - } - ) + group.info <- data.frame(row.names = c(cells.1, cells.2)) + group.info[cells.1, "group"] <- "Group1" + group.info[cells.2, "group"] <- "Group2" + group.info[, "group"] <- factor(x = group.info[, "group"]) + if (presto.check[1] && (!limma)) { + data.use <- data.use[, rownames(group.info), drop = FALSE] + res <- presto::wilcoxauc(X = data.use, y = group.info[, "group"]) + res <- res[1:(nrow(x = res)/2),] + p_val <- res$pval } else { - if (getOption('Seurat.limma.wilcox.msg', TRUE) && overflow.check) { + if (getOption('Seurat.presto.wilcox.msg', TRUE) && (!limma)) { message( - "For a more efficient implementation of the Wilcoxon Rank Sum Test,", - "\n(default method for FindMarkers) please install the limma package", + "For a (much!) faster implementation of the Wilcoxon Rank Sum Test,", + "\n(default method for FindMarkers) please install the presto package", "\n--------------------------------------------", - "\ninstall.packages('BiocManager')", - "\nBiocManager::install('limma')", + "\ninstall.packages('devtools')", + "\ndevtools::install_github('immunogenomics/presto')", "\n--------------------------------------------", - "\nAfter installation of limma, Seurat will automatically use the more ", + "\nAfter installation of presto, Seurat will automatically use the more ", "\nefficient implementation (no further action necessary).", "\nThis message will be shown once per session" ) - options(Seurat.limma.wilcox.msg = FALSE) + options(Seurat.presto.wilcox.msg = FALSE) } - group.info <- data.frame(row.names = c(cells.1, cells.2)) - group.info[cells.1, "group"] <- "Group1" - group.info[cells.2, "group"] <- "Group2" - group.info[, "group"] <- factor(x = group.info[, "group"]) - data.use <- data.use[, rownames(x = group.info), drop = FALSE] - p_val <- my.sapply( - X = 1:nrow(x = data.use), - FUN = function(x) { - return(wilcox.test(data.use[x, ] ~ group.info[, "group"], ...)$p.value) + if (limma.check[1] && overflow.check) { + p_val <- my.sapply( + X = 1:nrow(x = data.use), + FUN = function(x) { + return(min(2 * min(limma::rankSumTestWithCorrelation(index = j, statistics = data.use[x, ])), 1)) + } + ) + } else { + if (limma && overflow.check) { + stop( + "To use the limma implementation of the Wilcoxon Rank Sum Test, + please install the limma package: + -------------------------------------------- + install.packages('BiocManager') + BiocManager::install('limma') + --------------------------------------------" + ) + } else { + data.use <- data.use[, rownames(x = group.info), drop = FALSE] + p_val <- my.sapply( + X = 1:nrow(x = data.use), + FUN = function(x) { + return(wilcox.test(data.use[x, ] ~ group.info[, "group"], ...)$p.value) + } + ) } - ) + } } return(data.frame(p_val, row.names = rownames(x = data.use))) } diff --git a/R/dimensional_reduction.R b/R/dimensional_reduction.R index 0c78ace99..4450054fe 100644 --- a/R/dimensional_reduction.R +++ b/R/dimensional_reduction.R @@ -572,6 +572,8 @@ RunCCA.Seurat <- function( verbose = TRUE, ... ) { + op <- options(Seurat.object.assay.version = "v3", Seurat.object.assay.calcn = FALSE) + on.exit(expr = options(op), add = TRUE) assay1 <- assay1 %||% DefaultAssay(object = object1) assay2 <- assay2 %||% DefaultAssay(object = object2) if (assay1 != assay2) { @@ -649,7 +651,8 @@ RunCCA.Seurat <- function( warning("Some cells removed after object merge due to minimum feature count cutoff") } combined.scale <- cbind(data1,data2) - combined.object <- SetAssayData(object = combined.object,new.data = combined.scale, slot = "scale.data") + combined.object <- SetAssayData(object = combined.object, new.data = combined.scale, slot = "scale.data") + ## combined.object@assays$ToIntegrate@scale.data <- combined.scale if (renormalize) { combined.object <- NormalizeData( object = combined.object, @@ -800,9 +803,8 @@ RunICA.Seurat <- function( ... ) { assay <- assay %||% DefaultAssay(object = object) - assay.data <- GetAssay(object = object, assay = assay) reduction.data <- RunICA( - object = assay.data, + object = object[[assay]], assay = assay, features = features, nics = nics, @@ -861,10 +863,22 @@ RunPCA.default <- function( if (!is.null(x = seed.use)) { set.seed(seed = seed.use) } + if (inherits(x = object, what = 'matrix')) { + RowVar.function <- RowVar + } else if (inherits(x = object, what = 'dgCMatrix')) { + RowVar.function <- RowVarSparse + } else if (inherits(x = object, what = 'IterableMatrix')) { + RowVar.function <- function(x) { + return(BPCells::matrix_stats( + matrix = x, + row_stats = 'variance' + )$row_stats['variance',]) + } + } if (rev.pca) { npcs <- min(npcs, ncol(x = object) - 1) pca.results <- irlba(A = object, nv = npcs, ...) - total.variance <- sum(RowVar(x = t(x = object))) + total.variance <- sum(RowVar.function(x = t(x = object))) sdev <- pca.results$d/sqrt(max(1, nrow(x = object) - 1)) if (weight.by.var) { feature.loadings <- pca.results$u %*% diag(pca.results$d) @@ -874,7 +888,7 @@ RunPCA.default <- function( cell.embeddings <- pca.results$v } else { - total.variance <- sum(RowVar(x = object)) + total.variance <- sum(RowVar.function(x = object)) if (approx) { npcs <- min(npcs, nrow(x = object) - 1) pca.results <- irlba(A = t(x = object), nv = npcs, ...) @@ -966,6 +980,45 @@ RunPCA.Assay <- function( return(reduction.data) } +#' @method RunPCA StdAssay +#' @export +#' +RunPCA.StdAssay <- function( + object, + assay = NULL, + features = NULL, + layer = 'scale.data', + npcs = 50, + rev.pca = FALSE, + weight.by.var = TRUE, + verbose = TRUE, + ndims.print = 1:5, + nfeatures.print = 30, + reduction.key = "PC_", + seed.use = 42, + ... +) { + data.use <- PrepDR5( + object = object, + features = features, + layer = layer, + verbose = verbose + ) + return(RunPCA( + object = data.use, + assay = assay, + npcs = npcs, + rev.pca = rev.pca, + weight.by.var = weight.by.var, + verbose = verbose, + ndims.print = ndims.print, + nfeatures.print = nfeatures.print, + reduction.key = reduction.key, + seed.use = seed.use, + ... + )) +} + #' @param reduction.name dimensional reduction name, pca by default #' #' @rdname RunPCA @@ -989,9 +1042,8 @@ RunPCA.Seurat <- function( ... ) { assay <- assay %||% DefaultAssay(object = object) - assay.data <- GetAssay(object = object, assay = assay) reduction.data <- RunPCA( - object = assay.data, + object = object[[assay]], assay = assay, features = features, npcs = npcs, @@ -1009,19 +1061,59 @@ RunPCA.Seurat <- function( return(object) } +#' @method RunPCA Seurat5 +#' @export +#' +RunPCA.Seurat5 <- function( + object, + assay = NULL, + features = NULL, + npcs = 50, + rev.pca = FALSE, + weight.by.var = TRUE, + verbose = TRUE, + ndims.print = 1:5, + nfeatures.print = 30, + reduction.name = "pca", + reduction.key = "PC_", + seed.use = 42, + ... +) { + assay <- assay %||% DefaultAssay(object = object) + reduction.data <- RunPCA( + object = object[[assay]], + assay = assay, + features = features, + npcs = npcs, + rev.pca = rev.pca, + weight.by.var = weight.by.var, + verbose = verbose, + ndims.print = ndims.print, + nfeatures.print = nfeatures.print, + reduction.key = reduction.key, + seed.use = seed.use, + ... + ) + object[[reduction.name]] <- reduction.data + # object <- LogSeuratCommand(object = object) + return(object) +} + #' @param assay Name of assay that that t-SNE is being run on #' @param seed.use Random seed for the t-SNE. If NULL, does not set the seed #' @param tsne.method Select the method to use to compute the tSNE. Available #' methods are: #' \itemize{ -#' \item{Rtsne: }{Use the Rtsne package Barnes-Hut implementation of tSNE (default)} -# \item{tsne: }{standard tsne - not recommended for large datasets} -#' \item{FIt-SNE: }{Use the FFT-accelerated Interpolation-based t-SNE. Based on -#' Kluger Lab code found here: https://github.com/KlugerLab/FIt-SNE} +#' \item \dQuote{\code{Rtsne}}: Use the Rtsne package Barnes-Hut +#' implementation of tSNE (default) +#' \item \dQuote{\code{FIt-SNE}}: Use the FFT-accelerated Interpolation-based +#' t-SNE. Based on Kluger Lab code found here: +#' \url{https://github.com/KlugerLab/FIt-SNE} #' } #' @param dim.embed The dimensional space of the resulting tSNE embedding #' (default is 2). For example, set to 3 for a 3d tSNE -#' @param reduction.key dimensional reduction key, specifies the string before the number for the dimension names. tSNE_ by default +#' @param reduction.key dimensional reduction key, specifies the string before +#' the number for the dimension names. \dQuote{\code{tSNE_}} by default #' #' @importFrom Rtsne Rtsne #' @@ -1389,9 +1481,12 @@ RunUMAP.default <- function( call. = FALSE ) } + if (!"num_precomputed_nns" %in% names(x = model)) { + model$num_precomputed_nns <- 0 + } if (is.list(x = object)) { if (ncol(object$idx) != model$n_neighbors) { - warning("Number of neighbors between query and reference ", + warning("Number of neighbors between query and reference ", "is not equal to the number of neighbors within reference") model$n_neighbors <- ncol(object$idx) } @@ -1695,13 +1790,19 @@ RunUMAP.Seurat <- function( dens.var.shift = 0.1, verbose = TRUE, reduction.name = 'umap', - reduction.key = 'UMAP_', + reduction.key = NULL, ... ) { CheckDots(...) if (sum(c(is.null(x = dims), is.null(x = features), is.null(x = graph))) < 2) { stop("Please specify only one of the following arguments: dims, features, or graph") } + if (sum(!is.null(x = dims), !is.null(x = nn.name), + !is.null(x = graph), !is.null(x = features)) != 1) { + stop("Only one parameter among 'dims', 'nn.name', 'graph', or 'features' ", + "should be used at a time to run UMAP") + } + if (!is.null(x = features)) { data.use <- as.matrix(x = t(x = GetAssayData(object = object, slot = slot, assay = assay)[features, , drop = FALSE])) if (ncol(x = data.use) < n.components) { @@ -1779,7 +1880,7 @@ RunUMAP.Seurat <- function( dens.lambda = dens.lambda, dens.frac = dens.frac, dens.var.shift = dens.var.shift, - reduction.key = reduction.key, + reduction.key = reduction.key %||% Key(object = reduction.name, quiet = TRUE), verbose = verbose ) object <- LogSeuratCommand(object = object) @@ -1926,6 +2027,11 @@ CheckFeatures <- function( if (inherits(x = data.use, what = 'dgCMatrix')) { features.var <- SparseRowVar(mat = data.use[features, ], display_progress = F) } + else if (inherits(x = data.use, what = "IterableMatrix")) { + bp.stats <- BPCells::matrix_stats(matrix = data.use, + row_stats = "variance") + features.var <- bp.stats$row_stats["variance",][features] + } else { features.var <- RowVar(x = data.use[features, ]) } @@ -2314,6 +2420,55 @@ PrepDR <- function( return(data.use) } +PrepDR5 <- function(object, features = NULL, layer = 'scale.data', verbose = TRUE) { + layer <- layer[1L] + olayer <- layer + layer <- Layers(object = object, search = layer) + if (is.null(layer)) { + abort(paste0("No layer matching pattern '", olayer, "' not found. Please run ScaleData and retry")) + } + data.use <- LayerData(object = object, layer = layer) + features <- features %||% VariableFeatures(object = object) + if (!length(x = features)) { + stop("No variable features, run FindVariableFeatures() or provide a vector of features", call. = FALSE) + } + if (is(data.use, "IterableMatrix")) { + features.var <- BPCells::matrix_stats(matrix=data.use, row_stats="variance")$row_stats["variance",] + } else { + features.var <- apply(X = data.use, MARGIN = 1L, FUN = var) + } + features.keep <- features[features.var > 0] + if (!length(x = features.keep)) { + stop("None of the requested features have any variance", call. = FALSE) + } else if (length(x = features.keep) < length(x = features)) { + exclude <- setdiff(x = features, y = features.keep) + if (isTRUE(x = verbose)) { + warning( + "The following ", + length(x = exclude), + " features requested have zero variance; running reduction without them: ", + paste(exclude, collapse = ', '), + call. = FALSE, + immediate. = TRUE + ) + } + } + features <- features.keep + features <- features[!is.na(x = features)] + features.use <- features[features %in% rownames(data.use)] + if(!isTRUE(all.equal(features, features.use))) { + missing_features <- setdiff(features, features.use) + if(length(missing_features) > 0) { + warning_message <- paste("The following features were not available: ", + paste(missing_features, collapse = ", "), + ".", sep = "") + warning(warning_message, immediate. = TRUE) + } + } + data.use <- data.use[features.use, ] + return(data.use) +} + #' @param assay Name of Assay SPCA is being run on #' @param npcs Total Number of SPCs to compute and store (50 by default) #' @param verbose Print the top genes associated with high/low loadings for @@ -2401,6 +2556,46 @@ RunSPCA.Assay <- function( return(reduction.data) } +#' @param features Features to compute SPCA on. If features=NULL, SPCA will be run +#' using the variable features for the Assay. +#' @param layer Layer to run SPCA on +#' +#' @rdname RunSPCA +#' @concept dimensional_reduction +#' @export +#' @method RunSPCA Assay5 +#' +RunSPCA.Assay5 <- function( + object, + assay = NULL, + features = NULL, + npcs = 50, + reduction.key = "SPC_", + graph = NULL, + verbose = TRUE, + seed.use = 42, + layer = 'scale.data', + ... +) { + data.use <- PrepDR5( + object = object, + features = features, + layer = layer, + verbose = verbose + ) + reduction.data <- RunSPCA( + object = data.use, + assay = assay, + npcs = npcs, + reduction.key = reduction.key, + graph = graph, + verbose = verbose, + seed.use = seed.use, + ... + ) + return(reduction.data) +} + #' @param reduction.name dimensional reduction name, spca by default #' @rdname RunSPCA #' @concept dimensional_reduction @@ -2420,14 +2615,13 @@ RunSPCA.Seurat <- function( ... ) { assay <- assay %||% DefaultAssay(object = object) - assay.data <- GetAssay(object = object, assay = assay) if (is.null(x = graph)) { stop("Graph is not provided") } else if (is.character(x = graph)) { graph <- object[[graph]] } reduction.data <- RunSPCA( - object = assay.data, + object = object[[assay]], assay = assay, features = features, npcs = npcs, @@ -2475,7 +2669,12 @@ RunSLSI.default <- function( if (verbose) { message("Smoothing peaks matrix") } - object.smooth <- t(x = graph) %*% (t(x = object) %*% object) %*% graph + if (inherits(x = object, what = 'IterableMatrix')) { + t_object <- t(BPCells::transpose_storage_order(matrix = object)) + object.smooth <- t(x = graph) %*% (t_object %*% object) %*% graph + } else { + object.smooth <- t(x = graph) %*% (t(x = object) %*% object) %*% graph + } if (verbose) { message("Performing eigendecomposition") } @@ -2541,6 +2740,48 @@ RunSLSI.Assay <- function( return(reduction.data) } + +#' @param features Features to compute SLSI on. If features=NULL, SLSI will be run +#' using the variable features for the Assay5. +#' @param layer Layer to run SLSI on +#' +#' @rdname RunSLSI +#' @concept dimensional_reduction +#' @export +#' @method RunSLSI StdAssay +#' +RunSLSI.StdAssay <- function( + object, + assay = NULL, + features = NULL, + n = 50, + reduction.key = "SLSI_", + graph = NULL, + layer = "data", + verbose = TRUE, + seed.use = 42, + ...) { + data.use <- PrepDR5( + object = object, + features = features, + layer = layer, + verbose = verbose + ) + reduction.data <- RunSLSI( + object = data.use, + assay = assay, + npcs = npcs, + reduction.key = reduction.key, + graph = graph, + verbose = verbose, + seed.use = seed.use, + ... + ) + return(reduction.data) +} + + + #' @param reduction.name dimensional reduction name #' @rdname RunSLSI #' @concept dimensional_reduction diff --git a/R/generics.R b/R/generics.R index 0b6120813..2e64f7ce8 100644 --- a/R/generics.R +++ b/R/generics.R @@ -116,6 +116,7 @@ FindClusters <- function(object, ...) { #' @export #' #' @examples +#' \dontrun{ #' data("pbmc_small") #' # Find markers for cluster 2 #' markers <- FindMarkers(object = pbmc_small, ident.1 = 2) @@ -133,7 +134,8 @@ FindClusters <- function(object, ...) { #' markers <- FindMarkers(object = pbmc_small, ident.1 = 'clustertree', ident.2 = 5) #' head(x = markers) #' } -#' +#' } +#' #' @rdname FindMarkers #' @export FindMarkers #' @@ -230,9 +232,11 @@ FindSpatiallyVariableFeatures <- function(object, ...) { #' Otherwise, log2 fold change is returned with column named "avg_log2_FC". #' #' @examples +#' \dontrun{ #' data("pbmc_small") #' FoldChange(pbmc_small, ident.1 = 1) -#' +#' } +#' #' @param object A Seurat object #' @param ... Arguments passed to other methods #' @rdname FoldChange @@ -302,6 +306,56 @@ IntegrateEmbeddings <- function(anchorset, ...) { UseMethod(generic = "IntegrateEmbeddings", object = anchorset) } +#' Leverage Score Calculation +#' +#' This function computes the leverage scores for a given object +#' It uses the concept of sketching and random projections. The function provides an approximation +#' to the leverage scores using a scalable method suitable for large matrices. +#' +#' @param object A matrix-like object +#' @param ... Arguments passed to other methods +#' +#' @references Clarkson, K. L. & Woodruff, D. P. +#' Low-rank approximation and regression in input sparsity time. +#' JACM 63, 1–45 (2017). \url{https://dl.acm.org/doi/10.1145/3019134}; +#' +#' @export +#' +LeverageScore <- function(object, ...) { + UseMethod(generic = 'LeverageScore', object = object) +} + +#' Normalize Raw Data +#' +#' @param data Matrix with the raw count data +#' @param scale.factor Scale the data; default is \code{1e4} +#' @param margin Margin to normalize over +#' @param verbose Print progress +#' +#' @return A matrix with the normalized and log-transformed data +#' +#' @template param-dotsm +#' +#' @export +#' @concept preprocessing +#' +#' @examples +#' mat <- matrix(data = rbinom(n = 25, size = 5, prob = 0.2), nrow = 5) +#' mat +#' mat_norm <- LogNormalize(data = mat) +#' mat_norm +#' +LogNormalize <- function( + data, + scale.factor = 1e4, + margin = 2L, + verbose = TRUE, + ... +) { + UseMethod(generic = 'LogNormalize', object = data) +} + + #' Metric for evaluating mapping success #' #' This metric was designed to help identify query cells that aren't well @@ -340,6 +394,34 @@ NormalizeData <- function(object, ...) { UseMethod(generic = 'NormalizeData', object = object) } +#' Project query data to the reference dimensional reduction +#' +#' +#' @param query An object for query cells +#' @param reference An object for reference cells +#' @param query.assay Assay name for query object +#' @param reference.assay Assay name for reference object +#' @param reduction Name of dimensional reduction from reference object +#' @param dims Dimensions used for reference dimensional reduction +#' @param scale Determine if scale query data based on reference data variance +#' @param verbose Print progress +#' @param feature.mean Mean of features in reference +#' @param feature.sd Standard variance of features in reference +#' +#' @return A matrix with projected cell embeddings +#' +#' @rdname ProjectCellEmbeddings +#' @export ProjectCellEmbeddings +#' +#' @keywords internal +#' +ProjectCellEmbeddings <- function( + query, + ... +) { + UseMethod(generic = 'ProjectCellEmbeddings', object = query) +} + #' Project query into UMAP coordinates of a reference #' #' This function will take a query dataset and project it into the coordinates @@ -360,6 +442,23 @@ ProjectUMAP <- function(query, ...) { UseMethod(generic = "ProjectUMAP", object = query) } +#' Pseudobulk Expression +#' +#' Normalize the count data present in a given assay. +#' +#' @param object An assay +#' @param ... Arguments passed to other methods +#' +#' @return Returns object after normalization +#' +#' @rdname PseudobulkExpression +#' @export PseudobulkExpression +#' @concept utilities +#' +PseudobulkExpression <- function(object, ...) { + UseMethod(generic = "PseudobulkExpression", object = object) +} + #' Perform Canonical Correlation Analysis #' #' Runs a canonical correlation analysis using a diagonal implementation of CCA. @@ -374,6 +473,7 @@ ProjectUMAP <- function(query, ...) { #' @seealso \code{\link{merge.Seurat}} #' #' @examples +#' \dontrun{ #' data("pbmc_small") #' pbmc_small #' # As CCA requires two datasets, we will split our test object into two just for this example @@ -384,7 +484,8 @@ ProjectUMAP <- function(query, ...) { #' pbmc_cca <- RunCCA(object1 = pbmc1, object2 = pbmc2) #' # Print results #' print(x = pbmc_cca[["cca"]]) -#' +#' } +#' #' @rdname RunCCA #' @export RunCCA #' @@ -392,6 +493,29 @@ RunCCA <- function(object1, object2, ...) { UseMethod(generic = 'RunCCA', object = object1) } + +#' Run Graph Laplacian Eigendecomposition +#' +#' Run a graph laplacian dimensionality reduction. It is used as a low +#' dimensional representation for a cell-cell graph. The input graph +#' should be symmetric +#' +#' @param object A Seurat object +#' @param ... Arguments passed to +#' \code{\link[RSpectra:eigs_sym]{RSpectra::eigs_sym}} +#' +#' @return Returns Seurat object with the Graph laplacian eigenvector +#' calculation stored in the reductions slot +#' +#' @rdname RunGraphLaplacian +#' @export RunGraphLaplacian +#' + +RunGraphLaplacian <- function(object, ...) { + UseMethod(generic = 'RunGraphLaplacian', object = object) +} + + #' Run Independent Component Analysis on gene expression #' #' Run fastica algorithm from the ica package for ICA dimensionality reduction. @@ -604,6 +728,17 @@ ScoreJackStraw <- function(object, ...) { UseMethod(generic = 'ScoreJackStraw', object = object) } +#' Perform sctransform-based normalization +#' @param object An object +#' @param ... Arguments passed to other methods (not used) +#' +#' @rdname SCTransform +#' @export SCTransform +#' +SCTransform <- function(object, ...) { + UseMethod(generic = 'SCTransform', object = object) +} + #' Get SCT results from an Assay #' #' Pull the \code{\link{SCTResults}} information from an \code{\link{SCTAssay}} @@ -628,3 +763,46 @@ SCTResults <- function(object, ...) { "SCTResults<-" <- function(object, ..., value) { UseMethod(generic = 'SCTResults<-', object = object) } + +#' Variance Stabilizing Transformation +#' +#' Apply variance stabilizing transformation for selection of variable features +#' +#' @inheritParams stats::loess +#' @param data A matrix-like object +#' @param margin Unused +#' @param nselect Number of of features to select +#' @param clip Upper bound for values post-standardization; defaults to the +#' square root of the number of cells +#' @param verbose ... +#' +#' @template param-dotsm +#' +#' @return A data frame with the following columns: +#' \itemize{ +#' \item \dQuote{\code{mean}}: ... +#' \item \dQuote{\code{variance}}: ... +#' \item \dQuote{\code{variance.expected}}: ... +#' \item \dQuote{\code{variance.standardized}}: ... +#' \item \dQuote{\code{variable}}: \code{TRUE} if the feature selected as +#' variable, otherwise \code{FALSE} +#' \item \dQuote{\code{rank}}: If the feature is selected as variable, then how +#' it compares to other variable features with lower ranks as more variable; +#' otherwise, \code{NA} +#' } +#' +#' @rdname VST +#' @export VST +#' +#' @keywords internal +#' +VST <- function( + data, + margin = 1L, + nselect = 2000L, + span = 0.3, + clip = NULL, + ... +) { + UseMethod(generic = 'VST', object = data) +} diff --git a/R/integration.R b/R/integration.R index 6fceae087..e9dd3c3ca 100644 --- a/R/integration.R +++ b/R/integration.R @@ -71,6 +71,7 @@ NULL #' \itemize{ #' \item{cca: Canonical correlation analysis} #' \item{rpca: Reciprocal PCA} +#' \item{jpca: Joint PCA} #' \item{rlsi: Reciprocal LSI} #' } #' @param l2.norm Perform L2 normalization on the CCA cell embeddings after @@ -136,7 +137,7 @@ FindIntegrationAnchors <- function( scale = TRUE, normalization.method = c("LogNormalize", "SCT"), sct.clip.range = NULL, - reduction = c("cca", "rpca", "rlsi"), + reduction = c("cca", "rpca", "jpca", "rlsi"), l2.norm = TRUE, dims = 1:30, k.anchor = 5, @@ -261,7 +262,11 @@ FindIntegrationAnchors <- function( # if using pca or lsi, only need to compute the internal neighborhood structure once # for each dataset internal.neighbors <- list() - if (nn.reduction %in% c("pca", "lsi")) { + if (nn.reduction %in% c("pca", "lsi","jpca")) { + if (nn.reduction == 'jpca') { + nn.reduction <- 'joint.pca' + reduction <- 'joint.pca' + } k.filter <- NA if (verbose) { message("Computing within dataset neighborhoods") @@ -401,10 +406,29 @@ FindIntegrationAnchors <- function( } object.pair }, + 'joint.pca' = { + object.pair <- merge(x = object.1, y = object.2) + reduction.2 <- "joint.pca" + object.pair[['joint.pca']] <- CreateDimReducObject( + embeddings = rbind(Embeddings(object.1[['joint.pca']]), + Embeddings(object.2[['joint.pca']])), + loadings = Loadings(object.1[['joint.pca']]), + key = 'Joint_', + assay = 'ToIntegrate') + if (l2.norm) { + object.pair <- L2Dim(object = object.pair, + reduction = 'joint.pca', + new.dr = 'joint.pca.l2', + new.key = 'Jl2_' + ) + reduction <- paste0(reduction, ".l2") + reduction.2 <- paste0(reduction.2, ".l2") + } + object.pair + }, stop("Invalid reduction parameter. Please choose either cca, rpca, or rlsi") ) internal.neighbors <- internal.neighbors[c(i, j)] - anchors <- FindAnchors( object.pair = object.pair, assay = c("ToIntegrate", "ToIntegrate"), @@ -688,6 +712,7 @@ ReciprocalProject <- function( #' #' @export #' @importFrom methods slot slot<- +#' @importFrom SeuratObject JoinLayers RenameAssays #' @concept integration #' @examples #' \dontrun{ @@ -736,7 +761,7 @@ FindTransferAnchors <- function( l2.norm = TRUE, dims = 1:30, k.anchor = 5, - k.filter = 200, + k.filter = NA, k.score = 30, max.features = 200, nn.method = "annoy", @@ -746,6 +771,8 @@ FindTransferAnchors <- function( mapping.score.k = NULL, verbose = TRUE ) { + op <- options(Seurat.object.assay.calcn = FALSE) + on.exit(expr = options(op), add = TRUE) # input validation ValidateParams_FindTransferAnchors( reference = reference, @@ -778,64 +805,70 @@ FindTransferAnchors <- function( reduction.2 <- character() feature.mean <- NULL reference.reduction.init <- reference.reduction - if (normalization.method == "SCT") { - # ensure all residuals required are computed - query <- suppressWarnings(expr = GetResidual(object = query, assay = query.assay, features = features, verbose = FALSE)) + if (inherits(x = reference[[reference.assay]], what = 'Assay5')) { + if (length(Layers(reference, search = "data")) > 1) { + reference[[reference.assay]] <- JoinLayers( + reference[[reference.assay]], + layers = "data", new = "data") + } + } + if (normalization.method == "SCT") { if (is.null(x = reference.reduction)) { - reference <- suppressWarnings(expr = GetResidual(object = reference, assay = reference.assay, features = features, verbose = FALSE)) + reference <- suppressWarnings(expr = GetResidual( + object = reference, + assay = reference.assay, + features = features, + verbose = FALSE + )) features <- intersect( x = features, - y = intersect( - x = rownames(x = GetAssayData(object = query[[query.assay]], slot = "scale.data")), - y = rownames(x = GetAssayData(object = reference[[reference.assay]], slot = "scale.data")) - ) + y = rownames(reference[[reference.assay]]$scale.data) ) - reference[[reference.assay]] <- as( - object = CreateAssayObject( - data = GetAssayData(object = reference[[reference.assay]], slot = "scale.data")[features, ]), - Class = "SCTAssay" - ) - reference <- SetAssayData( - object = reference, - slot = "scale.data", - assay = reference.assay, - new.data = as.matrix(x = GetAssayData(object = reference[[reference.assay]], slot = "data")) - ) - } - query[[query.assay]] <- as( - object = CreateAssayObject( - data = GetAssayData(object = query[[query.assay]], slot = "scale.data")[features, ]), - Class = "SCTAssay" - ) - query <- SetAssayData( + VariableFeatures(reference) <- features + } + if (IsSCT(assay = query[[query.assay]])) { + query <- suppressWarnings(expr = GetResidual( + object = query, + assay = query.assay, + features = features, + verbose = FALSE + )) + } + } + # Rename query assay w same name as reference assay + if (query.assay != reference.assay) { + suppressWarnings(expr = query <- RenameAssays( object = query, - slot = "scale.data", - assay = query.assay, - new.data = as.matrix(x = GetAssayData(object = query[[query.assay]], slot = "data")) - ) - feature.mean <- "SCT" + assay.name = query.assay, + new.assay.name = reference.assay, + verbose = FALSE + )) + DefaultAssay(query) <- reference.assay } # only keep necessary info from objects + suppressWarnings( query <- DietSeurat( object = query, - assays = query.assay, + assays = reference.assay, dimreducs = reference.reduction, features = features, scale.data = TRUE ) + ) # check assay in the reference.reduction if (!is.null(reference.reduction) && slot(object = reference[[reference.reduction]], name = "assay.used") != reference.assay) { warnings("reference assay is diffrent from the assay.used in", reference.reduction) slot(object = reference[[reference.reduction]], name = "assay.used") <- reference.assay } - - reference <- DietSeurat( - object = reference, - assays = reference.assay, - dimreducs = reference.reduction, - features = features, - scale.data = TRUE + suppressWarnings( + reference <- DietSeurat( + object = reference, + assays = reference.assay, + dimreducs = reference.reduction, + features = features, + scale.data = TRUE + ) ) # append query and reference to cell names - mainly to avoid name conflicts query <- RenameCells( @@ -852,7 +885,10 @@ FindTransferAnchors <- function( if (is.null(x = reference.reduction)) { reference.reduction <- "pca" if (verbose) { - message("Performing PCA on the provided query using ", length(x = features), " features as input.") + message( + "Performing PCA on the provided query using ", + length(x = features), + " features as input.") } if (normalization.method == "LogNormalize") { query <- ScaleData( @@ -877,7 +913,9 @@ FindTransferAnchors <- function( query = reference, scale = scale, dims = dims, - verbose = verbose + feature.mean = feature.mean, + verbose = verbose, + normalization.method = normalization.method ) orig.embeddings <- Embeddings(object = query[[reference.reduction]])[, dims] orig.loadings <- Loadings(object = query[[reference.reduction]]) @@ -898,15 +936,23 @@ FindTransferAnchors <- function( approx = approx.pca ) } + if (paste0("nCount_", query.assay) %in% colnames(query[[]])) { + query_nCount_UMI <- query[[]][, paste0("nCount_", query.assay)] + names(x = query_nCount_UMI) <- colnames(x = query) + } else { + query_nCount_UMI <- NULL + } projected.pca <- ProjectCellEmbeddings( - reference = reference, - reduction = reference.reduction, - query = query, - scale = scale, - dims = dims, - feature.mean = feature.mean, - verbose = verbose - ) + reference = reference, + reduction = reference.reduction, + normalization.method = normalization.method, + query = query, + scale = scale, + dims = dims, + nCount_UMI = query_nCount_UMI, + feature.mean = feature.mean, + verbose = verbose + ) orig.embeddings <- Embeddings(object = reference[[reference.reduction]])[, dims] orig.loadings <- Loadings(object = reference[[reference.reduction]]) } @@ -915,12 +961,29 @@ FindTransferAnchors <- function( key = "ProjectPC_", assay = reference.assay ) - combined.ob <- suppressWarnings(expr = merge( - x = DietSeurat(object = reference, counts = FALSE), - y = DietSeurat(object = query, counts = FALSE), - )) - combined.ob[["pcaproject"]] <- combined.pca + # combined.ob <- suppressWarnings(expr = merge( + # x = DietSeurat(object = reference, counts = FALSE), + # y = DietSeurat(object = query, counts = FALSE), + # )) + ref.diet <- DietSeurat(object = reference, counts = FALSE) + query.diet <- DietSeurat(object = query, counts = FALSE) + + counts.list <- list(reference = LayerData(ref.diet[[reference.assay]], layer = "data")) + query.data.list <- list() + for (i in Layers(object = query.diet[[reference.assay]], search = "data")) { + data.layer.name <- gsub(pattern = "data.", replacement = "", x = i) + counts.list[[data.layer.name]] <- LayerData(object = query[[reference.assay]], layer = i) + } + combined.ob <- CreateSeuratObject(counts = counts.list, assay = reference.assay) + for (i in Layers(object = combined.ob[[reference.assay]], search = "counts")){ + data.layer.name <- gsub(pattern = "counts.", replacement = "data.", x = i) # replace counts. to data. + layer.data <- LayerData(object = combined.ob, layer = i) + LayerData(object = combined.ob, layer = data.layer.name) <- layer.data # set layer data + } colnames(x = orig.loadings) <- paste0("ProjectPC_", 1:ncol(x = orig.loadings)) + + combined.ob[["pcaproject"]] <- combined.pca + Loadings(object = combined.ob[["pcaproject"]], projected = FALSE) <- orig.loadings[, dims] Loadings(object = combined.ob[["pcaproject"]]) <- orig.loadings[, dims] } # Use reciprocal PCA projection in anchor finding @@ -985,13 +1048,13 @@ FindTransferAnchors <- function( reduction = reference.reduction, query = query, scale = scale, + normalization.method = normalization.method, dims = dims, feature.mean = feature.mean, verbose = verbose ) orig.embeddings <- Embeddings(object = reference[[reference.reduction]])[, dims] orig.loadings <- Loadings(object = reference[[reference.reduction]]) - combined.pca <- CreateDimReducObject( embeddings = as.matrix(x = rbind(orig.embeddings, projected.pca)), key = "ProjectPC_", @@ -1051,7 +1114,7 @@ FindTransferAnchors <- function( } else { projected.lsi <- ProjectSVD( reduction = reference[[reference.reduction]], - data = GetAssayData(object = query, assay = query.assay, slot = "data"), + data = GetAssayData(object = query, assay = reference.assay, slot = "data"), mode = "lsi", do.center = FALSE, do.scale = FALSE, @@ -1089,7 +1152,7 @@ FindTransferAnchors <- function( } k.nn <- max(k.score, k.anchor) query.neighbors <- NNHelper( - data = Embeddings(object = combined.ob[[reduction]])[Cells(x = query), ], + data = Embeddings(object = combined.ob[[reduction]])[colnames(x = query), ], k = max(mapping.score.k, k.nn + 1), method = nn.method, n.trees = n.trees, @@ -1107,11 +1170,21 @@ FindTransferAnchors <- function( } if (!is.null(x = reference.neighbors)) { precomputed.neighbors[["ref.neighbors"]] <- reference[[reference.neighbors]] - nn.idx1 <- Index(object = reference[[reference.neighbors]]) + } else { + precomputed.neighbors[["ref.neighbors"]] <- NNHelper( + data = Embeddings(combined.ob[[reduction]])[ + colnames(x = reference), + 1:length(x = dims) + ], + k = max(k.score, k.anchor) + 1, + method = nn.method, + cache.index = TRUE + ) } + nn.idx1 <- Index(object = precomputed.neighbors[["ref.neighbors"]]) anchors <- FindAnchors( object.pair = combined.ob, - assay = c(reference.assay, query.assay), + assay = c(reference.assay, reference.assay), slot = "data", cells1 = colnames(x = reference), cells2 = colnames(x = query), @@ -1132,7 +1205,7 @@ FindTransferAnchors <- function( verbose = verbose ) reductions <- slot(object = combined.ob, name = "reductions") - for (i in unique(x = c(reference.assay, query.assay))) { + for (i in unique(x = c(reference.assay))) { dummy.assay <- paste0(i, "DUMMY") suppressWarnings( expr = combined.ob[[dummy.assay]] <- CreateDummyAssay(assay = combined.ob[[i]]) @@ -1354,6 +1427,7 @@ IntegrateData <- function( anchors <- slot(object = anchorset, name = 'anchors') ref <- object.list[reference.datasets] features <- features %||% slot(object = anchorset, name = "anchor.features") + unintegrated <- suppressWarnings(expr = merge( x = object.list[[1]], y = object.list[2:length(x = object.list)] @@ -1381,6 +1455,7 @@ IntegrateData <- function( verbose = verbose ) } + print(i) model.list[[i]] <- slot(object = object.list[[i]][[assay]], name = "SCTModel.list") object.list[[i]][[assay]] <- suppressWarnings(expr = CreateSCTAssayObject( data = GetAssayData( @@ -1438,7 +1513,6 @@ IntegrateData <- function( reference.model <- model.list[[which(reference.model)]] } } - if (length(x = reference.datasets) == length(x = object.list)) { if (normalization.method == "SCT") { reference.integrated[[new.assay.name]] <- CreateSCTAssayObject( @@ -1557,6 +1631,10 @@ IntegrateEmbeddings.IntegrationAnchorSet <- function( reference.datasets <- slot(object = anchorset, name = 'reference.objects') object.list <- slot(object = anchorset, name = 'object.list') anchors <- slot(object = anchorset, name = 'anchors') + reductions <- reductions %||% slot( + object = anchorset, + name = 'weight.reduction' + ) ValidateParams_IntegrateEmbeddings_IntegrationAnchors( anchorset = anchorset, object.list = object.list, @@ -1574,10 +1652,11 @@ IntegrateEmbeddings.IntegrationAnchorSet <- function( intdr.assay <- DefaultAssay(object = reductions) int.assay <- DefaultAssay(object = object.list[[1]]) dims.names <- paste0("drtointegrate-", dims.to.integrate) - cell.names.map <- Cells(x = unintegrated) + # cell.names.map <- Cells(x = unintegrated) + cell.names.map <- colnames(x = unintegrated) names(x = cell.names.map) <- make.unique(names = unname(obj = do.call( what = c, - args = lapply(X = object.list, FUN = Cells))) + args = lapply(X = object.list, FUN = colnames))) ) for (i in 1:length(x = object.list)) { embeddings <- t(x = Embeddings(object = reductions)[cell.names.map[Cells(x = object.list[[i]])], dims.to.integrate]) @@ -1593,6 +1672,8 @@ IntegrateEmbeddings.IntegrationAnchorSet <- function( } slot(object = anchorset, name = "object.list") <- object.list new.reduction.name.safe <- gsub(pattern = "_", replacement = "", x = new.reduction.name) + new.reduction.name.safe <- gsub(pattern = "[.]", replacement = "", x = new.reduction.name.safe) + reference.integrated <- PairwiseIntegrateReference( anchorset = anchorset, new.assay.name = new.reduction.name.safe, @@ -1609,8 +1690,11 @@ IntegrateEmbeddings.IntegrationAnchorSet <- function( ) if (length(x = reference.datasets) == length(x = object.list)) { reference.dr <- CreateDimReducObject( - embeddings = as.matrix(x = t(GetAssayData(reference.integrated[[new.reduction.name.safe]]))), + embeddings = as.matrix(x = t(GetAssayData( + object = reference.integrated[[new.reduction.name.safe]] + ))), assay = intdr.assay, + loadings = Loadings(object = reductions), key = paste0(new.reduction.name.safe, "_") ) DefaultAssay(object = reference.integrated) <- int.assay @@ -1624,8 +1708,7 @@ IntegrateEmbeddings.IntegrationAnchorSet <- function( reference.integrated[[active.assay]] <- CreateAssayObject( data = GetAssayData( object = reference.integrated[[new.reduction.name.safe]], - slot = 'data', - check.matrix = FALSE + slot = 'data' ) ) DefaultAssay(object = reference.integrated) <- active.assay @@ -1646,23 +1729,38 @@ IntegrateEmbeddings.IntegrationAnchorSet <- function( preserve.order = preserve.order, verbose = verbose ) - unintegrated[[new.reduction.name]] <- CreateDimReducObject( + suppressWarnings(expr = unintegrated[[new.reduction.name]] <- CreateDimReducObject( embeddings = as.matrix(x = t(x = integrated.data)), assay = intdr.assay, + loadings = Loadings(object = reductions), key = paste0(new.reduction.name.safe, "_") - ) + )) unintegrated <- SetIntegrationData( object = unintegrated, integration.name = "Integration", slot = "anchors", new.data = anchors ) + if (!is.null(x = Tool(object = reference.integrated, slot = "Integration"))) { + sample.tree <- GetIntegrationData( + object = reference.integrated, + integration.name = "Integration", + slot = "sample.tree" + ) + } + unintegrated <- SetIntegrationData( + object = unintegrated, + integration.name = "Integration", + slot = "sample.tree", + new.data = sample.tree + ) unintegrated[["FindIntegrationAnchors"]] <- slot(object = anchorset, name = "command") suppressWarnings(unintegrated <- LogSeuratCommand(object = unintegrated)) return(unintegrated) } #' @param reference Reference object used in anchorset construction #' @param query Query object used in anchorset construction +#' @param query.assay Name of the Assay to use from query #' @param reuse.weights.matrix Can be used in conjunction with the store.weights #' parameter in TransferData to reuse a precomputed weights matrix. #' @@ -1675,6 +1773,7 @@ IntegrateEmbeddings.TransferAnchorSet <- function( anchorset, reference, query, + query.assay = NULL, new.reduction.name = "integrated_dr", reductions = 'pcaproject', dims.to.integrate = NULL, @@ -1690,11 +1789,13 @@ IntegrateEmbeddings.TransferAnchorSet <- function( combined.object <- slot(object = anchorset, name = 'object.list')[[1]] anchors <- slot(object = anchorset, name = 'anchors') weights.matrix <- NULL + query.assay <- query.assay %||% DefaultAssay(query) ValidateParams_IntegrateEmbeddings_TransferAnchors( anchorset = anchorset, combined.object = combined.object, reference = reference, query = query, + query.assay = query.assay, reductions = reductions, dims.to.integrate = dims.to.integrate, k.weight = k.weight, @@ -1747,7 +1848,7 @@ IntegrateEmbeddings.TransferAnchorSet <- function( ) integrated.embeddings <- as.matrix(x = integrated.embeddings) query[[new.reduction.name]] <- CreateDimReducObject( - embeddings = t(x = integrated.embeddings[, Cells(x = query)]), + embeddings = t(x = integrated.embeddings[, Cells(x = query[[query.assay]])]), assay = DefaultAssay(object = query[[reductions[1]]]), key = paste0(new.reduction.name.safe, "_") ) @@ -1759,6 +1860,207 @@ IntegrateEmbeddings.TransferAnchorSet <- function( return(query) } + +#' Integrate embeddings from the integrated sketched.assay +#' +#' The main steps of this procedure are outlined below. For a more detailed +#' description of the methodology, please see Hao, et al Biorxiv 2022: +#' \doi{10.1101/2022.02.24.481684} +#' +#' First learn a atom dictionary representation to reconstruct each cell. +#' Then, using this dictionary representation, +#' reconstruct the embeddings of each cell from the integrated atoms. +#' +#' @param object A Seurat object with all cells for one dataset +#' @param sketched.assay Assay name for sketched-cell expression (default is 'sketch') +#' @param assay Assay name for original expression (default is 'RNA') +#' @param features Features used for atomic sketch integration +#' @param reduction Dimensional reduction name for batch-corrected embeddings +#' in the sketched object (default is 'integrated_dr') +#' @param method Methods to construct sketch-cell representation +#' for all cells (default is 'sketch'). Can be one of: +#' \itemize{ +#' \item \dQuote{\code{sketch}}: Use random sketched data slot +#' \item \dQuote{\code{data}}: Use data slot +#' } +#' @param ratio Sketch ratio of data slot when \code{dictionary.method} is set +#' to \dQuote{\code{sketch}}; defaults to 0.8 +#' @param reduction.name Name to save new reduction as; defaults to +#' \code{paste0(reduction, '.orig')} +#' @param reduction.key Key for new dimensional reduction; defaults to creating +#' one from \code{reduction.name} +#' @param layers Names of layers for correction. +#' @param sketched.layers Names of sketched layers, defaults to all +#' layers of \dQuote{\code{object[[assay]]}} +#' @param seed A positive integer. The seed for the random number generator, defaults to 123. +#' @param verbose Print progress and message +#' +#' @return Returns a Seurat object with an integrated dimensional reduction +#' +#' @importFrom MASS ginv +#' @importFrom Matrix t +#' +#' @export +#' @concept integration +#' +ProjectIntegration <- function( + object, + sketched.assay = 'sketch', # DefaultAssay(object) + assay = 'RNA', + reduction = 'integrated_dr', # harmony; rerun UMAP on this + features = NULL, # VF from object[[atom.assay]] + layers = 'data', + reduction.name = NULL, + reduction.key = NULL, + method = c('sketch', 'data'), + ratio = 0.8, + sketched.layers = NULL, + seed = 123, + verbose = TRUE +) { + + layers <- Layers(object = object[[assay]], search = layers) + # Check input and output dimensional reductions + sketched.layers <- sketched.layers %||% layers + reduction <- match.arg(arg = reduction, choices = Reductions(object = object)) + reduction.name <- reduction.name %||% paste0(reduction, '.full') + reduction.key <- reduction.key %||% Key(object = reduction.name, quiet = TRUE) + if (reduction.name %in% Reductions(object = object)) { + warning( + "'", + reduction.name, + "' already exists, overwriting", + call. = FALSE, + immediate. = TRUE + ) + } + # Check the method being used + method <- method[1L] + method <- match.arg(arg = method) + # Check our layers + sketched.assay <- match.arg(arg = sketched.assay, choices = Assays(object = object)) + assay <- match.arg(arg = assay, choices = Assays(object = object)) + layer.full <- layers + layers <- layers %||% intersect( + x = DefaultLayer(object[[sketched.assay]]), + y = Layers(object[[assay]]) + ) + if (is.null(x = layer.full)) { + sketched.assay.missing <- setdiff(x = layers, DefaultLayer(object = object[[sketched.assay]])) + if (length(x = sketched.assay.missing) == length(x = layers)) { + stop("None of the requested layers are present in the sketched.assay") + } else if (length(x = sketched.assay.missing)) { + warning( + length(x = sketched.assay.missing), + " layers missing from the sketched.assay", + call. = FALSE, + immediate. = TRUE + ) + layers <- intersect(x = layers, y = DefaultLayer(object = object[[sketched.assay]])) + } + } + # check layers + layers.missing <- setdiff(layers, Layers(object = object[[assay]])) + if (length(x = layers.missing)) { + stop('layer ', layers.missing[1L], ' are not present in ', assay, " assay") + } + # check features + features <- features %||% VariableFeatures(object = object[[sketched.assay]]) + # TODO: see if we can handle missing features with `union` + features.atom <- Reduce( + f = intersect, + x = lapply( + X = sketched.layers, + FUN = function(lyr) { + return(Features(x = object[[sketched.assay]], layer = lyr)) + } + ) + ) + features <- intersect(x = features, y = features.atom) + if (length(x = features) == 0) { + stop('Features are not found. Please check VariableFeatures(object[[sketched.assay]]) ', + 'or set features in ProjectIntegration') + } + ncells <- c( + 0, + sapply( + X = layers, + FUN = function(lyr) { + return(length(x = Cells(x = object[[assay]], layer = lyr))) + } + ) + ) + if (length(sketched.layers) == 1) { + sketched.layers <- rep(sketched.layers, length(layers)) + } + sketch.matrix <- switch( + EXPR = method, + data = { + R = as.sparse( + x = diag( + x = length( + x = features) + ) + ) + R + }, + sketch = { + R <- FeatureSketch(features = features, + ratio = ratio, + seed = seed + ) + R + } + ) + emb.list <- list() + cells.list <- list() + for (i in seq_along(along.with = layers)) { + if (length(unique(sketched.layers)) == length(layers)) { + cells.sketch <- Cells(x = object[[sketched.assay]], layer = sketched.layers[i]) + } else if (length(unique(sketched.layers)) == 1) { + cells.sketch <- intersect(Cells(x = object[[sketched.assay]][[sketched.layers[[1]]]]), + Cells(object[[assay]][[layers[i] ]] )) + } + if (isTRUE(x = verbose)) { + message( + length(x = cells.sketch), + ' atomic cells identified in the sketched.assay' + ) + message("Correcting embeddings") + } + emb <- UnSketchEmbeddings( + atom.data = LayerData( + object = object[[sketched.assay]], + layer = layers[i], + features = features + ), + atom.cells = cells.sketch, + orig.data = LayerData( + object = object[[assay]], + layer = layers[i], + features = features + ), + embeddings = Embeddings(object = object[[reduction]]), + sketch.matrix = sketch.matrix) + emb.list[[i]] <- emb + cells.list[[i]] <- colnames(x = emb) + } + emb.all <- t(x = matrix( + data = unlist(emb.list), + nrow = ncol(x = object[[reduction]]), + ncol = length(unlist(cells.list)))) + rownames(emb.all) <- unlist(cells.list) + emb.all <- emb.all[colnames(object[[assay]]), ] + object[[reduction.name]] <- CreateDimReducObject( + embeddings = emb.all, + loadings = Loadings(object = object[[reduction]]), + key = reduction.key, + assay = assay + ) + CheckGC() + return(object) +} + #' Calculate the local structure preservation metric #' #' Calculates a metric that describes how well the local structure of each group @@ -1869,14 +2171,15 @@ LocalStruct <- function( #' @inheritParams IntegrateEmbeddings #' @inheritParams TransferData #' @inheritParams ProjectUMAP +#' @param store.weights Determine if the weight and anchor matrices are stored. #' @param transferdata.args A named list of additional arguments to #' \code{\link{TransferData}} #' @param integrateembeddings.args A named list of additional arguments to #' \code{\link{IntegrateEmbeddings}} #' @param projectumap.args A named list of additional arguments to #' \code{\link{ProjectUMAP}} -#' @return Returns a modified query Seurat object containing: #' +#' @return Returns a modified query Seurat object containing:#' #' \itemize{ #' \item{New Assays corresponding to the features transferred and/or their #' corresponding prediction scores from \code{\link{TransferData}}} @@ -1885,7 +2188,7 @@ LocalStruct <- function( #' reference UMAP using \code{\link{ProjectUMAP}}} #' } #' -#' @importFrom rlang invoke +#' @importFrom rlang exec #' #' @export #' @concept integration @@ -1899,15 +2202,22 @@ MapQuery <- function( reference.reduction = NULL, reference.dims = NULL, query.dims = NULL, + store.weights = FALSE, reduction.model = NULL, transferdata.args = list(), integrateembeddings.args = list(), projectumap.args = list(), verbose = TRUE ) { - + transfer.reduction <- slot(object = anchorset, name = "command")$reduction + if (DefaultAssay(anchorset@object.list[[1]]) %in% Assays(reference)) { + DefaultAssay(reference) <- DefaultAssay(anchorset@object.list[[1]]) + } else { + stop('The assay used to create the anchorset does not match any', + 'of the assays in the reference object.') + } # determine anchor type - if (grepl(pattern = "pca", x = slot(object = anchorset, name = "command")$reduction)) { + if (grepl(pattern = "pca", x = transfer.reduction)) { anchor.reduction <- "pcaproject" # check if the anchorset can be used for mapping if (is.null(x = slot(object = anchorset, name = "command")$reference.reduction)) { @@ -1915,7 +2225,7 @@ MapQuery <- function( 'FindTransferAnchors, so the resulting AnchorSet object cannot be used ', 'in the MapQuery function.') } - } else if (grepl(pattern = "cca", x = slot(object = anchorset, name = "command")$reduction)) { + } else if (grepl(pattern = "cca", x = transfer.reduction)) { anchor.reduction <- "cca" ref.cca.embedding <- Embeddings( slot(object = anchorset, name = "object.list")[[1]][["cca"]] @@ -1945,19 +2255,40 @@ MapQuery <- function( ) reference.reduction <- new.reduction.name <- "cca" reference.dims <- query.dims <- 1:ncol(x = ref.cca.embedding) - } else if (grepl(pattern = "lsi", x = slot(object = anchorset, name = "command")$reduction)) { + } else if (grepl(pattern = "lsi", x = transfer.reduction)) { anchor.reduction <- "lsiproject" - } else { + } else if (grepl(pattern = "direct", x = transfer.reduction)) { + anchor.reduction <- paste0( + slot(object = anchorset, + name = "command")$bridge.assay.name, + ".reduc" + ) + ref.reduction.emb <- Embeddings( + object = + slot( + object = anchorset, + name = "object.list" + )[[1]][[anchor.reduction]])[ + slot(object = anchorset, name = "reference.cells"),] + rownames(ref.reduction.emb) <- gsub( + pattern = "_reference", + replacement = "", + x = rownames(ref.reduction.emb) + ) + reference[[anchor.reduction]] <- CreateDimReducObject( + embeddings = ref.reduction.emb, + key = "L_", + assay = DefaultAssay(reference) + ) + } + else { stop("unkown type of anchors") } - - reference.reduction <- reference.reduction %||% slot(object = anchorset, name = "command")$reference.reduction %||% anchor.reduction new.reduction.name <- new.reduction.name %||% paste0("ref.", reference.reduction) - # checking TransferData parameters td.badargs <- names(x = transferdata.args)[!names(x = transferdata.args) %in% names(x = formals(fun = TransferData))] if (length(x = td.badargs) > 0) { @@ -1975,39 +2306,43 @@ MapQuery <- function( integrateembeddings.args <- integrateembeddings.args[names(x = integrateembeddings.args) %in% names(x = formals(fun = IntegrateEmbeddings.TransferAnchorSet))] integrateembeddings.args$reductions <- integrateembeddings.args$reductions %||% anchor.reduction integrateembeddings.args$weight.reduction <- integrateembeddings.args$weight.reduction %||% anchor.reduction - slot(object = query, name = "tools")$TransferData <- NULL reuse.weights.matrix <- FALSE - if (!is.null(x = refdata)) { - query <- invoke( - .fn = TransferData, - .args = c(list( - anchorset = anchorset, - reference = reference, - query = query, - refdata = refdata, - store.weights = TRUE, - verbose = verbose - ), transferdata.args - ) - ) - if (transferdata.args$weight.reduction == integrateembeddings.args$weight.reduction) { - reuse.weights.matrix <- TRUE - } - } - if (anchor.reduction != "cca"){ - query <- invoke( - .fn = IntegrateEmbeddings, - .args = c(list( - anchorset = anchorset, - reference = reference, - query = query, - new.reduction.name = new.reduction.name, - reuse.weights.matrix = reuse.weights.matrix, - verbose = verbose + td.allarguments <- c(list(anchorset = anchorset, + reference = reference, query = query, refdata = refdata, + store.weights = TRUE, only.weights = is.null(x = refdata), + verbose = verbose), transferdata.args) + query <- exec("TransferData",!!!td.allarguments) + if (inherits(x = transferdata.args$weight.reduction , "character") && + transferdata.args$weight.reduction == integrateembeddings.args$weight.reduction) { + reuse.weights.matrix <- TRUE + } + if (anchor.reduction != "cca") { + ie.allarguments <- c(list( + anchorset = anchorset, + reference = reference, + query = query, + new.reduction.name = new.reduction.name, + reuse.weights.matrix = reuse.weights.matrix, + verbose = verbose ), integrateembeddings.args - ) ) + query <- exec("IntegrateEmbeddings",!!!ie.allarguments) + Misc( + object = query[[new.reduction.name]], + slot = 'ref.dims' + ) <- slot(object = anchorset, name = "command")$dims + } + slot(object = query, name = "tools")$MapQuery <- NULL + if (store.weights) { + slot(object = query, name = "tools")$MapQuery <- slot( + object = query, + name = "tools" + )$TransferData + slot(object = query, name = "tools")$MapQuery$anchor <- slot( + object = anchorset, + name = "anchors" + ) } slot(object = query, name = "tools")$TransferData <- NULL if (!is.null(x = reduction.model)) { @@ -2018,20 +2353,18 @@ MapQuery <- function( query.dims <- reference.dims } ref_nn.num <- Misc(object = reference[[reduction.model]], slot = "model")$n_neighbors - query <- invoke( - .fn = ProjectUMAP, - .args = c(list( - query = query, - query.reduction = new.reduction.name, - query.dims = query.dims, - reference = reference, - reference.dims = reference.dims, - reference.reduction = reference.reduction, - reduction.model = reduction.model, - k.param = ref_nn.num - ), projectumap.args - ) + pu.allarguments <- c(list( + query = query, + query.reduction = new.reduction.name, + query.dims = query.dims, + reference = reference, + reference.dims = reference.dims, + reference.reduction = reference.reduction, + reduction.model = reduction.model, + k.param = ref_nn.num + ), projectumap.args ) + query <- exec("ProjectUMAP",!!!pu.allarguments) } return(query) } @@ -2289,19 +2622,19 @@ MappingScore.AnchorSet <- function( combined.object <- slot(object = anchors, name = "object.list")[[1]] combined.object <- RenameCells( object = combined.object, - new.names = unname(obj = sapply( + new.names = unname(obj = make.unique(sapply( X = Cells(x = combined.object), FUN = RemoveLastField - )) + ))) ) - query.cells <- sapply( + query.cells <- make.unique(sapply( X = slot(object = anchors, name = "query.cells"), FUN = RemoveLastField - ) - ref.cells <- sapply( + )) + ref.cells <- make.unique(sapply( X = slot(object = anchors, name = "reference.cells"), FUN = RemoveLastField - ) + )) query.embeddings <- Embeddings(object = subset( x = combined.object[["pcaproject.l2"]], cells = query.cells @@ -2682,6 +3015,140 @@ SelectIntegrationFeatures <- function( return(features) } +.FeatureRank <- function(features, flist, ranks = FALSE) { + franks <- vapply( + X = features, + FUN = function(x) { + return(median(x = unlist(x = lapply( + X = flist, + FUN = function(fl) { + if (x %in% fl) { + return(which(x = x == fl)) + } + return(NULL) + } + )))) + }, + FUN.VALUE = numeric(length = 1L) + ) + franks <- sort(x = franks) + if (!isTRUE(x = ranks)) { + franks <- names(x = franks) + } + return(franks) +} + +#' Select integration features +#' +#' @param object Seurat object +#' @param nfeatures Number of features to return for integration +#' @param assay Name of assay to use for integration feature selection +#' @param method Which method to pull. For \code{HVFInfo} and +#' \code{VariableFeatures}, choose one from one of the +#' following: +#' \itemize{ +#' \item \dQuote{vst} +#' \item \dQuote{sctransform} or \dQuote{sct} +#' \item \dQuote{mean.var.plot}, \dQuote{dispersion}, \dQuote{mvp}, or +#' \dQuote{disp} +#' } +#' @param layers Name of layers to use for integration feature selection +#' @param verbose Print messages +#' @param ... Arguments passed on to \code{method} +#' +#' @export +#' @concept integration +#' +SelectIntegrationFeatures5 <- function( + object, + nfeatures = 2000, + assay = NULL, + method = NULL, + layers = NULL, + verbose = TRUE, + ... +) { + assay <- assay %||% DefaultAssay(object = object) + layers <- Layers(object = object[[assay]], search = layers) + var.features <- VariableFeatures( + object = object, + assay = assay, + nfeatures = nfeatures, + method = method, + layer = layers, + simplify = TRUE + ) + return(var.features) +} + +#' Select SCT integration features +#' +#' @param object Seurat object +#' @param nfeatures Number of features to return for integration +#' @param assay Name of assay to use for integration feature selection +#' @param verbose Print messages +#' @param ... Arguments passed on to \code{method} +#' +#' @export +#' @concept integration +#' +SelectSCTIntegrationFeatures <- function( + object, + nfeatures = 3000, + assay = NULL, + verbose = TRUE, + ... +) { + assay <- assay %||% DefaultAssay(object = object) + if (!inherits(x = object[[assay]], what = 'SCTAssay')) { + abort(message = "'assay' must be an SCTAssay") + } + models <- levels(x = object[[assay]]) + vf.list <- VariableFeatures( + object = object[[assay]], + layer = models, + nfeatures = nfeatures, + simplify = FALSE + ) + var.features <- sort( + x = table(unlist(x = vf.list, use.names = FALSE)), + decreasing = TRUE + ) + for (i in 1:length(x = models)) { + vst_out <- SCTModel_to_vst(SCTModel = slot(object = object[[assay]], name = "SCTModel.list")[[models[[i]]]]) + var.features <- var.features[names(x = var.features) %in% rownames(x = vst_out$gene_attr)] + } + tie.val <- var.features[min(nfeatures, length(x = var.features))] + features <- names(x = var.features[which(x = var.features > tie.val)]) + if (length(x = features) > 0) { + feature.ranks <- sapply(X = features, FUN = function(x) { + ranks <- sapply(X = vf.list, FUN = function(vf) { + if (x %in% vf) { + return(which(x = x == vf)) + } + return(NULL) + }) + median(x = unlist(x = ranks)) + }) + features <- names(x = sort(x = feature.ranks)) + } + features.tie <- var.features[which(x = var.features == tie.val)] + tie.ranks <- sapply(X = names(x = features.tie), FUN = function(x) { + ranks <- sapply(X = vf.list, FUN = function(vf) { + if (x %in% vf) { + return(which(x = x == vf)) + } + return(NULL) + }) + median(x = unlist(x = ranks)) + }) + features <- c( + features, + names(x = head(x = sort(x = tie.ranks), nfeatures - length(x = features))) + ) + return(features) +} + #' Transfer data #' #' Transfer categorical or continuous data across single-cell datasets. For @@ -2744,6 +3211,7 @@ SelectIntegrationFeatures <- function( #' } #' @param reference Reference object from which to pull data to transfer #' @param query Query object into which the data will be transferred. +#' @param query.assay Name of the Assay to use from query #' @param weight.reduction Dimensional reduction to use for the weighting #' anchors. Options are: #' \itemize{ @@ -2770,6 +3238,7 @@ SelectIntegrationFeatures <- function( #' or "counts" #' @param prediction.assay Return an \code{Assay} object with the prediction #' scores for each class stored in the \code{data} slot. +#' @param only.weights Only return weights matrix #' @param store.weights Optionally store the weights matrix used for predictions #' in the returned query object. #' @@ -2790,8 +3259,12 @@ SelectIntegrationFeatures <- function( #' @references Stuart T, Butler A, et al. Comprehensive Integration of #' Single-Cell Data. Cell. 2019;177:1888-1902 \doi{10.1016/j.cell.2019.05.031} #' +#' @importFrom Matrix t +#' #' @export +#' #' @concept integration +#' #' @examples #' \dontrun{ #' # to install the SeuratData package see https://github.com/satijalab/seurat-data @@ -2824,6 +3297,7 @@ TransferData <- function( refdata, reference = NULL, query = NULL, + query.assay = NULL, weight.reduction = 'pcaproject', l2.norm = FALSE, dims = NULL, @@ -2834,12 +3308,16 @@ TransferData <- function( verbose = TRUE, slot = "data", prediction.assay = FALSE, + only.weights = FALSE, store.weights = TRUE ) { combined.ob <- slot(object = anchorset, name = "object.list")[[1]] anchors <- slot(object = anchorset, name = "anchors") reference.cells <- slot(object = anchorset, name = "reference.cells") query.cells <- slot(object = anchorset, name = "query.cells") + if (!is.null(query)) { + query.assay <- query.assay %||% DefaultAssay(query) + } label.transfer <- list() ValidateParams_TransferData( anchorset = anchorset, @@ -2850,6 +3328,7 @@ TransferData <- function( refdata = refdata, reference = reference, query = query, + query.assay = query.assay, weight.reduction = weight.reduction, l2.norm = l2.norm, dims = dims, @@ -2858,6 +3337,7 @@ TransferData <- function( eps = eps, n.trees = n.trees, verbose = verbose, + only.weights = only.weights, slot = slot, prediction.assay = prediction.assay, label.transfer = label.transfer @@ -2866,8 +3346,10 @@ TransferData <- function( if (verbose) { message("Running PCA on query dataset") } + features <- slot(object = anchorset, name = "anchor.features") query.ob <- query + DefaultAssay(query.ob) <- query.assay query.ob <- ScaleData(object = query.ob, features = features, verbose = FALSE) query.ob <- RunPCA(object = query.ob, npcs = max(dims), features = features, verbose = FALSE) query.pca <- Embeddings(query.ob[['pca']]) @@ -2913,6 +3395,7 @@ TransferData <- function( } weight.reduction <- combined.ob[[weight.reduction]] } + dims <- dims %||% seq_len(length.out = ncol(x = weight.reduction)) if (max(dims) > ncol(x = weight.reduction)) { stop("dims is larger than the number of available dimensions in ", "weight.reduction (", ncol(x = weight.reduction), ").", call. = FALSE) @@ -2948,6 +3431,14 @@ TransferData <- function( integration.name = "integrated", slot = 'weights' ) + if (only.weights) { + if (is.null(x = query)) { + return(weights) + } else { + slot(object = query, name = "tools")[["TransferData"]] <- list(weights.matrix = weights) + return(query) + } + } anchors <- as.data.frame(x = anchors) query.cells <- unname(obj = sapply( X = query.cells, @@ -2965,8 +3456,12 @@ TransferData <- function( anchors$id1 <- refdata[[rd]][anchors[, "cell1"]] reference.ids <- factor(x = anchors$id1, levels = unique(x = refdata[[rd]])) possible.ids <- levels(x = reference.ids) - prediction.mat <- matrix(nrow = nrow(x = anchors), ncol = length(x = possible.ids), data = 0) - for(i in 1:length(x = possible.ids)) { + prediction.mat <- matrix( + nrow = nrow(x = anchors), + ncol = length(x = possible.ids), + data = 0 + ) + for (i in 1:length(x = possible.ids)) { prediction.mat[which(reference.ids == possible.ids[i]), i] = 1 } if (verbose) { @@ -2975,10 +3470,42 @@ TransferData <- function( prediction.scores <- t(x = weights) %*% prediction.mat colnames(x = prediction.scores) <- possible.ids rownames(x = prediction.scores) <- query.cells + if ("bridge.sets" %in% names(anchorset@weight.reduction@misc)) { + bridge.weight <- anchorset@weight.reduction@misc$bridge.sets + bridge.prediction.matrix <- as.sparse( + x = dummy_cols( + refdata[[rd]][ bridge.weight$bridge.ref_anchor ] + )[, -1] + ) + colnames(bridge.prediction.matrix) <- gsub( + pattern = ".data_", + replacement = "", + x = colnames(bridge.prediction.matrix) + ) + extra.id <- setdiff(possible.ids, colnames(bridge.prediction.matrix)) + if (length(extra.id) > 0) { + extra.prediction <- as.sparse(x = matrix( + data = 0, + nrow = nrow(bridge.prediction.matrix), + ncol = length(extra.id) + )) + colnames(extra.prediction) <- extra.id + bridge.prediction.matrix <- cbind( + bridge.prediction.matrix, + extra.prediction + ) + } + bridge.prediction.matrix <- bridge.prediction.matrix[,possible.ids, drop = FALSE] + bridge.prediction.scores <- t(bridge.weight$query.weights) %*% + (t(bridge.weight$bridge.weights) %*% + bridge.prediction.matrix)[bridge.weight$query.ref_anchor,] + prediction.scores <- (prediction.scores + bridge.prediction.scores)/2 + prediction.scores <- as.matrix(x = prediction.scores) + } prediction.ids <- possible.ids[apply(X = prediction.scores, MARGIN = 1, FUN = which.max)] prediction.ids <- as.character(prediction.ids) prediction.max <- apply(X = prediction.scores, MARGIN = 1, FUN = max) - if (is.null(x = query)){ + if (is.null(x = query)) { prediction.scores <- cbind(prediction.scores, max = prediction.max) } predictions <- data.frame( @@ -2991,7 +3518,7 @@ TransferData <- function( predictions <- CreateAssayObject( data = t(x = as.matrix(x = prediction.scores)), check.matrix = FALSE ) - Key(object = predictions) <- paste0("predictionscore", rd.name, "_") + Key(object = predictions) <- Key(paste0("predictionscore", rd.name), quiet = TRUE) } if (is.null(x = query)) { transfer.results[[rd]] <- predictions @@ -3018,7 +3545,7 @@ TransferData <- function( } else if (slot == "data") { new.assay <- CreateAssayObject(data = new.data, check.matrix = FALSE) } - Key(object = new.assay) <- paste0(rd.name, "_") + Key(object = new.assay) <- Key(rd.name, quiet = TRUE) if (is.null(x = query)) { transfer.results[[rd]] <- new.assay } else { @@ -3049,6 +3576,7 @@ TransferData <- function( } } + #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Methods for Seurat-defined generics #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -3411,7 +3939,9 @@ FilterAnchors <- function( return(object) } -FindAnchors <- function( + + +FindAnchors_v3 <- function( object.pair, assay, slot, @@ -3473,6 +4003,7 @@ FindAnchors <- function( max.features = max.features, projected = projected ) + if(length(top.features) == 2){ top.features <- intersect(top.features[[1]], top.features[[2]]) } else{ @@ -3509,6 +4040,137 @@ FindAnchors <- function( return(anchors) } + +FindAnchors_v5 <- function( + object.pair, + assay, + slot, + cells1, + cells2, + internal.neighbors, + reduction, + reduction.2 = character(), + nn.reduction = reduction, + dims = 1:10, + k.anchor = 5, + k.filter = 200, + k.score = 30, + max.features = 200, + nn.method = "annoy", + n.trees = 50, + nn.idx1 = NULL, + nn.idx2 = NULL, + eps = 0, + projected = FALSE, + verbose = TRUE +) { + ref.assay <- assay[1] + query.assay <- assay[2] + reference.layers <- Layers(object.pair[[ref.assay]], search = 'data')[1] + query.layers <- setdiff(Layers(object.pair[[query.assay]], search = 'data'), reference.layers) + anchor.list <- list() + for (i in seq_along(query.layers)) { + cells2.i <- Cells( + x = object.pair[[query.assay]], + layer = query.layers[i] + ) + object.pair.i <- subset( + x = object.pair, + cells = c(cells1, cells2.i) + ) + object.pair.i <- JoinLayers(object.pair.i) + anchor.list[[i]] <- FindAnchors_v3( + object.pair = object.pair.i, + assay = assay, + slot = slot, + cells1 = cells1, + cells2 = cells2.i, + internal.neighbors = internal.neighbors, + reduction = reduction, + reduction.2 = reduction.2, + nn.reduction = nn.reduction, + dims = dims, + k.anchor = k.anchor, + k.filter = k.filter, + k.score = k.score, + max.features = max.features, + nn.method = nn.method, + n.trees = n.trees, + nn.idx1 = nn.idx1, + nn.idx2 = nn.idx2, + eps = eps, + projected = projected, + verbose = verbose + ) + anchor.list[[i]][,2] <- match(x = cells2.i, table = cells2)[anchor.list[[i]][,2]] + anchor.list[[i]] <- t(anchor.list[[i]]) + } + anchors <- t(x = matrix( + data = unlist(x = anchor.list), + nrow = 3, + ncol = sum( + sapply(X = anchor.list, FUN = function(x) ncol(x)) + ) + ) + ) + colnames(anchors) <- c('cell1', 'cell2', 'score') + return(anchors) +} + +FindAnchors <- function( + object.pair, + assay, + slot, + cells1, + cells2, + internal.neighbors, + reduction, + reduction.2 = character(), + nn.reduction = reduction, + dims = 1:10, + k.anchor = 5, + k.filter = 200, + k.score = 30, + max.features = 200, + nn.method = "annoy", + n.trees = 50, + nn.idx1 = NULL, + nn.idx2 = NULL, + eps = 0, + projected = FALSE, + verbose = TRUE +) { + if (inherits(x = object.pair[[assay[1]]], what = 'Assay')) { + FindAnchors.function <- FindAnchors_v3 + } else if (inherits(x = object.pair[[assay[1]]], what = 'Assay5')) { + FindAnchors.function <- FindAnchors_v5 + } + anchors <- FindAnchors.function( + object.pair = object.pair, + assay = assay, + slot = slot, + cells1 = cells1, + cells2 = cells2, + internal.neighbors = internal.neighbors, + reduction = reduction, + reduction.2 = reduction.2, + nn.reduction = nn.reduction, + dims = dims, + k.anchor = k.anchor, + k.filter = k.filter, + k.score = k.score, + max.features = max.features, + nn.method = nn.method, + n.trees = n.trees, + nn.idx1 = nn.idx1, + nn.idx2 = nn.idx2, + eps = eps, + projected = projected, + verbose = verbose + ) + return(anchors) +} + # Find Anchor pairs # FindAnchorPairs <- function( @@ -3581,6 +4243,9 @@ FindIntegrationMatrix <- function( neighbors <- GetIntegrationData(object = object, integration.name = integration.name, slot = 'neighbors') nn.cells1 <- neighbors$cells1 nn.cells2 <- neighbors$cells2 + if (inherits(x = object[[assay[1]]], what = 'Assay5')) { + object <- JoinLayers(object) + } anchors <- GetIntegrationData( object = object, integration.name = integration.name, @@ -3669,8 +4334,10 @@ FindNN <- function( method = nn.method, n.trees = n.trees, eps = eps, + cache.index = TRUE, index = nn.idx1 ) + nn.idx1 <- Index(object = nnaa) } if (!is.null(x = internal.neighbors[[2]])) { nnbb <- internal.neighbors[[2]] @@ -3682,8 +4349,9 @@ FindNN <- function( method = nn.method, n.trees = n.trees, eps = eps, - index = nn.idx1 + cache.index = TRUE ) + nn.idx2 <- Index(object = nnbb) } if (length(x = reduction.2) > 0) { nnab <- NNHelper( @@ -3693,8 +4361,9 @@ FindNN <- function( method = nn.method, n.trees = n.trees, eps = eps, - index = nn.idx2 + index = if (reduction.2 == nn.reduction) nn.idx2 else NULL ) + nnba <- NNHelper( data = Embeddings(object = object[[reduction]])[cells1, nn.dims], query = Embeddings(object = object[[reduction]])[cells2, nn.dims], @@ -3702,7 +4371,7 @@ FindNN <- function( method = nn.method, n.trees = n.trees, eps = eps, - index = nn.idx1 + index = if (reduction == nn.reduction) nn.idx1 else NULL ) } else { dim.data.opposite <- Embeddings(object = object[[reduction]])[ ,dims] @@ -3715,7 +4384,7 @@ FindNN <- function( method = nn.method, n.trees = n.trees, eps = eps, - index = nn.idx2 + index = if (reduction == nn.reduction) nn.idx2 else NULL ) nnba <- NNHelper( data = dims.cells1.opposite, @@ -3724,7 +4393,7 @@ FindNN <- function( method = nn.method, n.trees = n.trees, eps = eps, - index = nn.idx1 + index = if (reduction == nn.reduction) nn.idx1 else NULL ) } object <- SetIntegrationData( @@ -3777,6 +4446,9 @@ FindWeights <- function( to.keep <- !duplicated(x = anchors.cells1) anchors.cells1 <- anchors.cells1[to.keep] anchors.cells2 <- anchors.cells2[to.keep] + if (length(anchors.cells1) < k || length(anchors.cells2) < k) { + stop("Number of anchor cells is less than k.weight. Consider lowering k.weight to less than ", min(length(anchors.cells1), length(anchors.cells2)), " or increase k.anchor.") + } if (is.null(x = features)) { data.use <- Embeddings(object = reduction)[nn.cells1, dims] data.use.query <- Embeddings(object = reduction)[nn.cells2, dims] @@ -3802,6 +4474,9 @@ FindWeights <- function( ) } else { anchors.cells2 <- unique(x = nn.cells2[anchors[, "cell2"]]) + if (length(anchors.cells2) < k) { + stop("Number of anchor cells is less than k.weight. Consider lowering k.weight to less than ", length(anchors.cells2), " or increase k.anchor.") + } if (is.null(x = features)) { data.use <- Embeddings(reduction)[nn.cells2, dims] } else { @@ -3836,7 +4511,6 @@ FindWeights <- function( sd = sd.weight, display_progress = verbose ) - object <- SetIntegrationData( object = object, integration.name = integration.name, @@ -3847,6 +4521,34 @@ FindWeights <- function( } +# Find weight matrix between query and reference cells from a neighbor object +# +# +FindWeightsNN <- function( + nn.obj, + query.cells, + reference.cells, + verbose = TRUE +) { + distances <- Distances(object = nn.obj) + distances <- 1 - (distances / distances[, ncol(x = distances)]) + cell.index <- Indices(object = nn.obj) + weights <- FindWeightsC( + cells2 = 0:(length(query.cells) - 1), + distances = as.matrix(x = distances), + anchor_cells2 = reference.cells, + integration_matrix_rownames = reference.cells, + cell_index = cell.index, + anchor_score = rep(1, length(reference.cells)), + min_dist = 0, + sd = 1, + display_progress = verbose + ) + colnames(weights) <- query.cells + return(weights) +} + + # Work out the anchor cell offsets for given set of cells in anchor list # # @param anchors A dataframe of anchors, from AnchorSet object @@ -4211,7 +4913,8 @@ PairwiseIntegrateReference <- function( scale.data = matrix(), var.features = vector(), meta.features = data.frame(row.names = rownames(x = integrated.data)), - misc = NULL + misc = NULL, + key = paste0(new.assay.name, "_") ) unintegrated[[new.assay.name]] <- new.assay # "unintegrated" now contains the integrated assay @@ -4281,111 +4984,394 @@ ParseRow <- function(clustering, i){ return(unlist(datasets)) } -# Rescale query with mean and sd from reference, or known mean and SD -# -# @param reference A reference object -# @param query A query object -# @param features Features to scale -# @param scale Scale data (divide by SD) -# @return Returns a matrix containing the scaled query data -RescaleQuery <- function( - reference, + +#' @rdname ProjectCellEmbeddings +#' @method ProjectCellEmbeddings Seurat +#' @export +#' +#' +ProjectCellEmbeddings.Seurat <- function( query, - reference.assay = NULL, + reference, query.assay = NULL, - features = NULL, + reference.assay = NULL, + reduction = "pca", + dims = 1:50, + normalization.method = c("LogNormalize", "SCT"), + scale = TRUE, + verbose = TRUE, + nCount_UMI = NULL, feature.mean = NULL, feature.sd = NULL, - scale = TRUE + ... ) { - reference.assay <- reference.assay %||% DefaultAssay(object = reference) + if (verbose) { + message("Projecting cell embeddings") + } + normalization.method <- match.arg(arg = normalization.method) query.assay <- query.assay %||% DefaultAssay(object = query) - features <- features %||% intersect( - rownames(x = reference[[reference.assay]]), - rownames(x = query[[query.assay]]) - ) - reference.data <- GetAssayData( - object = reference, - assay = reference.assay, - slot = "data")[features, ] - query.data <- GetAssayData( - object = query, - assay = query.assay, - slot = "data")[features, ] - if (is.null(x = feature.mean)) { - feature.mean <- rowMeans(x = reference.data) - if (scale) { - feature.sd <- sqrt( - x = SparseRowVar2( - mat = as.sparse(x = reference.data), - mu = feature.mean, - display_progress = FALSE - ) - ) - feature.sd[is.na(x = feature.sd)] <- 1 - } else { - feature.sd <- rep(x = 1, nrow( reference.data)) + reference.assay <- reference.assay %||% DefaultAssay(object = reference) + if (normalization.method == 'SCT') { + if (!IsSCT(assay = reference[[reference.assay]])) { + stop('reference in ', reference.assay, ' assay does not have a SCT model' ) + } + reference.model.num <- length(slot(object = reference[[reference.assay]], name = "SCTModel.list")) + if (reference.model.num > 1) { + stop("Given reference assay (", reference.assay, ") has ", reference.model.num , + " reference sct models. Please provide a reference assay with a ", + " single reference sct model.", call. = FALSE) + } else if (reference.model.num == 0) { + stop("Given reference assay (", reference.assay, + ") doesn't contain a reference SCT model.") } - feature.mean[is.na(x = feature.mean)] <- 1 - } - proj.data <- GetAssayData( - object = query, - assay = query.assay, - slot = "data" - )[features, ] - store.names <- dimnames(x = proj.data) - if (is.numeric(x = feature.mean) && feature.mean[[1]] != "SCT") { - proj.data <- FastSparseRowScaleWithKnownStats( - mat = as.sparse(x = proj.data), - mu = feature.mean, - sigma = feature.sd, - display_progress = FALSE - ) } - dimnames(x = proj.data) <- store.names - return(proj.data) + proj.pca <- ProjectCellEmbeddings( + query = query[[query.assay]], + reference = reference, + reference.assay = reference.assay, + reduction = reduction, + dims = dims, + scale = scale, + normalization.method = normalization.method, + verbose = verbose, + nCount_UMI = nCount_UMI, + feature.mean = feature.mean, + feature.sd = feature.sd + ) + return(proj.pca) } -ProjectCellEmbeddings <- function( - reference, +#' @rdname ProjectCellEmbeddings +#' @method ProjectCellEmbeddings Assay +#' @export +#' +ProjectCellEmbeddings.Assay <- function( query, - reduction = "pca", + reference, reference.assay = NULL, - query.assay = NULL, + reduction = "pca", dims = 1:50, scale = TRUE, + normalization.method = NULL, verbose = TRUE, + nCount_UMI = NULL, feature.mean = NULL, - feature.sd = NULL + feature.sd = NULL, + ... ) { - if (verbose) { - message("Projecting cell embeddings") - } - reference.assay <- reference.assay %||% DefaultAssay(object = reference) - query.assay <- query.assay %||% DefaultAssay(object = query) - features <- rownames(x = Loadings(object = reference[[reduction]])) - features <- intersect(x = features, y = rownames(x = query[[query.assay]])) - proj.data <- RescaleQuery( + features <- Reduce( + f = intersect, + x = list( + rownames(x = Loadings(object = reference[[reduction]])), + rownames(x = reference[[reference.assay]]), + rownames(x = query) + ) + ) + if (normalization.method == 'SCT') { + slot <- 'counts' + } else { + slot <- 'data' + } + proj.pca <- ProjectCellEmbeddings( + query = GetAssayData( + object = query, + slot = slot), reference = reference, - query = query, - features = features, + reference.assay = reference.assay, + reduction = reduction, + dims = dims, scale = scale, + normalization.method = normalization.method, + verbose = verbose, + features = features, + nCount_UMI = nCount_UMI, feature.mean = feature.mean, feature.sd = feature.sd ) + return(proj.pca) +} + +#' @rdname ProjectCellEmbeddings +#' @method ProjectCellEmbeddings SCTAssay +#' @export +#' +ProjectCellEmbeddings.SCTAssay <- function( + query, + reference, + reference.assay = NULL, + reduction = "pca", + dims = 1:50, + scale = TRUE, + normalization.method = NULL, + verbose = TRUE, + nCount_UMI = NULL, + feature.mean = NULL, + feature.sd = NULL, + ... +) { + if (normalization.method != 'SCT') { + warning('Query data is SCT normalized, but normalization.method is set to LogNormalize') + } + features <- Reduce( + f = intersect, + x = list( + rownames(x = Loadings(object = reference[[reduction]])), + rownames(x = reference[[reference.assay]]), + rownames(x = query$scale.data) + ) + ) + query.data <- GetAssayData( + object = query, + slot = "scale.data")[features,] ref.feature.loadings <- Loadings(object = reference[[reduction]])[features, dims] - proj.pca <- t(crossprod(x = ref.feature.loadings, y = proj.data)) + proj.pca <- t(crossprod(x = ref.feature.loadings, y = query.data)) return(proj.pca) } -# Project new data onto SVD (LSI or PCA) -# -# A = U∑V SVD -# U' = VA'/∑ LSI projection -# -# Note that because in LSI we don't multiply by ∑ to get the embeddings (it's just U), -# we need to divide by ∑ in the projection to get the equivalent. Therefore need -# the singular values, which (in Signac RunLSI) we store in the DimReduc misc slot. +#' @rdname ProjectCellEmbeddings +#' @method ProjectCellEmbeddings StdAssay +#' @export +#' +ProjectCellEmbeddings.StdAssay <- function( + query, + reference, + reference.assay = NULL, + reduction = "pca", + dims = 1:50, + scale = TRUE, + normalization.method = NULL, + verbose = TRUE, + nCount_UMI = NULL, + feature.mean = NULL, + feature.sd = NULL, + ... +) { + reference.assay <- reference.assay %||% DefaultAssay(object = reference) + features <- Reduce( + f = intersect, + x = list( + rownames(x = Loadings(object = reference[[reduction]])), + rownames(x = reference[[reference.assay]]) + ) + ) + if (normalization.method == 'SCT') { + layers.set <- Layers(object = query, search = 'counts') + } else { + layers.set <- Layers(object = query, search = 'data') + } + proj.pca.list <- list() + cell.list <- list() + for (i in seq_along(layers.set)) { + proj.pca.list[[i]] <- t(ProjectCellEmbeddings( + query = LayerData(object = query, layer = layers.set[i]), + reference = reference, + reference.assay = reference.assay, + reduction = reduction, + dims = dims, + scale = scale, + normalization.method = normalization.method, + verbose = verbose, + features = features, + nCount_UMI = nCount_UMI[Cells(x = query, layer = layers.set[i])], + feature.mean = feature.mean, + feature.sd = feature.sd + )) + cell.list[[i]] <- colnames(proj.pca.list[[i]]) + } + proj.pca <- matrix( + data = unlist(proj.pca.list), + nrow = nrow(proj.pca.list[[1]]), + ncol = ncol(query) + ) + rownames(proj.pca) <- rownames(proj.pca.list[[1]]) + colnames(proj.pca) <- unlist(cell.list) + proj.pca <- t(proj.pca) + proj.pca <- proj.pca[colnames(query),] + return(proj.pca) +} + +#' @rdname ProjectCellEmbeddings +#' @method ProjectCellEmbeddings default +#' @export +#' +ProjectCellEmbeddings.default <- function( + query, + reference, + reference.assay = NULL, + reduction = "pca", + dims = 1:50, + scale = TRUE, + normalization.method = NULL, + verbose = TRUE, + features = NULL, + nCount_UMI = NULL, + feature.mean = NULL, + feature.sd = NULL, + ... +){ + features <- features %||% rownames(x = Loadings(object = reference[[reduction]])) +if (normalization.method == 'SCT') { + reference.SCT.model <- slot(object = reference[[reference.assay]], name = "SCTModel.list")[[1]] + query <- FetchResiduals_reference( + object = query, + reference.SCT.model = reference.SCT.model, + features = features, + nCount_UMI = nCount_UMI) +} else { + query <- query[features,] + if(inherits(x = reference[[reference.assay]], what = "Assay5")){ + reference.data.list <- c() + for (i in Layers(object = reference[[reference.assay]], search = "data")) { + reference.data.list[[i]] <- LayerData( + object = reference[[reference.assay]], + layer = i + )[features, ] + } + reference.data <- do.call(cbind, reference.data.list) + } else { + reference.data <- GetAssayData( + object = reference, + assay = reference.assay, + slot = "data")[features, ] + } + if (is.null(x = feature.mean)) { + if (inherits(x = reference.data, what = 'dgCMatrix')) { + feature.mean <- RowMeanSparse(mat = reference.data) + } else if (inherits(x = reference.data, what = "IterableMatrix")) { + bp.stats <- BPCells::matrix_stats(matrix = reference.data, + row_stats = "variance") + feature.mean <- bp.stats$row_stats["mean",] + } else { + feature.mean <- rowMeans2(x = reference.data) + } + if (scale) { + if (inherits(x = reference.data, what = "IterableMatrix")) { + feature.sd <- sqrt(bp.stats$row_stats["variance",]) + } else { + feature.sd <- sqrt(x = RowVarSparse(mat = as.sparse(reference.data))) + } + feature.sd[is.na(x = feature.sd)] <- 1 + feature.sd[feature.sd==0] <- 1 + } else { + feature.sd <- rep(x = 1, nrow(x = reference.data)) + } + feature.mean[is.na(x = feature.mean)] <- 1 + } + store.names <- dimnames(x = query) + if (is.numeric(x = feature.mean)) { + query <- FastSparseRowScaleWithKnownStats( + mat = as.sparse(x = query), + mu = feature.mean, + sigma = feature.sd, + display_progress = FALSE + ) + } + dimnames(x = query) <- store.names +} + ref.feature.loadings <- Loadings(object = reference[[reduction]])[features, dims] + proj.pca <- t(crossprod(x = ref.feature.loadings, y = query)) + return(proj.pca) +} + +#' @rdname ProjectCellEmbeddings +#' @method ProjectCellEmbeddings IterableMatrix +#' @export +#' +#' +ProjectCellEmbeddings.IterableMatrix <- function( + query, + reference, + reference.assay = NULL, + reduction = "pca", + dims = 1:50, + scale = TRUE, + normalization.method = NULL, + verbose = TRUE, + features = features, + nCount_UMI = NULL, + feature.mean = NULL, + feature.sd = NULL, + block.size = 10000, + ... +) { + features <- features %||% rownames(x = Loadings(object = reference[[reduction]])) + features <- intersect(x = features, y = rownames(query)) + if (normalization.method == 'SCT') { + reference.SCT.model <- slot(object = reference[[reference.assay]], name = "SCTModel.list")[[1]] + cells.grid <- split( + x = 1:ncol(query), + f = ceiling(seq_along(along.with = 1:ncol(query)) / block.size)) + proj.list <- list() + for (i in seq_along(along.with = cells.grid)) { + query.i <- FetchResiduals_reference( + object = as.sparse(query[,cells.grid[[i]]]), + reference.SCT.model = reference.SCT.model, + features = features, + nCount_UMI = nCount_UMI[colnames(query)[cells.grid[[i]]]]) + proj.list[[i]] <- t(Loadings(object = reference[[reduction]])[features, dims]) %*% query.i + } + proj.pca <- t(matrix( + data = unlist(x = proj.list), + nrow = length(x = dims), + ncol = ncol(x = query), + dimnames = list( + colnames(x = Embeddings(object = reference[[reduction]]))[dims], + colnames(x = query)) + )) + } else { + query <- query[features,] + reference.data.list <- c() + for (i in Layers(object = reference[[reference.assay]], + search = "data")) { + reference.data.list[[i]] <- LayerData(object = reference[[reference.assay]], + layer = i)[features, ] + } + reference.data <- do.call(cbind, reference.data.list) + if (is.null(x = feature.mean)) { + if (inherits(x = reference.data, what = 'dgCMatrix')) { + feature.mean <- RowMeanSparse(mat = reference.data) + } else if (inherits(x = reference.data, what = "IterableMatrix")) { + bp.stats <- BPCells::matrix_stats( + matrix = reference.data, + row_stats = "variance") + feature.mean <- bp.stats$row_stats["mean",] + } else { + feature.mean <- rowMeans(mat = reference.data) + } + if (scale) { + if (inherits(x = reference.data, what = "IterableMatrix")) { + feature.sd <- sqrt(x = bp.stats$row_stats["variance",]) + } else { + feature.sd <- sqrt( + x = RowVarSparse( + mat = as.sparse(x = reference.data) + ) + ) + } + feature.sd[is.na(x = feature.sd)] <- 1 + feature.sd[feature.sd==0] <- 1 + } else { + feature.sd <- rep(x = 1, nrow(x = reference.data)) + } + feature.mean[is.na(x = feature.mean)] <- 1 + } + query.scale <- BPCells::min_by_row(mat = query, vals = 10 * feature.sd + feature.mean) + query.scale <- (query.scale - feature.mean) / feature.sd + proj.pca <- t(query.scale) %*% Loadings(object = reference[[reduction]])[features,dims] + rownames(x = proj.pca) <- colnames(x = query) + colnames(x = proj.pca) <- colnames(x = Embeddings(object = reference[[reduction]]))[dims] + } + return(proj.pca) +} + +# Project new data onto SVD (LSI or PCA) +# +# A = U∑V SVD +# U' = VA'/∑ LSI projection +# +# Note that because in LSI we don't multiply by ∑ to get the embeddings (it's just U), +# we need to divide by ∑ in the projection to get the equivalent. Therefore need +# the singular values, which (in Signac RunLSI) we store in the DimReduc misc slot. # # @param reduction A \code{DimReduc} object containing the SVD dimension # reduction. Assumes original irlba output is stored in the misc slot of the dimreduc. @@ -4426,7 +5412,7 @@ ProjectSVD <- function( if (verbose) { message("Projecting new data onto SVD") } - projected.u <- as.matrix(x = crossprod(x = vt, y = data)) + projected.u <- as.matrix(t(vt) %*% data) if (mode == "lsi") { components <- slot(object = reduction, name = 'misc') sigma <- components$d @@ -4813,12 +5799,15 @@ TransformDataMatrix <- function( new.expression <- new.expression[, colnames(object)] new.assay <- new( Class = 'Assay', + # key = paste0(new.assay.name,"_"), counts = new(Class = "dgCMatrix"), data = new.expression, scale.data = matrix(), var.features = vector(), meta.features = data.frame(row.names = rownames(x = new.expression)), - misc = NULL + misc = NULL, + # key = paste0(new.assay.name, "_") + key = Key(object = new.assay.name, quiet = TRUE) ) object[[new.assay.name]] <- new.assay return(object) @@ -4886,6 +5875,12 @@ ValidateParams_FindTransferAnchors <- function( if (reduction == "lsiproject") { ModifyParam(param = "k.filter", value = NA) } + # commented out to enable filter anchors for v5 assay + # if (inherits(x = reference[[reference.assay]], what = 'Assay5') || + # inherits(x = query[[query.assay]], what = 'Assay5')) { + # # current filter anchors not support for v5 assay + # ModifyParam(param = "k.filter", value = NA) + # } if (!is.na(x = k.filter) && k.filter > ncol(x = query)) { warning("k.filter is larger than the number of cells present in the query.\n", "Continuing without anchor filtering.", @@ -4906,6 +5901,10 @@ ValidateParams_FindTransferAnchors <- function( stop("The project.query workflow is not compatible with reduction = 'cca'", call. = FALSE) } + if (normalization.method == "SCT" && isTRUE(x = project.query) && !IsSCT(query[[query.assay]])) { + stop("In the project.query workflow, normalization is SCT, but query is not SCT normalized", + call. = FALSE) + } if (IsSCT(assay = query[[query.assay]]) && IsSCT(assay = reference[[reference.assay]]) && normalization.method != "SCT") { warning("Both reference and query assays have been processed with SCTransform.", @@ -4930,6 +5929,8 @@ ValidateParams_FindTransferAnchors <- function( ModifyParam(param = "recompute.residuals", value = recompute.residuals) } if (recompute.residuals) { + # recompute.residuals only happens in ProjectCellEmbeddings, so k.filter set to NA. + ModifyParam(param = "k.filter", value = NA) reference.model.num <- length(x = slot(object = reference[[reference.assay]], name = "SCTModel.list")) if (reference.model.num > 1) { stop("Given reference assay (", reference.assay, ") has ", reference.model.num , @@ -4968,14 +5969,21 @@ ValidateParams_FindTransferAnchors <- function( "you can set recompute.residuals to FALSE", call. = FALSE) } } - query <- SCTransform( - object = query, - reference.SCT.model = slot(object = reference[[reference.assay]], name = "SCTModel.list")[[1]], - residual.features = features, - assay = query.umi.assay, - new.assay.name = new.sct.assay, - verbose = FALSE - ) + if (reduction %in% c('cca', 'rpca')) { + query <- SCTransform( + object = query, + reference.SCT.model = slot(object = reference[[reference.assay]], name = "SCTModel.list")[[1]], + residual.features = features, + assay = query.umi.assay, + new.assay.name = new.sct.assay, + verbose = FALSE + ) + } else { + new.sct.assay <- query.umi.assay + } + + + DefaultAssay(query) <- new.sct.assay ModifyParam(param = "query.assay", value = new.sct.assay) ModifyParam(param = "query", value = query) ModifyParam(param = "reference", value = reference) @@ -4989,13 +5997,35 @@ ValidateParams_FindTransferAnchors <- function( "SCTransform. Please either run SCTransform or set normalization.method = 'LogNormalize'.", call. = FALSE) } + # Make data slot if DNE + if (inherits(x = query[[query.assay]], what = "Assay5")){ + if (is.null( + tryCatch(expr = Layers(object = query[[query.assay]], search = 'data'), + error = function (e) return(NULL)) + ) + ) { + LayerData( + object = query[[query.assay]], + layer = "data") <- sparseMatrix( + i = 1, + j = 1, + x = 1, + dims = c(nrow(x = query[[query.assay]]), + ncol(x = query[[query.assay]]) + ) + ) + ModifyParam(param = "query", value = query) + } + } # features must be in both reference and query - feature.slot <- ifelse(test = normalization.method == "SCT", yes = "scale.data", no = "data") query.assay.check <- query.assay reference.assay.check <- reference.assay - ref.features <- rownames(x = GetAssayData(object = reference[[reference.assay.check]], slot = feature.slot)) - query.features <- rownames(x = GetAssayData(object = query[[query.assay.check]], slot = feature.slot)) + ref.features <- rownames(x = reference[[reference.assay.check]]) + query.features <- rownames(x = query[[query.assay.check]]) if (normalization.method == "SCT") { + if (IsSCT(query[[query.assay.check]])) { + query.features <- rownames(x = query[[query.assay.check]]$scale.data) + } query.model.features <- rownames(x = Misc(object = query[[query.assay]])$vst.out$gene_attr) query.features <- unique(c(query.features, query.model.features)) ref.model.features <- rownames(x = Misc(object = reference[[reference.assay]])$vst.out$gene_attr) @@ -5097,6 +6127,7 @@ ValidateParams_TransferData <- function( query.cells, reference, query, + query.assay, refdata, weight.reduction, l2.norm, @@ -5107,90 +6138,103 @@ ValidateParams_TransferData <- function( n.trees, verbose, slot, + only.weights, prediction.assay, label.transfer ) { - if (!inherits(x = refdata, what = "list")) { - refdata <- list(id = refdata) - } - for (i in 1:length(x = refdata)) { - if (inherits(x = refdata[[i]], what = c("character", "factor"))) { - # check is it's in the reference object - if (length(x = refdata[[i]]) == 1) { - if (is.null(x = reference)) { - warning("If providing a single string to refdata element number ", i, - ", please provide the reference object. Skipping element ", i, - ".", call. = FALSE, immediate. = TRUE) + ## check refdata + if (is.null(refdata)) { + if (!only.weights) { + stop("refdata is NULL and only.weights is FALSE") + } + } else { + if (!inherits(x = refdata, what = "list")) { + refdata <- list(id = refdata) + } + for (i in 1:length(x = refdata)) { + if (inherits(x = refdata[[i]], what = c("character", "factor"))) { + # check is it's in the reference object + if (length(x = refdata[[i]]) == 1) { + if (is.null(x = reference)) { + warning("If providing a single string to refdata element number ", i, + ", please provide the reference object. Skipping element ", i, + ".", call. = FALSE, immediate. = TRUE) + refdata[[i]] <- FALSE + next + } + if (refdata[[i]] %in% Assays(object = reference)) { + refdata[[i]] <- GetAssayData(object = reference, assay = refdata[[i]]) + colnames(x = refdata[[i]]) <- paste0(colnames(x = refdata[[i]]), "_reference") + label.transfer[[i]] <- FALSE + next + } else if (refdata[[i]] %in% colnames(x = reference[[]])) { + refdata[[i]] <- reference[[refdata[[i]]]][, 1] + } else { + warning("Element number ", i, " provided to refdata does not exist in ", + "the provided reference object.", call. = FALSE, immediate. = TRUE) + refdata[[i]] <- FALSE + next + } + } else if (length(x = refdata[[i]]) != length(x = reference.cells)) { + warning("Please provide a vector that is the same length as the number ", + "of reference cells used in anchor finding.\n", + "Length of vector provided: ", length(x = refdata[[i]]), "\n", + "Length of vector required: ", length(x = reference.cells), + "\nSkipping element ", i, ".", call. = FALSE, immediate. = TRUE) refdata[[i]] <- FALSE - next } - if (refdata[[i]] %in% Assays(object = reference)) { - refdata[[i]] <- GetAssayData(object = reference, assay = refdata[[i]]) - colnames(x = refdata[[i]]) <- paste0(colnames(x = refdata[[i]]), "_reference") - label.transfer[[i]] <- FALSE - next - } else if (refdata[[i]] %in% colnames(x = reference[[]])) { - refdata[[i]] <- reference[[refdata[[i]]]][, 1] - } else { - warning("Element number ", i, " provided to refdata does not exist in ", - "the provided reference object.", call. = FALSE, immediate. = TRUE) + label.transfer[[i]] <- TRUE + } else if (inherits(x = refdata[[i]], what = c("dgCMatrix", "matrix"))) { + if (ncol(x = refdata[[i]]) != length(x = reference.cells)) { + warning("Please provide a matrix that has the same number of columns as ", + "the number of reference cells used in anchor finding.\n", + "Number of columns in provided matrix : ", ncol(x = refdata[[i]]), "\n", + "Number of columns required : ", length(x = reference.cells), + "\nSkipping element ", i, ".", call. = FALSE, immediate. = TRUE) refdata[[i]] <- FALSE - next + } else { + colnames(x = refdata[[i]]) <- paste0(colnames(x = refdata[[i]]), "_reference") + if (any(!colnames(x = refdata[[i]]) == reference.cells)) { + if (any(!colnames(x = refdata[[i]]) %in% reference.cells) || any(!reference.cells %in% colnames(x = refdata[[i]]))) { + warning("Some (or all) of the column names of the provided refdata ", + "don't match the reference cells used in anchor finding ", + "\nSkipping element", i, ".", call. = FALSE, immediate. = TRUE) + refdata[[i]] <- FALSE + } else { + refdata[[i]] <- refdata[[i]][, reference.cells] + } + } + } + if (!slot %in% c("counts", "data")) { + stop("Please specify slot as either 'counts' or 'data'.") } - } else if (length(x = refdata[[i]]) != length(x = reference.cells)) { - warning("Please provide a vector that is the same length as the number ", - "of reference cells used in anchor finding.\n", - "Length of vector provided: ", length(x = refdata[[i]]), "\n", - "Length of vector required: ", length(x = reference.cells), - "\nSkipping element ", i, ".", call. = FALSE, immediate. = TRUE) + label.transfer[[i]] <- FALSE + } else { + warning("Please provide either a vector (character or factor) for label ", + "transfer or a matrix for feature transfer. \nType provided: ", + class(x = refdata[[i]])) refdata[[i]] <- FALSE } - label.transfer[[i]] <- TRUE - } else if (inherits(x = refdata[[i]], what = c("dgCMatrix", "matrix"))) { - if (ncol(x = refdata[[i]]) != length(x = reference.cells)) { - warning("Please provide a matrix that has the same number of columns as ", - "the number of reference cells used in anchor finding.\n", - "Number of columns in provided matrix : ", ncol(x = refdata[[i]]), "\n", - "Number of columns required : ", length(x = reference.cells), - "\nSkipping element ", i, ".", call. = FALSE, immediate. = TRUE) - refdata[[i]] <- FALSE - } else { - colnames(x = refdata[[i]]) <- paste0(colnames(x = refdata[[i]]), "_reference") - if (any(!colnames(x = refdata[[i]]) == reference.cells)) { - if (any(!colnames(x = refdata[[i]]) %in% reference.cells) || any(!reference.cells %in% colnames(x = refdata[[i]]))) { - warning("Some (or all) of the column names of the provided refdata ", - "don't match the reference cells used in anchor finding ", - "\nSkipping element", i, ".", call. = FALSE, immediate. = TRUE) - refdata[[i]] <- FALSE - } else { - refdata[[i]] <- refdata[[i]][, reference.cells] - } + if (names(x = refdata)[i] == "") { + possible.names <- make.unique(names = c(names(x = refdata), paste0("e", i))) + names(x = refdata)[i] <- possible.names[length(x = possible.names)] + if (verbose) { + message("refdata element ", i, " is not named. Setting name as ", names(x = refdata)[i]) } } - if (!slot %in% c("counts", "data")) { - stop("Please specify slot as either 'counts' or 'data'.") - } - label.transfer[[i]] <- FALSE - } else { - warning("Please provide either a vector (character or factor) for label ", - "transfer or a matrix for feature transfer. \nType provided: ", - class(x = refdata[[i]])) - refdata[[i]] <- FALSE - } - if (names(x = refdata)[i] == "") { - possible.names <- make.unique(names = c(names(x = refdata), paste0("e", i))) - names(x = refdata)[i] <- possible.names[length(x = possible.names)] - if (verbose) { - message("refdata element ", i, " is not named. Setting name as ", names(x = refdata)[i]) - } } + ModifyParam(param = "label.transfer", value = label.transfer) + if (all(unlist(x = lapply(X = refdata, FUN = isFALSE)))) { + stop("None of the provided refdata elements are valid.", call. = FALSE) + } + ModifyParam(param = "refdata", value = refdata) } - ModifyParam(param = "label.transfer", value = label.transfer) - if (all(unlist(x = lapply(X = refdata, FUN = isFALSE)))) { - stop("None of the provided refdata elements are valid.", call. = FALSE) - } - ModifyParam(param = "refdata", value = refdata) - valid.weight.reduction <- c("pcaproject", "pca", "cca", "rpca.ref","lsiproject", "lsi") + + + + + object.reduction <- Reductions(object = slot(object = anchorset, name = "object.list")[[1]]) + valid.weight.reduction <- c("pcaproject", "pca", "cca", "rpca.ref","lsiproject", "lsi", object.reduction) if (!inherits(x = weight.reduction, "DimReduc")) { if (!weight.reduction %in% valid.weight.reduction) { stop("Please provide one of ", paste(valid.weight.reduction, collapse = ", "), " or a custom DimReduc to ", @@ -5216,15 +6260,15 @@ ValidateParams_TransferData <- function( ncol(x = weight.reduction), ").", call. = FALSE) } } else { - if (is.null(x = dims)) { - ModifyParam(param = "dims", value = 1:length(x = slot(object = anchorset, name = "command")$dims)) + if (is.null(x = dims) && !is.null(x = slot(object = anchorset, name = "command")$dims)) { + ModifyParam(param = "dims", value = 1:length(x = slot(object = anchorset, name = "command")$dims)) } } if (!is.null(x = query)) { if (!isTRUE(x = all.equal( target = gsub(pattern = "_query", replacement = "", x = query.cells), - current = Cells(x = query), + current = colnames(x = query[[query.assay]]), check.attributes = FALSE) )) { stop("Query object provided contains a different set of cells from the ", @@ -5300,7 +6344,9 @@ ValidateParams_IntegrateEmbeddings_IntegrationAnchors <- function( for (i in 1:nobs) { if (!isTRUE(all.equal( target = Cells(x = weight.reduction[[i]]), - current = Cells(x = object.list[[i]]))) + current = Cells(x = object.list[[i]]), + check.attributes = FALSE + )) ) { stop("Cell names in the provided weight.reduction ", i, " don't ", "match with the cell names in object ", i, ".", call. = FALSE) @@ -5334,6 +6380,7 @@ ValidateParams_IntegrateEmbeddings_TransferAnchors <- function( combined.object , reference, query, + query.assay, reductions, dims.to.integrate, k.weight, @@ -5348,13 +6395,13 @@ ValidateParams_IntegrateEmbeddings_TransferAnchors <- function( } reference.cells <- slot(object = anchorset, name = "reference.cells") reference.cells <- gsub(pattern = "_reference", replacement = "", x = reference.cells) - if (!isTRUE(x = all.equal(target = reference.cells, current = as.character(x = Cells(x = reference))))) { + if (!isTRUE(x = all.equal(target = reference.cells, current = Cells(x = reference), check.attributes = FALSE))) { stop("The set of cells used as a reference in the AnchorSet does not match ", "the set of cells provided in the reference object.") } query.cells <- slot(object = anchorset, name = "query.cells") query.cells <- gsub(pattern = "_query", replacement = "", x = query.cells) - if (!isTRUE(x = all.equal(target = query.cells, current = Cells(x = query), check.attributes = FALSE))) { + if (!isTRUE(x = all.equal(target = query.cells, current = colnames(x = query[[query.assay]]), check.attributes = FALSE))) { stop("The set of cells used as a query in the AnchorSet does not match ", "the set of cells provided in the query object.") } @@ -5371,7 +6418,7 @@ ValidateParams_IntegrateEmbeddings_TransferAnchors <- function( reference[[reductions]] <- CreateDimReducObject(embeddings = reference.embeddings, assay = DefaultAssay(object = reference)) ModifyParam(param = "reference", value = reference) query <- RenameCells(object = query, new.names = paste0(Cells(x = query), "_query")) - query.embeddings <- Embeddings(object = combined.object[[reductions]])[Cells(x = query), ] + query.embeddings <- Embeddings(object = combined.object[[reductions]])[Cells(x = query[[query.assay]]), ] query[[reductions]] <- CreateDimReducObject(embeddings = query.embeddings, assay = DefaultAssay(object = query)) ModifyParam(param = "query", value = query) ModifyParam(param = "reductions", value = c(reductions, reductions)) @@ -5438,7 +6485,8 @@ ValidateParams_IntegrateEmbeddings_TransferAnchors <- function( weight.reduction <- RenameCells(object = weight.reduction, new.names = paste0(Cells(x = weight.reduction), "_query")) if (!isTRUE(all.equal( target = Cells(x = weight.reduction), - current = Cells(x = query) + current = Cells(x = query), + check.attributes = FALSE ))) { stop("Cell names in the provided weight.reduction don't ", "match with the cell names in the query object.", call. = FALSE) @@ -5447,3 +6495,1480 @@ ValidateParams_IntegrateEmbeddings_TransferAnchors <- function( } } } + + +#' Convert Neighbor class to an asymmetrical Graph class +#' +#' +#' @param nn.object A neighbor class object +#' @param col.cells Cells names of the neighbors, cell names in nn.object is used by default +#' @param weighted Determine if use distance in the Graph +#' +#' @return Returns a Graph object +#' +#' @importFrom Matrix sparseMatrix +#' +#' @export +#' @concept integration +#' +NNtoGraph <- function( + nn.object, + col.cells = NULL, + weighted = FALSE +) { + select_nn <- Indices(object = nn.object) + col.cells <- col.cells %||% Cells(x = nn.object) + ncol.nn <- length(x = col.cells) + k.nn <- ncol(x = select_nn) + j <- as.numeric(x = t(x = select_nn)) + i <- ((1:length(x = j)) - 1) %/% k.nn + 1 + if (weighted) { + select_nn_dist <- Distances(object = nn.object) + dist.element <- as.numeric(x = t(x = select_nn_dist)) + nn.matrix <- sparseMatrix( + i = i, + j = j, + x = dist.element, + dims = c(nrow(x = select_nn), ncol.nn) + ) + } else { + nn.matrix <- sparseMatrix( + i = i, + j = j, + x = 1, + dims = c(nrow(x = select_nn), ncol.nn) + ) + } + rownames(x = nn.matrix) <- Cells(x = nn.object) + colnames(x = nn.matrix) <- col.cells + nn.matrix <- as.Graph(x = nn.matrix) + return(nn.matrix) +} + + +# Find Anchor directly from assay +# +# +# @return Returns a TranserAnchor or Integration set +FindAssayAnchor <- function( + object.list, + reference = NULL, + anchor.type = c("Transfer", "Integration"), + assay = "Bridge", + slot = "data", + reduction = NULL, + k.anchor = 20, + k.score = 50, + verbose = TRUE +) { + anchor.type <- match.arg(arg = anchor.type) + reduction.name <- reduction %||% paste0(assay, ".reduc") + if ( is.null(x = reduction) || !reduction %in% Reductions(object.list[[1]])) { + object.list <- lapply(object.list, function(x) { + if (is.null(reduction)) { + x[[reduction.name]] <- CreateDimReducObject( + embeddings = t(GetAssayData( + object = x, + slot = slot, + assay = assay + )), + key = "L_", + assay = assay + ) + } + DefaultAssay(x) <- assay + x <- DietSeurat(x, assays = assay, dimreducs = reduction.name) + return(x) + } + ) +} + object.both <- merge(object.list[[1]], object.list[[2]], merge.dr = reduction.name) + objects.ncell <- sapply(X = object.list, FUN = function(x) dim(x = x)[2]) + offsets <- as.vector(x = cumsum(x = c(0, objects.ncell)))[1:length(x = object.list)] + if (verbose) { + message("Finding ", anchor.type," anchors from assay ", assay) + } + anchors <- FindAnchors(object.pair = object.both, + assay = c(DefaultAssay(object.both), DefaultAssay(object.both)), + slot = 'data', + cells1 = colnames(object.list[[1]]), + cells2 = colnames(object.list[[2]]), + internal.neighbors = NULL, + reduction = reduction.name, + k.anchor = k.anchor, + k.score = k.score, + dims = 1:ncol(object.both[[reduction.name]]), + k.filter = NA, + verbose = verbose + ) + inte.anchors <- anchors + inte.anchors[, 1] <- inte.anchors[, 1] + offsets[1] + inte.anchors[, 2] <- inte.anchors[, 2] + offsets[2] + # determine all anchors + inte.anchors <- rbind(inte.anchors, inte.anchors[, c(2, 1, 3)]) + inte.anchors <- AddDatasetID( + anchor.df = inte.anchors, + offsets = offsets, + obj.lengths = objects.ncell + ) + command <- LogSeuratCommand(object = object.list[[1]], return.command = TRUE) + anchor.features <- rownames(object.both) + if (anchor.type == "Integration") { + anchor.set <- new(Class = "IntegrationAnchorSet", + object.list = object.list, + reference.objects = reference %||% seq_along(object.list), + anchors = inte.anchors, + weight.reduction = object.both[[reduction.name]], + offsets = offsets, + anchor.features = anchor.features, + command = command + ) + } else if (anchor.type == "Transfer") { + reference.index <- reference + reference <- object.list[[reference.index]] + query <- object.list[[setdiff(c(1,2), reference.index)]] + query <- RenameCells( + object = query, + new.names = paste0(Cells(x = query), "_", "query") + ) + reference <- RenameCells( + object = reference, + new.names = paste0(Cells(x = reference), "_", "reference") + ) + combined.ob <- suppressWarnings(expr = merge( + x = reference, + y = query, + merge.dr = reduction.name + )) + anchor.set <- new( + Class = "TransferAnchorSet", + object.list = list(combined.ob), + reference.cells = colnames(x = reference), + query.cells = colnames(x = query), + anchors = anchors, + anchor.features = anchor.features, + command = command + ) + } + return(anchor.set) +} + + +#' Construct a dictionary representation for each unimodal dataset +#' +#' +#' @param object.list A list of Seurat objects +#' @param bridge.object A multi-omic bridge Seurat which is used as the basis to +#' represent unimodal datasets +#' @param object.reduction A list of dimensional reductions from object.list used +#' to be reconstructed by bridge.object +#' @param bridge.reduction A list of dimensional reductions from bridge.object used +#' to reconstruct object.reduction +#' @param laplacian.reduction Name of bridge graph laplacian dimensional reduction +#' @param laplacian.dims Dimensions used for bridge graph laplacian dimensional reduction +#' @param bridge.assay.name Assay name used for bridge object reconstruction value (default is 'Bridge') +#' @param return.all.assays Whether to return all assays in the object.list. +#' Only bridge assay is returned by default. +#' @param l2.norm Whether to l2 normalize the dictionary representation +#' @param verbose Print messages and progress +#' +#' @importFrom MASS ginv +#' @return Returns a object list in which each object has a bridge cell derived assay +#' +#' @export +#' @concept integration +#' +BridgeCellsRepresentation <- function(object.list, + bridge.object, + object.reduction, + bridge.reduction, + laplacian.reduction = 'lap', + laplacian.dims = 1:50, + bridge.assay.name = "Bridge", + return.all.assays = FALSE, + l2.norm = TRUE, + verbose = TRUE +) { + my.lapply <- ifelse( + test = verbose && nbrOfWorkers() == 1, + yes = pblapply, + no = future_lapply + ) + if (verbose) { + message("Constructing Bridge-cells representation") + } + single.object = FALSE + if (length(x = object.list) == 1 & + inherits(x = object.list, what = 'Seurat') + ) { + object.list <- list(object.list) + single.object = TRUE + } + dims.list <- list() + for (i in 1:length(object.reduction)) { + ref.dims <- list( + object= Misc(object.list[[i]][[object.reduction[[i]]]], slot = 'ref.dims'), + bridge = Misc( bridge.object[[bridge.reduction[[i]]]], slot = 'ref.dims') + ) + all.dims <- list( + object = 1:ncol(object.list[[i]][[object.reduction[[i]]]]), + bridge = 1:ncol( bridge.object[[bridge.reduction[[i]] ]]) + ) + projected.dims.index <- which(sapply(ref.dims, function(x) !is.null(x))) + if (length(projected.dims.index) == 0) { + warning('No reference dims found in the dimensional reduction,', + ' all dims in the dimensional reduction will be used.') + if (all.dims[[1]] == all.dims[[2]]) { + dims.list[[i]] <- all.dims + } else { + stop( 'The number of dimensions in the object.list ', + object.reduction[[i]], + ' (', length(all.dims[[1]]), ') ', + ' and the number of dimensions in the bridge object ', + bridge.reduction[[i]], + ' (', length(all.dims[[2]]), ') ', + ' is different.') + } + } else { + reference.dims.index <- setdiff(c(1:2), projected.dims.index) + dims.list[[i]] <- list() + dims.list[[i]][[reference.dims.index]] <- ref.dims[[projected.dims.index ]] + dims.list[[i]][[projected.dims.index]] <- all.dims[[projected.dims.index]] + names(dims.list[[i]]) <- c('object', 'bridge') + } + } + object.list <- my.lapply( + X = 1:length(x = object.list), + FUN = function(x) { + SA.inv <- ginv( + X = Embeddings( + object = bridge.object, + reduction = bridge.reduction[[x]] + )[ ,dims.list[[x]]$bridge] + ) + if (!is.null(laplacian.reduction)) { + lap.vector <- Embeddings(bridge.object[[laplacian.reduction]])[,laplacian.dims] + X <- Embeddings( + object = object.list[[x]], + reduction = object.reduction[[x]] + )[, dims.list[[x]]$object] %*% (SA.inv %*% lap.vector) + } else { + X <- Embeddings( + object = object.list[[x]], + reduction = object.reduction[[x]] + )[, dims.list[[x]]$object] %*% SA.inv + colnames(X) <- Cells(bridge.object) + } + if (l2.norm) { + X <- L2Norm(mat = X, MARGIN = 1) + } + colnames(x = X) <- paste0('bridge_', colnames(x = X)) + suppressWarnings( + object.list[[x]][[bridge.assay.name]] <- CreateAssayObject(data = t(X)) + ) + object.list[[x]][[bridge.assay.name]]@misc$SA.inv <- SA.inv + DefaultAssay(object.list[[x]]) <- bridge.assay.name + VariableFeatures(object = object.list[[x]]) <- rownames(object.list[[x]]) + return (object.list[[x]]) + } + ) + if (!return.all.assays) { + object.list <- my.lapply( + X = object.list, + FUN = function(x) { + x <- DietSeurat(object = x, assays = bridge.assay.name, scale.data = TRUE) + return(x) + } + ) + } + if (single.object) { + object.list <- object.list[[1]] + } + return(object.list) +} + +#' Find bridge anchors between two unimodal datasets +#' +#' First, bridge object is used to reconstruct two single-modality profiles and +#' then project those cells into bridage graph laplacian space. +#' Next, find a set of anchors between two single-modality objects. These +#' anchors can later be used to integrate embeddings or transfer data from the reference to +#' query object using the \code{\link{MapQuery}} object. +#' +#' \itemize{ +#' \item{ Bridge cells reconstruction +#' } +#' \item{ Find anchors between objects. It can be either IntegrationAnchors or TransferAnchor. +#' } +#' } +#' +#' @inheritParams BridgeCellsRepresentation +#' @param anchor.type The type of anchors. Can +#' be one of: +#' \itemize{ +#' \item{Integration: Generate IntegrationAnchors for integration} +#' \item{Transfer: Generate TransferAnchors for transfering data} +#' } +#' @param reference A vector specifying the object/s to be used as a reference +#' during integration or transfer data. +#' @param reduction Dimensional reduction to perform when finding anchors. Can +#' be one of: +#' \itemize{ +#' \item{cca: Canonical correlation analysis} +#' \item{direct: Use assay data as a dimensional reduction} +#' } +#' @param reference.bridge.stored If refernece has stored the bridge dictionary representation +#' @param k.anchor How many neighbors (k) to use when picking anchors +#' @param k.score How many neighbors (k) to use when scoring anchors +#' @param verbose Print messages and progress +#' @param ... Additional parameters passed to \code{FindIntegrationAnchors} or +#' \code{FindTransferAnchors} +#' +#' +#' @return Returns an \code{\link{AnchorSet}} object that can be used as input to +#' \code{\link{IntegrateEmbeddings}}.or \code{\link{MapQuery}} +#' +#' @keywords internal +#' +FindBridgeAnchor <- function(object.list, + bridge.object, + object.reduction, + bridge.reduction, + anchor.type = c("Transfer", "Integration"), + reference = NULL, + laplacian.reduction = "lap", + laplacian.dims = 1:50, + reduction = c("direct", "cca"), + bridge.assay.name = "Bridge", + reference.bridge.stored = FALSE, + k.anchor = 20, + k.score = 50, + verbose = TRUE, + ... + ) { + anchor.type <- match.arg(arg = anchor.type) + reduction <- match.arg(arg = reduction) + if (!is.null(laplacian.reduction)) { + bridge.method <- "bridge graph" + } else { + bridge.method <- "bridge cells" + } + if (verbose) { + message("Finding ", anchor.type," anchors") + switch( + EXPR = bridge.method, + "bridge graph" = { + message('Transform cells to bridge graph laplacian space') + }, + "bridge cells" = { + message('Transform cells to bridge cells space') + } + ) + } + reference <- reference %||% c(1) + query <- setdiff(c(1,2), reference) + if (anchor.type == "Transfer") { + stored.bridge.weights <- FALSE + # check weight matrix + if (is.null(bridge.object@tools$MapQuery)) { + warning("No weights stored between reference and bridge obejcts.", + "Please set store.weights to TRUE in MapQuery") + } else if (is.null(object.list[[query]]@tools$MapQuery)) { + warning("No weights stored between query and bridge obejcts.", + "Please set store.weights to TRUE in MapQuery") + } else { + stored.bridge.weights <- TRUE + } + } + if (reference.bridge.stored) { + object.list[[query]] <- BridgeCellsRepresentation( + object.list = object.list[[query]] , + bridge.object = bridge.object, + object.reduction = object.reduction[[query]] , + bridge.reduction = bridge.reduction[[query]] , + bridge.assay.name = bridge.assay.name, + laplacian.reduction = laplacian.reduction, + laplacian.dims = laplacian.dims, + verbose = verbose + ) + } else { + object.list <- BridgeCellsRepresentation( + object.list = object.list , + bridge.object = bridge.object, + object.reduction = object.reduction, + bridge.reduction = bridge.reduction, + bridge.assay.name = bridge.assay.name, + laplacian.reduction = laplacian.reduction, + laplacian.dims = laplacian.dims, + verbose = verbose + ) + } + if (reduction == "direct") { + anchor <- FindAssayAnchor( + object.list = object.list , + reference = reference, + slot = "data", + anchor.type = anchor.type, + assay = bridge.assay.name, + k.anchor = k.anchor, + k.score = k.score, + verbose = verbose + ) + } else if (reduction == "cca") { + # set data slot to scale.data slot + object.list <- lapply( + X = object.list, + FUN = function(x) { + x <- SetAssayData( + object = x, + slot = "scale.data", + new.data = as.matrix( + x = GetAssayData(object = x, slot = "data") + )) + return(x) + } + ) + anchor <- switch(EXPR = anchor.type, + "Integration" = { + anchor <- FindIntegrationAnchors( + object.list = object.list, + k.filter = NA, + reference = reference, + reduction = "cca", + scale = FALSE, + k.anchor = k.anchor, + k.score = k.score, + verbose = verbose, + ...) + object.merge <- merge(x = object.list[[1]], + y = object.list[2:length(object.list)] + ) + slot( + object = anchor, + name = "weight.reduction" + ) <- CreateDimReducObject( + embeddings = t(GetAssayData( + object = object.merge, + slot = 'data' + )), + key = "L_", + assay = bridge.assay.name + ) + anchor + }, + "Transfer" = { + anchor <- FindTransferAnchors( + reference = object.list[[reference]], + query = object.list[[query]], + reduction = "cca", + scale = FALSE, + k.filter = NA, + k.anchor = k.anchor, + k.score = k.score, + verbose = verbose, + ... + ) + } + ) + } + if (anchor.type == "Transfer") { + if (stored.bridge.weights) { + slot( object = anchor,name = "weight.reduction" + )@misc$bridge.sets <- list( + bridge.weights = slot(object = bridge.object, + name = "tools" + )$MapQuery_PrepareBridgeReference$weights.matrix, + bridge.ref_anchor = slot(object = bridge.object, + name = "tools" + )$MapQuery_PrepareBridgeReference$anchor[,1], + query.weights = slot(object = object.list[[query]], + name = "tools" + )$MapQuery$weights.matrix, + query.ref_anchor = slot(object = object.list[[query]], + name = "tools" + )$MapQuery$anchor[,1] + ) + } + } + slot(object = anchor, name = "command") <- LogSeuratCommand( + object = object.list[[1]], + return.command = TRUE + ) + return(anchor) +} + + +# Helper function to transfer labels based on neighbors object +# @param nn.object the query neighbors object +# @param reference.object the reference seurat object +# @param group.by A vector of variables to group cells by +# @param weight.matrix A reference x query cell weight matrix +# @return Returns a list for predicted labels, prediction score and matrix +#' @importFrom Matrix sparseMatrix +#' @importFrom fastDummies dummy_cols +#' @importFrom Matrix rowMeans t +#' +TransferLablesNN <- function( + nn.object = NULL, + weight.matrix = NULL, + reference.labels +){ + reference.labels.matrix <- CreateCategoryMatrix(labels = as.character(reference.labels)) + if (!is.null(x = weight.matrix) & !is.null(x = nn.object)) { + warning('both nn.object and weight matrix are set. Only weight matrix is used for label transfer') + } + if (is.null(x = weight.matrix)) { + select_nn <- Indices(nn.object) + k.nn <- ncol(select_nn) + j <- as.numeric(x = t(x = select_nn )) + i <- ((1:length(x = j)) - 1) %/% k.nn + 1 + nn.matrix <- sparseMatrix( + i = i, + j = j, + x = 1, + dims = c(nrow(select_nn), nrow(reference.labels.matrix)) + ) + rownames(nn.matrix) <- Cells(nn.object) + } else if (nrow(weight.matrix) == nrow(reference.labels.matrix)) { + nn.matrix <- t(weight.matrix) + k.nn <- 1 + } else if (ncol(weight.matrix) == nrow(reference.labels.matrix)) { + nn.matrix <- weight.matrix + k.nn <- 1 + } else { + stop('wrong weights matrix input') + } + query.label.mat <- nn.matrix %*% reference.labels.matrix + query.label.mat <- query.label.mat/k.nn + prediction.max <- apply(X = query.label.mat, MARGIN = 1, FUN = which.max) + + query.label <- colnames(x = query.label.mat)[prediction.max] + query.label.score <- apply(X = query.label.mat, MARGIN = 1, FUN = max) + names(query.label) <- names(query.label.score) <- rownames(query.label.mat) + if (is.factor(reference.labels)) { + levels(query.label) <- levels(reference.labels) + } + output.list <- list(labels = query.label, + scores = query.label.score, + prediction.mat = query.label.mat + ) + return(output.list) +} + +# transfer continuous value based on neighbors +# +TransferExpressionNN<- function( + nn.object, + reference.object, + var.name = NULL +) { + nn.matrix <- NNtoGraph(nn.object = nn.object, + col.cells = Cells(reference.object) + ) + reference.exp.matrix <- FetchData(object = reference.object, vars = var.name) + # remove NA + reference.exp.matrix <- reference.exp.matrix[complete.cases(reference.exp.matrix), ,drop= F] + nn.matrix <- nn.matrix[, rownames(reference.exp.matrix)] + + # remove NO neighbor query + nn.sum <- RowSumSparse(mat = nn.matrix) + nn.matrix <- nn.matrix[nn.sum > 2, ] + nn.sum <- nn.sum[nn.sum>2] + + # transfer data + reference.exp.matrix <- as.matrix(reference.exp.matrix) + query.exp.mat <- nn.matrix %*% reference.exp.matrix + query.exp.mat <- sweep(x = query.exp.mat, MARGIN = 1, STATS = nn.sum, FUN = "/") + + # set output for all query cells + query.exp.all <- data.frame(row.names = Cells(nn.object)) + query.exp.all[rownames(query.exp.mat),1] <- query.exp.mat[,1] + colnames(query.exp.all) <- var.name + return(query.exp.all) +} + + +#' @param reduction.name dimensional reduction name, lap by default +#' @param graph The name of graph +#' @rdname RunGraphLaplacian +#' @concept dimensional_reduction +#' @export +#' @method RunGraphLaplacian Seurat +#' +RunGraphLaplacian.Seurat <- function( + object, + graph, + reduction.name = "lap", + reduction.key ="LAP_", + n = 50, + verbose = TRUE, + ... +) { + lap_dir <- RunGraphLaplacian(object = object[[graph]], + n = n, + reduction.key = reduction.key , + verbose = verbose, + ... + ) + object[[reduction.name]] <- lap_dir + return(object) +} + + + +#' @param n Total Number of Eigenvectors to compute and store (50 by default) +#' @param reduction.key dimensional reduction key, specifies the string before +#' the number for the dimension names. LAP by default +#' @param verbose Print message and process +#' @param ... Arguments passed to eigs_sym +#' +#' +#' @concept dimensional_reduction +#' @rdname RunGraphLaplacian +#' @export +#' +#' @importFrom Matrix diag t rowSums +#' @importFrom RSpectra eigs_sym +RunGraphLaplacian.default <- function(object, + n = 50, + reduction.key ="LAP_", + verbose = TRUE, + ... +) { + if (!all( + slot(object = t(x = object), name = "x") == slot(object = object, name = "x") + )) { + stop("Input graph is not symmetric") + } + if (verbose) { + message("Generating normalized laplacian graph") + } + D_half <- sqrt(x = rowSums(x = object)) + L <- -1 * (t(object / D_half) / D_half) + diag(L) <- 1 + diag(L) + if (verbose) { + message("Performing eigendecomposition of the normalized laplacian graph") + } + L_eigen <- eigs_sym(L, k = n + 1, which = "SM", ...) + #delete the first eigen vector + new_order <- n:1 + lap_output <- list(eigen_vector = Re(L_eigen$vectors[, new_order]), + eigen_value = L_eigen$values[new_order] + ) + rownames(lap_output$eigen_vector) <- colnames(object) + colnames(lap_output$eigen_vector) <- paste0(reduction.key, 1:n ) + lap_dir <- CreateDimReducObject(embeddings = lap_output$eigen_vector, + key = reduction.key, + assay = DefaultAssay(object), + stdev = lap_output$eigen_value + ) + return(lap_dir) +} + + +# Check if the var.name already existed in the meta.data +# +CheckMetaVarName <- function(object, var.name) { + if (var.name %in% colnames(x = object[[]])) { + var.name.exist <- var.name + var.name <- rev( + x = make.unique( + names = c(colnames(object[[]]), var.name.exist) + ) + )[1] + warning(var.name.exist, " is already existed in the meta.data. ", + var.name, " will store leverage score value") + } + return(var.name) +} + + + +# Run hnsw to find neighbors +# +# @param data Data to build the index with +# @param query A set of data to be queried against data +# @param metric Distance metric; can be one of "euclidean", "cosine", "manhattan", +# "hamming" +# @param k Number of neighbors +# @param ef_construction A larger value means a better quality index, but increases build time. +# @param ef Higher values lead to improved recall at the expense of longer search time. +# @param n_threads Maximum number of threads to use. +# @param index optional index object, will be recomputed if not provided +#' @importFrom RcppHNSW hnsw_build hnsw_search +# +HnswNN <- function(data, + query = data, + metric = "euclidean", + k, + ef_construction = 200, + ef = 10, + index = NULL, + n_threads = 0 +) { + idx <- index %||% hnsw_build( + X = data, + distance = metric, + ef = ef_construction, + n_threads = n_threads + ) + nn <- hnsw_search( + X = query, + ann = idx, + k = k, + ef = ef, + n_threads = n_threads + ) + names(nn) <- c("nn.idx", "nn.dists") + nn$idx <- idx + nn$alg.info <- list(metric = metric, ndim = ncol(x = data)) + return(nn) +} + + +# Calculate reference index from the integrated object +# +IntegrationReferenceIndex <- function(object) { + if (is.null(object@tools$Integration@sample.tree)) { + reference.index <- object@commands$FindIntegrationAnchors$reference + if (length(x = reference.index) > 1) { + stop('the number of the reference is bigger than 1') + } + } else { + reference.index <- SampleIntegrationOrder(tree = object@tools$Integration@sample.tree)[1] + } + return(reference.index) +} + + +# Calculate mean and sd +# +SparseMeanSd <- function(object, + assay = NULL, + slot = 'data', + features = NULL, + eps = 1e-8 +){ + assay <- assay%||% DefaultAssay(object) + features <- features %||% rownames(object[[assay]]) + assay <- assay %||% DefaultAssay(object = object) + mat <- GetAssayData(object = object[[assay]], slot = slot)[features,] + if (class(mat)[1] !='dgCMatrix'){ + stop('Matrix is not sparse') + } + mat.mean <- RowMeanSparse(mat) + mat.sd <- sqrt(RowVarSparse(mat)) + names(mat.mean) <- names(mat.sd) <- rownames(mat) + mat.sd <- MinMax(data = mat.sd, min = eps, max = max(mat.sd)) + output <- list(mean = mat.mean, sd = mat.sd) + return(output) +} + + + +# Run PCA on sparse matrix +# +#' @importFrom Matrix t +#' @importFrom rlang exec +#' @importFrom irlba irlba +# +# +RunPCA_Sparse <- function( + object, + features = NULL, + reduction.key = "PCsp_", + reduction.name = "pca.sparse", + npcs = 50, + do.scale = TRUE, + verbose = TRUE +) { + features <- features %||% VariableFeatures(object) + data <- GetAssayData(object = object, slot = "data")[features,] + n <- npcs + args <- list(A = t(data), nv = n) + args$center <- RowMeanSparse(data) + feature.var <- RowVarSparse(data) + args$totalvar <- sum(feature.var) + if (do.scale) { + args$scale <- sqrt(feature.var) + args$scale <- MinMax(args$scale, min = 1e-8, max = max(args$scale)) + } else { + args$scale <- FALSE + } + if (verbose) { + message("Running PCA") + } + pca.irlba <- exec(.fn = irlba, !!!args) + sdev <- pca.irlba$d/sqrt(max(1, ncol(data) - 1)) + feture.loadings <- pca.irlba$v + rownames(feture.loadings) <- rownames(data) + embeddings <- sweep(x = pca.irlba$u, MARGIN = 2, STATS = pca.irlba$d, FUN = "*") + rownames(embeddings) <- colnames(data) + colnames(feture.loadings) <- colnames(embeddings) <- paste0(reduction.key, 1:npcs) + object[[reduction.name]] <- CreateDimReducObject( + embeddings = embeddings, + loadings = feture.loadings, + stdev = sdev, + key = reduction.key, + assay = DefaultAssay(object), + misc = list(d = pca.irlba$d) + ) + return(object) +} + +# Smoothing labels based on the clusters +# @param labels the original labels +# @param clusters the clusters that are used to smooth labels +# +SmoothLabels <- function(labels, clusters) { + cluster.set <- unique(clusters) + smooth.labels <- labels + for (c in cluster.set) { + cell.c <- which(clusters == c) + smooth.labels[cell.c] <- names(sort(table(labels[cell.c]), decreasing = T)[1]) + } + return(smooth.labels) +} + + + +#' Project query data to reference dimensional reduction +#' +#' @param query Query object +#' @param reference Reference object +#' @param mode Projection mode name for projection +#' \itemize{ +#' \item{pcaproject: PCA projection} +#' \item{lsiproject: LSI projection} +#' } +#' @param reference.reduction Name of dimensional reduction in the reference object +#' @param combine Determine if query and reference objects are combined +#' @param query.assay Assay used for query object +#' @param reference.assay Assay used for reference object +#' @param features Features used for projection +#' @param do.scale Determine if scale expression matrix in the pcaproject mode +#' @param reduction.name dimensional reduction name, reference.reduction is used by default +#' @param reduction.key dimensional reduction key, the key in reference.reduction +#' is used by default +#' @param verbose Print progress and message +#' +#' @return Returns a query-only or query-reference combined seurat object +#' +#' @export +#' @concept integration +#' +ProjectDimReduc <- function(query, + reference, + mode = c('pcaproject', 'lsiproject'), + reference.reduction, + combine = FALSE, + query.assay = NULL, + reference.assay = NULL, + features = NULL, + do.scale = TRUE, + reduction.name = NULL, + reduction.key= NULL, + verbose = TRUE +) { + query.assay <- query.assay %||% DefaultAssay(object = query) + reference.assay <- reference.assay %||% DefaultAssay(object = reference) + DefaultAssay(object = query) <- query.assay + DefaultAssay(object = reference) <- reference.assay + reduction.name <- reduction.name %||% reference.reduction + reduction.key <- reduction.key %||% Key(object = reference[[reference.reduction]]) + if (reduction.name %in% Reductions(object = query)) { + warning(reduction.name, + ' already exists in the query object. It will be overwritten.' + ) + } + features <- features %||% rownames(x = Loadings(object = reference[[reference.reduction]])) + features <- intersect(x = features, y = rownames(x = query)) + if (mode == 'lsiproject') { + if (verbose) { + message('LSI projection to ', reference.reduction) + } + projected.embeddings <- ProjectSVD( + reduction = reference[[reference.reduction]], + data = GetAssayData(object = query, assay = query.assay, slot = "data"), + mode = "lsi", + do.center = FALSE, + do.scale = FALSE, + features = features, + use.original.stats = FALSE, + verbose = verbose + ) + } else if (mode == 'pcaproject') { + if (inherits(query[[query.assay]], what = 'SCTAssay')) { + if (verbose) { + message('PCA projection to ', reference.reduction, ' in SCT assay') + } + query <- suppressWarnings( + expr = GetResidual(object = query, + assay = query.assay, + features = features, + verbose = FALSE) + ) + query.mat <- GetAssayData(object = query, slot = 'scale.data')[features,] + + projected.embeddings <- t( + crossprod(x = Loadings( + object = reference[[reference.reduction]])[features, ], + y = query.mat + ) + ) + } else { + if (verbose) { + message('PCA projection to ', reference.reduction) + } + projected.embeddings <- ProjectCellEmbeddings( + reference = reference, + reduction = reference.reduction, + query = query, + scale = do.scale, + dims = 1:ncol(reference[[reference.reduction]]), + feature.mean = NULL, + verbose = verbose + ) + } + } + query[[reduction.name]] <- CreateDimReducObject( + embeddings = projected.embeddings, + loadings = Loadings(reference[[reference.reduction]])[features,], + assay = query.assay, + key = reduction.key, + misc = Misc(reference[[reference.reduction]]) + ) + if (combine) { + query <- DietSeurat(object = query, + dimreducs = reduction.name, + features = features, + assays = query.assay + ) + reference <- DietSeurat(object = reference, + dimreducs = reference.reduction, + features = features, + assays = reference.assay) + suppressWarnings( + combine.obj <- merge(query, reference, + merge.dr = c(reduction.name, reference.reduction) + ) + ) + Idents(combine.obj) <- c(rep(x = 'query', times = ncol(query)), + rep(x = 'reference', times = ncol(reference)) + ) + return(combine.obj) + } else { + return(query) + } +} + + +#' Prepare the bridge and reference datasets +#' +#' Preprocess the multi-omic bridge and unimodal reference datasets into +#' an extended reference. +#' This function performs the following three steps: +#' 1. Performs within-modality harmonization between bridge and reference +#' 2. Performs dimensional reduction on the SNN graph of bridge datasets via +#' Laplacian Eigendecomposition +#' 3. Constructs a bridge dictionary representation for unimodal reference cells +#' +#' @param reference A reference Seurat object +#' @param bridge A multi-omic bridge Seurat object +#' @param reference.reduction Name of dimensional reduction of the reference object (default is 'pca') +#' @param reference.dims Number of dimensions used for the reference.reduction (default is 50) +#' @param normalization.method Name of normalization method used: LogNormalize +#' or SCT +#' @param reference.assay Assay name for reference (default is \code{\link{DefaultAssay}}) +#' @param bridge.ref.assay Assay name for bridge used for reference mapping. RNA by default +#' @param bridge.query.assay Assay name for bridge used for query mapping. ATAC by default +#' @param supervised.reduction Type of supervised dimensional reduction to be performed +#' for integrating the bridge and query. +#' #' Options are: +#' \itemize{ +#' \item{slsi: Perform supervised LSI as the dimensional reduction for +#' the bridge-query integration} +#' \item{spca: Perform supervised PCA as the dimensional reduction for +#' the bridge-query integration} +#' \item{NULL: no supervised dimensional reduction will be calculated. +#' bridge.query.reduction is used for the bridge-query integration} +#' } +#' @param bridge.query.reduction Name of dimensions used for the bridge-query harmonization. +#' 'bridge.query.reduction' and 'supervised.reduction' cannot be NULL together. +#' @param bridge.query.features Features used for bridge query dimensional reduction +#' (default is NULL which uses VariableFeatures from the bridge object) +#' @param laplacian.reduction.name Name of dimensional reduction name of graph laplacian eigenspace (default is 'lap') +#' @param laplacian.reduction.key Dimensional reduction key (default is 'lap_') +#' @param laplacian.reduction.dims Number of dimensions used for graph laplacian eigenspace (default is 50) +#' @param verbose Print progress and message (default is TRUE) +#' +#' @return Returns a \code{BridgeReferenceSet} that can be used as input to +#' \code{\link{FindBridgeTransferAnchors}}. +#' The parameters used are stored in the \code{BridgeReferenceSet} as well +#' +#' @export +#' @concept integration +#' +PrepareBridgeReference <- function ( + reference, + bridge, + reference.reduction = 'pca', + reference.dims = 1:50, + normalization.method = c('SCT', 'LogNormalize'), + reference.assay = NULL, + bridge.ref.assay = 'RNA', + bridge.query.assay = 'ATAC', + supervised.reduction = c('slsi', 'spca', NULL), + bridge.query.reduction = NULL, + bridge.query.features = NULL, + laplacian.reduction.name = 'lap', + laplacian.reduction.key = 'lap_', + laplacian.reduction.dims = 1:50, + verbose = TRUE +) { + ## checking + if (!is.null(supervised.reduction)) { + supervised.reduction <- match.arg(arg = supervised.reduction) + } + if (!is.null(x = bridge.query.reduction) & !is.null(x = supervised.reduction)) { + stop('bridge.query.reduction and supervised.reduction can only set one.', + 'If you want to set bridge.query.reduction, supervised.reduction should set to NULL') + } + if (is.null(x = bridge.query.reduction) & is.null(x = supervised.reduction)) { + stop('Both bridge.query.reduction and supervised.reduction are NULL. One of them needs to be set') + } + bridge.query.features <- bridge.query.features %||% + VariableFeatures(object = bridge[[bridge.query.assay]]) + if (length(x = bridge.query.features) == 0) { + stop('bridge object ', bridge.query.assay, + ' assay has no variable genes and bridge.query.features has no input') + } + # modality harmonization + reference.assay <- reference.assay %||% DefaultAssay(reference) + DefaultAssay(reference) <- reference.assay + DefaultAssay(bridge) <- bridge.ref.assay + ref.anchor <- FindTransferAnchors( + reference = reference, + reference.reduction = reference.reduction, + normalization.method = normalization.method, + dims = reference.dims, + query = bridge, + recompute.residuals = TRUE, + features = rownames(reference[[reference.reduction]]@feature.loadings), + k.filter = NA, + verbose = verbose + ) + bridge <- MapQuery(anchorset = ref.anchor, + reference = reference, + query = bridge, + store.weights = TRUE, + verbose = verbose + ) + bridge.ref.reduction <- paste0('ref.', reference.reduction) + bridge <- FindNeighbors(object = bridge, + reduction = bridge.ref.reduction, + dims = 1:ncol(x = bridge[[bridge.ref.reduction]]), + return.neighbor = FALSE, + graph.name = c('bridge.ref.nn', 'bridge.ref.snn'), + prune.SNN = 0) + bridge <- RunGraphLaplacian(object = bridge, + graph = "bridge.ref.snn", + reduction.name = laplacian.reduction.name, + reduction.key = laplacian.reduction.key, + verbose = verbose) + DefaultAssay(object = bridge) <- bridge.query.assay + if (!is.null(supervised.reduction)) { + bridge <- switch(EXPR = supervised.reduction, + 'slsi' = { + bridge.reduc <- RunSLSI(object = bridge, + features = VariableFeatures(bridge), + graph = 'bridge.ref.nn', + assay = bridge.query.assay + ) + bridge.reduc + }, + 'spca' = { + bridge.reduc <- RunSPCA(object = bridge, + features = VariableFeatures(bridge), + graph = 'bridge.ref.snn', + assay = bridge.query.assay + ) + bridge.reduc + } + ) + } + # bridge representation + reference.bridge <- BridgeCellsRepresentation( + object.list = reference, + bridge.object = bridge, + object.reduction = c(reference.reduction), + bridge.reduction = c(bridge.ref.reduction), + laplacian.reduction = laplacian.reduction.name, + laplacian.dims = laplacian.reduction.dims + ) + reference[['Bridge']] <- reference.bridge[['Bridge']] + reference <- merge(x = reference, y = bridge, merge.dr = NA) + reference@tools$MapQuery_PrepareBridgeReference <- bridge@tools$MapQuery + command <- LogSeuratCommand(object = reference, return.command = TRUE) + slot(object = command, name = "params")$bridge.query.features <- NULL + command.name <- slot(object = command, name = "name") + reference[[command.name]] <- command + return(reference) +} + + +#' Find bridge anchors between query and extended bridge-reference +#' +#' Find a set of anchors between unimodal query and the other unimodal reference +#' using a pre-computed \code{\link{BridgeReferenceSet}}. +#' This function performs three steps: +#' 1. Harmonize the bridge and query cells in the bridge query reduction space +#' 2. Construct the bridge dictionary representations for query cells +#' 3. Find a set of anchors between query and reference in the bridge graph laplacian eigenspace +#' These anchors can later be used to integrate embeddings or transfer data from the reference to +#' query object using the \code{\link{MapQuery}} object. + +#' @param extended.reference BridgeReferenceSet object generated from +#' \code{\link{PrepareBridgeReference}} +#' @param query A query Seurat object +#' @param query.assay Assay name for query-bridge integration +#' @param scale Determine if scale the query data for projection +#' @param dims Number of dimensions for query-bridge integration +#' @param reduction Dimensional reduction to perform when finding anchors. +#' Options are: +#' \itemize{ +#' \item{pcaproject: Project the PCA from the bridge onto the query. We +#' recommend using PCA when bridge and query datasets are from scRNA-seq} +#' \item{lsiproject: Project the LSI from the bridge onto the query. We +#' recommend using LSI when bridge and query datasets are from scATAC-seq or scCUT&TAG data. +#' This requires that LSI or supervised LSI has been computed for the bridge dataset, and the +#' same features (eg, peaks or genome bins) are present in both the bridge +#' and query. +#' } +#' } +#' @param bridge.reduction Dimensional reduction to perform when finding anchors. Can +#' be one of: +#' \itemize{ +#' \item{cca: Canonical correlation analysis} +#' \item{direct: Use assay data as a dimensional reduction} +#' } +#' @param verbose Print messages and progress +#' +#' @return Returns an \code{AnchorSet} object that can be used as input to +#' \code{\link{TransferData}}, \code{\link{IntegrateEmbeddings}} and +#' \code{\link{MapQuery}}. +#' +#' @export +#' @concept integration +#' +FindBridgeTransferAnchors <- function( + extended.reference, + query, + query.assay = NULL, + dims = 1:30, + scale = FALSE, + reduction = c('lsiproject', 'pcaproject'), + bridge.reduction = c('direct', 'cca'), + verbose = TRUE +) { + bridge.reduction <- match.arg(arg = bridge.reduction) + reduction <- match.arg(arg = reduction) + query.assay <- query.assay %||% DefaultAssay(query) + DefaultAssay(query) <- query.assay + command.name <- grep(pattern = 'PrepareBridgeReference', + x = names(slot(object = extended.reference, name = 'commands')), + value = TRUE) + params <- Command(object = extended.reference, command = command.name) + bridge.query.assay <- params$bridge.query.assay + bridge.query.reduction <- params$bridge.query.reduction %||% params$supervised.reduction + reference.reduction <- params$reference.reduction + bridge.ref.reduction <- paste0('ref.', reference.reduction) + DefaultAssay(extended.reference) <- bridge.query.assay + extended.reference.bridge <- DietSeurat( + object = extended.reference, + assays = bridge.query.assay, + dimreducs = c(bridge.ref.reduction, bridge.query.reduction, params$laplacian.reduction.name) + ) + query.anchor <- FindTransferAnchors( + reference = extended.reference.bridge, + reference.reduction = bridge.query.reduction, + dims = dims, + query = query, + reduction = reduction, + scale = scale, + features = rownames(Loadings(extended.reference[[bridge.query.reduction]])), + k.filter = NA, + verbose = verbose + ) + + query <- MapQuery(anchorset = query.anchor, + reference = extended.reference.bridge, + query = query, + store.weights = TRUE + ) + DefaultAssay(extended.reference) <- 'Bridge' + bridge_anchor <- FindBridgeAnchor( + object.list = list(DietSeurat(object = extended.reference, assays = 'Bridge'), query), + bridge.object = extended.reference.bridge, + object.reduction = c(reference.reduction, paste0('ref.', bridge.query.reduction)), + bridge.reduction = c(bridge.ref.reduction, bridge.query.reduction), + anchor.type = "Transfer", + reduction = bridge.reduction, + reference.bridge.stored = TRUE, + verbose = verbose + ) + return(bridge_anchor) +} + + + +#' Find integration bridge anchors between query and extended bridge-reference +#' +#' Find a set of anchors between unimodal query and the other unimodal reference +#' using a pre-computed \code{\link{BridgeReferenceSet}}. +#' These integration anchors can later be used to integrate query and reference +#' using the \code{\link{IntegrateEmbeddings}} object. +#' +#' @inheritParams FindBridgeTransferAnchors +#' @param integration.reduction Dimensional reduction to perform when finding anchors +#' between query and reference. +#' Options are: +#' \itemize{ +#' \item{direct: find anchors directly on the bridge representation space} +#' \item{cca: perform cca on the on the bridge representation space and then find anchors +#' } +#' } +#' +#' @return Returns an \code{AnchorSet} object that can be used as input to +#' \code{\link{IntegrateEmbeddings}}. +#' +#' @export +#' @concept integration +#' +FindBridgeIntegrationAnchors <- function( + extended.reference, + query, + query.assay = NULL, + dims = 1:30, + scale = FALSE, + reduction = c('lsiproject', 'pcaproject'), + integration.reduction = c('direct', 'cca'), + verbose = TRUE +) { + reduction <- match.arg(arg = reduction) + integration.reduction <- match.arg(arg = integration.reduction) + query.assay <- query.assay %||% DefaultAssay(query) + DefaultAssay(query) <- query.assay + command.name <- grep(pattern = 'PrepareBridgeReference', + x = names(slot(object = extended.reference, name = 'commands')), + value = TRUE) + params <- Command(object = extended.reference, command = command.name) + bridge.query.assay <- params$bridge.query.assay + bridge.query.reduction <- params$bridge.query.reduction %||% params$supervised.reduction + reference.reduction <- params$reference.reduction + bridge.ref.reduction <- paste0( 'ref.', params$bridge.ref.reduction) + DefaultAssay(extended.reference) <- bridge.query.assay + + extended.reference.bridge <- DietSeurat( + object = extended.reference, + assays = bridge.query.assay, + dimreducs = c(bridge.query.reduction, bridge.ref.reduction, params$laplacian.reduction.name) + ) + + query.anchor <- FindTransferAnchors( + reference = extended.reference.bridge, + reference.reduction = bridge.query.reduction, + dims = dims, + query = query, + reduction = reduction, + scale = scale, + features = rownames(Loadings(extended.reference.bridge[[bridge.query.reduction]])), + k.filter = NA, + verbose = verbose + ) + query <- MapQuery(anchorset = query.anchor, + reference = extended.reference.bridge, + query = query, + store.weights = TRUE + ) + DefaultAssay(extended.reference) <- 'Bridge' + bridge_anchor <- FindBridgeAnchor( + object.list = list(DietSeurat(object = extended.reference, assays = 'Bridge'), query), + bridge.object = extended.reference.bridge, + reduction = integration.reduction, + object.reduction = c(reference.reduction, paste0('ref.', bridge.query.reduction)), + bridge.reduction = c(bridge.ref.reduction, bridge.query.reduction), + anchor.type = "Integration", + reference.bridge.stored = TRUE, + verbose = verbose + ) + return(bridge_anchor) +} + + +#' Perform integration on the joint PCA cell embeddings. +#' +#' This is a convenience wrapper function around the following three functions +#' that are often run together when perform integration. +#' #' \code{\link{FindIntegrationAnchors}}, \code{\link{RunPCA}}, +#' \code{\link{IntegrateEmbeddings}}. +#' +#' @inheritParams FindIntegrationAnchors +#' @param new.reduction.name Name of integrated dimensional reduction +#' @param npcs Total Number of PCs to compute and store (50 by default) +#' @param findintegrationanchors.args A named list of additional arguments to +#' \code{\link{FindIntegrationAnchors}} +#' @param verbose Print messages and progress +#' +#' @importFrom rlang exec +#' @return Returns a Seurat object with integrated dimensional reduction +#' @export +#' @concept integration +#' +FastRPCAIntegration <- function( + object.list, + reference = NULL, + anchor.features = 2000, + k.anchor = 20, + dims = 1:30, + scale = TRUE, + normalization.method = c("LogNormalize", "SCT"), + new.reduction.name = 'integrated_dr', + npcs = 50, + findintegrationanchors.args = list(), + verbose = TRUE +) { + npcs <- max(npcs, dims) + my.lapply <- ifelse( + test = verbose && nbrOfWorkers() == 1, + yes = pblapply, + no = future_lapply + ) + reduction <- 'rpca' + if (is.numeric(x = anchor.features)) { + anchor.features <- SelectIntegrationFeatures( + object.list = object.list, + nfeatures = anchor.features, + verbose = FALSE + ) + } + if (normalization.method == 'SCT') { + scale <- FALSE + object.list <- PrepSCTIntegration(object.list = object.list, + anchor.features = anchor.features + ) + } + + if (verbose) { + message('Performing PCA for each object') + } + object.list <- my.lapply(X = object.list, + FUN = function(x) { + if (normalization.method != 'SCT') { + x <- ScaleData(x, features = anchor.features, do.scale = scale, verbose = FALSE) + } + x <- RunPCA(x, features = anchor.features, verbose = FALSE, npcs = npcs) + return(x) + } + ) + fia.allarguments <- c(list( + object.list = object.list, + reference = reference, + anchor.features = anchor.features, + reduction = reduction, + normalization.method = normalization.method, + scale = scale, + k.anchor = k.anchor, + dims = dims, + verbose = verbose + ), findintegrationanchors.args + ) + anchor <- exec("FindIntegrationAnchors",!!!fia.allarguments) + object_merged <- merge(x = object.list[[1]], + y = object.list[2:length(object.list)] + + ) + + anchor.feature <- slot(object = anchor, name = 'anchor.features') + if (normalization.method != 'SCT') { + object_merged <- ScaleData(object = object_merged, + features = anchor.feature, + do.scale = scale, + verbose = FALSE + ) + } + object_merged <- RunPCA(object_merged, + features = anchor.feature, + verbose = FALSE, + npcs = npcs + + ) + + temp <- object_merged[["pca"]] + object_merged <- IntegrateEmbeddings( + anchorset = anchor, + reductions = object_merged[['pca']], + new.reduction.name = new.reduction.name, + verbose = verbose) + object_merged[['pca']] <- temp + VariableFeatures(object = object_merged) <- anchor.feature + return(object_merged) + +} + + +#' Transfer embeddings from sketched cells to the full data +#' +#' @param atom.data Atom data +#' @param atom.cells Atom cells +#' @param orig.data Original data +#' @param embeddings Embeddings of atom cells +#' @param sketch.matrix Sketch matrix +#' +#' @importFrom MASS ginv +#' @importFrom Matrix t +#' +#' @export +#' @concept integration +#' +UnSketchEmbeddings <- function( + atom.data, + atom.cells = NULL, + orig.data, + embeddings, + sketch.matrix = NULL +) { + if(!all(rownames(atom.data) == rownames(orig.data))) { + stop('features in atom.data and orig.data are not identical') + } else { + features = rownames(atom.data) + } + atom.cells <- atom.cells %||% colnames(x = atom.data) + if (inherits(x = orig.data, what = 'DelayedMatrix') ) { + stop("PseudobulkExpression does not support DelayedMatrix objects") + } else if(inherits(x = orig.data, what = 'IterableMatrix')) { + matrix.prod.function <- crossprod_BPCells + } else { + matrix.prod.function <- crossprod + } + sketch.matrix <- sketch.matrix %||% as.sparse(diag(length(features))) + atom.data <- atom.data[, atom.cells] + embeddings <- embeddings[atom.cells,] + exp.mat <- as.matrix(x = t(x = atom.data) %*% sketch.matrix) + sketch.transform <- ginv(X = exp.mat) %*% embeddings + emb <- matrix.prod.function( + x = as.matrix(sketch.matrix %*% sketch.transform), + y = orig.data + ) + emb <- as.matrix(x = emb) + return(emb) +} + +FeatureSketch <- function(features, ratio = 0.8, seed = 123) { + sketch.R <- t(x = CountSketch( + nsketch = round(x = ratio * length(x = features)), + ncells = length(x = features), + seed = seed) + ) + return(sketch.R) +} diff --git a/R/integration5.R b/R/integration5.R new file mode 100644 index 000000000..c0b7034d7 --- /dev/null +++ b/R/integration5.R @@ -0,0 +1,723 @@ +#' @include zzz.R +#' @include generics.R +#' +NULL + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Generics +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Functions +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +#' Harmony Integration +#' +#' @param object An \code{\link[SeuratObject]{Assay5}} object +# @param assay Name of \code{object} in the containing \code{Seurat} object +#' @param orig A \link[SeuratObject:DimReduc]{dimensional reduction} to correct +#' @param features Ignored +#' @param scale.layer Ignored +#' @param new.reduction Name of new integrated dimensional reduction +#' @param layers Ignored +#' @param key Key for Harmony dimensional reduction +#' @param npcs If doing PCA on input matrix, number of PCs to compute +#' @param theta Diversity clustering penalty parameter +#' @param lambda Ridge regression penalty parameter +#' @param sigma Width of soft kmeans clusters +#' @param nclust Number of clusters in model +#' @param tau Protection against overclustering small datasets with large ones +#' @param block.size What proportion of cells to update during clustering +#' @param max.iter.harmony Maximum number of rounds to run Harmony +#' @param max.iter.cluster Maximum number of rounds to run clustering at each round of Harmony +#' @param epsilon.cluster Convergence tolerance for clustering round of Harmony +#' @param epsilon.harmony Convergence tolerance for Harmony +#' @param verbose Whether to print progress messages. TRUE to print, FALSE to suppress +#' @param ... Ignored +#' +#' @return ... +#' +#' @note This function requires the +#' \href{https://cran.r-project.org/package=harmony}{\pkg{harmony}} package +#' to be installed +#' +# @templateVar pkg harmony +# @template note-reqdpkg +#' +#' @examples +#' \dontrun{ +#' # Preprocessing +#' obj <- SeuratData::LoadData("pbmcsca") +#' obj[["RNA"]] <- split(obj[["RNA"]], f = obj$Method) +#' obj <- NormalizeData(obj) +#' obj <- FindVariableFeatures(obj) +#' obj <- ScaleData(obj) +#' obj <- RunPCA(obj) +#' +#' # After preprocessing, we integrate layers with added parameters specific to Harmony: +#' obj <- IntegrateLayers(object = obj, method = HarmonyIntegration, orig.reduction = "pca", +#' new.reduction = 'harmony', verbose = FALSE) +#' +#' # Modifying Parameters +#' # We can also add arguments specific to Harmony such as theta, to give more diverse clusters +#' obj <- IntegrateLayers(object = obj, method = HarmonyIntegration, orig.reduction = "pca", +#' new.reduction = 'harmony', verbose = FALSE, theta = 3) +#' # Integrating SCTransformed data +#' obj <- SCTransform(object = obj) +#' obj <- IntegrateLayers(object = obj, method = HarmonyIntegration, +#' orig.reduction = "pca", new.reduction = 'harmony', +#' assay = "SCT", verbose = FALSE) +#' } +#' +#' +#' @export +#' +#' @concept integration +#' +#' @seealso \code{\link[harmony:HarmonyMatrix]{harmony::HarmonyMatrix}()} +#' +HarmonyIntegration <- function( + object, + orig, + features = NULL, + scale.layer = 'scale.data', + new.reduction = 'harmony', + layers = NULL, + npcs = 50L, + key = 'harmony_', + theta = NULL, + lambda = NULL, + sigma = 0.1, + nclust = NULL, + tau = 0, + block.size = 0.05, + max.iter.harmony = 10L, + max.iter.cluster = 20L, + epsilon.cluster = 1e-05, + epsilon.harmony = 1e-04, + verbose = TRUE, + ... +) { + check_installed( + pkg = "harmony", + reason = "for running integration with Harmony" + ) + if (!inherits(x = object, what = c('StdAssay', 'SCTAssay'))) { + abort(message = "'object' must be a v5 or SCT assay") + } else if (!inherits(x = orig, what = 'DimReduc')) { + abort(message = "'orig' must be a dimensional reduction") + } + # # Run joint PCA + # features <- features %||% Features(x = object, layer = scale.layer) + # pca <- RunPCA( + # object = object, + # assay = assay, + # features = features, + # layer = scale.layer, + # npcs = npcs, + # verbose = verbose + # ) + #create grouping variables + groups <- CreateIntegrationGroups(object, layers = layers, scale.layer = scale.layer) + # Run Harmony + harmony.embed <- harmony::HarmonyMatrix( + data_mat = Embeddings(object = orig), + meta_data = groups, + vars_use = 'group', + do_pca = FALSE, + npcs = 0L, + theta = theta, + lambda = lambda, + sigma = sigma, + nclust = nclust, + tau = tau, + block.size = block.size, + max.iter.harmony = max.iter.harmony, + max.iter.cluster = max.iter.cluster, + epsilon.cluster = epsilon.cluster, + epsilon.harmony = epsilon.harmony, + return_object = FALSE, + verbose = verbose + ) + rownames(x = harmony.embed) <- Cells(x = orig) + # TODO add feature loadings from PCA + dr <- suppressWarnings(expr = CreateDimReducObject( + embeddings = harmony.embed, + key = key, + # assay = assay + assay = DefaultAssay(object = orig) + )) + output.list <- list(dr) + names(output.list) <- c(new.reduction) + return(output.list) +} + +attr(x = HarmonyIntegration, which = 'Seurat.method') <- 'integration' + +#' Seurat-CCA Integration +#' +#' @inheritParams RPCAIntegration +#' @export +#' @concept integration +#' +#' @examples +#' \dontrun{ +#' # Preprocessing +#' obj <- SeuratData::LoadData("pbmcsca") +#' obj[["RNA"]] <- split(obj[["RNA"]], f = obj$Method) +#' obj <- NormalizeData(obj) +#' obj <- FindVariableFeatures(obj) +#' obj <- ScaleData(obj) +#' obj <- RunPCA(obj) +#' +#' # After preprocessing, we integrate layers. +#' obj <- IntegrateLayers(object = obj, method = CCAIntegration, +#' orig.reduction = "pca", new.reduction = "integrated.cca", +#' verbose = FALSE) +#' +#' # Modifying parameters +#' # We can also specify parameters such as `k.anchor` to increase the strength of integration +#' obj <- IntegrateLayers(object = obj, method = CCAIntegration, +#' orig.reduction = "pca", new.reduction = "integrated.cca", +#' k.anchor = 20, verbose = FALSE) +#' +#' # Integrating SCTransformed data +#' obj <- SCTransform(object = obj) +#' obj <- IntegrateLayers(object = obj, method = CCAIntegration, +#' orig.reduction = "pca", new.reduction = "integrated.cca", +#' assay = "SCT", verbose = FALSE) +#' } +#' +CCAIntegration <- function( + object = NULL, + assay = NULL, + layers = NULL, + orig = NULL, + new.reduction = 'integrated.dr', + reference = NULL, + features = NULL, + normalization.method = c("LogNormalize", "SCT"), + dims = 1:30, + k.filter = NA, + scale.layer = 'scale.data', + dims.to.integrate = NULL, + k.weight = 100, + weight.reduction = NULL, + sd.weight = 1, + sample.tree = NULL, + preserve.order = FALSE, + verbose = TRUE, + ... +) { + op <- options(Seurat.object.assay.version = "v3", Seurat.object.assay.calcn = FALSE) + on.exit(expr = options(op), add = TRUE) + normalization.method <- match.arg(arg = normalization.method) + features <- features %||% SelectIntegrationFeatures5(object = object) + assay <- assay %||% 'RNA' + layers <- layers %||% Layers(object, search = 'data') + if (normalization.method == 'SCT') { + #create grouping variables + groups <- CreateIntegrationGroups(object, layers = layers, scale.layer = scale.layer) + object.sct <- CreateSeuratObject(counts = object, assay = 'SCT') + object.sct$split <- groups[,1] + object.list <- SplitObject(object = object.sct,split.by = 'split') + object.list <- PrepSCTIntegration(object.list, anchor.features = features) + } else { + object.list <- list() + for (i in seq_along(along.with = layers)) { + if (inherits(x = object[layers[i]], what = "IterableMatrix")) { + warning("Converting BPCells matrix to dgCMatrix for integration ", + "as on-disk CCA Integration is not currently supported", call. = FALSE, immediate. = TRUE) + counts <- as(object = object[layers[i]][features, ], + Class = "dgCMatrix") + } + else { + counts <- object[layers[i]][features, ] + } + object.list[[i]] <- CreateSeuratObject(counts = counts) + if (inherits(x = object[scale.layer], what = "IterableMatrix")) { + scale.data.layer <- as.matrix(object[scale.layer][features, + Cells(object.list[[i]])]) + object.list[[i]][["RNA"]]$scale.data <- scale.data.layer + } + else { + object.list[[i]][["RNA"]]$scale.data <- object[scale.layer][features, + Cells(object.list[[i]])] + } + object.list[[i]][['RNA']]$counts <- NULL + } + } + + anchor <- FindIntegrationAnchors(object.list = object.list, + anchor.features = features, + scale = FALSE, + reduction = 'cca', + normalization.method = normalization.method, + dims = dims, + k.filter = k.filter, + reference = reference, + verbose = verbose, + ... + ) + suppressWarnings({ + anchor@object.list <- lapply(anchor@object.list, function(x) { + x <- DietSeurat(x, features = features[1:2]) + return(x) + }) + }, classes = "dimWarning") + object_merged <- IntegrateEmbeddings(anchorset = anchor, + reductions = orig, + new.reduction.name = new.reduction, + dims.to.integrate = dims.to.integrate, + k.weight = k.weight, + weight.reduction = weight.reduction, + sd.weight = sd.weight, + sample.tree = sample.tree, + preserve.order = preserve.order, + verbose = verbose + ) + output.list <- list(object_merged[[new.reduction]]) + names(output.list) <- c(new.reduction) + return(output.list) +} + +attr(x = CCAIntegration, which = 'Seurat.method') <- 'integration' + +#' Seurat-RPCA Integration +#' +#' @param object A \code{Seurat} object +#' @param assay Name of \code{Assay} in the \code{Seurat} object +#' @param layers Names of layers in \code{assay} +#' @param orig A \link[SeuratObject:DimReduc]{dimensional reduction} to correct +#' @param new.reduction Name of new integrated dimensional reduction +#' @param reference A reference \code{Seurat} object +#' @param features A vector of features to use for integration +#' @param normalization.method Name of normalization method used: LogNormalize +#' or SCT +#' @param dims Dimensions of dimensional reduction to use for integration +#' @param k.filter Number of anchors to filter +#' @param scale.layer Name of scaled layer in \code{Assay} +#' @param verbose Print progress +#' @param ... Additional arguments passed to \code{FindIntegrationAnchors} +#' +#' @examples +#' \dontrun{ +#' # Preprocessing +#' obj <- SeuratData::LoadData("pbmcsca") +#' obj[["RNA"]] <- split(obj[["RNA"]], f = obj$Method) +#' obj <- NormalizeData(obj) +#' obj <- FindVariableFeatures(obj) +#' obj <- ScaleData(obj) +#' obj <- RunPCA(obj) +#' +#' # After preprocessing, we run integration +#' obj <- IntegrateLayers(object = obj, method = RPCAIntegration, +#' orig.reduction = "pca", new.reduction = 'integrated.rpca', +#' verbose = FALSE) +#' +#' # Reference-based Integration +#' # Here, we use the first layer as a reference for integraion +#' # Thus, we only identify anchors between the reference and the rest of the datasets, +#' # saving computational resources +#' obj <- IntegrateLayers(object = obj, method = RPCAIntegration, +#' orig.reduction = "pca", new.reduction = 'integrated.rpca', +#' reference = 1, verbose = FALSE) +#' +#' # Modifying parameters +#' # We can also specify parameters such as `k.anchor` to increase the strength of +#' # integration +#' obj <- IntegrateLayers(object = obj, method = RPCAIntegration, +#' orig.reduction = "pca", new.reduction = 'integrated.rpca', +#' k.anchor = 20, verbose = FALSE) +#' +#' # Integrating SCTransformed data +#' obj <- SCTransform(object = obj) +#' obj <- IntegrateLayers(object = obj, method = RPCAIntegration, +#' orig.reduction = "pca", new.reduction = 'integrated.rpca', +#' assay = "SCT", verbose = FALSE) +#' } +#' + +#' @inheritParams FindIntegrationAnchors +#' @inheritParams IntegrateEmbeddings +#' @param ... Arguments passed on to \code{FindIntegrationAnchors} +#' @export +#' +#' @concept utilities +#' +RPCAIntegration <- function( + object = NULL, + assay = NULL, + layers = NULL, + orig = NULL, + new.reduction = 'integrated.dr', + reference = NULL, + features = NULL, + normalization.method = c("LogNormalize", "SCT"), + dims = 1:30, + k.filter = NA, + scale.layer = 'scale.data', + dims.to.integrate = NULL, + k.weight = 100, + weight.reduction = NULL, + sd.weight = 1, + sample.tree = NULL, + preserve.order = FALSE, + verbose = TRUE, + ... +) { + op <- options(Seurat.object.assay.version = "v3", Seurat.object.assay.calcn = FALSE) + on.exit(expr = options(op), add = TRUE) + normalization.method <- match.arg(arg = normalization.method) + features <- features %||% SelectIntegrationFeatures5(object = object) + assay <- assay %||% 'RNA' + layers <- layers %||% Layers(object = object, search = 'data') + #check that there enough cells present + ncells <- sapply(X = layers, FUN = function(x) {ncell <- dim(object[x])[2] + return(ncell) }) + if (min(ncells) < max(dims)) { + abort(message = "At least one layer has fewer cells than dimensions specified, please lower 'dims' accordingly.") + } + if (normalization.method == 'SCT') { + #create grouping variables + groups <- CreateIntegrationGroups(object, layers = layers, scale.layer = scale.layer) + object.sct <- CreateSeuratObject(counts = object, assay = 'SCT') + object.sct$split <- groups[,1] + object.list <- SplitObject(object = object.sct, split.by = 'split') + object.list <- PrepSCTIntegration(object.list = object.list, anchor.features = features) + object.list <- lapply(X = object.list, FUN = function(x) { + x <- RunPCA(object = x, features = features, verbose = FALSE, npcs = max(dims)) + return(x) + } + ) + } else { + object.list <- list() + for (i in seq_along(along.with = layers)) { + object.list[[i]] <- suppressMessages(suppressWarnings( + CreateSeuratObject(counts = NULL, data = object[layers[i]][features,]) + )) + VariableFeatures(object = object.list[[i]]) <- features + object.list[[i]] <- suppressWarnings(ScaleData(object = object.list[[i]], verbose = FALSE)) + object.list[[i]] <- RunPCA(object = object.list[[i]], verbose = FALSE, npcs=max(dims)) + suppressWarnings(object.list[[i]][['RNA']]$counts <- NULL) + } + } + anchor <- FindIntegrationAnchors(object.list = object.list, + anchor.features = features, + scale = FALSE, + reduction = 'rpca', + normalization.method = normalization.method, + dims = dims, + k.filter = k.filter, + reference = reference, + verbose = verbose, + ... + ) + slot(object = anchor, name = "object.list") <- lapply( + X = slot( + object = anchor, + name = "object.list"), + FUN = function(x) { + suppressWarnings(expr = x <- DietSeurat(x, features = features[1:2])) + return(x) + }) + object_merged <- IntegrateEmbeddings(anchorset = anchor, + reductions = orig, + new.reduction.name = new.reduction, + dims.to.integrate = dims.to.integrate, + k.weight = k.weight, + weight.reduction = weight.reduction, + sd.weight = sd.weight, + sample.tree = sample.tree, + preserve.order = preserve.order, + verbose = verbose + ) + + output.list <- list(object_merged[[new.reduction]]) + names(output.list) <- c(new.reduction) + return(output.list) +} + +attr(x = RPCAIntegration, which = 'Seurat.method') <- 'integration' + +#' Seurat-Joint PCA Integration +#' +#' @inheritParams RPCAIntegration +#' @inheritParams FindIntegrationAnchors +#' @inheritParams IntegrateEmbeddings +#' @param ... Arguments passed on to \code{FindIntegrationAnchors} +#' @export +#' @concept integration +#' +JointPCAIntegration <- function( + object = NULL, + assay = NULL, + layers = NULL, + orig = NULL, + new.reduction = 'integrated.dr', + reference = NULL, + features = NULL, + normalization.method = c("LogNormalize", "SCT"), + dims = 1:30, + k.anchor = 20, + scale.layer = 'scale.data', + dims.to.integrate = NULL, + k.weight = 100, + weight.reduction = NULL, + sd.weight = 1, + sample.tree = NULL, + preserve.order = FALSE, + verbose = TRUE, + ... +) { + op <- options(Seurat.object.assay.version = "v3", Seurat.object.assay.calcn = FALSE) + on.exit(expr = options(op), add = TRUE) + normalization.method <- match.arg(arg = normalization.method) + features <- features %||% SelectIntegrationFeatures5(object = object) + features.diet <- features[1:2] + assay <- assay %||% DefaultAssay(object) + layers <- layers %||% Layers(object, search = 'data') + + if (normalization.method == 'SCT') { + #create grouping variables + groups <- CreateIntegrationGroups(object, layers = layers, scale.layer = scale.layer) + object.sct <- CreateSeuratObject(counts = object, assay = 'SCT') + object.sct <- DietSeurat(object = object.sct, features = features.diet) + object.sct[['joint.pca']] <- CreateDimReducObject( + embeddings = Embeddings(object = orig), + assay = 'SCT', + loadings = Loadings(orig), + key = 'J_' + ) + object.sct$split <- groups[,1] + object.list <- SplitObject(object = object.sct,split.by = 'split') + object.list <- PrepSCTIntegration(object.list, anchor.features = features.diet) + object.list <- lapply(object.list, function(x) { + x[['SCT']]@SCTModel.list <- list() + return(x) + }) + } else { + object.list <- list() + for (i in seq_along(along.with = layers)) { + object.list[[i]] <- CreateSeuratObject(counts = object[layers[i]][features.diet, ] ) + object.list[[i]][['RNA']]$counts <- NULL + object.list[[i]][['joint.pca']] <- CreateDimReducObject( + embeddings = Embeddings(object = orig)[Cells(object.list[[i]]),], + assay = 'RNA', + loadings = Loadings(orig), + key = 'J_' + ) + } + } + + anchor <- FindIntegrationAnchors(object.list = object.list, + anchor.features = features.diet, + scale = FALSE, + reduction = 'jpca', + normalization.method = normalization.method, + dims = dims, + k.anchor = k.anchor, + k.filter = NA, + reference = reference, + verbose = verbose, + ... + ) + object_merged <- IntegrateEmbeddings(anchorset = anchor, + reductions = orig, + new.reduction.name = new.reduction, + dims.to.integrate = dims.to.integrate, + k.weight = k.weight, + weight.reduction = weight.reduction, + sd.weight = sd.weight, + sample.tree = sample.tree, + preserve.order = preserve.order, + verbose = verbose + ) + output.list <- list(object_merged[[new.reduction]]) + names(output.list) <- c(new.reduction) + return(output.list) +} + +attr(x = JointPCAIntegration, which = 'Seurat.method') <- 'integration' + +#' Integrate Layers +#' +#' @param object A \code{\link[SeuratObject]{Seurat}} object +#' @param method Integration method function +#' @param orig.reduction Name of dimensional reduction for correction +#' @param assay Name of assay for integration +#' @param features A vector of features to use for integration +#' @param layers Names of normalized layers in \code{assay} +#' @param scale.layer Name(s) of scaled layer(s) in \code{assay} +#' @param ... Arguments passed on to \code{method} +#' +#' @return \code{object} with integration data added to it +#' +#' @section Integration Method Functions: +#' The following integration method functions are available: +#' \Sexpr[stage=render,results=rd]{Seurat:::.rd_methods("integration")} +#' +#' @export +#' +#' @concept integration +#' +#' @seealso \link[Seurat:writing-integration]{Writing integration method functions} +#' +IntegrateLayers <- function( + object, + method, + orig.reduction = 'pca', + assay = NULL, + features = NULL, + layers = NULL, + scale.layer = 'scale.data', + ... +) { + # Get the integration method + if (is_quosure(x = method)) { + method <- eval( + expr = quo_get_expr(quo = method), + envir = quo_get_env(quo = method) + ) + } + if (is.character(x = method)) { + method <- get(x = method) + } + if (!is.function(x = method)) { + abort(message = "'method' must be a function for integrating layers") + } + # Check our assay + assay <- assay %||% DefaultAssay(object = object) + if (inherits(x = object[[assay]], what = 'SCTAssay')) { + layers <- 'data' + scale.layer <- 'scale.data' + features <- features %||% SelectSCTIntegrationFeatures( + object = object, + assay = assay + ) + } else if (inherits(x = object[[assay]], what = 'StdAssay')) { + layers <- Layers(object = object, assay = assay, search = layers %||% 'data') + scale.layer <- Layers(object = object, search = scale.layer) + features <- features %||% VariableFeatures( + object = object, + assay = assay, + nfeatures = 2000L + ) + } else { + abort(message = "'assay' must be a v5 or SCT assay") + } + if (!is.null(scale.layer)) { + features <- intersect( + x = features, + y = Features(x = object, assay = assay, layer = scale.layer) + ) + } + if (!length(x = features)) { + abort(message = "None of the features provided are found in this assay") + } + if (!is.null(orig.reduction)) { + # Check our dimensional reduction + orig.reduction <- orig.reduction %||% DefaultDimReduc(object = object, assay = assay) + if (!orig.reduction %in% Reductions(object = object)) { + abort(message = paste(sQuote(x = orig.reduction), 'is not a dimensional reduction')) + } + obj.orig <- object[[orig.reduction]] + if (is.null(x = DefaultAssay(object = obj.orig))) { + DefaultAssay(object = obj.orig) <- assay + } + } + # Run the integration method + value <- method( + object = object[[assay]], + assay = assay, + orig = obj.orig, + layers = layers, + scale.layer = scale.layer, + features = features, + ... + ) + for (i in names(x = value)) { + object[[i]] <- value[[i]] + } + return(object) +} + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Methods for Seurat-defined generics +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Methods for Seurat-defined generics +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Internal +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Creates data.frame with cell group assignments for integration +# uses SCT models if SCTAssay and layers otherwise +CreateIntegrationGroups <- function(object, layers, scale.layer) { + groups <- if (inherits(x = object, what = 'SCTAssay')) { + df <- SeuratObject::EmptyDF(n = ncol(x = object)) + row.names(x = df) <- colnames(x = object) + for (model in levels(x = object)) { + cc <- Cells(x = object, layer = model) + df[cc, "group"] <- model + } + df + } else if (length(x = layers) > 1L) { + cmap <- slot(object = object, name = 'cells')[, layers] + as.data.frame(x = labels( + object = cmap, + values = Cells(x = object, layer = scale.layer) + )) + } + names(x = groups) <- 'group' + return(groups) +} + +#' Writing Integration Method Functions +#' +#' Integration method functions can be written by anyone to implement any +#' integration method in Seurat. These methods should expect to take a +#' \link[SeuratObject:Assay5]{v5 assay} as input and return a named list of +#' objects that can be added back to a \code{Seurat} object (eg. a +#' \link[SeuratObject:DimReduc]{dimensional reduction} or cell-level meta data) +#' +#' @section Provided Parameters: +#' Every integration method function should expect the following arguments: +#' \itemize{ +#' \item \dQuote{\code{object}}: an \code{\link[SeuratObject]{Assay5}} object +# \item \dQuote{\code{assay}}: name of \code{object} in the original +# \code{\link[SeuratObject]{Seurat}} object +#' \item \dQuote{\code{orig}}: \link[SeuratObject:DimReduc]{dimensional +#' reduction} to correct +#' \item \dQuote{\code{layers}}: names of normalized layers in \code{object} +#' \item \dQuote{\code{scale.layer}}: name(s) of scaled layer(s) in +#' \code{object} +#' \item \dQuote{\code{features}}: a vector of features for integration +#' \item \dQuote{\code{groups}}: a one-column data frame with the groups for +#' each cell in \code{object}; the column name will be \dQuote{group} +#' } +#' +#' @section Method Discovery: +#' The documentation for \code{\link{IntegrateLayers}()} will automatically +#' link to integration method functions provided by packages in the +#' \code{\link[base]{search}()} space. To make an integration method function +#' discoverable by the documentation, simply add an attribute named +#' \dQuote{\code{Seurat.method}} to the function with a value of +#' \dQuote{\code{integration}} +#' \preformatted{ +#' attr(MyIntegrationFunction, which = "Seurat.method") <- "integration" +#' } +#' +#' @keywords internal +#' +#' @concept integration +#' +#' @name writing-integration +#' @rdname writing-integration +#' +#' @seealso \code{\link{IntegrateLayers}()} +#' +NULL diff --git a/R/mixscape.R b/R/mixscape.R index a003ab7ba..230419320 100644 --- a/R/mixscape.R +++ b/R/mixscape.R @@ -127,6 +127,9 @@ CalcPerturbSig <- function( #' @param balanced Option to display pathway enrichments for both negative and #' positive DE genes.If false, only positive DE gene will be displayed. #' @param max.genes Maximum number of genes to use as input to enrichR. +#' @param logfc.threshold Limit testing to genes which show, on average, at least +#' X-fold difference (log-scale) between the two groups of cells. Default is 0.25. +#' Increasing logfc.threshold speeds up the function, but can miss weaker signals. #' @param p.val.cutoff Cutoff to select DE genes. #' @param cols A list of colors to use for barplots. #' @param enrich.database Database to use from enrichR. @@ -209,8 +212,9 @@ DEenrichRPlot <- function( } if (isTRUE(x = balanced)) { - neg.markers <- all.markers[all.markers[, 2] < logfc.threshold & all.markers[, 1] < p.val.cutoff, , drop = FALSE] + neg.markers <- all.markers[all.markers[, 2] < -logfc.threshold & all.markers[, 1] < p.val.cutoff, , drop = FALSE] neg.markers.list <- rownames(x = neg.markers)[1:min(max.genes, nrow(x = neg.markers))] + Sys.sleep(1) neg.er <- enrichR::enrichr(genes = neg.markers.list, databases = enrich.database) neg.er <- do.call(what = cbind, args = neg.er) neg.er$log10pval <- -log10(x = neg.er[, paste(enrich.database, sep = ".", "P.value")]) @@ -366,7 +370,9 @@ MixscapeLDA <- function( #' @param nt.label Name of non-targeting cell class. #' @param npcs Number of principle components to use. #' @param verbose Print progress bar. -#' @inheritParams FindMarkers +#' @param logfc.threshold Limit testing to genes which show, on average, at least +#' X-fold difference (log-scale) between the two groups of cells. Default is 0.25. +#' Increasing logfc.threshold speeds up the function, but can miss weaker signals. #' @return Returns a list of the first 10 PCs from each projection. #' #' @export @@ -586,9 +592,8 @@ RunLDA.Seurat <- function( ... ) { assay <- assay %||% DefaultAssay(object = object) - assay.data <- GetAssay(object = object, assay = assay) reduction.data <- RunLDA( - object = assay.data, + object = object[[assay]], assay = assay, labels = labels, features = features, @@ -628,7 +633,6 @@ RunLDA.Seurat <- function( #' Function to identify perturbed and non-perturbed gRNA expressing cells that #' accounts for multiple treatments/conditions/chemical perturbations. #' -#' @inheritParams FindMarkers #' @importFrom ggplot2 geom_density position_dodge #' @param object An object of class Seurat. #' @param assay Assay to use for mixscape classification. @@ -644,6 +648,10 @@ RunLDA.Seurat <- function( #' all are assigned NP. #' @param de.assay Assay to use when performing differential expression analysis. #' Usually RNA. +#' @param logfc.threshold Limit testing to genes which show, on average, +#' at least X-fold difference (log-scale) between the two groups of cells. +#' Default is 0.25 Increasing logfc.threshold speeds up the function, but can miss +#' weaker signals. #' @param iter.num Number of normalmixEM iterations to run if convergence does #' not occur. #' @param verbose Display messages @@ -863,6 +871,9 @@ RunMixscape <- function( #' @param max.cells.group Number of cells per identity to plot. #' @param max.genes Total number of DE genes to plot. #' @param balanced Plot an equal number of genes with both groups of cells. +#' @param logfc.threshold Limit testing to genes which show, on average, at least +#' X-fold difference (log-scale) between the two groups of cells. Default is 0.25. +#' Increasing logfc.threshold speeds up the function, but can miss weaker signals. #' @param order.by.prob Order cells on heatmap based on their mixscape knockout #' probability from highest to lowest score. #' @param group.by (Deprecated) Option to split densities based on mixscape @@ -1286,9 +1297,9 @@ ProjectVec <- function(v1, v2) { # @param ident.2 Non-targetting class or cells # @param labels metadata column with target gene classification. # @param de.assay Name of Assay DE is performed on. -# @param test.use Denotes which test to use. See all available tests on +# @param test.use Denotes which test to use. See all available tests on # FindMarkers documentation. -# @param pval.cut.off P-value cut-off for selection of significantly DE genes. +# @param pval.cutoff P-value cut-off for selection of significantly DE genes. # @param logfc.threshold Limit testing to genes which show, on average, at # least X-fold difference (log-scale) between the two groups of cells. Default # is 0.25 Increasing logfc.threshold speeds up the function, but can miss @@ -1302,7 +1313,7 @@ TopDEGenesMixscape <- function( ident.2 = NULL, labels = 'gene', de.assay = "RNA", - test.use = "LR", + test.use = "wilcox", pval.cutoff = 5e-2, logfc.threshold = 0.25, verbose = TRUE @@ -1321,7 +1332,8 @@ TopDEGenesMixscape <- function( assay = de.assay, test.use = test.use, logfc.threshold = logfc.threshold, - verbose = verbose + verbose = verbose, + min.pct = 0.1 ) de.genes <- de.genes[de.genes$p_val_adj < pval.cutoff, ] }, diff --git a/R/objects.R b/R/objects.R index f1dc39d56..7553a5d84 100644 --- a/R/objects.R +++ b/R/objects.R @@ -30,6 +30,7 @@ setOldClass(Classes = 'package_version') #' anchor score, and the index of the original dataset in the object.list for cell1 and cell2 of #' the anchor. #' @slot offsets The offsets used to enable cell look up in downstream functions +#' @slot weight.reduction The weight dimensional reduction used to calculate weight matrix #' @slot anchor.features The features used when performing anchor finding. #' @slot neighbors List containing Neighbor objects for reuse later (e.g. mapping) #' @slot command Store log of parameters that were used @@ -49,6 +50,7 @@ AnchorSet <- setClass( query.cells = "vector", anchors = "ANY", offsets = "ANY", + weight.reduction = "DimReduc", anchor.features = "ANY", neighbors = "list", command = "ANY" @@ -114,6 +116,31 @@ ModalityWeights <- setClass( ) ) + + + +#' The BridgeReferenceSet Class +#' The BridgeReferenceSet is an output from PrepareBridgeReference +#' @slot bridge The multi-omic object +#' @slot reference The Reference object only containing bridge representation assay +#' @slot params A list of parameters used in the PrepareBridgeReference +#' @slot command Store log of parameters that were used +#' +#' @name BridgeReferenceSet-class +#' @rdname BridgeReferenceSet-class +#' @concept objects +#' @exportClass BridgeReferenceSet +#' +BridgeReferenceSet <- setClass( + Class = "BridgeReferenceSet", + slots = list( + bridge = "ANY", + reference = "ANY", + params = "list", + command = "ANY" + ) +) + #' The IntegrationData Class #' #' The IntegrationData object is an intermediate storage container used internally throughout the @@ -200,9 +227,11 @@ SCTModel <- setClass( #' @concept objects #' #' @examples +#' \dontrun{ #' # SCTAssay objects are generated from SCTransform #' pbmc_small <- SCTransform(pbmc_small) #' pbmc_small[["SCT"]] +#' } #' SCTAssay <- setClass( Class = 'SCTAssay', @@ -299,6 +328,32 @@ VisiumV1 <- setClass( ) ) +#' The VisiumV2 class +#' +#' The VisiumV2 class represents spatial information from the 10X Genomics +#' Visium HD platform - it can also accomodate data from the standard +#' Visium platform +#' +#' @slot image A three-dimensional array with PNG image data, see +#' \code{\link[png]{readPNG}} for more details +#' @slot scale.factors An object of class \code{\link{scalefactors}}; see +#' \code{\link{scalefactors}} for more information +#' +#' @importClassesFrom SeuratObject FOV +#' @name VisiumV2-class +#' +#' @concept objects +#' @concept spatial +#' @exportClass VisiumV2 +VisiumV2 <- setClass( + Class = "VisiumV2", + contains = "FOV", + slots = list( + image = "array", + scale.factors = "scalefactors" + ) +) + setClass(Class = 'SliceImage', contains = 'VisiumV1') #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -319,6 +374,8 @@ setClass(Class = 'SliceImage', contains = 'VisiumV1') #' CellsByImage(object = object, images = "slice1") #' } #' +#' @keywords internal +#' CellsByImage <- function(object, images = NULL, unlist = FALSE) { images <- images %||% Images(object = object) cells <- sapply( @@ -441,86 +498,163 @@ CreateSCTAssayObject <- function( #' Slim down a Seurat object #' -#' Keep only certain aspects of the Seurat object. Can be useful in functions that utilize merge as -#' it reduces the amount of data in the merge. +#' Keep only certain aspects of the Seurat object. Can be useful in functions +#' that utilize merge as it reduces the amount of data in the merge #' -#' @param object Seurat object -#' @param counts Preserve the count matrices for the assays specified -#' @param data Preserve the data slot for the assays specified -#' @param scale.data Preserve the scale.data slot for the assays specified +#' @param object A \code{\link[SeuratObject]{Seurat}} object +#' @param layers A vector or named list of layers to keep #' @param features Only keep a subset of features, defaults to all features #' @param assays Only keep a subset of assays specified here -#' @param dimreducs Only keep a subset of DimReducs specified here (if NULL, -#' remove all DimReducs) -#' @param graphs Only keep a subset of Graphs specified here (if NULL, remove -#' all Graphs) +#' @param dimreducs Only keep a subset of DimReducs specified here (if +#' \code{NULL}, remove all DimReducs) +#' @param graphs Only keep a subset of Graphs specified here (if \code{NULL}, +#' remove all Graphs) #' @param misc Preserve the \code{misc} slot; default is \code{TRUE} +#' @param counts Preserve the count matrices for the assays specified +#' @param data Preserve the data matrices for the assays specified +#' @param scale.data Preserve the scale data matrices for the assays specified +#' @param ... Ignored +#' +#' @return \code{object} with only the sub-object specified retained +#' +#' @importFrom SeuratObject .FilterObjects .PropagateList Assays +#' Layers UpdateSlots #' #' @export +#' #' @concept objects #' DietSeurat <- function( object, - counts = TRUE, - data = TRUE, - scale.data = FALSE, + layers = NULL, features = NULL, assays = NULL, dimreducs = NULL, graphs = NULL, - misc = TRUE + misc = TRUE, + counts = deprecated(), + data = deprecated(), + scale.data = deprecated(), + ... ) { + CheckDots(...) + dep.args <- c(counts = counts, data = data, scale.data = scale.data) + for (lyr in names(x = dep.args)) { + if (is_present(arg = dep.args[[lyr]])) { + if (is.null(x = layers)) { + layers <- unique(x = unlist(x = lapply( + X = Assays(object = object), + FUN = function(x) { + return(Layers(object = object[[x]])) + } + ))) + } + deprecate_soft( + when = '5.0.0', + what = paste0('DietSeurat(', lyr, ' = )'), + with = 'DietSeurat(layers = )' + ) + layers <- if (isTRUE(x = dep.args[[lyr]])) { + c(layers, lyr) + } else { + Filter(f = function(x) x != lyr, x = layers) + } + } + } object <- UpdateSlots(object = object) - assays <- assays %||% FilterObjects(object = object, classes.keep = "Assay") - assays <- assays[assays %in% FilterObjects(object = object, classes.keep = 'Assay')] - if (length(x = assays) == 0) { - stop("No assays provided were found in the Seurat object") + assays <- assays %||% Assays(object = object) + assays <- intersect(x = assays, y = Assays(object = object)) + if (!length(x = assays)) { + abort(message = "No assays provided were found in the Seurat object") } if (!DefaultAssay(object = object) %in% assays) { - stop("The default assay is slated to be removed, please change the default assay") + abort( + message = "The default assay is slated to be removed, please change the default assay" + ) } - if (!counts && !data) { - stop("Either one or both of 'counts' and 'data' must be kept") + layers <- layers %||% assays + layers <- .PropagateList(x = layers, names = assays) + for (assay in names(x = layers)) { + layers[[assay]] <- tryCatch( + expr = Layers(object = object[[assay]], search = layers[[assay]]), + error = function(...) { + return(character(length = 0L)) + } + ) } - for (assay in FilterObjects(object = object, classes.keep = 'Assay')) { + layers <- Filter(f = length, x = layers) + if (!length(x = layers)) { + abort(message = "None of the requested layers found") + } + for (assay in Assays(object = object)) { if (!(assay %in% assays)) { object[[assay]] <- NULL - } else { - if (!is.null(x = features)) { - features.assay <- intersect(x = features, y = rownames(x = object[[assay]])) - if (length(x = features.assay) == 0) { - if (assay == DefaultAssay(object = object)) { - stop("The default assay is slated to be removed, please change the default assay") - } else { - warning("No features found in assay '", assay, "', removing...") - object[[assay]] <- NULL - } - } else { - object[[assay]] <- subset(x = object[[assay]], features = features.assay) - } - } - if (!counts) { - slot(object = object[[assay]], name = 'counts') <- new(Class = 'matrix') + next + } + layers.rm <- setdiff( + x = Layers(object = object[[assay]]), + y = layers[[assay]] + ) + if (length(x = layers.rm)) { + if (inherits(x = object[[assay]], what = 'Assay') && all(c('counts', 'data') %in% layers.rm)) { + abort(message = "Cannot remove both 'counts' and 'data' from v3 Assays") } - if (!data) { - stop('data = FALSE currently not supported') + for (lyr in layers.rm) { + suppressWarnings(object <- tryCatch(expr = { + object[[assay]][[lyr]] <- NULL + object + }, error = function(e) { + if (lyr == "data"){ + object[[assay]][[lyr]] <- sparseMatrix(i = 1, j = 1, x = 1, + dims = dim(object[[assay]][[lyr]]), + dimnames = dimnames(object[[assay]][[lyr]])) + } else{ + slot(object = object[[assay]], name = lyr) <- new(Class = "dgCMatrix") + } + message("Converting layer ", lyr, " in assay ", + assay, " to empty dgCMatrix") + object + })) } - if (!scale.data) { - slot(object = object[[assay]], name = 'scale.data') <- new(Class = 'matrix') + } + if (!is.null(x = features)) { + features.assay <- intersect( + x = features, + y = rownames(x = object[[assay]]) + ) + if (!length(x = features.assay)) { + warn(message = paste0( + 'No features found in assay ', + sQuote(x = assay), + ', removing...' + )) + object[[assay]] <- NULL + next } + suppressWarnings(object[[assay]] <- subset(x = object[[assay]], features = features.assay)) } } # remove misc when desired if (!isTRUE(x = misc)) { slot(object = object, name = "misc") <- list() } - # remove unspecified DimReducs and Graphs - all.objects <- FilterObjects(object = object, classes.keep = c('DimReduc', 'Graph')) + all.objects <- .FilterObjects( + object = object, + classes.keep = c('DimReduc', 'Graph') + ) objects.to.remove <- all.objects[!all.objects %in% c(dimreducs, graphs)] for (ob in objects.to.remove) { object[[ob]] <- NULL } + cells.keep <- list() + for (assay in Assays(object = object)) { + cells.keep[[assay]] <- colnames(x = object[[assay]] ) + } + cells.keep <- intersect(colnames(x = object), unlist(cells.keep)) + if (length(cells.keep) <- ncol(x = object)) { + object <- subset(object, cells = cells.keep) + } return(object) } @@ -1180,6 +1314,7 @@ as.Seurat.SingleCellExperiment <- function( #' @concept objects #' @export #' @method as.SingleCellExperiment Seurat +#' @importFrom SeuratObject .FilterObjects #' as.SingleCellExperiment.Seurat <- function(x, assay = NULL, ...) { CheckDots(...) @@ -1242,7 +1377,7 @@ as.SingleCellExperiment.Seurat <- function(x, assay = NULL, ...) { ) } } - for (dr in FilterObjects(object = x, classes.keep = "DimReduc")) { + for (dr in .FilterObjects(object = x, classes.keep = "DimReduc")) { assay.used <- DefaultAssay(object = x[[dr]]) swap.exp <- assay.used %in% SingleCellExperiment::altExpNames(x = sce) & assay.used != orig.exp.name if (swap.exp) { @@ -1301,6 +1436,15 @@ as.sparse.H5Group <- function(x, ...) { )) } + +#' @method as.sparse IterableMatrix +#' @export +#' +as.sparse.IterableMatrix <- function(x, ...) { + return(as(object = x, Class = 'dgCMatrix')) +} + + #' Get Cell Names #' #' @inheritParams SeuratObject::Cells @@ -1314,6 +1458,17 @@ Cells.SCTModel <- function(x, ...) { return(rownames(x = slot(object = x, name = "cell.attributes"))) } +#' @method Cells SCTAssay +#' @export +#' +Cells.SCTAssay <- function(x, layer = NA, ...) { + layer <- layer %||% levels(x = x)[1L] + if (rlang::is_na(x = layer)) { + return(colnames(x = x)) + } + return(Cells(x = components(object = x, model = layer))) +} + #' @rdname Cells #' @concept objects #' @concept spatial @@ -1345,6 +1500,30 @@ Cells.VisiumV1 <- function(x, ...) { return(rownames(x = GetTissueCoordinates(object = x, scale = NULL))) } +#' @importFrom SeuratObject DefaultLayer Layers +#' +#' @method Features SCTAssay +#' @export +#' +Features.SCTAssay <- function(x, layer = NA, ...) { + layer <- layer %||% DefaultLayer(object = x) + if (rlang::is_na(x = layer)) { + return(rownames(x = x)) + } + layer <- rlang::arg_match( + arg = layer, values = c(Layers(object = x), levels(x = x))) + if (layer %in% levels(x = x)) { + return(Features(x = components(object = x, model = layer))) + } + return(NextMethod()) +} + +#' @method Features SCTModel +#' @export +#' +Features.SCTModel <- function(x, ...) { + return(rownames(x = SCTResults(object = x, slot = 'feature.attributes'))) +} #' @param assay Assay to get #' @@ -1360,7 +1539,8 @@ Cells.VisiumV1 <- function(x, ...) { GetAssay.Seurat <- function(object, assay = NULL, ...) { CheckDots(...) assay <- assay %||% DefaultAssay(object = object) - object.assays <- FilterObjects(object = object, classes.keep = 'Assay') + object.assays <- FilterObjects( + object = object, classes.keep = c('Assay', 'Assay5')) if (!assay %in% object.assays) { stop(paste0( assay, @@ -1371,7 +1551,6 @@ GetAssay.Seurat <- function(object, assay = NULL, ...) { return(slot(object = object, name = 'assays')[[assay]]) } - #' Get Image Data #' #' @inheritParams SeuratObject::GetImage @@ -1424,7 +1603,9 @@ GetImage.VisiumV1 <- function( ... ) { mode <- match.arg(arg = mode) + image <- slot(object = object, name = 'image') + image <- switch( EXPR = mode, 'grob' = rasterGrob( @@ -1434,11 +1615,9 @@ GetImage.VisiumV1 <- function( ), 'raster' = as.raster(x = image), 'plotly' = list( - source = raster2uri(r = GetImage(object = object, mode = 'raster')), + source = raster2uri(GetImage(object, mode = 'raster')), xref = 'x', yref = 'y', - # x = -7, - # y = -7, sizex = ncol(x = object), sizey = nrow(x = object), sizing = 'stretch', @@ -1451,6 +1630,8 @@ GetImage.VisiumV1 <- function( return(image) } +GetImage.VisiumV2 <- GetImage.VisiumV1 + #' Get Tissue Coordinates #' #' @inheritParams SeuratObject::GetTissueCoordinates @@ -1505,8 +1686,10 @@ GetTissueCoordinates.VisiumV1 <- function( ) { cols <- cols %||% colnames(x = slot(object = object, name = 'coordinates')) if (!is.null(x = scale)) { - coordinates <- slot(object = object, name = 'coordinates')[, c('imagerow', 'imagecol')] - scale <- match.arg(arg = scale, choices = c('spot', 'fiducial', 'hires', 'lowres')) + coordinates <- slot( + object = object, name = 'coordinates')[, c('imagerow', 'imagecol')] + scale <- match.arg( + arg = scale, choices = c('spot', 'fiducial', 'hires', 'lowres')) scale.use <- ScaleFactors(object = object)[[scale]] coordinates <- coordinates * scale.use } else { @@ -1515,32 +1698,64 @@ GetTissueCoordinates.VisiumV1 <- function( return(coordinates) } + +#' @rdname GetTissueCoordinates +#' @method GetTissueCoordinates VisiumV2 +#' @export +#' +GetTissueCoordinates.VisiumV2 <- function( + object, + scale = NULL, + ... +) { + # make call to GetTissueCoordiantes.FOV + coordinates <- NextMethod(object, ...) + # do some cleanup of the resulting data.frame to make it play nice + # with `SpatialPlot` - namely set rownames and re-order the columns so + # that the actual position values appear first + rownames(coordinates) <- coordinates[["cell"]] + coordinates <- coordinates[, c("x", "y", "cell")] + + if (!is.null(scale)) { + # scale the coordinates by the specified factor + scale <- match.arg(scale, choices = c("lowres", "hires")) + scale.factor <- ScaleFactors(object)[[scale]] + coordinates[, c("x", "y")] <- coordinates[, c("x", "y")] * scale.factor + } + + return (coordinates) +} + #' Get Variable Feature Information #' #' Get variable feature information from \code{\link{SCTAssay}} objects #' #' @inheritParams SeuratObject::HVFInfo +#' @param method method to determine variable features #' #' @export +#' @concept objects #' @method HVFInfo SCTAssay #' #' @seealso \code{\link[SeuratObject]{HVFInfo}} #' #' @examples +#' \dontrun{ #' # Get the HVF info directly from an SCTAssay object #' pbmc_small <- SCTransform(pbmc_small) -#' HVFInfo(pbmc_small[["SCT"]], selection.method = 'sct')[1:5, ] +#' HVFInfo(pbmc_small[["SCT"]], method = 'sct')[1:5, ] +#' } #' -HVFInfo.SCTAssay <- function(object, selection.method, status = FALSE, ...) { +HVFInfo.SCTAssay <- function(object, method, status = FALSE, ...) { CheckDots(...) disp.methods <- c('mean.var.plot', 'dispersion', 'disp') - if (tolower(x = selection.method) %in% disp.methods) { - selection.method <- 'mvp' + if (tolower(x = method) %in% disp.methods) { + method <- 'mvp' } - selection.method <- switch( - EXPR = tolower(x = selection.method), + method <- switch( + EXPR = tolower(x = method), 'sctransform' = 'sct', - selection.method + method ) vars <- c('gmean', 'variance', 'residual_variance') hvf.info <- SCTResults(object = object, slot = "feature.attributes")[,vars] @@ -1563,7 +1778,7 @@ HVFInfo.SCTAssay <- function(object, selection.method, status = FALSE, ...) { #' #' @seealso \code{\link[SeuratObject:Radius]{SeuratObject::Radius}} #' -Radius.SlideSeq <- function(object) { +Radius.SlideSeq <- function(object, ...) { return(0.005) } @@ -1573,20 +1788,41 @@ Radius.SlideSeq <- function(object) { #' @method Radius STARmap #' @export #' -Radius.STARmap <- function(object) { +Radius.STARmap <- function(object, ...) { return(NULL) } +#' +#' @param scale A factor to scale the radius by; one of: "hires", +#' "lowres", or \code{NULL} for the unscaled value. +#' #' @rdname Radius #' @concept objects #' @concept spatial #' @method Radius VisiumV1 #' @export #' -Radius.VisiumV1 <- function(object) { - return(slot(object = object, name = 'spot.radius')) +Radius.VisiumV1 <- function(object, scale = "lowres", ...) { + scale.factors <- ScaleFactors(object = object) + + spot.size <- scale.factors[["spot"]] + scale.factor <- scale.factors[[ + match.arg(scale, choices = c("hires", "lowres")) + ]] + + image.size <- max(dim(GetImage(object, resolution=scale)$raster)) + + return ((spot.size * scale.factor) / image.size) } +#' @rdname Radius +#' @concept objects +#' @concept spatial +#' @method Radius VisiumV1 +#' @export +#' +Radius.VisiumV2 <- Radius.VisiumV1 + #' @rdname RenameCells #' @export #' @concept objects @@ -1777,6 +2013,103 @@ SCTResults.Seurat <- function(object, assay = "SCT", slot, model = NULL, ...) { return(SCTResults(object = object[[assay]], slot = slot, model = model, ...)) } +#' @importFrom utils head +#' @method VariableFeatures SCTModel +#' @export +#' +VariableFeatures.SCTModel <- function(object, method = NULL, nfeatures = 3000, ...) { + if (!is_scalar_integerish(x = nfeatures) || (!is_na(x = nfeatures < 1L) && nfeatures < 1L)) { + abort(message = "'nfeatures' must be a single positive integer") + } + feature.attr <- SCTResults(object = object, slot = 'feature.attributes') + feature.variance <- feature.attr[, 'residual_variance'] + names(x = feature.variance) <- row.names(x = feature.attr) + feature.variance <- sort(x = feature.variance, decreasing = TRUE) + if (is_na(x = nfeatures)) { + return(names(x = feature.variance)) + } + return(head(x = names(x = feature.variance), n = nfeatures)) +} + +#' @importFrom utils head +#' @method VariableFeatures SCTAssay +#' @export +#' +VariableFeatures.SCTAssay <- function( + object, + method = NULL, + layer = NULL, + nfeatures = NULL, + simplify = TRUE, + use.var.features = TRUE, + ... +) { + # Is the information already in var.features? + var.features.existing <- slot(object = object, name = "var.features") + nfeatures <- nfeatures %||% length(x = var.features.existing) %||% 3000 + if (is.null(x = layer)) { + layer <- levels(x = object) + } + if (simplify == TRUE & use.var.features == TRUE & length(var.features.existing) >= nfeatures){ + return(head(x = var.features.existing, n = nfeatures)) + } + + layer <- match.arg(arg = layer, choices = levels(x = object), several.ok = TRUE) + # run variable features on each model + + vf.list <- sapply( + X = layer, + FUN = function(lyr) { + return(VariableFeatures( + object = components(object = object, model = lyr), + nfeatures = nfeatures, + ... + )) + }, + simplify = FALSE, + USE.NAMES = TRUE + ) + if (isFALSE(x = simplify)){ + return(vf.list) + } + var.features <- sort( + x = table(unlist(x = vf.list, use.names = FALSE)), + decreasing = TRUE + ) + if (length(x = var.features) == 0) { + return(NULL) + } + tie.val <- var.features[min(nfeatures, length(x = var.features))] + features <- names(x = var.features[which(x = var.features > tie.val)]) + if (length(x = features) > 0) { + feature.ranks <- sapply(X = features, FUN = function(x) { + ranks <- sapply(X = vf.list, FUN = function(vf) { + if (x %in% vf) { + return(which(x = x == vf)) + } + return(NULL) + }) + median(x = unlist(x = ranks)) + }) + features <- names(x = sort(x = feature.ranks)) + } + features.tie <- var.features[which(x = var.features == tie.val)] + tie.ranks <- sapply(X = names(x = features.tie), FUN = function(x) { + ranks <- sapply(X = vf.list, FUN = function(vf) { + if (x %in% vf) { + return(which(x = x == vf)) + } + return(NULL) + }) + median(x = unlist(x = ranks)) + }) + features <- c( + features, + names(x = head(x = sort(x = tie.ranks), nfeatures - length(x = features))) + ) + return(features) +} + #' @rdname ScaleFactors #' @method ScaleFactors VisiumV1 #' @export @@ -1787,12 +2120,31 @@ ScaleFactors.VisiumV1 <- function(object, ...) { } #' @rdname ScaleFactors -#' @method ScaleFactors VisiumV1 +#' @method ScaleFactors VisiumV2 #' @export #' @concept spatial #' -ScaleFactors.VisiumV1 <- function(object, ...) { - return(slot(object = object, name = 'scale.factors')) +ScaleFactors.VisiumV2 <- ScaleFactors.VisiumV1 + +#' @method FetchData VisiumV1 +#' @export +#' @concept spatial +#' +FetchData.VisiumV1 <- function( + object, + vars, + cells = NULL, + ... +) { + if (is.numeric(x = cells)) { + cells <- Cells(x = object)[cells] + } else if (is.null(x = cells)) { + cells <- Cells(x = object) + } + vars.unkeyed <- gsub(pattern = paste0('^', Key(object)), replacement = '', x = vars) + coords <- GetTissueCoordinates(object = object)[cells, vars.unkeyed, drop = FALSE] + colnames(x = coords) <- vars + return(coords) } #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -1814,6 +2166,14 @@ ScaleFactors.VisiumV1 <- function(object, ...) { return(subset(x = x, cells = i)) } +#' @method components SCTAssay +#' @export +#' +components.SCTAssay <- function(object, model, ...) { + model <- rlang::arg_match(arg = model, values = levels(x = object)) + return(slot(object = object, name = 'SCTModel.list')[[model]]) +} + #' @method dim SlideSeq #' @concept objects #' @export @@ -1843,6 +2203,11 @@ dim.VisiumV1 <- function(x) { return(dim(x = GetImage(object = x)$raster)) } +#' @method dim VisiumV2 +#' @concept objects +#' @export +#' +dim.VisiumV2 <- dim.VisiumV1 #' @rdname SCTAssay-class #' @name SCTAssay-class @@ -1924,20 +2289,33 @@ merge.SCTAssay <- function( ... ) { assays <- c(x, y) + if (any(sapply( + X = assays, + FUN = function(assay.i) inherits(x = assay.i, what = "Assay5") + ))) { + return(merge(x = as(x, "Assay5"), y, ...)) + } parent.call <- grep(pattern = "merge.Seurat", x = sys.calls()) if (length(x = parent.call) > 0) { # Try and fill in missing residuals if called in the context of merge.Seurat - all.features <- unique(x = unlist(x = lapply(X = assays, FUN = function(assay) { - if (inherits(x = x, what = "SCTAssay")) { + all.features <- unique( + x = unlist( + x = lapply( + X = assays, + FUN = function(assay) { + if (inherits(x = assay, what = "SCTAssay")) { return(rownames(x = GetAssayData(object = assay, slot = "scale.data"))) } - }))) + }) + ) + ) if (!is.null(all.features)) { assays <- lapply(X = 1:length(x = assays), FUN = function(assay) { if (inherits(x = assays[[assay]], what = "SCTAssay")) { parent.environ <- sys.frame(which = parent.call[1]) seurat.object <- parent.environ$objects[[assay]] - seurat.object <- suppressWarnings(expr = GetResidual(object = seurat.object, features = all.features, assay = parent.environ$assay, verbose = FALSE)) + seurat.object <- suppressWarnings(expr = GetResidual(object = seurat.object, features = all.features, + assay = parent.environ$assay, verbose = FALSE)) return(seurat.object[[parent.environ$assay]]) } return(assays[[assay]]) @@ -2025,6 +2403,8 @@ merge.SCTAssay <- function( combined.assay, SCTModel.list = model.list ) + features <- VariableFeatures(object = combined.assay) + VariableFeatures(object = combined.assay) <- features return(combined.assay) } @@ -2164,7 +2544,10 @@ subset.SCTAssay <- function(x, cells = NULL, features = NULL, ...) { attr <- SCTResults(object = x, slot = "cell.attributes", model = m) attr <- attr[intersect(x = rownames(x = attr), y = Cells(x = x)), , drop = FALSE] SCTResults(object = x, slot = "cell.attributes", model = m) <- attr - } + if (nrow(x = attr) == 0) { + slot(object = x, name = 'SCTModel.list')[[m]] <- NULL + } + } return(x) } @@ -2309,6 +2692,22 @@ setMethod( } ) +setMethod( + f = 'show', + signature = 'BridgeReferenceSet', + definition = function(object) { + cat( + 'A BridgeReferenceSet object has a bridge object with ', + ncol(slot(object = object, name = 'bridge')), + 'cells and a reference object with ', + ncol(slot(object = object, name = 'reference')), + 'cells. \n','The bridge query reduction is ', + slot(object = object, name = 'params')$bridge.query.reduction %||% + slot(object = object, name = 'params')$supervised.reduction, + "\n This can be used as input to FindBridgeTransferAnchors and FindBridgeIntegrationAnchors") + } +) + setMethod( f = 'show', signature = 'SCTModel', @@ -2406,24 +2805,6 @@ Collections <- function(object) { return(names(x = collections)) } -# Calculate nCount and nFeature -# -# @param object An Assay object -# -# @return A named list with nCount and nFeature -# -#' @importFrom Matrix colSums -# -CalcN <- function(object) { - if (IsMatrixEmpty(x = GetAssayData(object = object, slot = "counts"))) { - return(NULL) - } - return(list( - nCount = colSums(x = object, slot = 'counts'), - nFeature = colSums(x = GetAssayData(object = object, slot = 'counts') > 0) - )) -} - # Get the default image of an object # # Attempts to find all images associated with the default assay of the object. @@ -2827,6 +3208,7 @@ UpdateKey <- function(key) { # # @return \code{object} with the latest slot definitions # +#' @importFrom rlang exec !!! UpdateSlots <- function(object) { object.list <- sapply( X = slotNames(x = object), @@ -2843,7 +3225,10 @@ UpdateSlots <- function(object) { ) object.list <- Filter(f = Negate(f = is.null), x = object.list) object.list <- c('Class' = class(x = object)[1], object.list) - object <- do.call(what = 'new', args = object.list) + object <- exec( + .fn = new, + !!! object.list + ) for (x in setdiff(x = slotNames(x = object), y = names(x = object.list))) { xobj <- slot(object = object, name = x) if (is.vector(x = xobj) && !is.list(x = xobj) && length(x = xobj) == 0) { diff --git a/R/preprocessing.R b/R/preprocessing.R index cc238aaac..38a680f9e 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -249,11 +249,13 @@ HTODemux <- function( ) #average hto signals per cluster #work around so we don't average all the RNA levels which takes time - average.expression <- AverageExpression( - object = object, - assays = assay, - verbose = FALSE - )[[assay]] + average.expression <- suppressWarnings( + AverageExpression( + object = object, + assays = assay, + verbose = FALSE + )[[assay]] + ) #checking for any cluster with all zero counts for any barcode if (sum(average.expression == 0) > 0) { stop("Cells with zero counts exist as a cluster.") @@ -363,15 +365,17 @@ HTODemux <- function( #' @seealso \code{\link[sctransform]{get_residuals}} #' #' @examples +#' \dontrun{ #' data("pbmc_small") #' pbmc_small <- SCTransform(object = pbmc_small, variable.features.n = 20) #' pbmc_small <- GetResidual(object = pbmc_small, features = c('MS4A1', 'TCL1A')) +#' } #' GetResidual <- function( object, features, assay = NULL, - umi.assay = NULL, + umi.assay = "RNA", clip.range = NULL, replace.value = FALSE, na.rm = TRUE, @@ -418,23 +422,43 @@ GetResidual <- function( "This SCTAssay contains multiple SCT models. Computing residuals for cells using different models" ) } - new.residuals <- lapply( - X = sct.models, - FUN = function(x) { - GetResidualSCTModel( - object = object, - assay = assay, - SCTModel = x, - new_features = features, - replace.value = replace.value, - clip.range = clip.range, - verbose = verbose - ) - } - ) + if (!umi.assay %in% Assays(object = object) || + length(x = Layers(object = object[[umi.assay]], search = 'counts')) == 0) { + return(object) + } + if (inherits(x = object[[umi.assay]], what = 'Assay')) { + new.residuals <- lapply( + X = sct.models, + FUN = function(x) { + GetResidualSCTModel( + object = object, + assay = assay, + SCTModel = x, + new_features = features, + replace.value = replace.value, + clip.range = clip.range, + verbose = verbose + ) + } + ) + } else if (inherits(x = object[[umi.assay]], what = 'Assay5')) { + new.residuals <- lapply( + X = sct.models, + FUN = function(x) { + FetchResidualSCTModel(object = object, + assay = assay, + umi.assay = umi.assay, + SCTModel = x, + new_features = features, + replace.value = replace.value, + clip.range = clip.range, + verbose = verbose) + } + ) + } existing.data <- GetAssayData(object = object, slot = 'scale.data', assay = assay) all.features <- union(x = rownames(x = existing.data), y = features) - new.scale <- matrix( + new.scale <- matrix( data = NA, nrow = length(x = all.features), ncol = ncol(x = object), @@ -474,13 +498,13 @@ GetResidual <- function( #' and the image data in a subdirectory called \code{spatial} #' @param filename Name of H5 file containing the feature barcode matrix #' @param slice Name for the stored image of the tissue slice +#' @param bin.size Specifies the bin sizes to read in - defaults to c(16, 8) #' @param filter.matrix Only keep spots that have been determined to be over #' tissue -#' @param to.upper Converts all feature names to upper case. This can provide an -#' approximate conversion of mouse to human gene names which can be useful in an -#' explorative analysis. For cross-species comparisons, orthologous genes should -#' be identified across species and used instead. -#' @param image An object of class VisiumV1. Typically, an output from \code{\link{Read10X_Image}} +#' @param to.upper Converts all feature names to upper case. Can be useful when +#' analyses require comparisons between human and mouse gene names for example. +#' @param image \code{VisiumV1}/\code{VisiumV2} instance(s) - if a vector is +#' passed in it should be co-indexed with \code{`bin.size`} #' @param ... Arguments passed to \code{\link{Read10X_h5}} #' #' @return A \code{Seurat} object @@ -488,6 +512,7 @@ GetResidual <- function( #' @importFrom png readPNG #' @importFrom grid rasterGrob #' @importFrom jsonlite fromJSON +#' @importFrom purrr imap #' #' @export #' @concept preprocessing @@ -499,50 +524,125 @@ GetResidual <- function( #' Load10X_Spatial(data.dir = data_dir) #' } #' -Load10X_Spatial <- function( - data.dir, - filename = 'filtered_feature_bc_matrix.h5', - assay = 'Spatial', - slice = 'slice1', - filter.matrix = TRUE, - to.upper = FALSE, - image = NULL, - ... +Load10X_Spatial <- function ( + data.dir, + filename = "filtered_feature_bc_matrix.h5", + assay = "Spatial", + slice = "slice1", + bin.size = NULL, + filter.matrix = TRUE, + to.upper = FALSE, + image = NULL, + ... ) { + # if more than one directory is passed in if (length(x = data.dir) > 1) { - warning("'Load10X_Spatial' accepts only one 'data.dir'", - immediate. = TRUE) + # party on with the first value data.dir <- data.dir[1] + # but also raise a warning + warning( + paste0( + "`data.dir` expects a single value but recieved multiple - ", + "continuing using the first: '", + data.dir, + "'." + ), + immediate. = TRUE, + ) } - data <- Read10X_h5(filename = file.path(data.dir, filename), - ...) - if (to.upper) { - data <- imap(data, ~{ - rownames(.x) <- toupper(rownames(.x)) - .x - }) + # if the specified directory does not exist + if (!file.exists(data.dir)) { + # raise an error + stop(paste0("No such file or directory: ", "'", data.dir, "'")) } - if (is.list(data) & "Antibody Capture" %in% names(data)) { - matrix_gex <- data$`Gene Expression` - matrix_protein <- data$`Antibody Capture` - object <- CreateSeuratObject(counts = matrix_gex, assay = assay) - object_protein <- CreateAssayObject(counts = matrix_protein) - object[["Protein"]] <- object_protein + + # if `bin.size` is not set but `data.dir` points to a folder with binned data + if (is.null(bin.size) & file.exists(paste0(data.dir, "/binned_outputs"))) { + # point `bin.size` to the "standard" set - i.e. everything in the default + # output except the 8 um binning because it's a memory hog + bin.size <- c(16, 8) + } + # if `bin.size` is specified + if(!is.null(bin.size)) { + # convert `bin.size` to a character vector and pad values to three digits + bin.size.pretty <- paste0(sprintf("%03d", bin.size), "um") + # point `data.dirs` to the specified binnings + data.dirs <- paste0( + data.dir, + "/binned_outputs/", + "square_", + bin.size.pretty + ) + # suffix assay/slice names with each bin size + assay.names <- paste0(assay, ".", bin.size.pretty) + slice.names <- paste0(slice, ".", bin.size.pretty) + } else { + # otherwise just hold onto the top-level directory + data.dirs <- data.dir + # and keep the assay/slice names unchanged + assay.names <- assay + slice.names <- slice } - else { - object <- CreateSeuratObject(counts = data, assay = assay) + + # read in counts matrices from specified h5 files + counts.paths <- lapply(data.dirs, file.path, filename) + counts.list <- lapply(counts.paths, Read10X_h5, ...) + # maybe convert Cell identifiers to uppercase + if (to.upper) { + rownames(counts) <- lapply(rownames(counts), toupper) } - if (is.null(x = image)) { - image <- Read10X_Image(image.dir = file.path(data.dir,"spatial"), - filter.matrix = filter.matrix) + + if (is.null(image)) { + # read in the corresponding images and coordinate mappings + image.list <- mapply( + Read10X_Image, + file.path(data.dirs, "spatial"), + assay = assay.names, + slice = slice.names, + MoreArgs = list(filter.matrix = filter.matrix) + ) + } else { + # make sure any passed images are in a vector + image.list <- c(image) } - else { - if (!inherits(x = image, what = "VisiumV1")) - stop("Image must be an object of class 'VisiumV1'.") + + # check that for each counts matrix there is a corresponding image + if (length(image.list) != length(counts.list)) { + stop( + paste0( + "The number of images does not match the number of counts matrices. ", + "Ensure each spatial dataset has a corresponding image." + ) + ) } - image <- image[Cells(x = object)] - DefaultAssay(object = image) <- assay - object[[slice]] <- image + + # for each counts matrix, build a Seurat object + object.list <- mapply(CreateSeuratObject, counts.list, assay = assay.names) + # associate each counts matrix with its corresponding image + object.list <- mapply( + function( + .object, + .image, + .assay, + .slice + ) { + # align the image's identifiers with the object's + .image <- .image[Cells(.object)] + # add the image to the corresponding Seurat instance + .object[[.slice]] <- .image + return (.object) + }, + object.list, + image.list, + assay.names, + slice.names + ) + # merge the Seurat instances - each assay should have unique Cell identifiers + object <- merge( + object.list[[1]], + y = object.list[-1] + ) + return(object) } #' Add 10X Cell Types to a Seurat Object @@ -673,6 +773,39 @@ Load10X <- function(data.dir, filename = "filtered_feature_bc_matrix.h5", } } + +#' Read10x Probe Metadata +#' +#' This function reads the probe metadata from a 10x Genomics probe barcode matrix file in HDF5 format. +#' +#' @param data.dir The directory where the file is located. +#' @param filename The name of the file containing the raw probe barcode matrix in HDF5 format. The default filename is 'raw_probe_bc_matrix.h5'. +#' +#' @return Returns a data.frame containing the probe metadata. +#' +#' @export +#' @concept preprocessing +#' +Read10X_probe_metadata <- function( + data.dir, + filename = 'raw_probe_bc_matrix.h5' +) { + if (!requireNamespace('hdf5r', quietly = TRUE)) { + stop("Please install hdf5r to read HDF5 files") + } + file.path = paste0(data.dir,"/", filename) + if (!file.exists(file.path)) { + stop("File not found") + } + infile <- hdf5r::H5File$new(filename = file.path, mode = 'r') + if("matrix/features/probe_region" %in% hdf5r::list.objects(infile)) { + probe.name <- infile[['matrix/features/name']][] + probe.region<- infile[['matrix/features/probe_region']][] + meta.data <- data.frame(probe.name, probe.region) + return(meta.data) + } +} + #' Load STARmap data #' #' @param data.dir location of data directory that contains the counts matrix, @@ -744,42 +877,89 @@ LoadSTARmap <- function( return(starmap) } -#' Normalize raw data -#' -#' Normalize count data per cell and transform to log scale +#' Load Curio Seeker data #' -#' @param data Matrix with the raw count data -#' @param scale.factor Scale the data. Default is 1e4 -#' @param verbose Print progress +#' @param data.dir location of data directory that contains the counts matrix, +#' gene names, barcodes/beads, and barcodes/bead location files. +#' @param assay Name of assay to associate spatial data to #' -#' @return Returns a matrix with the normalize and log transformed data +#' @return A \code{\link{Seurat}} object #' -#' @importFrom methods as +#' @importFrom Matrix readMM #' #' @export #' @concept preprocessing #' -#' @examples -#' mat <- matrix(data = rbinom(n = 25, size = 5, prob = 0.2), nrow = 5) -#' mat -#' mat_norm <- LogNormalize(data = mat) -#' mat_norm -#' -LogNormalize <- function(data, scale.factor = 1e4, verbose = TRUE) { - if (is.data.frame(x = data)) { - data <- as.matrix(x = data) +LoadCurioSeeker <- function(data.dir, assay = "Spatial") { + # check and find input files + if (length(x = data.dir) > 1) { + warning("'LoadCurioSeeker' accepts only one 'data.dir'", + immediate. = TRUE) + data.dir <- data.dir[1] } - if (!inherits(x = data, what = 'dgCMatrix')) { - data <- as.sparse(x = data) + mtx.file <- list.files( + data.dir, + pattern = "*MoleculesPerMatchedBead.mtx", + full.names = TRUE) + if (length(x = mtx.file) > 1) { + warning("Multiple files matched the pattern '*MoleculesPerMatchedBead.mtx'", + immediate. = TRUE) + } else if (length(x = mtx.file) == 0) { + stop("No file matched the pattern '*MoleculesPerMatchedBead.mtx'", call. = FALSE) } - # call Rcpp function to normalize - if (verbose) { - cat("Performing log-normalization\n", file = stderr()) + mtx.file <- mtx.file[1] + barcodes.file <- list.files( + data.dir, + pattern = "*barcodes.tsv", + full.names = TRUE) + if (length(x = barcodes.file) > 1) { + warning("Multiple files matched the pattern '*barcodes.tsv'", + immediate. = TRUE) + } else if (length(x = barcodes.file) == 0) { + stop("No file matched the pattern '*barcodes.tsv'", call. = FALSE) } - norm.data <- LogNorm(data, scale_factor = scale.factor, display_progress = verbose) - colnames(x = norm.data) <- colnames(x = data) - rownames(x = norm.data) <- rownames(x = data) - return(norm.data) + barcodes.file <- barcodes.file[1] + genes.file <- list.files( + data.dir, + pattern = "*genes.tsv", + full.names = TRUE) + if (length(x = genes.file) > 1) { + warning("Multiple files matched the pattern '*genes.tsv'", + immediate. = TRUE) + } else if (length(x = genes.file) == 0) { + stop("No file matched the pattern '*genes.tsv'", call. = FALSE) + } + genes.file <- genes.file[1] + coordinates.file <- list.files( + data.dir, + pattern = "*MatchedBeadLocation.csv", + full.names = TRUE) + if (length(x = coordinates.file) > 1) { + warning("Multiple files matched the pattern '*MatchedBeadLocation.csv'", + immediate. = TRUE) + } else if (length(x = coordinates.file) == 0) { + stop("No file matched the pattern '*MatchedBeadLocation.csv'", call. = FALSE) + } + coordinates.file <- coordinates.file[1] + + # load counts matrix and create seurat object + mtx <- readMM(mtx.file) + mtx <- as.sparse(mtx) + barcodes <- read.csv(barcodes.file, header = FALSE) + genes <- read.csv(genes.file, header = FALSE) + colnames(mtx) <- barcodes$V1 + rownames(mtx) <- genes$V1 + object <- CreateSeuratObject(counts = mtx, assay = assay) + + # load positions of each bead and store in a SlideSeq object in images slot + coords <- read.csv(coordinates.file) + colnames(coords) <- c("cell", "x", "y") + coords$y <- -coords$y + rownames(coords) <- coords$cell + coords$cell <- NULL + image <- new(Class = 'SlideSeq', assay = assay, coordinates = coords) + object[["Slice"]] <- image + return(object) } #' Demultiplex samples based on classification method from MULTI-seq (McGinnis et al., bioRxiv 2018) @@ -1154,54 +1334,158 @@ Read10X_h5 <- function(filename, use.names = TRUE, unique.features = TRUE) { #' #' @param image.dir Path to directory with 10X Genomics visium image data; #' should include files \code{tissue_lowres_image.png}, -#' @param image.name The file name of the image. Defaults to tissue_lowres_image.png. #' \code{scalefactors_json.json} and \code{tissue_positions_list.csv} +#' @param image.name PNG file to read in +#' @param assay Name of associated assay +#' @param slice Name for the image, used to populate the instance's key #' @param filter.matrix Filter spot/feature matrix to only include spots that -#' have been determined to be over tissue. -#' @param ... Ignored for now +#' have been determined to be over tissue #' -#' @return A \code{\link{VisiumV1}} object +#' @return A \code{\link{VisiumV2}} object #' -#' @importFrom png readPNG -#' @importFrom jsonlite fromJSON -#' -#' @seealso \code{\link{VisiumV1}} \code{\link{Load10X_Spatial}} +#' @seealso \code{\link{VisiumV2}} \code{\link{Load10X_Spatial}} #' #' @export #' @concept preprocessing #' -Read10X_Image <- function(image.dir, image.name = "tissue_lowres_image.png", filter.matrix = TRUE, ...) { - image <- readPNG(source = file.path(image.dir, image.name)) - scale.factors <- fromJSON(txt = file.path(image.dir, 'scalefactors_json.json')) - tissue.positions.path <- Sys.glob(paths = file.path(image.dir, 'tissue_positions*')) - tissue.positions <- read.csv( - file = tissue.positions.path, - col.names = c('barcodes', 'tissue', 'row', 'col', 'imagerow', 'imagecol'), - header = ifelse( - test = basename(tissue.positions.path) == "tissue_positions.csv", - yes = TRUE, - no = FALSE - ), - as.is = TRUE, - row.names = 1 +Read10X_Image <- function( + image.dir, + image.name = "tissue_lowres_image.png", + assay = "Spatial", + slice = "slice1", + filter.matrix = TRUE +) { + image <- png::readPNG( + source = file.path( + image.dir, + image.name + ) + ) + + # read in the scale factors + scale.factors <- Read10X_ScaleFactors( + filename = file.path(image.dir, "scalefactors_json.json") + ) + + # read in the tissue coordinates as a data.frame + coordinates <- Read10X_Coordinates( + filename = Sys.glob(file.path(image.dir, "*tissue_positions*")), + filter.matrix + ) + # create an `sp` compatible `FOV` instance + fov <- CreateFOV( + coordinates[, c("imagerow", "imagecol")], + type = "centroids", + radius = scale.factors[["spot"]], + assay = assay, + key = Key(slice, quiet = TRUE) + ) + + # build the final `VisiumV2` - essentially just adding `image` and + # `scale.factors` to the object + visium.fov <- new( + Class = "VisiumV2", + boundaries = fov@boundaries, + molecules = fov@molecules, + assay = fov@assay, + key = fov@key, + image = image, + scale.factors = scale.factors ) + + return(visium.fov) +} + +#' Load 10X Genomics Visium Tissue Positions +#' +#' @param filename Path to a \code{tissue_positions_list.csv} file +#' @param filter.matrix Filter spot/feature matrix to only include spots that +#' have been determined to be over tissue +#' +#' @return A data.frame +#' +#' @export +#' @concept preprocessing +#' +Read10X_Coordinates <- function(filename, filter.matrix) { + # output columns names + col.names <- c("barcodes", "tissue", "row", "col", "imagerow", "imagecol") + + # if the coordinate mappings are in a parquet file + if (tools::file_ext(filename) == "parquet") { + # `arrow` must be installed to read parquet files + if (!requireNamespace("arrow", quietly = TRUE)) { + stop("Please install arrow to read parquet files") + } + + # read in coordinates and conver the resulting tibble into a data.frame + coordinates <- as.data.frame(arrow::read_parquet(filename)) + # normalize column names for consistency with other datatypes + input.col.names <- c( + "barcode", + "in_tissue", + "array_row", + "array_col", + "pxl_row_in_fullres", + "pxl_col_in_fullres" + ) + col.map <- stats::setNames(col.names, input.col.names) + colnames(coordinates) <- ifelse( + colnames(coordinates) %in% names(col.map), + col.map[colnames(coordinates)], + colnames(coordinates) + ) + + # set rownames to "barcodes" then drop the column + rownames(coordinates) <- coordinates[["barcodes"]] + coordinates[["barcodes"]] <- NULL + + } else { + # the coordinate mappings must be in a CSV - read it in + coordinates <- read.csv( + file = filename, + col.names = col.names, + header = ifelse( + # assume files calles "tissue_positions.csv" have headers, otherwise + # assume they do not (i.e. "tissue_positions_list.csv") + test = basename(filename) == "tissue_positions.csv", + yes = TRUE, + no = FALSE + ), + as.is = TRUE, + row.names = 1 + ) + } + + # the `tissue` column should contain a boolean indicating whether or not a + # spot sits on top of the the tissue sample - maybe filter spots that do not if (filter.matrix) { - tissue.positions <- tissue.positions[which(x = tissue.positions$tissue == 1), , drop = FALSE] + coordinates <- coordinates[which(coordinates$tissue == 1), , drop = FALSE] } - unnormalized.radius <- scale.factors$fiducial_diameter_fullres * scale.factors$tissue_lowres_scalef - spot.radius <- unnormalized.radius / max(dim(x = image)) - return(new( - Class = 'VisiumV1', - image = image, - scale.factors = scalefactors( - spot = scale.factors$tissue_hires_scalef, - fiducial = scale.factors$fiducial_diameter_fullres, - hires = scale.factors$tissue_hires_scalef, - scale.factors$tissue_lowres_scalef - ), - coordinates = tissue.positions, - spot.radius = spot.radius - )) + + return (coordinates) +} + +#' Load 10X Genomics Visium Scale Factors +#' +#' @param filename Path to a \code{scalefactors_json.json} file +#' +#' @return A scalefactors object +#' +#' @export +#' @concept preprocessing +#' +Read10X_ScaleFactors <- function(filename) { + raw.data <- jsonlite::fromJSON(file.path(filename)) + + scale.factors <- scalefactors( + spot = raw.data$spot_diameter_fullres, + fiducial = raw.data$fiducial_diameter_fullres, + hires = raw.data$tissue_hires_scalef, + lowres = raw.data$tissue_lowres_scalef + ) + + return (scale.factors) } #' Read and Load Akoya CODEX data @@ -1677,7 +1961,7 @@ ReadMtx <- function( feature.column, ". Try specifiying a different column.", call. = FALSE - ) + ) } else { warning( "Some features names are NA in column ", @@ -1686,7 +1970,7 @@ ReadMtx <- function( replacement.column, ".", call. = FALSE - ) + ) } feature.names[na.features, feature.column] <- feature.names[na.features, replacement.column] } @@ -1710,7 +1994,7 @@ ReadMtx <- function( no = "" ), call. = FALSE - ) + ) } if (length(x = feature.names) != nrow(x = data)) { stop( @@ -1724,7 +2008,7 @@ ReadMtx <- function( no = "" ), call. = FALSE - ) + ) } colnames(x = data) <- cell.names @@ -2010,7 +2294,7 @@ ReadNanostring <- function( tx <- subset(tx, select = -c(fov, cell_ID)) } - tx <- as.data.frame(t(x = as.matrix(x = tx[, -1, drop = FALSE]))) + tx <- as.data.frame(t(x = as.matrix(x = tx))) if (!is.na(x = genes.filter)) { ptx( message = paste("Filtering genes with pattern", genes.filter), @@ -2124,12 +2408,15 @@ ReadNanostring <- function( #' \itemize{ #' \item \dQuote{matrix}: the counts matrix #' \item \dQuote{microns}: molecule coordinates +#' \item \dQuote{segmentation_method}: cell segmentation method (for runs which +#' use multi-modal segmentation) #' } #' @param type Type of cell spatial coordinate matrices to read; choose one #' or more of: #' \itemize{ #' \item \dQuote{centroids}: cell centroids in pixel coordinate space #' \item \dQuote{segmentations}: cell segmentations in pixel coordinate space +#' \item \dQuote{nucleus_segmentations}: nucleus segmentations in pixel coordinate space #' } #' @param mols.qv.threshold Remove transcript molecules with #' a QV less than this threshold. QV >= 20 is the standard threshold @@ -2153,26 +2440,41 @@ ReadNanostring <- function( #' ReadXenium <- function( data.dir, - outs = c("matrix", "microns"), + outs = c("segmentation_method", "matrix", "microns"), type = "centroids", - mols.qv.threshold = 20 + mols.qv.threshold = 20, + flip.xy = F ) { # Argument checking type <- match.arg( arg = type, - choices = c("centroids", "segmentations"), + choices = c("centroids", "segmentations", "nucleus_segmentations"), several.ok = TRUE ) outs <- match.arg( arg = outs, - choices = c("matrix", "microns"), + choices = c("segmentation_method", "matrix", "microns"), several.ok = TRUE ) outs <- c(outs, type) has_dt <- requireNamespace("data.table", quietly = TRUE) && requireNamespace("R.utils", quietly = TRUE) + has_arrow <- requireNamespace("arrow", quietly = TRUE) + has_hdf5r <- requireNamespace("hdf5r", quietly = TRUE) + + binary_to_string <- function(arrow_binary) { + if(typeof(arrow_binary) == 'list') { + unlist( + lapply( + arrow_binary, function(x) rawToChar(as.raw(strtoi(x, 16L))) + ) + ) + } else { + arrow_binary + } + } data <- sapply(outs, function(otype) { switch( @@ -2180,10 +2482,66 @@ ReadXenium <- function( 'matrix' = { pmtx <- progressor() pmtx(message = 'Reading counts matrix', class = 'sticky', amount = 0) - matrix <- suppressWarnings(Read10X(data.dir = file.path(data.dir, "cell_feature_matrix/"))) + + for(option in Filter(function(x) x$req, list( + list(filename = "cell_feature_matrix.h5", fn = Read10X_h5, req = has_hdf5r), + list(filename = "cell_feature_matrix", fn = Read10X, req = TRUE) + ))) { + matrix <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename)))) + if(!inherits(matrix, "try-error")) { break } + } + + if(!exists('matrix') || inherits(matrix, "try-error")) { + stop("Xenium outputs were incomplete: missing cell_feature_matrix") + } + pmtx(type = "finish") matrix }, + 'segmentation_method' = { + psegs <- progressor() + psegs( + message = 'Loading cell metadata', + class = 'sticky', + amount = 0 + ) + + col.use <- c( + cell_id = 'cell', + segmentation_method = 'segmentation_method' + ) + + for(option in Filter(function(x) x$req, list( + list( + filename = "cells.parquet", + fn = function(x) as.data.frame(arrow::read_parquet(x, col_select = names(col.use))), + req = has_arrow + ), + list( + filename = "cells.csv.gz", + fn = function(x) data.table::fread(x, data.table = FALSE, stringsAsFactors = FALSE, select = names(col.use)), + req = has_dt + ), + list(filename = "cells.csv.gz", fn = function(x) read.csv(x, stringsAsFactors = FALSE), req = TRUE) + ))) { + cell_seg <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename))), silent = TRUE) + if(!inherits(cell_seg, "try-error")) { break } + } + + if(!exists('cell_seg') || inherits(cell_seg, "try-error") || length(intersect(names(col.use), colnames(cell_seg))) != 2) { + warning('cells did not contain a segmentation_method column. Skipping...', call. = FALSE, immediate. = TRUE) + NULL + } else { + cell_seg <- cell_seg[, names(col.use)] + colnames(cell_seg) <- col.use + + cell_seg$cell <- binary_to_string(cell_seg$cell) + + psegs(type = 'finish') + + data.frame(segmentation_method = cell_seg$segmentation_method, row.names = cell_seg$cell) + } + }, 'centroids' = { pcents <- progressor() pcents( @@ -2191,19 +2549,42 @@ ReadXenium <- function( class = 'sticky', amount = 0 ) - if (has_dt) { - cell_info <- as.data.frame(data.table::fread(file.path(data.dir, "cells.csv.gz"))) - } else { - cell_info <- read.csv(file.path(data.dir, "cells.csv.gz")) - } - cell_centroid_df <- data.frame( - x = cell_info$x_centroid, - y = cell_info$y_centroid, - cell = cell_info$cell_id, - stringsAsFactors = FALSE + + col.use <- c( + x_centroid = letters[24 + flip.xy], + y_centroid = letters[25 - flip.xy], + cell_id = 'cell' ) + + for(option in Filter(function(x) x$req, list( + list( + filename = "cells.parquet", + fn = function(x) as.data.frame(arrow::read_parquet(x, col_select = names(col.use))), + req = has_arrow + ), + list( + filename = "cells.csv.gz", + fn = function(x) data.table::fread(x, data.table = FALSE, stringsAsFactors = FALSE, select = names(col.use)), + req = has_dt + ), + list(filename = "cells.csv.gz", fn = function(x) read.csv(x, stringsAsFactors = FALSE), req = TRUE) + ))) { + cell_info <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename)))) + if(!inherits(cell_info, "try-error")) { break } + } + + if(!exists('cell_info') || inherits(cell_info, "try-error")) { + stop("Xenium outputs were incomplete: missing cells") + } + + cell_info$cell_id <- binary_to_string(cell_info$cell_id) + + cell_info <- cell_info[, names(col.use)] + colnames(cell_info) <- col.use + pcents(type = 'finish') - cell_centroid_df + + cell_info }, 'segmentations' = { psegs <- progressor() @@ -2213,16 +2594,79 @@ ReadXenium <- function( amount = 0 ) - # load cell boundaries - if (has_dt) { - cell_boundaries_df <- as.data.frame(data.table::fread(file.path(data.dir, "cell_boundaries.csv.gz"))) - } else { - cell_boundaries_df <- read.csv(file.path(data.dir, "cell_boundaries.csv.gz"), stringsAsFactors = FALSE) + for(option in Filter(function(x) x$req, list( + list( + filename = "cell_boundaries.parquet", + fn = function(x) as.data.frame(arrow::read_parquet(x)), + req = has_arrow + ), + list( + filename = "cell_boundaries.csv.gz", + fn = function(x) data.table::fread(x, data.table = FALSE, stringsAsFactors = FALSE), + req = has_dt + ), + list(filename = "cell_boundaries.csv.gz", fn = function(x) read.csv(x, stringsAsFactors = FALSE), req = TRUE) + ))) { + cell_boundaries_df <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename)))) + if(!inherits(cell_boundaries_df, "try-error")) { break } } - names(cell_boundaries_df) <- c("cell", "x", "y") + + if(!exists('cell_boundaries_df') || inherits(cell_boundaries_df, "try-error")) { + stop("Xenium outputs were incomplete: missing cell_boundaries") + } + + colnames(cell_boundaries_df) <- c( + 'cell', + letters[24 + flip.xy], + letters[25 - flip.xy] + ) + + cell_boundaries_df$cell <- binary_to_string(cell_boundaries_df$cell) + psegs(type = "finish") + cell_boundaries_df }, + 'nucleus_segmentations' = { + psegs <- progressor() + psegs( + message = 'Loading nucleus segmentations', + class = 'sticky', + amount = 0 + ) + + for(option in Filter(function(x) x$req, list( + list( + filename = "nucleus_boundaries.parquet", + fn = function(x) as.data.frame(arrow::read_parquet(x)), + req = has_arrow + ), + list( + filename = "nucleus_boundaries.csv.gz", + fn = function(x) data.table::fread(x, data.table = FALSE, stringsAsFactors = FALSE), + req = has_dt + ), + list(filename = "nucleus_boundaries.csv.gz", fn = function(x) read.csv(x, stringsAsFactors = FALSE), req = TRUE) + ))) { + nucleus_boundaries_df <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename)))) + if(!inherits(nucleus_boundaries_df, "try-error")) { break } + } + + if(!exists('nucleus_boundaries_df') || inherits(nucleus_boundaries_df, "try-error")) { + stop("Xenium outputs were incomplete: missing nucleus_boundaries") + } + + colnames(nucleus_boundaries_df) <- c( + 'cell', + letters[24 + flip.xy], + letters[25 - flip.xy] + ) + nucleus_boundaries_df$cell <- binary_to_string(nucleus_boundaries_df$cell) + + psegs(type = "finish") + + nucleus_boundaries_df + }, 'microns' = { pmicrons <- progressor() pmicrons( @@ -2231,28 +2675,65 @@ ReadXenium <- function( amount = 0 ) - # molecules - if (has_dt) { - tx_dt <- as.data.frame(data.table::fread(file.path(data.dir, "transcripts.csv.gz"))) - transcripts <- subset(tx_dt, qv >= mols.qv.threshold) - } else { - transcripts <- read.csv(file.path(data.dir, "transcripts.csv.gz")) - transcripts <- subset(transcripts, qv >= mols.qv.threshold) + col.use = c( + x_location = letters[24+flip.xy], + y_location = letters[25-flip.xy], + feature_name = 'gene' + ) + + for(option in Filter(function(x) x$req, list( + list( + filename = "transcripts.parquet", + fn = function(x) as.data.frame(arrow::read_parquet(x, col_select = names(col.use))), + req = has_arrow + ), + list( + filename = "transcripts.csv.gz", + fn = function(x) data.table::fread(x, data.table = FALSE, select = names(col.use), stringsAsFactors = FALSE), + req = has_dt + ), + list(filename = "transcripts.csv.gz", fn = function(x) read.csv(x, stringsAsFactors = FALSE), req = TRUE) + ))) { + transcripts <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename)))) + if(!inherits(transcripts, "try-error")) { break } } - df <- - data.frame( - x = transcripts$x_location, - y = transcripts$y_location, - gene = transcripts$feature_name, - stringsAsFactors = FALSE - ) + if(!exists('transcripts') || inherits(transcripts, "try-error")) { + hint <- "" + if(file.exists(file.path(data.dir, "transcripts.parquet"))) { + hint <- ". Xenium outputs no longer include `transcripts.csv.gz`. Instead, please install `arrow` to read transcripts.parquet" + } + + stop(paste0("Xenium outputs were incomplete: missing transcripts", hint)) + } + + transcripts <- transcripts[, names(col.use)] + colnames(transcripts) <- col.use + + transcripts$gene <- binary_to_string(transcripts$gene) + pmicrons(type = 'finish') - df + + transcripts }, stop("Unknown Xenium input type: ", otype) ) - }, USE.NAMES = TRUE) + }, simplify = FALSE, USE.NAMES = TRUE) + + metadata <- file.path(data.dir, "experiment.xenium") + if(file.exists(metadata) && requireNamespace("jsonlite", quietly = TRUE)) { + meta <- jsonlite::read_json(metadata) + data$metadata <- meta[ + intersect( + names(meta), + c( + 'run_start_time', 'preservation_method', 'panel_name', + 'panel_organism', 'panel_tissue_type', + 'instrument_sw_version', 'segmentation_stain' + ) + ) + ] + } return(data) } @@ -3133,16 +3614,15 @@ SampleUMI <- function( #' Use regularized negative binomial regression to normalize UMI count data #' #' This function calls sctransform::vst. The sctransform package is available at -#' https://github.com/ChristophH/sctransform. +#' https://github.com/satijalab/sctransform. #' Use this function as an alternative to the NormalizeData, #' FindVariableFeatures, ScaleData workflow. Results are saved in a new assay #' (named SCT by default) with counts being (corrected) counts, data being log1p(counts), #' scale.data being pearson residuals; sctransform::vst intermediate results are saved #' in misc slot of new assay. #' -#' @param object A seurat object -#' @param assay Name of assay to pull the count data from; default is 'RNA' -#' @param new.assay.name Name for the new assay containing the normalized data +#' @param object UMI counts matrix +#' @param cell.attr A metadata with cell attributes #' @param reference.SCT.model If not NULL, compute residuals for the object #' using the provided SCT model; supports only log_umi as the latent variable. #' If residual.features are not specified, compute for the top variable.features.n @@ -3159,11 +3639,15 @@ SampleUMI <- function( #' use this residual variance cutoff; this is only used when \code{variable.features.n} #' is set to NULL; default is 1.3. Only applied if residual.features is not set. #' @param vars.to.regress Variables to regress out in a second non-regularized linear +#' @param latent.data Extra data to regress out, should be cells x latent data #' regression. For example, percent.mito. Default is NULL #' @param do.scale Whether to scale residuals to have unit variance; default is FALSE #' @param do.center Whether to center residuals to have mean zero; default is TRUE #' @param clip.range Range to clip the residuals to; default is \code{c(-sqrt(n/30), sqrt(n/30))}, #' where n is the number of cells +#' @param vst.flavor When set to 'v2' sets method = glmGamPoi_offset, n_cells=2000, +#' and exclude_poisson = TRUE which causes the model to learn theta and intercept +#' only besides excluding poisson genes from learning and regularization #' @param conserve.memory If set to TRUE the residual matrix for all genes is never #' created in full; useful for large data sets, but will take longer to run; #' this will also set return.only.var.genes to TRUE; default is FALSE @@ -3180,20 +3664,19 @@ SampleUMI <- function( #' slot of the new assay. #' #' @importFrom stats setNames +#' @importFrom Matrix colSums +#' @importFrom SeuratObject as.sparse #' @importFrom sctransform vst get_residual_var get_residuals correct_counts #' #' @seealso \code{\link[sctransform]{correct_counts}} \code{\link[sctransform]{get_residuals}} -#' @export -#' @concept preprocessing #' -#' @examples -#' data("pbmc_small") -#' SCTransform(object = pbmc_small) +#' @rdname SCTransform +#' @concept preprocessing +#' @export #' -SCTransform <- function( +SCTransform.default <- function( object, - assay = 'RNA', - new.assay.name = 'SCT', + cell.attr, reference.SCT.model = NULL, do.correct.umi = TRUE, ncells = 5000, @@ -3201,9 +3684,11 @@ SCTransform <- function( variable.features.n = 3000, variable.features.rv.th = 1.3, vars.to.regress = NULL, + latent.data = NULL, do.scale = FALSE, do.center = TRUE, - clip.range = c(-sqrt(x = ncol(x = object[[assay]]) / 30), sqrt(x = ncol(x = object[[assay]]) / 30)), + clip.range = c(-sqrt(x = ncol(x = umi) / 30), sqrt(x = ncol(x = umi) / 30)), + vst.flavor = 'v2', conserve.memory = FALSE, return.only.var.genes = TRUE, seed.use = 1448145, @@ -3213,11 +3698,9 @@ SCTransform <- function( if (!is.null(x = seed.use)) { set.seed(seed = seed.use) } - assay <- assay %||% DefaultAssay(object = object) - assay.obj <- GetAssay(object = object, assay = assay) - umi <- GetAssayData(object = assay.obj, slot = 'counts') - cell.attr <- slot(object = object, name = 'meta.data') vst.args <- list(...) + object <- as.sparse(x = object) + umi <- object # check for batch_var in meta data if ('batch_var' %in% names(x = vst.args)) { if (!(vst.args[['batch_var']] %in% colnames(x = cell.attr))) { @@ -3268,9 +3751,18 @@ SCTransform <- function( immediate. = TRUE ) } + + if (!is.null(x = vst.flavor) && !vst.flavor %in% c("v1", "v2")){ + stop("vst.flavor can be 'v1' or 'v2'. Default is 'v2'") + } + if (!is.null(x = vst.flavor) && vst.flavor == "v1"){ + vst.flavor <- NULL + } + + vst.args[['vst.flavor']] <- vst.flavor vst.args[['umi']] <- umi vst.args[['cell_attr']] <- cell.attr - vst.args[['verbosity']] <- as.numeric(x = verbose) * 2 + vst.args[['verbosity']] <- as.numeric(x = verbose) * 1 vst.args[['return_cell_attr']] <- TRUE vst.args[['return_gene_attr']] <- TRUE vst.args[['return_corrected_umi']] <- do.correct.umi @@ -3287,7 +3779,6 @@ SCTransform <- function( } else { sct.method <- "default" } - # set vst model vst.out <- switch( EXPR = sct.method, @@ -3299,13 +3790,10 @@ SCTransform <- function( do.correct.umi <- FALSE vst.out <- reference.SCT.model clip.range <- vst.out$arguments$sct.clip.range - umi.field <- paste0("nCount_", assay) - vst.out$cell_attr <- - if (umi.field %in% colnames(x = object[[]])) { - data.frame(log_umi = log10(x = object[[umi.field, drop = T]])) - } else { - data.frame(log_umi = log10(x = CalcN(object = object[[assay]])$nCount)) - } + cell_attr <- data.frame(log_umi = log10(x = colSums(umi))) + rownames(cell_attr) <- colnames(x = umi) + vst.out$cell_attr <- cell_attr + all.features <- intersect( x = rownames(x = vst.out$gene_attr), y = rownames(x = umi) @@ -3414,7 +3902,7 @@ SCTransform <- function( vst.out$umi_corrected <- correct_counts( x = vst.out, umi = umi, - verbosity = as.numeric(x = verbose) * 2 + verbosity = as.numeric(x = verbose) * 1 ) } vst.out @@ -3425,24 +3913,7 @@ SCTransform <- function( } vst.out }) - # create output assay and put (corrected) umi counts in count slot - if (do.correct.umi & residual.type == 'pearson') { - if (verbose) { - message('Place corrected count matrix in counts slot') - } - assay.out <- CreateAssayObject(counts = vst.out$umi_corrected, check.matrix = FALSE) - vst.out$umi_corrected <- NULL - } else { - assay.out <- CreateAssayObject(counts = umi, check.matrix = FALSE) - } - # set the variable genes - VariableFeatures(object = assay.out) <- residual.features %||% top.features - # put log1p transformed counts in data - assay.out <- SetAssayData( - object = assay.out, - slot = 'data', - new.data = log1p(x = GetAssayData(object = assay.out, slot = 'counts')) - ) + scale.data <- vst.out$y # clip the residuals scale.data[scale.data < clip.range[1]] <- clip.range[1] @@ -3452,7 +3923,7 @@ SCTransform <- function( scale.data, features = NULL, vars.to.regress = vars.to.regress, - latent.data = cell.attr[, vars.to.regress, drop = FALSE], + latent.data = latent.data, model.use = 'linear', use.umi = FALSE, do.scale = do.scale, @@ -3462,21 +3933,178 @@ SCTransform <- function( min.cells.to.block = 3000, verbose = verbose ) + vst.out$y <- scale.data + vst.out$variable_features <- residual.features %||% top.features + if (!do.correct.umi) { + vst.out$umi_corrected <- umi + } + min_var <- vst.out$arguments$min_variance + return(vst.out) +} + +#' @rdname SCTransform +#' @concept preprocessing +#' @export +#' @method SCTransform Assay +#' +SCTransform.Assay <- function( + object, + cell.attr, + reference.SCT.model = NULL, + do.correct.umi = TRUE, + ncells = 5000, + residual.features = NULL, + variable.features.n = 3000, + variable.features.rv.th = 1.3, + vars.to.regress = NULL, + latent.data = NULL, + do.scale = FALSE, + do.center = TRUE, + clip.range = c(-sqrt(x = ncol(x = object) / 30), sqrt(x = ncol(x = object) / 30)), + vst.flavor = 'v2', + conserve.memory = FALSE, + return.only.var.genes = TRUE, + seed.use = 1448145, + verbose = TRUE, + ... +) { + if (!is.null(x = seed.use)) { + set.seed(seed = seed.use) + } + if (!is.null(reference.SCT.model)){ + do.correct.umi <- FALSE + do.center <- FALSE + } + + umi <- GetAssayData(object = object, slot = 'counts') + vst.out <- SCTransform(object = umi, + cell.attr = cell.attr, + reference.SCT.model = reference.SCT.model, + do.correct.umi = do.correct.umi, + ncells = ncells, + residual.features = residual.features, + variable.features.n = variable.features.n, + variable.features.rv.th = variable.features.rv.th, + vars.to.regress = vars.to.regress, + latent.data = latent.data, + do.scale = do.scale, + do.center = do.center, + clip.range = clip.range, + vst.flavor = vst.flavor, + conserve.memory = conserve.memory, + return.only.var.genes = return.only.var.genes, + seed.use = seed.use, + verbose = verbose, + ...) + residual.type <- vst.out[['residual_type']] %||% 'pearson' + sct.method <- vst.out[["sct.method"]] + # create output assay and put (corrected) umi counts in count slot + if (do.correct.umi & residual.type == 'pearson') { + if (verbose) { + message('Place corrected count matrix in counts slot') + } + assay.out <- CreateAssayObject(counts = vst.out$umi_corrected) + vst.out$umi_corrected <- NULL + } else { + # TODO: restore once check.matrix is in SeuratObject + # assay.out <- CreateAssayObject(counts = umi, check.matrix = FALSE) + assay.out <- CreateAssayObject(counts = umi) + } + # set the variable genes + VariableFeatures(object = assay.out) <- vst.out$variable_features + # put log1p transformed counts in data + assay.out <- SetAssayData( + object = assay.out, + slot = 'data', + new.data = log1p(x = GetAssayData(object = assay.out, slot = 'counts')) + ) + scale.data <- vst.out$y assay.out <- SetAssayData( object = assay.out, slot = 'scale.data', new.data = scale.data ) - # save vst output (except y) in @misc slot vst.out$y <- NULL # save clip.range into vst model vst.out$arguments$sct.clip.range <- clip.range vst.out$arguments$sct.method <- sct.method Misc(object = assay.out, slot = 'vst.out') <- vst.out assay.out <- as(object = assay.out, Class = "SCTAssay") - assay.out <- SCTAssay(assay.out, assay.orig = assay) - slot(object = slot(object = assay.out, name = "SCTModel.list")[[1]], name = "umi.assay") <- assay - object[[new.assay.name]] <- assay.out + return(assay.out) +} + +#' @param assay Name of assay to pull the count data from; default is 'RNA' +#' @param new.assay.name Name for the new assay containing the normalized data; default is 'SCT' +#' +#' @rdname SCTransform +#' @concept preprocessing +#' @export +#' @method SCTransform Seurat +#' +SCTransform.Seurat <- function( + object, + assay = "RNA", + new.assay.name = 'SCT', + reference.SCT.model = NULL, + do.correct.umi = TRUE, + ncells = 5000, + residual.features = NULL, + variable.features.n = 3000, + variable.features.rv.th = 1.3, + vars.to.regress = NULL, + do.scale = FALSE, + do.center = TRUE, + clip.range = c(-sqrt(x = ncol(x = object[[assay]]) / 30), sqrt(x = ncol(x = object[[assay]]) / 30)), + vst.flavor = "v2", + conserve.memory = FALSE, + return.only.var.genes = TRUE, + seed.use = 1448145, + verbose = TRUE, + ... +) { + if (!is.null(x = seed.use)) { + set.seed(seed = seed.use) + } + if (any(vars.to.regress %in% colnames(x = object[[]]))) { + vars.to.regress.subset <- vars.to.regress[vars.to.regress %in% colnames(x = object[[]])] + latent.data <- object[[vars.to.regress.subset]] + } else { + latent.data <- NULL + } + assay <- assay %||% DefaultAssay(object = object) + if (assay == "SCT") { + # if re-running SCTransform, use the RNA assay + assay <- "RNA" + warning("Running SCTransform on the RNA assay while default assay is SCT.") + } + + if (verbose){ + message("Running SCTransform on assay: ", assay) + } + cell.attr <- slot(object = object, name = 'meta.data')[colnames(object[[assay]]),] + assay.data <- SCTransform(object = object[[assay]], + cell.attr = cell.attr, + reference.SCT.model = reference.SCT.model, + do.correct.umi = do.correct.umi, + ncells = ncells, + residual.features = residual.features, + variable.features.n = variable.features.n, + variable.features.rv.th = variable.features.rv.th, + vars.to.regress = vars.to.regress, + latent.data = latent.data, + do.scale = do.scale, + do.center = do.center, + clip.range = clip.range, + vst.flavor = vst.flavor, + conserve.memory = conserve.memory, + return.only.var.genes = return.only.var.genes, + seed.use = seed.use, + verbose = verbose, + ...) + assay.data <- SCTAssay(assay.data, assay.orig = assay) + slot(object = slot(object = assay.data, name = "SCTModel.list")[[1]], name = "umi.assay") <- assay + object[[new.assay.name]] <- assay.data + if (verbose) { message(paste("Set default assay to", new.assay.name)) } @@ -3526,19 +4154,21 @@ SubsetByBarcodeInflections <- function(object) { #' @param selection.method How to choose top variable features. Choose one of : #' \itemize{ -#' \item{vst:}{ First, fits a line to the relationship of log(variance) and -#' log(mean) using local polynomial regression (loess). Then standardizes the -#' feature values using the observed mean and expected variance (given by the -#' fitted line). Feature variance is then calculated on the standardized values -#' after clipping to a maximum (see clip.max parameter).} -#' \item{mean.var.plot (mvp):}{ First, uses a function to calculate average -#' expression (mean.function) and dispersion (dispersion.function) for each -#' feature. Next, divides features into num.bin (deafult 20) bins based on -#' their average expression, and calculates z-scores for dispersion within -#' each bin. The purpose of this is to identify variable features while -#' controlling for the strong relationship between variability and average -#' expression.} -#' \item{dispersion (disp):}{ selects the genes with the highest dispersion values} +#' \item \dQuote{\code{vst}}: First, fits a line to the relationship of +#' log(variance) and log(mean) using local polynomial regression (loess). +#' Then standardizes the feature values using the observed mean and +#' expected variance (given by the fitted line). Feature variance is then +#' calculated on the standardized values +#' after clipping to a maximum (see clip.max parameter). +#' \item \dQuote{\code{mean.var.plot}} (mvp): First, uses a function to +#' calculate average expression (mean.function) and dispersion +#' (dispersion.function) for each feature. Next, divides features into +#' \code{num.bin} (deafult 20) bins based on their average expression, +#' and calculates z-scores for dispersion within each bin. The purpose of +#' this is to identify variable features while controlling for the +#' strong relationship between variability and average expression +#' \item \dQuote{\code{dispersion}} (disp): selects the genes with the +#' highest dispersion values #' } #' @param loess.span (vst method) Loess span parameter used when fitting the #' variance-mean relationship @@ -3554,10 +4184,12 @@ SubsetByBarcodeInflections <- function(object) { #' @param binning.method Specifies how the bins should be computed. Available #' methods are: #' \itemize{ -#' \item{equal_width:}{ each bin is of equal width along the x-axis [default]} -#' \item{equal_frequency:}{ each bin contains an equal number of features (can -#' increase statistical power to detect overdispersed features at high -#' expression values, at the cost of reduced resolution along the x-axis)} +#' \item \dQuote{\code{equal_width}}: each bin is of equal width along the +#' x-axis (default) +#' \item \dQuote{\code{equal_frequency}}: each bin contains an equal number +#' of features (can increase statistical power to detect overdispersed +#' eatures at high expression values, at the cost of reduced resolution +#' along the x-axis) #' } #' @param verbose show progress bar for calculations #' @@ -3565,7 +4197,7 @@ SubsetByBarcodeInflections <- function(object) { #' @concept preprocessing #' @export #' -FindVariableFeatures.default <- function( +FindVariableFeatures.V3Matrix <- function( object, selection.method = "vst", loess.span = 0.3, @@ -3628,6 +4260,7 @@ FindVariableFeatures.default <- function( EXPR = binning.method, 'equal_width' = num.bin, 'equal_frequency' = c( + -1, quantile( x = feature.mean[feature.mean > 0], probs = seq.int(from = 0, to = 1, length.out = num.bin) @@ -3635,8 +4268,7 @@ FindVariableFeatures.default <- function( ), stop("Unknown binning method: ", binning.method) ) - data.x.bin <- cut(x = feature.mean, breaks = data.x.breaks, - include.lowest = TRUE) + data.x.bin <- cut(x = feature.mean, breaks = data.x.breaks) names(x = data.x.bin) <- names(x = feature.mean) mean.y <- tapply(X = feature.dispersion, INDEX = data.x.bin, FUN = mean) sd.y <- tapply(X = feature.dispersion, INDEX = data.x.bin, FUN = sd) @@ -3727,7 +4359,7 @@ FindVariableFeatures.Assay <- function( }, 'dispersion' = head(x = rownames(x = hvf.info), n = nfeatures), 'vst' = head(x = rownames(x = hvf.info), n = nfeatures), - stop("Unknown selection method: ", selection.method) + stop("Unkown selection method: ", selection.method) ) VariableFeatures(object = object) <- top.features vf.name <- ifelse( @@ -3782,10 +4414,10 @@ FindVariableFeatures.Seurat <- function( verbose = TRUE, ... ) { - assay <- assay %||% DefaultAssay(object = object) - assay.data <- GetAssay(object = object, assay = assay) + assay <- assay[1L] %||% DefaultAssay(object = object) + assay <- match.arg(arg = assay, Assays(object = object)) assay.data <- FindVariableFeatures( - object = assay.data, + object = object[[assay]], selection.method = selection.method, loess.span = loess.span, clip.max = clip.max, @@ -3906,35 +4538,8 @@ FindSpatiallyVariableFeatures.Assay <- function( features <- features[! features %in% features.computed] } data <- GetAssayData(object = object, slot = slot) - missing.features <- which(x = ! features %in% rownames(x = data)) - if (length(x = missing.features) > 0) { - remaining.features <- length(x = features) - length(x = missing.features) - if (length(x = remaining.features) > 0) { - warning("Not all requested features are present in the requested slot (", - slot, "). Removing ", length(x = missing.features), - " missing features and continuing with ", remaining.features, - " remaining features.", immediate. = TRUE, call. = FALSE) - features <- features[features %in% rownames(x = data)] - } else { - stop("None of the requested features are present in the requested slot (", - slot, ").", call. = FALSE) - } - } - image.cells <- rownames(x = spatial.location) - data <- as.matrix(x = data[features, image.cells, drop = FALSE]) - rv <- RowVar(x = data) - rv.small <- which(x = rv < 1e-16) - rv.remove <- c() - if (length(x = rv.small) > 0) { - for (i in rv.small) { - if (var(x = data[i, ]) == 0) { - rv.remove <- c(rv.remove, i) - } - } - } - if (length(x = rv.remove) > 0) { - data <- data[-c(rv.remove), , drop = FALSE] - } + data <- as.matrix(x = data[features, ]) + data <- data[RowVar(x = data) > 0, ] if (nrow(x = data) != 0) { svf.info <- FindSpatiallyVariableFeatures( object = data, @@ -4015,18 +4620,65 @@ FindSpatiallyVariableFeatures.Seurat <- function( object <- LogSeuratCommand(object = object) } +#' @rdname LogNormalize +#' @method LogNormalize data.frame +#' @export +#' +LogNormalize.data.frame <- function( + data, + scale.factor = 1e4, + margin = 2L, + verbose = TRUE, + ... +) { + return(LogNormalize( + data = as.matrix(x = data), + scale.factor = scale.factor, + verbose = verbose, + ... + )) +} + +#' @rdname LogNormalize +#' @method LogNormalize V3Matrix +#' @export +#' +LogNormalize.V3Matrix <- function( + data, + scale.factor = 1e4, + margin = 2L, + verbose = TRUE, + ... +) { + # if (is.data.frame(x = data)) { + # data <- as.matrix(x = data) + # } + if (!inherits(x = data, what = 'dgCMatrix')) { + data <- as(object = data, Class = "dgCMatrix") + } + # call Rcpp function to normalize + if (verbose) { + cat("Performing log-normalization\n", file = stderr()) + } + norm.data <- LogNorm(data, scale_factor = scale.factor, display_progress = verbose) + colnames(x = norm.data) <- colnames(x = data) + rownames(x = norm.data) <- rownames(x = data) + return(norm.data) +} + #' @importFrom future.apply future_lapply #' @importFrom future nbrOfWorkers #' #' @param normalization.method Method for normalization. #' \itemize{ -#' \item{LogNormalize: }{Feature counts for each cell are divided by the total -#' counts for that cell and multiplied by the scale.factor. This is then -#' natural-log transformed using log1p.} -#' \item{CLR: }{Applies a centered log ratio transformation} -#' \item{RC: }{Relative counts. Feature counts for each cell are divided by the total -#' counts for that cell and multiplied by the scale.factor. No log-transformation is applied. -#' For counts per million (CPM) set \code{scale.factor = 1e6}} +#' \item \dQuote{\code{LogNormalize}}: Feature counts for each cell are +#' divided by the total counts for that cell and multiplied by the +#' \code{scale.factor}. This is then natural-log transformed using \code{log1p} +#' \item \dQuote{\code{CLR}}: Applies a centered log ratio transformation +#' \item \dQuote{\code{RC}}: Relative counts. Feature counts for each cell +#' are divided by the total counts for that cell and multiplied by the +#' \code{scale.factor}. No log-transformation is applied. For counts per +#' million (CPM) set \code{scale.factor = 1e6} #' } #' @param scale.factor Sets the scale factor for cell-level normalization #' @param margin If performing CLR normalization, normalize across features (1) or cells (2) @@ -4038,7 +4690,7 @@ FindSpatiallyVariableFeatures.Seurat <- function( #' @concept preprocessing #' @export #' -NormalizeData.default <- function( +NormalizeData.V3Matrix <- function( object, normalization.method = "LogNormalize", scale.factor = 1e4, @@ -4134,7 +4786,7 @@ NormalizeData.default <- function( scale.factor = scale.factor, verbose = verbose ), - stop("Unknown normalization method: ", normalization.method) + stop("Unkown normalization method: ", normalization.method) ) } return(normalized.data) @@ -4192,9 +4844,8 @@ NormalizeData.Seurat <- function( ... ) { assay <- assay %||% DefaultAssay(object = object) - assay.data <- GetAssay(object = object, assay = assay) assay.data <- NormalizeData( - object = assay.data, + object = object[[assay]], normalization.method = normalization.method, scale.factor = scale.factor, verbose = verbose, @@ -4474,6 +5125,45 @@ ScaleData.default <- function( return(scaled.data) } +#' @rdname ScaleData +#' @concept preprocessing +#' @export +#' @method ScaleData IterableMatrix +#' +ScaleData.IterableMatrix <- function( + object, + features = NULL, + do.scale = TRUE, + do.center = TRUE, + scale.max = 10, + ... +) { + features <- features %||% rownames(x = object) + features <- as.vector(x = intersect(x = features, y = rownames(x = object))) + object <- object[features, , drop = FALSE] + if (do.center) { + features.mean <- BPCells::matrix_stats( + matrix = object, + row_stats = 'mean')$row_stats['mean',] + } else { + features.mean <- 0 + } + if (do.scale) { + features.sd <- sqrt(BPCells::matrix_stats( + matrix = object, + row_stats = 'variance')$row_stats['variance',]) + features.sd[features.sd == 0] <- 0.01 + } else { + features.sd <- 1 + } + if (scale.max != Inf) { + object <- BPCells::min_by_row(mat = object, vals = scale.max * features.sd + features.mean) + } + scaled.data <- (object - features.mean) / features.sd +return(scaled.data) +} + + #' @rdname ScaleData #' @concept preprocessing #' @export @@ -4553,8 +5243,8 @@ ScaleData.Seurat <- function( verbose = TRUE, ... ) { - assay <- assay %||% DefaultAssay(object = object) - assay.data <- GetAssay(object = object, assay = assay) + assay <- assay[1L] %||% DefaultAssay(object = object) + assay <- match.arg(arg = assay, choices = Assays(object = object)) if (any(vars.to.regress %in% colnames(x = object[[]]))) { latent.data <- object[[vars.to.regress[vars.to.regress %in% colnames(x = object[[]])]]] } else { @@ -4564,7 +5254,8 @@ ScaleData.Seurat <- function( split.by <- object[[split.by]] } assay.data <- ScaleData( - object = assay.data, + # object = assay.data, + object = object[[assay]], features = features, vars.to.regress = vars.to.regress, latent.data = latent.data, @@ -4825,9 +5516,6 @@ ClassifyCells <- function(data, q) { # # ComputeRMetric <- function(mv, r.metric = 5) { - if (!inherits(x = mv, what = "list")) { - mv <- list(mv) - } r.metric.results <- unlist(x = lapply( X = mv, FUN = function(x) { diff --git a/R/preprocessing5.R b/R/preprocessing5.R new file mode 100644 index 000000000..a767ab24b --- /dev/null +++ b/R/preprocessing5.R @@ -0,0 +1,2026 @@ +#' @include generics.R +#' @include preprocessing.R +#' @importFrom stats loess +#' @importFrom methods slot +#' @importFrom SeuratObject .MARGIN .SparseSlots +#' @importFrom utils txtProgressBar setTxtProgressBar +#' +NULL + +hvf.methods <- list() + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Generics +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Functions +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Methods for Seurat-defined generics +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +#' @method FindVariableFeatures default +#' @export +#' +FindVariableFeatures.default <- function( + object, + method = VST, + nfeatures = 2000L, + verbose = TRUE, + selection.method = selection.method, + ... +) { + if (is_quosure(x = method)) { + method <- eval( + expr = quo_get_expr(quo = method), + envir = quo_get_env(quo = method) + ) + } + if (is.character(x = method)) { + method <- get(x = method) + } + if (!is.function(x = method)) { + stop( + "'method' must be a function for calculating highly variable features", + call. = FALSE + ) + } + var.gene.ouput <- method( + data = object, + nselect = nfeatures, + verbose = verbose, + ... + ) + rownames(x = var.gene.ouput) <- rownames(x = object) + return(var.gene.ouput) +} + + +#' @importFrom SeuratObject DefaultLayer Features Key Layers +#' +#' @method FindVariableFeatures StdAssay +#' @export +#' +FindVariableFeatures.StdAssay <- function( + object, + method = NULL, + nfeatures = 2000L, + layer = NULL, + span = 0.3, + clip = NULL, + key = NULL, + verbose = TRUE, + selection.method = 'vst', + ... +) { + if (selection.method == 'vst') { + layer <- layer%||%'counts' + method <- VST + key <- 'vst' + } else if (selection.method %in% c('mean.var.plot', 'mvp')) { + layer <- layer%||%'data' + method <- MVP + key <- 'mvp' + } else if (selection.method %in% c('dispersion', 'disp')) { + layer <- layer%||%'data' + method <- DISP + key <- 'disp' + } else if (is.null(x = method) || is.null(x = layer)){ + stop('Custome functions and layers are both required') + } else { + key <- NULL + } + layer <- Layers(object = object, search = layer) + if (is.null(x = key)) { + false <- function(...) { + return(FALSE) + } + key <- if (tryCatch(expr = is_quosure(x = method), error = false)) { + method + } else if (is.function(x = method)) { + substitute(expr = method) + } else if (is.call(x = enquo(arg = method))) { + enquo(arg = method) + } else if (is.character(x = method)) { + method + } else { + parse(text = method) + } + key <- .Abbrv(x = as_name(x = key)) + } + warn.var <- warn.rank <- TRUE + for (i in seq_along(along.with = layer)) { + if (isTRUE(x = verbose)) { + message("Finding variable features for layer ", layer[i]) + } + data <- LayerData(object = object, layer = layer[i], fast = TRUE) + hvf.function <- if (inherits(x = data, what = 'V3Matrix')) { + FindVariableFeatures.default + } else { + FindVariableFeatures + } + hvf.info <- hvf.function( + object = data, + method = method, + nfeatures = nfeatures, + span = span, + clip = clip, + verbose = verbose, + ... + ) + if (warn.var) { + if (!'variable' %in% colnames(x = hvf.info) || !is.logical(x = hvf.info$variable)) { + warning( + "No variable feature indication in HVF info for method ", + key, + ", `VariableFeatures` will not work", + call. = FALSE, + immediate. = TRUE + ) + warn.var <- FALSE + } + } else if (warn.rank && !'rank' %in% colnames(x = hvf.info)) { + warning( + "No variable feature rank in HVF info for method ", + key, + ", `VariableFeatures` will return variable features in assay order", + call. = FALSE, + immediate. = TRUE + ) + warn.rank <- FALSE + } + colnames(x = hvf.info) <- paste( + 'vf', + key, + layer[i], + colnames(x = hvf.info), + sep = '_' + ) + rownames(x = hvf.info) <- Features(x = object, layer = layer[i]) + object[["var.features"]] <- NULL + object[["var.features.rank"]] <- NULL + object[[names(x = hvf.info)]] <- NULL + object[[names(x = hvf.info)]] <- hvf.info + } + VariableFeatures(object) <- VariableFeatures(object, nfeatures=nfeatures,method = key) + return(object) +} + +#' @param layer Layer in the Assay5 to pull data from +#' @param features If provided, only compute on given features. Otherwise, +#' compute for all features. +#' @param nfeatures Number of features to mark as the top spatially variable. +#' +#' @method FindSpatiallyVariableFeatures StdAssay +#' @rdname FindSpatiallyVariableFeatures +#' @concept preprocessing +#' @concept spatial +#' @export +#' +FindSpatiallyVariableFeatures.StdAssay <- function( + object, + layer = "scale.data", + spatial.location, + selection.method = c('markvariogram', 'moransi'), + features = NULL, + r.metric = 5, + x.cuts = NULL, + y.cuts = NULL, + nfeatures = nfeatures, + verbose = TRUE, + ... +) { + features <- features %||% rownames(x = object) + if (selection.method == "markvariogram" && "markvariogram" %in% names(x = Misc(object = object))) { + features.computed <- names(x = Misc(object = object, slot = "markvariogram")) + features <- features[! features %in% features.computed] + } + data <- GetAssayData(object = object, layer = layer) + data <- as.matrix(x = data[features, ]) + data <- data[RowVar(x = data) > 0, ] + if (nrow(x = data) != 0) { + svf.info <- FindSpatiallyVariableFeatures( + object = data, + spatial.location = spatial.location, + selection.method = selection.method, + r.metric = r.metric, + x.cuts = x.cuts, + y.cuts = y.cuts, + verbose = verbose, + ... + ) + } else { + svf.info <- c() + } + if (selection.method == "markvariogram") { + if ("markvariogram" %in% names(x = Misc(object = object))) { + svf.info <- c(svf.info, Misc(object = object, slot = "markvariogram")) + } + suppressWarnings(expr = Misc(object = object, slot = "markvariogram") <- svf.info) + svf.info <- ComputeRMetric(mv = svf.info, r.metric) + svf.info <- svf.info[order(svf.info[, 1]), , drop = FALSE] + } + if (selection.method == "moransi") { + colnames(x = svf.info) <- paste0("MoransI_", colnames(x = svf.info)) + svf.info <- svf.info[order(svf.info[, 2], -abs(svf.info[, 1])), , drop = FALSE] + } + var.name <- paste0(selection.method, ".spatially.variable") + var.name.rank <- paste0(var.name, ".rank") + svf.info[[var.name]] <- FALSE + svf.info[[var.name]][1:(min(nrow(x = svf.info), nfeatures))] <- TRUE + svf.info[[var.name.rank]] <- 1:nrow(x = svf.info) + object[names(x = svf.info)] <- svf.info + return(object) +} + + +#' @rdname LogNormalize +#' @method LogNormalize default +#' +#' @param margin Margin to normalize over +#' @importFrom SeuratObject .CheckFmargin +#' +#' @export +#' +LogNormalize.default <- function( + data, + scale.factor = 1e4, + margin = 2L, + verbose = TRUE, + ... +) { + margin <- .CheckFmargin(fmargin = margin) + ncells <- dim(x = data)[margin] + if (isTRUE(x = verbose)) { + pb <- txtProgressBar(file = stderr(), style = 3) + } + for (i in seq_len(length.out = ncells)) { + x <- if (margin == 1L) { + data[i, ] + } else { + data[, i] + } + xnorm <- log1p(x = x / sum(x) * scale.factor) + if (margin == 1L) { + data[i, ] <- xnorm + } else { + data[, i] <- xnorm + } + if (isTRUE(x = verbose)) { + setTxtProgressBar(pb = pb, value = i / ncells) + } + } + if (isTRUE(x = verbose)) { + close(con = pb) + } + return(data) +} + +#' @method LogNormalize IterableMatrix +#' @export +#' +LogNormalize.IterableMatrix <- function( + data, + scale.factor = 1e4, + margin = 2L, + verbose = TRUE, + ... +) { + data <- BPCells::t(BPCells::t(data) / colSums(data)) + # Log normalization + data <- log1p(data * scale.factor) + return(data) +} + +#' @importFrom SeuratObject IsSparse +#' +#' @method NormalizeData default +#' @export +#' +NormalizeData.default <- function( + object, + normalization.method = c('LogNormalize', 'CLR', 'RC'), + scale.factor = 1e4, + cmargin = 2L, + margin = 1L, + verbose = TRUE, + ... +) { + normalization.method <- normalization.method[1L] + normalization.method <- match.arg(arg = normalization.method) + # TODO: enable parallelization via future + normalized <- switch( + EXPR = normalization.method, + 'LogNormalize' = { + if (IsSparse(x = object) && .MARGIN(object = object) == cmargin) { + .SparseNormalize( + data = object, + scale.factor = scale.factor, + verbose = verbose + ) + } else { + LogNormalize( + data = object, + scale.factor = scale.factor, + margin = cmargin, + verbose = verbose, + ... + ) + } + }, + 'CLR' = { + if (inherits(x = object, what = 'dgTMatrix')) { + warning('Convert input dgTMatrix into dgCMatrix') + object <- as(object = object, Class = 'dgCMatrix') + } + if (!inherits(x = object, what = 'dgCMatrix') && + !inherits(x = object, what = 'matrix')) { + stop('CLR normalization is only supported for dense and dgCMatrix') + } + CustomNormalize( + data = object, + custom_function = function(x) { + return(log1p(x = x/(exp(x = sum(log1p(x = x[x > 0]), na.rm = TRUE)/length(x = x))))) + }, + margin = margin, + verbose = verbose + ) + }, + 'RC' = { + if (!inherits(x = object, what = 'dgCMatrix') && + !inherits(x = object, what = 'matrix')) { + stop('RC normalization is only supported for dense and dgCMatrix') + } + RelativeCounts(data = object, + scale.factor = scale.factor, + verbose = verbose) + } + ) + return(normalized) +} + +#' @importFrom SeuratObject Cells DefaultLayer DefaultLayer<- Features +#' LayerData LayerData<- +#' +#' @method NormalizeData StdAssay +#' @export +#' +NormalizeData.StdAssay <- function( + object, + normalization.method = 'LogNormalize', + scale.factor = 1e4, + margin = 1L, + layer = 'counts', + save = 'data', + verbose = TRUE, + ... +) { + olayer <- layer <- unique(x = layer) + layer <- Layers(object = object, search = layer) + if (length(x = save) != length(x = layer)) { + save <- make.unique(names = gsub( + pattern = olayer, + replacement = save, + x = layer + )) + } + for (i in seq_along(along.with = layer)) { + l <- layer[i] + if (isTRUE(x = verbose)) { + message("Normalizing layer: ", l) + } + LayerData( + object = object, + layer = save[i], + features = Features(x = object, layer = l), + cells = Cells(x = object, layer = l) + ) <- NormalizeData( + object = LayerData(object = object, layer = l, fast = NA), + normalization.method = normalization.method, + scale.factor = scale.factor, + margin = margin, + verbose = verbose, + ... + ) + } + gc(verbose = FALSE) + return(object) +} + + +#' @importFrom SeuratObject StitchMatrix +#' +#' @method ScaleData StdAssay +#' @export +#' +ScaleData.StdAssay <- function( + object, + features = NULL, + layer = 'data', + vars.to.regress = NULL, + latent.data = NULL, + by.layer = FALSE, + split.by = NULL, + model.use = 'linear', + use.umi = FALSE, + do.scale= TRUE, + do.center = TRUE, + scale.max = 10, + block.size = 1000, + min.cells.to.block = 3000, + save = 'scale.data', + verbose = TRUE, + ... +) { + use.umi <- ifelse(test = model.use != 'linear', yes = TRUE, no = use.umi) + olayer <- layer <- unique(x = layer) + layer <- Layers(object = object, search = layer) + if (is.null(layer)) { + abort(paste0("No layer matching pattern '", olayer, "' found. Please run NormalizeData and retry")) + } + if (isTRUE(x = use.umi)) { + layer <- "counts" + inform( + message = "'use.umi' is TRUE, please make sure 'layer' specifies raw counts" + ) + } + features <- features %||% VariableFeatures(object = object) + if (!length(x = features)) { + features <- Features(x = object, layer = layer) + } + if (isTRUE(x = by.layer)) { + if (length(x = save) != length(x = layer)) { + save <- make.unique(names = gsub( + pattern = olayer, + replacement = save, + x = layer + )) + } + for (i in seq_along(along.with = layer)) { + lyr <- layer[i] + if (isTRUE(x = verbose)) { + inform(message = paste("Scaling data for layer", sQuote(x = lyr))) + } + LayerData(object = object, layer = save[i], ...) <- ScaleData( + object = LayerData( + object = object, + layer = lyr, + features = features, + fast = NA + ), + features = features, + vars.to.regress = vars.to.regress, + latent.data = latent.data, + split.by = split.by, + model.use = model.use, + use.umi = use.umi, + do.scale = do.scale, + do.center = do.center, + scale.max = scale.max, + block.size = block.size, + min.cells.to.block = min.cells.to.block, + verbose = verbose, + ... + ) + } + } else { + ldata <- if (length(x = layer) > 1L) { + StitchMatrix( + x = LayerData(object = object, layer = layer[1L], features = features), + y = lapply( + X = layer[2:length(x = layer)], + FUN = LayerData, + object = object, + features = features + ), + rowmap = slot(object = object, name = 'features')[features, layer], + colmap = slot(object = object, name = 'cells')[, layer] + ) + } else { + LayerData(object = object, layer = layer, features = features) + } + ldata <- ScaleData( + object = ldata, + features = features, + vars.to.regress = vars.to.regress, + latent.data = latent.data, + split.by = split.by, + model.use = model.use, + use.umi = use.umi, + do.scale = do.scale, + do.center = do.center, + scale.max = scale.max, + block.size = block.size, + min.cells.to.block = min.cells.to.block, + verbose = verbose, + ... + ) + LayerData(object = object, layer = save, features = rownames(ldata)) <- ldata + } + return(object) +} + +#' @rdname VST +#' @method VST default +#' @export +#' +VST.default <- function( + data, + margin = 1L, + nselect = 2000L, + span = 0.3, + clip = NULL, + ... +) { + .NotYetImplemented() +} + +#' @rdname VST +#' @method VST IterableMatrix +#' @importFrom SeuratObject EmptyDF +#' @export +#' +VST.IterableMatrix <- function( + data, + margin = 1L, + nselect = 2000L, + span = 0.3, + clip = NULL, + verbose = TRUE, + ... +) { + nfeatures <- nrow(x = data) + hvf.info <- EmptyDF(n = nfeatures) + hvf.stats <- BPCells::matrix_stats( + matrix = data, + row_stats = 'variance')$row_stats + # Calculate feature means + hvf.info$mean <- hvf.stats['mean', ] + # Calculate feature variance + hvf.info$variance <- hvf.stats['variance', ] + hvf.info$variance.expected <- 0L + not.const <- hvf.info$variance > 0 + fit <- loess( + formula = log10(x = variance) ~ log10(x = mean), + data = hvf.info[not.const, , drop = TRUE], + span = span + ) + hvf.info$variance.expected[not.const] <- 10 ^ fit$fitted + feature.mean <- hvf.info$mean + feature.sd <- sqrt(x = hvf.info$variance.expected) + standard.max <- clip %||% sqrt(x = ncol(x = data)) + feature.mean[feature.mean == 0] <- 0.1 + data <- BPCells::min_by_row(mat = data, vals = standard.max*feature.sd + feature.mean) + data.standard <- (data - feature.mean) / feature.sd + hvf.info$variance.standardized <- BPCells::matrix_stats( + matrix = data.standard, + row_stats = 'variance' + )$row_stats['variance', ] + # Set variable features + hvf.info$variable <- FALSE + hvf.info$rank <- NA + vf <- head( + x = order(hvf.info$variance.standardized, decreasing = TRUE), + n = nselect + ) + hvf.info$variable[vf] <- TRUE + hvf.info$rank[vf] <- seq_along(along.with = vf) + rownames(x = hvf.info) <- rownames(x = data) + return(hvf.info) +} + +#' @importFrom Matrix rowMeans +#' @importFrom SeuratObject EmptyDF +#' +#' @rdname VST +#' @method VST dgCMatrix +#' @export +#' +VST.dgCMatrix <- function( + data, + margin = 1L, + nselect = 2000L, + span = 0.3, + clip = NULL, + verbose = TRUE, + ... +) { + nfeatures <- nrow(x = data) + hvf.info <- EmptyDF(n = nfeatures) + # Calculate feature means + hvf.info$mean <- Matrix::rowMeans(x = data) + # Calculate feature variance + hvf.info$variance <- SparseRowVar2( + mat = data, + mu = hvf.info$mean, + display_progress = verbose + ) + hvf.info$variance.expected <- 0L + not.const <- hvf.info$variance > 0 + fit <- loess( + formula = log10(x = variance) ~ log10(x = mean), + data = hvf.info[not.const, , drop = TRUE], + span = span + ) + hvf.info$variance.expected[not.const] <- 10 ^ fit$fitted + hvf.info$variance.standardized <- SparseRowVarStd( + mat = data, + mu = hvf.info$mean, + sd = sqrt(x = hvf.info$variance.expected), + vmax = clip %||% sqrt(x = ncol(x = data)), + display_progress = verbose + ) + # Set variable features + hvf.info$variable <- FALSE + hvf.info$rank <- NA + vf <- head( + x = order(hvf.info$variance.standardized, decreasing = TRUE), + n = nselect + ) + hvf.info$variable[vf] <- TRUE + hvf.info$rank[vf] <- seq_along(along.with = vf) + return(hvf.info) +} + +#' @rdname VST +#' @method VST matrix +#' @export +#' +VST.matrix <- function( + data, + margin = 1L, + nselect = 2000L, + span = 0.3, + clip = NULL, + ... +) { + return(VST( + data = as.sparse(x = data), + margin = margin, + nselect = nselect, + span = span, + clip = clip, + ... + )) +} + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Methods for R-defined generics +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Internal +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + +#' Calculate dispersion of features +#' +#' @param object Data matrix +#' @param mean.function Function to calculate mean +#' @param dispersion.function Function to calculate dispersion +#' @param num.bin Number of bins to use +#' @param binning.method Method to use for binning. Options are 'equal_width' or 'equal_frequency' +#' @param verbose Display progress +#' @keywords internal +#' +CalcDispersion <- function( + object, + mean.function = FastExpMean, + dispersion.function = FastLogVMR, + num.bin = 20, + binning.method = "equal_width", + verbose = TRUE, + ... +) { + if (!inherits(x = object, what = c('dgCMatrix', 'matrix'))) { + stop('mean.var.plot and dispersion methods only \ + support dense and sparse matrix input') + } + if (inherits(x = object, what = 'matrix')) { + object <- as.sparse(x = object) + } + feature.mean <- mean.function(object, verbose) + feature.dispersion <- dispersion.function(object, verbose) + + names(x = feature.mean) <- names( + x = feature.dispersion) <- rownames(x = object) + feature.dispersion[is.na(x = feature.dispersion)] <- 0 + feature.mean[is.na(x = feature.mean)] <- 0 + data.x.breaks <- switch( + EXPR = binning.method, + 'equal_width' = num.bin, + 'equal_frequency' = c( + quantile( + x = feature.mean[feature.mean > 0], + probs = seq.int(from = 0, to = 1, length.out = num.bin) + ) + ), + stop("Unknown binning method: ", binning.method) + ) + data.x.bin <- cut(x = feature.mean, breaks = data.x.breaks, + include.lowest = TRUE) + names(x = data.x.bin) <- names(x = feature.mean) + mean.y <- tapply(X = feature.dispersion, INDEX = data.x.bin, FUN = mean) + sd.y <- tapply(X = feature.dispersion, INDEX = data.x.bin, FUN = sd) + feature.dispersion.scaled <- (feature.dispersion - mean.y[as.numeric(x = data.x.bin)]) / + sd.y[as.numeric(x = data.x.bin)] + names(x = feature.dispersion.scaled) <- names(x = feature.mean) + hvf.info <- data.frame( + feature.mean, feature.dispersion, feature.dispersion.scaled) + rownames(x = hvf.info) <- rownames(x = object) + colnames(x = hvf.info) <- paste0( + 'mvp.', c('mean', 'dispersion', 'dispersion.scaled')) + return(hvf.info) +} + + +#' @importFrom SeuratObject .CalcN +#' +CalcN <- function(object, ...) { + return(.CalcN(object, ...)) +} + +#' @method .CalcN IterableMatrix +#' @export +#' +.CalcN.IterableMatrix <- function(object, ...) { + col_stat <- BPCells::matrix_stats(matrix = object, col_stats = 'mean')$col_stats + return(list( + nCount = round(col_stat['mean', ] * nrow(object)), + nFeature = col_stat['nonzero', ] + )) +} + +#' Find variable features based on dispersion +#' +#' @param data Data matrix +#' @param nselect Number of top features to select based on dispersion values +#' @param verbose Display progress +#' @keywords internal +#' +DISP <- function( + data, + nselect = 2000L, + verbose = TRUE, + ... +) { + hvf.info <- CalcDispersion(object = data, verbose = verbose, ...) + hvf.info$variable <- FALSE + hvf.info$rank <- NA + vf <- head( + x = order(hvf.info$mvp.dispersion, decreasing = TRUE), + n = nselect + ) + hvf.info$variable[vf] <- TRUE + hvf.info$rank[vf] <- seq_along(along.with = vf) + return(hvf.info) +} + +#' @importFrom SeuratObject .CheckFmargin +#' +.FeatureVar <- function( + data, + mu, + fmargin = 1L, + standardize = FALSE, + sd = NULL, + clip = NULL, + verbose = TRUE +) { + fmargin <- .CheckFmargin(fmargin = fmargin) + ncells <- dim(x = data)[-fmargin] + nfeatures <- dim(x = data)[fmargin] + fvars <- vector(mode = 'numeric', length = nfeatures) + if (length(x = mu) != nfeatures) { + stop("Wrong number of feature means provided") + } + if (isTRUE(x = standardize)) { + clip <- clip %||% sqrt(x = ncells) + if (length(x = sd) != nfeatures) { + stop("Wrong number of standard deviations") + } + } + if (isTRUE(x = verbose)) { + msg <- 'Calculating feature variances' + if (isTRUE(x = standardize)) { + msg <- paste(msg, 'of standardized and clipped values') + } + message(msg) + pb <- txtProgressBar(style = 3, file = stderr()) + } + for (i in seq_len(length.out = nfeatures)) { + if (isTRUE(x = standardize) && sd[i] == 0) { + if (isTRUE(x = verbose)) { + setTxtProgressBar(pb = pb, value = i / nfeatures) + } + next + } + x <- if (fmargin == 1L) { + data[i, , drop = TRUE] + } else { + data[, i, drop = TRUE] + } + x <- x - mu[i] + if (isTRUE(x = standardize)) { + x <- x / sd[i] + x[x > clip] <- clip + } + fvars[i] <- sum(x ^ 2) / (ncells - 1L) + if (isTRUE(x = verbose)) { + setTxtProgressBar(pb = pb, value = i / nfeatures) + } + } + if (isTRUE(x = verbose)) { + close(con = pb) + } + return(fvars) +} + +.Mean <- function(data, margin = 1L) { + nout <- dim(x = data)[margin] + nobs <- dim(x = data)[-margin] + means <- vector(mode = 'numeric', length = nout) + for (i in seq_len(length.out = nout)) { + x <- if (margin == 1L) { + data[i, , drop = TRUE] + } else { + data[, i, drop = TRUE] + } + means[i] <- sum(x) / nobs + } + return(means) +} + +.SparseNormalize <- function(data, scale.factor = 1e4, verbose = TRUE) { + entryname <- .SparseSlots(x = data, type = 'entries') + p <- slot(object = data, name = .SparseSlots(x = data, type = 'pointers')) + if (p[1L] == 0) { + p <- p + 1L + } + np <- length(x = p) - 1L + if (isTRUE(x = verbose)) { + pb <- txtProgressBar(style = 3L, file = stderr()) + } + for (i in seq_len(length.out = np)) { + idx <- seq.int(from = p[i], to = p[i + 1] - 1L) + xidx <- slot(object = data, name = entryname)[idx] + slot(object = data, name = entryname)[idx] <- log1p( + x = xidx / sum(xidx) * scale.factor + ) + if (isTRUE(x = verbose)) { + setTxtProgressBar(pb = pb, value = i / np) + } + } + if (isTRUE(x = verbose)) { + close(con = pb) + } + return(data) +} + +#' @param data A sparse matrix +#' @param mu A vector of feature means +#' @param fmargin Feature margin +#' @param standardize Standardize matrix rows prior to calculating variances +#' @param sd If standardizing, a vector of standard deviations to +#' standardize with +#' @param clip Set upper bound for standardized variances; defaults to the +#' square root of the number of cells +#' @param verbose Show progress updates +#' +#' @keywords internal +#' @importFrom SeuratObject .CheckFmargin +#' +#' @noRd +#' +.SparseFeatureVar <- function( + data, + mu, + fmargin = 1L, + standardize = FALSE, + sd = NULL, + clip = NULL, + verbose = TRUE +) { + fmargin <- .CheckFmargin(fmargin = fmargin) + if (fmargin != .MARGIN(object = data)) { + data <- t(x = data) + fmargin <- .MARGIN(object = data) + } + entryname <- .SparseSlots(x = data, type = 'entries') + p <- slot(object = data, name = .SparseSlots(x = data, type = 'pointers')) + if (p[1L] == 0) { + p <- p + 1L + } + np <- length(x = p) - 1L + ncells <- dim(x = data)[-fmargin] + fvars <- vector(mode = 'numeric', length = np) + if (length(x = mu) != np) { + stop("Wrong number of feature means provided") + } + if (isTRUE(x = standardize)) { + clip <- clip %||% sqrt(x = ncells) + if (length(x = sd) != np) { + stop("Wrong number of standard deviations provided") + } + } + if (isTRUE(x = verbose)) { + msg <- 'Calculating feature variances' + if (isTRUE(x = standardize)) { + msg <- paste(msg, 'of standardized and clipped values') + } + message(msg) + pb <- txtProgressBar(style = 3, file = stderr()) + } + for (i in seq_len(length.out = np)) { + if (isTRUE(x = standardize) && sd[i] == 0) { + if (isTRUE(x = verbose)) { + setTxtProgressBar(pb = pb, value = i / np) + } + next + } + idx <- seq.int(from = p[i], to = p[i + 1L] - 1L) + xidx <- slot(object = data, name = entryname)[idx] - mu[i] + nzero <- ncells - length(x = xidx) + csum <- nzero * ifelse( + test = isTRUE(x = standardize), + yes = ((0 - mu[i]) / sd[i]) ^ 2, + no = mu[i] ^ 2 + ) + if (isTRUE(x = standardize)) { + xidx <- xidx / sd[i] + xidx[xidx > clip] <- clip + } + fsum <- sum(xidx ^ 2) + csum + fvars[i] <- fsum / (ncells - 1L) + if (isTRUE(x = verbose)) { + setTxtProgressBar(pb = pb, value = i / np) + } + } + if (isTRUE(x = verbose)) { + close(con = pb) + } + return(fvars) +} + +#' @importFrom SeuratObject .CheckFmargin +.SparseMean <- function(data, margin = 1L) { + margin <- .CheckFmargin(fmargin = margin) + if (margin != .MARGIN(object = data)) { + data <- t(x = data) + margin <- .MARGIN(object = data) + } + entryname <- .SparseSlots(x = data, type = 'entries') + p <- slot(object = data, name = .SparseSlots(x = data, type = 'pointers')) + if (p[1L] == 0) { + p <- p + 1L + } + np <- length(x = p) - 1L + nobs <- dim(x = data)[-margin] + means <- vector(mode = 'numeric', length = np) + for (i in seq_len(length.out = np)) { + idx <- seq.int(from = p[i], to = p[i + 1L] - 1L) + means[i] <- sum(slot(object = data, name = entryname)[idx]) / nobs + } + return(means) +} + +#' @inheritParams stats::loess +#' @param data A matrix +#' @param fmargin Feature margin +#' @param nselect Number of features to select +#' @param clip After standardization values larger than \code{clip} will be set +#' to \code{clip}; default is \code{NULL} which sets this value to the square +#' root of the number of cells +#' +#' @importFrom Matrix rowMeans +#' @importFrom SeuratObject .CheckFmargin +#' +#' @keywords internal +#' +#' @noRd +#' +.VST <- function( + data, + fmargin = 1L, + nselect = 2000L, + span = 0.3, + clip = NULL, + verbose = TRUE, + ... +) { + fmargin <- .CheckFmargin(fmargin = fmargin) + nfeatures <- dim(x = data)[fmargin] + # TODO: Support transposed matrices + # nfeatures <- nrow(x = data) + if (IsSparse(x = data)) { + mean.func <- .SparseMean + var.func <- .SparseFeatureVar + } else { + mean.func <- .Mean + var.func <- .FeatureVar + } + hvf.info <- SeuratObject::EmptyDF(n = nfeatures) + # hvf.info$mean <- mean.func(data = data, margin = fmargin) + hvf.info$mean <- rowMeans(x = data) + hvf.info$variance <- var.func( + data = data, + mu = hvf.info$mean, + fmargin = fmargin, + verbose = verbose + ) + hvf.info$variance.expected <- 0L + not.const <- hvf.info$variance > 0 + fit <- loess( + formula = log10(x = variance) ~ log10(x = mean), + data = hvf.info[not.const, , drop = TRUE], + span = span + ) + hvf.info$variance.expected[not.const] <- 10 ^ fit$fitted + hvf.info$variance.standardized <- var.func( + data = data, + mu = hvf.info$mean, + standardize = TRUE, + sd = sqrt(x = hvf.info$variance.expected), + clip = clip, + verbose = verbose + ) + hvf.info$variable <- FALSE + hvf.info$rank <- NA + vs <- hvf.info$variance.standardized + vs[vs == 0] <- NA + vf <- head( + x = order(hvf.info$variance.standardized, decreasing = TRUE), + n = nselect + ) + hvf.info$variable[vf] <- TRUE + hvf.info$rank[vf] <- seq_along(along.with = vf) + # colnames(x = hvf.info) <- paste0('vst.', colnames(x = hvf.info)) + return(hvf.info) +} + +# hvf.methods$vst <- VST + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# S4 Methods +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + + + +################################################################################ +################################# SCTransform ################################## +################################################################################ + +#' @importFrom SeuratObject Cells as.sparse +#' +#' @method SCTransform IterableMatrix +#' @rdname SCTransform +#' @concept preprocessing +#' @export +SCTransform.IterableMatrix <- function( + object, + cell.attr, + reference.SCT.model = NULL, + do.correct.umi = TRUE, + ncells = 5000, + residual.features = NULL, + variable.features.n = 3000, + variable.features.rv.th = 1.3, + vars.to.regress = NULL, + latent.data = NULL, + do.scale = FALSE, + do.center = TRUE, + clip.range = c(-sqrt(x = ncol(x = object) / 30), sqrt(x = ncol(x = object) / 30)), + vst.flavor = 'v2', + conserve.memory = FALSE, + return.only.var.genes = TRUE, + seed.use = 1448145, + verbose = TRUE, + ... +) { + if (!is.null(x = seed.use)) { + set.seed(seed = seed.use) + } + if (!is.null(reference.SCT.model)){ + do.correct.umi <- FALSE + do.center <- FALSE + } + sampled_cells <- sample.int(n = ncol(x = object), size = min(ncells, ncol(x = object))) + umi <- as.sparse(x = object[, sampled_cells]) + cell.attr <- cell.attr[colnames(x = umi),,drop=FALSE] + vst.out <- SCTransform(object = umi, + cell.attr = cell.attr, + reference.SCT.model = reference.SCT.model, + do.correct.umi = do.correct.umi, + ncells = ncells, + residual.features = residual.features, + variable.features.n = variable.features.n, + variable.features.rv.th = variable.features.rv.th, + vars.to.regress = vars.to.regress, + latent.data = latent.data, + do.scale = do.scale, + do.center = do.center, + clip.range = clip.range, + vst.flavor = vst.flavor, + conserve.memory = conserve.memory, + return.only.var.genes = return.only.var.genes, + seed.use = seed.use, + verbose = verbose, + ...) + if (!do.correct.umi) { + vst.out$umi_corrected <- umi + } + return(vst.out) +} + + +#' @importFrom SeuratObject CreateAssayObject SetAssayData GetAssayData +CreateSCTAssay <- function(vst.out, do.correct.umi, residual.type, clip.range){ + residual.type <- vst.out[['residual_type']] %||% 'pearson' + sct.method <- vst.out[['sct.method']] + assay.out <- CreateAssayObject(counts = vst.out$umi_corrected) + + # set the variable genes + VariableFeatures(object = assay.out) <- vst.out$variable_features + # put log1p transformed counts in data + assay.out <- SetAssayData( + object = assay.out, + slot = 'data', + new.data = log1p(x = GetAssayData(object = assay.out, slot = 'counts')) + ) + scale.data <- vst.out$y + assay.out <- SetAssayData( + object = assay.out, + slot = 'scale.data', + new.data = scale.data + ) + vst.out$y <- NULL + # save clip.range into vst model + vst.out$arguments$sct.clip.range <- clip.range + vst.out$arguments$sct.method <- sct.method + Misc(object = assay.out, slot = 'vst.out') <- vst.out + assay.out <- as(object = assay.out, Class = "SCTAssay") + return (assay.out) +} + +#' @importFrom SeuratObject Cells DefaultLayer DefaultLayer<- Features +#' LayerData LayerData<- as.sparse +#' +#' @method SCTransform StdAssay +#' @export +#' +SCTransform.StdAssay <- function( + object, + layer = 'counts', + cell.attr = NULL, + reference.SCT.model = NULL, + do.correct.umi = TRUE, + ncells = 5000, + residual.features = NULL, + variable.features.n = 3000, + variable.features.rv.th = 1.3, + vars.to.regress = NULL, + latent.data = NULL, + do.scale = FALSE, + do.center = TRUE, + clip.range = c(-sqrt(x = ncol(x = object) / 30), sqrt(x = ncol(x = object) / 30)), + vst.flavor = 'v2', + conserve.memory = FALSE, + return.only.var.genes = TRUE, + seed.use = 1448145, + verbose = TRUE, + ...) { + if (!is.null(x = seed.use)) { + set.seed(seed = seed.use) + } + if (!is.null(reference.SCT.model)){ + do.correct.umi <- FALSE + do.center <- FALSE + } + olayer <- layer <- unique(x = layer) + layers <- Layers(object = object, search = layer) + dataset.names <- gsub(pattern = paste0(layer, "."), replacement = "", x = layers) + # loop over layers performing SCTransform() on individual layers + sct.assay.list <- list() + # Keep a tab of variable features per chunk + variable.feature.list <- list() + + for (dataset.index in seq_along(along.with = layers)) { + l <- layers[dataset.index] + if (isTRUE(x = verbose)) { + message("Running SCTransform on layer: ", l) + } + all_cells <- Cells(x = object, layer = l) + all_features <- Features(x = object, layer = l) + layer.data <- LayerData( + object = object, + layer = l, + features = all_features, + cells = all_cells + ) + local.reference.SCT.model <- NULL + set.seed(seed = seed.use) + do.correct.umi.chunk <- FALSE + sct.function <- if (inherits(x = layer.data, what = 'V3Matrix')) { + SCTransform.default + } else { + SCTransform + } + if (is.null(x = cell.attr) && is.null(x = reference.SCT.model)){ + calcn <- CalcN(object = layer.data) + cell.attr.layer <- data.frame(umi = calcn$nCount, + log_umi = log10(x = calcn$nCount)) + rownames(cell.attr.layer) <- colnames(x = layer.data) + } else { + cell.attr.layer <- cell.attr[colnames(x = layer.data),, drop=FALSE] + } + if (!"umi" %in% cell.attr.layer && is.null(x = reference.SCT.model)){ + calcn <- CalcN(object = layer.data) + cell.attr.tmp <- data.frame(umi = calcn$nCount) + rownames(cell.attr.tmp) <- colnames(x = layer.data) + cell.attr.layer$umi <- NA + cell.attr.layer$log_umi <- NA + cell.attr.layer[rownames(cell.attr.tmp), "umi"] <- cell.attr.tmp$umi + cell.attr.layer[rownames(cell.attr.tmp), "log_umi"] <- log10(x = cell.attr.tmp$umi) + } + + # Step 1: Learn model + vst.out <- sct.function(object = layer.data, + do.correct.umi = TRUE, + cell.attr = cell.attr.layer, + reference.SCT.model = reference.SCT.model, + ncells = ncells, + residual.features = residual.features, + variable.features.n = variable.features.n, + variable.features.rv.th = variable.features.rv.th, + vars.to.regress = vars.to.regress, + latent.data = latent.data, + do.scale = do.scale, + do.center = do.center, + clip.range = clip.range, + vst.flavor = vst.flavor, + conserve.memory = conserve.memory, + return.only.var.genes = return.only.var.genes, + seed.use = seed.use, + verbose = verbose, + ...) + min_var <- vst.out$arguments$min_variance + residual.type <- vst.out[['residual_type']] %||% 'pearson' + assay.out <- CreateSCTAssay(vst.out = vst.out, do.correct.umi = do.correct.umi, residual.type = residual.type, + clip.range = clip.range) + + # If there is no reference model, use the model learned on subset of cells to calculate residuals + # by setting the learned model as the reference model (local.reference.SCT.model) + if (is.null(x = reference.SCT.model)) { + local.reference.SCT.model <- assay.out@SCTModel.list[[1]] + } else { + local.reference.SCT.model <- reference.SCT.model + } + variable.features <- VariableFeatures(assay.out) + + # once we have the model, just calculate residuals for all cells + # local.reference.SCT.model set to reference.model if it is non null + vst_out.reference <- SCTModel_to_vst(SCTModel = local.reference.SCT.model) + vst_out.reference$gene_attr <- local.reference.SCT.model@feature.attributes + min_var <- vst_out.reference$arguments$min_variance + if (min_var == "umi_median"){ + counts.x <- as.sparse(x = layer.data[, sample.int(n = ncol(x = layer.data), size = min(ncells, ncol(x = layer.data)) )]) + min_var <- (median(counts.x@x)/5)^2 + } + + # Step 2: Use learned model to calculate residuals in chunks + cells.vector <- 1:ncol(x = layer.data) + cells.grid <- split(x = cells.vector, f = ceiling(x = seq_along(along.with = cells.vector)/ncells)) + # Single block + residuals <- list() + corrected_counts <- list() + cell_attrs <- list() + + if (length(x = cells.grid) == 1){ + merged.assay <- assay.out + corrected_counts[[1]] <- GetAssayData(object = assay.out, slot = "data") + residuals[[1]] <- GetAssayData(object = assay.out, slot = "scale.data") + cell_attrs[[1]] <- vst_out.reference$cell_attr + sct.assay.list[[dataset.names[dataset.index]]] <- assay.out + } else { + # iterate over chunks to get residuals + for (i in seq_len(length.out = length(x = cells.grid))) { + vp <- cells.grid[[i]] + if (verbose){ + message("Getting residuals for block ", i, "(of ", length(cells.grid), ") for ", dataset.names[[dataset.index]], " dataset") + } + counts.vp <- as.sparse(x = layer.data[, vp, drop=FALSE]) + cell.attr.object <- cell.attr.layer[colnames(x = counts.vp),, drop=FALSE] + vst_out <- vst_out.reference + + vst_out$cell_attr <- cell.attr.object + vst_out$gene_attr <- vst_out$gene_attr[variable.features,,drop=FALSE] + if (return.only.var.genes){ + new_residual <- get_residuals( + vst_out = vst_out, + umi = counts.vp[variable.features,,drop=FALSE], + residual_type = "pearson", + min_variance = min_var, + res_clip_range = clip.range, + verbosity = FALSE + ) + } else { + new_residual <- get_residuals( + vst_out = vst_out, + umi = counts.vp[all_features,,drop=FALSE], + residual_type = "pearson", + min_variance = min_var, + res_clip_range = clip.range, + verbosity = FALSE + ) + } + vst_out$y <- new_residual + corrected_counts[[i]] <- correct_counts( + x = vst_out, + umi = counts.vp[all_features,,drop=FALSE], + verbosity = FALSE# as.numeric(x = verbose) * 2 + ) + residuals[[i]] <- new_residual + cell_attrs[[i]] <- cell.attr.object + } + new.residuals <- Reduce(cbind, residuals) + corrected_counts <- Reduce(cbind, corrected_counts) + cell_attrs <- Reduce(rbind, cell_attrs) + vst_out.reference$cell_attr <- cell_attrs[colnames(new.residuals),,drop=FALSE] + SCTModel.list <- PrepVSTResults(vst.res = vst_out.reference, cell.names = all_cells) + SCTModel.list <- list(model1 = SCTModel.list) + # scale data here as do.center and do.scale are set to FALSE inside + new.residuals <- ScaleData( + new.residuals, + features = NULL, + vars.to.regress = vars.to.regress, + latent.data = latent.data, + model.use = 'linear', + use.umi = FALSE, + do.scale = do.scale, + do.center = do.center, + scale.max = Inf, + block.size = 750, + min.cells.to.block = 3000, + verbose = verbose + ) + assay.out <- CreateSCTAssayObject(counts = corrected_counts, scale.data = new.residuals, SCTModel.list = SCTModel.list) + assay.out$data <- log1p(x = corrected_counts) + VariableFeatures(assay.out) <- variable.features + # one assay per dataset + if (verbose){ + message("Finished calculating residuals for ", dataset.names[dataset.index]) + } + sct.assay.list[[dataset.names[dataset.index]]] <- assay.out + variable.feature.list[[dataset.names[dataset.index]]] <- VariableFeatures(assay.out) + } + } + # Return array by merging everythin + if (length(x = sct.assay.list) == 1){ + merged.assay <- sct.assay.list[[1]] + } else { + vf.list <- lapply(X = sct.assay.list, FUN = function(object.i) VariableFeatures(object = object.i)) + variable.features.union <- Reduce(f = union, x = vf.list) + var.features.sorted <- sort( + x = table(unlist(x = vf.list, use.names = FALSE)), + decreasing = TRUE + ) + # select top ranking features + var.features <- variable.features.union + # calculate residuals for union of features + for (layer.name in names(x = sct.assay.list)){ + vst_out <- SCTModel_to_vst(SCTModel = slot(object = sct.assay.list[[layer.name]], name = "SCTModel.list")[[1]]) + all_cells <- Cells(x = object, layer = paste0(layer, ".", layer.name)) + all_features <- Features(x = object, layer = paste0(layer, ".", layer.name)) + variable.features.target <- intersect(x = rownames(x = vst_out$model_pars_fit), y = var.features) + variable.features.target <- setdiff(x = variable.features.target, y = VariableFeatures(sct.assay.list[[layer.name]])) + if (length(x = variable.features.target )<1){ + next + } + layer.counts.tmp <- LayerData( + object = object, + layer = paste0(layer, ".", layer.name), + cells = all_cells + ) + layer.counts.tmp <- as.sparse(x = layer.counts.tmp) + vst_out$cell_attr <- vst_out$cell_attr[, c("log_umi"), drop=FALSE] + vst_out$model_pars_fit <- vst_out$model_pars_fit[variable.features.target,,drop=FALSE] + new_residual <- GetResidualsChunked( + vst_out = vst_out, + layer.counts = layer.counts.tmp, + residual_type = "pearson", + min_variance = min_var, + res_clip_range = clip.range, + verbose = FALSE + ) + old_residual <- GetAssayData(object = sct.assay.list[[layer.name]], slot = 'scale.data') + merged_residual <- rbind(old_residual, new_residual) + merged_residual <- ScaleData( + merged_residual, + features = NULL, + vars.to.regress = vars.to.regress, + latent.data = latent.data, + model.use = 'linear', + use.umi = FALSE, + do.scale = do.scale, + do.center = do.center, + scale.max = Inf, + block.size = 750, + min.cells.to.block = 3000, + verbose = verbose + ) + sct.assay.list[[layer.name]] <- SetAssayData(object = sct.assay.list[[layer.name]], slot = 'scale.data', new.data = merged_residual) + VariableFeatures(sct.assay.list[[layer.name]]) <- rownames(x = merged_residual) + } + merged.assay <- merge(x = sct.assay.list[[1]], y = sct.assay.list[2:length(sct.assay.list)]) + VariableFeatures(object = merged.assay) <- VariableFeatures(object = merged.assay, use.var.features = FALSE, nfeatures = variable.features.n) + + } + # set the names of SCTmodels to be layer names + models <- slot(object = merged.assay, name="SCTModel.list") + names(models) <- names(x = sct.assay.list) + slot(object = merged.assay, name="SCTModel.list") <- models + gc(verbose = FALSE) + return(merged.assay) + } + +#' Calculate pearson residuals of features not in the scale.data +#' +#' This function calls sctransform::get_residuals. +#' +#' @param object A seurat object +#' @param features Name of features to add into the scale.data +#' @param assay Name of the assay of the seurat object generated by SCTransform +#' @param layer Name (prefix) of the layer to pull counts from +#' @param umi.assay Name of the assay of the seurat object containing UMI matrix +#' and the default is RNA +#' @param clip.range Numeric of length two specifying the min and max values the +#' Pearson residual will be clipped to +#' @param reference.SCT.model reference.SCT.model If a reference SCT model should be used +#' for calculating the residuals. When set to not NULL, ignores the `SCTModel` +#' paramater. +#' @param replace.value Recalculate residuals for all features, even if they are +#' already present. Useful if you want to change the clip.range. +#' @param na.rm For features where there is no feature model stored, return NA +#' for residual value in scale.data when na.rm = FALSE. When na.rm is TRUE, only +#' return residuals for features with a model stored for all cells. +#' @param verbose Whether to print messages and progress bars +#' +#' @return Returns a Seurat object containing Pearson residuals of added +#' features in its scale.data +#' +#' @importFrom sctransform get_residuals +#' @importFrom matrixStats rowAnyNAs +#' +#' @export +#' @concept preprocessing +#' +#' @seealso \code{\link[sctransform]{get_residuals}} +FetchResiduals <- function( + object, + features, + assay = NULL, + umi.assay = "RNA", + layer = "counts", + clip.range = NULL, + reference.SCT.model = NULL, + replace.value = FALSE, + na.rm = TRUE, + verbose = TRUE) { + assay <- assay %||% DefaultAssay(object = object) + if (IsSCT(assay = object[[assay]])) { + object[[assay]] <- as(object[[assay]], "SCTAssay") + } + if (!inherits(x = object[[assay]], what = "SCTAssay")) { + stop(assay, " assay was not generated by SCTransform") + } + sct.models <- levels(x = object[[assay]]) + if (length(sct.models)==1){ + sct.models <- list(sct.models) + } + if (length(x = sct.models) == 0) { + warning("SCT model not present in assay", call. = FALSE, immediate. = TRUE) + return(object) + } + possible.features <- Reduce(f = union, x = lapply(X = sct.models, FUN = function(x) { + rownames(x = SCTResults(object = object[[assay]], slot = "feature.attributes", model = x)) + })) + bad.features <- setdiff(x = features, y = possible.features) + if (length(x = bad.features) > 0) { + warning("The following requested features are not present in any models: ", + paste(bad.features, collapse = ", "), + call. = FALSE + ) + features <- intersect(x = features, y = possible.features) + } + features.orig <- features + if (na.rm) { + # only compute residuals when feature model info is present in all + features <- names(x = which(x = table(unlist(x = lapply( + X = sct.models, + FUN = function(x) { + rownames(x = SCTResults(object = object[[assay]], slot = "feature.attributes", model = x)) + } + ))) == length(x = sct.models))) + if (length(x = features) == 0) { + return(object) + } + } + + features <- intersect(x = features.orig, y = features) + if (length(features) < 1){ + warning("The following requested features are not present in all the models: ", + paste(features.orig, collapse = ", "), + call. = FALSE + ) + return(NULL) + } #if (length(x = sct.models) > 1 & verbose) { + # message("This SCTAssay contains multiple SCT models. Computing residuals for cells using") + #} + + # Get all (count) layers + layers <- Layers(object = object[[umi.assay]], search = layer) + + # iterate over layer running sct model for each of the object names + new.residuals <- list() + total_cells <- 0 + all_cells <- c() + if (!is.null(x = reference.SCT.model)) { + if (inherits(x = reference.SCT.model, what = "SCTModel")) { + reference.SCT.model <- SCTModel_to_vst(SCTModel = reference.SCT.model) + } + if (is.list(x = reference.SCT.model) & inherits(x = reference.SCT.model[[1]], what = "SCTModel")) { + stop("reference.SCT.model must be one SCTModel rather than a list of SCTModel") + } + if (reference.SCT.model$model_str != "y ~ log_umi") { + stop("reference.SCT.model must be derived using default SCT regression formula, `y ~ log_umi`") + } + } + for (i in seq_along(along.with = layers)) { + l <- layers[i] + sct_model <- sct.models[[i]] + # these cells belong to this layer + layer_cells <- Cells(x = object[[umi.assay]], layer = l) + all_cells <- c(all_cells, layer_cells) + total_cells <- total_cells + length(layer_cells) + # calculate residual using this model and these cells + new.residuals[[i]] <- FetchResidualSCTModel( + object = object, + umi.assay = umi.assay, + assay = assay, + layer = l, + layer.cells = layer_cells, + SCTModel = sct_model, + reference.SCT.model = reference.SCT.model, + new_features = features, + replace.value = replace.value, + clip.range = clip.range, + verbose = verbose + ) + } + + existing.data <- GetAssayData(object = object, slot = "scale.data", assay = assay) + all.features <- union(x = rownames(x = existing.data), y = features) + new.scale <- matrix( + data = NA, + nrow = length(x = all.features), + ncol = total_cells, + dimnames = list(all.features, all_cells) + ) + common_cells <- intersect(colnames(new.scale), colnames(existing.data)) + if (nrow(x = existing.data) > 0) { + new.scale[rownames(x = existing.data), common_cells] <- existing.data[, common_cells] + } + if (length(x = new.residuals) == 1 & is.list(x = new.residuals)) { + new.residuals <- new.residuals[[1]] + } else { + new.residuals <- Reduce(cbind, new.residuals) + #new.residuals <- matrix(data = unlist(new.residuals), nrow = nrow(new.scale) , ncol = ncol(new.scale)) + #colnames(new.residuals) <- colnames(new.scale) + #rownames(new.residuals) <- rownames(new.scale) + } + new.scale[rownames(x = new.residuals), colnames(x = new.residuals)] <- new.residuals + + if (na.rm) { + new.scale <- new.scale[!rowAnyNAs(x = new.scale), ] + } + + return(new.scale[features, ]) +} + +#' Calculate pearson residuals of features not in the scale.data +#' This function is the secondary function under FetchResiduals +#' +#' @param object A seurat object +#' @param assay Name of the assay of the seurat object generated by +#' SCTransform. Default is "SCT" +#' @param umi.assay Name of the assay of the seurat object to fetch +#' UMIs from. Default is "RNA" +#' @param layer Name of the layer under `umi.assay` to fetch UMIs from. +#' Default is "counts" +#' @param chunk_size Number of cells to load in memory for calculating +#' residuals +#' @param layer.cells Vector of cells to calculate the residual for. +#' Default is NULL which uses all cells in the layer +#' @param SCTModel Which SCTmodel to use from the object for calculating +#' the residual. Will be ignored if reference.SCT.model is set +#' @param reference.SCT.model If a reference SCT model should be used +#' for calculating the residuals. When set to not NULL, ignores the `SCTModel` +#' paramater. +#' @param new_features A vector of features to calculate the residuals for +#' @param clip.range Numeric of length two specifying the min and max values +#' the Pearson residual will be clipped to. Useful if you want to change the +#' clip.range. +#' @param replace.value Whether to replace the value of residuals if it +#' already exists +#' @param verbose Whether to print messages and progress bars +#' +#' @return Returns a matrix containing centered pearson residuals of +#' added features +#' +#' @importFrom sctransform get_residuals +#' @importFrom Matrix colSums +#' +#' @keywords internal +FetchResidualSCTModel <- function( + object, + assay = "SCT", + umi.assay = "RNA", + layer = "counts", + chunk_size = 2000, + layer.cells = NULL, + SCTModel = NULL, + reference.SCT.model = NULL, + new_features = NULL, + clip.range = NULL, + replace.value = FALSE, + verbose = FALSE +) { + + model.cells <- character() + model.features <- Features(x = object, assay = assay) + if (is.null(x = reference.SCT.model)){ + clip.range <- clip.range %||% SCTResults(object = object[[assay]], slot = "clips", model = SCTModel)$sct + model.features <- rownames(x = SCTResults(object = object[[assay]], slot = "feature.attributes", model = SCTModel)) + model.cells <- Cells(x = slot(object = object[[assay]], name = "SCTModel.list")[[SCTModel]]) + sct.method <- SCTResults(object = object[[assay]], slot = "arguments", model = SCTModel)$sct.method %||% "default" + } + + layer.cells <- layer.cells %||% Cells(x = object[[umi.assay]], layer = layer) + if (!is.null(reference.SCT.model)) { + # use reference SCT model + sct.method <- "reference" + } + existing.scale.data <- NULL + if (is.null(x=reference.SCT.model)){ + existing.scale.data <- suppressWarnings(GetAssayData(object = object, assay = assay, slot = "scale.data")) + } + scale.data.cells <- colnames(x = existing.scale.data) + scale.data.cells.common <- intersect(scale.data.cells, layer.cells) + scale.data.cells <- intersect(x = scale.data.cells, y = scale.data.cells.common) + if (length(x = setdiff(x = layer.cells, y = scale.data.cells)) == 0) { + # existing.scale.data <- suppressWarnings(GetAssayData(object = object, assay = assay, slot = "scale.data")) + #full.scale.data <- matrix(data = NA, nrow = nrow(x = existing.scale.data), + # ncol = length(x = layer.cells), dimnames = list(rownames(x = existing.scale.data), layer.cells)) + #full.scale.data[rownames(x = existing.scale.data), colnames(x = existing.scale.data)] <- existing.scale.data + #existing_features <- names(x = which(x = !apply( + # X = full.scale.data, + # MARGIN = 1, + # FUN = anyNA + #))) + existing_features <- rownames(x = existing.scale.data) + } else { + existing_features <- character() + } + if (replace.value) { + features_to_compute <- new_features + } else { + features_to_compute <- setdiff(x = new_features, y = existing_features) + } + if (length(features_to_compute)<1){ + return (existing.scale.data[intersect(x = rownames(x = scale.data.cells), y = new_features),,drop=FALSE]) + } + + if (is.null(x = reference.SCT.model) & length(x = setdiff(x = model.cells, y = scale.data.cells)) == 0) { + existing_features <- names(x = which(x = ! apply( + X = GetAssayData(object = object, assay = assay, slot = "scale.data")[, model.cells], + MARGIN = 1, + FUN = anyNA) + )) + } else { + existing_features <- character() + } + if (sct.method == "reference.model") { + if (verbose) { + message("sct.model ", SCTModel, " is from reference, so no residuals will be recalculated") + } + features_to_compute <- character() + } + if (!umi.assay %in% Assays(object = object)) { + warning("The umi assay (", umi.assay, ") is not present in the object. ", + "Cannot compute additional residuals.", + call. = FALSE, immediate. = TRUE + ) + return(NULL) + } + # these features do not have feature attriutes + diff_features <- setdiff(x = features_to_compute, y = model.features) + intersect_features <- intersect(x = features_to_compute, y = model.features) + if (sct.method == "reference") { + vst_out <- SCTModel_to_vst(SCTModel = reference.SCT.model) + + # override clip.range + clip.range <- vst_out$arguments$sct.clip.range + umi.field <- paste0("nCount_", assay) + # get rid of the cell attributes + vst_out$cell_attr <- NULL + all.features <- intersect( + x = rownames(x = vst_out$gene_attr), + y = features_to_compute + ) + vst_out$gene_attr <- vst_out$gene_attr[all.features, , drop = FALSE] + vst_out$model_pars_fit <- vst_out$model_pars_fit[all.features, , drop = FALSE] + } else { + vst_out <- SCTModel_to_vst(SCTModel = slot(object = object[[assay]], name = "SCTModel.list")[[SCTModel]]) + clip.range <- vst_out$arguments$sct.clip.range + } + clip.max <- max(clip.range) + clip.min <- min(clip.range) + + layer.cells <- layer.cells %||% Cells(x = object[[umi.assay]], layer = layer) + if (length(x = diff_features) == 0) { + counts <- LayerData( + object = object[[umi.assay]], + layer = layer, + cells = layer.cells + ) + cells.vector <- 1:length(x = layer.cells) + cells.grid <- split(x = cells.vector, f = ceiling(x = seq_along(along.with = cells.vector)/chunk_size)) + new_residuals <- list() + + for (i in seq_len(length.out = length(x = cells.grid))) { + vp <- cells.grid[[i]] + block <- counts[,vp, drop=FALSE] + umi.all <- as.sparse(x = block) + + # calculate min_variance for get_residuals + # required when vst_out$arguments$min_variance == "umi_median" + # only calculated once + if (i==1){ + nz_median <- median(umi.all@x) + min_var_custom <- (nz_median / 5)^2 + } + umi <- umi.all[features_to_compute, , drop = FALSE] + + ## Add cell_attr for missing cells + cell_attr <- data.frame( + umi = colSums(umi.all), + log_umi = log10(x = colSums(umi.all)) + ) + rownames(cell_attr) <- colnames(umi.all) + if (sct.method %in% c("reference.model", "reference")) { + vst_out$cell_attr <- cell_attr[colnames(umi.all), ,drop=FALSE] + } else { + cell_attr_existing <- vst_out$cell_attr + cells_missing <- setdiff(rownames(cell_attr), rownames(cell_attr_existing)) + if (length(cells_missing)>0){ + cell_attr_missing <- cell_attr[cells_missing, ,drop=FALSE] + missing_cols <- setdiff(x = colnames(x = cell_attr_existing), + y = colnames(x = cell_attr_missing)) + + if (length(x = missing_cols) > 0) { + cell_attr_missing[, missing_cols] <- NA + } + vst_out$cell_attr <- rbind(cell_attr_existing, + cell_attr_missing) + vst_out$cell_attr <- vst_out$cell_attr[colnames(umi), , drop=FALSE] + } + } + if (verbose) { + if (sct.method == "reference.model") { + message("using reference sct model") + } else { + message("sct.model: ", SCTModel, " on ", ncol(x = umi), " cells: ", + colnames(x = umi.all)[1], " .. ", colnames(x = umi.all)[ncol(umi.all)]) + } + } + + if (vst_out$arguments$min_variance == "umi_median"){ + min_var <- min_var_custom + } else { + min_var <- vst_out$arguments$min_variance + } + if (nrow(umi)>0){ + vst_out.tmp <- vst_out + vst_out.tmp$cell_attr <- vst_out.tmp$cell_attr[colnames(x = umi),] + new_residual <- get_residuals( + vst_out = vst_out.tmp, + umi = umi, + residual_type = "pearson", + min_variance = min_var, + res_clip_range = c(clip.min, clip.max), + verbosity = as.numeric(x = verbose) * 2 + ) + } else { + return(matrix( + data = NA, + nrow = length(x = features_to_compute), + ncol = length(x = colnames(umi.all)), + dimnames = list(features_to_compute, colnames(umi.all)) + )) + } + new_residual <- as.matrix(x = new_residual) + new_residuals[[i]] <- new_residual + } + new_residual <- do.call(what = cbind, args = new_residuals) + # centered data if no reference model is provided + if (is.null(x = reference.SCT.model)){ + new_residual <- new_residual - rowMeans(x = new_residual) + } else { + # subtract residual mean from reference model + if (verbose){ + message("Using residual mean from reference for centering") + } + vst_out <- SCTModel_to_vst(SCTModel = reference.SCT.model) + ref.residuals.mean <- vst_out$gene_attr[rownames(x = new_residual),"residual_mean"] + new_residual <- sweep( + x = new_residual, + MARGIN = 1, + STATS = ref.residuals.mean, + FUN = "-" + ) + } + # return (new_residuals) + } else { + # Some features do not exist + warning( + "In the SCTModel ", SCTModel, ", the following ", length(x = diff_features), + " features do not exist in the counts slot: ", paste(diff_features, collapse = ", ") + ) + if (length(x = intersect_features) == 0) { + # No features exist + return(matrix( + data = NA, + nrow = length(x = features_to_compute), + ncol = length(x = model.cells), + dimnames = list(features_to_compute, model.cells) + )) + } + } + old.features <- setdiff(x = new_features, y = features_to_compute) + if (length(x = old.features) > 0) { + old_residuals <- GetAssayData(object = object[[assay]], slot = "scale.data")[old.features, model.cells, drop = FALSE] + new_residual <- rbind(new_residual, old_residuals)[new_features, ] + } + return(new_residual) +} + +#' @importFrom sctransform get_residuals +GetResidualsChunked <- function(vst_out, layer.counts, residual_type, min_variance, res_clip_range, verbose, chunk_size=5000) { + if (inherits(x = layer.counts, what = 'V3Matrix')) { + residuals <- get_residuals( + vst_out = vst_out, + umi = layer.counts, + residual_type = residual_type, + min_variance = min_variance, + res_clip_range = res_clip_range, + verbosity = as.numeric(x = verbose) * 2 + ) + } else if (inherits(x = layer.counts, what = "IterableMatrix")) { + cells.vector <- 1:ncol(x = layer.counts) + residuals.list <- list() + cells.grid <- split(x = cells.vector, f = ceiling(x = seq_along(along.with = cells.vector)/chunk_size)) + for (i in seq_len(length.out = length(x = cells.grid))) { + vp <- cells.grid[[i]] + counts.vp <- as.sparse(x = layer.counts[, vp]) + vst.out <- vst_out + vst.out$cell_attr <- vst.out$cell_attr[colnames(x = counts.vp),,drop=FALSE] + residuals.list[[i]] <- get_residuals( + vst_out = vst.out, + umi = counts.vp, + residual_type = residual_type, + min_variance = min_variance, + res_clip_range = res_clip_range, + verbosity = as.numeric(x = verbose) * 2 + ) + } + residuals <- Reduce(f = cbind, x = residuals.list) + } else { + stop("Data type not supported") + } + return (residuals) +} + +#' temporal function to get residuals from reference +#' @param object A seurat object +#' @param reference.SCT.model a reference SCT model that should be used +#' for calculating the residuals +#' @param features Names of features to compute +#' @param nCount_UMI UMI counts. If not specified, defaults to +#' column sums of object +#' @param verbose Whether to print messages and progress bars +#' @importFrom sctransform get_residuals +#' @importFrom Matrix colSums +#' +#' @keywords internal +FetchResiduals_reference <- function(object, + reference.SCT.model = NULL, + features = NULL, + nCount_UMI = NULL, + verbose = FALSE) { + ## Add cell_attr for missing cells + nCount_UMI <- nCount_UMI %||% colSums(object) + cell_attr <- data.frame( + umi = nCount_UMI, + log_umi = log10(x = nCount_UMI) + ) + features_to_compute <- features + features_to_compute <- intersect(features_to_compute, rownames(object)) + vst_out <- SCTModel_to_vst(SCTModel = reference.SCT.model) + + # override clip.range + clip.range <- vst_out$arguments$sct.clip.range + # get rid of the cell attributes + vst_out$cell_attr <- NULL + all.features <- intersect( + x = rownames(x = vst_out$gene_attr), + y = features_to_compute + ) + vst_out$gene_attr <- vst_out$gene_attr[all.features, , drop = FALSE] + vst_out$model_pars_fit <- vst_out$model_pars_fit[all.features, , drop = FALSE] + + clip.max <- max(clip.range) + clip.min <- min(clip.range) + + + umi <- object[features_to_compute, , drop = FALSE] + + + rownames(cell_attr) <- colnames(object) + vst_out$cell_attr <- cell_attr + + if (verbose) { + message("using reference sct model") + } + + if (vst_out$arguments$min_variance == "umi_median"){ + nz_median <- 1 + min_var_custom <- (nz_median / 5)^2 + min_var <- min_var_custom + } else { + min_var <- vst_out$arguments$min_variance + } + new_residual <- get_residuals( + vst_out = vst_out, + umi = umi, + residual_type = "pearson", + min_variance = min_var, + verbosity = as.numeric(x = verbose) * 2 + ) + + ref.residuals.mean <- vst_out$gene_attr[rownames(x = new_residual),"residual_mean"] + new_residual <- sweep( + x = new_residual, + MARGIN = 1, + STATS = ref.residuals.mean, + FUN = "-" + ) + new_residual <- MinMax(data = new_residual, min = clip.min, max = clip.max) + return(new_residual) +} + +#' Find variable features based on mean.var.plot +#' +#' @param data Data matrix +#' @param nselect Number of features to select based on dispersion values +#' @param verbose Whether to print messages and progress bars +#' @param mean.cutoff Numeric of length two specifying the min and max values +#' @param dispersion.cutoff Numeric of length two specifying the min and max values +#' +#' @keywords internal +#' +MVP <- function( + data, + verbose = TRUE, + nselect = 2000L, + mean.cutoff = c(0.1, 8), + dispersion.cutoff = c(1, Inf), + ... +) { + hvf.info <- DISP(data = data, nselect = nselect, verbose = verbose) + hvf.info$variable <- FALSE + hvf.info$rank <- NA + hvf.info <- hvf.info[order(hvf.info$mvp.dispersion, decreasing = TRUE), , drop = FALSE] + means.use <- (hvf.info[, 1] > mean.cutoff[1]) & (hvf.info[, 1] < mean.cutoff[2]) + dispersions.use <- (hvf.info[, 3] > dispersion.cutoff[1]) & (hvf.info[, 3] < dispersion.cutoff[2]) + hvf.info[which(x = means.use & dispersions.use), 'variable'] <- TRUE + rank.rows <- rownames(x = hvf.info)[which(x = means.use & dispersions.use)] + selected.indices <- which(rownames(x = hvf.info) %in% rank.rows) + hvf.info$rank[selected.indices] <- seq_along(selected.indices) + hvf.info <- hvf.info[order(as.numeric(row.names(hvf.info))), ] + # hvf.info[hvf.info$variable,'rank'] <- rank(x = hvf.info[hvf.info$variable,'rank']) + # hvf.info[!hvf.info$variable,'rank'] <- NA + return(hvf.info) +} diff --git a/R/reexports.R b/R/reexports.R index 4a12a6aa5..41cda45a0 100644 --- a/R/reexports.R +++ b/R/reexports.R @@ -145,12 +145,20 @@ NULL # Functions and Generics #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +#' @importFrom generics components +#' @rdname reexports +#' @export +#' +generics::components + #' @importFrom SeuratObject %||% +#' @rdname reexports #' @export #' SeuratObject::`%||%` #' @importFrom SeuratObject %iff% +#' @rdname reexports #' @export #' SeuratObject::`%iff%` @@ -196,13 +204,11 @@ SeuratObject::Cells SeuratObject::CellsByIdentities #' @importFrom SeuratObject Command - #' @export #' SeuratObject::Command #' @importFrom SeuratObject CreateAssayObject - #' @export #' SeuratObject::CreateAssayObject @@ -213,7 +219,6 @@ SeuratObject::CreateAssayObject SeuratObject::CreateDimReducObject #' @importFrom SeuratObject CreateSeuratObject - #' @export #' SeuratObject::CreateSeuratObject @@ -304,7 +309,6 @@ SeuratObject::IsGlobal SeuratObject::JS #' @importFrom SeuratObject JS<- - #' @export #' SeuratObject::`JS<-` @@ -315,13 +319,11 @@ SeuratObject::`JS<-` SeuratObject::Key #' @importFrom SeuratObject Key<- - #' @export #' SeuratObject::`Key<-` #' @importFrom SeuratObject Loadings - #' @export #' SeuratObject::Loadings @@ -332,7 +334,6 @@ SeuratObject::Loadings SeuratObject::`Loadings<-` #' @importFrom SeuratObject LogSeuratCommand - #' @export #' SeuratObject::LogSeuratCommand @@ -438,7 +439,6 @@ SeuratObject::`Tool<-` SeuratObject::UpdateSeuratObject #' @importFrom SeuratObject VariableFeatures - #' @export #' SeuratObject::VariableFeatures diff --git a/R/roxygen.R b/R/roxygen.R new file mode 100644 index 000000000..0fe3d0e49 --- /dev/null +++ b/R/roxygen.R @@ -0,0 +1,61 @@ +#' @include zzz.R +#' +NULL + +#' @importFrom utils lsf.str +#' @importFrom rlang is_scalar_character +#' +.rd_methods <- function(method = 'integration') { + methods <- sapply( + X = grep(pattern = '^package:', x = search(), value = TRUE), + FUN = function(x) { + fxns <- as.character(x = lsf.str(pos = x)) + attrs <- vector(mode = 'logical', length = length(x = fxns)) + for (i in seq_along(along.with = fxns)) { + mthd <- attr(x = get(x = fxns[i], pos = x), which = 'Seurat.method') + attrs[i] <- is_scalar_character(x = mthd) && isTRUE(x = mthd == method) + } + return(fxns[attrs]) + }, + simplify = FALSE, + USE.NAMES = TRUE + ) + methods <- Filter(f = length, x = methods) + names(x = methods) <- gsub( + pattern = '^package:', + replacement = '', + x = names(x = methods) + ) + if (!length(x = methods)) { + return('') + } + ret <- vector( + mode = 'character', + length = sum(vapply( + X = methods, + FUN = length, + FUN.VALUE = integer(length = 1L) + )) + ) + j <- 1L + for (pkg in names(x = methods)) { + for (fxn in methods[[pkg]]) { + ret[j] <- ifelse( + test = pkg == 'Seurat', + yes = paste0('\\item \\code{\\link{', fxn, '}}'), + no = paste0( + '\\item \\code{\\link[', + pkg, + ':', + fxn, + ']{', + pkg, + '::', + fxn, '}}' + ) + ) + j <- j + 1L + } + } + return(paste('\\itemize{', paste0(' ', ret, collapse = '\n'), '}', sep = '\n')) +} diff --git a/R/sketching.R b/R/sketching.R new file mode 100644 index 000000000..c2e8535c5 --- /dev/null +++ b/R/sketching.R @@ -0,0 +1,746 @@ +#' @include zzz.R +#' @include generics.R +#' @importFrom rlang enquo is_quosure quo_get_env quo_get_expr +#' +NULL + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Generics +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Functions +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + +#' Sketch Data +#' +#' This function uses sketching methods to downsample high-dimensional single-cell RNA expression data, +#' which can help with scalability for large datasets. +#' +#' @param object A Seurat object. +#' @param assay Assay name. Default is NULL, in which case the default assay of the object is used. +#' @param ncells A positive integer or a named vector/list specifying the +#' number of cells to sample per layer. If a single integer is provided, the +#' same number of cells will be sampled from each layer. Default is 5000. +#' @param sketched.assay Sketched assay name. A sketch assay is created or overwrite with the sketch data. Default is 'sketch'. +#' @param method Sketching method to use. Can be 'LeverageScore' or 'Uniform'. +#' Default is 'LeverageScore'. +#' @param var.name A metadata column name to store the leverage scores. Default is 'leverage.score'. +#' @param over.write whether to overwrite existing column in the metadata. Default is FALSE. +#' @param seed A positive integer for the seed of the random number generator. Default is 123. +#' @param cast The type to cast the resulting assay to. Default is 'dgCMatrix'. +#' @param verbose Print progress and diagnostic messages +#' @param ... Arguments passed to other methods +#' +#' @return A Seurat object with the sketched data added as a new assay. +#' +#' @importFrom SeuratObject CastAssay Key Key<- Layers +#' +#' @export +#' @concept sketching +#' +SketchData <- function( + object, + assay = NULL, + ncells = 5000L, + sketched.assay = 'sketch', + method = c('LeverageScore', 'Uniform'), + var.name = "leverage.score", + over.write = FALSE, + seed = 123L, + cast = 'dgCMatrix', + verbose = TRUE, + features = NULL, + ... +) { + assay <- assay[1L] %||% DefaultAssay(object = object) + assay <- match.arg(arg = assay, choices = Assays(object = object)) + method <- match.arg(arg = method) + if (sketched.assay == assay) { + abort(message = "Cannot overwrite existing assays") + } + if (sketched.assay %in% Assays(object = object)) { + if (sketched.assay == DefaultAssay(object = object)) { + DefaultAssay(object = object) <- assay + } + object[[sketched.assay]] <- NULL + } + if (!over.write) { + var.name <- CheckMetaVarName(object = object, var.name = var.name) + } + + if (method == 'LeverageScore') { + if (verbose) { + message("Calcuating Leverage Score") + } + object <- LeverageScore( + object = object, + assay = assay, + var.name = var.name, + over.write = over.write, + seed = seed, + verbose = FALSE, + features = features, + ... + ) + } else if (method == 'Uniform') { + if (verbose) { + message("Uniformly sampling") + } + object[[var.name]] <- 1 + } + leverage.score <- object[[var.name]] + layer.names <- Layers(object = object[[assay]], search = 'data') + if (length(ncells) == 1) { + ncells <- rep(ncells, length(layer.names)) + } + if (is.null(names(ncells))) { + names(ncells) <- layer.names + } + # align index of `ncells` with `layer.names` + ncells <- ncells[layer.names] + cells <- mapply( + function( + layer.name, + ncells.layer + ) { + if (!is.null(seed)) { + set.seed(seed) + } + cells.layer <- Cells(object[[assay]], layer = layer.name) + if (length(cells.layer) < ncells.layer) { + cells.to.keep <- cells.layer + } else { + cells.to.keep <- sample( + x = cells.layer, + size = ncells.layer, + prob = leverage.score[cells.layer,] + ) + } + return (cells.to.keep) + }, + layer.names, + ncells + ) + sketched <- suppressWarnings(expr = subset( + x = object[[assay]], + cells = unlist(cells), + layers = Layers(object = object[[assay]], search = c('counts', 'data')) + )) + for (layer.name in layer.names) { + try( + expr = VariableFeatures(object = sketched, method = "sketch", layer = layer.name) <- + VariableFeatures(object = object[[assay]], layer = layer.name), + silent = TRUE + ) + } + if (!is.null(x = cast) && inherits(x = sketched, what = 'Assay5')) { + sketched <- CastAssay(object = sketched, to = cast, ...) + } + Key(object = sketched) <- Key(object = sketched.assay, quiet = TRUE) + object[[sketched.assay]] <- sketched + DefaultAssay(object = object) <- sketched.assay + return(object) +} + + +#' Project full data to the sketch assay +#' +#' +#' This function allows projection of high-dimensional single-cell RNA expression data from a full dataset +#' onto the lower-dimensional embedding of the sketch of the dataset. +#' +#' @param object A Seurat object. +#' @param assay Assay name for the full data. Default is 'RNA'. +#' @param sketched.assay Sketched assay name to project onto. Default is 'sketch'. +#' @param sketched.reduction Dimensional reduction results of the sketched assay to project onto. +#' @param full.reduction Dimensional reduction name for the projected full dataset. +#' @param dims Dimensions to include in the projection. +#' @param normalization.method Normalization method to use. Can be 'LogNormalize' or 'SCT'. +#' Default is 'LogNormalize'. +#' @param refdata An optional list for label transfer from sketch to full data. Default is NULL. +#' Similar to refdata in `MapQuery` +#' @param k.weight Number of neighbors to consider when weighting labels for transfer. Default is 50. +#' @param umap.model An optional pre-computed UMAP model. Default is NULL. +#' @param recompute.neighbors Whether to recompute the neighbors for label transfer. Default is FALSE. +#' @param recompute.weights Whether to recompute the weights for label transfer. Default is FALSE. +#' @param verbose Print progress and diagnostic messages. +#' +#' @return A Seurat object with the full data projected onto the sketched dimensional reduction results. +#' The projected data are stored in the specified full reduction. +#' +#' @export +#' @concept sketching +#' +ProjectData <- function( + object, + assay = 'RNA', + sketched.assay = 'sketch', + sketched.reduction, + full.reduction, + dims, + normalization.method = c("LogNormalize", "SCT"), + refdata = NULL, + k.weight = 50, + umap.model = NULL, + recompute.neighbors = FALSE, + recompute.weights = FALSE, + verbose = TRUE +) { + if (!full.reduction %in% Reductions(object)) { + if (verbose) { + message(full.reduction, ' is not in the object.' + ,' Data from all cells will be projected to ', sketched.reduction) + } + proj.emb <- ProjectCellEmbeddings( + query = object, + reference = object, + query.assay = assay, + dims = dims, + normalization.method = normalization.method, + reference.assay = sketched.assay, + reduction = sketched.reduction, + verbose = verbose) + object[[full.reduction]] <- CreateDimReducObject( + embeddings = proj.emb, + assay = assay, + key = Key(object = full.reduction, quiet = TRUE) + ) + } + object <- TransferSketchLabels( + object = object, + sketched.assay = sketched.assay, + reduction = full.reduction, + dims = dims, + k = k.weight, + refdata = refdata, + reduction.model = umap.model, + recompute.neighbors = recompute.neighbors, + recompute.weights = recompute.weights, + verbose = verbose) + return(object) +} + + +#' Transfer data from sketch data to full data +#' +#' This function transfers cell type labels from a sketched dataset to a full dataset +#' based on the similarities in the lower dimensional space. +#' +#' @param object A Seurat object. +#' @param sketched.assay Sketched assay name. Default is 'sketch'. +#' @param reduction Dimensional reduction name to use for label transfer. +#' @param dims An integer vector indicating which dimensions to use for label transfer. +#' @param refdata A list of character strings indicating the metadata columns containing labels to transfer. Default is NULL. +#' Similar to refdata in `MapQuery` +#' @param k Number of neighbors to use for label transfer. Default is 50. +#' @param reduction.model Dimensional reduction model to use for label transfer. Default is NULL. +#' @param neighbors An object storing the neighbors found during the sketching process. Default is NULL. +#' @param recompute.neighbors Whether to recompute the neighbors for label transfer. Default is FALSE. +#' @param recompute.weights Whether to recompute the weights for label transfer. Default is FALSE. +#' @param verbose Print progress and diagnostic messages +#' +#' @return A Seurat object with transferred labels stored in the metadata. If a UMAP model is provided, +#' the full data are also projected onto the UMAP space, with the results stored in a new reduction, full.`reduction.model` +#' +#' @export +#' @concept sketching +#' +TransferSketchLabels <- function( + object, + sketched.assay = 'sketch', + reduction, + dims, + refdata = NULL, + k = 50, + reduction.model = NULL, + neighbors = NULL, + recompute.neighbors = FALSE, + recompute.weights = FALSE, + verbose = TRUE +){ + full_sketch.nn <- neighbors %||% Tool( + object = object, + slot = 'TransferSketchLabels' + )$full_sketch.nn + full_sketch.weight <- Tool( + object = object, + slot = 'TransferSketchLabels' + )$full_sketch.weight + + compute.neighbors <- is.null(x = full_sketch.nn) || + !all(Cells(full_sketch.nn) == Cells(object[[reduction]])) || + max(Indices(full_sketch.nn)) > ncol(object[[sketched.assay]]) || + !identical(x = full_sketch.nn@alg.info$dims, y = dims) || + !identical(x = full_sketch.nn@alg.info$reduction, y = reduction) || + recompute.neighbors + + compute.weights <- is.null(x = full_sketch.weight) || + !all(colnames(full_sketch.weight) == Cells(object[[reduction]])) || + !all(rownames(full_sketch.weight) == colnames(object[[sketched.assay]])) || + recompute.weights || + recompute.neighbors + + if (compute.neighbors) { + if (verbose) { + message("Finding sketch neighbors") + } + full_sketch.nn <- NNHelper( + query = Embeddings(object[[reduction]])[, dims], + data = Embeddings(object[[reduction]])[colnames(object[[sketched.assay]]), dims], + k = k, + method = "annoy" + ) + slot(object = full_sketch.nn, name = 'alg.info')$dims <- dims + slot(object = full_sketch.nn, name = 'alg.info')$reduction <- reduction + } + if (compute.weights) { + if (verbose) { + message("Finding sketch weight matrix") + } + full_sketch.weight <- FindWeightsNN( + nn.obj = full_sketch.nn, + query.cells = Cells(object[[reduction]]), + reference.cells = colnames(object[[sketched.assay]]), + verbose = verbose) + rownames(full_sketch.weight) <- colnames(object[[sketched.assay]]) + colnames(full_sketch.weight) <- Cells(object[[reduction]]) + } + slot( + object = object, name = 'tools' + )$TransferSketchLabels$full_sketch.nn <- full_sketch.nn + slot( + object = object, name = 'tools' + )$TransferSketchLabels$full_sketch.weight <- full_sketch.weight + + if (!is.null(refdata)) { + if (length(refdata) == 1 & is.character(refdata)) { + refdata <- list(refdata) + names(refdata) <- unlist(refdata) + } + if (verbose) { + message("Transfering refdata from sketch") + } + for (rd in 1:length(x = refdata)) { + if (isFALSE(x = refdata[[rd]])) { + transfer.results[[rd]] <- NULL + next + } + rd.name <- names(x = refdata)[rd] + label.rd <- refdata[[rd]] + ## FetchData not work + if (!label.rd %in% colnames( object[[]])) { + stop(label.rd, ' is not in the meta.data') + } + reference.labels <- object[[]][colnames(object[[sketched.assay]]), label.rd] + predicted.labels.list <- TransferLablesNN( + reference.labels = reference.labels, + weight.matrix = full_sketch.weight) + object[[paste0(rd.name)]] <- predicted.labels.list$labels + object[[paste0(rd.name, '.score')]] <- predicted.labels.list$scores + } + } + if (!is.null(reduction.model)) { + umap.model <- Misc(object = object[[reduction.model]], slot = 'model') + if (is.null(umap.model)) { + warning(reduction.model, ' does not have a stored umap model') + return(object) + } + if (verbose) { + message("Projection to sketch umap") + } + if (ncol(full_sketch.nn) > umap.model$n_neighbors) { + full_sketch.nn@nn.idx <- full_sketch.nn@nn.idx[, 1:umap.model$n_neighbors] + full_sketch.nn@nn.dist <- full_sketch.nn@nn.dist[, 1:umap.model$n_neighbors] + } + proj.umap <- RunUMAP( + object = full_sketch.nn, + reduction.model = object[[reduction.model]], + verbose = verbose, + assay = slot(object = object[[reduction]], name = 'assay.used') + ) + full.umap.reduction <- rev( + x = make.unique( + names = c( + Reductions(object = object), + paste0('full.',reduction.model) + ) + ) + )[1] + Key(object = proj.umap) <- Key(object = full.umap.reduction) + object[[full.umap.reduction ]] <- proj.umap + } + return(object) +} + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Methods for Seurat-defined generics +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +#' @param nsketch A positive integer. The number of sketches to be used in the approximation. +#' Default is 5000. +#' @param ndims A positive integer or NULL. The number of dimensions to use. If NULL, the number +#' of dimensions will default to the number of columns in the object. +#' @param method The sketching method to use, defaults to CountSketch. +#' @param eps A numeric. The error tolerance for the approximation in Johnson–Lindenstrauss embeddings, +#' defaults to 0.5. +#' @param seed A positive integer. The seed for the random number generator, defaults to 123. +#' @param verbose Print progress and diagnostic messages +#' @importFrom Matrix qrR t +#' @importFrom irlba irlba +#' +#' @rdname LeverageScore +#' @method LeverageScore default +#' +#' @export +#' @concept sketching +#' +LeverageScore.default <- function( + object, + nsketch = 5000L, + ndims = NULL, + method = CountSketch, + eps = 0.5, + seed = 123L, + verbose = TRUE, + ... +) { + # Check the dimensions of the object, nsketch, and ndims + ncells <- ncol(x = object) + if (ncells < nsketch * 1.5) { + nv <- ifelse(nrow(x = object) < 50, nrow(x = object) - 1, 50) + Z <- irlba(A = object, nv = 50, nu = 0, verbose = FALSE)$v + return(rowSums(x = Z ^ 2)) + } + if (nrow(x = object) > 5000L) { + abort(message = "too slow") + } else if (nrow(x = object) > (ncells / 1.1)) { + abort(message = "too square") + } + ndims <- ndims %||% ncells + if (nsketch < (1.1 * nrow(x = object))) { + nsketch <- 1.1 * nrow(x = object) + warning( + "'nsketch' is too close to the number of features, setting to ", + round(x = nsketch, digits = 2L), + call. = FALSE, + immediate. = TRUE + ) + } + nsketch <- min(nsketch, ndims) + # Check the method + if (is_quosure(x = method)) { + method <- eval( + expr = quo_get_expr(quo = method), + envir = quo_get_env(quo = method) + ) + } + if (is.character(x = method)) { + method <- match.fun(FUN = method) + } + stopifnot(is.function(x = method)) + # Run the sketching + if (isTRUE(x = verbose)) { + message("sampling ", nsketch, " cells for random sketching") + } + S <- method(nsketch = nsketch, ncells = ncells, seed = seed, ...) + object <- t(x = object) + if (isTRUE(x = verbose)) { + message("Performing QR decomposition") + } + if (inherits(x = object, what = 'IterableMatrix')) { + temp <- tempdir() + object.gene_index <- BPCells::transpose_storage_order(matrix = object, tmpdir = temp) + sa <- as(object = S %*% object.gene_index, Class = 'dgCMatrix') + rm(object.gene_index) + unlink(list.files(path = temp, full.names = TRUE)) + } else { + sa <- S %*% object + } + if (!inherits(x = sa, what = 'dgCMatrix')) { + sa <- as(object = sa, Class = 'dgCMatrix') + } + qr.sa <- base::qr(x = sa) + R <- if (inherits(x = qr.sa, what = 'sparseQR')) { + qrR(qr = qr.sa) + } else { + base::qr.R(qr = qr.sa) + } + R.inv <- as.sparse(x = backsolve(r = R, x = diag(x = ncol(x = R)))) + if (isTRUE(x = verbose)) { + message("Performing random projection") + } + JL <- as.sparse(x = JLEmbed( + nrow = ncol(x = R.inv), + ncol = ndims, + eps = eps, + seed = seed + )) + Z <- object %*% (R.inv %*% JL) + if (inherits(x = Z, what = 'IterableMatrix')) { + Z.score <- BPCells::matrix_stats(matrix = Z ^ 2, row_stats = 'mean' + )$row_stats['mean',]*ncol(x = Z) + } else { + Z.score <- rowSums(x = Z ^ 2) + } + return(Z.score) +} + +#' @param nsketch A positive integer. The number of sketches to be used in the approximation. +#' Default is 5000. +#' @param ndims A positive integer or NULL. The number of dimensions to use. If NULL, the number +#' of dimensions will default to the number of columns in the object. +#' @param method The sketching method to use, defaults to CountSketch. +#' @param vf.method VariableFeatures method +#' @param layer layer to use +#' @param eps A numeric. The error tolerance for the approximation in Johnson–Lindenstrauss embeddings, +#' defaults to 0.5. +#' @param seed A positive integer. The seed for the random number generator, defaults to 123. +#' @param verbose Print progress and diagnostic messages +#' @param features A vector of feature names to use for calculating leverage score. +#' +#' @importFrom SeuratObject EmptyDF +#' +#' @rdname LeverageScore +#' @method LeverageScore StdAssay +#' +#' @export +#' @concept sketching +#' +LeverageScore.StdAssay <- function( + object, + nsketch = 5000L, + ndims = NULL, + method = CountSketch, + vf.method = NULL, + layer = 'data', + eps = 0.5, + seed = 123L, + verbose = TRUE, + features = NULL, + ... +) { + layer <- unique(x = layer) %||% DefaultLayer(object = object) + layer <- Layers(object = object, search = layer) + if (!is_quosure(x = method)) { + method <- enquo(arg = method) + } + scores <- EmptyDF(n = ncol(x = object)) + row.names(x = scores) <- colnames(x = object) + scores[, 1] <- NA_real_ + for (i in seq_along(along.with = layer)) { + l <- layer[i] + if (isTRUE(x = verbose)) { + message("Running LeverageScore for layer ", l) + } + features.use <- features %||% tryCatch({ + VariableFeatures( + object = object, + method = vf.method, + layer = l + ) + }, error = function(e) { + stop("Unable to get variable features from layer `", l, "`. Try providing `features` argument instead.") + }) + + scores[Cells(x = object, layer = l), 1] <- LeverageScore( + object = LayerData( + object = object, + layer = l, + features = features.use, + fast = TRUE + ), + nsketch = nsketch, + ndims = ndims %||% ncol(x = object), + method = method, + eps = eps, + seed = seed, + verbose = verbose, + ... + ) + } + return(scores) +} + +#' @rdname LeverageScore +#' @method LeverageScore Assay +#' @export +#' +LeverageScore.Assay <- LeverageScore.StdAssay + + +#' @param assay assay to use +#' @param nsketch A positive integer. The number of sketches to be used in the approximation. +#' Default is 5000. +#' @param ndims A positive integer or NULL. The number of dimensions to use. If NULL, the number +#' of dimensions will default to the number of columns in the object. +#' @param method The sketching method to use, defaults to CountSketch. +#' @param var.name name of slot to store leverage scores +#' @param over.write whether to overwrite slot that currently stores leverage scores. Defaults +#' to FALSE, in which case the 'var.name' is modified if it already exists in the object +#' +#' @rdname LeverageScore +#' @method LeverageScore Seurat +#' +#' @export +#' @concept sketching +#' +LeverageScore.Seurat <- function( + object, + assay = NULL, + nsketch = 5000L, + ndims = NULL, + var.name = 'leverage.score', + over.write = FALSE, + method = CountSketch, + vf.method = NULL, + layer = 'data', + eps = 0.5, + seed = 123L, + verbose = TRUE, + features = NULL, + ... +) { + if (!over.write) { + var.name <- CheckMetaVarName(object = object, var.name = var.name) + } + assay <- assay[1L] %||% DefaultAssay(object = object) + assay <- match.arg(arg = assay, choices = Assays(object = object)) + method <- enquo(arg = method) + scores <- LeverageScore( + object = object[[assay]], + nsketch = nsketch, + ndims = ndims, + method = method, + vf.method = vf.method, + layer = 'data', + eps = eps, + seed = seed, + verbose = verbose, + features = features, + ... + ) + names(x = scores) <- var.name + object[[]] <- scores + return(object) +} + + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Methods for R-defined generics +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# Internal +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +#' Generate CountSketch random matrix +#' +#' @inheritParams base::set.seed +#' @param nsketch Number of sketching random cells +#' @param ncells Number of cells in the original data +#' @param ... Ignored +#' +#' @return ... +#' +#' @importFrom Matrix sparseMatrix +#' +#' @export +#' @concept sketching +#' +#' @keywords internal +#' +#' @references Clarkson, KL. & Woodruff, DP. +#' Low-rank approximation and regression in input sparsity time. +#' Journal of the ACM (JACM). 2017 Jan 30;63(6):1-45. +#' \url{https://dl.acm.org/doi/abs/10.1145/3019134}; + +CountSketch <- function(nsketch, ncells, seed = NA_integer_, ...) { + if (!is.na(x = seed)) { + set.seed(seed = seed) + } + iv <- xv <- vector(mode = "numeric", length = ncells) + jv <- seq_len(length.out = ncells) + for (i in jv) { + iv[i] <- sample(x = seq_len(length.out = nsketch), size = 1L) + xv[i] <- sample(x = c(-1L, 1L), size = 1L) + } + return(sparseMatrix( + i = iv, + j = jv, + x = xv, + dims = c(nsketch, ncells) + )) +} + +#' Gaussian sketching +#' +#' @inheritParams CountSketch +#' +#' @return ... +#' +#' @export +#' @concept sketching +#' +#' @keywords internal +#' +GaussianSketch <- function(nsketch, ncells, seed = NA_integer_, ...) { + if (!is.na(x = seed)) { + set.seed(seed = seed) + } + return(matrix( + data = rnorm(n = nsketch * ncells, mean = 0L, sd = 1 / (ncells ^ 2)), + nrow = nsketch, + ncol = ncells + )) +} + +#' Generate JL random projection embeddings +#' +#' @keywords internal +#' +#' @references Aghila G and Siddharth R (2020). +#' RandPro: Random Projection with Classification. R package version 0.2.2. +#' \url{https://CRAN.R-project.org/package=RandPro} +#' +#' @noRd +# +JLEmbed <- function(nrow, ncol, eps = 0.1, seed = NA_integer_, method = "li") { + if (!is.na(x = seed)) { + set.seed(seed = seed) + } + method <- method[1L] + method <- match.arg(arg = method) + if (!is.null(x = eps)) { + if (eps > 1 || eps <= 0) { + stop("'eps' must be 0 < eps <= 1") + } + ncol <- floor(x = 4 * log(x = ncol) / ((eps ^ 2) / 2 - (eps ^ 3 / 3))) + } + m <- switch( + EXPR = method, + "li" = { + s <- ceiling(x = sqrt(x = ncol)) + prob <- c( + 1 / (2 * s), + 1 - (1 / s), + 1 / (2 * s) + ) + matrix( + data = sample( + x = seq.int(from = -1L, to = 1L), + size = nrow * ncol, + replace = TRUE, + prob = prob + ), + nrow = nrow + ) + } + ) + return(m) +} + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# S4 Methods +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% diff --git a/R/tree.R b/R/tree.R index e27595183..f73587973 100644 --- a/R/tree.R +++ b/R/tree.R @@ -13,12 +13,12 @@ cluster.ape <- paste( #' Phylogenetic Analysis of Identity Classes #' -#' Constructs a phylogenetic tree relating the 'average' cell from each +#' Constructs a phylogenetic tree relating the 'aggregate' cell from each #' identity class. Tree is estimated based on a distance matrix constructed in #' either gene expression space or PCA space. #' -#' Note that the tree is calculated for an 'average' cell, so gene expression -#' or PC scores are averaged across all cells in an identity class before the +#' Note that the tree is calculated for an 'aggregate' cell, so gene expression +#' or PC scores are summed across all cells in an identity class before the #' tree is constructed. #' #' @param object Seurat object @@ -29,6 +29,7 @@ cluster.ape <- paste( #' overrides \code{features} #' @param reduction Name of dimension reduction to use. Only used if \code{dims} #' is not NULL. +#' @param slot slot/layer to use. #' @param graph If graph is passed, build tree based on graph connectivity between #' clusters; overrides \code{dims} and \code{features} #' @param reorder Re-order identity classes (factor ordering), according to @@ -37,7 +38,6 @@ cluster.ape <- paste( #' @param reorder.numeric Re-order identity classes according to position on #' the tree, assigning a numeric value ('1' is the leftmost node) #' @param verbose Show progress updates -#' @inheritParams AverageExpression #' #' @return A Seurat object where the cluster tree can be accessed with \code{\link{Tool}} #' @@ -49,13 +49,15 @@ cluster.ape <- paste( #' @concept tree #' #' @examples +#' \dontrun{ #' if (requireNamespace("ape", quietly = TRUE)) { #' data("pbmc_small") #' pbmc_small #' pbmc_small <- BuildClusterTree(object = pbmc_small) #' Tool(object = pbmc_small, slot = 'BuildClusterTree') #' } -#' +#' } +#' BuildClusterTree <- function( object, assay = NULL, @@ -68,7 +70,7 @@ BuildClusterTree <- function( reorder.numeric = FALSE, verbose = TRUE ) { - if (!PackageCheck('ape', error = FALSE)) { + if (!requireNamespace('ape', quietly = TRUE)) { stop(cluster.ape, call. = FALSE) } assay <- assay %||% DefaultAssay(object = object) @@ -133,14 +135,30 @@ BuildClusterTree <- function( } else { features <- features %||% VariableFeatures(object = object) features <- intersect(x = features, y = rownames(x = object)) - data.avg <- AverageExpression( - object = object, - assays = assay, - features = features, - slot = slot, - verbose = verbose - )[[1]] - data.dist <- dist(x = t(x = data.avg[features, ])) + # if `slot` is set to "counts" sum the expression of the + # ident groups, otherwise average them + if(slot == "counts") { + # AggregateExpression only operates on a "counts" matrix so `layer` + # cannot be specified + data.pseudobulk <- AggregateExpression( + object, + assays = assay, + features = features, + verbose = verbose + )[[1]] + } else { + data.pseudobulk <- suppressMessages( + AverageExpression( + object, + assays = assay, + features = features, + # explicitly pass in the value of `slot` in as `layer` + layer = slot, + verbose = verbose + ) + )[[1]] + } + data.dist <- dist(x = t(x = data.pseudobulk[features, ])) } data.tree <- ape::as.phylo(x = hclust(d = data.dist)) Tool(object = object) <- data.tree diff --git a/R/utilities.R b/R/utilities.R index b546bc5d0..f807ec298 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -18,12 +18,15 @@ NULL #' #' @return \code{object} with Azimuth results added #' +#' @export +#' @concept utilities +#' #' @examples #' \dontrun{ #' object <- AddAzimuthResults(object, filename = "azimuth_results.Rds") #' } #' -#' @export + AddAzimuthResults <- function(object = NULL, filename) { if (is.null(x = filename)) { stop("No Azimuth results provided.") @@ -83,6 +86,8 @@ AddAzimuthResults <- function(object = NULL, filename) { #' #' @return \code{object} with the mapping scores added #' +#' @keywords internal +#' #' @examples #' \dontrun{ #' object <- AddAzimuthScores(object, filename = "azimuth_pred.tsv") @@ -125,6 +130,7 @@ AddAzimuthScores <- function(object, filename) { #' @param search Search for symbol synonyms for features in \code{features} that #' don't match features in \code{object}? Searches the HGNC's gene names #' database; see \code{\link{UpdateSymbolList}} for more details +#' @param slot Slot to calculate score values off of. Defaults to data slot (i.e log-normalized counts) #' @param ... Extra parameters passed to \code{\link{UpdateSymbolList}} #' #' @return Returns a Seurat object with module scores added to object meta data; @@ -179,6 +185,7 @@ AddModuleScore <- function( name = 'Cluster', seed = 1, search = FALSE, + slot = 'data', ... ) { if (!is.null(x = seed)) { @@ -187,7 +194,7 @@ AddModuleScore <- function( assay.old <- DefaultAssay(object = object) assay <- assay %||% assay.old DefaultAssay(object = object) <- assay - assay.data <- GetAssayData(object = object) + assay.data <- GetAssayData(object = object, assay = assay, slot = slot) features.old <- features if (k) { .NotYetUsed(arg = 'k') @@ -321,39 +328,37 @@ AddModuleScore <- function( #' Aggregated feature expression by identity class #' -#' Returns aggregated (summed) expression values for each identity class +#' Returns summed counts ("pseudobulk") for each identity class. #' -#' If slot is set to 'data', this function assumes that the data has been log -#' normalized and therefore feature values are exponentiated prior to aggregating -#' so that sum is done in non-log space. Otherwise, if slot is set to -#' either 'counts' or 'scale.data', no exponentiation is performed prior to -#' aggregating -#' If \code{return.seurat = TRUE} and slot is not 'scale.data', aggregated values -#' are placed in the 'counts' slot of the returned object and the log of aggregated values -#' are placed in the 'data' slot. For the \code{\link{ScaleData}} is then run on the default assay +#' If \code{return.seurat = TRUE}, aggregated values are placed in the 'counts' +#' layer of the returned object. The data is then normalized by running \code{\link{NormalizeData}} +#' on the aggregated counts. \code{\link{ScaleData}} is then run on the default assay #' before returning the object. -#' If \code{return.seurat = TRUE} and slot is 'scale.data', the 'counts' slot is left empty, -#' the 'data' slot is filled with NA, and 'scale.data' is set to the aggregated values. #' #' @param object Seurat object #' @param assays Which assays to use. Default is all assays #' @param features Features to analyze. Default is all features in the assay #' @param return.seurat Whether to return the data as a Seurat object. Default is FALSE -#' @param group.by Categories for grouping (e.g, ident, replicate, celltype); 'ident' by default -#' @param add.ident (Deprecated) Place an additional label on each cell prior to pseudobulking -#' (very useful if you want to observe cluster pseudobulk values, separated by replicate, for example) -#' @param slot Slot(s) to use; if multiple slots are given, assumed to follow -#' the order of 'assays' (if specified) or object's assays +#' @param group.by Category (or vector of categories) for grouping (e.g, ident, replicate, celltype); 'ident' by default +#' To use multiple categories, specify a vector, such as c('ident', 'replicate', 'celltype') +#' @param add.ident (Deprecated). Place an additional label on each cell prior to pseudobulking +#' @param normalization.method Method for normalization, see \code{\link{NormalizeData}} +#' @param scale.factor Scale factor for normalization, see \code{\link{NormalizeData}} +#' @param margin Margin to perform CLR normalization, see \code{\link{NormalizeData}} #' @param verbose Print messages and show progress bar -#' @param ... Arguments to be passed to methods such as \code{\link{CreateSeuratObject}}#' +#' @param ... Arguments to be passed to methods such as \code{\link{CreateSeuratObject}} +#' #' @return Returns a matrix with genes as rows, identity classes as columns. #' If return.seurat is TRUE, returns an object of class \code{\link{Seurat}}. #' @export #' @concept utilities #' #' @examples +#' \dontrun{ #' data("pbmc_small") -#' head(AggregateExpression(object = pbmc_small)) +#' head(AggregateExpression(object = pbmc_small)$RNA) +#' head(AggregateExpression(object = pbmc_small, group.by = c('ident', 'groups'))$RNA) +#' } #' AggregateExpression <- function( object, @@ -362,20 +367,25 @@ AggregateExpression <- function( return.seurat = FALSE, group.by = 'ident', add.ident = NULL, - slot = 'data', + normalization.method = "LogNormalize", + scale.factor = 10000, + margin = 1, verbose = TRUE, ... ) { return( PseudobulkExpression( object = object, - pb.method = 'aggregate', assays = assays, features = features, return.seurat = return.seurat, group.by = group.by, add.ident = add.ident, - slot = slot, + layer = 'counts', + method = 'aggregate', + normalization.method = normalization.method, + scale.factor = scale.factor, + margin = margin, verbose = verbose, ... ) @@ -384,29 +394,29 @@ AggregateExpression <- function( #' Averaged feature expression by identity class #' -#' Returns averaged expression values for each identity class +#' Returns averaged expression values for each identity class. #' -#' If slot is set to 'data', this function assumes that the data has been log +#' If layer is set to 'data', this function assumes that the data has been log #' normalized and therefore feature values are exponentiated prior to averaging -#' so that averaging is done in non-log space. Otherwise, if slot is set to -#' either 'counts' or 'scale.data', no exponentiation is performed prior to -#' averaging -#' If \code{return.seurat = TRUE} and slot is not 'scale.data', averaged values -#' are placed in the 'counts' slot of the returned object and the log of averaged values -#' are placed in the 'data' slot. \code{\link{ScaleData}} is then run on the default assay -#' before returning the object. -#' If \code{return.seurat = TRUE} and slot is 'scale.data', the 'counts' slot is left empty, -#' the 'data' slot is filled with NA, and 'scale.data' is set to the aggregated values. +#' so that averaging is done in non-log space. Otherwise, if layer is set to +#' either 'counts' or 'scale.data', no exponentiation is performed prior to averaging. +#' If \code{return.seurat = TRUE} and layer is not 'scale.data', averaged values +#' are placed in the 'counts' layer of the returned object and 'log1p' +#' is run on the averaged counts and placed in the 'data' layer \code{\link{ScaleData}} +#' is then run on the default assay before returning the object. +#' If \code{return.seurat = TRUE} and layer is 'scale.data', the 'counts' layer contains +#' average counts and 'scale.data' is set to the averaged values of 'scale.data'. #' #' @param object Seurat object #' @param assays Which assays to use. Default is all assays #' @param features Features to analyze. Default is all features in the assay #' @param return.seurat Whether to return the data as a Seurat object. Default is FALSE -#' @param group.by Categories for grouping (e.g, ident, replicate, celltype); 'ident' by default -#' @param add.ident (Deprecated) Place an additional label on each cell prior to pseudobulking -#' (very useful if you want to observe cluster pseudobulk values, separated by replicate, for example) -#' @param slot Slot(s) to use; if multiple slots are given, assumed to follow +#' @param group.by Category (or vector of categories) for grouping (e.g, ident, replicate, celltype); 'ident' by default +#' To use multiple categories, specify a vector, such as c('ident', 'replicate', 'celltype') +#' @param add.ident (Deprecated). Place an additional label on each cell prior to pseudobulking +#' @param layer Layer(s) to use; if multiple layers are given, assumed to follow #' the order of 'assays' (if specified) or object's assays +#' @param slot (Deprecated). Slots(s) to use #' @param verbose Print messages and show progress bar #' @param ... Arguments to be passed to methods such as \code{\link{CreateSeuratObject}} #' @@ -414,10 +424,12 @@ AggregateExpression <- function( #' If return.seurat is TRUE, returns an object of class \code{\link{Seurat}}. #' @export #' @concept utilities +#' @importFrom SeuratObject .FilterObjects #' #' @examples #' data("pbmc_small") -#' head(AverageExpression(object = pbmc_small)) +#' head(AverageExpression(object = pbmc_small)$RNA) +#' head(AverageExpression(object = pbmc_small, group.by = c('ident', 'groups'))$RNA) #' AverageExpression <- function( object, @@ -426,20 +438,22 @@ AverageExpression <- function( return.seurat = FALSE, group.by = 'ident', add.ident = NULL, - slot = 'data', + layer = 'data', + slot = deprecated(), verbose = TRUE, ... ) { return( PseudobulkExpression( object = object, - pb.method = 'average', assays = assays, features = features, return.seurat = return.seurat, group.by = group.by, add.ident = add.ident, + layer = layer, slot = slot, + method = 'average', verbose = verbose, ... ) @@ -971,10 +985,9 @@ GroupCorrelation <- function( grp.cors <- grp.cors[names(x = gene.grp)] grp.cors <- as.data.frame(x = grp.cors[which(x = !is.na(x = grp.cors))]) grp.cors$gene_grp <- gene.grp[rownames(x = grp.cors)] - colnames(x = grp.cors) <- c("cor", "feature_grp") - object[[assay]][["feature.grp"]] <- grp.cors[, "feature_grp", drop = FALSE] - object[[assay]][[paste0(var, "_cor")]] <- grp.cors[, "cor", drop = FALSE] - if (do.plot) { + colnames(x = grp.cors) <- c(paste0(var, "_cor"), "feature.grp") + object[[assay]][] <- grp.cors + if (isTRUE(x = do.plot)) { print(GroupCorrelationPlot( object = object, assay = assay, @@ -1119,7 +1132,7 @@ MinMax <- function(data, min, max) { #' PercentAbove(sample(1:100, 10), 75) #' PercentAbove <- function(x, threshold) { - return(length(x = x[x > threshold]) / length(x = x)) + return (sum(x > threshold, na.rm = T) / length(x)) } #' Calculate the percentage of all counts that belong to a given set of features @@ -1159,11 +1172,24 @@ PercentageFeatureSet <- function( ) { assay <- assay %||% DefaultAssay(object = object) if (!is.null(x = features) && !is.null(x = pattern)) { - warning("Both pattern and features provided. Pattern is being ignored.") - } - features <- features %||% grep(pattern = pattern, x = rownames(x = object[[assay]]), value = TRUE) - percent.featureset <- colSums(x = GetAssayData(object = object, assay = assay, slot = "counts")[features, , drop = FALSE])/ - object[[paste0("nCount_", assay)]] * 100 + warn(message = "Both pattern and features provided. Pattern is being ignored.") + } + percent.featureset <- list() + layers <- Layers(object = object, search = "counts") + for (i in seq_along(along.with = layers)) { + layer <- layers[i] + features.layer <- features %||% grep( + pattern = pattern, + x = rownames(x = object[[assay]][layer]), + value = TRUE) + layer.data <- LayerData(object = object, + assay = assay, + layer = layer) + layer.sums <- colSums(x = layer.data[features.layer, , drop = FALSE]) + layer.perc <- layer.sums / object[[]][colnames(layer.data), paste0("nCount_", assay)] * 100 + percent.featureset[[i]] <- layer.perc + } + percent.featureset <- unlist(percent.featureset) if (!is.null(x = col.name)) { object <- AddMetaData(object = object, metadata = percent.featureset, col.name = col.name) return(object) @@ -1171,56 +1197,224 @@ PercentageFeatureSet <- function( return(percent.featureset) } -# Pseudobulk feature expression by identity class -# -# Returns a representative expression value for each identity class -# -# @param object Seurat object -# @param pb.method Whether to 'average' (default) or 'aggregate' expression levels -# @param assays Which assays to use. Default is all assays -# @param features Features to analyze. Default is all features in the assay -# @param return.seurat Whether to return the data as a Seurat object. Default is FALSE -# @param group.by Categories for grouping (e.g, ident, replicate, celltype); 'ident' by default -# @param add.ident (Deprecated) Place an additional label on each cell prior to pseudobulking -# (very useful if you want to observe cluster pseudobulk values, separated by replicate, for example) -# @param slot Slot(s) to use; if multiple slots are given, assumed to follow -# the order of 'assays' (if specified) or object's assays -# @param verbose Print messages and show progress bar -# @param ... Arguments to be passed to methods such as \code{\link{CreateSeuratObject}} -# -# @return Returns a matrix with genes as rows, identity classes as columns. -# If return.seurat is TRUE, returns an object of class \code{\link{Seurat}}. -# -#' @importFrom Matrix rowMeans sparse.model.matrix -#' @importFrom stats as.formula -# @export -# -# @examples -# data("pbmc_small") -# head(PseudobulkExpression(object = pbmc_small)) +#' Pseudobulk feature expression by identity class +#' +#' Returns a representative expression value for each identity class +#' +#' @param object Seurat object +#' @param method Whether to 'average' (default) or 'aggregate' expression levels +#' @param assay The name of the passed assay - used primarily for warning/error messages +#' @param category.matrix A matrix defining groupings for pseudobulk expression +#' calculations; each column represents an identity class, and each row a sample +#' @param features Features to analyze. Default is all features in the assay +#' @param layer Layer(s) to user; if multiple are given, assumed to follow +#' the order of 'assays' (if specified) or object's assays +#' @param slot (Deprecated) See \code{layer} +#' @param verbose Print messages and show progress bar +#' @param ... Arguments to be passed to methods such as \code{\link{CreateSeuratObject}} # -PseudobulkExpression <- function( +#' @return Returns a matrix with genes as rows, identity classes as columns. +#' If return.seurat is TRUE, returns an object of class \code{\link{Seurat}}. +#' @method PseudobulkExpression Assay +#' @rdname PseudobulkExpression +#' @importFrom SeuratObject .IsFutureSeurat +#' @export +#' @concept utilities +#' +PseudobulkExpression.Assay <- function( + object, + assay, + category.matrix, + features = NULL, + layer = 'data', + slot = deprecated(), + verbose = TRUE, + ... +) { + if (is_present(arg = slot)) { + deprecate_soft( + when = '5.0.0', + what = 'GetAssayData(slot = )', + with = 'GetAssayData(layer = )' + ) + layer <- slot + } + data.use <- GetAssayData( + object = object, + layer = layer + ) + features.to.avg <- features %||% rownames(x = data.use) + if (IsMatrixEmpty(x = data.use)) { + warning( + "The ", layer, " layer for the ", assay, + " assay is empty. Skipping assay.", immediate. = TRUE, call. = FALSE) + return(NULL) + } + bad.features <- setdiff(x = features.to.avg, y = rownames(x = data.use)) + if (length(x = bad.features) > 0) { + warning( + "The following ", length(x = bad.features), + " features were not found in the ", assay, " assay: ", + paste(bad.features, collapse = ", "), call. = FALSE, immediate. = TRUE) + } + features.assay <- intersect(x = features.to.avg, y = rownames(x = data.use)) + if (length(x = features.assay) > 0) { + data.use <- data.use[features.assay, ] + } else { + warning("None of the features specified were found in the ", assay, + " assay.", call. = FALSE, immediate. = TRUE) + return(NULL) + } + if (layer == 'data') { + data.use <- expm1(x = data.use) + if (any(data.use == Inf)) { + warning("Exponentiation yielded infinite values. `data` may not be log-normed.") + } + } + data.return <- data.use %*% category.matrix + return(data.return) +} + +#' @method PseudobulkExpression StdAssay +#' @rdname PseudobulkExpression +#' @export +#' @concept utilities +#' +PseudobulkExpression.StdAssay <- function( + object, + assay, + category.matrix, + features = NULL, + layer = 'data', + slot = deprecated(), + verbose = TRUE, + ... +) { + if (is_present(arg = slot)) { + deprecate_soft( + when = '5.0.0', + what = 'GetAssayData(slot = )', + with = 'GetAssayData(layer = )' + ) + layer <- slot + } + layers.set <- Layers(object = object, search = layer) + features.to.avg <- features %||% rownames(x = object) + bad.features <- setdiff(x = features.to.avg, y = rownames(x = object)) + if (length(x = bad.features) > 0) { + warning( + "The following ", length(x = bad.features), + " features were not found in the ", assay, " assay: ", + paste(bad.features, collapse = ", "), call. = FALSE, immediate. = TRUE) + } + features.assay <- Reduce( + f = intersect, + x = c(list(features.to.avg), + lapply(X = layers.set, FUN = function(l) rownames(object[l])) + ) + ) + if (length(x = features.assay) == 0) { + warning("None of the features specified were found in the ", assay, + " assay.", call. = FALSE, immediate. = TRUE) + return(NULL) + } + data.return <- as.sparse( + x = matrix( + data = 0, + nrow = length(x = features.assay), + ncol = ncol(x = category.matrix) + ) + ) + for (i in seq_along(layers.set)) { + data.i <- LayerData(object = object, + layer = layers.set[i], + features = features.assay + ) + if (layers.set[i] == "data") { + data.use.i <- expm1(x = data.i) + if (any(data.use.i == Inf)) { + warning("Exponentiation yielded infinite values. `data` may not be log-normed.") + } + } else { + data.use.i <- data.i + } + category.matrix.i <- category.matrix[colnames(x = data.i),] + if (inherits(x = data.i, what = 'DelayedArray')) { + stop("PseudobulkExpression does not support DelayedArray objects") + } else { + data.return.i <- as.sparse(x = data.use.i %*% category.matrix.i) + } + data.return <- data.return + data.return.i + } + return(data.return) +} + +#' @param assays Which assays to use. Default is all assays +#' @param return.seurat Whether to return the data as a Seurat object. Default is FALSE +#' @param group.by Categories for grouping (e.g, "ident", "replicate", +#' "celltype"); "ident" by default +#' @param add.ident (Deprecated) See group.by +#' @param method The method used for calculating pseudobulk expression; one of: +#' "average" or "aggregate" +#' @param normalization.method Method for normalization, see \code{\link{NormalizeData}} +#' @param scale.factor Scale factor for normalization, see \code{\link{NormalizeData}} +#' @param margin Margin to perform CLR normalization, see \code{\link{NormalizeData}} +#' +#' @method PseudobulkExpression Seurat +#' @rdname PseudobulkExpression +#' @export +#' @concept utilities +#' +PseudobulkExpression.Seurat <- function( object, - pb.method = 'average', assays = NULL, features = NULL, return.seurat = FALSE, group.by = 'ident', add.ident = NULL, - slot = 'data', + layer = 'data', + slot = deprecated(), + method = 'average', + normalization.method = "LogNormalize", + scale.factor = 10000, + margin = 1, verbose = TRUE, ... ) { CheckDots(..., fxns = 'CreateSeuratObject') if (!is.null(x = add.ident)) { - .Deprecated(msg = "'add.ident' is a deprecated argument, please use the 'group.by' argument instead") + .Deprecated(msg = "'add.ident' is a deprecated argument. Please see documentation to see how to pass a vector to the 'group.by' argument to specify multiple grouping variables") group.by <- c('ident', add.ident) } - if (!(pb.method %in% c('average', 'aggregate'))) { - stop("'pb.method' must be either 'average' or 'aggregate'") + if (!(method %in% c('average', 'aggregate'))) { + stop("'method' must be either 'average' or 'aggregate'") + } + if (is_present(arg = slot)) { + deprecate_soft( + when = '5.0.0', + what = 'AverageExpression(slot = )', + with = 'AverageExpression(layer = )' + ) + layer <- slot + } + + if (method == "average") { + inform( + message = "As of Seurat v5, we recommend using AggregateExpression to perform pseudo-bulk analysis.", + .frequency = "once", + .frequency_id = "AverageExpression" + ) } - object.assays <- FilterObjects(object = object, classes.keep = 'Assay') + + object.assays <- .FilterObjects(object = object, classes.keep = c('Assay', 'Assay5')) assays <- assays %||% object.assays + + # `features` is expected to be a 2D array - one vector per assay + # in the case the `features` a vector, duplicate it for each assay + if (!inherits(features, what = "list")) { + features <- rep(list(features), times = length(assays)) + } + if (!all(assays %in% object.assays)) { assays <- assays[assays %in% object.assays] if (length(x = assays) == 0) { @@ -1229,15 +1423,17 @@ PseudobulkExpression <- function( warning("Requested assays that do not exist in object. Proceeding with existing assays only.") } } - if (length(x = slot) == 1) { - slot <- rep_len(x = slot, length.out = length(x = assays)) - } else if (length(x = slot) != length(x = assays)) { - stop("Number of slots provided does not match number of assays") + if (length(x = layer) == 1) { + layer <- rep_len(x = layer, length.out = length(x = assays)) + } else if (length(x = layer) != length(x = assays)) { + stop("Number of layers provided does not match number of assays") } data <- FetchData(object = object, vars = rev(x = group.by)) + #only keep meta-data columns that are in object + group.by <- intersect(group.by, colnames(data)) data <- data[which(rowSums(x = is.na(x = data)) == 0), , drop = F] if (nrow(x = data) < ncol(x = object)) { - message("Removing cells with NA for 1 or more grouping variables") + inform("Removing cells with NA for 1 or more grouping variables") object <- subset(x = object, cells = rownames(x = data)) } for (i in 1:ncol(x = data)) { @@ -1250,184 +1446,180 @@ PseudobulkExpression <- function( } ) if (any(num.levels == 1)) { - message(paste0("The following grouping variables have 1 value and will be ignored: ", - paste0(colnames(x = data)[which(num.levels <= 1)], collapse = ", "))) + message( + paste0( + "The following grouping variables have 1 value and will be ignored: ", + paste0(colnames(x = data)[which(num.levels <= 1)], collapse = ", ") + ) + ) group.by <- colnames(x = data)[which(num.levels > 1)] data <- data[, which(num.levels > 1), drop = F] } - if (ncol(x = data) == 0) { - message("All grouping variables have 1 value only. Computing across all cells.") - category.matrix <- matrix( - data = 1, - nrow = ncol(x = object), - dimnames = list(Cells(x = object), 'all') + category.matrix <- CreateCategoryMatrix(labels = data, method = method) + #check if column names are numeric + col.names <- colnames(category.matrix) + if (any(!(grepl("^[a-zA-Z]|^\\.[^0-9]", col.names)))) { + col.names <- ifelse( + !(grepl("^[a-zA-Z]|^\\.[^0-9]", col.names)), + paste0("g", col.names), + col.names + ) + colnames(category.matrix) <- col.names + inform( + message = paste0("First group.by variable `", group.by[1], + "` starts with a number, appending `g` to ensure valid variable names"), + .frequency = "regularly", + .frequency_id = "PseudobulkExpression" ) - if (pb.method == 'average') { - category.matrix <- category.matrix / sum(category.matrix) - } - } else { - category.matrix <- sparse.model.matrix(object = as.formula( - object = paste0( - '~0+', - paste0( - "data[,", - 1:length(x = group.by), - "]", - collapse = ":" - ) - ) - )) - colsums <- colSums(x = category.matrix) - category.matrix <- category.matrix[, colsums > 0] - colsums <- colsums[colsums > 0] - if (pb.method == 'average') { - category.matrix <- Sweep( - x = category.matrix, - MARGIN = 2, - STATS = colsums, - FUN = "/") - } - colnames(x = category.matrix) <- sapply( - X = colnames(x = category.matrix), - FUN = function(name) { - name <- gsub(pattern = "data\\[, [1-9]*\\]", replacement = "", x = name) - return(paste0(rev(x = unlist(x = strsplit(x = name, split = ":"))), collapse = "_")) - }) } + data.return <- list() for (i in 1:length(x = assays)) { - data.use <- GetAssayData( - object = object, + data.return[[assays[i]]] <- PseudobulkExpression( + object = object[[assays[i]]], assay = assays[i], - slot = slot[i] + category.matrix = category.matrix, + features = features[[i]], + layer = layer[i], + verbose = verbose, + ... ) - features.to.avg <- features %||% rownames(x = data.use) - if (inherits(x = features, what = "list")) { - features.to.avg <- features[i] - } - if (IsMatrixEmpty(x = data.use)) { - warning( - "The ", slot[i], " slot for the ", assays[i], - " assay is empty. Skipping assay.", immediate. = TRUE, call. = FALSE) - next - } - bad.features <- setdiff(x = features.to.avg, y = rownames(x = data.use)) - if (length(x = bad.features) > 0) { - warning( - "The following ", length(x = bad.features), - " features were not found in the ", assays[i], " assay: ", - paste(bad.features, collapse = ", "), call. = FALSE, immediate. = TRUE) - } - features.assay <- intersect(x = features.to.avg, y = rownames(x = data.use)) - if (length(x = features.assay) > 0) { - data.use <- data.use[features.assay, ] - } else { - warning("None of the features specified were found in the ", assays[i], - " assay.", call. = FALSE, immediate. = TRUE) - next - } - if (slot[i] == 'data') { - data.use <- expm1(x = data.use) - if (any(data.use == Inf)) { - warning("Exponentiation yielded infinite values. `data` may not be log-normed.") - } - } - data.return[[i]] <- as.matrix(x = (data.use %*% category.matrix)) - names(x = data.return)[i] <- assays[[i]] } if (return.seurat) { - if (slot[1] == 'scale.data') { - na.matrix <- data.return[[1]] + op <- options(Seurat.object.assay.version = "v5", Seurat.object.assay.calcn = FALSE) + on.exit(expr = options(op), add = TRUE) + if (layer[1] == 'scale.data') { + na.matrix <- as.matrix(x = data.return[[1]]) na.matrix[1:length(x = na.matrix)] <- NA + #sum up counts to make seurat object + summed.counts <- PseudobulkExpression( + object = object[[assays[1]]], + assay = assays[1], + category.matrix = category.matrix, + features = features[[1]], + layer = "counts" + ) toRet <- CreateSeuratObject( - counts = na.matrix, - project = if (pb.method == "average") "Average" else "Aggregate", + counts = summed.counts, + project = if (method == "average") "Average" else "Aggregate", assay = names(x = data.return)[1], - check.matrix = FALSE, ... ) - toRet <- SetAssayData( + LayerData( object = toRet, - assay = names(x = data.return)[1], - slot = "counts", - new.data = matrix() - ) - toRet <- SetAssayData( - object = toRet, - assay = names(x = data.return)[1], - slot = "data", - new.data = na.matrix - ) - toRet <- SetAssayData( - object = toRet, - assay = names(x = data.return)[1], - slot = "scale.data", - new.data = data.return[[1]] - ) + layer = "scale.data", + assay = names(x = data.return)[i] + ) <- data.return[[1]] } else { toRet <- CreateSeuratObject( counts = data.return[[1]], - project = if (pb.method == "average") "Average" else "Aggregate", + project = if (method == "average") "Average" else "Aggregate", assay = names(x = data.return)[1], - check.matrix = FALSE, ... ) - toRet <- SetAssayData( - object = toRet, - assay = names(x = data.return)[1], - slot = "data", - new.data = log1p(x = as.matrix(x = data.return[[1]])) - ) + if (method == "aggregate") { + LayerData( + object = toRet, + layer = "data", + assay = names(x = data.return)[1] + ) <- NormalizeData( + as.matrix(x = data.return[[1]]), + normalization.method = normalization.method, + verbose = verbose + ) + } + else { + LayerData(object = toRet, + layer = "data", + assay = names(x = data.return)[1] + ) <- log1p(x = as.matrix(x = data.return[[1]])) + } } #for multimodal data if (length(x = data.return) > 1) { for (i in 2:length(x = data.return)) { - if (slot[i] == 'scale.data') { - na.matrix <- data.return[[i]] - na.matrix[1:length(x = na.matrix)] <- NA - toRet[[names(x = data.return)[i]]] <- CreateAssayObject(counts = na.matrix, check.matrix = FALSE) - toRet <- SetAssayData( - object = toRet, - assay = names(x = data.return)[i], - slot = "counts", - new.data = matrix() - ) - toRet <- SetAssayData( - object = toRet, - assay = names(x = data.return)[i], - slot = "data", - new.data = na.matrix + if (layer[i] == 'scale.data') { + summed.counts <- PseudobulkExpression( + object = object[[assays[i]]], + assay = assays[i], + category.matrix = category.matrix, + features = features[[i]], + layer = "counts" ) - toRet <- SetAssayData( + toRet[[names(x = data.return)[i]]] <- CreateAssayObject(counts = summed.counts) + LayerData( object = toRet, - assay = names(x = data.return)[i], - slot = "scale.data", - new.data = as.matrix(x = data.return[[i]]) - ) + layer = "scale.data", + assay = names(x = data.return)[i] + ) <- data.return[[i]] } else { - toRet[[names(x = data.return)[i]]] <- CreateAssayObject(counts = data.return[[i]], check.matrix = FALSE) - toRet <- SetAssayData( - object = toRet, - assay = names(x = data.return)[i], - slot = "data", - new.data = log1p(x = as.matrix(x = data.return[[i]])) + toRet[[names(x = data.return)[i]]] <- CreateAssayObject( + counts = data.return[[i]], + check.matrix = FALSE ) + if (method == "aggregate") { + LayerData( + object = toRet, + layer = "data", + assay = names(x = data.return)[i] + ) <- NormalizeData( + as.matrix(x = data.return[[i]]), + normalization.method = normalization.method, + scale.factor = scale.factor, + margin = margin, + verbose = verbose + ) + } + else { + LayerData( + object = toRet, + layer = "data", + assay = names(x = data.return)[i] + ) <- log1p(x = as.matrix(x = data.return[[i]])) + } } - } } if (DefaultAssay(object = object) %in% names(x = data.return)) { DefaultAssay(object = toRet) <- DefaultAssay(object = object) - if (slot[which(DefaultAssay(object = object) %in% names(x = data.return))[1]] != 'scale.data') { + if (layer[which(DefaultAssay(object = object) %in% names(x = data.return))[1]] != 'scale.data') { toRet <- ScaleData(object = toRet, verbose = verbose) } } - if ('ident' %in% group.by) { - first.cells <- c() - for (i in 1:ncol(x = category.matrix)) { - first.cells <- c(first.cells, Position(x = category.matrix[,i], f = function(x) {x > 0})) + #add meta-data based on group.by variables + cells <- Cells(toRet) + for (i in 1:length(group.by)) { + if (group.by[i] != "ident") { + v <- sapply( + strsplit(cells, "_"), + function(x) {return(x[i])} + ) + names(v) <- cells + toRet <- AddMetaData(toRet, + metadata = v, + col.name = group.by[i] + ) } - Idents(object = toRet) <- Idents(object = object)[first.cells] + } + #set idents to pseudobulk variables + Idents(toRet) <- cells + + #make orig.ident variable + #orig.ident = ident if group.by includes `ident` + #if not, orig.ident is equal to pseudobulk cell names + if(any(group.by == "ident")) { + i = which(group.by == "ident") + v <- sapply( + strsplit(cells, "_"), + function(x) {return(x[i])} + ) + names(v) <- cells + toRet <- AddMetaData(toRet, + metadata = v, + col.name = "orig.ident" + ) + } else { + toRet$orig.ident <- cells } return(toRet) } else { @@ -1607,6 +1799,35 @@ as.data.frame.Matrix <- function( # Internal #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +#' Create Abbreviations +#' +#' @param x A character vector +#' @param digits Include digits in the abbreviation +#' +#' @return Abbreviated versions of \code{x} +#' +#' @keywords internal +#' +#' @examples +#' .Abbrv(c('HelloWorld, 'LetsGo3', 'tomato')) +#' .Abbrv(c('HelloWorld, 'LetsGo3', 'tomato'), digits = FALSE) +#' .Abbrv('Wow3', digits = FALSE) +#' +#' @noRd +#' +.Abbrv <- function(x, digits = TRUE) { + pattern <- ifelse(test = isTRUE(x = digits), yes = '[A-Z0-9]+', no = '[A-Z]+') + y <- vapply( + X = regmatches(x = x, m = gregexec(pattern = pattern, text = x)), + FUN = paste, + FUN.VALUE = character(length = 1L), + collapse = '' + ) + na <- nchar(x = y) <= 1L + y[na] <- x[na] + return(tolower(x = y)) +} + .AsList <- function(x) { x <- as.list(x = x) return(sapply( @@ -2488,3 +2709,262 @@ ToNumeric <- function(x){ } return(x) } + +# Merge a list of sparse matrixes +#' @importFrom Matrix summary sparseMatrix +MergeSparseMatrices <- function(...) { + + colname.new <- character() + rowname.new <- character() + x <- vector() + i <- numeric() + j <- numeric() + + for (mat in list(...)) { + colname.old <- colnames(x = mat) + rowname.old <- rownames(x = mat) + + # does not check if there are overlapping cells + colname.new <- union(x = colname.new, y = colname.old) + rowname.new <- union(x = rowname.new, y = rowname.old) + + colindex.new <- match(x = colname.old, table = colname.new) + rowindex.new <- match(x = rowname.old, table = rowname.new) + + ind <- summary(object = mat) + # Expand the list of indices and x + i <- c(i, rowindex.new[ind[,1]]) + j <- c(j, colindex.new[ind[,2]]) + x <- c(x, ind[,3]) + } + + merged.mat <- sparseMatrix(i=i, + j=j, + x=x, + dims=c(length(rowname.new), length(colname.new)), + dimnames=list(rowname.new, colname.new)) + return (merged.mat) +} + +# cross product from delayed array +# +crossprod_DelayedAssay <- function(x, y, block.size = 1e8) { + # perform t(x) %*% y in blocks for y + if (!inherits(x = y, 'DelayedMatrix')) { + stop('y should a DelayedMatrix') + } + if (nrow(x) != nrow(y)) { + stop('row of x and y should be the same') + } + sparse <- DelayedArray::is_sparse(x = y) + suppressMessages(expr = DelayedArray::setAutoBlockSize(size = block.size)) + cells.grid <- DelayedArray::colAutoGrid(x = y) + product.list <- list() + for (i in seq_len(length.out = length(x = cells.grid))) { + vp <- cells.grid[[i]] + block <- DelayedArray::read_block(x = y, viewport = vp, as.sparse = sparse) + if (sparse) { + block <- as(object = block, Class = 'dgCMatrix') + } else { + block <- as(object = block, Class = 'Matrix') + } + product.list[[i]] <- as.matrix(t(x) %*% block) + } + product.mat <- matrix(data = unlist(product.list), nrow = ncol(x) , ncol = ncol(y)) + colnames(product.mat) <- colnames(y) + rownames(product.mat) <- colnames(x) + return(product.mat) +} + + +# cross product from BPCells +# +crossprod_BPCells <- function(x, y) { + # perform t(x) %*% y, y is from BPCells + product.mat <- t(x) %*% y + colnames(product.mat) <- colnames(y) + rownames(product.mat) <- colnames(x) + return(product.mat) +} + +# nonzero element version of sweep +# +SweepNonzero <- function( + x, + MARGIN, + STATS, + FUN = "/" +) { + if (!inherits(x = x, what = 'dgCMatrix')) { + stop('input should be dgCMatrix. eg: x <- as(x, "CsparseMatrix")') + } + if (dim(x = x)[MARGIN] != length(STATS)){ + warning("Length of STATS is not equal to dim(x)[MARGIN]") + } + fun <- match.fun(FUN) + if (MARGIN == 1) { + idx <- x@i + 1 + x@x <- fun(x@x, STATS[idx]) + } else if (MARGIN == 2) { + x <- as(x, "RsparseMatrix") + idx <- x@j + 1 + x@x <- fun(x@x, STATS[idx]) + x <- as(x, "CsparseMatrix") + } + return(x) +} + + +#' Create one hot matrix for a given label +#' +#' @param labels A vector of labels +#' @param method Method to aggregate cells with the same label. Either 'aggregate' or 'average' +#' @param cells.name A vector of cell names +#' +#' @importFrom Matrix colSums sparse.model.matrix +#' @importFrom stats as.formula +#' @export +#' @concept utilities +#' +CreateCategoryMatrix <- function( + labels, + method = c('aggregate', 'average'), + cells.name = NULL + ) { + method <- match.arg(arg = method) + if (is.null(dim(labels))) { + if (length(x = unique(x = labels)) == 1) { + data <- matrix(nrow = length(x = labels), ncol = 0) + } else { + data <- cbind(labels = labels) + } + } else { + data <- labels + } + cells.name <- cells.name %||% rownames(data) + if (!is.null(cells.name) & length(cells.name) != nrow(data)) { + stop('length of cells name should be equal to the length of input labels') + } + if (ncol(x = data) == 0) { + message("All grouping variables have 1 value only. Computing across all cells.") + category.matrix <- matrix( + data = 1, + nrow = nrow(x = data), + dimnames = list(cells.name, 'all') + ) + if (method == 'average') { + category.matrix <- category.matrix / sum(category.matrix) + } + return(category.matrix) + } + group.by <- colnames(x = data) + category.matrix <- sparse.model.matrix(object = as.formula( + object = paste0( + '~0+', + paste0( + "data[,", + 1:length(x = group.by), + "]", + collapse = ":" + ) + ) + )) + colsums <- colSums(x = category.matrix) + category.matrix <- category.matrix[, colsums > 0] + colsums <- colsums[colsums > 0] + + if (method =='average') { + category.matrix <- SweepNonzero( + x = category.matrix, + MARGIN = 2, + STATS = colsums, + FUN = "/") + } + if (any(grepl(pattern = "_", x = colnames(x = category.matrix) ))) { + inform( + message = "Names of identity class contain underscores ('_'), replacing with dashes ('-')", + .frequency = "regularly", + .frequency_id = "CreateCategoryMatrix" + ) + colnames(x = category.matrix) <- gsub(pattern = '_', + replacement = '-', + x = colnames(x = category.matrix) + ) + } + colnames(x = category.matrix) <- unname(sapply( + X = colnames(x = category.matrix), + FUN = function(name) { + name <- gsub(pattern = "data\\[, [1-9]*\\]", replacement = "", x = name) + return(paste0(rev(x = unlist(x = strsplit(x = name, split = ":"))), collapse = "_")) + })) + rownames(category.matrix) <- cells.name + return(category.matrix) +} + +#' Construct an assay for spatial niche analysis +#' +#' This function will construct a new assay where each feature is a +#' cell label The values represents the sum of a particular cell label +#' neighboring a given cell. +#' +#' @param object A Seurat object +#' @param fov FOV object to gather cell positions from +#' @param group.by Cell classifications to count in spatial neighborhood +#' @param assay Name for spatial neighborhoods assay +#' @param cluster.name Name of output clusters +#' @param neighbors.k Number of neighbors to consider for each cell +#' @param niches.k Number of clusters to return based on the niche assay +#' +#' @importFrom stats kmeans +#' @return Seurat object containing a new assay +#' @concept clustering +#' @export +#' +BuildNicheAssay <- function( + object, + fov, + group.by, + assay = "niche", + cluster.name = "niches", + neighbors.k = 20, + niches.k = 4 +) { + # initialize an empty cells x groups binary matrix + cells <- Cells(object[[fov]]) + group.labels <- unlist(object[[group.by]][cells, ]) + groups <- sort(unique(group.labels)) + cell.type.mtx <- matrix( + data = 0, + nrow = length(cells), + ncol = length(groups) + ) + rownames(cell.type.mtx) <- cells + colnames(cell.type.mtx) <- groups + # populate the binary matrix + cells.idx <- seq_along(cells) + group.idx <- match(group.labels, groups) + cell.type.mtx[cbind(cells.idx, group.idx)] <- 1 + + # find neighbors based on tissue position + coords <- GetTissueCoordinates(object[[fov]], which = "centroids") + rownames(coords) <- coords[["cell"]] + coords <- as.matrix(coords[ , c("x", "y")]) + neighbors <- FindNeighbors(coords, k.param = neighbors.k, compute.SNN = FALSE) + + # create niche assay + sum.mtx <- as.matrix(neighbors[["nn"]] %*% cell.type.mtx) + niche.assay <- CreateAssayObject(counts = t(sum.mtx)) + object[[assay]] <- niche.assay + DefaultAssay(object) <- assay + + # cluster niche assay + object <- ScaleData(object) + results <- kmeans( + x = t(object[[assay]]@scale.data), + centers = niches.k, + nstart = 30 + ) + object[[cluster.name]] <- results[["cluster"]] + + return(object) +} diff --git a/R/visualization.R b/R/visualization.R index b46347fd0..d56389c06 100644 --- a/R/visualization.R +++ b/R/visualization.R @@ -202,6 +202,7 @@ DimHeatmap <- function( #' @param label Label the cell identies above the color bar #' @param size Size of text above color bar #' @param hjust Horizontal justification of text above color bar +#' @param vjust Vertical justification of text above color bar #' @param angle Angle of text above color bar #' @param raster If true, plot with geom_raster, else use geom_tile. geom_raster may look blurry on #' some viewing applications such as Preview due to how the raster is interpolated. Set this to FALSE @@ -242,6 +243,7 @@ DoHeatmap <- function( label = TRUE, size = 5.5, hjust = 0, + vjust = 0, angle = 45, raster = TRUE, draw.lines = TRUE, @@ -249,12 +251,12 @@ DoHeatmap <- function( group.bar.height = 0.02, combine = TRUE ) { - cells <- cells %||% colnames(x = object) + assay <- assay %||% DefaultAssay(object = object) + DefaultAssay(object = object) <- assay + cells <- cells %||% colnames(x = object[[assay]]) if (is.numeric(x = cells)) { cells <- colnames(x = object)[cells] } - assay <- assay %||% DefaultAssay(object = object) - DefaultAssay(object = object) <- assay features <- features %||% VariableFeatures(object = object) features <- rev(x = unique(x = features)) disp.max <- disp.max %||% ifelse( @@ -400,7 +402,7 @@ DoHeatmap <- function( stat = "identity", data = label.x.pos, aes_string(label = 'group', x = 'label.x.pos'), - y = y.max + y.max * 0.03 * 0.5, + y = y.max + y.max * 0.03 * 0.5 + vjust, angle = angle, hjust = hjust, size = size @@ -508,6 +510,7 @@ HTOHeatmap <- function( #' @param log plot the feature axis on log scale #' @param ncol Number of columns if multiple plots are displayed #' @param slot Slot to pull expression data from (e.g. "counts" or "data") +#' @param layer Layer to pull expression data from (e.g. "counts" or "data") #' @param stack Horizontally stack plots for each feature #' @param combine Combine plots into a single \code{\link[patchwork]{patchwork}ed} #' ggplot object. If \code{FALSE}, return a list of ggplot @@ -535,11 +538,20 @@ RidgePlot <- function( same.y.lims = FALSE, log = FALSE, ncol = NULL, - slot = 'data', + slot = deprecated(), + layer = 'data', stack = FALSE, combine = TRUE, fill.by = 'feature' ) { + if (is_present(arg = slot)) { + deprecate_soft( + when = '5.0.0', + what = 'RidgePlot(slot = )', + with = 'RidgePlot(layer = )' + ) + layer <- slot %||% layer + } return(ExIPlot( object = object, type = 'ridge', @@ -553,7 +565,7 @@ RidgePlot <- function( cols = cols, group.by = group.by, log = log, - slot = slot, + layer = layer, stack = stack, combine = combine, fill.by = fill.by @@ -566,8 +578,10 @@ RidgePlot <- function( #' scores, etc.) #' #' @inheritParams RidgePlot -#' @param pt.size Point size for geom_violin -#' @param split.by A variable to split the violin plots by, +#' @param pt.size Point size for points +#' @param alpha Alpha value for points +#' @param split.by A factor in object metadata to split the plot by, pass 'ident' +#' to split by cell identity' #' @param split.plot plot each group of the split violin plots by multiple or #' single violin shapes. #' @param adjust Adjust parameter for geom_violin @@ -595,6 +609,7 @@ VlnPlot <- function( features, cols = NULL, pt.size = NULL, + alpha = 1, idents = NULL, sort = FALSE, assay = NULL, @@ -605,7 +620,8 @@ VlnPlot <- function( same.y.lims = FALSE, log = FALSE, ncol = NULL, - slot = 'data', + slot = deprecated(), + layer = NULL, split.plot = FALSE, stack = FALSE, combine = TRUE, @@ -614,6 +630,37 @@ VlnPlot <- function( add.noise = TRUE, raster = NULL ) { + if (is_present(arg = slot)) { + deprecate_soft( + when = '5.0.0', + what = 'VlnPlot(slot = )', + with = 'VlnPlot(layer = )' + ) + layer <- slot %||% layer + } + layer.set <- suppressWarnings( + Layers( + object = object, + search = layer %||% 'data' + ) + ) + if (is.null(layer) && length(layer.set) == 1 && layer.set == 'scale.data'){ + warning('Default search for "data" layer yielded no results; utilizing "scale.data" layer instead.') + } + assay.name <- DefaultAssay(object) + if (is.null(layer.set) & is.null(layer) ) { + warning('Default search for "data" layer in "', assay.name, '" assay yielded no results; utilizing "counts" layer instead.', + call. = FALSE, immediate. = TRUE) + layer.set <- Layers( + object = object, + search = 'counts' + ) + } + if (is.null(layer.set)) { + stop('layer "', layer,'" is not found in assay: "', assay.name, '"') + } else { + layer <- layer.set + } if ( !is.null(x = split.by) & getOption(x = 'Seurat.warn.vlnplot.split', default = TRUE) @@ -639,11 +686,12 @@ VlnPlot <- function( same.y.lims = same.y.lims, adjust = adjust, pt.size = pt.size, + alpha = alpha, cols = cols, group.by = group.by, split.by = split.by, log = log, - slot = slot, + layer = layer, stack = stack, combine = combine, fill.by = fill.by, @@ -677,12 +725,14 @@ VlnPlot <- function( #' @seealso \code{\link{DimPlot}} #' #' @examples +#' \dontrun{ #' if (requireNamespace("ape", quietly = TRUE)) { #' data("pbmc_small") #' pbmc_small <- BuildClusterTree(object = pbmc_small, verbose = FALSE) #' PlotClusterTree(pbmc_small) #' ColorDimSplit(pbmc_small, node = 5) #' } +#' } #' ColorDimSplit <- function( object, @@ -747,8 +797,8 @@ ColorDimSplit <- function( #' @param reduction Which dimensionality reduction to use. If not specified, first searches for umap, then tsne, then pca #' @param group.by Name of one or more metadata columns to group (color) cells by #' (for example, orig.ident); pass 'ident' to group by identity class -#' @param split.by Name of a metadata column to split plot by; -#' see \code{\link{FetchData}} for more details +#' @param split.by A factor in object metadata to split the plot by, pass 'ident' +#' to split by cell identity' #' @param shape.by If NULL, all points are circles (default). You can specify any #' cell attribute (that can be pulled with FetchData) allowing for both #' different colors and different shapes on cells. Only applicable if \code{raster = FALSE}. @@ -763,6 +813,7 @@ ColorDimSplit <- function( #' @param label.color Sets the color of the label text #' @param label.box Whether to put a box around the label text (geom_text vs #' geom_label) +#' @param alpha Alpha value for plotting (default is 1) #' @param repel Repel labels #' @param cells.highlight A list of character or numeric vectors of cells to #' highlight. If only one group of cells desired, can simply @@ -772,7 +823,8 @@ ColorDimSplit <- function( #' @param cols.highlight A vector of colors to highlight the cells as; will #' repeat to the length groups in cells.highlight #' @param sizes.highlight Size of highlighted cells; will repeat to the length -#' groups in cells.highlight +#' groups in cells.highlight. If \code{sizes.highlight = TRUE} size of all +#' points will be this value. #' @param na.value Color value for NA points when using custom scale #' @param ncol Number of columns for display when combining plots #' @param combine Combine plots into a single \code{\link[patchwork]{patchwork}ed} @@ -802,7 +854,7 @@ ColorDimSplit <- function( #' @examples #' data("pbmc_small") #' DimPlot(object = pbmc_small) -#' DimPlot(object = pbmc_small, split.by = 'ident') +#' DimPlot(object = pbmc_small, split.by = 'letter.idents') #' DimPlot <- function( object, @@ -822,6 +874,7 @@ DimPlot <- function( label.color = 'black', label.box = FALSE, repel = FALSE, + alpha = 1, cells.highlight = NULL, cols.highlight = '#DE2D26', sizes.highlight = 1, @@ -831,19 +884,33 @@ DimPlot <- function( raster = NULL, raster.dpi = c(512, 512) ) { - if (length(x = dims) != 2) { - stop("'dims' must be a two-length vector") + if (!is_integerish(x = dims, n = 2L, finite = TRUE) || !all(dims > 0L)) { + abort(message = "'dims' must be a two-length integer vector") } reduction <- reduction %||% DefaultDimReduc(object = object) - cells <- cells %||% colnames(x = object) - - data <- Embeddings(object = object[[reduction]])[cells, dims] - data <- as.data.frame(x = data) + # cells <- cells %||% colnames(x = object) + ##### Cells for all cells in the assay. + #### Cells function should not only get default layer + cells <- cells %||% Cells( + x = object, + assay = DefaultAssay(object = object[[reduction]]) + ) + # data <- Embeddings(object = object[[reduction]])[cells, dims] + # data <- as.data.frame(x = data) dims <- paste0(Key(object = object[[reduction]]), dims) - object[['ident']] <- Idents(object = object) orig.groups <- group.by group.by <- group.by %||% 'ident' - data <- cbind(data, object[[group.by]][cells, , drop = FALSE]) + data <- FetchData( + object = object, + vars = c(dims, group.by), + cells = cells, + clean = 'project' + ) + # cells <- rownames(x = object) + # object[['ident']] <- Idents(object = object) + # orig.groups <- group.by + # group.by <- group.by %||% 'ident' + # data <- cbind(data, object[[group.by]][cells, , drop = FALSE]) group.by <- colnames(x = data)[3:ncol(x = data)] for (group in group.by) { if (!is.factor(x = data[, group])) { @@ -854,7 +921,9 @@ DimPlot <- function( data[, shape.by] <- object[[shape.by, drop = TRUE]] } if (!is.null(x = split.by)) { - data[, split.by] <- object[[split.by, drop = TRUE]] + split <- FetchData(object = object, vars = split.by, clean=TRUE)[split.by] + data <- data[rownames(split),] + data[, split.by] <- split } if (isTRUE(x = shuffle)) { set.seed(seed = seed) @@ -871,6 +940,7 @@ DimPlot <- function( pt.size = pt.size, shape.by = shape.by, order = order, + alpha = alpha, label = FALSE, cells.highlight = cells.highlight, cols.highlight = cols.highlight, @@ -927,10 +997,11 @@ DimPlot <- function( #' cells expressing given feature are getting buried. #' @param features Vector of features to plot. Features can come from: #' \itemize{ -#' \item An \code{Assay} feature (e.g. a gene name - "MS4A1") -#' \item A column name from meta.data (e.g. mitochondrial percentage - "percent.mito") -#' \item A column name from a \code{DimReduc} object corresponding to the cell embedding values -#' (e.g. the PC 1 scores - "PC_1") +#' \item An \code{Assay} feature (e.g. a gene name - "MS4A1") +#' \item A column name from meta.data (e.g. mitochondrial percentage - +#' "percent.mito") +#' \item A column name from a \code{DimReduc} object corresponding to the +#' cell embedding values (e.g. the PC 1 scores - "PC_1") #' } #' @param cols The two colors to form the gradient over. Provide as string vector with #' the first color corresponding to low values, the second to high. Also accepts a Brewer @@ -943,13 +1014,20 @@ DimPlot <- function( #' } #' @param min.cutoff,max.cutoff Vector of minimum and maximum cutoff values for each feature, #' may specify quantile in the form of 'q##' where '##' is the quantile (eg, 'q1', 'q10') -#' @param split.by A factor in object metadata to split the feature plot by, pass 'ident' -#' to split by cell identity'; similar to the old \code{FeatureHeatmap} +#' @param split.by A factor in object metadata to split the plot by, pass 'ident' +#' to split by cell identity' #' @param keep.scale How to handle the color scale across multiple plots. Options are: #' \itemize{ -#' \item{"feature" (default; by row/feature scaling):}{ The plots for each individual feature are scaled to the maximum expression of the feature across the conditions provided to 'split.by'.} -#' \item{"all" (universal scaling):}{ The plots for all features and conditions are scaled to the maximum expression value for the feature with the highest overall expression.} -#' \item{NULL (no scaling):}{ Each individual plot is scaled to the maximum expression value of the feature in the condition provided to 'split.by'. Be aware setting NULL will result in color scales that are not comparable between plots.} +#' \item \dQuote{feature} (default; by row/feature scaling): The plots for +#' each individual feature are scaled to the maximum expression of the +#' feature across the conditions provided to \code{split.by} +#' \item \dQuote{all} (universal scaling): The plots for all features and +#' conditions are scaled to the maximum expression value for the feature +#' with the highest overall expression +#' \item \code{all} (no scaling): Each individual plot is scaled to the +#' maximum expression value of the feature in the condition provided to +#' \code{split.by}. Be aware setting \code{NULL} will result in color +#' scales that are not comparable between plots #' } #' @param slot Which slot to pull expression data from? #' @param blend Scale and blend expression values to visualize coexpression of two features @@ -999,6 +1077,7 @@ FeaturePlot <- function( c('lightgrey', 'blue') }, pt.size = NULL, + alpha = 1, order = FALSE, min.cutoff = NA, max.cutoff = NA, @@ -1016,25 +1095,21 @@ FeaturePlot <- function( ncol = NULL, coord.fixed = FALSE, by.col = TRUE, - sort.cell = NULL, + sort.cell = deprecated(), interactive = FALSE, combine = TRUE, raster = NULL, raster.dpi = c(512, 512) ) { # TODO: deprecate fully on 3.2.0 - if (!is.null(x = sort.cell)) { - warning( - "The sort.cell parameter is being deprecated. Please use the order ", - "parameter instead for equivalent functionality.", - call. = FALSE, - immediate. = TRUE + if (is_present(arg = sort.cell)) { + deprecate_stop( + when = '4.9.0', + what = 'FeaturePlot(sort.cell = )', + with = 'FeaturePlot(order = )' ) - if (isTRUE(x = sort.cell)) { - order <- sort.cell - } } - if (interactive) { + if (isTRUE(x = interactive)) { return(IFeaturePlot( object = object, feature = features[1], @@ -1044,8 +1119,8 @@ FeaturePlot <- function( )) } # Check keep.scale param for valid entries - if (!(is.null(x = keep.scale)) && !(keep.scale %in% c("feature", "all"))) { - stop("`keep.scale` must be set to either `feature`, `all`, or NULL") + if (!is.null(x = keep.scale)) { + keep.scale <- arg_match0(arg = keep.scale, values = c('feature', 'all')) } # Set a theme to remove right-hand Y axis lines # Also sets right-hand Y axis text label formatting @@ -1061,61 +1136,51 @@ FeaturePlot <- function( ) # Get the DimReduc to use reduction <- reduction %||% DefaultDimReduc(object = object) - if (length(x = dims) != 2 || !is.numeric(x = dims)) { - stop("'dims' must be a two-length integer vector") + if (!is_integerish(x = dims, n = 2L, finite = TRUE) && !all(dims > 0L)) { + abort(message = "'dims' must be a two-length integer vector") } # Figure out blending stuff - if (blend && length(x = features) != 2) { - stop("Blending feature plots only works with two features") + if (isTRUE(x = blend) && length(x = features) != 2) { + abort(message = "Blending feature plots only works with two features") } # Set color scheme for blended FeaturePlots - if (blend) { + if (isTRUE(x = blend)) { default.colors <- eval(expr = formals(fun = FeaturePlot)$cols) cols <- switch( EXPR = as.character(x = length(x = cols)), '0' = { - warning( - "No colors provided, using default colors", - call. = FALSE, - immediate. = TRUE - ) + warn(message = "No colors provided, using default colors") default.colors }, '1' = { - warning( - "Only one color provided, assuming specified is double-negative and augmenting with default colors", - call. = FALSE, - immediate. = TRUE - ) + warn(message = paste( + "Only one color provided, assuming", + sQuote(x = cols), + "is double-negative and augmenting with default colors" + )) c(cols, default.colors[2:3]) }, '2' = { - warning( - "Only two colors provided, assuming specified are for features and agumenting with '", - default.colors[1], - "' for double-negatives", - call. = FALSE, - immediate. = TRUE - ) + warn(message = paste( + "Only two colors provided, assuming specified are for features and agumenting with", + sQuote(default.colors[1]), + "for double-negatives", + )) c(default.colors[1], cols) }, '3' = cols, { - warning( - "More than three colors provided, using only first three", - call. = FALSE, - immediate. = TRUE - ) + warn(message = "More than three colors provided, using only first three") cols[1:3] } ) } - if (blend && length(x = cols) != 3) { - stop("Blending feature plots only works with three colors; first one for negative cells") + if (isTRUE(x = blend) && length(x = cols) != 3) { + abort("Blending feature plots only works with three colors; first one for negative cells") } # Name the reductions dims <- paste0(Key(object = object[[reduction]]), dims) - cells <- cells %||% colnames(x = object) + cells <- cells %||% Cells(x = object[[reduction]]) # Get plotting data data <- FetchData( object = object, @@ -1125,17 +1190,16 @@ FeaturePlot <- function( ) # Check presence of features/dimensions if (ncol(x = data) < 4) { - stop( - "None of the requested features were found: ", + abort(message = paste( + "None of the requested features were found:", paste(features, collapse = ', '), - " in slot ", - slot, - call. = FALSE - ) + "in slot ", + slot + )) } else if (!all(dims %in% colnames(x = data))) { - stop("The dimensions requested were not found", call. = FALSE) + abort(message = "The dimensions requested were not found") } - features <- colnames(x = data)[4:ncol(x = data)] + features <- setdiff(x = names(x = data), y = c(dims, 'ident')) # Determine cutoffs min.cutoff <- mapply( FUN = function(cutoff, feature) { @@ -1165,39 +1229,36 @@ FeaturePlot <- function( FUN.VALUE = numeric(length = 1) )) if (length(x = check.lengths) != 1) { - stop("There must be the same number of minimum and maximum cuttoffs as there are features") + abort( + message = "There must be the same number of minimum and maximum cuttoffs as there are features" + ) } + names(x = min.cutoff) <- names(x = max.cutoff) <- features brewer.gran <- ifelse( test = length(x = cols) == 1, yes = brewer.pal.info[cols, ]$maxcolors, no = length(x = cols) ) # Apply cutoffs - data[, 4:ncol(x = data)] <- sapply( - X = 4:ncol(x = data), - FUN = function(index) { - data.feature <- as.vector(x = data[, index]) - min.use <- SetQuantile(cutoff = min.cutoff[index - 3], data.feature) - max.use <- SetQuantile(cutoff = max.cutoff[index - 3], data.feature) - data.feature[data.feature < min.use] <- min.use - data.feature[data.feature > max.use] <- max.use - if (brewer.gran == 2) { - return(data.feature) - } - data.cut <- if (all(data.feature == 0)) { - 0 - } - else { + for (i in seq_along(along.with = features)) { + f <- features[i] + data.feature <- data[[f]] + min.use <- SetQuantile(cutoff = min.cutoff[f], data = data.feature) + max.use <- SetQuantile(cutoff = max.cutoff[f], data = data.feature) + data.feature[data.feature < min.use] <- min.use + data.feature[data.feature > max.use] <- max.use + if (brewer.gran != 2) { + data.feature <- if (all(data.feature == 0)) { + rep_len(x = 0, length.out = length(x = data.feature)) + } else { as.numeric(x = as.factor(x = cut( x = as.numeric(x = data.feature), - breaks = brewer.gran + breaks = 2 ))) } - return(data.cut) } - ) - colnames(x = data)[4:ncol(x = data)] <- features - rownames(x = data) <- cells + data[[f]] <- data.feature + } # Figure out splits (FeatureHeatmap) data$split <- if (is.null(x = split.by)) { RandomName() @@ -1248,15 +1309,14 @@ FeaturePlot <- function( ident <- levels(x = data$split)[i] data.plot <- data[as.character(x = data$split) == ident, , drop = FALSE] # Blend expression values - if (blend) { + if (isTRUE(x = blend)) { features <- features[1:2] no.expression <- features[colMeans(x = data.plot[, features]) == 0] if (length(x = no.expression) != 0) { - stop( - "The following features have no value: ", - paste(no.expression, collapse = ', '), - call. = FALSE - ) + abort(message = paste( + "The following features have no value:", + paste(no.expression, collapse = ', ') + )) } data.plot <- cbind(data.plot[, c(dims, 'ident')], BlendExpression(data = data.plot[, features[1:2]])) features <- colnames(x = data.plot)[4:ncol(x = data.plot)] @@ -1265,7 +1325,7 @@ FeaturePlot <- function( for (j in 1:length(x = features)) { feature <- features[j] # Get blended colors - if (blend) { + if (isTRUE(x = blend)) { cols.use <- as.numeric(x = as.character(x = data.plot[, feature])) + 1 cols.use <- colors[[j]][sort(x = unique(x = cols.use))] } else { @@ -1279,6 +1339,7 @@ FeaturePlot <- function( col.by = feature, order = order, pt.size = pt.size, + alpha = alpha, cols = cols.use, shape.by = shape.by, label = FALSE, @@ -1291,7 +1352,7 @@ FeaturePlot <- function( CenterTitle() # theme(plot.title = element_text(hjust = 0.5)) # Add labels - if (label) { + if (isTRUE(x = label)) { plot <- LabelClusters( plot = plot, id = 'ident', @@ -1350,7 +1411,12 @@ FeaturePlot <- function( } else if (length(x = cols) > 1) { unique.feature.exp <- unique(data.plot[, feature]) if (length(unique.feature.exp) == 1) { - warning("All cells have the same value (", unique.feature.exp, ") of ", feature, ".") + warn(message = paste0( + "All cells have the same value (", + unique.feature.exp, + ") of ", + dQuote(x = feature) + )) if (unique.feature.exp == 0) { cols.grad <- cols[1] } else{ @@ -1382,7 +1448,7 @@ FeaturePlot <- function( } } # Add blended color key - if (blend) { + if (isTRUE(x = blend)) { blend.legend <- BlendMap(color.matrix = color.matrix) for (ii in 1:length(x = levels(x = data$split))) { suppressMessages(expr = plots <- append( @@ -1428,17 +1494,17 @@ FeaturePlot <- function( } } ncol <- ifelse( - test = is.null(x = split.by) || blend, + test = is.null(x = split.by) || isTRUE(x = blend), yes = ncol, no = length(x = features) ) - legend <- if (blend) { + legend <- if (isTRUE(x = blend)) { 'none' } else { split.by %iff% 'none' } # Transpose the FeatureHeatmap matrix (not applicable for blended FeaturePlots) - if (combine) { + if (isTRUE(x = combine)) { if (by.col && !is.null(x = split.by) && !blend) { plots <- lapply( X = plots, @@ -1487,7 +1553,10 @@ FeaturePlot <- function( } plots <- plots[c(do.call( what = rbind, - args = split(x = 1:length(x = plots), f = ceiling(x = seq_along(along.with = 1:length(x = plots)) / length(x = features))) + args = split( + x = 1:length(x = plots), + f = ceiling(x = seq_along(along.with = 1:length(x = plots)) / length(x = features)) + ) ))] # Set ncol to number of splits (nrow) and nrow to number of features (ncol) plots <- wrap_plots(plots, ncol = nrow, nrow = ncol) @@ -1900,21 +1969,25 @@ CellScatter <- function( #' @param cols Colors to use for identity class plotting. #' @param pt.size Size of the points on the plot #' @param shape.by Ignored for now +#' @param split.by A factor in object metadata to split the feature plot by, pass 'ident' +#' to split by cell identity' #' @param span Spline span in loess function call, if \code{NULL}, no spline added #' @param smooth Smooth the graph (similar to smoothScatter) #' @param slot Slot to pull data from, should be one of 'counts', 'data', or 'scale.data' #' @param combine Combine plots into a single \code{\link[patchwork]{patchwork}ed} #' @param plot.cor Display correlation in plot title +#' @param ncol Number of columns if plotting multiple plots #' @param raster Convert points to raster format, default is \code{NULL} #' which will automatically use raster if the number of points plotted is greater than #' 100,000 #' @param raster.dpi Pixel resolution for rasterized plots, passed to geom_scattermore(). #' Default is c(512, 512). #' @param jitter Jitter for easier visualization of crowded points (default is FALSE) +#' @param log Plot features on the log scale (default is FALSE) #' #' @return A ggplot object #' -#' @importFrom ggplot2 geom_smooth aes_string +#' @importFrom ggplot2 geom_smooth aes_string facet_wrap vars sym labs #' @importFrom patchwork wrap_plots #' #' @export @@ -1934,6 +2007,7 @@ FeatureScatter <- function( shuffle = FALSE, seed = 1, group.by = NULL, + split.by = NULL, cols = NULL, pt.size = 1, shape.by = NULL, @@ -1942,16 +2016,17 @@ FeatureScatter <- function( combine = TRUE, slot = 'data', plot.cor = TRUE, + ncol = NULL, raster = NULL, raster.dpi = c(512, 512), - jitter = FALSE + jitter = FALSE, + log = FALSE ) { cells <- cells %||% colnames(x = object) if (isTRUE(x = shuffle)) { set.seed(seed = seed) cells <- sample(x = cells) } - object[['ident']] <- Idents(object = object) group.by <- group.by %||% 'ident' data <- FetchData( object = object, @@ -1959,25 +2034,30 @@ FeatureScatter <- function( cells = cells, slot = slot ) - if (!grepl(pattern = feature1, x = colnames(x = data)[1])) { - stop("Feature 1 (", feature1, ") not found.", call. = FALSE) + if (!grepl(pattern = feature1, x = names(x = data)[1])) { + abort(message = paste("Feature 1", sQuote(x = feature1), "not found")) } - if (!grepl(pattern = feature2, x = colnames(x = data)[2])) { - stop("Feature 2 (", feature2, ") not found.", call. = FALSE) + if (!grepl(pattern = feature2, x = names(x = data)[2])) { + abort(message = paste("Feature 2", sQuote(x = feature2), "not found")) } - data <- as.data.frame(x = data) - feature1 <- colnames(x = data)[1] - feature2 <- colnames(x = data)[2] + feature1 <- names(x = data)[1] + feature2 <- names(x = data)[2] + group.by <- intersect(x = group.by, y = names(x = data)[3:ncol(x = data)]) for (group in group.by) { if (!is.factor(x = data[, group])) { data[, group] <- factor(x = data[, group]) } } + if (!is.null(x = split.by)) { + split <- FetchData(object = object, vars = split.by, clean=TRUE)[split.by] + data <- data[rownames(split),] + data[, split.by] <- split + } plots <- lapply( X = group.by, FUN = function(x) { - SingleCorPlot( - data = data[,c(feature1, feature2)], + plot <- SingleCorPlot( + data = data[,c(feature1, feature2, split.by)], col.by = data[, x], cols = cols, pt.size = pt.size, @@ -1989,6 +2069,21 @@ FeatureScatter <- function( raster.dpi = raster.dpi, jitter = jitter ) + if (!is.null(x = split.by)) { + plot <- plot + FacetTheme() + + facet_wrap( + facets = vars(!!sym(x = split.by)), + ncol = if (length(x = group.by) > 1 || is.null(x = ncol)) { + length(x = unique(x = data[, split.by])) + } else { + ncol + } + ) + } + if (log) { + plot <- plot + scale_x_log10() + scale_y_log10() + } + plot } ) if (isTRUE(x = length(x = plots) == 1)) { @@ -2041,12 +2136,15 @@ VariableFeaturePlot <- function( hvf.info <- HVFInfo( object = object, assay = assay, - selection.method = selection.method, + method = selection.method, status = TRUE ) - var.status <- c('no', 'yes')[unlist(x = hvf.info[, ncol(x = hvf.info)]) + 1] + status.col <- colnames(hvf.info)[grepl("variable", colnames(hvf.info))][[1]] + var.status <- c('no', 'yes')[unlist(hvf.info[[status.col]]) + 1] if (colnames(x = hvf.info)[3] == 'dispersion.scaled') { hvf.info <- hvf.info[, c(1, 2)] + } else if (colnames(x = hvf.info)[3] == 'variance.expected') { + hvf.info <- hvf.info[, c(1, 4)] } else { hvf.info <- hvf.info[, c(1, 3)] } @@ -2281,6 +2379,7 @@ PolyFeaturePlot <- function( #' \code{patchwork} ggplot object.If \code{FALSE}, #' return a list of ggplot objects #' @param coord.fixed Plot cartesian coordinates with fixed aspect ratio +#' @param flip_xy Flag to flip X and Y axes. Default is FALSE. #' #' @return If \code{combine = TRUE}, a \code{patchwork} #' ggplot object; otherwise, a list of ggplot objects @@ -2292,6 +2391,8 @@ PolyFeaturePlot <- function( #' DefaultBoundary FetchData Images Overlay #' #' @export +#' @concept visualization +#' @concept spatial #' ImageDimPlot <- function( object, @@ -2317,7 +2418,8 @@ ImageDimPlot <- function( overlap = FALSE, axes = FALSE, combine = TRUE, - coord.fixed = TRUE + coord.fixed = TRUE, + flip_xy = TRUE ) { cells <- cells %||% Cells(x = object) # Determine FOV to use @@ -2482,6 +2584,10 @@ ImageDimPlot <- function( if (isTRUE(coord.fixed)) { p <- p + coord_fixed() } + if(!isTRUE(flip_xy) && isTRUE(coord.fixed)){ + xy_ratio = (max(pdata[[i]]$x) - min(pdata[[i]]$x)) / (max(pdata[[i]]$y) - min(pdata[[i]]$y)) + p = p + coord_flip() + theme(aspect.ratio = 1/xy_ratio) + } plots[[idx]] <- p idx <- idx + 1L } @@ -2518,6 +2624,8 @@ ImageDimPlot <- function( #' FetchData Images Overlay #' #' @export +#' @concept visualization +#' @concept spatial #' ImageFeaturePlot <- function( object, @@ -2777,9 +2885,18 @@ ImageFeaturePlot <- function( names(x = pdata) <- pnames for (i in names(x = pdata)) { ul <- unlist(x = strsplit(x = i, split = '_')) - img <- paste(ul[1:length(ul)-1], collapse = '_') + # img <- paste(ul[1:length(ul)-1], collapse = '_') # Apply overlap - lyr <- ul[length(ul)] + # lyr <- ul[length(ul)] + if(length(ul) > 1) { + img <- paste(ul[1:length(ul)-1], collapse = '_') + lyr <- ul[length(ul)] + } else if (length(ul) == 1) { + img <- ul[1] + lyr <- "centroids" + } else { + stop("the length of ul is 0. please check.") + } if (is.na(x = lyr)) { lyr <- boundaries[[img]] } @@ -3032,9 +3149,9 @@ ImageFeaturePlot <- function( #' Visualize spatial and clustering (dimensional reduction) data in a linked, #' interactive framework #' -#' @inheritParams DimPlot -#' @inheritParams FeaturePlot #' @inheritParams SpatialPlot +#' @inheritParams FeaturePlot +#' @inheritParams DimPlot #' @param feature Feature to visualize #' @param image Name of the image to use in the plot #' @@ -3069,6 +3186,7 @@ LinkedDimPlot <- function( dims = 1:2, reduction = NULL, image = NULL, + image.scale = "lowres", group.by = NULL, alpha = c(0.1, 1), combine = TRUE @@ -3112,7 +3230,7 @@ LinkedDimPlot <- function( vars = group.by, cells = cells.use ) - coords <- GetTissueCoordinates(object = object[[image]]) + coords <- GetTissueCoordinates(object = object[[image]], scale = image.scale) embeddings <- Embeddings(object = object[[reduction]])[cells.use, dims] plot.data <- cbind(coords, group.data, embeddings) plot.data$selected_ <- FALSE @@ -3278,6 +3396,7 @@ LinkedFeaturePlot <- function( dims = 1:2, reduction = NULL, image = NULL, + image.scale = "lowres", slot = 'data', alpha = c(0.1, 1), combine = TRUE @@ -3317,7 +3436,7 @@ LinkedFeaturePlot <- function( vars = feature, cells = cells.use ) - coords <- GetTissueCoordinates(object = object[[image]]) + coords <- GetTissueCoordinates(object = object[[image]], scale = image.scale) embeddings <- Embeddings(object = object[[reduction]])[cells.use, dims] plot.data <- cbind(coords, group.data, embeddings) # Setup the server @@ -3391,8 +3510,8 @@ LinkedFeaturePlot <- function( #' Visualize clusters spatially and interactively #' -#' @inheritParams DimPlot #' @inheritParams SpatialPlot +#' @inheritParams DimPlot #' @inheritParams LinkedPlots #' #' @return Returns final plot as a ggplot object @@ -3409,6 +3528,7 @@ LinkedFeaturePlot <- function( ISpatialDimPlot <- function( object, image = NULL, + image.scale = "lowres", group.by = NULL, alpha = c(0.3, 1) ) { @@ -3442,7 +3562,8 @@ ISpatialDimPlot <- function( vars = group.by, cells = cells.use ) - coords <- GetTissueCoordinates(object = object[[image]]) + coords <- GetTissueCoordinates(object = object[[image]], scale = image.scale) + scale.factor <- ScaleFactors(object[[image]])[[image.scale]] plot.data <- cbind(coords, group.data) plot.data$selected_ <- FALSE Idents(object = object) <- group.by @@ -3498,14 +3619,26 @@ ISpatialDimPlot <- function( # Add hover text output$info <- renderPrint( expr = { - cell.hover <- rownames(x = nearPoints( + hovered <- nearPoints( df = plot.data, coordinfo = InvertCoordinate(x = input$hover), threshold = 10, maxpoints = 1 - )) - if (length(x = cell.hover) == 1) { - paste(cell.hover, paste('Group:', plot.data[cell.hover, group.by, drop = TRUE]), collapse = '
') + ) + if (nrow(hovered) == 1) { + cell.hover <- rownames(hovered) + # SingleSpatialPlot relies on the spatial coordinates appearing + # in the first two columns of the returned data.frame - it's kinda + # fragile but we're obligated to use the same behaviour here + coords.hover <- hovered[1, colnames(coords)[1:2]] / scale.factor + group.hover <- hovered[1, group.by] + sprintf( + "Cell: %s, Group: %s, Coordinates: (%.2f, %.2f)", + cell.hover, + group.hover, + coords.hover[[1]], + coords.hover[[2]] + ) } else { NULL } @@ -3517,8 +3650,8 @@ ISpatialDimPlot <- function( #' Visualize features spatially and interactively #' -#' @inheritParams FeaturePlot #' @inheritParams SpatialPlot +#' @inheritParams FeaturePlot #' @inheritParams LinkedPlots #' #' @return Returns final plot as a ggplot object @@ -3535,6 +3668,7 @@ ISpatialFeaturePlot <- function( object, feature, image = NULL, + image.scale = "lowres", slot = 'data', alpha = c(0.1, 1) ) { @@ -3626,7 +3760,7 @@ ISpatialFeaturePlot <- function( # Prepare plotting data image <- image %||% DefaultImage(object = object) cells.use <- Cells(x = object[[image]]) - coords <- GetTissueCoordinates(object = object[[image]]) + coords <- GetTissueCoordinates(object = object[[image]], scale = image.scale) feature.data <- FetchData( object = object, vars = feature, @@ -3733,15 +3867,24 @@ ISpatialFeaturePlot <- function( #' default, ggplot2 assigns colors #' @param image.alpha Adjust the opacity of the background images. Set to 0 to #' remove. +#' @param image.scale Choose the scale factor ("lowres"/"hires") to apply in +#' order to matchthe plot with the specified `image` - defaults to "lowres" #' @param crop Crop the plot in to focus on points plotted. Set to \code{FALSE} to show #' entire background image. #' @param slot If plotting a feature, which data slot to pull from (counts, #' data, or scale.data) #' @param keep.scale How to handle the color scale across multiple plots. Options are: #' \itemize{ -#' \item{"feature" (default; by row/feature scaling):}{ The plots for each individual feature are scaled to the maximum expression of the feature across the conditions provided to 'split.by'.} -#' \item{"all" (universal scaling):}{ The plots for all features and conditions are scaled to the maximum expression value for the feature with the highest overall expression.} -#' \item{NULL (no scaling):}{ Each individual plot is scaled to the maximum expression value of the feature in the condition provided to 'split.by'. Be aware setting NULL will result in color scales that are not comparable between plots.} +#' \item \dQuote{feature} (default; by row/feature scaling): The plots for +#' each individual feature are scaled to the maximum expression of the +#' feature across the conditions provided to \code{split.by} +#' \item \dQuote{all} (universal scaling): The plots for all features and +#' conditions are scaled to the maximum expression value for the feature +#' with the highest overall expression +#' \item \code{NULL} (no scaling): Each individual plot is scaled to the +#' maximum expression value of the feature in the condition provided to +#' \code{split.by}; be aware setting \code{NULL} will result in color +#' scales that are not comparable between plots #' } #' @param min.cutoff,max.cutoff Vector of minimum and maximum cutoff #' values for each feature, may specify quantile in the form of 'q##' where '##' @@ -3768,6 +3911,8 @@ ISpatialFeaturePlot <- function( #' @param alpha Controls opacity of spots. Provide as a vector specifying the #' min and max for SpatialFeaturePlot. For SpatialDimPlot, provide a single #' alpha value for each plot. +#' @param shape Control the shape of the spots - same as the ggplot2 parameter. +#' The default is 21, which plots circles - use 22 to plot squares. #' @param stroke Control the width of the border around the spots #' @param interactive Launch an interactive SpatialDimPlot or SpatialFeaturePlot #' session, see \code{\link{ISpatialDimPlot}} or @@ -3804,6 +3949,7 @@ SpatialPlot <- function( images = NULL, cols = NULL, image.alpha = 1, + image.scale = "lowres", crop = TRUE, slot = 'data', keep.scale = "feature", @@ -3821,7 +3967,8 @@ SpatialPlot <- function( combine = TRUE, pt.size.factor = 1.6, alpha = c(1, 1), - stroke = 0.25, + shape = 21, + stroke = NA, interactive = FALSE, do.identify = FALSE, identify.ident = NULL, @@ -3852,11 +3999,13 @@ SpatialPlot <- function( stop("`keep.scale` must be set to either `feature`, `all`, or NULL") } + cells <- unique(CellsByImage(object, images = images, unlist = TRUE)) if (is.null(x = features)) { if (interactive) { return(ISpatialDimPlot( object = object, image = images[1], + image.scale = image.scale, group.by = group.by, alpha = alpha )) @@ -3864,6 +4013,7 @@ SpatialPlot <- function( group.by <- group.by %||% 'ident' object[['ident']] <- Idents(object = object) data <- object[[group.by]] + data <- data[cells,,drop=F] for (group in group.by) { if (!is.factor(x = data[, group])) { data[, group] <- factor(x = data[, group]) @@ -3875,6 +4025,7 @@ SpatialPlot <- function( object = object, feature = features[1], image = images[1], + image.scale = image.scale, slot = slot, alpha = alpha )) @@ -3882,7 +4033,9 @@ SpatialPlot <- function( data <- FetchData( object = object, vars = features, - slot = slot + cells = cells, + layer = slot, + clean = FALSE ) features <- colnames(x = data) # Determine cutoffs @@ -3929,11 +4082,11 @@ SpatialPlot <- function( } ) colnames(x = data) <- features - rownames(x = data) <- Cells(x = object) + rownames(x = data) <- cells } features <- colnames(x = data) colnames(x = data) <- features - rownames(x = data) <- colnames(x = object) + rownames(x = data) <- cells facet.highlight <- facet.highlight && (!is.null(x = cells.highlight) && is.list(x = cells.highlight)) if (do.hover) { if (length(x = images) > 1) { @@ -3990,12 +4143,16 @@ SpatialPlot <- function( max.feature.value <- max(apply(data, 2, function(x) max(x, na.rm = TRUE))) } - for (i in 1:ncols) { plot.idx <- i image.idx <- ifelse(test = facet.highlight, yes = 1, no = i) image.use <- object[[images[[image.idx]]]] - coordinates <- GetTissueCoordinates(object = image.use) + + coordinates <- GetTissueCoordinates( + object = image.use, + scale = image.scale + ) + highlight.use <- if (facet.highlight) { cells.highlight[i] } else { @@ -4009,7 +4166,7 @@ SpatialPlot <- function( } # Get feature max for individual feature - if (!(is.null(x = keep.scale)) && keep.scale == "feature") { + if (!(is.null(x = keep.scale)) && keep.scale == "feature" && !inherits(x = data[, features[j]], what = "factor") ) { max.feature.value <- max(data[, features[j]]) } @@ -4019,6 +4176,7 @@ SpatialPlot <- function( data[rownames(x = coordinates), features[j], drop = FALSE] ), image = image.use, + image.scale = image.scale, image.alpha = image.alpha, col.by = features[j], cols = cols, @@ -4040,6 +4198,7 @@ SpatialPlot <- function( cells.highlight = highlight.use, cols.highlight = cols.highlight, pt.size.factor = pt.size.factor, + shape = shape, stroke = stroke, crop = crop ) @@ -4085,7 +4244,7 @@ SpatialPlot <- function( } # Plot multiple images depending on keep.scale - if (!(is.null(x = keep.scale))) { + if (!(is.null(x = keep.scale)) && !inherits(x = data[, features[j]], "factor")) { plot <- suppressMessages(plot & scale_fill_gradientn(colors = SpatialColors(n = 100), limits = c(NA, max.feature.value))) } @@ -4096,21 +4255,7 @@ SpatialPlot <- function( } } } - # if (do.identify) { - # return(CellSelector( - # plot = plot, - # object = identify.ident %iff% object, - # ident = identify.ident - # )) - # } else if (do.hover) { - # return(HoverLocator( - # plot = plots[[1]], - # information = information %||% data[, features, drop = FALSE], - # axes = FALSE, - # # cols = c('size' = 'point.size.factor', 'colour' = 'fill'), - # images = GetImage(object = object, mode = 'plotly', image = images) - # )) - # } + if (combine) { if (!is.null(x = ncol)) { return(wrap_plots(plots = plots, ncol = ncol)) @@ -4228,8 +4373,8 @@ BarcodeInflectionsPlot <- function(object) { #' @param dot.scale Scale the size of the points, similar to cex #' @param idents Identity classes to include in plot (default is all) #' @param group.by Factor to group the cells by -#' @param split.by Factor to split the groups by (replicates the functionality -#' of the old SplitDotPlotGG); +#' @param split.by A factor in object metadata to split the plot by, pass 'ident' +#' to split by cell identity' #' see \code{\link{FetchData}} for more details #' @param cluster.idents Whether to order identities by hierarchical clusters #' based on given features, default is FALSE @@ -4265,8 +4410,8 @@ BarcodeInflectionsPlot <- function(object) { #' DotPlot <- function( object, - assay = NULL, features, + assay = NULL, cols = c("lightgrey", "blue"), col.min = -2.5, col.max = 2.5, @@ -4308,8 +4453,7 @@ DotPlot <- function( features <- unlist(x = features) names(x = feature.groups) <- features } - cells <- unlist(x = CellsByIdentities(object = object, idents = idents)) - + cells <- unlist(x = CellsByIdentities(object = object, cells = colnames(object[[assay]]), idents = idents)) data.features <- FetchData(object = object, vars = features, cells = cells) data.features$id <- if (is.null(x = group.by)) { Idents(object = object)[cells, drop = TRUE] @@ -4322,10 +4466,10 @@ DotPlot <- function( id.levels <- levels(x = data.features$id) data.features$id <- as.vector(x = data.features$id) if (!is.null(x = split.by)) { - splits <- object[[split.by, drop = TRUE]][cells, drop = TRUE] + splits <- FetchData(object = object, vars = split.by)[cells, split.by] if (split.colors) { if (length(x = unique(x = splits)) > length(x = cols)) { - stop("Not enough colors for the number of groups") + stop(paste0("Need to specify at least ", length(x = unique(x = splits)), " colors using the cols parameter")) } cols <- cols[1:length(x = unique(x = splits))] names(x = cols) <- unique(x = splits) @@ -4391,7 +4535,7 @@ DotPlot <- function( FUN = function(x) { data.use <- data.plot[data.plot$features.plot == x, 'avg.exp'] if (scale) { - data.use <- scale(x = data.use) + data.use <- scale(x = log1p(data.use)) data.use <- MinMax(data = data.use, min = col.min, max = col.max) } else { data.use <- log1p(x = data.use) @@ -4411,18 +4555,19 @@ DotPlot <- function( data.plot$pct.exp[data.plot$pct.exp < dot.min] <- NA data.plot$pct.exp <- data.plot$pct.exp * 100 if (split.colors) { - splits.use <- vapply( - X = as.character(x = data.plot$id), - FUN = gsub, - FUN.VALUE = character(length = 1L), - pattern = paste0( - '^((', - paste(sort(x = levels(x = object), decreasing = TRUE), collapse = '|'), - ')_)' - ), - replacement = '', - USE.NAMES = FALSE - ) + splits.use <- unlist(x = lapply( + X = data.plot$id, + FUN = function(x) + sub( + paste0(".*_(", + paste(sort(unique(x = splits), decreasing = TRUE), + collapse = '|' + ),")$"), + "\\1", + x + ) + ) + ) data.plot$colors <- mapply( FUN = function(color, value) { return(colorRampPalette(colors = c('grey', color))(20)[value]) @@ -4552,9 +4697,10 @@ GroupCorrelationPlot <- function( cor = "nCount_RNA_cor" ) { assay <- assay %||% DefaultAssay(object = object) - data <- object[[assay]][[c(feature.group, cor)]] + data <- object[[assay]][c(feature.group, cor)] data <- data[complete.cases(data), ] colnames(x = data) <- c('grp', 'cor') + data$grp <- as.character(data$grp) plot <- ggplot(data = data, aes_string(x = "grp", y = "cor", fill = "grp")) + geom_boxplot() + theme_cowplot() + @@ -4682,11 +4828,13 @@ JackStrawPlot <- function( #' @concept visualization #' #' @examples +#' \dontrun{ #' if (requireNamespace("ape", quietly = TRUE)) { #' data("pbmc_small") #' pbmc_small <- BuildClusterTree(object = pbmc_small) #' PlotClusterTree(object = pbmc_small) #' } +#' } PlotClusterTree <- function(object, direction = "downwards", ...) { if (!PackageCheck('ape', error = FALSE)) { stop(cluster.ape, call. = FALSE) @@ -4885,7 +5033,7 @@ AutoPointSize <- function(data, raster = NULL) { #' hexadecimal codes #' @param threshold Intensity threshold for light/dark cutoff; intensities #' greater than \code{theshold} yield \code{dark}, others yield \code{light} -#' @param w3c Use \href{http://www.w3.org/TR/WCAG20/}{W3C} formula for calculating +#' @param w3c Use \href{https://www.w3.org/TR/WCAG20/}{W3C} formula for calculating #' background text color; ignores \code{threshold} #' @param dark Color for dark text #' @param light Color for light text @@ -6528,7 +6676,8 @@ Col2Hex <- function(...) { # @param y.max Maximum y axis value # @param same.y.lims Set all the y-axis limits to the same values # @param adjust Adjust parameter for geom_violin -# @param pt.size Point size for geom_violin +# @param pt.size Point size for points +# @param alpha Alpha value for points # @param cols Colors to use for plotting # @param group.by Group (color) cells in different ways (for example, orig.ident) # @param split.by A variable to split the plot by @@ -6563,10 +6712,12 @@ ExIPlot <- function( adjust = 1, cols = NULL, pt.size = 0, + alpha = 1, group.by = NULL, split.by = NULL, log = FALSE, - slot = 'data', + slot = deprecated(), + layer = 'data', stack = FALSE, combine = TRUE, fill.by = NULL, @@ -6574,8 +6725,12 @@ ExIPlot <- function( add.noise = TRUE, raster = NULL ) { + if (is_present(arg = slot)) { + layer <- layer %||% slot + } assay <- assay %||% DefaultAssay(object = object) DefaultAssay(object = object) <- assay + cells <- Cells(x = object, assay = NULL) if (isTRUE(x = stack)) { if (!is.null(x = ncol)) { warning( @@ -6598,14 +6753,15 @@ ExIPlot <- function( no = min(length(x = features), 3) ) } - data <- FetchData(object = object, vars = features, slot = slot) + if (!is.null(x = idents)) { + cells <- intersect( + x = names(x = Idents(object = object)[Idents(object = object) %in% idents]), + y = cells + ) + } + data <- FetchData(object = object, vars = features, slot = layer, cells = cells) pt.size <- pt.size %||% AutoPointSize(data = object) features <- colnames(x = data) - if (is.null(x = idents)) { - cells <- colnames(x = object) - } else { - cells <- names(x = Idents(object = object)[Idents(object = object) %in% idents]) - } data <- data[cells, , drop = FALSE] idents <- if (is.null(x = group.by)) { Idents(object = object)[cells] @@ -6618,7 +6774,7 @@ ExIPlot <- function( if (is.null(x = split.by)) { split <- NULL } else { - split <- object[[split.by, drop = TRUE]][cells] + split <- FetchData(object,split.by)[cells,split.by] if (!is.factor(x = split)) { split <- factor(x = split) } @@ -6674,6 +6830,7 @@ ExIPlot <- function( adjust = adjust, cols = cols, pt.size = pt.size, + alpha = alpha, log = log, add.noise = add.noise, raster = raster @@ -6701,7 +6858,8 @@ ExIPlot <- function( if (length(x = obj) == 1) { if (inherits(x = object[[obj]], what = 'DimReduc')) { plots[[i]] <- plots[[i]] + label.fxn(label = 'Embeddings Value') - } else if (inherits(x = object[[obj]], what = 'Assay')) { + } else if (inherits(x = object[[obj]], what = 'Assay') || + inherits(x = object[[obj]], what = 'Assay5')) { next } else { warning("Unknown object type ", class(x = object), immediate. = TRUE, call. = FALSE) @@ -6836,14 +6994,14 @@ GeomSpatial <- ggproto( "GeomSpatial", Geom, required_aes = c("x", "y"), - extra_params = c("na.rm", "image", "image.alpha", "crop"), + extra_params = c("na.rm", "image", "image.alpha", "image.scale", "crop"), default_aes = aes( shape = 21, colour = "black", point.size.factor = 1.0, fill = NA, alpha = NA, - stroke = 0.25 + stroke = NA ), setup_data = function(self, data, params) { data <- ggproto_parent(Geom, self)$setup_data(data, params) @@ -6852,19 +7010,21 @@ GeomSpatial <- ggproto( data }, draw_key = draw_key_point, - draw_panel = function(data, panel_scales, coord, image, image.alpha, crop) { + draw_panel = function(data, panel_scales, coord, image, image.alpha, image.scale, crop) { + img.dim <- dim(image) + # This should be in native units, where # Locations and sizes are relative to the x- and yscales for the current viewport. if (!crop) { - y.transform <- c(0, nrow(x = image)) - panel_scales$y.range + y.transform <- c(0, img.dim[[1]]) - panel_scales$y.range data$y <- data$y + sum(y.transform) - panel_scales$x$continuous_range <- c(0, ncol(x = image)) - panel_scales$y$continuous_range <- c(0, nrow(x = image)) - panel_scales$y.range <- c(0, nrow(x = image)) - panel_scales$x.range <- c(0, ncol(x = image)) + panel_scales$x$continuous_range <- c(0, img.dim[[2]]) + panel_scales$y$continuous_range <- c(0, img.dim[[1]]) + panel_scales$y.range <- c(0, img.dim[[1]]) + panel_scales$x.range <- c(0, img.dim[[2]]) } z <- coord$transform( - data.frame(x = c(0, ncol(x = image)), y = c(0, nrow(x = image))), + data.frame(x = c(0, img.dim[[2]]), y = c(0, img.dim[[1]])), panel_scales ) # Flip Y axis for image @@ -6878,12 +7038,11 @@ GeomSpatial <- ggproto( height = unit(x = hgth, units = "npc"), just = c("left", "bottom") ) - img.grob <- GetImage(object = image) - img <- editGrob(grob = img.grob, vp = vp) - # spot.size <- slot(object = image, name = "spot.radius") - spot.size <- Radius(object = image) + spot.size <- Radius(object = image, scale = image.scale) coords <- coord$transform(data, panel_scales) + + img <- editGrob(grob = GetImage(image), vp = vp) pts <- pointsGrob( x = coords$x, y = coords$y, @@ -6912,7 +7071,6 @@ GeomSpatial <- ggproto( # Replacement for ggname gt$name <- grobName(grob = gt, prefix = 'geom_spatial') return(gt) - # ggplot2:::ggname("geom_spatial", gt) } ) @@ -6926,6 +7084,7 @@ geom_spatial <- function( data = NULL, image = image, image.alpha = image.alpha, + image.scale = image.scale, crop = crop, stat = "identity", position = "identity", @@ -6942,7 +7101,14 @@ geom_spatial <- function( position = position, show.legend = show.legend, inherit.aes = inherit.aes, - params = list(na.rm = na.rm, image = image, image.alpha = image.alpha, crop = crop, ...) + params = list( + na.rm = na.rm, + image = image, + image.alpha = image.alpha, + image.scale = image.scale, + crop = crop, + ... + ) ) } @@ -7706,6 +7872,8 @@ ScaleColumn <- function(vec, cutoffs) { # @param cols.highlight Colors to highlight cells as # @param col.base Base color to use for unselected cells # @param pt.size Size of unselected cells +# @param raster Convert points to raster format, default is \code{NULL} which +# automatically rasterizes if plotting more than 100,000 cells # # @return A list will cell highlight information # \describe{ @@ -7721,7 +7889,8 @@ SetHighlight <- function( sizes.highlight, cols.highlight, col.base = 'black', - pt.size = 1 + pt.size = 1, + raster = NULL ) { if (is.character(x = cells.highlight)) { cells.highlight <- list(cells.highlight) @@ -7765,6 +7934,12 @@ SetHighlight <- function( size[index.check] <- sizes.highlight[i] } } + + # Check for raster + if (isTRUE(x = raster)) { + size <- sizes.highlight[1] + } + plot.order <- sort(x = unique(x = highlight), na.last = TRUE) plot.order[is.na(x = plot.order)] <- 'Unselected' highlight[is.na(x = highlight)] <- 'Unselected' @@ -7802,7 +7977,7 @@ globalVariables(names = '..density..', package = 'Seurat') #' @param cols An optional vector of colors to use #' @param pt.size Point size for the plot #' @param smooth Make a smoothed scatter plot -#' @param rows.highight A vector of rows to highlight (like cells.highlight in +#' @param rows.highlight A vector of rows to highlight (like cells.highlight in #' \code{\link{SingleDimPlot}}) #' @param legend.title Optional legend title #' @param raster Convert points to raster format, default is \code{NULL} @@ -7842,7 +8017,7 @@ SingleCorPlot <- function( jitter = TRUE ) { pt.size <- pt.size %||% AutoPointSize(data = data, raster = raster) - if ((nrow(x = data) > 1e5) & !isFALSE(raster)){ + if ((nrow(x = data) > 1e5) & is.null(x = raster)){ message("Rasterizing points since number of points exceeds 100,000.", "\nTo disable this behavior set `raster=FALSE`") } @@ -7864,6 +8039,12 @@ SingleCorPlot <- function( x = colnames(x = data), fixed = TRUE ) + names.plot <- colnames(x = data) <- gsub( + pattern = ' ', + replacement = '.', + x = colnames(x = data), + fixed = TRUE + ) if (ncol(x = data) < 2) { msg <- "Too few variables passed" if (ncol(x = data) == 1) { @@ -7884,7 +8065,8 @@ SingleCorPlot <- function( sizes.highlight = pt.size, cols.highlight = 'red', col.base = 'black', - pt.size = pt.size + pt.size = pt.size, + raster = raster ) cols <- highlight.info$color col.by <- factor( @@ -7998,6 +8180,7 @@ SingleCorPlot <- function( #' @param shape.by If NULL, all points are circles (default). You can specify #' any cell attribute (that can be pulled with \code{\link{FetchData}}) #' allowing for both different colors and different shapes on cells. +#' @param alpha Alpha value for plotting (default is 1) #' @param alpha.by Mapping variable for the point alpha value #' @param order Specify the order of plotting for the idents. This can be #' useful for crowded plots if points of interest are being buried. Provide @@ -8039,6 +8222,7 @@ SingleDimPlot <- function( cols = NULL, pt.size = NULL, shape.by = NULL, + alpha = 1, alpha.by = NULL, order = NULL, label = FALSE, @@ -8051,12 +8235,17 @@ SingleDimPlot <- function( raster = NULL, raster.dpi = NULL ) { - pt.size <- pt.size %||% AutoPointSize(data = data, raster = raster) - if ((nrow(x = data) > 1e5) & !isFALSE(raster)){ + if ((nrow(x = data) > 1e5) & is.null(x = raster)){ message("Rasterizing points since number of points exceeds 100,000.", "\nTo disable this behavior set `raster=FALSE`") } raster <- raster %||% (nrow(x = data) > 1e5) + pt.size <- pt.size %||% AutoPointSize(data = data, raster = raster) + + if (!is.null(x = cells.highlight) && pt.size != AutoPointSize(data = data, raster = raster) && sizes.highlight != pt.size && isTRUE(x = raster)) { + warning("When `raster = TRUE` highlighted and non-highlighted cells must be the same size. Plot will use the value provided to 'sizes.highlight'.") + } + if (!is.null(x = raster.dpi)) { if (!is.numeric(x = raster.dpi) || length(x = raster.dpi) != 2) stop("'raster.dpi' must be a two-length numeric vector") @@ -8083,7 +8272,8 @@ SingleDimPlot <- function( sizes.highlight = sizes.highlight %||% pt.size, cols.highlight = cols.highlight, col.base = cols[1] %||% '#C3C3C3', - pt.size = pt.size + pt.size = pt.size, + raster = raster ) order <- highlight.info$plot.order data$highlight <- highlight.info$highlight @@ -8149,6 +8339,7 @@ SingleDimPlot <- function( alpha = alpha.by ), pointsize = pt.size, + alpha = alpha, pixels = raster.dpi ) } else { @@ -8160,11 +8351,12 @@ SingleDimPlot <- function( shape = shape.by, alpha = alpha.by ), - size = pt.size + size = pt.size, + alpha = alpha ) } plot <- plot + - guides(color = guide_legend(override.aes = list(size = 3))) + + guides(color = guide_legend(override.aes = list(size = 3, alpha = 1))) + labs(color = NULL, title = col.by) + CenterTitle() if (label && !is.null(x = col.by)) { @@ -8201,6 +8393,7 @@ SingleDimPlot <- function( #' @param y.max Maximum Y value to plot #' @param adjust Adjust parameter for geom_violin #' @param pt.size Size of points for violin plots +#' @param alpha Alpha vlaue for violin plots #' @param cols Colors to use for plotting #' @param seed.use Random seed to use. If NULL, don't set a seed #' @param log plot Y axis on log10 scale @@ -8231,6 +8424,7 @@ SingleExIPlot <- function( y.max = NULL, adjust = 1, pt.size = 0, + alpha = 1, cols = NULL, seed.use = 42, log = FALSE, @@ -8245,7 +8439,7 @@ SingleExIPlot <- function( if (PackageCheck('ggrastr', error = FALSE)) { # Set rasterization to true if ggrastr is installed and # number of points exceeds 100,000 - if ((nrow(x = data) > 1e5) & !isFALSE(raster)){ + if ((nrow(x = data) > 1e5) & is.null(x = raster)){ message("Rasterizing points since number of points exceeds 100,000.", "\nTo disable this behavior set `raster=FALSE`") # change raster to TRUE @@ -8315,21 +8509,23 @@ SingleExIPlot <- function( ) if (is.null(x = split)) { if (isTRUE(x = raster)) { - jitter <- ggrastr::rasterize(geom_jitter(height = 0, size = pt.size, show.legend = FALSE)) + jitter <- ggrastr::rasterize(geom_jitter(height = 0, size = pt.size, alpha = alpha, show.legend = FALSE)) } else { - jitter <- geom_jitter(height = 0, size = pt.size, show.legend = FALSE) + jitter <- geom_jitter(height = 0, size = pt.size, alpha = alpha, show.legend = FALSE) } } else { if (isTRUE(x = raster)) { jitter <- ggrastr::rasterize(geom_jitter( position = position_jitterdodge(jitter.width = 0.4, dodge.width = 0.9), size = pt.size, + alpha = alpha, show.legend = FALSE )) } else { jitter <- geom_jitter( position = position_jitterdodge(jitter.width = 0.4, dodge.width = 0.9), size = pt.size, + alpha = alpha, show.legend = FALSE ) } @@ -8348,7 +8544,7 @@ SingleExIPlot <- function( scale_y_discrete(expand = c(0.01, 0)), scale_x_continuous(expand = c(0, 0)) ) - jitter <- geom_jitter(width = 0, size = pt.size, show.legend = FALSE) + jitter <- geom_jitter(width = 0, size = pt.size, alpha = alpha, show.legend = FALSE) log.scale <- scale_x_log10() axis.scale <- function(...) { invisible(x = NULL) @@ -8783,12 +8979,16 @@ SingleRasterMap <- function( #' colors #' @param image.alpha Adjust the opacity of the background images. Set to 0 to #' remove. +#' @param image.scale Choose the scale factor ("lowres"/"hires") to apply in +#' order to matchthe plot with the specified `image` - defaults to "lowres" #' @param pt.alpha Adjust the opacity of the points if plotting a #' \code{SpatialDimPlot} #' @param crop Crop the plot in to focus on points plotted. Set to \code{FALSE} #' to show entire background image. #' @param pt.size.factor Sets the size of the points relative to spot.radius #' @param stroke Control the width of the border around the spots +#' @param shape Control the shape of the spots - same as the ggplot2 parameter. +#' The default is 21, which plots cirlces - use 22 to plot squares. #' @param col.by Mapping variable for the point color #' @param alpha.by Mapping variable for the point alpha value #' @param cells.highlight A list of character or numeric vectors of cells to @@ -8817,10 +9017,12 @@ SingleSpatialPlot <- function( image, cols = NULL, image.alpha = 1, + image.scale = "lowres", pt.alpha = NULL, crop = TRUE, pt.size.factor = NULL, - stroke = 0.25, + shape = 21, + stroke = NA, col.by = NULL, alpha.by = NULL, cells.highlight = NULL, @@ -8864,7 +9066,9 @@ SingleSpatialPlot <- function( data = data, image = image, image.alpha = image.alpha, + image.scale = image.scale, crop = crop, + shape = shape, stroke = stroke, ) } else { @@ -8873,7 +9077,9 @@ SingleSpatialPlot <- function( data = data, image = image, image.alpha = image.alpha, + image.scale = image.scale, crop = crop, + shape = shape, stroke = stroke, alpha = pt.alpha ) @@ -8882,7 +9088,14 @@ SingleSpatialPlot <- function( }, 'interactive' = { plot + geom_spatial_interactive( - data = tibble(grob = list(GetImage(object = image, mode = 'grob'))), + data = tibble( + grob = list( + GetImage( + object = image, + mode = 'grob' + ) + ) + ), mapping = aes_string(grob = 'grob'), x = 0.5, y = 0.5 @@ -8897,7 +9110,11 @@ SingleSpatialPlot <- function( data[, c('x', 'y')] <- NULL data <- merge( x = data, - y = GetTissueCoordinates(object = image, qhulls = TRUE), + y = GetTissueCoordinates( + object = image, + qhulls = TRUE, + scale = image.scale + ), by = "cell" ) plot + geom_polygon( diff --git a/R/zzz.R b/R/zzz.R index 49e317374..0063b2fa0 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,4 +1,23 @@ #' @importFrom progressr progressor +#' @importFrom methods slot slot<- +#' @importFrom lifecycle deprecated deprecate_soft deprecate_stop +#' deprecate_warn is_present +#' @importFrom rlang !!! +#' abort +#' arg_match +#' arg_match0 +#' as_name +#' caller_env +#' check_installed +#' enquo +#' inform +#' is_integerish +#' is_na +#' is_quosure +#' is_scalar_integerish +#' quo_get_env +#' quo_get_expr +#' warn #' NULL @@ -39,11 +58,29 @@ seurat_default_options <- list( Seurat.memsafe = FALSE, Seurat.warn.umap.uwot = TRUE, Seurat.checkdots = "warn", - Seurat.limma.wilcox.msg = TRUE, + Seurat.presto.wilcox.msg = TRUE, #CHANGE Seurat.Rfast2.msg = TRUE, - Seurat.warn.vlnplot.split = TRUE + Seurat.warn.vlnplot.split = TRUE, + Seurat.object.assay.version = "v5" ) + +#' @importFrom methods setClassUnion +#' @importClassesFrom Matrix dgCMatrix +#' +NULL + +setClassUnion(name = 'V3Matrix', members = c('matrix', 'dgCMatrix')) + +AttachDeps <- function(deps) { + for (d in deps) { + if (!paste0('package:', d) %in% search()) { + packageStartupMessage("Attaching ", d) + attachNamespace(ns = d) + } + } +} + #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Hooks #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -60,6 +97,7 @@ seurat_default_options <- list( x = names(x = seurat_default_options), y = names(x = options()) ) + # toset <- names(x = seurat_default_options) if (length(x = toset)) { options(seurat_default_options[toset]) } diff --git a/README.md b/README.md index 427ab3a20..370cd40b7 100644 --- a/README.md +++ b/README.md @@ -3,10 +3,15 @@ [![CRAN Version](https://www.r-pkg.org/badges/version/Seurat)](https://cran.r-project.org/package=Seurat) [![CRAN Downloads](https://cranlogs.r-pkg.org/badges/Seurat)](https://cran.r-project.org/package=Seurat) -# Seurat v4.3.0 + +# Seurat v5 Seurat is an R toolkit for single cell genomics, developed and maintained by the Satija Lab at NYGC. +We are excited to release Seurat v5! This updates introduces new functionality for spatial, multimodal, and scalable single-cell analysis. + +Seurat v5 is backwards-compatible with previous versions, so that users will continue to be able to re-run existing workflows. + Instructions, documentation, and tutorials can be found at: * https://satijalab.org/seurat diff --git a/_pkgdown.yaml b/_pkgdown.yaml index 17c2a9464..90b8e0d5e 100644 --- a/_pkgdown.yaml +++ b/_pkgdown.yaml @@ -14,70 +14,72 @@ navbar: title: "Seurat" left: - text: "Install" - href: articles/install.html + href: articles/install_v5.html - text: "Get started" - href: articles/get_started.html + href: articles/get_started_v5_new.html - text: "Vignettes" menu: - text: Introductory Vignettes - text: "PBMC 3K guided tutorial" href: articles/pbmc3k_tutorial.html + - text: "Data visualization vignette" + href: articles/visualization_vignette.html + - text: "SCTransform, v2 regularization" + href: articles/sctransform_vignette.html - text: "Using Seurat with multi-modal data" href: articles/multimodal_vignette.html - - text: "Analysis, visualization, and integration of spatial datasets with Seurat" - href: articles/spatial_vignette.html + - text: "Seurat v5 Command Cheat Sheet" + href: articles/essential_commands.html - text: ------- - text: Data Integration - text: "Introduction to scRNA-seq integration" href: articles/integration_introduction.html + - text: "Integrative analysis in Seurat v5" + href: articles/seurat5_integration.html - text: "Mapping and annotating query datasets" href: articles/integration_mapping.html - - text: "Fast integration using reciprocal PCA (RPCA)" - href: articles/integration_rpca.html - - text: "Tips for integrating large datasets" - href: articles/integration_large_datasets.html - - text: "Integrating scRNA-seq and scATAC-seq data" - href: articles/atacseq_integration_vignette.html - - text: "Multimodal reference mapping" - href: articles/multimodal_reference_mapping.html - text: ------- - - text: New Statistical Methods + - text: Multi-assay data + - text: "Dictionary Learning for cross-modality integration" + href: articles/seurat5_integration_bridge.html - text: "Weighted Nearest Neighbor Analysis" href: articles/weighted_nearest_neighbor_analysis.html + - text: "Integrating scRNA-seq and scATAC-seq data" + href: articles/seurat5_atacseq_integration_vignette.html + - text: "Multimodal reference mapping" + href: articles/multimodal_reference_mapping.html - text: "Mixscape Vignette" href: articles/mixscape_vignette.html - - text: "Using sctransform in Seurat" - href: articles/sctransform_vignette.html - - text: "SCTransform, v2 regularization" - href: articles/sctransform_v2_vignette.html + - text: ------- + - text: Massively scalable analysis + - text: "Sketch-based analysis in Seurat v5" + href: articles/seurat5_sketch_analysis.html + - text: "Sketch integration using a 1 million cell dataset from Parse Biosciences" + href: articles/ParseBio_sketch_integration.html + - text: "Map COVID PBMC datasets to a healthy reference" + href: articles/COVID_SCTMapping.html + - text: "BPCells Interaction" + href: articles/seurat5_bpcells_interaction_vignette.html + - text: ------- + - text: Spatial analysis + - text: "Analysis of spatial datasets (Imaging-based)" + href: articles/seurat5_spatial_vignette_2.html + - text: "Analysis of spatial datasets (Sequencing-based)" + href: articles/spatial_vignette.html - text: ------- - text: Other - - text: "Data visualization vignette" - href: articles/visualization_vignette.html - text: "Cell-cycle scoring and regression" href: articles/cell_cycle_vignette.html - text: "Differential expression testing" href: articles/de_vignette.html - text: "Demultiplexing with hashtag oligos (HTOs)" href: articles/hashing_vignette.html - - text: "Interoperability between single-cell object formats" - href: articles/conversion_vignette.html - - text: "Parallelization in Seurat with future" - href: articles/future_vignette.html - - text: "Dimensional reduction vignette" - href: articles/dim_reduction_vignette.html - - text: "Seurat essential commands list" - href: articles/essential_commands.html - - text: "Seurat interaction tips" - href: articles/interaction_vignette.html - - text: "Merging Seurat objects" - href: articles/merge_vignette.html - text: Extensions href: articles/extensions.html - text: FAQ href: "https://github.com/satijalab/seurat/discussions" - text: "News" - href: news/index.html + href: articles/announcements.html - text: "Reference" href: reference/index.html - text: "Archive" @@ -126,6 +128,11 @@ reference: - contents: - has_concept("spatial") +- title: "Sketching" + desc: "Functions for flexible analysis of massively scalable datasets" +- contents: + - has_concept("sketching") + - title: "Mixscape" desc: > Functions related to the mixscape algorithm @@ -157,7 +164,21 @@ reference: - contents: - has_concept("convenience") +- title: "Sketching" + desc: "Functions for flexible analysis of massively scalable datasets" +- contents: + - has_concept("sketching") + - title: "Re-exports" desc: "Functions re-exported from other packages" - contents: - reexports + - Seurat-class + - Assay-class + - DimReduc-class + - Neighbor-class + - Graph-class + - SpatialImage-class + - JackStrawData-class + - SeuratCommand-class + - Seurat-package diff --git a/cran-comments.md b/cran-comments.md index 4da97cd64..09e951b1f 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,30 +1,31 @@ -# Seurat v4.2.1 +# Seurat v5.1.0 ## Test environments -* local Ubuntu 20.04 install, R 4.1.3 -* win-builder (release, devel) +* local ubuntu 20.04 install, R 4.3.2 +* local macOS 14.1, R 4.4.0 +* win-builder (oldrelease, release, devel) ## R CMD check results + There were no ERRORs or WARNINGs -There was one NOTE: +There were two NOTEs -* checking CRAN incoming feasibility ... NOTE -Maintainer: 'Paul Hoffman ' +> Suggests or Enhances not in mainstream repositories: +> BPCells, presto +> Availability using Additional_repositories specification: +> BPCells yes https://bnprks.r-universe.dev +> presto yes https://satijalab.r-universe.dev -Found the following (possibly) invalid URLs: - URL: https://www.science.org/doi/abs/10.1126/science.aad0501 - From: man/cc.genes.Rd - man/cc.genes.updated.2019.Rd - Status: 503 - Message: Service Unavailable +> * checking package dependencies ... NOTE +> Package suggested but not available for checking: 'BPCells', 'presto' -This URL is valid and the service still exists. When navigating to the URL either via the documentation or directly, you are taken to the correct article +BPCells and presto are hosted on R-universe and used conditionally in Seurat. ## Downstream dependencies -There no packages that depend on Seurat +There are three packages that depend on Seurat: CACIMAR, scCustomize, and SCdeconR; this update does not impact their functionality -There are sixteen packages that import Seurat: CAMML, CIDER, DR.SC, DUBStepR, maple, Platypus, rPanglaoDB, scDiffCom, scMappR, SCRIP, scRNAstat, Signac, SignacX, SoupX, spruce, and tidyseurat; this update does not impact their functionality +There are 30 packages that import Seurat: AnanseSeurat, APackOfTheClones, bbknnR, CAMML, DR.SC, DWLS, GeneNMF, ggsector, mixhvg, nebula, Platypus, PRECAST, ProFAST, rPanglaoDB, scAnnotate, scaper, sccca, scDiffCom, scGate, scGOclust, scMappR, scperturbR, scpoisson, SCRIP, scRNAstat, SignacX, SoupX, SPECK, STREAK, and tidyseurat; this update does not impact their functionality -There are twelve packages that suggest Seurat: BisqueRNA, CIARA, ClustAssess, clustree, conos, DIscBIO, dyngen, harmony, rliger, Rmagic, treefit, and VAM; this update does not impact their functionality. +There are 22 packages that suggest Seurat: BisqueRNA, Canek, cellpypes, CIARA, ClustAssess, clustree, combiroc, conos, countland, CRMetrics, CytoSimplex, DIscBIO, dyngen, grandR, harmony, RESET, rliger, SCORPIUS, SCpubr, Signac, treefit, and VAM; this update does not impact their functionality diff --git a/index.md b/index.md index 7925418e9..33f874f55 100644 --- a/index.md +++ b/index.md @@ -1,36 +1,50 @@ ![](articles/assets/seurat_banner.jpg) -# Official release of Seurat 4.0 +## **Seurat v5** -We are excited to release Seurat v4.0! This update brings the following new features and functionality: +We are excited to release Seurat v5! To install, please follow the instructions in our [install page](install.html). This update brings the following new features and functionality: -* **Integrative multimodal analysis.** The ability to make simultaneous measurements of multiple data types from the same cell, known as multimodal analysis, represents a new and exciting frontier for single-cell genomics. In Seurat v4, we introduce weighted nearest neighbor (WNN) analysis, an unsupervised strategy to learn the information content of each modality in each cell, and to define cellular state based on a weighted combination of both modalities. - In our new paper, we generate a CITE-seq dataset featuring paired measurements of the transcriptome and 228 surface proteins, and leverage WNN to define a multimodal reference of human PBMC. You can use WNN to analyze multimodal data from a variety of technologies, including CITE-seq, ASAP-seq, 10X Genomics ATAC + RNA, and SHARE-seq. +* **Integrative multimodal analysis:** The cellular transcriptome is just one aspect of cellular identity, and recent technologies enable routine profiling of chromatin accessibility, histone modifications, and protein levels from single cells. In Seurat v5, we introduce 'bridge integration', a statistical method to integrate experiments measuring different modalities (i.e. separate scRNA-seq and scATAC-seq datasets), using a separate multiomic dataset as a molecular 'bridge'. For example, we demonstrate how to map scATAC-seq datasets onto scRNA-seq datasets, to assist users in interpreting and annotating data from new modalities.\ +\ +We recognize that while the goal of matching shared cell types across datasets may be important for many problems, users may also be concerned about which method to use, or that integration could result in a loss of biological resolution. In Seurat v5, we also introduce flexible and streamlined workflows for the integration of multiple scRNA-seq datasets. This makes it easier to explore the results of different integration methods, and to compare these results to a workflow that excludes integration steps. - - Paper: [Integrated analysis of multimodal single-cell data](https://doi.org/10.1016/j.cell.2021.04.048) - - Vignette: [Multimodal clustering of a human bone marrow CITE-seq dataset](articles/weighted_nearest_neighbor_analysis.html) - - Portal: [Click here](https://atlas.fredhutch.org/nygc/multimodal-pbmc/) - - Dataset: [Download here](https://atlas.fredhutch.org/data/nygc/multimodal/pbmc_multimodal.h5seurat) + - Paper: [Dictionary learning for integrative, multimodal, and scalable single-cell analysis](https://doi.org/10.1101/2022.02.24.481684) + - Vignette: [Streamlined integration of scRNA-seq data](articles/seurat5_integration.html) + - Vignette: [Cross-modality bridge integration](articles/seurat5_integration_bridge.html) + - Website: [Azimuth-ATAC, reference-mapping for scATAC-seq datasets](https://azimuth.hubmapconsortium.org/references/)\ +\ +* **Flexible, interactive, and highly scalable analsyis:** The size and scale of single-cell sequencing datasets is rapidly increasing, outpacing even Moore's law. In Seurat v5, we introduce new infrastructure and methods to analyze, interpret, and explore exciting datasets spanning millions of cells, even if they cannot be fully loaded into memory. We introduce support for 'sketch'-based analysis, where representative subsamples of a large dataset are stored in-memory to enable rapid and iterative analysis - while the full dataset remains accessible via on-disk storage.\ +\ +We enable high-performance via the BPCells package, developed by Ben Parks in the Greenleaf Lab. The BPCells package enables high-performance analysis via innovative bit-packing compression techniques, optimized C++ code, and use of streamlined and lazy operations. -* **Rapid mapping of query datasets to references.** We introduce Azimuth, a workflow to leverage high-quality reference datasets to rapidly map new scRNA-seq datasets (queries). For example, you can map any scRNA-seq dataset of human PBMC onto our reference, automating the process of visualization, clustering annotation, and differential expression. Azimuth can be run within Seurat, or using a standalone web application that requires no installation or programming experience. + - Vignette: [Sketch-based clustering of 1.3M brain cells (10x Genomics)](articles/seurat5_sketch_analysis.html) + - Vignette: [Sketch-based integration of 1M healthy and diabetic PBMC (Parse Biosciences)](articles/ParseBio_sketch_integration.html) + - Vignette: [Mapping 1.5M cells from multiple studies to an Azimuth reference](articles/COVID_SCTMapping.html) + - Vignette: [Interacting with BPCell matrices in Seurat v5](articles/seurat5_bpcells_interaction_vignette.html) + - BPCells R Package: [Scaling Single Cell Analysis to Millions of Cells](https://bnprks.github.io/BPCells/)\ +\ +* **Analysis of sequencing and imaging-based spatial datasets:** Spatially resolved datasets are redefining our understanding of cellular interactions and the organization of human tissues. Both sequencing-based(i.e. Visium, SLIDE-seq, etc.), and imaging-based (MERFISH/Vizgen, Xenium, CosMX, etc.) technologies have unique advantages, and require tailored analytical methods and software infrastructure. In Seurat v5, we introduce flexible and diverse support for a wide variety of spatially resolved data types, and support for analytical techniqiues for scRNA-seq integration, deconvolution, and niche identification. - - Vignette: [Mapping scRNA-seq queries onto reference datasets](articles/multimodal_reference_mapping.html) - - Web app: [Automated mapping, visualization, and annotation of scRNA-seq datasets from human PBMC](https://azimuth.hubmapconsortium.org/) + - Vignette: [Analysis of spatial datasets (Sequencing-based)](articles/seurat5_spatial_vignette.html) + - Vignette: [Analysis of spatial datasets (Imaging-based)](articles/seurat5_spatial_vignette_2.html)\ +\ +* **Backwards compatibility:** While Seurat v5 introduces new functionality, we have ensured that the software is backwards-compatible with previous versions, so that users will continue to be able to re-run existing workflows. Previous versions of Seurat, such as Seurat v4, can also be installed following the instructions in our [install page](install.html).\ -Additional speed and usability updates: We have made minor changes in v4, primarily to improve the performance of Seurat v4 on large datasets. These changes substantially improve the speed and memory requirements, but do not adversely impact downstream results. We provide a detailed description of key changes [here](articles/v4_changes.html). Users who wish to fully reproduce existing results can continue to do so by continuing to install Seurat v3. +## **Changes between v4 and v5** -We believe that users who are familiar with Seurat v3 should experience a smooth transition to Seurat v4. While we have introduced extensive new functionality, existing workflows, functions, and syntax are largely unchanged in this update. In addition, Seurat objects that have been previously generated in Seurat v3 can be seamlessly loaded into Seurat v4 for further analysis. +We have documented major changes between Seurat v4 and v5 in our [News page](announcements.html) for reference. -# About Seurat +## **About Seurat** Seurat is an R package designed for QC, analysis, and exploration of single-cell RNA-seq data. Seurat aims to enable users to identify and interpret sources of heterogeneity from single-cell transcriptomic measurements, and to integrate diverse types of single-cell data. If you use Seurat in your research, please considering citing: -* [Hao\*, Hao\*, et al., Cell 2021](https://doi.org/10.1016/j.cell.2021.04.048) [Seurat V4] -* [Stuart\*, Butler\*, et al., Cell 2019](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8) [Seurat V3] -* [Butler\* et al., Nat Biotechnol 2018](https://doi.org/10.1038/nbt.4096) [Seurat V2] -* [Satija\*, Farrell\*, et al., Nat Biotechnol 2015](https://doi.org/10.1038/nbt.3192) [Seurat V1] +* [Hao, et al., Nature Biotechnology 2023](https://www.nature.com/articles/s41587-023-01767-y) [Seurat v5] +* [Hao\*, Hao\*, et al., Cell 2021](https://doi.org/10.1016/j.cell.2021.04.048) [Seurat v4] +* [Stuart\*, Butler\*, et al., Cell 2019](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8) [Seurat v3] +* [Butler, et al., Nat Biotechnol 2018](https://doi.org/10.1038/nbt.4096) [Seurat v2] +* [Satija\*, Farrell\*, et al., Nat Biotechnol 2015](https://doi.org/10.1038/nbt.3192) [Seurat v1] All methods emphasize clear, attractive, and interpretable visualizations, and were designed to be [easily used](articles/get_started.html) by both dry-lab and wet-lab researchers. diff --git a/inst/CITATION b/inst/CITATION index 4aaa4d01f..76d447bbd 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -1,31 +1,55 @@ citHeader("To cite Seurat in publications, please use:") -citEntry(entry = "article", - author = personList(as.person("Yuhan Hao"), - as.person("Stephanie Hao"), - as.person("Erica Andersen-Nissen"), - as.person("William M. Mauck III"), - as.person("Shiwei Zheng"), - as.person("Andrew Butler"), - as.person("Maddie J. Lee"), - as.person("Aaron J. Wilk"), - as.person("Charlotte Darby"), - as.person("Michael Zagar"), - as.person("Paul Hoffman"), - as.person("Marlon Stoeckius"), - as.person("Efthymia Papalexi"), - as.person("Eleni P. Mimitou"), - as.person("Jaison Jain"), - as.person("Avi Srivastava"), - as.person("Tim Stuart"), - as.person("Lamar B. Fleming"), - as.person("Bertrand Yeung"), - as.person("Angela J. Rogers"), - as.person("Juliana M. McElrath"), - as.person("Catherine A. Blish"), - as.person("Raphael Gottardo"), - as.person("Peter Smibert"), - as.person("Rahul Satija")), +bibentry(bibtype = "article", + author = c( + as.person("Yuhan Hao"), + as.person("Tim Stuart"), + as.person("Madeline H Kowalski"), + as.person("Saket Choudhary"), + as.person("Paul Hoffman"), + as.person("Austin Hartman"), + as.person("Avi Srivastava"), + as.person("Gesmira Molla"), + as.person("Shaista Madad"), + as.person("Carlos Fernandez-Granda"), + as.person("Rahul Satija") + ), + title = "Dictionary learning for integrative, multimodal and scalable single-cell analysis", + journal = "Nature Biotechnology", + year = "2023", + doi = "10.1038/s41587-023-01767-y", + url = "https://doi.org/10.1038/s41587-023-01767-y", + textVersion = "Hao et al. Dictionary learning for integrative, multimodal and scalable single-cell analysis. Nature Biotechnology (2023) [Seurat V5]" +) + +bibentry(bibtype = "article", + author = c( + as.person("Yuhan Hao"), + as.person("Stephanie Hao"), + as.person("Erica Andersen-Nissen"), + as.person("William M. Mauck III"), + as.person("Shiwei Zheng"), + as.person("Andrew Butler"), + as.person("Maddie J. Lee"), + as.person("Aaron J. Wilk"), + as.person("Charlotte Darby"), + as.person("Michael Zagar"), + as.person("Paul Hoffman"), + as.person("Marlon Stoeckius"), + as.person("Efthymia Papalexi"), + as.person("Eleni P. Mimitou"), + as.person("Jaison Jain"), + as.person("Avi Srivastava"), + as.person("Tim Stuart"), + as.person("Lamar B. Fleming"), + as.person("Bertrand Yeung"), + as.person("Angela J. Rogers"), + as.person("Juliana M. McElrath"), + as.person("Catherine A. Blish"), + as.person("Raphael Gottardo"), + as.person("Peter Smibert"), + as.person("Rahul Satija") + ), title = "Integrated analysis of multimodal single-cell data", journal = "Cell", year = "2021", @@ -34,17 +58,19 @@ citEntry(entry = "article", textVersion = "Hao and Hao et al. Integrated analysis of multimodal single-cell data. Cell (2021) [Seurat V4]" ) -citEntry(entry = "article", - author = personList(as.person("Tim Stuart"), - as.person("Andrew Butler"), - as.person("Paul Hoffman"), - as.person("Christoph Hafemeister"), - as.person("Efthymia Papalexi"), - as.person("William M Mauck III"), - as.person("Yuhan Hao"), - as.person("Marlon Stoeckius"), - as.person("Peter Smibert"), - as.person("Rahul Satija")), +bibentry(bibtype = "article", + author = c( + as.person("Tim Stuart"), + as.person("Andrew Butler"), + as.person("Paul Hoffman"), + as.person("Christoph Hafemeister"), + as.person("Efthymia Papalexi"), + as.person("William M Mauck III"), + as.person("Yuhan Hao"), + as.person("Marlon Stoeckius"), + as.person("Peter Smibert"), + as.person("Rahul Satija") + ), title = "Comprehensive Integration of Single-Cell Data", journal = "Cell", year = "2019", @@ -55,12 +81,14 @@ citEntry(entry = "article", textVersion = "Stuart and Butler et al. Comprehensive Integration of Single-Cell Data. Cell (2019) [Seurat V3]" ) -citEntry(entry = "article", - author = personList(as.person("Andrew Butler"), - as.person("Paul Hoffman"), - as.person("Peter Smibert"), - as.person("Efthymia Papalexi"), - as.person("Rahul Satija")), +bibentry(bibtype = "article", + author = c( + as.person("Andrew Butler"), + as.person("Paul Hoffman"), + as.person("Peter Smibert"), + as.person("Efthymia Papalexi"), + as.person("Rahul Satija") + ), title = "Integrating single-cell transcriptomic data across different conditions, technologies, and species", journal = "Nature Biotechnology", year = "2018", @@ -71,12 +99,14 @@ citEntry(entry = "article", textVersion = "Butler et al. Integrating single-cell transcriptomic data across different conditions, technologies, and species. Nat Biotechnol (2018) [Seurat V2]" ) -citEntry(entry = "article", - author = personList(as.person("Rahul Satija"), - as.person("Jeffrey A Farrell"), - as.person("David Gennert"), - as.person("Alexander F Schier"), - as.person("Aviv Regev")), +bibentry(bibtype = "article", + author = c( + as.person("Rahul Satija"), + as.person("Jeffrey A Farrell"), + as.person("David Gennert"), + as.person("Alexander F Schier"), + as.person("Aviv Regev") + ), title = "Spatial reconstruction of single-cell gene expression data", journal = "Nature Biotechnology", year = "2015", diff --git a/man/AddAzimuthResults.Rd b/man/AddAzimuthResults.Rd index 2044e87b2..8f8a6e206 100644 --- a/man/AddAzimuthResults.Rd +++ b/man/AddAzimuthResults.Rd @@ -25,3 +25,4 @@ object <- AddAzimuthResults(object, filename = "azimuth_results.Rds") } } +\concept{utilities} diff --git a/man/AddAzimuthScores.Rd b/man/AddAzimuthScores.Rd index a49350531..3f1338df8 100644 --- a/man/AddAzimuthScores.Rd +++ b/man/AddAzimuthScores.Rd @@ -24,3 +24,4 @@ object <- AddAzimuthScores(object, filename = "azimuth_pred.tsv") } } +\keyword{internal} diff --git a/man/AddModuleScore.Rd b/man/AddModuleScore.Rd index 141dd6a5a..efafd56d2 100644 --- a/man/AddModuleScore.Rd +++ b/man/AddModuleScore.Rd @@ -15,6 +15,7 @@ AddModuleScore( name = "Cluster", seed = 1, search = FALSE, + slot = "data", ... ) } @@ -48,6 +49,8 @@ programs, the results will be stored as \code{name1}, \code{name2}, don't match features in \code{object}? Searches the HGNC's gene names database; see \code{\link{UpdateSymbolList}} for more details} +\item{slot}{Slot to calculate score values off of. Defaults to data slot (i.e log-normalized counts)} + \item{...}{Extra parameters passed to \code{\link{UpdateSymbolList}}} } \value{ diff --git a/man/AggregateExpression.Rd b/man/AggregateExpression.Rd index d54854e8e..71983b4de 100644 --- a/man/AggregateExpression.Rd +++ b/man/AggregateExpression.Rd @@ -11,7 +11,9 @@ AggregateExpression( return.seurat = FALSE, group.by = "ident", add.ident = NULL, - slot = "data", + normalization.method = "LogNormalize", + scale.factor = 10000, + margin = 1, verbose = TRUE, ... ) @@ -25,41 +27,40 @@ AggregateExpression( \item{return.seurat}{Whether to return the data as a Seurat object. Default is FALSE} -\item{group.by}{Categories for grouping (e.g, ident, replicate, celltype); 'ident' by default} +\item{group.by}{Category (or vector of categories) for grouping (e.g, ident, replicate, celltype); 'ident' by default +To use multiple categories, specify a vector, such as c('ident', 'replicate', 'celltype')} -\item{add.ident}{(Deprecated) Place an additional label on each cell prior to pseudobulking -(very useful if you want to observe cluster pseudobulk values, separated by replicate, for example)} +\item{add.ident}{(Deprecated). Place an additional label on each cell prior to pseudobulking} -\item{slot}{Slot(s) to use; if multiple slots are given, assumed to follow -the order of 'assays' (if specified) or object's assays} +\item{normalization.method}{Method for normalization, see \code{\link{NormalizeData}}} + +\item{scale.factor}{Scale factor for normalization, see \code{\link{NormalizeData}}} + +\item{margin}{Margin to perform CLR normalization, see \code{\link{NormalizeData}}} \item{verbose}{Print messages and show progress bar} -\item{...}{Arguments to be passed to methods such as \code{\link{CreateSeuratObject}}#'} +\item{...}{Arguments to be passed to methods such as \code{\link{CreateSeuratObject}}} } \value{ Returns a matrix with genes as rows, identity classes as columns. If return.seurat is TRUE, returns an object of class \code{\link{Seurat}}. } \description{ -Returns aggregated (summed) expression values for each identity class +Returns summed counts ("pseudobulk") for each identity class. } \details{ -If slot is set to 'data', this function assumes that the data has been log -normalized and therefore feature values are exponentiated prior to aggregating -so that sum is done in non-log space. Otherwise, if slot is set to -either 'counts' or 'scale.data', no exponentiation is performed prior to -aggregating -If \code{return.seurat = TRUE} and slot is not 'scale.data', aggregated values -are placed in the 'counts' slot of the returned object and the log of aggregated values -are placed in the 'data' slot. For the \code{\link{ScaleData}} is then run on the default assay +If \code{return.seurat = TRUE}, aggregated values are placed in the 'counts' +layer of the returned object. The data is then normalized by running \code{\link{NormalizeData}} +on the aggregated counts. \code{\link{ScaleData}} is then run on the default assay before returning the object. -If \code{return.seurat = TRUE} and slot is 'scale.data', the 'counts' slot is left empty, -the 'data' slot is filled with NA, and 'scale.data' is set to the aggregated values. } \examples{ +\dontrun{ data("pbmc_small") -head(AggregateExpression(object = pbmc_small)) +head(AggregateExpression(object = pbmc_small)$RNA) +head(AggregateExpression(object = pbmc_small, group.by = c('ident', 'groups'))$RNA) +} } \concept{utilities} diff --git a/man/AnchorSet-class.Rd b/man/AnchorSet-class.Rd index 25c7e4962..37c7687d0 100644 --- a/man/AnchorSet-class.Rd +++ b/man/AnchorSet-class.Rd @@ -28,6 +28,8 @@ the anchor.} \item{\code{offsets}}{The offsets used to enable cell look up in downstream functions} +\item{\code{weight.reduction}}{The weight dimensional reduction used to calculate weight matrix} + \item{\code{anchor.features}}{The features used when performing anchor finding.} \item{\code{neighbors}}{List containing Neighbor objects for reuse later (e.g. mapping)} diff --git a/man/AverageExpression.Rd b/man/AverageExpression.Rd index ca481cb3d..a40b3fe13 100644 --- a/man/AverageExpression.Rd +++ b/man/AverageExpression.Rd @@ -11,7 +11,8 @@ AverageExpression( return.seurat = FALSE, group.by = "ident", add.ident = NULL, - slot = "data", + layer = "data", + slot = deprecated(), verbose = TRUE, ... ) @@ -25,14 +26,16 @@ AverageExpression( \item{return.seurat}{Whether to return the data as a Seurat object. Default is FALSE} -\item{group.by}{Categories for grouping (e.g, ident, replicate, celltype); 'ident' by default} +\item{group.by}{Category (or vector of categories) for grouping (e.g, ident, replicate, celltype); 'ident' by default +To use multiple categories, specify a vector, such as c('ident', 'replicate', 'celltype')} -\item{add.ident}{(Deprecated) Place an additional label on each cell prior to pseudobulking -(very useful if you want to observe cluster pseudobulk values, separated by replicate, for example)} +\item{add.ident}{(Deprecated). Place an additional label on each cell prior to pseudobulking} -\item{slot}{Slot(s) to use; if multiple slots are given, assumed to follow +\item{layer}{Layer(s) to use; if multiple layers are given, assumed to follow the order of 'assays' (if specified) or object's assays} +\item{slot}{(Deprecated). Slots(s) to use} + \item{verbose}{Print messages and show progress bar} \item{...}{Arguments to be passed to methods such as \code{\link{CreateSeuratObject}}} @@ -42,24 +45,24 @@ Returns a matrix with genes as rows, identity classes as columns. If return.seurat is TRUE, returns an object of class \code{\link{Seurat}}. } \description{ -Returns averaged expression values for each identity class +Returns averaged expression values for each identity class. } \details{ -If slot is set to 'data', this function assumes that the data has been log +If layer is set to 'data', this function assumes that the data has been log normalized and therefore feature values are exponentiated prior to averaging -so that averaging is done in non-log space. Otherwise, if slot is set to -either 'counts' or 'scale.data', no exponentiation is performed prior to -averaging -If \code{return.seurat = TRUE} and slot is not 'scale.data', averaged values -are placed in the 'counts' slot of the returned object and the log of averaged values -are placed in the 'data' slot. \code{\link{ScaleData}} is then run on the default assay -before returning the object. -If \code{return.seurat = TRUE} and slot is 'scale.data', the 'counts' slot is left empty, -the 'data' slot is filled with NA, and 'scale.data' is set to the aggregated values. +so that averaging is done in non-log space. Otherwise, if layer is set to +either 'counts' or 'scale.data', no exponentiation is performed prior to averaging. +If \code{return.seurat = TRUE} and layer is not 'scale.data', averaged values +are placed in the 'counts' layer of the returned object and 'log1p' +is run on the averaged counts and placed in the 'data' layer \code{\link{ScaleData}} +is then run on the default assay before returning the object. +If \code{return.seurat = TRUE} and layer is 'scale.data', the 'counts' layer contains +average counts and 'scale.data' is set to the averaged values of 'scale.data'. } \examples{ data("pbmc_small") -head(AverageExpression(object = pbmc_small)) +head(AverageExpression(object = pbmc_small)$RNA) +head(AverageExpression(object = pbmc_small, group.by = c('ident', 'groups'))$RNA) } \concept{utilities} diff --git a/man/BGTextColor.Rd b/man/BGTextColor.Rd index 99130d0dc..823f57dd1 100644 --- a/man/BGTextColor.Rd +++ b/man/BGTextColor.Rd @@ -22,7 +22,7 @@ hexadecimal codes} \item{threshold}{Intensity threshold for light/dark cutoff; intensities greater than \code{theshold} yield \code{dark}, others yield \code{light}} -\item{w3c}{Use \href{http://www.w3.org/TR/WCAG20/}{W3C} formula for calculating +\item{w3c}{Use \href{https://www.w3.org/TR/WCAG20/}{W3C} formula for calculating background text color; ignores \code{threshold}} \item{dark}{Color for dark text} diff --git a/man/BridgeCellsRepresentation.Rd b/man/BridgeCellsRepresentation.Rd new file mode 100644 index 000000000..9dfe0e225 --- /dev/null +++ b/man/BridgeCellsRepresentation.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/integration.R +\name{BridgeCellsRepresentation} +\alias{BridgeCellsRepresentation} +\title{Construct a dictionary representation for each unimodal dataset} +\usage{ +BridgeCellsRepresentation( + object.list, + bridge.object, + object.reduction, + bridge.reduction, + laplacian.reduction = "lap", + laplacian.dims = 1:50, + bridge.assay.name = "Bridge", + return.all.assays = FALSE, + l2.norm = TRUE, + verbose = TRUE +) +} +\arguments{ +\item{object.list}{A list of Seurat objects} + +\item{bridge.object}{A multi-omic bridge Seurat which is used as the basis to +represent unimodal datasets} + +\item{object.reduction}{A list of dimensional reductions from object.list used +to be reconstructed by bridge.object} + +\item{bridge.reduction}{A list of dimensional reductions from bridge.object used +to reconstruct object.reduction} + +\item{laplacian.reduction}{Name of bridge graph laplacian dimensional reduction} + +\item{laplacian.dims}{Dimensions used for bridge graph laplacian dimensional reduction} + +\item{bridge.assay.name}{Assay name used for bridge object reconstruction value (default is 'Bridge')} + +\item{return.all.assays}{Whether to return all assays in the object.list. +Only bridge assay is returned by default.} + +\item{l2.norm}{Whether to l2 normalize the dictionary representation} + +\item{verbose}{Print messages and progress} +} +\value{ +Returns a object list in which each object has a bridge cell derived assay +} +\description{ +Construct a dictionary representation for each unimodal dataset +} +\concept{integration} diff --git a/man/BridgeReferenceSet-class.Rd b/man/BridgeReferenceSet-class.Rd new file mode 100644 index 000000000..b3faf5b90 --- /dev/null +++ b/man/BridgeReferenceSet-class.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/objects.R +\docType{class} +\name{BridgeReferenceSet-class} +\alias{BridgeReferenceSet-class} +\alias{BridgeReferenceSet} +\title{The BridgeReferenceSet Class +The BridgeReferenceSet is an output from PrepareBridgeReference} +\description{ +The BridgeReferenceSet Class +The BridgeReferenceSet is an output from PrepareBridgeReference +} +\section{Slots}{ + +\describe{ +\item{\code{bridge}}{The multi-omic object} + +\item{\code{reference}}{The Reference object only containing bridge representation assay} + +\item{\code{params}}{A list of parameters used in the PrepareBridgeReference} + +\item{\code{command}}{Store log of parameters that were used} +}} + +\concept{objects} diff --git a/man/BuildClusterTree.Rd b/man/BuildClusterTree.Rd index f8c061167..243d02d54 100644 --- a/man/BuildClusterTree.Rd +++ b/man/BuildClusterTree.Rd @@ -34,8 +34,7 @@ is not NULL.} \item{graph}{If graph is passed, build tree based on graph connectivity between clusters; overrides \code{dims} and \code{features}} -\item{slot}{Slot(s) to use; if multiple slots are given, assumed to follow -the order of 'assays' (if specified) or object's assays} +\item{slot}{slot/layer to use.} \item{reorder}{Re-order identity classes (factor ordering), according to position on the tree. This groups similar classes together which can be @@ -50,22 +49,24 @@ the tree, assigning a numeric value ('1' is the leftmost node)} A Seurat object where the cluster tree can be accessed with \code{\link{Tool}} } \description{ -Constructs a phylogenetic tree relating the 'average' cell from each +Constructs a phylogenetic tree relating the 'aggregate' cell from each identity class. Tree is estimated based on a distance matrix constructed in either gene expression space or PCA space. } \details{ -Note that the tree is calculated for an 'average' cell, so gene expression -or PC scores are averaged across all cells in an identity class before the +Note that the tree is calculated for an 'aggregate' cell, so gene expression +or PC scores are summed across all cells in an identity class before the tree is constructed. } \examples{ +\dontrun{ if (requireNamespace("ape", quietly = TRUE)) { data("pbmc_small") pbmc_small pbmc_small <- BuildClusterTree(object = pbmc_small) Tool(object = pbmc_small, slot = 'BuildClusterTree') } +} } \concept{tree} diff --git a/man/BuildNicheAssay.Rd b/man/BuildNicheAssay.Rd new file mode 100644 index 000000000..2c4b39c8d --- /dev/null +++ b/man/BuildNicheAssay.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{BuildNicheAssay} +\alias{BuildNicheAssay} +\title{Construct an assay for spatial niche analysis} +\usage{ +BuildNicheAssay( + object, + fov, + group.by, + assay = "niche", + cluster.name = "niches", + neighbors.k = 20, + niches.k = 4 +) +} +\arguments{ +\item{object}{A Seurat object} + +\item{fov}{FOV object to gather cell positions from} + +\item{group.by}{Cell classifications to count in spatial neighborhood} + +\item{assay}{Name for spatial neighborhoods assay} + +\item{cluster.name}{Name of output clusters} + +\item{neighbors.k}{Number of neighbors to consider for each cell} + +\item{niches.k}{Number of clusters to return based on the niche assay} +} +\value{ +Seurat object containing a new assay +} +\description{ +This function will construct a new assay where each feature is a +cell label The values represents the sum of a particular cell label +neighboring a given cell. +} +\concept{clustering} diff --git a/man/CCAIntegration.Rd b/man/CCAIntegration.Rd new file mode 100644 index 000000000..b8399ad04 --- /dev/null +++ b/man/CCAIntegration.Rd @@ -0,0 +1,128 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/integration5.R +\name{CCAIntegration} +\alias{CCAIntegration} +\title{Seurat-CCA Integration} +\usage{ +CCAIntegration( + object = NULL, + assay = NULL, + layers = NULL, + orig = NULL, + new.reduction = "integrated.dr", + reference = NULL, + features = NULL, + normalization.method = c("LogNormalize", "SCT"), + dims = 1:30, + k.filter = NA, + scale.layer = "scale.data", + dims.to.integrate = NULL, + k.weight = 100, + weight.reduction = NULL, + sd.weight = 1, + sample.tree = NULL, + preserve.order = FALSE, + verbose = TRUE, + ... +) +} +\arguments{ +\item{object}{A \code{Seurat} object} + +\item{assay}{Name of \code{Assay} in the \code{Seurat} object} + +\item{layers}{Names of layers in \code{assay}} + +\item{orig}{A \link[SeuratObject:DimReduc]{dimensional reduction} to correct} + +\item{new.reduction}{Name of new integrated dimensional reduction} + +\item{reference}{A reference \code{Seurat} object} + +\item{features}{A vector of features to use for integration} + +\item{normalization.method}{Name of normalization method used: LogNormalize +or SCT} + +\item{dims}{Dimensions of dimensional reduction to use for integration} + +\item{k.filter}{Number of anchors to filter} + +\item{scale.layer}{Name of scaled layer in \code{Assay}} + +\item{dims.to.integrate}{Number of dimensions to return integrated values for} + +\item{k.weight}{Number of neighbors to consider when weighting anchors} + +\item{weight.reduction}{Dimension reduction to use when calculating anchor +weights. This can be one of: +\itemize{ + \item{A string, specifying the name of a dimension reduction present in + all objects to be integrated} + \item{A vector of strings, specifying the name of a dimension reduction to + use for each object to be integrated} + \item{A vector of \code{\link{DimReduc}} objects, specifying the object to + use for each object in the integration} + \item{NULL, in which case the full corrected space is used for computing + anchor weights.} +}} + +\item{sd.weight}{Controls the bandwidth of the Gaussian kernel for weighting} + +\item{sample.tree}{Specify the order of integration. Order of integration +should be encoded in a matrix, where each row represents one of the pairwise +integration steps. Negative numbers specify a dataset, positive numbers +specify the integration results from a given row (the format of the merge +matrix included in the \code{\link{hclust}} function output). For example: +\code{matrix(c(-2, 1, -3, -1), ncol = 2)} gives: + +\if{html}{\out{
}}\preformatted{ [,1] [,2] + [1,] -2 -3 + [2,] 1 -1 +}\if{html}{\out{
}} + +Which would cause dataset 2 and 3 to be integrated first, then the resulting +object integrated with dataset 1. + +If NULL, the sample tree will be computed automatically.} + +\item{preserve.order}{Do not reorder objects based on size for each pairwise +integration.} + +\item{verbose}{Print progress} + +\item{...}{Arguments passed on to \code{FindIntegrationAnchors}} +} +\description{ +Seurat-CCA Integration +} +\examples{ +\dontrun{ +# Preprocessing +obj <- SeuratData::LoadData("pbmcsca") +obj[["RNA"]] <- split(obj[["RNA"]], f = obj$Method) +obj <- NormalizeData(obj) +obj <- FindVariableFeatures(obj) +obj <- ScaleData(obj) +obj <- RunPCA(obj) + +# After preprocessing, we integrate layers. +obj <- IntegrateLayers(object = obj, method = CCAIntegration, + orig.reduction = "pca", new.reduction = "integrated.cca", + verbose = FALSE) + +# Modifying parameters +# We can also specify parameters such as `k.anchor` to increase the strength of integration +obj <- IntegrateLayers(object = obj, method = CCAIntegration, + orig.reduction = "pca", new.reduction = "integrated.cca", + k.anchor = 20, verbose = FALSE) + +# Integrating SCTransformed data +obj <- SCTransform(object = obj) +obj <- IntegrateLayers(object = obj, method = CCAIntegration, + orig.reduction = "pca", new.reduction = "integrated.cca", + assay = "SCT", verbose = FALSE) +} + +} +\concept{integration} diff --git a/man/CalcDispersion.Rd b/man/CalcDispersion.Rd new file mode 100644 index 000000000..c95d13718 --- /dev/null +++ b/man/CalcDispersion.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocessing5.R +\name{CalcDispersion} +\alias{CalcDispersion} +\title{Calculate dispersion of features} +\usage{ +CalcDispersion( + object, + mean.function = FastExpMean, + dispersion.function = FastLogVMR, + num.bin = 20, + binning.method = "equal_width", + verbose = TRUE, + ... +) +} +\arguments{ +\item{object}{Data matrix} + +\item{mean.function}{Function to calculate mean} + +\item{dispersion.function}{Function to calculate dispersion} + +\item{num.bin}{Number of bins to use} + +\item{binning.method}{Method to use for binning. Options are 'equal_width' or 'equal_frequency'} + +\item{verbose}{Display progress} +} +\description{ +Calculate dispersion of features +} +\keyword{internal} diff --git a/man/CellsByImage.Rd b/man/CellsByImage.Rd index ca0acaa3c..3c867aeef 100644 --- a/man/CellsByImage.Rd +++ b/man/CellsByImage.Rd @@ -26,3 +26,4 @@ CellsByImage(object = object, images = "slice1") } } +\keyword{internal} diff --git a/man/ColorDimSplit.Rd b/man/ColorDimSplit.Rd index 4dab0677c..33efebabd 100644 --- a/man/ColorDimSplit.Rd +++ b/man/ColorDimSplit.Rd @@ -37,8 +37,8 @@ See \code{\link{DiscretePalette}} for details.} \item{\code{reduction}}{Which dimensionality reduction to use. If not specified, first searches for umap, then tsne, then pca} \item{\code{group.by}}{Name of one or more metadata columns to group (color) cells by (for example, orig.ident); pass 'ident' to group by identity class} - \item{\code{split.by}}{Name of a metadata column to split plot by; -see \code{\link{FetchData}} for more details} + \item{\code{split.by}}{A factor in object metadata to split the plot by, pass 'ident' +to split by cell identity'} \item{\code{shape.by}}{If NULL, all points are circles (default). You can specify any cell attribute (that can be pulled with FetchData) allowing for both different colors and different shapes on cells. Only applicable if \code{raster = FALSE}.} @@ -53,6 +53,7 @@ useful for crowded plots if points of interest are being buried. (default is FAL \item{\code{label.color}}{Sets the color of the label text} \item{\code{label.box}}{Whether to put a box around the label text (geom_text vs geom_label)} + \item{\code{alpha}}{Alpha value for plotting (default is 1)} \item{\code{repel}}{Repel labels} \item{\code{cells.highlight}}{A list of character or numeric vectors of cells to highlight. If only one group of cells desired, can simply @@ -62,7 +63,8 @@ will also resize to the size(s) passed to \code{sizes.highlight}} \item{\code{cols.highlight}}{A vector of colors to highlight the cells as; will repeat to the length groups in cells.highlight} \item{\code{sizes.highlight}}{Size of highlighted cells; will repeat to the length -groups in cells.highlight} +groups in cells.highlight. If \code{sizes.highlight = TRUE} size of all +points will be this value.} \item{\code{na.value}}{Color value for NA points when using custom scale} \item{\code{ncol}}{Number of columns for display when combining plots} \item{\code{combine}}{Combine plots into a single \code{\link[patchwork]{patchwork}ed} @@ -81,12 +83,14 @@ Returns a DimPlot colored based on whether the cells fall in clusters to the left or to the right of a node split in the cluster tree. } \examples{ +\dontrun{ if (requireNamespace("ape", quietly = TRUE)) { data("pbmc_small") pbmc_small <- BuildClusterTree(object = pbmc_small, verbose = FALSE) PlotClusterTree(pbmc_small) ColorDimSplit(pbmc_small, node = 5) } +} } \seealso{ diff --git a/man/CountSketch.Rd b/man/CountSketch.Rd new file mode 100644 index 000000000..802cc8e75 --- /dev/null +++ b/man/CountSketch.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sketching.R +\name{CountSketch} +\alias{CountSketch} +\title{Generate CountSketch random matrix} +\usage{ +CountSketch(nsketch, ncells, seed = NA_integer_, ...) +} +\arguments{ +\item{nsketch}{Number of sketching random cells} + +\item{ncells}{Number of cells in the original data} + +\item{seed}{a single value, interpreted as an integer, or \code{NULL} + (see \sQuote{Details}).} + +\item{...}{Ignored} +} +\value{ +... +} +\description{ +Generate CountSketch random matrix +} +\references{ +Clarkson, KL. & Woodruff, DP. +Low-rank approximation and regression in input sparsity time. +Journal of the ACM (JACM). 2017 Jan 30;63(6):1-45. +\url{https://dl.acm.org/doi/abs/10.1145/3019134}; +} +\concept{sketching} +\keyword{internal} diff --git a/man/CreateCategoryMatrix.Rd b/man/CreateCategoryMatrix.Rd new file mode 100644 index 000000000..08afede39 --- /dev/null +++ b/man/CreateCategoryMatrix.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities.R +\name{CreateCategoryMatrix} +\alias{CreateCategoryMatrix} +\title{Create one hot matrix for a given label} +\usage{ +CreateCategoryMatrix( + labels, + method = c("aggregate", "average"), + cells.name = NULL +) +} +\arguments{ +\item{labels}{A vector of labels} + +\item{method}{Method to aggregate cells with the same label. Either 'aggregate' or 'average'} + +\item{cells.name}{A vector of cell names} +} +\description{ +Create one hot matrix for a given label +} +\concept{utilities} diff --git a/man/CreateSCTAssayObject.Rd b/man/CreateSCTAssayObject.Rd index c3c1406c8..70f30f633 100644 --- a/man/CreateSCTAssayObject.Rd +++ b/man/CreateSCTAssayObject.Rd @@ -25,10 +25,10 @@ CreateSCTAssayObject( \item{min.cells}{Include features detected in at least this many cells. Will subset the counts matrix as well. To reintroduce excluded features, create a -new object with a lower cutoff.} +new object with a lower cutoff} \item{min.features}{Include cells where at least this many features are -detected.} +detected} \item{SCTModel.list}{list of SCTModels} } diff --git a/man/DEenrichRPlot.Rd b/man/DEenrichRPlot.Rd index 0bf4a36e9..5dd783d50 100644 --- a/man/DEenrichRPlot.Rd +++ b/man/DEenrichRPlot.Rd @@ -32,7 +32,7 @@ DEenrichRPlot( positive DE genes.If false, only positive DE gene will be displayed.} \item{logfc.threshold}{Limit testing to genes which show, on average, at least -X-fold difference (log-scale) between the two groups of cells. Default is 0.25 +X-fold difference (log-scale) between the two groups of cells. Default is 0.25. Increasing logfc.threshold speeds up the function, but can miss weaker signals.} \item{assay}{Assay to use in differential expression testing} @@ -42,7 +42,11 @@ Increasing logfc.threshold speeds up the function, but can miss weaker signals.} \item{test.use}{Denotes which test to use. Available options are: \itemize{ \item{"wilcox"} : Identifies differentially expressed genes between two - groups of cells using a Wilcoxon Rank Sum test (default) + groups of cells using a Wilcoxon Rank Sum test (default); will use a fast + implementation by Presto if installed + \item{"wilcox_limma"} : Identifies differentially expressed genes between two + groups of cells using the limma implementation of the Wilcoxon Rank Sum test; + set this option to reproduce results from Seurat v4 \item{"bimod"} : Likelihood-ratio test for single cell gene expression, (McDavid et al., Bioinformatics, 2013) \item{"roc"} : Identifies 'markers' of gene expression using ROC analysis. diff --git a/man/DISP.Rd b/man/DISP.Rd new file mode 100644 index 000000000..e876f0823 --- /dev/null +++ b/man/DISP.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocessing5.R +\name{DISP} +\alias{DISP} +\title{Find variable features based on dispersion} +\usage{ +DISP(data, nselect = 2000L, verbose = TRUE, ...) +} +\arguments{ +\item{data}{Data matrix} + +\item{nselect}{Number of top features to select based on dispersion values} + +\item{verbose}{Display progress} +} +\description{ +Find variable features based on dispersion +} +\keyword{internal} diff --git a/man/DietSeurat.Rd b/man/DietSeurat.Rd index 35b66560e..d09ee9c38 100644 --- a/man/DietSeurat.Rd +++ b/man/DietSeurat.Rd @@ -6,39 +6,48 @@ \usage{ DietSeurat( object, - counts = TRUE, - data = TRUE, - scale.data = FALSE, + layers = NULL, features = NULL, assays = NULL, dimreducs = NULL, graphs = NULL, - misc = TRUE + misc = TRUE, + counts = deprecated(), + data = deprecated(), + scale.data = deprecated(), + ... ) } \arguments{ -\item{object}{Seurat object} +\item{object}{A \code{\link[SeuratObject]{Seurat}} object} -\item{counts}{Preserve the count matrices for the assays specified} - -\item{data}{Preserve the data slot for the assays specified} - -\item{scale.data}{Preserve the scale.data slot for the assays specified} +\item{layers}{A vector or named list of layers to keep} \item{features}{Only keep a subset of features, defaults to all features} \item{assays}{Only keep a subset of assays specified here} -\item{dimreducs}{Only keep a subset of DimReducs specified here (if NULL, -remove all DimReducs)} +\item{dimreducs}{Only keep a subset of DimReducs specified here (if +\code{NULL}, remove all DimReducs)} -\item{graphs}{Only keep a subset of Graphs specified here (if NULL, remove -all Graphs)} +\item{graphs}{Only keep a subset of Graphs specified here (if \code{NULL}, +remove all Graphs)} \item{misc}{Preserve the \code{misc} slot; default is \code{TRUE}} + +\item{counts}{Preserve the count matrices for the assays specified} + +\item{data}{Preserve the data matrices for the assays specified} + +\item{scale.data}{Preserve the scale data matrices for the assays specified} + +\item{...}{Ignored} +} +\value{ +\code{object} with only the sub-object specified retained } \description{ -Keep only certain aspects of the Seurat object. Can be useful in functions that utilize merge as -it reduces the amount of data in the merge. +Keep only certain aspects of the Seurat object. Can be useful in functions +that utilize merge as it reduces the amount of data in the merge } \concept{objects} diff --git a/man/DimPlot.Rd b/man/DimPlot.Rd index c93a78b01..a46a402f0 100644 --- a/man/DimPlot.Rd +++ b/man/DimPlot.Rd @@ -26,6 +26,7 @@ DimPlot( label.color = "black", label.box = FALSE, repel = FALSE, + alpha = 1, cells.highlight = NULL, cols.highlight = "#DE2D26", sizes.highlight = 1, @@ -61,8 +62,8 @@ See \code{\link{DiscretePalette}} for details.} \item{group.by}{Name of one or more metadata columns to group (color) cells by (for example, orig.ident); pass 'ident' to group by identity class} -\item{split.by}{Name of a metadata column to split plot by; -see \code{\link{FetchData}} for more details} +\item{split.by}{A factor in object metadata to split the plot by, pass 'ident' +to split by cell identity'} \item{shape.by}{If NULL, all points are circles (default). You can specify any cell attribute (that can be pulled with FetchData) allowing for both @@ -88,6 +89,8 @@ geom_label)} \item{repel}{Repel labels} +\item{alpha}{Alpha value for plotting (default is 1)} + \item{cells.highlight}{A list of character or numeric vectors of cells to highlight. If only one group of cells desired, can simply pass a vector instead of a list. If set, colors selected cells to the color(s) @@ -98,7 +101,8 @@ will also resize to the size(s) passed to \code{sizes.highlight}} repeat to the length groups in cells.highlight} \item{sizes.highlight}{Size of highlighted cells; will repeat to the length -groups in cells.highlight} +groups in cells.highlight. If \code{sizes.highlight = TRUE} size of all +points will be this value.} \item{na.value}{Color value for NA points when using custom scale} @@ -131,7 +135,7 @@ For the old \code{do.hover} and \code{do.identify} functionality, please see \examples{ data("pbmc_small") DimPlot(object = pbmc_small) -DimPlot(object = pbmc_small, split.by = 'ident') +DimPlot(object = pbmc_small, split.by = 'letter.idents') } \seealso{ diff --git a/man/DoHeatmap.Rd b/man/DoHeatmap.Rd index efa301bee..5747b1994 100644 --- a/man/DoHeatmap.Rd +++ b/man/DoHeatmap.Rd @@ -18,6 +18,7 @@ DoHeatmap( label = TRUE, size = 5.5, hjust = 0, + vjust = 0, angle = 45, raster = TRUE, draw.lines = TRUE, @@ -54,6 +55,8 @@ if \code{slot} is 'scale.data', 6 otherwise} \item{hjust}{Horizontal justification of text above color bar} +\item{vjust}{Vertical justification of text above color bar} + \item{angle}{Angle of text above color bar} \item{raster}{If true, plot with geom_raster, else use geom_tile. geom_raster may look blurry on diff --git a/man/DotPlot.Rd b/man/DotPlot.Rd index 31441433d..f59e9fa2e 100644 --- a/man/DotPlot.Rd +++ b/man/DotPlot.Rd @@ -7,8 +7,8 @@ \usage{ DotPlot( object, - assay = NULL, features, + assay = NULL, cols = c("lightgrey", "blue"), col.min = -2.5, col.max = 2.5, @@ -27,12 +27,12 @@ DotPlot( \arguments{ \item{object}{Seurat object} -\item{assay}{Name of assay to use, defaults to the active assay} - \item{features}{Input vector of features, or named list of feature vectors if feature-grouped panels are desired (replicates the functionality of the old SplitDotPlotGG)} +\item{assay}{Name of assay to use, defaults to the active assay} + \item{cols}{Colors to plot: the name of a palette from \code{RColorBrewer::brewer.pal.info}, a pair of colors defining a gradient, or 3+ colors defining multiple gradients (if split.by is set)} @@ -53,8 +53,8 @@ gene will have no dot drawn.} \item{group.by}{Factor to group the cells by} -\item{split.by}{Factor to split the groups by (replicates the functionality -of the old SplitDotPlotGG); +\item{split.by}{A factor in object metadata to split the plot by, pass 'ident' + to split by cell identity' see \code{\link{FetchData}} for more details} \item{cluster.idents}{Whether to order identities by hierarchical clusters diff --git a/man/FastRPCAIntegration.Rd b/man/FastRPCAIntegration.Rd new file mode 100644 index 000000000..4c6d8e55b --- /dev/null +++ b/man/FastRPCAIntegration.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/integration.R +\name{FastRPCAIntegration} +\alias{FastRPCAIntegration} +\title{Perform integration on the joint PCA cell embeddings.} +\usage{ +FastRPCAIntegration( + object.list, + reference = NULL, + anchor.features = 2000, + k.anchor = 20, + dims = 1:30, + scale = TRUE, + normalization.method = c("LogNormalize", "SCT"), + new.reduction.name = "integrated_dr", + npcs = 50, + findintegrationanchors.args = list(), + verbose = TRUE +) +} +\arguments{ +\item{object.list}{A list of \code{\link{Seurat}} objects between which to +find anchors for downstream integration.} + +\item{reference}{A vector specifying the object/s to be used as a reference +during integration. If NULL (default), all pairwise anchors are found (no +reference/s). If not NULL, the corresponding objects in \code{object.list} +will be used as references. When using a set of specified references, anchors +are first found between each query and each reference. The references are +then integrated through pairwise integration. Each query is then mapped to +the integrated reference.} + +\item{anchor.features}{Can be either: +\itemize{ + \item{A numeric value. This will call \code{\link{SelectIntegrationFeatures}} + to select the provided number of features to be used in anchor finding} + \item{A vector of features to be used as input to the anchor finding process} +}} + +\item{k.anchor}{How many neighbors (k) to use when picking anchors} + +\item{dims}{Which dimensions to use from the CCA to specify the neighbor +search space} + +\item{scale}{Whether or not to scale the features provided. Only set to FALSE +if you have previously scaled the features you want to use for each object in +the object.list} + +\item{normalization.method}{Name of normalization method used: LogNormalize +or SCT} + +\item{new.reduction.name}{Name of integrated dimensional reduction} + +\item{npcs}{Total Number of PCs to compute and store (50 by default)} + +\item{findintegrationanchors.args}{A named list of additional arguments to +\code{\link{FindIntegrationAnchors}}} + +\item{verbose}{Print messages and progress} +} +\value{ +Returns a Seurat object with integrated dimensional reduction +} +\description{ +This is a convenience wrapper function around the following three functions +that are often run together when perform integration. +#' \code{\link{FindIntegrationAnchors}}, \code{\link{RunPCA}}, +\code{\link{IntegrateEmbeddings}}. +} +\concept{integration} diff --git a/man/FeaturePlot.Rd b/man/FeaturePlot.Rd index b06367ba3..fa5c4293a 100644 --- a/man/FeaturePlot.Rd +++ b/man/FeaturePlot.Rd @@ -17,6 +17,7 @@ FeaturePlot( c("lightgrey", "blue") }, pt.size = NULL, + alpha = 1, order = FALSE, min.cutoff = NA, max.cutoff = NA, @@ -34,7 +35,7 @@ FeaturePlot( ncol = NULL, coord.fixed = FALSE, by.col = TRUE, - sort.cell = NULL, + sort.cell = deprecated(), interactive = FALSE, combine = TRUE, raster = NULL, @@ -46,10 +47,11 @@ FeaturePlot( \item{features}{Vector of features to plot. Features can come from: \itemize{ - \item An \code{Assay} feature (e.g. a gene name - "MS4A1") - \item A column name from meta.data (e.g. mitochondrial percentage - "percent.mito") - \item A column name from a \code{DimReduc} object corresponding to the cell embedding values - (e.g. the PC 1 scores - "PC_1") + \item An \code{Assay} feature (e.g. a gene name - "MS4A1") + \item A column name from meta.data (e.g. mitochondrial percentage - + "percent.mito") + \item A column name from a \code{DimReduc} object corresponding to the + cell embedding values (e.g. the PC 1 scores - "PC_1") }} \item{dims}{Dimensions to plot, must be a two-length numeric vector specifying x- and y-dimensions} @@ -68,6 +70,8 @@ When blend is \code{TRUE}, takes anywhere from 1-3 colors: \item{pt.size}{Adjust point size for plotting} +\item{alpha}{Alpha value for plotting (default is 1)} + \item{order}{Boolean determining whether to plot cells in order of expression. Can be useful if cells expressing given feature are getting buried.} @@ -76,14 +80,21 @@ may specify quantile in the form of 'q##' where '##' is the quantile (eg, 'q1', \item{reduction}{Which dimensionality reduction to use. If not specified, first searches for umap, then tsne, then pca} -\item{split.by}{A factor in object metadata to split the feature plot by, pass 'ident' -to split by cell identity'; similar to the old \code{FeatureHeatmap}} +\item{split.by}{A factor in object metadata to split the plot by, pass 'ident' +to split by cell identity'} \item{keep.scale}{How to handle the color scale across multiple plots. Options are: \itemize{ - \item{"feature" (default; by row/feature scaling):}{ The plots for each individual feature are scaled to the maximum expression of the feature across the conditions provided to 'split.by'.} - \item{"all" (universal scaling):}{ The plots for all features and conditions are scaled to the maximum expression value for the feature with the highest overall expression.} - \item{NULL (no scaling):}{ Each individual plot is scaled to the maximum expression value of the feature in the condition provided to 'split.by'. Be aware setting NULL will result in color scales that are not comparable between plots.} + \item \dQuote{feature} (default; by row/feature scaling): The plots for + each individual feature are scaled to the maximum expression of the + feature across the conditions provided to \code{split.by} + \item \dQuote{all} (universal scaling): The plots for all features and + conditions are scaled to the maximum expression value for the feature + with the highest overall expression + \item \code{all} (no scaling): Each individual plot is scaled to the + maximum expression value of the feature in the condition provided to + \code{split.by}. Be aware setting \code{NULL} will result in color + scales that are not comparable between plots }} \item{shape.by}{If NULL, all points are circles (default). You can specify any diff --git a/man/FeatureScatter.Rd b/man/FeatureScatter.Rd index a6634d840..5c1c6db47 100644 --- a/man/FeatureScatter.Rd +++ b/man/FeatureScatter.Rd @@ -13,6 +13,7 @@ FeatureScatter( shuffle = FALSE, seed = 1, group.by = NULL, + split.by = NULL, cols = NULL, pt.size = 1, shape.by = NULL, @@ -21,9 +22,11 @@ FeatureScatter( combine = TRUE, slot = "data", plot.cor = TRUE, + ncol = NULL, raster = NULL, raster.dpi = c(512, 512), - jitter = FALSE + jitter = FALSE, + log = FALSE ) } \arguments{ @@ -44,6 +47,9 @@ useful for crowded plots if points of interest are being buried. (default is FAL \item{group.by}{Name of one or more metadata columns to group (color) cells by (for example, orig.ident); pass 'ident' to group by identity class} +\item{split.by}{A factor in object metadata to split the feature plot by, pass 'ident' +to split by cell identity'} + \item{cols}{Colors to use for identity class plotting.} \item{pt.size}{Size of the points on the plot} @@ -60,6 +66,8 @@ useful for crowded plots if points of interest are being buried. (default is FAL \item{plot.cor}{Display correlation in plot title} +\item{ncol}{Number of columns if plotting multiple plots} + \item{raster}{Convert points to raster format, default is \code{NULL} which will automatically use raster if the number of points plotted is greater than 100,000} @@ -68,6 +76,8 @@ which will automatically use raster if the number of points plotted is greater t Default is c(512, 512).} \item{jitter}{Jitter for easier visualization of crowded points (default is FALSE)} + +\item{log}{Plot features on the log scale (default is FALSE)} } \value{ A ggplot object diff --git a/man/FetchResidualSCTModel.Rd b/man/FetchResidualSCTModel.Rd new file mode 100644 index 000000000..1c19d66a8 --- /dev/null +++ b/man/FetchResidualSCTModel.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocessing5.R +\name{FetchResidualSCTModel} +\alias{FetchResidualSCTModel} +\title{Calculate pearson residuals of features not in the scale.data +This function is the secondary function under FetchResiduals} +\usage{ +FetchResidualSCTModel( + object, + assay = "SCT", + umi.assay = "RNA", + layer = "counts", + chunk_size = 2000, + layer.cells = NULL, + SCTModel = NULL, + reference.SCT.model = NULL, + new_features = NULL, + clip.range = NULL, + replace.value = FALSE, + verbose = FALSE +) +} +\arguments{ +\item{object}{A seurat object} + +\item{assay}{Name of the assay of the seurat object generated by +SCTransform. Default is "SCT"} + +\item{umi.assay}{Name of the assay of the seurat object to fetch +UMIs from. Default is "RNA"} + +\item{layer}{Name of the layer under `umi.assay` to fetch UMIs from. +Default is "counts"} + +\item{chunk_size}{Number of cells to load in memory for calculating +residuals} + +\item{layer.cells}{Vector of cells to calculate the residual for. +Default is NULL which uses all cells in the layer} + +\item{SCTModel}{Which SCTmodel to use from the object for calculating +the residual. Will be ignored if reference.SCT.model is set} + +\item{reference.SCT.model}{If a reference SCT model should be used +for calculating the residuals. When set to not NULL, ignores the `SCTModel` +paramater.} + +\item{new_features}{A vector of features to calculate the residuals for} + +\item{clip.range}{Numeric of length two specifying the min and max values +the Pearson residual will be clipped to. Useful if you want to change the +clip.range.} + +\item{replace.value}{Whether to replace the value of residuals if it +already exists} + +\item{verbose}{Whether to print messages and progress bars} +} +\value{ +Returns a matrix containing centered pearson residuals of +added features +} +\description{ +Calculate pearson residuals of features not in the scale.data +This function is the secondary function under FetchResiduals +} +\keyword{internal} diff --git a/man/FetchResiduals.Rd b/man/FetchResiduals.Rd new file mode 100644 index 000000000..ba8fac0c1 --- /dev/null +++ b/man/FetchResiduals.Rd @@ -0,0 +1,58 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocessing5.R +\name{FetchResiduals} +\alias{FetchResiduals} +\title{Calculate pearson residuals of features not in the scale.data} +\usage{ +FetchResiduals( + object, + features, + assay = NULL, + umi.assay = "RNA", + layer = "counts", + clip.range = NULL, + reference.SCT.model = NULL, + replace.value = FALSE, + na.rm = TRUE, + verbose = TRUE +) +} +\arguments{ +\item{object}{A seurat object} + +\item{features}{Name of features to add into the scale.data} + +\item{assay}{Name of the assay of the seurat object generated by SCTransform} + +\item{umi.assay}{Name of the assay of the seurat object containing UMI matrix +and the default is RNA} + +\item{layer}{Name (prefix) of the layer to pull counts from} + +\item{clip.range}{Numeric of length two specifying the min and max values the +Pearson residual will be clipped to} + +\item{reference.SCT.model}{reference.SCT.model If a reference SCT model should be used +for calculating the residuals. When set to not NULL, ignores the `SCTModel` +paramater.} + +\item{replace.value}{Recalculate residuals for all features, even if they are +already present. Useful if you want to change the clip.range.} + +\item{na.rm}{For features where there is no feature model stored, return NA +for residual value in scale.data when na.rm = FALSE. When na.rm is TRUE, only +return residuals for features with a model stored for all cells.} + +\item{verbose}{Whether to print messages and progress bars} +} +\value{ +Returns a Seurat object containing Pearson residuals of added +features in its scale.data +} +\description{ +This function calls sctransform::get_residuals. +} +\seealso{ +\code{\link[sctransform]{get_residuals}} +} +\concept{preprocessing} diff --git a/man/FetchResiduals_reference.Rd b/man/FetchResiduals_reference.Rd new file mode 100644 index 000000000..3795a9496 --- /dev/null +++ b/man/FetchResiduals_reference.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocessing5.R +\name{FetchResiduals_reference} +\alias{FetchResiduals_reference} +\title{temporal function to get residuals from reference} +\usage{ +FetchResiduals_reference( + object, + reference.SCT.model = NULL, + features = NULL, + nCount_UMI = NULL, + verbose = FALSE +) +} +\arguments{ +\item{object}{A seurat object} + +\item{reference.SCT.model}{a reference SCT model that should be used +for calculating the residuals} + +\item{features}{Names of features to compute} + +\item{nCount_UMI}{UMI counts. If not specified, defaults to +column sums of object} + +\item{verbose}{Whether to print messages and progress bars} +} +\description{ +temporal function to get residuals from reference +} +\keyword{internal} diff --git a/man/FindAllMarkers.Rd b/man/FindAllMarkers.Rd index 622474624..a2f9f764e 100644 --- a/man/FindAllMarkers.Rd +++ b/man/FindAllMarkers.Rd @@ -9,10 +9,10 @@ FindAllMarkers( object, assay = NULL, features = NULL, - logfc.threshold = 0.25, + logfc.threshold = 0.1, test.use = "wilcox", slot = "data", - min.pct = 0.1, + min.pct = 0.01, min.diff.pct = -Inf, node = NULL, verbose = TRUE, @@ -38,13 +38,18 @@ FindAllMarkers( \item{features}{Genes to test. Default is to use all genes} \item{logfc.threshold}{Limit testing to genes which show, on average, at least -X-fold difference (log-scale) between the two groups of cells. Default is 0.25 -Increasing logfc.threshold speeds up the function, but can miss weaker signals.} +X-fold difference (log-scale) between the two groups of cells. Default is 0.1 +Increasing logfc.threshold speeds up the function, but can miss weaker signals. +If the \code{slot} parameter is "scale.data" no filtering is performed.} \item{test.use}{Denotes which test to use. Available options are: \itemize{ \item{"wilcox"} : Identifies differentially expressed genes between two - groups of cells using a Wilcoxon Rank Sum test (default) + groups of cells using a Wilcoxon Rank Sum test (default); will use a fast + implementation by Presto if installed + \item{"wilcox_limma"} : Identifies differentially expressed genes between two + groups of cells using the limma implementation of the Wilcoxon Rank Sum test; + set this option to reproduce results from Seurat v4 \item{"bimod"} : Likelihood-ratio test for single cell gene expression, (McDavid et al., Bioinformatics, 2013) \item{"roc"} : Identifies 'markers' of gene expression using ROC analysis. @@ -82,12 +87,12 @@ Increasing logfc.threshold speeds up the function, but can miss weaker signals.} https://bioconductor.org/packages/release/bioc/html/DESeq2.html }} -\item{slot}{Slot to pull data from; note that if \code{test.use} is "negbinom", "poisson", or "DESeq2", -\code{slot} will be set to "counts"} +\item{slot}{Slot to pull data from; note that if \code{test.use} is +"negbinom", "poisson", or "DESeq2", \code{slot} will be set to "counts"} \item{min.pct}{only test genes that are detected in a minimum fraction of min.pct cells in either of the two populations. Meant to speed up the function -by not testing genes that are very infrequently expressed. Default is 0.1} +by not testing genes that are very infrequently expressed. Default is 0.01} \item{min.diff.pct}{only test genes that show a minimum difference in the fraction of detection between the two groups. Set to -Inf by default} @@ -113,7 +118,14 @@ of the two groups, currently only used for poisson and negative binomial tests} \item{min.cells.group}{Minimum number of cells in one of the groups} \item{mean.fxn}{Function to use for fold change or average difference calculation. -If NULL, the appropriate function will be chose according to the slot used} +The default depends on the the value of \code{fc.slot}: +\itemize{ + \item{"counts"} : difference in the log of the mean counts, with pseudocount. + \item{"data"} : difference in the log of the average exponentiated data, with pseudocount. + This adjusts for differences in sequencing depth between cells, and assumes that "data" + has been log-normalized. + \item{"scale.data"} : difference in the means of scale.data. +}} \item{fc.name}{Name of the fold change, average difference, or custom function column in the output data.frame. If NULL, the fold change column will be named @@ -124,7 +136,8 @@ slot "avg_diff".} \item{return.thresh}{Only return markers that have a p-value < return.thresh, or a power > return.thresh (if the test is ROC)} -\item{densify}{Convert the sparse matrix to a dense form before running the DE test. This can provide speedups but might require higher memory; default is FALSE} +\item{densify}{Convert the sparse matrix to a dense form before running the +DE test. This can provide speedups but might require higher memory; default is FALSE} \item{...}{Arguments passed to other methods and to specific DE methods} } diff --git a/man/FindBridgeAnchor.Rd b/man/FindBridgeAnchor.Rd new file mode 100644 index 000000000..57d691bbe --- /dev/null +++ b/man/FindBridgeAnchor.Rd @@ -0,0 +1,90 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/integration.R +\name{FindBridgeAnchor} +\alias{FindBridgeAnchor} +\title{Find bridge anchors between two unimodal datasets} +\usage{ +FindBridgeAnchor( + object.list, + bridge.object, + object.reduction, + bridge.reduction, + anchor.type = c("Transfer", "Integration"), + reference = NULL, + laplacian.reduction = "lap", + laplacian.dims = 1:50, + reduction = c("direct", "cca"), + bridge.assay.name = "Bridge", + reference.bridge.stored = FALSE, + k.anchor = 20, + k.score = 50, + verbose = TRUE, + ... +) +} +\arguments{ +\item{object.list}{A list of Seurat objects} + +\item{bridge.object}{A multi-omic bridge Seurat which is used as the basis to +represent unimodal datasets} + +\item{object.reduction}{A list of dimensional reductions from object.list used +to be reconstructed by bridge.object} + +\item{bridge.reduction}{A list of dimensional reductions from bridge.object used +to reconstruct object.reduction} + +\item{anchor.type}{The type of anchors. Can +be one of: +\itemize{ + \item{Integration: Generate IntegrationAnchors for integration} + \item{Transfer: Generate TransferAnchors for transfering data} +}} + +\item{reference}{A vector specifying the object/s to be used as a reference +during integration or transfer data.} + +\item{laplacian.reduction}{Name of bridge graph laplacian dimensional reduction} + +\item{laplacian.dims}{Dimensions used for bridge graph laplacian dimensional reduction} + +\item{reduction}{Dimensional reduction to perform when finding anchors. Can +be one of: +\itemize{ + \item{cca: Canonical correlation analysis} + \item{direct: Use assay data as a dimensional reduction} +}} + +\item{bridge.assay.name}{Assay name used for bridge object reconstruction value (default is 'Bridge')} + +\item{reference.bridge.stored}{If refernece has stored the bridge dictionary representation} + +\item{k.anchor}{How many neighbors (k) to use when picking anchors} + +\item{k.score}{How many neighbors (k) to use when scoring anchors} + +\item{verbose}{Print messages and progress} + +\item{...}{Additional parameters passed to \code{FindIntegrationAnchors} or +\code{FindTransferAnchors}} +} +\value{ +Returns an \code{\link{AnchorSet}} object that can be used as input to +\code{\link{IntegrateEmbeddings}}.or \code{\link{MapQuery}} +} +\description{ +First, bridge object is used to reconstruct two single-modality profiles and +then project those cells into bridage graph laplacian space. +Next, find a set of anchors between two single-modality objects. These +anchors can later be used to integrate embeddings or transfer data from the reference to +query object using the \code{\link{MapQuery}} object. +} +\details{ +\itemize{ + \item{ Bridge cells reconstruction + } + \item{ Find anchors between objects. It can be either IntegrationAnchors or TransferAnchor. + } +} +} +\keyword{internal} diff --git a/man/FindBridgeIntegrationAnchors.Rd b/man/FindBridgeIntegrationAnchors.Rd new file mode 100644 index 000000000..31e00069b --- /dev/null +++ b/man/FindBridgeIntegrationAnchors.Rd @@ -0,0 +1,64 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/integration.R +\name{FindBridgeIntegrationAnchors} +\alias{FindBridgeIntegrationAnchors} +\title{Find integration bridge anchors between query and extended bridge-reference} +\usage{ +FindBridgeIntegrationAnchors( + extended.reference, + query, + query.assay = NULL, + dims = 1:30, + scale = FALSE, + reduction = c("lsiproject", "pcaproject"), + integration.reduction = c("direct", "cca"), + verbose = TRUE +) +} +\arguments{ +\item{extended.reference}{BridgeReferenceSet object generated from +\code{\link{PrepareBridgeReference}}} + +\item{query}{A query Seurat object} + +\item{query.assay}{Assay name for query-bridge integration} + +\item{dims}{Number of dimensions for query-bridge integration} + +\item{scale}{Determine if scale the query data for projection} + +\item{reduction}{Dimensional reduction to perform when finding anchors. +Options are: +\itemize{ + \item{pcaproject: Project the PCA from the bridge onto the query. We + recommend using PCA when bridge and query datasets are from scRNA-seq} + \item{lsiproject: Project the LSI from the bridge onto the query. We + recommend using LSI when bridge and query datasets are from scATAC-seq or scCUT&TAG data. + This requires that LSI or supervised LSI has been computed for the bridge dataset, and the + same features (eg, peaks or genome bins) are present in both the bridge + and query. +} +}} + +\item{integration.reduction}{Dimensional reduction to perform when finding anchors +between query and reference. +Options are: +\itemize{ + \item{direct: find anchors directly on the bridge representation space} + \item{cca: perform cca on the on the bridge representation space and then find anchors +} +}} + +\item{verbose}{Print messages and progress} +} +\value{ +Returns an \code{AnchorSet} object that can be used as input to +\code{\link{IntegrateEmbeddings}}. +} +\description{ +Find a set of anchors between unimodal query and the other unimodal reference +using a pre-computed \code{\link{BridgeReferenceSet}}. +These integration anchors can later be used to integrate query and reference +using the \code{\link{IntegrateEmbeddings}} object. +} +\concept{integration} diff --git a/man/FindBridgeTransferAnchors.Rd b/man/FindBridgeTransferAnchors.Rd new file mode 100644 index 000000000..7012d6555 --- /dev/null +++ b/man/FindBridgeTransferAnchors.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/integration.R +\name{FindBridgeTransferAnchors} +\alias{FindBridgeTransferAnchors} +\title{Find bridge anchors between query and extended bridge-reference} +\usage{ +FindBridgeTransferAnchors( + extended.reference, + query, + query.assay = NULL, + dims = 1:30, + scale = FALSE, + reduction = c("lsiproject", "pcaproject"), + bridge.reduction = c("direct", "cca"), + verbose = TRUE +) +} +\arguments{ +\item{extended.reference}{BridgeReferenceSet object generated from +\code{\link{PrepareBridgeReference}}} + +\item{query}{A query Seurat object} + +\item{query.assay}{Assay name for query-bridge integration} + +\item{dims}{Number of dimensions for query-bridge integration} + +\item{scale}{Determine if scale the query data for projection} + +\item{reduction}{Dimensional reduction to perform when finding anchors. +Options are: +\itemize{ + \item{pcaproject: Project the PCA from the bridge onto the query. We + recommend using PCA when bridge and query datasets are from scRNA-seq} + \item{lsiproject: Project the LSI from the bridge onto the query. We + recommend using LSI when bridge and query datasets are from scATAC-seq or scCUT&TAG data. + This requires that LSI or supervised LSI has been computed for the bridge dataset, and the + same features (eg, peaks or genome bins) are present in both the bridge + and query. +} +}} + +\item{bridge.reduction}{Dimensional reduction to perform when finding anchors. Can +be one of: +\itemize{ + \item{cca: Canonical correlation analysis} + \item{direct: Use assay data as a dimensional reduction} +}} + +\item{verbose}{Print messages and progress} +} +\value{ +Returns an \code{AnchorSet} object that can be used as input to +\code{\link{TransferData}}, \code{\link{IntegrateEmbeddings}} and +\code{\link{MapQuery}}. +} +\description{ +Find a set of anchors between unimodal query and the other unimodal reference +using a pre-computed \code{\link{BridgeReferenceSet}}. +This function performs three steps: +1. Harmonize the bridge and query cells in the bridge query reduction space +2. Construct the bridge dictionary representations for query cells +3. Find a set of anchors between query and reference in the bridge graph laplacian eigenspace +These anchors can later be used to integrate embeddings or transfer data from the reference to +query object using the \code{\link{MapQuery}} object. +} +\concept{integration} diff --git a/man/FindClusters.Rd b/man/FindClusters.Rd index 2a69cad65..513cf8f36 100644 --- a/man/FindClusters.Rd +++ b/man/FindClusters.Rd @@ -29,6 +29,7 @@ FindClusters(object, ...) \method{FindClusters}{Seurat}( object, graph.name = NULL, + cluster.name = NULL, modularity.fxn = 1, initial.membership = NULL, node.sizes = NULL, @@ -81,6 +82,8 @@ Specify the ABSOLUTE path.} \item{verbose}{Print output} \item{graph.name}{Name of graph to use for the clustering algorithm} + +\item{cluster.name}{Name of output clusters} } \value{ Returns a Seurat object where the idents have been updated with new cluster info; diff --git a/man/FindConservedMarkers.Rd b/man/FindConservedMarkers.Rd index 429c7dd9e..1cb33e87f 100644 --- a/man/FindConservedMarkers.Rd +++ b/man/FindConservedMarkers.Rd @@ -29,8 +29,8 @@ use all other cells for comparison.} \item{assay}{of assay to fetch data for (default is RNA)} -\item{slot}{Slot to pull data from; note that if \code{test.use} is "negbinom", "poisson", or "DESeq2", -\code{slot} will be set to "counts"} +\item{slot}{Slot to pull data from; note that if \code{test.use} is +"negbinom", "poisson", or "DESeq2", \code{slot} will be set to "counts"} \item{min.cells.group}{Minimum number of cells in one of the groups} diff --git a/man/FindIntegrationAnchors.Rd b/man/FindIntegrationAnchors.Rd index 62495f03c..8c4d0831c 100644 --- a/man/FindIntegrationAnchors.Rd +++ b/man/FindIntegrationAnchors.Rd @@ -12,7 +12,7 @@ FindIntegrationAnchors( scale = TRUE, normalization.method = c("LogNormalize", "SCT"), sct.clip.range = NULL, - reduction = c("cca", "rpca", "rlsi"), + reduction = c("cca", "rpca", "jpca", "rlsi"), l2.norm = TRUE, dims = 1:30, k.anchor = 5, @@ -63,6 +63,7 @@ be one of: \itemize{ \item{cca: Canonical correlation analysis} \item{rpca: Reciprocal PCA} + \item{jpca: Joint PCA} \item{rlsi: Reciprocal LSI} }} diff --git a/man/FindMarkers.Rd b/man/FindMarkers.Rd index 160c69c0d..373d58190 100644 --- a/man/FindMarkers.Rd +++ b/man/FindMarkers.Rd @@ -15,13 +15,12 @@ FindMarkers(object, ...) \method{FindMarkers}{default}( object, slot = "data", - counts = numeric(), cells.1 = NULL, cells.2 = NULL, features = NULL, - logfc.threshold = 0.25, + logfc.threshold = 0.1, test.use = "wilcox", - min.pct = 0.1, + min.pct = 0.01, min.diff.pct = -Inf, verbose = TRUE, only.pos = FALSE, @@ -30,7 +29,6 @@ FindMarkers(object, ...) latent.vars = NULL, min.cells.feature = 3, min.cells.group = 3, - pseudocount.use = 1, fc.results = NULL, densify = FALSE, ... @@ -42,48 +40,28 @@ FindMarkers(object, ...) cells.1 = NULL, cells.2 = NULL, features = NULL, - logfc.threshold = 0.25, test.use = "wilcox", - min.pct = 0.1, - min.diff.pct = -Inf, - verbose = TRUE, - only.pos = FALSE, - max.cells.per.ident = Inf, - random.seed = 1, - latent.vars = NULL, - min.cells.feature = 3, - min.cells.group = 3, + fc.slot = "data", pseudocount.use = 1, + norm.method = NULL, mean.fxn = NULL, fc.name = NULL, base = 2, - densify = FALSE, - norm.method = NULL, ... ) \method{FindMarkers}{SCTAssay}( object, - slot = "data", cells.1 = NULL, cells.2 = NULL, features = NULL, - logfc.threshold = 0.25, test.use = "wilcox", - min.pct = 0.1, - min.diff.pct = -Inf, - verbose = TRUE, - only.pos = FALSE, - max.cells.per.ident = Inf, - random.seed = 1, - latent.vars = NULL, - min.cells.feature = 3, - min.cells.group = 3, pseudocount.use = 1, + slot = "data", + fc.slot = "data", mean.fxn = NULL, fc.name = NULL, base = 2, - densify = FALSE, recorrect_umi = TRUE, ... ) @@ -93,9 +71,9 @@ FindMarkers(object, ...) cells.1 = NULL, cells.2 = NULL, features = NULL, - logfc.threshold = 0.25, + logfc.threshold = 0.1, test.use = "wilcox", - min.pct = 0.1, + min.pct = 0.01, min.diff.pct = -Inf, verbose = TRUE, only.pos = FALSE, @@ -104,10 +82,9 @@ FindMarkers(object, ...) latent.vars = NULL, min.cells.feature = 3, min.cells.group = 3, - pseudocount.use = 1, + densify = FALSE, mean.fxn = rowMeans, fc.name = NULL, - densify = FALSE, ... ) @@ -115,27 +92,11 @@ FindMarkers(object, ...) object, ident.1 = NULL, ident.2 = NULL, + latent.vars = NULL, group.by = NULL, subset.ident = NULL, assay = NULL, - slot = "data", reduction = NULL, - features = NULL, - logfc.threshold = 0.25, - test.use = "wilcox", - min.pct = 0.1, - min.diff.pct = -Inf, - verbose = TRUE, - only.pos = FALSE, - max.cells.per.ident = Inf, - random.seed = 1, - latent.vars = NULL, - min.cells.feature = 3, - min.cells.group = 3, - mean.fxn = NULL, - fc.name = NULL, - base = 2, - densify = FALSE, ... ) } @@ -144,12 +105,8 @@ FindMarkers(object, ...) \item{...}{Arguments passed to other methods and to specific DE methods} -\item{slot}{Slot to pull data from; note that if \code{test.use} is "negbinom", "poisson", or "DESeq2", -\code{slot} will be set to "counts"} - -\item{counts}{Count matrix if using scale.data for DE tests. This is used for -computing pct.1 and pct.2 and for filtering features based on fraction -expressing} +\item{slot}{Slot to pull data from; note that if \code{test.use} is +"negbinom", "poisson", or "DESeq2", \code{slot} will be set to "counts"} \item{cells.1}{Vector of cell names belonging to group 1} @@ -158,13 +115,18 @@ expressing} \item{features}{Genes to test. Default is to use all genes} \item{logfc.threshold}{Limit testing to genes which show, on average, at least -X-fold difference (log-scale) between the two groups of cells. Default is 0.25 -Increasing logfc.threshold speeds up the function, but can miss weaker signals.} +X-fold difference (log-scale) between the two groups of cells. Default is 0.1 +Increasing logfc.threshold speeds up the function, but can miss weaker signals. +If the \code{slot} parameter is "scale.data" no filtering is performed.} \item{test.use}{Denotes which test to use. Available options are: \itemize{ \item{"wilcox"} : Identifies differentially expressed genes between two - groups of cells using a Wilcoxon Rank Sum test (default) + groups of cells using a Wilcoxon Rank Sum test (default); will use a fast + implementation by Presto if installed + \item{"wilcox_limma"} : Identifies differentially expressed genes between two + groups of cells using the limma implementation of the Wilcoxon Rank Sum test; + set this option to reproduce results from Seurat v4 \item{"bimod"} : Likelihood-ratio test for single cell gene expression, (McDavid et al., Bioinformatics, 2013) \item{"roc"} : Identifies 'markers' of gene expression using ROC analysis. @@ -204,7 +166,7 @@ Increasing logfc.threshold speeds up the function, but can miss weaker signals.} \item{min.pct}{only test genes that are detected in a minimum fraction of min.pct cells in either of the two populations. Meant to speed up the function -by not testing genes that are very infrequently expressed. Default is 0.1} +by not testing genes that are very infrequently expressed. Default is 0.01} \item{min.diff.pct}{only test genes that show a minimum difference in the fraction of detection between the two groups. Set to -Inf by default} @@ -226,15 +188,29 @@ of the two groups, currently only used for poisson and negative binomial tests} \item{min.cells.group}{Minimum number of cells in one of the groups} +\item{fc.results}{data.frame from FoldChange} + +\item{densify}{Convert the sparse matrix to a dense form before running the +DE test. This can provide speedups but might require higher memory; default is FALSE} + +\item{fc.slot}{Slot used to calculate fold-change - will also affect the +default for \code{mean.fxn}, see below for more details.} + \item{pseudocount.use}{Pseudocount to add to averaged expression values when calculating logFC. 1 by default.} -\item{fc.results}{data.frame from FoldChange} - -\item{densify}{Convert the sparse matrix to a dense form before running the DE test. This can provide speedups but might require higher memory; default is FALSE} +\item{norm.method}{Normalization method for fold change calculation when +\code{slot} is \dQuote{\code{data}}} \item{mean.fxn}{Function to use for fold change or average difference calculation. -If NULL, the appropriate function will be chose according to the slot used} +The default depends on the the value of \code{fc.slot}: +\itemize{ + \item{"counts"} : difference in the log of the mean counts, with pseudocount. + \item{"data"} : difference in the log of the average exponentiated data, with pseudocount. + This adjusts for differences in sequencing depth between cells, and assumes that "data" + has been log-normalized. + \item{"scale.data"} : difference in the means of scale.data. +}} \item{fc.name}{Name of the fold change, average difference, or custom function column in the output data.frame. If NULL, the fold change column will be named @@ -243,10 +219,8 @@ slot "avg_diff".} \item{base}{The base with respect to which logarithms are computed.} -\item{norm.method}{Normalization method for fold change calculation when -\code{slot} is \dQuote{\code{data}}} - -\item{recorrect_umi}{Recalculate corrected UMI counts using minimum of the median UMIs when performing DE using multiple SCT objects; default is TRUE} +\item{recorrect_umi}{Recalculate corrected UMI counts using minimum of the +median UMIs when performing DE using multiple SCT objects; default is TRUE} \item{ident.1}{Identity class to define markers for; pass an object of class \code{phylo} or 'clustertree' to find markers for a node in a cluster tree; @@ -256,13 +230,16 @@ passing 'clustertree' requires \code{\link{BuildClusterTree}} to have been run} use all other cells for comparison; if an object of class \code{phylo} or 'clustertree' is passed to \code{ident.1}, must pass a node to find markers for} -\item{group.by}{Regroup cells into a different identity class prior to performing differential expression (see example)} +\item{group.by}{Regroup cells into a different identity class prior to +performing differential expression (see example)} -\item{subset.ident}{Subset a particular identity class prior to regrouping. Only relevant if group.by is set (see example)} +\item{subset.ident}{Subset a particular identity class prior to regrouping. +Only relevant if group.by is set (see example)} \item{assay}{Assay to use in differential expression testing} -\item{reduction}{Reduction to use in differential expression testing - will test for DE on cell embeddings} +\item{reduction}{Reduction to use in differential expression testing - will +test for DE on cell embeddings} } \value{ data.frame with a ranked list of putative markers as rows, and associated @@ -286,6 +263,7 @@ should be interpreted cautiously, as the genes used for clustering are the same genes tested for differential expression. } \examples{ +\dontrun{ data("pbmc_small") # Find markers for cluster 2 markers <- FindMarkers(object = pbmc_small, ident.1 = 2) @@ -303,6 +281,7 @@ if (requireNamespace("ape", quietly = TRUE)) { markers <- FindMarkers(object = pbmc_small, ident.1 = 'clustertree', ident.2 = 5) head(x = markers) } +} } \references{ diff --git a/man/FindNeighbors.Rd b/man/FindNeighbors.Rd index 4771bf893..b6c520bca 100644 --- a/man/FindNeighbors.Rd +++ b/man/FindNeighbors.Rd @@ -23,7 +23,6 @@ FindNeighbors(object, ...) annoy.metric = "euclidean", nn.eps = 0, verbose = TRUE, - force.recalc = FALSE, l2.norm = FALSE, cache.index = FALSE, index = NULL, @@ -42,7 +41,6 @@ FindNeighbors(object, ...) annoy.metric = "euclidean", nn.eps = 0, verbose = TRUE, - force.recalc = FALSE, l2.norm = FALSE, cache.index = FALSE, ... @@ -59,7 +57,6 @@ FindNeighbors(object, ...) annoy.metric = "euclidean", nn.eps = 0, verbose = TRUE, - force.recalc = FALSE, l2.norm = FALSE, cache.index = FALSE, ... @@ -80,7 +77,6 @@ FindNeighbors(object, ...) annoy.metric = "euclidean", nn.eps = 0, verbose = TRUE, - force.recalc = FALSE, do.plot = FALSE, graph.name = NULL, l2.norm = FALSE, @@ -127,8 +123,6 @@ default of 0.0 implies exact nearest neighbor search} \item{verbose}{Whether or not to print output to the console} -\item{force.recalc}{Force recalculation of (S)NN.} - \item{l2.norm}{Take L2Norm of the data} \item{cache.index}{Include cached index in returned Neighbor object diff --git a/man/FindSpatiallyVariableFeatures.Rd b/man/FindSpatiallyVariableFeatures.Rd index a94625155..1dbcbccd2 100644 --- a/man/FindSpatiallyVariableFeatures.Rd +++ b/man/FindSpatiallyVariableFeatures.Rd @@ -1,10 +1,12 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/generics.R, R/preprocessing.R +% Please edit documentation in R/generics.R, R/preprocessing.R, +% R/preprocessing5.R \name{FindSpatiallyVariableFeatures} \alias{FindSpatiallyVariableFeatures} \alias{FindSpatiallyVariableFeatures.default} \alias{FindSpatiallyVariableFeatures.Assay} \alias{FindSpatiallyVariableFeatures.Seurat} +\alias{FindSpatiallyVariableFeatures.StdAssay} \title{Find spatially variable features} \usage{ FindSpatiallyVariableFeatures(object, ...) @@ -48,6 +50,20 @@ FindSpatiallyVariableFeatures(object, ...) verbose = TRUE, ... ) + +\method{FindSpatiallyVariableFeatures}{StdAssay}( + object, + layer = "scale.data", + spatial.location, + selection.method = c("markvariogram", "moransi"), + features = NULL, + r.metric = 5, + x.cuts = NULL, + y.cuts = NULL, + nfeatures = nfeatures, + verbose = TRUE, + ... +) } \arguments{ \item{object}{A Seurat object, assay, or expression matrix} @@ -83,6 +99,8 @@ compute for all features.} \item{assay}{Assay to pull the features (marks) from} \item{image}{Name of image to pull the coordinates from} + +\item{layer}{Layer in the Assay5 to pull data from} } \description{ Identify features whose variability in expression can be explained to some diff --git a/man/FindTransferAnchors.Rd b/man/FindTransferAnchors.Rd index 398a55a03..f0dfbbc60 100644 --- a/man/FindTransferAnchors.Rd +++ b/man/FindTransferAnchors.Rd @@ -21,7 +21,7 @@ FindTransferAnchors( l2.norm = TRUE, dims = 1:30, k.anchor = 5, - k.filter = 200, + k.filter = NA, k.score = 30, max.features = 200, nn.method = "annoy", diff --git a/man/FindVariableFeatures.Rd b/man/FindVariableFeatures.Rd index 3db62df0d..e4a3deac9 100644 --- a/man/FindVariableFeatures.Rd +++ b/man/FindVariableFeatures.Rd @@ -3,7 +3,7 @@ \name{FindVariableFeatures} \alias{FindVariableFeatures} \alias{FindVariableGenes} -\alias{FindVariableFeatures.default} +\alias{FindVariableFeatures.V3Matrix} \alias{FindVariableFeatures.Assay} \alias{FindVariableFeatures.SCTAssay} \alias{FindVariableFeatures.Seurat} @@ -11,7 +11,7 @@ \usage{ FindVariableFeatures(object, ...) -\method{FindVariableFeatures}{default}( +\method{FindVariableFeatures}{V3Matrix}( object, selection.method = "vst", loess.span = 0.3, @@ -66,19 +66,21 @@ FindVariableFeatures(object, ...) \item{selection.method}{How to choose top variable features. Choose one of : \itemize{ - \item{vst:}{ First, fits a line to the relationship of log(variance) and - log(mean) using local polynomial regression (loess). Then standardizes the - feature values using the observed mean and expected variance (given by the - fitted line). Feature variance is then calculated on the standardized values - after clipping to a maximum (see clip.max parameter).} - \item{mean.var.plot (mvp):}{ First, uses a function to calculate average - expression (mean.function) and dispersion (dispersion.function) for each - feature. Next, divides features into num.bin (deafult 20) bins based on - their average expression, and calculates z-scores for dispersion within - each bin. The purpose of this is to identify variable features while - controlling for the strong relationship between variability and average - expression.} - \item{dispersion (disp):}{ selects the genes with the highest dispersion values} + \item \dQuote{\code{vst}}: First, fits a line to the relationship of + log(variance) and log(mean) using local polynomial regression (loess). + Then standardizes the feature values using the observed mean and + expected variance (given by the fitted line). Feature variance is then + calculated on the standardized values + after clipping to a maximum (see clip.max parameter). + \item \dQuote{\code{mean.var.plot}} (mvp): First, uses a function to + calculate average expression (mean.function) and dispersion + (dispersion.function) for each feature. Next, divides features into + \code{num.bin} (deafult 20) bins based on their average expression, + and calculates z-scores for dispersion within each bin. The purpose of + this is to identify variable features while controlling for the + strong relationship between variability and average expression + \item \dQuote{\code{dispersion}} (disp): selects the genes with the + highest dispersion values }} \item{loess.span}{(vst method) Loess span parameter used when fitting the @@ -100,10 +102,12 @@ is 20)} \item{binning.method}{Specifies how the bins should be computed. Available methods are: \itemize{ - \item{equal_width:}{ each bin is of equal width along the x-axis [default]} - \item{equal_frequency:}{ each bin contains an equal number of features (can - increase statistical power to detect overdispersed features at high - expression values, at the cost of reduced resolution along the x-axis)} + \item \dQuote{\code{equal_width}}: each bin is of equal width along the + x-axis (default) + \item \dQuote{\code{equal_frequency}}: each bin contains an equal number + of features (can increase statistical power to detect overdispersed + eatures at high expression values, at the cost of reduced resolution + along the x-axis) }} \item{verbose}{show progress bar for calculations} diff --git a/man/FoldChange.Rd b/man/FoldChange.Rd index edda396b5..cbdff2579 100644 --- a/man/FoldChange.Rd +++ b/man/FoldChange.Rd @@ -4,6 +4,7 @@ \alias{FoldChange} \alias{FoldChange.default} \alias{FoldChange.Assay} +\alias{FoldChange.SCTAssay} \alias{FoldChange.DimReduc} \alias{FoldChange.Seurat} \title{Fold Change} @@ -26,6 +27,19 @@ FoldChange(object, ...) ... ) +\method{FoldChange}{SCTAssay}( + object, + cells.1, + cells.2, + features = NULL, + slot = "data", + pseudocount.use = 1, + fc.name = NULL, + mean.fxn = NULL, + base = 2, + ... +) + \method{FoldChange}{DimReduc}( object, cells.1, @@ -48,7 +62,7 @@ FoldChange(object, ...) slot = "data", reduction = NULL, features = NULL, - pseudocount.use = NULL, + pseudocount.use = 1, mean.fxn = NULL, base = 2, fc.name = NULL, @@ -113,8 +127,10 @@ is returned instead of log fold change and the column is named "avg_diff". Otherwise, log2 fold change is returned with column named "avg_log2_FC". } \examples{ +\dontrun{ data("pbmc_small") FoldChange(pbmc_small, ident.1 = 1) +} } \seealso{ diff --git a/man/GaussianSketch.Rd b/man/GaussianSketch.Rd new file mode 100644 index 000000000..1e7e2d9f3 --- /dev/null +++ b/man/GaussianSketch.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sketching.R +\name{GaussianSketch} +\alias{GaussianSketch} +\title{Gaussian sketching} +\usage{ +GaussianSketch(nsketch, ncells, seed = NA_integer_, ...) +} +\arguments{ +\item{nsketch}{Number of sketching random cells} + +\item{ncells}{Number of cells in the original data} + +\item{seed}{a single value, interpreted as an integer, or \code{NULL} + (see \sQuote{Details}).} + +\item{...}{Ignored} +} +\value{ +... +} +\description{ +Gaussian sketching +} +\concept{sketching} +\keyword{internal} diff --git a/man/GetResidual.Rd b/man/GetResidual.Rd index 6a8835796..0d0525c40 100644 --- a/man/GetResidual.Rd +++ b/man/GetResidual.Rd @@ -8,7 +8,7 @@ GetResidual( object, features, assay = NULL, - umi.assay = NULL, + umi.assay = "RNA", clip.range = NULL, replace.value = FALSE, na.rm = TRUE, @@ -45,9 +45,11 @@ features in its scale.data This function calls sctransform::get_residuals. } \examples{ +\dontrun{ data("pbmc_small") pbmc_small <- SCTransform(object = pbmc_small, variable.features.n = 20) pbmc_small <- GetResidual(object = pbmc_small, features = c('MS4A1', 'TCL1A')) +} } \seealso{ diff --git a/man/GetTissueCoordinates.Rd b/man/GetTissueCoordinates.Rd index 5ccb69d68..4b3a95447 100644 --- a/man/GetTissueCoordinates.Rd +++ b/man/GetTissueCoordinates.Rd @@ -4,6 +4,7 @@ \alias{GetTissueCoordinates.SlideSeq} \alias{GetTissueCoordinates.STARmap} \alias{GetTissueCoordinates.VisiumV1} +\alias{GetTissueCoordinates.VisiumV2} \title{Get Tissue Coordinates} \usage{ \method{GetTissueCoordinates}{SlideSeq}(object, ...) @@ -16,6 +17,8 @@ cols = c("imagerow", "imagecol"), ... ) + +\method{GetTissueCoordinates}{VisiumV2}(object, scale = NULL, ...) } \arguments{ \item{object}{An object} diff --git a/man/HVFInfo.SCTAssay.Rd b/man/HVFInfo.SCTAssay.Rd index 6e26995d8..d29102bae 100644 --- a/man/HVFInfo.SCTAssay.Rd +++ b/man/HVFInfo.SCTAssay.Rd @@ -4,25 +4,12 @@ \alias{HVFInfo.SCTAssay} \title{Get Variable Feature Information} \usage{ -\method{HVFInfo}{SCTAssay}(object, selection.method, status = FALSE, ...) +\method{HVFInfo}{SCTAssay}(object, method, status = FALSE, ...) } \arguments{ \item{object}{An object} -\item{selection.method}{Which method to pull. For \code{HVFInfo} and -\code{VariableFeatures}, choose one from one of the -following: -\itemize{ - \item \dQuote{vst} - \item \dQuote{sctransform} or \dQuote{sct} - \item \dQuote{mean.var.plot}, \dQuote{dispersion}, \dQuote{mvp}, or - \dQuote{disp} -} -For \code{SVFInfo} and \code{SpatiallyVariableFeatures}, choose from: -\itemize{ - \item \dQuote{markvariogram} - \item \dQuote{moransi} -}} +\item{method}{method to determine variable features} \item{status}{Add variable status to the resulting data frame} @@ -32,11 +19,14 @@ For \code{SVFInfo} and \code{SpatiallyVariableFeatures}, choose from: Get variable feature information from \code{\link{SCTAssay}} objects } \examples{ +\dontrun{ # Get the HVF info directly from an SCTAssay object pbmc_small <- SCTransform(pbmc_small) -HVFInfo(pbmc_small[["SCT"]], selection.method = 'sct')[1:5, ] +HVFInfo(pbmc_small[["SCT"]], method = 'sct')[1:5, ] +} } \seealso{ \code{\link[SeuratObject]{HVFInfo}} } +\concept{objects} diff --git a/man/HarmonyIntegration.Rd b/man/HarmonyIntegration.Rd new file mode 100644 index 000000000..ea459e35a --- /dev/null +++ b/man/HarmonyIntegration.Rd @@ -0,0 +1,112 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/integration5.R +\name{HarmonyIntegration} +\alias{HarmonyIntegration} +\title{Harmony Integration} +\usage{ +HarmonyIntegration( + object, + orig, + features = NULL, + scale.layer = "scale.data", + new.reduction = "harmony", + layers = NULL, + npcs = 50L, + key = "harmony_", + theta = NULL, + lambda = NULL, + sigma = 0.1, + nclust = NULL, + tau = 0, + block.size = 0.05, + max.iter.harmony = 10L, + max.iter.cluster = 20L, + epsilon.cluster = 1e-05, + epsilon.harmony = 1e-04, + verbose = TRUE, + ... +) +} +\arguments{ +\item{object}{An \code{\link[SeuratObject]{Assay5}} object} + +\item{orig}{A \link[SeuratObject:DimReduc]{dimensional reduction} to correct} + +\item{features}{Ignored} + +\item{scale.layer}{Ignored} + +\item{new.reduction}{Name of new integrated dimensional reduction} + +\item{layers}{Ignored} + +\item{npcs}{If doing PCA on input matrix, number of PCs to compute} + +\item{key}{Key for Harmony dimensional reduction} + +\item{theta}{Diversity clustering penalty parameter} + +\item{lambda}{Ridge regression penalty parameter} + +\item{sigma}{Width of soft kmeans clusters} + +\item{nclust}{Number of clusters in model} + +\item{tau}{Protection against overclustering small datasets with large ones} + +\item{block.size}{What proportion of cells to update during clustering} + +\item{max.iter.harmony}{Maximum number of rounds to run Harmony} + +\item{max.iter.cluster}{Maximum number of rounds to run clustering at each round of Harmony} + +\item{epsilon.cluster}{Convergence tolerance for clustering round of Harmony} + +\item{epsilon.harmony}{Convergence tolerance for Harmony} + +\item{verbose}{Whether to print progress messages. TRUE to print, FALSE to suppress} + +\item{...}{Ignored} +} +\value{ +... +} +\description{ +Harmony Integration +} +\note{ +This function requires the +\href{https://cran.r-project.org/package=harmony}{\pkg{harmony}} package +to be installed +} +\examples{ +\dontrun{ +# Preprocessing +obj <- SeuratData::LoadData("pbmcsca") +obj[["RNA"]] <- split(obj[["RNA"]], f = obj$Method) +obj <- NormalizeData(obj) +obj <- FindVariableFeatures(obj) +obj <- ScaleData(obj) +obj <- RunPCA(obj) + +# After preprocessing, we integrate layers with added parameters specific to Harmony: +obj <- IntegrateLayers(object = obj, method = HarmonyIntegration, orig.reduction = "pca", + new.reduction = 'harmony', verbose = FALSE) + +# Modifying Parameters +# We can also add arguments specific to Harmony such as theta, to give more diverse clusters +obj <- IntegrateLayers(object = obj, method = HarmonyIntegration, orig.reduction = "pca", + new.reduction = 'harmony', verbose = FALSE, theta = 3) +# Integrating SCTransformed data +obj <- SCTransform(object = obj) +obj <- IntegrateLayers(object = obj, method = HarmonyIntegration, + orig.reduction = "pca", new.reduction = 'harmony', + assay = "SCT", verbose = FALSE) +} + + +} +\seealso{ +\code{\link[harmony:HarmonyMatrix]{harmony::HarmonyMatrix}()} +} +\concept{integration} diff --git a/man/ISpatialDimPlot.Rd b/man/ISpatialDimPlot.Rd index a19d06119..ebe615fe8 100644 --- a/man/ISpatialDimPlot.Rd +++ b/man/ISpatialDimPlot.Rd @@ -4,15 +4,23 @@ \alias{ISpatialDimPlot} \title{Visualize clusters spatially and interactively} \usage{ -ISpatialDimPlot(object, image = NULL, group.by = NULL, alpha = c(0.3, 1)) +ISpatialDimPlot( + object, + image = NULL, + image.scale = "lowres", + group.by = NULL, + alpha = c(0.3, 1) +) } \arguments{ -\item{object}{Seurat object} +\item{object}{A Seurat object} \item{image}{Name of the image to use in the plot} -\item{group.by}{Name of one or more metadata columns to group (color) cells by -(for example, orig.ident); pass 'ident' to group by identity class} +\item{image.scale}{Choose the scale factor ("lowres"/"hires") to apply in +order to matchthe plot with the specified `image` - defaults to "lowres"} + +\item{group.by}{Name of meta.data column to group the data by} \item{alpha}{Controls opacity of spots. Provide as a vector specifying the min and max for SpatialFeaturePlot. For SpatialDimPlot, provide a single diff --git a/man/ISpatialFeaturePlot.Rd b/man/ISpatialFeaturePlot.Rd index 48528668a..e4bd477ba 100644 --- a/man/ISpatialFeaturePlot.Rd +++ b/man/ISpatialFeaturePlot.Rd @@ -8,18 +8,23 @@ ISpatialFeaturePlot( object, feature, image = NULL, + image.scale = "lowres", slot = "data", alpha = c(0.1, 1) ) } \arguments{ -\item{object}{Seurat object} +\item{object}{A Seurat object} \item{feature}{Feature to visualize} \item{image}{Name of the image to use in the plot} -\item{slot}{Which slot to pull expression data from?} +\item{image.scale}{Choose the scale factor ("lowres"/"hires") to apply in +order to matchthe plot with the specified `image` - defaults to "lowres"} + +\item{slot}{If plotting a feature, which data slot to pull from (counts, +data, or scale.data)} \item{alpha}{Controls opacity of spots. Provide as a vector specifying the min and max for SpatialFeaturePlot. For SpatialDimPlot, provide a single diff --git a/man/ImageDimPlot.Rd b/man/ImageDimPlot.Rd index ce3d3fd5d..43fe764d6 100644 --- a/man/ImageDimPlot.Rd +++ b/man/ImageDimPlot.Rd @@ -28,7 +28,8 @@ ImageDimPlot( overlap = FALSE, axes = FALSE, combine = TRUE, - coord.fixed = TRUE + coord.fixed = TRUE, + flip_xy = TRUE ) } \arguments{ @@ -44,8 +45,8 @@ segmentation boundaries} \item{group.by}{Name of one or more metadata columns to group (color) cells by (for example, orig.ident); pass 'ident' to group by identity class} -\item{split.by}{Name of a metadata column to split plot by; -see \code{\link{FetchData}} for more details} +\item{split.by}{A factor in object metadata to split the plot by, pass 'ident' +to split by cell identity'} \item{cols}{Vector of colors, each color corresponds to an identity class. This may also be a single character or numeric value corresponding to a palette as specified by \code{\link[RColorBrewer]{brewer.pal.info}}. @@ -68,8 +69,7 @@ RColorBrewer is used by default.} \item{nmols}{Max number of each molecule specified in `molecules` to plot} -\item{alpha}{Alpha value, should be between 0 and 1; when plotting multiple -boundaries, \code{alpha} is equivalent to max alpha} +\item{alpha}{Alpha value for plotting (default is 1)} \item{border.color}{Color of cell segmentation border; pass \code{NA} to suppress borders for segmentation-based plots} @@ -96,6 +96,8 @@ given (first is lowest)} return a list of ggplot objects} \item{coord.fixed}{Plot cartesian coordinates with fixed aspect ratio} + +\item{flip_xy}{Flag to flip X and Y axes. Default is FALSE.} } \value{ If \code{combine = TRUE}, a \code{patchwork} @@ -104,3 +106,5 @@ ggplot object; otherwise, a list of ggplot objects \description{ Visualize clusters or other categorical groupings in a spatial context } +\concept{spatial} +\concept{visualization} diff --git a/man/ImageFeaturePlot.Rd b/man/ImageFeaturePlot.Rd index 9a91337eb..95dc7fe69 100644 --- a/man/ImageFeaturePlot.Rd +++ b/man/ImageFeaturePlot.Rd @@ -43,10 +43,11 @@ ImageFeaturePlot( \item{features}{Vector of features to plot. Features can come from: \itemize{ - \item An \code{Assay} feature (e.g. a gene name - "MS4A1") - \item A column name from meta.data (e.g. mitochondrial percentage - "percent.mito") - \item A column name from a \code{DimReduc} object corresponding to the cell embedding values - (e.g. the PC 1 scores - "PC_1") + \item An \code{Assay} feature (e.g. a gene name - "MS4A1") + \item A column name from meta.data (e.g. mitochondrial percentage - + "percent.mito") + \item A column name from a \code{DimReduc} object corresponding to the + cell embedding values (e.g. the PC 1 scores - "PC_1") }} \item{fov}{Name of FOV to plot} @@ -71,8 +72,8 @@ When blend is \code{TRUE}, takes anywhere from 1-3 colors: \item{min.cutoff, max.cutoff}{Vector of minimum and maximum cutoff values for each feature, may specify quantile in the form of 'q##' where '##' is the quantile (eg, 'q1', 'q10')} -\item{split.by}{A factor in object metadata to split the feature plot by, pass 'ident' -to split by cell identity'; similar to the old \code{FeatureHeatmap}} +\item{split.by}{A factor in object metadata to split the plot by, pass 'ident' +to split by cell identity'} \item{molecules}{A vector of molecules to plot} @@ -83,8 +84,7 @@ RColorBrewer is used by default.} \item{nmols}{Max number of each molecule specified in `molecules` to plot} -\item{alpha}{Alpha value, should be between 0 and 1; when plotting multiple -boundaries, \code{alpha} is equivalent to max alpha} +\item{alpha}{Alpha value for plotting (default is 1)} \item{border.color}{Color of cell segmentation border; pass \code{NA} to suppress borders for segmentation-based plots} @@ -130,3 +130,5 @@ ggplot object; otherwise, a list of ggplot objects \description{ Visualize expression in a spatial context } +\concept{spatial} +\concept{visualization} diff --git a/man/IntegrateEmbeddings.Rd b/man/IntegrateEmbeddings.Rd index dc0469132..304d0500a 100644 --- a/man/IntegrateEmbeddings.Rd +++ b/man/IntegrateEmbeddings.Rd @@ -26,6 +26,7 @@ IntegrateEmbeddings(anchorset, ...) anchorset, reference, query, + query.assay = NULL, new.reduction.name = "integrated_dr", reductions = "pcaproject", dims.to.integrate = NULL, @@ -96,6 +97,8 @@ integration.} \item{query}{Query object used in anchorset construction} +\item{query.assay}{Name of the Assay to use from query} + \item{reuse.weights.matrix}{Can be used in conjunction with the store.weights parameter in TransferData to reuse a precomputed weights matrix.} } diff --git a/man/IntegrateLayers.Rd b/man/IntegrateLayers.Rd new file mode 100644 index 000000000..5ac814d34 --- /dev/null +++ b/man/IntegrateLayers.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/integration5.R +\name{IntegrateLayers} +\alias{IntegrateLayers} +\title{Integrate Layers} +\usage{ +IntegrateLayers( + object, + method, + orig.reduction = "pca", + assay = NULL, + features = NULL, + layers = NULL, + scale.layer = "scale.data", + ... +) +} +\arguments{ +\item{object}{A \code{\link[SeuratObject]{Seurat}} object} + +\item{method}{Integration method function} + +\item{orig.reduction}{Name of dimensional reduction for correction} + +\item{assay}{Name of assay for integration} + +\item{features}{A vector of features to use for integration} + +\item{layers}{Names of normalized layers in \code{assay}} + +\item{scale.layer}{Name(s) of scaled layer(s) in \code{assay}} + +\item{...}{Arguments passed on to \code{method}} +} +\value{ +\code{object} with integration data added to it +} +\description{ +Integrate Layers +} +\section{Integration Method Functions}{ + +The following integration method functions are available: +\Sexpr[stage=render,results=rd]{Seurat:::.rd_methods("integration")} +} + +\seealso{ +\link[Seurat:writing-integration]{Writing integration method functions} +} +\concept{integration} diff --git a/man/JointPCAIntegration.Rd b/man/JointPCAIntegration.Rd new file mode 100644 index 000000000..3f2ab73f7 --- /dev/null +++ b/man/JointPCAIntegration.Rd @@ -0,0 +1,99 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/integration5.R +\name{JointPCAIntegration} +\alias{JointPCAIntegration} +\title{Seurat-Joint PCA Integration} +\usage{ +JointPCAIntegration( + object = NULL, + assay = NULL, + layers = NULL, + orig = NULL, + new.reduction = "integrated.dr", + reference = NULL, + features = NULL, + normalization.method = c("LogNormalize", "SCT"), + dims = 1:30, + k.anchor = 20, + scale.layer = "scale.data", + dims.to.integrate = NULL, + k.weight = 100, + weight.reduction = NULL, + sd.weight = 1, + sample.tree = NULL, + preserve.order = FALSE, + verbose = TRUE, + ... +) +} +\arguments{ +\item{object}{A \code{Seurat} object} + +\item{assay}{Name of \code{Assay} in the \code{Seurat} object} + +\item{layers}{Names of layers in \code{assay}} + +\item{orig}{A \link[SeuratObject:DimReduc]{dimensional reduction} to correct} + +\item{new.reduction}{Name of new integrated dimensional reduction} + +\item{reference}{A reference \code{Seurat} object} + +\item{features}{A vector of features to use for integration} + +\item{normalization.method}{Name of normalization method used: LogNormalize +or SCT} + +\item{dims}{Dimensions of dimensional reduction to use for integration} + +\item{k.anchor}{How many neighbors (k) to use when picking anchors} + +\item{scale.layer}{Name of scaled layer in \code{Assay}} + +\item{dims.to.integrate}{Number of dimensions to return integrated values for} + +\item{k.weight}{Number of neighbors to consider when weighting anchors} + +\item{weight.reduction}{Dimension reduction to use when calculating anchor +weights. This can be one of: +\itemize{ + \item{A string, specifying the name of a dimension reduction present in + all objects to be integrated} + \item{A vector of strings, specifying the name of a dimension reduction to + use for each object to be integrated} + \item{A vector of \code{\link{DimReduc}} objects, specifying the object to + use for each object in the integration} + \item{NULL, in which case the full corrected space is used for computing + anchor weights.} +}} + +\item{sd.weight}{Controls the bandwidth of the Gaussian kernel for weighting} + +\item{sample.tree}{Specify the order of integration. Order of integration +should be encoded in a matrix, where each row represents one of the pairwise +integration steps. Negative numbers specify a dataset, positive numbers +specify the integration results from a given row (the format of the merge +matrix included in the \code{\link{hclust}} function output). For example: +\code{matrix(c(-2, 1, -3, -1), ncol = 2)} gives: + +\if{html}{\out{
}}\preformatted{ [,1] [,2] + [1,] -2 -3 + [2,] 1 -1 +}\if{html}{\out{
}} + +Which would cause dataset 2 and 3 to be integrated first, then the resulting +object integrated with dataset 1. + +If NULL, the sample tree will be computed automatically.} + +\item{preserve.order}{Do not reorder objects based on size for each pairwise +integration.} + +\item{verbose}{Print progress} + +\item{...}{Arguments passed on to \code{FindIntegrationAnchors}} +} +\description{ +Seurat-Joint PCA Integration +} +\concept{integration} diff --git a/man/LeverageScore.Rd b/man/LeverageScore.Rd new file mode 100644 index 000000000..6b48c92fc --- /dev/null +++ b/man/LeverageScore.Rd @@ -0,0 +1,112 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/generics.R, R/sketching.R +\name{LeverageScore} +\alias{LeverageScore} +\alias{LeverageScore.default} +\alias{LeverageScore.StdAssay} +\alias{LeverageScore.Assay} +\alias{LeverageScore.Seurat} +\title{Leverage Score Calculation} +\usage{ +LeverageScore(object, ...) + +\method{LeverageScore}{default}( + object, + nsketch = 5000L, + ndims = NULL, + method = CountSketch, + eps = 0.5, + seed = 123L, + verbose = TRUE, + ... +) + +\method{LeverageScore}{StdAssay}( + object, + nsketch = 5000L, + ndims = NULL, + method = CountSketch, + vf.method = NULL, + layer = "data", + eps = 0.5, + seed = 123L, + verbose = TRUE, + features = NULL, + ... +) + +\method{LeverageScore}{Assay}( + object, + nsketch = 5000L, + ndims = NULL, + method = CountSketch, + vf.method = NULL, + layer = "data", + eps = 0.5, + seed = 123L, + verbose = TRUE, + features = NULL, + ... +) + +\method{LeverageScore}{Seurat}( + object, + assay = NULL, + nsketch = 5000L, + ndims = NULL, + var.name = "leverage.score", + over.write = FALSE, + method = CountSketch, + vf.method = NULL, + layer = "data", + eps = 0.5, + seed = 123L, + verbose = TRUE, + features = NULL, + ... +) +} +\arguments{ +\item{object}{A matrix-like object} + +\item{...}{Arguments passed to other methods} + +\item{nsketch}{A positive integer. The number of sketches to be used in the approximation. +Default is 5000.} + +\item{ndims}{A positive integer or NULL. The number of dimensions to use. If NULL, the number +of dimensions will default to the number of columns in the object.} + +\item{method}{The sketching method to use, defaults to CountSketch.} + +\item{eps}{A numeric. The error tolerance for the approximation in Johnson–Lindenstrauss embeddings, +defaults to 0.5.} + +\item{seed}{A positive integer. The seed for the random number generator, defaults to 123.} + +\item{verbose}{Print progress and diagnostic messages} + +\item{vf.method}{VariableFeatures method} + +\item{layer}{layer to use} + +\item{features}{A vector of feature names to use for calculating leverage score.} + +\item{assay}{assay to use} + +\item{var.name}{name of slot to store leverage scores} + +\item{over.write}{whether to overwrite slot that currently stores leverage scores. Defaults +to FALSE, in which case the 'var.name' is modified if it already exists in the object} +} +\description{ +This function computes the leverage scores for a given object +It uses the concept of sketching and random projections. The function provides an approximation +to the leverage scores using a scalable method suitable for large matrices. +} +\references{ +Clarkson, K. L. & Woodruff, D. P. +Low-rank approximation and regression in input sparsity time. +JACM 63, 1–45 (2017). \url{https://dl.acm.org/doi/10.1145/3019134}; +} +\concept{sketching} diff --git a/man/LinkedPlots.Rd b/man/LinkedPlots.Rd index 103241f9f..6ed68241f 100644 --- a/man/LinkedPlots.Rd +++ b/man/LinkedPlots.Rd @@ -13,6 +13,7 @@ LinkedDimPlot( dims = 1:2, reduction = NULL, image = NULL, + image.scale = "lowres", group.by = NULL, alpha = c(0.1, 1), combine = TRUE @@ -24,13 +25,14 @@ LinkedFeaturePlot( dims = 1:2, reduction = NULL, image = NULL, + image.scale = "lowres", slot = "data", alpha = c(0.1, 1), combine = TRUE ) } \arguments{ -\item{object}{Seurat object} +\item{object}{A Seurat object} \item{dims}{Dimensions to plot, must be a two-length numeric vector specifying x- and y-dimensions} @@ -38,19 +40,22 @@ LinkedFeaturePlot( \item{image}{Name of the image to use in the plot} -\item{group.by}{Name of one or more metadata columns to group (color) cells by -(for example, orig.ident); pass 'ident' to group by identity class} +\item{image.scale}{Choose the scale factor ("lowres"/"hires") to apply in +order to matchthe plot with the specified `image` - defaults to "lowres"} + +\item{group.by}{Name of meta.data column to group the data by} \item{alpha}{Controls opacity of spots. Provide as a vector specifying the min and max for SpatialFeaturePlot. For SpatialDimPlot, provide a single alpha value for each plot.} -\item{combine}{Combine plots into a single \code{\link[patchwork]{patchwork}ed} -ggplot object. If \code{FALSE}, return a list of ggplot objects} +\item{combine}{Combine plots into a single gg object; note that if TRUE; +themeing will not work when plotting multiple features/groupings} \item{feature}{Feature to visualize} -\item{slot}{Which slot to pull expression data from?} +\item{slot}{If plotting a feature, which data slot to pull from (counts, +data, or scale.data)} } \value{ Returns final plots. If \code{combine}, plots are stiched together diff --git a/man/Load10X_Spatial.Rd b/man/Load10X_Spatial.Rd index cfe5c63d6..9bb2bbb5d 100644 --- a/man/Load10X_Spatial.Rd +++ b/man/Load10X_Spatial.Rd @@ -9,6 +9,7 @@ Load10X_Spatial( filename = "filtered_feature_bc_matrix.h5", assay = "Spatial", slice = "slice1", + bin.size = NULL, filter.matrix = TRUE, to.upper = FALSE, image = NULL, @@ -25,15 +26,16 @@ and the image data in a subdirectory called \code{spatial}} \item{slice}{Name for the stored image of the tissue slice} +\item{bin.size}{Specifies the bin sizes to read in - defaults to c(16, 8)} + \item{filter.matrix}{Only keep spots that have been determined to be over tissue} -\item{to.upper}{Converts all feature names to upper case. This can provide an -approximate conversion of mouse to human gene names which can be useful in an -explorative analysis. For cross-species comparisons, orthologous genes should -be identified across species and used instead.} +\item{to.upper}{Converts all feature names to upper case. Can be useful when +analyses require comparisons between human and mouse gene names for example.} -\item{image}{An object of class VisiumV1. Typically, an output from \code{\link{Read10X_Image}}} +\item{image}{\code{VisiumV1}/\code{VisiumV2} instance(s) - if a vector is +passed in it should be co-indexed with \code{`bin.size`}} \item{...}{Arguments passed to \code{\link{Read10X_h5}}} } diff --git a/man/LoadCurioSeeker.Rd b/man/LoadCurioSeeker.Rd new file mode 100644 index 000000000..5b86ef2c7 --- /dev/null +++ b/man/LoadCurioSeeker.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocessing.R +\name{LoadCurioSeeker} +\alias{LoadCurioSeeker} +\title{Load Curio Seeker data} +\usage{ +LoadCurioSeeker(data.dir, assay = "Spatial") +} +\arguments{ +\item{data.dir}{location of data directory that contains the counts matrix, +gene names, barcodes/beads, and barcodes/bead location files.} + +\item{assay}{Name of assay to associate spatial data to} +} +\value{ +A \code{\link{Seurat}} object +} +\description{ +Load Curio Seeker data +} +\concept{preprocessing} diff --git a/man/LogNormalize.Rd b/man/LogNormalize.Rd index 468b37a45..0c72744b8 100644 --- a/man/LogNormalize.Rd +++ b/man/LogNormalize.Rd @@ -1,23 +1,37 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/preprocessing.R +% Please edit documentation in R/generics.R, R/preprocessing.R, +% R/preprocessing5.R \name{LogNormalize} \alias{LogNormalize} -\title{Normalize raw data} +\alias{LogNormalize.data.frame} +\alias{LogNormalize.V3Matrix} +\alias{LogNormalize.default} +\title{Normalize Raw Data} \usage{ -LogNormalize(data, scale.factor = 10000, verbose = TRUE) +LogNormalize(data, scale.factor = 10000, margin = 2L, verbose = TRUE, ...) + +\method{LogNormalize}{data.frame}(data, scale.factor = 10000, margin = 2L, verbose = TRUE, ...) + +\method{LogNormalize}{V3Matrix}(data, scale.factor = 10000, margin = 2L, verbose = TRUE, ...) + +\method{LogNormalize}{default}(data, scale.factor = 10000, margin = 2L, verbose = TRUE, ...) } \arguments{ \item{data}{Matrix with the raw count data} -\item{scale.factor}{Scale the data. Default is 1e4} +\item{scale.factor}{Scale the data; default is \code{1e4}} + +\item{margin}{Margin to normalize over} \item{verbose}{Print progress} + +\item{...}{Arguments passed to other methods} } \value{ -Returns a matrix with the normalize and log transformed data +A matrix with the normalized and log-transformed data } \description{ -Normalize count data per cell and transform to log scale +Normalize Raw Data } \examples{ mat <- matrix(data = rbinom(n = 25, size = 5, prob = 0.2), nrow = 5) diff --git a/man/MVP.Rd b/man/MVP.Rd new file mode 100644 index 000000000..516015275 --- /dev/null +++ b/man/MVP.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocessing5.R +\name{MVP} +\alias{MVP} +\title{Find variable features based on mean.var.plot} +\usage{ +MVP( + data, + verbose = TRUE, + nselect = 2000L, + mean.cutoff = c(0.1, 8), + dispersion.cutoff = c(1, Inf), + ... +) +} +\arguments{ +\item{data}{Data matrix} + +\item{verbose}{Whether to print messages and progress bars} + +\item{nselect}{Number of features to select based on dispersion values} + +\item{mean.cutoff}{Numeric of length two specifying the min and max values} + +\item{dispersion.cutoff}{Numeric of length two specifying the min and max values} +} +\description{ +Find variable features based on mean.var.plot +} +\keyword{internal} diff --git a/man/MapQuery.Rd b/man/MapQuery.Rd index 7ca0a68ce..035f75b4d 100644 --- a/man/MapQuery.Rd +++ b/man/MapQuery.Rd @@ -13,6 +13,7 @@ MapQuery( reference.reduction = NULL, reference.dims = NULL, query.dims = NULL, + store.weights = FALSE, reduction.model = NULL, transferdata.args = list(), integrateembeddings.args = list(), @@ -48,6 +49,8 @@ neighbor finding} \item{query.dims}{Dimensions (columns) to use from query} +\item{store.weights}{Determine if the weight and anchor matrices are stored.} + \item{reduction.model}{\code{DimReduc} object that contains the umap model} \item{transferdata.args}{A named list of additional arguments to @@ -62,8 +65,7 @@ neighbor finding} \item{verbose}{Print progress bars and output} } \value{ -Returns a modified query Seurat object containing: - +Returns a modified query Seurat object containing:#' \itemize{ \item{New Assays corresponding to the features transferred and/or their corresponding prediction scores from \code{\link{TransferData}}} diff --git a/man/MixscapeHeatmap.Rd b/man/MixscapeHeatmap.Rd index fabae8b35..c6c8ab8cc 100644 --- a/man/MixscapeHeatmap.Rd +++ b/man/MixscapeHeatmap.Rd @@ -37,7 +37,7 @@ use all other cells for comparison; if an object of class \code{phylo} or \item{balanced}{Plot an equal number of genes with both groups of cells.} \item{logfc.threshold}{Limit testing to genes which show, on average, at least -X-fold difference (log-scale) between the two groups of cells. Default is 0.25 +X-fold difference (log-scale) between the two groups of cells. Default is 0.25. Increasing logfc.threshold speeds up the function, but can miss weaker signals.} \item{assay}{Assay to use in differential expression testing} @@ -47,7 +47,11 @@ Increasing logfc.threshold speeds up the function, but can miss weaker signals.} \item{test.use}{Denotes which test to use. Available options are: \itemize{ \item{"wilcox"} : Identifies differentially expressed genes between two - groups of cells using a Wilcoxon Rank Sum test (default) + groups of cells using a Wilcoxon Rank Sum test (default); will use a fast + implementation by Presto if installed + \item{"wilcox_limma"} : Identifies differentially expressed genes between two + groups of cells using the limma implementation of the Wilcoxon Rank Sum test; + set this option to reproduce results from Seurat v4 \item{"bimod"} : Likelihood-ratio test for single cell gene expression, (McDavid et al., Bioinformatics, 2013) \item{"roc"} : Identifies 'markers' of gene expression using ROC analysis. diff --git a/man/MixscapeLDA.Rd b/man/MixscapeLDA.Rd index 63f9b5b57..148a8ae2e 100644 --- a/man/MixscapeLDA.Rd +++ b/man/MixscapeLDA.Rd @@ -43,7 +43,7 @@ MixscapeLDA( \item{verbose}{Print progress bar.} \item{logfc.threshold}{Limit testing to genes which show, on average, at least -X-fold difference (log-scale) between the two groups of cells. Default is 0.25 +X-fold difference (log-scale) between the two groups of cells. Default is 0.25. Increasing logfc.threshold speeds up the function, but can miss weaker signals.} } \value{ diff --git a/man/NNPlot.Rd b/man/NNPlot.Rd index b6d51c6a5..cf41671ee 100644 --- a/man/NNPlot.Rd +++ b/man/NNPlot.Rd @@ -40,7 +40,8 @@ NNPlot( \item{repel}{Repel labels} \item{sizes.highlight}{Size of highlighted cells; will repeat to the length -groups in cells.highlight} +groups in cells.highlight. If \code{sizes.highlight = TRUE} size of all +points will be this value.} \item{pt.size}{Adjust point size for plotting} diff --git a/man/NNtoGraph.Rd b/man/NNtoGraph.Rd new file mode 100644 index 000000000..95f6d36f4 --- /dev/null +++ b/man/NNtoGraph.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/integration.R +\name{NNtoGraph} +\alias{NNtoGraph} +\title{Convert Neighbor class to an asymmetrical Graph class} +\usage{ +NNtoGraph(nn.object, col.cells = NULL, weighted = FALSE) +} +\arguments{ +\item{nn.object}{A neighbor class object} + +\item{col.cells}{Cells names of the neighbors, cell names in nn.object is used by default} + +\item{weighted}{Determine if use distance in the Graph} +} +\value{ +Returns a Graph object +} +\description{ +Convert Neighbor class to an asymmetrical Graph class +} +\concept{integration} diff --git a/man/NormalizeData.Rd b/man/NormalizeData.Rd index cb3ac1635..1f6edd463 100644 --- a/man/NormalizeData.Rd +++ b/man/NormalizeData.Rd @@ -2,14 +2,14 @@ % Please edit documentation in R/generics.R, R/preprocessing.R \name{NormalizeData} \alias{NormalizeData} -\alias{NormalizeData.default} +\alias{NormalizeData.V3Matrix} \alias{NormalizeData.Assay} \alias{NormalizeData.Seurat} \title{Normalize Data} \usage{ NormalizeData(object, ...) -\method{NormalizeData}{default}( +\method{NormalizeData}{V3Matrix}( object, normalization.method = "LogNormalize", scale.factor = 10000, @@ -45,13 +45,14 @@ NormalizeData(object, ...) \item{normalization.method}{Method for normalization. \itemize{ - \item{LogNormalize: }{Feature counts for each cell are divided by the total - counts for that cell and multiplied by the scale.factor. This is then - natural-log transformed using log1p.} - \item{CLR: }{Applies a centered log ratio transformation} - \item{RC: }{Relative counts. Feature counts for each cell are divided by the total - counts for that cell and multiplied by the scale.factor. No log-transformation is applied. - For counts per million (CPM) set \code{scale.factor = 1e6}} + \item \dQuote{\code{LogNormalize}}: Feature counts for each cell are + divided by the total counts for that cell and multiplied by the + \code{scale.factor}. This is then natural-log transformed using \code{log1p} + \item \dQuote{\code{CLR}}: Applies a centered log ratio transformation + \item \dQuote{\code{RC}}: Relative counts. Feature counts for each cell + are divided by the total counts for that cell and multiplied by the + \code{scale.factor}. No log-transformation is applied. For counts per + million (CPM) set \code{scale.factor = 1e6} }} \item{scale.factor}{Sets the scale factor for cell-level normalization} diff --git a/man/PlotClusterTree.Rd b/man/PlotClusterTree.Rd index 567c6dd1f..248cd70bd 100644 --- a/man/PlotClusterTree.Rd +++ b/man/PlotClusterTree.Rd @@ -22,10 +22,12 @@ Plots dendogram (must be precomputed using BuildClusterTree), returns no value Plots previously computed tree (from BuildClusterTree) } \examples{ +\dontrun{ if (requireNamespace("ape", quietly = TRUE)) { data("pbmc_small") pbmc_small <- BuildClusterTree(object = pbmc_small) PlotClusterTree(object = pbmc_small) } } +} \concept{visualization} diff --git a/man/PolyFeaturePlot.Rd b/man/PolyFeaturePlot.Rd index 59a75466d..6c822aa49 100644 --- a/man/PolyFeaturePlot.Rd +++ b/man/PolyFeaturePlot.Rd @@ -21,10 +21,11 @@ PolyFeaturePlot( \item{features}{Vector of features to plot. Features can come from: \itemize{ - \item An \code{Assay} feature (e.g. a gene name - "MS4A1") - \item A column name from meta.data (e.g. mitochondrial percentage - "percent.mito") - \item A column name from a \code{DimReduc} object corresponding to the cell embedding values - (e.g. the PC 1 scores - "PC_1") + \item An \code{Assay} feature (e.g. a gene name - "MS4A1") + \item A column name from meta.data (e.g. mitochondrial percentage - + "percent.mito") + \item A column name from a \code{DimReduc} object corresponding to the + cell embedding values (e.g. the PC 1 scores - "PC_1") }} \item{cells}{Vector of cells to plot (default is all cells)} diff --git a/man/PrepLDA.Rd b/man/PrepLDA.Rd index 8e5121b6e..d1fedec4f 100644 --- a/man/PrepLDA.Rd +++ b/man/PrepLDA.Rd @@ -31,7 +31,7 @@ PrepLDA( \item{verbose}{Print progress bar.} \item{logfc.threshold}{Limit testing to genes which show, on average, at least -X-fold difference (log-scale) between the two groups of cells. Default is 0.25 +X-fold difference (log-scale) between the two groups of cells. Default is 0.25. Increasing logfc.threshold speeds up the function, but can miss weaker signals.} } \value{ diff --git a/man/PrepSCTFindMarkers.Rd b/man/PrepSCTFindMarkers.Rd index efec41676..4c29d71c0 100644 --- a/man/PrepSCTFindMarkers.Rd +++ b/man/PrepSCTFindMarkers.Rd @@ -24,10 +24,35 @@ as the sequencing depth covariate. The counts slot of the SCT assay is replaced with recorrected counts and the data slot is replaced with log1p of recorrected counts. } +\section{Progress Updates with \pkg{progressr}}{ + +This function uses +\href{https://cran.r-project.org/package=progressr}{\pkg{progressr}} to +render status updates and progress bars. To enable progress updates, wrap +the function call in \code{\link[progressr]{with_progress}} or run +\code{\link[progressr:handlers]{handlers(global = TRUE)}} before running +this function. For more details about \pkg{progressr}, please read +\href{https://progressr.futureverse.org/articles/progressr-intro.html}{\code{vignette("progressr-intro")}} +} + +\section{Parallelization with \pkg{future}}{ + +This function uses +\href{https://cran.r-project.org/package=future}{\pkg{future}} to enable +parallelization. Parallelization strategies can be set using +\code{\link[future]{plan}}. Common plans include \dQuote{\code{sequential}} +for non-parallelized processing or \dQuote{\code{multisession}} for parallel +evaluation using multiple \R sessions; for other plans, see the +\dQuote{Implemented evaluation strategies} section of +\code{\link[future:plan]{?future::plan}}. For a more thorough introduction +to \pkg{future}, see +\href{https://future.futureverse.org/articles/future-1-overview.html}{\code{vignette("future-1-overview")}} +} + \examples{ data("pbmc_small") -pbmc_small1 <- SCTransform(object = pbmc_small, variable.features.n = 20) -pbmc_small2 <- SCTransform(object = pbmc_small, variable.features.n = 20) +pbmc_small1 <- SCTransform(object = pbmc_small, variable.features.n = 20, vst.flavor="v1") +pbmc_small2 <- SCTransform(object = pbmc_small, variable.features.n = 20, vst.flavor="v1") pbmc_merged <- merge(x = pbmc_small1, y = pbmc_small2) pbmc_merged <- PrepSCTFindMarkers(object = pbmc_merged) markers <- FindMarkers( @@ -47,3 +72,4 @@ markers_subset <- FindMarkers( } \concept{differential_expression} +\concept{future} diff --git a/man/PrepareBridgeReference.Rd b/man/PrepareBridgeReference.Rd new file mode 100644 index 000000000..46f2c1a54 --- /dev/null +++ b/man/PrepareBridgeReference.Rd @@ -0,0 +1,83 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/integration.R +\name{PrepareBridgeReference} +\alias{PrepareBridgeReference} +\title{Prepare the bridge and reference datasets} +\usage{ +PrepareBridgeReference( + reference, + bridge, + reference.reduction = "pca", + reference.dims = 1:50, + normalization.method = c("SCT", "LogNormalize"), + reference.assay = NULL, + bridge.ref.assay = "RNA", + bridge.query.assay = "ATAC", + supervised.reduction = c("slsi", "spca", NULL), + bridge.query.reduction = NULL, + bridge.query.features = NULL, + laplacian.reduction.name = "lap", + laplacian.reduction.key = "lap_", + laplacian.reduction.dims = 1:50, + verbose = TRUE +) +} +\arguments{ +\item{reference}{A reference Seurat object} + +\item{bridge}{A multi-omic bridge Seurat object} + +\item{reference.reduction}{Name of dimensional reduction of the reference object (default is 'pca')} + +\item{reference.dims}{Number of dimensions used for the reference.reduction (default is 50)} + +\item{normalization.method}{Name of normalization method used: LogNormalize +or SCT} + +\item{reference.assay}{Assay name for reference (default is \code{\link{DefaultAssay}})} + +\item{bridge.ref.assay}{Assay name for bridge used for reference mapping. RNA by default} + +\item{bridge.query.assay}{Assay name for bridge used for query mapping. ATAC by default} + +\item{supervised.reduction}{Type of supervised dimensional reduction to be performed +for integrating the bridge and query. +#' Options are: +\itemize{ + \item{slsi: Perform supervised LSI as the dimensional reduction for + the bridge-query integration} + \item{spca: Perform supervised PCA as the dimensional reduction for + the bridge-query integration} + \item{NULL: no supervised dimensional reduction will be calculated. + bridge.query.reduction is used for the bridge-query integration} +}} + +\item{bridge.query.reduction}{Name of dimensions used for the bridge-query harmonization. +'bridge.query.reduction' and 'supervised.reduction' cannot be NULL together.} + +\item{bridge.query.features}{Features used for bridge query dimensional reduction +(default is NULL which uses VariableFeatures from the bridge object)} + +\item{laplacian.reduction.name}{Name of dimensional reduction name of graph laplacian eigenspace (default is 'lap')} + +\item{laplacian.reduction.key}{Dimensional reduction key (default is 'lap_')} + +\item{laplacian.reduction.dims}{Number of dimensions used for graph laplacian eigenspace (default is 50)} + +\item{verbose}{Print progress and message (default is TRUE)} +} +\value{ +Returns a \code{BridgeReferenceSet} that can be used as input to + \code{\link{FindBridgeTransferAnchors}}. +The parameters used are stored in the \code{BridgeReferenceSet} as well +} +\description{ +Preprocess the multi-omic bridge and unimodal reference datasets into +an extended reference. +This function performs the following three steps: +1. Performs within-modality harmonization between bridge and reference +2. Performs dimensional reduction on the SNN graph of bridge datasets via +Laplacian Eigendecomposition +3. Constructs a bridge dictionary representation for unimodal reference cells +} +\concept{integration} diff --git a/man/ProjectCellEmbeddings.Rd b/man/ProjectCellEmbeddings.Rd new file mode 100644 index 000000000..fca3c2d60 --- /dev/null +++ b/man/ProjectCellEmbeddings.Rd @@ -0,0 +1,136 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/generics.R, R/integration.R +\name{ProjectCellEmbeddings} +\alias{ProjectCellEmbeddings} +\alias{ProjectCellEmbeddings.Seurat} +\alias{ProjectCellEmbeddings.Assay} +\alias{ProjectCellEmbeddings.SCTAssay} +\alias{ProjectCellEmbeddings.StdAssay} +\alias{ProjectCellEmbeddings.default} +\alias{ProjectCellEmbeddings.IterableMatrix} +\title{Project query data to the reference dimensional reduction} +\usage{ +ProjectCellEmbeddings(query, ...) + +\method{ProjectCellEmbeddings}{Seurat}( + query, + reference, + query.assay = NULL, + reference.assay = NULL, + reduction = "pca", + dims = 1:50, + normalization.method = c("LogNormalize", "SCT"), + scale = TRUE, + verbose = TRUE, + nCount_UMI = NULL, + feature.mean = NULL, + feature.sd = NULL, + ... +) + +\method{ProjectCellEmbeddings}{Assay}( + query, + reference, + reference.assay = NULL, + reduction = "pca", + dims = 1:50, + scale = TRUE, + normalization.method = NULL, + verbose = TRUE, + nCount_UMI = NULL, + feature.mean = NULL, + feature.sd = NULL, + ... +) + +\method{ProjectCellEmbeddings}{SCTAssay}( + query, + reference, + reference.assay = NULL, + reduction = "pca", + dims = 1:50, + scale = TRUE, + normalization.method = NULL, + verbose = TRUE, + nCount_UMI = NULL, + feature.mean = NULL, + feature.sd = NULL, + ... +) + +\method{ProjectCellEmbeddings}{StdAssay}( + query, + reference, + reference.assay = NULL, + reduction = "pca", + dims = 1:50, + scale = TRUE, + normalization.method = NULL, + verbose = TRUE, + nCount_UMI = NULL, + feature.mean = NULL, + feature.sd = NULL, + ... +) + +\method{ProjectCellEmbeddings}{default}( + query, + reference, + reference.assay = NULL, + reduction = "pca", + dims = 1:50, + scale = TRUE, + normalization.method = NULL, + verbose = TRUE, + features = NULL, + nCount_UMI = NULL, + feature.mean = NULL, + feature.sd = NULL, + ... +) + +\method{ProjectCellEmbeddings}{IterableMatrix}( + query, + reference, + reference.assay = NULL, + reduction = "pca", + dims = 1:50, + scale = TRUE, + normalization.method = NULL, + verbose = TRUE, + features = features, + nCount_UMI = NULL, + feature.mean = NULL, + feature.sd = NULL, + block.size = 10000, + ... +) +} +\arguments{ +\item{query}{An object for query cells} + +\item{reference}{An object for reference cells} + +\item{query.assay}{Assay name for query object} + +\item{reference.assay}{Assay name for reference object} + +\item{reduction}{Name of dimensional reduction from reference object} + +\item{dims}{Dimensions used for reference dimensional reduction} + +\item{scale}{Determine if scale query data based on reference data variance} + +\item{verbose}{Print progress} + +\item{feature.mean}{Mean of features in reference} + +\item{feature.sd}{Standard variance of features in reference} +} +\value{ +A matrix with projected cell embeddings +} +\description{ +Project query data to the reference dimensional reduction +} +\keyword{internal} diff --git a/man/ProjectData.Rd b/man/ProjectData.Rd new file mode 100644 index 000000000..e57a8b595 --- /dev/null +++ b/man/ProjectData.Rd @@ -0,0 +1,60 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sketching.R +\name{ProjectData} +\alias{ProjectData} +\title{Project full data to the sketch assay} +\usage{ +ProjectData( + object, + assay = "RNA", + sketched.assay = "sketch", + sketched.reduction, + full.reduction, + dims, + normalization.method = c("LogNormalize", "SCT"), + refdata = NULL, + k.weight = 50, + umap.model = NULL, + recompute.neighbors = FALSE, + recompute.weights = FALSE, + verbose = TRUE +) +} +\arguments{ +\item{object}{A Seurat object.} + +\item{assay}{Assay name for the full data. Default is 'RNA'.} + +\item{sketched.assay}{Sketched assay name to project onto. Default is 'sketch'.} + +\item{sketched.reduction}{Dimensional reduction results of the sketched assay to project onto.} + +\item{full.reduction}{Dimensional reduction name for the projected full dataset.} + +\item{dims}{Dimensions to include in the projection.} + +\item{normalization.method}{Normalization method to use. Can be 'LogNormalize' or 'SCT'. +Default is 'LogNormalize'.} + +\item{refdata}{An optional list for label transfer from sketch to full data. Default is NULL. +Similar to refdata in `MapQuery`} + +\item{k.weight}{Number of neighbors to consider when weighting labels for transfer. Default is 50.} + +\item{umap.model}{An optional pre-computed UMAP model. Default is NULL.} + +\item{recompute.neighbors}{Whether to recompute the neighbors for label transfer. Default is FALSE.} + +\item{recompute.weights}{Whether to recompute the weights for label transfer. Default is FALSE.} + +\item{verbose}{Print progress and diagnostic messages.} +} +\value{ +A Seurat object with the full data projected onto the sketched dimensional reduction results. +The projected data are stored in the specified full reduction. +} +\description{ +This function allows projection of high-dimensional single-cell RNA expression data from a full dataset +onto the lower-dimensional embedding of the sketch of the dataset. +} +\concept{sketching} diff --git a/man/ProjectDimReduc.Rd b/man/ProjectDimReduc.Rd new file mode 100644 index 000000000..503cf9d2c --- /dev/null +++ b/man/ProjectDimReduc.Rd @@ -0,0 +1,58 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/integration.R +\name{ProjectDimReduc} +\alias{ProjectDimReduc} +\title{Project query data to reference dimensional reduction} +\usage{ +ProjectDimReduc( + query, + reference, + mode = c("pcaproject", "lsiproject"), + reference.reduction, + combine = FALSE, + query.assay = NULL, + reference.assay = NULL, + features = NULL, + do.scale = TRUE, + reduction.name = NULL, + reduction.key = NULL, + verbose = TRUE +) +} +\arguments{ +\item{query}{Query object} + +\item{reference}{Reference object} + +\item{mode}{Projection mode name for projection + \itemize{ +\item{pcaproject: PCA projection} +\item{lsiproject: LSI projection} +}} + +\item{reference.reduction}{Name of dimensional reduction in the reference object} + +\item{combine}{Determine if query and reference objects are combined} + +\item{query.assay}{Assay used for query object} + +\item{reference.assay}{Assay used for reference object} + +\item{features}{Features used for projection} + +\item{do.scale}{Determine if scale expression matrix in the pcaproject mode} + +\item{reduction.name}{dimensional reduction name, reference.reduction is used by default} + +\item{reduction.key}{dimensional reduction key, the key in reference.reduction +is used by default} + +\item{verbose}{Print progress and message} +} +\value{ +Returns a query-only or query-reference combined seurat object +} +\description{ +Project query data to reference dimensional reduction +} +\concept{integration} diff --git a/man/ProjectIntegration.Rd b/man/ProjectIntegration.Rd new file mode 100644 index 000000000..8755eec08 --- /dev/null +++ b/man/ProjectIntegration.Rd @@ -0,0 +1,73 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/integration.R +\name{ProjectIntegration} +\alias{ProjectIntegration} +\title{Integrate embeddings from the integrated sketched.assay} +\usage{ +ProjectIntegration( + object, + sketched.assay = "sketch", + assay = "RNA", + reduction = "integrated_dr", + features = NULL, + layers = "data", + reduction.name = NULL, + reduction.key = NULL, + method = c("sketch", "data"), + ratio = 0.8, + sketched.layers = NULL, + seed = 123, + verbose = TRUE +) +} +\arguments{ +\item{object}{A Seurat object with all cells for one dataset} + +\item{sketched.assay}{Assay name for sketched-cell expression (default is 'sketch')} + +\item{assay}{Assay name for original expression (default is 'RNA')} + +\item{reduction}{Dimensional reduction name for batch-corrected embeddings +in the sketched object (default is 'integrated_dr')} + +\item{features}{Features used for atomic sketch integration} + +\item{layers}{Names of layers for correction.} + +\item{reduction.name}{Name to save new reduction as; defaults to +\code{paste0(reduction, '.orig')}} + +\item{reduction.key}{Key for new dimensional reduction; defaults to creating +one from \code{reduction.name}} + +\item{method}{Methods to construct sketch-cell representation +for all cells (default is 'sketch'). Can be one of: +\itemize{ + \item \dQuote{\code{sketch}}: Use random sketched data slot + \item \dQuote{\code{data}}: Use data slot +}} + +\item{ratio}{Sketch ratio of data slot when \code{dictionary.method} is set +to \dQuote{\code{sketch}}; defaults to 0.8} + +\item{sketched.layers}{Names of sketched layers, defaults to all +layers of \dQuote{\code{object[[assay]]}}} + +\item{seed}{A positive integer. The seed for the random number generator, defaults to 123.} + +\item{verbose}{Print progress and message} +} +\value{ +Returns a Seurat object with an integrated dimensional reduction +} +\description{ +The main steps of this procedure are outlined below. For a more detailed +description of the methodology, please see Hao, et al Biorxiv 2022: +\doi{10.1101/2022.02.24.481684} +} +\details{ +First learn a atom dictionary representation to reconstruct each cell. +Then, using this dictionary representation, +reconstruct the embeddings of each cell from the integrated atoms. +} +\concept{integration} diff --git a/man/PseudobulkExpression.Rd b/man/PseudobulkExpression.Rd new file mode 100644 index 000000000..f790383af --- /dev/null +++ b/man/PseudobulkExpression.Rd @@ -0,0 +1,99 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/generics.R, R/utilities.R +\name{PseudobulkExpression} +\alias{PseudobulkExpression} +\alias{PseudobulkExpression.Assay} +\alias{PseudobulkExpression.StdAssay} +\alias{PseudobulkExpression.Seurat} +\title{Pseudobulk Expression} +\usage{ +PseudobulkExpression(object, ...) + +\method{PseudobulkExpression}{Assay}( + object, + assay, + category.matrix, + features = NULL, + layer = "data", + slot = deprecated(), + verbose = TRUE, + ... +) + +\method{PseudobulkExpression}{StdAssay}( + object, + assay, + category.matrix, + features = NULL, + layer = "data", + slot = deprecated(), + verbose = TRUE, + ... +) + +\method{PseudobulkExpression}{Seurat}( + object, + assays = NULL, + features = NULL, + return.seurat = FALSE, + group.by = "ident", + add.ident = NULL, + layer = "data", + slot = deprecated(), + method = "average", + normalization.method = "LogNormalize", + scale.factor = 10000, + margin = 1, + verbose = TRUE, + ... +) +} +\arguments{ +\item{object}{Seurat object} + +\item{...}{Arguments to be passed to methods such as \code{\link{CreateSeuratObject}}} + +\item{assay}{The name of the passed assay - used primarily for warning/error messages} + +\item{category.matrix}{A matrix defining groupings for pseudobulk expression +calculations; each column represents an identity class, and each row a sample} + +\item{features}{Features to analyze. Default is all features in the assay} + +\item{layer}{Layer(s) to user; if multiple are given, assumed to follow +the order of 'assays' (if specified) or object's assays} + +\item{slot}{(Deprecated) See \code{layer}} + +\item{verbose}{Print messages and show progress bar} + +\item{assays}{Which assays to use. Default is all assays} + +\item{return.seurat}{Whether to return the data as a Seurat object. Default is FALSE} + +\item{group.by}{Categories for grouping (e.g, "ident", "replicate", +"celltype"); "ident" by default} + +\item{add.ident}{(Deprecated) See group.by} + +\item{method}{The method used for calculating pseudobulk expression; one of: +"average" or "aggregate"} + +\item{normalization.method}{Method for normalization, see \code{\link{NormalizeData}}} + +\item{scale.factor}{Scale factor for normalization, see \code{\link{NormalizeData}}} + +\item{margin}{Margin to perform CLR normalization, see \code{\link{NormalizeData}}} +} +\value{ +Returns object after normalization + +Returns a matrix with genes as rows, identity classes as columns. +If return.seurat is TRUE, returns an object of class \code{\link{Seurat}}. +} +\description{ +Normalize the count data present in a given assay. + +Returns a representative expression value for each identity class +} +\concept{utilities} diff --git a/man/RPCAIntegration.Rd b/man/RPCAIntegration.Rd new file mode 100644 index 000000000..0e4589d68 --- /dev/null +++ b/man/RPCAIntegration.Rd @@ -0,0 +1,137 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/integration5.R +\name{RPCAIntegration} +\alias{RPCAIntegration} +\title{Seurat-RPCA Integration} +\usage{ +RPCAIntegration( + object = NULL, + assay = NULL, + layers = NULL, + orig = NULL, + new.reduction = "integrated.dr", + reference = NULL, + features = NULL, + normalization.method = c("LogNormalize", "SCT"), + dims = 1:30, + k.filter = NA, + scale.layer = "scale.data", + dims.to.integrate = NULL, + k.weight = 100, + weight.reduction = NULL, + sd.weight = 1, + sample.tree = NULL, + preserve.order = FALSE, + verbose = TRUE, + ... +) +} +\arguments{ +\item{object}{A \code{Seurat} object} + +\item{assay}{Name of \code{Assay} in the \code{Seurat} object} + +\item{layers}{Names of layers in \code{assay}} + +\item{orig}{A \link[SeuratObject:DimReduc]{dimensional reduction} to correct} + +\item{new.reduction}{Name of new integrated dimensional reduction} + +\item{reference}{A reference \code{Seurat} object} + +\item{features}{A vector of features to use for integration} + +\item{normalization.method}{Name of normalization method used: LogNormalize +or SCT} + +\item{dims}{Dimensions of dimensional reduction to use for integration} + +\item{k.filter}{Number of anchors to filter} + +\item{scale.layer}{Name of scaled layer in \code{Assay}} + +\item{dims.to.integrate}{Number of dimensions to return integrated values for} + +\item{k.weight}{Number of neighbors to consider when weighting anchors} + +\item{weight.reduction}{Dimension reduction to use when calculating anchor +weights. This can be one of: +\itemize{ + \item{A string, specifying the name of a dimension reduction present in + all objects to be integrated} + \item{A vector of strings, specifying the name of a dimension reduction to + use for each object to be integrated} + \item{A vector of \code{\link{DimReduc}} objects, specifying the object to + use for each object in the integration} + \item{NULL, in which case the full corrected space is used for computing + anchor weights.} +}} + +\item{sd.weight}{Controls the bandwidth of the Gaussian kernel for weighting} + +\item{sample.tree}{Specify the order of integration. Order of integration +should be encoded in a matrix, where each row represents one of the pairwise +integration steps. Negative numbers specify a dataset, positive numbers +specify the integration results from a given row (the format of the merge +matrix included in the \code{\link{hclust}} function output). For example: +\code{matrix(c(-2, 1, -3, -1), ncol = 2)} gives: + +\if{html}{\out{
}}\preformatted{ [,1] [,2] + [1,] -2 -3 + [2,] 1 -1 +}\if{html}{\out{
}} + +Which would cause dataset 2 and 3 to be integrated first, then the resulting +object integrated with dataset 1. + +If NULL, the sample tree will be computed automatically.} + +\item{preserve.order}{Do not reorder objects based on size for each pairwise +integration.} + +\item{verbose}{Print progress} + +\item{...}{Arguments passed on to \code{FindIntegrationAnchors}} +} +\description{ +Seurat-RPCA Integration +} +\examples{ +\dontrun{ +# Preprocessing +obj <- SeuratData::LoadData("pbmcsca") +obj[["RNA"]] <- split(obj[["RNA"]], f = obj$Method) +obj <- NormalizeData(obj) +obj <- FindVariableFeatures(obj) +obj <- ScaleData(obj) +obj <- RunPCA(obj) + +# After preprocessing, we run integration +obj <- IntegrateLayers(object = obj, method = RPCAIntegration, + orig.reduction = "pca", new.reduction = 'integrated.rpca', + verbose = FALSE) + +# Reference-based Integration +# Here, we use the first layer as a reference for integraion +# Thus, we only identify anchors between the reference and the rest of the datasets, +# saving computational resources +obj <- IntegrateLayers(object = obj, method = RPCAIntegration, + orig.reduction = "pca", new.reduction = 'integrated.rpca', + reference = 1, verbose = FALSE) + +# Modifying parameters +# We can also specify parameters such as `k.anchor` to increase the strength of +# integration +obj <- IntegrateLayers(object = obj, method = RPCAIntegration, + orig.reduction = "pca", new.reduction = 'integrated.rpca', + k.anchor = 20, verbose = FALSE) + +# Integrating SCTransformed data +obj <- SCTransform(object = obj) +obj <- IntegrateLayers(object = obj, method = RPCAIntegration, + orig.reduction = "pca", new.reduction = 'integrated.rpca', + assay = "SCT", verbose = FALSE) +} + +} +\concept{utilities} diff --git a/man/Radius.Rd b/man/Radius.Rd index a0b1a74db..31597ca51 100644 --- a/man/Radius.Rd +++ b/man/Radius.Rd @@ -4,16 +4,24 @@ \alias{Radius.SlideSeq} \alias{Radius.STARmap} \alias{Radius.VisiumV1} +\alias{Radius.VisiumV2} \title{Get Spot Radius} \usage{ -\method{Radius}{SlideSeq}(object) +\method{Radius}{SlideSeq}(object, ...) -\method{Radius}{STARmap}(object) +\method{Radius}{STARmap}(object, ...) -\method{Radius}{VisiumV1}(object) +\method{Radius}{VisiumV1}(object, scale = "lowres", ...) + +\method{Radius}{VisiumV1}(object, scale = "lowres", ...) } \arguments{ \item{object}{An image object} + +\item{...}{Arguments passed to other methods} + +\item{scale}{A factor to scale the radius by; one of: "hires", +"lowres", or \code{NULL} for the unscaled value.} } \description{ Get Spot Radius diff --git a/man/Read10X_Coordinates.Rd b/man/Read10X_Coordinates.Rd new file mode 100644 index 000000000..813dd5d8a --- /dev/null +++ b/man/Read10X_Coordinates.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocessing.R +\name{Read10X_Coordinates} +\alias{Read10X_Coordinates} +\title{Load 10X Genomics Visium Tissue Positions} +\usage{ +Read10X_Coordinates(filename, filter.matrix) +} +\arguments{ +\item{filename}{Path to a \code{tissue_positions_list.csv} file} + +\item{filter.matrix}{Filter spot/feature matrix to only include spots that +have been determined to be over tissue} +} +\value{ +A data.frame +} +\description{ +Load 10X Genomics Visium Tissue Positions +} +\concept{preprocessing} diff --git a/man/Read10X_Image.Rd b/man/Read10X_Image.Rd index a42388e80..2b9a81d06 100644 --- a/man/Read10X_Image.Rd +++ b/man/Read10X_Image.Rd @@ -7,29 +7,32 @@ Read10X_Image( image.dir, image.name = "tissue_lowres_image.png", - filter.matrix = TRUE, - ... + assay = "Spatial", + slice = "slice1", + filter.matrix = TRUE ) } \arguments{ \item{image.dir}{Path to directory with 10X Genomics visium image data; -should include files \code{tissue_lowres_image.png},} - -\item{image.name}{The file name of the image. Defaults to tissue_lowres_image.png. +should include files \code{tissue_lowres_image.png}, \code{scalefactors_json.json} and \code{tissue_positions_list.csv}} -\item{filter.matrix}{Filter spot/feature matrix to only include spots that -have been determined to be over tissue.} +\item{image.name}{PNG file to read in} + +\item{assay}{Name of associated assay} -\item{...}{Ignored for now} +\item{slice}{Name for the image, used to populate the instance's key} + +\item{filter.matrix}{Filter spot/feature matrix to only include spots that +have been determined to be over tissue} } \value{ -A \code{\link{VisiumV1}} object +A \code{\link{VisiumV2}} object } \description{ Load a 10X Genomics Visium Image } \seealso{ -\code{\link{VisiumV1}} \code{\link{Load10X_Spatial}} +\code{\link{VisiumV2}} \code{\link{Load10X_Spatial}} } \concept{preprocessing} diff --git a/man/Read10X_ScaleFactors.Rd b/man/Read10X_ScaleFactors.Rd new file mode 100644 index 000000000..bc6b86ea1 --- /dev/null +++ b/man/Read10X_ScaleFactors.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocessing.R +\name{Read10X_ScaleFactors} +\alias{Read10X_ScaleFactors} +\title{Load 10X Genomics Visium Scale Factors} +\usage{ +Read10X_ScaleFactors(filename) +} +\arguments{ +\item{filename}{Path to a \code{scalefactors_json.json} file} +} +\value{ +A scalefactors object +} +\description{ +Load 10X Genomics Visium Scale Factors +} +\concept{preprocessing} diff --git a/man/Read10X_probe_metadata.Rd b/man/Read10X_probe_metadata.Rd new file mode 100644 index 000000000..4d5702ed9 --- /dev/null +++ b/man/Read10X_probe_metadata.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocessing.R +\name{Read10X_probe_metadata} +\alias{Read10X_probe_metadata} +\title{Read10x Probe Metadata} +\usage{ +Read10X_probe_metadata(data.dir, filename = "raw_probe_bc_matrix.h5") +} +\arguments{ +\item{data.dir}{The directory where the file is located.} + +\item{filename}{The name of the file containing the raw probe barcode matrix in HDF5 format. The default filename is 'raw_probe_bc_matrix.h5'.} +} +\value{ +Returns a data.frame containing the probe metadata. +} +\description{ +This function reads the probe metadata from a 10x Genomics probe barcode matrix file in HDF5 format. +} +\concept{preprocessing} diff --git a/man/ReadAkoya.Rd b/man/ReadAkoya.Rd index adf300e57..9c0041368 100644 --- a/man/ReadAkoya.Rd +++ b/man/ReadAkoya.Rd @@ -40,11 +40,7 @@ quantification level to read in} \item{assay}{Name to store expression matrix as} -\item{...}{ - Arguments passed on to \code{\link[=ReadAkoya]{ReadAkoya}} - \describe{ - \item{\code{}}{} - }} +\item{...}{Ignored} } \value{ \code{ReadAkoya}: A list with some combination of the following values diff --git a/man/ReadXenium.Rd b/man/ReadXenium.Rd index df86aa864..dbe85e398 100644 --- a/man/ReadXenium.Rd +++ b/man/ReadXenium.Rd @@ -5,13 +5,23 @@ \alias{ReadXenium} \title{Read and Load 10x Genomics Xenium in-situ data} \usage{ -LoadXenium(data.dir, fov = "fov", assay = "Xenium") +LoadXenium( + data.dir, + fov = "fov", + assay = "Xenium", + mols.qv.threshold = 20, + cell.centroids = TRUE, + molecule.coordinates = TRUE, + segmentations = NULL, + flip.xy = FALSE +) ReadXenium( data.dir, - outs = c("matrix", "microns"), + outs = c("segmentation_method", "matrix", "microns"), type = "centroids", - mols.qv.threshold = 20 + mols.qv.threshold = 20, + flip.xy = F ) } \arguments{ @@ -22,10 +32,26 @@ default filenames} \item{assay}{Assay name} +\item{mols.qv.threshold}{Remove transcript molecules with +a QV less than this threshold. QV >= 20 is the standard threshold +used to construct the cell x gene count matrix.} + +\item{cell.centroids}{Whether or not to load cell centroids} + +\item{molecule.coordinates}{Whether or not to load molecule pixel coordinates} + +\item{segmentations}{One of "cell", "nucleus" or NULL (to load either cell +segmentations, nucleus segmentations or neither)} + +\item{flip.xy}{Whether or not to flip the x/y coordinates of the Xenium outputs +to match what is displayed in Xenium Explorer, or fit on your screen better.} + \item{outs}{Types of molecular outputs to read; choose one or more of: \itemize{ \item \dQuote{matrix}: the counts matrix \item \dQuote{microns}: molecule coordinates + \item \dQuote{segmentation_method}: cell segmentation method (for runs which + use multi-modal segmentation) }} \item{type}{Type of cell spatial coordinate matrices to read; choose one @@ -33,11 +59,8 @@ or more of: \itemize{ \item \dQuote{centroids}: cell centroids in pixel coordinate space \item \dQuote{segmentations}: cell segmentations in pixel coordinate space + \item \dQuote{nucleus_segmentations}: nucleus segmentations in pixel coordinate space }} - -\item{mols.qv.threshold}{Remove transcript molecules with -a QV less than this threshold. QV >= 20 is the standard threshold -used to construct the cell x gene count matrix.} } \value{ \code{LoadXenium}: A \code{\link[SeuratObject]{Seurat}} object diff --git a/man/RidgePlot.Rd b/man/RidgePlot.Rd index 7825a99a5..744e120e4 100644 --- a/man/RidgePlot.Rd +++ b/man/RidgePlot.Rd @@ -16,7 +16,8 @@ RidgePlot( same.y.lims = FALSE, log = FALSE, ncol = NULL, - slot = "data", + slot = deprecated(), + layer = "data", stack = FALSE, combine = TRUE, fill.by = "feature" @@ -49,6 +50,8 @@ expression of the attribute being potted, can also pass 'increasing' or 'decreas \item{slot}{Slot to pull expression data from (e.g. "counts" or "data")} +\item{layer}{Layer to pull expression data from (e.g. "counts" or "data")} + \item{stack}{Horizontally stack plots for each feature} \item{combine}{Combine plots into a single \code{\link[patchwork]{patchwork}ed} diff --git a/man/RunCCA.Rd b/man/RunCCA.Rd index 245366b04..aa2c6b14b 100644 --- a/man/RunCCA.Rd +++ b/man/RunCCA.Rd @@ -76,6 +76,7 @@ For details about stored CCA calculation parameters, see \code{PrintCCAParams}. } \examples{ +\dontrun{ data("pbmc_small") pbmc_small # As CCA requires two datasets, we will split our test object into two just for this example @@ -86,6 +87,7 @@ pbmc2[["group"]] <- "group2" pbmc_cca <- RunCCA(object1 = pbmc1, object2 = pbmc2) # Print results print(x = pbmc_cca[["cca"]]) +} } \seealso{ diff --git a/man/RunGraphLaplacian.Rd b/man/RunGraphLaplacian.Rd new file mode 100644 index 000000000..1e1994993 --- /dev/null +++ b/man/RunGraphLaplacian.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/generics.R, R/integration.R +\name{RunGraphLaplacian} +\alias{RunGraphLaplacian} +\alias{RunGraphLaplacian.Seurat} +\alias{RunGraphLaplacian.default} +\title{Run Graph Laplacian Eigendecomposition} +\usage{ +RunGraphLaplacian(object, ...) + +\method{RunGraphLaplacian}{Seurat}( + object, + graph, + reduction.name = "lap", + reduction.key = "LAP_", + n = 50, + verbose = TRUE, + ... +) + +\method{RunGraphLaplacian}{default}(object, n = 50, reduction.key = "LAP_", verbose = TRUE, ...) +} +\arguments{ +\item{object}{A Seurat object} + +\item{...}{Arguments passed to eigs_sym} + +\item{graph}{The name of graph} + +\item{reduction.name}{dimensional reduction name, lap by default} + +\item{reduction.key}{dimensional reduction key, specifies the string before +the number for the dimension names. LAP by default} + +\item{n}{Total Number of Eigenvectors to compute and store (50 by default)} + +\item{verbose}{Print message and process} +} +\value{ +Returns Seurat object with the Graph laplacian eigenvector +calculation stored in the reductions slot +} +\description{ +Run a graph laplacian dimensionality reduction. It is used as a low +dimensional representation for a cell-cell graph. The input graph +should be symmetric +} +\concept{dimensional_reduction} diff --git a/man/RunMixscape.Rd b/man/RunMixscape.Rd index 1214043e2..953d95d0c 100644 --- a/man/RunMixscape.Rd +++ b/man/RunMixscape.Rd @@ -47,9 +47,10 @@ all are assigned NP.} \item{de.assay}{Assay to use when performing differential expression analysis. Usually RNA.} -\item{logfc.threshold}{Limit testing to genes which show, on average, at least -X-fold difference (log-scale) between the two groups of cells. Default is 0.25 -Increasing logfc.threshold speeds up the function, but can miss weaker signals.} +\item{logfc.threshold}{Limit testing to genes which show, on average, +at least X-fold difference (log-scale) between the two groups of cells. +Default is 0.25 Increasing logfc.threshold speeds up the function, but can miss +weaker signals.} \item{iter.num}{Number of normalmixEM iterations to run if convergence does not occur.} diff --git a/man/RunSLSI.Rd b/man/RunSLSI.Rd index 5b7a05ad6..65b79ad74 100644 --- a/man/RunSLSI.Rd +++ b/man/RunSLSI.Rd @@ -4,6 +4,7 @@ \alias{RunSLSI} \alias{RunSLSI.default} \alias{RunSLSI.Assay} +\alias{RunSLSI.StdAssay} \alias{RunSLSI.Seurat} \title{Run Supervised Latent Semantic Indexing} \usage{ @@ -32,6 +33,19 @@ RunSLSI(object, ...) ... ) +\method{RunSLSI}{StdAssay}( + object, + assay = NULL, + features = NULL, + n = 50, + reduction.key = "SLSI_", + graph = NULL, + layer = "data", + verbose = TRUE, + seed.use = 42, + ... +) + \method{RunSLSI}{Seurat}( object, assay = NULL, @@ -63,8 +77,10 @@ the number for the dimension names} \item{seed.use}{Set a random seed. Setting NULL will not set a seed.} -\item{features}{Features to compute SLSI on. If NULL, SLSI will be run -using the variable features for the Assay.} +\item{features}{Features to compute SLSI on. If features=NULL, SLSI will be run +using the variable features for the Assay5.} + +\item{layer}{Layer to run SLSI on} \item{reduction.name}{dimensional reduction name} } diff --git a/man/RunSPCA.Rd b/man/RunSPCA.Rd index fb166c30c..11c23986e 100644 --- a/man/RunSPCA.Rd +++ b/man/RunSPCA.Rd @@ -4,6 +4,7 @@ \alias{RunSPCA} \alias{RunSPCA.default} \alias{RunSPCA.Assay} +\alias{RunSPCA.Assay5} \alias{RunSPCA.Seurat} \title{Run Supervised Principal Component Analysis} \usage{ @@ -32,6 +33,19 @@ RunSPCA(object, ...) ... ) +\method{RunSPCA}{Assay5}( + object, + assay = NULL, + features = NULL, + npcs = 50, + reduction.key = "SPC_", + graph = NULL, + verbose = TRUE, + seed.use = 42, + layer = "scale.data", + ... +) + \method{RunSPCA}{Seurat}( object, assay = NULL, @@ -68,6 +82,8 @@ NULL will not set a seed.} \item{features}{Features to compute SPCA on. If features=NULL, SPCA will be run using the variable features for the Assay.} +\item{layer}{Layer to run SPCA on} + \item{reduction.name}{dimensional reduction name, spca by default} } \value{ diff --git a/man/RunTSNE.Rd b/man/RunTSNE.Rd index 82077c88d..edd94dbc1 100644 --- a/man/RunTSNE.Rd +++ b/man/RunTSNE.Rd @@ -68,15 +68,18 @@ RunTSNE(object, ...) \item{tsne.method}{Select the method to use to compute the tSNE. Available methods are: \itemize{ -\item{Rtsne: }{Use the Rtsne package Barnes-Hut implementation of tSNE (default)} -\item{FIt-SNE: }{Use the FFT-accelerated Interpolation-based t-SNE. Based on -Kluger Lab code found here: https://github.com/KlugerLab/FIt-SNE} + \item \dQuote{\code{Rtsne}}: Use the Rtsne package Barnes-Hut + implementation of tSNE (default) + \item \dQuote{\code{FIt-SNE}}: Use the FFT-accelerated Interpolation-based + t-SNE. Based on Kluger Lab code found here: + \url{https://github.com/KlugerLab/FIt-SNE} }} \item{dim.embed}{The dimensional space of the resulting tSNE embedding (default is 2). For example, set to 3 for a 3d tSNE} -\item{reduction.key}{dimensional reduction key, specifies the string before the number for the dimension names. tSNE_ by default} +\item{reduction.key}{dimensional reduction key, specifies the string before +the number for the dimension names. \dQuote{\code{tSNE_}} by default} \item{cells}{Which cells to analyze (default, all cells)} diff --git a/man/RunUMAP.Rd b/man/RunUMAP.Rd index 54b2319ae..fba7ec869 100644 --- a/man/RunUMAP.Rd +++ b/man/RunUMAP.Rd @@ -103,7 +103,7 @@ RunUMAP(object, ...) dens.var.shift = 0.1, verbose = TRUE, reduction.name = "umap", - reduction.key = "UMAP_", + reduction.key = NULL, ... ) } diff --git a/man/SCTAssay-class.Rd b/man/SCTAssay-class.Rd index b06638089..d116a62e9 100644 --- a/man/SCTAssay-class.Rd +++ b/man/SCTAssay-class.Rd @@ -78,9 +78,11 @@ the conversion will automagically fill the new slots with the data pbmc_small <- SCTransform(pbmc_small) } +\dontrun{ # SCTAssay objects are generated from SCTransform pbmc_small <- SCTransform(pbmc_small) pbmc_small[["SCT"]] +} \dontrun{ # Query and change SCT model names diff --git a/man/SCTransform.Rd b/man/SCTransform.Rd index 54d4c2e8a..4d3842666 100644 --- a/man/SCTransform.Rd +++ b/man/SCTransform.Rd @@ -1,10 +1,61 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/preprocessing.R +% Please edit documentation in R/generics.R, R/preprocessing.R, +% R/preprocessing5.R \name{SCTransform} \alias{SCTransform} -\title{Use regularized negative binomial regression to normalize UMI count data} +\alias{SCTransform.default} +\alias{SCTransform.Assay} +\alias{SCTransform.Seurat} +\alias{SCTransform.IterableMatrix} +\title{Perform sctransform-based normalization} \usage{ -SCTransform( +SCTransform(object, ...) + +\method{SCTransform}{default}( + object, + cell.attr, + reference.SCT.model = NULL, + do.correct.umi = TRUE, + ncells = 5000, + residual.features = NULL, + variable.features.n = 3000, + variable.features.rv.th = 1.3, + vars.to.regress = NULL, + latent.data = NULL, + do.scale = FALSE, + do.center = TRUE, + clip.range = c(-sqrt(x = ncol(x = umi)/30), sqrt(x = ncol(x = umi)/30)), + vst.flavor = "v2", + conserve.memory = FALSE, + return.only.var.genes = TRUE, + seed.use = 1448145, + verbose = TRUE, + ... +) + +\method{SCTransform}{Assay}( + object, + cell.attr, + reference.SCT.model = NULL, + do.correct.umi = TRUE, + ncells = 5000, + residual.features = NULL, + variable.features.n = 3000, + variable.features.rv.th = 1.3, + vars.to.regress = NULL, + latent.data = NULL, + do.scale = FALSE, + do.center = TRUE, + clip.range = c(-sqrt(x = ncol(x = object)/30), sqrt(x = ncol(x = object)/30)), + vst.flavor = "v2", + conserve.memory = FALSE, + return.only.var.genes = TRUE, + seed.use = 1448145, + verbose = TRUE, + ... +) + +\method{SCTransform}{Seurat}( object, assay = "RNA", new.assay.name = "SCT", @@ -19,6 +70,29 @@ SCTransform( do.center = TRUE, clip.range = c(-sqrt(x = ncol(x = object[[assay]])/30), sqrt(x = ncol(x = object[[assay]])/30)), + vst.flavor = "v2", + conserve.memory = FALSE, + return.only.var.genes = TRUE, + seed.use = 1448145, + verbose = TRUE, + ... +) + +\method{SCTransform}{IterableMatrix}( + object, + cell.attr, + reference.SCT.model = NULL, + do.correct.umi = TRUE, + ncells = 5000, + residual.features = NULL, + variable.features.n = 3000, + variable.features.rv.th = 1.3, + vars.to.regress = NULL, + latent.data = NULL, + do.scale = FALSE, + do.center = TRUE, + clip.range = c(-sqrt(x = ncol(x = object)/30), sqrt(x = ncol(x = object)/30)), + vst.flavor = "v2", conserve.memory = FALSE, return.only.var.genes = TRUE, seed.use = 1448145, @@ -27,11 +101,11 @@ SCTransform( ) } \arguments{ -\item{object}{A seurat object} +\item{object}{UMI counts matrix} -\item{assay}{Name of assay to pull the count data from; default is 'RNA'} +\item{...}{Additional parameters passed to \code{sctransform::vst}} -\item{new.assay.name}{Name for the new assay containing the normalized data} +\item{cell.attr}{A metadata with cell attributes} \item{reference.SCT.model}{If not NULL, compute residuals for the object using the provided SCT model; supports only log_umi as the latent variable. @@ -54,7 +128,9 @@ ranking by residual variance; default is 3000. Only applied if residual.features use this residual variance cutoff; this is only used when \code{variable.features.n} is set to NULL; default is 1.3. Only applied if residual.features is not set.} -\item{vars.to.regress}{Variables to regress out in a second non-regularized linear +\item{vars.to.regress}{Variables to regress out in a second non-regularized linear} + +\item{latent.data}{Extra data to regress out, should be cells x latent data regression. For example, percent.mito. Default is NULL} \item{do.scale}{Whether to scale residuals to have unit variance; default is FALSE} @@ -64,6 +140,10 @@ regression. For example, percent.mito. Default is NULL} \item{clip.range}{Range to clip the residuals to; default is \code{c(-sqrt(n/30), sqrt(n/30))}, where n is the number of cells} +\item{vst.flavor}{When set to 'v2' sets method = glmGamPoi_offset, n_cells=2000, +and exclude_poisson = TRUE which causes the model to learn theta and intercept +only besides excluding poisson genes from learning and regularization} + \item{conserve.memory}{If set to TRUE the residual matrix for all genes is never created in full; useful for large data sets, but will take longer to run; this will also set return.only.var.genes to TRUE; default is FALSE} @@ -76,7 +156,9 @@ NULL will not set a seed.} \item{verbose}{Whether to print messages and progress bars} -\item{...}{Additional parameters passed to \code{sctransform::vst}} +\item{assay}{Name of assay to pull the count data from; default is 'RNA'} + +\item{new.assay.name}{Name for the new assay containing the normalized data; default is 'SCT'} } \value{ Returns a Seurat object with a new assay (named SCT by default) with @@ -86,17 +168,12 @@ slot of the new assay. } \description{ This function calls sctransform::vst. The sctransform package is available at -https://github.com/ChristophH/sctransform. +https://github.com/satijalab/sctransform. Use this function as an alternative to the NormalizeData, FindVariableFeatures, ScaleData workflow. Results are saved in a new assay (named SCT by default) with counts being (corrected) counts, data being log1p(counts), scale.data being pearson residuals; sctransform::vst intermediate results are saved in misc slot of new assay. -} -\examples{ -data("pbmc_small") -SCTransform(object = pbmc_small) - } \seealso{ \code{\link[sctransform]{correct_counts}} \code{\link[sctransform]{get_residuals}} diff --git a/man/STARmap-class.Rd b/man/STARmap-class.Rd index 7984f21c3..30ab87d92 100644 --- a/man/STARmap-class.Rd +++ b/man/STARmap-class.Rd @@ -16,7 +16,10 @@ The STARmap class priority for visualization when the assay is set as the active/default assay in a \code{Seurat} object} -\item{\code{key}}{Key for the image} +\item{\code{key}}{A one-length character vector with the object's key; keys must +be one or more alphanumeric characters followed by an underscore +\dQuote{\code{_}} (regex pattern +\dQuote{\code{^[a-zA-Z][a-zA-Z0-9]*_$}})} } } diff --git a/man/ScaleData.Rd b/man/ScaleData.Rd index 6deefcc8e..c77576375 100644 --- a/man/ScaleData.Rd +++ b/man/ScaleData.Rd @@ -3,6 +3,7 @@ \name{ScaleData} \alias{ScaleData} \alias{ScaleData.default} +\alias{ScaleData.IterableMatrix} \alias{ScaleData.Assay} \alias{ScaleData.Seurat} \title{Scale and center the data.} @@ -26,6 +27,15 @@ ScaleData(object, ...) ... ) +\method{ScaleData}{IterableMatrix}( + object, + features = NULL, + do.scale = TRUE, + do.center = TRUE, + scale.max = 10, + ... +) + \method{ScaleData}{Assay}( object, features = NULL, diff --git a/man/ScaleFactors.Rd b/man/ScaleFactors.Rd index bac4a5fee..e746f02e3 100644 --- a/man/ScaleFactors.Rd +++ b/man/ScaleFactors.Rd @@ -4,6 +4,7 @@ \alias{ScaleFactors} \alias{scalefactors} \alias{ScaleFactors.VisiumV1} +\alias{ScaleFactors.VisiumV2} \title{Get image scale factors} \usage{ ScaleFactors(object, ...) @@ -12,7 +13,7 @@ scalefactors(spot, fiducial, hires, lowres) \method{ScaleFactors}{VisiumV1}(object, ...) -\method{ScaleFactors}{VisiumV1}(object, ...) +\method{ScaleFactors}{VisiumV2}(object, ...) } \arguments{ \item{object}{An object to get scale factors from} diff --git a/man/SelectIntegrationFeatures5.Rd b/man/SelectIntegrationFeatures5.Rd new file mode 100644 index 000000000..f71b5cf07 --- /dev/null +++ b/man/SelectIntegrationFeatures5.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/integration.R +\name{SelectIntegrationFeatures5} +\alias{SelectIntegrationFeatures5} +\title{Select integration features} +\usage{ +SelectIntegrationFeatures5( + object, + nfeatures = 2000, + assay = NULL, + method = NULL, + layers = NULL, + verbose = TRUE, + ... +) +} +\arguments{ +\item{object}{Seurat object} + +\item{nfeatures}{Number of features to return for integration} + +\item{assay}{Name of assay to use for integration feature selection} + +\item{method}{Which method to pull. For \code{HVFInfo} and +\code{VariableFeatures}, choose one from one of the +following: +\itemize{ + \item \dQuote{vst} + \item \dQuote{sctransform} or \dQuote{sct} + \item \dQuote{mean.var.plot}, \dQuote{dispersion}, \dQuote{mvp}, or + \dQuote{disp} +}} + +\item{layers}{Name of layers to use for integration feature selection} + +\item{verbose}{Print messages} + +\item{...}{Arguments passed on to \code{method}} +} +\description{ +Select integration features +} +\concept{integration} diff --git a/man/SelectSCTIntegrationFeatures.Rd b/man/SelectSCTIntegrationFeatures.Rd new file mode 100644 index 000000000..e12914bd5 --- /dev/null +++ b/man/SelectSCTIntegrationFeatures.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/integration.R +\name{SelectSCTIntegrationFeatures} +\alias{SelectSCTIntegrationFeatures} +\title{Select SCT integration features} +\usage{ +SelectSCTIntegrationFeatures( + object, + nfeatures = 3000, + assay = NULL, + verbose = TRUE, + ... +) +} +\arguments{ +\item{object}{Seurat object} + +\item{nfeatures}{Number of features to return for integration} + +\item{assay}{Name of assay to use for integration feature selection} + +\item{verbose}{Print messages} + +\item{...}{Arguments passed on to \code{method}} +} +\description{ +Select SCT integration features +} +\concept{integration} diff --git a/man/Seurat-package.Rd b/man/Seurat-package.Rd index 351af75c9..616f0d59e 100644 --- a/man/Seurat-package.Rd +++ b/man/Seurat-package.Rd @@ -43,20 +43,27 @@ Useful links: } \author{ -\strong{Maintainer}: Paul Hoffman \email{seurat@nygenome.org} (\href{https://orcid.org/0000-0002-7693-8957}{ORCID}) +\strong{Maintainer}: Rahul Satija \email{seurat@nygenome.org} (\href{https://orcid.org/0000-0001-9448-8833}{ORCID}) Other contributors: \itemize{ \item Andrew Butler \email{abutler@nygenome.org} (\href{https://orcid.org/0000-0003-3608-0463}{ORCID}) [contributor] \item Saket Choudhary \email{schoudhary@nygenome.org} (\href{https://orcid.org/0000-0001-5202-7633}{ORCID}) [contributor] + \item David Collins \email{dcollins@nygenome.org} (\href{https://orcid.org/0000-0001-9243-7821}{ORCID}) [contributor] \item Charlotte Darby \email{cdarby@nygenome.org} (\href{https://orcid.org/0000-0003-2195-5300}{ORCID}) [contributor] \item Jeff Farrell \email{jfarrell@g.harvard.edu} [contributor] + \item Isabella Grabski \email{igrabski@nygenome.org} (\href{https://orcid.org/0000-0002-0616-5469}{ORCID}) [contributor] \item Christoph Hafemeister \email{chafemeister@nygenome.org} (\href{https://orcid.org/0000-0001-6365-8254}{ORCID}) [contributor] \item Yuhan Hao \email{yhao@nygenome.org} (\href{https://orcid.org/0000-0002-1810-0822}{ORCID}) [contributor] + \item Austin Hartman \email{ahartman@nygenome.org} (\href{https://orcid.org/0000-0001-7278-1852}{ORCID}) [contributor] + \item Paul Hoffman \email{hoff0792@umn.edu} (\href{https://orcid.org/0000-0002-7693-8957}{ORCID}) [contributor] \item Jaison Jain \email{jjain@nygenome.org} (\href{https://orcid.org/0000-0002-9478-5018}{ORCID}) [contributor] + \item Longda Jiang \email{ljiang@nygenome.org} (\href{https://orcid.org/0000-0003-4964-6497}{ORCID}) [contributor] + \item Madeline Kowalski \email{mkowalski@nygenome.org} (\href{https://orcid.org/0000-0002-5655-7620}{ORCID}) [contributor] + \item Skylar Li \email{sli@nygenome.org} [contributor] + \item Gesmira Molla \email{gmolla@nygenome.org} (\href{https://orcid.org/0000-0002-8628-5056}{ORCID}) [contributor] \item Efthymia Papalexi \email{epapalexi@nygenome.org} (\href{https://orcid.org/0000-0001-5898-694X}{ORCID}) [contributor] \item Patrick Roelli \email{proelli@nygenome.org} [contributor] - \item Rahul Satija \email{rsatija@nygenome.org} (\href{https://orcid.org/0000-0001-9448-8833}{ORCID}) [contributor] \item Karthik Shekhar \email{kshekhar@berkeley.edu} [contributor] \item Avi Srivastava \email{asrivastava@nygenome.org} (\href{https://orcid.org/0000-0001-9798-2079}{ORCID}) [contributor] \item Tim Stuart \email{tstuart@nygenome.org} (\href{https://orcid.org/0000-0002-3044-0897}{ORCID}) [contributor] diff --git a/man/SingleCorPlot.Rd b/man/SingleCorPlot.Rd index b250c32c6..7b45f430f 100644 --- a/man/SingleCorPlot.Rd +++ b/man/SingleCorPlot.Rd @@ -31,6 +31,9 @@ SingleCorPlot( \item{smooth}{Make a smoothed scatter plot} +\item{rows.highlight}{A vector of rows to highlight (like cells.highlight in +\code{\link{SingleDimPlot}})} + \item{legend.title}{Optional legend title} \item{raster}{Convert points to raster format, default is \code{NULL} @@ -43,9 +46,6 @@ Default is c(512, 512)} \item{plot.cor}{...} \item{jitter}{Jitter for easier visualization of crowded points} - -\item{rows.highight}{A vector of rows to highlight (like cells.highlight in -\code{\link{SingleDimPlot}})} } \value{ A ggplot2 object diff --git a/man/SingleDimPlot.Rd b/man/SingleDimPlot.Rd index 6fb2f6956..413bd721a 100644 --- a/man/SingleDimPlot.Rd +++ b/man/SingleDimPlot.Rd @@ -11,6 +11,7 @@ SingleDimPlot( cols = NULL, pt.size = NULL, shape.by = NULL, + alpha = 1, alpha.by = NULL, order = NULL, label = FALSE, @@ -42,6 +43,8 @@ default, ggplot2 assigns colors} any cell attribute (that can be pulled with \code{\link{FetchData}}) allowing for both different colors and different shapes on cells.} +\item{alpha}{Alpha value for plotting (default is 1)} + \item{alpha.by}{Mapping variable for the point alpha value} \item{order}{Specify the order of plotting for the idents. This can be diff --git a/man/SingleExIPlot.Rd b/man/SingleExIPlot.Rd index 636503c5d..da5398fbd 100644 --- a/man/SingleExIPlot.Rd +++ b/man/SingleExIPlot.Rd @@ -13,6 +13,7 @@ SingleExIPlot( y.max = NULL, adjust = 1, pt.size = 0, + alpha = 1, cols = NULL, seed.use = 42, log = FALSE, @@ -38,6 +39,8 @@ expression of the attribute being potted} \item{pt.size}{Size of points for violin plots} +\item{alpha}{Alpha vlaue for violin plots} + \item{cols}{Colors to use for plotting} \item{seed.use}{Random seed to use. If NULL, don't set a seed} diff --git a/man/SingleSpatialPlot.Rd b/man/SingleSpatialPlot.Rd index de8d25c46..f199eac83 100644 --- a/man/SingleSpatialPlot.Rd +++ b/man/SingleSpatialPlot.Rd @@ -9,10 +9,12 @@ SingleSpatialPlot( image, cols = NULL, image.alpha = 1, + image.scale = "lowres", pt.alpha = NULL, crop = TRUE, pt.size.factor = NULL, - stroke = 0.25, + shape = 21, + stroke = NA, col.by = NULL, alpha.by = NULL, cells.highlight = NULL, @@ -35,6 +37,9 @@ colors} \item{image.alpha}{Adjust the opacity of the background images. Set to 0 to remove.} +\item{image.scale}{Choose the scale factor ("lowres"/"hires") to apply in +order to matchthe plot with the specified `image` - defaults to "lowres"} + \item{pt.alpha}{Adjust the opacity of the points if plotting a \code{SpatialDimPlot}} @@ -43,6 +48,9 @@ to show entire background image.} \item{pt.size.factor}{Sets the size of the points relative to spot.radius} +\item{shape}{Control the shape of the spots - same as the ggplot2 parameter. +The default is 21, which plots cirlces - use 22 to plot squares.} + \item{stroke}{Control the width of the border around the spots} \item{col.by}{Mapping variable for the point color} diff --git a/man/SketchData.Rd b/man/SketchData.Rd new file mode 100644 index 000000000..97ffc2d78 --- /dev/null +++ b/man/SketchData.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sketching.R +\name{SketchData} +\alias{SketchData} +\title{Sketch Data} +\usage{ +SketchData( + object, + assay = NULL, + ncells = 5000L, + sketched.assay = "sketch", + method = c("LeverageScore", "Uniform"), + var.name = "leverage.score", + over.write = FALSE, + seed = 123L, + cast = "dgCMatrix", + verbose = TRUE, + features = NULL, + ... +) +} +\arguments{ +\item{object}{A Seurat object.} + +\item{assay}{Assay name. Default is NULL, in which case the default assay of the object is used.} + +\item{ncells}{A positive integer or a named vector/list specifying the +number of cells to sample per layer. If a single integer is provided, the +same number of cells will be sampled from each layer. Default is 5000.} + +\item{sketched.assay}{Sketched assay name. A sketch assay is created or overwrite with the sketch data. Default is 'sketch'.} + +\item{method}{Sketching method to use. Can be 'LeverageScore' or 'Uniform'. +Default is 'LeverageScore'.} + +\item{var.name}{A metadata column name to store the leverage scores. Default is 'leverage.score'.} + +\item{over.write}{whether to overwrite existing column in the metadata. Default is FALSE.} + +\item{seed}{A positive integer for the seed of the random number generator. Default is 123.} + +\item{cast}{The type to cast the resulting assay to. Default is 'dgCMatrix'.} + +\item{verbose}{Print progress and diagnostic messages} + +\item{...}{Arguments passed to other methods} +} +\value{ +A Seurat object with the sketched data added as a new assay. +} +\description{ +This function uses sketching methods to downsample high-dimensional single-cell RNA expression data, +which can help with scalability for large datasets. +} +\concept{sketching} diff --git a/man/SlideSeq-class.Rd b/man/SlideSeq-class.Rd index a81a02e3c..60cdb125f 100644 --- a/man/SlideSeq-class.Rd +++ b/man/SlideSeq-class.Rd @@ -22,7 +22,10 @@ The SlideSeq class represents spatial information from the Slide-seq platform priority for visualization when the assay is set as the active/default assay in a \code{Seurat} object} -\item{\code{key}}{Key for the image} +\item{\code{key}}{A one-length character vector with the object's key; keys must +be one or more alphanumeric characters followed by an underscore +\dQuote{\code{_}} (regex pattern +\dQuote{\code{^[a-zA-Z][a-zA-Z0-9]*_$}})} } } diff --git a/man/SpatialPlot.Rd b/man/SpatialPlot.Rd index 133501e2a..e58d5e56b 100644 --- a/man/SpatialPlot.Rd +++ b/man/SpatialPlot.Rd @@ -13,8 +13,10 @@ SpatialPlot( images = NULL, cols = NULL, image.alpha = 1, + image.scale = "lowres", crop = TRUE, slot = "data", + keep.scale = "feature", min.cutoff = NA, max.cutoff = NA, cells.highlight = NULL, @@ -29,7 +31,8 @@ SpatialPlot( combine = TRUE, pt.size.factor = 1.6, alpha = c(1, 1), - stroke = 0.25, + shape = 21, + stroke = NA, interactive = FALSE, do.identify = FALSE, identify.ident = NULL, @@ -55,7 +58,9 @@ SpatialDimPlot( pt.size.factor = 1.6, alpha = c(1, 1), image.alpha = 1, - stroke = 0.25, + image.scale = "lowres", + shape = 21, + stroke = NA, label.box = TRUE, interactive = FALSE, information = NULL @@ -67,6 +72,7 @@ SpatialFeaturePlot( images = NULL, crop = TRUE, slot = "data", + keep.scale = "feature", min.cutoff = NA, max.cutoff = NA, ncol = NULL, @@ -74,7 +80,9 @@ SpatialFeaturePlot( pt.size.factor = 1.6, alpha = c(1, 1), image.alpha = 1, - stroke = 0.25, + image.scale = "lowres", + shape = 21, + stroke = NA, interactive = FALSE, information = NULL ) @@ -97,12 +105,29 @@ default, ggplot2 assigns colors} \item{image.alpha}{Adjust the opacity of the background images. Set to 0 to remove.} +\item{image.scale}{Choose the scale factor ("lowres"/"hires") to apply in +order to matchthe plot with the specified `image` - defaults to "lowres"} + \item{crop}{Crop the plot in to focus on points plotted. Set to \code{FALSE} to show entire background image.} \item{slot}{If plotting a feature, which data slot to pull from (counts, data, or scale.data)} +\item{keep.scale}{How to handle the color scale across multiple plots. Options are: +\itemize{ + \item \dQuote{feature} (default; by row/feature scaling): The plots for + each individual feature are scaled to the maximum expression of the + feature across the conditions provided to \code{split.by} + \item \dQuote{all} (universal scaling): The plots for all features and + conditions are scaled to the maximum expression value for the feature + with the highest overall expression + \item \code{NULL} (no scaling): Each individual plot is scaled to the + maximum expression value of the feature in the condition provided to + \code{split.by}; be aware setting \code{NULL} will result in color + scales that are not comparable between plots +}} + \item{min.cutoff, max.cutoff}{Vector of minimum and maximum cutoff values for each feature, may specify quantile in the form of 'q##' where '##' is the quantile (eg, 'q1', 'q10')} @@ -141,6 +166,9 @@ themeing will not work when plotting multiple features/groupings} min and max for SpatialFeaturePlot. For SpatialDimPlot, provide a single alpha value for each plot.} +\item{shape}{Control the shape of the spots - same as the ggplot2 parameter. +The default is 21, which plots circles - use 22 to plot squares.} + \item{stroke}{Control the width of the border around the spots} \item{interactive}{Launch an interactive SpatialDimPlot or SpatialFeaturePlot diff --git a/man/TransferData.Rd b/man/TransferData.Rd index 6bbe17fcb..e30977df7 100644 --- a/man/TransferData.Rd +++ b/man/TransferData.Rd @@ -9,6 +9,7 @@ TransferData( refdata, reference = NULL, query = NULL, + query.assay = NULL, weight.reduction = "pcaproject", l2.norm = FALSE, dims = NULL, @@ -19,6 +20,7 @@ TransferData( verbose = TRUE, slot = "data", prediction.assay = FALSE, + only.weights = FALSE, store.weights = TRUE ) } @@ -42,6 +44,8 @@ TransferData( \item{query}{Query object into which the data will be transferred.} +\item{query.assay}{Name of the Assay to use from query} + \item{weight.reduction}{Dimensional reduction to use for the weighting anchors. Options are: \itemize{ @@ -78,6 +82,8 @@ or "counts"} \item{prediction.assay}{Return an \code{Assay} object with the prediction scores for each class stored in the \code{data} slot.} +\item{only.weights}{Only return weights matrix} + \item{store.weights}{Optionally store the weights matrix used for predictions in the returned query object.} } diff --git a/man/TransferSketchLabels.Rd b/man/TransferSketchLabels.Rd new file mode 100644 index 000000000..744866afb --- /dev/null +++ b/man/TransferSketchLabels.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sketching.R +\name{TransferSketchLabels} +\alias{TransferSketchLabels} +\title{Transfer data from sketch data to full data} +\usage{ +TransferSketchLabels( + object, + sketched.assay = "sketch", + reduction, + dims, + refdata = NULL, + k = 50, + reduction.model = NULL, + neighbors = NULL, + recompute.neighbors = FALSE, + recompute.weights = FALSE, + verbose = TRUE +) +} +\arguments{ +\item{object}{A Seurat object.} + +\item{sketched.assay}{Sketched assay name. Default is 'sketch'.} + +\item{reduction}{Dimensional reduction name to use for label transfer.} + +\item{dims}{An integer vector indicating which dimensions to use for label transfer.} + +\item{refdata}{A list of character strings indicating the metadata columns containing labels to transfer. Default is NULL. +Similar to refdata in `MapQuery`} + +\item{k}{Number of neighbors to use for label transfer. Default is 50.} + +\item{reduction.model}{Dimensional reduction model to use for label transfer. Default is NULL.} + +\item{neighbors}{An object storing the neighbors found during the sketching process. Default is NULL.} + +\item{recompute.neighbors}{Whether to recompute the neighbors for label transfer. Default is FALSE.} + +\item{recompute.weights}{Whether to recompute the weights for label transfer. Default is FALSE.} + +\item{verbose}{Print progress and diagnostic messages} +} +\value{ +A Seurat object with transferred labels stored in the metadata. If a UMAP model is provided, +the full data are also projected onto the UMAP space, with the results stored in a new reduction, full.`reduction.model` +} +\description{ +This function transfers cell type labels from a sketched dataset to a full dataset +based on the similarities in the lower dimensional space. +} +\concept{sketching} diff --git a/man/UnSketchEmbeddings.Rd b/man/UnSketchEmbeddings.Rd new file mode 100644 index 000000000..2634b7595 --- /dev/null +++ b/man/UnSketchEmbeddings.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/integration.R +\name{UnSketchEmbeddings} +\alias{UnSketchEmbeddings} +\title{Transfer embeddings from sketched cells to the full data} +\usage{ +UnSketchEmbeddings( + atom.data, + atom.cells = NULL, + orig.data, + embeddings, + sketch.matrix = NULL +) +} +\arguments{ +\item{atom.data}{Atom data} + +\item{atom.cells}{Atom cells} + +\item{orig.data}{Original data} + +\item{embeddings}{Embeddings of atom cells} + +\item{sketch.matrix}{Sketch matrix} +} +\description{ +Transfer embeddings from sketched cells to the full data +} +\concept{integration} diff --git a/man/VST.Rd b/man/VST.Rd new file mode 100644 index 000000000..f59ce6b10 --- /dev/null +++ b/man/VST.Rd @@ -0,0 +1,71 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/generics.R, R/preprocessing5.R +\name{VST} +\alias{VST} +\alias{VST.default} +\alias{VST.IterableMatrix} +\alias{VST.dgCMatrix} +\alias{VST.matrix} +\title{Variance Stabilizing Transformation} +\usage{ +VST(data, margin = 1L, nselect = 2000L, span = 0.3, clip = NULL, ...) + +\method{VST}{default}(data, margin = 1L, nselect = 2000L, span = 0.3, clip = NULL, ...) + +\method{VST}{IterableMatrix}( + data, + margin = 1L, + nselect = 2000L, + span = 0.3, + clip = NULL, + verbose = TRUE, + ... +) + +\method{VST}{dgCMatrix}( + data, + margin = 1L, + nselect = 2000L, + span = 0.3, + clip = NULL, + verbose = TRUE, + ... +) + +\method{VST}{matrix}(data, margin = 1L, nselect = 2000L, span = 0.3, clip = NULL, ...) +} +\arguments{ +\item{data}{A matrix-like object} + +\item{margin}{Unused} + +\item{nselect}{Number of of features to select} + +\item{span}{the parameter \eqn{\alpha} which controls the degree of + smoothing.} + +\item{clip}{Upper bound for values post-standardization; defaults to the +square root of the number of cells} + +\item{...}{Arguments passed to other methods} + +\item{verbose}{...} +} +\value{ +A data frame with the following columns: +\itemize{ + \item \dQuote{\code{mean}}: ... + \item \dQuote{\code{variance}}: ... + \item \dQuote{\code{variance.expected}}: ... + \item \dQuote{\code{variance.standardized}}: ... + \item \dQuote{\code{variable}}: \code{TRUE} if the feature selected as + variable, otherwise \code{FALSE} + \item \dQuote{\code{rank}}: If the feature is selected as variable, then how + it compares to other variable features with lower ranks as more variable; + otherwise, \code{NA} +} +} +\description{ +Apply variance stabilizing transformation for selection of variable features +} +\keyword{internal} diff --git a/man/VariableFeaturePlot.Rd b/man/VariableFeaturePlot.Rd index 9320b3f3d..b622025bc 100644 --- a/man/VariableFeaturePlot.Rd +++ b/man/VariableFeaturePlot.Rd @@ -26,20 +26,7 @@ VariableFeaturePlot( \item{log}{Plot the x-axis in log scale} -\item{selection.method}{Which method to pull. For \code{HVFInfo} and -\code{VariableFeatures}, choose one from one of the -following: -\itemize{ - \item \dQuote{vst} - \item \dQuote{sctransform} or \dQuote{sct} - \item \dQuote{mean.var.plot}, \dQuote{dispersion}, \dQuote{mvp}, or - \dQuote{disp} -} -For \code{SVFInfo} and \code{SpatiallyVariableFeatures}, choose from: -\itemize{ - \item \dQuote{markvariogram} - \item \dQuote{moransi} -}} +\item{selection.method}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}} \item{assay}{Assay to pull variable features from} diff --git a/man/VisiumV2-class.Rd b/man/VisiumV2-class.Rd new file mode 100644 index 000000000..b2feaa6ea --- /dev/null +++ b/man/VisiumV2-class.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/objects.R +\docType{class} +\name{VisiumV2-class} +\alias{VisiumV2-class} +\alias{VisiumV2} +\title{The VisiumV2 class} +\description{ +The VisiumV2 class represents spatial information from the 10X Genomics +Visium HD platform - it can also accomodate data from the standard +Visium platform +} +\section{Slots}{ + +\describe{ +\item{\code{image}}{A three-dimensional array with PNG image data, see +\code{\link[png]{readPNG}} for more details} + +\item{\code{scale.factors}}{An object of class \code{\link{scalefactors}}; see +\code{\link{scalefactors}} for more information} +}} + +\concept{objects} +\concept{spatial} diff --git a/man/VlnPlot.Rd b/man/VlnPlot.Rd index 951fc2afc..00be2eb10 100644 --- a/man/VlnPlot.Rd +++ b/man/VlnPlot.Rd @@ -9,6 +9,7 @@ VlnPlot( features, cols = NULL, pt.size = NULL, + alpha = 1, idents = NULL, sort = FALSE, assay = NULL, @@ -19,7 +20,8 @@ VlnPlot( same.y.lims = FALSE, log = FALSE, ncol = NULL, - slot = "data", + slot = deprecated(), + layer = NULL, split.plot = FALSE, stack = FALSE, combine = TRUE, @@ -37,7 +39,9 @@ anything that can be retreived by FetchData)} \item{cols}{Colors to use for plotting} -\item{pt.size}{Point size for geom_violin} +\item{pt.size}{Point size for points} + +\item{alpha}{Alpha value for points} \item{idents}{Which classes to include in the plot (default is all)} @@ -48,7 +52,8 @@ expression of the attribute being potted, can also pass 'increasing' or 'decreas \item{group.by}{Group (color) cells in different ways (for example, orig.ident)} -\item{split.by}{A variable to split the violin plots by,} +\item{split.by}{A factor in object metadata to split the plot by, pass 'ident' +to split by cell identity'} \item{adjust}{Adjust parameter for geom_violin} @@ -62,6 +67,8 @@ expression of the attribute being potted, can also pass 'increasing' or 'decreas \item{slot}{Slot to pull expression data from (e.g. "counts" or "data")} +\item{layer}{Layer to pull expression data from (e.g. "counts" or "data")} + \item{split.plot}{plot each group of the split violin plots by multiple or single violin shapes.} diff --git a/man/reexports.Rd b/man/reexports.Rd index e94d58954..c85e749d4 100644 --- a/man/reexports.Rd +++ b/man/reexports.Rd @@ -1,10 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/reexports.R \docType{import} -\name{reexports} -\alias{reexports} +\name{components} +\alias{components} \alias{\%||\%} \alias{\%iff\%} +\alias{reexports} \alias{AddMetaData} \alias{as.Graph} \alias{as.Neighbor} @@ -64,12 +65,19 @@ \alias{VariableFeatures<-} \alias{WhichCells} \title{Objects exported from other packages} +\usage{ +components(object, ...) + +x \%||\% y + +x \%iff\% y +} \keyword{internal} \description{ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ - \item{SeuratObject}{\code{\link[SeuratObject:set-if-null]{\%||\%}}, \code{\link[SeuratObject:set-if-null]{\%iff\%}}, \code{\link[SeuratObject]{AddMetaData}}, \code{\link[SeuratObject]{as.Graph}}, \code{\link[SeuratObject]{as.Neighbor}}, \code{\link[SeuratObject]{as.Seurat}}, \code{\link[SeuratObject]{as.sparse}}, \code{\link[SeuratObject:ObjectAccess]{Assays}}, \code{\link[SeuratObject]{Cells}}, \code{\link[SeuratObject]{CellsByIdentities}}, \code{\link[SeuratObject]{Command}}, \code{\link[SeuratObject]{CreateAssayObject}}, \code{\link[SeuratObject]{CreateDimReducObject}}, \code{\link[SeuratObject]{CreateSeuratObject}}, \code{\link[SeuratObject]{DefaultAssay}}, \code{\link[SeuratObject:DefaultAssay]{DefaultAssay<-}}, \code{\link[SeuratObject]{Distances}}, \code{\link[SeuratObject]{Embeddings}}, \code{\link[SeuratObject]{FetchData}}, \code{\link[SeuratObject:AssayData]{GetAssayData}}, \code{\link[SeuratObject]{GetImage}}, \code{\link[SeuratObject]{GetTissueCoordinates}}, \code{\link[SeuratObject:VariableFeatures]{HVFInfo}}, \code{\link[SeuratObject]{Idents}}, \code{\link[SeuratObject:Idents]{Idents<-}}, \code{\link[SeuratObject]{Images}}, \code{\link[SeuratObject]{Index}}, \code{\link[SeuratObject:Index]{Index<-}}, \code{\link[SeuratObject]{Indices}}, \code{\link[SeuratObject]{IsGlobal}}, \code{\link[SeuratObject]{JS}}, \code{\link[SeuratObject:JS]{JS<-}}, \code{\link[SeuratObject]{Key}}, \code{\link[SeuratObject:Key]{Key<-}}, \code{\link[SeuratObject]{Loadings}}, \code{\link[SeuratObject:Loadings]{Loadings<-}}, \code{\link[SeuratObject]{LogSeuratCommand}}, \code{\link[SeuratObject]{Misc}}, \code{\link[SeuratObject:Misc]{Misc<-}}, \code{\link[SeuratObject:ObjectAccess]{Neighbors}}, \code{\link[SeuratObject]{Project}}, \code{\link[SeuratObject:Project]{Project<-}}, \code{\link[SeuratObject]{Radius}}, \code{\link[SeuratObject:ObjectAccess]{Reductions}}, \code{\link[SeuratObject]{RenameCells}}, \code{\link[SeuratObject:Idents]{RenameIdents}}, \code{\link[SeuratObject:Idents]{ReorderIdent}}, \code{\link[SeuratObject]{RowMergeSparseMatrices}}, \code{\link[SeuratObject:AssayData]{SetAssayData}}, \code{\link[SeuratObject:Idents]{SetIdent}}, \code{\link[SeuratObject:VariableFeatures]{SpatiallyVariableFeatures}}, \code{\link[SeuratObject:Idents]{StashIdent}}, \code{\link[SeuratObject]{Stdev}}, \code{\link[SeuratObject:VariableFeatures]{SVFInfo}}, \code{\link[SeuratObject]{Tool}}, \code{\link[SeuratObject:Tool]{Tool<-}}, \code{\link[SeuratObject]{UpdateSeuratObject}}, \code{\link[SeuratObject]{VariableFeatures}}, \code{\link[SeuratObject:VariableFeatures]{VariableFeatures<-}}, \code{\link[SeuratObject]{WhichCells}}} + \item{SeuratObject}{\code{\link[SeuratObject]{AddMetaData}}, \code{\link[SeuratObject]{as.Graph}}, \code{\link[SeuratObject]{as.Neighbor}}, \code{\link[SeuratObject]{as.Seurat}}, \code{\link[SeuratObject]{as.sparse}}, \code{\link[SeuratObject:ObjectAccess]{Assays}}, \code{\link[SeuratObject]{Cells}}, \code{\link[SeuratObject]{CellsByIdentities}}, \code{\link[SeuratObject]{Command}}, \code{\link[SeuratObject]{CreateAssayObject}}, \code{\link[SeuratObject]{CreateDimReducObject}}, \code{\link[SeuratObject]{CreateSeuratObject}}, \code{\link[SeuratObject]{DefaultAssay}}, \code{\link[SeuratObject:DefaultAssay]{DefaultAssay<-}}, \code{\link[SeuratObject]{Distances}}, \code{\link[SeuratObject]{Embeddings}}, \code{\link[SeuratObject]{FetchData}}, \code{\link[SeuratObject:AssayData]{GetAssayData}}, \code{\link[SeuratObject]{GetImage}}, \code{\link[SeuratObject]{GetTissueCoordinates}}, \code{\link[SeuratObject:VariableFeatures]{HVFInfo}}, \code{\link[SeuratObject]{Idents}}, \code{\link[SeuratObject:Idents]{Idents<-}}, \code{\link[SeuratObject]{Images}}, \code{\link[SeuratObject:NNIndex]{Index}}, \code{\link[SeuratObject:NNIndex]{Index<-}}, \code{\link[SeuratObject]{Indices}}, \code{\link[SeuratObject]{IsGlobal}}, \code{\link[SeuratObject]{JS}}, \code{\link[SeuratObject:JS]{JS<-}}, \code{\link[SeuratObject]{Key}}, \code{\link[SeuratObject:Key]{Key<-}}, \code{\link[SeuratObject]{Loadings}}, \code{\link[SeuratObject:Loadings]{Loadings<-}}, \code{\link[SeuratObject]{LogSeuratCommand}}, \code{\link[SeuratObject]{Misc}}, \code{\link[SeuratObject:Misc]{Misc<-}}, \code{\link[SeuratObject:ObjectAccess]{Neighbors}}, \code{\link[SeuratObject]{Project}}, \code{\link[SeuratObject:Project]{Project<-}}, \code{\link[SeuratObject]{Radius}}, \code{\link[SeuratObject:ObjectAccess]{Reductions}}, \code{\link[SeuratObject]{RenameCells}}, \code{\link[SeuratObject:Idents]{RenameIdents}}, \code{\link[SeuratObject:Idents]{ReorderIdent}}, \code{\link[SeuratObject]{RowMergeSparseMatrices}}, \code{\link[SeuratObject:AssayData]{SetAssayData}}, \code{\link[SeuratObject:Idents]{SetIdent}}, \code{\link[SeuratObject:VariableFeatures]{SpatiallyVariableFeatures}}, \code{\link[SeuratObject:Idents]{StashIdent}}, \code{\link[SeuratObject]{Stdev}}, \code{\link[SeuratObject:VariableFeatures]{SVFInfo}}, \code{\link[SeuratObject]{Tool}}, \code{\link[SeuratObject:Tool]{Tool<-}}, \code{\link[SeuratObject]{UpdateSeuratObject}}, \code{\link[SeuratObject]{VariableFeatures}}, \code{\link[SeuratObject:VariableFeatures]{VariableFeatures<-}}, \code{\link[SeuratObject]{WhichCells}}} }} diff --git a/man/roxygen/templates/param-dotsi.R b/man/roxygen/templates/param-dotsi.R new file mode 100644 index 000000000..4fd71cedb --- /dev/null +++ b/man/roxygen/templates/param-dotsi.R @@ -0,0 +1 @@ +#' @param ... Ignored diff --git a/man/roxygen/templates/param-dotsm.R b/man/roxygen/templates/param-dotsm.R new file mode 100644 index 000000000..17d5b6da6 --- /dev/null +++ b/man/roxygen/templates/param-dotsm.R @@ -0,0 +1 @@ +#' @param ... Arguments passed to other methods diff --git a/man/writing-integration.Rd b/man/writing-integration.Rd new file mode 100644 index 000000000..60f151cdf --- /dev/null +++ b/man/writing-integration.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/integration5.R +\name{writing-integration} +\alias{writing-integration} +\title{Writing Integration Method Functions} +\description{ +Integration method functions can be written by anyone to implement any +integration method in Seurat. These methods should expect to take a +\link[SeuratObject:Assay5]{v5 assay} as input and return a named list of +objects that can be added back to a \code{Seurat} object (eg. a +\link[SeuratObject:DimReduc]{dimensional reduction} or cell-level meta data) +} +\section{Provided Parameters}{ + +Every integration method function should expect the following arguments: +\itemize{ + \item \dQuote{\code{object}}: an \code{\link[SeuratObject]{Assay5}} object + \item \dQuote{\code{orig}}: \link[SeuratObject:DimReduc]{dimensional + reduction} to correct + \item \dQuote{\code{layers}}: names of normalized layers in \code{object} + \item \dQuote{\code{scale.layer}}: name(s) of scaled layer(s) in + \code{object} + \item \dQuote{\code{features}}: a vector of features for integration + \item \dQuote{\code{groups}}: a one-column data frame with the groups for + each cell in \code{object}; the column name will be \dQuote{group} +} +} + +\section{Method Discovery}{ + +The documentation for \code{\link{IntegrateLayers}()} will automatically +link to integration method functions provided by packages in the +\code{\link[base]{search}()} space. To make an integration method function +discoverable by the documentation, simply add an attribute named +\dQuote{\code{Seurat.method}} to the function with a value of +\dQuote{\code{integration}} +\preformatted{ +attr(MyIntegrationFunction, which = "Seurat.method") <- "integration" +} +} + +\seealso{ +\code{\link{IntegrateLayers}()} +} +\concept{integration} +\keyword{internal} diff --git a/src/Makevars b/src/Makevars index a7f35101d..e9d976546 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1 +1 @@ -CXX_STD = CXX11 +CXX_STD = CXX17 diff --git a/src/integration.cpp b/src/integration.cpp index 0b613b0be..0a2b59580 100644 --- a/src/integration.cpp +++ b/src/integration.cpp @@ -11,15 +11,15 @@ typedef Eigen::Triplet T; // [[Rcpp::export(rng = false)]] Eigen::SparseMatrix FindWeightsC( - NumericVector cells2, - Eigen::MatrixXd distances, - std::vector anchor_cells2, - std::vector integration_matrix_rownames, - Eigen::MatrixXd cell_index, - Eigen::VectorXd anchor_score, - double min_dist, - double sd, - bool display_progress + NumericVector cells2, + Eigen::MatrixXd distances, + std::vector anchor_cells2, + std::vector integration_matrix_rownames, + Eigen::MatrixXd cell_index, + Eigen::VectorXd anchor_score, + double min_dist, + double sd, + bool display_progress ) { std::vector tripletList; tripletList.reserve(anchor_cells2.size() * 10); @@ -87,14 +87,24 @@ Eigen::SparseMatrix FindWeightsC( // [[Rcpp::export(rng = false)]] Eigen::SparseMatrix IntegrateDataC( - Eigen::SparseMatrix integration_matrix, - Eigen::SparseMatrix weights, - Eigen::SparseMatrix expression_cells2 + Eigen::SparseMatrix integration_matrix, + Eigen::SparseMatrix weights, + Eigen::SparseMatrix expression_cells2 ) { Eigen::SparseMatrix corrected = expression_cells2 - weights.transpose() * integration_matrix; return(corrected); } +template +std::vector sort_indexes(const std::vector &v) { + // initialize original index locations + std::vector idx(v.size()); + std::iota(idx.begin(), idx.end(), 0); + std::stable_sort(idx.begin(), idx.end(), + [&v](size_t i1, size_t i2) {return v[i1] < v[i2];}); + return idx; +} + // [[Rcpp::export]] std::vector ScoreHelper( Eigen::SparseMatrix snn, diff --git a/tests/testdata/visium/spatial/scalefactors_json.json b/tests/testdata/visium/spatial/scalefactors_json.json index 3b7167131..519589717 100644 --- a/tests/testdata/visium/spatial/scalefactors_json.json +++ b/tests/testdata/visium/spatial/scalefactors_json.json @@ -1 +1 @@ -{"spot_diameter_fullres": 96.3607212981978, "tissue_hires_scalef": 0.17211704, "fiducial_diameter_fullres": 144.54108194729673, "tissue_lowres_scalef": 0.051635113} \ No newline at end of file +{"tissue_hires_scalef": 0.17211704, "tissue_lowres_scalef": 0.051635113, "fiducial_diameter_fullres": 144.5324756763478, "spot_diameter_fullres": 89.47248494250101} \ No newline at end of file diff --git a/tests/testdata/visium/spatial/tissue_hires_image.png b/tests/testdata/visium/spatial/tissue_hires_image.png new file mode 100644 index 000000000..fcf39b512 Binary files /dev/null and b/tests/testdata/visium/spatial/tissue_hires_image.png differ diff --git a/tests/testdata/visium/spatial/tissue_lowres_image.png b/tests/testdata/visium/spatial/tissue_lowres_image.png index e061991c8..9363e01de 100644 Binary files a/tests/testdata/visium/spatial/tissue_lowres_image.png and b/tests/testdata/visium/spatial/tissue_lowres_image.png differ diff --git a/tests/testdata/visium_hd/binned_outputs/square_008um/filtered_feature_bc_matrix.h5 b/tests/testdata/visium_hd/binned_outputs/square_008um/filtered_feature_bc_matrix.h5 new file mode 100644 index 000000000..24110a1ec Binary files /dev/null and b/tests/testdata/visium_hd/binned_outputs/square_008um/filtered_feature_bc_matrix.h5 differ diff --git a/tests/testdata/visium_hd/binned_outputs/square_008um/spatial/scalefactors_json.json b/tests/testdata/visium_hd/binned_outputs/square_008um/spatial/scalefactors_json.json new file mode 100755 index 000000000..fdee0beb1 --- /dev/null +++ b/tests/testdata/visium_hd/binned_outputs/square_008um/spatial/scalefactors_json.json @@ -0,0 +1,9 @@ +{ + "spot_diameter_fullres": 29.22254153479092, + "bin_size_um": 8.0, + "microns_per_pixel": 0.27376126715315274, + "regist_target_img_scalef": 0.25404352, + "tissue_lowres_scalef": 0.025404353, + "fiducial_diameter_fullres": 1205.4298383101254, + "tissue_hires_scalef": 0.25404352 +} \ No newline at end of file diff --git a/tests/testdata/visium_hd/binned_outputs/square_008um/spatial/tissue_hires_image.png b/tests/testdata/visium_hd/binned_outputs/square_008um/spatial/tissue_hires_image.png new file mode 100755 index 000000000..3c659c3c6 Binary files /dev/null and b/tests/testdata/visium_hd/binned_outputs/square_008um/spatial/tissue_hires_image.png differ diff --git a/tests/testdata/visium_hd/binned_outputs/square_008um/spatial/tissue_lowres_image.png b/tests/testdata/visium_hd/binned_outputs/square_008um/spatial/tissue_lowres_image.png new file mode 100755 index 000000000..634176afe Binary files /dev/null and b/tests/testdata/visium_hd/binned_outputs/square_008um/spatial/tissue_lowres_image.png differ diff --git a/tests/testdata/visium_hd/binned_outputs/square_008um/spatial/tissue_positions.parquet b/tests/testdata/visium_hd/binned_outputs/square_008um/spatial/tissue_positions.parquet new file mode 100644 index 000000000..fc8760a5c Binary files /dev/null and b/tests/testdata/visium_hd/binned_outputs/square_008um/spatial/tissue_positions.parquet differ diff --git a/tests/testdata/visium_hd/binned_outputs/square_016um/filtered_feature_bc_matrix.h5 b/tests/testdata/visium_hd/binned_outputs/square_016um/filtered_feature_bc_matrix.h5 new file mode 100644 index 000000000..24110a1ec Binary files /dev/null and b/tests/testdata/visium_hd/binned_outputs/square_016um/filtered_feature_bc_matrix.h5 differ diff --git a/tests/testdata/visium_hd/binned_outputs/square_016um/spatial/scalefactors_json.json b/tests/testdata/visium_hd/binned_outputs/square_016um/spatial/scalefactors_json.json new file mode 100755 index 000000000..3ce1f9af4 --- /dev/null +++ b/tests/testdata/visium_hd/binned_outputs/square_016um/spatial/scalefactors_json.json @@ -0,0 +1,9 @@ +{ + "spot_diameter_fullres": 58.44508306958184, + "bin_size_um": 16.0, + "microns_per_pixel": 0.27376126715315274, + "regist_target_img_scalef": 0.25404352, + "tissue_lowres_scalef": 0.025404353, + "fiducial_diameter_fullres": 1205.4298383101254, + "tissue_hires_scalef": 0.25404352 +} \ No newline at end of file diff --git a/tests/testdata/visium_hd/binned_outputs/square_016um/spatial/tissue_hires_image.png b/tests/testdata/visium_hd/binned_outputs/square_016um/spatial/tissue_hires_image.png new file mode 100755 index 000000000..3c659c3c6 Binary files /dev/null and b/tests/testdata/visium_hd/binned_outputs/square_016um/spatial/tissue_hires_image.png differ diff --git a/tests/testdata/visium_hd/binned_outputs/square_016um/spatial/tissue_lowres_image.png b/tests/testdata/visium_hd/binned_outputs/square_016um/spatial/tissue_lowres_image.png new file mode 100755 index 000000000..634176afe Binary files /dev/null and b/tests/testdata/visium_hd/binned_outputs/square_016um/spatial/tissue_lowres_image.png differ diff --git a/tests/testdata/visium_hd/binned_outputs/square_016um/spatial/tissue_positions.parquet b/tests/testdata/visium_hd/binned_outputs/square_016um/spatial/tissue_positions.parquet new file mode 100644 index 000000000..8d82f7dad Binary files /dev/null and b/tests/testdata/visium_hd/binned_outputs/square_016um/spatial/tissue_positions.parquet differ diff --git a/tests/testthat.R b/tests/testthat.R index 72a15ce4b..0bea94fda 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,4 +1,14 @@ library(testthat) library(Seurat) +# +# # Run tests for 'v3' +# message('Run tests for v3 assay') +# options(Seurat.object.assay.version = 'v3') +# test_check("Seurat") +# Run tests for 'v5' +message('Run tests for v5 assay') +options(Seurat.object.assay.version = 'v5') test_check("Seurat") + + diff --git a/tests/testthat/test_differential_expression.R b/tests/testthat/test_differential_expression.R index 373a6b6d9..f0c685670 100644 --- a/tests/testthat/test_differential_expression.R +++ b/tests/testthat/test_differential_expression.R @@ -7,81 +7,82 @@ set.seed(seed = 42) context("FindMarkers") clr.obj <- suppressWarnings(NormalizeData(pbmc_small, normalization.method = "CLR")) -sct.obj <- suppressWarnings(suppressMessages(SCTransform(pbmc_small))) +sct.obj <- suppressWarnings(suppressMessages(SCTransform(pbmc_small, vst.flavor = "v1"))) -markers.0 <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, verbose = FALSE, base = exp(1))) -markers.01 <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, verbose = FALSE, base = exp(1))) -results.clr <- suppressWarnings(FindMarkers(object = clr.obj, ident.1 = 0, ident.2 = 1, verbose = FALSE, base = exp(1))) -results.sct <- suppressWarnings(FindMarkers(object = sct.obj, ident.1 = 0, ident.2 = 1, verbose = FALSE, base = exp(1))) +markers.0 <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, verbose = FALSE, base = exp(1),pseudocount.use = 1)) +markers.01 <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, verbose = FALSE, base = exp(1),pseudocount.use = 1)) +results.clr <- suppressWarnings(FindMarkers(object = clr.obj, ident.1 = 0, ident.2 = 1, verbose = FALSE, base = exp(1), pseudocount.use = 1)) +results.sct <- suppressWarnings(FindMarkers(object = sct.obj, ident.1 = 0, ident.2 = 1, verbose = FALSE, base = exp(1), pseudocount.use = 1)) -test_that("Default settings work as expected", { + +test_that("Default settings work as expected with pseudocount = 1", { expect_error(FindMarkers(object = pbmc_small)) expect_error(FindMarkers(object = pbmc_small, ident.1 = "test")) expect_error(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = "test")) expect_equal(colnames(x = markers.0), c("p_val", "avg_logFC", "pct.1", "pct.2", "p_val_adj")) - expect_equal(markers.0[1, "p_val"], 9.572778e-13) - expect_equal(markers.0[1, "avg_logFC"], -4.034691, tolerance = 1e-6) + expect_equal(markers.0[1, "p_val"], 9.572778e-13, tolerance = 1e-18) + expect_equal(markers.0[1, "avg_logFC"], -4.180029, tolerance = 1e-6) expect_equal(markers.0[1, "pct.1"], 0.083) expect_equal(markers.0[1, "pct.2"], 0.909) - expect_equal(markers.0[1, "p_val_adj"], 2.201739e-10) - expect_equal(nrow(x = markers.0), 204) + expect_equal(markers.0[1, "p_val_adj"], 2.201739e-10, tolerance = 1e-15) + expect_equal(nrow(x = markers.0), 228) expect_equal(rownames(markers.0)[1], "HLA-DPB1") - expect_equal(markers.01[1, "p_val"], 1.702818e-11) - expect_equal(markers.01[1, "avg_logFC"], -2.539289, tolerance = 1e-6) + expect_equal(markers.01[1, "p_val"], 1.702818e-11, tolerance = 1e-16) + expect_equal(markers.01[1, "avg_logFC"], -2.638242, tolerance = 1e-6) expect_equal(markers.01[1, "pct.1"], 0.111) expect_equal(markers.01[1, "pct.2"], 1.00) - expect_equal(markers.01[1, "p_val_adj"], 3.916481e-09) - expect_equal(nrow(x = markers.01), 201) + expect_equal(markers.01[1, "p_val_adj"], 3.916481e-09, tolerance = 1e-14) + expect_equal(nrow(x = markers.01), 222) expect_equal(rownames(x = markers.01)[1], "TYMP") # CLR normalization - expect_equal(results.clr[1, "p_val"], 1.209462e-11) - expect_equal(results.clr[1, "avg_logFC"], -0.8290693, tolerance = 1e-6) + expect_equal(results.clr[1, "p_val"], 1.209462e-11, tolerance = 1e-16) + expect_equal(results.clr[1, "avg_logFC"], -2.946633, tolerance = 1e-6) expect_equal(results.clr[1, "pct.1"], 0.111) expect_equal(results.clr[1, "pct.2"], 0.96) - expect_equal(results.clr[1, "p_val_adj"], 2.781762e-09) - expect_equal(nrow(x = results.clr), 85) + expect_equal(results.clr[1, "p_val_adj"], 2.781762e-09, tolerance = 1e-14) + expect_equal(nrow(x = results.clr), 213) expect_equal(rownames(x = results.clr)[1], "S100A8") # SCT normalization - expect_equal(results.sct[1, "p_val"], 6.225491e-11) - expect_equal(results.sct[1, "avg_logFC"], -1.081321, tolerance = 1e-6) + expect_equal(results.sct[1, "p_val"], 6.225491e-11, tolerance = 1e-16) + expect_equal(results.sct[1, "avg_logFC"], -2.545867, tolerance = 1e-6) expect_equal(results.sct[1, "pct.1"], 0.111) expect_equal(results.sct[1, "pct.2"], 0.96) - expect_equal(results.sct[1, "p_val_adj"], 1.369608e-08) - expect_equal(nrow(x = results.sct), 158) + expect_equal(results.sct[1, "p_val_adj"], 1.369608e-08, tolerance = 1e-13) + expect_equal(nrow(x = results.sct), 214) expect_equal(rownames(x = results.sct)[1], "TYMP") }) -tymp.results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, features = "TYMP", verbose = FALSE, base = exp(1))) -vargenes.results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, features = VariableFeatures(object = pbmc_small), verbose = FALSE, base = exp(1))) +tymp.results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, features = "TYMP", verbose = FALSE, base = exp(1),pseudocount.use = 1)) +vargenes.results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, features = VariableFeatures(object = pbmc_small), verbose = FALSE, base = exp(1),pseudocount.use = 1)) test_that("features parameter behaves correctly ", { expect_equal(nrow(x = tymp.results), 1) - expect_equal(tymp.results[1, "p_val"], 3.227445e-07) - expect_equal(tymp.results[1, "avg_logFC"], -2.093928, tolerance = 1e-6) + expect_equal(tymp.results[1, "p_val"], 3.227445e-07, tolerance = 1e-12) + expect_equal(tymp.results[1, "avg_logFC"], -2.188179, tolerance = 1e-6) expect_equal(tymp.results[1, "pct.1"], 0.111) expect_equal(tymp.results[1, "pct.2"], 0.682) - expect_equal(tymp.results[1, "p_val_adj"], 7.423123e-05) + expect_equal(tymp.results[1, "p_val_adj"], 7.423123e-05, tolerance = 1e-10) expect_equal(rownames(x = tymp.results)[1], "TYMP") - expect_equal(nrow(x = vargenes.results), 19) - expect_equal(vargenes.results[19, "p_val"], 4.225151e-01, tolerance = 1e-6) - expect_equal(vargenes.results[19, "avg_logFC"], 1.5976958, tolerance = 1e-6) - expect_equal(vargenes.results[19, "pct.1"], 0.139) - expect_equal(vargenes.results[19, "pct.2"], 0.091) - expect_equal(vargenes.results[19, "p_val_adj"], 1.000000e+00) - expect_equal(rownames(x = vargenes.results)[19], "PARVB") + expect_equal(nrow(x = vargenes.results), 20) + expect_equal(vargenes.results[20, "p_val"], 4.225151e-01, tolerance = 1e-6) + expect_equal(vargenes.results[20, "avg_logFC"], 1.796863, tolerance = 1e-6) + expect_equal(vargenes.results[20, "pct.1"], 0.139) + expect_equal(vargenes.results[20, "pct.2"], 0.091) + expect_equal(vargenes.results[20, "p_val_adj"], 1.000000e+00) + expect_equal(rownames(x = vargenes.results)[20], "PARVB") }) -results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = Cells(x = pbmc_small)[1:40], ident.2 = Cells(x = pbmc_small)[41:80], verbose = FALSE, base = exp(1))) +results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = Cells(x = pbmc_small)[1:40], ident.2 = Cells(x = pbmc_small)[41:80], verbose = FALSE, base = exp(1),pseudocount.use = 1)) test_that("passing cell names works", { - expect_equal(nrow(x = results), 190) + expect_equal(nrow(x = results), 216) expect_equal(results[1, "p_val"], 0.0001690882) - expect_equal(results[1, "avg_logFC"], -1.790824, tolerance = 1e-6) + expect_equal(results[1, "avg_logFC"], -1.967123, tolerance = 1e-6) expect_equal(results[1, "pct.1"], 0.075) expect_equal(results[1, "pct.2"], 0.450) expect_equal(results[1, "p_val_adj"], 0.03889028) @@ -90,143 +91,143 @@ test_that("passing cell names works", { results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, verbose = FALSE, base = exp(1), pseudocount.use = 0.1)) results.clr <- suppressWarnings(FindMarkers(object = clr.obj, ident.1 = 0, ident.2 = 1, verbose = FALSE, base = exp(1), pseudocount.use = 0.1)) -results.sct <- suppressWarnings(FindMarkers(object = sct.obj, ident.1 = 0, ident.2 = 1, verbose = FALSE, base = exp(1), pseudocount.use = 0.1)) +results.sct <- suppressWarnings(FindMarkers(object = sct.obj, ident.1 = 0, ident.2 = 1, verbose = FALSE, base = exp(1), pseudocount.use = 0.1, vst.flavor = "v1")) test_that("setting pseudocount.use works", { - expect_equal(nrow(x = results), 202) - expect_equal(results[1, "avg_logFC"], -2.630395, tolerance = 1e-6) - expect_equal(nrow(x = results.clr), 182) - expect_equal(results.clr[1, "avg_logFC"], -2.317338, tolerance = 1e-6) - expect_equal(nrow(results.sct), 194) - expect_equal(results.sct[1, "avg_logFC"], -2.253920, tolerance = 1e-6) + expect_equal(nrow(x = results), 222) + expect_equal(results[1, "avg_logFC"], -2.640848, tolerance = 1e-6) + expect_equal(nrow(x = results.clr), 214) + expect_equal(results.clr[1, "avg_logFC"], -3.322368, tolerance = 1e-6) + expect_equal(nrow(results.sct), 215) + expect_equal(results.sct[1, "avg_logFC"], -2.668866, tolerance = 1e-6) }) -results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, verbose = FALSE, base = exp(1), mean.fxn = rowMeans)) -results.clr <- suppressWarnings(FindMarkers(object = clr.obj, ident.1 = 0, ident.2 = 1, verbose = FALSE, base = exp(1), mean.fxn = rowMeans)) -results.sct <- suppressWarnings(FindMarkers(object = sct.obj, ident.1 = 0, ident.2 = 1, verbose = FALSE, base = exp(1), mean.fxn = rowMeans)) +results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, verbose = FALSE, base = exp(1), pseudocount.use = 1, mean.fxn = rowMeans)) +results.clr <- suppressWarnings(FindMarkers(object = clr.obj, ident.1 = 0, ident.2 = 1, verbose = FALSE, base = exp(1), pseudocount.use = 1, mean.fxn = rowMeans)) +results.sct <- suppressWarnings(FindMarkers(object = sct.obj, ident.1 = 0, ident.2 = 1, verbose = FALSE, base = exp(1), pseudocount.use = 1, mean.fxn = rowMeans, vst.flaovr = "v1")) test_that("setting mean.fxn works", { - expect_equal(nrow(x = results), 191) + expect_equal(nrow(x = results), 216) expect_equal(results[1, "avg_logFC"], -4.204346, tolerance = 1e-6) expect_equal(results.clr[1, "avg_logFC"], -1.353025, tolerance = 1e-6) expect_equal(results.sct[1, "avg_logFC"], -1.064042, tolerance = 1e-6) }) -results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, logfc.threshold = 2, verbose = FALSE, base = exp(1))) +results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, logfc.threshold = 2, verbose = FALSE, base = exp(1), pseudocount.use = 1)) test_that("logfc.threshold works", { - expect_equal(nrow(x = results), 112) + expect_equal(nrow(x = results), 139) expect_gte(min(abs(x = results$avg_logFC)), 2) }) -results <- expect_warning(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, logfc.threshold = 100, verbose = FALSE, base = exp(1))) +results <- expect_warning(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, logfc.threshold = 100, verbose = FALSE, base = exp(1), pseudocount.use = 1)) test_that("logfc.threshold warns when none met", { expect_equal(nrow(x = results), 0) }) -results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, min.pct = 0.5, verbose = FALSE, base = exp(1))) +results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, min.pct = 0.5, verbose = FALSE, base = exp(1), pseudocount.use = 1)) test_that("min.pct works", { - expect_equal(nrow(x = results), 65) + expect_equal(nrow(x = results), 66) expect_gte(min(apply(X = results, MARGIN = 1, FUN = function(x) max(x[3], x[4]))), 0.5) }) -results <- expect_warning(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, min.pct = 2.0, verbose = FALSE, base = exp(1))) +results <- expect_warning(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, min.pct = 2.0, verbose = FALSE, base = exp(1), pseudocount.use = 1)) test_that("min.pct warns when none met", { expect_equal(nrow(x = results), 0) }) -results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, min.diff.pct = 0.5, verbose = FALSE, base = exp(1))) +results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, min.diff.pct = 0.5, verbose = FALSE, base = exp(1), pseudocount.use = 1)) test_that("min.diff.pct works", { expect_equal(nrow(x = results), 44) expect_gte(min(apply(X = results, MARGIN = 1, FUN = function(x) abs(x[4] - x[3]))), 0.5) }) -results <- expect_warning(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, min.diff.pct = 1.0, verbose = FALSE, base = exp(1))) +results <- expect_warning(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, min.diff.pct = 1.0, verbose = FALSE, base = exp(1), pseudocount.use = 1)) test_that("min.diff.pct warns when none met", { expect_equal(nrow(x = results), 0) }) -results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, only.pos = TRUE, verbose = FALSE, base = exp(1))) +results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, only.pos = TRUE, verbose = FALSE, base = exp(1), pseudocount.use = 1)) test_that("only.pos works", { - expect_equal(nrow(x = results), 116) + expect_equal(nrow(x = results), 127) expect_true(all(results$avg_logFC > 0)) }) -results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, max.cells.per.ident = 20, verbose = FALSE, base = exp(1))) +results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, max.cells.per.ident = 20, verbose = FALSE, base = exp(1),pseudocount.use = 1)) test_that("max.cells.per.ident works", { - expect_equal(nrow(x = results), 201) - expect_equal(results[1, "p_val"], 3.428568e-08) - expect_equal(results[1, "avg_logFC"], -2.539289, tolerance = 1e-6) + expect_equal(nrow(x = results), 222) + expect_equal(results[1, "p_val"], 3.428568e-08, tolerance = 1e-13) + expect_equal(results[1, "avg_logFC"], -2.638242, tolerance = 1e-6) expect_equal(results[1, "pct.1"], 0.111) expect_equal(results[1, "pct.2"], 1) expect_equal(results[1, "p_val_adj"], 7.885706e-06) expect_equal(rownames(x = results)[1], "TYMP") }) -results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, latent.vars= "groups", verbose = FALSE, test.use = 'LR', base = exp(1))) +results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, latent.vars= "groups", verbose = FALSE, test.use = 'LR', base = exp(1), pseudocount.use = 1)) test_that("latent.vars works", { expect_error(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, latent.vars= "fake", verbose = FALSE)) expect_warning(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, latent.vars= "groups", verbose = FALSE)) - expect_equal(nrow(x = results), 201) - expect_equal(results[1, "p_val"], 2.130202e-16) - expect_equal(results[1, "avg_logFC"], -3.082150, tolerance = 1e-6) + expect_equal(nrow(x = results), 222) + expect_equal(results[1, "p_val"], 2.130202e-16, tolerance = 1e-21) + expect_equal(results[1, "avg_logFC"], -3.102866, tolerance = 1e-6) expect_equal(results[1, "pct.1"], 0.417) expect_equal(results[1, "pct.2"], 1) - expect_equal(results[1, "p_val_adj"], 4.899466e-14) + expect_equal(results[1, "p_val_adj"], 4.899466e-14, tolerance = 1e-19) expect_equal(rownames(x = results)[1], "LYZ") }) -results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = "g1", ident.2 = "g2", group.by= "groups", verbose = FALSE, base = exp(1))) +results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = "g1", ident.2 = "g2", group.by= "groups", verbose = FALSE, base = exp(1), pseudocount.use = 1)) t2 <- pbmc_small Idents(object = t2) <- "groups" -results2 <- suppressWarnings(FindMarkers(object = t2, ident.1 = "g1", ident.2 = "g2", verbose = FALSE, base = exp(1))) +results2 <- suppressWarnings(FindMarkers(object = t2, ident.1 = "g1", ident.2 = "g2", verbose = FALSE, base = exp(1), pseudocount.use = 1)) test_that("group.by works", { - expect_equal(nrow(x = results), 136) + expect_equal(nrow(x = results), 190) expect_equal(results, results2) expect_equal(results[1, "p_val"], 0.02870319) - expect_equal(results[1, "avg_logFC"], 0.8226720, tolerance = 1e-6) + expect_equal(results[1, "avg_logFC"], 0.8473584, tolerance = 1e-6) expect_equal(results[1, "pct.1"], 0.455) expect_equal(results[1, "pct.2"], 0.194) expect_equal(results[1, "p_val_adj"], 1) expect_equal(rownames(x = results)[1], "NOSIP") }) -results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = "g1", ident.2 = "g2", group.by= "groups", subset.ident = 0, verbose = FALSE, base = exp(1))) +results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = "g1", ident.2 = "g2", group.by= "groups", subset.ident = 0, verbose = FALSE, base = exp(1), pseudocount.use = 1)) t2 <- subset(x = pbmc_small, idents = 0) Idents(object = t2) <- "groups" -results2 <- suppressWarnings(FindMarkers(object = t2, ident.1 = "g1", ident.2 = "g2", verbose = FALSE, base = exp(1))) +results2 <- suppressWarnings(FindMarkers(object = t2, ident.1 = "g1", ident.2 = "g2", verbose = FALSE, base = exp(1), pseudocount.use = 1)) test_that("subset.ident works", { - expect_equal(nrow(x = results), 127) + expect_equal(nrow(x = results), 183) expect_equal(results, results2) expect_equal(results[1, "p_val"], 0.01293720) - expect_equal(results[1, "avg_logFC"], 1.799280, tolerance = 1e-6) + expect_equal(results[1, "avg_logFC"], 1.912603, tolerance = 1e-6) expect_equal(results[1, "pct.1"], 0.50) expect_equal(results[1, "pct.2"], 0.125) expect_equal(results[1, "p_val_adj"], 1) expect_equal(rownames(x = results)[1], "TSPO") }) -results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, reduction = "pca", verbose = FALSE, base = exp(1))) +results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, reduction = "pca", verbose = FALSE, base = exp(1), pseudocount.use = 1)) test_that("reduction works", { - expect_equal(results[1, "p_val"], 1.664954e-10) + expect_equal(results[1, "p_val"], 1.664954e-10, tolerance = 1e-15) expect_equal(results[1, "avg_diff"], -2.810453669, tolerance = 1e-6) - expect_equal(results[1, "p_val_adj"], 3.163412e-09) + expect_equal(results[1, "p_val_adj"], 3.163412e-09, tolerance = 1e-14) expect_equal(rownames(x = results)[1], "PC_2") }) -results <- FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, test.use = "bimod", verbose = FALSE, base = exp(1)) +results <- FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, test.use = "bimod", verbose = FALSE, base = exp(1), pseudocount.use = 1) test_that("bimod test works", { - expect_equal(nrow(x = results), 201) - expect_equal(results[1, "p_val"], 4.751376e-17) - expect_equal(results[1, "avg_logFC"], -2.552769, tolerance = 1e-6) + expect_equal(nrow(x = results), 222) + expect_equal(results[1, "p_val"], 4.751376e-17, tolerance = 1e-22) + expect_equal(results[1, "avg_logFC"], -2.57219, tolerance = 1e-6) expect_equal(results[1, "pct.1"], 0.306) expect_equal(results[1, "pct.2"], 1.00) - expect_equal(results[1, "p_val_adj"], 1.092816e-14) + expect_equal(results[1, "p_val_adj"], 1.092816e-14, tolerance = 1e-19) expect_equal(rownames(x = results)[1], "CST3") }) -results <- FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, test.use = "roc", verbose = FALSE, base = exp(1)) +results <- FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, test.use = "roc", verbose = FALSE, base = exp(1), pseudocount.use = 1) test_that("roc test works", { - expect_equal(nrow(x = results), 201) + expect_equal(nrow(x = results), 222) # expect_equal(colnames(x = results), c("myAUC", "avg_diff", "power", "pct.1", "pct.2")) expect_equal(colnames(x = results), c("myAUC", "avg_diff", "power", "avg_logFC", "pct.1", "pct.2")) expect_equal(results["CST3", "myAUC"], 0.018) @@ -237,91 +238,173 @@ test_that("roc test works", { expect_equal(rownames(x = results)[1], "LYZ") }) -results <- FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, test.use = "t", verbose = FALSE, base = exp(1)) -test_that("bimod test works", { - expect_equal(nrow(x = results), 201) - expect_equal(results["CST3", "p_val"], 1.170112e-15) - expect_equal(results["CST3", "avg_logFC"], -2.552769 , tolerance = 1e-6) +results <- FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, test.use = "t", verbose = FALSE, base = exp(1), pseudocount.use = 1) +test_that("t test works", { + expect_equal(nrow(x = results), 222) + expect_equal(results["CST3", "p_val"], 1.170112e-15, tolerance = 1e-20) + expect_equal(results["CST3", "avg_logFC"], -2.57219, tolerance = 1e-6) expect_equal(results["CST3", "pct.1"], 0.306) expect_equal(results["CST3", "pct.2"], 1.00) - expect_equal(results["CST3", "p_val_adj"], 2.691258e-13) + expect_equal(results["CST3", "p_val_adj"], 2.691258e-13, tolerance = 1e-18) expect_equal(rownames(x = results)[1], "TYMP") }) -results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, test.use = "negbinom", verbose = FALSE, base = exp(1))) +results <- suppressWarnings( + FindMarkers( + object = pbmc_small, + ident.1 = 0, ident.2 = 1, + test.use = "negbinom", + verbose = FALSE, + base = exp(1), + fc.slot = "counts", + pseudocount.use = 1 + ) +) test_that("negbinom test works", { - expect_equal(nrow(x = results), 149) - expect_equal(results["CST3", "p_val"], 1.354443e-17) - expect_equal(results["CST3", "avg_logFC"], -2.353701, tolerance = 1e-6) + expect_equal(nrow(x = results), 204) + expect_equal(results["CST3", "p_val"], 1.354443e-17, tolerance = 1e-22) + expect_equal(results["CST3", "avg_logFC"], -2.878123, tolerance = 1e-6) expect_equal(results["CST3", "pct.1"], 0.306) expect_equal(results["CST3", "pct.2"], 1.00) - expect_equal(results["CST3", "p_val_adj"], 3.115218e-15) + expect_equal(results["CST3", "p_val_adj"], 3.115218e-15, tolerance = 1e-20) expect_equal(rownames(x = results)[1], "LYZ") }) -results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, test.use = "poisson", verbose = FALSE, base = exp(1))) +results <- suppressWarnings( + FindMarkers( + object = pbmc_small, + ident.1 = 0, + ident.2 = 1, + test.use = "poisson", + verbose = FALSE, + base = exp(1), + fc.slot = "counts", + pseudocount.use = 1 + ) +) test_that("poisson test works", { - expect_equal(nrow(x = results), 149) - expect_equal(results["CST3", "p_val"], 3.792196e-78) - expect_equal(results["CST3", "avg_logFC"], -2.353701, tolerance = 1e-6) + expect_equal(nrow(x = results), 204) + expect_equal(results["CST3", "p_val"], 3.792196e-78, tolerance = 1e-83) + expect_equal(results["CST3", "avg_logFC"], -2.878123, tolerance = 1e-6) expect_equal(results["CST3", "pct.1"], 0.306) expect_equal(results["CST3", "pct.2"], 1.00) - expect_equal(results["CST3", "p_val_adj"], 8.722050e-76) + expect_equal(results["CST3", "p_val_adj"], 8.722050e-76, tolerance = 1e-81) expect_equal(rownames(x = results)[1], "LYZ") }) -results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, test.use = "LR", verbose = FALSE, base = exp(1))) +results <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, test.use = "LR", verbose = FALSE, base = exp(1), pseudocount.use = 1)) test_that("LR test works", { - expect_equal(nrow(x = results), 201) - expect_equal(results["CST3", "p_val"], 3.990707e-16) - expect_equal(results["CST3", "avg_logFC"], -2.552769, tolerance = 1e-6) + expect_equal(nrow(x = results), 222) + expect_equal(results["CST3", "p_val"], 3.990707e-16, tolerance = 1e-21) + expect_equal(results["CST3", "avg_logFC"], -2.57219, tolerance = 1e-6) expect_equal(results["CST3", "pct.1"], 0.306) expect_equal(results["CST3", "pct.2"], 1.00) - expect_equal(results["CST3", "p_val_adj"], 9.178625e-14) + expect_equal(results["CST3", "p_val_adj"], 9.178625e-14, tolerance = 1e-19) expect_equal(rownames(x = results)[1], "LYZ") }) +test_that("FindMarkers with wilcox_limma works", { + skip_on_cran() + skip_if_not_installed("limma") + markers.0.limma <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, verbose = FALSE, base = exp(1),pseudocount.use = 1,test.use='wilcox_limma')) + markers.01.limma <- suppressWarnings(FindMarkers(object = pbmc_small, ident.1 = 0, ident.2 = 1, verbose = FALSE, base = exp(1),pseudocount.use = 1,test.use='wilcox_limma')) + results.clr.limma <- suppressWarnings(FindMarkers(object = clr.obj, ident.1 = 0, ident.2 = 1, verbose = FALSE, base = exp(1), pseudocount.use = 1,test.use='wilcox_limma')) + results.sct.limma <- suppressWarnings(FindMarkers(object = sct.obj, ident.1 = 0, ident.2 = 1, verbose = FALSE, base = exp(1), pseudocount.use = 1,test.use='wilcox_limma')) + + expect_equal(colnames(x = markers.0.limma), c("p_val", "avg_logFC", "pct.1", "pct.2", "p_val_adj")) + expect_equal(markers.0.limma[1, "p_val"], 9.572778e-13, tolerance = 1e-18) + expect_equal(markers.0.limma[1, "avg_logFC"], -4.180029, tolerance = 1e-6) + expect_equal(markers.0.limma[1, "pct.1"], 0.083) + expect_equal(markers.0.limma[1, "pct.2"], 0.909) + expect_equal(markers.0.limma[1, "p_val_adj"], 2.201739e-10, tolerance = 1e-15) + expect_equal(nrow(x = markers.0.limma), 228) + expect_equal(rownames(markers.0.limma)[1], "HLA-DPB1") + + expect_equal(markers.01.limma[1, "p_val"], 1.702818e-11, tolerance = 1e-16) + expect_equal(markers.01.limma[1, "avg_logFC"], -2.638242, tolerance = 1e-6) + expect_equal(markers.01.limma[1, "pct.1"], 0.111) + expect_equal(markers.01.limma[1, "pct.2"], 1.00) + expect_equal(markers.01.limma[1, "p_val_adj"], 3.916481e-09, tolerance = 1e-14) + expect_equal(nrow(x = markers.01.limma), 222) + expect_equal(rownames(x = markers.01.limma)[1], "TYMP") + + expect_equal(results.clr.limma[1, "p_val"], 1.209462e-11, tolerance = 1e-16) + expect_equal(results.clr.limma[1, "avg_logFC"], -2.946633, tolerance = 1e-6) + expect_equal(results.clr.limma[1, "pct.1"], 0.111) + expect_equal(results.clr.limma[1, "pct.2"], 0.96) + expect_equal(results.clr.limma[1, "p_val_adj"], 2.781762e-09, tolerance = 1e-14) + expect_equal(nrow(x = results.clr.limma), 213) + expect_equal(rownames(x = results.clr.limma)[1], "S100A8") + + expect_equal(results.sct.limma[1, "p_val"], 6.225491e-11, tolerance = 1e-16) + expect_equal(results.sct.limma[1, "avg_logFC"], -2.545867, tolerance = 1e-6) + expect_equal(results.sct.limma[1, "pct.1"], 0.111) + expect_equal(results.sct.limma[1, "pct.2"], 0.96) + expect_equal(results.sct.limma[1, "p_val_adj"], 1.369608e-08, tolerance = 1e-13) + expect_equal(nrow(x = results.sct.limma), 214) + expect_equal(rownames(x = results.sct.limma)[1], "TYMP") +}) + +test_that("BPCells FindMarkers gives same results", { + skip_on_cran() + skip_if_not_installed("BPCells") + library(BPCells) + library(Matrix) + mat_bpcells <- t(as(t(pbmc_small[['RNA']]$counts ), "IterableMatrix")) + pbmc_small[['RNAbp']] <- CreateAssay5Object(counts = mat_bpcells) + pbmc_small <- NormalizeData(pbmc_small, assay = "RNAbp") + markers.bp <- suppressWarnings(FindMarkers(object = pbmc_small, assay = "RNAbp", ident.1 = 0, verbose = FALSE, base = exp(1),pseudocount.use = 1)) + expect_equal(colnames(x = markers.bp), c("p_val", "avg_logFC", "pct.1", "pct.2", "p_val_adj")) + expect_equal(markers.bp[1, "p_val"], 9.572778e-13) + expect_equal(markers.bp[1, "avg_logFC"], -4.180029, tolerance = 1e-6) + expect_equal(markers.bp[1, "pct.1"], 0.083) + expect_equal(markers.bp[1, "pct.2"], 0.909) + expect_equal(markers.bp[1, "p_val_adj"], 2.201739e-10) + expect_equal(nrow(x = markers.bp), 228) + expect_equal(rownames(markers.bp)[1], "HLA-DPB1") +}) + # Tests for FindAllMarkers # ------------------------------------------------------------------------------- -results <- suppressMessages(suppressWarnings(FindAllMarkers(object = pbmc_small))) -results.clr <- suppressMessages(suppressWarnings(FindAllMarkers(object = clr.obj))) -results.sct <- suppressMessages(suppressWarnings(FindAllMarkers(object = sct.obj))) +results <- suppressMessages(suppressWarnings(FindAllMarkers(object = pbmc_small,pseudocount.use=1))) +results.clr <- suppressMessages(suppressWarnings(FindAllMarkers(object = clr.obj, pseudocount.use=1))) +results.sct <- suppressMessages(suppressWarnings(FindAllMarkers(object = sct.obj, pseudocount.use=1, vst.flavor = "v1"))) results.pseudo <- suppressMessages(suppressWarnings(FindAllMarkers(object = pbmc_small, pseudocount.use = 0.1))) test_that("FindAllMarkers works as expected", { expect_equal(colnames(x = results), c("p_val", "avg_log2FC", "pct.1", "pct.2", "p_val_adj", "cluster", "gene")) - expect_equal(results[1, "p_val"], 9.572778e-13) - expect_equal(results[1, "avg_log2FC"], -5.820829, tolerance = 1e-6) + expect_equal(results[1, "p_val"], 9.572778e-13, tolerance = 1e-18) + expect_equal(results[1, "avg_log2FC"], -6.030507, tolerance = 1e-6) expect_equal(results[1, "pct.1"], 0.083) expect_equal(results[1, "pct.2"], 0.909) - expect_equal(results[1, "p_val_adj"], 2.201739e-10) + expect_equal(results[1, "p_val_adj"], 2.201739e-10, tolerance = 1e-15) expect_equal(nrow(x = results), 222) expect_equal(rownames(results)[1], "HLA-DPB1") # CLR normalization - expect_equal(results.clr[1, "p_val"], 1.209462e-11) - expect_equal(results.clr[1, "avg_log2FC"], -1.079924, tolerance = 1e-6) + expect_equal(results.clr[1, "p_val"], 1.338858e-12, tolerance = 1e-17) + expect_equal(results.clr[1, "avg_log2FC"], -4.088546, tolerance = 1e-6) expect_equal(results.clr[1, "pct.1"], 0.083) expect_equal(results.clr[1, "pct.2"], 0.909) - expect_equal(results.clr[1, "p_val_adj"], 3.079373e-10) - expect_equal(nrow(x = results.clr), 200) + expect_equal(results.clr[1, "p_val_adj"], 3.079373e-10, tolerance = 1e-15) + expect_equal(nrow(x = results.clr), 222) expect_equal(rownames(x = results.clr)[1], "HLA-DPB1") # SCT normalization - expect_equal(results.sct[1, "p_val"], 4.25861e-12) - expect_equal(results.sct[1, "avg_log2FC"], -2.70188, tolerance = 1e-6) + expect_equal(results.sct[1, "p_val"], 4.25861e-12, tolerance = 1e-17) + expect_equal(results.sct[1, "avg_log2FC"], -5.088014, tolerance = 1e-6) expect_equal(results.sct[1, "pct.1"], 0.167) expect_equal(results.sct[1, "pct.2"], 0.909) - expect_equal(results.sct[1, "p_val_adj"], 9.368941e-10) - expect_equal(nrow(x = results.sct), 210) + expect_equal(results.sct[1, "p_val_adj"], 9.368941e-10, tolerance = 1e-15) + expect_equal(nrow(x = results.sct), 212) expect_equal(rownames(x = results.sct)[1], "HLA-DPB1") # pseudocount.use = 0.1 - expect_equal(results.pseudo[1, "p_val"], 9.572778e-13) - expect_equal(results.pseudo[1, "avg_log2FC"], -6.013818, tolerance = 1e-6) + expect_equal(results.pseudo[1, "p_val"], 9.572778e-13, tolerance = 1e-18) + expect_equal(results.pseudo[1, "avg_log2FC"], -6.036353, tolerance = 1e-6) expect_equal(results.pseudo[1, "pct.1"], 0.083) expect_equal(results.pseudo[1, "pct.2"], 0.909) - expect_equal(results.pseudo[1, "p_val_adj"], 2.201739e-10) + expect_equal(results.pseudo[1, "p_val_adj"], 2.201739e-10, tolerance = 1e-15) expect_equal(nrow(x = results.pseudo), 222) expect_equal(rownames(results.pseudo)[1], "HLA-DPB1") }) @@ -330,12 +413,16 @@ test_that("FindAllMarkers works as expected", { # Tests for running FindMarkers post integration/transfer ref <- pbmc_small ref <- FindVariableFeatures(object = ref, verbose = FALSE, nfeatures = 100) -query <- CreateSeuratObject( - counts = GetAssayData(object = pbmc_small[['RNA']], slot = "counts") + rpois(n = ncol(pbmc_small), lambda = 1) -) -query2 <- CreateSeuratObject( - counts = GetAssayData(object = pbmc_small[['RNA']], slot = "counts")[, 1:40] + rpois(n = ncol(pbmc_small), lambda = 1) -) +query <- CreateSeuratObject(CreateAssayObject( + counts = as.sparse(GetAssayData(object = pbmc_small[['RNA']], layer = "counts") + rpois(n = ncol(pbmc_small), lambda = 1)) +)) + +query2 <- CreateSeuratObject(CreateAssayObject( + counts = as.sparse(GetAssayData(object = pbmc_small[['RNA']], layer = "counts")[, 1:40] + rpois(n = ncol(pbmc_small), lambda = 1)) +)) + + + query.list <- list(query, query2) query.list <- lapply(X = query.list, FUN = NormalizeData, verbose = FALSE) query.list <- lapply(X = query.list, FUN = FindVariableFeatures, verbose = FALSE, nfeatures = 100) @@ -343,17 +430,41 @@ query.list <- lapply(X = query.list, FUN = ScaleData, verbose = FALSE) query.list <- suppressWarnings(lapply(X = query.list, FUN = RunPCA, verbose = FALSE, npcs = 20)) anchors <- suppressMessages(suppressWarnings(FindIntegrationAnchors(object.list = c(ref, query.list), k.filter = NA, verbose = FALSE))) -object <- suppressMessages(IntegrateData(anchorset = anchors, k.weight = 25, verbose = FALSE)) +object <- suppressWarnings(suppressMessages(IntegrateData(anchorset = anchors, k.weight = 25, verbose = FALSE))) object <- suppressMessages(ScaleData(object, verbose = FALSE)) object <- suppressMessages(RunPCA(object, verbose = FALSE)) object <- suppressMessages(FindNeighbors(object = object, verbose = FALSE)) object <- suppressMessages(FindClusters(object, verbose = FALSE)) -markers <- FindMarkers(object = object, ident.1="0", ident.2="1") -test_that("FindMarkers recognizes log normalizatio", { - expect_equal(markers[1, "p_val"], 1.598053e-14) - expect_equal(markers[1, "avg_log2FC"], -2.614686, tolerance = 1e-6) +markers <- FindMarkers(object = object, ident.1="0", ident.2="1",pseudocount.use = 1, verbose=FALSE) +test_that("FindMarkers recognizes log normalization", { + expect_equal(markers[1, "p_val"], 1.598053e-14, tolerance = 1e-19) + expect_equal(markers[1, "avg_log2FC"], -2.634458, tolerance = 1e-6) +}) + + +test_that("BPCells FindAllMarkers gives same results", { + skip_on_cran() + skip_if_not_installed("BPCells") + library(BPCells) + library(Matrix) + mat_bpcells <- t(as(t(pbmc_small[['RNA']]$counts ), "IterableMatrix")) + pbmc_small[['RNAbp']] <- CreateAssay5Object(counts = mat_bpcells) + pbmc_small <- NormalizeData(pbmc_small, assay = "RNAbp") + + results.bp <- suppressMessages(suppressWarnings(FindAllMarkers(object = pbmc_small, assay = "RNAbp", pseudocount.use=1))) + + expect_equal(colnames(x = results.bp), c("p_val", "avg_log2FC", "pct.1", "pct.2", "p_val_adj", "cluster", "gene")) + expect_equal(results.bp[1, "p_val"], 9.572778e-13) + expect_equal(results.bp[1, "avg_log2FC"], -6.030507, tolerance = 1e-6) + expect_equal(results.bp[1, "pct.1"], 0.083) + expect_equal(results.bp[1, "pct.2"], 0.909) + expect_equal(results.bp[1, "p_val_adj"], 2.201739e-10) + expect_equal(nrow(x = results.bp), 222) + expect_equal(rownames(results.bp)[1], "HLA-DPB1") }) + + # Tests for FindConservedMarkers # ------------------------------------------------------------------------------- @@ -361,25 +472,25 @@ if (requireNamespace('metap', quietly = TRUE)) { context("FindConservedMarkers") pbmc_small$groups - markers <- suppressWarnings(FindConservedMarkers(object = pbmc_small, ident.1 = 0, grouping.var = "groups", verbose = FALSE, base = exp(1))) + markers <- suppressWarnings(FindConservedMarkers(object = pbmc_small, ident.1 = 0, grouping.var = "groups", verbose = FALSE, base = exp(1), pseudocount.use = 1)) standard.names <- c("p_val", "avg_logFC", "pct.1", "pct.2", "p_val_adj") test_that("FindConservedMarkers works", { expect_equal(colnames(x = markers), c(paste0("g2_", standard.names), paste0("g1_", standard.names), "max_pval", "minimump_p_val")) expect_equal(markers[1, "g2_p_val"], 4.983576e-05) - expect_equal(markers[1, "g2_avg_logFC"], -4.125279, tolerance = 1e-6) + expect_equal(markers[1, "g2_avg_logFC"], -4.364959, tolerance = 1e-6) # expect_equal(markers[1, "g2_pct.1"], 0.062) expect_equal(markers[1, "g2_pct.2"], 0.75) expect_equal(markers[1, "g2_p_val_adj"], 0.0114622238) - expect_equal(markers[1, "g1_p_val"], 3.946643e-08) - expect_equal(markers[1, "g1_avg_logFC"], -3.589384, tolerance = 1e-6) + expect_equal(markers[1, "g1_p_val"], 3.946643e-08, tolerance = 1e-13) + expect_equal(markers[1, "g1_avg_logFC"], -3.69215, tolerance = 1e-6) expect_equal(markers[1, "g1_pct.1"], 0.10) expect_equal(markers[1, "g1_pct.2"], 0.958) expect_equal(markers[1, "g1_p_val_adj"], 9.077279e-06) expect_equal(markers[1, "max_pval"], 4.983576e-05) - expect_equal(markers[1, "minimump_p_val"], 7.893286e-08) - expect_equal(nrow(markers), 179) + expect_equal(markers[1, "minimump_p_val"], 7.893286e-08, tolerance = 1e-13) + expect_equal(nrow(markers), 219) expect_equal(rownames(markers)[1], "HLA-DRB1") expect_equal(markers[, "max_pval"], unname(obj = apply(X = markers, MARGIN = 1, FUN = function(x) max(x[c("g1_p_val", "g2_p_val")])))) }) @@ -394,17 +505,18 @@ if (requireNamespace('metap', quietly = TRUE)) { Idents(object = pbmc.test) <- "RNA_snn_res.1" pbmc.test$id.group <- paste0(pbmc.test$RNA_snn_res.1, "_", pbmc.test$groups) pbmc.test <- subset(x = pbmc.test, id.group == "0_g1", invert = TRUE) - markers.missing <- suppressWarnings(FindConservedMarkers(object = pbmc.test, ident.1 = 0, grouping.var = "groups", test.use = "t", verbose = FALSE, base = exp(1))) + markers.missing <- suppressWarnings(FindConservedMarkers(object = pbmc.test, ident.1 = 0, grouping.var = "groups", test.use = "t", verbose = FALSE, base = exp(1), pseudocount.use = 1)) test_that("FindConservedMarkers handles missing idents in certain groups", { expect_warning(FindConservedMarkers(object = pbmc.test, ident.1 = 0, grouping.var = "groups", test.use = "t")) expect_equal(colnames(x = markers.missing), paste0("g2_", standard.names)) - expect_equal(markers.missing[1, "g2_p_val"], 1.672911e-13) - expect_equal(markers.missing[1, "g2_avg_logFC"], -4.527888, tolerance = 1e-6) + expect_equal(markers.missing[1, "g2_p_val"], 1.672911e-13, tolerance = 1e-18) + expect_equal(markers.missing[1, "g2_avg_logFC"], -4.796379, tolerance = 1e-6) # expect_equal(markers.missing[1, "g2_pct.1"], 0.062) expect_equal(markers.missing[1, "g2_pct.2"], 0.95) - expect_equal(markers.missing[1, "g2_p_val_adj"], 3.847695e-11) - expect_equal(nrow(markers.missing), 205) + expect_equal(markers.missing[1, "g2_p_val_adj"], 3.847695e-11, tolerance = 1e-16) + expect_equal(nrow(markers.missing), 226) expect_equal(rownames(markers.missing)[1], "HLA-DPB1") }) } + diff --git a/tests/testthat/test_dimensional_reduction.R b/tests/testthat/test_dimensional_reduction.R index 9fadd6ea6..ebc529298 100644 --- a/tests/testthat/test_dimensional_reduction.R +++ b/tests/testthat/test_dimensional_reduction.R @@ -1,16 +1,18 @@ context("test-dimensional_reduction") -test_that("different ways of passing distance matrix", { - # Generate dummy data exp matrix - set.seed(1) - dummyexpMat <- matrix(data = sample(x = c(1:50), size = 1e4, replace = TRUE), - ncol = 100, nrow = 100) - colnames(dummyexpMat) <- paste0("cell", seq(ncol(dummyexpMat))) - row.names(dummyexpMat) <- paste0("gene", seq(nrow(dummyexpMat))) +set.seed(seed = 1) +dummyexpMat <- matrix( + data = sample(x = c(1:50), size = 1e4, replace = TRUE), + ncol = 100, nrow = 100 +) +colnames(x = dummyexpMat) <- paste0("cell", seq(ncol(x = dummyexpMat))) +row.names(x = dummyexpMat) <- paste0("gene", seq(nrow(x = dummyexpMat))) + +# Create Seurat object for testing +obj <- CreateSeuratObject(counts = as.sparse(dummyexpMat)) - # Create Seurat object for testing - obj <- CreateSeuratObject(counts = dummyexpMat) +test_that("different ways of passing distance matrix", { # Manually make a distance object to test distMat <- dist(t(dummyexpMat)) @@ -28,29 +30,19 @@ test_that("different ways of passing distance matrix", { ) }) -test_that("pca returns total variance (see #982)", { - # Generate dummy data exp matrix - set.seed(seed = 1) - dummyexpMat <- matrix( - data = sample(x = c(1:50), size = 1e4, replace = TRUE), - ncol = 100, nrow = 100 - ) - colnames(x = dummyexpMat) <- paste0("cell", seq(ncol(x = dummyexpMat))) - row.names(x = dummyexpMat) <- paste0("gene", seq(nrow(x = dummyexpMat))) +# Normalize, scale, and compute PCA, using RunPCA +obj <- NormalizeData(object = obj, verbose = FALSE) +obj <- ScaleData(object = obj, verbose = FALSE) - # Create Seurat object for testing - obj <- CreateSeuratObject(counts = dummyexpMat) - - # Scale and compute PCA, using RunPCA - obj <- ScaleData(object = obj, verbose = FALSE) - pca_result <- suppressWarnings(expr = RunPCA( - object = obj, - features = rownames(x = obj), - verbose = FALSE - )) +pca_result <- suppressWarnings(expr = RunPCA( + object = obj, + features = rownames(obj[['RNA']]$counts), + verbose = FALSE +)) +test_that("pca returns total variance (see #982)", { # Using stats::prcomp - scaled_data <- Seurat::GetAssayData(object = obj, slot = "scale.data") + scaled_data <- LayerData(object = obj, layer = "scale.data") prcomp_result <- stats::prcomp(scaled_data, center = FALSE, scale. = FALSE) # Compare @@ -58,3 +50,22 @@ test_that("pca returns total variance (see #982)", { sum(prcomp_result$sdev^2)) }) + +test_that("pca is equivalent for BPCells", { + skip_on_cran() + skip_if_not_installed("BPCells") + library(BPCells) + library(Matrix) + mat_bpcells <- t(x = as(object = t(x = obj[['RNA']]$counts ), Class = "IterableMatrix")) + obj[['RNAbp']] <- CreateAssay5Object(counts = mat_bpcells) + DefaultAssay(obj) <- "RNAbp" + obj <- NormalizeData(object = obj, verbose = FALSE) + obj <- ScaleData(object = obj, verbose=FALSE) + pca_result_bp <- suppressWarnings(expr = RunPCA( + object = obj, + features = rownames(obj[['RNAbp']]$counts), + assay = "RNAbp")) + expect_equivalent(abs(pca_result_bp[['pca']]@cell.embeddings), + abs(pca_result[['pca']]@cell.embeddings), + tolerance = 1e-5) +}) diff --git a/tests/testthat/test_integratedata.R b/tests/testthat/test_integratedata.R index ebc2036bd..7f6f27d37 100644 --- a/tests/testthat/test_integratedata.R +++ b/tests/testthat/test_integratedata.R @@ -6,10 +6,22 @@ pbmc_small <- suppressWarnings(UpdateSeuratObject(pbmc_small)) ref <- pbmc_small ref <- FindVariableFeatures(object = ref, verbose = FALSE, nfeatures = 100) query <- CreateSeuratObject( - counts = GetAssayData(object = pbmc_small[['RNA']], slot = "counts") + rpois(n = ncol(pbmc_small), lambda = 1) + counts = as.sparse( + GetAssayData( + object = pbmc_small[['RNA']], + layer = "counts") + rpois(n = ncol(pbmc_small), + lambda = 1 + ) + ) ) query2 <- CreateSeuratObject( - counts = GetAssayData(object = pbmc_small[['RNA']], slot = "counts")[, 1:40] + rpois(n = ncol(pbmc_small), lambda = 1) + counts = as.sparse( + LayerData( + object = pbmc_small[['RNA']], + layer = "counts")[, 1:40] + rpois(n = ncol(pbmc_small), + lambda = 1 + ) + ) ) query.list <- list(query, query2) query.list <- lapply(X = query.list, FUN = NormalizeData, verbose = FALSE) @@ -65,29 +77,29 @@ context("IntegrateData") test_that("IntegrateData with two objects default work", { expect_error(IntegrateData(anchorset = anchors2)) - int2 <- IntegrateData(anchorset = anchors2, k.weight = 50, verbose = FALSE) + int2 <- suppressWarnings(IntegrateData(anchorset = anchors2, k.weight = 50, verbose = FALSE)) expect_true(all(Assays(int2) %in% c("integrated", "RNA"))) expect_equal(Tool(int2), "Integration") expect_equal(dim(int2[["integrated"]]), c(133, 160)) expect_equal(length(VariableFeatures(int2)), 133) - expect_equal(GetAssayData(int2[["integrated"]], slot = "counts"), new("dgCMatrix")) - expect_equal(GetAssayData(int2[['integrated']], slot = "scale.data"), matrix()) - expect_equal(sum(GetAssayData(int2[["integrated"]])[1, ]), 44.97355, tolerance = 1e-3) - expect_equal(sum(GetAssayData(int2[["integrated"]])[, 1]), 78.8965706046, tolerance = 1e-6) + expect_equal(GetAssayData(int2[["integrated"]], layer = "counts"), new("dgCMatrix")) + expect_equal(GetAssayData(int2[['integrated']], layer = "scale.data"), matrix()) + expect_equal(sum(GetAssayData(int2[["integrated"]], layer = "data")[1, ]), 44.97355, tolerance = 1e-3) + expect_equal(sum(GetAssayData(int2[["integrated"]], layer = "data")[, 1]), 78.8965706046, tolerance = 1e-6) expect_equal(Tool(object = int2, slot = "Integration")@sample.tree, matrix(c(-1, -2), nrow = 1)) }) test_that("IntegrateData with three objects default work", { expect_error(IntegrateData(anchorset = anchors3, k.weight = 50)) - int3 <- IntegrateData(anchorset = anchors3, k.weight = 25, verbose = FALSE) + int3 <- suppressWarnings(IntegrateData(anchorset = anchors3, k.weight = 25, verbose = FALSE)) expect_true(all(Assays(int3) %in% c("integrated", "RNA"))) expect_equal(Tool(int3), "Integration") expect_equal(dim(int3[["integrated"]]), c(169, 200)) expect_equal(length(VariableFeatures(int3)), 169) - expect_equal(GetAssayData(int3[["integrated"]], slot = "counts"), new("dgCMatrix")) - expect_equal(GetAssayData(int3[['integrated']], slot = "scale.data"), matrix()) - expect_equal(sum(GetAssayData(int3[["integrated"]])[1, ]), 372.829, tolerance = 1e-6) - expect_equal(sum(GetAssayData(int3[["integrated"]])[, 1]), 482.5009, tolerance = 1e-6) + expect_equal(GetAssayData(int3[["integrated"]], layer = "counts"), new("dgCMatrix")) + expect_equal(GetAssayData(int3[['integrated']], layer = "scale.data"), matrix()) + expect_equal(sum(GetAssayData(int3[["integrated"]], layer = "data")[1, ]), 372.829, tolerance = 1e-6) + expect_equal(sum(GetAssayData(int3[["integrated"]], layer = "data")[, 1]), 482.5009, tolerance = 1e-6) expect_equal(Tool(object = int3, slot = "Integration")@sample.tree, matrix(c(-2, -3, 1, -1), nrow = 2, byrow = TRUE)) }) @@ -100,5 +112,3 @@ test_that("Input validates correctly ", { #expect_warning(IntegrateData(anchorset = anchors2, k.weight = 50, features = c(rownames(ref), "BAD"))) #expect_warning(IntegrateData(anchorset = anchors2, k.weight = 50, dims = 1:1000)) }) - - diff --git a/tests/testthat/test_integration.R b/tests/testthat/test_integration.R index ff5f57e01..94843afd3 100644 --- a/tests/testthat/test_integration.R +++ b/tests/testthat/test_integration.R @@ -5,7 +5,13 @@ pbmc_small <- suppressWarnings(UpdateSeuratObject(pbmc_small)) # Setup test objects ref <- pbmc_small query <- CreateSeuratObject( - counts = GetAssayData(object = pbmc_small[['RNA']], slot = "counts") + rpois(n = ncol(pbmc_small), lambda = 1) + counts = as.sparse( + GetAssayData( + object = pbmc_small[['RNA']], + layer = "counts") + rpois(n = ncol(pbmc_small), + lambda = 1 + ) + ) ) query <- NormalizeData(object = query, verbose = FALSE) query <- FindVariableFeatures(object = query, verbose = FALSE, nfeatures = 100) @@ -20,8 +26,8 @@ test_that("FindTransferAnchors defaults work", { co <- anchors@object.list[[1]] expect_equal(dim(co), c(100, 160)) expect_equal(Reductions(co), c("pcaproject", "pcaproject.l2")) - expect_equal(GetAssayData(co[["RNA"]])[1, 3], 0) - expect_equal(GetAssayData(co[["RNA"]], slot = "counts")[1, 3], 0) + expect_equal(GetAssayData(co[["RNA"]], layer ="data")[1, 3], 0) + expect_equal(GetAssayData(co[["RNA"]], layer = "counts")[1, 3], 0) expect_equal(dim(co[['pcaproject']]), c(160, 30)) expect_equal(Embeddings(co[['pcaproject']])[1, 1], 0.4840944592, tolerance = 1e-7) expect_equal(Loadings(co[['pcaproject']], projected = T)[1, 1], 0.2103563963, tolerance = 1e-7) @@ -73,7 +79,7 @@ test_that("FindTransferAnchors allows reference.reduction to be precomputed", { expect_equal(dim(co), c(100, 160)) expect_equal(Reductions(co), c("pcaproject", "pcaproject.l2")) expect_equal(GetAssayData(co[["RNA"]])[1, 3], 0) - expect_equal(GetAssayData(co[["RNA"]], slot = "counts")[1, 3], 0) + expect_equal(GetAssayData(co[["RNA"]], layer = "counts")[1, 3], 0) expect_equal(dim(co[['pcaproject']]), c(160, 30)) expect_equal(Embeddings(co[['pcaproject']])[1, 1], 0.4840944592, tolerance = 1e-7) expect_equal(Loadings(co[['pcaproject']], projected = T)[1, 1], 0.2103563963, tolerance = 1e-7) @@ -103,8 +109,8 @@ test_that("FindTransferAnchors with cca defaults work", { expect_equal(Reductions(co), c("cca", "cca.l2")) expect_equal(GetAssayData(co[["RNA"]])["PPBP", 3], 0) expect_equal(GetAssayData(co[["RNA"]])["PPBP", 1], 0) - expect_equal(GetAssayData(co[["RNA"]], slot = "counts")["PPBP", 3], 0) - expect_equal(GetAssayData(co[["RNA"]], slot = "counts")["PPBP", 1], 0) + expect_equal(GetAssayData(co[["RNA"]], layer = "counts")["PPBP", 3], 0) + expect_equal(GetAssayData(co[["RNA"]], layer = "counts")["PPBP", 1], 0) expect_equal(dim(co[['cca']]), c(160, 30)) expect_equal(Embeddings(co[['cca']])[1, 1], 0.04611130861, tolerance = 1e-7) expect_equal(Loadings(co[['cca']], projected = T)["PPBP", 1], 12.32379661, tolerance = 1e-7) @@ -132,10 +138,10 @@ test_that("FindTransferAnchors with project.query defaults work", { co <- anchors@object.list[[1]] expect_equal(dim(co), c(100, 160)) expect_equal(Reductions(co), c("pcaproject", "pcaproject.l2")) - expect_equal(GetAssayData(co[["RNA"]])["PPBP", 3], 0) - expect_equal(GetAssayData(co[["RNA"]])["PPBP", 1], 0) - expect_equal(GetAssayData(co[["RNA"]], slot = "counts")["PPBP", 3], 0) - expect_equal(GetAssayData(co[["RNA"]], slot = "counts")["PPBP", 1], 0) + expect_equal(GetAssayData(co[["RNA"]], layer = "data")["PPBP", 3], 0) + expect_equal(GetAssayData(co[["RNA"]], layer = "data")["PPBP", 1], 0) + expect_equal(GetAssayData(co[["RNA"]], layer = "counts")["PPBP", 3], 0) + expect_equal(GetAssayData(co[["RNA"]], layer = "counts")["PPBP", 1], 0) expect_equal(dim(co[['pcaproject']]), c(160, 30)) expect_equal(Embeddings(co[['pcaproject']])[1, 1], 1.577959404, tolerance = 1e-7) expect_equal(Loadings(co[['pcaproject']], projected = T)["PPBP", 1], 0.1145472305, tolerance = 1e-7) @@ -168,8 +174,8 @@ test_that("FindTransferAnchors with project.query and reference.reduction works" expect_equal(Reductions(co), c("pcaproject", "pcaproject.l2")) expect_equal(GetAssayData(co[["RNA"]])["PPBP", 3], 0) expect_equal(GetAssayData(co[["RNA"]])["PPBP", 1], 0) - expect_equal(GetAssayData(co[["RNA"]], slot = "counts")["PPBP", 3], 0) - expect_equal(GetAssayData(co[["RNA"]], slot = "counts")["PPBP", 1], 0) + expect_equal(GetAssayData(co[["RNA"]], layer = "counts")["PPBP", 3], 0) + expect_equal(GetAssayData(co[["RNA"]], layer = "counts")["PPBP", 1], 0) expect_equal(dim(co[['pcaproject']]), c(160, 30)) expect_equal(Embeddings(co[['pcaproject']])[1, 1], 1.577959404, tolerance = 1e-7) expect_equal(Loadings(co[['pcaproject']], projected = T)["PPBP", 1], 0.1145472305, tolerance = 1e-7) @@ -202,7 +208,7 @@ test_that("FindTransferAnchors with reference.neighbors precomputed works", { expect_equal(dim(co), c(100, 160)) expect_equal(Reductions(co), c("pcaproject", "pcaproject.l2")) expect_equal(GetAssayData(co[["RNA"]])[1, 3], 0) - expect_equal(GetAssayData(co[["RNA"]], slot = "counts")[1, 3], 0) + expect_equal(GetAssayData(co[["RNA"]], layer = "counts")[1, 3], 0) expect_equal(dim(co[['pcaproject']]), c(160, 30)) expect_equal(Embeddings(co[['pcaproject']])[1, 1], 0.4840944592, tolerance = 1e-7) expect_equal(Loadings(co[['pcaproject']], projected = T)[1, 1], 0.2103563963, tolerance = 1e-7) @@ -231,7 +237,7 @@ test_that("FindTransferAnchors with no l2 works", { expect_equal(dim(co), c(100, 160)) expect_equal(Reductions(co), c("pcaproject")) expect_equal(GetAssayData(co[["RNA"]])[1, 3], 0) - expect_equal(GetAssayData(co[["RNA"]], slot = "counts")[1, 3], 0) + expect_equal(GetAssayData(co[["RNA"]], layer = "counts")[1, 3], 0) expect_equal(dim(co[['pcaproject']]), c(160, 30)) expect_equal(Embeddings(co[['pcaproject']])[1, 1], 0.4840944592, tolerance = 1e-7) expect_equal(Loadings(co[['pcaproject']], projected = T)[1, 1], 0.2103563963, tolerance = 1e-7) @@ -250,9 +256,9 @@ test_that("FindTransferAnchors with no l2 works", { expect_equal(anchors@neighbors, list()) }) -# SCTransform tests -query <- suppressWarnings(SCTransform(object = query, verbose = FALSE)) -ref <- suppressWarnings(SCTransform(object = ref, verbose = FALSE)) +# SCTransform tests V1 +query <- suppressWarnings(SCTransform(object = query, verbose = FALSE,vst.flavor = 'v1')) +ref <- suppressWarnings(SCTransform(object = ref, verbose = FALSE,vst.flavor = 'v1')) test_that("FindTransferAnchors with default SCT works", { skip_on_cran() @@ -261,7 +267,7 @@ test_that("FindTransferAnchors with default SCT works", { expect_equal(dim(co), c(220, 160)) expect_equal(Reductions(co), c("pcaproject", "pcaproject.l2")) expect_equal(DefaultAssay(co), "SCT") - expect_equal(GetAssayData(co[["SCT"]], slot = "scale.data"), new(Class = "matrix")) + expect_equal(GetAssayData(co[["SCT"]], layer = "scale.data"), new(Class = "matrix")) expect_equal(GetAssayData(co[["SCT"]])[1, 1], 0) expect_equal(dim(co[['pcaproject']]), c(160, 30)) expect_equal(Embeddings(co[['pcaproject']])[1, 1], -1.852491719, tolerance = 1e-7) @@ -306,10 +312,10 @@ test_that("FindTransferAnchors with default SCT works", { expect_equal(GetAssayData(co[["SCT"]])[1, 1], 0) expect_equal(dim(co[['cca']]), c(160, 30)) expect_equal(Embeddings(co[['cca']])[1, 1], 0.0459135444, tolerance = 1e-7) - expect_equal(Loadings(co[['cca']], projected = T)[1, 1], 8.51477973, tolerance = 1e-7) + expect_equal(Loadings(co[['cca']], projected = T)["NKG7", 1], 8.51477973, tolerance = 1e-7) expect_equal(dim(co[['cca.l2']]), c(160, 30)) expect_equal(Embeddings(co[['cca.l2']])[1, 1], 0.0625989664, tolerance = 1e-7) - expect_equal(Loadings(co[['cca.l2']], projected = T)[1, 1], 8.51477973, tolerance = 1e-7) + expect_equal(Loadings(co[['cca.l2']], projected = T)["NKG7", 1], 8.51477973, tolerance = 1e-7) ref.cells <- paste0(Cells(ref), "_reference") query.cells <- paste0(Cells(query), "_query") expect_equal(anchors@reference.cells, ref.cells) @@ -335,10 +341,10 @@ test_that("FindTransferAnchors with SCT and project.query work", { expect_equal(GetAssayData(co[["SCT"]])[1, 1], 0) expect_equal(GetAssayData(co[["SCT"]], slot = "scale.data"), new("matrix")) expect_equal(dim(co[['pcaproject']]), c(160, 30)) - expect_equal(Embeddings(co[['pcaproject']])[1, 1], 0.3308694488, tolerance = 1e-7) + expect_equal(Embeddings(co[['pcaproject']])[1, 1], 0.3049308, tolerance = 1e-7) expect_equal(Loadings(co[['pcaproject']], projected = T)[1, 1], 0.05788217444, tolerance = 1e-7) expect_equal(dim(co[['pcaproject.l2']]), c(160, 30)) - expect_equal(Embeddings(co[['pcaproject.l2']])[1, 1], 0.03807493471, tolerance = 1e-7) + expect_equal(Embeddings(co[['pcaproject.l2']])[1, 1], 0.04334884, tolerance = 1e-7) expect_equal(Loadings(co[['pcaproject.l2']], projected = T)[1, 1], 0.05788217444, tolerance = 1e-7) ref.cells <- paste0(Cells(ref), "_reference") query.cells <- paste0(Cells(query), "_query") @@ -346,8 +352,8 @@ test_that("FindTransferAnchors with SCT and project.query work", { expect_equal(anchors@query.cells, query.cells) expect_equal(anchors@reference.objects, logical()) anchor.mat <- anchors@anchors - expect_equal(dim(anchor.mat), c(288, 3)) - expect_equal(as.vector(anchor.mat[1, ]), c(1, 1, 0.6138996139), tolerance = 1e-7) + expect_equal(dim(anchor.mat), c(290, 3)) + expect_equal(as.vector(anchor.mat[1, ]), c(1, 1, 0.6315789), tolerance = 1e-7) expect_equal(max(anchor.mat[, 2]), 80) expect_null(anchors@offsets) expect_equal(length(anchors@anchor.features), 220) @@ -363,7 +369,7 @@ test_that("FindTransferAnchors with SCT and l2.norm FALSE work", { expect_equal(Reductions(co), c("pcaproject")) expect_equal(DefaultAssay(co), "SCT") expect_equal(GetAssayData(co[["SCT"]])[1, 1], 0) - expect_equal(GetAssayData(co[["SCT"]], slot = "scale.data"), new("matrix")) + expect_equal(GetAssayData(co[["SCT"]], layer = "scale.data"), new("matrix")) expect_equal(dim(co[['pcaproject']]), c(160, 30)) expect_equal(Embeddings(co[['pcaproject']])[1, 1], -1.852491719, tolerance = 1e-7) expect_equal(Loadings(co[['pcaproject']], projected = T)[1, 1], -0.1829401539, tolerance = 1e-7) diff --git a/tests/testthat/test_integration5.R b/tests/testthat/test_integration5.R new file mode 100644 index 000000000..f4f635790 --- /dev/null +++ b/tests/testthat/test_integration5.R @@ -0,0 +1,349 @@ +# Tests for IntegrateLayers +set.seed(42) + + +# checks that the absolute value of `x` and `y` are within `tolerance` +expect_abs_equal <- function(x, y, tolerance = 1.0e-04) { + expect_equal(abs(x), abs(y), tolerance = tolerance) +} + + +# setup shared fixtures +# update `pbmc_small` to use `Assay5` instances +test.data <- pbmc_small +suppressWarnings( + test.data[["RNA"]] <- CreateAssay5Object( + counts = LayerData( + test.data, + assay = "RNA", + layer = "counts" + ) + ) +) +# split the assay into multiple layers +test.data[["RNA"]] <- split(test.data[["RNA"]], f = test.data$groups) + + +context("IntegrateLayers") + +# setup fixtures for standard integration workflow +test.data.std <- NormalizeData(test.data, verbose = FALSE) +test.data.std <- FindVariableFeatures(test.data.std, verbose = FALSE) +test.data.std <- ScaleData(test.data.std, verbose = FALSE) +test.data.std <- suppressWarnings( + RunPCA(test.data.std, verbose = FALSE) +) + +test_that("IntegrateLayers works with HarmonyIntegration", { + skip_on_cran() + skip_if_not_installed("harmony") + + integrated <- suppressWarnings( + IntegrateLayers( + test.data.std, + method = HarmonyIntegration, + orig.reduction = "pca", + new.reduction = "integrated", + verbose = FALSE + ) + ) + # the integrated reduction should have the same dimensions as the original + expect_equal(dim(integrated[["integrated"]]), dim(integrated[["pca"]])) + # spot-check a few of the integrated values - since the integrated + # reductions sporadically flip sign only compare absolute values + expect_abs_equal( + Embeddings(integrated[["integrated"]])[5, 5], + 0.3912 + ) + expect_abs_equal( + Embeddings(integrated[["integrated"]])[40, 25], + 0.6668 + ) + expect_abs_equal( + Embeddings(integrated[["integrated"]])[75, 45], + 0.7248 + ) +}) + +test_that("IntegrateLayers works with CCAIntegration", { + integrated <- suppressWarnings( + IntegrateLayers( + test.data.std, + method = CCAIntegration, + orig.reduction = "pca", + new.reduction = "integrated", + verbose = FALSE, + # since `k.weight` must be less than the number of samples in the + # smallest layer being integrated, it must be set to accommodate the + # small dataset used for testing + k.weight = 10 + ) + ) + # the integrated reduction should have the same dimensions as the original + expect_equal(dim(integrated[["integrated"]]), dim(integrated[["pca"]])) + # spot-check a few of the integrated values - since the integrated + # reductions sporadically flip sign only compare absolute values + expect_abs_equal( + Embeddings(integrated[["integrated"]])[5, 5], + 0.9174 + ) + expect_abs_equal( + Embeddings(integrated[["integrated"]])[40, 25], + 1.4885 + ) + expect_abs_equal( + Embeddings(integrated[["integrated"]])[75, 45], + 0.5442 + ) +}) + +test_that("IntegrateLayers works with RPCAIntegration", { + integrated <- suppressWarnings( + IntegrateLayers( + test.data.std, + method = RPCAIntegration, + orig.reduction = "pca", + new.reduction = "integrated", + verbose = FALSE, + # since `k.weight` must be less than the number of samples in the + # smallest layer being integrated, it must be set to accommodate the + # small dataset used for testing + k.weight = 10 + ) + ) + # the integrated reduction should have the same dimensions as the original + expect_equal(dim(integrated[["integrated"]]), dim(integrated[["pca"]])) + # spot-check a few of the integrated values - since the integrated + # reductions sporadically flip sign only compare absolute values + expect_abs_equal( + Embeddings(integrated[["integrated"]])[5, 5], + 0.1785 + ) + expect_abs_equal( + Embeddings(integrated[["integrated"]])[40, 25], + 0.5832 + ) + expect_abs_equal( + Embeddings(integrated[["integrated"]])[75, 45], + 0.5442 + ) +}) + +test_that("IntegrateLayers works with JointPCAIntegration", { + integrated <- suppressWarnings( + IntegrateLayers( + test.data.std, + method = JointPCAIntegration, + orig.reduction = "pca", + new.reduction = "integrated", + verbose = FALSE, + # since `k.weight` must be less than the number of samples in the + # smallest layer being integrated, it must be set to accommodate the + # small dataset used for testing + k.weight = 10 + ) + ) + # the integrated reduction should have the same dimensions as the original + expect_equal(dim(integrated[["integrated"]]), dim(integrated[["pca"]])) + # spot-check a few of the integrated values - since the integrated + # reductions sporadically flip sign only compare absolute values + expect_abs_equal( + Embeddings(integrated[["integrated"]])[5, 5], + 0.4092 + ) + expect_abs_equal( + Embeddings(integrated[["integrated"]])[40, 25], + 0.3246 + ) + expect_abs_equal( + Embeddings(integrated[["integrated"]])[75, 45], + 0.5442 + ) +}) + +test_that("IntegrateLayers fails when expected", { + # an error should be raised if a v3 assay is passed in + expect_error( + IntegrateLayers( + pbmc_small, + method = CCAIntegration, + orig.reduction = "pca", + assay = "RNA", + new.reduction = "integrated" + ) + ) + + # an error should be raised if a nonexistent `assay` is specified + expect_error( + IntegrateLayers( + test.data.std, + method = CCAIntegration, + orig.reduction = "pca", + assay = "DNA", + new.reduction = "integrated" + ) + ) + + # an error should be raised if a nonexistent `orig.reduction` is specified + expect_error( + IntegrateLayers( + test.data.std, + method = CCAIntegration, + orig.reduction = "lda", + new.reduction = "integrated" + ) + ) +}) + + +context("IntegrateData with SCTransform") + +# setup fixtures for SCTransform workflow +test.data.sct <- suppressWarnings( + SCTransform( + test.data, + # use v1 to avoid potentially different + # return values depending on if `glmGamPoi` + # is installed or not + vst.flavor="v1", + # set seed for reproducibility + seed.use = 12345, + verbose = FALSE + ) +) +test.data.sct <- suppressWarnings( + RunPCA(test.data.sct, verbose = FALSE) +) + +test_that("IntegrateLayers works with HarmonyIntegration & SCTransform", { + skip_on_cran() + skip_if_not_installed("harmony") + + integrated <- suppressWarnings( + IntegrateLayers( + test.data.sct, + method = HarmonyIntegration, + orig.reduction = "pca", + new.reduction = "integrated", + verbose = FALSE + ) + ) + # the integrated reduction should have the same dimensions as the original + expect_equal(dim(integrated[["integrated"]]), dim(integrated[["pca"]])) + # spot-check a few of the integrated values - since the integrated + # reductions sporadically flip sign only compare absolute values + expect_abs_equal( + Embeddings(integrated[["integrated"]])[5, 5], + 1.1520 + ) + expect_abs_equal( + Embeddings(integrated[["integrated"]])[40, 25], + 1.0302 + ) + expect_abs_equal( + Embeddings(integrated[["integrated"]])[75, 45], + 0.1886 + ) +}) + +test_that("IntegrateLayers works with CCAIntegration & SCTransform", { + integrated <- suppressWarnings( + IntegrateLayers( + test.data.sct, + method = CCAIntegration, + orig.reduction = "pca", + new.reduction = "integrated", + verbose = FALSE, + # since `k.weight` must be less than the number of samples in the + # smallest layer being integrated, it must be set to accommodate the + # small dataset used for testing + k.weight = 10, + # `CCAIntegration` needs to know how the data was normalized + normalization.method = "SCT" + ) + ) + # the integrated reduction should have the same dimensions as the original + expect_equal(dim(integrated[["integrated"]]), dim(integrated[["pca"]])) + # spot-check a few of the integrated values - since the integrated + # reductions sporadically flip sign only compare absolute values + expect_abs_equal( + Embeddings(integrated[["integrated"]])[5, 5], + 1.6113 + ) + expect_abs_equal( + Embeddings(integrated[["integrated"]])[40, 25], + 0.6927 + ) + expect_abs_equal( + Embeddings(integrated[["integrated"]])[75, 45], + 0.0855 + ) +}) + +test_that("IntegrateLayers works with RPCAIntegration & SCTransform", { + integrated <- suppressWarnings( + IntegrateLayers( + test.data.sct, + method = RPCAIntegration, + orig.reduction = "pca", + new.reduction = "integrated", + verbose = FALSE, + # since `k.weight` must be less than the number of samples in the + # smallest layer being integrated, it must be set to accommodate the + # small dataset used for testing + k.weight = 10, + # `RPCAIntegration` needs to know how the data was normalized + normalization.method = "SCT" + ) + ) + # the integrated reduction should have the same dimensions as the original + expect_equal(dim(integrated[["integrated"]]), dim(integrated[["pca"]])) + # spot-check a few of the integrated values - since the integrated + # reductions sporadically flip sign only compare absolute values + expect_abs_equal( + Embeddings(integrated[["integrated"]])[5, 5], + 1.6492 + ) + expect_abs_equal( + Embeddings(integrated[["integrated"]])[40, 25], + 0.7343 + ) + expect_abs_equal( + Embeddings(integrated[["integrated"]])[75, 45], + 0.0855 + ) +}) + +test_that("IntegrateLayers works with JointPCAIntegration & SCTransform", { + integrated <- suppressWarnings( + IntegrateLayers( + test.data.sct, + method = JointPCAIntegration, + orig.reduction = "pca", + new.reduction = "integrated", + verbose = FALSE, + # since `k.weight` must be less than the number of samples in the + # smallest layer being integrated, it must be set to accommodate the + # small dataset used for testing + k.weight = 10, + # `JointPCAIntegration` needs to know how the data was normalized + normalization.method = "SCT" + ) + ) + # the integrated reduction should have the same dimensions as the original + expect_equal(dim(integrated[["integrated"]]), dim(integrated[["pca"]])) + # spot-check a few of the integrated values - since the integrated + # reductions sporadically flip sign only compare absolute values + expect_abs_equal( + Embeddings(integrated[["integrated"]])[5, 5], + 0.3427 + ) + expect_abs_equal( + Embeddings(integrated[["integrated"]])[40, 25], + 0.1015 + ) + expect_abs_equal( + Embeddings(integrated[["integrated"]])[75, 45], + 0.0855 + ) +}) diff --git a/tests/testthat/test_load_10X.R b/tests/testthat/test_load_10X.R index 02b23e6dd..f4f7a50f5 100644 --- a/tests/testthat/test_load_10X.R +++ b/tests/testthat/test_load_10X.R @@ -27,21 +27,212 @@ test_that("Read10X handles missing files properly", { expect_error(Read10X(dname, gene.column = 10)) }) -# Tests for reading in spatial 10x data -if (requireNamespace("hdf5r", quietly = TRUE)) { - context("Load10X_Spatial") - dname <- "../testdata/visium" - txsp <- Load10X_Spatial(data.dir = '../testdata/visium') - test_that("10x Spatial Data Parsing", { - expect_is(txsp, "Seurat") - expect_equal(ncol(x = txsp), 2695) - expect_equal(nrow(x = txsp), 100) - expect_equal(Cells(x = txsp)[1], "AAACAAGTATCTCCCA-1") - expect_equal(Assays(object = txsp), "Spatial") - expect_equal(GetAssayData(object = txsp[["Spatial"]], slot = "counts")[5, 9], 1) - }) - test_that("Read10X_Spatial handles missing files properly", { - expect_error(Load10X_Spatial(data.dir = ".")) - expect_error(Load10X_Spatial(data.dir = "./notadir/")) - }) -} + +context("Load10X_Spatial") + +# setup test fixtures +path.to.data <- file.path("../testdata") +path.to.visium <- file.path(path.to.data, "visium") +path.to.visium.hd <- file.path(path.to.data, "visium_hd") + +test_that("Read10X_h5 works as expected", { + skip_if_not_installed("hdf5r") + + path.to.counts <- file.path( + path.to.visium, + "filtered_feature_bc_matrix.h5" + ) + counts <- Read10X_h5(path.to.counts) + + # check that the shape of the returned matrix is correct + expect_equal(ncol(counts), 2695) + expect_equal(nrow(counts), 100) + # spot-check a few of the values + expect_equal(colnames(counts)[[151]], "AATGCAACCGGGTACC-1") + expect_equal(counts[1, 151], 1) + expect_equal(colnames(counts)[[9]], "AAACCGTTCGTCCAGG-1") + expect_equal(counts[5, 9], 1) + expect_equal(colnames(counts)[[2328]], "TCTCGAGGAGGTTCGC-1") + expect_equal(counts[99, 2328], 1) +}) + +test_that("Read10X_Image works as expected", { + path.to.images <- file.path(path.to.visium, "spatial") + coordinate.filenames <- "tissue_positions_list.csv" + # only test HD the dataset containing a parquet file if `arrow` is installed + if (requireNamespace("arrow", quietly = TRUE)) { + path.to.images <- c( + path.to.images, + file.path( + path.to.visium.hd, + "binned_outputs/square_008um/spatial" + ) + ) + coordinate.filenames <- c( + coordinate.filenames, + "tissue_positions.parquet" + ) + } + + for (i in seq_along(path.to.images)) { + path.to.image <- path.to.images[[i]] + coordinate.filename <- coordinate.filenames[[i]] + + # read in the coordinates as a data.frame - only keep the relevant columns + # and rename them to match the processed output + coordinates.expected <- Read10X_Coordinates( + file.path(path.to.image, coordinate.filename), + filter.matrix = TRUE + )[, c("imagerow", "imagecol")] + colnames(coordinates.expected) <- c("x", "y") + # read in the scale factors as an S3 object + scale.factors.expected <- Read10X_ScaleFactors( + file.path(path.to.image, "scalefactors_json.json") + ) + # default/lowres scaling + image.lowres <- Read10X_Image( + path.to.image, + image.name = "tissue_lowres_image.png" + ) + coordinates <- GetTissueCoordinates(image.lowres, scale = "lowres") + spot.radius <- Radius(image.lowres, scale = "lowres") + scale.factors <- ScaleFactors(image.lowres) + # check that the scale factors were read in as expected + expect_true(identical(scale.factors, scale.factors.expected)) + # check that `coordinates` contains values scaled for the low resolution PNG + expect_equal( + coordinates[, c("x", "y")] / scale.factors[["lowres"]], + coordinates.expected + ) + # check that the spot size is similarly scaled + expect_equal( + (spot.radius / scale.factors[["lowres"]] * max(dim(image.lowres))), + scale.factors.expected[["spot"]], + ) + + # hires scaling + image.hires <- Read10X_Image( + path.to.image, + image.name = "tissue_hires_image.png" + ) + coordinates <- GetTissueCoordinates(image.hires, scale = "hires") + spot.radius <- Radius(image.hires, scale = "hires") + scale.factors <- ScaleFactors(image.hires) + # check that the scale factors were read in as expected + expect_true(identical(scale.factors, scale.factors.expected)) + # check that `coordinates` contains values scaled for the high resolution PNG + expect_equal( + coordinates[, c("x", "y")] / scale.factors[["hires"]], + coordinates.expected + ) + # check that the spot size is similarly scaled + expect_equal( + (spot.radius / scale.factors[["hires"]] * max(dim(image.hires))), + scale.factors.expected[["spot"]] + ) + # the size of the two images should be different + expect_false(all(dim(image.hires) == dim(image.lowres))) + } +}) + +test_that("Load10X_Spatial works with SD data", { + skip_if_not_installed("hdf5r") + + path.to.counts <- file.path( + path.to.visium, + "filtered_feature_bc_matrix.h5" + ) + path.to.image <- file.path(path.to.visium, "spatial") + + # load the expected counts matrix + counts.expected <- Read10X_h5(path.to.counts) + + # load the expected image + image.expected <- Read10X_Image(path.to.image) + # set the image's key + Key(image.expected) <- "slice1_" + # update the expected image's assay to match the default + DefaultAssay(image.expected) <- "Spatial" + # align the expected image's identifiers with the expected count matrix + image.expected <- image.expected[colnames(counts.expected)] + + spatial <- Load10X_Spatial(path.to.visium) + + # check that `spatial` contains the expected counts matrix + expect_true( + identical( + LayerData(spatial, assay = "Spatial", layer = "counts"), + counts.expected + ) + ) + # check that `spatial` contains the expected image + expect_true( + identical( + spatial[["slice1"]], + image.expected + ) + ) +}) + +test_that("Load10X_Spatial works with HD data", { + skip_if_not_installed("hdf5r") + skip_if_not_installed("arrow") + + # since the "HD" test data is actually just the standard definition test + # data re-structured to look like it's been binned at multiple resolutions + # the `merge` call inside `Load10X_Spatial` throws a warning + spatial <- suppressWarnings(Load10X_Spatial(path.to.visium.hd)) + + bin.size <- c(16, 8) + for (i in seq_along(bin.size)) { + bin.size.pretty <- paste0(sprintf("%03d", bin.size[[i]]), "um") + assay.name <- paste0("Spatial.", bin.size.pretty) + image.key <- paste0("slice1", bin.size.pretty, "_") + image.key.pretty <- paste0("slice1.", bin.size.pretty) + + path.to.bin <- file.path( + path.to.visium.hd, + paste0("binned_outputs/square_", bin.size.pretty) + ) + path.to.counts <- file.path( + path.to.bin, + "filtered_feature_bc_matrix.h5" + ) + path.to.image <- file.path(path.to.bin, "spatial") + + # load the expected counts matrix + counts.expected <- Read10X_h5(path.to.counts) + # accomodate for the way cell identifiers will be reset during + # the call to `merge` inside `Load10X_Spatial` + colnames(counts.expected) <- paste0(colnames(counts.expected), "_", i) + # load the expected image + image.expected <- Read10X_Image(path.to.image, assay = assay.name) + # set the image's key + Key(image.expected) <- image.key + # again, accomodate for the way cell identifiers will be reset during + # the call to `merge` inside `Load10X_Spatial` + image.expected <- RenameCells(image.expected, paste0(Cells(image.expected), "_", i)) + # align the expected image's identifiers with the expected count matrix + image.expected <- image.expected[colnames(counts.expected)] + + # check that `spatial` contains the expected counts matrix + expect_true( + identical( + LayerData(spatial, assay = assay.name, layer = "counts"), + counts.expected + ) + ) + # check that `spatial` contains the expected image + expect_true( + identical( + spatial[[image.key.pretty]], + image.expected + ) + ) + } +}) + +test_that("Read10X_Spatial handles missing files properly", { + expect_error(Load10X_Spatial(data.dir = ".")) + expect_error(Load10X_Spatial(data.dir = "./notadir/")) +}) diff --git a/tests/testthat/test_objects.R b/tests/testthat/test_objects.R index ed6961b14..e0bf005d8 100644 --- a/tests/testthat/test_objects.R +++ b/tests/testthat/test_objects.R @@ -1,369 +1,24 @@ # Tests for functions in objects.R -# Tests for interacting with the meta.data slot -# ------------------------------------------------------------------------------ -context("Metadata") - -data("pbmc_small") - -pbmc_small <- suppressWarnings(suppressMessages(UpdateSeuratObject(pbmc_small))) -cluster_letters <- LETTERS[Idents(object = pbmc_small)] -names(cluster_letters) <- colnames(x = pbmc_small) -cluster_letters_shuffled <- sample(x = cluster_letters) - -test_that("AddMetaData adds in cell-level vector properly ", { - pbmc_small <- AddMetaData(object = pbmc_small, metadata = cluster_letters, col.name = 'letter.idents') - expect_equal(pbmc_small$letter.idents, cluster_letters) - pbmc_small <- AddMetaData(object = pbmc_small, metadata = cluster_letters_shuffled, col.name = 'letter.idents.shuffled') - expect_equal(pbmc_small$letter.idents, pbmc_small$letter.idents.shuffled) -}) - -cluster_letters_df <- data.frame(A = cluster_letters, B = cluster_letters_shuffled) -test_that("AddMetaData adds in data frame properly for cell-level metadata", { - pbmc_small <- AddMetaData(object = pbmc_small, metadata = cluster_letters_df) - expect_equal(pbmc_small[[c("A", "B")]], cluster_letters_df) -}) - -feature_letters <- sample(x = LETTERS, size = nrow(x = pbmc_small[["RNA"]]), replace = TRUE) -names(feature_letters) <- rownames(x = pbmc_small[["RNA"]]) -feature_letters_shuffled <- sample(x = feature_letters) - -test_that("AddMetaData adds feature level metadata", { - pbmc_small[["RNA"]] <- AddMetaData(object = pbmc_small[["RNA"]], metadata = feature_letters, col.name = 'feature_letters') - expect_equal(pbmc_small[["RNA"]][["feature_letters", drop = TRUE]], feature_letters) - pbmc_small[["RNA"]] <- AddMetaData(object = pbmc_small[["RNA"]], metadata = feature_letters_shuffled, col.name = 'feature_letters_shuffled') - expect_equal(pbmc_small[["RNA"]][["feature_letters", drop = TRUE]], pbmc_small[["RNA"]][["feature_letters_shuffled", drop = TRUE]]) -}) - -feature_letters_df <- data.frame(A = feature_letters, B = feature_letters_shuffled) -test_that("AddMetaData adds in data frame properly for Assays", { - pbmc_small[["RNA"]] <- AddMetaData(object = pbmc_small[["RNA"]], metadata = feature_letters_df) - expect_equal(pbmc_small[["RNA"]][[c("A", "B")]], feature_letters_df) -}) - -test_that("AddMetaData errors", { - expect_error(AddMetaData(object = pbmc_small, metadata = cluster_letters, col.name = "RNA")) - expect_error(AddMetaData(object = pbmc_small, metadata = c(unname(cluster_letters), "A"), col.name = "letter.idents")) - expect_error(AddMetaData(object = pbmc_small, metadata = feature_letters, col.name = "letter.idents")) - expect_error(AddMetaData(object = pbmc_small[["RNA"]], metadata = cluster_letters, col.name = "letter.idents")) -}) - -# Tests for creating an Assay object -# ------------------------------------------------------------------------------ -context("CreateAssayObject") - -pbmc.raw <- GetAssayData(object = pbmc_small[["RNA"]], slot = "counts") -rna.assay <- CreateAssayObject(counts = pbmc.raw) -rna.assay2 <- CreateAssayObject(data = pbmc.raw) - -test_that("CreateAssayObject works as expected", { - expect_equal(dim(x = rna.assay), c(230, 80)) - expect_equal(rownames(x = rna.assay), rownames(x = pbmc.raw)) - expect_equal(colnames(x = rna.assay), colnames(x = pbmc.raw)) - expect_equal(GetAssayData(object = rna.assay, slot = "counts"), pbmc.raw) - expect_equal(GetAssayData(object = rna.assay, slot = "data"), pbmc.raw) - expect_equal(GetAssayData(object = rna.assay, slot = "scale.data"), new(Class = "matrix")) - expect_equal(dim(rna.assay[[]]), c(230, 0)) - expect_equal(rownames(x = rna.assay[[]]), rownames(x = rna.assay)) - expect_equal(VariableFeatures(object = rna.assay), vector()) - expect_equal(rna.assay@misc, list()) - expect_equal(GetAssayData(object = rna.assay2, slot = "counts"), new(Class = "matrix")) -}) - -rna.assay2 <- CreateAssayObject(counts = pbmc.raw, min.cells = 10, min.features = 30) -test_that("CreateAssayObject filtering works", { - expect_equal(dim(x = rna.assay2), c(163, 77)) - expect_true(all(rowSums(GetAssayData(object = rna.assay2, slot = "counts")) >= 10)) - expect_true(all(colSums(GetAssayData(object = rna.assay2, slot = "counts")) >= 30)) -}) - -test_that("CreateAssayObject catches improper input", { - expect_error(CreateAssayObject()) - expect_error(CreateAssayObject(counts = pbmc.raw, data = pbmc.raw)) - pbmc.raw2 <- cbind(pbmc.raw[, 1:10], pbmc.raw[, 1:10]) - expect_warning(CreateAssayObject(counts = pbmc.raw2)) - expect_warning(CreateAssayObject(data = pbmc.raw2)) - pbmc.raw2 <- rbind(pbmc.raw[1:10, ], pbmc.raw[1:10, ]) - expect_warning(CreateAssayObject(counts = pbmc.raw2)) - expect_warning(CreateAssayObject(data = pbmc.raw2)) - pbmc.raw2 <- pbmc.raw - colnames(x = pbmc.raw2) <- c() - expect_error(CreateAssayObject(counts = pbmc.raw2)) - expect_error(CreateAssayObject(data = pbmc.raw2)) - pbmc.raw2 <- pbmc.raw - rownames(x = pbmc.raw2) <- c() - expect_error(CreateAssayObject(counts = pbmc.raw2)) - expect_error(CreateAssayObject(data = pbmc.raw2)) - pbmc.raw.mat <- as.matrix(x = pbmc.raw) - pbmc.raw.df <- as.data.frame(x = pbmc.raw.mat) - rna.assay3 <- CreateAssayObject(counts = pbmc.raw.df) - rna.assay4 <- CreateAssayObject(counts = pbmc.raw.mat) - expect_is(object = GetAssayData(object = rna.assay3, slot = "counts"), class = "dgCMatrix") - expect_is(object = GetAssayData(object = rna.assay4, slot = "counts"), class = "dgCMatrix") - pbmc.raw.underscores <- pbmc.raw - rownames(pbmc.raw.underscores) <- gsub(pattern = "-", replacement = "_", x = rownames(pbmc.raw.underscores)) - expect_warning(CreateAssayObject(counts = pbmc.raw.underscores)) -}) - -# Tests for creating an DimReduc object -# ------------------------------------------------------------------------------ -context("CreateDimReducObject") - -pca <- pbmc_small[["pca"]] -Key(object = pca) <- 'PC_' - -test_that("CreateDimReducObject works", { - pca.dr <- CreateDimReducObject( - embeddings = Embeddings(object = pca), - loadings = Loadings(object = pca), - projected = Loadings(object = pca, projected = TRUE), - assay = "RNA" - ) - expect_equal(Embeddings(object = pca.dr), Embeddings(object = pca)) - expect_equal(Loadings(object = pca.dr), Loadings(object = pca)) - expect_equal(Loadings(object = pca.dr, projected = TRUE), Loadings(object = pca, projected = TRUE)) - expect_equal(Key(object = pca.dr), "PC_") - expect_equal(pca.dr@assay.used, "RNA") -}) - -test_that("CreateDimReducObject catches improper input", { - bad.embeddings <- Embeddings(object = pca) - colnames(x = bad.embeddings) <- paste0("PCA", 1:ncol(x = bad.embeddings)) - expect_warning(CreateDimReducObject(embeddings = bad.embeddings, key = "PC")) - colnames(x = bad.embeddings) <- paste0("PC", 1:ncol(x = bad.embeddings), "X") - suppressWarnings(expect_error(CreateDimReducObject(embeddings = bad.embeddings, key = "PC"))) - suppressWarnings(expect_error(CreateDimReducObject(embeddings = bad.embeddings))) -}) - -# Tests for creating a Seurat object -# ------------------------------------------------------------------------------ -context("CreateSeuratObject") - -colnames(x = pbmc.raw) <- paste0(colnames(x = pbmc.raw), "-", pbmc_small$groups) -metadata.test <- pbmc_small[[]][, 5:7] -rownames(x = metadata.test) <- colnames(x = pbmc.raw) - -test_that("CreateSeuratObject works", { - seurat.object <- CreateSeuratObject( - counts = pbmc.raw, - project = "TESTING", - assay = "RNA.TEST", - names.field = 2, - names.delim = "-", - meta.data = metadata.test - ) - expect_equal(seurat.object[[]][, 4:6], metadata.test) - expect_equal(seurat.object@project.name, "TESTING") - expect_equal(names(x = seurat.object), "RNA.TEST") - expect_equal(as.vector(x = unname(obj = Idents(object = seurat.object))), unname(pbmc_small$groups)) -}) - -test_that("CreateSeuratObject handles bad names.field/names.delim", { - expect_warning(seurat.object <- CreateSeuratObject( - counts = pbmc.raw[1:5,1:5], - names.field = 3, - names.delim = ":", - meta.data = metadata.test - )) -}) - -# Tests for creating a Seurat object -# ------------------------------------------------------------------------------ -context("Merging") - -pbmc.assay <- pbmc_small[["RNA"]] -x <- merge(x = pbmc.assay, y = pbmc.assay) - -test_that("Merging Assays works properly", { - expect_equal(dim(GetAssayData(object = x, slot = "counts")), c(230, 160)) - expect_equal(dim(GetAssayData(object = x, slot = "data")), c(230, 160)) - expect_equal(GetAssayData(object = x, slot = "scale.data"), new(Class = "matrix")) - expect_equal(Key(object = x), "rna_") - expect_equal(VariableFeatures(object = x), vector()) - expect_equal(x[[]], data.frame(row.names = rownames(x = pbmc.assay))) -}) - -pbmc.assay2 <- pbmc.assay -pbmc.assay2@counts <- new("dgCMatrix") -test_that("Merging Assays handles case when counts not present", { - y <- merge(x = pbmc.assay2, y = pbmc.assay) - expect_equal(unname(colSums(x = GetAssayData(object = y, slot = "counts"))[1:80]), rep.int(x = 0, times = 80)) - z <- merge(x = pbmc.assay2, pbmc.assay2) - expect_equal(nnzero(x = GetAssayData(object = z, slot = "counts")), 0) -}) - -pbmc.assay2 <- pbmc.assay -pbmc.assay2@data <- new("dgCMatrix") -test_that("Merging Assays handles case when data not present", { - y <- merge(x = pbmc.assay2, y = pbmc.assay, merge.data = TRUE) - expect_equal(unname(colSums(x = GetAssayData(object = y, slot = "data"))[1:80]), rep.int(x = 0, times = 80)) - z <- merge(x = pbmc.assay2, y = pbmc.assay2, merge.data = TRUE) - expect_equal(nnzero(x = GetAssayData(object = z, slot = "data")), 0) -}) - -# Tests for Neighbor object -# ------------------------------------------------------------------------------ -context("Neighbor") - -# converting to Graph and back - -n.rann.ob <- NNHelper( - data = Embeddings(object = pbmc_small[["pca"]]), - query = Embeddings(object = pbmc_small[["pca"]]), - k = 10, - method = "rann") - -test_that("Neighbor object methods work", { - expect_equal(dim(x = Indices(object = n.rann.ob)), c(80, 10)) - expect_equal(dim(x = n.rann.ob), c(80, 10)) - expect_equal(as.numeric(Indices(object = n.rann.ob)[1, 7]), 45, ) - expect_equal(dim(x = Distances(object = n.rann.ob)), c(80, 10)) - expect_equal(as.numeric(Distances(object = n.rann.ob)[2, 2]), 2.643759, tolerance = 1e-6) - expect_equal(length(x = Cells(x = n.rann.ob)), 80) - expect_equal(Cells(x = n.rann.ob)[c(1, 20, 80)], c("ATGCCAGAACGACT", "TACATCACGCTAAC", "CTTGATTGATCTTC")) - pbmc_small[["n.ob"]] <- n.rann.ob - pbmc_small <- RenameCells(object = pbmc_small, add.cell.id = "test") - expect_equal(Cells(x = pbmc_small[['n.ob']])[1], c("test_ATGCCAGAACGACT")) - expect_equal(TopNeighbors(object = n.rann.ob, cell = "ATGCCAGAACGACT", n = 5)[5], "GATATAACACGCAT") - expect_equal(length(TopNeighbors(object = n.rann.ob, cell = "ATGCCAGAACGACT", n = 7)), 7) - nrg <- as.Graph(x = n.rann.ob) - expect_true(inherits(x = nrg, what = "Graph")) - expect_equal(as.numeric(Distances(object = n.rann.ob)[2, 3]), nrg[2, Indices(object = n.rann.ob)[2, 3]]) - nro2 <- as.Neighbor(x = nrg) - expect_true(inherits(x = nro2, what = "Neighbor")) - expect_equal(Distances(object = n.rann.ob)[2, 3], Distances(object = nro2)[2, 3]) - expect_equal(Indices(object = n.rann.ob)[1, 6], Indices(object = nro2)[1, 6]) -}) - -n.annoy.ob <- NNHelper( - data = Embeddings(object = pbmc_small[["pca"]]), - query = Embeddings(object = pbmc_small[["pca"]]), - k = 10, - method = "annoy", - cache.index = TRUE) -idx.file <- tempfile() -SaveAnnoyIndex(object = n.annoy.ob, file = idx.file) -nao2 <- LoadAnnoyIndex(object = n.annoy.ob, file = idx.file) - -test_that("Saving/Loading annoy index", { - expect_error(SaveAnnoyIndex(object = n.rann.ob, file = idx.file)) - expect_equal(head(Indices(n.annoy.ob)), head(Indices(nao2))) - expect_equal(head(Distances(n.annoy.ob)), head(Distances(nao2))) - expect_false(is.null(x = Index(nao2))) -}) - -# Tests for FetchData -# ------------------------------------------------------------------------------ -context("FetchData") - -# Features to test: -# able to pull cell embeddings, data, metadata -# subset of cells - -test_that("Fetching a subset of cells works", { - x <- FetchData(object = pbmc_small, cells = colnames(x = pbmc_small)[1:10], vars = rownames(x = pbmc_small)[1]) - expect_equal(rownames(x = x), colnames(x = pbmc_small)[1:10]) - random.cells <- sample(x = colnames(x = pbmc_small), size = 10) - x <- FetchData(object = pbmc_small, cells = random.cells, vars = rownames(x = pbmc_small)[1]) - expect_equal(rownames(x = x), random.cells) - x <- FetchData(object = pbmc_small, cells = 1:10, vars = rownames(x = pbmc_small)[1]) - expect_equal(rownames(x = x), colnames(x = pbmc_small)[1:10]) -}) - -suppressWarnings(pbmc_small[["RNA2"]] <- pbmc_small[["RNA"]]) -Key(pbmc_small[["RNA2"]]) <- "rna2_" - -test_that("Fetching keyed variables works", { - x <- FetchData(object = pbmc_small, vars = c(paste0("rna_", rownames(x = pbmc_small)[1:5]), paste0("rna2_", rownames(x = pbmc_small)[1:5]))) - expect_equal(colnames(x = x), c(paste0("rna_", rownames(x = pbmc_small)[1:5]), paste0("rna2_", rownames(x = pbmc_small)[1:5]))) - x <- FetchData(object = pbmc_small, vars = c(paste0("rna_", rownames(x = pbmc_small)[1:5]), paste0("PC_", 1:5))) - expect_equal(colnames(x = x), c(paste0("rna_", rownames(x = pbmc_small)[1:5]), paste0("PC_", 1:5))) -}) - -test_that("Fetching embeddings/loadings not present returns warning or errors", { - expect_warning(FetchData(object = pbmc_small, vars = c("PC_1", "PC_100"))) - expect_error(FetchData(object = pbmc_small, vars = "PC_100")) -}) - -bad.gene <- GetAssayData(object = pbmc_small[["RNA"]], slot = "data") -rownames(x = bad.gene)[1] <- paste0("rna_", rownames(x = bad.gene)[1]) -pbmc_small[["RNA"]]@data <- bad.gene - -# Tests for WhichCells -# ------------------------------------------------------------------------------ - -test_that("Specifying cells works", { - test.cells <- Cells(x = pbmc_small)[1:10] - expect_equal(WhichCells(object = pbmc_small, cells = test.cells), test.cells) - expect_equal(WhichCells(object = pbmc_small, cells = test.cells, invert = TRUE), setdiff(Cells(x = pbmc_small), test.cells)) -}) - -test_that("Specifying idents works", { - c12 <- WhichCells(object = pbmc_small, idents = c(1, 2)) - expect_equal(length(x = c12), 44) - expect_equal(c12[44], "CTTGATTGATCTTC") - expect_equal(c12, WhichCells(object = pbmc_small, idents = 0, invert = TRUE)) -}) - -test_that("downsample works", { - expect_equal(length(x = WhichCells(object = pbmc_small, downsample = 5)), 15) - expect_equal(length(x = WhichCells(object = pbmc_small, downsample = 100)), 80) -}) - -test_that("passing an expression works", { - lyz.pos <- WhichCells(object = pbmc_small, expression = LYZ > 1) - expect_true(all(GetAssayData(object = pbmc_small, slot = "data")["LYZ", lyz.pos] > 1)) - # multiple values in expression - lyz.pos <- WhichCells(object = pbmc_small, expression = LYZ > 1 & groups == "g1") - expect_equal(length(x = lyz.pos), 30) - expect_equal(lyz.pos[30], "CTTGATTGATCTTC") -}) - -# Tests for small other functions -# ------------------------------------------------------------------------------ -test_that("Top works", { - dat <- Embeddings(object = pbmc_small[['pca']])[, 1, drop = FALSE] - expect_warning(Top(data = dat, num = 1000, balanced = FALSE)) - tpc1 <- Top(data = dat, num = 20, balanced = FALSE) - expect_equal(length(x = tpc1), 20) - expect_equal(tpc1[1], "ACGTGATGCCATGA") - expect_equal(tpc1[20], "GTCATACTTCGCCT") - tpc1b <- Top(data = dat, num = 20, balanced = TRUE) - expect_equal(length(x = tpc1b), 2) - expect_equal(names(tpc1b), c("positive", "negative")) - expect_equal(length(tpc1b[[1]]), 10) - expect_equal(length(tpc1b[[2]]), 10) - expect_equal(tpc1b[[1]][1], "GTCATACTTCGCCT") - expect_equal(tpc1b[[1]][10], "CTTGATTGATCTTC") - expect_equal(tpc1b[[2]][1], "ACGTGATGCCATGA") - expect_equal(tpc1b[[2]][10], "ATTGTAGATTCCCG") - tpc1.sub <- Top(data = dat[1:79, , drop = FALSE], num = 79, balanced = TRUE) - expect_equal(length(tpc1.sub[[1]]), 40) - expect_equal(length(tpc1.sub[[2]]), 39) -}) - - # Tests for SCE conversion # ------------------------------------------------------------------------------ test_that("as.SingleCellExperiment works", { skip_on_cran() if (requireNamespace('SingleCellExperiment', quietly = TRUE)) { - mat <- matrix(1:100, ncol = 10) - colnames(mat) <- LETTERS[1:10] - rownames(mat) <- LETTERS[1:10] + mat <- pbmc_small[["RNA"]]$counts seuratObj <- Seurat::CreateSeuratObject(mat) - sce <- as.SingleCellExperiment(seuratObj) + sce <- suppressWarnings(as.SingleCellExperiment(seuratObj)) - expect_equal(ncol(sce), 10) - expect_equal(nrow(sce), 10) + expect_equal(ncol(sce), 80) + expect_equal(nrow(sce), 230) # expect_equal(length(SingleCellExperiment::altExps(sce)), 0) # expect_equal(SingleCellExperiment::mainExpName(sce), 'RNA') seuratObj <- Seurat::CreateSeuratObject(mat) seuratObj[['ADT']] <- CreateAssayObject(mat) - sce <- as.SingleCellExperiment(seuratObj) - expect_equal(ncol(sce), 10) - expect_equal(nrow(sce), 10) + sce <- suppressWarnings(as.SingleCellExperiment(seuratObj)) + expect_equal(ncol(sce), 80) + expect_equal(nrow(sce), 230) # expect_equal(names(SingleCellExperiment::altExps(sce)), 'ADT') # expect_equal(SingleCellExperiment::mainExpName(sce), 'RNA') } diff --git a/tests/testthat/test_preprocessing.R b/tests/testthat/test_preprocessing.R index d84ef3bdd..a28af6ddd 100644 --- a/tests/testthat/test_preprocessing.R +++ b/tests/testthat/test_preprocessing.R @@ -17,13 +17,14 @@ test_that("object initialization actually creates seurat object", { expect_is(object, "Seurat") }) -test_that("meta.data slot generated correctly", { - expect_equal(dim(object[[]]), c(80, 4)) - expect_equal(colnames(object[[]]), c("orig.ident", "nCount_RNA", "nFeature_RNA", "FMD")) - expect_equal(rownames(object[[]]), colnames(object)) - expect_equal(object[["nFeature_RNA"]][1:5, ], c(47, 52, 50, 56, 53)) - expect_equal(object[["nCount_RNA"]][75:80, ], c(228, 527, 202, 157, 150, 233)) -}) +#this should be moved to seurat object +# test_that("meta.data slot generated correctly", { +# expect_equal(dim(object[[]]), c(80, 4)) +# expect_equal(colnames(object[[]]), c("orig.ident", "nCount_RNA", "nFeature_RNA", "FMD")) +# expect_equal(rownames(object[[]]), colnames(object)) +# expect_equal(object[["nFeature_RNA"]][1:5, ], c(47, 52, 50, 56, 53)) +# expect_equal(object[["nCount_RNA"]][75:80, ], c(228, 527, 202, 157, 150, 233)) +# }) object.filtered <- CreateSeuratObject( counts = pbmc.test, @@ -32,17 +33,18 @@ object.filtered <- CreateSeuratObject( ) test_that("Filtering handled properly", { - expect_equal(nrow(x = GetAssayData(object = object.filtered, slot = "counts")), 163) - expect_equal(ncol(x = GetAssayData(object = object.filtered, slot = "counts")), 77) + expect_equal(nrow(x = LayerData(object = object.filtered, layer = "counts")), 163) + expect_equal(ncol(x = LayerData(object = object.filtered, layer = "counts")), 77) }) -test_that("Metadata check errors correctly", { - pbmc.md <- pbmc_small[[]] - pbmc.md.norownames <- as.matrix(pbmc.md) - rownames(pbmc.md.norownames) <- NULL - expect_error(CreateSeuratObject(counts = pbmc.test, meta.data = pbmc.md.norownames), - "Row names not set in metadata. Please ensure that rownames of metadata match column names of data matrix") -}) +#this should be moved to seurat object +# test_that("Metadata check errors correctly", { +# pbmc.md <- pbmc_small[[]] +# pbmc.md.norownames <- as.matrix(pbmc.md) +# rownames(pbmc.md.norownames) <- NULL +# expect_error(CreateSeuratObject(counts = pbmc.test, meta.data = pbmc.md.norownames), +# "Row names not set in metadata. Please ensure that rownames of metadata match column names of data matrix") +# }) # Tests for NormalizeData # -------------------------------------------------------------------------------- @@ -50,31 +52,31 @@ context("NormalizeData") test_that("NormalizeData error handling", { expect_error(NormalizeData(object = object, assay = "FAKE")) expect_equal( - object = GetAssayData( + object = LayerData( object = NormalizeData( object = object, normalization.method = NULL, verbose = FALSE ), - slot = "data" + layer = "data" ), - expected = GetAssayData(object = object, slot = "counts") + expected = LayerData(object = object, layer = "counts") ) }) object <- NormalizeData(object = object, verbose = FALSE, scale.factor = 1e6) test_that("NormalizeData scales properly", { - expect_equal(GetAssayData(object = object, slot = "data")[2, 1], 9.567085, tolerance = 1e-6) - expect_equal(GetAssayData(object = object, slot = "data")[161, 55], 8.415309, tolerance = 1e-6) + expect_equal(LayerData(object = object, layer = "data")[2, 1], 9.567085, tolerance = 1e-6) + expect_equal(LayerData(object = object, layer = "data")[161, 55], 8.415309, tolerance = 1e-6) expect_equal(Command(object = object, command = "NormalizeData.RNA", value = "scale.factor"), 1e6) expect_equal(Command(object = object, command = "NormalizeData.RNA", value = "normalization.method"), "LogNormalize") }) -normalized.data <- LogNormalize(data = GetAssayData(object = object[["RNA"]], slot = "counts"), verbose = FALSE) +normalized.data <- LogNormalize(data = GetAssayData(object = object[["RNA"]], layer = "counts"), verbose = FALSE) test_that("LogNormalize normalizes properly", { expect_equal( - LogNormalize(data = GetAssayData(object = object[["RNA"]], slot = "counts"), verbose = FALSE), - LogNormalize(data = as.data.frame(as.matrix(GetAssayData(object = object[["RNA"]], slot = "counts"))), verbose = FALSE) + as.matrix(LogNormalize(data = GetAssayData(object = object[["RNA"]], layer = "counts"), verbose = FALSE)), + as.matrix(LogNormalize(data = as.data.frame(as.matrix(GetAssayData(object = object[["RNA"]], layer = "counts"))), verbose = FALSE)) ) }) @@ -95,22 +97,101 @@ test_that("Relative count normalization returns expected values", { expect_equal(rc.counts[2, 1], 14285.71, tolerance = 1e-6) }) +# Tests for v5 NormalizeData +# -------------------------------------------------------------------------------- +context("v5 NormalizeData") + +if(class(object[['RNA']]) == "Assay5") { + fake.groups <- c(rep(1, floor(ncol(pbmc.test)/2)), + rep(2, ncol(pbmc.test) - (floor(ncol(pbmc.test)/2))) ) + object$groups <- fake.groups + object.split <- CreateSeuratObject(split(object[["RNA"]], f = object$groups)) + object.split <- NormalizeData(object = object.split) + + group1 <- subset(object, groups==1) + group1 <- NormalizeData(group1) + + test_that("Normalization is performed for each layer", { + expect_equal(Layers(object.split),c("counts.1", "counts.2", "data.1", "data.2")) + expect_equal(group1[['RNA']]$data, LayerData(object.split, layer="data.1")) + }) + + object.split <- NormalizeData(object = object.split, normalization.method = "CLR", verbose = FALSE) + group1 <- NormalizeData(object = group1, normalization.method = "CLR", verbose = FALSE) + test_that("CLR normalization works with multiple layers", { + expect_equal(Layers(object.split),c("counts.1", "counts.2", "data.1", "data.2")) + expect_equal(group1[['RNA']]$data, LayerData(object.split, layer="data.1")) + }) + + object.split <- NormalizeData(object = object.split, normalization.method = "RC", verbose = FALSE) + group1 <- NormalizeData(object = group1, normalization.method = "RC", verbose = FALSE) + test_that("RC normalization works with multiple layers", { + expect_equal(Layers(object.split),c("counts.1", "counts.2", "data.1", "data.2")) + expect_equal(group1[['RNA']]$data, LayerData(object.split, layer="data.1")) + }) +} + + + +test_that("NormalizeData scales properly for BPcells", { + # Tests for BPCells NormalizeData + # -------------------------------------------------------------------------------- + + skip_on_cran() + library(Matrix) + skip_if_not_installed("BPCells") + library(BPCells) + mat_bpcells <- t(as(t(object[['RNA']]$counts ), "IterableMatrix")) + object[['RNAbp']] <- CreateAssay5Object(counts = mat_bpcells) + + object <- NormalizeData(object = object, verbose = FALSE, scale.factor = 1e6, assay = "RNAbp") + object <- NormalizeData(object = object, verbose = FALSE, scale.factor = 1e6, assay = "RNA") + + expect_equal(as.matrix(object[['RNAbp']]$data), as.matrix(object[['RNA']]$data), tolerance = 1e-6) + expect_equal(Command(object = object, command = "NormalizeData.RNAbp", value = "scale.factor"), 1e6) + expect_equal(Command(object = object, command = "NormalizeData.RNAbp", value = "normalization.method"), "LogNormalize") +}) + + + +test_that("LogNormalize normalizes properly for BPCells", { + skip_on_cran() + library(Matrix) + skip_if_not_installed("BPCells") + library(BPCells) + mat_bpcells <- t(as(t(object[['RNA']]$counts ), "IterableMatrix")) + object[['RNAbp']] <- CreateAssay5Object(counts = mat_bpcells) + + object <- NormalizeData(object = object, verbose = FALSE, scale.factor = 1e6, assay = "RNAbp") + object <- NormalizeData(object = object, verbose = FALSE, scale.factor = 1e6, assay = "RNA") + + normalized.data.bp <- LogNormalize(data = GetAssayData(object = object[["RNAbp"]], layer = "counts"), verbose = FALSE) + normalized.data <- LogNormalize(data = GetAssayData(object = object[["RNA"]], layer = "counts"), verbose = FALSE) + + expect_equal( + as.matrix(normalized.data.bp), + as.matrix(normalized.data), + tolerance = 1e-6 + ) +}) + # Tests for ScaleData # -------------------------------------------------------------------------------- context("ScaleData") object <- ScaleData(object, verbose = FALSE) test_that("ScaleData returns expected values when input is a sparse matrix", { - expect_equal(GetAssayData(object = object[["RNA"]], slot = "scale.data")[1, 1], -0.4148587, tolerance = 1e-6) - expect_equal(GetAssayData(object = object[["RNA"]], slot = "scale.data")[75, 25], -0.2562305, tolerance = 1e-6) - expect_equal(GetAssayData(object = object[["RNA"]], slot = "scale.data")[162, 59], -0.4363939, tolerance = 1e-6) + expect_equal(GetAssayData(object = object[["RNA"]], layer = "scale.data")[1, 1], -0.4148587, tolerance = 1e-6) + expect_equal(GetAssayData(object = object[["RNA"]], layer = "scale.data")[75, 25], -0.2562305, tolerance = 1e-6) + expect_equal(GetAssayData(object = object[["RNA"]], layer = "scale.data")[162, 59], -0.4363939, tolerance = 1e-6) }) -new.data <- as.matrix(GetAssayData(object = object[["RNA"]], slot = "data")) +new.data <- as.matrix(GetAssayData(object = object[["RNA"]], layer = "data")) new.data[1, ] <- rep(x = 0, times = ncol(x = new.data)) object2 <- object -object2[["RNA"]] <- SetAssayData( - object = object[["RNA"]], +object2 <- SetAssayData( + object = object, + assay = "RNA", slot = "data", new.data = new.data ) @@ -118,13 +199,13 @@ object2 <- ScaleData(object = object2, verbose = FALSE) object <- ScaleData(object = object, verbose = FALSE) test_that("ScaleData returns expected values when input is not sparse", { - expect_equal(GetAssayData(object = object[["RNA"]], slot = "scale.data")[75, 25], -0.2562305, tolerance = 1e-6) - expect_equal(GetAssayData(object = object[["RNA"]], slot = "scale.data")[162, 59], -0.4363939, tolerance = 1e-6) + expect_equal(GetAssayData(object = object[["RNA"]], layer = "scale.data")[75, 25], -0.2562305, tolerance = 1e-6) + expect_equal(GetAssayData(object = object[["RNA"]], layer = "scale.data")[162, 59], -0.4363939, tolerance = 1e-6) }) test_that("ScaleData handles zero variance features properly", { - expect_equal(GetAssayData(object = object2[["RNA"]], slot = "scale.data")[1, 1], 0) - expect_equal(GetAssayData(object = object2[["RNA"]], slot = "scale.data")[1, 80], 0) + expect_equal(GetAssayData(object = object2[["RNA"]], layer = "scale.data")[1, 1], 0) + expect_equal(GetAssayData(object = object2[["RNA"]], layer = "scale.data")[1, 80], 0) }) ng1 <- rep(x = "g1", times = round(x = ncol(x = object) / 2)) @@ -135,39 +216,42 @@ g2 <- subset(x = object, group == "g2") g2 <- ScaleData(object = g2, features = rownames(x = g2), verbose = FALSE) object <- ScaleData(object = object, features = rownames(x = object), verbose = FALSE, split.by = "group") -test_that("split.by option works", { - expect_equal(GetAssayData(object = object, slot = "scale.data")[, Cells(x = g1)], - GetAssayData(object = g1, slot = "scale.data")) - expect_equal(GetAssayData(object = object, slot = "scale.data")[, Cells(x = g2)], - GetAssayData(object = g2, slot = "scale.data")) -}) +#move to SeuratObject +# test_that("split.by option works", { +# expect_equal(GetAssayData(object = object, layer = "scale.data")[, Cells(x = g1)], +# GetAssayData(object = g1, layer = "scale.data")) +# expect_equal(GetAssayData(object = object, layer = "scale.data")[, Cells(x = g2)], +# GetAssayData(object = g2, layer = "scale.data")) +# }) g1 <- ScaleData(object = g1, features = rownames(x = g1), vars.to.regress = "nCount_RNA", verbose = FALSE) g2 <- ScaleData(object = g2, features = rownames(x = g2), vars.to.regress = "nCount_RNA", verbose = FALSE) object <- ScaleData(object = object, features = rownames(x = object), verbose = FALSE, split.by = "group", vars.to.regress = "nCount_RNA") test_that("split.by option works with regression", { - expect_equal(GetAssayData(object = object, slot = "scale.data")[, Cells(x = g1)], - GetAssayData(object = g1, slot = "scale.data")) - expect_equal(GetAssayData(object = object, slot = "scale.data")[, Cells(x = g2)], - GetAssayData(object = g2, slot = "scale.data")) + expect_equal(LayerData(object = object, layer = "scale.data")[, Cells(x = g1)], + LayerData(object = g1, layer = "scale.data")) + expect_equal(LayerData(object = object, layer = "scale.data")[, Cells(x = g2)], + LayerData(object = g2, layer = "scale.data")) }) # Tests for various regression techniques context("Regression") -object <- ScaleData( +suppressWarnings({ + object <- ScaleData( object = object, vars.to.regress = "nCount_RNA", features = rownames(x = object)[1:10], verbose = FALSE, model.use = "linear") + }) test_that("Linear regression works as expected", { - expect_equal(dim(x = GetAssayData(object = object[["RNA"]], slot = "scale.data")), c(10, 80)) - expect_equal(GetAssayData(object = object[["RNA"]], slot = "scale.data")[1, 1], -0.6436435, tolerance = 1e-6) - expect_equal(GetAssayData(object = object[["RNA"]], slot = "scale.data")[5, 25], -0.09035383, tolerance = 1e-6) - expect_equal(GetAssayData(object = object[["RNA"]], slot = "scale.data")[10, 80], -0.2723782, tolerance = 1e-6) + expect_equal(dim(x = GetAssayData(object = object[["RNA"]], layer = "scale.data")), c(10, 80)) + expect_equal(GetAssayData(object = object[["RNA"]], layer = "scale.data")[1, 1], -0.6436435, tolerance = 1e-6) + expect_equal(GetAssayData(object = object[["RNA"]], layer = "scale.data")[5, 25], -0.09035383, tolerance = 1e-6) + expect_equal(GetAssayData(object = object[["RNA"]], layer = "scale.data")[10, 80], -0.2723782, tolerance = 1e-6) }) object <- ScaleData( @@ -178,10 +262,10 @@ object <- ScaleData( model.use = "negbinom") test_that("Negative binomial regression works as expected", { - expect_equal(dim(x = GetAssayData(object = object[["RNA"]], slot = "scale.data")), c(10, 80)) - expect_equal(GetAssayData(object = object[["RNA"]], slot = "scale.data")[1, 1], -0.5888811, tolerance = 1e-6) - expect_equal(GetAssayData(object = object[["RNA"]], slot = "scale.data")[5, 25], -0.2553394, tolerance = 1e-6) - expect_equal(GetAssayData(object = object[["RNA"]], slot = "scale.data")[10, 80], -0.1921429, tolerance = 1e-6) + expect_equal(dim(x = GetAssayData(object = object[["RNA"]], layer = "scale.data")), c(10, 80)) + expect_equal(GetAssayData(object = object[["RNA"]], layer = "scale.data")[1, 1], -0.5888811, tolerance = 1e-6) + expect_equal(GetAssayData(object = object[["RNA"]], layer = "scale.data")[5, 25], -0.2553394, tolerance = 1e-6) + expect_equal(GetAssayData(object = object[["RNA"]], layer = "scale.data")[10, 80], -0.1921429, tolerance = 1e-6) }) test_that("Regression error handling checks out", { @@ -196,10 +280,10 @@ object <- ScaleData( model.use = "poisson") test_that("Poisson regression works as expected", { - expect_equal(dim(x = GetAssayData(object = object[["RNA"]], slot = "scale.data")), c(10, 80)) - expect_equal(GetAssayData(object = object[["RNA"]], slot = "scale.data")[1, 1], -1.011717, tolerance = 1e-6) - expect_equal(GetAssayData(object = object[["RNA"]], slot = "scale.data")[5, 25], 0.05575307, tolerance = 1e-6) - expect_equal(GetAssayData(object = object[["RNA"]], slot = "scale.data")[10, 80], -0.1662119, tolerance = 1e-6) + expect_equal(dim(x = GetAssayData(object = object[["RNA"]], layer = "scale.data")), c(10, 80)) + expect_equal(GetAssayData(object = object[["RNA"]], layer = "scale.data")[1, 1], -1.011717, tolerance = 1e-6) + expect_equal(GetAssayData(object = object[["RNA"]], layer = "scale.data")[5, 25], 0.05575307, tolerance = 1e-6) + expect_equal(GetAssayData(object = object[["RNA"]], layer = "scale.data")[10, 80], -0.1662119, tolerance = 1e-6) }) @@ -208,22 +292,22 @@ test_that("Poisson regression works as expected", { context("SampleUMI") downsampled.umis <- SampleUMI( - data = GetAssayData(object = object, slot = "counts"), + data = LayerData(object = object, layer = "counts"), max.umi = 100, verbose = FALSE ) downsampled.umis.p.cell <- SampleUMI( - data = GetAssayData(object = object, slot = "counts"), + data = LayerData(object = object, layer = "counts"), max.umi = seq(50, 1640, 20), verbose = FALSE, upsample = TRUE ) test_that("SampleUMI gives reasonable downsampled/upsampled UMI counts", { expect_true(!any(colSums(x = downsampled.umis) < 30, colSums(x = downsampled.umis) > 120)) - expect_error(SampleUMI(data = GetAssayData(object = object, slot = "raw.data"), max.umi = rep(1, 5))) + expect_error(SampleUMI(data = LayerData(object = object, layer = "counts"), max.umi = rep(1, 5))) expect_true(!is.unsorted(x = colSums(x = downsampled.umis.p.cell))) expect_error(SampleUMI( - data = GetAssayData(object = object, slot = "counts"), + data = LayerData(object = object, layer = "counts"), max.umi = seq(50, 900, 10), verbose = FALSE, upsample = TRUE @@ -238,31 +322,36 @@ object <- FindVariableFeatures(object = object, selection.method = "mean.var.plo test_that("mean.var.plot selection option returns expected values", { expect_equal(VariableFeatures(object = object)[1:4], c("PTGDR", "SATB1", "ZNF330", "S100B")) expect_equal(length(x = VariableFeatures(object = object)), 20) - expect_equal(HVFInfo(object = object[["RNA"]], selection.method = 'mvp')$mean[1:2], c(8.328927, 8.444462), tolerance = 1e-6) - expect_equal(HVFInfo(object = object[["RNA"]], selection.method = 'mvp')$dispersion[1:2], c(10.552507, 10.088223), tolerance = 1e-6) - expect_equal(as.numeric(HVFInfo(object = object[["RNA"]], selection.method = 'mvp')$dispersion.scaled[1:2]), c(0.1113214, -0.1332181523), tolerance = 1e-6) + hvf_info <- HVFInfo(object = object[["RNA"]], method = 'mvp') + expect_equal(hvf_info[[grep("mean$", colnames(hvf_info), value = TRUE)]][1:2], c(8.328927, 8.444462), tolerance = 1e-6) + expect_equal(hvf_info[[grep("dispersion$", colnames(hvf_info), value = TRUE)]][1:2], c(10.552507, 10.088223), tolerance = 1e-6) + expect_equal(as.numeric(hvf_info[[grep("dispersion.scaled$", colnames(hvf_info), value = TRUE)]][1:2]), c(0.1113214, -0.1332181523), tolerance = 1e-6) }) object <- FindVariableFeatures(object, selection.method = "dispersion", verbose = FALSE) test_that("dispersion selection option returns expected values", { expect_equal(VariableFeatures(object = object)[1:4], c("PCMT1", "PPBP", "LYAR", "VDAC3")) expect_equal(length(x = VariableFeatures(object = object)), 230) - expect_equal(HVFInfo(object = object[["RNA"]], selection.method = 'mvp')$mean[1:2], c(8.328927, 8.444462), tolerance = 1e-6) - expect_equal(HVFInfo(object = object[["RNA"]], selection.method = 'mvp')$dispersion[1:2], c(10.552507, 10.088223), tolerance = 1e-6) - expect_equal(as.numeric(HVFInfo(object = object[["RNA"]], selection.method = 'mvp')$dispersion.scaled[1:2]), c(0.1113214, -0.1332181523), tolerance = 1e-6) - expect_true(!is.unsorted(rev(HVFInfo(object = object[["RNA"]], selection.method = 'mvp')[VariableFeatures(object = object), "dispersion"]))) + hvf_info <- HVFInfo(object = object[["RNA"]], method = 'mvp') + expect_equal(hvf_info[[grep("mean$", colnames(hvf_info), value = TRUE)]][1:2], c(8.328927, 8.444462), tolerance = 1e-6) + expect_equal(hvf_info[[grep("dispersion$", colnames(hvf_info), value = TRUE)]][1:2], c(10.552507, 10.088223), tolerance = 1e-6) + expect_equal(as.numeric(hvf_info[[grep("dispersion.scaled$", colnames(hvf_info), value = TRUE)]][1:2]), c(0.1113214, -0.1332181523), tolerance = 1e-6) + expect_true(!is.unsorted(rev(hvf_info[VariableFeatures(object = object), "dispersion"]))) }) object <- FindVariableFeatures(object, selection.method = "vst", verbose = FALSE) test_that("vst selection option returns expected values", { expect_equal(VariableFeatures(object = object)[1:4], c("PPBP", "IGLL5", "VDAC3", "CD1C")) expect_equal(length(x = VariableFeatures(object = object)), 230) - expect_equal(unname(object[["RNA"]][["vst.variance", drop = TRUE]][1:2]), c(1.0251582, 1.2810127), tolerance = 1e-6) - expect_equal(unname(object[["RNA"]][["vst.variance.expected", drop = TRUE]][1:2]), c(1.1411616, 2.7076228), tolerance = 1e-6) - expect_equal(unname(object[["RNA"]][["vst.variance.standardized", drop = TRUE]][1:2]), c(0.8983463, 0.4731134), tolerance = 1e-6) - expect_true(!is.unsorted(rev(object[["RNA"]][["vst.variance.standardized", drop = TRUE]][VariableFeatures(object = object)]))) + hvf_info <- HVFInfo(object = object[["RNA"]], method = 'vst') + expect_equal(hvf_info[[grep("variance$", colnames(hvf_info), value = TRUE)]][1:2], c(1.0251582, 1.2810127), tolerance = 1e-6) + expect_equal(hvf_info[[grep("variance.standardized$", colnames(hvf_info), value = TRUE)]][1:2], c(0.8983463, 0.4731134), tolerance = 1e-6) + expect_true(!is.unsorted(rev(hvf_info[VariableFeatures(object = object), grep("variance.standardized$", colnames(hvf_info))]))) }) +#object <- FindVariableFeatures(object, assay = "RNAbp") +#this breaks currently + # Tests for internal functions # ------------------------------------------------------------------------------ norm.fxn <- function(x) {x / mean(x)} @@ -287,18 +376,18 @@ test_that("CustomNormalize works as expected", { }) # Tests for SCTransform -# ------------------------------------------------------------------------------ +# -------------------------------------------------------------------------------- context("SCTransform") -object <- suppressWarnings(SCTransform(object = object, verbose = FALSE)) +object <- suppressWarnings(SCTransform(object = object, verbose = FALSE, vst.flavor = "v1", seed.use = 1448145)) -test_that("SCTransform wrapper works as expected", { +test_that("SCTransform v1 works as expected", { expect_true("SCT" %in% names(object)) - expect_equal(as.numeric(colSums(GetAssayData(object = object[["SCT"]], slot = "scale.data"))[1]), 11.40288448) - expect_equal(as.numeric(rowSums(GetAssayData(object = object[["SCT"]], slot = "scale.data"))[5]), 0) - expect_equal(as.numeric(colSums(GetAssayData(object = object[["SCT"]], slot = "data"))[1]), 57.7295742, tolerance = 1e-6) - expect_equal(as.numeric(rowSums(GetAssayData(object = object[["SCT"]], slot = "data"))[5]), 11.74403719, tolerance = 1e-6) - expect_equal(as.numeric(colSums(GetAssayData(object = object[["SCT"]], slot = "counts"))[1]), 129) - expect_equal(as.numeric(rowSums(GetAssayData(object = object[["SCT"]], slot = "counts"))[5]), 28) + expect_equal(as.numeric(colSums(GetAssayData(object = object[["SCT"]], layer = "scale.data"))[1]), 11.40288448) + expect_equal(as.numeric(rowSums(GetAssayData(object = object[["SCT"]], layer = "scale.data"))[5]), 0) + expect_equal(as.numeric(colSums(GetAssayData(object = object[["SCT"]], layer = "data"))[1]), 57.7295742, tolerance = 1e-6) + expect_equal(as.numeric(rowSums(GetAssayData(object = object[["SCT"]], layer = "data"))[5]), 11.74403719, tolerance = 1e-6) + expect_equal(as.numeric(colSums(GetAssayData(object = object[["SCT"]], layer = "counts"))[1]), 129) + expect_equal(as.numeric(rowSums(GetAssayData(object = object[["SCT"]], layer = "counts"))[5]), 28) expect_equal(length(VariableFeatures(object[["SCT"]])), 220) fa <- SCTResults(object = object, assay = "SCT", slot = "feature.attributes") expect_equal(fa["MS4A1", "detection_rate"], 0.15) @@ -309,33 +398,175 @@ test_that("SCTransform wrapper works as expected", { }) suppressWarnings(RNGversion(vstr = "3.5.0")) -object <- suppressWarnings(SCTransform(object = object, ncells = 40, verbose = FALSE, seed.use = 42)) +object <- suppressWarnings(SCTransform(object = object, vst.flavor = "v1", ncells = 80, verbose = FALSE, seed.use = 42)) test_that("SCTransform ncells param works", { expect_true("SCT" %in% names(object)) - expect_equal(as.numeric(colSums(GetAssayData(object = object[["SCT"]], slot = "scale.data"))[1]), 12.02126, tolerance = 1e6) - expect_equal(as.numeric(rowSums(GetAssayData(object = object[["SCT"]], slot = "scale.data"))[5]), 0) - expect_equal(as.numeric(colSums(GetAssayData(object = object[["SCT"]], slot = "data"))[1]), 60.65299, tolerance = 1e-6) - expect_equal(as.numeric(rowSums(GetAssayData(object = object[["SCT"]], slot = "data"))[5]), 11.74404, tolerance = 1e-6) - expect_equal(as.numeric(colSums(GetAssayData(object = object[["SCT"]], slot = "counts"))[1]), 136) - expect_equal(as.numeric(rowSums(GetAssayData(object = object[["SCT"]], slot = "counts"))[5]), 28) + expect_equal(as.numeric(colSums(GetAssayData(object = object[["SCT"]], layer = "scale.data"))[1]), 11.40288, tolerance = 1e-6) + expect_equal(as.numeric(rowSums(GetAssayData(object = object[["SCT"]], layer = "scale.data"))[5]), 0) + expect_equal(as.numeric(colSums(GetAssayData(object = object[["SCT"]], layer = "data"))[1]), 57.72957, tolerance = 1e-6) + expect_equal(as.numeric(rowSums(GetAssayData(object = object[["SCT"]], layer = "data"))[5]), 11.74404, tolerance = 1e-6) + expect_equal(as.numeric(colSums(GetAssayData(object = object[["SCT"]], layer = "counts"))[1]), 129) + expect_equal(as.numeric(rowSums(GetAssayData(object = object[["SCT"]], layer = "counts"))[5]), 28) expect_equal(length(VariableFeatures(object[["SCT"]])), 220) fa <- SCTResults(object = object, assay = "SCT", slot = "feature.attributes") expect_equal(fa["MS4A1", "detection_rate"], 0.15) expect_equal(fa["MS4A1", "gmean"], 0.2027364, tolerance = 1e-6) expect_equal(fa["MS4A1", "variance"], 1.025158, tolerance = 1e-6) - expect_equal(fa["MS4A1", "residual_mean"], 0.2829672, tolerance = 1e-3) - expect_equal(fa["MS4A1", "residual_variance"], 3.674079, tolerance = 1e-3) + expect_equal(fa["MS4A1", "residual_mean"], 0.2362887, tolerance = 1e-3) + expect_equal(fa["MS4A1", "residual_variance"], 2.875761, tolerance = 1e-3) }) suppressWarnings(object[["SCT_SAVE"]] <- object[["SCT"]]) -object[["SCT"]] <- SetAssayData(object = object[["SCT"]], slot = "scale.data", new.data = GetAssayData(object = object[["SCT"]], slot = "scale.data")[1:100, ]) +object[["SCT"]] <- suppressWarnings({SetAssayData(object = object[["SCT"]], slot = "scale.data", new.data = GetAssayData(object = object[["SCT"]], layer = "scale.data")[1:100, ])}) object <- GetResidual(object = object, features = rownames(x = object), verbose = FALSE) test_that("GetResidual works", { - expect_equal(dim(GetAssayData(object = object[["SCT"]], slot = "scale.data")), c(220, 80)) + expect_equal(dim(GetAssayData(object = object[["SCT"]], layer = "scale.data")), c(220, 80)) expect_equal( - GetAssayData(object = object[["SCT"]], slot = "scale.data"), - GetAssayData(object = object[["SCT_SAVE"]], slot = "scale.data") + GetAssayData(object = object[["SCT"]], layer = "scale.data"), + GetAssayData(object = object[["SCT_SAVE"]], layer = "scale.data") ) expect_warning(GetResidual(object, features = "asd")) }) + +test_that("SCTransform v2 works as expected", { + skip_on_cran() + skip_if_not_installed("glmGamPoi") + object <- suppressWarnings(SCTransform(object = object, verbose = FALSE, vst.flavor = "v2", seed.use = 1448145)) + + expect_true("SCT" %in% names(object)) + expect_equal(as.numeric(colSums(GetAssayData(object = object[["SCT"]], layer = "scale.data"))[1]), 24.5813, tolerance = 1e-4) + expect_equal(as.numeric(rowSums(GetAssayData(object = object[["SCT"]], layer = "scale.data"))[5]), 0) + expect_equal(as.numeric(colSums(GetAssayData(object = object[["SCT"]], layer = "data"))[1]), 58.65829, tolerance = 1e-6) + expect_equal(as.numeric(rowSums(GetAssayData(object = object[["SCT"]], layer = "data"))[5]), 13.75449, tolerance = 1e-6) + expect_equal(as.numeric(colSums(GetAssayData(object = object[["SCT"]], layer = "counts"))[1]), 141) + expect_equal(as.numeric(rowSums(GetAssayData(object = object[["SCT"]], layer = "counts"))[5]), 40) + expect_equal(length(VariableFeatures(object[["SCT"]])), 220) + fa <- SCTResults(object = object, assay = "SCT", slot = "feature.attributes") + expect_equal(fa["MS4A1", "detection_rate"], 0.15) + expect_equal(fa["MS4A1", "gmean"], 0.2027364, tolerance = 1e-6) + expect_equal(fa["MS4A1", "variance"], 1.025158, tolerance = 1e-6) + expect_equal(fa["MS4A1", "residual_mean"], 0.2763993, tolerance = 1e-6) + expect_equal(fa["MS4A1", "residual_variance"], 3.023062, tolerance = 1e-6) + expect_equal(fa["FCER2", "theta"], Inf) +}) + +test_that("SCTransform `clip.range` param works as expected", { + # make a copy of the testing data + test.data <- object + # override defaults for ease of testing + clip.min <- -0.1 + clip.max <- 0.1 + + # for some reason, the clipping seems to be a little fuzzy at the upper end, + # since this is expected behaviour we'll need to accomodate the difference + clip.max.tolerance <- 0.1 + + test.result <- suppressWarnings( + SCTransform( + test.data, + clip.range = c(clip.min, clip.max), + ) + ) + scale.data <- LayerData(test.result[["SCT"]], layer = "scale.data") + expect_true(min(scale.data) >= clip.min) + expect_true(max(scale.data) <= (clip.max + clip.max.tolerance)) + + # when `ncells` is less than the size of the dataset the residuals will get + # re-clipped in batches, make sure this clipping is done correctly as well + test.result <- suppressWarnings( + SCTransform( + test.data, + clip.range = c(clip.min, clip.max), + ncells = 40 + ) + ) + scale.data <- LayerData(test.result[["SCT"]], layer = "scale.data") + expect_true(min(scale.data) >= clip.min) + expect_true(max(scale.data) <= (clip.max + clip.max.tolerance)) +}) + +test_that("SCTransform `vars.to.regress` param works as expected", { + # make a copy of the testing data + test.data <- object + # add a fake mitochondrial gene to the counts matrix + counts <- LayerData(test.data, assay = "RNA", layer = "counts") + counts <- rbind(counts, 5) + rownames(counts)[nrow(counts)] <- "MT-TEST" + # use the fake feature to populate a new meta.data column + test.data[[ "percent.mt" ]] <- PercentageFeatureSet( + test.data, + pattern="^MT-" + ) + + # make sure that `ncells` is smaller than the datset being transformed + # so tha the regression model is trained on a subset of the data - make sure + # the regression is applied to the entire dataset + left <- suppressWarnings( + SCTransform( + test.data, + vars.to.regress = NULL, + ncells = ncol(test.data) / 2, + verbose = FALSE + ) + ) + right <- suppressWarnings( + SCTransform( + test.data, + vars.to.regress = "percent.mt", + ncells = ncol(test.data) / 2, + verbose = FALSE + ) + ) + expect_false(identical(left[["SCT"]]$scale.data, right[["SCT"]]$scale.data)) + + # if the `assay` points to an `Assay5` instance the regression is handled + # using separate logic + test.data[["RNAv5"]] <- CreateAssay5Object( + counts = LayerData( + test.data, + assay = "RNA", + layer = "counts" + ) + ) + left <- suppressWarnings( + SCTransform( + test.data, + assay = "RNAv5", + vars.to.regress = NULL, + ncells = ncol(test.data) / 2, + verbose = FALSE + ) + ) + right <- suppressWarnings( + SCTransform( + test.data, + assay = "RNAv5", + vars.to.regress = "percent.mt", + ncells = ncol(test.data) / 2, + verbose = FALSE + ) + ) + expect_false(identical(left[["SCT"]]$scale.data, right[["SCT"]]$scale.data)) +}) + +test_that("SCTransform is equivalent for BPcells ", { + skip_on_cran() + skip_on_cran() + skip_if_not_installed("glmGamPoi") + + library(Matrix) + skip_if_not_installed("BPCells") + library(BPCells) + mat_bpcells <- t(as(t(object[['RNA']]$counts ), "IterableMatrix")) + object[['RNAbp']] <- CreateAssay5Object(counts = mat_bpcells) + object <- suppressWarnings(SCTransform(object = object, assay = "RNA", new.assay.name = "SCT", + verbose = FALSE, vst.flavor = "v2", seed.use = 1448145)) + + object <- suppressWarnings(SCTransform(object = object, assay = "RNAbp", new.assay.name = "SCTbp", + verbose = FALSE, vst.flavor = "v2", seed.use = 1448145)) + + expect_equal(as.matrix(LayerData(object = object[["SCT"]], layer = "data")), + as.matrix(LayerData(object = object[["SCTbp"]], layer = "data")), + tolerance = 1e-6) +}) diff --git a/tests/testthat/test_sketching.R b/tests/testthat/test_sketching.R new file mode 100644 index 000000000..2f724d036 --- /dev/null +++ b/tests/testthat/test_sketching.R @@ -0,0 +1,289 @@ +# setup shared test fixtures +path_to_counts <- system.file("extdata", "pbmc_raw.txt", package = "Seurat") + + +build_test_data <- function(multi_layer = FALSE) { + counts <- read.table(path_to_counts, sep = "\t", row.names = 1) + counts <- as.sparse(as.matrix(counts)) + + if (multi_layer) { + barcodes <- colnames(counts) + + counts.layer1 <- counts + colnames(counts.layer1) <- paste0(barcodes, "_layer1") + + counts.layer2 <- counts + colnames(counts.layer1) <- paste0(barcodes, "_layer2") + + counts_layers <- c( + counts.layer1 = counts.layer1, + counts.layer2 = counts.layer2 + ) + + test_data <- CreateSeuratObject(counts_layers) + + } else{ + test_data <- CreateSeuratObject(counts) + } + + test_data <- NormalizeData(test_data, verbose = FALSE) + test_data <- FindVariableFeatures(test_data, verbose = FALSE) + + return (test_data) +} + + +context("SketchData") + +test_that("SketchData defaults work", { + test_case <- build_test_data() + + result <- suppressWarnings( + SketchData( + test_case, + assay = "RNA", + ncells = 50, + method = "LeverageScore", + sketched.assay = "sketch", + set.seed = 42 + ) + ) + expect_equal( + dim(result[["sketch"]]), + c(230, 50) + ) + expect_equal( + as.numeric(result$leverage.score[1]), + 0.9036446, + tolerance = 1e-5 + ) + expect_equal( + colnames(result[["sketch"]])[1], + "ATGCCAGAACGACT" + ) + + result_2 <- suppressWarnings( + SketchData( + test_case, + assay = "RNA", + ncells = c("data" = 50), + method = "LeverageScore", + sketched.assay = "sketch" + ) + ) + expect_equal( + dim(result_2[["sketch"]]), + dim(result[["sketch"]]) + ) + expect_equal( + as.numeric(result_2$leverage.score[1]), + as.numeric(result$leverage.score[1]), + tolerance = 1e-6 + ) + expect_equal( + colnames(result_2[["sketch"]])[1], + colnames(result[["sketch"]])[1] + ) +}) + +test_that("SketchData with multiple layers works", { # (and one is less than the number of cells in that layer) + test_case <- build_test_data(multi_layer = TRUE) + + result <- suppressWarnings( + SketchData( + test_case, + assay = "RNA", + ncells = 80, + method = "LeverageScore", + sketched.assay = "sketch", + set.seed = 42 + ) + ) + expect_equal( + dim(result[["sketch"]]$data.layer1), + c(230, 80) + ) + expect_equal( + dim(result[["sketch"]]$data.layer2), + c(230, 80) + ) + expect_equal( + as.numeric(result$leverage.score[1]), + 0.9036446, + tolerance = 1e-6 + ) + expect_equal( + colnames(result[["sketch"]])[1], + "ATGCCAGAACGACT_layer2" + ) +}) + +test_that("SketchData with a different number of cells per layer works", { + test_case <- build_test_data(multi_layer = TRUE) + + result <- suppressWarnings( + SketchData( + test_case, + assay = "RNA", + ncells = c(50, 30), + method = "LeverageScore", + sketched.assay = "sketch", + set.seed = 42 + ) + ) + expect_equal( + dim(result[["sketch"]]$data.layer1), + c(230, 50) + ) + expect_equal( + dim(result[["sketch"]]$data.layer2), + c(230, 30) + ) + expect_equal( + as.numeric(result$leverage.score[1]), + 0.9036446, + tolerance = 1e-5 + ) + expect_equal( + colnames(result[["sketch"]])[1], + "ATGCCAGAACGACT_layer2" + ) + + result_2 <- suppressWarnings( + SketchData( + test_case, + assay = "RNA", + ncells = c("data.layer1" = 50, "data.layer2" = 30), + method = "LeverageScore", + sketched.assay = "sketch", + set.seed = 42 + ) + ) + expect_equal( + dim(result_2[["sketch"]]$data.layer1), + dim(result[["sketch"]]$data.layer1) + ) + expect_equal( + dim(result_2[["sketch"]]$data.layer2), + dim(result[["sketch"]]$data.layer2) + ) + expect_equal( + as.numeric(result_2$leverage.score[1]), + as.numeric(result$leverage.score[1]), + tolerance = 1e-5 + ) + expect_equal( + colnames(result_2[["sketch"]])[1], + colnames(result[["sketch"]])[1] + ) +}) + +test_that("SketchData with specified features works", { + test_case <- build_test_data() + + result <- suppressWarnings( + SketchData( + test_case, + assay = "RNA", + ncells = 50, + method = "LeverageScore", + sketched.assay = "sketch", + features = VariableFeatures(test_case)[1:100] + ) + ) + expect_equal( + dim(result[["sketch"]]), + c(230, 50) + ) + expect_equal( + as.numeric(result$leverage.score[1]), + 0.7202897, + tolerance = 1e-6 + ) + expect_equal( + colnames(result[["sketch"]])[1], + "ATGCCAGAACGACT" + ) +}) + +test_that("SketchData with specified features and multiple layers works", { + test_case <- build_test_data(multi_layer = TRUE) + + result <- suppressWarnings( + SketchData( + test_case, + assay = "RNA", + ncells = c(50, 30), + method = "LeverageScore", + sketched.assay = "sketch", + features = VariableFeatures(test_case)[1:100], + set.seed = 42 + ) + ) + expect_equal( + dim(result[["sketch"]]$data.layer1), + c(230, 50) + ) + expect_equal( + dim(result[["sketch"]]$data.layer2), + c(230, 30) + ) + expect_equal( + as.numeric(result$leverage.score[1]), + 0.7202896, + tolerance = 1e-6 + ) + expect_equal( + colnames(result[["sketch"]])[1], + "ATGCCAGAACGACT_layer2" + ) +}) + +test_that("SketchData when setting your own variable features and specifying features works", { + test_case <- build_test_data() + top_features <- VariableFeatures(test_case)[1:100] + VariableFeatures(test_case) <- top_features + + result <- suppressWarnings( + SketchData( + test_case, + assay = "RNA", + ncells = 50, + method = "LeverageScore", + sketched.assay = "sketch", + features = top_features, + set.seed = 42 + ) + ) + expect_equal( + dim(result[["sketch"]]), + c(230,50) + ) + expect_equal( + as.numeric(result$leverage.score[1]), + 0.7202896, + tolerance = 1e-6 + ) + expect_equal( + colnames(result[["sketch"]])[1], + "ATGCCAGAACGACT" + ) +}) + +test_that("SketchData when setting your own variable features and not specifying features errors out", { + test_case <- build_test_data() + top_features <- VariableFeatures(test_case)[1:100] + VariableFeatures(test_case) <- top_features + + expect_error( + suppressWarnings( + SketchData( + pbmc_new, + assay = "RNA", + ncells = 50, + method = "LeverageScore", + sketched.assay = "sketch" + ) + ) + ) +}) diff --git a/tests/testthat/test_spatial.R b/tests/testthat/test_spatial.R new file mode 100644 index 000000000..e38ceac3e --- /dev/null +++ b/tests/testthat/test_spatial.R @@ -0,0 +1,352 @@ +# setup shared test fixtures +path_to_counts <- system.file("extdata", "pbmc_raw.txt", package = "Seurat") +path_to_image = file.path("../testdata/visium/spatial") + + +build_spatial_data <- function(assay_name, image_name, id_prefix) { + raw_counts <- read.table(path_to_counts, sep = "\t", row.names = 1) + + image <- Read10X_Image( + path_to_image, + assay = assay_name, + slice = image_name + ) + cell_names <- Cells(image) + + counts <- do.call(cbind, replicate(34, raw_counts, simplify = FALSE)) + counts <- counts[1:length(cell_names)] + counts <- as.sparse(as.matrix(counts)) + colnames(counts) <- cell_names + + test_data <- CreateSeuratObject(counts, assay = assay_name) + test_data[[image_name]] <- image + test_data <- RenameCells( + test_data, + add.cell.id = id_prefix + ) + + return (test_data) +} + +test_render <- function(plot) { + grDevices::pdf(NULL) + print(plot) + dev.off() +} + +equivalent_plots <- function(plot1, plot2) { + if (length(plot1$layers) != length(plot2$layers)) { + return(FALSE) + } + + for (i in seq_along(plot1$layers)) { + layer1 <- plot1$layers[[i]] + layer2 <- plot2$layers[[i]] + + if (class(layer1$geom)[1] != class(layer2$geom)[1]) { + return(FALSE) + } + + if (!identical(layer1$data, layer2$data)) { + return(FALSE) + } + } + return(TRUE) +} + +test.data.1 <- build_spatial_data( + assay_name = "Spatial.A", + image_name = "slice1.A", + id_prefix = "test-data-1" +) + +test.data.2 <- build_spatial_data( + assay_name = "Spatial.A", + image_name = "slice2.A", + id_prefix = "test-data-2" +) + +test.data.3 <- build_spatial_data( + assay_name = "Spatial.B", + image_name = "slice2.B", + id_prefix = "test-data-3" +) + +context("SpatialFeaturePlot") + +test_that("SpatialFeaturePlot works with a single assay/image", { + test.case <- test.data.1 + + plot.1 <- SpatialFeaturePlot(test.case, features = "nCount_Spatial.A") + plot.2 <- SpatialFeaturePlot( + test.case, + images = "slice1.A", + features = "nCount_Spatial.A" + ) + + expect_no_error(test_render(plot.1)) + expect_no_error(test_render(plot.2)) + expect_true(equivalent_plots(plot.1, plot.2)) +}) + +test_that("SpatialFeaturePlot works with multiple layers & images", { + test.case <- merge(test.data.1, test.data.2) + + plots <- SpatialFeaturePlot(test.case, features = "nCount_Spatial.A") + plot.1 <- SpatialFeaturePlot( + test.case, + images = "slice1.A", + features = "nCount_Spatial.A" + ) + plot.2 <- SpatialFeaturePlot( + test.case, + images = "slice2.A", + features = "nCount_Spatial.A" + ) + expect_equal(length(plots), 2) + expect_no_error(test_render(plots)) + expect_no_error(test_render(plot.1)) + expect_no_error(test_render(plot.2)) + expect_true(equivalent_plots(plots[[1]], plot.1)) + expect_true(equivalent_plots(plots[[2]], plot.2)) +}) + +test_that("SpatialFeaturePlot works with multiple overlapping images", { + skip_if_not_installed("sf") + + test.case <- test.data.1 + suppressWarnings( + test.case[["slice1.crop"]] <- Crop( + test.case[["slice1.A"]], + x = c(0, 5000), + y = c(0, 5000) + ) + ) + + plots <- SpatialFeaturePlot(test.case, features = "nCount_Spatial.A") + plot.1 <- SpatialFeaturePlot( + test.case, + images = "slice1.A", + features = "nCount_Spatial.A" + ) + plot.2 <- SpatialFeaturePlot( + test.case, + images = "slice1.crop", + features = "nCount_Spatial.A" + ) + expect_equal(length(plots), 2) + expect_no_error(test_render(plots)) + expect_no_error(test_render(plot.1)) + expect_no_error(test_render(plot.2)) + expect_true(equivalent_plots(plots[[1]], plot.1)) + expect_true(equivalent_plots(plots[[2]], plot.2)) +}) + +test_that("SpatialFeaturePlot works with multiple assays & images", { + test.case <- merge(test.data.1, test.data.3) + + DefaultAssay(test.case) <- "Spatial.A" + plot.1 <- SpatialFeaturePlot(test.case, features = "nCount_Spatial.A") + plot.2 <- SpatialFeaturePlot( + test.case, + features = "nCount_Spatial.A", + images = "slice2.B" + ) + plots <- SpatialFeaturePlot( + test.case, + features = "nCount_Spatial.A", + images = c("slice1.A", "slice2.B") + ) + expect_equal(length(plots), 2) + expect_no_error(test_render(plots)) + expect_no_error(test_render(plot.1)) + expect_no_error(test_render(plot.2)) + expect_true(equivalent_plots(plots[[1]], plot.1)) + expect_true(equivalent_plots(plots[[2]], plot.2)) + + DefaultAssay(test.case) <- "Spatial.B" + plot.1 <- SpatialFeaturePlot(test.case, features = "nCount_Spatial.B") + plot.2 <- SpatialFeaturePlot( + test.case, + features = "nCount_Spatial.B", + images = "slice1.A" + ) + plots <- SpatialFeaturePlot( + test.case, + features = "nCount_Spatial.B", + images = c("slice1.A", "slice2.B") + ) + expect_equal(length(plots), 2) + expect_no_error(test_render(plots)) + expect_no_error(test_render(plot.1)) + expect_no_error(test_render(plot.2)) + expect_true(equivalent_plots(plots[[2]], plot.1)) + expect_true(expect_true(equivalent_plots(plots[[1]], plot.2))) +}) + +test_that("SpatialFeaturePlot works with multiple assays, layers, & images", { + test.case <- merge(test.data.1, c(test.data.2, test.data.3)) + + DefaultAssay(test.case) <- "Spatial.A" + plots <- SpatialFeaturePlot(test.case, features = "nCount_Spatial.A") + plot.1 <- SpatialFeaturePlot( + test.case, + images = "slice1.A", + features = "nCount_Spatial.A" + ) + plot.2 <- SpatialFeaturePlot( + test.case, + images = "slice2.A", + features = "nCount_Spatial.A" + ) + expect_equal(length(plots), 2) + expect_no_error(test_render(plots)) + expect_no_error(test_render(plot.1)) + expect_no_error(test_render(plot.2)) + expect_true(equivalent_plots(plots[[1]], plot.1)) + expect_true(equivalent_plots(plots[[2]], plot.2)) + + DefaultAssay(test.case) <- "Spatial.B" + plot.1 <- SpatialFeaturePlot(test.case, features = "nCount_Spatial.B") + plot.2 <- SpatialFeaturePlot( + test.case, + features = "nCount_Spatial.B", + images = "slice2.A" + ) + plots <- SpatialFeaturePlot( + test.case, + features = "nCount_Spatial.B", + images = c("slice2.A", "slice2.B") + ) + expect_equal(length(plots), 2) + expect_no_error(test_render(plots)) + expect_no_error(test_render(plot.1)) + expect_no_error(test_render(plot.2)) + expect_true(equivalent_plots(plots[[1]], plot.2)) + expect_true(equivalent_plots(plots[[2]], plot.1)) +}) + + +context("SpatialDimPlot") + +test_that("SpatialDimPlot works with a single assay/image", { + test.case <- test.data.1 + + plot.1 <- SpatialDimPlot(test.case) + plot.2 <- SpatialDimPlot( + test.case, + images = "slice1.A" + ) + expect_no_error(test_render(plot.1)) + expect_no_error(test_render(plot.2)) + expect_true(equivalent_plots(plot.1, plot.2)) +}) + +test_that("SpatialDimPlot works with multiple layers/images", { + test.case <- merge(test.data.1, test.data.2) + + plots <- SpatialDimPlot(test.case) + plot.1 <- SpatialDimPlot( + test.case, + images = "slice1.A" + ) + plot.2 <- SpatialDimPlot( + test.case, + images = "slice2.A" + ) + expect_equal(length(plots), 2) + expect_no_error(test_render(plots)) + expect_no_error(test_render(plot.1)) + expect_no_error(test_render(plot.2)) + expect_true(equivalent_plots(plots[[1]], plot.1)) + expect_true(equivalent_plots(plots[[2]], plot.2)) +}) + +test_that("SpatialDimPlot works with multiple overlapping images", { + skip_if_not_installed("sf") + + test.case <- test.data.1 + suppressWarnings( + test.case[["slice1.crop"]] <- Crop( + test.case[["slice1.A"]], + x = c(0, 5000), + y = c(0, 5000) + ) + ) + + plots <- SpatialDimPlot(test.case) + plot.1 <- SpatialDimPlot( + test.case, + images = "slice1.A" + ) + plot.2 <- SpatialDimPlot( + test.case, + images = "slice1.crop" + ) + expect_equal(length(plots), 2) + expect_no_error(test_render(plots)) + expect_no_error(test_render(plot.1)) + expect_no_error(test_render(plot.2)) + expect_true(equivalent_plots(plots[[1]], plot.1)) + expect_true(equivalent_plots(plots[[2]], plot.2)) +}) + +test_that("SpatialDimPlot works with multiple assays/images", { + test.case <- merge(test.data.1, test.data.3) + + DefaultAssay(test.case) <- "Spatial.A" + plot.1 <- SpatialDimPlot(test.case) + plot.2 <- SpatialDimPlot( + test.case, + images = "slice2.B" + ) + plots <- SpatialDimPlot( + test.case, + images = c("slice1.A", "slice2.B") + ) + + expect_equal(length(plots), 2) + expect_true(equivalent_plots(plots[[1]], plot.1)) + expect_true(equivalent_plots(plots[[2]], plot.2)) + + DefaultAssay(test.case) <- "Spatial.B" + plot.1 <- SpatialDimPlot(test.case) + plot.2 <- SpatialDimPlot( + test.case, + images = "slice1.A" + ) + plots <- SpatialDimPlot(test.case, images = c("slice1.A", "slice2.B")) + + expect_equal(length(plots), 2) + expect_no_error(test_render(plots)) + expect_no_error(test_render(plot.1)) + expect_no_error(test_render(plot.2)) + expect_true(equivalent_plots(plots[[2]], plot.1)) + expect_true(expect_true(equivalent_plots(plots[[1]], plot.2))) +}) + +test_that("SpatialDimPlot works with multiple assays, layers, & images", { + test.case <- merge(test.data.1, c(test.data.2, test.data.3)) + + DefaultAssay(test.case) <- "Spatial.A" + plots <- SpatialDimPlot(test.case) + plot.1 <- SpatialDimPlot(test.case, images = "slice1.A") + plot.2 <- SpatialDimPlot(test.case, images = "slice2.A") + expect_equal(length(plots), 2) + expect_no_error(test_render(plots)) + expect_no_error(test_render(plot.1)) + expect_no_error(test_render(plot.2)) + expect_true(equivalent_plots(plots[[1]], plot.1)) + expect_true(equivalent_plots(plots[[2]], plot.2)) + + DefaultAssay(test.case) <- "Spatial.B" + plot.1 <- SpatialDimPlot(test.case) + plot.2 <- SpatialDimPlot(test.case, images = "slice2.A") + plots <- SpatialDimPlot(test.case, images = c("slice2.A", "slice2.B")) + expect_equal(length(plots), 2) + expect_no_error(test_render(plots)) + expect_no_error(test_render(plot.1)) + expect_no_error(test_render(plot.2)) + expect_true(equivalent_plots(plots[[1]], plot.2)) + expect_true(equivalent_plots(plots[[2]], plot.1)) +}) diff --git a/tests/testthat/test_transferdata.R b/tests/testthat/test_transferdata.R index d4e8f2103..fbaf6e5b8 100644 --- a/tests/testthat/test_transferdata.R +++ b/tests/testthat/test_transferdata.R @@ -5,7 +5,13 @@ pbmc_small <- suppressWarnings(UpdateSeuratObject(pbmc_small)) # Setup test objects ref <- pbmc_small query <- CreateSeuratObject( - counts = GetAssayData(object = pbmc_small[['RNA']], slot = "counts") + rpois(n = ncol(pbmc_small), lambda = 1) + counts = as.sparse( + GetAssayData( + object = pbmc_small[['RNA']], + layer = "counts") + rpois(n = ncol(pbmc_small), + lambda = 1 + ) + ) ) query <- NormalizeData(object = query, verbose = FALSE) query <- FindVariableFeatures(object = query, verbose = FALSE, nfeatures = 100) @@ -30,8 +36,8 @@ test_that("TransferData default work", { # continuous assay data pred.assay <- TransferData(anchorset = anchors, refdata = GetAssayData(ref[["RNA"]]), verbose = FALSE) expect_equal(dim(pred.assay), c(230, 80)) - expect_equal(GetAssayData(pred.assay, slot = "counts"), new("matrix")) - expect_equal(GetAssayData(pred.assay, slot = "scale.data"), new("matrix")) + expect_equal(GetAssayData(pred.assay, layer = "counts"), new("matrix")) + expect_equal(GetAssayData(pred.assay, layer = "scale.data"), new("matrix")) expect_equal(colnames(pred.assay), Cells(query)) expect_equal(rownames(pred.assay), rownames(ref[["RNA"]])) expect_equal(sum(GetAssayData(pred.assay)[1, ]), 64.46388, tolerance = 1e-6) @@ -45,8 +51,8 @@ test_that("TransferData can return predictions assay, ", { pred.assay <- TransferData(anchorset = anchors, refdata = ref$RNA_snn_res.1, prediction.assay = TRUE, verbose = FALSE) expect_true(inherits(pred.assay, "Assay")) expect_equal(dim(pred.assay), c(4, 80)) - expect_equal(GetAssayData(pred.assay, slot = "counts"), new("matrix")) - expect_equal(GetAssayData(pred.assay, slot = "scale.data"), new("matrix")) + expect_equal(GetAssayData(pred.assay, layer = "counts"), new("matrix")) + expect_equal(GetAssayData(pred.assay, layer = "scale.data"), new("matrix")) expect_equal(colnames(pred.assay), Cells(query)) expect_equal(pred.assay@var.features, logical(0)) expect_equal(ncol(pred.assay@meta.features), 0) @@ -68,7 +74,7 @@ test_that("TransferData handles weight.reduction properly, ", { cdr.preds <- TransferData(anchorset = anchors, refdata = ref$RNA_snn_res.1, weight.reduction = custom.dr, verbose = FALSE, dims = 1:30) expect_equal(preds, cdr.preds) # weight.reduction = "pca - pca.preds <- TransferData(anchorset = anchors, refdata = ref$RNA_snn_res.1, query = query, weight.reduction = "pca", verbose = FALSE) + pca.preds <- suppressWarnings(TransferData(anchorset = anchors, refdata = ref$RNA_snn_res.1, query = query, weight.reduction = "pca", verbose = FALSE)) expect_true(inherits(pca.preds, "Seurat")) expect_equal(sum(GetAssayData(pca.preds[['prediction.score.id']])[1, ]), 27.83330252, tolerance = 1e-6) # weight.reduction = "cca" @@ -128,11 +134,11 @@ test_that("TransferData with multiple items to transfer works ", { }) test_that("TransferData can return a modified query object ", { - query <- TransferData(anchorset = anchors, refdata = ref$RNA_snn_res.1, query = query, verbose = FALSE) + query <- suppressWarnings(TransferData(anchorset = anchors, refdata = ref$RNA_snn_res.1, query = query, verbose = FALSE)) expect_true("prediction.score.id" %in% Assays(query)) expect_true("predicted.id" %in% colnames(query[[]])) expect_true("predicted.id.score" %in% colnames(query[[]])) - query <- TransferData(anchorset = anchors, refdata = ref$RNA_snn_res.1, query = query, store.weights = TRUE, verbose = FALSE) + query <- suppressWarnings(TransferData(anchorset = anchors, refdata = ref$RNA_snn_res.1, query = query, store.weights = TRUE, verbose = FALSE)) expect_equal(dim(Tool(query, slot = "TransferData")$weights.matrix), c(128, 80)) }) diff --git a/tests/testthat/test_tree.R b/tests/testthat/test_tree.R new file mode 100644 index 000000000..5349c7e38 --- /dev/null +++ b/tests/testthat/test_tree.R @@ -0,0 +1,54 @@ +PATH_TO_DATA <- system.file("extdata", "pbmc_raw.txt", package = "Seurat") + +TOLERANCE = 1.0e-4 + +setup_test_data <- function(cluster_resolution) { + counts <- as.sparse( + as.matrix(read.table(PATH_TO_DATA, sep = "\t", row.names = 1)) + ) + test.data <- CreateSeuratObject(counts) + + test.data <- NormalizeData(test.data) + test.data <- FindVariableFeatures(test.data) + test.data <- ScaleData(test.data) + + test.data <- RunPCA(test.data, npcs = 20) + test.data <- FindNeighbors(test.data, dims = 1:20) + test.data <- FindClusters(test.data, resolution = cluster_resolution) + + return(test.data) +} + +context("BuildClusterTree") + +test_that("BuildClusterTree works as expected", { + skip_if_not_installed("ape") + + # TODO: this test doesn't really check that the phylogenic trees being + # generated are biologically valid but that would be more useful + # for testing purposes we'll over-cluster our dataset to introduce + # some structure into the resulting phylogeny + test_case <- setup_test_data(cluster_resolution = 5) + + result <- BuildClusterTree(test_case, slot = "counts") + tree <- Tool(result, slot = "BuildClusterTree") + # check that the tree contains the expected number of leaf nodes + expect_equal(length(tree$tip.label), 6) + # check that the tree contains the expected number of edges + expect_equal(length(tree$edge), 20) + # spot check a few edge lengths + expect_equal(tree$edge.length[[1]], 1127.60199, tolerance = TOLERANCE) + expect_equal(tree$edge.length[[5]], 182.42944, tolerance = TOLERANCE) + expect_equal(tree$edge.length[[10]], 59.24525, tolerance = TOLERANCE) + + result <- BuildClusterTree(test_case, slot = "data") + tree <- Tool(result, slot = "BuildClusterTree") + # check that the tree contains the expected number of leaf nodes + expect_equal(length(tree$tip.label), 6) + # check that the tree contains the expected number of edges + expect_equal(length(tree$edge), 20) + # spot check a few edge lengths + expect_equal(tree$edge.length[[1]], 1206.4312, tolerance = TOLERANCE) + expect_equal(tree$edge.length[[5]], 657.0097, tolerance = TOLERANCE) + expect_equal(tree$edge.length[[10]], 982.9843, tolerance = TOLERANCE) +}) diff --git a/tests/testthat/test_utilities.R b/tests/testthat/test_utilities.R index 079634451..2e533b1c8 100644 --- a/tests/testthat/test_utilities.R +++ b/tests/testthat/test_utilities.R @@ -14,128 +14,280 @@ object <- CreateSeuratObject( min.features = 30, meta.data = meta.data ) +object <- NormalizeData(object) object <- SetIdent(object, value = 'a') -test_that("AverageExpression works for different slots", { - average.expression <- AverageExpression(object, slot = 'data')$RNA +group.by = "a" +data <- FetchData(object = object, vars = rev(x = group.by)) +data <- data[which(rowSums(x = is.na(x = data)) == 0), , drop = F] +category.matrix.avg <- CreateCategoryMatrix(labels = data, method = 'average') +category.matrix.sum <- CreateCategoryMatrix(labels = data, method = 'aggregate') + + +test_that("CreateCategoryMatrix works for average and aggregate", { + expect_equal(unname(colSums(category.matrix.avg)), c(1, 1, 1)) + expect_equal(unname(colSums(category.matrix.sum)), c(27, 26, 24)) +}) + +test_that("AverageExpression works for different layers", { + #average expression on data layer is equal to log of average exponentiated data + suppressWarnings(average.expression <- AverageExpression(object, layer = 'data')$RNA) + counts.from.data.avg <- expm1(object[['RNA']]$data) %*% category.matrix.avg expect_equivalent( - average.expression['KHDRBS1', 1:3], - c(a = 7.278237e-01, b = 1.658166e+14, c = 1.431902e-01), + log1p(counts.from.data.avg), + average.expression, tolerance = 1e-6 ) + #average expression on counts layer is equal to average of counts + suppressWarnings(average.counts <- AverageExpression(object, layer = 'counts')$RNA) + avg.counts <- object[['RNA']]$data %*% category.matrix.avg expect_equivalent( - average.expression['DNAJB1', 1:3] , - c(a = 1.374079e+00, b = 5.100840e-01, c = 5.011655e-01), - tolerance = 1e-6 - ) - avg.counts <- AverageExpression(object, slot = 'counts')$RNA - expect_equal( - avg.counts['MS4A1', ], - c(a = 0.37037037, b = 0.3461538, c = 0.3333333), - tolerance = 1e-6 - ) - expect_equal( - avg.counts['SPON2', ], - c(a = 0.5185185, b = 0.6153846, c = 0.08333333), + avg.counts, + average.counts, tolerance = 1e-6 ) - expect_warning(AverageExpression(object, slot = 'scale.data')) - object <- ScaleData(object = object, verbose = FALSE) - avg.scale <- AverageExpression(object, slot = "scale.data")$RNA - expect_equal( - avg.scale['MS4A1', ], - c(a = 0.02092088, b = -0.004769018, c = -0.018369549), - tolerance = 1e-6 - ) - expect_equal( - avg.scale['SPON2', ], - c(a = 0.1052434, b = 0.2042827, c = -0.3397051), + #average expression on scale.data layer is equal to average of scale.data + object <- ScaleData(object, features = rownames(object[['RNA']]$data)) + suppressWarnings(average.scale.data <- AverageExpression(object, layer = 'scale.data')$RNA) + avg.scale <- object[['RNA']]$scale.data %*% category.matrix.avg + expect_equivalent( + average.scale.data, + avg.scale, tolerance = 1e-6 ) -}) + }) test_that("AverageExpression handles features properly", { features <- rownames(x = object)[1:10] - average.expression <- AverageExpression(object, slot = 'data', features = features)$RNA + + # check that the average expression is calcualted for the specifed features + average.expression <- AverageExpression( + object, + layer = "data", + features = features + )$RNA expect_equal(rownames(x = average.expression), features) - expect_warning(AverageExpression(object, slot = 'data', features = "BAD")) - expect_warning(AverageExpression(object, slot = "data", features = c(features, "BAD"))) + + # check that an error is raised if none of the specified features are present + expect_warning(AverageExpression(object, layer = 'data', features = "BAD")) + # check that an error is raised if any of the features are missing + expect_warning( + AverageExpression( + object, + layer = "data", + features = c(features, "BAD") + ) + ) + + # check that features can be specified as a simple vector even when + # `layer="scale.data"` and `return.seurat=TRUE` + object <- ScaleData(object = object, verbose = FALSE) + avg.scale <- AverageExpression( + object, + layer = "scale.data", + return.seurat = TRUE, + features = features, + verbose = FALSE + )$RNA + expect_equal(rownames(avg.scale), features) }) test_that("AverageExpression with return.seurat", { # counts - avg.counts <- AverageExpression(object, slot = "counts", return.seurat = TRUE, verbose = FALSE) - expect_s4_class(object = avg.counts, "Seurat") - avg.counts.mat <- AverageExpression(object, slot = 'counts')$RNA - expect_equal(as.matrix(GetAssayData(avg.counts[["RNA"]], slot = "counts")), avg.counts.mat) - avg.data <- GetAssayData(avg.counts[["RNA"]], slot = "data") - expect_equal( - avg.data['MS4A1', ], - c(a = 0.31508105, b = 0.2972515, c = 0.2876821), - tolerance = 1e-6 - ) - expect_equal( - avg.data['SPON2', ], - c(a = 0.4177352, b = 0.4795731, c = 0.08004271), + avg.counts <- AverageExpression(object, layer = "counts", return.seurat = TRUE, verbose = FALSE) + avg.counts.calc <- object[['RNA']]$counts %*% category.matrix.avg + #test that counts are indeed equal to average counts + expect_equivalent( + as.matrix(avg.counts[['RNA']]$counts), + as.matrix(avg.counts.calc), tolerance = 1e-6 ) - avg.scale <- GetAssayData(avg.counts[["RNA"]], slot = "scale.data") - expect_equal( - avg.scale['MS4A1', ], - c(a = 1.0841908, b = -0.1980056, c = -0.8861852), + expect_s4_class(object = avg.counts, "Seurat") + avg.counts.mat <- AverageExpression(object, layer = 'counts')$RNA + expect_equal(unname(as.matrix(LayerData(avg.counts[["RNA"]], layer = "counts"))), + unname(as.matrix(avg.counts.mat))) + avg.data <- LayerData(avg.counts[["RNA"]], layer = "data") + #test that data returned is log1p of average counts + expect_equivalent( + as.matrix(log1p(avg.counts.mat)), + as.matrix(avg.data), tolerance = 1e-6 ) + #test that scale.data returned is scaled data + avg.scale <- LayerData(avg.counts[["RNA"]], layer = "scale.data") expect_equal( - avg.scale['SPON2', ], - c(a = 0.4275778, b = 0.7151260, c = -1.1427038), + avg.scale, + ScaleData(avg.counts)[['RNA']]$scale.data, tolerance = 1e-6 ) # data - avg.data <- AverageExpression(object, slot = "data", return.seurat = TRUE, verbose = FALSE) + avg.data <- AverageExpression(object, layer = "data", return.seurat = TRUE, verbose = FALSE) expect_s4_class(object = avg.data, "Seurat") - avg.data.mat <- AverageExpression(object, slot = 'data')$RNA - expect_equal(as.matrix(GetAssayData(avg.data[["RNA"]], slot = "counts")), avg.data.mat) - expect_equal(unname(as.matrix(GetAssayData(avg.data[["RNA"]], slot = "data"))), unname(log1p(x = avg.data.mat))) - avg.scale <- GetAssayData(avg.data[["RNA"]], slot = "scale.data") + avg.data.mat <- AverageExpression(object, layer = 'data')$RNA + expect_equal(unname(as.matrix(LayerData(avg.data[["RNA"]], layer = "counts"))), + unname(as.matrix(avg.data.mat))) + expect_equal(unname(as.matrix(LayerData(avg.data[["RNA"]], layer = "data"))), + as.matrix(unname(log1p(x = avg.data.mat)))) + avg.scale <- LayerData(avg.data[["RNA"]], layer = "scale.data") expect_equal( avg.scale['MS4A1', ], - c(a = 0.721145238, b = -1.1415734, c = 0.4204281), + c(a = -0.07823997, b = 1.0368218, c = -0.9585818), tolerance = 1e-6 ) expect_equal( avg.scale['SPON2', ], - c(a = 0.08226771, b = 0.9563249, c = -1.0385926), + c(a = 0.1213127, b = 0.9338096, c = -1.0551222), tolerance = 1e-6 ) # scale.data object <- ScaleData(object = object, verbose = FALSE) - avg.scale <- AverageExpression(object, slot = "scale.data", return.seurat = TRUE, verbose = FALSE) + avg.scale <- AverageExpression(object, layer = "scale.data", return.seurat = TRUE, verbose = FALSE) expect_s4_class(object = avg.scale, "Seurat") - avg.scale.mat <- AverageExpression(object, slot = 'scale.data')$RNA - expect_equal(unname(as.matrix(GetAssayData(avg.scale[["RNA"]], slot = "scale.data"))), unname(avg.scale.mat)) - expect_true(all(is.na(GetAssayData(avg.scale[["RNA"]], slot = "data")))) - expect_equal(GetAssayData(avg.scale[["RNA"]], slot = "counts"), matrix()) + avg.scale.mat <- AverageExpression(object, layer = 'scale.data')$RNA + expect_equal(unname(as.matrix(LayerData(avg.scale[["RNA"]], layer = "scale.data"))), unname(as.matrix(avg.scale.mat))) }) -test.dat <- GetAssayData(object = object, slot = "data") +test.dat <- LayerData(object = object, layer = "data") rownames(x = test.dat) <- paste0("test-", rownames(x = test.dat)) -object[["TEST"]] <- CreateAssayObject(data = test.dat) +suppressWarnings(object[["TEST"]] <- CreateAssayObject(data = test.dat)) test_that("AverageExpression with multiple assays", { - avg.test <- AverageExpression(object = object, assays = "TEST") + avg.test <- AverageExpression(object = object, assays = "TEST", layer = "data") expect_equal(names(x = avg.test), "TEST") expect_equal(length(x = avg.test), 1) expect_equivalent( avg.test[[1]]['test-KHDRBS1', 1:3], - c(a = 7.278237e-01, b = 1.658166e+14, c = 1.431902e-01), + c(a = 10.329153, b = 92.287109, c = 5.620942), tolerance = 1e-6 ) expect_equivalent( avg.test[[1]]['test-DNAJB1', 1:3] , - c(a = 1.374079e+00, b = 5.100840e-01, c = 5.011655e-01), + c(a = 42.32240, b = 15.94807, c = 15.96319), tolerance = 1e-6 ) - avg.all <- AverageExpression(object = object) + avg.all <- AverageExpression(object = object, layer = "data") expect_equal(names(x = avg.all), c("RNA", "TEST")) expect_equal(length(x = avg.all), 2) }) + + +meta.data.2 <- data.frame( + b = rep(as.factor(c('c', 'd', 'e')), length.out = ncol(pbmc.test)), + row.names = colnames(pbmc.test) +) +object <- AddMetaData(object, meta.data.2) +if(class(object[['RNA']]) == "Assay5") { + test_that("AggregateExpression works with multiple layers", { + object.split <- split(object, f = object$b) + aggregate.split <- AggregateExpression(object.split, assay = "RNA") + aggregate <- AggregateExpression(object, assay = "RNA") + expect_equivalent( + aggregate.split$RNA, + aggregate$RNA, + tolerance = 1e-6 + ) + avg.split <- AverageExpression(object.split, assay = "RNA") + avg <- AverageExpression(object, assay = "RNA") + expect_equivalent( + avg.split$RNA, + avg$RNA, + tolerance = 1e-6 + ) + }) +} + +test_that("PercentAbove works as expected", { + vals <- c(1, 1, 2, 2, NA) + expect_equal(PercentAbove(vals, threshold = 1), 0.4) +}) + + +context("BuildNicheAssay") + +test_that("BuildNicheAssay works as expected", { + test.data <- pbmc_small + + # generate fake coordinates arranging the cells from pbmc_small into a grid + test.coordinates <- data.frame( + cell = Cells(test.data), + x = rep(1:4, times = 20), + y = rep(1:4, each = 20) + ) + # associate the coordinates and counts with a FOV + fov <- CreateFOV( + test.coordinates, + type = "centroids", + assay = "RNA" + ) + test.data[["fov"]] <- fov + + # dividing the grid into 4 along each axis creates 16 regions - label + # each cell with the region containing it's x, y position + x.regions <- cut(test.coordinates[["x"]], breaks = 4, labels = FALSE) + y.regions <- cut(test.coordinates[["y"]], breaks = 4, labels = FALSE) + test.data[["test_labels"]] <- ((y.regions - 1) * 4 + x.regions) + + results <- BuildNicheAssay( + test.data, + fov = "fov", + group.by = "test_labels", + assay = "niche", + cluster.name = "niches" + ) + # the new niche assay should contain the same number of cells as the input + # and a feature for each unique label from the specified `group.by` variable + expect_equal( + c(16, ncol(test.data[["RNA"]])), + dim(results[["niche"]]) + ) + # exaclty a quarter of the cells should be assigned to each niche + for (niche in 1:4) { + expect_equal( + 20, + length(results[["niches"]][results[["niches"]]["niches"] == niche]) + ) + } +}) + +test_that("BuildNicheAssay works with FOV and VisiumV2 instances", { + skip_if_not_installed("hdf5r") + + path.to.data = file.path("../testdata/visium") + + test.case <- Load10X_Spatial( + path.to.data, + assay = "Spatial", + slice = "slice" + ) + + # populate a meta.data column with random labels + random_labels <- sample(1:3, size = ncol(test.case), replace = TRUE) + test.case[["random_labels"]] <- random_labels + + fov <- CreateFOV( + GetTissueCoordinates(test.case[["slice"]]), + type = "centroids", + radius = Radius(test.case[["slice"]]), + assay = "Spatial", + key = "fov" + ) + + test.case[["fov"]] <- fov + + left <- BuildNicheAssay( + test.case, + fov = "fov", + group.by = "random_labels" + ) + + right <- BuildNicheAssay( + test.case, + fov = "fov", + group.by = "random_labels" + ) + + expect_equal( + LayerData(left, layer = "scale.data"), + LayerData(left, layer = "scale.data") + ) +}) diff --git a/tests/testthat/test_visualization.R b/tests/testthat/test_visualization.R index bb4513552..037c6ceaf 100644 --- a/tests/testthat/test_visualization.R +++ b/tests/testthat/test_visualization.R @@ -4,7 +4,7 @@ set.seed(42) # Tests for visualization utilities # ------------------------------------------------------------------------------ -pbmc_small[["tsne_new"]] <- CollapseEmbeddingOutliers(pbmc_small, +pbmc_small[["tsne_new"]] <- CollapseEmbeddingOutliers(pbmc_small, reduction = "tsne", reduction.key = 'tsne_', outlier.sd = 0.5) test_that("CollapseEmbeddingOutliers works", { diff --git a/vignettes/COVID_SCTMapping.Rmd b/vignettes/COVID_SCTMapping.Rmd new file mode 100644 index 000000000..cc09ebd74 --- /dev/null +++ b/vignettes/COVID_SCTMapping.Rmd @@ -0,0 +1,180 @@ +--- +title: "Map COVID PBMC datasets to a healthy reference" +output: + html_document: + theme: united + df_print: kable + pdf_document: default +date: 'Compiled: `r Sys.Date()`' +--- + + +```{r setup, include=FALSE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = 'styler', + message = FALSE, + warning = FALSE, + fig.width = 10, + time_it = TRUE, + error = TRUE +) +``` + +```{r, warning=F, message=F} +devtools::load_all() +library(BPCells) +library(dplyr) +library(patchwork) +library(ggplot2) +options(future.globals.maxSize = 1e9) +``` + + +## Introduction: Reference mapping analysis in Seurat v5 + +In Seurat v5, we introduce a scalable approach for reference mapping datasets from separate studies or individuals. Reference mapping is a powerful approach to identify consistent labels across studies and perform cross-dataset analysis. We emphasize that while individual datasets are manageable in size, the aggregate of many datasets often amounts to millions of cell which do not fit in-memory. Furthermore, cross-dataset analysis is often challenged by disparate or unique cell type labels. Through reference mapping, we annotate all cells with a common reference for consistent cell type labels. Importantly, we never simultaneously load all of the cells in-memory to maintain low memory usage. + +In this vignette, we reference map three publicly available datasets totaling 1,498,064 cells and 277 donors which are available through [CZI cellxgene collections](https://cellxgene.cziscience.com/collections): [Ahern, et al., Nature 2022](https://cellxgene.cziscience.com/collections/8f126edf-5405-4731-8374-b5ce11f53e82), [Jin, et al., Science 2021](https://cellxgene.cziscience.com/collections/b9fc3d70-5a72-4479-a046-c2cc1ab19efc), and [Yoshida, et al., Nature 2022](https://cellxgene.cziscience.com/collections/03f821b4-87be-4ff4-b65a-b5fc00061da7). Each dataset consists of PBMCs from both healthy donors and donors diagnosed with COVID-19. Using the harmonized annotations, we demonstrate how to prepare a pseudobulk object to perform differential expression analysis across disease within cell types. + +Prior to running this vignette, please [install Seurat v5](install.html) and see the [BPCells vignette](seurat5_bpcells_interaction_vignette.html) to construct the on-disk object used in this vignette. Additionally, we map to our annotated CITE-seq reference containing 162,000 cells and 228 antibodies ([Hao*, Hao*, et al., Cell 2021](https://doi.org/10.1016/j.cell.2021.04.048)) which is available for download [here](https://zenodo.org/record/7779017#.ZCMojezMJqs). + +## Load the PBMC Reference Dataset and Query Datasets +We first load the reference (available [here](https://zenodo.org/record/7779017#.ZCMojezMJqs)) and normalize the query Seurat object prepared in the [BPCells interaction vignette](seurat5_bpcells_interaction_vignette.html). The query object consists of datasets from three different studies constructed using the `CreateSeuratObject` function, which accepts a list of BPCells matrices as input. Within the Seurat object, the three datasets reside in the `RNA` assay in three separate `layers` on-disk. + + +```{r load.data} +reference <- readRDS("/brahms/hartmana/vignette_data/pbmc_multimodal_2023.rds") +object <- readRDS("/brahms/mollag/seurat_v5/vignette_data/merged_covid_object.rds") +object <- NormalizeData(object, verbose = FALSE) +``` + +## Mapping +Using the same code from the [v4 reference mapping vignette](articles/multimodal_reference_mapping.html), we find anchors between the reference and query in the precomputed supervised PCA. We recommend the use of supervised PCA for CITE-seq reference datasets, and demonstrate how to compute this transformation in [v4 reference mapping vignette](articles/multimodal_reference_mapping.html). In Seurat v5, we only need to call `FindTransferAnchors` and `MapQuery` once to map all three datasets as they are all contained within the query object. Furthermore, utilizing the on-disk capabilities of [BPCells](https://github.com/bnprks/BPCells), we map 1.5 million cells without ever loading them all into memory. + +```{r} +anchor <- FindTransferAnchors( + reference = reference, + query = object, + reference.reduction = 'spca', + normalization.method = 'SCT', + dims = 1:50) +object <- MapQuery( + anchorset = anchor, + query = object, + reference = reference, + refdata = list( + celltype.l1 = "celltype.l1", + celltype.l2 = "celltype.l2" + ), + reduction.model = 'wnn.umap' +) +``` + +## Explore the mapping results +Next, we visualize all cells from the three studies which have been projected into a UMAP-space defined by the reference. Each cell is annotated at two levels of granularity (`predicted.celltype.l1` and `predicted.celltype.l2`). We can compare the differing ontologies used in the original annotations (`cell_type`) to the now harmonized annotations (`predicted.celltype.l2`, for example) that were predicted from reference-mapping. Previously, the lack of standardization prevented us from directly performing integrative analysis across studies, but now we can easily compare. + +```{r, fig.width=10, fig.height=6} +DimPlot(object, reduction = 'ref.umap', group.by = 'cell_type',alpha = 0.1, label = TRUE, split.by = 'publication', ncol = 3, label.size = 3) + NoLegend() +``` +```{r, fig.width=10, fig.height=6} +DimPlot(object, reduction = 'ref.umap', group.by = 'predicted.celltype.l2',alpha = 0.1, label = TRUE, split.by = 'publication', ncol = 3, label.size = 3) + NoLegend() +``` + +## Differential composition analysis +We utilize our harmonized annotations to identify differences in the proportion of different cell types between healthy individuals and COVID-19 patients. For example, we noticed a reduction in MAIT cells as well as an increase in plasmablasts among COVID-19 patients. + +```{r} +df_comp <- as.data.frame.matrix(table(object$donor_id, object$predicted.celltype.l2)) +select.donors <- rownames(df_comp)[rowSums(df_comp)> 50] +df_comp <- df_comp[select.donors, ] +df_comp_relative <- sweep(x = df_comp, MARGIN = 1, STATS = rowSums(df_comp), FUN = '/') + +df_disease <- as.data.frame.matrix(table(object$donor_id, object$disease))[select.donors, ] + +df_comp_relative$disease <- 'other' +df_comp_relative$disease[df_disease$normal!=0] <- 'normal' +df_comp_relative$disease[df_disease$`COVID-19`!=0] <- 'COVID-19' +df_comp_relative$disease <- factor(df_comp_relative$disease, levels = c('normal','COVID-19','other')) +df_comp_relative <- df_comp_relative[df_comp_relative$disease %in% c('normal','COVID-19'),] +``` + +```{r, fig.width=10, fig.height=4} +p1 <- ggplot(data = df_comp_relative, mapping = aes(x = disease, y = MAIT, fill = disease)) + + geom_boxplot(outlier.shape = NA) + + scale_fill_manual(values = c("#377eb8", "#e41a1c")) + + xlab("") + ylab('relative abundance') + + ggtitle('MAIT') + + geom_jitter(color="black", size=0.4, alpha=0.9 ) + + theme_bw() + + theme( axis.title = element_text(size = 12), + axis.text = element_text(size = 12), + plot.title = element_text(size = 15, hjust = 0.5, face = "bold") + ) + +p2 <- ggplot(data = df_comp_relative, mapping = aes(x = disease, y = Plasmablast, fill = disease)) + + geom_boxplot(outlier.shape = NA) + + scale_fill_manual(values = c("#377eb8", "#e41a1c")) + + xlab("") + ylab('relative abundance') + + ggtitle('Plasmablast') + + geom_jitter(color="black", size=0.4, alpha=0.9 ) + + theme_bw() + + theme( axis.title = element_text(size = 12), + axis.text = element_text(size = 12), + plot.title = element_text(size = 15, hjust = 0.5, face = "bold") + ) + +p1 + p2 + plot_layout(ncol = 2) +``` +```{r, include=FALSE} +#saveRDS(object, '/home/lis/seurat-private/output/covid_object.rds') +``` +## Differential expression analysis +In addition to composition analysis, we use an aggregation-based (pseudobulk) workflow to explore differential genes between healthy individuals and COVID-19 donors. We aggregate all cells within the same cell type and donor using the `AggregateExpression` function. This returns a Seurat object where each ‘cell’ represents the pseudobulk profile of one cell type in one individual. + +```{r} +bulk <- AggregateExpression(object, + return.seurat = TRUE, + assays = 'RNA', + group.by = c("predicted.celltype.l2", "donor_id", "disease") +) +``` + +```{r} +bulk <- subset(bulk, subset = disease %in% c('normal', 'COVID-19') ) +bulk <- subset(bulk, subset = predicted.celltype.l2 != 'Doublet') +bulk$disease <- factor(bulk$disease, levels = c('normal', 'COVID-19')) +``` + +Once a pseudobulk object is created, we can perform cell type-specific differential expression analysis between healthy individuals and COVID-19 donors. Here, we only visualize certain interferon-stimulated genes which are often upregulated during viral infection. + +```{r, fig.width=10, fig.height=12} +p1 <- VlnPlot( + object = bulk, features = 'IFI6', group.by = 'predicted.celltype.l2', + split.by = 'disease', cols = c("#377eb8", "#e41a1c")) +p2 <- VlnPlot( + object = bulk, features = c('ISG15'), group.by = 'predicted.celltype.l2', + split.by = 'disease', cols = c("#377eb8", "#e41a1c")) +p3 <- VlnPlot( + object = bulk, features = c('IFIT5'), group.by = 'predicted.celltype.l2', + split.by = 'disease', cols = c("#377eb8", "#e41a1c")) +p1 + p2 + p3 + plot_layout(ncol = 1) +``` + +
+ **Session Info** +```{r} +sessionInfo() +``` +
diff --git a/vignettes/ParseBio_sketch_integration.Rmd b/vignettes/ParseBio_sketch_integration.Rmd new file mode 100644 index 000000000..07b4df155 --- /dev/null +++ b/vignettes/ParseBio_sketch_integration.Rmd @@ -0,0 +1,178 @@ +--- +title: "Sketch integration using a 1 million cell dataset from Parse Biosciences" +output: + html_document: + theme: united + df_print: kable + pdf_document: default +date: 'Compiled: `r Sys.Date()`' +--- + +```{r setup, include=FALSE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = TRUE, + tidy.opts = list(width.cutoff = 95), + message = FALSE, + warning = FALSE, + fig.width = 10, + time_it = TRUE, + error = TRUE +) +``` + +The recent increase in publicly available single-cell datasets poses a significant challenge for integrative analysis. For example, multiple tissues have now been profiled across dozens of studies, representing hundreds of individuals and millions of cells. In [Hao et al, 2023](https://www.nature.com/articles/s41587-023-01767-y) proposed a dictionary learning based method, atomic sketch integration, could also enable efficient and large-scale integrative analysis. Our procedure enables the integration of large compendiums of datasets without ever needing to load the full scale of data into memory. In [our manuscript](https://www.nature.com/articles/s41587-023-01767-y) we use atomic sketch integration to integrate millions of scRNA-seq from human lung and human PBMC. + +In this vignette, we demonstrate how to use atomic sketch integration to harmonize scRNA-seq experiments 1M cells, though we have used this procedure to integrate datasets of 10M+ cells as well. We analyze a dataset from Parse Biosciences, in which PBMC from 24 human samples (12 healthy donors, 12 Type-1 diabetes donors), which is available [here](https://cdn.parsebiosciences.com/1M_PBMC_T1D_Parse.zip). + +* Sample a representative subset of cells ('atoms') from each dataset +* Integrate the atoms from each dataset, and define a set of cell states +* Reconstruct (integrate) the full datasets, based on the atoms +* Annotate all cells in the full datasets +* Identify cell-type specific differences between healthy and diabetic patients + +Prior to running this vignette, please [install Seurat v5](install.html), as well as the [BPCells](https://github.com/bnprks/BPCells) package, which we use for on-disk storage. You can read more about using BPCells in Seurat v5 [here](seurat5_bpcells_interaction_vignette.html). We also recommend reading the [Sketch-based analysis in Seurat v5](seurat5_sketch_analysis.html) vignette, which introduces the concept of on-disk and in-memory storage in Seurat v5. +```{r, warning=F, message=F} +library(Seurat) +library(BPCells) +library(dplyr) +library(ggplot2) +library(ggrepel) +library(patchwork) +# set this option when analyzing large datasets +options(future.globals.maxSize = 3e9) +``` +## Create a Seurat object containing data from 24 patients +We downloaded the original dataset and donor metadata from [Parse Biosciences](https://cdn.parsebiosciences.com/1M_PBMC_T1D_Parse.zip). While the BPCells package can work directly with h5ad files, for optimal performance, we converted the dataset to the compressed sparse format used by BPCells, as described [here](seurat5_bpcells_interaction_vignette.html). +We create a Seurat object for this dataset. Since the input to `CreateSeuratObject` is a BPCells matrix, the data remains on-disk and is not loaded into memory. After creating the object, we split the dataset into 24 [layers](seurat5_essential_commands.html), one for each sample (i.e. patient), to facilitate integration. +```{r, warning=F, message=F} +parse.mat <- open_matrix_dir(dir = "/brahms/hartmana/vignette_data/bpcells/parse_1m_pbmc") +# need to move +metadata <- readRDS("/brahms/haoy/vignette_data/ParseBio_PBMC_meta.rds") +object <- CreateSeuratObject(counts = parse.mat, meta.data = metadata) + +object <- NormalizeData(object) +# split assay into 24 layers +object[['RNA']] <- split(object[['RNA']], f = object$sample) +object <- FindVariableFeatures(object, verbose = FALSE) +``` +## Sample representative cells from each dataset +Inspired by pioneering work aiming to identify ['sketches'](https://www.sciencedirect.com/science/article/pii/S2405471219301528) of scRNA-seq data, our first step is to sample a representative set of cells from each dataset. We compute a leverage score (estimate of ['statistical leverage'](https://arxiv.org/abs/1109.3843)) for each cell, which helps to identify cells that are likely to be member of rare subpopulations and ensure that these are included in our representative sample. Importantly, the estimation of leverage scores only requires data normalization, can be computed efficiently for sparse datasets, and does not require any intensive computation or dimensional reduction steps. +We load each object separately, perform basic preprocessing (normalization and variable feature selection), and select and store 5,000 representative cells from each dataset. Since there are 24 datasets, the sketched dataset now contains 120,000 cells. These cells are stored in a new `sketch` assay, and are loaded in-memory. +```{r, warning=F, message=F} +object <- SketchData(object = object, ncells = 5000, method = 'LeverageScore', sketched.assay = 'sketch') +object +``` + +## Perform integration on the sketched cells across samples +Next we perform integrative analysis on the 'atoms' from each of the datasets. Here, we perform integration using the streamlined [Seurat v5 integration worfklow](seurat5_integration.html), and utilize the reference-based `RPCAIntegration` method. The function performs all corrections in low-dimensional space (rather than on the expression values themselves) to further improve speed and memory usage, and outputs a merged Seurat object where all cells have been placed in an integrated low-dimensional space (stored as `integrated.rpca`). +However, we emphasize that you can perform integration here using any analysis technique that places cells across datasets into a shared space. This includes CCA Integration, Harmony, and scVI. We demonstrate how to use these tools in Seurat v5 [here](seurat5_integration.html). +```{r} +DefaultAssay(object) <- 'sketch' +object <- FindVariableFeatures(object, verbose = F) +object <- ScaleData(object, verbose = F) +object <- RunPCA(object, verbose = F) +# integrate the datasets +object <- IntegrateLayers(object, method = RPCAIntegration, orig = 'pca', + new.reduction = 'integrated.rpca', dims = 1:30, k.anchor = 20, + reference = which(Layers(object, search = 'data') %in% c( 'data.H_3060')), + verbose = F) +# cluster the integrated data +object <- FindNeighbors(object, reduction = 'integrated.rpca', dims = 1:30) +object <- FindClusters(object, resolution = 2) +object <- RunUMAP(object, reduction = 'integrated.rpca', dims = 1:30, return.model = T, verbose = F) +``` + +```{r} +# you can now rejoin the layers in the sketched assay +# this is required to perform differential expression +object[['sketch']] <- JoinLayers(object[['sketch']]) +c10_markers <- FindMarkers(object = object, ident.1 = 10, max.cells.per.ident = 500, only.pos = TRUE) +head(c10_markers) + +# You can now annotate clusters using marker genes. +# We performed this step, and include the results in the 'sketch.celltype' metadata column + +plot.s1 <- DimPlot(object, group.by = 'sample', reduction = 'umap') +plot.s2 <- DimPlot(object, group.by = 'celltype.manual', reduction = 'umap') +``` + +```{r, fig.width=10, fig.height=10} +plot.s1 + plot.s2 + plot_layout(ncol = 1) +``` + +## Integrate the full datasets +Now that we have integrated the subset of atoms of each dataset, placing them each in an integrated low-dimensional space, we can now place each cell from each dataset in this space as well. We load the full datasets back in individually, and use the `ProjectIntegration` function to integrate all cells. After this function is run, the `integrated.rpca.full` space now embeds all cells in the dataset.Even though all cells in the dataset have been integrated together, the non-sketched cells are not loaded into memory. Users can still switch between the `sketch` (sketched cells, in-memory) and `RNA` (full dataset, on disk) for analysis. After integration, we can also project cell type labels from the sketched cells onto the full dataset using `ProjectData`. + +```{r} +# resplit the sketched cell assay into layers +# this is required to project the integration onto all cells +object[['sketch']] <- split(object[['sketch']], f = object$sample) + +object <- ProjectIntegration(object = object, + sketched.assay = 'sketch', + assay = 'RNA', + reduction = 'integrated.rpca' + ) + + +object <- ProjectData(object = object, + sketched.assay = 'sketch', + assay = 'RNA', + sketched.reduction = 'integrated.rpca.full', + full.reduction = 'integrated.rpca.full', + dims = 1:30, + refdata = list(celltype.full = 'celltype.manual') + ) + +``` + +```{r} +object <- RunUMAP(object, reduction = 'integrated.rpca.full', dims = 1:30 , reduction.name = 'umap.full', reduction.key = 'UMAP_full_') +``` + +```{r, fig.width=10, fig.height=10, eval = FALSE} +p1 <- DimPlot(object, reduction = 'umap.full', group.by = 'sample',alpha = 0.1) +p2 <- DimPlot(object, reduction = 'umap.full', group.by = 'celltype.full', alpha = 0.1) +p1 + p2 + plot_layout(ncol = 1) +``` + +## Compare healthy and diabetic samples + +By integrating all samples together, we can now compare healthy and diabetic cells in matched cell states. To maximize statistical power, we want to use all cells - not just the sketched cells - to perform this analysis. As recommended by [Soneson et all.](https://www.nature.com/articles/nmeth.4612) and [Crowell et al.](https://www.nature.com/articles/s41467-020-19894-4), we use an aggregation-based (pseudobulk) workflow. We aggregate all cells within the same cell type and sample using the `AggregateExpression` function. This returns a Seurat object where each 'cell' represents the pseudobulk profile of one cell type in one individual. + +After we aggregate cells, we can perform celltype-specific differential expression between healthy and diabetic samples using DESeq2. We demonstrate this for CD14 monocytes. + +```{r} +bulk <- AggregateExpression(object, return.seurat = T, slot = 'counts', + assays = 'RNA', group.by = c("celltype.full","sample", 'disease')) +``` +```{r} +# each sample is an individual-specific celltype-specific pseudobulk profile +tail(Cells(bulk)) + +cd14.bulk <- subset(bulk,celltype.full == "CD14 Mono") +Idents(cd14.bulk) <- 'disease' +de_markers <- FindMarkers(cd14.bulk, ident.1 = 'D',ident.2 = 'H', slot = 'counts', test.use = 'DESeq2', verbose = F ) +de_markers$gene <- rownames(de_markers) +ggplot(de_markers, aes(avg_log2FC, -log10(p_val))) + geom_point(size=0.5, alpha=0.5) + theme_bw() + ylab("-log10(unadjusted p-value)")+geom_text_repel(aes(label = ifelse(p_val_adj<0.01, gene, "")),colour = 'red', size = 3) + +``` + +
+ **Session Info** +```{r, eval = TRUE} +sessionInfo() +``` +
diff --git a/vignettes/announcements.Rmd b/vignettes/announcements.Rmd new file mode 100644 index 000000000..e6c0c8e13 --- /dev/null +++ b/vignettes/announcements.Rmd @@ -0,0 +1,50 @@ +--- +title: "News" +output: + html_document: + theme: united + df_print: kable +--- + +## **Changes in Seurat v5** + +We are excited to release Seurat v5 on CRAN, where it is now the default version for new installs. Seurat v5 is designed to be backwards compatible with Seurat v4 so existing code will continue to run, but we have made some changes to the software that will affect user results. We note that users who aim to reproduce their previous workflows in Seurat v4 can still install this version using the instructions on our [install page](link). + +In particular, we have made changes to: + +* **Seurat Object and Assay class:** +\ +Seurat v5 now includes support for additional assay and data types, including on-disk matrices. To facilitate this, we have introduced an updated Seurat v5 assay. Users can check out this [vignette for more information]. Briefly, Seurat v5 assays store data in layers (previously referred to as 'slots'). + + For example, these layers can store: raw counts `(layer='counts')`, normalized data `(layer='data')`, or z-scored/variance-stabilized data `(layer='scale.data')`. + + Data can be accessed using the `$` accessor (i.e. `obj[["RNA"]]$counts`), or the ``LayerData` function (i.e. `LayerData(obj, assay="RNA", layer='counts')`. +\ + + We've designed these updates to minimize changes for users. Existing Seurat functions and workflows from v4 continue to work in v5. For example, the command `GetAssayData(obj, assay="RNA", slot='counts')`, will run successfully in both Seurat v4 and Seurat v5. + + +* **Integration workflow:** +\ +Seurat v5 introduces a [streamlined integration](integration_introduction.html) and [data transfer](intregration_mapping.html) workflows that performs integration in low-dimensional space, and improves speed and memory efficiency. The results of integration are not identical between the two workflows, but users can still run the [v4 integration workflow](integration_introduction.html) in Seurat v5 if they wish. +\ + + In previous versions of Seurat, the integration workflow required a list of multiple Seurat objects as input. In Seurat v5, all the data can be kept as a single object, but prior to integration, users can simply split the layers. See our [introduction to integration](integration_introduction.html) vignette for more information. + + +* **Differential expression:** +\ +Seurat v5 now uses the [presto package](https://github.com/immunogenomics/presto) (from the Korunsky and Raychaudhari labs), when available, to perform differential expression analysis. Using presto can dramatically speed up DE testing, and we encourage users to install it. +\ +In addition, in Seurat v5 we implement a pseudocount (when calculating log-FC) at the group level instead of the cell level. As a result, users will observe higher logFC estimates in v5 - but should note that these estimates may be more unstable - particularly for genes that are very lowly expressed in one of the two groups. We gratefully acknowledge feedback from the McCarthy and Pachter labs on this topic. + +* **SCTransform v2:** +\ +In [Choudhary and Satija, Genome Biology, 2022](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02584-9), we implement an updated version 2 of sctransform. This is now the default version when running `SCTransform` in Seurat v5. Users who wish to run the previous workflow can set the `vst.flavor = "v1"` argument in the `SCTransform` function. +\ +\ +* **Pseudobulk analysis:** +\ +Once a single-cell dataset has been analyzed to annotate cell subpopulations, pseudobulk analyses (i.e. aggregating together cells within a given subpopulation and sample) can reduce noise, improve quantification of lowly expressed genes, and reduce the size of the data matrix. In Seurat v5, we encourage the use of the `AggregateExpression` function to perform pseudobulk analysis. +\ +Check out our [differential expression vignette](de_vignette.html) as well as our [pancreatic/healthy PBMC comparison](ParseBio_sketch_integration.html), for examples of how to use `AggregateExpression` to perform robust differential expression of scRNA-seq data from multiple different conditions. \ No newline at end of file diff --git a/vignettes/archive.yaml b/vignettes/archive.yaml index 868c8e1dc..790109365 100644 --- a/vignettes/archive.yaml +++ b/vignettes/archive.yaml @@ -1,25 +1,25 @@ vignettes: - name: atacseq_integration_vignette title: PBMC scATAC-seq Vignette - versions: [v3.2, v3.1, v3.0] + versions: [v4.3, v3.2, v3.1, v3.0] - name: cell_cycle_vignette title: Cell-Cycle Scoring and Regression - versions: [v3.2, v3.1, v3.0, v2.4] + versions: [v4.3, v3.2, v3.1, v3.0, v2.4] - name: conversion_vignette title: Interoperability between single-cell object formats - versions: [v3.2, v3.1, v3.0, v2.4] + versions: [v4.3, v4.3, v3.2, v3.1, v3.0, v2.4] - name: de_vignette title: Differential expression testing - versions: [v3.2, v3.1, v3.0, v2.4] + versions: [v4.3, v3.2, v3.1, v3.0, v2.4] - name: dim_reduction_vignette title: Seurat - Dimensional Reduction Vignette - versions: [v3.2, v3.1, v3.0, v2.4] + versions: [v4.3, v3.2, v3.1, v3.0, v2.4] - name: future_vignette title: Parallelization in Seurat with future - versions: [v3.2, v3.1, v3.0] + versions: [v4.3, v3.2, v3.1, v3.0] - name: hashing_vignette title: Demultiplexing with hashtag oligos (HTOs) - versions: [v3.2, v3.1, v3.0, v2.4] + versions: [v4.3, v3.2, v3.1, v3.0, v2.4] - name: immune_alignment title: "Tutorial: Integrating stimulated vs. control PBMC datasets to learn cell- type specific responses" @@ -27,29 +27,47 @@ type specific responses" - name: integration title: Integration and Label Transfer versions: [v3.2, v3.1, v3.0] + - name: integration_mapping + title: Mapping and annotating query datasets + versions: [v4.3] + - name: integration_large_datasets + title: Tips for integrating large datasets + versions: [v4.3] + - name: integration_rpca + title: Fast integration using reciprocal PCA (RPCA) + versions: [v4.3] - name: interaction_vignette title: Seurat - Interaction Tips - versions: [v3.2, v3.1, v3.0, v2.4] + versions: [v4.3, v3.2, v3.1, v3.0, v2.4] - name: mca title: Guided Clustering of the Microwell-seq Mouse Cell Atlas versions: [v3.2, v3.1, v3.0, v2.4] + - name: merge + title: Seurat - Combining Two 10X Runs + versions: [v4.3] - name: mixscape title: Mixscape Vignette - versions: [v3.2, v3.1] + versions: [v4.3, v3.2, v3.1] + - name: multimodal_reference_mapping + title: Multimodal reference mapping + versions: [v4.3] - name: multimodal_vignette title: Using Seurat with multi-modal data - versions: [v3.2, v3.1, v3.0, v2.4] + versions: [v4.3, v3.2, v3.1, v3.0, v2.4] - name: pbmc3k_tutorial title: Guided tutorial --- 2,700 PBMCs - versions: [v3.2, v3.1, v3.0, v2.4, v1.4] + versions: [v4.3, v3.2, v3.1, v3.0, v2.4, v1.4] - name: sctransform_vignette title: Using sctransform in Seurat - versions: [v3.2, v3.1, v3.0] + versions: [v4.3, v3.2, v3.1, v3.0] - name: spatial_vignette title: Analysis, visualization, and integration of spatial datasets with Seurat - versions: [v3.2] + versions: [v4.3, v3.2] - name: visualization_vignette title: New data visualization methods in v3.0 - versions: [v3.2, v3.1, v3.0, v2.4] + versions: [v4.3, v3.2, v3.1, v3.0, v2.4] + - name: weighted_nearest_neighbor_analysis + title: Weighted Nearest Neighbor Analysis + versions: [v4.3] diff --git a/vignettes/atacseq_integration_vignette.Rmd b/vignettes/atacseq_integration_vignette.Rmd index 59f391a69..bbe70ac34 100644 --- a/vignettes/atacseq_integration_vignette.Rmd +++ b/vignettes/atacseq_integration_vignette.Rmd @@ -26,7 +26,8 @@ knitr::opts_chunk$set( tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE, - time_it = TRUE + time_it = TRUE, + error = TRUE ) options(SeuratData.repo.use = 'satijalab04.nygenome.org') ``` @@ -100,7 +101,7 @@ p2 <- DimPlot(pbmc.atac, group.by = 'orig.ident', label = FALSE) + NoLegend() + p1 + p2 ``` -```{r save.img, include = FALSE} +```{r save.img, include = TRUE} plot <- (p1 + p2) & xlab("UMAP 1") & ylab("UMAP 2") & theme(axis.title = element_text(size = 18)) diff --git a/vignettes/cell_cycle_vignette.Rmd b/vignettes/cell_cycle_vignette.Rmd index d74c5c0c6..bb3340f62 100644 --- a/vignettes/cell_cycle_vignette.Rmd +++ b/vignettes/cell_cycle_vignette.Rmd @@ -25,7 +25,7 @@ knitr::opts_chunk$set( tidy = TRUE, tidy.opts = list(width.cutoff = 95), warning = FALSE, - error = FALSE, + error = TRUE, message = FALSE, fig.width = 8, time_it = TRUE @@ -39,7 +39,7 @@ library(Seurat) # Read in the expression matrix # The first row is a header row, the first column is rownames -exp.mat <- read.table(file = "../data/nestorawa_forcellcycle_expressionMatrix.txt", header = TRUE, as.is = TRUE, row.names = 1) +exp.mat <- read.table(file = "/brahms/shared/vignette-data/cell_cycle_vignette_files/nestorawa_forcellcycle_expressionMatrix.txt", header = TRUE, as.is = TRUE, row.names = 1) # A list of cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat. # We can segregate this list into markers of G2/M phase and markers of S phase @@ -47,7 +47,7 @@ s.genes <- cc.genes$s.genes g2m.genes <- cc.genes$g2m.genes # Create our Seurat object and complete the initalization steps -marrow <- CreateSeuratObject(counts = exp.mat) +marrow <- CreateSeuratObject(counts = Matrix::Matrix(as.matrix(exp.mat),sparse = T)) marrow <- NormalizeData(marrow) marrow <- FindVariableFeatures(marrow, selection.method = 'vst') marrow <- ScaleData(marrow, features = rownames(marrow)) @@ -81,7 +81,7 @@ marrow <- RunPCA(marrow, features = c(s.genes, g2m.genes)) DimPlot(marrow) ``` -```{r save.img, include=FALSE} +```{r save.img, include=TRUE} library(ggplot2) plot <- DimPlot(marrow) + theme(axis.title = element_text(size = 18), legend.text = element_text(size = 18)) + diff --git a/vignettes/conversion_vignette.Rmd b/vignettes/conversion_vignette.Rmd index ac4917b27..7b9af30c9 100644 --- a/vignettes/conversion_vignette.Rmd +++ b/vignettes/conversion_vignette.Rmd @@ -27,7 +27,8 @@ knitr::opts_chunk$set( fig.width = 10, message = FALSE, warning = FALSE, - time_it = TRUE + time_it = TRUE, + error = TRUE ) ``` diff --git a/vignettes/de_vignette.Rmd b/vignettes/de_vignette.Rmd index 125c0977a..eb350de5a 100644 --- a/vignettes/de_vignette.Rmd +++ b/vignettes/de_vignette.Rmd @@ -8,135 +8,189 @@ output: date: 'Compiled: `r Sys.Date()`' --- -```{r setup, include=FALSE} -all_times <- list() # store the time for each chunk -knitr::knit_hooks$set(time_it = local({ - now <- NULL - function(before, options) { - if (before) { - now <<- Sys.time() - } else { - res <- difftime(Sys.time(), now, units = "secs") - all_times[[options$label]] <<- res - } - } -})) -knitr::opts_chunk$set( - tidy = TRUE, - tidy.opts = list(width.cutoff = 95), - message = FALSE, - warning = FALSE, - time_it = TRUE -) -``` - -```{r, include = FALSE} -options(SeuratData.repo.use = "http://satijalab04.nygenome.org") -``` - -# Load in the data +### Load in the data -This vignette highlights some example workflows for performing differential expression in Seurat. For demonstration purposes, we will be using the 2,700 PBMC object that is available via the [SeuratData](https://github.com/satijalab/seurat-data) package). +This vignette highlights some example workflows for performing differential expression in Seurat. For demonstration purposes, we will be using the interferon-beta stimulated human PBMCs dataset ([ifnb](https://www.nature.com/articles/nbt.4042)) that is available via the [SeuratData](https://github.com/satijalab/seurat-data) package. -```{r load_data} +```{r echo=TRUE, message=FALSE, warning=FALSE} library(Seurat) library(SeuratData) -pbmc <- LoadData("pbmc3k", type = "pbmc3k.final") +library(ggplot2) +ifnb <- LoadData("ifnb") ``` -# Perform default differential expression tests +### Perform default differential expression tests -The bulk of Seurat's differential expression features can be accessed through the `FindMarkers()` function. As a default, Seurat performs differential expression based on the non-parametric Wilcoxon rank sum test. This replaces the previous default test ('bimod'). To test for differential expression between two specific groups of cells, specify the `ident.1` and `ident.2` parameters. +The bulk of Seurat’s differential expression features can be accessed through the `FindMarkers()` function. By default, Seurat performs differential expression (DE) testing based on the non-parametric Wilcoxon rank sum test. To test for DE genes between two specific groups of cells, specify the `ident.1` and `ident.2` parameters. -```{r basic_de} -# list options for groups to perform differential expression on -levels(pbmc) -# Find differentially expressed features between CD14+ and FCGR3A+ Monocytes -monocyte.de.markers <- FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = "FCGR3A+ Mono") +```{r echo=TRUE, message=FALSE, warning=FALSE,results=TRUE} +# Normalize the data +ifnb <- NormalizeData(ifnb) + +# Find DE features between CD16 Mono and CD1 Mono +Idents(ifnb) <- "seurat_annotations" +monocyte.de.markers <- FindMarkers(ifnb, ident.1 = "CD16 Mono", ident.2 = "CD14 Mono") # view results head(monocyte.de.markers) ``` -The results data frame has the following columns : +The results data frame has the following columns : - * p_val : p_val (unadjusted) - * avg_log2FC : log fold-change of the average expression between the two groups. Positive values indicate that the feature is more highly expressed in the first group. - * pct.1 : The percentage of cells where the feature is detected in the first group - * pct.2 : The percentage of cells where the feature is detected in the second group - * p_val_adj : Adjusted p-value, based on Bonferroni correction using all features in the dataset. +* p_val : p-value (unadjusted) +* avg_log2FC : log fold-change of the average expression between the two groups. Positive values indicate that the feature is more highly expressed in the first group. +* pct.1 : The percentage of cells where the feature is detected in the first group +* pct.2 : The percentage of cells where the feature is detected in the second group +* p_val_adj : Adjusted p-value, based on Bonferroni correction using all features in the dataset. -If the `ident.2` parameter is omitted or set to NULL, `FindMarkers()` will test for differentially expressed features between the group specified by `ident.1` and all other cells. +If the `ident.2` parameter is omitted or set to `NULL`, `FindMarkers()` will test for differentially expressed features between the group specified by `ident.1` and all other cells. Additionally, the parameter `only.pos` can be set to `TRUE` to only search for positive markers, i.e. features that are more highly expressed in the `ident.1` group. -```{r basic_de_2} -# Find differentially expressed features between CD14+ Monocytes and all other cells, only search for positive markers -monocyte.de.markers <- FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = NULL, only.pos = TRUE) +```{r echo=TRUE, message=FALSE, warning=FALSE,results=TRUE} +# Find differentially expressed features between CD14+ Monocytes and all other cells, only +# search for positive markers +monocyte.de.markers <- FindMarkers(ifnb, ident.1 = "CD16 Mono", ident.2 = NULL, only.pos = TRUE) # view results head(monocyte.de.markers) ``` -# Prefilter features or cells to increase the speed of DE testing +### Perform DE analysis within the same cell type across conditions + +Since this dataset contains treatment information (control versus stimulated with interferon-beta), we can also ask what genes change in different conditions for cells of the same type. First, we create a column in the `meta.data` slot to hold both the cell type and treatment information and switch the current `Idents` to that column. Then we use `FindMarkers()` to find the genes that are different between control and stimulated CD14 monocytes. + +```{r echo=TRUE, message=FALSE, warning=FALSE,results=TRUE} +ifnb$celltype.stim <- paste(ifnb$seurat_annotations, ifnb$stim, sep = "_") +Idents(ifnb) <- "celltype.stim" +mono.de <- FindMarkers(ifnb, ident.1 = "CD14 Mono_STIM", ident.2 = "CD14 Mono_CTRL", verbose = FALSE) +head(mono.de, n = 10) +``` -To increase the speed of marker discovery, particularly for large datasets, Seurat allows for pre-filtering of features or cells. For example, features that are very infrequently detected in either group of cells, or features that are expressed at similar average levels, are unlikely to be differentially expressed. Example use cases of the `min.pct`, `logfc.threshold`, `min.diff.pct`, and `max.cells.per.ident` parameters are demonstrated below. +However, the p-values obtained from this analysis should be interpreted with caution, because these tests treat each cell as an independent replicate and ignore inherent correlations between cells originating from the same sample. Such analyses have been shown to find a large number of false positive associations, as has been demonstrated by [Squair et al., 2021](https://www.nature.com/articles/s41467-021-25960-2), [Zimmerman et al., 2021](https://www.nature.com/articles/s41467-021-21038-1), [Junttila et al., 2022](https://academic.oup.com/bib/article/23/5/bbac286/6649780), and others. Below, we show how pseudobulking can be used to account for such within-sample correlation. -```{r prefilter} -# Pre-filter features that are detected at <50% frequency in either CD14+ Monocytes or FCGR3A+ Monocytes -head(FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = "FCGR3A+ Mono", min.pct = 0.5)) +#### Perform DE analysis after pseudobulking -# Pre-filter features that have less than a two-fold change between the average expression of CD14+ Monocytes vs FCGR3A+ Monocytes -head(FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = "FCGR3A+ Mono", logfc.threshold = log(2))) +To pseudobulk, we will use [AggregateExpression()](https://satijalab.org/seurat/reference/aggregateexpression) to sum together gene counts of all the cells from the same sample for each cell type. This results in one gene expression profile per sample and cell type. We can then perform DE analysis using [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) on the sample level. This treats the samples, rather than the individual cells, as independent observations. -# Pre-filter features whose detection percentages across the two groups are similar (within 0.25) -head(FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = "FCGR3A+ Mono", min.diff.pct = 0.25)) +First, we need to retrieve the sample information for each cell. This is not loaded in the metadata, so we will load it from the [Github repo](https://github.com/yelabucsf/demuxlet_paper_code/tree/master/) of the source data for the original paper. -# Increasing min.pct, logfc.threshold, and min.diff.pct, will increase the speed of DE testing, but could also miss features that are prefiltered +
**Add sample information to the dataset** -# Subsample each group to a maximum of 200 cells. Can be very useful for large clusters, or computationally-intensive DE tests -head(FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = "FCGR3A+ Mono", max.cells.per.ident = 200)) +```{r echo=TRUE, message=FALSE, warning=FALSE,results=TRUE} +# load the inferred sample IDs of each cell +ctrl <- read.table(url("https://raw.githubusercontent.com/yelabucsf/demuxlet_paper_code/master/fig3/ye1.ctrl.8.10.sm.best"), head = T, stringsAsFactors = F) +stim <- read.table(url("https://raw.githubusercontent.com/yelabucsf/demuxlet_paper_code/master/fig3/ye2.stim.8.10.sm.best"), head = T, stringsAsFactors = F) +info <- rbind(ctrl, stim) + +# rename the cell IDs by substituting the '-' into '.' +info$BARCODE <- gsub(pattern = "\\-", replacement = "\\.", info$BARCODE) + +# only keep the cells with high-confidence sample ID +info <- info[grep(pattern = "SNG", x = info$BEST), ] + +# remove cells with duplicated IDs in both ctrl and stim groups +info <- info[!duplicated(info$BARCODE) & !duplicated(info$BARCODE, fromLast = T), ] + +# now add the sample IDs to ifnb +rownames(info) <- info$BARCODE +info <- info[, c("BEST"), drop = F] +names(info) <- c("donor_id") +ifnb <- AddMetaData(ifnb, metadata = info) + +# remove cells without donor IDs +ifnb$donor_id[is.na(ifnb$donor_id)] <- "unknown" +ifnb <- subset(ifnb, subset = donor_id != "unknown") ``` +
+\ + +We can now perform pseudobulking (`AggregateExpression()`) based on the donor IDs. -# Perform DE analysis using alternative tests - -The following differential expression tests are currently supported: - - * "wilcox" : Wilcoxon rank sum test (default) - * "bimod" : Likelihood-ratio test for single cell feature expression, [(McDavid et al., Bioinformatics, 2013)](https://www.ncbi.nlm.nih.gov/pubmed/23267174) - * "roc" : Standard AUC classifier - * "t" : Student's t-test - * "poisson" : Likelihood ratio test assuming an underlying negative binomial distribution. Use only for UMI-based datasets - * "negbinom" : Likelihood ratio test assuming an underlying negative binomial distribution. Use only for UMI-based datasets - * "LR" : Uses a logistic regression framework to determine differentially expressed genes. Constructs a logistic regression model predicting group membership based on each feature individually and compares this to a null model with a likelihood ratio test. - * "MAST" : GLM-framework that treates cellular detection rate as a covariate [(Finak et al, Genome Biology, 2015)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4676162/) ([Installation instructions](https://github.com/RGLab/MAST)) - * "DESeq2" : DE based on a model using the negative binomial distribution [(Love et al, Genome Biology, 2014)](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) ([Installation instructions](https://bioconductor.org/packages/release/bioc/html/DESeq2.html)) - -For MAST and DESeq2, please ensure that these packages are installed separately in order to use them as part of Seurat. Once installed, use the `test.use` parameter can be used to specify which DE test to use. - -```{r include = FALSE} -# necessary to get MAST to work properly -library(SingleCellExperiment) +```{r echo=TRUE, message=FALSE, warning=FALSE,results=TRUE} +# pseudobulk the counts based on donor-condition-celltype +pseudo_ifnb <- AggregateExpression(ifnb, assays = "RNA", return.seurat = T, group.by = c("stim", "donor_id", "seurat_annotations")) + +# each 'cell' is a donor-condition-celltype pseudobulk profile +tail(Cells(pseudo_ifnb)) +pseudo_ifnb$celltype.stim <- paste(pseudo_ifnb$seurat_annotations, pseudo_ifnb$stim, sep = "_") ``` - -```{r multiple test} -# Test for DE features using the MAST package -head(FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = "FCGR3A+ Mono", test.use = "MAST")) -# Test for DE features using the DESeq2 package. Throws an error if DESeq2 has not already been installed -# Note that the DESeq2 workflows can be computationally intensive for large datasets, but are incompatible with some feature pre-filtering options -# We therefore suggest initially limiting the number of cells used for testing -head(FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = "FCGR3A+ Mono", test.use = "DESeq2", max.cells.per.ident = 50)) +Next, we perform DE testing on the pseudobulk level for CD14 monocytes, and compare it against the previous single-cell-level DE results. + +```{r echo=TRUE, message=FALSE, warning=FALSE,results=TRUE} +Idents(pseudo_ifnb) <- "celltype.stim" + +bulk.mono.de <- FindMarkers(object = pseudo_ifnb, + ident.1 = "CD14 Mono_STIM", + ident.2 = "CD14 Mono_CTRL", + test.use = "DESeq2") +head(bulk.mono.de, n = 15) + +# compare the DE P-values between the single-cell level and the pseudobulk level results +names(bulk.mono.de) <- paste0(names(bulk.mono.de), ".bulk") +bulk.mono.de$gene <- rownames(bulk.mono.de) + +names(mono.de) <- paste0(names(mono.de), ".sc") +mono.de$gene <- rownames(mono.de) + +merge_dat <- merge(mono.de, bulk.mono.de, by = "gene") +merge_dat <- merge_dat[order(merge_dat$p_val.bulk), ] + +# Number of genes that are marginally significant in both; marginally significant only in bulk; and marginally significant only in single-cell +common <- merge_dat$gene[which(merge_dat$p_val.bulk < 0.05 & + merge_dat$p_val.sc < 0.05)] +only_sc <- merge_dat$gene[which(merge_dat$p_val.bulk > 0.05 & + merge_dat$p_val.sc < 0.05)] +only_bulk <- merge_dat$gene[which(merge_dat$p_val.bulk < 0.05 & + merge_dat$p_val.sc > 0.05)] +print(paste0('# Common: ',length(common))) +print(paste0('# Only in single-cell: ',length(only_sc))) +print(paste0('# Only in bulk: ',length(only_bulk))) ``` - -# Acknowledgements -We thank the authors of the MAST and DESeq2 packages for their kind assistance and advice. We also point users to the following [study](https://www.nature.com/articles/nmeth.4612) by Charlotte Soneson and Mark Robinson, which performs careful and extensive evaluation of methods for single cell differential expression testing. +We can see that while the p-values are correlated between the single-cell and pseudobulk data, the single-cell p-values are often smaller and suggest higher levels of significance. In particular, there are 3,519 genes with evidence of differential expression (prior to multiple hypothesis testing) in both analyses, 1,649 genes that only appear to be differentially expressed in the single-cell analysis, and just 204 genes that only appear to be differentially expressed in the bulk analysis. We can investigate these discrepancies using `VlnPlot`. + +First, we can examine the top genes that are differentially expressed in both analyses. + +```{r echo=TRUE, message=FALSE, warning=FALSE,results=TRUE} +# create a new column to annotate sample-condition-celltype in the single-cell dataset +ifnb$donor_id.stim <- paste0(ifnb$stim, "-", ifnb$donor_id) -```{r save.times, include = FALSE} -write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/de_vignette_times.csv") +# generate violin plot +Idents(ifnb) <- "celltype.stim" +print(merge_dat[merge_dat$gene%in%common[1:2],c('gene','p_val.sc','p_val.bulk')]) +VlnPlot(ifnb, features = common[1:2], idents = c("CD14 Mono_CTRL", "CD14 Mono_STIM"), group.by = "stim") +VlnPlot(ifnb, features = common[1:2], idents = c("CD14 Mono_CTRL", "CD14 Mono_STIM"), group.by = "donor_id.stim", ncol = 1) ``` -
- **Session Info** -```{r} -sessionInfo() +In both the pseudobulk and single-cell analyses, the p-values for these two genes are astronomically small. For both of these genes, when just comparing all stimulated CD4 monocytes to all control CD4 monocytes across samples, we see much higher expression in the stimulated cells. When breaking down these cells by sample, we continue to see consistently higher expression levels in the stimulated samples compared to the control samples; in other words, this finding is not driven by just one or two samples. Because of this consistency, we find this signal in both analyses. + +By contrast, we can examine examples of genes that are only DE under the single-cell analysis. + +```{r echo=TRUE, message=FALSE, warning=FALSE,results=TRUE} +print(merge_dat[merge_dat$gene%in%c('SRGN','HLA-DRA'),c('gene','p_val.sc','p_val.bulk')]) +VlnPlot(ifnb, features <- c('SRGN','HLA-DRA'), idents = c("CD14 Mono_CTRL", "CD14 Mono_STIM"), group.by = "stim") +VlnPlot(ifnb, features <- c('SRGN','HLA-DRA'), idents = c("CD14 Mono_CTRL", "CD14 Mono_STIM"), group.by = "donor_id.stim", ncol = 1) +``` + +Here, SRGN and HLA-DRA both have very small p-values in the single-cell analysis (on the orders of $10^{-21}$ and $10^{-9}$), but much larger p-values around 0.18 in the pseudobulk analysis. While there appears to be a difference between control and simulated cells when ignoring sample information, the signal is much weaker on the sample level, and we can see notable variability from sample to sample. + +### Perform DE analysis using alternative tests + +Finally, we also support many other DE tests using other methods. For completeness, the following tests are currently supported: + +* "wilcox" : Wilcoxon rank sum test (default, using '[presto](https://github.com/immunogenomics/presto)' package) +* "wilcox_limma" : Wilcoxon rank sum test (using '[limma](https://bioconductor.org/packages/release/bioc/html/limma.html)' package) +* "bimod" : Likelihood-ratio test for single cell feature expression, (McDavid et al., Bioinformatics, 2013) +* "roc" : Standard AUC classifier +* "t" : Student’s t-test +* "poisson" : Likelihood ratio test assuming an underlying negative binomial distribution. Use only for UMI-based datasets +* "negbinom" : Likelihood ratio test assuming an underlying negative binomial distribution. Use only for UMI-based datasets +* "LR" : Uses a logistic regression framework to determine differentially expressed genes. Constructs a logistic regression model predicting group membership based on each feature individually and compares this to a null model with a likelihood ratio test. +* "MAST" : GLM-framework that treates cellular detection rate as a covariate (Finak et al, Genome Biology, 2015) (Installation instructions) +* "DESeq2" : DE based on a model using the negative binomial distribution (Love et al, Genome Biology, 2014) (Installation instructions) +For MAST and DESeq2, please ensure that these packages are installed separately in order to use them as part of Seurat. Once installed, use the test.use parameter can be used to specify which DE test to use. + +```{r echo=TRUE, message=FALSE,warning=FALSE,results=TRUE,eval=FALSE} +# Test for DE features using the MAST package +Idents(ifnb) <- "seurat_annotations" +head(FindMarkers(ifnb, ident.1 = "CD14 Mono", ident.2 = "CD16 Mono", test.use = "MAST")) ``` -
diff --git a/vignettes/dim_reduction_vignette.Rmd b/vignettes/dim_reduction_vignette.Rmd index fa66c107e..a13721b77 100644 --- a/vignettes/dim_reduction_vignette.Rmd +++ b/vignettes/dim_reduction_vignette.Rmd @@ -25,7 +25,8 @@ knitr::opts_chunk$set( tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE, - time_it = TRUE + time_it = TRUE, + error = TRUE ) ``` @@ -96,7 +97,7 @@ FeatureScatter(pbmc, feature1 = "MDS_1", feature2 = "PC_1") ``` -```{r save.img, include=FALSE} +```{r save.img, include=TRUE} library(ggplot2) plot <- DimPlot(pbmc, reduction = "mds", pt.size = 0.5) ggsave(filename = "../output/images/pbmc_mds.jpg", height = 7, width = 12, plot = plot, quality = 50) diff --git a/vignettes/essential_commands.Rmd b/vignettes/essential_commands.Rmd index 40edf0bc1..f33e5d494 100644 --- a/vignettes/essential_commands.Rmd +++ b/vignettes/essential_commands.Rmd @@ -16,158 +16,314 @@ knitr::opts_chunk$set( message = FALSE, warning = FALSE, results = 'hold', - eval = FALSE + eval = FALSE, + error = TRUE ) ``` ```{r load-data, echo=FALSE} library(Seurat) library(ggplot2) -pbmc <- readRDS(file = '~/Downloads/pbmc3k/pbmc3k_final.rds') -cbmc.rna <- as.sparse(x = read.csv("~/Downloads/GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv.gz", sep = ",", header = TRUE, row.names = 1)) +library(SeuratData) -# To make life a bit easier going forward, we're going to discard all but the top 100 most highly expressed mouse genes, and remove the "HUMAN_" from the CITE-seq prefix -cbmc.rna <- CollapseSpeciesExpressionMatrix(object = cbmc.rna) +# install datasets +InstallData('pbmc3k') +InstallData('cbmc') +InstallData('ifnb') -# Load in the ADT UMI matrix -cbmc.adt <- as.sparse(x = read.csv("~/Downloads/GSE100866_CBMC_8K_13AB_10X-ADT_umi.csv.gz", sep = "," ,header = TRUE, row.names = 1)) +#load in pbmc example +pbmc <- LoadData('pbmc3k') -# When adding multimodal data to Seurat, it's okay to have duplicate feature names. Each set of modal data (eg. RNA, ADT, etc.) is stored in its own Assay object. -# One of these Assay objects is called the "default assay", meaning it's used for all analyses and visualization. -# To pull data from an assay that isn't the default, you can specify a key that's linked to an assay for feature pulling. -# To see all keys for all objects, use the Key function. -# Lastly, we observed poor enrichments for CCR5, CCR7, and CD10 - and therefore remove them from the matrix (optional) -cbmc.adt <- cbmc.adt[setdiff(x = rownames(x = cbmc.adt), y = c('CCR5', 'CCR7', 'CD10')), ] -``` +# load in multimodal example +# type ?cbmc for more info +cbmc <- LoadData('cbmc') -# Seurat Standard Worflow +#load in case/control example +# type ?ifnb for more info +ifnb <- LoadData('ifnb') -The standard Seurat workflow takes raw single-cell expression data and aims to find clusters within the data. For full details, please read our tutorial. This process consists of data normalization and variable feature selection, data scaling, a PCA on variable features, construction of a shared-nearest-neighbors graph, and clustering using a modularity optimizer. Finally, we use a t-SNE to visualize our clusters in a two-dimensional space. +``` -```{r seurat-standard-workflow} -pbmc.counts <- Read10X(data.dir = "~/Downloads/pbmc3k/filtered_gene_bc_matrices/hg19/") -pbmc <- CreateSeuratObject(counts = pbmc.counts) +# Standard `Seurat` workflow +```{r seurat-standard-workflow-2} pbmc <- NormalizeData(object = pbmc) pbmc <- FindVariableFeatures(object = pbmc) pbmc <- ScaleData(object = pbmc) pbmc <- RunPCA(object = pbmc) -pbmc <- FindNeighbors(object = pbmc) +pbmc <- FindNeighbors(object = pbmc, dims = 1:30) pbmc <- FindClusters(object = pbmc) -pbmc <- RunTSNE(object = pbmc) -DimPlot(object = pbmc, reduction = 'tsne') +pbmc <- RunUMAP(object = pbmc, dims = 1:30) +DimPlot(object = pbmc, reduction = 'umap') + ``` -# `Seurat` Object Interaction +## SCtransform version +```{r seurat-standard-workflow} +pbmc <- SCTransform(object = pbmc) +pbmc <- RunPCA(object = pbmc) +pbmc <- FindNeighbors(object = pbmc, dims = 1:30) +pbmc <- FindClusters(object = pbmc) +pbmc <- RunUMAP(object = pbmc, dims = 1:30) +DimPlot(object = pbmc, reduction = 'umap') +``` -Since Seurat v3.0, we’ve made improvements to the Seurat object, and added new methods for user interaction. We also introduce simple functions for common tasks, like subsetting and merging, that mirror standard R functions. +```{r chaincmds,eval=FALSE} +# note that you can chain multiple commands together with %>% +pbmc <- SCTransform(pbmc) %>% RunPCA() %>% FindNeighbors(dims=1:30) %>% + FindClusters() %>% RunUMAP(dims=1:30) +``` - +# `Seurat` Object Data Access +## Cell, feature, and layer names ```{r features-and-cells} # Get cell and feature names, and total numbers -colnames(x = pbmc) -Cells(object = pbmc) -rownames(x = pbmc) -ncol(x = pbmc) -nrow(x = pbmc) +# We show multiple ways to get the same output +# cell names +colnames(pbmc) +Cells(pbmc) + +# feature names +Features(pbmc) +rownames(pbmc) + +# number of cells/features +num_cells <- ncol(pbmc) +num_features <- nrow(pbmc) + +# List of object layers +Layers(pbmc) + +# working with multimodal objects +# list assays +Assays(cbmc) + +# Assay-specific features (genes/ADT) +Features(cbmc[["RNA"]]) +Features(cbmc[["ADT"]]) + +# Variable feature names +VariableFeatures(pbmc) + ``` +```{r setvarfeatures, eval=FALSE} +# Set variable features +VariableFeatures(cbmc) <- var.gene.names + +# set for a specific assay +VariableFeatures(cbmc[["ADT"]]) <- var.gene.names +``` + +## Identity class labels ```{r idents} +# Setting and retrieving cell identities + +# Set identity classes to an existing column in meta data +Idents(object = pbmc) <- 'seurat_annotations' + +# View cell identities, get summary table +Idents(pbmc) +table(Idents(pbmc)) + +# Set identity to CD4 T cells for all cells +Idents(pbmc) <- 'CD4 T cells' + +# Set for a selected group of cells +pbmc.cells <- Cells(pbmc) +Idents(object = pbmc, cells = pbmc.cells[1:10]) <- 'CD4 T cells' + # Get cell identity classes Idents(object = pbmc) levels(x = pbmc) -# Stash cell identity classes +# Stash cell identity classes in metadata pbmc[['old.ident']] <- Idents(object = pbmc) pbmc <- StashIdent(object = pbmc, save.name = 'old.ident') -# Set identity classes -Idents(object = pbmc) <- 'CD4 T cells' -Idents(object = pbmc, cells = 1:10) <- 'CD4 T cells' - -# Set identity classes to an existing column in meta data -Idents(object = pbmc, cells = 1:10) <- 'orig.ident' -Idents(object = pbmc) <- 'orig.ident' - # Rename identity classes pbmc <- RenameIdents(object = pbmc, 'CD4 T cells' = 'T Helper cells') ``` +## Cell metadata +```{r metadata} +# View metadata data frame, stored in object@meta.data +pbmc[[]] + +# Retrieve specific values from the metadata +pbmc$nCount_RNA +pbmc[[c('percent.mito', 'nFeature_RNA')]] + +# Add metadata, see ?AddMetaData +random_group_labels <- sample(x = c('g1', 'g2'), size = ncol(x = pbmc), replace = TRUE) +pbmc$groups <- random_group_labels +``` + +## Expression data (stored as layers in Seurat v5) + +```{r expression-matrices, eval=FALSE} +# Retrieve data in an expression matrix +# RNA counts matrix +pbmc[["RNA"]]$counts + +# Alternate accessor function with the same result +LayerData(pbmc,assay = 'RNA',layer = 'counts') + +# GetAssayData from Seurat v4 is still supported +GetAssayData(object = pbmc, assay = 'RNA', slot = 'counts') + +# ADT counts matrix (multimodal object) +cbmc[["ADT"]]$counts + +``` + +```{r setdata, eval = FALSE} +# Set expression data +# assume new.data is a new expression matrix +pbmc[["RNA"]]$counts <- new.data + +# Alternate setter function with the same result +LayerData(pbmc,assay = 'RNA',layer = 'counts') <- new.data + +# SetAssayData from Seurat v4 is still supported +pbmc <- SetAssayData(object = pbmc, slot = 'counts',new.data = new.data) + +``` + +## Dimensional reductions +```{r embeddings-loadings} +# Get cell embeddings and feature loadings +# stored on pbmc[["pca"]]@cell.embeddings +Embeddings(pbmc, reduction = 'pca') + +# stored in pbmc[["pca]]@feature.loadings +Loadings(pbmc, reduction = 'pca') +``` + +```{r customreduc, eval=FALSE} +# Create custom dimensional reduction +# loadings matrix is optional +new_reduction <- CreateDimReducObject(embeddings = new.embeddings,loadings = new.loadings, key = "custom_pca") +pbmc[["custom_pca"]] <- new_reduction +``` + +## FetchData +```{r fetchdata} +# FetchData can access anything from expression matrices, cell embeddings, or metadata +# Use the previously listed commands to access entire matrices +# Use FetchData to access individual/small groups of variables +FetchData(object = pbmc, vars = c('PC_1', 'nFeature_RNA', 'MS4A1'),layer = 'counts') +``` + +# Subsetting and merging + +## Subset Seurat objects ```{r subsetting} # Subset Seurat object based on identity class, also see ?SubsetData -subset(x = pbmc, idents = 'B cells') -subset(x = pbmc, idents = c('CD4 T cells', 'CD8 T cells'), invert = TRUE) +subset(x = pbmc, idents = 'B') +subset(x = pbmc, idents = c('Naive CD4 T', 'CD8 T'), invert = TRUE) # Subset on the expression level of a gene/feature -subset(x = pbmc, subset = MS4A1 > 3) +subset(x = pbmc, subset = MS4A1 > 2.5) # Subset on a combination of criteria -subset(x = pbmc, subset = MS4A1 > 3 & PC1 > 5) -subset(x = pbmc, subset = MS4A1 > 3, idents = 'B cells') +subset(x = pbmc, subset = MS4A1 > 2.5 & PC_1 > 5) +subset(x = pbmc, subset = MS4A1 > 2.5, idents = 'B') # Subset on a value in the object meta data -subset(x = pbmc, subset = orig.ident == "Replicate1") +subset(x = pbmc, subset = groups == "g1") # Downsample the number of cells per identity class subset(x = pbmc, downsample = 100) + +``` + +## Split layers +```{r splitlayers, results = 'hide'} +# In Seurat v5, users can now split in object directly into different layers +# keeps expression data in one object, but splits multiple samples into layers +# can proceed directly to integration workflow after splitting layers +ifnb[["RNA"]] <- split(ifnb[["RNA"]],f = ifnb$stim) +Layers(ifnb) + +# If desired, for example after intergation, the layers can be joined together again +ifnb <- JoinLayers(ifnb) +``` + +## Split objects +```{r splitting, results = 'hide'} +# In line with prior workflows, you can also into split your object into a list of multiple objects +# based on a metadata column +# creates a list of two objects +ifnb_list <- SplitObject(ifnb,split.by = 'stim') +ifnb_list$CTRL +ifnb_list$STIM ``` -```{r merging, eval=FALSE} +## Merge objects (without integration) +In Seurat v5, merging creates a single object, but keeps the expression information split into different layers for integration. If not proceeding with integration, rejoin the layers after merging. + +```{r merging-2, eval=FALSE, results = 'hide'} # Merge two Seurat objects -merge(x = pbmc1, y = pbmc2) -# Merge more than two Seurat objects +merged_obj <- merge(x = ifnb_list$CTRL, y = ifnb_list$STIM) +merged_obj[["RNA"]] <- JoinLayers(merged_obj) + +# Example to merge more than two Seurat objects merge(x = pbmc1, y = list(pbmc2, pbmc3)) ``` -# Data Access +## Merge objects (with integration) +See [introduction to integration](integration_introduction.html) for more information. -Accessing data in Seurat is simple, using clearly defined accessors and setters to quickly find the data needed. +```{r merging-3, eval=FALSE, results = 'hide'} +merged_obj <- merge(x = ifnb_list$CTRL, y = ifnb_list$STIM) +merged_obj <- NormalizeData(merged_obj) +merged_obj <- FindVariableFeatures(merged_obj) +merged_obj <- ScaleData(merged_obj) +merged_obj <- RunPCA(merged_obj) +merged_obj <- IntegrateLayers( + object = obj, method = RPCAIntegration, + orig.reduction = "pca", new.reduction = 'integrated.rpca', + verbose = FALSE) -```{r metadata} -# View metadata data frame, stored in object@meta.data -pbmc[[]] +# now that integration is complete, rejoin layers +merged_obj[["RNA"]] <- JoinLayers(merged_obj) -# Retrieve specific values from the metadata -pbmc$nCount_RNA -pbmc[[c('percent.mito', 'nFeature_RNA')]] - -# Add metadata, see ?AddMetaData -random_group_labels <- sample(x = c('g1', 'g2'), size = ncol(x = pbmc), replace = TRUE) -pbmc$groups <- random_group_labels ``` -```{r expression-matrices, eval=FALSE} -# Retrieve or set data in an expression matrix ('counts', 'data', and 'scale.data') -GetAssayData(object = pbmc, slot = 'counts') -pbmc <- SetAssayData(object = pbmc, slot = 'scale.data', new.data = new.data) -``` +# Pseudobulk analysis -```{r embeddings-loadings} -# Get cell embeddings and feature loadings -Embeddings(object = pbmc, reduction = 'pca') -Loadings(object = pbmc, reduction = 'pca') -Loadings(object = pbmc, reduction = 'pca', projected = TRUE) -``` +## Group cells together, based on multiple categories +See [DE vignette](de_vignette.html) for information on how to add the `donor_id` column to meta data. -```{r fetchdata} -# FetchData can pull anything from expression matrices, cell embeddings, or metadata -FetchData(object = pbmc, vars = c('PC_1', 'percent.mito', 'MS4A1')) +```{r, eval=FALSE} +# pseudobulk cells only by cell type +bulk <- AggregateExpression(ifnb,group.by ='seurat_annotations', return.seurat = TRUE) +Cells(bulk) + +# pseudobulk cells by stimulation condition AND cell type +bulk <- AggregateExpression(ifnb,group.by = c('stim','seurat_annotations'), return.seurat = TRUE) +Cells(bulk) + +# pseudobulk cells by stimulation condition AND cell type AND donor +bulk <- AggregateExpression(ifnb,group.by = c('stim','seurat_annotations',"donor_id"), return.seurat = TRUE) +Cells(bulk) ``` -# Visualization in Seurat +# Visualization in `Seurat` Seurat has a vast, ggplot2-based plotting library. All plotting functions will return a ggplot2 plot by default, allowing easy customization with ggplot2. ```{r visualization} -# Dimensional reduction plot for PCA or tSNE -DimPlot(object = pbmc, reduction = 'tsne') +# Dimensional reduction plot DimPlot(object = pbmc, reduction = 'pca') # Dimensional reduction plot, with cells colored by a quantitative feature +# Defaults to UMAP if available FeaturePlot(object = pbmc, features = "MS4A1") -# Scatter plot across single cells, replaces GenePlot +# Scatter plot across single cells FeatureScatter(object = pbmc, feature1 = "MS4A1", feature2 = "PC_1") FeatureScatter(object = pbmc, feature1 = "MS4A1", feature2 = "CD3D") @@ -179,18 +335,33 @@ VariableFeaturePlot(object = pbmc) #Violin and Ridge plots VlnPlot(object = pbmc, features = c("LYZ", "CCL5", "IL32")) RidgePlot(object = pbmc, feature = c("LYZ", "CCL5", "IL32")) +``` -# Heatmaps -DoHeatmap(object = pbmc,features = heatmap_markers) +```{r heatmaps} +# Heatmaps (visualize scale.data slot) DimHeatmap(object = pbmc,reduction = 'pca', cells = 200) +# standard workflow +pbmc <- ScaleData(pbmc,features = heatmap_markers) +DoHeatmap(object = pbmc,features = heatmap_markers) + +# sctransform workflow +pbmc <- GetResidual(pbmc,features = heatmap_markers) +DoHeatmap(object = pbmc,features = heatmap_markers) + +# heatmap with maximum of 100 cells per group +DoHeatmap(pbmc,heatmap_markers,cells = subset(pbmc,downsample = 100)) + +``` + +```{r newthings} # New things to try! # Note that plotting functions now return ggplot2 objects, so you can add themes, titles, and options onto them VlnPlot(object = pbmc, features = "MS4A1", split.by = "groups") DotPlot(object = pbmc, features = c("LYZ", "CCL5", "IL32"), split.by = "groups") FeaturePlot(object = pbmc, features = c("MS4A1", "CD79A"), blend = TRUE) DimPlot(object = pbmc) + DarkTheme() -DimPlot(object = pbmc) + labs(title = '2,700 PBMCs clustered using Seurat and viewed\non a two-dimensional tSNE') +DimPlot(object = pbmc) + labs(title = '2,700 PBMCs clustered using Seurat and viewed\non a two-dimensional UMAP') ``` Seurat provides many prebuilt themes that can be added to ggplot2 plots for quick customization @@ -244,36 +415,12 @@ FetchData(object = cbmc, vars = c('rna_CD3E', 'adt_CD3')) FeatureScatter(object = cbmc, feature1 = "rna_CD3E", feature2 = "adt_CD3") ``` -# Seurat v2.X vs v3.X - -| Seurat v2.X | Seurat v3.X | -| ----------- | ----------- | -| `object@data` | `GetAssayData(object = object)` | -| `object@raw.data` | `GetAssayData(object = object, slot = "counts")` | -| `object@scale.data` | `GetAssayData(object = object, slot = "scale.data")` | -| `object@cell.names` | `colnames(x = object)` | -| `rownames(x = object@data)` | `rownames(x = object)` | -| `object@var.genes` | `VariableFeatures(object = object)` | -| `object@hvg.info` | `HVFInfo(object = object)` | -| `object@assays$assay.name` | `object[["assay.name"]]` | -| `object@dr$pca` | `object[["pca"]]` | -| `GetCellEmbeddings(object = object, reduction.type = "pca")` | `Embeddings(object = object, reduction = "pca")` | -| `GetGeneLoadings(object = object, reduction.type = "pca")` | `Loadings(object = object, reduction = "pca")` | -| `AddMetaData(object = object, metadata = vector, col.name = "name")` | `object$name <- vector` | -| `object@meta.data$name` | `object$name` | -| `object@idents` | `Idents(object = object)` | -| `SetIdent(object = object, ident.use = "new.idents")` | `Idents(object = object) <- "new.idents"` | -| `SetIdent(object = object, cells.use = 1:10, ident.use = "new.idents")` | `Idents(object = object, cells = 1:10) <- "new.idents"` | -| `StashIdent(object = object, save.name = "saved.idents")` | `object$saved.idents <- Idents(object = object)` | -| `levels(x = object@idents)` | `levels(x = object)` | -| `RenameIdent(object = object, old.ident.name = "old.ident", new.ident.name = "new.ident")` | `RenameIdents(object = object, "old.ident" = "new.ident")` | -| `WhichCells(object = object, ident = "ident.keep")` | `WhichCells(object = object, idents = "ident.keep")` | -| `WhichCells(object = object, ident.remove = "ident.remove")` | `WhichCells(object = object, idents = "ident.remove", invert = TRUE)` | -| `WhichCells(object = object, max.cells.per.ident = 500)` | `WhichCells(object = object, downsample = 500)` | -| `WhichCells(object = object, subset.name = "name", low.threshold = low, high.threshold = high)` | `WhichCells(object = object, expression = name > low & name < high)` | -| `FilterCells(object = object, subset.names = "name", low.threshold = low, high.threshold = high)` | `subset(x = object, subset = name > low & name < high)` | -| `SubsetData(object = object, subset.name = "name", low.threshold = low, high.threshold = high)` | `subset(x = object, subset = name > low & name < high)` | -| `MergeSeurat(object1 = object1, object2 = object2)` | `merge(x = object1, y = object2)` | +# Additional resources + +Users who are particularly interested in some of the technical changes to data storage in Seurat v5 can explore the following resources: + +* [SeuratObject manual](https://mojaveazure.github.io/seurat-object/) +* [Seurat v5 and Assay5 introductory vignette ](seurat5_essential_commands.html)
**Session Info** diff --git a/vignettes/extensions.Rmd b/vignettes/extensions.Rmd index 787bdcdb6..a99215424 100644 --- a/vignettes/extensions.Rmd +++ b/vignettes/extensions.Rmd @@ -33,3 +33,9 @@ The SeuratDisk package introduces the h5Seurat file format for the storage and a Azimuth is a web application that uses an annotated reference dataset to automate the processing, analysis, and interpretation of a new single-cell RNA-seq experiment. Azimuth leverages a 'reference-based mapping' pipeline that inputs a counts matrix of gene expression in single cells, and performs normalization, visualization, cell annotation, and differential expression (biomarker discovery). All results can be explored within the app, and easily downloaded for additional downstream analysis. To use the Azimuth web app, visit the Azimuth website [here](https://azimuth.hubmapconsortium.org/). +# BPCells + +BPCells is an R package that allows for computationally efficient single-cell analysis. It utilizes bit-packing compression to store counts matrices on disk and C++ code to cache operations. BPCells is an R package that allows for computationally efficient single-cell analysis. It utilizes bit-packing compression to store counts matrices on disk and C++ code to cache operations. + +# presto +Presto performs a fast Wilcoxon rank sum test and auROC analysis. Seurat uses the presto package to perform fast differential expression. \ No newline at end of file diff --git a/vignettes/future_vignette.Rmd b/vignettes/future_vignette.Rmd index b8c58c4a3..cb7e747f6 100644 --- a/vignettes/future_vignette.Rmd +++ b/vignettes/future_vignette.Rmd @@ -25,7 +25,8 @@ knitr::opts_chunk$set( tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE, - time_it = TRUE + time_it = TRUE, + error = TRUE ) ``` In Seurat, we have chosen to use the `future` framework for parallelization. In this vignette, we will demonstrate how you can take advantage of the `future` implementation of certain Seurat functions from a user's perspective. If you are interested in learning more about the `future` framework beyond what is described here, please see the package vignettes [here](https://cran.r-project.org/web/packages/future/index.html) for a comprehensive and detailed description. @@ -59,6 +60,7 @@ For example, to run the parallel version of `FindMarkers()`, you simply need to ```{r demo} library(Seurat) pbmc <- readRDS("../data/pbmc3k_final.rds") +pbmc <- UpdateSeuratObject(pbmc) # Enable parallelization plan('multiprocess', workers = 4) diff --git a/vignettes/get_started_v5.Rmd b/vignettes/get_started_v5.Rmd new file mode 100644 index 000000000..3643560d5 --- /dev/null +++ b/vignettes/get_started_v5.Rmd @@ -0,0 +1,140 @@ +--- +title: "Introduction to Seurat v5" +output: + html_document: + theme: united + df_print: kable + pdf_document: default +--- + +```{r fxns, include = FALSE} +library('htmlTable') +make_list <- function(items) { + paste0("
    ", sprintf('
  • %s
  • ', items), '
', collapse = '') +} +make_href <- function(url, text){ + paste0("") +} +make_href2 <- function(url, text){ + paste0("", text, "") +} +process_entry <- function(dat) { + if (grepl(pattern = "https://satijalab.org/img/vignette_images", x = dat$image)) { + img <- paste0('![](', dat$image, '){width=3000px}') + } else if (grepl(pattern = "assets/", x= dat$image)) { + img <- paste0('![](', dat$image, '){width=3000px}') + } else { + img <- paste0('![](', '../output/images/', dat$image, '){width=3000px}') + } + + if (dat$name == "seurat5_run_azimuth") { + link <- "https://satijalab.github.io/azimuth/articles/run_azimuth_tutorial.html" + } else if (grepl(pattern = "https://satijalab.org/", x = dat$name)) { + link <- dat$name + } else { + link <- paste0(dat$name, ".html") + } + go.button <- paste0('GO') + data.frame( + title = make_href(url = link, text = dat$title), + img = img, + desc = dat$summary, + btn = go.button + ) +} +process_wrapper_entry <- function(dat) { + data.frame( + Package = dat$name, + Vignette = make_href2(url = dat$link, text = dat$title), + Reference = make_href2(url = dat$reference, text = dat$citation), + Source = make_href2(url = dat$source, text = dat$source) + ) +} +make_vignette_card_section <- function(vdat, cat) { + vignettes <- vdat[[cat]]$vignettes + dat <- data.frame(title = character(), img = character(), desc = character()) + for (v in 1:length(x = vignettes)) { + dat <- rbind(dat, process_entry(vignettes[[v]])) + if(nrow(x = dat) == 3 | v == length(x = vignettes)){ + colnames(dat) <- NULL + dat <- t(dat) + if (ncol(x = dat) == 2) { + print(htmlTable( + dat, + align = '|l|l|', + css.cell = "padding-left: .75em; width: 50%", + css.class = "two-column-htmltable" + )) + } else if (ncol(x = dat) == 1){ + print(htmlTable( + dat, + align = '|l|', + css.cell = "padding-left: .75em; width: 100%", + css.class = "one-column-htmltable" + )) + } else { + print(htmlTable( + dat, + align = '|l|l|l|', + css.cell = "padding-left: .75em; width: 30%" + )) + } + dat <- data.frame(title = character(), img = character(), desc = character()) + } + } +} +``` + +```{r yaml, include = FALSE} +library(yaml) +vdat <- read_yaml(file = "vignettes_v5.yaml") +``` + +```{=html} + +``` + +We provide a series of vignettes, tutorials, and analysis walkthroughs to help users get started with Seurat v5. These vignettes are meant to highlight new functions and features supported by Seurat v5. Seurat v5 is backwards compatible with previous versions, so existing user workflows (as well as [previously released Seurat vignettes](get_started.html)) will continue to work even when using Seurat v5. + +# Spatial analysis + +These vignettes will help introduce users to the analysis of spatial datasets in Seurat v5, including technologies that leverage sequencing-based readouts, as well as technologies that leverage in-situ imaging-based readouts. The vignettes introduce data from multiple platforms including 10x Visium, SLIDE-seq, Vizgen MERSCOPE, 10x Xenium, Nanostring CosMx, and Akoya CODEX. + +```{r results='asis', echo=FALSE, warning=FALSE, message = FALSE} +make_vignette_card_section(vdat = vdat, cat = 1) +``` + +# Streamlined and multimodal integration + +Performing integrative analysis in order to identify shared cell types across multiple datasets is an increasingly important analytical step in single-cell workflows. These vignettes demonstrate new methods and infrastructure for integrative analysis in Seurat v5. They include a streamlined analytical workflow to integrate scRNA-seq datasets, and the use of 'bridge integration' for harmonizing datasets across modalities. + +```{r results='asis', echo=FALSE, warning=FALSE, message = FALSE} +make_vignette_card_section(vdat = vdat, cat = 2) +``` + +# Flexible analysis of massively scalable datasets + +In Seurat v5, we introduce new infrastructure and methods to analyze, interpret, and explore datasets that extend to millions of cells. We introduce support for 'sketch-based' techniques, where a subset of representative cells are stored in memory to enable rapid and iterative exploration, while the remaining cells are stored on-disk. Users can flexibly switch between both data representations, and we leverage the [BPCells package](https://bnprks.github.io/BPCells/) from Ben Parks in the Greenleaf lab to enable high-performance analysis of disk-backed data. +\ +\ +The vignettes below demonstrate three scalable analyses in Seurat v5: Unsupervised clustering analysis of a large dataset (1.3M neurons), Unsupervised integration and comparison of 1M PBMC from healthy and diabetic patients, and Supervised mapping of 1.5M immune cells from healthy and COVID donors. In all cases, the vignettes perform these analyses without ever loading the full datasets into memory. + +```{r results='asis', echo=FALSE, warning=FALSE, message = FALSE} +make_vignette_card_section(vdat = vdat, cat = 3) +``` + +# References and additional documentation + +We include brief vignettes describing install instructions, a 'cheat sheet' of commands for interacting with Seurat v5 assays, and additional documentation for using the BPCells package together with Seurat. + +```{r results='asis', echo=FALSE, warning=FALSE, message = FALSE} +make_vignette_card_section(vdat = vdat, cat = 4) +``` diff --git a/vignettes/get_started_v5_new.Rmd b/vignettes/get_started_v5_new.Rmd new file mode 100644 index 000000000..652676190 --- /dev/null +++ b/vignettes/get_started_v5_new.Rmd @@ -0,0 +1,172 @@ +--- +title: "Getting Started with Seurat" +output: + html_document: + theme: united + df_print: kable + pdf_document: default +--- + +```{r fxns, include = FALSE} +library('htmlTable') +make_list <- function(items) { + paste0("
    ", sprintf('
  • %s
  • ', items), '
', collapse = '') +} +make_href <- function(url, text){ + paste0("") +} +make_href2 <- function(url, text){ + paste0("", text, "") +} +process_entry <- function(dat) { + if (grepl(pattern = "https://satijalab.org/img/vignette_images", x = dat$image)) { + img <- paste0('![](', dat$image, '){width=3000px}') + } else if (grepl(pattern = "assets/", x= dat$image)) { + img <- paste0('![](', dat$image, '){width=3000px}') + } else { + img <- paste0('![](', '../output/images/', dat$image, '){width=3000px}') + } + if (grepl(pattern = "https://satijalab.org/", x = dat$name)) { + link <- dat$name + } else { + link <- paste0(dat$name, ".html") + } + go.button <- paste0('GO') + data.frame( + title = make_href(url = link, text = dat$title), + img = img, + desc = dat$summary, + btn = go.button + ) +} +process_wrapper_entry <- function(dat) { + data.frame( + Package = dat$name, + Vignette = make_href2(url = dat$link, text = dat$title), + Reference = make_href2(url = dat$reference, text = dat$citation), + Source = make_href2(url = dat$source, text = dat$source) + ) +} +make_vignette_card_section <- function(vdat, cat) { + vignettes <- vdat[[cat]]$vignettes + dat <- data.frame(title = character(), img = character(), desc = character()) + for (v in 1:length(x = vignettes)) { + dat <- rbind(dat, process_entry(vignettes[[v]])) + if(nrow(x = dat) == 3 | v == length(x = vignettes)){ + colnames(dat) <- NULL + dat <- t(dat) + if (ncol(x = dat) == 2) { + print(htmlTable( + dat, + align = '|l|l|', + css.cell = "padding-left: .75em; width: 50%", + css.class = "two-column-htmltable" + )) + } else if (ncol(x = dat) == 1){ + print(htmlTable( + dat, + align = '|l|', + css.cell = "padding-left: .75em; width: 100%", + css.class = "one-column-htmltable" + )) + } else { + print(htmlTable( + dat, + align = '|l|l|l|', + css.cell = "padding-left: .75em; width: 30%" + )) + } + dat <- data.frame(title = character(), img = character(), desc = character()) + } + } +} +``` + +```{r yaml, include = FALSE} +library(yaml) +vdat <- read_yaml(file = "vignettes_v5_new.yaml") +``` + +```{=html} + +``` + +We provide a series of vignettes, tutorials, and analysis walkthroughs to help users get started with Seurat. You can also check out our [Reference page](../reference/index.html) which contains a full list of functions available to users. + +Our previous Get Started page for Seurat v4 is archived [here](get_started.html). + +# Introductory Vignettes + +For new users of Seurat, we suggest starting with a guided walk through of a dataset of 2,700 Peripheral Blood Mononuclear Cells (PBMCs) made publicly available by 10X Genomics. This tutorial implements the major components of a standard unsupervised clustering workflow including QC and data filtration, calculation of high-variance genes, dimensional reduction, graph-based clustering, and the identification of cluster markers. We provide additional vignettes introducing visualization techniques in Seurat, the sctransform normalization workflow, and storage/interaction with multimodal datasets. We also provide an 'essential commands cheatsheet' as a quick reference. + +```{r results='asis', echo=FALSE, warning=FALSE, message = FALSE} +make_vignette_card_section(vdat = vdat, cat = 1) +``` + +# scRNA Data Integration + +We have developed [computational methods](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8) for integrated analysis of single-cell datasets generated across different conditions, technologies, or species. As an example, we provide a guided walk through for integrating and comparing PBMC datasets generated under different stimulation conditions. We provide additional vignettes demonstrating how to leverage an annotated scRNA-seq reference to map and label cells from a query, and to efficiently integrate large datasets. + +```{r results='asis', echo=FALSE, warning=FALSE, message = FALSE} +make_vignette_card_section(vdat = vdat, cat = 2) +``` + +# Multi-assay data + +Seurat also offers support for a suite of statistical methods for analyzing multimodal single-cell data. These include methods to integrate modalities that are simultaneously measured in the same cells, modalities that are measured in different cells, and techniques to analyze pooled CRISPR screens. + +```{r results='asis', echo=FALSE, warning=FALSE, message = FALSE} +make_vignette_card_section(vdat = vdat, cat = 3) +``` + +# Flexible analysis of massively scalable datasets + +In Seurat v5, we introduce new infrastructure and methods to analyze, interpret, and explore datasets that extend to millions of cells. We introduce support for 'sketch-based' techniques, where a subset of representative cells are stored in memory to enable rapid and iterative exploration, while the remaining cells are stored on-disk. Users can flexibly switch between both data representations, and we leverage the [BPCells package](https://bnprks.github.io/BPCells/) from Ben Parks in the Greenleaf lab to enable high-performance analysis of disk-backed data. + +The vignettes below demonstrate three scalable analyses in Seurat v5: Unsupervised clustering analysis of a large dataset (1.3M neurons), Unsupervised integration and comparison of 1M PBMC from healthy and diabetic patients, and Supervised mapping of 1.5M immune cells from healthy and COVID donors. In all cases, the vignettes perform these analyses without ever loading the full datasets into memory. + +```{r results='asis', echo=FALSE, warning=FALSE, message = FALSE} +make_vignette_card_section(vdat = vdat, cat = 4) +``` + +# Spatial analysis + +These vignettes will help introduce users to the analysis of spatial datasets in Seurat v5, including technologies that leverage sequencing-based readouts, as well as technologies that leverage in-situ imaging-based readouts. The vignettes introduce data from multiple platforms including 10x Visium, SLIDE-seq, Vizgen MERSCOPE, 10x Xenium, Nanostring CosMx, and Akoya CODEX. + +```{r results='asis', echo=FALSE, warning=FALSE, message = FALSE} +make_vignette_card_section(vdat = vdat, cat = 5) +``` + +# Other + +Here we provide a series of short vignettes to demonstrate a number of features that are commonly used in Seurat. We've focused the vignettes around questions that we frequently receive from users. + +```{r results='asis', echo=FALSE, warning=FALSE, message = FALSE} +make_vignette_card_section(vdat = vdat, cat = 6) +``` + +# SeuratWrappers + +In order to facilitate the use of community tools with Seurat, we provide the Seurat Wrappers package, which contains code to run other analysis tools on Seurat objects. For the initial release, we provide wrappers for a few packages in the table below but would encourage other package developers interested in interfacing with Seurat to check out our contributor guide [here](https://github.com/satijalab/seurat.wrappers/wiki/Submission-Process). + +```{r results='asis', echo=FALSE, warning=FALSE, message = FALSE} +library(knitr) +library(kableExtra) +cat <- 7 +vignettes <- vdat[[cat]]$vignettes +dat <- data.frame(Package = character(), Vignette = character(), Reference = character(), Source = character()) +for (v in 1:length(x = vignettes)) { + dat <- rbind(dat, process_wrapper_entry(vignettes[[v]])) +} +dat %>% + kable(format = "html", escape = FALSE) %>% + kable_styling(bootstrap_options = c("striped", "hover")) +``` \ No newline at end of file diff --git a/vignettes/hashing_vignette.Rmd b/vignettes/hashing_vignette.Rmd index 5d92ded3f..0a34ae283 100644 --- a/vignettes/hashing_vignette.Rmd +++ b/vignettes/hashing_vignette.Rmd @@ -26,7 +26,8 @@ knitr::opts_chunk$set( tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE, - time_it = TRUE + time_it = TRUE, + error = TRUE ) ``` @@ -71,11 +72,11 @@ Read in data ```{r read_Data} # Load in the UMI matrix -pbmc.umis <- readRDS("../data/pbmc_umi_mtx.rds") +pbmc.umis <- readRDS("/brahms/shared/vignette-data/pbmc_umi_mtx.rds") # For generating a hashtag count matrix from FASTQ files, please refer to https://github.com/Hoohm/CITE-seq-Count. # Load in the HTO count matrix -pbmc.htos <- readRDS("../data/pbmc_hto_mtx.rds") +pbmc.htos <- readRDS("/brahms/shared/vignette-data/pbmc_hto_mtx.rds") # Select cell barcodes detected by both RNA and HTO # In the example datasets we have already filtered the cells for you, but perform this step for clarity. @@ -93,7 +94,7 @@ Setup Seurat object and add in the HTO data ```{r hashtag_setup} # Setup Seurat object -pbmc.hashtag <- CreateSeuratObject(counts = pbmc.umis) +pbmc.hashtag <- CreateSeuratObject(counts = Matrix::Matrix(as.matrix(pbmc.umis),sparse = T)) # Normalize RNA data with log normalization pbmc.hashtag <- NormalizeData(pbmc.hashtag) @@ -153,7 +154,7 @@ Idents(pbmc.hashtag) <- 'HTO_classification.global' VlnPlot(pbmc.hashtag, features = 'nCount_RNA', pt.size = 0.1, log = TRUE) ``` -Generate a two dimensional tSNE embedding for HTOs.Here we are grouping cells by singlets and doublets for simplicity. +Generate a two dimensional tSNE embedding for HTOs. Here we are grouping cells by singlets and doublets for simplicity. ```{r hashtag_sub_tsne, fig.width=9} #First, we will remove negative cells from the object @@ -219,16 +220,16 @@ DimPlot(pbmc.singlet, group.by = "HTO_classification") ```{r hto_setup} # Read in UMI count matrix for RNA -hto12.umis <- readRDS("../data/hto12_umi_mtx.rds") +hto12.umis <- readRDS("/brahms/shared/vignette-data/hto12_umi_mtx.rds") # Read in HTO count matrix -hto12.htos <- readRDS("../data/hto12_hto_mtx.rds") +hto12.htos <- readRDS("/brahms/shared/vignette-data/hto12_hto_mtx.rds") # Select cell barcodes detected in both RNA and HTO cells.use <- intersect(rownames(hto12.htos), colnames(hto12.umis)) # Create Seurat object and add HTO data -hto12 <- CreateSeuratObject(counts = hto12.umis[, cells.use], min.features = 300) +hto12 <- CreateSeuratObject(counts = Matrix::Matrix(as.matrix(hto12.umis[, cells.use]), sparse = T), min.features = 300) hto12[['HTO']] <- CreateAssayObject(counts = t(x = hto12.htos[colnames(hto12), 1:12])) # Normalize data diff --git a/vignettes/install.Rmd b/vignettes/install.Rmd index b5e3f39eb..7cc7d55c2 100644 --- a/vignettes/install.Rmd +++ b/vignettes/install.Rmd @@ -5,6 +5,36 @@ output: html_document To install Seurat, [R](https://www.r-project.org/) version 4.0 or greater is required. We also recommend installing [R Studio](https://www.rstudio.com/). +# ![Seurat v5:](../output/images/SeuratV5.png){#id .class width=60 height=60} Seurat 5: Install from GitHub + +Copy the code below to install Seurat v5: + +```{r required, eval=FALSE} +remotes::install_github("satijalab/seurat", "seurat5", quiet = TRUE) +``` + +The following packages are not required but are used in many Seurat v5 vignettes: + +* SeuratData: automatically load datasets pre-packaged as Seurat objects +* Azimuth: local annotation of scRNA-seq and scATAC-seq queries across multiple organs and tissues +* SeuratWrappers: enables use of additional integration and differential expression methods +* Signac: analysis of single-cell chromatin data + +```{r additional, eval=FALSE} +remotes::install_github("satijalab/seurat-data", "seurat5", quiet = TRUE) +remotes::install_github("satijalab/azimuth", "seurat5", quiet = TRUE) +remotes::install_github("satijalab/seurat-wrappers", "seurat5", quiet = TRUE) +remotes::install_github("stuart-lab/signac", "seurat5", quiet = TRUE) +``` + +Seurat v5 utilizes BPCells to support analysis of extremely large datasets: + +```{r bpcells, eval=FALSE} +remotes::install_github("bnprks/BPCells", quiet = TRUE) +``` + +For more information on BPCells installation, please see the [installation instructions](https://bnprks.github.io/BPCells/#installation). For macOS users, the following GitHub issues concerning [M1 chip installation](https://github.com/bnprks/BPCells/issues/6) and [compiler compatibility](https://github.com/bnprks/BPCells/issues/3) may be of use. + # Install from CRAN Seurat is available on [CRAN](https://cran.r-project.org/package=Seurat) for all platforms. To install, run: diff --git a/vignettes/install_v5.Rmd b/vignettes/install_v5.Rmd new file mode 100644 index 000000000..c9efee6a9 --- /dev/null +++ b/vignettes/install_v5.Rmd @@ -0,0 +1,87 @@ +--- +title: "Installation Instructions for Seurat" +output: html_document +--- + +To install Seurat, [R](https://www.r-project.org/) version 4.0 or greater is required. We also recommend installing [R Studio](https://www.rstudio.com/). + +# ![Seurat v5:](../output/images/SeuratV5.png){#id .class width=60 height=60} Seurat 5: Install from CRAN + +Seurat is available on [CRAN](https://cran.r-project.org/package=Seurat) for all platforms. To install, run: + +```{r, eval = FALSE} +# Enter commands in R (or R studio, if installed) +install.packages('Seurat') +library(Seurat) +``` + +Seurat does not require, but makes use of, packages developed by other labs that can substantially enhance speed and performance. These include [presto](https://github.com/immunogenomics/presto) (Korunsky/Raychaudhari labs), [BPCells](https://github.com/bnprks/BPCells) (Greenleaf Lab), and [glmGamPoi](https://github.com/const-ae/glmGamPoi) (Huber Lab). We recommend users install these along with users: + +```{r, eval = FALSE} +setRepositories(ind = 1:3, addURLs = c('https://satijalab.r-universe.dev', 'https://bnprks.r-universe.dev/')) +install.packages(c("BPCells", "presto", "glmGamPoi")) +``` + +We also recommend installing these additional packages, which are used in our vignettes, and enhance the functionality of Seurat: + +* [Signac](https://github.com/stuart-lab/signac): analysis of single-cell chromatin data +* [SeuratData](https://github.com/satijalab/seurat-data): automatically load datasets pre-packaged as Seurat objects +* [Azimuth](https://github.com/satijalab/azimuth): local annotation of scRNA-seq and scATAC-seq queries across multiple organs and tissues +* [SeuratWrappers](https://github.com/satijalab/seurat-wrappers): enables use of additional integration and differential expression methods + +```{r additional, eval=FALSE} +install.packages('Signac') +remotes::install_github("satijalab/seurat-data", quiet = TRUE) +remotes::install_github("satijalab/azimuth", quiet = TRUE) +remotes::install_github("satijalab/seurat-wrappers", quiet = TRUE) +``` + +# Install previous versions of Seurat + +## Install Seurat v4 + +Seurat v4.4.0 can be installed with the following command: + +```{r eval = FALSE} +install.packages('Seurat', repos = c('https://satijalab.r-universe.dev', 'https://cloud.r-project.org')) +``` + +## Older versions of Seurat +Old versions of Seurat, from Seurat v2.0.1 and up, are hosted in CRAN's archive. To install an old version of Seurat, run: + +```{r eval = FALSE} +# Install the remotes package +install.packages('remotes') +# Replace 'X.X.X' with your desired version +remotes::install_version(package = 'Seurat', version = package_version('X.X.X')) +``` + +For versions of Seurat older than those not hosted on CRAN (versions 1.3.0 and 1.4.0), please download the packaged source code from our [releases page](https://github.com/satijalab/seurat/releases) and [install from the tarball](https://stackoverflow.com/questions/4739837/how-do-i-install-an-r-package-from-the-source-tarball-on-windows). + +# Install the development version of Seurat + +Install the development version of Seurat - directly from [GitHub](https://github.com/satijalab/seurat/tree/develop). + +```{r eval = FALSE} +# Enter commands in R (or R studio, if installed) +# Install the remotes package +install.packages('remotes') +remotes::install_github(repo = 'satijalab/seurat', ref = 'develop') +library(Seurat) +``` + +# Docker + +We provide docker images for Seurat via [dockerhub](https://hub.docker.com/r/satijalab/seurat). + +To pull the latest image from the command line: + +```sh +docker pull satijalab/seurat:latest +``` + +To use as a base image in a new Dockerfile: + +```sh +FROM satijalab/seurat:latest +``` \ No newline at end of file diff --git a/vignettes/integration_introduction.Rmd b/vignettes/integration_introduction.Rmd index bec4b4dd3..8a814ebf7 100644 --- a/vignettes/integration_introduction.Rmd +++ b/vignettes/integration_introduction.Rmd @@ -26,31 +26,30 @@ knitr::opts_chunk$set( fig.width = 10, message = FALSE, warning = FALSE, - time_it = TRUE + time_it = TRUE, + error = TRUE ) ``` # Introduction to scRNA-seq integration -The joint analysis of two or more single-cell datasets poses unique challenges. In particular, identifying cell populations that are present across multiple datasets can be problematic under standard workflows. Seurat v4 includes a set of methods to match (or ‘align’) shared cell populations across datasets. These methods first identify cross-dataset pairs of cells that are in a matched biological state (‘anchors’), can be used both to correct for technical differences between datasets (i.e. batch effect correction), and to perform comparative scRNA-seq analysis of across experimental conditions. - -Below, we demonstrate methods for scRNA-seq integration as described in [Stuart\*, Butler\* et al, 2019](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8) to perform a comparative analysis of human immune cells (PBMC) in either a [resting or interferon-stimulated state](https://www.nature.com/articles/nbt.4042). +Integration of single-cell sequencing datasets, for example across experimental batches, donors, or conditions, is often an important step in scRNA-seq workflows. Integrative analysis can help to match shared cell types and states across datasets, which can boost statistical power, and most importantly, facilitate accurate comparative analysis across datasets. In previous versions of Seurat we introduced methods for integrative analysis, including our ‘anchor-based’ integration workflow. Many labs have also published powerful and pioneering methods, including [Harmony](https://portals.broadinstitute.org/harmony/) and [scVI](https://docs.scvi-tools.org/en/stable/index.html), for integrative analysis. Please see our [Integrating scRNA-seq data with multiple tools](https://satijalab.org/seurat/articles/seurat5_integration) vignette. -## Integration goals + +# Integration goals The following tutorial is designed to give you an overview of the kinds of comparative analyses on complex cell types that are possible using the Seurat integration procedure. Here, we address a few key goals: -* Create an 'integrated' data assay for downstream analysis -* Identify cell types that are present in both datasets +* Identify cell subpopulations that are present in both datasets * Obtain cell type markers that are conserved in both control and stimulated cells * Compare the datasets to find cell-type specific responses to stimulation -## Setup the Seurat objects +# Setup the Seurat objects For convenience, we distribute this dataset through our [SeuratData](https://github.com/satijalab/seurat-data) package. ```{r, include = FALSE} -options(SeuratData.repo.use = "http://satijalab04.nygenome.org") +options(SeuratData.repo.use = "http://seurat.nygenome.org") ``` ```{r data} @@ -64,197 +63,186 @@ library(patchwork) InstallData('ifnb') ``` -```{r init, results='hide', message=FALSE, fig.keep='none'} -# load dataset -LoadData('ifnb') +The object contains data from human PBMC from two conditions, interferon-stimulated and control cells (stored in the `stim` column in the object metadata). We will aim to integrate the two conditions together, so that we can jointly identify cell subpopulations across datasets, and then explore how each group differs across conditions -# split the dataset into a list of two seurat objects (stim and CTRL) -ifnb.list <- SplitObject(ifnb, split.by = "stim") +In previous versions of Seurat, we would require the data to be represented as two different Seurat objects. In Seurat v5, we keep all the data in one object, but simply split it into multiple 'layers'. To learn more about layers, check out our [Seurat object interaction vignette](https://satijalab.org/seurat/articles/interaction_vignette). -# normalize and identify variable features for each dataset independently -ifnb.list <- lapply(X = ifnb.list, FUN = function(x) { - x <- NormalizeData(x) - x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000) -}) +```{r init, results='hide', message=FALSE, fig.keep='none'} +# load dataset +ifnb <- LoadData('ifnb') +# split the RNA measurements into two layers +# one for control cells, one for stimulated cells -# select features that are repeatedly variable across datasets for integration -features <- SelectIntegrationFeatures(object.list = ifnb.list) +ifnb[["RNA"]] <- split(ifnb[["RNA"]], f = ifnb$stim) +ifnb ``` -## Perform integration +# Perform analysis without integration + +We can first analyze the dataset without integration. The resulting clusters are defined both by cell type and stimulation condition, which creates challenges for downstream analysis. -We then identify anchors using the `FindIntegrationAnchors()` function, which takes a list of Seurat objects as input, and use these anchors to integrate the two datasets together with `IntegrateData()`. +```{r} -```{r find.anchors} -immune.anchors <- FindIntegrationAnchors(object.list = ifnb.list, anchor.features = features) +# run standard anlaysis workflow +ifnb <- NormalizeData(ifnb) +ifnb <- FindVariableFeatures(ifnb) +ifnb <- ScaleData(ifnb) +ifnb <- RunPCA(ifnb) ``` -```{r integrate.data} -# this command creates an 'integrated' data assay -immune.combined <- IntegrateData(anchorset = immune.anchors) +```{r unintegratedUMAP, fig.height=5} +ifnb <- FindNeighbors(ifnb, dims=1:30, reduction = 'pca') +ifnb <- FindClusters(ifnb, resolution = 2, cluster.name = "unintegrated_clusters") +ifnb <- RunUMAP(ifnb, dims = 1:30, reduction = 'pca', reduction.name = 'umap.unintegrated') +DimPlot(ifnb, reduction = 'umap.unintegrated', group.by=c('stim','seurat_clusters')) ``` -## Perform an integrated analysis +# Perform integration -Now we can run a single integrated analysis on all cells! +We now aim to integrate data from the two conditions, so that cells from the same cell type/subpopulation will cluster together. -```{r clustering, results='hide', message=FALSE} -# specify that we will perform downstream analysis on the corrected data -# note that the original unmodified data still resides in the 'RNA' assay -DefaultAssay(immune.combined) <- "integrated" +We often refer to this procedure as intergration/alignment. When aligning two genome sequences together, identification of shared/homologous regions can help to interpret differences between the sequences as well. Similarly for scRNA-seq integration, our goal is not to remove biological differences across conditions, but to learn shared cell types/states in an initial step - specifically because that will enable us to compare control stimulated and control profiles for these individual cell types. -# Run the standard workflow for visualization and clustering -immune.combined <- ScaleData(immune.combined, verbose = FALSE) -immune.combined <- RunPCA(immune.combined, npcs = 30, verbose = FALSE) -immune.combined <- RunUMAP(immune.combined, reduction = "pca", dims = 1:30) -immune.combined <- FindNeighbors(immune.combined, reduction = "pca", dims = 1:30) -immune.combined <- FindClusters(immune.combined, resolution = 0.5) +The Seurat v5 integration procedure aims to return a single dimensional reduction that captures the shared sources of variance across multiple layers, so that cells in a similar biological state will cluster. The method returns a dimensional reduction (i.e. `integrated.cca`) which can be used for visualization and unsupervised clustering analysis. For evaluating performance, we can use cell type labels that are pre-loaded in the `seurat_annotations` metadata column. + +```{r} +ifnb <- IntegrateLayers( + object = ifnb, method = CCAIntegration, + orig.reduction = "pca", new.reduction = 'integrated.cca', + verbose = FALSE) + +# re-join layers after integration +ifnb[["RNA"]] <- JoinLayers(ifnb[["RNA"]]) + +ifnb <- FindNeighbors(ifnb, reduction = "integrated.cca", dims = 1:30) +ifnb <- FindClusters(ifnb, resolution = 1) +ifnb <- RunUMAP(ifnb,dims = 1:30,reduction = 'integrated.cca') ``` ```{r viz, results='hide', message=FALSE} # Visualization -p1 <- DimPlot(immune.combined, reduction = "umap", group.by = "stim") -p2 <- DimPlot(immune.combined, reduction = "umap", label = TRUE, repel = TRUE) -p1 + p2 +DimPlot(ifnb, reduction = 'umap', group.by=c('stim','seurat_annotations')) + ``` To visualize the two conditions side-by-side, we can use the `split.by` argument to show each condition colored by cluster. ```{r split.dim} -DimPlot(immune.combined, reduction = "umap", split.by = "stim") +DimPlot(ifnb, reduction = "umap", split.by = "stim") ``` -## Identify conserved cell type markers +# Identify conserved cell type markers To identify canonical cell type marker genes that are conserved across conditions, we provide the `FindConservedMarkers()` function. This function performs differential gene expression testing for each dataset/group and combines the p-values using meta-analysis methods from the MetaDE R package. For example, we can calculated the genes that are conserved markers irrespective of stimulation condition in cluster 6 (NK cells). ```{r conserved.markers, warning=FALSE} -# For performing differential expression after integration, we switch back to the original data -DefaultAssay(immune.combined) <- "RNA" -nk.markers <- FindConservedMarkers(immune.combined, ident.1 = 6, grouping.var = "stim", verbose = FALSE) +Idents(ifnb) <- "seurat_annotations" +nk.markers <- FindConservedMarkers(ifnb, ident.1 = "NK", grouping.var = "stim", verbose = FALSE) head(nk.markers) ``` -We can explore these marker genes for each cluster and use them to annotate our clusters as specific cell types. - -```{r annotate, results = 'hide', message=FALSE, fig.height = 8} -FeaturePlot(immune.combined, features = c("CD3D", "SELL", "CREM", "CD8A", "GNLY", "CD79A", "FCGR3A", "CCL2", "PPBP"), min.cutoff = "q9") -immune.combined <- RenameIdents(immune.combined, "0" = "CD14 Mono", "1" = "CD4 Naive T", "2" = "CD4 Memory T", "3" = "CD16 Mono", "4" = "B", "5" = "CD8 T", "6" = "NK" , "7" = "T activated", "8" = "DC", "9" = "B Activated", "10" = "Mk", "11" = "pDC", "12" = "Eryth", "13" = "Mono/Mk Doublets", "14" = "HSPC") -DimPlot(immune.combined, label = TRUE) -``` +You can perform these same analysis on the unsupervised clustering results (stored in `seurat_clusters`), and use these conserved markers to annotate cell types in your dataset. The `DotPlot()` function with the `split.by` parameter can be useful for viewing conserved cell type markers across conditions, showing both the expression level and the percentage of cells in a cluster expressing any given gene. Here we plot 2-3 strong marker genes for each of our 14 clusters. ```{r splitdotplot, fig.height = 10} -Idents(immune.combined) <- factor( - Idents(immune.combined), - levels = c("HSPC", "Mono/Mk Doublets", "pDC", "Eryth","Mk", "DC", "CD14 Mono", "CD16 Mono", "B Activated", "B", "CD8 T", "NK", "T activated", "CD4 Naive T", "CD4 Memory T")) + +#NEEDS TO BE FIXED AND SET ORDER CORRECTLY +Idents(ifnb) <- factor( + Idents(ifnb), + levels = c("pDC", "Eryth","Mk", "DC", "CD14 Mono", "CD16 Mono", "B Activated", "B", "CD8 T", "NK", "T activated", "CD4 Naive T", "CD4 Memory T")) + markers.to.plot <- c("CD3D","CREM","HSPH1","SELL","GIMAP5","CACYBP","GNLY","NKG7","CCL5","CD8A","MS4A1","CD79A","MIR155HG","NME1","FCGR3A","VMO1","CCL2","S100A9","HLA-DQA1","GPR183","PPBP","GNG11","HBA2","HBB","TSPAN13","IL3RA","IGJ","PRSS57") -DotPlot(immune.combined, features = markers.to.plot, cols = c('blue', 'red'), dot.scale = 8, split.by = "stim") + RotatedAxis() +DotPlot(ifnb, features = markers.to.plot, cols = c('blue', 'red'), dot.scale = 8, split.by = "stim") + RotatedAxis() ``` -```{r save.img, include = FALSE} -library(ggplot2) -plot <- DotPlot(immune.combined, features = markers.to.plot, cols = c('blue', 'red'), - dot.scale = 6, split.by = "stim") + RotatedAxis() -ggsave(filename = "../output/images/pbmc_alignment.jpg", height = 7, width = 12, plot = plot, quality = 50) -``` +# Identify differential expressed genes across conditions + +Now that we've aligned the stimulated and control cells, we can start to do comparative analyses and look at the differences induced by stimulation. -### Identify differential expressed genes across conditions +We can aggregate cells of a similar type and condition together to create "pseudobulk" profiles using the `AggregateExpression` command. As an initial exploratory analysis, we can compare pseudobulk profiles of two cell types (naive CD4 T cells, and CD14 monocytes), and compare their gene expression profiles before and after stimulation. We highlight genes that exhibit dramatic responses to interferon stimulation. As you can see, many of the same genes are upregulated in both of these cell types and likely represent a conserved interferon response pathway, though CD14 monocytes exhibit a stronger transcriptional response. -Now that we've aligned the stimulated and control cells, we can start to do comparative analyses and look at the differences induced by stimulation. One way to look broadly at these changes is to plot the average expression of both the stimulated and control cells and look for genes that are visual outliers on a scatter plot. Here, we take the average expression of both the stimulated and control naive T cells and CD14 monocyte populations and generate the scatter plots, highlighting genes that exhibit dramatic responses to interferon stimulation. ```{r scatterplots, results = 'hide', message=FALSE} library(ggplot2) library(cowplot) theme_set(theme_cowplot()) -t.cells <- subset(immune.combined, idents = "CD4 Naive T") -Idents(t.cells) <- "stim" -avg.t.cells <- as.data.frame(log1p(AverageExpression(t.cells, verbose = FALSE)$RNA)) -avg.t.cells$gene <- rownames(avg.t.cells) - -cd14.mono <- subset(immune.combined, idents = "CD14 Mono") -Idents(cd14.mono) <- "stim" -avg.cd14.mono <- as.data.frame(log1p(AverageExpression(cd14.mono, verbose = FALSE)$RNA)) -avg.cd14.mono$gene <- rownames(avg.cd14.mono) +aggregate_ifnb <- AggregateExpression(ifnb,group.by = c("seurat_annotations","stim"),return.seurat = TRUE) genes.to.label = c("ISG15", "LY6E", "IFI6", "ISG20", "MX1", "IFIT2", "IFIT1", "CXCL10", "CCL8") -p1 <- ggplot(avg.t.cells, aes(CTRL, STIM)) + geom_point() + ggtitle("CD4 Naive T Cells") -p1 <- LabelPoints(plot = p1, points = genes.to.label, repel = TRUE) -p2 <- ggplot(avg.cd14.mono, aes(CTRL, STIM)) + geom_point() + ggtitle("CD14 Monocytes") -p2 <- LabelPoints(plot = p2, points = genes.to.label, repel = TRUE) -p1 + p2 + +p1 <- CellScatter(aggregate_ifnb, "CD14 Mono_CTRL","CD14 Mono_STIM",highlight = genes.to.label) +p2 <- LabelPoints(plot = p1, points = genes.to.label, repel = TRUE) + +p3 <- CellScatter(aggregate_ifnb, "CD4 Naive T_CTRL","CD4 Naive T_STIM",highlight = genes.to.label) +p4 <- LabelPoints(plot = p3, points = genes.to.label, repel = TRUE) + +p2 + p4 ``` -As you can see, many of the same genes are upregulated in both of these cell types and likely represent a conserved interferon response pathway. -Because we are confident in having identified common cell types across condition, we can ask what genes change in different conditions for cells of the same type. First, we create a column in the meta.data slot to hold both the cell type and stimulation information and switch the current ident to that column. Then we use `FindMarkers()` to find the genes that are different between stimulated and control B cells. Notice that many of the top genes that show up here are the same as the ones we plotted earlier as core interferon response genes. Additionally, genes like CXCL10 which we saw were specific to monocyte and B cell interferon response show up as highly significant in this list as well. +We can now ask what genes change in different conditions for cells of the same type. First, we create a column in the meta.data slot to hold both the cell type and stimulation information and switch the current ident to that column. Then we use `FindMarkers()` to find the genes that are different between stimulated and control B cells. Notice that many of the top genes that show up here are the same as the ones we plotted earlier as core interferon response genes. Additionally, genes like CXCL10 which we saw were specific to monocyte and B cell interferon response show up as highly significant in this list as well. + +Please note that p-values obtained from this analysis should be interpreted with caution, as these tests treat each cell as an independent replicate, and ignore inherent correlations between cells originating from the same sample. As discussed [here](https://pubmed.ncbi.nlm.nih.gov/33257685/), DE tests across multiple conditions should expressly utilize multiple samples/replicates, and can be performed after aggregating ('pseudobulking') cells from the same sample and subpopulation together. We do not perform this analysis here, as there is a single replicate in the data, but please see our [vignette comparing healthy and diabetic samples](https://satijalab.org/seurat/articles/parsebio_sketch_integration) as an example for how to perform DE analysis across conditions. + ```{r de.genes} -immune.combined$celltype.stim <- paste(Idents(immune.combined), immune.combined$stim, sep = "_") -immune.combined$celltype <- Idents(immune.combined) -Idents(immune.combined) <- "celltype.stim" -b.interferon.response <- FindMarkers(immune.combined, ident.1 = "B_STIM", ident.2 = "B_CTRL", verbose = FALSE) +ifnb$celltype.stim <- paste(ifnb$seurat_annotations, ifnb$stim, sep = "_") +Idents(ifnb) <- "celltype.stim" +b.interferon.response <- FindMarkers(ifnb, ident.1 = "B_STIM", ident.2 = "B_CTRL", verbose = FALSE) head(b.interferon.response, n = 15) ``` + Another useful way to visualize these changes in gene expression is with the `split.by` option to the `FeaturePlot()` or `VlnPlot()` function. This will display FeaturePlots of the list of given genes, split by a grouping variable (stimulation condition here). Genes such as CD3D and GNLY are canonical cell type markers (for T cells and NK/CD8 T cells) that are virtually unaffected by interferon stimulation and display similar gene expression patterns in the control and stimulated group. IFI6 and ISG15, on the other hand, are core interferon response genes and are upregulated accordingly in all cell types. Finally, CD14 and CXCL10 are genes that show a cell type specific interferon response. CD14 expression decreases after stimulation in CD14 monocytes, which could lead to misclassification in a supervised analysis framework, underscoring the value of integrated analysis. CXCL10 shows a distinct upregulation in monocytes and B cells after interferon stimulation but not in other cell types. -```{r feature.heatmaps, fig.height = 14} -FeaturePlot(immune.combined, features = c("CD3D", "GNLY", "IFI6"), split.by = "stim", max.cutoff = 3, cols = c("grey", "red")) +```{r feature.heatmaps, fig.height = 10, fig.width=10} +FeaturePlot(ifnb, features = c("CD3D", "GNLY", "IFI6"), split.by = "stim", max.cutoff = 3, cols = c("grey", "red"), reduction='umap') ``` ```{r splitvln, fig.height = 12} -plots <- VlnPlot(immune.combined, features = c("LYZ", "ISG15", "CXCL10"), split.by = "stim", group.by = "celltype", pt.size = 0, combine = FALSE) +plots <- VlnPlot(ifnb, features = c("LYZ", "ISG15", "CXCL10"), split.by = "stim", group.by = "seurat_annotations", pt.size = 0, combine = FALSE) wrap_plots(plots = plots, ncol = 1) ``` -```{r save, include = FALSE} -saveRDS(immune.combined, file = "../output/immune.combined.rds") +```{r save, include = TRUE} ``` -# Performing integration on datasets normalized with SCTransform +# Perform integration with SCTransform-normalized datasets -In [Hafemeister and Satija, 2019](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1), we introduced an improved method for the normalization of scRNA-seq, based on regularized negative binomial regression. The method is named 'sctransform', and avoids some of the pitfalls of standard normalization workflows, including the addition of a pseudocount, and log-transformation. You can read more about sctransform in the [manuscript](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1) or our [SCTransform vignette](sctransform_vignette.html). +As an alternative to log-normalization, Seurat also includes support for preprocessing of scRNA-seq using the [sctransform workflow](https://satijalab.org/seurat/articles/sctransform_vignette). The `IntegrateLayers` function also supports SCTransform-normalized data, by setting the `normalization.method` parameter, as shown below. -Below, we demonstrate how to modify the Seurat integration workflow for datasets that have been normalized with the sctransform workflow. The commands are largely similar, with a few key differences: +```{r sct, fig.height = 5} -* Normalize datasets individually by `SCTransform()`, instead of `NormalizeData()` prior to integration -* As discussed further in our [SCTransform vignette](sctransform_vignette.html), we typically use 3,000 or more features for analysis downstream of sctransform. -* Run the `PrepSCTIntegration()` function prior to identifying anchors -* When running `FindIntegrationAnchors()`, and `IntegrateData()`, set the `normalization.method` parameter to the value `SCT`. -* When running sctransform-based workflows, including integration, do not run the `ScaleData()` function +ifnb <- LoadData('ifnb') +# split datasets and process without integration +ifnb[["RNA"]] <- split(ifnb[["RNA"]], f=ifnb$stim) +ifnb <- SCTransform(ifnb) +ifnb <- RunPCA(ifnb) +ifnb <- RunUMAP(ifnb,dims = 1:30) +DimPlot(ifnb,reduction = 'umap',group.by = c("stim","seurat_annotations")) -```{r panc8.cca.sct.init, results='hide', message=FALSE, fig.keep='none'} -LoadData('ifnb') -ifnb.list <- SplitObject(ifnb, split.by = "stim") -ifnb.list <- lapply(X = ifnb.list, FUN = SCTransform) -features <- SelectIntegrationFeatures(object.list = ifnb.list, nfeatures = 3000) -ifnb.list <- PrepSCTIntegration(object.list = ifnb.list, anchor.features = features) -``` +# integrate datasets +ifnb <- IntegrateLayers(object = ifnb, + method = CCAIntegration, + normalization.method="SCT", + verbose = F) +ifnb <- FindNeighbors(ifnb,reduction = 'integrated.dr',dims = 1:30) +ifnb <- FindClusters(ifnb,resolution = 0.6) +ifnb <- RunUMAP(ifnb,dims = 1:30,reduction = 'integrated.dr') +DimPlot(ifnb,reduction = 'umap',group.by = c("stim","seurat_annotations")) -```{r ifnb.cca.sct.anchors} -immune.anchors <- FindIntegrationAnchors(object.list = ifnb.list, normalization.method = 'SCT', anchor.features = features) -immune.combined.sct <- IntegrateData(anchorset = immune.anchors, normalization.method = 'SCT') -``` -```{r ifnb.cca.sct.clustering, results='hide', message=FALSE} -immune.combined.sct <- RunPCA(immune.combined.sct, verbose = FALSE) -immune.combined.sct <- RunUMAP(immune.combined.sct, reduction = "pca", dims = 1:30) -``` +# perform differential expression +ifnb <- PrepSCTFindMarkers(ifnb) +ifnb$celltype.stim <- paste(ifnb$seurat_annotations, ifnb$stim, sep = "_") +Idents(ifnb) <- 'celltype.stim' +b.interferon.response <- FindMarkers(ifnb, ident.1 = "B_STIM", ident.2 = "B_CTRL", verbose = FALSE) -```{r immunesca.cca.sct.split.dims} -p1 <- DimPlot(immune.combined.sct, reduction = "umap", group.by = "stim") -p2 <- DimPlot(immune.combined.sct, reduction = "umap", group.by = 'seurat_annotations',label = TRUE, repel = TRUE) -p1 + p2 ``` -Now that the datasets have been integrated, you can follow the previous steps in this vignette identify cell types and cell type-specific responses. - -```{r save.times, include = FALSE} -write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/integration_introduction.csv") ```
diff --git a/vignettes/integration_large_datasets.Rmd b/vignettes/integration_large_datasets.Rmd index c1870ac0e..bf5867144 100644 --- a/vignettes/integration_large_datasets.Rmd +++ b/vignettes/integration_large_datasets.Rmd @@ -27,7 +27,8 @@ knitr::opts_chunk$set( fig.width = 10, message = FALSE, warning = FALSE, - time_it = TRUE + time_it = TRUE, + error = TRUE ) ``` @@ -98,7 +99,7 @@ bm280k.integrated <- RunUMAP(bm280k.integrated, dims = 1:50) DimPlot(bm280k.integrated, group.by = "orig.ident") ``` -```{r save.img, include=FALSE} +```{r save.img, include=TRUE} library(ggplot2) plot <- DimPlot(bm280k.integrated, group.by = "orig.ident") + xlab("UMAP 1") + ylab("UMAP 2") + theme(axis.title = element_text(size = 18), legend.text = element_text(size = 18)) + diff --git a/vignettes/integration_mapping.Rmd b/vignettes/integration_mapping.Rmd index 71f5769cd..9702c213c 100644 --- a/vignettes/integration_mapping.Rmd +++ b/vignettes/integration_mapping.Rmd @@ -7,7 +7,7 @@ output: pdf_document: default date: 'Compiled: `r Sys.Date()`' --- -*** + ```{r setup, include=FALSE} all_times <- list() # store the time for each chunk knitr::knit_hooks$set(time_it = local({ @@ -28,7 +28,8 @@ knitr::opts_chunk$set( error = FALSE, message = FALSE, fig.width = 8, - time_it = TRUE + time_it = TRUE, + error = TRUE ) ``` @@ -43,76 +44,44 @@ For the purposes of this example, we've chosen human pancreatic islet cell datas ```{r libraries, message=FALSE, warning=FALSE} library(Seurat) library(SeuratData) +library(ggplot2) ``` ```{r install.data, eval=FALSE} InstallData('panc8') ``` -To construct a reference, we will identify 'anchors' between the individual datasets. First, we split the combined object into a list, with each dataset as an element (this is only necessary because the data was bundled together for easy distribution). +As a demonstration, we will use a subset of technologies to construct a reference. We will then map the remaining datasets onto this reference. We start by selecting cells from four technologies, and performing an analysis without integration. ```{r preprocessing1} -data('panc8') -pancreas.list <- SplitObject(panc8, split.by = "tech") -pancreas.list <- pancreas.list[c("celseq", "celseq2", "fluidigmc1", "smartseq2")] -``` - -Prior to finding anchors, we perform standard preprocessing (log-normalization), and identify variable features individually for each. Note that Seurat implements an improved method for variable feature selection based on a variance stabilizing transformation (`"vst"`) - -```{r preprocessing3} -for (i in 1:length(pancreas.list)) { - pancreas.list[[i]] <- NormalizeData(pancreas.list[[i]], verbose = FALSE) - pancreas.list[[i]] <- FindVariableFeatures(pancreas.list[[i]], selection.method = "vst", - nfeatures = 2000, verbose = FALSE) -} -``` - -# Integration of 3 pancreatic islet cell datasets - -Next, we identify anchors using the `FindIntegrationAnchors()` function, which takes a list of Seurat objects as input. Here, we integrate three of the objects into a reference (we will use the fourth later in this vignette as a query dataset to demonstrate mapping). - -* We use all default parameters here for identifying anchors, including the 'dimensionality' of the dataset (30; feel free to try varying this parameter over a broad range, for example between 10 and 50). - -```{r integration.anchors, warning = FALSE, message = FALSE} -reference.list <- pancreas.list[c("celseq", "celseq2", "smartseq2")] -pancreas.anchors <- FindIntegrationAnchors(object.list = reference.list, dims = 1:30) -``` - -We then pass these anchors to the `IntegrateData()` function, which returns a Seurat object. - -* The returned object will contain a new `Assay`, which holds an integrated (or 'batch-corrected') expression matrix for all cells, enabling them to be jointly analyzed. - -```{r data.integration, warning = FALSE, message = FALSE} -pancreas.integrated <- IntegrateData(anchorset = pancreas.anchors, dims = 1:30) -``` - -After running `IntegrateData()`, the `Seurat` object will contain a new `Assay` with the integrated expression matrix. Note that the original (uncorrected values) are still stored in the object in the "RNA" assay, so you can switch back and forth. - -We can then use this new integrated matrix for downstream analysis and visualization. Here we scale the integrated data, run PCA, and visualize the results with UMAP. The integrated datasets cluster by cell type, instead of by technology. - -```{r analysis, message = FALSE, warning=FALSE, fig.width=10} -library(ggplot2) -library(cowplot) -library(patchwork) -#switch to integrated assay. The variable features of this assay are automatically -#set during IntegrateData -DefaultAssay(pancreas.integrated) <- 'integrated' -# Run the standard workflow for visualization and clustering -pancreas.integrated <- ScaleData(pancreas.integrated, verbose = FALSE) -pancreas.integrated <- RunPCA(pancreas.integrated, npcs = 30, verbose = FALSE) -pancreas.integrated <- RunUMAP(pancreas.integrated, reduction = "pca", dims = 1:30, - verbose = FALSE) -p1 <- DimPlot(pancreas.integrated, reduction = "umap", group.by = "tech") -p2 <- DimPlot(pancreas.integrated, reduction = "umap", group.by = "celltype", - label = TRUE, repel = TRUE) + NoLegend() -p1 + p2 -``` - -```{r save.img, include = FALSE} -plot <- DimPlot(pancreas.integrated, reduction = "umap", label = TRUE, label.size = 4.5) + xlab("UMAP 1") + ylab("UMAP 2") + - theme(axis.title = element_text(size = 18), legend.text = element_text(size = 18)) + - guides(colour = guide_legend(override.aes = list(size = 10))) -ggsave(filename = "pancreas_integrated_umap.jpg", height = 7, width = 12, plot = plot, quality = 50) +panc8 <- LoadData('panc8') +table(panc8$tech) + +# we will use data from 2 technologies for the reference +pancreas.ref <- subset(panc8, tech %in% c("celseq2", "smartseq2")) +pancreas.ref[["RNA"]] <- split(pancreas.ref[["RNA"]], f = pancreas.ref$tech) + +# pre-process dataset (without integration) +pancreas.ref <- NormalizeData(pancreas.ref) +pancreas.ref <- FindVariableFeatures(pancreas.ref) +pancreas.ref <- ScaleData(pancreas.ref) +pancreas.ref <- RunPCA(pancreas.ref) +pancreas.ref <- FindNeighbors(pancreas.ref, dims=1:30) +pancreas.ref <- FindClusters(pancreas.ref) +pancreas.ref <- RunUMAP(pancreas.ref, dims = 1:30) +DimPlot(pancreas.ref,group.by = c("celltytpe","tech")) +``` + +Next, we integrate the datasets into a shared reference. Please see our [introduction to integration vignette](https://satijalab.org/seurat/articles/integration_introduction) +```{r preprocessing3, fig.height = 4} +pancreas.ref <- IntegrateLayers( + object = pancreas.ref, method = CCAIntegration, + orig.reduction = "pca", new.reduction = 'integrated.cca', + verbose = FALSE) +pancreas.ref <- FindNeighbors(pancreas.ref,reduction='integrated.cca',dims=1:30) +pancreas.ref <- FindClusters(pancreas.ref) +pancreas.ref <- RunUMAP(pancreas.ref,reduction='integrated.cca',dims=1:30) +DimPlot(pancreas.ref,group.by = c("tech","celltype")) ``` # Cell type classification using an integrated reference @@ -125,9 +94,12 @@ Seurat also supports the projection of reference data (or meta data) onto a quer After finding anchors, we use the `TransferData()` function to classify the query cells based on reference data (a vector of reference cell type labels). `TransferData()` returns a matrix with predicted IDs and prediction scores, which we can add to the query metadata. ```{r label.transfer, warning = FALSE, message = FALSE} -pancreas.query <- pancreas.list[["fluidigmc1"]] -pancreas.anchors <- FindTransferAnchors(reference = pancreas.integrated, query = pancreas.query, dims = 1:30, reference.reduction = "pca") -predictions <- TransferData(anchorset = pancreas.anchors, refdata = pancreas.integrated$celltype, dims = 1:30) + +# select two technologies for the query datasets +pancreas.query <- subset(panc8, tech %in% c("fluidigmc1","celseq")) +pancreas.query <- NormalizeData(pancreas.query) +pancreas.anchors <- FindTransferAnchors(reference = pancreas.ref, query = pancreas.query, dims = 1:30, reference.reduction="pca") +predictions <- TransferData(anchorset = pancreas.anchors, refdata = pancreas.ref$celltype, dims = 1:30) pancreas.query <- AddMetaData(pancreas.query, metadata = predictions) ``` @@ -147,16 +119,16 @@ VlnPlot(pancreas.query, c("REG1A", "PPY", "SST", "GHRL", "VWF", "SOX10"), group. # Unimodal UMAP Projection -In Seurat v4, we also enable projection of a query onto the reference UMAP structure. This can be achieved by computing the reference UMAP model and then calling `MapQuery()` instead of `TransferData()`. +We also enable projection of a query onto the reference UMAP structure. This can be achieved by computing the reference UMAP model and then calling `MapQuery()` instead of `TransferData()`. ```{r label.transfer.v4, warning = FALSE, message = FALSE} -pancreas.integrated <- RunUMAP(pancreas.integrated, dims = 1:30, reduction = "pca", return.model = TRUE) +pancreas.ref <- RunUMAP(pancreas.ref, dims = 1:30, reduction = "integrated.cca", return.model = TRUE) pancreas.query <- MapQuery( anchorset = pancreas.anchors, - reference = pancreas.integrated, + reference = pancreas.ref, query = pancreas.query, refdata = list(celltype = 'celltype'), - reference.reduction = 'pca', + reference.reduction = "pca", reduction.model = 'umap' ) ``` @@ -169,20 +141,20 @@ pancreas.query <- MapQuery( ```{r, eval=FALSE} pancreas.query <- TransferData( anchorset = pancreas.anchors, - reference = pancreas.integrated, + reference = pancreas.ref, query = pancreas.query, refdata = list(celltype = "celltype") ) pancreas.query <- IntegrateEmbeddings( anchorset = pancreas.anchors, - reference = pancreas.integrated, + reference = pancreas.ref, query = pancreas.query, new.reduction.name = "ref.pca" ) pancreas.query <- ProjectUMAP( query = pancreas.query, query.reduction = "ref.pca", - reference = pancreas.integrated, + reference = pancreas.ref, reference.reduction = "pca", reduction.model = "umap" ) @@ -192,7 +164,7 @@ pancreas.query <- ProjectUMAP( We can now visualize the query cells alongside our reference. ```{r panc.refdimplots, fig.width=10} -p1 <- DimPlot(pancreas.integrated, reduction = "umap", group.by = "celltype", label = TRUE, +p1 <- DimPlot(pancreas.ref, reduction = "umap", group.by = "celltype", label = TRUE, label.size = 3 ,repel = TRUE) + NoLegend() + ggtitle("Reference annotations") p2 <- DimPlot(pancreas.query, reduction = "ref.umap", group.by = "predicted.celltype", label = TRUE, label.size = 3 ,repel = TRUE) + NoLegend() + ggtitle("Query transferred labels") diff --git a/vignettes/integration_rpca.Rmd b/vignettes/integration_rpca.Rmd index d1ae0d266..f249d13c0 100644 --- a/vignettes/integration_rpca.Rmd +++ b/vignettes/integration_rpca.Rmd @@ -26,7 +26,8 @@ knitr::opts_chunk$set( fig.width = 10, message = FALSE, warning = FALSE, - time_it = TRUE + time_it = TRUE, + error = TRUE ) ``` @@ -34,7 +35,8 @@ In this vignette, we present a slightly modified workflow for the integration of By identifying shared sources of variation between datasets, CCA is well-suited for identifying anchors when cell types are conserved, but there are very substantial differences in gene expression across experiments. CCA-based integration therefore enables integrative analysis when experimental conditions or disease states introduce very strong expression shifts, or when integrating datasets across modalities and species. However, CCA-based integration may also lead to overcorrection, especially when a large proportion of cells are non-overlapping across datasets. -RPCA-based integration runs significantly faster, and also represents a more conservative approach where cells in different biological states are less likely to 'align' after integration. We therefore,recommend RPCA during integrative analysis where: +RPCA-based integration runs significantly faster, and also represents a more conservative approach where cells in different biological states are less likely to 'align' after integration. We therefore recommend RPCA during integrative analysis where: + * A substantial fraction of cells in one dataset have no matching type in the other * Datasets originate from the same platform (i.e. multiple lanes of 10x genomics) * There are a large number of datasets or cells to integrate (see [here](integration_large_datasets.html) for more tips on integrating large datasets) @@ -46,6 +48,7 @@ options(SeuratData.repo.use = "http://satijalab04.nygenome.org") ``` ```{r installdata} +library(Seurat) library(SeuratData) # install dataset InstallData('ifnb') @@ -53,7 +56,7 @@ InstallData('ifnb') ```{r init, results='hide', message=FALSE, fig.keep='none'} # load dataset -LoadData('ifnb') +ifnb <- LoadData('ifnb') # split the dataset into a list of two seurat objects (stim and CTRL) ifnb.list <- SplitObject(ifnb, split.by = "stim") @@ -130,7 +133,7 @@ p2 <- DimPlot(immune.combined, reduction = "umap", label = TRUE, repel = TRUE) p1 + p2 ``` -```{r save.img, include = FALSE} +```{r save.img, include = TRUE} library(ggplot2) plot <- DimPlot(immune.combined, group.by = "stim") + xlab("UMAP 1") + ylab("UMAP 2") + @@ -146,7 +149,7 @@ Now that the datasets have been integrated, you can follow the previous steps in As an additional example, we repeat the analyses performed above, but normalize the datasets using [SCTransform](sctransform_vignette.html). We may choose to set the `method` parameter to `glmGamPoi` (install [here](https://bioconductor.org/packages/release/bioc/html/glmGamPoi.html)) in order to enable faster estimation of regression parameters in `SCTransform()`. ```{r panc8.cca.sct.init, results='hide', message=FALSE, fig.keep='none'} -LoadData('ifnb') +ifnb <- LoadData('ifnb') ifnb.list <- SplitObject(ifnb, split.by = "stim") ifnb.list <- lapply(X = ifnb.list, FUN = SCTransform, method = "glmGamPoi") features <- SelectIntegrationFeatures(object.list = ifnb.list, nfeatures = 3000) diff --git a/vignettes/interaction_vignette.Rmd b/vignettes/interaction_vignette.Rmd index 2c52fe43c..2b007dbfb 100644 --- a/vignettes/interaction_vignette.Rmd +++ b/vignettes/interaction_vignette.Rmd @@ -25,11 +25,12 @@ knitr::opts_chunk$set( tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE, - time_it = TRUE + time_it = TRUE, + error = TRUE ) ``` -```{r, include = FALSE} +```{r, include = TRUE} options(SeuratData.repo.use = "http://satijalab04.nygenome.org") ``` @@ -116,27 +117,27 @@ cluster.averages <- AverageExpression(pbmc) head(cluster.averages[['RNA']][, 1:5]) # Return this information as a Seurat object (enables downstream plotting and analysis) -# First, replace spaces with underscores '_' so ggplot2 doesn't fail +# First, replace spaces with underscores '-' so ggplot2 doesn't fail orig.levels <- levels(pbmc) -Idents(pbmc) <- gsub(pattern = ' ', replacement = '_', x = Idents(pbmc)) -orig.levels <- gsub(pattern = ' ', replacement = '_', x = orig.levels) +Idents(pbmc) <- gsub(pattern = ' ', replacement = '-', x = Idents(pbmc)) +orig.levels <- gsub(pattern = ' ', replacement = '-', x = orig.levels) levels(pbmc) <- orig.levels cluster.averages <- AverageExpression(pbmc, return.seurat = TRUE) cluster.averages # How can I plot the average expression of NK cells vs. CD8 T cells? # Pass do.hover = T for an interactive plot to identify gene outliers -CellScatter(cluster.averages, cell1 = "NK", cell2 = "CD8_T") +CellScatter(cluster.averages, cell1 = "NK", cell2 = "CD8-T") # How can I calculate expression averages separately for each replicate? cluster.averages <- AverageExpression(pbmc, return.seurat = TRUE, add.ident = "replicate") -CellScatter(cluster.averages, cell1 = "CD8_T_rep1", cell2 = "CD8_T_rep2") +CellScatter(cluster.averages, cell1 = "CD8-T_rep1", cell2 = "CD8-T_rep2") # You can also plot heatmaps of these 'in silico' bulk datasets to visualize agreement between replicates DoHeatmap(cluster.averages, features = unlist(TopFeatures(pbmc[['pca']], balanced = TRUE)), size = 3, draw.lines = FALSE) ``` -```{r save.times, include = FALSE} +```{r save.times, include = TRUE} write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/interaction_vignette_times.csv") ``` diff --git a/vignettes/merge_vignette.Rmd b/vignettes/merge_vignette.Rmd index 2fd230f7d..4d1adca2d 100644 --- a/vignettes/merge_vignette.Rmd +++ b/vignettes/merge_vignette.Rmd @@ -26,11 +26,12 @@ knitr::opts_chunk$set( tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE, - time_it = TRUE + time_it = TRUE, + error = TRUE ) ``` -```{r, include = FALSE} +```{r, include = TRUE} options(SeuratData.repo.use = "http://satijalab04.nygenome.org") ``` @@ -96,7 +97,7 @@ GetAssayData(pbmc.combined)[1:10, 1:15] GetAssayData(pbmc.normalized)[1:10, 1:15] ``` -```{r save.times, include = FALSE} +```{r save.times, include = TRUE} write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/merge_vignette_times.csv") ``` diff --git a/vignettes/mixscape_vignette.Rmd b/vignettes/mixscape_vignette.Rmd index 4edf7fe62..18bd24863 100644 --- a/vignettes/mixscape_vignette.Rmd +++ b/vignettes/mixscape_vignette.Rmd @@ -23,14 +23,15 @@ knitr::knit_hooks$set(time_it = local({ knitr::opts_chunk$set( message = FALSE, warning = FALSE, - time_it = TRUE + time_it = TRUE, + error = TRUE ) -options(SeuratData.repo.use = 'satijalab04.nygenome.org') +options(SeuratData.repo.use = 'seurat.nygenome.org') ``` # Overview -This tutorial demonstrates how to use mixscape for the analyses of single-cell pooled CRSIPR screens. We introduce new Seurat functions for: +This tutorial demonstrates how to use mixscape for the analyses of single-cell pooled CRISPR screens. We introduce new Seurat functions for: 1. Calculating the perturbation-specific signature of every cell. 2. Identifying and removing cells that have 'escaped' CRISPR perturbation. @@ -301,11 +302,13 @@ VlnPlot( theme(axis.text.x = element_text(angle = 0, hjust = 0.5), plot.title = element_text(size = 20), axis.text = element_text(size = 16)) ``` -```{r save.img, include=FALSE} +```{r save.img, include=TRUE} p <- VlnPlot(object = eccite, features = "adt_PDL1", idents = c("NT","JAK2","STAT1","IFNGR1","IFNGR2", "IRF1"), group.by = "gene", pt.size = 0.2, sort = T, split.by = "mixscape_class.global", cols = c("coral3","grey79","grey39")) +ggtitle("PD-L1 protein") +theme(axis.text.x = element_text(angle = 0, hjust = 0.5)) -ggsave(filename = "../output/images/mixscape_vignette.jpg", height = 7, width = 12, plot = p, quality = 50) ``` +```{r, include=FALSE} +ggsave(filename = "../output/images/mixscape_vignette.jpg", height = 7, width = 12, plot = p, quality = 50) +``` # Visualizing perturbation responses with Linear Discriminant Analysis (LDA) We use LDA as a dimensionality reduction method to visualize perturbation-specific clusters. LDA is trying to maximize the separability of known labels (mixscape classes) using both gene expression and the labels as input. diff --git a/vignettes/multimodal_reference_mapping.Rmd b/vignettes/multimodal_reference_mapping.Rmd index 2d0be4f07..0decf0ee0 100644 --- a/vignettes/multimodal_reference_mapping.Rmd +++ b/vignettes/multimodal_reference_mapping.Rmd @@ -23,7 +23,8 @@ knitr::knit_hooks$set(time_it = local({ knitr::opts_chunk$set( message = FALSE, warning = FALSE, - time_it = TRUE + time_it = TRUE, + error = TRUE ) ``` @@ -41,30 +42,24 @@ In this vignette, we demonstrate how to use a previously established reference t To run this vignette please install Seurat v4, available on CRAN. Additionally, you will need to install the `SeuratDisk` package. -```{r install, eval = FALSE} -install.packages("Seurat") -remotes::install_github("mojaveazure/seurat-disk") -``` - ```{r packages, cache=FALSE} library(Seurat) -library(SeuratDisk) library(ggplot2) library(patchwork) ``` -```{r, include = FALSE, cache=FALSE} -options(SeuratData.repo.use = "http://satijalab04.nygenome.org") +```{r, include = TRUE, cache=FALSE} +options(SeuratData.repo.use = "http://seurat.nygenome.org") ``` # Example 1: Mapping human peripheral blood cells ## A Multimodal PBMC Reference Dataset -We load the reference (download [here](https://atlas.fredhutch.org/data/nygc/multimodal/pbmc_multimodal.h5seurat)) from our recent [paper](https://doi.org/10.1016/j.cell.2021.04.048), and visualize the pre-computed UMAP. This reference is stored as an h5Seurat file, a format that enables on-disk storage of multimodal Seurat objects (more details on h5Seurat and `SeuratDisk` can be found [here](https://mojaveazure.github.io/seurat-disk/index.html)). +We load the reference from our recent [paper](https://doi.org/10.1016/j.cell.2021.04.048), and visualize the pre-computed UMAP. ```{r pbmc.ref} -reference <- LoadH5Seurat("../data/pbmc_multimodal.h5seurat") +reference <- readRDS("/brahms/hartmana/vignette_data/pbmc_multimodal_2023.rds") ``` ```{r ref.dimplot} @@ -78,6 +73,9 @@ To demonstrate mapping to this multimodal reference, we will use a dataset of 2, ```{r 3k.load} library(SeuratData) InstallData('pbmc3k') + +pbmc3k <- LoadData('pbmc3k') +pbmc3k <- UpdateSeuratObject(pbmc3k) ``` The reference was normalized using `SCTransform()`, so we use the same approach to normalize the query here. @@ -165,7 +163,7 @@ Each prediction is assigned a score between 0 and 1. FeaturePlot(pbmc3k, features = c("pDC", "CD16 Mono", "Treg"), reduction = "ref.umap", cols = c("lightgrey", "darkred"), ncol = 3) & theme(plot.title = element_text(size = 10)) ``` -```{r save.img, include=FALSE} +```{r save.img, include=TRUE} library(ggplot2) plot <- FeaturePlot(pbmc3k, features = "CD16 Mono", reduction = "ref.umap", cols = c("lightgrey", "darkred")) + ggtitle("CD16 Mono") + theme(plot.title = element_text(hjust = 0.5, size = 30)) + labs(color = "Prediction Score") + xlab("UMAP 1") + ylab("UMAP 2") + theme(axis.title = element_text(size = 18), legend.text = element_text(size = 18), legend.title = element_text(size = 25)) @@ -198,7 +196,7 @@ In our [manuscript](https://doi.org/10.1016/j.cell.2021.04.048), we map a query We emphasize that if users are attempting to map datasets where the underlying samples are not PBMC, or contain cell types that are not present in the reference, computing a 'de novo' visualization is an important step in interpreting their dataset. -```{r hiddendiet, include=FALSE} +```{r hiddendiet, include=TRUE} reference <- DietSeurat(reference, counts = FALSE, dimreducs = "spca") pbmc3k <- DietSeurat(pbmc3k, counts = FALSE, dimreducs = "ref.spca") ``` @@ -283,8 +281,8 @@ If you want to save and load a cached index for a `Neighbor` object generated wi ```{r neighbor.demo} bm[["spca.annoy.neighbors"]] -SaveAnnoyIndex(object = bm[["spca.annoy.neighbors"]], file = "../data/reftmp.idx") -bm[["spca.annoy.neighbors"]] <- LoadAnnoyIndex(object = bm[["spca.annoy.neighbors"]], file = "../data/reftmp.idx") +SaveAnnoyIndex(object = bm[["spca.annoy.neighbors"]], file = "/brahms/shared/vignette-data/reftmp.idx") +bm[["spca.annoy.neighbors"]] <- LoadAnnoyIndex(object = bm[["spca.annoy.neighbors"]], file = "/brahms/shared/vignette-data/reftmp.idx") ```
diff --git a/vignettes/multimodal_vignette.Rmd b/vignettes/multimodal_vignette.Rmd index 5c4911f69..7f9193618 100644 --- a/vignettes/multimodal_vignette.Rmd +++ b/vignettes/multimodal_vignette.Rmd @@ -7,7 +7,6 @@ output: pdf_document: default date: 'Compiled: `r Sys.Date()`' --- -*** ```{r setup, include=FALSE} all_times <- list() # store the time for each chunk @@ -28,15 +27,16 @@ knitr::opts_chunk$set( message = FALSE, warning = FALSE, time_it = TRUE, - fig.width = 10 + fig.width = 9, + error = TRUE ) ``` # Load in the data -The ability to make simultaneous measurements of multiple data types from the same cell, known as multimodal analysis, represents a new and exciting frontier for single-cell genomics. For example, [CITE-seq](http://www.nature.com/nmeth/journal/v14/n9/full/nmeth.4380.html) enables the simultaneous measurements of transcriptomes and cell-surface proteins from the same cell. Other exciting multimodal technologies, such as the [10x multiome kit](https://www.10xgenomics.com/products/single-cell-multiome-atac-plus-gene-expression) allow for the paired measurements of cellular transcriptome and chromatin accessibility (i.e scRNA-seq+scATAC-seq). Other modalities that can be measured alongside cellular transcriptomes include genetic perturbations, cellular methylomes, and hashtag oligos from [Cell Hashing](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1603-1). We have designed Seurat4 to enable for the seamless storage, analysis, and exploration of diverse multimodal single-cell datasets. +The ability to make simultaneous measurements of multiple data types from the same cell, known as multimodal analysis, represents a new and exciting frontier for single-cell genomics. For example, [CITE-seq](http://www.nature.com/nmeth/journal/v14/n9/full/nmeth.4380.html) enables the simultaneous measurements of transcriptomes and cell-surface proteins from the same cell. Other exciting multimodal technologies, such as the [10x multiome kit](https://www.10xgenomics.com/products/single-cell-multiome-atac-plus-gene-expression) allow for the paired measurements of cellular transcriptome and chromatin accessibility (i.e scRNA-seq+scATAC-seq). Other modalities that can be measured alongside cellular transcriptomes include genetic perturbations, cellular methylomes, and hashtag oligos from [Cell Hashing](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1603-1). We have designed Seurat to enable for the seamless storage, analysis, and exploration of diverse multimodal single-cell datasets. -In this vignette, we present an introductory workflow for creating a multimodal Seurat object and performing an initial analysis. For example, we demonstrate how to cluster a CITE-seq dataset on the basis of the measured cellular transcriptomes, and subsequently discover cell surface proteins that are enriched in each cluster. We note that Seurat4 also enables more advanced techniques for the analysis of multimodal data, in particular the application of our [Weighted Nearest Neighbors (WNN) approach](https://doi.org/10.1016/j.cell.2021.04.048) that enables simultaneous clustering of cells based on a weighted combination of both modalities, and you can explore this functionality [here](weighted_nearest_neighbor_analysis.html). +In this vignette, we present an introductory workflow for creating a multimodal Seurat object and performing an initial analysis. For example, we demonstrate how to cluster a CITE-seq dataset on the basis of the measured cellular transcriptomes, and subsequently discover cell surface proteins that are enriched in each cluster. We note that Seurat also enables more advanced techniques for the analysis of multimodal data, in particular the application of our [Weighted Nearest Neighbors (WNN) approach](https://doi.org/10.1016/j.cell.2021.04.048) that enables simultaneous clustering of cells based on a weighted combination of both modalities, and you can explore this functionality [here](weighted_nearest_neighbor_analysis.html). Here, we analyze a dataset of 8,617 cord blood mononuclear cells (CBMCs), where transcriptomic measurements are paired with abundance estimates for 11 surface proteins, whose levels are quantified with DNA-barcoded antibodies. First, we load in two count matrices : one for the RNA measurements, and one for the antibody-derived tags (ADT). You can download the ADT file [here](ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100866/suppl/GSE100866_CBMC_8K_13AB_10X-ADT_umi.csv.gz) and the RNA file [here](ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100866/suppl/GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv.gz) @@ -50,13 +50,13 @@ library(patchwork) # Load in the RNA UMI matrix # Note that this dataset also contains ~5% of mouse cells, which we can use as negative controls for the protein measurements. For this reason, the gene expression matrix has HUMAN_ or MOUSE_ appended to the beginning of each gene. -cbmc.rna <- as.sparse(read.csv(file = '../data/GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv.gz', sep = ',', header = TRUE, row.names = 1)) +cbmc.rna <- as.sparse(read.csv(file = '/brahms/shared/vignette-data/GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv.gz', sep = ',', header = TRUE, row.names = 1)) # To make life a bit easier going forward, we're going to discard all but the top 100 most highly expressed mouse genes, and remove the "HUMAN_" from the CITE-seq prefix cbmc.rna <- CollapseSpeciesExpressionMatrix(cbmc.rna) # Load in the ADT UMI matrix -cbmc.adt <- as.sparse(read.csv(file = '../data/GSE100866_CBMC_8K_13AB_10X-ADT_umi.csv.gz', sep = ',', header = TRUE, row.names = 1)) +cbmc.adt <- as.sparse(read.csv(file = '/brahms/shared/vignette-data/GSE100866_CBMC_8K_13AB_10X-ADT_umi.csv.gz', sep = ',', header = TRUE, row.names = 1)) # Note that since measurements were made in the same cells, the two matrices have identical column names all.equal(colnames(cbmc.rna),colnames(cbmc.adt)) @@ -74,7 +74,7 @@ cbmc <- CreateSeuratObject(counts = cbmc.rna) Assays(cbmc) # create a new assay to store ADT information -adt_assay <- CreateAssayObject(counts = cbmc.adt) +adt_assay <- CreateAssay5Object(counts = cbmc.adt) # add this assay to the previously created Seurat object cbmc[["ADT"]] <- adt_assay @@ -186,7 +186,7 @@ FeatureScatter(cbmc, feature1 = 'adt_CD4', feature2 = 'adt_CD8', slot = 'counts' Seurat is also able to analyze data from multimodal 10X experiments processed using CellRanger v3; as an example, we recreate the plots above using a dataset of 7,900 peripheral blood mononuclear cells (PBMC), freely available from 10X Genomics [here](https://support.10xgenomics.com/single-cell-gene-expression/datasets/3.0.0/pbmc_10k_protein_v3). ```{r pbmc10x, fig.height=4.5, fig.width=10} -pbmc10k.data <- Read10X(data.dir = '../data/pbmc10k/filtered_feature_bc_matrix/') +pbmc10k.data <- Read10X(data.dir = '/brahms/shared/vignette-data/pbmc10k/filtered_feature_bc_matrix/') rownames(x = pbmc10k.data[['Antibody Capture']]) <- gsub( pattern = '_[control_]*TotalSeqB', replacement = '', @@ -204,7 +204,7 @@ plot3 <- FeatureScatter(pbmc10k, feature1 = 'adt_CD3', feature2 = 'CD3E', pt.siz (plot1 + plot2 + plot3) & NoLegend() ``` -```{r save.img, include = FALSE} +```{r save.img, include = TRUE} plot <- FeatureScatter(cbmc, feature1 = "adt_CD19", feature2 = "adt_CD3") + NoLegend() + theme(axis.title = element_text(size = 18), legend.text = element_text(size = 18)) ggsave(filename = "../output/images/citeseq_plot.jpg", height = 7, width = 12, plot = plot, quality = 50) @@ -219,11 +219,12 @@ write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/multimodal_ Seurat v4 also includes additional functionality for the analysis, visualization, and integration of multimodal datasets. For more information, please explore the resources below: * Defining cellular identity from multimodal data using WNN analysis in Seurat v4 [vignette](weighted_nearest_neighbor_analysis.html) -* Mapping scRNA-seq data onto CITE-seq references [[vignette](reference_mapping.html)] -* Introduction to the analysis of spatial transcriptomics analysis [[vignette](spatial_vignette.html)] -* Analysis of 10x multiome (paired scRNA-seq + ATAC) using WNN analysis [[vignette](weighted_nearest_neighbor_analysis.html)] -* Signac: Analysis, interpretation, and exploration of single-cell chromatin datasets [[package](https://satijalab.org/signac/)] -* Mixscape: an analytical toolkit for pooled single-cell genetic screens [[vignette](mixscape_vignette.html)] +* Mapping scRNA-seq data onto CITE-seq references [vignette](multimodal_reference_mapping.html) +* Introduction to the analysis of spatial transcriptomics analysis [vignette](spatial_vignette.html) +* Analysis of 10x multiome (paired scRNA-seq + ATAC) using WNN analysis [vignette](weighted_nearest_neighbor_analysis.html) +* Signac: Analysis, interpretation, and exploration of single-cell chromatin datasets [package](https://satijalab.org/signac/) +* Mixscape: an analytical toolkit for pooled single-cell genetic screens [vignette](mixscape_vignette.html) +* Bridge integration: mapping multi-omic datasets across molecular modalities [vignette](https://satijalab.org/seurat/articles/seurat5_integration_bridge)
**Session Info** diff --git a/vignettes/pancreas_integrated_umap.jpg b/vignettes/pancreas_integrated_umap.jpg new file mode 100644 index 000000000..560a2b1a6 Binary files /dev/null and b/vignettes/pancreas_integrated_umap.jpg differ diff --git a/vignettes/pbmc3k_tutorial.Rmd b/vignettes/pbmc3k_tutorial.Rmd index a51c8b463..777303216 100644 --- a/vignettes/pbmc3k_tutorial.Rmd +++ b/vignettes/pbmc3k_tutorial.Rmd @@ -6,7 +6,6 @@ output: df_print: kable date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' --- -*** ```{r setup, include=FALSE} all_times <- list() # store the time for each chunk @@ -26,7 +25,8 @@ knitr::opts_chunk$set( tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE, - time_it = TRUE + time_it = TRUE, + error = TRUE ) ``` @@ -34,9 +34,9 @@ knitr::opts_chunk$set( For this tutorial, we will be analyzing the a dataset of Peripheral Blood Mononuclear Cells (PBMC) freely available from 10X Genomics. There are 2,700 single cells that were sequenced on the Illumina NextSeq 500. The raw data can be found [here](https://cf.10xgenomics.com/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz). -We start by reading in the data. The `Read10X()` function reads in the output of the [cellranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger) pipeline from 10X, returning a unique molecular identified (UMI) count matrix. The values in this matrix represent the number of molecules for each feature (i.e. gene; row) that are detected in each cell (column). +We start by reading in the data. The `Read10X()` function reads in the output of the [cellranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger) pipeline from 10X, returning a unique molecular identified (UMI) count matrix. The values in this matrix represent the number of molecules for each feature (i.e. gene; row) that are detected in each cell (column). Note that more recent versions of cellranger now also output using the [h5 file format](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/advanced/h5_matrices), which can be read in using the `Read10X_h5()` function in Seurat. -We next use the count matrix to create a `Seurat` object. The object serves as a container that contains both data (like the count matrix) and analysis (like PCA, or clustering results) for a single-cell dataset. For a technical discussion of the `Seurat` object structure, check out our [GitHub Wiki](https://github.com/satijalab/seurat/wiki). For example, the count matrix is stored in `pbmc[["RNA"]]@counts`. +We next use the count matrix to create a `Seurat` object. The object serves as a container that contains both data (like the count matrix) and analysis (like PCA, or clustering results) for a single-cell dataset. For more information, check out our [Seurat object interaction vignette], or our [GitHub Wiki](https://github.com/satijalab/seurat/wiki). For example, in Seurat v5, the count matrix is stored in `pbmc[["RNA"]]$counts`. ```{r init} library(dplyr) @@ -44,7 +44,7 @@ library(Seurat) library(patchwork) # Load the PBMC dataset -pbmc.data <- Read10X(data.dir = "../data/pbmc3k/filtered_gene_bc_matrices/hg19/") +pbmc.data <- Read10X(data.dir = "/brahms/mollag/practice/filtered_gene_bc_matrices/hg19/") # Initialize the Seurat object with the raw (non-normalized data). pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200) pbmc @@ -59,7 +59,7 @@ pbmc.data[c("CD3D","TCL1A","MS4A1"), 1:30] The `.` values in the matrix represent 0s (no molecules detected). Since most values in an scRNA-seq matrix are 0, Seurat uses a sparse-matrix representation whenever possible. This results in significant memory and speed savings for Drop-seq/inDrop/10x data. -````{r} +```{r} dense.size <- object.size(as.matrix(pbmc.data)) dense.size sparse.size <- object.size(pbmc.data) @@ -86,7 +86,7 @@ Seurat allows you to easily explore QC metrics and filter cells based on any use + We calculate mitochondrial QC metrics with the `PercentageFeatureSet()` function, which calculates the percentage of counts originating from a set of features + We use the set of all genes starting with `MT-` as a set of mitochondrial genes -```{r mito, fig.height=7, fig.width=13} +```{r mito} # The [[ operator can add columns to object metadata. This is a great place to stash QC stats pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-") ``` @@ -96,7 +96,7 @@ pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-") * The number of unique genes and total molecules are automatically calculated during `CreateSeuratObject()` + You can find them stored in the object meta data -```{r qc, fig.height=7, fig.width=13} +```{r qc} # Show QC metrics for the first 5 cells head(pbmc@meta.data, 5) ``` @@ -126,7 +126,9 @@ pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent # Normalizing the data -After removing unwanted cells from the dataset, the next step is to normalize the data. By default, we employ a global-scaling normalization method "LogNormalize" that normalizes the feature expression measurements for each cell by the total expression, multiplies this by a scale factor (10,000 by default), and log-transforms the result. Normalized values are stored in `pbmc[["RNA"]]@data`. +After removing unwanted cells from the dataset, the next step is to normalize the data. By default, we employ a global-scaling normalization method "LogNormalize" that normalizes the feature expression measurements for each cell by the total expression, multiplies this by a scale factor (10,000 by default), and log-transforms the result. In Seurat v5, Normalized values are stored in `pbmc[["RNA"]]$data`. + + ```{r normalize} pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 1e4) @@ -137,13 +139,16 @@ For clarity, in this previous line of code (and in future commands), we provide pbmc <- NormalizeData(pbmc) ``` +While this method of normalization is standard and widely used in scRNA-seq analysis, global-scaling relies on an assumption that each cell originally contains the same number of RNA molecules. We and others have developed alternative workflows for the single cell preprocessing that do not make these assumptions. For users who are interested, please check out our `SCTransform()` normalization workflow. The method is described in our[paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02584-9), with a separate vignette using Seurat [here](https://satijalab.org/seurat/articles/sctransform_vignette.html). The use of `SCTransform` replaces the need to run `NormalizeData`, `FindVariableFeatures`, or `ScaleData` (described below.) + + # Identification of highly variable features (feature selection) We next calculate a subset of features that exhibit high cell-to-cell variation in the dataset (i.e, they are highly expressed in some cells, and lowly expressed in others). We and [others](https://www.nature.com/articles/nmeth.2645) have found that focusing on these genes in downstream analysis helps to highlight biological signal in single-cell datasets. Our procedure in Seurat is described in detail [here](https://doi.org/10.1016/j.cell.2019.05.031), and improves on previous versions by directly modeling the mean-variance relationship inherent in single-cell data, and is implemented in the `FindVariableFeatures()` function. By default, we return 2,000 features per dataset. These will be used in downstream analysis, like PCA. -```{r var_features, fig.height=5, fig.width=11} +```{r var_features, fig.height=5, fig.width=10} pbmc <- FindVariableFeatures(pbmc, selection.method = 'vst', nfeatures = 2000) # Identify the 10 most highly variable genes @@ -164,38 +169,32 @@ Next, we apply a linear transformation ('scaling') that is a standard pre-proces * Shifts the expression of each gene, so that the mean expression across cells is 0 * Scales the expression of each gene, so that the variance across cells is 1 + This step gives equal weight in downstream analyses, so that highly-expressed genes do not dominate -* The results of this are stored in `pbmc[["RNA"]]@scale.data` +* The results of this are stored in `pbmc[["RNA"]]$scale.data` +* By default, only variable features are scaled. +* You can specify the `features` argument to scale additional features -```{r regress, fig.height=7, fig.width=11, results='hide'} +```{r regress, results='hide'} all.genes <- rownames(pbmc) pbmc <- ScaleData(pbmc, features = all.genes) ``` -
- **This step takes too long! Can I make it faster?** -Scaling is an essential step in the Seurat workflow, but only on genes that will be used as input to PCA. Therefore, the default in `ScaleData()` is only to perform scaling on the previously identified variable features (2,000 by default). To do this, omit the `features` argument in the previous function call, i.e. -```{r regressvar, fig.height=7, fig.width=11, results='hide',eval = FALSE} -pbmc <- ScaleData(pbmc) -``` -Your PCA and clustering results will be unaffected. However, Seurat heatmaps (produced as shown below with `DoHeatmap()`) require genes in the heatmap to be scaled, to make sure highly-expressed genes don't dominate the heatmap. To make sure we don't leave any genes out of the heatmap later, we are scaling all genes in this tutorial. -
-\
- **How can I remove unwanted sources of variation, as in Seurat v2?** + **How can I remove unwanted sources of variation** -In `Seurat v2` we also use the `ScaleData()` function to remove unwanted sources of variation from a single-cell dataset. For example, we could 'regress out' heterogeneity associated with (for example) cell cycle stage, or mitochondrial contamination. These features are still supported in `ScaleData()` in `Seurat v3`, i.e.: -```{r regressvarmt, fig.height=7, fig.width=11, results='hide',eval = FALSE} +In Seurat, we also use the `ScaleData()` function to remove unwanted sources of variation from a single-cell dataset. For example, we could 'regress out' heterogeneity associated with (for example) [cell cycle stage](https://satijalab.org/seurat/articles/cell_cycle_vignette), or mitochondrial contamination i.e.: +```{r regressvarmt, eval = FALSE} pbmc <- ScaleData(pbmc, vars.to.regress = 'percent.mt') ``` -However, particularly for advanced users who would like to use this functionality, we strongly recommend the use of our new normalization workflow, `SCTransform()`. The method is described in our [paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1), with a separate vignette using Seurat v3 [here](sctransform_vignette.html). As with `ScaleData()`, the function `SCTransform()` also includes a `vars.to.regress` parameter. +However, particularly for advanced users who would like to use this functionality, we strongly recommend the use of our new normalization workflow, `SCTransform()`. The method is described in our [paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02584-9), with a separate vignette using Seurat [here](https://satijalab.org/seurat/articles/sctransform_vignette). As with `ScaleData()`, the function `SCTransform()` also includes a `vars.to.regress` parameter.
-\ *** # Perform linear dimensional reduction -Next we perform PCA on the scaled data. By default, only the previously determined variable features are used as input, but can be defined using `features` argument if you wish to choose a different subset. +Next we perform PCA on the scaled data. By default, only the previously determined variable features are used as input, but can be defined using `features` argument if you wish to choose a different subset (if you do want to use a custom subset of features, make sure you pass these to `ScaleData` first). + +For the first principal components, Seurat outputs a list of genes with the most positive and negative loadings, representing modules of genes that exhibit either correlation (or anti-correlation) across single-cells in the dataset. ```{r pca,results='hide'} pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc)) @@ -203,16 +202,16 @@ pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc)) Seurat provides several useful ways of visualizing both cells and features that define the PCA, including `VizDimReduction()`, `DimPlot()`, and `DimHeatmap()` -```{r pca_viz, message=TRUE} +```{r pca_viz, fig.height=8, fig.width=10, message=TRUE} # Examine and visualize PCA results a few different ways print(pbmc[['pca']], dims = 1:5, nfeatures = 5) VizDimLoadings(pbmc, dims = 1:2, reduction = 'pca') -DimPlot(pbmc, reduction = 'pca') +DimPlot(pbmc, reduction = 'pca') + NoLegend() ``` In particular `DimHeatmap()` allows for easy exploration of the primary sources of heterogeneity in a dataset, and can be useful when trying to decide which PCs to include for further downstream analyses. Both cells and features are ordered according to their PCA scores. Setting `cells` to a number plots the 'extreme' cells on both ends of the spectrum, which dramatically speeds plotting for large datasets. Though clearly a supervised analysis, we find this to be a valuable tool for exploring correlated feature sets. -```{r single-heatmap} +```{r single-heatmap, fig.height=6, fig.width=9} DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE) ``` @@ -224,19 +223,8 @@ DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE) To overcome the extensive technical noise in any single feature for scRNA-seq data, Seurat clusters cells based on their PCA scores, with each PC essentially representing a 'metafeature' that combines information across a correlated feature set. The top principal components therefore represent a robust compression of the dataset. However, how many components should we choose to include? 10? 20? 100? -In [Macosko *et al*](http://www.cell.com/abstract/S0092-8674(15)00549-8), we implemented a resampling test inspired by the JackStraw procedure. We randomly permute a subset of the data (1% by default) and rerun PCA, constructing a 'null distribution' of feature scores, and repeat this procedure. We identify 'significant' PCs as those who have a strong enrichment of low p-value features. +In [Macosko *et al*](http://www.cell.com/abstract/S0092-8674(15)00549-8), we implemented a resampling test inspired by the JackStraw procedure. While still available in Seurat [(see previous vignette)](https://satijalab.org/seurat/articles/pbmc3k_tutorial), this is a slow and computationally expensive procedure, and we is no longer routinely used in single cell analysis. -```{r jackstraw, fig.height=6, fig.width=10} -# NOTE: This process can take a long time for big datasets, comment out for expediency. More approximate techniques such as those implemented in ElbowPlot() can be used to reduce computation time -pbmc <- JackStraw(pbmc, num.replicate = 100) -pbmc <- ScoreJackStraw(pbmc, dims = 1:20) -``` - -The `JackStrawPlot()` function provides a visualization tool for comparing the distribution of p-values for each PC with a uniform distribution (dashed line). 'Significant' PCs will show a strong enrichment of features with low p-values (solid curve above the dashed line). In this case it appears that there is a sharp drop-off in significance after the first 10-12 PCs. - -```{r jsplots, fig.height=6, fig.width=10} -JackStrawPlot(pbmc, dims = 1:15) -``` An alternative heuristic method generates an 'Elbow plot': a ranking of principle components based on the percentage of variance explained by each one (`ElbowPlot()` function). In this example, we can observe an 'elbow' around PC9-10, suggesting that the majority of true signal is captured in the first 10 PCs. @@ -244,7 +232,7 @@ An alternative heuristic method generates an 'Elbow plot': a ranking of principl ElbowPlot(pbmc) ``` -Identifying the true dimensionality of a dataset -- can be challenging/uncertain for the user. We therefore suggest these three approaches to consider. The first is more supervised, exploring PCs to determine relevant sources of heterogeneity, and could be used in conjunction with GSEA for example. The second implements a statistical test based on a random null model, but is time-consuming for large datasets, and may not return a clear PC cutoff. The third is a heuristic that is commonly used, and can be calculated instantly. In this example, all three approaches yielded similar results, but we might have been justified in choosing anything between PC 7-12 as a cutoff. +Identifying the true dimensionality of a dataset -- can be challenging/uncertain for the user. We therefore suggest these multiple approaches for users. The first is more supervised, exploring PCs to determine relevant sources of heterogeneity, and could be used in conjunction with GSEA for example. The second (`ElbowPlot`) The third is a heuristic that is commonly used, and can be calculated instantly. In this example, we might have been justified in choosing anything between PC 7-12 as a cutoff. We chose 10 here, but encourage users to consider the following: @@ -256,14 +244,14 @@ We chose 10 here, but encourage users to consider the following: # Cluster the cells -Seurat v3 applies a graph-based clustering approach, building upon initial strategies in ([Macosko *et al*](http://www.cell.com/abstract/S0092-8674(15)00549-8)). Importantly, the *distance metric* which drives the clustering analysis (based on previously identified PCs) remains the same. However, our approach to partitioning the cellular distance matrix into clusters has dramatically improved. Our approach was heavily inspired by recent manuscripts which applied graph-based clustering approaches to scRNA-seq data [[SNN-Cliq, Xu and Su, Bioinformatics, 2015]](http://bioinformatics.oxfordjournals.org/content/early/2015/02/10/bioinformatics.btv088.abstract) and CyTOF data [[PhenoGraph, Levine *et al*., Cell, 2015]](http://www.ncbi.nlm.nih.gov/pubmed/26095251). Briefly, these methods embed cells in a graph structure - for example a K-nearest neighbor (KNN) graph, with edges drawn between cells with similar feature expression patterns, and then attempt to partition this graph into highly interconnected 'quasi-cliques' or 'communities'. +Seurat applies a graph-based clustering approach, building upon initial strategies in ([Macosko *et al*](http://www.cell.com/abstract/S0092-8674(15)00549-8)). Importantly, the *distance metric* which drives the clustering analysis (based on previously identified PCs) remains the same. However, our approach to partitioning the cellular distance matrix into clusters has dramatically improved. Our approach was heavily inspired by recent manuscripts which applied graph-based clustering approaches to scRNA-seq data [[SNN-Cliq, Xu and Su, Bioinformatics, 2015]](http://bioinformatics.oxfordjournals.org/content/early/2015/02/10/bioinformatics.btv088.abstract) and CyTOF data [[PhenoGraph, Levine *et al*., Cell, 2015]](http://www.ncbi.nlm.nih.gov/pubmed/26095251). Briefly, these methods embed cells in a graph structure - for example a K-nearest neighbor (KNN) graph, with edges drawn between cells with similar feature expression patterns, and then attempt to partition this graph into highly interconnected 'quasi-cliques' or 'communities'. As in PhenoGraph, we first construct a KNN graph based on the euclidean distance in PCA space, and refine the edge weights between any two cells based on the shared overlap in their local neighborhoods (Jaccard similarity). This step is performed using the `FindNeighbors()` function, and takes as input the previously defined dimensionality of the dataset (first 10 PCs). To cluster the cells, we next apply modularity optimization techniques such as the Louvain algorithm (default) or SLM [[SLM, Blondel *et al*., Journal of Statistical Mechanics]](http://dx.doi.org/10.1088/1742-5468/2008/10/P10008), to iteratively group cells together, with the goal of optimizing the standard modularity function. The `FindClusters()` function implements this procedure, and contains a resolution parameter that sets the 'granularity' of the downstream clustering, with increased values leading to a greater number of clusters. We find that setting this parameter between 0.4-1.2 typically returns good results for single-cell datasets of around 3K cells. Optimal resolution often increases for larger datasets. The clusters can be found using the `Idents()` function. -```{r cluster, fig.height=5, fig.width=7} +```{r cluster} pbmc <- FindNeighbors(pbmc, dims = 1:10) pbmc <- FindClusters(pbmc, resolution = 0.5) @@ -275,14 +263,17 @@ head(Idents(pbmc), 5) # Run non-linear dimensional reduction (UMAP/tSNE) -Seurat offers several non-linear dimensional reduction techniques, such as tSNE and UMAP, to visualize and explore these datasets. The goal of these algorithms is to learn the underlying manifold of the data in order to place similar cells together in low-dimensional space. Cells within the graph-based clusters determined above should co-localize on these dimension reduction plots. As input to the UMAP and tSNE, we suggest using the same PCs as input to the clustering analysis. +Seurat offers several non-linear dimensional reduction techniques, such as tSNE and UMAP, to visualize and explore these datasets. The goal of these algorithms is to learn underlying structure in the dataset, in order to place similar cells together in low-dimensional space. Therefore, cells that are grouped together within graph-based clusters determined above should co-localize on these dimension reduction plots. + +While we and others have routinely found 2D visualization techniques like tSNE and UMAP to be valuable tools for exploring datasets, all visualization techniques have limitations, and cannot fully represent the complexity of the underlying data. In particular, these methods aim to preserve local distances in the dataset (i.e. ensuring that cells with very similar gene expression profiles co-localize), but often do not preserve more global relationships. We encourage users to leverage techniques like UMAP for visualization, but to avoid drawing biological conclusions solely on the basis of visualization techniques. + + -```{r tsne, fig.height=5, fig.width=7} -# If you haven't installed UMAP, you can do so via reticulate::py_install(packages = "umap-learn") +```{r umap} pbmc <- RunUMAP(pbmc, dims = 1:10) ``` -```{r tsneplot, fig.height=5, fig.width=7} +```{r umapplot, fig.height=8, fig.width=8} # note that you can set `label = TRUE` or use the LabelClusters function to help label individual clusters DimPlot(pbmc, reduction = 'umap') ``` @@ -297,41 +288,49 @@ saveRDS(pbmc, file = "../output/pbmc_tutorial.rds") # Finding differentially expressed features (cluster biomarkers) -Seurat can help you find markers that define clusters via differential expression. By default, it identifies positive and negative markers of a single cluster (specified in `ident.1`), compared to all other cells. `FindAllMarkers()` automates this process for all clusters, but you can also test groups of clusters vs. each other, or against all cells. +Seurat can help you find markers that define clusters via differential expression (DE). By default, it identifies positive and negative markers of a single cluster (specified in `ident.1`), compared to all other cells. `FindAllMarkers()` automates this process for all clusters, but you can also test groups of clusters vs. each other, or against all cells. -The `min.pct` argument requires a feature to be detected at a minimum percentage in either of the two groups of cells, and the thresh.test argument requires a feature to be differentially expressed (on average) by some amount between the two groups. You can set both of these to 0, but with a dramatic increase in time - since this will test a large number of features that are unlikely to be highly discriminatory. As another option to speed up these computations, `max.cells.per.ident` can be set. This will downsample each identity class to have no more cells than whatever this is set to. While there is generally going to be a loss in power, the speed increases can be significant and the most highly differentially expressed features will likely still rise to the top. +In Seurat v5, we use the presto package (as described [here](https://www.biorxiv.org/content/10.1101/653253v1) and available for installation [here](https://github.com/immunogenomics/presto)), to dramatically improve the speed of DE analysis, particularly for large datasets. For users who are not using presto, you can examine the documentation for this function (`?FindMarkers`) to explore the `min.pct` and `logfc.threshold` parameters, which can be increased in order to increase the speed of DE testing. -```{r markers1, fig.height=8, fig.width=15} +```{r markers1} # find all markers of cluster 2 -cluster2.markers <- FindMarkers(pbmc, ident.1 = 2, min.pct = 0.25) +cluster2.markers <- FindMarkers(pbmc, ident.1 = 2) head(cluster2.markers, n = 5) # find all markers distinguishing cluster 5 from clusters 0 and 3 -cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25) +cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3)) head(cluster5.markers, n = 5) # find markers for every cluster compared to all remaining cells, report only the positive ones -pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25) -pbmc.markers %>% group_by(cluster) %>% slice_max(n = 2, order_by = avg_log2FC) +pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE) +pbmc.markers %>% + group_by(cluster) %>% + dplyr::filter(avg_log2FC > 1) ``` Seurat has several tests for differential expression which can be set with the test.use parameter (see our [DE vignette](de_vignette.html) for details). For example, the ROC test returns the 'classification power' for any individual marker (ranging from 0 - random, to 1 - perfect). -```{r markersroc, fig.height=8, fig.width=15} +```{r markersroc} cluster0.markers <- FindMarkers(pbmc, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE) ``` We include several tools for visualizing marker expression. `VlnPlot()` (shows expression probability distributions across clusters), and `FeaturePlot()` (visualizes feature expression on a tSNE or PCA plot) are our most commonly used visualizations. We also suggest exploring `RidgePlot()`, `CellScatter()`, and `DotPlot()` as additional methods to view your dataset. -```{r markerplots, fig.height=10, fig.width=15} +```{r markerplots, fig.height=5, fig.width=10} VlnPlot(pbmc, features = c("MS4A1", "CD79A")) # you can plot raw counts as well VlnPlot(pbmc, features = c("NKG7", "PF4"), slot = 'counts', log = TRUE) -FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A")) ``` +```{r, fig.height=8, fig.width=10} +FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A")) +``` `DoHeatmap()` generates an expression heatmap for given cells and features. In this case, we are plotting the top 20 markers (or all markers if less than 20) for each cluster. -```{r clusterHeatmap, fig.height=8, fig.width=15} -pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC) -> top10 +```{r clusterHeatmap, fig.height=8, fig.width=10} +pbmc.markers %>% + group_by(cluster) %>% + dplyr::filter(avg_log2FC > 1) %>% + slice_head(n = 10) %>% + ungroup() -> top10 DoHeatmap(pbmc, features = top10$gene) + NoLegend() ``` @@ -360,7 +359,7 @@ pbmc <- RenameIdents(pbmc, new.cluster.ids) DimPlot(pbmc, reduction = 'umap', label = TRUE, pt.size = 0.5) + NoLegend() ``` -```{r save.img, include=FALSE} +```{r save.img, include=TRUE} library(ggplot2) plot <- DimPlot(pbmc, reduction = "umap", label = TRUE, label.size = 4.5) + xlab("UMAP 1") + ylab("UMAP 2") + theme(axis.title = element_text(size = 18), legend.text = element_text(size = 18)) + @@ -372,10 +371,6 @@ ggsave(filename = "../output/images/pbmc3k_umap.jpg", height = 7, width = 12, pl saveRDS(pbmc, file = "../output/pbmc3k_final.rds") ``` -```{r save2, include = FALSE} -saveRDS(pbmc, file = "../data/pbmc3k_final.rds") -``` - ```{r save.times, include = FALSE} write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/pbmc3k_tutorial_times.csv") ``` diff --git a/vignettes/sctransform_v2_vignette.Rmd b/vignettes/sctransform_v2_vignette.Rmd index 42ecd064d..f4cf3d04e 100644 --- a/vignettes/sctransform_v2_vignette.Rmd +++ b/vignettes/sctransform_v2_vignette.Rmd @@ -26,16 +26,17 @@ knitr::opts_chunk$set( fig.width = 10, message = FALSE, warning = FALSE, - time_it = TRUE + time_it = TRUE, + error = TRUE ) ``` ## TL;DR -We recently introduced [sctransform](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1) to perform normalization and variance stabilization of scRNA-seq datasets. We now release an updated version ('v2'), based on [our broad analysis](https://www.biorxiv.org/content/10.1101/2021.07.07.451498v1) of 59 scRNA-seq datasets spanning a range of technologies, systems, and sequencing depths. This update improves speed and memory consumption, the stability of parameter estimates, the identification of variable features, and the the ability to perform downstream differential expression analyses. +We recently introduced [sctransform](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1) to perform normalization and variance stabilization of scRNA-seq datasets. We now release an updated version ('v2'), based on [our broad analysis](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02584-9) of 59 scRNA-seq datasets spanning a range of technologies, systems, and sequencing depths. This update improves speed and memory consumption, the stability of parameter estimates, the identification of variable features, and the the ability to perform downstream differential expression analyses. -Users can install sctransform v2 from CRAN (sctransform v0.3.3) and invoke the use of the updated method via the `vst.flavor` argument. +Users can install sctransform v2 from CRAN (sctransform v0.3.3+) and invoke the use of the updated method via the `vst.flavor` argument. ```{r tldr, eval=FALSE} # install sctransform >= 0.3.3 @@ -56,7 +57,7 @@ In this vignette, we use [sctransform v2](https://github.com/satijalab/sctransfo ## Install sctransform -We will install sctransform v2 from CRAN (v0.3.3). We will also install the [glmGamPoi](https://bioconductor.org/packages/release/bioc/html/glmGamPoi.html) package which substantially improves the speed of the learning procedure. +We will install sctransform v2 from CRAN. We will also install the [glmGamPoi](https://bioconductor.org/packages/release/bioc/html/glmGamPoi.html) package which substantially improves the speed of the learning procedure. ```{r results='hide', message=FALSE, warning=FALSE} # install glmGamPoi @@ -85,7 +86,7 @@ InstallData("ifnb") ```{r init, results='hide', message=FALSE, fig.keep='none'} # load dataset -LoadData("ifnb") +ifnb <- LoadData("ifnb") # split the dataset into a list of two seurat objects (stim and CTRL) ifnb.list <- SplitObject(ifnb, split.by = "stim") diff --git a/vignettes/sctransform_vignette.Rmd b/vignettes/sctransform_vignette.Rmd index b132586ec..8b9230432 100644 --- a/vignettes/sctransform_vignette.Rmd +++ b/vignettes/sctransform_vignette.Rmd @@ -1,6 +1,6 @@ --- title: "Using sctransform in Seurat" -author: Christoph Hafemeister & Rahul Satija +author: Saket Choudhary, Christoph Hafemeister & Rahul Satija output: html_document: theme: united @@ -26,16 +26,18 @@ knitr::opts_chunk$set( tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE, - time_it = TRUE + time_it = TRUE, + error = TRUE ) ``` Biological heterogeneity in single-cell RNA-seq data is often confounded by technical factors including sequencing depth. The number of molecules detected in each cell can vary significantly between cells, even within the same celltype. Interpretation of scRNA-seq data requires effective pre-processing and normalization to remove this technical variability. -In [Hafemeister and Satija, 2019](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1) we introduce a modeling framework for the normalization and variance stabilization of molecular count data from scRNA-seq experiment. -This procedure omits the need for heuristic steps including pseudocount addition or log-transformation and improves common downstream analytical tasks such as variable gene selection, dimensional reduction, and differential expression. -In this vignette, we demonstrate how using [sctransform](https://github.com/ChristophH/sctransform/) based normalization enables recovering sharper biological distinction compared to log-normalization. +In [our manuscript](https://genomebiology-biomedcentral-com/articles/10.1186/s13059-021-02584-9) we introduce a modeling framework for the normalization and variance stabilization of molecular count data from scRNA-seq experiments. This procedure omits the need for heuristic steps including pseudocount addition or log-transformation and improves common downstream analytical tasks such as variable gene selection, dimensional reduction, and differential expression. We named this method `sctransform`. + +Inspired by important and rigorous work from [Lause et al](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02451-7), we released an [updated manuscript](https://link.springer.com/article/10.1186/s13059-021-02584-9) and updated the sctransform software to a v2 version, which is now the default in Seurat v5. + ```{r packages} library(Seurat) @@ -43,18 +45,20 @@ library(ggplot2) library(sctransform) ``` -Load data and create Seurat object +# Load data and create Seurat object ```{r load_data, warning=FALSE, message=FALSE} -pbmc_data <- Read10X(data.dir = "../data/pbmc3k/filtered_gene_bc_matrices/hg19/") +pbmc_data <- Read10X(data.dir = "/brahms/shared/vignette-data/pbmc3k/filtered_gene_bc_matrices/hg19/") pbmc <- CreateSeuratObject(counts = pbmc_data) ``` -Apply sctransform normalization +# Apply sctransform normalization * Note that this single command replaces `NormalizeData()`, `ScaleData()`, and `FindVariableFeatures()`. * Transformed data will be available in the SCT assay, which is set as the default after running sctransform * During normalization, we can also remove confounding sources of variation, for example, mitochondrial mapping percentage + * In Seurat v5, SCT v2 is applied by default. You can revert to v1 by setting `vst.flavor = 'v1'` + * The [glmGamPoi](https://bioconductor.org/packages/release/bioc/html/glmGamPoi.html) package substantially improves speed and is used by default if installed, with instructions [here](install.html) ```{r apply_sct, warning=FALSE, message=FALSE} # store mitochondrial percentage in object meta data @@ -64,19 +68,8 @@ pbmc <- PercentageFeatureSet(pbmc, pattern = "^MT-", col.name = 'percent.mt') pbmc <- SCTransform(pbmc, vars.to.regress = "percent.mt", verbose = FALSE) ``` -The latest version of `sctransform` also supports using [glmGamPoi](https://bioconductor.org/packages/release/bioc/html/glmGamPoi.html) -package which substantially improves the speed of the learning procedure. It can be invoked by specifying -`method="glmGamPoi"`. - -```{r eval=FALSE} -if (!requireNamespace("BiocManager", quietly = TRUE)) - install.packages("BiocManager") - -BiocManager::install("glmGamPoi") -pbmc <- SCTransform(pbmc, method="glmGamPoi", vars.to.regress = "percent.mt", verbose = FALSE) -``` +# Perform dimensionality reduction by PCA and UMAP embedding -Perform dimensionality reduction by PCA and UMAP embedding ```{r pca, fig.width=5, fig.height=5} # These are now standard steps in the Seurat workflow for visualization and clustering pbmc <- RunPCA(pbmc, verbose = FALSE) @@ -84,7 +77,7 @@ pbmc <- RunUMAP(pbmc, dims = 1:30, verbose = FALSE) pbmc <- FindNeighbors(pbmc, dims = 1:30, verbose = FALSE) pbmc <- FindClusters(pbmc, verbose = FALSE) -DimPlot(pbmc, label = TRUE) + NoLegend() +DimPlot(pbmc, label = TRUE) ```
@@ -109,28 +102,24 @@ pbmc <- CreateSeuratObject(pbmc_data) %>% PercentageFeatureSet(pattern = "^MT-",
**Where are normalized values stored for sctransform?** -As described in our [paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1), sctransform calculates a model of technical noise in scRNA-seq data using 'regularized negative binomial regression'. The residuals for this model are normalized values, and can be positive or negative. Positive residuals for a given gene in a given cell indicate that we observed more UMIs than expected given the gene’s average expression in the population and cellular sequencing depth, while negative residuals indicate the converse. The results of sctransfrom are stored in the "SCT" assay. You can learn more about multi-assay data and commands in Seurat in our [vignette](multimodal_vignette.html), [command cheat sheet](essential_commands.html#multi-assay-features), or [developer guide](https://github.com/satijalab/seurat/wiki/Assay). -* `pbmc[["SCT"]]@scale.data` contains the residuals (normalized values), and is used directly as input to PCA. Please note that this matrix is non-sparse, and can therefore take up a lot of memory if stored for all genes. To save memory, we store these values only for variable genes, by setting the return.only.var.genes = TRUE by default in the `SCTransform()` function call. -* To assist with visualization and interpretation. we also convert Pearson residuals back to ‘corrected’ UMI counts. You can interpret these as the UMI counts we would expect to observe if all cells were sequenced to the same depth. If you want to see exactly how we do this, please look at the correct function [here](https://github.com/ChristophH/sctransform/blob/master/R/denoise.R). -* The 'corrected' UMI counts are stored in `pbmc[["SCT"]]@counts`. We store log-normalized versions of these corrected counts in `pbmc[["SCT"]]@data`, which are very helpful for visualization. -* You can use the corrected log-normalized counts for differential expression and integration. However, in principle, it would be most optimal to perform these calculations directly on the residuals (stored in the `scale.data` slot) themselves. This is not currently supported in Seurat v3, but will be soon. +* `pbmc[["SCT"]]$scale.data` contains the residuals (normalized values), and is used directly as input to PCA. Please note that this matrix is non-sparse, and can therefore take up a lot of memory if stored for all genes. To save memory, we store these values only for variable genes, by setting the return.only.var.genes = TRUE by default in the `SCTransform()` function call. +* To assist with visualization and interpretation, we also convert Pearson residuals back to ‘corrected’ UMI counts. You can interpret these as the UMI counts we would expect to observe if all cells were sequenced to the same depth. If you want to see exactly how we do this, please look at the correct function [here](https://github.com/ChristophH/sctransform/blob/master/R/denoise.R). +* The 'corrected' UMI counts are stored in `pbmc[["SCT"]]$counts`. We store log-normalized versions of these corrected counts in `pbmc[["SCT"]]$data`, which are very helpful for visualization. ------
\ - Users can individually annotate clusters based on canonical markers. However, the sctransform normalization reveals sharper biological distinctions compared to the [standard Seurat workflow](pbmc3k_tutorial.html), in a few ways: - * Clear separation of at least 3 CD8 T cell populations (naive, memory, effector), based on CD8A, GZMK, CCL5, GZMK expression + * Clear separation of at least 3 CD8 T cell populations (naive, memory, effector), based on CD8A, GZMK, CCL5, CCR7 expression * Clear separation of three CD4 T cell populations (naive, memory, IFN-activated) based on S100A4, CCR7, IL32, and ISG15 * Additional developmental sub-structure in B cell cluster, based on TCL1A, FCER2 * Additional separation of NK cells into CD56dim vs. bright clusters, based on XCL1 and FCGR3A - ```{r fplot, fig.width = 10, fig.height=6} # These are now standard steps in the Seurat workflow for visualization and clustering # Visualize canonical marker genes as violin plots. @@ -141,7 +130,6 @@ FeaturePlot(pbmc, features = c("CD8A", "GZMK", "CCL5", "S100A4", "ANXA1", "CCR7" FeaturePlot(pbmc, features = c("CD3D", "ISG15", "TCL1A", "FCER2", "XCL1", "FCGR3A"), pt.size = 0.2, ncol = 3) ``` - ```{r save.times, include = FALSE} write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/sctransform_vignette_times.csv") ``` diff --git a/vignettes/seurat5_archive.Rmd b/vignettes/seurat5_archive.Rmd new file mode 100644 index 000000000..00f8321d5 --- /dev/null +++ b/vignettes/seurat5_archive.Rmd @@ -0,0 +1,41 @@ +--- +title: "Documentation Archive" +output: + html_document: + theme: united + df_print: kable +--- + +```{r helper, include = FALSE} +make_vignette_button <- function(name, title, version) { + url <- paste0("../archive/", version, "/", name, ".html") + paste0('', version, '') +} +``` + +```{r yaml, include = FALSE} +library(yaml) +vdat <- read_yaml(file = "archive.yaml") +``` +In version 4, the Seurat documentation was transitioned to pkgdown. Here we provide access to all previous versions of the documentation. + +# Version 2-3 tutorials + +```{r results='asis', echo=FALSE, warning=FALSE, message = FALSE} +for (i in 1:length(x = vdat$vignettes)) { + vignette <- vdat$vignettes[[i]] + cat('

', vignette$title, '

') + for(j in 1:length(x = vignette$versions)) { + cat(make_vignette_button(name = vignette$name, title = vignette$title, version = vignette$version[j])) + } +} +``` + +# Version 1.3-1.4 tutorials + +For versions 1.3-1.4, we provide access to the old documentation pages [here](../archive/v1.4/get_started_v1_4.html) + +# Version <=1.2 tutorials + +For versions <=1.2, we provide access to the old documentation pages [here](../archive/v1.2/get_started_v1_2.html) + diff --git a/vignettes/seurat5_atacseq_integration_vignette.Rmd b/vignettes/seurat5_atacseq_integration_vignette.Rmd new file mode 100644 index 000000000..7da517636 --- /dev/null +++ b/vignettes/seurat5_atacseq_integration_vignette.Rmd @@ -0,0 +1,231 @@ +--- +title: "Integrating scRNA-seq and scATAC-seq data" +output: + html_document: + theme: united + df_print: kable +date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' +--- + +```{r markdown.setup, include=TRUE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = TRUE, + fig.width = 12, + tidy.opts = list(width.cutoff = 95), + message = FALSE, + warning = FALSE, + time_it = TRUE, + error = TRUE +) +options(SeuratData.repo.use = 'seurat.nygenome.org') +``` + +Single-cell transcriptomics has transformed our ability to characterize cell states, but deep biological understanding requires more than a taxonomic listing of clusters. As new methods arise to measure distinct cellular modalities, a key analytical challenge is to integrate these datasets to better understand cellular identity and function. For example, users may perform scRNA-seq and scATAC-seq experiments on the same biological system and to consistently annotate both datasets with the same set of cell type labels. This analysis is particularly challenging as scATAC-seq datasets are difficult to annotate, due to both the sparsity of genomic data collected at single-cell resolution, and the lack of interpretable gene markers in scRNA-seq data. + +In [Stuart\*, Butler\* et al, 2019](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8), we introduce methods to integrate scRNA-seq and scATAC-seq datasets collected from the same biological system, and demonstrate these methods in this vignette. In particular, we demonstrate the following analyses: + +* How to use an annotated scRNA-seq dataset to label cells from an scATAC-seq experiment +* How to co-visualize (co-embed) cells from scRNA-seq and scATAC-seq +* How to project scATAC-seq cells onto a UMAP derived from an scRNA-seq experiment + +This vignette makes extensive use of the [Signac package](https://stuartlab.org/signac/), recently developed for the analysis of chromatin datasets collected at single-cell resolution, including scATAC-seq. Please see the Signac website for additional [vignettes](https://stuartlab.org/signac/articles/pbmc_vignette.html) and documentation for analyzing scATAC-seq data. + +We demonstrate these methods using a publicly available ~12,000 human PBMC 'multiome' dataset from 10x Genomics. In this dataset, scRNA-seq and scATAC-seq profiles were simultaneously collected in the same cells. For the purposes of this vignette, we treat the datasets as originating from two different experiments and integrate them together. Since they were originally measured in the same cells, this provides a ground truth that we can use to assess the accuracy of integration. We emphasize that our use of the multiome dataset here is for demonstration and evaluation purposes, and that users should apply these methods to scRNA-seq and scATAC-seq datasets that are collected separately. We provide a separate [weighted nearest neighbors vignette (WNN)](weighted_nearest_neighbor_analysis.html) that describes analysis strategies for multi-omic single-cell data. + +# Load in data and process each modality individually + +The PBMC multiome dataset is available from [10x genomics](https://support.10xgenomics.com/single-cell-multiome-atac-gex/datasets/1.0.0/pbmc_granulocyte_sorted_10k). To facilitate easy loading and exploration, it is also available as part of our SeuratData package. We load the RNA and ATAC data in separately, and pretend that these profiles were measured in separate experiments. We annotated these cells in our [WNN](weighted_nearest_neighbor_analysis.html) vignette, and the annotations are also included in SeuratData. + +```{r installdata} +library(SeuratData) +# install the dataset and load requirements +InstallData('pbmcMultiome') +``` + +```{r loadpkgs} +library(Seurat) +library(Signac) +library(EnsDb.Hsapiens.v86) +library(ggplot2) +library(cowplot) +``` + +```{r load_data} +# load both modalities +pbmc.rna <- LoadData("pbmcMultiome", "pbmc.rna") +pbmc.atac <- LoadData("pbmcMultiome", "pbmc.atac") + +pbmc.rna[['RNA']] <- as(pbmc.rna[['RNA']], Class = 'Assay5') +# repeat QC steps performed in the WNN vignette +pbmc.rna <- subset(pbmc.rna, seurat_annotations != 'filtered') +pbmc.atac <- subset(pbmc.atac, seurat_annotations != 'filtered') + +# Perform standard analysis of each modality independently +# RNA analysis +pbmc.rna <- NormalizeData(pbmc.rna) +pbmc.rna <- FindVariableFeatures(pbmc.rna) +pbmc.rna <- ScaleData(pbmc.rna) +pbmc.rna <- RunPCA(pbmc.rna) +pbmc.rna <- RunUMAP(pbmc.rna, dims = 1:30) + +# ATAC analysis +# add gene annotation information +annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86) +seqlevelsStyle(annotations) <- 'UCSC' +genome(annotations) <- "hg38" +Annotation(pbmc.atac) <- annotations + +# We exclude the first dimension as this is typically correlated with sequencing depth +pbmc.atac <- RunTFIDF(pbmc.atac) +pbmc.atac <- FindTopFeatures(pbmc.atac, min.cutoff = 'q0') +pbmc.atac <- RunSVD(pbmc.atac) +pbmc.atac <- RunUMAP(pbmc.atac, reduction = 'lsi', dims = 2:30, reduction.name = "umap.atac", reduction.key = "atacUMAP_") +``` + +Now we plot the results from both modalities. Cells have been previously annotated based on transcriptomic state. We will predict annotations for the scATAC-seq cells. + +```{r viz1} +p1 <- DimPlot(pbmc.rna, group.by = 'seurat_annotations', label = TRUE) + NoLegend() + ggtitle("RNA") +p2 <- DimPlot(pbmc.atac, group.by = 'orig.ident', label = FALSE) + NoLegend() + ggtitle("ATAC") +p1 + p2 +``` + +```{r save.img, include = TRUE} +plot <- (p1 + p2) & + xlab("UMAP 1") & ylab("UMAP 2") & + theme(axis.title = element_text(size = 18)) +ggsave(filename = "../output/images/atacseq_integration_vignette.jpg", height = 7, width = 12, plot = plot, quality = 50) +``` + +# Identifying anchors between scRNA-seq and scATAC-seq datasets + +In order to identify 'anchors' between scRNA-seq and scATAC-seq experiments, we first generate a rough estimate of the transcriptional activity of each gene by quantifying ATAC-seq counts in the 2 kb-upstream region and gene body, using the `GeneActivity()` function in the Signac package. The ensuing gene activity scores from the scATAC-seq data are then used as input for canonical correlation analysis, along with the gene expression quantifications from scRNA-seq. We perform this quantification for all genes identified as being highly variable from the scRNA-seq dataset. +```{r gene.activity} +# quantify gene activity +gene.activities <- GeneActivity(pbmc.atac, features = VariableFeatures(pbmc.rna)) + +# add gene activities as a new assay +pbmc.atac[["ACTIVITY"]] <- CreateAssayObject(counts = gene.activities) + +# normalize gene activities +DefaultAssay(pbmc.atac) <- "ACTIVITY" +pbmc.atac <- NormalizeData(pbmc.atac) +pbmc.atac <- ScaleData(pbmc.atac, features = rownames(pbmc.atac)) +``` + +```{r label.xfer} +# Identify anchors +transfer.anchors <- FindTransferAnchors( + reference = pbmc.rna, + query = pbmc.atac, + features = VariableFeatures(object = pbmc.rna), + reference.assay = 'RNA', + query.assay = 'ACTIVITY', + reduction = 'cca' +) +``` + +# Annotate scATAC-seq cells via label transfer + +After identifying anchors, we can transfer annotations from the scRNA-seq dataset onto the scATAC-seq cells. The annotations are stored in the `seurat_annotations` field, and are provided as input to the `refdata` parameter. The output will contain a matrix with predictions and confidence scores for each ATAC-seq cell. + +```{r transfer.data} +celltype.predictions <- TransferData( + anchorset = transfer.anchors, + refdata = pbmc.rna$seurat_annotations, + weight.reduction = pbmc.atac[['lsi']], + dims = 2:30 +) + +pbmc.atac <- AddMetaData(pbmc.atac, metadata = celltype.predictions) +``` + +
+ **Why do you choose different (non-default) values for reduction and weight.reduction?** + +In `FindTransferAnchors()`, we typically project the PCA structure from the reference onto the query when transferring between scRNA-seq datasets. However, when transferring across modalities we find that CCA better captures the shared feature correlation structure and therefore set `reduction = 'cca'` here. Additionally, by default in `TransferData()` we use the same projected PCA structure to compute the weights of the local neighborhood of anchors that influence each cell's prediction. In the case of scRNA-seq to scATAC-seq transfer, we use the low dimensional space learned by computing an LSI on the ATAC-seq data to compute these weights as this better captures the internal structure of the ATAC-seq data. + +
+\ + +After performing transfer, the ATAC-seq cells have predicted annotations (transferred from the scRNA-seq dataset) stored in the `predicted.id` field. Since these cells were measured with the multiome kit, we also have a ground-truth annotation that can be used for evaluation. You can see that the predicted and actual annotations are extremely similar. + + +```{r viz.label.accuracy} +pbmc.atac$annotation_correct <- pbmc.atac$predicted.id == pbmc.atac$seurat_annotations +p1 <- DimPlot(pbmc.atac, group.by = 'predicted.id', label = TRUE) + NoLegend() + ggtitle("Predicted annotation") +p2 <- DimPlot(pbmc.atac, group.by = 'seurat_annotations', label = TRUE) + NoLegend() + ggtitle("Ground-truth annotation") +p1 | p2 +``` + +In this example, the annotation for an scATAC-seq profile is correctly predicted via scRNA-seq integration ~90% of the time. In addition, the `prediction.score.max` field quantifies the uncertainty associated with our predicted annotations. We can see that cells that are correctly annotated are typically associated with high prediction scores (>90%), while cells that are incorrectly annotated are associated with sharply lower prediction scores (<50%). Incorrect assignments also tend to reflect closely related cell types (i.e. Intermediate vs. Naive B cells). + +```{r score.viz, fig.height = 5} +predictions <- table(pbmc.atac$seurat_annotations, pbmc.atac$predicted.id) +predictions <- predictions / rowSums(predictions) # normalize for number of cells in each cell type +predictions <- as.data.frame(predictions) +p1 <- ggplot(predictions, aes(Var1, Var2, fill = Freq)) + + geom_tile() + + scale_fill_gradient(name = "Fraction of cells", low = "#ffffc8", high = "#7d0025") + + xlab("Cell type annotation (RNA)") + + ylab("Predicted cell type label (ATAC)") + + theme_cowplot() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + +correct <- length(which(pbmc.atac$seurat_annotations == pbmc.atac$predicted.id)) +incorrect <- length(which(pbmc.atac$seurat_annotations != pbmc.atac$predicted.id)) +data <- FetchData(pbmc.atac, vars = c("prediction.score.max", "annotation_correct")) +p2 <- ggplot(data, aes(prediction.score.max, fill = annotation_correct, colour = annotation_correct)) + geom_density(alpha = 0.5) + theme_cowplot() + scale_fill_discrete(name = "Annotation Correct", labels = c(paste0("FALSE (n = ", incorrect, ")"), paste0("TRUE (n = ", correct, ")"))) + scale_color_discrete(name = "Annotation Correct", labels = c(paste0("FALSE (n = ", incorrect, ")"), paste0("TRUE (n = ", correct, ")"))) + xlab("Prediction Score") +p1 + p2 +``` + +# Co-embedding scRNA-seq and scATAC-seq datasets + +In addition to transferring labels across modalities, it is also possible to visualize scRNA-seq and scATAC-seq cells on the same plot. We emphasize that this step is primarily for visualization, and is an optional step. Typically, when we perform integrative analysis between scRNA-seq and scATAC-seq datasets, we focus primarily on label transfer as described above. We demonstrate our workflows for co-embedding below, and again highlight that this is for demonstration purposes, especially as in this particular case both the scRNA-seq profiles and scATAC-seq profiles were actually measured in the same cells. + +In order to perform co-embedding, we first 'impute' RNA expression into the scATAC-seq cells based on the previously computed anchors, and then merge the datasets. + +```{r coembed} +# note that we restrict the imputation to variable genes from scRNA-seq, but could impute the +# full transcriptome if we wanted to +genes.use <- VariableFeatures(pbmc.rna) +refdata <- GetAssayData(pbmc.rna, assay = "RNA", slot = "data")[genes.use, ] + +# refdata (input) contains a scRNA-seq expression matrix for the scRNA-seq cells. imputation +# (output) will contain an imputed scRNA-seq matrix for each of the ATAC cells +imputation <- TransferData(anchorset = transfer.anchors, refdata = refdata, weight.reduction = pbmc.atac[["lsi"]], dims = 2:30) +pbmc.atac[["RNA"]] <- imputation + +coembed <- merge(x = pbmc.rna, y = pbmc.atac) + +# Finally, we run PCA and UMAP on this combined object, to visualize the co-embedding of both +# datasets +coembed <- ScaleData(coembed, features = genes.use, do.scale = FALSE) +coembed <- RunPCA(coembed, features = genes.use, verbose = FALSE) +coembed <- RunUMAP(coembed, dims = 1:30) + +DimPlot(coembed, group.by = c("orig.ident","seurat_annotations")) +``` + +```{r save.times, include = FALSE} +write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_atacseq_integration_vignette.csv") +``` + +
+ **Session Info** +```{r} +sessionInfo() +``` +
diff --git a/vignettes/seurat5_bpcells_interaction_vignette.Rmd b/vignettes/seurat5_bpcells_interaction_vignette.Rmd new file mode 100644 index 000000000..51e0004a7 --- /dev/null +++ b/vignettes/seurat5_bpcells_interaction_vignette.Rmd @@ -0,0 +1,212 @@ +--- +title: "Using BPCells with Seurat Objects" +output: + html_document: + theme: united + df_print: kable + pdf_document: default +date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' +--- +*** +```{r setup, include=FALSE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = 'styler', + warning = FALSE, + error = TRUE, + message = FALSE, + fig.width = 8, + time_it = TRUE, + cache = TRUE +) +``` + +BPCells is an [R package](https://github.com/bnprks/BPCells) that allows for computationally efficient single-cell analysis. It utilizes bit-packing compression to store counts matrices on disk and C++ code to cache operations. + +We leverage the high performance capabilities of BPCells to work with Seurat objects in memory while accessing the counts on disk. In this vignette, we show how to use BPCells to load data, work with a Seurat objects in a more memory-efficient way, and write out Seurat objects with BPCells matrices. + +We will show the methods for interacting with both a single dataset in one file or multiple datasets across multiple files using BPCells. BPCells allows us to easily analyze these large datasets in memory, and we encourage users to check out some of our other vignettes [here]() and [here]() to see further applications. + +```{r, eval=FALSE} +devtools::install_github("bnprks/BPCells") +``` +```{r install, message = FALSE, warning = FALSE} +library(BPCells) +library(Seurat) +library(SeuratObject) +library(SeuratDisk) +library(Azimuth) +``` + +We use BPCells functionality to both load in our data and write the counts layers to bitpacked compressed binary files on disk to improve computation speeds. BPCells has multiple functions for reading in files. + +# Load Data + +## Load Data from one h5 file +In this section, we will load a dataset of mouse brain cells freely available from 10x Genomics. This includes 1.3 Million single cells that were sequenced on the Illumina NovaSeq 6000. The raw data can be found [here](https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.3.0/1M_neurons?). + +To read in the file, we will use open_matrix_10x_hdf5, a BPCells function written to read in feature matrices from 10x. We then write a matrix directory, load the matrix, and create a Seurat object. + +```{r} +brain.data <- open_matrix_10x_hdf5( + path = "/brahms/hartmana/vignette_data/1M_neurons_filtered_gene_bc_matrices_h5.h5") +# Write the matrix to a directory +write_matrix_dir( + mat = brain.data, + dir = '/brahms/hartmana/vignette_data/bpcells/brain_counts') +# Now that we have the matrix on disk, we can load it +brain.mat <- open_matrix_dir(dir = "/brahms/hartmana/vignette_data/bpcells/brain_counts") +brain.mat <- Azimuth:::ConvertEnsembleToSymbol(mat = brain.mat, species = "mouse") + +# Create Seurat Object +brain <- CreateSeuratObject(counts = brain.mat) +``` + +
+ **What if I already have a Seurat Object?** + +You can use BPCells to convert the matrices in your already created Seurat objects to on-disk matrices. Note, that this is only possible for V5 assays. As an example, if you'd like to convert the counts matrix of your RNA assay to a BPCells matrix, you can use the following: + +```{r, message=FALSE, warning=FALSE, eval=FALSE} +obj <- readRDS("/path/to/reference.rds") + +# Write the counts layer to a directory +write_matrix_dir(mat = obj[["RNA"]]$counts, dir = '/brahms/hartmana/vignette_data/bpcells/brain_counts') +counts.mat <- open_matrix_dir(dir = "/brahms/hartmana/vignette_data/bpcells/brain_counts") + +obj[["RNA"]]$counts <- counts.mat +``` + +
+ +### Example Analsyis + +Once this conversion is done, you can perform typical Seurat functions on the object. For example, we can normalize data and visualize features by automatically accessing the on-disk counts. + +```{r} +VlnPlot(brain, features = c("Sox10", "Slc17a7", "Aif1"), ncol = 3, layer = "counts", alpha = 0.1) + +# We then normalize and visualize again +brain <- NormalizeData(brain, normalization.method = "LogNormalize") +VlnPlot(brain, features = c("Sox10", "Slc17a7", "Aif1"), ncol = 3, layer = "data", alpha = 0.1) +``` + +### Saving Seurat objects with on-disk layers + +If you save your object and load it in in the future, Seurat will access the on-disk matrices by their path, which is stored in the assay level data. To make it easy to ensure these are saved in the same place, we provide new functionality to the SaveSeuratRds function. In this function, you specify your filename. The pointer to the path in the Seurat object will change to the current directory. + +This also makes it easy to share your Seurat objects with BPCells matrices by sharing a folder that contains both the object and the BPCells directory. + +```{r} +SaveSeuratRds( + object = brain, + file = "obj.Rds") +``` + + +If needed, a layer with an on-disk matrix can be converted to an in-memory matrix using the `as()` function. For the purposes of this demo, we'll subset the object so that it takes up less space in memory. +```{r} +brain <- subset(brain, downsample = 1000) +brain[["RNA"]]$counts <- as(object = brain[["RNA"]]$counts, Class = "dgCMatrix") +``` + + +## Load data from multiple h5ad files + +You can also download data from multiple matrices. In this section, we create a Seurat object using multiple peripheral blood mononuclear cell (PBMC) samples that are freely available for downlaod from CZI [here](https://cellxgene.cziscience.com/collections). We download data from [Ahern et al. (2022) Nature](https://cellxgene.cziscience.com/collections/8f126edf-5405-4731-8374-b5ce11f53e82), [Jin et al. (2021) Science](https://cellxgene.cziscience.com/collections/b9fc3d70-5a72-4479-a046-c2cc1ab19efc), and [Yoshida et al. (2022) Nature](https://cellxgene.cziscience.com/collections/03f821b4-87be-4ff4-b65a-b5fc00061da7). We use the BPCells function to read h5ad files. + + +```{r, warning=FALSE} +file.dir <- "/brahms/hartmana/vignette_data/h5ad_files/" +files.set <- c("ahern_pbmc.h5ad", "jin_pbmc.h5ad", "yoshida_pbmc.h5ad") + +# Loop through h5ad files and output BPCells matrices on-disk +data.list <- c() +metadata.list <- c() + +for (i in 1:length(files.set)) { + path <- paste0(file.dir, files.set[i]) + data <- open_matrix_anndata_hdf5(path) + write_matrix_dir( + mat = data, + dir = paste0(gsub(".h5ad", "", path), "_BP"), + overwrite = TRUE + ) + # Load in BP matrices + mat <- open_matrix_dir(dir = paste0(gsub(".h5ad", "", path), "_BP")) + mat <- Azimuth:::ConvertEnsembleToSymbol(mat = mat, species = "human") + # Get metadata + metadata.list[[i]] <- LoadH5ADobs(path = path) + data.list[[i]] <- mat +} +# Name layers +names(data.list) <- c("ahern", "jin", "yoshida") + +# Add Metadata +for (i in 1:length(metadata.list)){ + metadata.list[[i]]$publication <- names(data.list)[i] +} +metadata.list <- lapply(metadata.list, function(x) { + x <- x[, c("publication", "sex", "cell_type", "donor_id", "disease")] + return(x) +}) +metadata <- Reduce(rbind, metadata.list) +``` + +When we create the Seurat object with the list of matrices from each publication, we can then see that multiple counts layers exist that represent each dataset. This object contains over a million cells, yet only takes up minimal space in memory! + +```{r} +merged.object <- CreateSeuratObject(counts = data.list, meta.data = metadata) +merged.object +``` + +```{r save_merged, eval=FALSE} +SaveSeuratRds( + object = merged.object, + file = "obj.Rds") +``` + +## Parse Biosciences + +Here, we show how to load a 1 million cell data set from Parse Biosciences and create a Seurat Object. The data is available for download [here](https://support.parsebiosciences.com/hc/en-us/articles/7704577188500-How-to-analyze-a-1-million-cell-data-set-using-Scanpy-and-Harmony) + +```{r} +parse.data <- open_matrix_anndata_hdf5( + "/brahms/hartmana/vignette_data/h5ad_files/ParseBio_PBMC.h5ad") +``` + +```{r, eval=FALSE} +write_matrix_dir(mat = parse.data, dir = "/brahms/hartmana/vignette_data/bpcells/parse_1m_pbmc") +``` + +```{r} +parse.mat <- open_matrix_dir(dir = "/brahms/hartmana/vignette_data/bpcells/parse_1m_pbmc") +metadata <- readRDS("/brahms/hartmana/vignette_data/ParseBio_PBMC_meta.rds") +metadata$disease <- sapply(strsplit(x = metadata$sample, split = "_"), "[", 1) +parse.object <- CreateSeuratObject(counts = parse.mat, meta.data = metadata) +``` + +```{r save_parse, eval=FALSE} +SaveSeuratRds( + object = parse.object, + file = "obj.Rds") +``` + + +
+ **Session Info** +```{r} +sessionInfo() +``` +
diff --git a/vignettes/seurat5_cell_cycle_vignette.Rmd b/vignettes/seurat5_cell_cycle_vignette.Rmd new file mode 100644 index 000000000..59e6a7fc8 --- /dev/null +++ b/vignettes/seurat5_cell_cycle_vignette.Rmd @@ -0,0 +1,150 @@ +--- +title: "Cell-Cycle Scoring and Regression" +output: + html_document: + theme: united + df_print: kable + pdf_document: default +date: 'Compiled: `r Sys.Date()`' +--- + +```{r setup, include=TRUE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = TRUE, + tidy.opts = list(width.cutoff = 95), + warning = FALSE, + error = TRUE, + message = FALSE, + fig.width = 8, + time_it = TRUE +) +``` + +We demonstrate how to mitigate the effects of cell cycle heterogeneity in scRNA-seq data by calculating cell cycle phase scores based on canonical markers, and regressing these out of the data during pre-processing. We demonstrate this on a dataset of murine hematopoietic progenitors ([Nestorowa *et al*., Blood 2016](http://www.bloodjournal.org/content/early/2016/06/30/blood-2016-05-716480?sso-checked=true)).You can download the files needed to run this vignette [here](https://www.dropbox.com/s/3dby3bjsaf5arrw/cell_cycle_vignette_files.zip?dl=1). + +```{r initialize_object, results='hide'} +library(Seurat) +options(Seurat.object.assay.version = "v5") + +# Read in the expression matrix +# The first row is a header row, the first column is rownames +exp.mat <- read.table(file = "../data/nestorawa_forcellcycle_expressionMatrix.txt", header = TRUE, as.is = TRUE, row.names = 1) + +# A list of cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat. +# We can segregate this list into markers of G2/M phase and markers of S phase +s.genes <- cc.genes$s.genes +g2m.genes <- cc.genes$g2m.genes + +# Create our Seurat object and complete the initalization steps +marrow <- CreateSeuratObject(counts = as.matrix(exp.mat)) +marrow <- NormalizeData(marrow) +marrow <- FindVariableFeatures(marrow, selection.method = 'vst') +marrow <- ScaleData(marrow, features = rownames(marrow)) +``` + +If we run a PCA on our object, using the variable genes we found in `FindVariableFeatures()` above, we see that while most of the variance can be explained by lineage, PC8 and PC10 are split on cell-cycle genes including *TOP2A* and *MKI67*. We will attempt to regress this signal from the data, so that cell-cycle heterogeneity does not contribute to PCA or downstream analysis. + +```{r justification, message=TRUE} +marrow <- RunPCA(marrow, features = VariableFeatures(marrow, layer = 'counts'), ndims.print = 6:10, + nfeatures.print = 10) +DimHeatmap(marrow, dims = c(8, 10)) +``` + +# Assign Cell-Cycle Scores + +First, we assign each cell a score, based on its expression of G2/M and S phase markers. These marker sets should be anticorrelated in their expression levels, and cells expressing neither are likely not cycling and in G1 phase. + +We assign scores in the `CellCycleScoring()` function, which stores S and G2/M scores in object meta data, along with the predicted classification of each cell in either G2M, S or G1 phase. `CellCycleScoring()` can also set the identity of the Seurat object to the cell-cycle phase by passing `set.ident = TRUE` (the original identities are stored as `old.ident`). Please note that Seurat does not use the discrete classifications (G2M/G1/S) in downstream cell cycle regression. Instead, it uses the quantitative scores for G2M and S phase. However, we provide our predicted classifications in case they are of interest. + +```{r cc_score} +marrow <- CellCycleScoring(marrow, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE) + +#view cell cycle scores and phase assignments +head(marrow[[]]) + +#Visualize the distribution of cell cycle markers across +RidgePlot(marrow, features = c("PCNA","TOP2A","MCM6","MKI67"), ncol = 2) + +#Running a PCA on cell cycle genes reveals, unsurprisingly, that cells separate entirely by phase +marrow <- RunPCA(marrow, features = c(s.genes, g2m.genes)) +DimPlot(marrow) +``` + +```{r save.img, include=TRUE} +library(ggplot2) +plot <- DimPlot(marrow) + + theme(axis.title = element_text(size = 18), legend.text = element_text(size = 18)) + + guides(colour = guide_legend(override.aes = list(size = 10))) +ggsave(filename = "../output/images/cell_cycle_vignette.jpg", height = 7, width = 12, plot = plot, quality = 50) +``` + +We score single cells based on the scoring strategy described in [Tirosh *et al*. 2016](http://science.sciencemag.org/content/352/6282/189). See `?AddModuleScore()` in Seurat for more information, this function can be used to calculate supervised module scores for any gene list. + +# Regress out cell cycle scores during data scaling + +We now attempt to subtract ('regress out') this source of heterogeneity from the data. For users of Seurat v1.4, this was implemented in `RegressOut`. However, as the results of this procedure are stored in the scaled data slot (therefore overwriting the output of `ScaleData()`), we now merge this functionality into the `ScaleData()` function itself. + +For each gene, Seurat models the relationship between gene expression and the S and G2M cell cycle scores. The scaled residuals of this model represent a 'corrected' expression matrix, that can be used downstream for dimensional reduction. + +```{r regress, results='hide'} +marrow <- ScaleData(marrow, vars.to.regress = c('S.Score', 'G2M.Score'), features = rownames(marrow)) +``` + +```{r pca2, message=TRUE} +# Now, a PCA on the variable genes no longer returns components associated with cell cycle +marrow <- RunPCA(marrow, features = VariableFeatures(marrow, layer = 'counts'), nfeatures.print = 10) +``` + +```{r pca3} +#When running a PCA on only cell cycle genes, cells no longer separate by cell-cycle phase +marrow <- RunPCA(marrow, features = c(s.genes, g2m.genes)) +DimPlot(marrow) +``` + +As the best cell cycle markers are extremely well conserved across tissues and species, we have found this procedure to work robustly and reliably on diverse datasets. + +# Alternate Workflow + +The procedure above removes all signal associated with cell cycle. In some cases, we've found that this can negatively impact downstream analysis, particularly in differentiating processes (like murine hematopoiesis), where stem cells are quiescent and differentiated cells are proliferating (or vice versa). In this case, regressing out all cell cycle effects can blur the distinction between stem and progenitor cells as well. + +As an alternative, we suggest regressing out the **difference** between the G2M and S phase scores. This means that signals separating non-cycling cells and cycling cells will be maintained, but differences in cell cycle phase among proliferating cells (which are often uninteresting), will be regressed out of the data + +```{r regress_diff, results='hide'} +marrow$CC.Difference <- marrow$S.Score - marrow$G2M.Score +marrow <- ScaleData(marrow, vars.to.regress = 'CC.Difference', features = rownames(marrow)) +``` + +```{r pca4, message=TRUE} +#cell cycle effects strongly mitigated in PCA +marrow <- RunPCA(marrow, features = VariableFeatures(marrow, layer = 'counts'), nfeatures.print = 10) +``` + +```{r pca5} +#when running a PCA on cell cycle genes, actively proliferating cells remain distinct from G1 cells +#however, within actively proliferating cells, G2M and S phase cells group together +marrow <- RunPCA(marrow, features = c(s.genes, g2m.genes)) +DimPlot(marrow) +``` + +```{r save.times, include = FALSE} +write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_cell_cycle_vignette_times.csv") +``` + +
+ **Session Info** +```{r} +sessionInfo() +``` +
diff --git a/vignettes/seurat5_conversion_vignette.Rmd b/vignettes/seurat5_conversion_vignette.Rmd new file mode 100644 index 000000000..5e69fbb66 --- /dev/null +++ b/vignettes/seurat5_conversion_vignette.Rmd @@ -0,0 +1,132 @@ +--- +title: "Interoperability between single-cell object formats" +output: + html_document: + theme: united + df_print: kable + pdf_document: default +date: 'Compiled: `r Sys.Date()`' +--- + +```{r setup, include=TRUE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = TRUE, + tidy.opts = list(width.cutoff = 95), + fig.width = 10, + message = FALSE, + warning = FALSE, + time_it = TRUE, + error = TRUE +) +``` + +```{r, include = FALSE, cache=FALSE} +options(SeuratData.repo.use = "http://satijalab04.nygenome.org") +``` +In this vignette, we demonstrate the ability to convert between Seurat objects, SingleCellExperiment objects, and anndata objects. + +```{r packages} +# install scater +# https://bioconductor.org/packages/release/bioc/html/scater.html +library(scater) +library(Seurat) +# install SeuratDisk from GitHub using the remotes package +# remotes::install_github(repo = 'mojaveazure/seurat-disk', ref = 'develop') +library(SeuratDisk) +library(SeuratData) +library(patchwork) +``` + +# Converting to/from `SingleCellExperiment` + +[`SingleCellExperiment`](https://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html) is a class for storing single-cell experiment data, created by Davide Risso, Aaron Lun, and Keegan Korthauer, and is used by many Bioconductor analysis packages. Here we demonstrate converting the Seurat object produced in our 3k PBMC tutorial to SingleCellExperiment for use with Davis McCarthy's [scater](https://bioconductor.org/packages/release/bioc/html/scater.html) package. + +```{r seurat_singlecell} +# Use PBMC3K from SeuratData +InstallData("pbmc3k") +pbmc <- LoadData(ds = "pbmc3k", type = "pbmc3k.final") +pbmc.sce <- as.SingleCellExperiment(pbmc) +p1 <- plotExpression(pbmc.sce, features = 'MS4A1', x = 'ident') + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +p2 <- plotPCA(pbmc.sce, colour_by = 'ident') +p1 + p2 +``` + +Seurat also allows conversion from `SingleCellExperiment` objects to Seurat objects; we demonstrate this on some publicly available data downloaded from a repository maintained by [Martin Hemberg's group](http://www.sanger.ac.uk/science/groups/hemberg-group). + +```{r singlecell_seurat} +# download from hemberg lab +# https://scrnaseq-public-datasets.s3.amazonaws.com/scater-objects/manno_human.rds +manno <- readRDS(file = '../data/manno_human.rds') +manno <- runPCA(manno) +manno.seurat <- as.Seurat(manno, counts = 'counts', data = 'logcounts') +# gives the same results; but omits defaults provided in the last line +manno.seurat <- as.Seurat(manno) +Idents(manno.seurat) <- 'cell_type1' +p1 <- DimPlot(manno.seurat, reduction = 'PCA', group.by = 'Source') + NoLegend() +p2 <- RidgePlot(manno.seurat, features = 'ACTB', group.by = 'Source') +p1 + p2 +``` + +# Converting to/from `loom` + +The [`loom`](http://loompy.org/) format is a file structure imposed on [HDF5 files](http://portal.hdfgroup.org/display/support) designed by [Sten Linnarsson's](http://linnarssonlab.org/) group. It is designed to efficiently hold large single-cell genomics datasets. The ability to save Seurat objects as `loom` files is implemented in [SeuratDisk](https://mojaveazure.github.io/seurat-disk) For more details about the `loom` format, please see the [`loom` file format specification](http://linnarssonlab.org/loompy/format/index.html). + +```{r prepare_loom, echo=FALSE} +if (file.exists('../output/pbmc3k.loom')) { + file.remove('../output/pbmc3k.loom') +} +``` + +```{r seruat_loom} +pbmc.loom <- as.loom(pbmc, filename = '../output/pbmc3k.loom', verbose = FALSE) +pbmc.loom +# Always remember to close loom files when done +pbmc.loom$close_all() +``` + +Seurat can also read in `loom` files connected via [SeuratDisk](https://github.com/mojaveazure/seurat-disk) into a Seurat object; we demonstrate this on a subset of the [Mouse Brain Atlas](http://mousebrain.org/) created by the Linnarsson lab. + +```{r loom_seurat, fig.height=10} +# download from linnarsson lab +# https://storage.googleapis.com/linnarsson-lab-loom/l6_r1_immune_cells.loom +l6.immune <- Connect(filename = '../data/l6_r1_immune_cells.loom', mode = 'r') +l6.immune +l6.seurat <- as.Seurat(l6.immune) +Idents(l6.seurat) <- "ClusterName" +VlnPlot(l6.seurat, features = c('Sparc', 'Ftl1', 'Junb', 'Ccl4'), ncol = 2) +# Always remember to close loom files when done +l6.immune$close_all() +``` + +For more details about interacting with loom files in R and Seurat, please see [loomR on GitHub](https://github.com/mojaveazure/loomR). + +# Converting to/from `AnnData` + +[`AnnData`](https://anndata.readthedocs.io/en/latest/) provides a Python class, created by Alex Wolf and Philipp Angerer, that can be used to store single-cell data. This data format is also use for storage in their [Scanpy](https://scanpy.readthedocs.io/en/latest/index.html) package for which we now support interoperability. Support for reading data from and saving data to `AnnData` files is provided by [SeuratDisk](https://mojaveazure.github.io/seurat-disk); please see their [vignette](https://mojaveazure.github.io/seurat-disk/articles/convert-anndata.html) showcasing the interoperability. + +# Acknowledgments + +Many thanks to [Davis McCarthy](https://twitter.com/davisjmcc?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) and [Alex Wolf](https://twitter.com/falexwolf) for their help in drafting the conversion functions. + +```{r save.times, include = FALSE} +write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_conversion_vignette_times.csv") +``` + +
+ **Session Info** +```{r} +sessionInfo() +``` +
diff --git a/vignettes/seurat5_de_vignette.Rmd b/vignettes/seurat5_de_vignette.Rmd new file mode 100644 index 000000000..858331e09 --- /dev/null +++ b/vignettes/seurat5_de_vignette.Rmd @@ -0,0 +1,143 @@ +--- +title: "Differential expression testing" +output: + html_document: + theme: united + df_print: kable + pdf_document: default +date: 'Compiled: `r Sys.Date()`' +--- + +```{r setup, include=TRUE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = TRUE, + tidy.opts = list(width.cutoff = 95), + message = FALSE, + warning = FALSE, + time_it = TRUE, + error = TRUE +) +``` + +```{r, include = FALSE} +options(SeuratData.repo.use = "http://satijalab04.nygenome.org") +``` + +# Load in the data + +This vignette highlights some example workflows for performing differential expression in Seurat. For demonstration purposes, we will be using the 2,700 PBMC object that is available via the [SeuratData](https://github.com/satijalab/seurat-data) package). + +```{r load_data} +library(Seurat) +library(SeuratData) +pbmc <- LoadData("pbmc3k", type = "pbmc3k.final") +``` + +# Perform default differential expression tests + +The bulk of Seurat's differential expression features can be accessed through the `FindMarkers()` function. As a default, Seurat performs differential expression based on the non-parametric Wilcoxon rank sum test. This replaces the previous default test ('bimod'). To test for differential expression between two specific groups of cells, specify the `ident.1` and `ident.2` parameters. + +```{r basic_de} +# list options for groups to perform differential expression on +levels(pbmc) +# Find differentially expressed features between CD14+ and FCGR3A+ Monocytes +monocyte.de.markers <- FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = "FCGR3A+ Mono") +# view results +head(monocyte.de.markers) +``` + +The results data frame has the following columns : + + * p_val : p_val (unadjusted) + * avg_log2FC : log fold-change of the average expression between the two groups. Positive values indicate that the feature is more highly expressed in the first group. + * pct.1 : The percentage of cells where the feature is detected in the first group + * pct.2 : The percentage of cells where the feature is detected in the second group + * p_val_adj : Adjusted p-value, based on Bonferroni correction using all features in the dataset. + +If the `ident.2` parameter is omitted or set to NULL, `FindMarkers()` will test for differentially expressed features between the group specified by `ident.1` and all other cells. + +```{r basic_de_2} +# Find differentially expressed features between CD14+ Monocytes and all other cells, only search for positive markers +monocyte.de.markers <- FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = NULL, only.pos = TRUE) +# view results +head(monocyte.de.markers) +``` + +# Prefilter features or cells to increase the speed of DE testing + +To increase the speed of marker discovery, particularly for large datasets, Seurat allows for pre-filtering of features or cells. For example, features that are very infrequently detected in either group of cells, or features that are expressed at similar average levels, are unlikely to be differentially expressed. Example use cases of the `min.pct`, `logfc.threshold`, `min.diff.pct`, and `max.cells.per.ident` parameters are demonstrated below. + +```{r prefilter} +# Pre-filter features that are detected at <50% frequency in either CD14+ Monocytes or FCGR3A+ Monocytes +head(FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = "FCGR3A+ Mono", min.pct = 0.5)) + +# Pre-filter features that have less than a two-fold change between the average expression of CD14+ Monocytes vs FCGR3A+ Monocytes +head(FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = "FCGR3A+ Mono", logfc.threshold = log(2))) + +# Pre-filter features whose detection percentages across the two groups are similar (within 0.25) +head(FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = "FCGR3A+ Mono", min.diff.pct = 0.25)) + +# Increasing min.pct, logfc.threshold, and min.diff.pct, will increase the speed of DE testing, but could also miss features that are prefiltered + +# Subsample each group to a maximum of 200 cells. Can be very useful for large clusters, or computationally-intensive DE tests +head(FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = "FCGR3A+ Mono", max.cells.per.ident = 200)) +``` + +# Perform DE analysis using alternative tests + +The following differential expression tests are currently supported: + + * "wilcox" : Wilcoxon rank sum test (default) + * "bimod" : Likelihood-ratio test for single cell feature expression, [(McDavid et al., Bioinformatics, 2013)](https://www.ncbi.nlm.nih.gov/pubmed/23267174) + * "roc" : Standard AUC classifier + * "t" : Student's t-test + * "poisson" : Likelihood ratio test assuming an underlying negative binomial distribution. Use only for UMI-based datasets + * "negbinom" : Likelihood ratio test assuming an underlying negative binomial distribution. Use only for UMI-based datasets + * "LR" : Uses a logistic regression framework to determine differentially expressed genes. Constructs a logistic regression model predicting group membership based on each feature individually and compares this to a null model with a likelihood ratio test. + * "MAST" : GLM-framework that treates cellular detection rate as a covariate [(Finak et al, Genome Biology, 2015)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4676162/) ([Installation instructions](https://github.com/RGLab/MAST)) + * "DESeq2" : DE based on a model using the negative binomial distribution [(Love et al, Genome Biology, 2014)](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) ([Installation instructions](https://bioconductor.org/packages/release/bioc/html/DESeq2.html)) + +For MAST and DESeq2, please ensure that these packages are installed separately in order to use them as part of Seurat. Once installed, use the `test.use` parameter can be used to specify which DE test to use. + +```{r include = FALSE} +# necessary to get MAST to work properly +library(SingleCellExperiment) +``` + +```{r multiple test} +# Test for DE features using the MAST package +head(FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = "FCGR3A+ Mono", test.use = "MAST")) + +# Test for DE features using the DESeq2 package. Throws an error if DESeq2 has not already been installed +# Note that the DESeq2 workflows can be computationally intensive for large datasets, but are incompatible with some feature pre-filtering options +# We therefore suggest initially limiting the number of cells used for testing +head(FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = "FCGR3A+ Mono", test.use = "DESeq2", max.cells.per.ident = 50)) +``` + +# Acknowledgements + +We thank the authors of the MAST and DESeq2 packages for their kind assistance and advice. We also point users to the following [study](https://www.nature.com/articles/nmeth.4612) by Charlotte Soneson and Mark Robinson, which performs careful and extensive evaluation of methods for single cell differential expression testing. + +```{r save.times, include = FALSE} +write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_de_vignette_times.csv") +``` + +
+ **Session Info** +```{r} +sessionInfo() +``` +
+ diff --git a/vignettes/seurat5_dim_reduction_vignette.Rmd b/vignettes/seurat5_dim_reduction_vignette.Rmd new file mode 100644 index 000000000..85e9a663f --- /dev/null +++ b/vignettes/seurat5_dim_reduction_vignette.Rmd @@ -0,0 +1,119 @@ +--- +title: "Seurat - Dimensional Reduction Vignette" +output: + html_document: + theme: united + df_print: kable +date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' +--- + +```{r setup, include=TRUE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = TRUE, + tidy.opts = list(width.cutoff = 95), + message = FALSE, + warning = FALSE, + time_it = TRUE, + error = TRUE +) +``` + +# Load in the data + +This vignette demonstrates how to store and interact with dimensional reduction information (such as the output from `RunPCA()`) in Seurat. For demonstration purposes, we will be using the 2,700 PBMC object that is available via the [SeuratData](https://github.com/satijalab/seurat-data) package. + +```{r load_data} +library(Seurat) +library(SeuratData) +pbmc <- LoadData("pbmc3k", type = "pbmc3k.final") +``` + +# Explore the new dimensional reduction structure + +In Seurat v3.0, storing and interacting with dimensional reduction information has been generalized and formalized into the `DimReduc` object. Each dimensional reduction procedure is stored as a `DimReduc` object in the `object@reductions` slot as an element of a named list. Accessing these reductions can be done with the `[[` operator, calling the name of the reduction desired. For example, after running a principle component analysis with `RunPCA()`, `object[['pca']]` will contain the results of the PCA. By adding new elements to the list, users can add additional, and custom, dimensional reductions. Each stored dimensional reduction contains the following slots: + +1. **cell.embeddings**: stores the coordinates for each cell in low-dimensional space. +2. **feature.loadings**: stores the weight for each feature along each dimension of the embedding +3. **feature.loadings.projected**: Seurat typically calculate the dimensional reduction on a subset of genes (for example, high-variance genes), and then project that structure onto the entire dataset (all genes). The results of that projection (calculated with `ProjectDim()`) are stored in this slot. Note that the cell loadings will remain unchanged after projection but there are now feature loadings for all feature +4. **stdev**: The standard deviations of each dimension. Most often used with PCA (storing the square roots of the eigenvalues of the covariance matrix) and can be useful when looking at the drop off in the amount of variance that is explained by each successive dimension. +5. **key**: Sets the column names for the cell.embeddings and feature.loadings matrices. For example, for PCA, the column names are PC1, PC2, etc., so the key is "PC". +6. **jackstraw**: Stores the results of the jackstraw procedure run using this dimensional reduction technique. Currently supported only for PCA. +7. **misc**: Bonus slot to store any other information you might want + +To access these slots, we provide the `Embeddings()`,`Loadings()`, and `Stdev()` functions + +```{r explore} +pbmc[['pca']] +head(Embeddings(pbmc, reduction = "pca")[, 1:5]) +head(Loadings(pbmc, reduction = "pca")[, 1:5]) +head(Stdev(pbmc, reduction = "pca")) +``` + +Seurat provides `RunPCA()` (pca), and `RunTSNE()` (tsne), and representing dimensional reduction techniques commonly applied to scRNA-seq data. When using these functions, all slots are filled automatically. + +We also allow users to add the results of a custom dimensional reduction technique (for example, multi-dimensional scaling (MDS), or [zero-inflated factor analysis](https://github.com/epierson9/ZIFA)), that is computed separately. All you need is a matrix with each cell's coordinates in low-dimensional space, as shown below. + +# Storing a custom dimensional reduction calculation + +Though not incorporated as part of the Seurat package, its easy to run multidimensional scaling (MDS) in R. If you were interested in running MDS and storing the output in your Seurat object: + +```{r mds} +# Before running MDS, we first calculate a distance matrix between all pairs of cells. +# Here we use a simple euclidean distance metric on all genes, using scale.data as input +d <- dist(t(GetAssayData(pbmc, slot = 'scale.data'))) +# Run the MDS procedure, k determines the number of dimensions +mds <- cmdscale(d = d, k = 2) +# cmdscale returns the cell embeddings, we first label the columns to ensure downstream consistency +colnames(mds) <- paste0("MDS_", 1:2) +# We will now store this as a custom dimensional reduction called "mds" +pbmc[['mds']] <- CreateDimReducObject(embeddings = mds, key = 'MDS_', assay = DefaultAssay(pbmc)) + +# We can now use this as you would any other dimensional reduction in all downstream functions +DimPlot(pbmc, reduction = "mds", pt.size = 0.5) + +# If you wold like to observe genes that are strongly correlated with the first MDS coordinate +pbmc <- ProjectDim(pbmc, reduction = "mds") + +# Display the results as a heatmap +DimHeatmap(pbmc, reduction = "mds", dims = 1, cells = 500, projected = TRUE, balanced = TRUE) + +# Explore how the first MDS dimension is distributed across clusters +VlnPlot(pbmc, features = "MDS_1") + +# See how the first MDS dimension is correlated with the first PC dimension +FeatureScatter(pbmc, feature1 = "MDS_1", feature2 = "PC_1") +``` + + +```{r save.img, include=TRUE} +library(ggplot2) +plot <- DimPlot(pbmc, reduction = "mds", pt.size = 0.5) +ggsave(filename = "../output/images/pbmc_mds.jpg", height = 7, width = 12, plot = plot, quality = 50) +``` + +```{r save.times, include = FALSE} +write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_dim_reduction_vignette_times.csv") +``` + +
+ **Session Info** +```{r} +sessionInfo() +``` +
+ + + + diff --git a/vignettes/seurat5_essential_commands.Rmd b/vignettes/seurat5_essential_commands.Rmd new file mode 100644 index 000000000..885e13d71 --- /dev/null +++ b/vignettes/seurat5_essential_commands.Rmd @@ -0,0 +1,239 @@ +--- +title: "Seurat v5 Command Cheat Sheet" +output: + html_document: + theme: united + df_print: kable + pdf_document: default +date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' +--- +*** +```{r setup, include=FALSE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = 'styler', + warning = FALSE, + eval = FALSE, + error = TRUE, + message = FALSE, + fig.width = 8, + time_it = TRUE +) +``` + +Here, we describe important commands and functions to store, access, and process data using Seurat v5. To demonstrate commamnds, we use a dataset of 3,000 PBMC (stored in-memory), and a dataset of 1.3M E18 mouse neurons (stored on-disk), which we constructed as described in the [BPCells vignette](seurat5_bpcells_interaction_vignette.html). + +```{r loaddata} +library(Seurat) +library(SeuratData) +library(BPCells) +library(dplyr) +options(Seurat.object.assay.version = "v5") +``` + +## Load datasets + +```{r matrix} +pbmc3k <- LoadData("pbmc3k") +mousebrain1m <- readRDS("/brahms/hartmana/vignette_data/1p3_million_mouse_brain.rds") + +# RNA assay is of the Assay5 class +class(pbmc3k[["RNA"]]) +class(mousebrain1m[["RNA"]]) +``` + +## Access and store expression data + +The `$` and double-bracket `[[]]` symbols can be used as efficient accessor functions for Seurat5 assays. + +```{r } + +# access the counts matrix from the RNA assay +counts_matrix <- pbmc3k[["RNA"]]$counts + +# Add a layer +# Equivalent to running pbmc3k <-NormalizeData(pbmc3k) +pbmc3k[["RNA"]]$data <- NormalizeData(pbmc3k[["RNA"]]$counts) + +# Delete a layer +pbmc3k[["RNA"]]$data <- NULL + +# pbmc3k counts matrix is stored in-memory +class(pbmc3k[["RNA"]]$counts) + +# 1.3M cell dataset counts matrix is stored on-disk +class(mousebrain1m[["RNA"]]$counts) +``` + +Despite the drastic difference in dataset size, the 1.3M cell dataset occupies a small memory footprint thanks to on-disk storage. + +```{r} +paste("PBMC 3k contains", length(colnames(pbmc3k)), "cells") +paste("Mouse brain 1.3M contains", length(colnames(mousebrain1m)), "cells") + +# Despite the mouse brain dataset containing 1.3 million cells, the assay is under 350Mbs in size due to on-disk storage +paste("PBMC 3k assay size:", format(object.size(pbmc3k[["RNA"]]), units = "Mb")) +paste("Mouse brain 1.3M assay size:", format(object.size(mousebrain1m[["RNA"]]), units = "Mb")) +``` + +## Access cell names and metadata + +Get cell names. Since Seurat v5 object doesn't require all assays have the same cells, `Cells()` is designed to get cell names of the default assay and `colnames()` is deigned to get cell names of the entire object + +```{r} +pbmc3k[["RNAsub"]] <- subset(pbmc3k[["RNA"]], cells = colnames(pbmc3k)[1:100]) +DefaultAssay(pbmc3k) <- 'RNAsub' +length(Cells(pbmc3k)) +length(colnames(pbmc3k)) +``` + +Access object metadata + +```{r meta} + +# get all object metadata +pbmc_metadata <- pbmc3k[[]] + +# get list of metadata columns +colnames(pbmc_metadata) + +# get annotations stored in metadata +annotations <- pbmc3k$seurat_annotations + +``` + +## Create Seurat or Assay objects + +By setting a global option (`Seurat.object.assay.version`), you can default to creating either Seurat v3 assays, or Seurat v5 assays. The use of v5 assays is set by default upon package loading, which ensures backwards compatibiltiy with existing workflows. + +```{r create} +# create v3 assays +options(Seurat.object.assay.version = "v3") +pbmc.counts <- Read10X(data.dir = "/brahms/hartmana/vignette_data/pbmc3k/filtered_gene_bc_matrices/hg19/") +pbmc <- CreateSeuratObject(counts = pbmc.counts) +class(pbmc[["RNA"]]) + +# create v5 assays +options(Seurat.object.assay.version = "v5") +pbmc.counts <- Read10X(data.dir = "/brahms/hartmana/vignette_data/pbmc3k/filtered_gene_bc_matrices/hg19/") +pbmc <- CreateSeuratObject(counts = pbmc.counts) +class(pbmc[["RNA"]]) +``` + +`CreateAssayObject()` and `CreateAssay5Object()` can be used to create v3 and v5 assay regardless of the setting in `Seurat.object.assay.version` + +```{r} +#create a v3 assay +assay.v3 <- CreateAssayObject(counts = pbmc.counts) + +#create a v5 assay +assay.v5 <- CreateAssay5Object(counts = pbmc.counts) + +class(assay.v3) +class(assay.v5) +``` + +Assay5 objects are more flexible, and can be used to store only a data layer, with no counts data. This can be used to create Seurat objects that require less space + +```{r} +# create an assay using only normalized data +assay.v5 <- CreateAssay5Object(data = log1p(pbmc.counts)) + +# create a Seurat object based on this assay +pbmc3k_slim <- CreateSeuratObject(assay.v5) +pbmc3k_slim +``` + +We can also convert (cast) between `Assay` and `Assay5` objects with `as()`. + +```{r} +# convert a v5 assay to a v3 assay +pbmc3k[["RNA3"]] <- as(object = pbmc3k[["RNA"]], Class = "Assay") + +# convert a v3 assay to a v5 assay +pbmc3k[["RNA5"]] <- as(object = pbmc3k[["RNA3"]], Class = "Assay5") +``` + +## Working with layers + +Seurat v5 assays store data in layers. These layers can store raw, un-normalized counts (`layer='counts'`), normalized data (`layer='data'`), or z-scored/variance-stabilized data (`layer='scale.data'`). + +```{r} +# by default, creates an RNA assay with a counts layer +obj <- CreateSeuratObject(counts = pbmc.counts) +obj + +# creates a normalized data layer +obj <- NormalizeData(obj,verbose = FALSE) +obj + +#extract only the layer names from an assay +Layers(obj[["RNA"]]) +``` + +Prior to performing integration analysis in Seurat v5, we can split the layers into groups. The `IntegrateLayers` function, described in [our vignette](seurat5_integration.html), will then align shared cell types across these layers. After performing integration, you can rejoin the layers. + +```{r joinsplit} +# create random batches +pbmc3k$batch <- sample(c("batchA","batchB","batchC"),ncol(pbmc3k),replace = TRUE) + +# split layers +pbmc3k[["RNA"]] <- split(pbmc3k[["RNA"]], f=pbmc3k$batch) +Layers(pbmc3k[["RNA"]]) + +# rejoin layers +pbmc3k[["RNA"]] <- JoinLayers(pbmc3k[["RNA"]]) +Layers(pbmc3k[["RNA"]]) +``` + +If you have multiple counts matrices, you can also create a Seurat object that is initialized with multiple layers. + +```{r multilayer} +batchA_counts <- pbmc.counts[,1:200] +batchB_counts <- pbmc.counts[,201:400] +batchC_counts <- pbmc.counts[,401:600] +count_list <- list(batchA_counts,batchB_counts,batchC_counts) +names(count_list) <- c('batchA','batchB','batchC') + +# create a Seurat object initialized with multiple layers +obj <- CreateSeuratObject(counts = count_list) +Layers(obj[["RNA"]]) +``` + +## Accessing additional data + +```{r} +pbmc3k <- FindVariableFeatures(pbmc3k, verbose = FALSE) +pbmc3k <- ScaleData(pbmc3k,verbose = FALSE) +pbmc3k <- RunPCA(pbmc3k,verbose = FALSE) + +# return variable features + +# returns information from both assay, cell embeddings and meta.data as a data.frame +fetch_df <- FetchData(object = pbmc3k, layer = "counts", vars = c("rna_MS4A1", "PC_1", "nCount_RNA")) +head(fetch_df) + +# get cell embeddings +head(Embeddings(object = pbmc3k[['pca']])[, 1:5]) + +# get feature loadings +head(Loadings(object = pbmc3k[['pca']])[, 1:5]) +``` + +
+ **Session Info** +```{r} +sessionInfo() +``` +
diff --git a/vignettes/seurat5_extensions.Rmd b/vignettes/seurat5_extensions.Rmd new file mode 100644 index 000000000..787bdcdb6 --- /dev/null +++ b/vignettes/seurat5_extensions.Rmd @@ -0,0 +1,35 @@ +--- +title: "Seurat Extension Packages" +output: html_document +--- + +In addition to the core Seurat package, we provide several extensions that enhance the functionality and utility of Seurat. A brief description of each is listed below with links to more complete documentation and examples. + +# Signac + +Signac is an R toolkit that extends Seurat for the analysis, interpretation, and exploration of single-cell chromatin datasets. The software supports the following features: + +* Calculating single-cell QC metrics +* Dimensional reduction, visualization, and clustering +* Identifying cell type-specific peaks +* Visualizing ‘pseudo-bulk’ coverage tracks +* Integration with single-cell RNA-seq datasets + +For documentation and vignettes, click [here](https://satijalab.org/signac/). + +# SeuratData + +SeuratData is a mechanism for distributing datasets in the form of Seurat objects using R’s internal package and data management systems. It represents an easy way for users to get access to datasets that are used in the Seurat vignettes. For more information, click [here](https://github.com/satijalab/seurat-data). + +# SeuratWrappers + +In order to facilitate the use of community tools with Seurat, we provide the SeuratWrappers package, which contains code to run other analysis tools on Seurat objects. For a full list of supported packages and vignettes, please see our vignettes page. + +# SeuratDisk + +The SeuratDisk package introduces the h5Seurat file format for the storage and analysis of multimodal single-cell and spatially-resolved expression experiments. The SeuratDisk package provides functions to save Seurat objects as h5Seurat files, and functions for rapid on-disk conversion between h5Seurat and AnnData formats with the goal of enhancing interoperability between Seurat and Scanpy. For more information, click [here](https://mojaveazure.github.io/seurat-disk/) + +# Azimuth + +Azimuth is a web application that uses an annotated reference dataset to automate the processing, analysis, and interpretation of a new single-cell RNA-seq experiment. Azimuth leverages a 'reference-based mapping' pipeline that inputs a counts matrix of gene expression in single cells, and performs normalization, visualization, cell annotation, and differential expression (biomarker discovery). All results can be explored within the app, and easily downloaded for additional downstream analysis. To use the Azimuth web app, visit the Azimuth website [here](https://azimuth.hubmapconsortium.org/). + diff --git a/vignettes/seurat5_future_vignette.Rmd b/vignettes/seurat5_future_vignette.Rmd new file mode 100644 index 000000000..694227bb8 --- /dev/null +++ b/vignettes/seurat5_future_vignette.Rmd @@ -0,0 +1,134 @@ +--- +title: "Parallelization in Seurat with future" +output: + html_document: + theme: united + df_print: kable +date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' +--- + +```{r setup, include=TRUE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = TRUE, + tidy.opts = list(width.cutoff = 95), + message = FALSE, + warning = FALSE, + time_it = TRUE, + error = TRUE +) +``` +In Seurat, we have chosen to use the `future` framework for parallelization. In this vignette, we will demonstrate how you can take advantage of the `future` implementation of certain Seurat functions from a user's perspective. If you are interested in learning more about the `future` framework beyond what is described here, please see the package vignettes [here](https://cran.r-project.org/web/packages/future/index.html) for a comprehensive and detailed description. + +# How to use parallelization in Seurat + +To access the parallel version of functions in Seurat, you need to load the `future` package and set the `plan`. The `plan` will specify how the function is executed. The default behavior is to evaluate in a non-parallelized fashion (sequentially). To achieve parallel (asynchronous) behavior, we typically recommend the "multiprocess" strategy. By default, this uses all available cores but you can set the `workers` parameter to limit the number of concurrently active futures. + +```{r future.setup} +library(future) +# check the current active plan +plan() +# change the current plan to access parallelization +plan("multiprocess", workers = 4) +plan() +``` + +# 'Futurized' functions in Seurat + +The following functions have been written to take advantage of the future framework and will be parallelized if the current `plan` is set appropriately. Importantly, the way you call the function shouldn't change. + +* `NormalizeData()` +* `ScaleData()` +* `JackStraw()` +* `FindMarkers()` +* `FindIntegrationAnchors()` +* `FindClusters()` - if clustering over multiple resolutions + +For example, to run the parallel version of `FindMarkers()`, you simply need to set the plan and call the function as usual. + +```{r demo} +library(Seurat) +pbmc <- readRDS("../data/pbmc3k_final.rds") +pbmc <- UpdateSeuratObject(pbmc) +pbmc[["RNA"]] <- as(pbmc[["RNA"]], Class = "Assay5") + +# Enable parallelization +plan('multiprocess', workers = 4) +markers <- FindMarkers(pbmc, ident.1 = "NK", verbose = FALSE) +``` + +# Comparison of sequential vs. parallel + +Here we'll perform a brief comparison the runtimes for the same function calls with and without parallelization. Note that while we expect that using a parallelized strategy will decrease the runtimes of the functions listed above, the magnitude of that decrease will depend on many factors (e.g. the size of the dataset, the number of workers, specs of the system, the future strategy, etc). The following benchmarks were performed on a desktop computer running Ubuntu 16.04.5 LTS with an Intel(R) Core(TM) i7-6800K CPU @ 3.40GHz and 96 GB of RAM. + +
+ **Click to see bencharking code** + +```{r compare} +timing.comparisons <- data.frame(fxn = character(), time = numeric(), strategy = character()) +plan("sequential") +start <- Sys.time() +pbmc <- ScaleData(pbmc, vars.to.regress = "percent.mt", verbose = FALSE) +end <- Sys.time() +timing.comparisons <- rbind(timing.comparisons, data.frame(fxn = "ScaleData", time = as.numeric(end - start, units = "secs"), strategy = "sequential")) + +start <- Sys.time() +markers <- FindMarkers(pbmc, ident.1 = "NK", verbose = FALSE) +end <- Sys.time() +timing.comparisons <- rbind(timing.comparisons, data.frame(fxn = "FindMarkers", time = as.numeric(end - start, units = "secs"), strategy = "sequential")) + +plan("multiprocess", workers = 4) +start <- Sys.time() +pbmc <- ScaleData(pbmc, vars.to.regress = "percent.mt", verbose = FALSE) +end <- Sys.time() +timing.comparisons <- rbind(timing.comparisons, data.frame(fxn = "ScaleData", time = as.numeric(end - start, units = "secs"), strategy = "multiprocess")) + +start <- Sys.time() +markers <- FindMarkers(pbmc, ident.1 = "NK", verbose = FALSE) +end <- Sys.time() +timing.comparisons <- rbind(timing.comparisons, data.frame(fxn = "FindMarkers", time = as.numeric(end - start, units = "secs"), strategy = "multiprocess")) +``` + +
+ +```{r viz.compare} +library(ggplot2) +library(cowplot) +ggplot(timing.comparisons, aes(fxn, time)) + geom_bar(aes(fill = strategy), stat = "identity", position = "dodge") + ylab("Time(s)") + xlab("Function") + theme_cowplot() +``` + +# Frequently asked questions + +1. **Where did my progress bar go?** +
Unfortunately, the when running these functions in any of the parallel plan modes you will lose the progress bar. This is due to some technical limitations in the `future` framework and R generally. If you want to monitor function progress, you'll need to forgo parallelization and use `plan("sequential")`. + +2. **What should I do if I keep seeing the following error?** +``` +Error in getGlobalsAndPackages(expr, envir = envir, globals = TRUE) : + The total size of the X globals that need to be exported for the future expression ('FUN()') is X GiB. + This exceeds the maximum allowed size of 500.00 MiB (option 'future.globals.maxSize'). The X largest globals are ... +``` +For certain functions, each worker needs access to certain global variables. If these are larger than the default limit, you will see this error. To get around this, you can set `options(future.globals.maxSize = X)`, where X is the maximum allowed size in bytes. So to set it to 1GB, you would run `options(future.globals.maxSize = 1000 * 1024^2)`. Note that this will increase your RAM usage so set this number mindfully. + + +```{r save.times, include = FALSE} +write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_future_vignette_times.csv") +``` + +
+ **Session Info** +```{r} +sessionInfo() +``` +
diff --git a/vignettes/seurat5_get_started.Rmd b/vignettes/seurat5_get_started.Rmd new file mode 100644 index 000000000..998c6cc9d --- /dev/null +++ b/vignettes/seurat5_get_started.Rmd @@ -0,0 +1,159 @@ +--- +title: "Getting Started with Seurat" +output: + html_document: + theme: united + df_print: kable + pdf_document: default +--- + +```{r fxns, include = FALSE} +library('htmlTable') +make_list <- function(items) { + paste0("
    ", sprintf('
  • %s
  • ', items), '
', collapse = '') +} +make_href <- function(url, text){ + paste0("") +} +make_href2 <- function(url, text){ + paste0("", text, "") +} +process_entry <- function(dat) { + if (grepl(pattern = "https://satijalab.org/img/vignette_images", x = dat$image)) { + img <- paste0('![](', dat$image, '){width=3000px}') + } else if (grepl(pattern = "assets/", x= dat$image)) { + img <- paste0('![](', dat$image, '){width=3000px}') + } else { + img <- paste0('![](', '../output/images/', dat$image, '){width=3000px}') + } + if (grepl(pattern = "https://satijalab.org/", x = dat$name)) { + link <- dat$name + } else { + link <- paste0(dat$name, ".html") + } + go.button <- paste0('GO') + data.frame( + title = make_href(url = link, text = dat$title), + img = img, + desc = dat$summary, + btn = go.button + ) +} +process_wrapper_entry <- function(dat) { + data.frame( + Package = dat$name, + Vignette = make_href2(url = dat$link, text = dat$title), + Reference = make_href2(url = dat$reference, text = dat$citation), + Source = make_href2(url = dat$source, text = dat$source) + ) +} +make_vignette_card_section <- function(vdat, cat) { + vignettes <- vdat[[cat]]$vignettes + dat <- data.frame(title = character(), img = character(), desc = character()) + for (v in 1:length(x = vignettes)) { + dat <- rbind(dat, process_entry(vignettes[[v]])) + if(nrow(x = dat) == 3 | v == length(x = vignettes)){ + colnames(dat) <- NULL + dat <- t(dat) + if (ncol(x = dat) == 2) { + print(htmlTable( + dat, + align = '|l|l|', + css.cell = "padding-left: .75em; width: 50%", + css.class = "two-column-htmltable" + )) + } else if (ncol(x = dat) == 1){ + print(htmlTable( + dat, + align = '|l|', + css.cell = "padding-left: .75em; width: 100%", + css.class = "one-column-htmltable" + )) + } else { + print(htmlTable( + dat, + align = '|l|l|l|', + css.cell = "padding-left: .75em; width: 30%" + )) + } + dat <- data.frame(title = character(), img = character(), desc = character()) + } + } +} +``` + +```{r yaml, include = TRUE} +library(yaml) +vdat <- read_yaml(file = "vignettes.yaml") +``` + +```{=html} + +``` + +We provide a series of vignettes, tutorials, and analysis walkthroughs to help users get started with Seurat. You can also check out our [Reference page](../reference/index.html) which contains a full list of functions available to users. + +# Introductory Vignettes + +For new users of Seurat, we suggest starting with a guided walk through of a dataset of 2,700 Peripheral Blood Mononuclear Cells (PBMCs) made publicly available by 10X Genomics. This tutorial implements the major components of a standard unsupervised clustering workflow including QC and data filtration, calculation of high-variance genes, dimensional reduction, graph-based clustering, and the identification of cluster markers. + +We provide additional introductory vignettes for users who are interested in analyzing multimodal single-cell datasets (e.g. from CITE-seq, or the 10x mulitome kit), or spatial datasets (e.g. from 10x visium or SLIDE-seq). + +```{r results='asis', echo=FALSE, warning=FALSE, message = FALSE} +make_vignette_card_section(vdat = vdat, cat = 1) +``` + +# Data Integration + +Recently, we have developed [computational methods](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8) for integrated analysis of single-cell datasets generated across different conditions, technologies, or species. As an example, we provide a guided walk through for integrating and comparing PBMC datasets generated under different stimulation conditions. We provide additional vignettes demonstrating how to leverage an annotated scRNA-seq reference to map and label cells from a query, and to efficiently integrate large datasets. + +```{r results='asis', echo=FALSE, warning=FALSE, message = FALSE} +make_vignette_card_section(vdat = vdat, cat = 2) +``` + +# Additional New Methods + +Seurat also offers additional novel statistical methods for analyzing single-cell data. These include: + +* Weighted-nearest neighbor (WNN) analysis: to define cell state based on multiple modalities [[paper](https://doi.org/10.1016/j.cell.2021.04.048)] +* Mixscape: to analyze data from pooled single-cell CRISPR screens [[paper](https://doi.org/10.1038/s41588-021-00778-2)] +* SCTransform: Improved normalization for single-cell RNA-seq data [[paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1)]] +* SCTransform, v2 regularization [[paper](https://www.biorxiv.org/content/10.1101/2021.07.07.451498v1.full)]] + +```{r results='asis', echo=FALSE, warning=FALSE, message = FALSE} +make_vignette_card_section(vdat = vdat, cat = 3) +``` + +# Other + +Here we provide a series of short vignettes to demonstrate a number of features that are commonly used in Seurat. We’ve focused the vignettes around questions that we frequently receive from users. Click on a vignette to get started. + +```{r results='asis', echo=FALSE, warning=FALSE, message = FALSE} +make_vignette_card_section(vdat = vdat, cat = 4) +``` + +# SeuratWrappers + +In order to facilitate the use of community tools with Seurat, we provide the Seurat Wrappers package, which contains code to run other analysis tools on Seurat objects. For the initial release, we provide wrappers for a few packages in the table below but would encourage other package developers interested in interfacing with Seurat to check out our contributor guide [here](https://github.com/satijalab/seurat.wrappers/wiki/Submission-Process). + +```{r results='asis', echo=FALSE, warning=FALSE, message = FALSE} +library(knitr) +library(kableExtra) +cat <- 5 +vignettes <- vdat[[cat]]$vignettes +dat <- data.frame(Package = character(), Vignette = character(), Reference = character(), Source = character()) +for (v in 1:length(x = vignettes)) { + dat <- rbind(dat, process_wrapper_entry(vignettes[[v]])) +} +dat %>% + kable(format = "html", escape = FALSE) %>% + kable_styling(bootstrap_options = c("striped", "hover")) +``` diff --git a/vignettes/seurat5_hashing_vignette.Rmd b/vignettes/seurat5_hashing_vignette.Rmd new file mode 100644 index 000000000..a1cc52d96 --- /dev/null +++ b/vignettes/seurat5_hashing_vignette.Rmd @@ -0,0 +1,291 @@ +--- +title: "Demultiplexing with hashtag oligos (HTOs)" +output: + html_document: + theme: united + df_print: kable + pdf_document: default +date: 'Compiled: `r Sys.Date()`' +--- + +```{r setup, include=TRUE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = TRUE, + tidy.opts = list(width.cutoff = 95), + message = FALSE, + warning = FALSE, + time_it = TRUE, + error = TRUE +) +``` + +Developed in collaboration with the Technology Innovation Group at NYGC, Cell Hashing uses oligo-tagged antibodies against ubiquitously expressed surface proteins to place a "sample barcode" on each single cell, enabling different samples to be multiplexed together and run in a single experiment. For more information, please refer to this [paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1603-1). + +This vignette will give a brief demonstration on how to work with data produced with Cell Hashing in Seurat. Applied to two datasets, we can successfully demultiplex cells to their the original sample-of-origin, and identify cross-sample doublets. + +
+
The demultiplexing function `HTODemux()` implements the following procedure:
    +
  • We perform a k-medoid clustering on the normalized HTO values, which initially separates cells into K(# of samples)+1 clusters.
  • +
  • We calculate a 'negative' distribution for HTO. For each HTO, we use the cluster with the lowest average value as the negative group.
  • +
  • For each HTO, we fit a negative binomial distribution to the negative cluster. We use the 0.99 quantile of this distribution as a threshold.
  • +
  • Based on these thresholds, each cell is classified as positive or negative for each HTO.
  • +
  • Cells that are positive for more than one HTOs are annotated as doublets.
  • +
+ + +# 8-HTO dataset from human PBMCs + +
+ +
Dataset description:
+
    +
  • Data represent peripheral blood mononuclear cells (PBMCs) from eight different donors.
  • +
  • Cells from each donor are uniquely labeled, using CD45 as a hashing antibody.
  • +
  • Samples were subsequently pooled, and run on a single lane of the the 10X Chromium v2 system. +
  • You can download the count matrices for RNA and HTO [here](https://www.dropbox.com/sh/ntc33ium7cg1za1/AAD_8XIDmu4F7lJ-5sp-rGFYa?dl=0), or the FASTQ files from [GEO](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE108313)
  • +
+ +
+ +## Basic setup + +Load packages + +```{r load_pacakges} +library(Seurat) +options(Seurat.object.assay.version = "v5") +``` + +Read in data + +```{r read_Data} +# Load in the UMI matrix +pbmc.umis <- readRDS("../data/pbmc_umi_mtx.rds") + +# For generating a hashtag count matrix from FASTQ files, please refer to https://github.com/Hoohm/CITE-seq-Count. +# Load in the HTO count matrix +pbmc.htos <- readRDS("../data/pbmc_hto_mtx.rds") + +# Select cell barcodes detected by both RNA and HTO +# In the example datasets we have already filtered the cells for you, but perform this step for clarity. +joint.bcs <- intersect(colnames(pbmc.umis), colnames(pbmc.htos)) + +# Subset RNA and HTO counts by joint cell barcodes +pbmc.umis <- pbmc.umis[, joint.bcs] +pbmc.htos <- as.matrix(pbmc.htos[, joint.bcs]) + +# Confirm that the HTO have the correct names +rownames(pbmc.htos) +``` + +Setup Seurat object and add in the HTO data + +```{r hashtag_setup} +# Setup Seurat object +pbmc.hashtag <- CreateSeuratObject(counts = pbmc.umis) + +# Normalize RNA data with log normalization +pbmc.hashtag <- NormalizeData(pbmc.hashtag) +# Find and scale variable features +pbmc.hashtag <- FindVariableFeatures(pbmc.hashtag, selection.method = 'mean.var.plot') +pbmc.hashtag <- ScaleData(pbmc.hashtag, features = VariableFeatures(pbmc.hashtag)) +``` + +## Adding HTO data as an independent assay + +You can read more about working with multi-modal data [here](multimodal_vignette.html) + +```{r hto_assay} +# Add HTO data as a new assay independent from RNA +pbmc.hashtag[['HTO']] <- CreateAssay5Object(counts = pbmc.htos) +# Normalize HTO data, here we use centered log-ratio (CLR) transformation +pbmc.hashtag <- NormalizeData(pbmc.hashtag, assay = 'HTO', normalization.method = 'CLR') +``` + +## Demultiplex cells based on HTO enrichment + +Here we use the Seurat function `HTODemux()` to assign single cells back to their sample origins. + +```{r hashtag_demux, results = FALSE} +# If you have a very large dataset we suggest using k_function = "clara". This is a k-medoid clustering function for large applications +# You can also play with additional parameters (see documentation for HTODemux()) to adjust the threshold for classification +# Here we are using the default settings +pbmc.hashtag <- HTODemux(pbmc.hashtag, assay = "HTO", positive.quantile = 0.99) +``` + +## Visualize demultiplexing results + +Output from running `HTODemux()` is saved in the object metadata. We can visualize how many cells are classified as singlets, doublets and negative/ambiguous cells. + +```{r demux_summary} +# Global classification results +table(pbmc.hashtag$HTO_classification.global) +``` + +Visualize enrichment for selected HTOs with ridge plots + +```{r hashtag_ridge, fig.width=9} +# Group cells based on the max HTO signal +Idents(pbmc.hashtag) <- 'HTO_maxID' +RidgePlot(pbmc.hashtag, assay = 'HTO', features = rownames(pbmc.hashtag[['HTO']])[1:2], ncol = 2) +``` + +Visualize pairs of HTO signals to confirm mutual exclusivity in singlets + +```{r hashtag_scatter1, fig.height=8, fig.width=9} +FeatureScatter(pbmc.hashtag, feature1 = 'hto_HTO-A', feature2 = 'hto_HTO-B') +``` + +Compare number of UMIs for singlets, doublets and negative cells +```{r hashtag_vln, fig.width=10} +Idents(pbmc.hashtag) <- 'HTO_classification.global' +VlnPlot(pbmc.hashtag, features = 'nCount_RNA', pt.size = 0.1, log = TRUE) +``` + +Generate a two dimensional tSNE embedding for HTOs.Here we are grouping cells by singlets and doublets for simplicity. + +```{r hashtag_sub_tsne, fig.width=9} +#First, we will remove negative cells from the object +pbmc.hashtag.subset <- subset(pbmc.hashtag, idents = 'Negative', invert = TRUE) + +# Calculate a tSNE embedding of the HTO data +DefaultAssay(pbmc.hashtag.subset) <- "HTO" +pbmc.hashtag.subset <- ScaleData(pbmc.hashtag.subset, features = rownames(pbmc.hashtag.subset), verbose = FALSE) +pbmc.hashtag.subset <- RunPCA(pbmc.hashtag.subset, features = rownames(pbmc.hashtag.subset), approx = FALSE) +pbmc.hashtag.subset <- RunTSNE(pbmc.hashtag.subset, dims = 1:8, perplexity = 100) +DimPlot(pbmc.hashtag.subset) +#You can also visualize the more detailed classification result by running Idents(object) <- 'HTO_classification' before plotting. Here, you can see that each of the small clouds on the tSNE plot corresponds to one of the 28 possible doublet combinations. +``` + +Create an HTO heatmap, based on Figure 1C in the Cell Hashing paper. + +```{r hashtag_heatmap, fig.width=12} +# To increase the efficiency of plotting, you can subsample cells using the num.cells argument +HTOHeatmap(pbmc.hashtag, assay = 'HTO', ncells = 5000) +``` + +Cluster and visualize cells using the usual scRNA-seq workflow, and examine for the potential presence of batch effects. + +```{r hastag_cluster} +# Extract the singlets +pbmc.singlet <- subset(pbmc.hashtag, idents = 'Singlet') + +# Select the top 1000 most variable features +pbmc.singlet <- FindVariableFeatures(pbmc.singlet, selection.method = 'mean.var.plot') + +# Scaling RNA data, we only scale the variable features here for efficiency +pbmc.singlet <- ScaleData(pbmc.singlet, features = VariableFeatures(pbmc.singlet)) + +# Run PCA +pbmc.singlet <- RunPCA(pbmc.singlet, features = VariableFeatures(pbmc.singlet)) +``` + +```{r hashtag_tsne, fig.width=9} +# We select the top 10 PCs for clustering and tSNE based on PCElbowPlot +pbmc.singlet <- FindNeighbors(pbmc.singlet, reduction = 'pca', dims = 1:10) +pbmc.singlet <- FindClusters(pbmc.singlet, resolution = 0.6, verbose = FALSE) +pbmc.singlet <- RunTSNE(pbmc.singlet, reduction = 'pca', dims = 1:10) + +# Projecting singlet identities on TSNE visualization +DimPlot(pbmc.singlet, group.by = "HTO_classification") +``` + +# 12-HTO dataset from four human cell lines + +
+ +
Dataset description:
+
    +
  • Data represent single cells collected from four cell lines: HEK, K562, KG1 and THP1
  • +
  • Each cell line was further split into three samples (12 samples in total).
  • +
  • Each sample was labeled with a hashing antibody mixture (CD29 and CD45), pooled, and run on a single lane of 10X.
  • +
  • Based on this design, we should be able to detect doublets both across and within cell types
  • +
  • You can download the count matrices for RNA and HTO [here](https://www.dropbox.com/sh/c5gcjm35nglmvcv/AABGz9VO6gX9bVr5R2qahTZha?dl=0), and are available on GEO [here](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE108313)
  • +
+
+ +## Create Seurat object, add HTO data and perform normalization + +```{r hto_setup} +library(Seurat) +options(Seurat.object.assay.version = "v5") + +# Read in UMI count matrix for RNA +hto12.umis <- readRDS("../data/hto12_umi_mtx.rds") + +# Read in HTO count matrix +hto12.htos <- readRDS("../data/hto12_hto_mtx.rds") + +# Select cell barcodes detected in both RNA and HTO +cells.use <- intersect(rownames(hto12.htos), colnames(hto12.umis)) + +# Create Seurat object and add HTO data +hto12 <- CreateSeuratObject(counts = as(hto12.umis[, cells.use], "dgCMatrix"), min.features = 300) +hto12[['HTO']] <- CreateAssay5Object(counts = t(x = hto12.htos[colnames(hto12), 1:12])) + +# Normalize data +hto12 <- NormalizeData(hto12) +hto12 <- NormalizeData(hto12, assay = "HTO", normalization.method = "CLR") +``` + +## Demultiplex data + +```{r demux, results = FALSE} +hto12 <- HTODemux(hto12, assay = "HTO", positive.quantile = 0.99) +``` + +## Visualize demultiplexing results + +Distribution of selected HTOs grouped by classification, displayed by ridge plots + +```{r ridgeplot, fig.height=10, fig.width=9} +RidgePlot(hto12, assay = 'HTO', features = c("HEK-A","K562-B","KG1-A","THP1-C"), ncol = 2) +``` + +Visualize HTO signals in a heatmap + +```{r heatmap, fig.width=12} +HTOHeatmap(hto12, assay = "HTO") +``` + +## Visualize RNA clustering + +
  • Below, we cluster the cells using our standard scRNA-seq workflow. As expected we see four major clusters, corresponding to the cell lines
  • +
  • In addition, we see small clusters in between, representing mixed transcriptomes that are correctly annotated as doublets.
  • +
  • We also see within-cell type doublets, that are (perhaps unsurprisingly) intermixed with singlets of the same cell type
  • + +```{r hto_sub_tsne, fig.width=9} +# Remove the negative cells +hto12 <- subset(hto12, idents = 'Negative', invert = TRUE) + +# Run PCA on most variable features +hto12 <- FindVariableFeatures(hto12, selection.method = 'mean.var.plot') +hto12 <- ScaleData(hto12, features = VariableFeatures(hto12)) +hto12 <- RunPCA(hto12) +hto12 <- RunTSNE(hto12, dims = 1:5, perplexity = 100) +DimPlot(hto12) + NoLegend() +``` + +```{r save.times, include = TRUE} +write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_hashing_vignette_times.csv") +``` + +
    + **Session Info** +```{r} +sessionInfo() +``` +
    diff --git a/vignettes/seurat5_integration.Rmd b/vignettes/seurat5_integration.Rmd new file mode 100644 index 000000000..b03134b6d --- /dev/null +++ b/vignettes/seurat5_integration.Rmd @@ -0,0 +1,219 @@ +--- +title: "Integrative analysis in Seurat v5" +output: + html_document: + theme: united + pdf_document: default +date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' +--- + +```{r setup, include=FALSE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = 'styler', + fig.width = 8, + message = FALSE, + warning = FALSE, + time_it = TRUE, + error = TRUE +) +``` + +```{r init} +library(Seurat) +library(SeuratData) +library(SeuratWrappers) +library(Azimuth) +library(ggplot2) +library(patchwork) +options(future.globals.maxSize = 1e9) +``` + +## Introduction +Integration of single-cell sequencing datasets, for example across experimental batches, donors, or conditions, is often an important step in scRNA-seq workflows. Integrative analysis can help to match shared cell types and states across datasets, which can boost statistical power, and most importantly, facilitate accurate comparative analysis across datasets. In previous versions of Seurat we introduced methods for integrative analysis, including our ‘anchor-based’ integration workflow. Many labs have also published powerful and pioneering methods, including [Harmony](https://github.com/immunogenomics/harmony) and [scVI](https://yoseflab.github.io/software/scvi-tools/), for integrative analysis. +We recognize that while the goal of matching shared cell types across datasets may be important for many problems, users may also be concerned about which method to use, or that integration could result in a loss of biological resolution. In Seurat v5, we introduce more flexible and streamlined infrastructure to run different integration algorithms with a single line of code. This makes it easier to explore the results of different integration methods, and to compare these results to a workflow that excludes integration steps. +For this vignette, we use a [dataset of human PBMC profiled with seven different technologies](https://www.nature.com/articles/s41587-020-0465-8), profiled as part of a systematic comparative analysis (`pbmcsca`). The data is available as part of our [SeuratData](https://github.com/satijalab/seurat-data) package. + +## Layers in the Seurat v5 object +Seurat v5 assays store data in layers. These layers can store raw, un-normalized counts (`layer='counts'`), normalized data (`layer='data'`), or z-scored/variance-stabilized data (`layer='scale.data'`). We can load in the data, remove low-quality cells, and obtain predicted cell annotations (which will be useful for assessing integration later), using our [Azimuth pipeline](https://satijalab.github.io/azimuth/articles/run_azimuth_tutorial.html). + +```{r installdata, include=FALSE, eval=TRUE} +InstallData("pbmcref") +``` + +```{r loadannotate, message=FALSE} +# load in the pbmc systematic comparative analysis dataset +obj <- LoadData("pbmcsca") +obj <- subset(obj, nFeature_RNA > 1000) +obj <- RunAzimuth(obj, reference = "pbmcref") +# currently, the object has two layers in the RNA assay: counts, and data +obj +``` +The object contains data from nine different batches (stored in the `Method` column in the object metadata), representing seven different technologies. We will aim to integrate the different batches together. In previous versions of Seurat, we would require the data to be represented as nine different Seurat objects. When using Seurat v5 assays, we can instead keep all the data in one object, but simply split the layers. +After splitting, there are now 18 layers (a `counts` and `data` layer for each batch). We can also run a standard scRNA-seq analysis (i.e. without integration). Note that since the data is split into layers, normalization and variable feature identification is performed for each batch independently (a consensus set of variable features is automatically identified). +```{r splitassay} +obj[["RNA"]] <- split(obj[["RNA"]], f = obj$Method) +obj +obj <- NormalizeData(obj) +obj <- FindVariableFeatures(obj) +obj <- ScaleData(obj) +obj <- RunPCA(obj) +``` +We can now visualize the results of a standard analysis without integration. Note that cells are grouping both by cell type and by underlying method. While a UMAP analysis is just a visualization of this, clustering this dataset would return predominantly batch-specific clusters. Especially if previous cell-type annotations were not available, this would make downstream analysis extremely challenging. +```{r unintegratedUMAP, fig.height=5, fig.width=14} +obj <- FindNeighbors(obj, dims=1:30, reduction = 'pca') +obj <- FindClusters(obj, resolution = 2, cluster.name = "unintegrated_clusters") +obj <- RunUMAP(obj, dims = 1:30, reduction = 'pca', reduction.name = 'umap.unintegrated') +# visualize by batch and cell type annotation +# cell type annotations were previously added by Azimuth +DimPlot(obj, reduction = 'umap.unintegrated', group.by=c('Method','predicted.celltype.l2')) +``` + +## Perform streamlined (one-line) integrative analysis + +Seurat v5 enables streamlined integrative analysis using the `IntegrateLayers` function. The method currently supports five integration methods. Each of these methods performs integration in low-dimensional space, and returns a dimensional reduction (i.e. `integrated.rpca`) that aims to co-embed shared cell types across batches: + +* Anchor-based CCA integration (`method=CCAIntegration`) +* Anchor-based RPCA integration (`method=RPCAIntegration`) +* Harmony (`method=HarmonyIntegration`) +* FastMNN (`method= FastMNNIntegration`) +* scVI (`method=scVIIntegration`) + +Note that our anchor-based RPCA integration represents a faster and more conservative (less correction) method for integration. For interested users, we discuss this method in more detail in our [previous RPCA vignette](https://satijalab.org/seurat/articles/integration_rpca) + +You can find more detail on each method, and any installation prerequisites, in Seurat's documentation (for example, `?scVIIntegration`). For example, scVI integration requires `reticulate` which can be installed from CRAN (`install.packages("reticulate")`) as well as `scvi-tools` and its dependencies installed in a conda environment. Please see scVI installation instructions [here](https://docs.scvi-tools.org/en/stable/installation.html). + + +Each of the following lines perform a new integration using a single line of code: + +```{r integratelayerscca, results='hide'} +obj <- IntegrateLayers( + object = obj, method = CCAIntegration, + orig.reduction = "pca", new.reduction = 'integrated.cca', + verbose = FALSE) +``` + +```{r integratelayersrpca, results='hide'} +obj <- IntegrateLayers( + object = obj, method = RPCAIntegration, + orig.reduction = "pca", new.reduction = 'integrated.rpca', + verbose = FALSE) +``` + +```{r integratelayersharmony, results='hide'} +obj <- IntegrateLayers( + object = obj, method = HarmonyIntegration, + orig.reduction = "pca", new.reduction = 'harmony', + verbose = FALSE) +``` + +```{r integratelayersfastmnn, results='hide'} +obj <- IntegrateLayers( + object = obj, method = FastMNNIntegration, + new.reduction = 'integrated.mnn', + verbose = FALSE) +``` + +```{r integratelayersscvi, eval=FALSE, results='hide'} +obj <- IntegrateLayers( + object = obj, method = scVIIntegration, + new.reduction = 'integrated.scvi', + conda_env = '../miniconda3/envs/scvi-env', verbose = FALSE) +``` + +```{r addscvi, include=FALSE} +scvi.reduc <- readRDS("/brahms/haoy/seurat5/object/pbmcsca_scvi.dr.rds")@cell.embeddings +scvi.reduc <- scvi.reduc[Cells(obj),] +obj[["integrated.scvi"]] <- CreateDimReducObject(embeddings = scvi.reduc) +``` + +For any of the methods, we can now visualize and cluster the datasets. We show this for CCA integration and scVI, but you can do this for any method: + +```{r integratedprojections, fig.height=16, fig.width=16} +obj <- FindNeighbors(obj, reduction = 'integrated.cca', dims = 1:30) +obj <- FindClusters(obj,resolution = 2, cluster.name = 'cca_clusters') +obj <- RunUMAP(obj, reduction = "integrated.cca", dims = 1:30, reduction.name = 'umap.cca') +p1 <- DimPlot( + obj, reduction = "umap.cca", + group.by = c("Method", "predicted.celltype.l2", "cca_clusters"), + combine = FALSE, label.size = 2) + +obj <- FindNeighbors(obj, reduction = 'integrated.scvi', dims = 1:30) +obj <- FindClusters(obj,resolution = 2, cluster.name = 'scvi_clusters') +obj <- RunUMAP(obj, reduction = "integrated.scvi", dims = 1:30, reduction.name = 'umap.scvi') +p2 <- DimPlot( + obj, reduction = "umap.scvi", + group.by = c("Method", "predicted.celltype.l2", "scvi_clusters"), + combine = FALSE, label.size = 2) + +wrap_plots(c(p1, p2), ncol = 2, byrow = F) +``` + +We hope that by simplifying the process of performing integrative analysis, users can more carefully evaluate the biological information retained in the integrated dataset. For example, users can compare the expression of biological markers based on different clustering solutions, or visualize one method's clustering solution on different UMAP visualizations. + +```{r vlnplots, fig.height=5, fig.width=16, warning=FALSE} +p1 <- VlnPlot( + obj, features = "rna_CD8A", group.by = 'unintegrated_clusters' +) + NoLegend() + ggtitle("CD8A - Unintegrated Clusters") +p2 <- VlnPlot( + obj, "rna_CD8A", group.by = 'cca_clusters' +) + NoLegend() + ggtitle("CD8A - CCA Clusters") +p3 <- VlnPlot( + obj, "rna_CD8A", group.by = 'scvi_clusters' +) + NoLegend() + ggtitle("CD8A - scVI Clusters") +p1 | p2 | p3 +``` + +```{r umaps, fig.height=5, fig.width=16} +obj <- RunUMAP(obj, reduction = "integrated.rpca", dims = 1:30, reduction.name = 'umap.rpca') +p4 <- DimPlot(obj, reduction="umap.unintegrated", group.by=c("cca_clusters")) +p5 <- DimPlot(obj, reduction="umap.rpca", group.by=c("cca_clusters")) +p6 <- DimPlot(obj, reduction="umap.scvi", group.by=c("cca_clusters")) +p4 | p5 | p6 +``` + +Once integrative analysis is complete, you can rejoin the layers - which collapses the individual datasets together and recreates the original `counts` and `data` layers. You will need to do this before performing any differential expression analysis. However, you can always resplit the layers in case you would like to reperform integrative analysis. + +```{r joinlayers} +obj <- JoinLayers(obj) +obj +``` + +Lastly, users can also perform integration using sctransform-normalized data (see our [SCTransform vignette](https://satijalab.org/seurat/articles/sctransform_vignette) for more information), by first running SCTransform normalization, and then setting the `normalization.method` argument in `IntegrateLayers`. + +```{r, include=FALSE} +obj <- LoadData("pbmcsca") +obj <- subset(obj, nFeature_RNA > 1000) +obj[["RNA"]] <- split(obj[["RNA"]], f = obj$Method) +``` + +```{r sct} +options(future.globals.maxSize = 3e+09) +obj <- SCTransform(obj) +obj <- RunPCA(obj, npcs = 30, verbose = F) +obj <- IntegrateLayers(object = obj, + method = RPCAIntegration, + normalization.method="SCT", + verbose = F) +obj <- FindNeighbors(obj, dims = 1:30,reduction = 'integrated.dr') +obj <- FindClusters(obj, resolution = 2) +obj <- RunUMAP(obj, dims = 1:30,reduction = 'integrated.dr') +``` + +
    + **Session Info** +```{r} +sessionInfo() +``` +
    diff --git a/vignettes/seurat5_integration_bridge.Rmd b/vignettes/seurat5_integration_bridge.Rmd new file mode 100644 index 000000000..2453d060b --- /dev/null +++ b/vignettes/seurat5_integration_bridge.Rmd @@ -0,0 +1,281 @@ +--- +title: "Dictionary Learning for cross-modality integration" +output: + html_document: + theme: united + df_print: kable + pdf_document: default +date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' +--- +*** +```{r setup, include=FALSE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = "styler", + tidy.opts = list(width.cutoff = 95), + warning = FALSE, + error = TRUE, + message = FALSE, + fig.width = 8, + time_it = TRUE +) +``` + +In the same way that read mapping tools have transformed genome sequence analysis, the ability to map new datasets to established references represents an exciting opportunity for the field of single-cell genomics. Along with others in the community, we have developed [tools to map and interpret query datasets](https://satijalab.org/seurat/articles/multimodal_reference_mapping.html), and have also constructed a [set of scRNA-seq datasets for diverse mammalian tissues](http://azimuth.hubmapconsortium.org). + +A key challenge is to extend this reference mapping framework to technologies that do not measure gene expression, even if the underlying reference is based on scRNA-seq. In [Hao et al, Nat Biotechnol 2023](https://www.nature.com/articles/s41587-023-01767-y), we introduce 'bridge integration', which enables the mapping of complementary technologies (like scATAC-seq, scDNAme, CyTOF), onto scRNA-seq references, using a 'multi-omic' dataset as a molecular bridge. In this vignette, we demonstrate how to map an scATAC-seq dataset of human PBMC, onto our previously constructed [PBMC reference](https://azimuth.hubmapconsortium.org/references/human_pbmc/). We use a publicly available 10x multiome dataset, which simultaneously measures gene expression and chromatin accessibility in the same cell, as a bridge dataset. + +In this vignette we demonstrate: + +* Loading in and pre-processing the scATAC-seq, multiome, and scRNA-seq reference datasets +* Mapping the scATAC-seq dataset via bridge integration +* Exploring and assessing the resulting annotations + +## Azimuth ATAC for Bridge Integration +Users can now automatically run bridge integration for PBMC and Bone Marrow scATAC-seq queries with the newly released Azimuth ATAC workflow on the [Azimuth website](https://azimuth.hubmapconsortium.org/) or in R. For more details on running locally in R, see the section on ATAC data in this [vignette](https://satijalab.github.io/azimuth/articles/run_azimuth_tutorial.html). + + +```{r, message=FALSE, warning=FALSE} +library(Seurat) +library(Signac) +library(EnsDb.Hsapiens.v86) +library(dplyr) +library(ggplot2) +``` + +## Load the bridge, query, and reference datasets + +We start by loading a 10x multiome dataset, consisting of ~12,000 PBMC from a healthy donor. The dataset measures RNA-seq and ATAC-seq in the same cell, and is available for download from 10x Genomics [here](https://www.10xgenomics.com/resources/datasets/pbmc-from-a-healthy-donor-granulocytes-removed-through-cell-sorting-10-k-1-standard-2-0-0). We follow the loading instructions from the [Signac package vignettes](https://stuartlab.org/signac/articles/pbmc_multiomic.html). Note that when using Signac, please make sure you are using the [latest version of Bioconductor](https://www.bioconductor.org/install/), as [users have reported errors](https://github.com/timoast/signac/issues/687) when using older BioC versions. + +
    + **Load and setup the 10x multiome object** + +```{r} +# the 10x hdf5 file contains both data types. +inputdata.10x <- Read10X_h5("/brahms/hartmana/vignette_data/pbmc_cellranger_arc_2/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5") +# extract RNA and ATAC data +rna_counts <- inputdata.10x$`Gene Expression` +atac_counts <- inputdata.10x$Peaks +# Create Seurat object +obj.multi <- CreateSeuratObject(counts = rna_counts) +# Get % of mitochondrial genes +obj.multi[["percent.mt"]] <- PercentageFeatureSet(obj.multi, pattern = "^MT-") + +# add the ATAC-seq assay +grange.counts <- StringToGRanges(rownames(atac_counts), sep = c(":", "-")) +grange.use <- seqnames(grange.counts) %in% standardChromosomes(grange.counts) +atac_counts <- atac_counts[as.vector(grange.use), ] +# Get gene annotations +annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86) +# Change style to UCSC +seqlevelsStyle(annotations) <- 'UCSC' +genome(annotations) <- "hg38" +# File with ATAC per fragment information file +frag.file <- "/brahms/hartmana/vignette_data/pbmc_cellranger_arc_2/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz" +# Add in ATAC-seq data as ChromatinAssay object +chrom_assay <- CreateChromatinAssay( + counts = atac_counts, + sep = c(":", "-"), + genome = 'hg38', + fragments = frag.file, + min.cells = 10, + annotation = annotations +) +# Add the ATAC assay to the multiome object +obj.multi[["ATAC"]] <- chrom_assay +# Filter ATAC data based on QC metrics +obj.multi <- subset( + x = obj.multi, + subset = nCount_ATAC < 7e4 & + nCount_ATAC > 5e3 & + nCount_RNA < 25000 & + nCount_RNA > 1000 & + percent.mt < 20 +) +``` +
    + +--- + +The scATAC-seq query dataset represents ~10,000 PBMC from a healthy donor, and is available for download [here](https://www.10xgenomics.com/resources/datasets/10-k-human-pbm-cs-atac-v-1-1-chromium-x-1-1-standard-2-0-0). We load in the peak/cell matrix, store the path to the fragments file, and add gene annotations to the object, following the steps as with the ATAC data in the multiome experiment. + +We note that it is important to quantify the same set of genomic features in the query dataset as are quantified in the multi-omic bridge. We therefore requantify the set of scATAC-seq peaks using the `FeatureMatrix` command. This is also described in the [Signac vignettes](https://stuartlab.org/signac/articles/integrate_atac.html) and shown below. + +
    + **Load and setup the 10x scATAC-seq query** + +```{r, message=FALSE, warning=FALSE} +# Load ATAC dataset +atac_pbmc_data <- Read10X_h5(filename = "/brahms/hartmana/vignette_data/10k_PBMC_ATAC_nextgem_Chromium_X_filtered_peak_bc_matrix.h5") +fragpath <- "/brahms/hartmana/vignette_data/10k_PBMC_ATAC_nextgem_Chromium_X_fragments.tsv.gz" +# Get gene annotations +annotation <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86) +# Change to UCSC style +seqlevelsStyle(annotation) <- 'UCSC' +# Create ChromatinAssay for ATAC data +atac_pbmc_assay <- CreateChromatinAssay( + counts = atac_pbmc_data, + sep = c(":", "-"), + fragments = fragpath, + annotation = annotation +) +# Requantify query ATAC to have same features as multiome ATAC dataset +requant_multiome_ATAC <- FeatureMatrix( + fragments = Fragments(atac_pbmc_assay), + features = granges(obj.multi[['ATAC']]), + cells = Cells(atac_pbmc_assay) +) +# Create assay with requantified ATAC data +ATAC_assay <- CreateChromatinAssay( + counts = requant_multiome_ATAC, + fragments = fragpath, + annotation = annotation +) +# Create Seurat sbject +obj.atac <- CreateSeuratObject(counts = ATAC_assay,assay = 'ATAC') +obj.atac[['peak.orig']] <- atac_pbmc_assay +obj.atac <- subset(obj.atac, subset = nCount_ATAC < 7e4 & nCount_ATAC > 2000) +``` +
    + +--- + +We load the reference (download [here](https://atlas.fredhutch.org/data/nygc/multimodal/pbmc_multimodal.h5seurat)) from our recent [paper](https://doi.org/10.1016/j.cell.2021.04.048). This reference is stored as an h5Seurat file, a format that enables on-disk storage of multimodal Seurat objects (more details on h5Seurat and `SeuratDisk` can be found [here](https://mojaveazure.github.io/seurat-disk/index.html)). + +```{r pbmc.ref} +obj.rna <- readRDS("/brahms/haoy/seurat4_pbmc/pbmc_multimodal_2023.rds") +``` +
    + **What if I want to use my own reference dataset?** + +As an alternative to using a pre-built reference, you can also use your own reference. To demonstrate, you can download a scRNA-seq dataset of 23,837 human PBMC [here](https://www.dropbox.com/s/x8mu9ye2w3a63hf/20k_PBMC_scRNA.rds?dl=0), which we have already annotated. +```{r, message=FALSE, warning=FALSE, eval=FALSE} +obj.rna = readRDS("/path/to/reference.rds") +obj.rna = SCTransform(object = obj.rna) %>% + RunPCA() %>% + RunUMAP(dims = 1:50, return.model = TRUE) +``` +When using your own reference, set `reference.reduction = "pca"` in the `PrepareBridgeReference` function. + +
    + +--- + +# Preprocessing/normalization for all datasets + +Prior to performing bridge integration, we normalize and pre-process each of the datasets (note that the reference has already been normalized). We normalize gene expression data using [sctransform](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1), and ATAC data using TF-IDF. + +```{r, message=FALSE, warning=FALSE} +# normalize multiome RNA +DefaultAssay(obj.multi) <- "RNA" +obj.multi <- SCTransform(obj.multi, verbose = FALSE) +# normalize multiome ATAC +DefaultAssay(obj.multi) <- "ATAC" +obj.multi <- RunTFIDF(obj.multi) +obj.multi <- FindTopFeatures(obj.multi, min.cutoff = "q0") +# normalize query +obj.atac <- RunTFIDF(obj.atac) +``` + +## Map scATAC-seq dataset using bridge integration + +Now that we have the reference, query, and bridge datasets set up, we can begin integration. The bridge dataset enables translation between the scRNA-seq reference and the scATAC-seq query, effectively augmenting the reference so that it can map a new data type. We call this an extended reference, and first set it up. Note that you can save the results of this function and map multiple scATAC-seq datasets without having to rerun. + +First, we drop the first dimension of the ATAC reduction. +```{r, message=FALSE, warning=FALSE} +dims.atac <- 2:50 +dims.rna <- 1:50 +DefaultAssay(obj.multi) <- "RNA" +DefaultAssay(obj.rna) <- "SCT" +obj.rna.ext <- PrepareBridgeReference( + reference = obj.rna, bridge = obj.multi, + reference.reduction = "spca", reference.dims = dims.rna, + normalization.method = "SCT") +``` + +Now, we can directly find anchors between the extended reference and query objects. We use the `FindBridgeTransferAnchors` function, which translates the query dataset using the same dictionary as was used to translate the reference, and then identifies anchors in this space. The function is meant to mimic our `FindTransferAnchors` function, but to identify correspondences across modalities. + +```{r, message=FALSE, warning=FALSE} +bridge.anchor <- FindBridgeTransferAnchors( + extended.reference = obj.rna.ext, query = obj.atac, + reduction = "lsiproject", dims = dims.atac) +``` + + +Once we have identified anchors, we can map the query dataset onto the reference. The `MapQuery` function is the same as we have [previously introduced for reference mapping](https://satijalab.org/seurat/articles/multimodal_reference_mapping.html) . It transfers cell annotations from the reference dataset, and also visualizes the query dataset on a previously computed UMAP embedding. Since our reference dataset contains cell type annotations at three levels of resolution (l1 - l3), we can transfer each level to the query dataset. + + +```{r, message=FALSE, warning=FALSE} +obj.atac <- MapQuery( + anchorset = bridge.anchor, reference = obj.rna.ext, + query = obj.atac, + refdata = list( + l1 = "celltype.l1", + l2 = "celltype.l2", + l3 = "celltype.l3"), + reduction.model = "wnn.umap") +``` + +Now we can visualize the results, plotting the scATAC-seq cells based on their predicted annotations, on the reference UMAP embedding. You can see that each scATAC-seq cell has been assigned a cell name based on the scRNA-seq defined cell ontology. + +```{r, message=FALSE, warning=FALSE} +DimPlot( + obj.atac, group.by = "predicted.l2", + reduction = "ref.umap", label = TRUE +) + ggtitle("ATAC") + NoLegend() +``` + +## Assessing the mapping + +To assess the mapping and cell type predictions, we will first see if the predicted cell type labels are concordant with an unsupervised analysis of the scATAC-seq dataset. We follow the standard unsupervised processing workflow for scATAC-seq data: + +```{r, message=FALSE, warning=FALSE} +obj.atac <- FindTopFeatures(obj.atac, min.cutoff = "q0") +obj.atac <- RunSVD(obj.atac) +obj.atac <- RunUMAP(obj.atac, reduction = "lsi", dims = 2:50) +``` + +Now, we visualize the predicted cluster labels on the unsupervised UMAP emebdding. We can see that predicted cluster labels (from the scRNA-seq reference) are concordant with the structure of the scATAC-seq data. However, there are some cell types (i.e. Treg), that do not appear to separate in unsupervised analysis. These may be prediction errors, or cases where the reference mapping provides additional resolution. + +```{r, pbmcdimplots, message=FALSE, warning=FALSE} +DimPlot(obj.atac, group.by = "predicted.l2", reduction = "umap", label = FALSE) +``` + +Lastly, we validate the predicted cell types for the scATAC-seq data by examining their chromatin accessibility profiles at canonical loci. We use the `CoveragePlot` function to visualize accessibility patterns at the CD8A, FOXP3, and RORC, after grouping cells by their predicted labels. We see expected patterns in each case. For example, the PAX5 locus exhibits peaks that are accessible exclusively in B cells, and the CD8A locus shows the same in CD8 T cell subsets. Similarly, the accessibility of FOXP3, a canonical marker of regulatory T cells (Tregs), in predicted Tregs provides strong support for the accuracy of our prediction. + +```{r, message=FALSE, warning=FALSE} +CoveragePlot( + obj.atac, region = "PAX5", group.by = "predicted.l1", + idents = c("B", "CD4 T", "Mono", "NK"), window = 200, + extend.upstream = -150000) +CoveragePlot( + obj.atac, region = "CD8A", group.by = "predicted.l2", + idents = c("CD8 Naive", "CD4 Naive", "CD4 TCM", "CD8 TCM"), + extend.downstream = 5000, extend.upstream = 5000) +CoveragePlot( + obj.atac, region = "FOXP3", group.by = "predicted.l2", + idents = c( "CD4 Naive", "CD4 TCM", "CD4 TEM", "Treg"), + extend.downstream = 0, extend.upstream = 0) +CoveragePlot( + obj.atac, region = "RORC", group.by = "predicted.l2", + idents = c("CD8 Naive", "CD8 TEM", "CD8 TCM", "MAIT"), + extend.downstream = 5000, extend.upstream = 5000) +``` + +
    + **Session Info** +```{r} +sessionInfo() +``` +
    diff --git a/vignettes/seurat5_integration_introduction.Rmd b/vignettes/seurat5_integration_introduction.Rmd new file mode 100644 index 000000000..2342c7c6e --- /dev/null +++ b/vignettes/seurat5_integration_introduction.Rmd @@ -0,0 +1,273 @@ +--- +title: 'Introduction to scRNA-seq integration' +output: + html_document: + theme: united + pdf_document: default +date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' +--- + +```{r setup, include=TRUE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = TRUE, + tidy.opts = list(width.cutoff = 95), + fig.width = 10, + message = FALSE, + warning = FALSE, + time_it = TRUE, + error = TRUE +) +``` + +# Introduction to scRNA-seq integration + +The joint analysis of two or more single-cell datasets poses unique challenges. In particular, identifying cell populations that are present across multiple datasets can be problematic under standard workflows. Seurat v4 includes a set of methods to match (or ‘align’) shared cell populations across datasets. These methods first identify cross-dataset pairs of cells that are in a matched biological state (‘anchors’), can be used both to correct for technical differences between datasets (i.e. batch effect correction), and to perform comparative scRNA-seq analysis of across experimental conditions. + +Below, we demonstrate methods for scRNA-seq integration as described in [Stuart\*, Butler\* et al, 2019](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8) to perform a comparative analysis of human immune cells (PBMC) in either a [resting or interferon-stimulated state](https://www.nature.com/articles/nbt.4042). + +## Integration goals + +The following tutorial is designed to give you an overview of the kinds of comparative analyses on complex cell types that are possible using the Seurat integration procedure. Here, we address a few key goals: + +* Create an 'integrated' data assay for downstream analysis +* Identify cell types that are present in both datasets +* Obtain cell type markers that are conserved in both control and stimulated cells +* Compare the datasets to find cell-type specific responses to stimulation + +## Setup the Seurat objects + +For convenience, we distribute this dataset through our [SeuratData](https://github.com/satijalab/seurat-data) package. + +```{r, include = TRUE} +options(SeuratData.repo.use = "http://satijalab04.nygenome.org") +``` + +```{r data} +library(Seurat) +options(Seurat.object.assay.version = "v5") +library(SeuratData) +library(patchwork) +``` + +```{r installdata, eval=FALSE} +# install dataset +InstallData('ifnb') +``` + +```{r init, results='hide', message=FALSE, fig.keep='none'} +# load dataset +ifnb <- LoadData("ifnb") +ifnb[["RNA"]] <- as(ifnb[["RNA"]], Class = "Assay5") + +# split the dataset into layers (stim and CTRL) +ifnb[["RNA"]] <- split(ifnb[["RNA"]], f = ifnb$stim) + +# normalize and identify variable features for each dataset independently +ifnb <- NormalizeData(ifnb) +ifnb <- FindVariableFeatures(ifnb, selection.method = "vst", nfeatures = 2000) + +# these two now are run before +ifnb <- ScaleData(ifnb) +ifnb <- RunPCA(ifnb) +# # select features that are repeatedly variable across datasets for integration +# features <- SelectIntegrationFeatures(object.list = ifnb.list) +ifnb +``` + +## Perform integration + +We then identify anchors using the `FindIntegrationAnchors()` function (not any more), which takes a list of Seurat objects as input, and use these anchors to integrate the two layers together with `IntegrateLayers()`. +```{r} +ifnb <- IntegrateLayers(object = ifnb, + method = CCAIntegration, + verbose = F) +``` + + +## Perform an integrated analysis + +Now we can run a single integrated analysis on all cells! + +```{r clustering, results='hide', message=FALSE} +# specify that we will perform downstream analysis on the corrected data +# note that the original unmodified data still resides in the 'RNA' assay + +# Run the standard workflow for visualization and clustering +# use integrated.dr here instead of pca +ifnb <- RunUMAP(ifnb, reduction = "integrated.dr", dims = 1:30) +ifnb <- FindNeighbors(ifnb, reduction = "integrated.dr", dims = 1:30) +ifnb <- FindClusters(ifnb, resolution = 0.5) +``` + +```{r viz, results='hide', message=FALSE} +# Visualization +p1 <- DimPlot(ifnb, reduction = "umap", group.by = "stim") +p2 <- DimPlot(ifnb, reduction = "umap", label = TRUE, repel = TRUE) +p1 + p2 +``` + +To visualize the two conditions side-by-side, we can use the `split.by` argument to show each condition colored by cluster. + +```{r split.dim} +DimPlot(ifnb, reduction = "umap", split.by = "stim") +``` + +## Identify conserved cell type markers + +To identify canonical cell type marker genes that are conserved across conditions, we provide the `FindConservedMarkers()` function. This function performs differential gene expression testing for each dataset/group and combines the p-values using meta-analysis methods from the MetaDE R package. For example, we can calculated the genes that are conserved markers irrespective of stimulation condition in cluster 6 (NK cells). + +```{r conserved.markers, warning=FALSE} +# For performing differential expression after integration, we switch back to the original data +DefaultAssay(ifnb) <- "RNA" +# Join Data Layers across stimualtions +ifnb[['RNA']] <- JoinLayers(ifnb[["RNA"]], layers = "data", new = "data") +nk.markers <- FindConservedMarkers(ifnb, ident.1 = 6, grouping.var = "stim", verbose = FALSE) +head(nk.markers) +``` + +We can explore these marker genes for each cluster and use them to annotate our clusters as specific cell types. + +```{r annotate, results = 'hide', message=FALSE, fig.height = 8} +FeaturePlot(ifnb, features = c("CD3D", "SELL", "CREM", "CD8A", "GNLY", "CD79A", "FCGR3A", "CCL2", "PPBP"), min.cutoff = "q9") +ifnb <- RenameIdents(ifnb, `0` = "CD14 Mono", `1` = "CD4 Naive T", `2` = "CD4 Memory T", + `3` = "CD16 Mono", `4` = "B", `5` = "CD8 T", `6` = "NK", `7` = "T activated", `8` = "DC", `9` = "B Activated", + `10` = "Mk", `11` = "pDC", `12` = "Eryth", `13` = "Mono/Mk Doublets", `14` = "HSPC") +DimPlot(ifnb, label = TRUE) +``` + +The `DotPlot()` function with the `split.by` parameter can be useful for viewing conserved cell type markers across conditions, showing both the expression level and the percentage of cells in a cluster expressing any given gene. Here we plot 2-3 strong marker genes for each of our 14 clusters. + + +```{r splitdotplot, fig.height = 10} +Idents(ifnb) <- factor( + Idents(ifnb), + levels = c("HSPC", "Mono/Mk Doublets", "pDC", "Eryth","Mk", "DC", "CD14 Mono", "CD16 Mono", "B Activated", "B", "CD8 T", "NK", "T activated", "CD4 Naive T", "CD4 Memory T")) +markers.to.plot <- c("CD3D","CREM","HSPH1","SELL","GIMAP5","CACYBP","GNLY","NKG7","CCL5","CD8A","MS4A1","CD79A","MIR155HG","NME1","FCGR3A","VMO1","CCL2","S100A9","HLA-DQA1","GPR183","PPBP","GNG11","HBA2","HBB","TSPAN13","IL3RA","IGJ","PRSS57") +DotPlot(ifnb, features = markers.to.plot, cols = c('blue', 'red'), dot.scale = 8, split.by = "stim") + RotatedAxis() +``` + +```{r save.img, include=TRUE} +library(ggplot2) +plot <- DotPlot(ifnb, features = markers.to.plot, cols = c('blue', 'red'), + dot.scale = 6, split.by = "stim") + RotatedAxis() +#ggsave(filename = "../output/images/pbmc_alignment.jpg", height = 7, width = 12, plot = plot, quality = 50) +``` + +### Identify differential expressed genes across conditions + +Now that we've aligned the stimulated and control cells, we can start to do comparative analyses and look at the differences induced by stimulation. One way to look broadly at these changes is to plot the average expression of both the stimulated and control cells and look for genes that are visual outliers on a scatter plot. Here, we take the average expression of both the stimulated and control naive T cells and CD14 monocyte populations and generate the scatter plots, highlighting genes that exhibit dramatic responses to interferon stimulation. + +```{r scatterplots, results = 'hide', message=FALSE} +library(ggplot2) +library(cowplot) +theme_set(theme_cowplot()) +t.cells <- subset(ifnb, idents = "CD4 Naive T") +Idents(t.cells) <- "stim" +t.cells.pseudo <- AggregateExpression(t.cells, return.seurat = TRUE, verbose = FALSE) +avg.t.cells <- as.data.frame(t.cells.pseudo[['RNA']]$data) +avg.t.cells$gene <- rownames(avg.t.cells) + +cd14.mono <- subset(ifnb, idents = "CD14 Mono") +Idents(cd14.mono) <- "stim" +cd14.mono.pseudo <- AggregateExpression(cd14.mono, return.seurat = TRUE, verbose = FALSE) +avg.cd14.mono <- as.data.frame(cd14.mono.pseudo[['RNA']]$data) +avg.cd14.mono$gene <- rownames(avg.cd14.mono) + +genes.to.label = c("ISG15", "LY6E", "IFI6", "ISG20", "MX1", "IFIT2", "IFIT1", "CXCL10", "CCL8") +p1 <- ggplot(avg.t.cells, aes(CTRL, STIM)) + geom_point() + ggtitle("CD4 Naive T Cells") +p1 <- LabelPoints(plot = p1, points = genes.to.label, repel = TRUE) +p2 <- ggplot(avg.cd14.mono, aes(CTRL, STIM)) + geom_point() + ggtitle("CD14 Monocytes") +p2 <- LabelPoints(plot = p2, points = genes.to.label, repel = TRUE) +p1 + p2 +``` + +As you can see, many of the same genes are upregulated in both of these cell types and likely represent a conserved interferon response pathway. + +Because we are confident in having identified common cell types across condition, we can ask what genes change in different conditions for cells of the same type. First, we create a column in the meta.data slot to hold both the cell type and stimulation information and switch the current ident to that column. Then we use `FindMarkers()` to find the genes that are different between stimulated and control B cells. Notice that many of the top genes that show up here are the same as the ones we plotted earlier as core interferon response genes. Additionally, genes like CXCL10 which we saw were specific to monocyte and B cell interferon response show up as highly significant in this list as well. + +```{r de.genes} +ifnb$celltype.stim <- paste(Idents(ifnb), ifnb$stim, sep = "_") +ifnb$celltype <- Idents(ifnb) +Idents(ifnb) <- "celltype.stim" +b.interferon.response <- FindMarkers(ifnb, ident.1 = "B_STIM", ident.2 = "B_CTRL", verbose = FALSE) +head(b.interferon.response, n = 15) +``` + +Another useful way to visualize these changes in gene expression is with the `split.by` option to the `FeaturePlot()` or `VlnPlot()` function. This will display FeaturePlots of the list of given genes, split by a grouping variable (stimulation condition here). Genes such as CD3D and GNLY are canonical cell type markers (for T cells and NK/CD8 T cells) that are virtually unaffected by interferon stimulation and display similar gene expression patterns in the control and stimulated group. IFI6 and ISG15, on the other hand, are core interferon response genes and are upregulated accordingly in all cell types. Finally, CD14 and CXCL10 are genes that show a cell type specific interferon response. CD14 expression decreases after stimulation in CD14 monocytes, which could lead to misclassification in a supervised analysis framework, underscoring the value of integrated analysis. CXCL10 shows a distinct upregulation in monocytes and B cells after interferon stimulation but not in other cell types. + +```{r feature.heatmaps, fig.height = 14} +FeaturePlot(ifnb, features = c("CD3D", "GNLY", "IFI6"), split.by = "stim", max.cutoff = 3, cols = c("grey", "red")) +``` + +```{r splitvln, fig.height = 12} +plots <- VlnPlot(ifnb, features = c("LYZ", "ISG15", "CXCL10"), split.by = "stim", group.by = "celltype", pt.size = 0, combine = FALSE) +wrap_plots(plots = plots, ncol = 1) +``` + +```{r save, include=TRUE} +#saveRDS(ifnb, file = "../output/ifnb.rds") +``` + +# Performing integration on datasets normalized with SCTransform + +In [Hafemeister and Satija, 2019](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1), we introduced an improved method for the normalization of scRNA-seq, based on regularized negative binomial regression. The method is named 'sctransform', and avoids some of the pitfalls of standard normalization workflows, including the addition of a pseudocount, and log-transformation. You can read more about sctransform in the [manuscript](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1) or our [SCTransform vignette](sctransform_vignette.html). + +Below, we demonstrate how to modify the Seurat integration workflow for datasets that have been normalized with the sctransform workflow. The commands are largely similar, with a few key differences: + +* Normalize datasets individually by `SCTransform()`, instead of `NormalizeData()` prior to integration +* As discussed further in our [SCTransform vignette](sctransform_vignette.html), we typically use 3,000 or more features for analysis downstream of sctransform. +* Run the `PrepSCTIntegration()` function prior to identifying anchors +* When running `FindIntegrationAnchors()`, and `IntegrateData()`, set the `normalization.method` parameter to the value `SCT`. +* When running sctransform-based workflows, including integration, do not run the `ScaleData()` function + + +```{r panc8.cca.sct.init, results='hide', message=FALSE, fig.keep='none'} +ifnb <- LoadData("ifnb") +ifnb[["RNA"]] <- as(ifnb[["RNA"]], Class = "Assay5") +ifnb[["RNA"]] <- split(ifnb[["RNA"]], f = ifnb$stim) + +ifnb <- SCTransform(ifnb) +ifnb <- RunPCA(ifnb) +``` + +```{r ifnb.cca.sct.anchors} +ifnb <- IntegrateLayers(object = ifnb, + method = CCAIntegration, + normalization.method = "SCT", + verbose = F) +``` + +```{r ifnb.cca.sct.clustering, results='hide', message=FALSE} +ifnb <- RunUMAP(ifnb, reduction = "integrated.dr", dims = 1:30) +``` + +```{r immunesca.cca.sct.split.dims} +p1 <- DimPlot(ifnb, reduction = "umap", group.by = "stim") +p2 <- DimPlot(ifnb, reduction = "umap", group.by = 'seurat_annotations',label = TRUE, repel = TRUE) +p1 + p2 +``` + +Now that the datasets have been integrated, you can follow the previous steps in this vignette identify cell types and cell type-specific responses. + +```{r save.times, include=TRUE} +#write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_integration_introduction.csv") +``` + +
    + **Session Info** +```{r} +sessionInfo() +``` +
    diff --git a/vignettes/seurat5_integration_large_datasets.Rmd b/vignettes/seurat5_integration_large_datasets.Rmd new file mode 100644 index 000000000..176a4ca14 --- /dev/null +++ b/vignettes/seurat5_integration_large_datasets.Rmd @@ -0,0 +1,125 @@ +--- +title: "Tips for integrating large datasets" +output: + html_document: + theme: united + df_print: kable + pdf_document: default +date: 'Compiled: `r Sys.Date()`' +--- + +```{r setup, include=TRUE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = TRUE, + tidy.opts = list(width.cutoff = 85), + fig.width = 10, + message = FALSE, + warning = FALSE, + time_it = TRUE, + error = TRUE +) +``` + +```{r, include=TRUE} +options(SeuratData.repo.use = "http://satijalab04.nygenome.org") +options(future.globals.maxSize = 8e9) +``` + +For very large datasets, the standard integration workflow can sometimes be prohibitively computationally expensive. In this workflow, we employ two options that can improve efficiency and runtimes: + +1. Reciprocal PCA (RPCA) +2. Reference-based integration + +The main efficiency improvements are gained in `FindIntegrationAnchors()`. First, we use reciprocal PCA (RPCA) instead of CCA, to identify an effective space in which to find anchors. When determining anchors between any two datasets using reciprocal PCA, we project each dataset into the others PCA space and constrain the anchors by the same mutual neighborhood requirement. All downstream integration steps remain the same and we are able to 'correct' (or harmonize) the datasets. + +Additionally, we use reference-based integration. In the standard workflow, we identify anchors between all pairs of datasets. While this gives datasets equal weight in downstream integration, it can also become computationally intensive. For example when integrating 10 different datasets, we perform 45 different pairwise comparisons. As an alternative, we introduce here the possibility of specifying one or more of the datasets as the 'reference' for integrated analysis, with the remainder designated as 'query' datasets. In this workflow, we do not identify anchors between pairs of query datasets, reducing the number of comparisons. For example, when integrating 10 datasets with one specified as a reference, we perform only 9 comparisons. Reference-based integration can be applied to either log-normalized or SCTransform-normalized datasets. + +This alternative workflow consists of the following steps: + +* Create a list of Seurat objects to integrate +* Perform normalization, feature selection, and scaling separately for each dataset +* Run PCA on each object in the list +* Integrate datasets, and proceed with joint analysis + +In general, we observe strikingly similar results between the standard workflow and the one demonstrated here, with substantial reduction in compute time and memory. However, if the datasets are highly divergent (for example, cross-modality mapping or cross-species mapping), where only a small subset of features can be used to facilitate integration, and you may observe superior results using CCA. + +For this example, we will be using the "Immune Cell Atlas" data from the Human Cell Atlas which can be found [here](https://data.humancellatlas.org/explore/projects?filter=%5B%7B%22facetName%22:%22organ%22,%22terms%22:%5B%22immune%20system%22%5D%7D%5D&catalog=dcp1). + +```{r libs} +library(Seurat) +options(Seurat.object.assay.version = "v5") +``` + +After acquiring the data, we first perform standard normalization and variable feature selection. + +```{r hca.full.1} + +bm280k.data <- Read10X_h5("../data/ica_bone_marrow_h5.h5") +bm280k <- CreateSeuratObject(counts = bm280k.data, min.cells = 100, min.features = 500) +bm280k[["RNA"]] <- split(bm280k[["RNA"]], f = bm280k$orig.ident) + +# Preprocessing +bm280k <- NormalizeData(bm280k, verbose = FALSE) +bm280k <- FindVariableFeatures(bm280k, verbose = FALSE) +``` + +Next, select features for downstream integration, and run PCA on each object in the list, which is required for running the alternative reciprocal PCA workflow. + +```{r hca.full.2} +features <- VariableFeatures(bm280k) +bm280k <- ScaleData(bm280k, features = features, verbose = FALSE) +bm280k <- RunPCA(bm280k, features = features, verbose = FALSE) + +``` + +Since this dataset contains both men and women, we will chose one male and one female (BM1 and BM2) to use in a reference-based workflow. We determined donor sex by examining the expression of the XIST gene. + +```{r integration.hca.full} +bm280k <- IntegrateLayers(object = bm280k, + method = RPCAIntegration, + reference = c(1, 2), + dims = 1:50, + verbose = F) + + +``` + + +```{r downstream.hca.full} +bm280k <- RunUMAP(bm280k, dims = 1:50) +``` + +```{r viz.hca.full, fig.height = 9, fig.width = 16} +DimPlot(bm280k, group.by = "orig.ident") +``` + +```{r save.img, include=TRUE} +library(ggplot2) +plot <- DimPlot(bm280k, group.by = "orig.ident") + xlab("UMAP 1") + ylab("UMAP 2") + + theme(axis.title = element_text(size = 18), legend.text = element_text(size = 18)) + + guides(colour = guide_legend(override.aes = list(size = 10))) +ggsave(filename = "../output/images/bm280k_integrated.jpg", height = 7, width = 12, plot = plot, quality = 50) +``` + +```{r save.times, include=TRUE} +write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_integration_large_datasets.csv") +``` + +
    + **Session Info** +```{r} +sessionInfo() +``` +
    diff --git a/vignettes/seurat5_integration_mapping.Rmd b/vignettes/seurat5_integration_mapping.Rmd new file mode 100644 index 000000000..d7bea6a73 --- /dev/null +++ b/vignettes/seurat5_integration_mapping.Rmd @@ -0,0 +1,211 @@ +--- +title: "Mapping and annotating query datasets" +output: + html_document: + theme: united + df_print: kable + pdf_document: default +date: 'Compiled: `r Sys.Date()`' +--- +*** +```{r setup, include=TRUE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = TRUE, + tidy.opts = list(width.cutoff = 95), + warning = FALSE, + error = TRUE, + message = FALSE, + fig.width = 8, + time_it = TRUE +) +``` + +# Introduction to single-cell reference mapping + +In this vignette, we first build an integrated reference and then demonstrate how to leverage this reference to annotate new query datasets. Generating an integrated reference follows the same workflow described in more detail in the integration introduction [vignette](integration_introduction.html). Once generated, this reference can be used to analyze additional query datasets through tasks like cell type label transfer and projecting query cells onto reference UMAPs. Notably, this does not require correction of the underlying raw query data and can therefore be an efficient strategy if a high quality reference is available. + +# Dataset preprocessing + +For the purposes of this example, we've chosen human pancreatic islet cell datasets produced across four technologies, CelSeq (GSE81076) CelSeq2 (GSE85241), Fluidigm C1 (GSE86469), and SMART-Seq2 (E-MTAB-5061). For convenience, we distribute this dataset through our [SeuratData](https://github.com/satijalab/seurat-data) package. The metadata contains the technology (`tech` column) and cell type annotations (`celltype` column) for each cell in the four datasets. + +```{r libraries, message=FALSE, warning=FALSE} +library(Seurat) +options(Seurat.object.assay.version = "v5") +library(SeuratData) +``` + +```{r install.data, eval=FALSE} +InstallData("panc8") +``` + +To construct a reference, we will identify 'anchors' between the individual datasets. First, we split the combined object into a list, with each dataset as an element (this is only necessary because the data was bundled together for easy distribution). + +```{r preprocessing1} +panc8 <- LoadData("panc8") +panc8[["RNA"]] <- as(panc8[["RNA"]], Class = "Assay5") + +# split the dataset into layers by technology +panc8[["RNA"]] <- split(panc8[["RNA"]], f = panc8$tech) +``` + +Prior to finding anchors, we perform standard preprocessing (log-normalization), and identify variable features individually for each. Note that Seurat implements an improved method for variable feature selection based on a variance stabilizing transformation (`"vst"`) + +```{r preprocessing3} +panc8 <- NormalizeData(panc8, verbose = FALSE) +panc8 <- FindVariableFeatures(panc8, selection.method = "vst", + nfeatures = 2000, verbose = FALSE) +``` + +# Integration of 3 pancreatic islet cell datasets + +Next, we identify anchors using the `FindIntegrationAnchors()` function, which takes a list of Seurat objects as input. Here, we integrate three of the objects into a reference (we will use the fourth later in this vignette as a query dataset to demonstrate mapping). + +* We use all default parameters here for identifying anchors, including the 'dimensionality' of the dataset (30; feel free to try varying this parameter over a broad range, for example between 10 and 50). + +```{r integration.anchors, warning = FALSE, message = FALSE} +pancreas.ref <- subset(panc8, subset = tech %in% c("celseq", "celseq2", "smartseq2")) +pancreas.ref <- ScaleData(pancreas.ref) +pancreas.ref <- RunPCA(pancreas.ref) +``` + +We then pass these anchors to the `IntegrateData()` function, which returns a Seurat object. + +* The returned object will contain a new `Assay`, which holds an integrated (or 'batch-corrected') expression matrix for all cells, enabling them to be jointly analyzed. + +```{r data.integration, warning = FALSE, message = FALSE} +pancreas.ref <- IntegrateLayers(object = pancreas.ref, + method = CCAIntegration, + verbose = FALSE) +``` + +After running `IntegrateData()`, the `Seurat` object will contain a new `Assay` with the integrated expression matrix. Note that the original (uncorrected values) are still stored in the object in the "RNA" assay, so you can switch back and forth. + +We can then use this new integrated matrix for downstream analysis and visualization. Here we scale the integrated data, run PCA, and visualize the results with UMAP. The integrated datasets cluster by cell type, instead of by technology. + +```{r analysis, message = FALSE, warning=FALSE, fig.width=10} +library(ggplot2) +library(cowplot) +library(patchwork) +# Run the standard workflow for visualization and clustering +pancreas.ref <- RunUMAP(pancreas.ref, reduction = "integrated.dr", dims = 1:30, + verbose = FALSE) +p1 <- DimPlot(pancreas.ref, reduction = "umap", group.by = "tech") +p2 <- DimPlot(pancreas.ref, reduction = "umap", group.by = "celltype", + label = TRUE, repel = TRUE) + NoLegend() +p1 + p2 +``` + +```{r save.img, include=TRUE} +plot <- DimPlot(pancreas.ref, reduction = "umap", label = TRUE, label.size = 4.5) + xlab("UMAP 1") + ylab("UMAP 2") + + theme(axis.title = element_text(size = 18), legend.text = element_text(size = 18)) + + guides(colour = guide_legend(override.aes = list(size = 10))) +#ggsave(filename = "pancreas_integrated_umap.jpg", height = 7, width = 12, plot = plot, quality = 50) +``` + +# Cell type classification using an integrated reference + +Seurat also supports the projection of reference data (or meta data) onto a query object. While many of the methods are conserved (both procedures begin by identifying anchors), there are two important distinctions between data transfer and integration: + +1. In data transfer, Seurat does not correct or modify the query expression data. +2. In data transfer, Seurat has an option (set by default) to project the PCA structure of a reference onto the query, instead of learning a joint structure with CCA. We generally suggest using this option when projecting data between scRNA-seq datasets. + +After finding anchors, we use the `TransferData()` function to classify the query cells based on reference data (a vector of reference cell type labels). `TransferData()` returns a matrix with predicted IDs and prediction scores, which we can add to the query metadata. + +```{r label.transfer, warning = FALSE, message = FALSE} +pancreas.query <- subset(panc8, subset = tech == "fluidigmc1") +pancreas.anchors <- FindTransferAnchors(reference = pancreas.ref, query = pancreas.query, dims = 1:30, reference.reduction = "integrated.dr", k.filter = NA) + +predictions <- TransferData(anchorset = pancreas.anchors, refdata = pancreas.ref$celltype, dims = 1:30) +pancreas.query <- AddMetaData(pancreas.query, metadata = predictions) +``` + +Because we have the original label annotations from our full integrated analysis, we can evaluate how well our predicted cell type annotations match the full reference. In this example, we find that there is a high agreement in cell type classification, with over 96% of cells being labeled correctly. + +```{r analysis2} +pancreas.query$prediction.match <- pancreas.query$predicted.id == pancreas.query$celltype +table(pancreas.query$prediction.match) +``` + +To verify this further, we can examine some canonical cell type markers for specific pancreatic islet cell populations. Note that even though some of these cell types are only represented by one or two cells (e.g. epsilon cells), we are still able to classify them correctly. + +```{r vlnplots, fig.height=8} +table(pancreas.query$predicted.id) +VlnPlot(pancreas.query, c("REG1A", "PPY", "SST", "GHRL", "VWF", "SOX10"), group.by = "predicted.id") +``` + +# Unimodal UMAP Projection + +In Seurat v4, we also enable projection of a query onto the reference UMAP structure. This can be achieved by computing the reference UMAP model and then calling `MapQuery()` instead of `TransferData()`. + +```{r label.transfer.v4, warning = FALSE, message = FALSE} +pancreas.ref <- RunUMAP(pancreas.ref, dims = 1:30, reduction = "integrated.dr", return.model = TRUE) +pancreas.query <- MapQuery( + anchorset = pancreas.anchors, + reference = pancreas.ref, + query = pancreas.query, + refdata = list(celltype = 'celltype'), + reference.reduction = 'integrated.dr', + reduction.model = 'umap' +) +``` + +
    + **What is `MapQuery` doing?** + +`MapQuery()` is a wrapper around three functions: `TransferData()`, `IntegrateEmbeddings()`, and `ProjectUMAP()`. `TransferData()` is used to transfer cell type labels and impute the ADT values; `IntegrateEmbeddings()` is used to integrate reference with query by correcting the query's projected low-dimensional embeddings; and finally `ProjectUMAP()` is used to project the query data onto the UMAP structure of the reference. The equivalent code for doing this with the intermediate functions is below: + +```{r, eval=FALSE} +pancreas.query <- TransferData( + anchorset = pancreas.anchors, + reference = pancreas.ref, + query = pancreas.query, + refdata = list(celltype = "celltype") +) +pancreas.query <- IntegrateEmbeddings( + anchorset = pancreas.anchors, + reference = pancreas.ref, + query = pancreas.query, + new.reduction.name = "ref.pca" +) +pancreas.query <- ProjectUMAP( + query = pancreas.query, + query.reduction = "ref.pca", + reference = pancreas.ref, + reference.reduction = "integrated.dr", + reduction.model = "umap" +) +``` +
    + +We can now visualize the query cells alongside our reference. + +```{r panc.refdimplots, fig.width=10} +p1 <- DimPlot(pancreas.ref, reduction = "umap", group.by = "celltype", label = TRUE, + label.size = 3 ,repel = TRUE) + NoLegend() + ggtitle("Reference annotations") +p2 <- DimPlot(pancreas.query, reduction = "ref.umap", group.by = "predicted.celltype", label = TRUE, + label.size = 3 ,repel = TRUE) + NoLegend() + ggtitle("Query transferred labels") +p1 + p2 +``` + +```{r save.times, include=TRUE} +#write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_integration_reference_mapping.csv") +``` + +
    + **Session Info** +```{r} +sessionInfo() +``` +
    diff --git a/vignettes/seurat5_integration_rpca.Rmd b/vignettes/seurat5_integration_rpca.Rmd new file mode 100644 index 000000000..a59463944 --- /dev/null +++ b/vignettes/seurat5_integration_rpca.Rmd @@ -0,0 +1,191 @@ +--- +title: 'Fast integration using reciprocal PCA (RPCA)' +output: + html_document: + theme: united + pdf_document: default +date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' +--- + +```{r setup, include=TRUE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = TRUE, + tidy.opts = list(width.cutoff = 95), + fig.width = 10, + message = FALSE, + warning = FALSE, + time_it = TRUE, + error = TRUE +) +``` + +In this vignette, we present a slightly modified workflow for the integration of scRNA-seq datasets. Instead of utilizing canonical correlation analysis (‘CCA’) to identify anchors, we instead utilize reciprocal PCA (‘RPCA’). When determining anchors between any two datasets using RPCA, we project each dataset into the others PCA space and constrain the anchors by the same mutual neighborhood requirement. The commands for both workflows are largely identical, but the two methods may be applied in different context. + +By identifying shared sources of variation between datasets, CCA is well-suited for identifying anchors when cell types are conserved, but there are very substantial differences in gene expression across experiments. CCA-based integration therefore enables integrative analysis when experimental conditions or disease states introduce very strong expression shifts, or when integrating datasets across modalities and species. However, CCA-based integration may also lead to overcorrection, especially when a large proportion of cells are non-overlapping across datasets. + +RPCA-based integration runs significantly faster, and also represents a more conservative approach where cells in different biological states are less likely to 'align' after integration. We therefore,recommend RPCA during integrative analysis where: +* A substantial fraction of cells in one dataset have no matching type in the other +* Datasets originate from the same platform (i.e. multiple lanes of 10x genomics) +* There are a large number of datasets or cells to integrate (see [here](integration_large_datasets.html) for more tips on integrating large datasets) + +Below, we demonstrate the use of reciprocal PCA to align the same stimulated and resting datasets first analyzed in our [introduction to scRNA-seq integration](integration_introduction.html) vignette. While the list of commands is nearly identical, this workflow requires users to run principal components analysis (PCA) individually on each dataset prior to integration. Users should also set the 'reduction' argument to 'rpca', when running `FindIntegrationAnchors()`. + +```{r, include=TRUE} +options(SeuratData.repo.use = "http://satijalab04.nygenome.org") +options(future.globals.maxSize = 1e9) +``` + +```{r installdata} +library(Seurat) +library(SeuratData) +# install dataset +InstallData("ifnb") +``` + +```{r init, results='hide', message=FALSE, fig.keep='none'} +# load dataset +ifnb <- LoadData("ifnb") +ifnb[["RNA"]] <- as(ifnb[["RNA"]], Class = "Assay5") + +# split the dataset into layers (stim and CTRL) +ifnb[["RNA"]] <- split(ifnb[["RNA"]], f = ifnb$stim) + +# normalize and identify variable features for each dataset independently +ifnb <- NormalizeData(ifnb) + +# select features that are repeatedly variable across datasets for integration +# run PCA on each dataset using these features + +ifnb <- FindVariableFeatures(ifnb, selection.method = "vst", nfeatures = 2000) +#features <- SelectIntegrationFeatures(ifnb.list) # this is in the vignette +features <- VariableFeatures(ifnb) + +ifnb <- ScaleData(ifnb, features = features, verbose = FALSE) +ifnb <- RunPCA(ifnb, features = features, verbose = FALSE) +ifnb +``` + +# Perform integration + +We then identify anchors using the `FindIntegrationAnchors()` function, which takes a list of Seurat objects as input, and use these anchors to integrate the two datasets together with `IntegrateData()`. + +```{r integrate.data} +ifnb <- IntegrateLayers(object = ifnb, + method = RPCAIntegration, + features = features, + verbose = F) +``` + +Now we can run a single integrated analysis on all cells! + +```{r clustering, results='hide', message=FALSE} +# specify that we will perform downstream analysis on the corrected data +# note that the original unmodified data still resides in the 'RNA' assay + +# Run the standard workflow for visualization and clustering +ifnb <- RunUMAP(ifnb, reduction = "integrated.dr", dims = 1:30) +ifnb <- FindNeighbors(ifnb, reduction = "integrated.dr", dims = 1:30) +ifnb <- FindClusters(ifnb, resolution = 0.5) +``` + +```{r viz, results='hide', message=FALSE} +# Visualization +p1 <- DimPlot(ifnb, reduction = "umap", group.by = "stim") +p2 <- DimPlot(ifnb, reduction = "umap", group.by = 'seurat_annotations',label = TRUE, repel = TRUE) +p1 + p2 +``` + +# Modifying the strength of integration + +The results show that rpca-based integration is more conservative, and in this case, do not perfectly align a subset of cells (which are naive and memory T cells) across experiments. You can increase the strength of alignment by increasing the `k.anchor` parameter, which is set to 5 by default. Increasing this parameter to 20 will assist in aligning these populations. + +```{r split.dim} +#immune.anchors <- FindIntegrationAnchors(object.list = ifnb.list, anchor.features = features,reduction = 'rpca', k.anchor = 20) +#ifnb <- IntegrateData(anchorset = immune.anchors) + +ifnb <- IntegrateLayers(object = ifnb, + k.anchor = 20, + method = RPCAIntegration, + features = features, + verbose = F) + +ifnb <- RunUMAP(ifnb, reduction = "integrated.dr", dims = 1:30) +ifnb <- FindNeighbors(ifnb, reduction = "integrated.dr", dims = 1:30) +ifnb <- FindClusters(ifnb, resolution = 0.5) +``` + +```{r viz2, results='hide', message=FALSE} +# Visualization +p1 <- DimPlot(ifnb, reduction = "umap", group.by = "stim") +p2 <- DimPlot(ifnb, reduction = "umap", group.by = 'seurat_annotations', label = TRUE, repel = TRUE) +p1 + p2 +``` + +```{r save.img, include=TRUE} +library(ggplot2) +plot <- DimPlot(ifnb, group.by = "stim") + + xlab("UMAP 1") + ylab("UMAP 2") + + theme(axis.title = element_text(size = 18), legend.text = element_text(size = 18)) + + guides(colour = guide_legend(override.aes = list(size = 10))) +ggsave(filename = "../output/images/rpca_integration.jpg", height = 7, width = 12, plot = plot, quality = 50) +``` + +Now that the datasets have been integrated, you can follow the previous steps in the [introduction to scRNA-seq integration vignette](integration_introduction.html) to identify cell types and cell type-specific responses. + +# Performing integration on datasets normalized with SCTransform + +As an additional example, we repeat the analyses performed above, but normalize the datasets using [SCTransform](sctransform_vignette.html). We may choose to set the `method` parameter to `glmGamPoi` (install [here](https://bioconductor.org/packages/release/bioc/html/glmGamPoi.html)) in order to enable faster estimation of regression parameters in `SCTransform()`. + +```{r panc8.cca.sct.init, results='hide', message=FALSE, fig.keep='none'} +ifnb <- LoadData("ifnb") +ifnb[["RNA"]] <- as(ifnb[["RNA"]], Class = "Assay5") +ifnb[["RNA"]] <- split(ifnb[["RNA"]], f = ifnb$stim) + +ifnb <- SCTransform(ifnb, method = "glmGamPoi") +features <- VariableFeatures(ifnb) +ifnb <- RunPCA(ifnb, features = features) +``` + +```{r ifnb.cca.sct.anchors} +ifnb <- IntegrateLayers(object = ifnb, + method = RPCAIntegration, + normalization.method = "SCT", + features = features, + k.anchor = 20, + verbose = F) +``` + +```{r ifnb.cca.sct.clustering, results='hide', message=FALSE} +ifnb <- RunUMAP(ifnb, reduction = "integrated.dr", dims = 1:30) +``` + +```{r immunesca.cca.sct.split.dims} +# Visualization +p1 <- DimPlot(ifnb, reduction = "umap", group.by = "stim") +p2 <- DimPlot(ifnb, reduction = "umap", group.by = 'seurat_annotations',label = TRUE, repel = TRUE) +p1 + p2 +``` + +```{r save.times, include=TRUE} +write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_integration_rpca.csv") +``` + +
    + **Session Info** +```{r} +sessionInfo() +``` +
    + diff --git a/vignettes/seurat5_interaction_vignette.Rmd b/vignettes/seurat5_interaction_vignette.Rmd new file mode 100644 index 000000000..966230343 --- /dev/null +++ b/vignettes/seurat5_interaction_vignette.Rmd @@ -0,0 +1,146 @@ +--- +title: "Seurat - Interaction Tips" +output: + html_document: + theme: united + df_print: kable +date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' +--- + +```{r setup, include=TRUE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = TRUE, + tidy.opts = list(width.cutoff = 95), + message = FALSE, + warning = FALSE, + time_it = TRUE, + error = TRUE +) +``` + +```{r, include=TRUE} +options(SeuratData.repo.use = "http://satijalab04.nygenome.org") +``` + +# Load in the data + +This vignette demonstrates some useful features for interacting with the Seurat object. For demonstration purposes, we will be using the 2,700 PBMC object that is created in the first guided tutorial. You can load the data from our [SeuratData](https://github.com/satijalab/seurat-data) package. To simulate the scenario where we have two replicates, we will randomly assign half the cells in each cluster to be from "rep1" and other half from "rep2". + +```{r load_data} +library(Seurat) +library(SeuratData) +InstallData("pbmc3k") +pbmc <- LoadData("pbmc3k", type = "pbmc3k.final") + +# pretend that cells were originally assigned to one of two replicates (we assign randomly here) +# if your cells do belong to multiple replicates, and you want to add this info to the Seurat object +# create a data frame with this information (similar to replicate.info below) +set.seed(42) +pbmc$replicate <- sample(c('rep1', 'rep2'), size = ncol(pbmc), replace = TRUE) +``` + +# Switch identity class between cluster ID and replicate + +```{r swap.idents} +# Plot UMAP, coloring cells by cell type (currently stored in object@ident) +DimPlot(pbmc, reduction = 'umap') +# How do I create a UMAP plot where cells are colored by replicate? +# First, store the current identities in a new column of meta.data called CellType +pbmc$CellType <- Idents(pbmc) +# Next, switch the identity class of all cells to reflect replicate ID +Idents(pbmc) <- 'replicate' +DimPlot(pbmc, reduction = 'umap') +# alternately : DimPlot(pbmc, reduction = 'umap', group.by = "replicate") +# you can pass the shape.by to label points by both replicate and cell type + +# Switch back to cell type labels +Idents(pbmc) <- 'CellType' +``` + +# Tabulate cells by cluster ID, replicate, or both + +```{r counting} +# How many cells are in each cluster +table(Idents(pbmc)) + +# How many cells are in each replicate? +table(pbmc$replicate) + +# What proportion of cells are in each cluster? +prop.table(table(Idents(pbmc))) + +# How does cluster membership vary by replicate? +table(Idents(pbmc), pbmc$replicate) +prop.table(table(Idents(pbmc), pbmc$replicate), margin = 2) +``` + +# Selecting particular cells and subsetting the Seurat object + +```{r subset} +# What are the cell names of all NK cells? +WhichCells(pbmc, idents = "NK") + +# How can I extract expression matrix for all NK cells (perhaps, to load into another package) +nk.raw.data <- as.matrix(GetAssayData(pbmc, slot = 'counts')[, WhichCells(pbmc, ident = "NK")]) + +# Can I create a Seurat object based on expression of a feature or value in object metadata? +subset(pbmc, subset = MS4A1 > 1) +subset(pbmc, subset = replicate == 'rep2') + +# Can I create a Seurat object of just the NK cells and B cells? +subset(pbmc, idents = c('NK', 'B')) + +# Can I create a Seurat object of all cells except the NK cells and B cells? +subset(pbmc, idents = c('NK', 'B'), invert = TRUE) + +# note that if you wish to perform additional rounds of clustering after subsetting +# we recommend re-running FindVariableFeatures() and ScaleData() +``` + +# Calculating the average gene expression within a cluster + +```{r avg.exp, fig.height=8} +# How can I pseudobulk cells within a cluster? +# First, replace spaces with underscores '_' so ggplot2 doesn't fail +pbmc$CellType <- gsub(" ", "_", pbmc$CellType) +Idents(pbmc) <- pbmc$CellType +# Return this information as a Seurat object (enables downstream plotting and analysis) +# The summed counts are stored in the counts layer and normalized value are stored in the data layer +cluster.pseudobulk <- AggregateExpression(pbmc, return.seurat=TRUE) +cluster.pseudobulk +head(cluster.pseudobulk[['RNA']]$data[1:5, ]) + +# How can I plot the average expression of NK cells vs. CD8 T cells? +# Pass do.hover = T for an interactive plot to identify gene outliers +CellScatter(cluster.pseudobulk, cell1 = "NK", cell2 = "CD8-T") + +# How can I calculate pseudobulked expression values separately for each replicate? +cluster.pseudobulk <- AggregateExpression(pbmc, return.seurat = TRUE, group.by = c("CellType", "replicate")) +CellScatter(cluster.pseudobulk, cell1 = "CD8-T_rep1", cell2 = "CD8-T_rep2") + +# You can also plot heatmaps of these 'in silico' bulk datasets to visualize agreement between replicates +DoHeatmap(cluster.pseudobulk, features = unlist(TopFeatures(pbmc[['pca']], balanced = TRUE)), size = 3, draw.lines = FALSE) +``` + +```{r save.times, include=TRUE} +write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_interaction_vignette_times.csv") +``` + +
    + **Session Info** +```{r} +sessionInfo() +``` +
    diff --git a/vignettes/seurat5_merge_vignette.Rmd b/vignettes/seurat5_merge_vignette.Rmd new file mode 100644 index 000000000..037614972 --- /dev/null +++ b/vignettes/seurat5_merge_vignette.Rmd @@ -0,0 +1,109 @@ +--- +title: "Seurat - Combining Two 10X Runs" +output: + html_document: + theme: united + df_print: kable + pdf_document: default +date: 'Compiled: `r Sys.Date()`' +--- + +```{r setup, include=TRUE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = TRUE, + tidy.opts = list(width.cutoff = 95), + message = FALSE, + warning = FALSE, + time_it = TRUE, + error = TRUE +) +``` + +```{r, include=TRUE} +options(SeuratData.repo.use = "http://satijalab04.nygenome.org") +``` + +In this vignette, we will combine two 10X PBMC datasets: one containing 4K cells and one containing 8K cells. The datasets can be found [here](https://support.10xgenomics.com/single-cell-gene-expression/datasets). + +To start, we read in the data and create two `Seurat` objects. + +```{r load_data} +library(Seurat) +pbmc4k.data <- Read10X(data.dir = "../data/pbmc4k/filtered_gene_bc_matrices/GRCh38/") +pbmc4k <- CreateSeuratObject(counts = pbmc4k.data, project = "PBMC4K") +pbmc4k + +pbmc8k.data <- Read10X(data.dir = "../data/pbmc8k/filtered_gene_bc_matrices/GRCh38/") +pbmc8k <- CreateSeuratObject(counts = pbmc8k.data, project = "PBMC8K") +pbmc8k +``` + + +# Merging Two `Seurat` Objects + +`merge()` merges the raw count matrices of two `Seurat` objects and creates a new `Seurat` object with the resulting combined raw count matrix. To easily tell which original object any particular cell came from, you can set the `add.cell.ids` parameter with an `c(x, y)` vector, which will prepend the given identifier to the beginning of each cell name. The original project ID will remain stored in object meta data under `orig.ident` + +```{r merge.objects} +pbmc.combined <- merge(pbmc4k, y = pbmc8k, add.cell.ids = c('4K', '8K'), project = 'PBMC12K') +pbmc.combined +``` + +```{r inspect.merge} +# notice the cell names now have an added identifier +head(colnames(pbmc.combined)) +table(pbmc.combined$orig.ident) +``` + +# Merging More Than Two `Seurat` Objects + +To merge more than two `Seurat` objects, simply pass a vector of multiple `Seurat` objects to the `y` parameter for `merge`; we'll demonstrate this using the 4K and 8K PBMC datasets as well as our previously computed Seurat object from the 2,700 PBMC tutorial (loaded via the [SeuratData](https://github.com/satijalab/seurat-data) package). + +```{r merge_three} +library(SeuratData) +InstallData("pbmc3k") +pbmc3k <- LoadData("pbmc3k", type = "pbmc3k.final") +pbmc3k + +pbmc.big <- merge(pbmc3k, y = c(pbmc4k, pbmc8k), add.cell.ids = c('3K', '4K', '8K'), project = 'PBMC15K') +pbmc.big + +head(colnames(pbmc.big)) +tail(colnames(pbmc.big)) +unique(sapply(X = strsplit(colnames(pbmc.big), split = '_'), FUN = '[', 1)) +table(pbmc.big$orig.ident) +``` + +# Merge Based on Normalized Data + +By default, `merge()` will combine the `Seurat` objects based on the raw count matrices, erasing any previously normalized and scaled data matrices. If you want to merge the normalized data matrices as well as the raw count matrices, simply pass `merge.data = TRUE`. This should be done if the same normalization approach was applied to all objects. + +```{r normalize} +pbmc4k <- NormalizeData(pbmc4k) +pbmc8k <- NormalizeData(pbmc8k) +pbmc.normalized <- merge(pbmc4k, y = pbmc8k, add.cell.ids = c('4K', '8K'), project = 'PBMC12K', merge.data = TRUE) +GetAssayData(pbmc.combined)[1:10, 1:15] +GetAssayData(pbmc.normalized)[1:10, 1:15] +``` + +```{r save.times, include=TRUE} +write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_merge_vignette_times.csv") +``` + +
    + **Session Info** +```{r} +sessionInfo() +``` +
    diff --git a/vignettes/seurat5_mixscape_vignette.Rmd b/vignettes/seurat5_mixscape_vignette.Rmd new file mode 100644 index 000000000..5d229c54d --- /dev/null +++ b/vignettes/seurat5_mixscape_vignette.Rmd @@ -0,0 +1,372 @@ +--- +title: "Mixscape Vignette" +output: + html_document: + theme: united + df_print: kable +date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' +--- + +```{r setup, include=TRUE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now) + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + message = FALSE, + warning = FALSE, + time_it = TRUE, + error = TRUE +) +options(SeuratData.repo.use = 'satijalab04.nygenome.org') +``` + +# Overview + +This tutorial demonstrates how to use mixscape for the analyses of single-cell pooled CRSIPR screens. We introduce new Seurat functions for: + +1. Calculating the perturbation-specific signature of every cell. +2. Identifying and removing cells that have 'escaped' CRISPR perturbation. +3. Visualizing similarities/differences across different perturbations. + +# Loading required packages + +```{r pkgs1} +# Load packages. +library(Seurat) +options(Seurat.object.assay.version = "v5") +library(SeuratData) +library(ggplot2) +library(patchwork) +library(scales) +library(dplyr) +library(reshape2) + +# Download dataset using SeuratData. +InstallData(ds = "thp1.eccite") + +# Setup custom theme for plotting. +custom_theme <- theme( + plot.title = element_text(size = 16, hjust = 0.5), + legend.key.size = unit(0.7, "cm"), + legend.text = element_text(size = 14)) +``` + +# Loading Seurat object containing ECCITE-seq dataset + +We use a 111 gRNA ECCITE-seq dataset generated from stimulated THP-1 cells that was recently published from our lab in bioRxiv [Papalexi et al. 2020](https://www.biorxiv.org/content/10.1101/2020.06.28.175596v1). This dataset can be easily downloaded from the [SeuratData](https://github.com/satijalab/seurat-data) package. + +```{r eccite.load} +# Load object. +eccite <- LoadData(ds = "thp1.eccite") + +eccite[["RNA"]] <- as(eccite[["RNA"]], Class = "Assay5") +eccite[["ADT"]] <- as(eccite[["ADT"]], Class = "Assay5") +eccite[["HTO"]] <- as(eccite[["HTO"]], Class = "Assay5") +eccite[["GDO"]] <- as(eccite[["GDO"]], Class = "Assay5") + +# Normalize protein. +eccite <- NormalizeData( + object = eccite, + assay = "ADT", + normalization.method = "CLR", + margin = 2) +``` + +# RNA-based clustering is driven by confounding sources of variation + +Here, we follow the standard Seurat workflow to cluster cells based on their gene expression profiles. We expected to obtain perturbation-specific clusters however we saw that clustering is primarily driven by cell cycle phase and replicate ID. We only observed one perturbation-specific cluster containing cells expression IFNgamma pathway gRNAs. + +```{r eccite.pp, fig.height = 10, fig.width = 15} +# Prepare RNA assay for dimensionality reduction: +# Normalize data, find variable features and scale data. +DefaultAssay(object = eccite) <- 'RNA' +eccite <- NormalizeData(object = eccite) %>% FindVariableFeatures() %>% ScaleData() + +# Run Principle Component Analysis (PCA) to reduce the dimensionality of the data. +eccite <- RunPCA(object = eccite) + +# Run Uniform Manifold Approximation and Projection (UMAP) to visualize clustering in 2-D. +eccite <- RunUMAP(object = eccite, dims = 1:40) + +# Generate plots to check if clustering is driven by biological replicate ID, +# cell cycle phase or target gene class. +p1 <- DimPlot( + object = eccite, + group.by = "replicate", + label = F, + pt.size = 0.2, + reduction = "umap", cols = "Dark2", repel = T) + + scale_color_brewer(palette = "Dark2") + + ggtitle("Biological Replicate") + + xlab("UMAP 1") + + ylab("UMAP 2") + +p2 <- DimPlot( + object = eccite, + group.by = "Phase", + label = F, pt.size = 0.2, + reduction = "umap", repel = T) + + ggtitle("Cell Cycle Phase") + + ylab("UMAP 2") + + xlab("UMAP 1") + +p3 <- DimPlot( + object = eccite, + group.by = "crispr", + pt.size = 0.2, + reduction = "umap", + split.by = "crispr", + ncol = 1, + cols = c("grey39", "goldenrod3")) + + ggtitle("Perturbation Status") + + ylab("UMAP 2") + + xlab("UMAP 1") + +# Visualize plots. +((p1 / p2 + plot_layout(guides = "auto")) | p3 ) +``` + +# Calculating local perturbation signatures mitigates confounding effects + +To calculate local perturbation signatures we set the number of non-targeting Nearest Neighbors (NNs) equal to k=20 and we recommend that the user picks a k from the following range: 20 < k < 30. Intuitively, the user does not want to set k to a very small or large number as this will most likely not remove the technical variation from the dataset. Using the PRTB signature to cluster cells removes all technical variation and reveals one additional perturbation-specific cluster. + +```{r eccite.cps, fig.height = 10, fig.width = 15} +# Calculate perturbation signature (PRTB). +eccite <- CalcPerturbSig( + object = eccite, + assay = "RNA", + slot = "data", + gd.class ="gene", + nt.cell.class = "NT", + reduction = "pca", + ndims = 40, + num.neighbors = 20, + split.by = "replicate", + new.assay.name = "PRTB") + +# Prepare PRTB assay for dimensionality reduction: +# Normalize data, find variable features and center data. +DefaultAssay(object = eccite) <- "PRTB" + +# Use variable features from RNA assay. +VariableFeatures(object = eccite) <- VariableFeatures(object = eccite[["RNA"]]) +eccite <- ScaleData(object = eccite, do.scale = FALSE, do.center = TRUE) + +# Run PCA to reduce the dimensionality of the data. +eccite <- RunPCA(object = eccite, reduction.key = "prtbpca", reduction.name = "prtbpca") + +# Run UMAP to visualize clustering in 2-D. +eccite <- RunUMAP( + object = eccite, + dims = 1:40, + reduction = "prtbpca", + reduction.key = "prtbumap", + reduction.name = "prtbumap") + +# Generate plots to check if clustering is driven by biological replicate ID, +# cell cycle phase or target gene class. +q1 <- DimPlot( + object = eccite, + group.by = "replicate", + reduction = "prtbumap", + pt.size = 0.2, cols = "Dark2", label = FALSE, repel = TRUE) + + scale_color_brewer(palette = "Dark2") + + ggtitle("Biological Replicate") + + ylab("UMAP 2") + + xlab("UMAP 1") + +q2 <- DimPlot( + object = eccite, + group.by = "Phase", + reduction = "prtbumap", + pt.size = 0.2, label = F, repel = T) + + ggtitle("Cell Cycle Phase") + + ylab("UMAP 2") + + xlab("UMAP 1") + +q3 <- DimPlot( + object = eccite, + group.by = "crispr", + reduction = "prtbumap", + split.by = "crispr", + ncol = 1, + pt.size = 0.2, + cols = c("grey39", "goldenrod3")) + + ggtitle("Perturbation Status") + + ylab("UMAP 2") + + xlab("UMAP 1") + +# Visualize plots. +(q1 / q2 + plot_layout(guides = "auto") | q3) +``` + +# Mixscape identifies cells with no detectable perturbation + +Here, we are assuming each target gene class is a mixture of two Gaussian distributions one representing the knockout (KO) and the other the non-perturbed (NP) cells. We further assume that the distribution of the NP cells is identical to that of cells expressing non-targeting gRNAs (NT) and we try to estimate the distribution of KO cells using the function `normalmixEM()` from the mixtools package. Next, we calculate the posterior probability that a cell belongs to the KO distribution and classify cells with a probability higher than 0.5 as KOs. Applying this method we identify KOs in 11 target gene classes and detect variation in gRNA targeting efficiency within each class. + +```{r eccite.mixscape, fig.height = 20, fig.width = 20, results="hide"} +# Run mixscape. +eccite <- RunMixscape( + object = eccite, + assay = "PRTB", + slot = "scale.data", + labels = "gene", + nt.class.name = "NT", + min.de.genes = 5, + iter.num = 10, + de.assay = "RNA", + verbose = FALSE, + prtb.type = "KO") + +# Calculate percentage of KO cells for all target gene classes. +df <- prop.table(table(eccite$mixscape_class.global, eccite$NT),2) + +df2 <- reshape2::melt(df) +df2$Var2 <- as.character(df2$Var2) +test <- df2[which(df2$Var1 == "KO"),] +test <- test[order(test$value, decreasing = TRUE),] +new.levels <- test$Var2 +df2$Var2 <- factor(df2$Var2, levels = new.levels) +df2$Var1 <- factor(df2$Var1, levels = c("NT", "NP", "KO")) +df2$gene <- sapply(as.character(df2$Var2), function(x) strsplit(x, split = "g")[[1]][1]) +df2$guide_number <- sapply(as.character(df2$Var2), + function(x) strsplit(x, split = "g")[[1]][2]) +df3 <- df2[-c(which(df2$gene == "NT")),] + +p1 <- ggplot(df3, aes(x = guide_number, y = value * 100, fill = Var1)) + + geom_bar(stat = "identity") + + theme_classic() + + scale_fill_manual(values = c("grey49", "grey79", "coral1")) + + ylab("% of cells") + + xlab("sgRNA") + +p1 + theme(axis.text.x = element_text(size = 18, hjust = 1), + axis.text.y = element_text(size = 18), + axis.title = element_text(size = 16), + strip.text = element_text(size = 16, face = "bold")) + + facet_wrap(vars(gene), ncol = 5, scales = "free") + + labs(fill = "mixscape class") + theme(legend.title = element_text(size = 14), + legend.text = element_text(size = 12)) +``` + +# Inspecting mixscape results + +To ensure mixscape is assigning the correct perturbation status to cells we can use the functions below to look at the perturbation score distributions and the posterior probabilities of cells within a target gene class (for example IFNGR2) and compare it to those of the NT cells. In addition, we can perform differential expression (DE) analyses and show that only IFNGR2 KO cells have reduced expression of the IFNG-pathway genes. Finally, as an independent check, we can look at the PD-L1 protein expression values in NP and KO cells for target genes known to be PD-L1 regulators. + +```{r eccite.plots, fig.height = 10, fig.width = 15, results="hide"} +# Explore the perturbation scores of cells. +PlotPerturbScore(object = eccite, + target.gene.ident = "IFNGR2", + mixscape.class = "mixscape_class", + col = "coral2") + labs(fill = "mixscape class") + +# Inspect the posterior probability values in NP and KO cells. +VlnPlot(eccite, "mixscape_class_p_ko", idents = c("NT", "IFNGR2 KO", "IFNGR2 NP")) + + theme(axis.text.x = element_text(angle = 0, hjust = 0.5),axis.text = element_text(size = 16) ,plot.title = element_text(size = 20)) + + NoLegend() + + ggtitle("mixscape posterior probabilities") + +# Run DE analysis and visualize results on a heatmap ordering cells by their posterior +# probability values. +Idents(object = eccite) <- "gene" +MixscapeHeatmap(object = eccite, + ident.1 = "NT", + ident.2 = "IFNGR2", + balanced = FALSE, + assay = "RNA", + max.genes = 20, angle = 0, + group.by = "mixscape_class", + max.cells.group = 300, + size=6.5) + NoLegend() +theme(axis.text.y = element_text(size = 16)) + +# Show that only IFNG pathway KO cells have a reduction in PD-L1 protein expression. +VlnPlot( + object = eccite, + features = "adt_PDL1", + idents = c("NT","JAK2", "STAT1", "IFNGR1", "IFNGR2", "IRF1"), + group.by = "gene", + pt.size = 0.2, + sort = TRUE, + split.by = "mixscape_class.global", + cols = c("coral3", "grey79", "grey39")) + + ggtitle("PD-L1 protein") + + theme(axis.text.x = element_text(angle = 0, hjust = 0.5), plot.title = element_text(size = 20), axis.text = element_text(size = 16)) +``` + +```{r save.img, include=TRUE} +p <- VlnPlot(object = eccite, features = "adt_PDL1", idents = c("NT","JAK2","STAT1","IFNGR1","IFNGR2", "IRF1"), group.by = "gene", pt.size = 0.2, sort = T, split.by = "mixscape_class.global", cols = c("coral3","grey79","grey39")) +ggtitle("PD-L1 protein") +theme(axis.text.x = element_text(angle = 0, hjust = 0.5)) +ggsave(filename = "../output/images/mixscape_vignette.jpg", height = 7, width = 12, plot = p, quality = 50) +``` + +# Visualizing perturbation responses with Linear Discriminant Analysis (LDA) + +We use LDA as a dimensionality reduction method to visualize perturbation-specific clusters. LDA is trying to maximize the separability of known labels (mixscape classes) using both gene expression and the labels as input. + +```{r eccite.lda, fig.height = 7, fig.width = 10, results="hide"} +# Remove non-perturbed cells and run LDA to reduce the dimensionality of the data. +Idents(eccite) <- "mixscape_class.global" +sub <- subset(eccite, idents = c("KO", "NT")) + +# Run LDA. +sub <- MixscapeLDA( + object = sub, + assay = "RNA", + pc.assay = "PRTB", + labels = "gene", + nt.label = "NT", + npcs = 10, + logfc.threshold = 0.25, + verbose = FALSE) + +# Use LDA results to run UMAP and visualize cells on 2-D. +# Here, we note that the number of the dimensions to be used is equal to the number of +# labels minus one (to account for NT cells). +sub <- RunUMAP( + object = sub, + dims = 1:11, + reduction = "lda", + reduction.key = "ldaumap", + reduction.name = "ldaumap") + +# Visualize UMAP clustering results. +Idents(sub) <- "mixscape_class" +sub$mixscape_class <- as.factor(sub$mixscape_class) + +# Set colors for each perturbation. +col = setNames(object = hue_pal()(12), nm = levels(sub$mixscape_class)) +names(col) <- c(names(col)[1:7], "NT", names(col)[9:12]) +col[8] <- "grey39" + +p <- DimPlot(object = sub, + reduction = "ldaumap", + repel = TRUE, + label.size = 5, + label = TRUE, + cols = col) + NoLegend() + +p2 <- p + + scale_color_manual(values = col, drop = FALSE) + + ylab("UMAP 2") + + xlab("UMAP 1") +p2 +``` + +```{r save.times, include=TRUE} +write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_mixscape_vignette_times.csv") +``` + +
    + **Session Info** +```{r} +sessionInfo() +``` +
    diff --git a/vignettes/seurat5_multimodal_reference_mapping.Rmd b/vignettes/seurat5_multimodal_reference_mapping.Rmd new file mode 100644 index 000000000..b2e1c8305 --- /dev/null +++ b/vignettes/seurat5_multimodal_reference_mapping.Rmd @@ -0,0 +1,396 @@ +--- +title: "Multimodal reference mapping" +output: + html_document: + theme: united + df_print: kable +date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' +--- + +```{r setup, include=TRUE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now) + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + message = FALSE, + warning = FALSE, + time_it = TRUE, + error = TRUE +) +``` + +# Intro: Seurat v4 Reference Mapping + +This vignette introduces the process of mapping query datasets to annotated references in Seurat. In this example, we map one of the first scRNA-seq datasets released by 10X Genomics of 2,700 PBMC to our [recently described CITE-seq reference of 162,000 PBMC measured with 228 antibodies](https://doi.org/10.1016/j.cell.2021.04.048). We chose this example to demonstrate how supervised analysis guided by a reference dataset can help to enumerate cell states that would be challenging to find with [unsupervised analysis](pbmc3k_tutorial.html). In a second example, we demonstrate how to serially map Human Cell Atlas datasets of human BMNC profiled from different individuals onto a consistent reference. + +We have [previously demonstrated](integration_mapping.html) how to use reference-mapping approach to annotate cell labels in a query dataset . In Seurat v4, we have substantially improved the speed and memory requirements for integrative tasks including reference mapping, and also include new functionality to project query cells onto a previously computed UMAP visualization. + +In this vignette, we demonstrate how to use a previously established reference to interpret an scRNA-seq query: + +* Annotate each query cell based on a set of reference-defined cell states +* Project each query cell onto a previously computed UMAP visualization +* Impute the predicted levels of surface proteins that were measured in the CITE-seq reference + +To run this vignette please install Seurat v4, available on CRAN. Additionally, you will need to install the `SeuratDisk` package. + +```{r install, eval = FALSE} +install.packages("Seurat") +remotes::install_github("mojaveazure/seurat-disk") +``` + +```{r packages, cache=FALSE} +library(Seurat) +library(SeuratDisk) +library(ggplot2) +library(patchwork) +``` + +```{r, include=TRUE, cache=FALSE} +options(SeuratData.repo.use = "http://satijalab04.nygenome.org") +``` + +# Example 1: Mapping human peripheral blood cells + +## A Multimodal PBMC Reference Dataset + +We load the reference (download [here](https://atlas.fredhutch.org/data/nygc/multimodal/pbmc_multimodal.h5seurat)) from our recent [paper](https://doi.org/10.1016/j.cell.2021.04.048), and visualize the pre-computed UMAP. This reference is stored as an h5Seurat file, a format that enables on-disk storage of multimodal Seurat objects (more details on h5Seurat and `SeuratDisk` can be found [here](https://mojaveazure.github.io/seurat-disk/index.html)). + +```{r pbmc.ref} +reference <- readRDS("../data/pbmc_multimodal_2023.rds") +``` + +```{r ref.dimplot} +DimPlot(object = reference, reduction = "wnn.umap", group.by = "celltype.l2", label = TRUE, label.size = 3, repel = TRUE) + NoLegend() +``` + +## Mapping + +To demonstrate mapping to this multimodal reference, we will use a dataset of 2,700 PBMCs generated by 10x Genomics and available via `SeuratData`. + +```{r 3k.load} +library(SeuratData) +pbmc3k <- LoadData('pbmc3k') +pbmc3k[['RNA']] <- as(pbmc3k[['RNA']], Class = 'Assay5') +``` + +The reference was normalized using `SCTransform()`, so we use the same approach to normalize the query here. + +```{r 3k.preprocess, results="hide"} +pbmc3k <- SCTransform(pbmc3k, verbose = FALSE) +``` + +We then find anchors between reference and query. As described in the [manuscript](https://doi.org/10.1016/j.cell.2021.04.048), we used a precomputed supervised PCA (spca) transformation for this example. We recommend the use of supervised PCA for CITE-seq datasets, and demonstrate how to compute this transformation on the next tab of this vignette. However, you can also use a standard PCA transformation. + +```{r transfer.anchors} +anchors <- FindTransferAnchors( + reference = reference, + query = pbmc3k, + normalization.method = "SCT", + reference.reduction = "spca", + dims = 1:50 +) +``` + +We then transfer cell type labels and protein data from the reference to the query. Additionally, we project the query data onto the UMAP structure of the reference. + +```{r transfer} +pbmc3k <- MapQuery( + anchorset = anchors, + query = pbmc3k, + reference = reference, + refdata = list( + celltype.l1 = "celltype.l1", + celltype.l2 = "celltype.l2", + predicted_ADT = "ADT" + ), + reference.reduction = "spca", + reduction.model = "wnn.umap" +) +``` + +
    + **What is `MapQuery` doing?** + + `MapQuery()` is a wrapper around three functions: `TransferData()`, `IntegrateEmbeddings()`, and `ProjectUMAP()`. `TransferData()` is used to transfer cell type labels and impute the ADT values. `IntegrateEmbeddings()` and `ProjectUMAP()` are used to project the query data onto the UMAP structure of the reference. The equivalent code for doing this with the intermediate functions is below: + + +```{r, eval=FALSE} +pbmc3k <- TransferData( + anchorset = anchors, + reference = reference, + query = pbmc3k, + refdata = list( + celltype.l1 = "celltype.l1", + celltype.l2 = "celltype.l2", + predicted_ADT = "ADT") +) +pbmc3k <- IntegrateEmbeddings( + anchorset = anchors, + reference = reference, + query = pbmc3k, + new.reduction.name = "ref.spca" +) +pbmc3k <- ProjectUMAP( + query = pbmc3k, + query.reduction = "ref.spca", + reference = reference, + reference.reduction = "spca", + reduction.model = "wnn.umap" +) +``` +
    + +## Explore the mapping results + +We can now visualize the 2,700 query cells. They have been projected into a UMAP visualization defined by the reference, and each has received annotations at two levels of granularity (level 1, and level 2). + +```{r 3k.refdimplots, fig.width=10} +p1 = DimPlot(pbmc3k, reduction = "ref.umap", group.by = "predicted.celltype.l1", label = TRUE, label.size = 3, repel = TRUE) + NoLegend() +p2 = DimPlot(pbmc3k, reduction = "ref.umap", group.by = "predicted.celltype.l2", label = TRUE, label.size = 3 ,repel = TRUE) + NoLegend() +p1 + p2 +``` + +The reference-mapped dataset helps us identify cell types that were previously blended in an [unsupervised analysis of the query dataset](pbmc3k_tutorial.html). Just a few examples include plasmacytoid dendritic cells (pDC), hematopoietic stem and progenitor cells (HSPC), regulatory T cells (Treg), CD8 Naive T cells, cells, CD56+ NK cells, memory, and naive B cells, and plasmablasts. + +Each prediction is assigned a score between 0 and 1. + +```{r 3k.featureplots1, fig.width = 10, fig.height =4} +FeaturePlot(pbmc3k, features = c("pDC", "CD16 Mono", "Treg"), reduction = "ref.umap", cols = c("lightgrey", "darkred"), ncol = 3) & theme(plot.title = element_text(size = 10)) +``` + +```{r save.img, include=TRUE} +library(ggplot2) +plot <- FeaturePlot(pbmc3k, features = "CD16 Mono", reduction = "ref.umap", cols = c("lightgrey", "darkred")) + ggtitle("CD16 Mono") + theme(plot.title = element_text(hjust = 0.5, size = 30)) + labs(color = "Prediction Score") + xlab("UMAP 1") + ylab("UMAP 2") + + theme(axis.title = element_text(size = 18), legend.text = element_text(size = 18), legend.title = element_text(size = 25)) +ggsave(filename = "../output/images/multimodal_reference_mapping.jpg", height = 7, width = 12, plot = plot, quality = 50) +``` + +We can verify our predictions by exploring the expression of canonical marker genes. For example, CLEC4C and LIRA4 have been [reported](https://pubmed.ncbi.nlm.nih.gov/30395816/) as markers of pDC identity, consistent with our predictions. Similarly, if we perform differential expression to identify markers of Tregs, we identify a set of [canonical markers](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4761514/) including RTKN2, CTLA4, FOXP3, and IL2RA. + +```{r 3k.VlnPlots, fig.height=6, fig.width = 10, fig.height =5} +Idents(pbmc3k) <- 'predicted.celltype.l2' +VlnPlot(pbmc3k, features = c("CLEC4C", "LILRA4"), sort = TRUE) + NoLegend() + +treg_markers <- FindMarkers(pbmc3k, ident.1 = "Treg", only.pos = TRUE, logfc.threshold = 0.1) +print(head(treg_markers)) +``` + +Finally, we can visualize the imputed levels of surface protein, which were inferred based on the CITE-seq reference. + +```{r 3k.featureplots2, fig.width=10, fig.height =4} +DefaultAssay(pbmc3k) <- 'predicted_ADT' +# see a list of proteins: rownames(pbmc3k) +FeaturePlot(pbmc3k, features = c("CD3-1", "CD45RA", "IgD"), reduction = "ref.umap", cols = c("lightgrey", "darkgreen"), ncol = 3) +``` + +## Computing a new UMAP visualiztion + +In the previous examples, we visualize the query cells after mapping to the reference-derived UMAP. Keeping a consistent visualization can assist with the interpretation of new datasets. However, if there are cell states that are present in the query dataset that are not represented in the reference, they will project to the most similar cell in the reference. This is the expected behavior and functionality as established by the UMAP package, but can potentially mask the presence of new cell types in the query which may be of interest. + +In our [manuscript](https://doi.org/10.1016/j.cell.2021.04.048), we map a query dataset containing developing and differentiated neutrophils, which are not included in our reference. We find that computing a new UMAP ('de novo visualization') after merging the reference and query can help to identify these populations, as demonstrated in Supplementary Figure 8. In the 'de novo' visualization, unique cell states in the query remain separated. In this example, the 2,700 PBMC does not contain unique cell states, but we demonstrate how to compute this visualization below. + +We emphasize that if users are attempting to map datasets where the underlying samples are not PBMC, or contain cell types that are not present in the reference, computing a 'de novo' visualization is an important step in interpreting their dataset. + +```{r hiddendiet, include=TRUE} +reference <- DietSeurat(reference, counts = FALSE, dimreducs = "spca") +pbmc3k <- DietSeurat(pbmc3k, counts = FALSE, dimreducs = "ref.spca") +``` + +```{r denovoumap} +#merge reference and query +reference$id <- 'reference' +pbmc3k$id <- 'query' +refquery <- merge(reference, pbmc3k) +refquery[["spca"]] <- merge(reference[["spca"]], pbmc3k[["ref.spca"]]) +refquery <- RunUMAP(refquery, reduction = 'spca', dims = 1:50) +DimPlot(refquery, group.by = 'id', shuffle = TRUE) +``` + +# Example 2: Mapping human bone marrow cells + +## A Multimodal BMNC Reference Dataset + +As a second example, we map a dataset of human bone marrow mononuclear (BMNC) cells from eight individual donors, produced by the Human Cell Atlas. As a reference, we use the CITE-seq reference of human BMNC that we analyzed using [weighted-nearest neighbor analysis (WNN)](weighted_nearest_neighbor_analysis.html). + +This vignette exhibits the same reference-mapping functionality as the PBMC example on the previous tab. In addition, we also demonstrate: + +* How to construct a supervised PCA (sPCA) transformation +* How to serially map multiple datasets to the same reference +* Optimization steps to further enhance to speed of mapping + +```{r bmref.seuratdata} +# Both datasets are available through SeuratData +library(SeuratData) + +#load reference data +InstallData("bmcite") +bm <- LoadData("bmcite") +bm[['RNA']] <- as(bm[['RNA']], Class = 'Assay5') + +#load query data +InstallData('hcabm40k') +hcabm40k <- LoadData("hcabm40k") +hcabm40k[['RNA']] <- as(hcabm40k[['RNA']], Class = 'Assay5') +``` + +The reference dataset contains a [WNN graph](weighted_nearest_neighbor_analysis.html), reflecting a weighted combination of the RNA and protein data in this CITE-seq experiment. + +We can compute a UMAP visualization based on this graph. We set `return.model = TRUE`, which will enable us to project query datasets onto this visualization. + +```{r bm.refdimplot, fig.width=8} +bm <- RunUMAP(bm, nn.name = "weighted.nn", reduction.name = "wnn.umap", + reduction.key = "wnnUMAP_", return.model = TRUE) +DimPlot(bm, group.by = "celltype.l2", reduction = "wnn.umap") +``` + +## Computing an sPCA transformation + +As described in our [manuscript](https://doi.org/10.1016/j.cell.2021.04.048), we first compute a 'supervised' PCA. This identifies the transformation of the transcriptome data that best encapsulates the structure of the WNN graph. This allows a weighted combination of the protein and RNA measurements to 'supervise' the PCA, and highlight the most relevant sources of variation. After computing this transformation, we can project it onto a query dataset. We can also compute and project a PCA projection, but recommend the use of sPCA when working with multimodal references that have been constructed with WNN analysis. + +The sPCA calculation is performed once, and then can be rapidly projected onto each query dataset. + +```{r bm.spca} +bm <- ScaleData(bm, assay = 'RNA') +bm <- RunSPCA(bm, assay = 'RNA', graph = 'wsnn') +``` + +## Computing a cached neighbor index + +Since we will be mapping multiple query samples to the same reference, we can cache particular steps that only involve the reference. This step is optional but will improve speed when mapping multiple samples. + +We compute the first 50 neighbors in the sPCA space of the reference. We store this information in the `spca.annoy.neighbors` object within the reference Seurat object and also cache the annoy index data structure (via `cache.index = TRUE`). + +```{r bm.nn, cache = FALSE} +bm <- FindNeighbors( + object = bm, + reduction = "spca", + dims = 1:50, + graph.name = "spca.annoy.neighbors", + k.param = 50, + cache.index = TRUE, + return.neighbor = TRUE, + l2.norm = TRUE +) +``` + +
    + **How can I save and load a cached annoy index?** + +If you want to save and load a cached index for a `Neighbor` object generated with `method = "annoy"` and `cache.index = TRUE`, use the `SaveAnnoyIndex()`/`LoadAnnoyIndex()` functions. Importantly, this index cannot be saved normally to an RDS or RDA file, so it will not persist correctly across R session restarts or `saveRDS`/`readRDS` for the Seurat object containing it. Instead, use `LoadAnnoyIndex()` to add the Annoy index to the `Neighbor` object every time R restarts or you load the reference Seurat object from RDS. The file created by `SaveAnnoyIndex()` can be distributed along with a reference Seurat object, and added to the `Neighbor` object in the reference. + +```{r neighbor.demo} +bm[["spca.annoy.neighbors"]] +SaveAnnoyIndex(object = bm[["spca.annoy.neighbors"]], file = "../data/reftmp.idx") +bm[["spca.annoy.neighbors"]] <- LoadAnnoyIndex(object = bm[["spca.annoy.neighbors"]], file = "../data/reftmp.idx") +``` +
    + +## Query dataset preprocessing + +Here we will demonstrate mapping multiple donor bone marrow samples to the multimodal bone marrow reference. These query datasets are derived from the Human Cell Atlas (HCA) Immune Cell Atlas Bone marrow dataset and are available through SeuratData. This dataset is provided as a single merged object with 8 donors. We first split the data back into 8 separate Seurat objects, one for each original donor to map individually. + +```{r bm40k.load} +library(dplyr) +library(SeuratData) +hcabm40k <- LoadData('hcabm40k') +hcabm40k[['RNA']] <- as(hcabm40k[['RNA']], Class = 'Assay5') +hcabm40k.batches <- SplitObject(hcabm40k, split.by = "orig.ident") +``` + +We then normalize the query in the same manner as the reference. Here, the reference was normalized using log-normalization via `NormalizeData()`. If the reference had been normalized using `SCTransform()`, the query must be normalized with `SCTransform()` as well. + +```{r 40k.norm} +hcabm40k.batches <- lapply(X = hcabm40k.batches, FUN = NormalizeData, verbose = FALSE) +``` + +## Mapping + +We then find anchors between each donor query dataset and the multimodal reference. This command is optimized to minimize mapping time, by passing in a pre-computed set of reference neighbors, and turning off anchor filtration. + +```{r bm.anchors} +anchors <- list() +for (i in 1:length(hcabm40k.batches)) { + anchors[[i]] <- FindTransferAnchors( + reference = bm, + query = hcabm40k.batches[[i]], + k.filter = NA, + reference.reduction = "spca", + reference.neighbors = "spca.annoy.neighbors", + dims = 1:50 + ) +} +``` + +We then individually map each of the datasets. + +```{r bm.map} +for (i in 1:length(hcabm40k.batches)) { + hcabm40k.batches[[i]] <- MapQuery( + anchorset = anchors[[i]], + query = hcabm40k.batches[[i]], + reference = bm, + refdata = list( + celltype = "celltype.l2", + predicted_ADT = "ADT"), + reference.reduction = "spca", + reduction.model = "wnn.umap" + ) +} +``` + +## Explore the mapping results + +Now that mapping is complete, we can visualize the results for individual objects + +```{r bm.umap.separate, fig.width=10} +p1 <- DimPlot(hcabm40k.batches[[1]], reduction = 'ref.umap', group.by = 'predicted.celltype', label.size = 3) +p2 <- DimPlot(hcabm40k.batches[[2]], reduction = 'ref.umap', group.by = 'predicted.celltype', label.size = 3) +p1 + p2 + plot_layout(guides = "collect") +``` + +We can also merge all the objects into one dataset. Note that they have all been integrated into a common space, defined by the reference. We can then visualize the results together. + +```{r bm.umap.combine} +# Merge the batches +hcabm40k <- merge(hcabm40k.batches[[1]], hcabm40k.batches[2:length(hcabm40k.batches)], merge.dr = "ref.umap") +DimPlot(hcabm40k, reduction = "ref.umap", group.by = "predicted.celltype", label = TRUE, repel = TRUE, label.size = 3) + NoLegend() +``` + +We can visualize gene expression, cluster prediction scores, and (imputed) surface protein levels in the query cells: + +```{r ftplot, fig.height = 10, fig.width=10} +p3 <- FeaturePlot(hcabm40k, features = c("rna_TRDC", "rna_MPO", "rna_AVP"), reduction = 'ref.umap', + max.cutoff = 3, ncol = 3) + +# cell type prediction scores +DefaultAssay(hcabm40k) <- 'prediction.score.celltype' +p4 <- FeaturePlot(hcabm40k, features = c("CD16 Mono", "HSC", "Prog-RBC"), ncol = 3, + cols = c("lightgrey", "darkred")) + +# imputed protein levels +DefaultAssay(hcabm40k) <- 'predicted_ADT' +p5 <- FeaturePlot(hcabm40k, features = c("CD45RA", "CD16", "CD161"), reduction = 'ref.umap', + min.cutoff = 'q10', max.cutoff = 'q99', cols = c("lightgrey", "darkgreen") , + ncol = 3) +p3 / p4 / p5 +``` + +```{r save.times, include=TRUE} +write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_reference_mapping_times.csv") +``` + +
    + **Session Info** +```{r} +sessionInfo() +``` +
    diff --git a/vignettes/seurat5_multimodal_vignette.Rmd b/vignettes/seurat5_multimodal_vignette.Rmd new file mode 100644 index 000000000..6a086618d --- /dev/null +++ b/vignettes/seurat5_multimodal_vignette.Rmd @@ -0,0 +1,240 @@ +--- +title: "Using Seurat with multimodal data" +output: + html_document: + theme: united + df_print: kable + pdf_document: default +date: 'Compiled: `r Sys.Date()`' +--- +*** + +```{r setup, include=TRUE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = TRUE, + tidy.opts = list(width.cutoff = 95), + message = FALSE, + warning = FALSE, + time_it = TRUE, + fig.width = 10, + error = TRUE +) +``` + +# Load in the data + +The ability to make simultaneous measurements of multiple data types from the same cell, known as multimodal analysis, represents a new and exciting frontier for single-cell genomics. For example, [CITE-seq](http://www.nature.com/nmeth/journal/v14/n9/full/nmeth.4380.html) enables the simultaneous measurements of transcriptomes and cell-surface proteins from the same cell. Other exciting multimodal technologies, such as the [10x multiome kit](https://www.10xgenomics.com/products/single-cell-multiome-atac-plus-gene-expression) allow for the paired measurements of cellular transcriptome and chromatin accessibility (i.e scRNA-seq+scATAC-seq). Other modalities that can be measured alongside cellular transcriptomes include genetic perturbations, cellular methylomes, and hashtag oligos from [Cell Hashing](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1603-1). We have designed Seurat4 to enable for the seamless storage, analysis, and exploration of diverse multimodal single-cell datasets. + +In this vignette, we present an introductory workflow for creating a multimodal Seurat object and performing an initial analysis. For example, we demonstrate how to cluster a CITE-seq dataset on the basis of the measured cellular transcriptomes, and subsequently discover cell surface proteins that are enriched in each cluster. We note that Seurat4 also enables more advanced techniques for the analysis of multimodal data, in particular the application of our [Weighted Nearest Neighbors (WNN) approach](https://doi.org/10.1016/j.cell.2021.04.048) that enables simultaneous clustering of cells based on a weighted combination of both modalities, and you can explore this functionality [here](weighted_nearest_neighbor_analysis.html). + +Here, we analyze a dataset of 8,617 cord blood mononuclear cells (CBMCs), where transcriptomic measurements are paired with abundance estimates for 11 surface proteins, whose levels are quantified with DNA-barcoded antibodies. First, we load in two count matrices : one for the RNA measurements, and one for the antibody-derived tags (ADT). You can download the ADT file [here](ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100866/suppl/GSE100866_CBMC_8K_13AB_10X-ADT_umi.csv.gz) and the RNA file [here](ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100866/suppl/GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv.gz) + +```{r load_packages} +library(Seurat) +options(Seurat.object.assay.version = "v5") +library(ggplot2) +library(patchwork) +``` + +```{r load_data} +# Load in the RNA UMI matrix + +# Note that this dataset also contains ~5% of mouse cells, which we can use as negative controls for the protein measurements. For this reason, the gene expression matrix has HUMAN_ or MOUSE_ appended to the beginning of each gene. +cbmc.rna <- as.sparse(read.csv(file = '../data/GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv.gz', sep = ',', header = TRUE, row.names = 1)) + +# To make life a bit easier going forward, we're going to discard all but the top 100 most highly expressed mouse genes, and remove the "HUMAN_" from the CITE-seq prefix +cbmc.rna <- CollapseSpeciesExpressionMatrix(cbmc.rna) + +# Load in the ADT UMI matrix +cbmc.adt <- as.sparse(read.csv(file = '../data/GSE100866_CBMC_8K_13AB_10X-ADT_umi.csv.gz', sep = ',', header = TRUE, row.names = 1)) + +# Note that since measurements were made in the same cells, the two matrices have identical column names +all.equal(colnames(cbmc.rna),colnames(cbmc.adt)) +``` + +# Setup a Seurat object, add the RNA and protein data + +Now we create a Seurat object, and add the ADT data as a second assay + +```{r basic_de} +# creates a Seurat object based on the scRNA-seq data +cbmc <- CreateSeuratObject(counts = cbmc.rna) + +# We can see that by default, the cbmc object contains an assay storing RNA measurement +Assays(cbmc) + +# create a new assay to store ADT information +adt_assay <- CreateAssay5Object(counts = cbmc.adt) + +# add this assay to the previously created Seurat object +cbmc[["ADT"]] <- adt_assay + +# Validate that the object now contains multiple assays +Assays(cbmc) + +# Extract a list of features measured in the ADT assay +rownames(cbmc[["ADT"]]) + +# Note that we can easily switch back and forth between the two assays to specify the default for visualization and analysis + +# List the current default assay +DefaultAssay(cbmc) + +# Switch the default to ADT +DefaultAssay(cbmc) <- 'ADT' +DefaultAssay(cbmc) +``` + +# Cluster cells on the basis of their scRNA-seq profiles + +The steps below represent a quick clustering of the PBMCs based on the scRNA-seq data. For more detail on individual steps or more advanced options, see our PBMC clustering guided tutorial [here](pbmc3k_tutorial.html) + +```{r cluster1} +# Note that all operations below are performed on the RNA assay +# Set and verify that the default assay is RNA +DefaultAssay(cbmc) <- "RNA" +DefaultAssay(cbmc) + +# perform visualization and clustering steps +cbmc <- NormalizeData(cbmc) +DefaultLayer(cbmc[["RNA"]]) <- "counts" +cbmc <- FindVariableFeatures(cbmc) +DefaultLayer(cbmc[["RNA"]]) <- "data" +cbmc <- ScaleData(cbmc) +DefaultLayer(cbmc[["RNA"]]) <- "scale.data" +cbmc <- RunPCA(cbmc, verbose = FALSE) +cbmc <- FindNeighbors(cbmc, dims = 1:30) +cbmc <- FindClusters(cbmc, resolution = 0.8, verbose = FALSE) +cbmc <- RunUMAP(cbmc, dims = 1:30) +DimPlot(cbmc, label = TRUE) +``` + +# Visualize multiple modalities side-by-side + +Now that we have obtained clusters from scRNA-seq profiles, we can visualize the expression of either protein or RNA molecules in our dataset. Importantly, Seurat provides a couple ways to switch between modalities, and specify which modality you are interested in analyzing or visualizing. This is particularly important as, in some cases, the same feature can be present in multiple modalities - for example this dataset contains independent measurements of the B cell marker CD19 (both protein and RNA levels). + +```{r vis} +# Normalize ADT data, +DefaultAssay(cbmc) <- 'ADT' +DefaultLayer(cbmc[["ADT"]]) <- "counts" +cbmc <- NormalizeData(cbmc, normalization.method = 'CLR', margin = 2) +DefaultAssay(cbmc) <- 'RNA' + +# Note that the following command is an alternative but returns the same result +DefaultLayer(cbmc[["ADT"]]) <- "counts" +cbmc <- NormalizeData(cbmc, normalization.method = 'CLR', margin = 2, assay = 'ADT') + +# Now, we will visualize CD14 levels for RNA and protein +# By setting the default assay, we can visualize one or the other +DefaultAssay(cbmc) <- 'ADT' +p1 <- FeaturePlot(cbmc, "CD19",cols = c("lightgrey","darkgreen")) + ggtitle("CD19 protein") +DefaultAssay(cbmc) <- 'RNA' +p2 <- FeaturePlot(cbmc, "CD19") + ggtitle("CD19 RNA") + +# place plots side-by-side +p1 | p2 + +# Alternately, we can use specific assay keys to specify a specific modality +# Identify the key for the RNA and protein assays +Key(cbmc[["RNA"]]) +Key(cbmc[["ADT"]]) + +# Now, we can include the key in the feature name, which overrides the default assay +p1 <- FeaturePlot(cbmc, "adt_CD19",cols = c("lightgrey","darkgreen")) + ggtitle("CD19 protein") +p2 <- FeaturePlot(cbmc, "rna_CD19") + ggtitle("CD19 RNA") +p1 | p2 +``` + +# Identify cell surface markers for scRNA-seq clusters + +We can leverage our paired CITE-seq measurements to help annotate clusters derived from scRNA-seq, and to identify both protein and RNA markers. + +```{r markers} +# as we know that CD19 is a B cell marker, we can identify cluster 6 as expressing CD19 on the surface +VlnPlot(cbmc, "adt_CD19") + +# we can also identify alternative protein and RNA markers for this cluster through differential expression +adt_markers <- FindMarkers(cbmc,ident.1 = 6, assay = 'ADT') +rna_markers <- FindMarkers(cbmc,ident.1 = 6, assay = 'RNA') + +head(adt_markers) +head(rna_markers) +``` + +# Additional visualizations of multimodal data + +```{r viz.cite.two, fig.height=4.5, fig.width=10} +# Draw ADT scatter plots (like biaxial plots for FACS). Note that you can even 'gate' cells if desired by using HoverLocator and FeatureLocator +FeatureScatter(cbmc, feature1 = 'adt_CD19', feature2 = 'adt_CD3') + +# view relationship between protein and RNA +FeatureScatter(cbmc, feature1 = 'adt_CD3', feature2 = 'rna_CD3E') + +FeatureScatter(cbmc, feature1 = 'adt_CD4', feature2 = 'adt_CD8') + +# Let's look at the raw (non-normalized) ADT counts. You can see the values are quite high, particularly in comparison to RNA values. This is due to the significantly higher protein copy number in cells, which significantly reduces 'drop-out' in ADT data +FeatureScatter(cbmc, feature1 = 'adt_CD4', feature2 = 'adt_CD8', slot = 'counts') +``` + +# Loading data from 10X multi-modal experiments + +Seurat is also able to analyze data from multimodal 10X experiments processed using CellRanger v3; as an example, we recreate the plots above using a dataset of 7,900 peripheral blood mononuclear cells (PBMC), freely available from 10X Genomics [here](https://support.10xgenomics.com/single-cell-gene-expression/datasets/3.0.0/pbmc_10k_protein_v3). + +```{r pbmc10x, fig.height=4.5, fig.width=10} +pbmc10k.data <- Read10X(data.dir = '../data/pbmc10k/filtered_feature_bc_matrix/') +rownames(x = pbmc10k.data[['Antibody Capture']]) <- gsub( + pattern = '_[control_]*TotalSeqB', + replacement = '', + x = rownames(x = pbmc10k.data[['Antibody Capture']]) +) + +pbmc10k <- CreateSeuratObject(counts = pbmc10k.data[['Gene Expression']], min.cells = 3, min.features = 200) +pbmc10k <- NormalizeData(pbmc10k) +pbmc10k[['ADT']] <- CreateAssay5Object(pbmc10k.data[['Antibody Capture']][, colnames(x = pbmc10k)]) +pbmc10k <- NormalizeData(pbmc10k, assay = 'ADT', normalization.method = 'CLR') + +plot1 <- FeatureScatter(pbmc10k, feature1 = 'adt_CD19', feature2 = 'adt_CD3', pt.size = 1) +plot2 <- FeatureScatter(pbmc10k, feature1 = 'adt_CD4', feature2 = 'adt_CD8a', pt.size = 1) +plot3 <- FeatureScatter(pbmc10k, feature1 = 'adt_CD3', feature2 = 'CD3E', pt.size = 1) +(plot1 + plot2 + plot3) & NoLegend() +``` + +```{r save.img, include=TRUE} +plot <- FeatureScatter(cbmc, feature1 = "adt_CD19", feature2 = "adt_CD3") + NoLegend() + + theme(axis.title = element_text(size = 18), legend.text = element_text(size = 18)) +ggsave(filename = "../output/images/citeseq_plot.jpg", height = 7, width = 12, plot = plot, quality = 50) +``` + +```{r save.times, include=TRUE} +write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_multimodal_vignette_times.csv") +``` + +# Additional functionality for multimodal data in Seurat + +Seurat v4 also includes additional functionality for the analysis, visualization, and integration of multimodal datasets. For more information, please explore the resources below: + +* Defining cellular identity from multimodal data using WNN analysis in Seurat v4 [vignette](weighted_nearest_neighbor_analysis.html) +* Mapping scRNA-seq data onto CITE-seq references [[vignette](reference_mapping.html)] +* Introduction to the analysis of spatial transcriptomics analysis [[vignette](spatial_vignette.html)] +* Analysis of 10x multiome (paired scRNA-seq + ATAC) using WNN analysis [[vignette](weighted_nearest_neighbor_analysis.html)] +* Signac: Analysis, interpretation, and exploration of single-cell chromatin datasets [[package](https://satijalab.org/signac/)] +* Mixscape: an analytical toolkit for pooled single-cell genetic screens [[vignette](mixscape_vignette.html)] + +
    + **Session Info** +```{r} +sessionInfo() +``` +
    diff --git a/vignettes/seurat5_pbmc3k_tutorial.Rmd b/vignettes/seurat5_pbmc3k_tutorial.Rmd new file mode 100644 index 000000000..d6d73c31a --- /dev/null +++ b/vignettes/seurat5_pbmc3k_tutorial.Rmd @@ -0,0 +1,395 @@ +--- +title: "Seurat - Guided Clustering Tutorial" +output: + html_document: + theme: united + df_print: kable +date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' +--- +*** + +```{r setup, include=TRUE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = TRUE, + tidy.opts = list(width.cutoff = 95), + message = FALSE, + warning = FALSE, + time_it = TRUE, + error = TRUE +) +``` + +# Setup the Seurat Object + +For this tutorial, we will be analyzing the a dataset of Peripheral Blood Mononuclear Cells (PBMC) freely available from 10X Genomics. There are 2,700 single cells that were sequenced on the Illumina NextSeq 500. The raw data can be found [here](https://cf.10xgenomics.com/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz). + +We start by reading in the data. The `Read10X()` function reads in the output of the [cellranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger) pipeline from 10X, returning a unique molecular identified (UMI) count matrix. The values in this matrix represent the number of molecules for each feature (i.e. gene; row) that are detected in each cell (column). + +We next use the count matrix to create a `Seurat` object. The object serves as a container that contains both data (like the count matrix) and analysis (like PCA, or clustering results) for a single-cell dataset. For a technical discussion of the `Seurat` object structure, check out our [GitHub Wiki](https://github.com/satijalab/seurat/wiki). For example, the count matrix is stored in `pbmc[["RNA"]]@counts`. + +```{r init, error=TRUE} +library(dplyr) +library(Seurat) +options(Seurat.object.assay.version = "v5") +library(patchwork) + +# Load the PBMC dataset +pbmc.data <- Read10X(data.dir = "../data/pbmc3k/filtered_gene_bc_matrices/hg19/") +# Initialize the Seurat object with the raw (non-normalized data). +pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200) +pbmc +``` +
    + **What does data in a count matrix look like?** + +```{r, error=TRUE} +# Lets examine a few genes in the first thirty cells +pbmc.data[c("CD3D", "TCL1A", "MS4A1"), 1:30] +``` + +The `.` values in the matrix represent 0s (no molecules detected). Since most values in an scRNA-seq matrix are 0, Seurat uses a sparse-matrix representation whenever possible. This results in significant memory and speed savings for Drop-seq/inDrop/10x data. + +```{r, error=TRUE} +dense.size <- object.size(as.matrix(pbmc.data)) +dense.size +sparse.size <- object.size(pbmc.data) +sparse.size +dense.size / sparse.size +``` +
    +\ + +# Standard pre-processing workflow + +The steps below encompass the standard pre-processing workflow for scRNA-seq data in Seurat. These represent the selection and filtration of cells based on QC metrics, data normalization and scaling, and the detection of highly variable features. + +## QC and selecting cells for further analysis + +Seurat allows you to easily explore QC metrics and filter cells based on any user-defined criteria. A few QC metrics [commonly used](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4758103/) by the community include + +* The number of unique genes detected in each cell. + + Low-quality cells or empty droplets will often have very few genes + + Cell doublets or multiplets may exhibit an aberrantly high gene count +* Similarly, the total number of molecules detected within a cell (correlates strongly with unique genes) +* The percentage of reads that map to the mitochondrial genome + + Low-quality / dying cells often exhibit extensive mitochondrial contamination + + We calculate mitochondrial QC metrics with the `PercentageFeatureSet()` function, which calculates the percentage of counts originating from a set of features + + We use the set of all genes starting with `MT-` as a set of mitochondrial genes + +```{r mito, fig.height=7, fig.width=13, error=TRUE} +# The [[ operator can add columns to object metadata. This is a great place to stash QC stats +pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-") +``` + +
    + **Where are QC metrics stored in Seurat?** + +* The number of unique genes and total molecules are automatically calculated during `CreateSeuratObject()` + + You can find them stored in the object meta data +```{r qc, fig.height=7, fig.width=13, error=TRUE} +# Show QC metrics for the first 5 cells +head(pbmc@meta.data, 5) +``` +
    +\ + +In the example below, we visualize QC metrics, and use these to filter cells. + +* We filter cells that have unique feature counts over 2,500 or less than 200 +* We filter cells that have >5% mitochondrial counts + +```{r qc2, fig.height=7, fig.width=13, error=TRUE} + +#Visualize QC metrics as a violin plot +VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3) + +# FeatureScatter is typically used to visualize feature-feature relationships, but can be used for anything calculated by the object, i.e. columns in object metadata, PC scores etc. + +plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt") +plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") +plot1 + plot2 + +pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5) +``` + +*** + +# Normalizing the data + +After removing unwanted cells from the dataset, the next step is to normalize the data. By default, we employ a global-scaling normalization method "LogNormalize" that normalizes the feature expression measurements for each cell by the total expression, multiplies this by a scale factor (10,000 by default), and log-transforms the result. Normalized values are stored in `pbmc[["RNA"]]@data`. + +```{r normalize, error=TRUE} +pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 1e4) +``` +For clarity, in this previous line of code (and in future commands), we provide the default values for certain parameters in the function call. However, this isn't required and the same behavior can be achieved with: + +```{r normalize.default, eval = FALSE} +pbmc <- NormalizeData(pbmc) +``` + +# Identification of highly variable features (feature selection) + +We next calculate a subset of features that exhibit high cell-to-cell variation in the dataset (i.e, they are highly expressed in some cells, and lowly expressed in others). We and [others](https://www.nature.com/articles/nmeth.2645) have found that focusing on these genes in downstream analysis helps to highlight biological signal in single-cell datasets. + +Our procedure in Seurat is described in detail [here](https://doi.org/10.1016/j.cell.2019.05.031), and improves on previous versions by directly modeling the mean-variance relationship inherent in single-cell data, and is implemented in the `FindVariableFeatures()` function. By default, we return 2,000 features per dataset. These will be used in downstream analysis, like PCA. + +```{r var_features, fig.height=5, fig.width=11} +DefaultLayer(pbmc[["RNA"]]) <- "counts" +pbmc <- FindVariableFeatures(pbmc, selection.method = 'vst', nfeatures = 2000) + +# Identify the 10 most highly variable genes +top10 <- head(VariableFeatures(pbmc), 10) + +# plot variable features with and without labels +plot1 <- VariableFeaturePlot(pbmc) +plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE) +plot1 + plot2 +``` + +*** + +# Scaling the data + +Next, we apply a linear transformation ('scaling') that is a standard pre-processing step prior to dimensional reduction techniques like PCA. The `ScaleData()` function: + +* Shifts the expression of each gene, so that the mean expression across cells is 0 +* Scales the expression of each gene, so that the variance across cells is 1 + + This step gives equal weight in downstream analyses, so that highly-expressed genes do not dominate +* The results of this are stored in `pbmc[["RNA"]]@scale.data` + +```{r regress, fig.height=7, fig.width=11, results='hide'} +DefaultLayer(pbmc[["RNA"]]) <- "data" +all.genes <- rownames(pbmc) +pbmc <- ScaleData(pbmc, features = all.genes) +``` +
    + **This step takes too long! Can I make it faster?** + +Scaling is an essential step in the Seurat workflow, but only on genes that will be used as input to PCA. Therefore, the default in `ScaleData()` is only to perform scaling on the previously identified variable features (2,000 by default). To do this, omit the `features` argument in the previous function call, i.e. +```{r regressvar, fig.height=7, fig.width=11, results='hide', eval = FALSE} +pbmc <- ScaleData(pbmc) +``` +Your PCA and clustering results will be unaffected. However, Seurat heatmaps (produced as shown below with `DoHeatmap()`) require genes in the heatmap to be scaled, to make sure highly-expressed genes don't dominate the heatmap. To make sure we don't leave any genes out of the heatmap later, we are scaling all genes in this tutorial. +
    +\ +
    + **How can I remove unwanted sources of variation, as in Seurat v2?** + +In `Seurat v2` we also use the `ScaleData()` function to remove unwanted sources of variation from a single-cell dataset. For example, we could 'regress out' heterogeneity associated with (for example) cell cycle stage, or mitochondrial contamination. These features are still supported in `ScaleData()` in `Seurat v3`, i.e.: +```{r regressvarmt, fig.height=7, fig.width=11, results='hide', eval = FALSE} +pbmc <- ScaleData(pbmc, vars.to.regress = 'percent.mt') +``` +However, particularly for advanced users who would like to use this functionality, we strongly recommend the use of our new normalization workflow, `SCTransform()`. The method is described in our [paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1), with a separate vignette using Seurat v3 [here](sctransform_vignette.html). As with `ScaleData()`, the function `SCTransform()` also includes a `vars.to.regress` parameter. +
    +\ + +*** + +# Perform linear dimensional reduction + +Next we perform PCA on the scaled data. By default, only the previously determined variable features are used as input, but can be defined using `features` argument if you wish to choose a different subset. + +```{r pca} +pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc)) +``` + +Seurat provides several useful ways of visualizing both cells and features that define the PCA, including `VizDimReduction()`, `DimPlot()`, and `DimHeatmap()` + +```{r pca_viz, message=TRUE} +# Examine and visualize PCA results a few different ways +print(pbmc[['pca']], dims = 1:5, nfeatures = 5) +VizDimLoadings(pbmc, dims = 1:2, reduction = 'pca') +DimPlot(pbmc, reduction = 'pca') +``` + +In particular `DimHeatmap()` allows for easy exploration of the primary sources of heterogeneity in a dataset, and can be useful when trying to decide which PCs to include for further downstream analyses. Both cells and features are ordered according to their PCA scores. Setting `cells` to a number plots the 'extreme' cells on both ends of the spectrum, which dramatically speeds plotting for large datasets. Though clearly a supervised analysis, we find this to be a valuable tool for exploring correlated feature sets. + +```{r single-heatmap} +DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE) +``` + +```{r multi-heatmap, fig.height=15, fig.width=9} +DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE) +``` + +# Determine the 'dimensionality' of the dataset + +To overcome the extensive technical noise in any single feature for scRNA-seq data, Seurat clusters cells based on their PCA scores, with each PC essentially representing a 'metafeature' that combines information across a correlated feature set. The top principal components therefore represent a robust compression of the dataset. However, how many components should we choose to include? 10? 20? 100? + +In [Macosko *et al*](http://www.cell.com/abstract/S0092-8674(15)00549-8), we implemented a resampling test inspired by the JackStraw procedure. We randomly permute a subset of the data (1% by default) and rerun PCA, constructing a 'null distribution' of feature scores, and repeat this procedure. We identify 'significant' PCs as those who have a strong enrichment of low p-value features. + +```{r jackstraw, fig.height=6, fig.width=10} +# NOTE: This process can take a long time for big datasets, comment out for expediency. More approximate techniques such as those implemented in ElbowPlot() can be used to reduce computation time +pbmc <- JackStraw(pbmc, num.replicate = 100) +pbmc <- ScoreJackStraw(pbmc, dims = 1:20) +``` + +The `JackStrawPlot()` function provides a visualization tool for comparing the distribution of p-values for each PC with a uniform distribution (dashed line). 'Significant' PCs will show a strong enrichment of features with low p-values (solid curve above the dashed line). In this case it appears that there is a sharp drop-off in significance after the first 10-12 PCs. + +```{r jsplots, fig.height=6, fig.width=10} +JackStrawPlot(pbmc, dims = 1:15) +``` + +An alternative heuristic method generates an 'Elbow plot': a ranking of principle components based on the percentage of variance explained by each one (`ElbowPlot()` function). In this example, we can observe an 'elbow' around PC9-10, suggesting that the majority of true signal is captured in the first 10 PCs. + +```{r elbow_plot, fig.height=6, fig.width=10} +ElbowPlot(pbmc) +``` + +Identifying the true dimensionality of a dataset -- can be challenging/uncertain for the user. We therefore suggest these three approaches to consider. The first is more supervised, exploring PCs to determine relevant sources of heterogeneity, and could be used in conjunction with GSEA for example. The second implements a statistical test based on a random null model, but is time-consuming for large datasets, and may not return a clear PC cutoff. The third is a heuristic that is commonly used, and can be calculated instantly. In this example, all three approaches yielded similar results, but we might have been justified in choosing anything between PC 7-12 as a cutoff. + +We chose 10 here, but encourage users to consider the following: + +* Dendritic cell and NK aficionados may recognize that genes strongly associated with PCs 12 and 13 define rare immune subsets (i.e. MZB1 is a marker for plasmacytoid DCs). However, these groups are so rare, they are difficult to distinguish from background noise for a dataset of this size without prior knowledge. +* We encourage users to repeat downstream analyses with a different number of PCs (10, 15, or even 50!). As you will observe, the results often do not differ dramatically. +* We advise users to err on the higher side when choosing this parameter. For example, performing downstream analyses with only 5 PCs does significantly and adversely affect results. + +*** + +# Cluster the cells + +Seurat v3 applies a graph-based clustering approach, building upon initial strategies in ([Macosko *et al*](http://www.cell.com/abstract/S0092-8674(15)00549-8)). Importantly, the *distance metric* which drives the clustering analysis (based on previously identified PCs) remains the same. However, our approach to partitioning the cellular distance matrix into clusters has dramatically improved. Our approach was heavily inspired by recent manuscripts which applied graph-based clustering approaches to scRNA-seq data [[SNN-Cliq, Xu and Su, Bioinformatics, 2015]](http://bioinformatics.oxfordjournals.org/content/early/2015/02/10/bioinformatics.btv088.abstract) and CyTOF data [[PhenoGraph, Levine *et al*., Cell, 2015]](http://www.ncbi.nlm.nih.gov/pubmed/26095251). Briefly, these methods embed cells in a graph structure - for example a K-nearest neighbor (KNN) graph, with edges drawn between cells with similar feature expression patterns, and then attempt to partition this graph into highly interconnected 'quasi-cliques' or 'communities'. + +As in PhenoGraph, we first construct a KNN graph based on the euclidean distance in PCA space, and refine the edge weights between any two cells based on the shared overlap in their local neighborhoods (Jaccard similarity). This step is performed using the `FindNeighbors()` function, and takes as input the previously defined dimensionality of the dataset (first 10 PCs). + +To cluster the cells, we next apply modularity optimization techniques such as the Louvain algorithm (default) or SLM [[SLM, Blondel *et al*., Journal of Statistical Mechanics]](http://dx.doi.org/10.1088/1742-5468/2008/10/P10008), to iteratively group cells together, with the goal of optimizing the standard modularity function. The `FindClusters()` function implements this procedure, and contains a resolution parameter that sets the 'granularity' of the downstream clustering, with increased values leading to a greater number of clusters. We find that setting this parameter between 0.4-1.2 typically returns good results for single-cell datasets of around 3K cells. Optimal resolution often increases for larger datasets. The clusters can be found using the `Idents()` function. + + +```{r cluster, fig.height=5, fig.width=7} +pbmc <- FindNeighbors(pbmc, dims = 1:10) +pbmc <- FindClusters(pbmc, resolution = 0.5) + +# Look at cluster IDs of the first 5 cells +head(Idents(pbmc), 5) +``` + +*** + +# Run non-linear dimensional reduction (UMAP/tSNE) + +Seurat offers several non-linear dimensional reduction techniques, such as tSNE and UMAP, to visualize and explore these datasets. The goal of these algorithms is to learn the underlying manifold of the data in order to place similar cells together in low-dimensional space. Cells within the graph-based clusters determined above should co-localize on these dimension reduction plots. As input to the UMAP and tSNE, we suggest using the same PCs as input to the clustering analysis. + +```{r tsne, fig.height=5, fig.width=7} +# If you haven't installed UMAP, you can do so via reticulate::py_install(packages = "umap-learn") +pbmc <- RunUMAP(pbmc, dims = 1:10) +``` + +```{r tsneplot, fig.height=5, fig.width=7} +# note that you can set `label = TRUE` or use the LabelClusters function to help label individual clusters +DimPlot(pbmc, reduction = 'umap') +``` + +You can save the object at this point so that it can easily be loaded back in without having to rerun the computationally intensive steps performed above, or easily shared with collaborators. + +```{r saveobject, eval=FALSE} +saveRDS(pbmc, file = "../output/pbmc_tutorial.rds") +``` + +*** + +# Finding differentially expressed features (cluster biomarkers) + +Seurat can help you find markers that define clusters via differential expression. By default, it identifies positive and negative markers of a single cluster (specified in `ident.1`), compared to all other cells. `FindAllMarkers()` automates this process for all clusters, but you can also test groups of clusters vs. each other, or against all cells. + +The `min.pct` argument requires a feature to be detected at a minimum percentage in either of the two groups of cells, and the thresh.test argument requires a feature to be differentially expressed (on average) by some amount between the two groups. You can set both of these to 0, but with a dramatic increase in time - since this will test a large number of features that are unlikely to be highly discriminatory. As another option to speed up these computations, `max.cells.per.ident` can be set. This will downsample each identity class to have no more cells than whatever this is set to. While there is generally going to be a loss in power, the speed increases can be significant and the most highly differentially expressed features will likely still rise to the top. + +```{r markers1, fig.height=8, fig.width=15} +# find all markers of cluster 2 +cluster2.markers <- FindMarkers(pbmc, ident.1 = 2, min.pct = 0.25) +head(cluster2.markers, n = 5) +# find all markers distinguishing cluster 5 from clusters 0 and 3 +cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25) +head(cluster5.markers, n = 5) +# find markers for every cluster compared to all remaining cells, report only the positive ones +pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25) +pbmc.markers %>% group_by(cluster) %>% slice_max(n = 2, order_by = avg_log2FC) +``` + +Seurat has several tests for differential expression which can be set with the test.use parameter (see our [DE vignette](de_vignette.html) for details). For example, the ROC test returns the 'classification power' for any individual marker (ranging from 0 - random, to 1 - perfect). + +```{r markersroc, fig.height=8, fig.width=15} +cluster0.markers <- FindMarkers(pbmc, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE) +``` + +We include several tools for visualizing marker expression. `VlnPlot()` (shows expression probability distributions across clusters), and `FeaturePlot()` (visualizes feature expression on a tSNE or PCA plot) are our most commonly used visualizations. We also suggest exploring `RidgePlot()`, `CellScatter()`, and `DotPlot()` as additional methods to view your dataset. + +```{r markerplots, fig.height=10, fig.width=15} +VlnPlot(pbmc, features = c("MS4A1", "CD79A")) +# you can plot raw counts as well +VlnPlot(pbmc, features = c("NKG7", "PF4"), slot = 'counts', log = TRUE) +FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A")) +``` + +`DoHeatmap()` generates an expression heatmap for given cells and features. In this case, we are plotting the top 20 markers (or all markers if less than 20) for each cluster. + +```{r clusterHeatmap, fig.height=8, fig.width=15} +pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC) -> top10 + +# DoHeatmap doesn't automatically use the correct slot/layer anymore. need to figure out to make this occur automatically. +DefaultLayer(pbmc[["RNA"]]) <- "scale.data" +DoHeatmap(pbmc, features = top10$gene) + NoLegend() +``` + +*** +# Assigning cell type identity to clusters + +Fortunately in the case of this dataset, we can use canonical markers to easily match the unbiased clustering to known cell types: + +Cluster ID | Markers | Cell Type +-----------|---------------|---------- +0 | IL7R, CCR7 | Naive CD4+ T +1 | CD14, LYZ | CD14+ Mono +2 | IL7R, S100A4 | Memory CD4+ +3 | MS4A1 | B +4 | CD8A | CD8+ T +5 | FCGR3A, MS4A7 | FCGR3A+ Mono +6 | GNLY, NKG7 | NK +7 | FCER1A, CST3 | DC +8 | PPBP | Platelet + + +```{r labelplot, fig.height=5, fig.width=9} +new.cluster.ids <- c("Naive CD4 T", "CD14+ Mono", "Memory CD4 T", "B", "CD8 T", "FCGR3A+ Mono", "NK", "DC", "Platelet") +names(new.cluster.ids) <- levels(pbmc) +pbmc <- RenameIdents(pbmc, new.cluster.ids) +DimPlot(pbmc, reduction = 'umap', label = TRUE, pt.size = 0.5) + NoLegend() +``` + +```{r save.img, include=TRUE} +library(ggplot2) +plot <- DimPlot(pbmc, reduction = "umap", label = TRUE, label.size = 4.5) + xlab("UMAP 1") + ylab("UMAP 2") + + theme(axis.title = element_text(size = 18), legend.text = element_text(size = 18)) + + guides(colour = guide_legend(override.aes = list(size = 10))) +ggsave(filename = "../output/images/pbmc3k_umap.jpg", height = 7, width = 12, plot = plot, quality = 50) +``` + +```{r save.rds, eval=FALSE} +saveRDS(pbmc, file = "../output/pbmc3k_final.rds") +``` + +```{r save2, include=TRUE} +saveRDS(pbmc, file = "../data/pbmc3k_final.rds") +``` + +```{r save.times, include=TRUE} +write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_pbmc3k_tutorial_times.csv") +``` + +
    + **Session Info** +```{r} +sessionInfo() +``` +
    diff --git a/vignettes/seurat5_sctransform_integration.Rmd b/vignettes/seurat5_sctransform_integration.Rmd new file mode 100644 index 000000000..1cfb38050 --- /dev/null +++ b/vignettes/seurat5_sctransform_integration.Rmd @@ -0,0 +1,90 @@ +--- +title: 'Integration of SCTransform normalized datasets' +output: + html_document: + theme: united + pdf_document: default +date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' +--- + +```{r setup, include=TRUE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = TRUE, + tidy.opts = list(width.cutoff = 95), + fig.width = 10, + message = FALSE, + warning = FALSE, + time_it = TRUE, + error = TRUE +) +``` +## Setup the Seurat objects + + + +```{r data} +library(Seurat) +options(Seurat.object.assay.version = "v5") +library(SeuratData) +library(patchwork) +``` + + +```{r installdata, eval=FALSE} +# install dataset +InstallData('pbmcsca') +``` + +```{r init, results='hide', message=FALSE, fig.keep='none'} +# load dataset +pbmcsca <- LoadData("pbmcsca") +pbmcsca <- UpdateSeuratObject(object = pbmcsca) +pbmcsca[["RNA"]] <- as(pbmcsca[["RNA"]], Class = "Assay5") + +# split the dataset into layers +pbmcsca[["RNA"]] <- split(pbmcsca[["RNA"]], f = pbmcsca$Method) +``` + +## Run SCTransform + +```{r} +pbmcsca <- SCTransform(pbmcsca) +pbmcsca <- RunPCA(pbmcsca, npcs = 30, verbose = FALSE) +``` + +## Perform integration + +We then integrate all the layers using the `IntegrateLayers()` function. + +```{r} +pbmcsca <- IntegrateLayers(object = pbmcsca, + method = RPCAIntegration, + normalization.method="SCT", + verbose = F) +``` + + +```{r} +pbmcsca <- FindNeighbors(pbmcsca, dims = 1:30) +pbmcsca <- FindClusters(pbmcsca, resolution = 2) +pbmcsca <- RunUMAP(pbmcsca, dims = 1:30) +``` + +```{r viz, results='hide', message=FALSE} +# Visualization +p1 <- DimPlot(pbmcsca, reduction = "umap", group.by = "Method") +p2 <- DimPlot(pbmcsca, reduction = "umap", group.by = "CellType", label = TRUE, repel = TRUE) +p1 + p2 +``` diff --git a/vignettes/seurat5_sctransform_v2_vignette.Rmd b/vignettes/seurat5_sctransform_v2_vignette.Rmd new file mode 100644 index 000000000..21cd4d43b --- /dev/null +++ b/vignettes/seurat5_sctransform_v2_vignette.Rmd @@ -0,0 +1,231 @@ +--- +title: 'Introduction to SCTransform, v2 regularization' +output: + html_document: + theme: united + pdf_document: default +date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' +--- + +```{r setup, include=TRUE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = TRUE, + tidy.opts = list(width.cutoff = 95), + fig.width = 10, + message = FALSE, + warning = FALSE, + time_it = TRUE, + error = TRUE +) + +``` + +## TL;DR + +We recently introduced [sctransform](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1) to perform normalization and variance stabilization of scRNA-seq datasets. We now release an updated version ('v2'), based on [our broad analysis](https://www.biorxiv.org/content/10.1101/2021.07.07.451498v1) of 59 scRNA-seq datasets spanning a range of technologies, systems, and sequencing depths. This update improves speed and memory consumption, the stability of parameter estimates, the identification of variable features, and the the ability to perform downstream differential expression analyses. + +Users can install sctransform v2 from CRAN (sctransform v0.3.3) and invoke the use of the updated method via the `vst.flavor` argument (This is the default in SeuratV5). + +```{r tldr, eval=FALSE} +# install sctransform >= 0.3.3 +install.packages("sctransform") +# invoke sctransform - requires Seurat>=4.1 +object <- SCTransform(object, vst.flavor = "v2") +``` + +## Introduction + +Heterogeneity in single-cell RNA-seq (scRNA-seq) data is driven by multiple sources, including biological variation in cellular state as well as technical variation introduced during experimental processing. In [Choudhary and Satija, 2021](https://www.biorxiv.org/content/10.1101/2021.07.07.451498v1) we provide a set of recommendations for modeling variation in scRNA-seq data, particularly when using generalized linear models or likelihood-based approaches for preprocessing and downstream analysis. + +In this vignette, we use [sctransform v2](https://github.com/satijalab/sctransform/) based workflow to perform a comparative analysis of human immune cells (PBMC) in either a [resting or interferon-stimulated state](https://www.nature.com/articles/nbt.4042). In this vignette we apply sctransform-v2 based normalization to perform the following tasks: + +* Create an 'integrated' data assay for downstream analysis +* Compare the datasets to find cell-type specific responses to stimulation +* Obtain cell type markers that are conserved in both control and stimulated cells + +## Install dependencies + +We will install the [glmGamPoi](https://bioconductor.org/packages/release/bioc/html/glmGamPoi.html) package which substantially improves the speed of the learning procedure. + +```{r results='hide', message=FALSE, warning=FALSE, eval=FALSE} +# install glmGamPoi +if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") +BiocManager::install("glmGamPoi") +``` + +## Setup the Seurat objects + + +```{r data} +library(Seurat) +options(Seurat.object.assay.version = "v5") +library(SeuratData) +library(patchwork) +library(dplyr) +library(ggplot2) +``` +The dataset is available through our [SeuratData](https://github.com/satijalab/seurat-data) package. + +```{r installdata, eval=FALSE} +# install dataset +InstallData("ifnb") +``` + +```{r init, results='hide', message=FALSE, fig.keep='none'} +# load dataset +ifnb <- LoadData("ifnb") +ifnb <- UpdateSeuratObject(object = ifnb) +ifnb[["RNA"]] <- as(ifnb[["RNA"]], Class = "Assay5") + +# split the dataset into a list of two seurat objects (stim and CTRL) +ifnb.list <- SplitObject(ifnb, split.by = "stim") + +ctrl <- ifnb.list[["CTRL"]] +stim <- ifnb.list[["STIM"]] +``` + +## Perform normalization and dimensionality reduction + +To perform normalization, we invoke `SCTransform` with an additional flag `vst.flavor="v2"` to invoke +the v2 regularization. This provides some improvements over our original approach first introduced in [Hafemeister and Satija, 2019](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1). + +* We fix the slope parameter of the GLM to $\ln(10)$ with $\log_{10}(\text{total UMI})$ used as the predictor as proposed by [Lause et al.](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02451-7) +* We utilize an improved parameter estimation procedure that alleviates uncertainty and bias that result from fitting GLM models for very lowly expressed genes. +* We place a lower bound on gene-level standard deviation when calculating Pearson residuals. This prevents genes with extremely low expression (only 1-2 detected UMIs) from having a high pearson residual. + + +```{r ctrldimreduc, fig.width=10, fig.height=4} +# normalize and run dimensionality reduction on control dataset +ctrl <- SCTransform(ctrl, vst.flavor = "v2", verbose = FALSE) %>% + RunPCA(npcs = 30, verbose = FALSE) %>% + RunUMAP(reduction = "pca", dims = 1:30, verbose = FALSE) %>% + FindNeighbors(reduction = "pca", dims = 1:30, verbose = FALSE) %>% + FindClusters(resolution = 0.7, verbose = FALSE) + +p1 <- DimPlot(ctrl, label = T, repel = T) + ggtitle("Unsupervised clustering") +p2 <- DimPlot(ctrl, label = T, repel = T, group.by = "seurat_annotations") + ggtitle("Annotated celltypes") + +p1 | p2 +``` + +## Perform integration using pearson residuals + +To perform integration using the pearson residuals calculated above, we use the `PrepSCTIntegration()` function after selecting a list of informative features using `SelectIntegrationFeatures()`: + +```{r prepinteg} +stim <- SCTransform(stim, vst.flavor = "v2", verbose = FALSE) %>% RunPCA(npcs = 30, verbose = FALSE) +ifnb.list <- list(ctrl = ctrl, stim = stim) +features <- SelectIntegrationFeatures(object.list = ifnb.list, nfeatures = 3000) +ifnb.list <- PrepSCTIntegration(object.list = ifnb.list, anchor.features = features) +``` + +To integrate the two datasets, we use the `FindIntegrationAnchors()` function, which takes a list of Seurat objects as input, and use these anchors to integrate the two datasets together with `IntegrateData()`. + +```{r ifnb.cca.sct.anchors} +immune.anchors <- FindIntegrationAnchors(object.list = ifnb.list, + normalization.method = "SCT", anchor.features = features) +immune.combined.sct <- IntegrateData(anchorset = immune.anchors, normalization.method = "SCT") +``` + +## Perform an integrated analysis + +Now we can run a single integrated analysis on all cells: + +```{r ifnb.cca.sct.clustering, results='hide', message=FALSE} +immune.combined.sct <- RunPCA(immune.combined.sct, verbose = FALSE) +immune.combined.sct <- RunUMAP(immune.combined.sct, reduction = "pca", dims = 1:30, verbose = FALSE) +immune.combined.sct <- FindNeighbors(immune.combined.sct, reduction = "pca", dims = 1:30) +immune.combined.sct <- FindClusters(immune.combined.sct, resolution = 0.3) +``` + +To visualize the two conditions side-by-side, we can use the `split.by` argument to show each condition colored by cluster. + +```{r split.dim} +DimPlot(immune.combined.sct, reduction = "umap", split.by = "stim") +``` + +We can also visualize the distribution of annotated celltypes across control and stimulated datasets: + +```{r immunesca.cca.sct.split.dims, fig.width=13, fig.height=4} +p1 <- DimPlot(immune.combined.sct, reduction = "umap", group.by = "stim") +p2 <- DimPlot(immune.combined.sct, reduction = "umap", group.by = "seurat_clusters", label = TRUE, repel = TRUE) +p3 <- DimPlot(immune.combined.sct, reduction = "umap", group.by = "seurat_annotations", label = TRUE, repel = TRUE) +p1 | p2 | p3 +``` + + +## Identify differential expressed genes across conditions + +Using the normalized datasets with known celltype annotation, we can ask what genes change in different conditions for cells of the same type. First, we create a column in the meta.data slot to hold both the cell type and stimulation information and switch the current ident to that column. + +```{r de.genes} +immune.combined.sct$celltype.stim <- paste(immune.combined.sct$seurat_annotations, + immune.combined.sct$stim, sep = "_") +Idents(immune.combined.sct) <- "celltype.stim" +``` + +To run differential expression, we make use of 'corrected counts' that are stored in the `data` slot of the the `SCT` assay. Corrected counts are obtained by setting the sequencing depth for all the cells to a fixed value and reversing the learned regularized negative-binomial regression model. Prior to performing differential expression, we first run `PrepSCTFindMarkers`, which ensures that the fixed value is set properly. Then we use `FindMarkers(assay="SCT")` to find differentially expressed genes. Here, we aim to identify genes that are differently expressed between stimulated and control B cells. + +```{r runde} +immune.combined.sct <- PrepSCTFindMarkers(immune.combined.sct) + +b.interferon.response <- FindMarkers(immune.combined.sct, assay = "SCT", + ident.1 = "B_STIM", ident.2 = "B_CTRL", verbose = FALSE) +head(b.interferon.response, n = 15) +``` + +If running on a subset of the original object after running `PrepSCTFindMarkers()`, `FindMarkers()` should be invoked with `recorrect_umi = FALSE` to use the existing corrected counts: + +```{r runde2} +immune.combined.sct.subset <- subset(immune.combined.sct, idents = c("B_STIM", "B_CTRL")) +b.interferon.response.subset <- FindMarkers(immune.combined.sct.subset, assay = "SCT", + ident.1 = "B_STIM", ident.2 = "B_CTRL", + verbose = FALSE, recorrect_umi = FALSE) +``` + +We can also use the corrected counts for visualization: + +```{r feature.heatmaps, fig.height = 14} +Idents(immune.combined.sct) <- "seurat_annotations" +DefaultAssay(immune.combined.sct) <- "SCT" +FeaturePlot(immune.combined.sct, features = c("CD3D", "GNLY", "IFI6"), + split.by = "stim", max.cutoff = 3, cols = c("grey", "red")) +``` + +```{r splitvln, fig.height = 12} +plots <- VlnPlot(immune.combined.sct, features = c("LYZ", "ISG15", "CXCL10"), + split.by = "stim", group.by = "seurat_annotations", pt.size = 0, combine = FALSE) +wrap_plots(plots = plots, ncol = 1) +``` + +### Identify conserved cell type markers + +To identify canonical cell type marker genes that are conserved across conditions, we provide the `FindConservedMarkers()` function. This function performs differential gene expression testing for each dataset/group and combines the p-values using meta-analysis methods from the MetaDE R package. For example, we can identify genes that are conserved markers irrespective of stimulation condition in NK cells. Note that the `PrepSCTFindMarkers` command does not to be rerun here. + +```{r conserved.markers, warning=FALSE} +nk.markers <- FindConservedMarkers(immune.combined.sct, assay = "SCT", ident.1 = "NK", grouping.var = "stim", verbose = FALSE) +head(nk.markers) +``` + +```{r save.times, include=TRUE} +write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_sctransform2.csv") +``` + +
    + **Session Info** +```{r} +sessionInfo() +``` +
    diff --git a/vignettes/seurat5_sctransform_vignette.Rmd b/vignettes/seurat5_sctransform_vignette.Rmd new file mode 100644 index 000000000..1fd6af556 --- /dev/null +++ b/vignettes/seurat5_sctransform_vignette.Rmd @@ -0,0 +1,156 @@ +--- +title: "Using sctransform in Seurat" +author: Christoph Hafemeister & Rahul Satija +output: + html_document: + theme: united + df_print: kable +date: 'Compiled: `r Sys.Date()`' +--- + +```{r setup, include=TRUE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = TRUE, + tidy.opts = list(width.cutoff = 95), + message = FALSE, + warning = FALSE, + time_it = TRUE, + error = TRUE +) +``` + +Biological heterogeneity in single-cell RNA-seq data is often confounded by technical factors including sequencing depth. The number of molecules detected in each cell can vary significantly between cells, even within the same celltype. +Interpretation of scRNA-seq data requires effective pre-processing and normalization to remove this technical variability. +In [Hafemeister and Satija, 2019](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1) we introduce a modeling framework for the normalization and variance stabilization of molecular count data from scRNA-seq experiment. +This procedure omits the need for heuristic steps including pseudocount addition or log-transformation and improves common downstream analytical tasks such as variable gene selection, dimensional reduction, and differential expression. + +In this vignette, we demonstrate how using [sctransform](https://github.com/ChristophH/sctransform/) based normalization enables recovering sharper biological distinction compared to log-normalization. + +```{r packages} +library(Seurat) +options(Seurat.object.assay.version = "v5") +library(ggplot2) +library(sctransform) +``` + +Load data and create Seurat object + +```{r load_data, warning=FALSE, message=FALSE} +pbmc_data <- Read10X(data.dir = "../data/pbmc3k/filtered_gene_bc_matrices/hg19/") +pbmc <- CreateSeuratObject(counts = pbmc_data) +``` + +Apply sctransform normalization + + * Note that this single command replaces `NormalizeData()`, `ScaleData()`, and `FindVariableFeatures()`. + * Transformed data will be available in the SCT assay, which is set as the default after running sctransform + * During normalization, we can also remove confounding sources of variation, for example, mitochondrial mapping percentage + +```{r apply_sct, warning=FALSE, message=FALSE} +# store mitochondrial percentage in object meta data +pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-") + +# run sctransform +pbmc <- SCTransform(pbmc, vars.to.regress = "percent.mt", verbose = FALSE) +``` + +The latest version of `sctransform` also supports using [glmGamPoi](https://bioconductor.org/packages/release/bioc/html/glmGamPoi.html) +package which substantially improves the speed of the learning procedure. It can be invoked by specifying +`method="glmGamPoi"`. + +```{r eval=FALSE} +if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + +BiocManager::install("glmGamPoi") +pbmc <- SCTransform(pbmc, method="glmGamPoi", vars.to.regress = "percent.mt", verbose = FALSE) +``` + +Perform dimensionality reduction by PCA and UMAP embedding +```{r pca, fig.width=5, fig.height=5} +# These are now standard steps in the Seurat workflow for visualization and clustering +pbmc <- RunPCA(pbmc, verbose = FALSE) +pbmc <- RunUMAP(pbmc, dims = 1:30, verbose = FALSE) + +pbmc <- FindNeighbors(pbmc, dims = 1:30, verbose = FALSE) +pbmc <- FindClusters(pbmc, verbose = FALSE) +DimPlot(pbmc, label = TRUE) + NoLegend() +``` + +
    + **Why can we choose more PCs when using sctransform?** + +In the [standard Seurat workflow](pbmc3k_tutorial.html) we focus on 10 PCs for this dataset, though we highlight that the results are similar with higher settings for this parameter. Interestingly, we've found that when using sctransform, we often benefit by pushing this parameter even higher. We believe this is because the sctransform workflow performs more effective normalization, strongly removing technical effects from the data. + +Even after standard log-normalization, variation in sequencing depth is still a confounding factor (see [Figure 1](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1)), and this effect can subtly influence higher PCs. In sctransform, this effect is substantially mitigated (see [Figure 3](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1)). This means that higher PCs are more likely to represent subtle, but biologically relevant, sources of heterogeneity -- so including them may improve downstream analysis. + +In addition, sctransform returns 3,000 variable features by default, instead of 2,000. The rationale is similar, the additional variable features are less likely to be driven by technical differences across cells, and instead may represent more subtle biological fluctuations. In general, we find that results produced with sctransform are less dependent on these parameters (indeed, we achieve nearly identical results when using all genes in the transcriptome, though this does reduce computational efficiency). This can help users generate more robust results, and in addition, enables the application of standard analysis pipelines with identical parameter settings that can quickly be applied to new datasets: + +For example, the following code replicates the full end-to-end workflow, in a single command: + +```{r oneliner, eval=FALSE} +pbmc <- CreateSeuratObject(pbmc_data) %>% PercentageFeatureSet(pattern = "^MT-",col.name = 'percent.mt') %>% SCTransform(vars.to.regress = 'percent.mt') %>% + RunPCA() %>% FindNeighbors(dims = 1:30) %>% RunUMAP(dims = 1:30) %>% FindClusters() + +``` + +
    + +
    + **Where are normalized values stored for sctransform?** + +As described in our [paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1), sctransform calculates a model of technical noise in scRNA-seq data using 'regularized negative binomial regression'. The residuals for this model are normalized values, and can be positive or negative. Positive residuals for a given gene in a given cell indicate that we observed more UMIs than expected given the gene’s average expression in the population and cellular sequencing depth, while negative residuals indicate the converse. + +The results of sctransfrom are stored in the "SCT" assay. You can learn more about multi-assay data and commands in Seurat in our [vignette](multimodal_vignette.html), [command cheat sheet](essential_commands.html#multi-assay-features), or [developer guide](https://github.com/satijalab/seurat/wiki/Assay). + +* `pbmc[["SCT"]]@scale.data` contains the residuals (normalized values), and is used directly as input to PCA. Please note that this matrix is non-sparse, and can therefore take up a lot of memory if stored for all genes. To save memory, we store these values only for variable genes, by setting the return.only.var.genes = TRUE by default in the `SCTransform()` function call. +* To assist with visualization and interpretation. we also convert Pearson residuals back to ‘corrected’ UMI counts. You can interpret these as the UMI counts we would expect to observe if all cells were sequenced to the same depth. If you want to see exactly how we do this, please look at the correct function [here](https://github.com/ChristophH/sctransform/blob/master/R/denoise.R). +* The 'corrected' UMI counts are stored in `pbmc[["SCT"]]@counts`. We store log-normalized versions of these corrected counts in `pbmc[["SCT"]]@data`, which are very helpful for visualization. +* You can use the corrected log-normalized counts for differential expression and integration. However, in principle, it would be most optimal to perform these calculations directly on the residuals (stored in the `scale.data` slot) themselves. This is not currently supported in Seurat v3, but will be soon. + +------ +
    +\ + + +Users can individually annotate clusters based on canonical markers. However, the sctransform normalization reveals sharper biological distinctions compared to the [standard Seurat workflow](pbmc3k_tutorial.html), in a few ways: + + * Clear separation of at least 3 CD8 T cell populations (naive, memory, effector), based on CD8A, GZMK, CCL5, GZMK expression + * Clear separation of three CD4 T cell populations (naive, memory, IFN-activated) based on S100A4, CCR7, IL32, and ISG15 + * Additional developmental sub-structure in B cell cluster, based on TCL1A, FCER2 + * Additional separation of NK cells into CD56dim vs. bright clusters, based on XCL1 and FCGR3A + + +```{r fplot, fig.width = 10, fig.height=6} +# These are now standard steps in the Seurat workflow for visualization and clustering +# Visualize canonical marker genes as violin plots. +VlnPlot(pbmc, features = c("CD8A", "GZMK", "CCL5", "S100A4", "ANXA1", "CCR7", "ISG15", "CD3D"), pt.size = 0.2, ncol = 4) + +# Visualize canonical marker genes on the sctransform embedding. +FeaturePlot(pbmc, features = c("CD8A", "GZMK", "CCL5", "S100A4", "ANXA1", "CCR7"), pt.size = 0.2, ncol = 3) +FeaturePlot(pbmc, features = c("CD3D", "ISG15", "TCL1A", "FCER2", "XCL1", "FCGR3A"), pt.size = 0.2, ncol = 3) +``` + + +```{r save.times, include=TRUE} +write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_sctransform_vignette_times.csv") +``` + +
    + **Session Info** +```{r} +sessionInfo() +``` +
    diff --git a/vignettes/seurat5_sketch_analysis.Rmd b/vignettes/seurat5_sketch_analysis.Rmd new file mode 100644 index 000000000..ccbdfefe6 --- /dev/null +++ b/vignettes/seurat5_sketch_analysis.Rmd @@ -0,0 +1,214 @@ +--- +title: "Sketch-based analysis in Seurat v5" +output: + html_document: + theme: united + df_print: kable + pdf_document: default +date: 'Compiled: `r Sys.Date()`' +--- + +```{r setup, include=FALSE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = 'styler', + message = FALSE, + warning = FALSE, + fig.width = 10, + time_it = TRUE, + error = TRUE +) +``` + +## Intro: Sketch-based analysis in Seurat v5 +As single-cell sequencing technologies continue to improve in scalability in throughput, the generation of datasets spanning a million or more cells is becoming increasingly routine. In Seurat v5, we introduce new infrastructure and methods to analyze, interpret, and explore these exciting datasets. + +In this vignette, we introduce a sketch-based analysis workflow to analyze a 1.3 million cell dataset of the developing mouse brain, freely available from 10x Genomics. Analyzing datasets of this size with standard workflows can be challenging, slow, and memory-intensive. Here we introduce an alternative workflow that is highly scalable, even to datasets ranging beyond 10 million cells in size. +Our 'sketch-based' workflow involves three new features in Seurat v5: + +* Infrastructure for on-disk storage of large single-cell datasets + +Storing expression matrices in memory can be challenging for extremely large scRNA-seq datasets. In Seurat v5, we introduce support for multiple on-disk storage formats. + +* 'Sketching' methods to subsample cells from large datasets while preserving rare populations + +As introduced in [Hie et al, 2019](https://www.sciencedirect.com/science/article/pii/S2405471219301528), cell sketching methods aim to compactly summarize large single-cell datasets in a small number of cells, while preserving the presence of both abundant and rare cell types. In Seurat v5, we leverage this idea to select subsamples ('sketches') of cells from large datasets that are stored on-disk. However, after sketching, the subsampled cells can be stored in-memory, allowing for interactive and rapid visualization and exploration. +We store sketched cells (in-memory) and the full dataset (on-disk) as two assays in the same Seurat object. Users can then easily switch between the two versions, providing the flexibility to perform quick analyses on a subset of cells in-memory, while retaining access to the full dataset on-disk. + +* Support for 'bit-packing' compression and infrastructure + +We demonstrate the on-disk capabilities in Seurat v5 using the [BPCells package](https://github.com/bnprks/BPCells) developed by Ben Parks in the Greenleaf Lab. This package utilizes bit-packing compression and optimized, streaming-compatible C++ code to substantially improve I/O and computational performance when working with on-disk data. +To run this vignette please install Seurat v5, using the installation instructions found [here](install.html). Additionally, you will need to install the `BPcells` package, using the installation instructions found [here](https://bnprks.github.io/BPCells/#installation). + +```{r, warning=FALSE, message=FALSE} +library(Seurat) +library(BPCells) +library(ggplot2) +# needs to be set for large dataset analysis +options(future.globals.maxSize = 1e9) +``` + +## Create a Seurat object with a v5 assay for on-disk storage + +We start by loading the 1.3M dataset from 10x Genomics using the `open_matrix_dir` function from `BPCells`. Note that this function does not load the dataset into memory, but instead, creates a connection to the data stored on-disk. We then store this on-disk representation in the Seurat object. Note that in our [Introduction to on-disk storage vignette](seurat5_bpcells_interaction_vignette.html), we demonstrate how to create this on-disk representation. + +```{r load.obj} +# Read the Seurat object, which contains 1.3M cells stored on-disk as part of the 'RNA' assay +obj <- readRDS("/brahms/hartmana/vignette_data/1p3_million_mouse_brain.rds") +obj + +# Note that since the data is stored on-disk, the object size easily fits in-memory (<1GB) +format(object.size(obj), units = 'Mb') +``` + +## 'Sketch' a subset of cells, and load these into memory +We select a subset ('sketch') of 50,000 cells (out of 1.3M). Rather than sampling all cells with uniform probability, we compute and sample based off a 'leverage score' for each cell, which reflects the magnitude of its contribution to the gene-covariance matrix, and its importance to the overall dataset. In [Hao et al, 2022](https://www.biorxiv.org/content/10.1101/2022.02.24.481684v1.full), we demonstrate that the leverage score is highest for rare populations in a dataset. Therefore, our sketched set of 50,000 cells will oversample rare populations, retaining the biological complexity of the sample while drastically compressing the dataset. + +The function `SketchData` takes a normalized single-cell dataset (stored either on-disk or in-memory), and a set of variable features. It returns a Seurat object with a new assay (`sketch`), consisting of 50,000 cells, but these cells are now stored in-memory. Users can now easily switch between the in-memory and on-disk representation just by changing the default assay. + +```{r, warning=FALSE, message=FALSE} +obj <- NormalizeData(obj) +obj <- FindVariableFeatures(obj) +obj <- SketchData( + object = obj, + ncells = 50000, + method = 'LeverageScore', + sketched.assay = 'sketch') +obj +# switch to analyzing the full dataset (on-disk) +DefaultAssay(obj) <- 'RNA' +# switch to analyzing the sketched dataset (in-memory) +DefaultAssay(obj) <- 'sketch' +``` + +## Perform clustering on the sketched dataset + +Now that we have compressed the dataset, we can perform standard clustering and visualization of a 50,000 cell dataset. +After clustering, we can see groups of cells that clearly correspond to precursors of distinct lineages, including endothelial cells (Igfbp7), Excitatory (Neurod6) and Inhibitory (Dlx2) neurons, Intermediate Progenitors (Eomes), Radial Glia (Vim), Cajal-Retzius cells (Reln), Oligodendroytes (Olig1), and extremely rare populations of macrophages (C1qa) that were oversampled in our sketched data. + +```{r, warning=FALSE, message=FALSE, fig.width=5, fig.height=5} +DefaultAssay(obj) <- 'sketch' +obj <- FindVariableFeatures(obj) +obj <- ScaleData(obj) +obj <- RunPCA(obj) +obj <- FindNeighbors(obj, dims = 1:50) +obj <- FindClusters(obj, resolution = 2) +obj <- RunUMAP(obj, dims = 1:50, return.model = T) +DimPlot(obj, label = T, label.size = 3, reduction = 'umap') + NoLegend() +``` + +```{r,fig.height = 7, fig.width = 10} +FeaturePlot( + object = obj, + features = c( + 'Igfbp7', 'Neurod6', 'Dlx2', 'Gad2', + 'Eomes', 'Vim', 'Reln', 'Olig1', 'C1qa'), + ncol = 3) +``` + +## Extend results to the full datasets +We can now extend the cluster labels and dimensional reductions learned on the sketched cells to the full dataset. The `ProjectData` function projects the on-disk data, onto the `sketch` assay. It returns a Seurat object that includes a + +* Dimensional reduction (PCA): The `pca.full` dimensional reduction extends the `pca` reduction on the sketched cells to all cells in the dataset +* Dimensional reduction (UMAP): The `full.umap` dimensional reduction extends the `umap` reduction on the sketched cells to all cells in the dataset +* Cluster labels: The `cluster_full` column in the object metadata now labels all cells in the dataset with one of the cluster labels derived from the sketched cells + +```{r, warning=FALSE, message=FALSE} +obj <- ProjectData( + object = obj, + assay = 'RNA', + full.reduction = 'pca.full', + sketched.assay = 'sketch', + sketched.reduction = 'pca', + umap.model = 'umap', + dims = 1:50, + refdata = list(cluster_full = 'seurat_clusters')) +# now that we have projected the full dataset, switch back to analyzing all cells +DefaultAssay(obj) <- 'RNA' +``` + +```{r save.img, include = FALSE, eval = FALSE} +p <- DimPlot(obj, label = T, label.size = 3, reduction = "full.umap", group.by = "cluster_full", alpha = 0.1) + NoLegend() +ggsave(filename = "../output/images/MouseBrain_sketch_clustering.jpg", height = 7, width = 7, plot = p, quality = 50) +``` + +```{r, fig.width=5, fig.height=5} +DimPlot(obj, label = T, label.size = 3, reduction = 'full.umap', group.by = 'cluster_full', alpha = 0.1) + NoLegend() +``` + +```{r, fig.width=10, fig.height=5} +# visualize gene expression on the sketched cells (fast) and the full dataset (slower) +DefaultAssay(obj) <- 'sketch' +x1 <- FeaturePlot(obj, 'C1qa') +DefaultAssay(obj) <- 'RNA' +x2 <- FeaturePlot(obj, 'C1qa') +x1 | x2 +``` + +## Perform iterative sub-clustering + +Now that we have performed an initial analysis of the dataset, we can iteratively 'zoom-in' on a cell subtype of interest, extract all cells of this type, and perform iterative sub-clustering. For example, we can see that Dlx2+ interneuron precursors are defined by clusters 2, 15, 18, 28 and 40. + +```{r} +DefaultAssay(obj) <- 'sketch' +VlnPlot(obj, 'Dlx2') +``` + +We therefore extract all cells from the full on-disk dataset that are present in these clusters. There are 200,892 of them. Since this is a manageable number, we can convert these data from on-disk storage into in-memory storage. We can then proceed with standard clustering. + +```{r} +# subset cells in these clusters. Note that the data remains on-disk after subsetting +obj.sub <- subset(obj, subset = cluster_full %in% c(2, 15, 18, 28, 40)) +DefaultAssay(obj.sub) <- 'RNA' + +# now convert the RNA assay (previously on-disk) into an in-memory representation (sparse Matrix) +# we only convert the data layer, and keep the counts on-disk +obj.sub[['RNA']]$data <- as(obj.sub[['RNA']]$data, Class = 'dgCMatrix') + +# recluster the cells +obj.sub <- FindVariableFeatures(obj.sub) +obj.sub <- ScaleData(obj.sub) +obj.sub <- RunPCA(obj.sub) +obj.sub <- RunUMAP(obj.sub, dims = 1:30) +obj.sub <- FindNeighbors(obj.sub, dims = 1:30) +obj.sub <- FindClusters(obj.sub) +``` + +```{r, fig.width=5, fig.height=5} +DimPlot(obj.sub, label = T, label.size = 3) + NoLegend() +``` + +Note that we can start to see distinct interneuron lineages emerging in this dataset. We can see a clear separation of interneuron precursors that originated from the medial ganglionic eminence (Lhx6) or caudal ganglionic eminence (Nr2f2). We can further see the emergence of Sst (Sst) and Pvalb (Mef2c)-committed interneurons, and a CGE-derived Meis2-expressing progenitor population. +These results closely mirror our findings from [Mayer*, Hafemeister*, Bandler* et al, Nature 2018](https://www.nature.com/articles/nature25999), where we enriched for interneuron precursors using a Dlx6a-cre fate-mapping strategy. Here, we obtain similar results using only computational enrichment, enabled by the large size of the original dataset. + +```{r,fig.height = 7, fig.width = 10} +FeaturePlot( + object = obj.sub, + features = c( + 'Dlx2', 'Gad2', 'Lhx6', 'Nr2f2', 'Sst', + 'Mef2c', 'Meis2', 'Id2', 'Dlx6os1'), + ncol = 3) +``` + +```{r save.times, include=FALSE, eval=FALSE} +print(as.data.frame(all_times)) +write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_sketch_analysis.csv") +``` + +
    + **Session Info** +```{r} +sessionInfo() +``` +
    \ No newline at end of file diff --git a/vignettes/seurat5_spatial_vignette.Rmd b/vignettes/seurat5_spatial_vignette.Rmd new file mode 100644 index 000000000..a0d88d5ab --- /dev/null +++ b/vignettes/seurat5_spatial_vignette.Rmd @@ -0,0 +1,524 @@ +--- +title: "Analysis, visualization, and integration of spatial datasets with Seurat" +output: + html_document: + theme: united + df_print: kable + pdf_document: default +date: 'Compiled: `r Sys.Date()`' +--- + +```{r setup, include=FALSE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = TRUE, + tidy.opts = list(width.cutoff = 95), + message = FALSE, + warning = FALSE, + fig.width = 10, + time_it = TRUE, + error = TRUE +) +``` + +# Overview + +This tutorial demonstrates how to use Seurat (>=3.2) to analyze spatially-resolved RNA-seq data. While the analytical pipelines are similar to the Seurat workflow for [single-cell RNA-seq analysis](pbmc3k_tutorial.html), we introduce updated interaction and visualization tools, with a particular emphasis on the integration of spatial and molecular information. This tutorial will cover the following tasks, which we believe will be common for many spatial analyses: + +* Normalization +* Dimensional reduction and clustering +* Detecting spatially-variable features +* Interactive visualization +* Integration with single-cell RNA-seq data +* Working with multiple slices + +For our first vignette, we analyze a dataset generated with the [Visium technology](https://www.10xgenomics.com/spatial-transcriptomics/) from 10x Genomics. We will be extending Seurat to work with additional data types in the near-future, including [SLIDE-Seq](https://science.sciencemag.org/content/363/6434/1463), [STARmap](https://science.sciencemag.org/content/361/6400/eaat5691), and [MERFISH](https://science.sciencemag.org/content/362/6416/eaau5324). + +First, we load Seurat and the other packages necessary for this vignette. + +```{r install} +library(Seurat) +options(Seurat.object.assay.version = "v5") +library(SeuratData) +library(ggplot2) +library(patchwork) +library(dplyr) +``` + +```{r libraries.for.rmd, echo = FALSE} +library("htmltools") +library("vembedr") +``` + + +# 10x Visium + +## Dataset + +Here, we will be using a recently released dataset of sagital mouse brain slices generated using the Visium v1 chemistry. There are two serial anterior sections, and two (matched) serial posterior sections. + +You can download the data [here](https://support.10xgenomics.com/spatial-gene-expression/datasets), and load it into Seurat using the `Load10X_Spatial()` function. This reads in the output of the [spaceranger](https://support.10xgenomics.com/spatial-gene-expression/software/pipelines/latest/what-is-space-ranger) pipeline, and returns a Seurat object that contains both the spot-level expression data along with the associated image of the tissue slice. You can also use our [SeuratData package](https://github.com/satijalab/seurat-data) for easy data access, as demonstrated below. After installing the dataset, you can type `?stxBrain` to learn more. + +```{r data.install, eval = FALSE} +InstallData("stxBrain") +``` + +```{r data} +brain <- LoadData('stxBrain', type = 'anterior1') +brain[['Spatial']] <- as(brain[['Spatial']], Class = 'Assay5') +``` + +
    + **How is the spatial data stored within Seurat? ** +The visium data from 10x consists of the following data types: + +* A spot by gene expression matrix +* An image of the tissue slice (obtained from H&E staining during data acquisition) +* Scaling factors that relate the original high resolution image to the lower resolution image used here for visualization. + +In the Seurat object, the spot by gene expression matrix is similar to a typical "RNA" `Assay` but contains spot level, not single-cell level data. The image itself is stored in a new `images` slot in the Seurat object. The `images` slot also stores the information necessary to associate spots with their physical position on the tissue image. +
    + +## Data preprocessing + +The initial preprocessing steps that we perform on the spot by gene expression data are similar to a typical scRNA-seq experiment. We first need to normalize the data in order to account for variance in sequencing depth across data points. We note that the variance in molecular counts / spot can be substantial for spatial datasets, particularly if there are differences in cell density across the tissue. We see substantial heterogeneity here, which requires effective normalization. + +```{r qc, fig.height=5} +plot1 <- VlnPlot(brain, features = 'nCount_Spatial', pt.size = 0.1) + NoLegend() +plot2 <- SpatialFeaturePlot(brain, features = 'nCount_Spatial') + theme(legend.position = "right") +wrap_plots(plot1, plot2) +``` + +These plots demonstrate that the variance in molecular counts across spots is not just technical in nature, but also is dependent on the tissue anatomy. For example, regions of the tissue that are depleted for neurons (such as the cortical white matter), reproducibly exhibit lower molecular counts. As a result, standard approaches (such as the `LogNormalize()` function), which force each data point to have the same underlying 'size' after normalization, can be problematic. + +As an alternative, we recommend using sctransform (Hafemeister and Satija, Genome Biology 2019), which which builds regularized negative binomial models of gene expression in order to account for technical artifacts while preserving biological variance. For more details on sctransform, please see the paper [here](https://doi.org/10.1186/s13059-019-1874-1) and the Seurat vignette [here](sctransform_vignette.html). sctransform normalizes the data, detects high-variance features, and stores the data in the `SCT` assay. + +```{r preprocess} +brain <- SCTransform(brain, assay = "Spatial", verbose = FALSE) +``` + +
    + **How do results compare to log-normalization?** +To explore the differences in normalization methods, we examine how both the sctransform and log normalization results correlate with the number of UMIs. For this comparison, we first rerun sctransform to store values for all genes and run a log-normalization procedure via `NormalizeData()`. + +```{r norm.test} +# rerun normalization to store sctransform residuals for all genes +brain <- SCTransform(brain, assay = "Spatial", return.only.var.genes = FALSE, verbose = FALSE) +# also run standard log normalization for comparison +brain <- NormalizeData(brain, verbose = FALSE, assay = "Spatial") +``` + +```{r norm.test2} +# Computes the correlation of the log normalized data and sctransform residuals with the number of UMIs +brain <- GroupCorrelation(brain, group.assay = "Spatial", assay = "Spatial", slot = "data", do.plot = FALSE) +brain <- GroupCorrelation(brain, group.assay = "Spatial", assay = "SCT", slot = "scale.data", do.plot = FALSE) +``` + +```{r norm.test3} +p1 <- GroupCorrelationPlot(brain, assay = "Spatial", cor = "nCount_Spatial_cor") + ggtitle("Log Normalization") + theme(plot.title = element_text(hjust = 0.5)) +p2 <- GroupCorrelationPlot(brain, assay = "SCT", cor = "nCount_Spatial_cor") + ggtitle("SCTransform Normalization") + theme(plot.title = element_text(hjust = 0.5)) +p1 + p2 +``` + +For the boxplots above, we calculate the correlation of each feature (gene) with the number of UMIs (the `nCount_Spatial` variable here). We then place genes into groups based on their mean expression, and generate boxplots of these correlations. You can see that log-normalization fails to adequately normalize genes in the first three groups, suggesting that technical factors continue to influence normalized expression estimates for highly expressed genes. In contrast, sctransform normalization substantially mitigates this effect. +
    + +## Gene expression visualization + +In Seurat, we have functionality to explore and interact with the inherently visual nature of spatial data. The `SpatialFeaturePlot()` function in Seurat extends `FeaturePlot()`, and can overlay molecular data on top of tissue histology. For example, in this data set of the mouse brain, the gene Hpca is a strong hippocampus marker and Ttr is a marker of the choroid plexus. + +```{r featureplot} +SpatialFeaturePlot(brain, features = c("Hpca", "Ttr")) +``` + + +```{r save.img, include=TRUE} +library(ggplot2) +plot <- SpatialFeaturePlot(brain, features = c("Ttr")) + + theme(legend.text = element_text(size = 0), legend.title = element_text(size = 20), legend.key.size = unit(1, "cm")) +jpeg(filename = "../output/images/spatial_vignette_ttr.jpg", height = 700, width = 1200, quality = 50) +print(plot) +dev.off() +``` + +The default parameters in Seurat emphasize the visualization of molecular data. However, you can also adjust the size of the spots (and their transparency) to improve the visualization of the histology image, by changing the following parameters: + +* `pt.size.factor`- This will scale the size of the spots. Default is 1.6 +* `alpha` - minimum and maximum transparency. Default is c(1, 1). +* Try setting to `alpha` c(0.1, 1), to downweight the transparency of points with lower expression + +```{r fpe1} +p1 <- SpatialFeaturePlot(brain, features = "Ttr", pt.size.factor = 1) +p2 <- SpatialFeaturePlot(brain, features = "Ttr", alpha = c(0.1, 1)) +p1 + p2 +``` + +## Dimensionality reduction, clustering, and visualization + +We can then proceed to run dimensionality reduction and clustering on the RNA expression data, using the same workflow as we use for scRNA-seq analysis. + +```{r dim.cluster} +brain <- RunPCA(brain, assay = "SCT", verbose = FALSE) +brain <- FindNeighbors(brain, reduction = "pca", dims = 1:30) +brain <- FindClusters(brain, verbose = FALSE) +brain <- RunUMAP(brain, reduction = "pca", dims = 1:30) +``` + +We can then visualize the results of the clustering either in UMAP space (with `DimPlot()`) or overlaid on the image with `SpatialDimPlot()`. + +```{r dim.plots,fig.height=5} +p1 <- DimPlot(brain, reduction = "umap", label = TRUE) +p2 <- SpatialDimPlot(brain, label = TRUE, label.size = 3) +p1 + p2 +``` +As there are many colors, it can be challenging to visualize which voxel belongs to which cluster. We have a few strategies to help with this. Setting the `label` parameter places a colored box at the median of each cluster (see the plot above). + +You can also use the `cells.highlight` parameter to demarcate particular cells of interest on a `SpatialDimPlot()`. This can be very useful for distinguishing the spatial localization of individual clusters, as we show below: + +```{r facetdim} +SpatialDimPlot(brain, cells.highlight = CellsByIdentities(object = brain,idents = c(2, 1, 4, 3, 5, 8)), facet.highlight = TRUE, ncol = 3) +``` + +## Interactive plotting + +We have also built in a number of interactive plotting capabilities. Both `SpatialDimPlot()` and `SpatialFeaturePlot()` now have an `interactive` parameter, that when set to `TRUE`, will open up the Rstudio viewer pane with an interactive Shiny plot. The example below demonstrates an interactive `SpatialDimPlot()` in which you can hover over spots and view the cell name and current identity class (analogous to the previous `do.hover` behavior). + +```{r ispatialdimplot, eval = FALSE} +SpatialDimPlot(brain, interactive = TRUE) +``` + +```{r, echo = FALSE} +embed_url("https://youtu.be/E1aZjmG1neQ") +``` + +For `SpatialFeaturePlot()`, setting interactive to `TRUE` brings up an interactive pane in which you can adjust the transparency of the spots, the point size, as well as the `Assay` and feature being plotted. After exploring the data, selecting the done button will return the last active plot as a ggplot object. + +```{r ispatialfeatureplot, eval = FALSE} +SpatialFeaturePlot(brain, features = "Ttr", interactive = TRUE) +``` + +```{r, echo = FALSE} +embed_url("https://youtu.be/ILmb8XNlgEM") +``` + +The `LinkedDimPlot()` function links the UMAP representation to the tissue image representation and allows for interactive selection. For example, you can select a region in the UMAP plot and the corresponding spots in the image representation will be highlighted. + +```{r linkedplot, eval=FALSE} +LinkedDimPlot(brain) +``` + +```{r, echo = FALSE} +embed_url("https://youtu.be/10PZqjcSKrg") +``` + +## Identification of Spatially Variable Features + +Seurat offers two workflows to identify molecular features that correlate with spatial location within a tissue. The first is to perform differential expression based on pre-annotated anatomical regions within the tissue, which may be determined either from unsupervised clustering or prior knowledge. This strategy works will in this case, as the clusters above exhibit clear spatial restriction. + +```{r de, fig.height = 4} +de_markers <- FindMarkers(brain, ident.1 = 5, ident.2 = 6) +SpatialFeaturePlot(object = brain, features = rownames(de_markers)[1:3], alpha = c(0.1, 1), ncol = 3) +``` + +An alternative approach, implemented in `FindSpatiallyVariables()`, is to search for features exhibiting spatial patterning in the absence of pre-annotation. The default method (`method = 'markvariogram`), is inspired by the [Trendsceek](https://www.nature.com/articles/nmeth.4634), which models spatial transcriptomics data as a mark point process and computes a 'variogram', which identifies genes whose expression level is dependent on their spatial location. More specifically, this process calculates gamma(r) values measuring the dependence between two spots a certain "r" distance apart. By default, we use an r-value of '5' in these analyses, and only compute these values for variable genes (where variation is calculated independently of spatial location) to save time. + +We note that there are multiple methods in the literature to accomplish this task, including [SpatialDE](https://www.nature.com/articles/nmeth.4636), and [Splotch](https://www.biorxiv.org/content/10.1101/757096v1.article-metrics). We encourage interested users to explore these methods, and hope to add support for them in the near future. + +```{r spatial.vf} +brain <- FindSpatiallyVariableFeatures(brain, assay = 'SCT', features = VariableFeatures(brain)[1:1000], selection.method = 'moransi') +``` + +Now we visualize the expression of the top 6 features identified by this measure. +```{r spatial.vf.plot, fig.height=8} +top.features <- head(SpatiallyVariableFeatures(brain, selection.method = 'moransi'),6) +SpatialFeaturePlot(brain, features = top.features, ncol = 3, alpha = c(0.1, 1)) +``` + +## Subset out anatomical regions + +As with single-cell objects, you can subset the object to focus on a subset of data. Here, we approximately subset the frontal cortex. This process also facilitates the integration of these data with a cortical scRNA-seq dataset in the next section. First, we take a subset of clusters, and then further segment based on exact positions. After subsetting, we can visualize the cortical cells either on the full image, or a cropped image. + +```{r subset1} +cortex <- subset(brain, idents = c(1, 2, 3, 4, 6, 7)) +# now remove additional cells, use SpatialDimPlots to visualize what to remove +# SpatialDimPlot(cortex,cells.highlight = WhichCells(cortex, expression = image_imagerow > 400 | image_imagecol < 150)) +cortex <- subset(cortex, anterior1_imagerow > 400 | anterior1_imagecol < 150, invert = TRUE) +cortex <- subset(cortex, anterior1_imagerow > 275 & anterior1_imagecol > 370, invert = TRUE) +cortex <- subset(cortex, anterior1_imagerow > 250 & anterior1_imagecol > 440, invert = TRUE) +``` + +```{r subset1.plot, fig.height = 4} +p1 <- SpatialDimPlot(cortex, crop = TRUE, label = TRUE) +p2 <- SpatialDimPlot(cortex, crop = FALSE, label = TRUE, pt.size.factor = 1, label.size = 3) +p1 + p2 +``` + +## Integration with single-cell data + +At ~50um, spots from the visium assay will encompass the expression profiles of multiple cells. For the growing list of systems where scRNA-seq data is available, users may be interested to 'deconvolute' each of the spatial voxels to predict the underlying composition of cell types. In preparing this vignette, we tested a wide variety of decovonlution and integration methods, using a [reference scRNA-seq dataset](https://www.nature.com/articles/nn.4216) of ~14,000 adult mouse cortical cell taxonomy from the Allen Institute, generated with the SMART-Seq2 protocol. +We consistently found superior performance using integration methods (as opposed to deconvolution methods), likely because of substantially different noise models that characterize spatial and single-cell datasets, and integration methods are specifiically designed to be robust to these differences. We therefore apply the 'anchor'-based integration workflow introduced in [Seurat v3](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8), that enables the probabilistic transfer of annotations from a reference to a query set. We therefore follow the label transfer workflow introduced [here](reference_mapping.html), taking advantage of sctransform normalization, but anticipate new methods to be developed to accomplish this task. + +We first load the data (download available [here](https://www.dropbox.com/s/cuowvm4vrf65pvq/allen_cortex.rds?dl=1)), pre-process the scRNA-seq reference, and then perform label transfer. The procedure outputs, for each spot, a probabilistic classification for each of the scRNA-seq derived classes. We add these predictions as a new assay in the Seurat object. + +```{r sc.data} +allen_reference <- readRDS("../data/allen_cortex.rds") +allen_reference <- UpdateSeuratObject(allen_reference) +``` + +```{r sc.data2} +# note that setting ncells=3000 normalizes the full dataset but learns noise models on 3k cells +# this speeds up SCTransform dramatically with no loss in performance +library(dplyr) +allen_reference <- SCTransform(allen_reference, ncells = 3000, verbose = FALSE) %>% RunPCA(verbose = FALSE) %>% RunUMAP(dims = 1:30) +``` + +```{r sc.data3, fig.width=8, fig.align="center"} +# After subsetting, we renormalize cortex +cortex <- SCTransform(cortex, assay = 'Spatial', verbose = FALSE) %>% RunPCA(verbose = FALSE) +# the annotation is stored in the 'subclass' column of object metadata +DimPlot(allen_reference, group.by = 'subclass', label = TRUE) +``` + +```{r sc.data5} +anchors <- FindTransferAnchors(reference = allen_reference, query = cortex, normalization.method = "SCT") +predictions.assay <- TransferData(anchorset = anchors, refdata = allen_reference$subclass, prediction.assay = TRUE, weight.reduction = cortex[["pca"]], dims = 1:30) +cortex[["predictions"]] <- predictions.assay +``` + +Now we get prediction scores for each spot for each class. Of particular interest in the frontal cortex region are the laminar excitatory neurons. Here we can distinguish between distinct sequential layers of these neuronal subtypes, for example: + +```{r sc.data7} +DefaultAssay(cortex) <- "predictions" +SpatialFeaturePlot(cortex, features = c("L2/3 IT", "L4"), pt.size.factor = 1.6, ncol = 2, crop = TRUE) +``` + +Based on these prediction scores, we can also predict *cell types* whose location is spatially restricted. We use the same methods based on marked point processes to define spatially variable features, but use the cell type prediction scores as the "marks" rather than gene expression. + +```{r sc.data8, fig.height = 10} +cortex <- FindSpatiallyVariableFeatures(cortex, assay = "predictions", selection.method = "moransi", features = rownames(cortex), r.metric = 5, slot = "data") +top.clusters <- head(SpatiallyVariableFeatures(cortex, selection.method = "moransi"), 4) +SpatialPlot(object = cortex, features = top.clusters, ncol = 2) +``` + +Finally, we show that our integrative procedure is capable of recovering the known spatial localization patterns of both neuronal and non-neuronal subsets, including laminar excitatory, layer-1 astrocytes, and the cortical grey matter. + +```{r sc.data9,fig.height=20,fig.width=10} +SpatialFeaturePlot(cortex, features = c("Astro", "L2/3 IT", "L4", "L5 PT", "L5 IT", "L6 CT", "L6 IT", "L6b", "Oligo"), pt.size.factor = 1, ncol = 2, crop = FALSE, alpha = c(0.1, 1)) +``` + + +## Working with multiple slices in Seurat + +This dataset of the mouse brain contains another slice corresponding to the other half of the brain. Here we read it in and perform the same initial normalization. + +```{r brain2data} +brain2 <- LoadData('stxBrain', type = 'posterior1') +brain2[['Spatial']] <- as(brain2[['Spatial']], Class = 'Assay5') +brain2 <- SCTransform(brain2, assay = 'Spatial', verbose = FALSE) +``` + +In order to work with multiple slices in the same Seurat object, we provide the `merge` function. + +```{r merge} +brain.merge <- merge(brain, brain2) +``` + +This then enables joint dimensional reduction and clustering on the underlying RNA expression data. + +```{r joint.analysis} +DefaultAssay(brain.merge) <- "SCT" +VariableFeatures(brain.merge) <- c(VariableFeatures(brain), VariableFeatures(brain2)) +brain.merge <- RunPCA(brain.merge, verbose = FALSE) +brain.merge <- FindNeighbors(brain.merge, dims = 1:30) +brain.merge <- FindClusters(brain.merge, verbose = FALSE) +brain.merge <- RunUMAP(brain.merge, dims = 1:30) +``` + +Finally, the data can be jointly visualized in a single UMAP plot. `SpatialDimPlot()` and `SpatialFeaturePlot()` will by default plot all slices as columns and groupings/features as rows. + +```{r joint.viz, fig.height = 4} +DimPlot(brain.merge, reduction = "umap", group.by = c("ident", "orig.ident")) +``` + +```{r joint.viz2} +SpatialDimPlot(brain.merge) +``` + +```{r joint.viz3, fig.height = 10} +SpatialFeaturePlot(brain.merge, features = c('Hpca', 'Plp1')) +``` + +## Acknowledgments + +We would like to thank Nigel Delaney and Stephen Williams for their helpful feedback and contributions to the new spatial Seurat code. + +# Slide-seq + +## Dataset + +Here, we will be analyzing a dataset generated using [Slide-seq v2](https://www.biorxiv.org/content/10.1101/2020.03.12.989806v1) of the mouse hippocampus. This tutorial will follow much of the same structure as the spatial vignette for 10x Visium data but is tailored to give a demonstration specific to Slide-seq data. + +You can use our [SeuratData package](https://github.com/satijalab/seurat-data) for easy data access, as demonstrated below. After installing the dataset, you can type `?ssHippo` to see the commands used to create the Seurat object. + +```{r data.ss.install, eval = FALSE} +InstallData("ssHippo") +``` + +```{r data.ss} +slide.seq <- LoadData('ssHippo') +``` + +## Data preprocessing + +The initial preprocessing steps for the bead by gene expression data are similar to other spatial Seurat analyses and to typical scRNA-seq experiments. Here, we note that many beads contain particularly low UMI counts but choose to keep all detected beads for downstream analysis. + +```{r qc.ss, fig.height=5} +plot1 <- VlnPlot(slide.seq, features = 'nCount_Spatial', pt.size = 0, log = TRUE) + NoLegend() +slide.seq$log_nCount_Spatial <- log(slide.seq$nCount_Spatial) +plot2 <- SpatialFeaturePlot(slide.seq, features = 'log_nCount_Spatial') + theme(legend.position = "right") +wrap_plots(plot1, plot2) +``` + +We then normalize the data using [sctransform](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1) and perform a standard scRNA-seq dimensionality reduction and clustering workflow. + +```{r preprocess.ss} +slide.seq <- SCTransform(slide.seq, assay = "Spatial", ncells = 3000, n_genes=nrow(slide.seq[["Spatial"]]), verbose = FALSE) +slide.seq <- RunPCA(slide.seq, assay = "SCT") +slide.seq <- RunUMAP(slide.seq, dims = 1:30) +slide.seq <- FindNeighbors(slide.seq, dims = 1:30) +slide.seq <- FindClusters(slide.seq, resolution = 0.3, verbose = FALSE) +``` + +We can then visualize the results of the clustering either in UMAP space (with `DimPlot()`) or in the bead coordinate space with `SpatialDimPlot()`. + +```{r dim.plots.ss,fig.height=5} +plot1 <- DimPlot(slide.seq, reduction = "umap", label = TRUE) +plot2 <- SpatialDimPlot(slide.seq, stroke = 0) +plot1 + plot2 +SpatialDimPlot(slide.seq, cells.highlight = CellsByIdentities(object = slide.seq, idents = c(1, 6, 13)), facet.highlight = TRUE) +``` + +## Integration with a scRNA-seq reference + +To facilitate cell-type annotation of the Slide-seq dataset, we are leveraging an existing mouse single-cell RNA-seq hippocampus dataset, produced in [Saunders\*, Macosko\*, et al. 2018](https://doi.org/10.1016/j.cell.2018.07.028). The data is available for download as a processed Seurat object [here](https://www.dropbox.com/s/cs6pii5my4p3ke3/mouse_hippocampus_reference.rds?dl=0), with the raw count matrices available on the [DropViz website](http://dropviz.org/). + +```{r ref.saunders} +ref <- readRDS("../data/mouse_hippocampus_reference.rds") +ref <- UpdateSeuratObject(ref) +``` + +The original annotations from the paper are provided in the cell metadata of the Seurat object. These annotations are provided at several "resolutions", from broad classes (`ref$class`) to subclusters within celltypes (`ref$subcluster`). For the purposes of this vignette, we'll work off of a modification of the celltype annotations (`ref$celltype`) which we felt struck a good balance. + +We'll start by running the Seurat label transfer method to predict the major celltype for each bead. + +```{r ref.preprocessing.ss} +anchors <- FindTransferAnchors(reference = ref, query = slide.seq, normalization.method = "SCT", npcs = 50) +predictions.assay <- TransferData(anchorset = anchors, refdata = ref$celltype, prediction.assay = TRUE, weight.reduction = slide.seq[["pca"]], dims = 1:50) +slide.seq[["predictions"]] <- predictions.assay +``` + +We can then visualize the prediction scores for some of the major expected classes. + +```{r transfer.viz.ss, fig.height = 8} +DefaultAssay(slide.seq) <- 'predictions' +SpatialFeaturePlot(slide.seq, features = c("Dentate Principal cells", "CA3 Principal cells", "Entorhinal cortex", "Endothelial tip", "Ependymal", "Oligodendrocyte"), alpha = c(0.1, 1)) +``` + +```{r max.idents.ss} +slide.seq$predicted.id <- GetTransferPredictions(slide.seq) +Idents(slide.seq) <- "predicted.id" +SpatialDimPlot(slide.seq, cells.highlight = CellsByIdentities(object = slide.seq, idents = c("CA3 Principal cells", "Dentate Principal cells", "Endothelial tip")), facet.highlight = TRUE) +``` + +## Identification of Spatially Variable Features + +As mentioned in the Visium vignette, we can identify spatially variable features in two general ways: differential expression testing between pre-annotated anatomical regions or statistics that measure the dependence of a feature on spatial location. + +Here, we demonstrate the latter with an implementation of Moran's I available via `FindSpatiallyVariableFeatures()` by setting `method = 'moransi'`. Moran's I computes an overall spatial autocorrelation and gives a statistic (similar to a correlation coefficient) that measures the dependence of a feature on spatial location. This allows us to rank features based on how spatially variable their expression is. In order to facilitate quick estimation of this statistic, we implemented a basic binning strategy that will draw a rectangular grid over Slide-seq puck and average the feature and location within each bin. The number of bins in the x and y direction are controlled by the `x.cuts` and `y.cuts` parameters respectively. Additionally, while not required, installing the optional `Rfast2` package(`install.packages('Rfast2')`), will significantly decrease the runtime via a more efficient implementation. + +```{r spatial.vf.ss} +DefaultAssay(slide.seq) <- "SCT" +slide.seq <- FindSpatiallyVariableFeatures(slide.seq, assay = 'SCT', slot = "scale.data", features = VariableFeatures(slide.seq)[1:1000], selection.method = 'moransi', x.cuts = 100, y.cuts = 100) +``` + +Now we visualize the expression of the top 6 features identified by Moran's I. + +```{r spatial.vf.plot.ss, fig.height = 8} +SpatialFeaturePlot(slide.seq, features = head(SpatiallyVariableFeatures(slide.seq, selection.method = "moransi"), 6), ncol = 3, alpha = c(0.1, 1), max.cutoff = "q95") +``` + + +## Spatial deconvolution using RCTD + +While `FindTransferAnchors` can be used to integrate spot-level data from spatial transcriptomic datasets, Seurat v5 also includes support for the [Robust Cell Type Decomposition](https://www.nature.com/articles/s41587-021-00830-w), a computational approach to deconvolve spot-level data from spatial datasets, when provided with an scRNA-seq reference. RCTD has been shown to accurately annotate spatial data from a variety of technologies, including SLIDE-seq, Visium, and the 10x Xenium in-situ spatial platform. + +To run RCTD, we first install the `spacexr` package from GitHub which implements RCTD. + +```{r, eval=FALSE} +devtools::install_github("dmcable/spacexr", build_vignettes = FALSE) +``` + +Counts, cluster, and spot information is extracted from the Seurat query and reference objects to construct `Reference` and `SpatialRNA` objects used by RCTD for annotation. + +```{r rctd.setup, warning=FALSE, results=FALSE} +library(spacexr) + +# set up reference +ref <- readRDS("../data/mouse_hippocampus_reference.rds") +ref <- UpdateSeuratObject(ref) +Idents(ref) <- "celltype" + +# extract information to pass to the RCTD Reference function +counts <- ref[["RNA"]]$counts +cluster <- as.factor(ref$celltype) +names(cluster) <- colnames(ref) +nUMI <- ref$nCount_RNA +names(nUMI) <- colnames(ref) +reference <- Reference(counts, cluster, nUMI) + +# set up query with the RCTD function SpatialRNA +slide.seq <- SeuratData::LoadData("ssHippo") +counts <- slide.seq[["Spatial"]]$counts +coords <- GetTissueCoordinates(slide.seq) +colnames(coords) <- c("x", "y") +coords[is.na(colnames(coords))] <- NULL +query <- SpatialRNA(coords, counts, colSums(counts)) +``` + +Using the `reference` and `query` object, we annotate the dataset and add the cell type labels to the query Seurat object. RCTD parallelizes well, so multiple cores can be specified for faster performance. + +```{r run.rctd, warning=FALSE, results=FALSE} +RCTD <- create.RCTD(query, reference, max_cores = 8) +RCTD <- run.RCTD(RCTD, doublet_mode = 'doublet') +slide.seq <- AddMetaData(slide.seq, metadata = RCTD@results$results_df) +``` + +Next, plot the RCTD annotations. Because we ran RCTD in doublet mode, the algorithm assigns a `first_type` and `second_type` for each barcode or spot. + +```{r rctd_results, fig.height=8, fig.width=14} +p1 <- SpatialDimPlot(slide.seq, group.by = "first_type") +p2 <- SpatialDimPlot(slide.seq, group.by = "second_type") +p1 | p2 +``` + +```{r save.times, include=TRUE} +write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_spatial_vignette_times.csv") +``` + +
    + **Session Info** +```{r} +sessionInfo() +``` +
    diff --git a/vignettes/seurat5_spatial_vignette_2.Rmd b/vignettes/seurat5_spatial_vignette_2.Rmd new file mode 100644 index 000000000..d334aaec3 --- /dev/null +++ b/vignettes/seurat5_spatial_vignette_2.Rmd @@ -0,0 +1,655 @@ +--- +title: "Analysis of Image-based Spatial Data in Seurat" +output: + html_document: + theme: united + df_print: kable +date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' +--- + +------------------------------------------------------------------------ + +```{r setup, include=FALSE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = TRUE, + tidy.opts = list(width.cutoff = 95), + message = FALSE, + warning = FALSE, + time_it = TRUE, + error = TRUE +) +``` + +# Overview + +In this vignette, we introduce a Seurat extension to analyze new types of spatially-resolved data. We have [previously introduced a spatial framework](https://satijalab.org/seurat/articles/spatial_vignette.html) which is compatible with sequencing-based technologies, like the 10x Genomics Visium system, or SLIDE-seq. Here, we extend this framework to analyze new data types that are captured via highly multiplexed imaging. In contrast to sequencing-based technologies, these datasets are often targeted (i.e. they profile a pre-selected set of genes). However they can resolve individual molecules - retaining single-cell (and subcellular) resolution. These approaches also often capture cellular boundaries (segmentations). + +We update the Seurat infrastructure to enable the analysis, visualization, and exploration of these exciting datasets. In this vignette, we focus on three datasets produced by different multiplexed imaging technologies, each of which is publicly available. We will be adding support for additional imaging-based technologies in the coming months. + +- Vizgen MERSCOPE (Mouse Brain) +- Nanostring CosMx Spatial Molecular Imager (FFPE Human Lung) +- Akoya CODEX (Human Lymph Node) + +First, we load the packages necessary for this vignette. + +```{r init, message=FALSE, warning=FALSE} +library(Seurat) +library(future) +plan("multisession", workers = 10) +library(ggplot2) +``` + +# Mouse Brain: Vizgen MERSCOPE + +This dataset was produced using the Vizgen MERSCOPE system, which utilizes the MERFISH technology. The total dataset is available for [public download](https://info.vizgen.com/mouse-brain-data), and contains nine samples (three full coronal slices of the mouse brain, with three biological replicates per slice). The gene panel consists of 483 gene targets, representing known anonical cell type markers, nonsensory G-Protein coupled receptors (GPCRs), and Receptor Tyrosine Kinases (RTKs). In this vignette, we analyze one of the samples - slice 2, replicate 1. The median number of transcripts detected in each cell is 206. + +First, we read in the dataset and create a Seurat object. + +We use the `LoadVizgen()` function, which we have written to read in the output of the Vizgen analysis pipeline. The resulting Seurat object contains the following information: + +- A count matrix, indicating the number of observed molecules for each of the 483 transcripts in each cell. This matrix is analogous to a count matrix in scRNA-seq, and is stored by default in the RNA assay of the Seurat object + +```{r, message=FALSE, warning=FALSE} +# Loading segmentations is a slow process and multi processing with the future pacakge is recommended +vizgen.obj <- LoadVizgen(data.dir = "/brahms/hartmana/vignette_data/vizgen/s2r1/", fov = "s2r1") +``` + +The next pieces of information are specific to imaging assays, and is stored in the images slot of the resulting Seurat object: + +
    + +**Cell Centroids: The spatial coordinates marking the centroid for each cell being profiled** + +```{r} +# Get the center position of each centroid. There is one row per cell in this dataframe. +head(GetTissueCoordinates(vizgen.obj[["s2r1"]][["centroids"]])) +``` + +
    + +
    + +**Cell Segmentation Boundaries: The spatial coordinates that describe the polygon segmentation of each single cell** + +```{r} +# Get the coordinates for each segmentation vertice. Each cell will have a variable number of vertices describing its shape. +head(GetTissueCoordinates(vizgen.obj[["s2r1"]][["segmentation"]])) +``` + +
    + +
    + +**Molecule positions: The spatial coordinates for each individual molecule that was detected during the multiplexed smFISH experiment.** + +```{r} +# Fetch molecules positions for Chrm1 +head(FetchData(vizgen.obj[["s2r1"]][["molecules"]], vars="Chrm1")) +``` + +
    + +\ + +## Preprocessing and unsupervised analysis + +We start by performing a standard unsupervised clustering analysis, essentially first treating the dataset as an scRNA-seq experiment. We use SCTransform-based normalization, though we slightly modify the default clipping parameters to mitigate the effect of outliers that we occasionally observe in smFISH experiments. After normalization, we can run dimensional reduction and clustering. + +```{r analysis, results='hide'} +vizgen.obj <- SCTransform(vizgen.obj, assay = "Vizgen", clip.range = c(-10,10)) +vizgen.obj <- RunPCA(vizgen.obj, npcs = 30, features = rownames(vizgen.obj)) +vizgen.obj <- RunUMAP(vizgen.obj, dims = 1:30) +vizgen.obj <- FindNeighbors(vizgen.obj, reduction = "pca", dims = 1:30) +vizgen.obj <- FindClusters(vizgen.obj, resolution = 0.3) +``` + +We can then visualize the results of the clustering either in UMAP space (with `DimPlot()`) or overlaid on the image with `ImageDimPlot()`. + +```{r umap} +DimPlot(vizgen.obj, reduction = "umap") +``` + +```{r spatial.plot, fig.height=6, fig.width=6} +ImageDimPlot(vizgen.obj, fov = "s2r1", cols = "polychrome", axes = TRUE) +``` + +You can also customize multiple aspect of the plot, including the color scheme, cell border widths, and size (see below). + +
    + +**Customizing spatial plots in Seurat** + +The `ImageDimPlot()` and `ImageFeaturePlot()` functions have a few parameters which you can customize individual visualizations. These include: + +- alpha: Ranges from 0 to 1. Sets the transparency of within-cell coloring. +- size: determines the size of points representing cells, if centroids are being plotted +- cols: Sets the color scheme for the internal shading of each cell. Examples settings are `polychrome`, `glasbey`, `Paired`, `Set3`, and `parade`. Default is the ggplot2 color palette +- shuffle.cols: In some cases the selection of `cols` is more effective when the same colors are assigned to different clusters. Set `shuffle.cols = TRUE` to randomly shuffle the colors in the palette. +- border.size: Sets the width of the cell segmentation borders. By default, segmentations are plotted with a border size of 0.3 and centroids are plotted without border. +- border.color: Sets the color of the cell segmentation borders +- dark.background: Sets a black background color (TRUE by default) +- axes: Display + +
    + +Since it can be difficult to visualize the spatial localization patterns of an individual cluster when viewing them all together, we can highlight all cells that belong to a particular cluster: + +```{r, fig.height=8, fig.width=12} +p1 <- ImageDimPlot(vizgen.obj, fov = "s2r1", cols = "red", cells = WhichCells(vizgen.obj, idents = 1)) +p2 <- ImageDimPlot(vizgen.obj, fov = "s2r1", cols = "red", cells = WhichCells(vizgen.obj, idents = 15)) +p1 + p2 +``` + +We can find markers of individual clusters and visualize their spatial expression pattern. We can color cells based on their quantified expression of an individual gene, using `ImageFeaturePlot()`, which is analagous to the `FeaturePlot()` function for visualizing expression on a 2D embedding. Since MERFISH images individual molecules, we can also visualize the location of individual *molecules*. + +```{r, fig.height=7, fig.width=12} +p1 <- ImageFeaturePlot(vizgen.obj, features = "Slc17a7") +p2 <- ImageDimPlot(vizgen.obj, molecules = "Slc17a7", nmols = 10000, alpha = 0.3, mols.cols = "red") +p1 + p2 +``` + +Note that the `nmols` parameter can be used to reduce the total number of molecules shown to reduce overplotting. You can also use the `mols.size`, `mols.cols`, and `mols.alpha` parameter to further optimize. + +Plotting molecules is especially useful for visualizing co-expression of multiple genes on the same plot. + +```{r, fig.height=7, fig.width=12} +p1 <- ImageDimPlot(vizgen.obj, fov = "s2r1", alpha = 0.3, molecules = c("Slc17a7", "Olig1"), nmols = 10000) +markers.14 <- FindMarkers(vizgen.obj, ident.1 = "14") +p2 <- ImageDimPlot(vizgen.obj, fov = "s2r1", alpha = 0.3, molecules = rownames(markers.14)[1:4], nmols = 10000) +p1 + p2 +``` + +The updated Seurat spatial framework has the option to treat cells as individual points, or also to visualize cell boundaries (segmentations). By default, Seurat ignores cell segmentations and treats each cell as a point ('centroids'). This speeds up plotting, especially when looking at large areas, where cell boundaries are too small to visualize. + +We can zoom into a region of tissue, creating a new field of view. For example, we can zoom into a region that contains the hippocampus. Once zoomed-in, we can set `DefaultBoundary()` to show cell segmentations. You can also 'simplify' the cell segmentations, reducing the number of edges in each polygon to speed up plotting. + +```{r, fig.height=5, fig.width=14} +# create a Crop +cropped.coords <- Crop(vizgen.obj[["s2r1"]], x = c(1750, 3000), y = c(3750, 5250), coords = "plot") +# set a new field of view (fov) +vizgen.obj[["hippo"]] <- cropped.coords + +# visualize FOV using default settings (no cell boundaries) +p1 <- ImageDimPlot(vizgen.obj, fov = "hippo", axes = TRUE, size = 0.7, border.color = "white", cols = "polychrome", coord.fixed = FALSE) + +# visualize FOV with full cell segmentations +DefaultBoundary(vizgen.obj[["hippo"]]) <- "segmentation" +p2 <- ImageDimPlot(vizgen.obj, fov = "hippo", axes = TRUE, border.color = "white", border.size = 0.1, cols = "polychrome", coord.fixed = FALSE) + +# simplify cell segmentations +vizgen.obj[["hippo"]][["simplified.segmentations"]] <- Simplify(coords = vizgen.obj[["hippo"]][["segmentation"]], tol = 3) +DefaultBoundary(vizgen.obj[["hippo"]]) <- "simplified.segmentations" + +# visualize FOV with simplified cell segmentations +DefaultBoundary(vizgen.obj[["hippo"]]) <- "simplified.segmentations" +p3 <- ImageDimPlot(vizgen.obj, fov = "hippo", axes = TRUE, border.color = "white", border.size = 0.1, cols = "polychrome", coord.fixed = FALSE) + +p1 + p2 + p3 +``` + +
    + +**What is the tol parameter?** + +The tol parameter determines how simplified the resulting segmentations are. A higher value of tol will reduce the number of vertices more drastically which will speed up plotting, but some segmentation detail will be lost. See for examples using different values for tol. + +
    + +We can visualize individual molecules plotted at higher resolution after zooming-in + +```{r, fig.height=8, fig.width=8} +# Since there is nothing behind the segmentations, alpha will slightly mute colors +ImageDimPlot(vizgen.obj, fov = "hippo", molecules = rownames(markers.14)[1:4], cols = "polychrome", mols.size = 1, alpha = 0.5, mols.cols = c("red", "blue", "yellow", "green")) +``` + +# Human Lung: 10x Genomics Xenium In Situ + +This dataset is a preview of the Xenium multimodal cell segmentation solution using a development version of the assay user guide and analysis software. It uses the [Xenium Human Multi-Tissue and Cancer Panel](https://www.10xgenomics.com/support/in-situ-gene-expression/documentation/steps/panel-design/pre-designed-xenium-gene-expression-panels) (377 genes) which was pre-designed by 10x Genomics. In this vignette, we will demonstrate how to load Xenium data for analysis and visualization using Seurat and, in particular, how to parse and visualize cell metadata. + +This uses the full Xenium output bundle available from the [FFPE Human Lung Cancer with Xenium Multimodal Cell Segmentation Preview Data](https://www.10xgenomics.com/datasets/preview-data-ffpe-human-lung-cancer-with-xenium-multimodal-cell-segmentation-1-standard) page, which can be downloaded as described below (note that this file is \~7 GB). + +```{bash, eval=FALSE} +wget https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_humanLung_Cancer_FFPE/Xenium_V1_humanLung_Cancer_FFPE_outs.zip +unzip Xenium_V1_humanLung_Cancer_FFPE_outs.zip +``` + +We will first load in the dataset and create the Seurat object. We will flip the x/y coordinates for more convenient plotting. Provide the path to the data folder for a Xenium run as the input path. The RNA data is stored in the `Xenium` assay of the Seurat object. Installing `arrow` will permit you to load the data from Parquet files, which is much more efficient than from csv. + +By default, the subcellular coordinates of each Q20 transcript will be loaded, as well as the cell centroids, which can commonly take up more than 1 GB of RAM. + +```{r, results='hide'} +path <- "/brahms/hartmana/vignette_data/Xenium_V1_humanLung_Cancer_FFPE_outs" +# Load the Xenium data, including cell segmentations +xenium.obj <- LoadXenium(path, fov = "fov", segmentations = "cell", flip.xy = T) +# remove cells with 0 counts +xenium.obj <- subset(xenium.obj, subset = nCount_Xenium > 0) +``` + +This dataset uses Xenium multimodal segmentation, which involves custom deep learning models trained on Xenium In Situ data. After nuclei segmentation with DAPI, the algorithm uses three methods to segment cells. The segmentation results for each cell are prioritized in this order: + +1. **Cell boundary stain:** This is the most reliable method. Antibodies target epithelial markers (CD45) and immune markers (pan-lymphocyte: ATP1A1, E-Cadherin). It can split nuclei and define cells missing a nucleus. Nuclei that overlap with anucleate cells are assigned to the cell + +2. **Expansion from the nucleus to the cell interior stain edge:** This method requires both segmented nuclei and the interior stain (18S rRNA marker) + +3. **Nuclear expansion:** For cases where cells that do not have boundary or interior stains, segment cells with a nuclear (DAPI) expansion distance of 5 µm or until another cell boundary is encountered + +We can directly visualize cells which were segmented according to each method. + +```{r} +ImageDimPlot(xenium.obj, fov = "fov", dark.background = F, group.by = "segmentation_method", cols = c('#ffabc3', '#a9a900', '#a9ceff')) +``` + +It is also possible to load and visualize the unsupervised cluster annotations computed by the Xenium Onboard Analysis pipeline, which are stored in the `analysis` folder of an output bundle. + +```{r} +where <- tempdir() +untar(file.path(data.dir, 'analysis.tar.gz'), exdir = where) + +graph_clusters <- read.csv(file.path(where, 'analysis', 'clustering', 'gene_expression_graphclust', 'clusters.csv'), row.names = 'Barcode') + +# Store the graph-based clusters in the metadata +xenium.obj <- AddMetaData(xenium.obj, graph_clusters) + +ImageDimPlot(xenium.obj, fov = "fov", dark.background = F, group.by = "Cluster") +``` + +Differential expression results from Xenium Onboard Analysis can also be loaded in a similar fashion. + +```{r} +diff_exp <- read.csv(file.path(where, 'analysis', 'diffexp', 'gene_expression_graphclust', 'differential_expression.csv')) + +diff_exp <- melt(diff_exp, id.vars = c("Feature.ID", "Feature.Name")) + +colnames(diff_exp)[1:2] <- c('ensembl_id', 'gene_name') +diff_exp$cluster <- unlist(lapply(strsplit(as.character(diff_exp$variable), '.', fixed = T), '[[', 2)) +diff_exp$measure <- factor(gsub('Cluster\\.\\d+\\.', '', as.character(diff_exp$variable)), c('Mean.Counts', 'Log2.fold.change', 'Adjusted.p.value'), c('mean_count', 'log2_fc', 'p_adj')) +diff_exp$variable <- NULL + +diff_exp <- dcast(diff_exp, ensembl_id + gene_name + cluster ~ measure) + +significant_de <- subset(diff_exp, p_adj <= 0.05) +significant_de <- significant_de[order(significant_de$mean_count, decreasing = T), ] +significant_de[!duplicated(significant_de$cluster), ] +``` + +We will zoom in to visualize cell segmentations and expression of a select few marker genes. + +```{r} +cropped.coords <- Crop(xenium.obj[["fov"]], x = c(6700, 7400), y = c(1500, 2000), coords = "plot") +xenium.obj[["zoom"]] <- cropped.coords +# visualize cropped area with cell segmentations & selected molecules +DefaultBoundary(xenium.obj[["zoom"]]) <- "segmentation" +ImageDimPlot(xenium.obj, fov = "zoom", group.by = "Cluster", + axes = TRUE, border.color = "white", border.size = 0.1, + cols = "polychrome", coord.fixed = FALSE, + molecules = c("SNTN", "MALL", "MS4A1", "IL7R", "CYP2B6"), nmols = 10000, mols.cols = RColorBrewer::brewer.pal(5, "Set3"), alpha = 0.4) +``` + +Lots of valuable data is output directly in each run, allowing for rapid interrogation of the biology. In the following vignette, we will see how we can use standard Seurat workflows to do more involved secondary analysis on Xenium data. + +# Mouse Brain: 10x Genomics Xenium In Situ + +In this section we'll analyze data produced by the Xenium platform. The vignette demonstrates how to load the per-transcript location data, cell x gene matrix, cell segmentation, and cell centroid information available in the Xenium outputs. The resulting Seurat object will contain the gene expression profile of each cell, the centroid and boundary of each cell, and the location of each individual detected transcript. The per-cell gene expression profiles are similar to standard single-cell RNA-seq and can be analyzed using the same tools. + +This uses the `Tiny subset` dataset from 10x Genomics provided in the [Fresh Frozen Mouse Brain for Xenium Explorer Demo](https://www.10xgenomics.com/resources/datasets/fresh-frozen-mouse-brain-for-xenium-explorer-demo-1-standard) which can be downloaded as described below. These analysis steps are also compatible with the larger `Full coronal section`, but will take longer to execute. + +```{bash, eval=FALSE} +wget https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_Coronal_Subset_CTX_HP/Xenium_V1_FF_Mouse_Brain_Coronal_Subset_CTX_HP_outs.zip +unzip Xenium_V1_FF_Mouse_Brain_Coronal_Subset_CTX_HP_outs.zip +``` + +First we read in the dataset and create a Seurat object. Provide the path to the data folder for a Xenium run as the input path. The RNA data is stored in the `Xenium` assay of the Seurat object. This step should take about a minute (you can improve this by installing `arrow` and `hdf5r`). + +```{r load.xenium, results='hide'} +path <- "/brahms/hartmana/vignette_data/xenium_tiny_subset" +# Load the Xenium data +xenium.obj <- LoadXenium(path, fov = "fov", segmentations = "cell") +# remove cells with 0 counts +xenium.obj <- subset(xenium.obj, subset = nCount_Xenium > 0) +``` + +Spatial information is loaded into slots of the Seurat object, labelled by the name of "field of view" (FOV) being loaded. Initially all the data is loaded into the FOV named `fov`. Later, we will make a cropped FOV that zooms into a region of interest. + +Standard QC plots provided by Seurat are available via the `Xenium` assay. Here are violin plots of genes per cell (`nFeature_Xenium`) and transcript counts per cell (`nCount_Xenium`) + +```{r vlnplot.xenium} +VlnPlot(xenium.obj, features = c("nFeature_Xenium", "nCount_Xenium"), ncol = 2, pt.size = 0) +``` + +Next, we plot the positions of the pan-inhibitory neuron marker Gad1, inhibitory neuron sub-type markers Pvalb, and Sst, and astrocyte marker Gfap on the tissue using `ImageDimPlot()`. + +```{r p2.xenium, fig.width=10, fig.height=8} +ImageDimPlot(xenium.obj, fov = "fov", molecules = c("Gad1", "Sst", "Pvalb", "Gfap"), nmols = 20000) +``` + +```{r save.img, include=FALSE} +plot <- ImageDimPlot(xenium.obj, fov = "fov", molecules = c("Gad1", "Gfap"), nmols = 40000, alpha=0.01, dark.background = F, mols.alpha = 0.6) + coord_flip() + scale_x_reverse() + NoLegend() +ggsave(filename = "../output/images/spatial_vignette_2.jpg", height = 5, width = 9, plot = plot) +``` + +Here we visualize the expression level of some key layer marker genes at the per-cell level using `ImageFeaturePlot()` which is analogous to the `FeaturePlot()` function for visualizing expression on a 2D embedding. We manually adjust the `max.cutoff` for each gene to roughly the 90th percentile (which can be specified with `max.cutoff='q90'`) of it's count distribution to improve contrast. + +```{r mat.xenium, message=FALSE, warning=FALSE, fig.width=12, fig.height=12} +ImageFeaturePlot(xenium.obj, features = c("Cux2", "Rorb", "Bcl11b", "Foxp2"), max.cutoff = c(25, 35, 12, 10), size = 0.75, cols = c("white", "red")) +``` + +We can zoom in on a chosen area with the `Crop()` function. Once zoomed-in, we can visualize cell segmentation boundaries along with individual molecules. + +```{r cropping.xenium, message=FALSE, warning=FALSE, fig.width=10, fig.height=8} +cropped.coords <- Crop(xenium.obj[["fov"]], x = c(1200, 2900), y = c(3750, 4550), coords = "plot") +xenium.obj[["zoom"]] <- cropped.coords +# visualize cropped area with cell segmentations & selected molecules +DefaultBoundary(xenium.obj[["zoom"]]) <- "segmentation" +ImageDimPlot(xenium.obj, fov = "zoom", + axes = TRUE, border.color = "white", border.size = 0.1, + cols = "polychrome", coord.fixed = FALSE, + molecules = c("Gad1", "Sst", "Npy2r", "Pvalb", "Nrn1"), nmols = 10000) +``` + +Next, we use SCTransform for normalization followed by standard dimensionality reduction and clustering. This step takes about 5 minutes from start to finish. + +```{r unsupervised.xenium, results='hide'} +xenium.obj <- SCTransform(xenium.obj, assay = "Xenium") +xenium.obj <- RunPCA(xenium.obj, npcs = 30, features = rownames(xenium.obj)) +xenium.obj <- RunUMAP(xenium.obj, dims = 1:30) +xenium.obj <- FindNeighbors(xenium.obj, reduction = "pca", dims = 1:30) +xenium.obj <- FindClusters(xenium.obj, resolution = 0.3) +``` + +We can then visualize the results of the clustering by coloring each cell according to its cluster either in UMAP space with `DimPlot()` or overlaid on the image with `ImageDimPlot()`. + +```{r umap.xenium, fig.width=10, fig.height=7} +DimPlot(xenium.obj) +``` + +We can visualize the expression level of the markers we looked at earlier on the UMAP coordinates. + +```{r features.xenium, fig.width=8, fig.height=10} +FeaturePlot(xenium.obj, features = c("Cux2", "Bcl11b", "Foxp2", "Gad1", "Sst", "Gfap")) +``` + +We can now use `ImageDimPlot()` to color the cell positions colored by the cluster labels determined in the previous step. + +```{r clusters.xenium, fig.width=13, fig.height=13} +ImageDimPlot(xenium.obj, cols = "polychrome", size = 0.75) +``` + +Using the positional information of each cell, we compute spatial niches. We use a cortex reference from the the Allen Brain Institute to annotate cells, so we first crop the dataset to the cortex. The Allen Brain reference can be installed [here](https://www.dropbox.com/s/cuowvm4vrf65pvq/allen_cortex.rds?dl=1). + +Below, we use Slc17a7 expression to help determine the cortical region. + +```{r, fig.width=5, fig.height=5, warning=FALSE} +xenium.obj <- LoadXenium("/brahms/hartmana/vignette_data/xenium_tiny_subset") +p1 <- ImageFeaturePlot(xenium.obj, features = "Slc17a7", axes = TRUE, max.cutoff = "q90") +p1 +``` + +```{r resolve.crop, fig.width=5, fig.height=7, warning=FALSE} +crop <- Crop(xenium.obj[["fov"]], x = c(600, 2100), y = c(900, 4700)) +xenium.obj[["crop"]] <- crop +p2 <- ImageFeaturePlot( + xenium.obj, + fov = "crop", + features = "Slc17a7", + size = 1, + axes = TRUE, + max.cutoff = "q90") +p2 +``` + +While `FindTransferAnchors` can be used to integrate spot-level data from spatial transcriptomic datasets, Seurat v5 also includes support for the [Robust Cell Type Decomposition](https://www.nature.com/articles/s41587-021-00830-w), a computational approach to deconvolve spot-level data from spatial datasets, when provided with an scRNA-seq reference. RCTD has been shown to accurately annotate spatial data from a variety of technologies, including SLIDE-seq, Visium, and the 10x Xenium in-situ spatial platform. + +To run RCTD, we first install the `spacexr` package from GitHub which implements RCTD. + +```{r, rctd.install, eval=FALSE} +devtools::install_github("dmcable/spacexr", build_vignettes = FALSE) +``` + +Counts, cluster, and spot information is extracted from the Seurat query and reference objects to construct `Reference` and `SpatialRNA` objects used by RCTD for annotation. The output of the annotation is then added to the Seurat object. + +```{r rctd.qeury, warning=FALSE} +library(spacexr) + +query.counts <- GetAssayData(xenium.obj, assay = "Xenium", slot = "counts")[, Cells(xenium.obj[["crop"]])] +coords <- GetTissueCoordinates(xenium.obj[["crop"]], which = "centroids") +rownames(coords) <- coords$cell +coords$cell <- NULL +query <- SpatialRNA(coords, query.counts, colSums(query.counts)) +``` + +```{r rctd.reference, eval=FALSE} +# allen.corted.ref can be downloaded here: https://www.dropbox.com/s/cuowvm4vrf65pvq/allen_cortex.rds?dl=1 +allen.cortex.ref <- readRDS("/brahms/shared/vignette-data/allen_cortex.rds") +allen.cortex.ref <- UpdateSeuratObject(allen.cortex.ref) + +Idents(allen.cortex.ref) <- "subclass" +# remove CR cells because there aren't enough of them for annotation +allen.cortex.ref <- subset(allen.cortex.ref, subset = subclass != "CR") +counts <- GetAssayData(allen.cortex.ref, assay = "RNA", slot = "counts") +cluster <- as.factor(allen.cortex.ref$subclass) +names(cluster) <- colnames(allen.cortex.ref) +nUMI <- allen.cortex.ref$nCount_RNA +names(nUMI) <- colnames(allen.cortex.ref) +nUMI <- colSums(counts) +levels(cluster) <- gsub("/", "-", levels(cluster)) +reference <- Reference(counts, cluster, nUMI) +``` + +```{r niche.run.rctd, warning=FALSE, results=FALSE, eval=FALSE} +# run RCTD with many cores +RCTD <- create.RCTD(query, reference, max_cores = 8) +RCTD <- run.RCTD(RCTD, doublet_mode = "doublet") +``` + +```{r niche.add.annotations, eval=FALSE} +annotations.df <- RCTD@results$results_df +annotations <- annotations.df$first_type +names(annotations) <- rownames(annotations.df) +xenium.obj$predicted.celltype <- annotations +keep.cells <- Cells(xenium.obj)[!is.na(xenium.obj$predicted.celltype)] +xenium.obj <- subset(xenium.obj, cells = keep.cells) +``` + +While the previous analyses consider each cell independently, spatial data enables cells to be defined not just by their neighborhood, but also by their broader spatial context. In Seurat v5, we introduce support for 'niche' analysis of spatial data, which demarcates regions of tissue ('niches'), each of which is defined by a different composition of spatially adjacent cell types. Inspired by methods in [Goltsev et al, Cell 2018](https://www.sciencedirect.com/science/article/pii/S0092867418309048) and [He et al, NBT 2022](https://www.nature.com/articles/s41587-022-01483-z), we consider the 'local neighborhood' for each cell - consisting of its `k.neighbor` spatially closest neighbors, and count the occurrences of each cell type present in this neighborhood. We then use k-means clustering to group cells that have similar neighborhoods together, into spatial niches. + +We call the `BuildNicheAssay` function from within Seurat to construct a new assay called `niche` containing the cell type composition spatially neighboring each cell. A metadata column called `niches` is also returned, which contains cluster assignments based on the niche assay. + +```{r build.niche.assay, eval=FALSE} +xenium.obj <- BuildNicheAssay( + object = xenium.obj, + fov = "crop", + group.by = "predicted.celltype", + niches.k = 5, + neighbors.k = 30 +) +``` + +```{r load.niche.results, eval=TRUE, include=FALSE} +xenium.obj <- readRDS("/brahms/hartmana/vignette_data/xenium_niches_presaved.rds") +``` + +We can then group cells either by their cell type identity, or their niche identity. The niches identified clearly demarcate the neuronal layers in the cortex. + +```{r, niche.dimplots, fig.width=8, fig.height=6, warning=FALSE} +celltype.plot <- ImageDimPlot( + xenium.obj, + group.by = "predicted.celltype", + size = 1.5, + cols = "polychrome", + dark.background = F) + + ggtitle("Cell type") +niche.plot <- ImageDimPlot( + xenium.obj, + group.by = "niches", + size = 1.5, + dark.background = F) + + ggtitle("Niches") + + scale_fill_manual( + values = c("#442288", "#6CA2EA", "#B5D33D", "#FED23F", "#EB7D5B")) +celltype.plot | niche.plot +``` + +Further, we observe that the composition of each niche is enriched for distinct cell types. + +```{r niche.composition} +table(xenium.obj$predicted.celltype, xenium.obj$niches) +``` + +# Human Lung: Nanostring CosMx Spatial Molecular Imager + +This dataset was produced using Nanostring CosMx Spatial Molecular Imager (SMI). The CosMX SMI performs multiplexed single molecule profiling, can profile both RNA and protein targets, and can be applied directly to FFPE tissues. The dataset represents 8 FFPE samples taken from 5 non-small-cell lung cancer (NSCLC) tissues, and is available for [public download](https://www.nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/). The gene panel consists of 960 transcripts. + +In this vignette, we load one of 8 samples (lung 5, replicate 1). We use the `LoadNanostring()` function, which parses the outputs available on the public download site. Note that the coordinates for the cell boundaries were provided by Nanostring by request, and are available for download [here](https://www.dropbox.com/s/hl3peavrx92bluy/Lung5_Rep1-polygons.csv?dl=0). + +For this dataset, instead of performing unsupervised analysis, we map the Nanostring profiles to our Azimuth Healthy Human Lung reference, which was defined by scRNA-seq. We used Azimuth version 0.4.3 with the [human lung](https://azimuth.hubmapconsortium.org/references/#Human%20-%20Lung%20v1) reference version 1.0.0. You can download the precomputed results [here](https://seurat.nygenome.org/vignette_data/spatial_vignette_2/nanostring_data.Rds), which include annotations, prediction scores, and a UMAP visualization. The median number of detected transcripts/cell is 249, which does create uncertainty for the annotation process. + +```{r load} +nano.obj <- LoadNanostring(data.dir = "/brahms/hartmana/vignette_data/nanostring/lung5_rep1", fov="lung5.rep1") +``` + +```{r integration} +# add in precomputed Azimuth annotations +azimuth.data <- readRDS("/brahms/hartmana/vignette_data/nanostring_data.Rds") +nano.obj <- AddMetaData(nano.obj, metadata = azimuth.data$annotations) +nano.obj[["proj.umap"]] <- azimuth.data$umap +Idents(nano.obj) <- nano.obj$predicted.annotation.l1 + +# set to avoid error exceeding max allowed size of globals +options(future.globals.maxSize = 8000 * 1024^2) +nano.obj <- SCTransform(nano.obj, assay = "Nanostring", clip.range = c(-10, 10), verbose = FALSE) + +# text display of annotations and prediction scores +head(slot(object = nano.obj, name = "meta.data")[2:5]) +``` + +We can visualize the Nanostring cells and annotations, projected onto the reference-defined UMAP. Note that for this NSCLC sample, tumor samples are annotated as 'basal', which is the closest cell type match in the healthy reference. + +```{r, fig.width=9, fig.height=4} +DimPlot(nano.obj) +``` + +## Visualization of cell type and expression localization patterns + +As in the previous example, `ImageDimPlot()` plots c ells based on their spatial locations, and colors them based on their assigned cell type. Notice that the basal cell population (tumor cells) is tightly spatially organized, as expected. + +```{r, fig.width=11, fig.height=7} +ImageDimPlot(nano.obj, fov = "lung5.rep1", axes = TRUE, cols = "glasbey") +``` + +Since there are many cell types present, we can highlight the localization of a few select groups. + +```{r, fig.width=10, fig.height=7} +ImageDimPlot(nano.obj, fov = "lung5.rep1", cells = WhichCells(nano.obj, idents=c("Basal", "Macrophage", "Smooth Muscle", "CD4 T")), cols=c("red", "green", "blue", "orange"), size = 0.6) +``` + +We can also visualize gene expression markers a few different ways: + +```{r, fig.width=10, fig.height=5} +VlnPlot(nano.obj, features = "KRT17", assay = "Nanostring", layer = "counts", pt.size = 0.1, y.max = 30) + NoLegend() +``` + +```{r, fig.width=5, fig.height=4} +FeaturePlot(nano.obj, features = "KRT17", max.cutoff = "q95") +``` + +```{r, fig.height=4, fig.width=8} +p1 <- ImageFeaturePlot(nano.obj, fov = "lung5.rep1", features = "KRT17", max.cutoff = "q95") +p2 <- ImageDimPlot(nano.obj, fov = "lung5.rep1", alpha = 0.3, molecules = "KRT17", nmols = 10000) + NoLegend() +p1 + p2 +``` + +We can plot molecules in order to co-visualize the expression of multiple markers, including KRT17 (basal cells), C1QA (macrophages), IL7R (T cells), and TAGLN (Smooth muscle cells). + +```{r, fig.width=10, fig.height=7} +# Plot some of the molecules which seem to display spatial correlation with each other +ImageDimPlot(nano.obj, fov = "lung5.rep1", group.by = NA, alpha = 0.3, molecules = c("KRT17", "C1QA", "IL7R", "TAGLN"), nmols = 20000) +``` + +We zoom in on one basal-rich region using the `Crop()` function. Once zoomed-in, we can visualize individual cell boundaries as well in all visualizations. + +```{r} +basal.crop <- Crop(nano.obj[["lung5.rep1"]], x = c(159500, 164000), y = c(8700, 10500)) +nano.obj[["zoom1"]] <- basal.crop +DefaultBoundary(nano.obj[["zoom1"]]) <- "segmentation" +``` + +```{r, fig.width=11, fig.height=7} +ImageDimPlot(nano.obj, fov = "zoom1", cols = "polychrome", coord.fixed = FALSE) +``` + +```{r, fig.width=11, fig.height=7} +# note the clouds of TPSAB1 molecules denoting mast cells +ImageDimPlot(nano.obj, fov = "zoom1", cols = "polychrome", alpha = 0.3, molecules = c("KRT17", "IL7R", "TPSAB1"), mols.size = 0.3, nmols = 20000, border.color = "black", coord.fixed = FALSE) +``` + +# Human Lymph Node: Akoya CODEX system + +This dataset was produced using Akoya CODEX system. The CODEX system performs multiplexed spatially-resolved protein profiling, iteratively visualizing antibody-binding events. The dataset here represents a tissue section from a human lymph node, and was generated by the University of Florida as part of the Human Biomolecular Atlas Program (HuBMAP). More information about the sample and experiment is available [here](https://portal.hubmapconsortium.org/browse/dataset/c95d9373d698faf60a66ffdc27499fe1). The protein panel in this dataset consists of 28 markers, and protein intensities were quantified as part of the Akoya processor pipeline, which outputs a CSV file providing the intensity of each marker in each cell, as well as the cell coordinates. The file is available for public download via Globus [here](https://app.globus.org/file-manager?origin_id=af603d86-eab9-4eec-bb1d-9d26556741bb&origin_path=%2Fc95d9373d698faf60a66ffdc27499fe1%2Fdrv_CX_20-008_lymphnode_n10_reg001%2Fprocessed_2020-12-2320-008LNn10r001%2Fsegm%2Fsegm-1%2Ffcs%2Fcompensated%2F). + +First, we load in the data of a HuBMAP dataset using the `LoadAkoya()` function in Seurat: + +```{r} +codex.obj <- LoadAkoya( + filename = "/brahms/hartmana/vignette_data/LN7910_20_008_11022020_reg001_compensated.csv", + type = "processor", + fov = "HBM754.WKLP.262" +) +``` + +We can now run unsupervised analysis to identify cell clusters. To normalize the protein data, we use centered log-ratio based normalization, as we typically apply to the protein modality of CITE-seq data. We then run dimensional reduction and graph-based clustering. + +```{r} +codex.obj <- NormalizeData(object = codex.obj, normalization.method = "CLR", margin = 2) +codex.obj <- ScaleData(codex.obj) +VariableFeatures(codex.obj) <- rownames(codex.obj) # since the panel is small, treat all features as variable. +codex.obj <- RunPCA(object = codex.obj, npcs = 20, verbose = FALSE) +codex.obj <- RunUMAP(object = codex.obj, dims = 1:20, verbose = FALSE) +codex.obj <- FindNeighbors(object = codex.obj, dims = 1:20, verbose = FALSE) +codex.obj <- FindClusters(object = codex.obj, verbose = FALSE, resolution = 0.4, n.start = 1) +``` + +We can visualize the cell clusters based on a protein intensity-based UMAP embedding, or also based on their spatial location. + +```{r} +DimPlot(codex.obj, label = TRUE, label.box = TRUE) + NoLegend() +``` + +```{r, fig.width=6, fig.height=5} +ImageDimPlot(codex.obj, cols = "parade") +``` + +The expression patters of individual markers clearly denote different cell types and spatial structures, including Lyve1 (lymphatic endothelial cells), CD34 (blood endothelial cells), and CD21 (B cells). As expected, endothelial cells group together into vessels, and B cells are key components of specialized microstructures known as germinal zones. You can read more about protein markers in this dataset, as well as cellular networks in human lynmphatic tissues, in this [preprint](https://www.biorxiv.org/content/10.1101/2021.10.20.465151v1.full). + +```{r, fig.width=9, fig.height=8} +p1 <- ImageFeaturePlot(codex.obj, fov = "HBM754.WKLP.262", features = c("CD34", "CD21", "Lyve1"), min.cutoff = "q10", max.cutoff = "q90") +p2 <- ImageDimPlot(codex.obj, fov = "HBM754.WKLP.262", cols = "parade") +p1 + p2 +``` + +Each of these datasets represents an opportunity to learn organizing principles that govern the spatial localization of different cell types. Stay tuned for future updates to Seurat enabling further exploration and characterization of the relationship between spatial position and molecular state. + +
    + +**Session Info** + +```{r} +sessionInfo() +``` + +
    + +```{r save.times, include=FALSE} +write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_spatial_vignette_2.csv") +``` diff --git a/vignettes/seurat5_v4_changes.Rmd b/vignettes/seurat5_v4_changes.Rmd new file mode 100644 index 000000000..d7020c930 --- /dev/null +++ b/vignettes/seurat5_v4_changes.Rmd @@ -0,0 +1,38 @@ +--- +title: "Changes in Seurat v4" +output: + html_document: + theme: united + df_print: kable +--- + +# Changes in Seurat v4 + +We have made minor changes in v4, primarily to improve the performance of Seurat v4 on large datasets. This includes minor changes to default parameter settings, and the use of newly available packages for tasks such as the identification of k-nearest neighbors, and graph-based clustering. These changes do not adversely impact downstream results, and we provide a detailed description of key changes below. Users who wish to continue using Seurat v3, or those interested in reproducing results produced by previous versions, may continue to install previous versions [here](articles/install.html#previous). + +# Changes to parameter defaults + +## FindNeighbors + +* The default method for identifying k-nearest neighbors has been set to [annoy](https://github.com/spotify/annoy). This is an approximate nearest-neighbor approach that is widely used for high-dimensional analysis in many fields, including single-cell analysis. Extensive [community benchmarking](http://ann-benchmarks.com/) has shown that annoy substantially improves the speed and memory requirements of neighbor discovery, with negligible impact to downstream results, and is consistent with our analysis and testing. Users may switch back to using the previous default setting using `nn.method="rann"`. + +## FindMarkers + +* We have restructured the code of the `FindMarkers()` function to be easier to understand, interpret, and debug. The results of differential expression are unchanged. However, by default we now prefilter genes and report fold change using base 2, as is commonly done in other differential expression packages, instead of natural log. If the default option is set, the output of `FindMarkers()` will include the column avg_log2FC, instead of avg_logFC. Users can restore the previous behavior (natural log), by specifying `base = exp(1)`. + +## IntegrateData/TransferData + +* We have made minor changes to the exact calculation of the anchor weight matrix, for example, in cases where a query cell participates in multiple anchor pairs. These changes reflect an improved workflow, but do not result in meaningful differences for downstream analysis (for example, see you can compare the results of our integration vignettes using [Seurat v3]() and [Seurat v4](articles/immune_alignment.html). + +## SCTransform + +* In `SCTransform()`, we slightly modify default parameters to improve scalability of parameter estimation for large datasets. For example, when estimating the regularized relationship between mu and theta, we compute this on a subset of the data by setting the `ncells` parameter to 5,000. The `vst()` function in sctransform v0.3 (available on CRAN [here](https://cran.r-project.org/web/packages/sctransform/index.html)) also introduces minor changes to the process of regularization. We have tested these changes extensively and found a substantial improvement in speed and memory, particularly for large dataset, with no adverse impact to performance. User can compare the results of the SCTransform vignette computed using [Seurat v3]() and [Seurat v4](articles/sctransform_vignette.html), or set `ncells=NULL` on larger datasets to compare results. + +## Removed functions + +The following functions have been removed in Seurat v4: + +- `CreateGeneActivityMatrix` replaced by `GeneActivity` in [Signac](https://satijalab.org/signac/reference/GeneActivity.html) +- `RunLSI` replaced by `RunTFIDF` and `RunSVD` in [Signac](https://satijalab.org/signac/reference/RunTFIDF.html) +- `ReadAlevin` and `ReadAlevinCsv` moved to [SeuratWrappers](https://github.com/satijalab/seurat-wrappers), see details [here](https://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/alevin.html) +- `ExportToCellbrowser` and `StopCellbrowser` moved to [SeuratWrappers](https://github.com/satijalab/seurat-wrappers), see details [here](https://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/cellbrowser.html) diff --git a/vignettes/seurat5_visualization_vignette.Rmd b/vignettes/seurat5_visualization_vignette.Rmd new file mode 100644 index 000000000..4724b0151 --- /dev/null +++ b/vignettes/seurat5_visualization_vignette.Rmd @@ -0,0 +1,249 @@ +--- +title: 'Data visualization methods in Seurat' +output: + html_document: + theme: united + df_print: kable + pdf_document: default +date: 'Compiled: `r Sys.Date()`' +--- + +```{r setup, include=TRUE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + fig.cap = '', + fig.width = 9, + fig.height = 7, + tidy = TRUE, + tidy.opts = list(width.cutoff = 95), + message = FALSE, + warning = FALSE, + time_it = TRUE, + error = TRUE +) +options(SeuratData.repo.use = 'satijalab04.nygenome.org') +``` + +We'll demonstrate visualization techniques in Seurat using our previously computed Seurat object from the 2,700 PBMC tutorial. You can download this dataset from [SeuratData](https://github.com/satijalab/seurat-data) + +```{r data, eval = FALSE} +SeuratData::InstallData("pbmc3k") +``` + +```{r seed, include=TRUE} +set.seed(seed = 42) +``` + +```{r initialize_object} +library(Seurat) +options(Seurat.object.assay.version = "v5") +library(SeuratData) +library(ggplot2) +library(patchwork) + +pbmc3k.final <- LoadData("pbmc3k", type = "pbmc3k.final") +pbmc3k.final[["RNA"]] <- as(pbmc3k.final[["RNA"]], Class = "Assay5") +pbmc3k.final <- NormalizeData(pbmc3k.final) +pbmc3k.final <- FindVariableFeatures(pbmc3k.final) +pbmc3k.final <- ScaleData(pbmc3k.final) +pbmc3k.final$groups <- sample(c("group1", "group2"), size = ncol(pbmc3k.final), replace = TRUE) +features <- c("LYZ", "CCL5", "IL32", "PTPRCAP", "FCGR3A", "PF4") +pbmc3k.final +``` + + +# Five visualizations of marker feature expression + +```{r visualization_smorgasbord, fig.height=11} +# Ridge plots - from ggridges. Visualize single cell expression distributions in each cluster +RidgePlot(pbmc3k.final, features = features, ncol = 2) + +# Violin plot - Visualize single cell expression distributions in each cluster +VlnPlot(pbmc3k.final, features = features) + +# Feature plot - visualize feature expression in low-dimensional space +FeaturePlot(pbmc3k.final, features = features) +``` + +```{r visualization_smorgasbord2, fig.height = 5} +# Dot plots - the size of the dot corresponds to the percentage of cells expressing the feature in each cluster. The color represents the average expression level +DotPlot(pbmc3k.final, features = features) + RotatedAxis() +# Single cell heatmap of feature expression +DoHeatmap(subset(pbmc3k.final, downsample = 100), features = features, size = 3) +``` + +# New additions to `FeaturePlot` + +```{r featureplot} +# Plot a legend to map colors to expression levels +FeaturePlot(pbmc3k.final, features = "MS4A1") + +# Adjust the contrast in the plot +FeaturePlot(pbmc3k.final, features = "MS4A1", min.cutoff = 1, max.cutoff = 3) +``` + +```{r featureplot2, fig.height = 4} +# Calculate feature-specific contrast levels based on quantiles of non-zero expression. Particularly useful when plotting multiple markers +FeaturePlot(pbmc3k.final, features = c("MS4A1", "PTPRCAP"), min.cutoff = "q10", max.cutoff = "q90") + +# Visualize co-expression of two features simultaneously +FeaturePlot(pbmc3k.final, features = c("MS4A1", "CD79A"), blend = TRUE) +``` + +```{r featureplot.split} +# Split visualization to view expression by groups (replaces FeatureHeatmap) +FeaturePlot(pbmc3k.final, features = c("MS4A1", "CD79A"), split.by = "groups") +``` + +# Updated and expanded visualization functions + +In addition to changes to `FeaturePlot()`, several other plotting functions have been updated and expanded with new features and taking over the role of now-deprecated functions + +```{r new_functions} +# Violin plots can also be split on some variable. Simply add the splitting variable to object metadata and pass it to the split.by argument +VlnPlot(pbmc3k.final, features = 'percent.mt', split.by = 'groups') + +# SplitDotPlotGG has been replaced with the `split.by` parameter for DotPlot +DotPlot(pbmc3k.final, features = features, split.by = 'groups') + RotatedAxis() + +# DimPlot replaces TSNEPlot, PCAPlot, etc. In addition, it will plot either "umap", "tsne", or "pca" by default, in that order +DimPlot(pbmc3k.final) +pbmc3k.final.no.umap <- pbmc3k.final +pbmc3k.final.no.umap[['umap']] <- NULL +DimPlot(pbmc3k.final.no.umap) + RotatedAxis() +``` + +```{r new2, fig.width=11, fig.height = 15} +# DoHeatmap now shows a grouping bar, splitting the heatmap into groups or clusters. This can be changed with the `group.by` parameter +DoHeatmap(pbmc3k.final, features = VariableFeatures(pbmc3k.final)[1:100], cells = 1:500, size = 4, angle = 90) + NoLegend() +``` + +# Applying themes to plots + +With Seurat, all plotting functions return ggplot2-based plots by default, allowing one to easily capture and manipulate plots just like any other ggplot2-based plot. + +```{r themeing, fig.height=6} +baseplot <- DimPlot(pbmc3k.final, reduction = 'umap') +# Add custom labels and titles +baseplot + labs(title = 'Clustering of 2,700 PBMCs') +# Use community-created themes, overwriting the default Seurat-applied theme +# Install ggmin with remotes::install_github("sjessa/ggmin") +baseplot + ggmin::theme_powerpoint() +# Seurat also provides several built-in themes, such as DarkTheme; for more details see ?SeuratTheme +baseplot + DarkTheme() +# Chain themes together +baseplot + FontSize(x.title = 20, y.title = 20) + NoLegend() +``` + +```{r save.img, include=TRUE} +library(ggplot2) +plot <- baseplot + DarkTheme() + + theme(axis.title = element_text(size = 18), legend.text = element_text(size = 18)) + + guides(colour = guide_legend(override.aes = list(size = 10))) +ggsave(filename = "../output/images/visualization_vignette.jpg", height = 7, width = 12, plot = plot, quality = 50) +``` + +# Interactive plotting features + +Seurat utilizes R's plotly graphing library to create interactive plots. This interactive plotting feature works with any ggplot2-based scatter plots (requires a `geom_point` layer). To use, simply make a ggplot2-based scatter plot (such as `DimPlot()` or `FeaturePlot()`) and pass the resulting plot to `HoverLocator()` + +```{r hover} +# Include additional data to display alongside cell names by passing in a data frame of information +# Works well when using FetchData +plot <- FeaturePlot(pbmc3k.final, features = 'MS4A1') +HoverLocator(plot = plot, information = FetchData(pbmc3k.final, vars = c('ident', 'PC_1', 'nFeature_RNA'))) +``` + +Another interactive feature provided by Seurat is being able to manually select cells for further investigation. We have found this particularly useful for small clusters that do not always separate using unbiased clustering, but which look tantalizingly distinct. You can now select these cells by creating a ggplot2-based scatter plot (such as with `DimPlot()` or `FeaturePlot()`, and passing the returned plot to `CellSelector()`. `CellSelector()` will return a vector with the names of the points selected, so that you can then set them to a new identity class and perform differential expression. + +For example, lets pretend that DCs had merged with monocytes in the clustering, but we wanted to see what was unique about them based on their position in the tSNE plot. + +```{r identify, eval=FALSE} +pbmc3k.final <- RenameIdents(pbmc3k.final, "DC" = "CD14+ Mono") +plot <- DimPlot(pbmc3k.final, reduction = 'umap') +select.cells <- CellSelector(plot = plot) +``` + +![](./assets/pbmc_select.gif) + +```{r load_cells, echo=FALSE} +select.cells <- readLines(con = './assets/pbmc_dcs_cells.txt') +# select.cells <- paste0(select.cells, "-1") +``` + +We can then change the identity of these cells to turn them into their own mini-cluster. + +```{r ident} +head(select.cells) +Idents(pbmc3k.final, cells = select.cells) <- 'NewCells' + +# Now, we find markers that are specific to the new cells, and find clear DC markers +newcells.markers <- FindMarkers(pbmc3k.final, ident.1 = "NewCells", ident.2 = "CD14+ Mono", min.diff.pct = 0.3, only.pos = TRUE) +head(newcells.markers) +``` + +
    + Using `CellSelector` to Automatically Assign Cell Identities + + In addition to returning a vector of cell names, `CellSelector()` can also take the selected cells and assign a new identity to them, returning a Seurat object with the identity classes already set. This is done by passing the Seurat object used to make the plot into `CellSelector()`, as well as an identity class. As an example, we're going to select the same set of cells as before, and set their identity class to "selected" + +```{r ident2, eval=FALSE} +pbmc3k.final <- CellSelector(plot = plot, object = pbmc3k.final, ident = 'selected') +``` + +![](./assets/pbmc_select.gif) + +```{r ident2_hidden, echo=FALSE} +pbmc3k.final <- RenameIdents(pbmc3k.final, 'NewCells' = 'selected') +``` + +```{r ident2_levels} +levels(pbmc3k.final) +``` + +
    + +# Plotting Accessories + +Along with new functions add interactive functionality to plots, Seurat provides new accessory functions for manipulating and combining plots. + +```{r labelling} +# LabelClusters and LabelPoints will label clusters (a coloring variable) or individual points on a ggplot2-based scatter plot +plot <- DimPlot(pbmc3k.final, reduction = 'pca') + NoLegend() +LabelClusters(plot = plot, id = 'ident') +# Both functions support `repel`, which will intelligently stagger labels and draw connecting lines from the labels to the points or clusters +LabelPoints(plot = plot, points = TopCells(object = pbmc3k.final[['pca']]), repel = TRUE) +``` + +Plotting multiple plots was previously achieved with the `CombinePlot()` function. We are deprecating this functionality in favor of the [patchwork](https://patchwork.data-imaginist.com/) system. Below is a brief demonstration but please see the patchwork package website [here](https://patchwork.data-imaginist.com/) for more details and examples. + +```{r combining_plots, fig.height = 5} +plot1 <- DimPlot(pbmc3k.final) +plot2 <- FeatureScatter(pbmc3k.final, feature1 = 'LYZ', feature2 = 'CCL5') +# Combine two plots +plot1 + plot2 +# Remove the legend from all plots +(plot1 + plot2) & NoLegend() +``` + +```{r save.times, include=TRUE} +write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_visualization_vignette_times.csv") +``` + +
    + **Session Info** +```{r} +sessionInfo() +``` +
    diff --git a/vignettes/seurat5_weighted_nearest_neighbor_analysis.Rmd b/vignettes/seurat5_weighted_nearest_neighbor_analysis.Rmd new file mode 100644 index 000000000..64eb316a9 --- /dev/null +++ b/vignettes/seurat5_weighted_nearest_neighbor_analysis.Rmd @@ -0,0 +1,440 @@ +--- +title: "Weighted Nearest Neighbor Analysis" +output: + html_document: + theme: united + df_print: kable +date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' +--- + +```{r setup, include=TRUE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now) + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + message = FALSE, + warning = FALSE, + time_it = TRUE, + error = TRUE +) +``` +The simultaneous measurement of multiple modalities, known as multimodal analysis, represents an exciting frontier for single-cell genomics and necessitates new computational methods that can define cellular states based on multiple data types. The varying information content of each modality, even across cells in the same dataset, represents a pressing challenge for the analysis and integration of multimodal datasets. In ([Hao\*, Hao\* et al, Cell 2021](https://doi.org/10.1016/j.cell.2021.04.048)), we introduce 'weighted-nearest neighbor' (WNN) analysis, an unsupervised framework to learn the relative utility of each data type in each cell, enabling an integrative analysis of multiple modalities. + +This vignette introduces the WNN workflow for the analysis of multimodal single-cell datasets. The workflow consists of three steps + +* Independent preprocessing and dimensional reduction of each modality individually +* Learning cell-specific modality 'weights', and constructing a WNN graph that integrates the modalities +* Downstream analysis (i.e. visualization, clustering, etc.) of the WNN graph + +We demonstrate the use of WNN analysis to two single-cell multimodal technologies: CITE-seq and 10x multiome. We define the cellular states based on both modalities, instead of either individual modality. + + +# WNN analysis of CITE-seq, RNA + ADT +We use the CITE-seq dataset from ([Stuart\*, Butler\* et al, Cell 2019](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8)), which consists of 30,672 scRNA-seq profiles measured alongside a panel of 25 antibodies from bone marrow. The object contains two assays, RNA and antibody-derived tags (ADT). + +To run this vignette please install Seurat v4, available on [CRAN](https://cran.r-project.org/web/packages/Seurat/index.html), and SeuratData, available on [GitHub](https://github.com/satijalab/seurat-data). + +```{r install, eval = FALSE} +install.packages("Seurat") +``` + +```{r, include=TRUE, cache=FALSE} +options(SeuratData.repo.use = "http://satijalab04.nygenome.org") +``` + +```{r packages, cache=FALSE} +library(Seurat) +options(Seurat.object.assay.version = "v5") +library(SeuratData) +library(cowplot) +library(dplyr) +``` + +```{r} +InstallData("bmcite") +bm <- LoadData(ds = "bmcite") +bm[["ADT"]] <- CreateAssay5Object(bm[["ADT"]]$counts) +bm[["RNA"]] <- CreateAssay5Object(bm[["RNA"]]$counts) +``` + +We first perform pre-processing and dimensional reduction on both assays independently. We use standard normalization, but you can also use SCTransform or any alternative method. + +```{r pp.rna} +DefaultAssay(bm) <- 'RNA' +bm <- NormalizeData(bm) +bm <- FindVariableFeatures(bm) +bm <- ScaleData(bm) +bm <- RunPCA(bm) + +DefaultAssay(bm) <- 'ADT' +# we will use all ADT features for dimensional reduction +# we set a dimensional reduction name to avoid overwriting the +VariableFeatures(bm) <- rownames(bm[["ADT"]]) +bm <- NormalizeData(bm, normalization.method = 'CLR', margin = 2) %>% + ScaleData() %>% RunPCA(reduction.name = 'apca') +``` +For each cell, we calculate its closest neighbors in the dataset based on a weighted combination of RNA and protein similarities. The cell-specific modality weights and multimodal neighbors are calculated in a single function, which takes ~2 minutes to run on this dataset. We specify the dimensionality of each modality (similar to specifying the number of PCs to include in scRNA-seq clustering), but you can vary these settings to see that small changes have minimal effect on the overall results. + +```{r jc} +# Identify multimodal neighbors. These will be stored in the neighbors slot, +# and can be accessed using bm[['weighted.nn']] +# The WNN graph can be accessed at bm[["wknn"]], +# and the SNN graph used for clustering at bm[["wsnn"]] +# Cell-specific modality weights can be accessed at bm$RNA.weight +bm <- FindMultiModalNeighbors( + bm, reduction.list = list("pca", "apca"), + dims.list = list(1:30, 1:18), modality.weight.name = "RNA.weight" +) +``` + +We can now use these results for downstream analysis, such as visualization and clustering. For example, we can create a UMAP visualization of the data based on a weighted combination of RNA and protein data We can also perform graph-based clustering and visualize these results on the UMAP, alongside a set of cell annotations. + +```{r wumap, fig.width=10} +bm <- RunUMAP(bm, nn.name = "weighted.nn", reduction.name = "wnn.umap", reduction.key = "wnnUMAP_") +bm <- FindClusters(bm, graph.name = "wsnn", algorithm = 3, resolution = 2, verbose = FALSE) +``` + +```{r wumap.plot, fig.width = 10} +p1 <- DimPlot(bm, reduction = 'wnn.umap', label = TRUE, repel = TRUE, label.size = 2.5) + NoLegend() +p2 <- DimPlot(bm, reduction = 'wnn.umap', group.by = 'celltype.l2', label = TRUE, repel = TRUE, label.size = 2.5) + NoLegend() +p1 + p2 +``` + +We can also compute UMAP visualization based on only the RNA and protein data and compare. We find that the RNA analysis is more informative than the ADT analysis in identifying progenitor states (the ADT panel contains markers for differentiated cells), while the converse is true of T cell states (where the ADT analysis outperforms RNA). + +```{r compumap} +bm <- RunUMAP(bm, reduction = 'pca', dims = 1:30, assay = 'RNA', + reduction.name = 'rna.umap', reduction.key = 'rnaUMAP_') +bm <- RunUMAP(bm, reduction = 'apca', dims = 1:18, assay = 'ADT', + reduction.name = 'adt.umap', reduction.key = 'adtUMAP_') +``` + +```{r umapplot2, fig.width=10} +p3 <- DimPlot(bm, reduction = 'rna.umap', group.by = 'celltype.l2', label = TRUE, + repel = TRUE, label.size = 2.5) + NoLegend() +p4 <- DimPlot(bm, reduction = 'adt.umap', group.by = 'celltype.l2', label = TRUE, + repel = TRUE, label.size = 2.5) + NoLegend() +p3 + p4 +``` + +We can visualize the expression of canonical marker genes and proteins on the multimodal UMAP, which can assist in verifying the provided annotations: +```{r ftplot, fig.width = 10, fig.height = 7} +p5 <- FeaturePlot(bm, features = c("adt_CD45RA","adt_CD16","adt_CD161"), + reduction = 'wnn.umap', max.cutoff = 2, + cols = c("lightgrey","darkgreen"), ncol = 3) +p6 <- FeaturePlot(bm, features = c("rna_TRDC","rna_MPO","rna_AVP"), + reduction = 'wnn.umap', max.cutoff = 3, ncol = 3) +p5 / p6 +``` + +Finally, we can visualize the modality weights that were learned for each cell. Each of the populations with the highest RNA weights represent progenitor cells, while the populations with the highest protein weights represent T cells. This is in line with our biological expectations, as the antibody panel does not contain markers that can distinguish between different progenitor populations. + +```{r plotwts, fig.width=10} + VlnPlot(bm, features = "RNA.weight", group.by = 'celltype.l2', sort = TRUE, pt.size = 0.1) + + NoLegend() +``` + +```{r save.img, include=TRUE} +library(ggplot2) +plot <- VlnPlot(bm, features = "RNA.weight", group.by = 'celltype.l2', sort = TRUE, pt.size = 0.1) + + NoLegend() + xlab("") + ggtitle("RNA Modality Weights") + theme(plot.title = element_text(hjust = 0.5, size = 30), axis.text = element_text(size = 20)) + +ggsave(filename = "../output/images/weighted_nearest_neighbor_analysis.jpg", height = 7, width = 12, plot = plot, quality = 50) +``` + +# WNN analysis of 10x Multiome, RNA + ATAC + +Here, we demonstrate the use of WNN analysis to a second multimodal technology, the 10x multiome RNA+ATAC kit. We use a dataset that is publicly available on the 10x website, where paired transcriptomes and ATAC-seq profiles are measured in 10,412 PBMCs. + +We use the same WNN methods as we use in the previous tab, where we apply integrated multimodal analysis to a CITE-seq dataset. In this example we will demonstrate how to: + +* Create a multimodal Seurat object with paired transcriptome and ATAC-seq profiles +* Perform weighted neighbor clustering on RNA+ATAC data in single cells +* Leverage both modalities to identify putative regulators of different cell types and states + +You can download the dataset from the 10x Genomics website [here](https://support.10xgenomics.com/single-cell-multiome-atac-gex/datasets/1.0.0/pbmc_granulocyte_sorted_10k). Please make sure to download the following files: + +* Filtered feature barcode matrix (HDF5) +* ATAC Per fragment information file (TSV.GZ) +* ATAC Per fragment information index (TSV.GZ index) + +Finally, in order to run the vignette, make sure the following packages are installed: + +* [Seurat v4](install.html) +* [Signac](https://satijalab.org/signac/) for the analysis of single-cell chromatin datasets +* [EnsDb.Hsapiens.v86](https://bioconductor.org/packages/release/data/annotation/html/EnsDb.Hsapiens.v86.html) for a set of annotations for hg38 +* [dplyr](https://cran.r-project.org/web/packages/dplyr/index.html) to help manipulate data tables + +```{r pkgs} +library(Seurat) +library(Signac) +library(EnsDb.Hsapiens.v86) +library(dplyr) +library(ggplot2) +``` + + + +We'll create a Seurat object based on the gene expression data, and then add in the ATAC-seq data as a second assay. You can explore the [Signac getting started vignette](https://satijalab.org/signac/articles/pbmc_vignette.html) for more information on the creation and processing of a ChromatinAssay object. + +```{r CreateObject} +# the 10x hdf5 file contains both data types. +inputdata.10x <- Read10X_h5("../data/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5") + +# extract RNA and ATAC data +rna_counts <- inputdata.10x$`Gene Expression` +atac_counts <- inputdata.10x$Peaks + +# Create Seurat object +pbmc <- CreateSeuratObject(counts = rna_counts) +pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")$nCount_RNA +pbmc@meta.data[["percent.mt"]] <- as.numeric(pbmc@meta.data[["percent.mt"]]) + +# Now add in the ATAC-seq data +# we'll only use peaks in standard chromosomes +grange.counts <- StringToGRanges(rownames(atac_counts), sep = c(":", "-")) +grange.use <- seqnames(grange.counts) %in% standardChromosomes(grange.counts) +atac_counts <- atac_counts[as.vector(grange.use), ] +annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86) +seqlevelsStyle(annotations) <- 'UCSC' +genome(annotations) <- "hg38" + +frag.file <- "../data/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz" +chrom_assay <- CreateChromatinAssay( + counts = atac_counts, + sep = c(":", "-"), + genome = 'hg38', + fragments = frag.file, + min.cells = 10, + annotation = annotations + ) +pbmc[["ATAC"]] <- chrom_assay +``` + +We perform basic QC based on the number of detected molecules for each modality as well as mitochondrial percentage. + +```{r QCObject, fig.width=10} +VlnPlot(pbmc, features = c("nCount_ATAC", "nCount_RNA","percent.mt"), ncol = 3, + log = TRUE, pt.size = 0) + NoLegend() + +pbmc <- subset( + x = pbmc, + subset = nCount_ATAC < 7e4 & + nCount_ATAC > 5e3 & + nCount_RNA < 25000 & + nCount_RNA > 1000 & + percent.mt < 20 +) +``` + +We next perform pre-processing and dimensional reduction on both assays independently, using standard approaches for RNA and ATAC-seq data. + +```{r IndependentAnalysis} +# RNA analysis +DefaultAssay(pbmc) <- "RNA" +pbmc <- SCTransform(pbmc, verbose = FALSE) %>% RunPCA() %>% RunUMAP(dims = 1:50, reduction.name = 'umap.rna', reduction.key = 'rnaUMAP_') + +# ATAC analysis +# We exclude the first dimension as this is typically correlated with sequencing depth +DefaultAssay(pbmc) <- "ATAC" +pbmc <- RunTFIDF(pbmc) +pbmc <- FindTopFeatures(pbmc, min.cutoff = 'q0') +pbmc <- RunSVD(pbmc) +pbmc <- RunUMAP(pbmc, reduction = 'lsi', dims = 2:50, reduction.name = "umap.atac", reduction.key = "atacUMAP_") +``` + +We calculate a WNN graph, representing a weighted combination of RNA and ATAC-seq modalities. We use this graph for UMAP visualization and clustering + +```{r MultiModalAnalysis} +pbmc <- FindMultiModalNeighbors(pbmc, reduction.list = list("pca", "lsi"), dims.list = list(1:50, 2:50)) +pbmc <- RunUMAP(pbmc, nn.name = "weighted.nn", reduction.name = "wnn.umap", reduction.key = "wnnUMAP_") +pbmc <- FindClusters(pbmc, graph.name = "wsnn", algorithm = 3, verbose = FALSE) +``` + +We annotate the clusters below. Note that you could also annotate the dataset using our supervised mapping pipelines, using either our [vignette](multimodal_reference_mapping.html), or [automated web tool, Azimuth](www.satijalab.org/azimuth). + +```{r Annotate, results = 'hide'} +# perform sub-clustering on cluster 6 to find additional structure +pbmc <- FindSubCluster(pbmc, cluster = 6, graph.name = "wsnn", algorithm = 3) +Idents(pbmc) <- "sub.cluster" +``` + +```{r Annotate2} +# add annotations +pbmc <- RenameIdents(pbmc, '19' = 'pDC','20' = 'HSPC','15' = 'cDC') +pbmc <- RenameIdents(pbmc, '0' = 'CD14 Mono', '9' ='CD14 Mono', '5' = 'CD16 Mono') +pbmc <- RenameIdents(pbmc, '10' = 'Naive B', '11' = 'Intermediate B', '17' = 'Memory B', '21' = 'Plasma') +pbmc <- RenameIdents(pbmc, '7' = 'NK') +pbmc <- RenameIdents(pbmc, '4' = 'CD4 TCM', '13'= "CD4 TEM", '3' = "CD4 TCM", '16' ="Treg", '1' ="CD4 Naive", '14' = "CD4 Naive") +pbmc <- RenameIdents(pbmc, '2' = 'CD8 Naive', '8'= "CD8 Naive", '12' = 'CD8 TEM_1', '6_0' = 'CD8 TEM_2', '6_1' ='CD8 TEM_2', '6_4' ='CD8 TEM_2') +pbmc <- RenameIdents(pbmc, '18' = 'MAIT') +pbmc <- RenameIdents(pbmc, '6_2' ='gdT', '6_3' = 'gdT') +pbmc$celltype <- Idents(pbmc) +``` + +We can visualize clustering based on gene expression, ATAC-seq, or WNN analysis. The differences are more subtle than in the previous analysis (you can explore the weights, which are more evenly split than in our CITE-seq example), but we find that WNN provides the clearest separation of cell states. + +```{r UMAPs, fig.width=10} +p1 <- DimPlot(pbmc, reduction = "umap.rna", group.by = "celltype", label = TRUE, label.size = 2.5, repel = TRUE) + ggtitle("RNA") +p2 <- DimPlot(pbmc, reduction = "umap.atac", group.by = "celltype", label = TRUE, label.size = 2.5, repel = TRUE) + ggtitle("ATAC") +p3 <- DimPlot(pbmc, reduction = "wnn.umap", group.by = "celltype", label = TRUE, label.size = 2.5, repel = TRUE) + ggtitle("WNN") +p1 + p2 + p3 & NoLegend() & theme(plot.title = element_text(hjust = 0.5)) +``` + +For example, the ATAC-seq data assists in the separation of CD4 and CD8 T cell states. This is due to the presence of multiple loci that exhibit differential accessibility between different T cell subtypes. For example, we can visualize 'pseudobulk' tracks of the CD8A locus alongside violin plots of gene expression levels, using tools in the [Signac visualization vignette](https://satijalab.org/signac/articles/visualization.html). + +```{r coverageplotcd8, fig.width=10} +## to make the visualization easier, subset T cell clusters +celltype.names <- levels(pbmc) +tcell.names <- grep("CD4|CD8|Treg", celltype.names,value = TRUE) +tcells <- subset(pbmc, idents = tcell.names) +CoveragePlot(tcells, region = 'CD8A', features = 'CD8A', assay = 'ATAC', expression.assay = 'SCT', peaks = FALSE) +``` + +Next, we will examine the accessible regions of each cell to determine enriched motifs. As described in the [Signac motifs vignette](https://satijalab.org/signac/articles/motif_vignette.html), there are a few ways to do this, but we will use the [chromVAR](https://www.nature.com/articles/nmeth.4401) package from the Greenleaf lab. This calculates a per-cell accessibility score for known motifs, and adds these scores as a third assay (`chromvar`) in the Seurat object. + +To continue, please make sure you have the following packages installed. + +* [chromVAR](https://bioconductor.org/packages/release/bioc/html/chromVAR.html) for the analysis of motif accessibility in scATAC-seq +* [presto](https://github.com/immunogenomics/presto) for fast differential expression analyses. +* [TFBSTools](http://www.bioconductor.org/packages/release/bioc/html/TFBSTools.html) for TFBS analysis +* [JASPAR2020](https://bioconductor.org/packages/release/data/annotation/html/JASPAR2020.html) for JASPAR motif models +* [motifmatchr](https://www.bioconductor.org/packages/release/bioc/html/motifmatchr.html) for motif matching +* [BSgenome.Hsapiens.UCSC.hg38](https://bioconductor.org/packages/release/data/annotation/html/BSgenome.Hsapiens.UCSC.hg38.html) for chromVAR + +
    + **Install command for all dependencies** + +```{r install.deps, eval=FALSE} +remotes::install_github("immunogenomics/presto") +BiocManager::install(c("chromVAR", "TFBSTools", "JASPAR2020", "motifmatchr", "BSgenome.Hsapiens.UCSC.hg38")) +``` + +
    + +```{r chromVar} +library(chromVAR) +library(JASPAR2020) +library(TFBSTools) +library(motifmatchr) +library(BSgenome.Hsapiens.UCSC.hg38) + +# Scan the DNA sequence of each peak for the presence of each motif, and create a Motif object +DefaultAssay(pbmc) <- "ATAC" +pwm_set <- getMatrixSet(x = JASPAR2020, opts = list(species = 9606, all_versions = FALSE)) +motif.matrix <- CreateMotifMatrix(features = granges(pbmc), pwm = pwm_set, genome = 'hg38', use.counts = FALSE) +motif.object <- CreateMotifObject(data = motif.matrix, pwm = pwm_set) +pbmc <- SetAssayData(pbmc, assay = 'ATAC', slot = 'motifs', new.data = motif.object) + +# Note that this step can take 30-60 minutes +pbmc <- RunChromVAR( + object = pbmc, + genome = BSgenome.Hsapiens.UCSC.hg38 +) +``` + +Finally, we explore the multimodal dataset to identify key regulators of each cell state. Paired data provides a unique opportunity to identify transcription factors (TFs) that satisfy multiple criteria, helping to narrow down the list of putative regulators to the most likely candidates. We aim to identify TFs whose expression is enriched in multiple cell types in the RNA measurements, but *also* have enriched accessibility for their motifs in the ATAC measurements. + +As an example and positive control, the CCAAT Enhancer Binding Protein (CEBP) family of proteins, including the TF CEBPB, have been repeatedly shown to play important roles in the differentiation and function of myeloid cells including monocytes and dendritic cells. We can see that both the expression of the CEBPB, and the accessibility of the MA0466.2.4 motif (which encodes the binding site for CEBPB), are both enriched in monocytes. + +```{r CEBPB, fig.width=10} +#returns MA0466.2 +motif.name <- ConvertMotifID(pbmc, name = 'CEBPB') +gene_plot <- FeaturePlot(pbmc, features = "sct_CEBPB", reduction = 'wnn.umap') +motif_plot <- FeaturePlot(pbmc, features = motif.name, min.cutoff = 0, cols = c("lightgrey", "darkred"), reduction = 'wnn.umap') +gene_plot | motif_plot +``` + +We'd like to quantify this relationship, and search across all cell types to find similar examples. To do so, we will use the `presto` package to perform fast differential expression. We run two tests: one using gene expression data, and the other using chromVAR motif accessibilities. `presto` calculates a p-value based on the Wilcox rank sum test, which is also the default test in Seurat, and we restrict our search to TFs that return significant results in both tests. + +`presto` also calculates an "AUC" statistic, which reflects the power of each gene (or motif) to serve as a marker of cell type. A maximum AUC value of 1 indicates a perfect marker. Since the AUC statistic is on the same scale for both genes and motifs, we take the average of the AUC values from the two tests, and use this to rank TFs for each cell type: + +```{r presto} +markers_rna <- presto:::wilcoxauc.Seurat(X = pbmc, group_by = 'celltype', assay = 'data', seurat_assay = 'SCT') +markers_motifs <- presto:::wilcoxauc.Seurat(X = pbmc, group_by = 'celltype', assay = 'data', seurat_assay = 'chromvar') +motif.names <- markers_motifs$feature +colnames(markers_rna) <- paste0("RNA.", colnames(markers_rna)) +colnames(markers_motifs) <- paste0("motif.", colnames(markers_motifs)) +markers_rna$gene <- markers_rna$RNA.feature +markers_motifs$gene <- ConvertMotifID(pbmc, id = motif.names) +``` + +```{r defineTests} +# a simple function to implement the procedure above +topTFs <- function(celltype, padj.cutoff = 1e-2) { + ctmarkers_rna <- dplyr::filter( + markers_rna, RNA.group == celltype, RNA.padj < padj.cutoff, RNA.logFC > 0) %>% + arrange(-RNA.auc) + ctmarkers_motif <- dplyr::filter( + markers_motifs, motif.group == celltype, motif.padj < padj.cutoff, motif.logFC > 0) %>% + arrange(-motif.auc) + top_tfs <- inner_join( + x = ctmarkers_rna[, c(2, 11, 6, 7)], + y = ctmarkers_motif[, c(2, 1, 11, 6, 7)], by = "gene" + ) + top_tfs$avg_auc <- (top_tfs$RNA.auc + top_tfs$motif.auc) / 2 + top_tfs <- arrange(top_tfs, -avg_auc) + return(top_tfs) +} +``` + +We can now compute, and visualize, putative regulators for any cell type. We recover well-established regulators, including [TBX21 for NK cells](https://www.sciencedirect.com/science/article/pii/S1074761304000767), [IRF4 for plasma cells](https://pubmed.ncbi.nlm.nih.gov/16767092/), [SOX4 for hematopoietic progenitors](https://ashpublications.org/blood/article/124/21/1577/88774/Sox4-Is-Required-for-the-Formation-and-Maintenance), [EBF1 and PAX5 for B cells](https://www.nature.com/articles/ni.2641), [IRF8 and TCF4 for pDC](https://www.nature.com/articles/s41590-018-0136-9). We believe that similar strategies can be used to help focus on a set of putative regulators in diverse systems. + +```{r NK, fig.width=10} +# identify top markers in NK and visualize +head(topTFs("NK"), 3) + +motif.name <- ConvertMotifID(pbmc, name = 'TBX21') +gene_plot <- FeaturePlot(pbmc, features = "sct_TBX21", reduction = 'wnn.umap') +motif_plot <- FeaturePlot(pbmc, features = motif.name, min.cutoff = 0, cols = c("lightgrey", "darkred"), reduction = 'wnn.umap') +gene_plot | motif_plot +``` + +```{r pDC, fig.width=10} +# identify top markers in pDC and visualize +head(topTFs("pDC"), 3) + +motif.name <- ConvertMotifID(pbmc, name = 'TCF4') +gene_plot <- FeaturePlot(pbmc, features = "sct_TCF4", reduction = 'wnn.umap') +motif_plot <- FeaturePlot(pbmc, features = motif.name, min.cutoff = 0, cols = c("lightgrey", "darkred"), reduction = 'wnn.umap') +gene_plot | motif_plot +``` + +```{r CD16Mono, fig.width=10} +# identify top markers in HSPC and visualize +head(topTFs("CD16 Mono"),3) + +motif.name <- ConvertMotifID(pbmc, name = 'SPI1') +gene_plot <- FeaturePlot(pbmc, features = "sct_SPI1", reduction = 'wnn.umap') +motif_plot <- FeaturePlot(pbmc, features = motif.name, min.cutoff = 0, cols = c("lightgrey", "darkred"), reduction = 'wnn.umap') +gene_plot | motif_plot +``` + +```{r moreTFS, fig.width=10} +# identify top markers in other cell types +head(topTFs("Naive B"), 3) +head(topTFs("HSPC"), 3) +head(topTFs("Plasma"), 3) +``` + + +```{r save.times, include=TRUE} +# write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_weighted_nearest_neighbor_analysis_times.csv") +print("done") +``` + +
    + **Session Info** +```{r} +sessionInfo() +``` +
    diff --git a/vignettes/spatial_vignette.Rmd b/vignettes/spatial_vignette.Rmd index cad2ef62e..7edb15b34 100644 --- a/vignettes/spatial_vignette.Rmd +++ b/vignettes/spatial_vignette.Rmd @@ -27,7 +27,8 @@ knitr::opts_chunk$set( message = FALSE, warning = FALSE, fig.width = 10, - time_it = TRUE + time_it = TRUE, + error = TRUE ) ``` @@ -99,7 +100,7 @@ wrap_plots(plot1, plot2) These plots demonstrate that the variance in molecular counts across spots is not just technical in nature, but also is dependent on the tissue anatomy. For example, regions of the tissue that are depleted for neurons (such as the cortical white matter), reproducibly exhibit lower molecular counts. As a result, standard approaches (such as the `LogNormalize()` function), which force each data point to have the same underlying 'size' after normalization, can be problematic. -As an alternative, we recommend using sctransform (Hafemeister and Satija, Genome Biology 2019), which which builds regularized negative binomial models of gene expression in order to account for technical artifacts while preserving biological variance. For more details on sctransform, please see the paper [here](https://doi.org/10.1186/s13059-019-1874-1) and the Seurat vignette [here](sctransform_vignette.html). sctransform normalizes the data, detects high-variance features, and stores the data in the `SCT` assay. +As an alternative, we recommend using sctransform (Hafemeister and Satija, Genome Biology 2019), which which builds regularized negative binomial models of gene expression in order to account for technical artifacts while preserving biological variance. For more details on sctransform, please see the paper [here](https://genomebiology-biomedcentral-com/articles/10.1186/s13059-021-02584-9) and the Seurat vignette [here](sctransform_vignette.html). sctransform normalizes the data, detects high-variance features, and stores the data in the `SCT` assay. ```{r preprocess} brain <- SCTransform(brain, assay = "Spatial", verbose = FALSE) @@ -140,7 +141,7 @@ SpatialFeaturePlot(brain, features = c("Hpca", "Ttr")) ``` -```{r save.img, include=FALSE} +```{r save.img, include=TRUE} library(ggplot2) plot <- SpatialFeaturePlot(brain, features = c("Ttr")) + theme(legend.text = element_text(size = 0), legend.title = element_text(size = 20), legend.key.size = unit(1, "cm")) @@ -233,12 +234,12 @@ An alternative approach, implemented in `FindSpatiallyVariables()`, is to search We note that there are multiple methods in the literature to accomplish this task, including [SpatialDE](https://www.nature.com/articles/nmeth.4636), and [Splotch](https://www.biorxiv.org/content/10.1101/757096v1.article-metrics). We encourage interested users to explore these methods, and hope to add support for them in the near future. ```{r spatial.vf} -brain <- FindSpatiallyVariableFeatures(brain, assay = 'SCT', features = VariableFeatures(brain)[1:1000], selection.method = 'markvariogram') +brain <- FindSpatiallyVariableFeatures(brain, assay = 'SCT', features = VariableFeatures(brain)[1:1000], selection.method = 'moransi') ``` Now we visualize the expression of the top 6 features identified by this measure. ```{r spatial.vf.plot, fig.height=8} -top.features <- head(SpatiallyVariableFeatures(brain, selection.method = 'markvariogram'),6) +top.features <- head(SpatiallyVariableFeatures(brain, selection.method = 'moransi'),6) SpatialFeaturePlot(brain, features = top.features, ncol = 3, alpha = c(0.1, 1)) ``` @@ -264,12 +265,12 @@ p1 + p2 ## Integration with single-cell data At ~50um, spots from the visium assay will encompass the expression profiles of multiple cells. For the growing list of systems where scRNA-seq data is available, users may be interested to 'deconvolute' each of the spatial voxels to predict the underlying composition of cell types. In preparing this vignette, we tested a wide variety of decovonlution and integration methods, using a [reference scRNA-seq dataset](https://www.nature.com/articles/nn.4216) of ~14,000 adult mouse cortical cell taxonomy from the Allen Institute, generated with the SMART-Seq2 protocol. -We consistently found superior performance using integration methods (as opposed to deconvolution methods), likely because of substantially different noise models that characterize spatial and single-cell datasets, and integration methods are specifiically designed to be robust to these differences. We therefore apply the 'anchor'-based integration workflow introduced in [Seurat v3](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8), that enables the probabilistic transfer of annotations from a reference to a query set. We therefore follow the label transfer workflow introduced [here](reference_mapping.html), taking advantage of sctransform normalization, but anticipate new methods to be developed to accomplish this task. +We consistently found superior performance using integration methods (as opposed to deconvolution methods), likely because of substantially different noise models that characterize spatial and single-cell datasets, and integration methods are specifiically designed to be robust to these differences. We therefore apply the 'anchor'-based integration workflow introduced in [Seurat v3](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8), that enables the probabilistic transfer of annotations from a reference to a query set. We therefore follow the label transfer workflow introduced [here](integration_mapping.html), taking advantage of sctransform normalization, but anticipate new methods to be developed to accomplish this task. We first load the data (download available [here](https://www.dropbox.com/s/cuowvm4vrf65pvq/allen_cortex.rds?dl=1)), pre-process the scRNA-seq reference, and then perform label transfer. The procedure outputs, for each spot, a probabilistic classification for each of the scRNA-seq derived classes. We add these predictions as a new assay in the Seurat object. ```{r sc.data} -allen_reference <- readRDS("../data/allen_cortex.rds") +allen_reference <- readRDS("/brahms/shared/vignette-data/allen_cortex.rds") ``` ```{r sc.data2} @@ -302,8 +303,8 @@ SpatialFeaturePlot(cortex, features = c("L2/3 IT", "L4"), pt.size.factor = 1.6, Based on these prediction scores, we can also predict *cell types* whose location is spatially restricted. We use the same methods based on marked point processes to define spatially variable features, but use the cell type prediction scores as the "marks" rather than gene expression. ```{r sc.data8, fig.height = 10} -cortex <- FindSpatiallyVariableFeatures(cortex, assay = "predictions", selection.method = "markvariogram", features = rownames(cortex), r.metric = 5, slot = "data") -top.clusters <- head(SpatiallyVariableFeatures(cortex), 4) +cortex <- FindSpatiallyVariableFeatures(cortex, assay = "predictions", selection.method = "moransi", features = rownames(cortex), r.metric = 5, slot = "data") +top.clusters <- head(SpatiallyVariableFeatures(cortex, selection.method = "moransi"), 4) SpatialPlot(object = cortex, features = top.clusters, ncol = 2) ``` @@ -398,8 +399,8 @@ slide.seq <- FindClusters(slide.seq, resolution = 0.3, verbose = FALSE) We can then visualize the results of the clustering either in UMAP space (with `DimPlot()`) or in the bead coordinate space with `SpatialDimPlot()`. ```{r dim.plots.ss,fig.height=5} -plot1 <- DimPlot(slide.seq, reduction = "umap", label = TRUE) -plot2 <- SpatialDimPlot(slide.seq, stroke = 0) +plot1 <- DimPlot(slide.seq, reduction = "umap", label = TRUE) +plot2 <- SpatialDimPlot(slide.seq, stroke = 0) plot1 + plot2 SpatialDimPlot(slide.seq, cells.highlight = CellsByIdentities(object = slide.seq, idents = c(1, 6, 13)), facet.highlight = TRUE) ``` @@ -409,7 +410,8 @@ SpatialDimPlot(slide.seq, cells.highlight = CellsByIdentities(object = slide.seq To facilitate cell-type annotation of the Slide-seq dataset, we are leveraging an existing mouse single-cell RNA-seq hippocampus dataset, produced in [Saunders\*, Macosko\*, et al. 2018](https://doi.org/10.1016/j.cell.2018.07.028). The data is available for download as a processed Seurat object [here](https://www.dropbox.com/s/cs6pii5my4p3ke3/mouse_hippocampus_reference.rds?dl=0), with the raw count matrices available on the [DropViz website](http://dropviz.org/). ```{r ref.saunders} -ref <- readRDS("../data/mouse_hippocampus_reference.rds") +ref <- readRDS("/brahms/shared/vignette-data/mouse_hippocampus_reference.rds") +ref <- UpdateSeuratObject(ref) ``` The original annotations from the paper are provided in the cell metadata of the Seurat object. These annotations are provided at several "resolutions", from broad classes (`ref$class`) to subclusters within celltypes (`ref$subcluster`). For the purposes of this vignette, we'll work off of a modification of the celltype annotations (`ref$celltype`) which we felt struck a good balance. @@ -452,6 +454,59 @@ Now we visualize the expression of the top 6 features identified by Moran's I. SpatialFeaturePlot(slide.seq, features = head(SpatiallyVariableFeatures(slide.seq, selection.method = "moransi"), 6), ncol = 3, alpha = c(0.1, 1), max.cutoff = "q95") ``` +## Spatial deconvolution using RCTD + +While `FindTransferAnchors` can be used to integrate spot-level data from spatial transcriptomic datasets, Seurat v5 also includes support for the [Robust Cell Type Decomposition](https://www.nature.com/articles/s41587-021-00830-w), a computational approach to deconvolve spot-level data from spatial datasets, when provided with an scRNA-seq reference. RCTD has been shown to accurately annotate spatial data from a variety of technologies, including SLIDE-seq, Visium, and the 10x Xenium in-situ spatial platform. + +To run RCTD, we first install the `spacexr` package from GitHub which implements RCTD. + +```{r, eval=FALSE} +devtools::install_github("dmcable/spacexr", build_vignettes = FALSE) +``` + +Counts, cluster, and spot information is extracted from the Seurat query and reference objects to construct `Reference` and `SpatialRNA` objects used by RCTD for annotation. + +```{r rctd.setup, warning=FALSE, results=FALSE} +library(spacexr) + +# set up reference +ref <- readRDS("../data/mouse_hippocampus_reference.rds") +ref <- UpdateSeuratObject(ref) +Idents(ref) <- "celltype" + +# extract information to pass to the RCTD Reference function +counts <- ref[["RNA"]]$counts +cluster <- as.factor(ref$celltype) +names(cluster) <- colnames(ref) +nUMI <- ref$nCount_RNA +names(nUMI) <- colnames(ref) +reference <- Reference(counts, cluster, nUMI) + +# set up query with the RCTD function SpatialRNA +slide.seq <- SeuratData::LoadData("ssHippo") +counts <- slide.seq[["Spatial"]]$counts +coords <- GetTissueCoordinates(slide.seq) +colnames(coords) <- c("x", "y") +coords[is.na(colnames(coords))] <- NULL +query <- SpatialRNA(coords, counts, colSums(counts)) +``` + +Using the `reference` and `query` object, we annotate the dataset and add the cell type labels to the query Seurat object. RCTD parallelizes well, so multiple cores can be specified for faster performance. + +```{r run.rctd, warning=FALSE, results=FALSE} +RCTD <- create.RCTD(query, reference, max_cores = 8) +RCTD <- run.RCTD(RCTD, doublet_mode = 'doublet') +slide.seq <- AddMetaData(slide.seq, metadata = RCTD@results$results_df) +``` + +Next, plot the RCTD annotations. Because we ran RCTD in doublet mode, the algorithm assigns a `first_type` and `second_type` for each barcode or spot. + +```{r rctd_results, fig.height=8, fig.width=14} +p1 <- SpatialDimPlot(slide.seq, group.by = "first_type") +p2 <- SpatialDimPlot(slide.seq, group.by = "second_type") +p1 | p2 +``` + ```{r save.times, include = FALSE} write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/spatial_vignette_times.csv") ``` diff --git a/vignettes/spatial_vignette_2.Rmd b/vignettes/spatial_vignette_2.Rmd index 90976ede8..f6d1524b1 100644 --- a/vignettes/spatial_vignette_2.Rmd +++ b/vignettes/spatial_vignette_2.Rmd @@ -26,7 +26,8 @@ knitr::opts_chunk$set( tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE, - time_it = TRUE + time_it = TRUE, + error = TRUE ) ``` @@ -41,7 +42,7 @@ We update the Seurat infrastructure to enable the analysis, visualization, and e * Nanostring CosMx Spatial Molecular Imager (FFPE Human Lung) * Akoya CODEX (Human Lymph Node) -First, we install the updated versions of Seurat and SeuratObject that support this infrastructure, as well as other packages necessary for this vignette. +First, we load the packages necessary for this vignette. ```{r init, message=FALSE, warning=FALSE} library(Seurat) @@ -61,7 +62,7 @@ We use the `LoadVizgen()` function, which we have written to read in the output ```{r, message=FALSE, warning=FALSE} # Loading segmentations is a slow process and multi processing with the future pacakge is recommended -vizgen.obj <- LoadVizgen(data.dir = "/brahms/hartmana/spatial_vignette_data/vizgen/s2r1/", fov = "s2r1") +vizgen.obj <- LoadVizgen(data.dir = "../data/vizgen/s2r1/", fov = "s2r1") ``` The next pieces of information are specific to imaging assays, and is stored in the images slot of the resulting Seurat object: @@ -282,6 +283,309 @@ We can now use `ImageDimPlot()` to color the cell positions colored by the clust ImageDimPlot(xenium.obj, cols = "polychrome", size = 0.75) ``` +# Mouse Brain: 10x Genomics Xenium In Situ + +In this section we'll analyze data produced by the Xenium platform. The vignette demonstrates how to load the per-transcript location data, cell x gene matrix, cell segmentation, and cell centroid information available in the Xenium outputs. The resulting Seurat object will contain the gene expression profile of each cell, the centroid and boundary of each cell, and the location of each individual detected transcript. The per-cell gene expression profiles are similar to standard single-cell RNA-seq and can be analyzed using the same tools. + +This uses the `Tiny subset` dataset from 10x Genomics provided in the [Fresh Frozen Mouse Brain for Xenium Explorer Demo](https://www.10xgenomics.com/resources/datasets/fresh-frozen-mouse-brain-for-xenium-explorer-demo-1-standard) which can be downloaded as described below. These analysis steps are also compatible with the larger `Full coronal section`, but will take longer to execute. + +```{bash, eval=FALSE} +wget https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_Coronal_Subset_CTX_HP/Xenium_V1_FF_Mouse_Brain_Coronal_Subset_CTX_HP_outs.zip +unzip Xenium_V1_FF_Mouse_Brain_Coronal_Subset_CTX_HP_outs.zip +``` + +First we read in the dataset and create a Seurat object. Provide the path to the data folder for a Xenium run as the input path. The RNA data is stored in the `Xenium` assay of the Seurat object. This step should take about a minute. + +```{r load.xenium, results='hide'} +path <- "/brahms/hartmana/spatial_sensitivity_comparison/10x_xenium/xenium_tiny_subset" + +# Load the Xenium data +xenium.obj <- LoadXenium(path, fov = "fov") + +# remove cells with 0 counts +xenium.obj <- subset(xenium.obj, subset = nCount_Xenium > 0) +``` + +Spatial information is loaded into slots of the Seurat object, labelled by the name of "field of view" (FOV) being loaded. Initially all the data is loaded into the FOV named `fov`. Later, we will make a cropped FOV that zooms into a region of interest. + +Standard QC plots provided by Seurat are available via the `Xenium` assay. Here are violin plots of genes per cell (`nFeature_Xenium`) and transcript counts per cell (`nCount_Xenium`) +```{r vlnplot.xenium} +VlnPlot(xenium.obj, features = c("nFeature_Xenium", "nCount_Xenium"), ncol = 2, pt.size = 0) +``` + +Next, we plot the positions of the pan-inhibitory neuron marker Gad1, inhibitory neuron sub-type markers Pvalb, and Sst, and astrocyte marker Gfap on the tissue using `ImageDimPlot()`. +```{r p2.xenium, fig.width=10, fig.height=8} +ImageDimPlot(xenium.obj, fov = "fov", molecules = c("Gad1", "Sst", "Pvalb", "Gfap"), nmols = 20000) +``` + +```{r save.img, include=FALSE} +library(ggplot2) +plot <- ImageDimPlot(xenium.obj, fov = "fov", molecules = c("Gad1", "Gfap"), nmols = 40000, alpha=0.01, dark.background = F, mols.alpha = 0.6) + coord_flip() + scale_x_reverse() + NoLegend() +ggsave(filename = "../output/images/spatial_vignette_2.jpg", height = 7, width = 12, plot = plot, quality = 50) +``` + +Here we visualize the expression level of some key layer marker genes at the per-cell level using `ImageFeaturePlot()` which is analogous to the `FeaturePlot()` function for visualizing expression on a 2D embedding. We manually adjust the `max.cutoff` for each gene to roughly the 90th percentile (which can be specified with `max.cutoff='q90'`) of it's count distribution to improve contrast. +```{r mat.xenium, message=FALSE, warning=FALSE, fig.width=12, fig.height=12} +ImageFeaturePlot(xenium.obj, features = c("Cux2", "Rorb", "Bcl11b", "Foxp2"), max.cutoff = c(25, 35, 12, 10), size = 0.75, cols = c("white", "red")) +``` + +We can zoom in on a chosen area with the `Crop()` function. Once zoomed-in, we can visualize cell segmentation boundaries along with individual molecules. +```{r cropping.xenium, message=FALSE, warning=FALSE, fig.width=10, fig.height=8} +cropped.coords <- Crop(xenium.obj[["fov"]], x = c(1200, 2900), y = c(3750, 4550), coords = "plot") + +xenium.obj[["zoom"]] <- cropped.coords + +# visualize cropped area with cell segmentations & selected molecules +DefaultBoundary(xenium.obj[["zoom"]]) <- "segmentation" +ImageDimPlot(xenium.obj, fov = "zoom", + axes = TRUE, border.color = "white", border.size = 0.1, + cols = "polychrome", coord.fixed = FALSE, + molecules = c("Gad1", "Sst", "Npy2r", "Pvalb", "Nrn1"), nmols = 10000) +``` + +Next, we use SCTransform for normalization followed by standard dimensionality reduction and clustering. This step takes about 5 minutes from start to finish. +```{r unsupervised.xenium, results='hide'} +xenium.obj <- SCTransform(xenium.obj, assay = "Xenium") +xenium.obj <- RunPCA(xenium.obj, npcs = 30, features = rownames(xenium.obj)) +xenium.obj <- RunUMAP(xenium.obj, dims = 1:30) +xenium.obj <- FindNeighbors(xenium.obj, reduction = "pca", dims = 1:30) +xenium.obj <- FindClusters(xenium.obj, resolution = 0.3) +``` + +We can then visualize the results of the clustering by coloring each cell according to its cluster either in UMAP space with `DimPlot()` or overlaid on the image with `ImageDimPlot()`. +```{r umap.xenium, fig.width=10, fig.height=7} +DimPlot(xenium.obj) +``` + +We can visualize the expression level of the markers we looked at earlier on the UMAP coordinates. +```{r features.xenium, fig.width=8, fig.height=10} +FeaturePlot(xenium.obj, features = c("Cux2", "Bcl11b", "Foxp2", "Gad1", "Sst", "Gfap")) +``` + +We can now use `ImageDimPlot()` to color the cell positions colored by the cluster labels determined in the previous step. +```{r clusters.xenium, fig.width=13, fig.height=13} +ImageDimPlot(xenium.obj, cols = "polychrome", size = 0.75) +``` + +# Mouse Brain: 10x Genomics Xenium In Situ + +In this section we'll analyze data produced by the Xenium platform. The vignette demonstrates how to load the per-transcript location data, cell x gene matrix, cell segmentation, and cell centroid information available in the Xenium outputs. The resulting Seurat object will contain the gene expression profile of each cell, the centroid and boundary of each cell, and the location of each individual detected transcript. The per-cell gene expression profiles are similar to standard single-cell RNA-seq and can be analyzed using the same tools. + +This uses the `Tiny subset` dataset from 10x Genomics provided in the [Fresh Frozen Mouse Brain for Xenium Explorer Demo](https://www.10xgenomics.com/resources/datasets/fresh-frozen-mouse-brain-for-xenium-explorer-demo-1-standard) which can be downloaded as described below. These analysis steps are also compatible with the larger `Full coronal section`, but will take longer to execute. + +```{bash, eval=FALSE} +wget https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_Coronal_Subset_CTX_HP/Xenium_V1_FF_Mouse_Brain_Coronal_Subset_CTX_HP_outs.zip +unzip Xenium_V1_FF_Mouse_Brain_Coronal_Subset_CTX_HP_outs.zip +``` + +First we read in the dataset and create a Seurat object. Provide the path to the data folder for a Xenium run as the input path. The RNA data is stored in the `Xenium` assay of the Seurat object. This step should take about a minute. + +```{r load.xenium, results='hide'} +path <- "../data/xenium_tiny_subset" +# Load the Xenium data +xenium.obj <- LoadXenium(path, fov = "fov") +# remove cells with 0 counts +xenium.obj <- subset(xenium.obj, subset = nCount_Xenium > 0) +``` + +Spatial information is loaded into slots of the Seurat object, labelled by the name of "field of view" (FOV) being loaded. Initially all the data is loaded into the FOV named `fov`. Later, we will make a cropped FOV that zooms into a region of interest. + +Standard QC plots provided by Seurat are available via the `Xenium` assay. Here are violin plots of genes per cell (`nFeature_Xenium`) and transcript counts per cell (`nCount_Xenium`) +```{r vlnplot.xenium} +VlnPlot(xenium.obj, features = c("nFeature_Xenium", "nCount_Xenium"), ncol = 2, pt.size = 0) +``` + +Next, we plot the positions of the pan-inhibitory neuron marker Gad1, inhibitory neuron sub-type markers Pvalb, and Sst, and astrocyte marker Gfap on the tissue using `ImageDimPlot()`. +```{r p2.xenium, fig.width=10, fig.height=8} +ImageDimPlot(xenium.obj, fov = "fov", molecules = c("Gad1", "Sst", "Pvalb", "Gfap"), nmols = 20000) +``` + +```{r save.img, include=FALSE} +library(ggplot2) +plot <- ImageDimPlot(xenium.obj, fov = "fov", molecules = c("Gad1", "Gfap"), nmols = 40000, alpha=0.01, dark.background = F, mols.alpha = 0.6) + coord_flip() + scale_x_reverse() + NoLegend() +ggsave(filename = "../output/images/spatial_vignette_2.jpg", height = 5, width = 9, plot = plot) +``` + +Here we visualize the expression level of some key layer marker genes at the per-cell level using `ImageFeaturePlot()` which is analogous to the `FeaturePlot()` function for visualizing expression on a 2D embedding. We manually adjust the `max.cutoff` for each gene to roughly the 90th percentile (which can be specified with `max.cutoff='q90'`) of it's count distribution to improve contrast. +```{r mat.xenium, message=FALSE, warning=FALSE, fig.width=12, fig.height=12} +ImageFeaturePlot(xenium.obj, features = c("Cux2", "Rorb", "Bcl11b", "Foxp2"), max.cutoff = c(25, 35, 12, 10), size = 0.75, cols = c("white", "red")) +``` + +We can zoom in on a chosen area with the `Crop()` function. Once zoomed-in, we can visualize cell segmentation boundaries along with individual molecules. +```{r cropping.xenium, message=FALSE, warning=FALSE, fig.width=10, fig.height=8} +cropped.coords <- Crop(xenium.obj[["fov"]], x = c(1200, 2900), y = c(3750, 4550), coords = "plot") +xenium.obj[["zoom"]] <- cropped.coords +# visualize cropped area with cell segmentations & selected molecules +DefaultBoundary(xenium.obj[["zoom"]]) <- "segmentation" +ImageDimPlot(xenium.obj, fov = "zoom", + axes = TRUE, border.color = "white", border.size = 0.1, + cols = "polychrome", coord.fixed = FALSE, + molecules = c("Gad1", "Sst", "Npy2r", "Pvalb", "Nrn1"), nmols = 10000) +``` + +Next, we use SCTransform for normalization followed by standard dimensionality reduction and clustering. This step takes about 5 minutes from start to finish. +```{r unsupervised.xenium, results='hide'} +xenium.obj <- SCTransform(xenium.obj, assay = "Xenium") +xenium.obj <- RunPCA(xenium.obj, npcs = 30, features = rownames(xenium.obj)) +xenium.obj <- RunUMAP(xenium.obj, dims = 1:30) +xenium.obj <- FindNeighbors(xenium.obj, reduction = "pca", dims = 1:30) +xenium.obj <- FindClusters(xenium.obj, resolution = 0.3) +``` + +We can then visualize the results of the clustering by coloring each cell according to its cluster either in UMAP space with `DimPlot()` or overlaid on the image with `ImageDimPlot()`. +```{r umap.xenium, fig.width=10, fig.height=7} +DimPlot(xenium.obj) +``` + +We can visualize the expression level of the markers we looked at earlier on the UMAP coordinates. +```{r features.xenium, fig.width=8, fig.height=10} +FeaturePlot(xenium.obj, features = c("Cux2", "Bcl11b", "Foxp2", "Gad1", "Sst", "Gfap")) +``` + +We can now use `ImageDimPlot()` to color the cell positions colored by the cluster labels determined in the previous step. +```{r clusters.xenium, fig.width=13, fig.height=13} +ImageDimPlot(xenium.obj, cols = "polychrome", size = 0.75) +``` + +Using the positional information of each cell, we compute spatial niches. +We use a cortex reference from the the Allen Brain Institute to annotate cells, so we first crop the dataset to the cortex. +Below, we use Slc17a7 expression to help determine the cortical region. + +```{r, fig.width=5, fig.height=5, warning=FALSE} +xenium.obj <- LoadXenium("../data/xenium_tiny_subset") +p1 <- ImageFeaturePlot(xenium.obj, features = "Slc17a7", axes = TRUE, max.cutoff = "q90") +p1 +``` + +```{r resolve.crop, fig.width=5, fig.height=7, warning=FALSE} +crop <- Crop(xenium.obj[["fov"]], x=c(600, 2100), y=c(900, 4700)) +xenium.obj[["crop"]] <- crop +p2 <- ImageFeaturePlot( + xenium.obj, + fov = "crop", + features = "Slc17a7", + size = 1, + axes = TRUE, + max.cutoff = "q90") +p2 +``` + +Annotation of spatial datasets can be tricky, and single cell methods are not always effective. Here, we use RCTD, which directly accounts for cell type mixing at each spot or segmentation, to annotate cells. For more details on RCTD, please see the [paper](https://doi.org/10.1038/s41587-021-00830-w). + +First, we install the `spacexr` package from GitHub which implements RCTD. + +```{r, rctd.install, eval=FALSE} +devtools::install_github("dmcable/spacexr", build_vignettes = FALSE) +``` + +Counts, cluster, and spot information is extracted from the Seurat query and reference objects to construct `Reference` and `SpatialRNA` objects used by RCTD for annotation. + +```{r rctd.qeury, warning=FALSE} +library(spacexr) + +query.counts <- GetAssayData(xenium.obj, assay = "Xenium", slot = "counts")[, Cells(xenium.obj[["crop"]])] +coords <- GetTissueCoordinates(xenium.obj[["crop"]], which = "centroids") +rownames(coords) <- coords$cell +coords$cell <- NULL +query <- SpatialRNA(coords, query.counts, colSums(query.counts)) +``` + +```{r rctd.reference, eval=FALSE} +allen.cortex.ref <- readRDS("../data/allen_cortex.rds") +allen.cortex.ref <- UpdateSeuratObject(allen.cortex.ref) + +Idents(allen.cortex.ref) <- "subclass" +# remove CR cells because there aren't enough of them for annotation +allen.cortex.ref <- subset(allen.cortex.ref, subset = subclass != "CR") +counts <- GetAssayData(allen.cortex.ref, assay = "RNA", slot = "counts") +cluster <- as.factor(allen.cortex.ref$subclass) +names(cluster) <- colnames(allen.cortex.ref) +nUMI <- allen.cortex.ref$nCount_RNA +names(nUMI) <- colnames(allen.cortex.ref) +nUMI <- colSums(counts) +levels(cluster) <- gsub("/", "-", levels(cluster)) +reference <- Reference(counts, cluster, nUMI) +``` + +```{r niche.run.rctd, warning=FALSE, results=FALSE, eval=FALSE} +# run RCTD with many cores +RCTD <- create.RCTD(query, reference, max_cores = 8) +RCTD <- run.RCTD(RCTD, doublet_mode = "doublet") +``` + +Many spot annotations contain multiple cell type markers, so we filter only to singlets and add the annotations to the Seurat object for downstream analysis. + +```{r niche.add.annotations, eval=FALSE} +annotations.df <- RCTD@results$results_df +annotations.df <- annotations.df[annotations.df$spot_class == "singlet", ] +annotations <- annotations.df$first_type +names(annotations) <- rownames(annotations.df) +xenium.obj$predicted.celltype <- annotations +keep.cells <- Cells(xenium.obj)[!is.na(xenium.obj$predicted.celltype)] +xenium.obj <- subset(xenium.obj, cells = keep.cells) +``` + +We call the `BuildNicheAssay` function from within Seurat to construct a new assay called `niche` containing the cell type composition spatially neighboring each cell. A metadata column called `niches` is also returned, which contains cluster assignments based on the niche assay. + +```{r build.niche.assay, eval=FALSE} +xenium.obj <- BuildNicheAssay( + object = xenium.obj, + fov = "crop", + group.by = "predicted.celltype", + niches.k = 5, + neighbors.k = 30 +) +``` + +```{r load.niche.results, eval=TRUE, include=FALSE} +xenium.obj <- readRDS("../data/xenium_niches_presaved.rds") +``` + +After clustering the cell type composition nearby each cell, the neuronal layers in the cortex are visually demarcated. + +```{r, niche.dimplots, fig.width=8, fig.height=6, warning=FALSE} +celltype.plot <- ImageDimPlot( + xenium.obj, + group.by = "predicted.celltype", + size = 1.5, + cols = "polychrome", + dark.background = F) + + ggtitle("Cell type") +niche.plot <- ImageDimPlot( + xenium.obj, + group.by = "niches", + size = 1.5, + dark.background = F) + + ggtitle("Niches") + + scale_fill_manual( + values = c("#442288", "#6CA2EA", "#B5D33D", "#FED23F", "#EB7D5B")) +celltype.plot | niche.plot +``` + +Further, we observe that the composition of each niche is enriched for distinct cell types. + +```{r niche.composition} +table(xenium.obj$predicted.celltype, xenium.obj$niches) +``` + +Next, we perform DE between atrocytes from two of the niches. + +Note: I think this style of analysis is very risky - most of the DEGs comparing a cell type across niches are cell type markers of the other cell types enriched in one of the niches likely due to incorrect molecular assignment to cells. + +```{r niche.de} +xenium.obj$celltype.niches <- paste0(xenium.obj$predicted.celltype, "_", xenium.obj$niches) +Idents(xenium.obj) <- "celltype.niches" +niche.markers <- FindMarkers(xenium.obj, assay = "Xenium", ident.1 = "Astro_1", ident.2 = "Astro_5") +``` + +```{r niche.vln} +VlnPlot( + xenium.obj, + idents = c("Astro_1", "Astro_5"), + assay = "Xenium", + features = rownames(niche.markers)[1:6]) +``` + # Human Lung: Nanostring CosMx Spatial Molecular Imager This dataset was produced using Nanostring CosMx Spatial Molecular Imager (SMI). The CosMX SMI performs multiplexed single molecule profiling, can profile both RNA and protein targets, and can be applied directly to FFPE tissues. The dataset represents 8 FFPE samples taken from 5 non-small-cell lung cancer (NSCLC) tissues, and is available for [public download](https://www.nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/). The gene panel consists of 960 transcripts. @@ -291,12 +595,12 @@ In this vignette, we load one of 8 samples (lung 5, replicate 1). We use the `Lo For this dataset, instead of performing unsupervised analysis, we map the Nanostring profiles to our Azimuth Healthy Human Lung reference, which was defined by scRNA-seq. We used Azimuth version 0.4.3 with the [human lung](https://azimuth.hubmapconsortium.org/references/#Human%20-%20Lung%20v1) reference version 1.0.0. You can download the precomputed results [here](https://seurat.nygenome.org/vignette_data/spatial_vignette_2/nanostring_data.Rds), which include annotations, prediction scores, and a UMAP visualization. The median number of detected transcripts/cell is 249, which does create uncertainty for the annotation process. ```{r load} -nano.obj <- LoadNanostring(data.dir = "/brahms/hartmana/spatial_vignette_data/nanostring/lung5_rep1", fov="lung5.rep1") +nano.obj <- LoadNanostring(data.dir = "../data/nanostring/lung5_rep1", fov="lung5.rep1") ``` ```{r integration} # add in precomputed Azimuth annotations -azimuth.data <- readRDS("/brahms/hartmana/spatial_vignette_data/nanostring_data.Rds") +azimuth.data <- readRDS("../data/nanostring_data.Rds") nano.obj <- AddMetaData(nano.obj, metadata = azimuth.data$annotations) nano.obj[["proj.umap"]] <- azimuth.data$umap Idents(nano.obj) <- nano.obj$predicted.annotation.l1 @@ -332,7 +636,7 @@ ImageDimPlot(nano.obj, fov = "lung5.rep1", cells = WhichCells(nano.obj, idents=c We can also visualize gene expression markers a few different ways: ```{r, fig.width=10, fig.height=5} -VlnPlot(nano.obj, features = "KRT17", slot = "counts", pt.size = 0.1, y.max = 30) + NoLegend() +VlnPlot(nano.obj, features = "KRT17", layer = "counts", pt.size = 0.1, y.max = 30) + NoLegend() ``` ```{r, fig.width=5, fig.height=4} @@ -378,7 +682,7 @@ First, we load in the data of a HuBMAP dataset using the `LoadAkoya()` function ```{r} codex.obj <- LoadAkoya( - filename = "/brahms/hartmana/spatial_vignette_data/LN7910_20_008_11022020_reg001_compensated.csv", + filename = "../data/LN7910_20_008_11022020_reg001_compensated.csv", type = "processor", fov = "HBM754.WKLP.262" ) @@ -422,3 +726,9 @@ Each of these datasets represents an opportunity to learn organizing principles sessionInfo() ```
    + +```{r save.times, include=FALSE} +write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/spatial_vignette_2.csv") +``` + + diff --git a/vignettes/vignettes.yaml b/vignettes/vignettes.yaml index a45dfde70..81a60a7c6 100644 --- a/vignettes/vignettes.yaml +++ b/vignettes/vignettes.yaml @@ -87,6 +87,7 @@ summary: | Examples of how to perform normalization, feature selection, integration, and differential expression with an updated version of sctransform. image: assets/sctransform_v2.png + - category: Other vignettes: - title: Visualization diff --git a/vignettes/vignettes_v5.yaml b/vignettes/vignettes_v5.yaml new file mode 100644 index 000000000..7167da282 --- /dev/null +++ b/vignettes/vignettes_v5.yaml @@ -0,0 +1,67 @@ +- category: Spatial analysis + vignettes: + - title: Analysis of spatial datasets (Imaging-based) + name: seurat5_spatial_vignette_2 + summary: | + Learn to explore spatially-resolved data from multiplexed imaging technologies, including MERSCOPE, Xenium, CosMx SMI, and CODEX. + image: spatial_vignette_2.jpg + + - title: Analysis of spatial datasets (Sequencing-based) + name: spatial_vignette + summary: | + Learn to explore spatially-resolved transcriptomic data with examples from 10x Visium and Slide-seq v2. + image: spatial_vignette_ttr.jpg + +- category: Streamlined and multimodal integration + vignettes: + - title: scRNA-seq Integration + name: seurat5_integration + summary: | + Integrate scRNA-seq datasets using a variety of computational methods. + image: integration_seurat5.jpg + + - title: Cross-modality Bridge Integration + name: seurat5_integration_bridge + summary: | + Map scATAC-seq onto an scRNA-seq reference using a multi-omic bridge dataset. + image: bridge_integration.png + +- category: Flexible analysis of massively scalable datasets + vignettes: + - title: Unsupervised clustering of 1.3M neurons + name: seurat5_sketch_analysis + summary: | + Analyze a 1.3 million cell mouse brain dataset using on-disk capabilities powered by BPCells. + image: sketch_1p3.png + + - title: Integrating/comparing healthy and diabetic samples + name: ParseBio_sketch_integration + summary: | + Perform sketch integration on a large dataset from Parse Biosciences. + image: sketch.png + + - title: Supervised mapping of 1.5M immune cells + name: COVID_SCTMapping + summary: | + Map PBMC datasets from COVID-19 patients to a healthy PBMC reference. + image: COVID_SCTMapping.png + +- category: References and additional documentation + vignettes: + - title: Seurat v5 Command Cheat Sheet + name: seurat5_essential_commands + summary: | + Explore the new assay structure introduced in Seurat v5. + image: assay.png + + - title: BPCells Interaction + name: seurat5_bpcells_interaction_vignette + summary: | + Load and save large on-disk matrices using BPCells. + image: bpcells.png + + - title: Seurat v5 Installation + name: install + summary: | + Install Seurat v5 and the required dependencies. + image: SeuratV5.png diff --git a/vignettes/vignettes_v5_new.yaml b/vignettes/vignettes_v5_new.yaml new file mode 100644 index 000000000..057e08af4 --- /dev/null +++ b/vignettes/vignettes_v5_new.yaml @@ -0,0 +1,267 @@ +- category: Introduction + vignettes: + - title: Guided tutorial --- 2,700 PBMCs + name: pbmc3k_tutorial + summary: | + A basic overview of Seurat that includes an introduction to common analytical workflows. + image: pbmc3k_umap.jpg + + - title: Multimodal analysis + name: multimodal_vignette + summary: | + An introduction to working with multi-modal datasets in Seurat. + image: citeseq_plot.jpg + + - title: Visualization + name: visualization_vignette + summary: | + An overview of the visualization capabilities within Seurat. + image: visualization_vignette.jpg + + - title: SCTransform + name: sctransform_vignette + summary: | + Examples of how to perform normalization, feature selection, integration, and differential expression with an updated version of sctransform. + image: assets/sctransform_v2_new.png + + - title: Essential Commands Cheat Sheet + name: essential_commands + summary: | + Reference list of commonly used commands to store, access, explore, and analyze datasets. + image: commands.png + + +- category: scRNA-seq Data Integration + vignettes: + - title: Introduction to scRNA-seq integration + name: integration_introduction + summary: | + An introduction to integrating scRNA-seq datasets in order to identify and compare shared cell types across experiments. + image: pbmc_alignment.jpg + + - title: scRNA-seq Integration + name: seurat5_integration + summary: | + Integrate scRNA-seq datasets using a variety of computational methods. + image: integration_seurat5.jpg + + - title: Mapping and annotating query datasets + name: integration_mapping + summary: | + Learn how to map a query scRNA-seq dataset onto a reference in order to automate the annotation and visualization of query cells. + image: assets/anchorsb_2018.png + +- category: Multi-assay data + vignettes: + - title: Cross-modality Bridge Integration + name: seurat5_integration_bridge + summary: | + Map scATAC-seq onto an scRNA-seq reference using a multi-omic bridge dataset in Seurat v5. + image: bridge_integration.png + + - title: Weighted Nearest Neighbor Analysis + name: weighted_nearest_neighbor_analysis + summary: | + Analyze multimodal single-cell data with weighted nearest neighbor analysis in Seurat v4. + image: weighted_nearest_neighbor_analysis.jpg + + - title: Integrating scRNA-seq and scATAC-seq data + name: atacseq_integration_vignette + summary: | + Annotate, visualize, and interpret an scATAC-seq experiment using scRNA-seq data from the same biological system in Seurat v3. + image: atacseq_integration_vignette.jpg + + - title: Reference Mapping for Multimodal Data + name: multimodal_reference_mapping + summary: | + Analyze query data in the context of multimodal reference atlases. + image: multimodal_reference_mapping.jpg + + - title: Mixscape + name: mixscape_vignette + summary: | + Explore new methods to analyze pooled single-celled perturbation screens. + image: mixscape_vignette.jpg + +- category: Flexible analysis of massively scalable datasets + vignettes: + - title: Unsupervised clustering of 1.3M neurons + name: seurat5_sketch_analysis + summary: | + Analyze a 1.3 million cell mouse brain dataset using on-disk capabilities powered by BPCells. + image: sketch_1p3.png + + - title: Integrating/comparing healthy and diabetic samples + name: ParseBio_sketch_integration + summary: | + Perform sketch integration on a large dataset from Parse Biosciences. + image: sketch.png + + - title: Supervised mapping of 1.5M immune cells + name: COVID_SCTMapping + summary: | + Map PBMC datasets from COVID-19 patients to a healthy PBMC reference. + image: COVID_SCTMapping.png + + - title: BPCells Interaction + name: seurat5_bpcells_interaction_vignette + summary: | + Load and save large on-disk matrices using BPCells. + image: bpcells.png + +- category: Spatial analysis + vignettes: + - title: Analysis of spatial datasets (Imaging-based) + name: seurat5_spatial_vignette_2 + summary: | + Learn to explore spatially-resolved data from multiplexed imaging technologies, including MERSCOPE, Xenium, CosMx SMI, and CODEX. + image: spatial_vignette_2.jpg + + - title: Analysis of spatial datasets (Sequencing-based) + name: spatial_vignette + summary: | + Learn to explore spatially-resolved transcriptomic data with examples from 10x Visium and Slide-seq v2. + image: spatial_vignette_ttr.jpg + + - title: Analysis of Visium HD spatial datasets + name: visiumhd_analysis_vignette + summary: | + Learn to explore spatially-resolved transcriptomic data in high-definition from 10x Visium HD. + image: spatial_vignette_hd.jpg + +- category: Other + vignettes: + - title: Cell Cycle Regression + name: cell_cycle_vignette + summary: | + Mitigate the effects of cell cycle heterogeneity by computing cell cycle phase scores based on marker genes. + image: cell_cycle_vignette.jpg + + - title: Differential Expression Testing + name: de_vignette + summary: | + Perform differential expression (DE) testing in Seurat using a number of frameworks. + image: assets/de_vignette.png + + - title: Demultiplex Cell Hashing data + name: hashing_vignette + summary: | + Learn how to work with data produced with Cell Hashing. + image: assets/hashing_vignette.png + +- category: Seurat Wrappers + hash: seurat-wrappers + description: | + In order to facilitate the use of community tools with Seurat, we provide the Seurat Wrappers package, which contains code to run other analysis tools on Seurat objects. For the initial release, we provide wrappers for a few packages in the table below but would encourage other package developers interested in interfacing with Seurat to check out our contributor guide [here](https://github.com/satijalab/seurat.wrappers/wiki/Submission-Process).

    + vignettes: + - name: alevin + title: Import alevin counts into Seurat + link: http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/alevin.html + reference: https://doi.org/10.1186/s13059-019-1670-y + citation: Srivastava et. al., Genome Biology 2019 + source: https://github.com/k3yavi/alevin-Rtools + + - name: ALRA + title: Zero-preserving imputation with ALRA + link: https://htmlpreview.github.io/?https://github.com/satijalab/seurat.wrappers/blob/master/docs/alra.html + reference: https://doi.org/10.1101/397588 + citation: Linderman et al, bioRxiv 2018 + source: https://github.com/KlugerLab/ALRA + + - name: CoGAPS + title: Running CoGAPS on Seurat Objects + link: http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/cogaps.html + reference: https://doi.org/10.1016/j.cels.2019.04.004 + citation: Stein-O’Brien et al, Cell Systems 2019 + source: https://www.bioconductor.org/packages/release/bioc/html/CoGAPS.html + + - name: Conos + title: Integration of datasets using Conos + link: https://htmlpreview.github.io/?https://github.com/satijalab/seurat.wrappers/blob/master/docs/conos.html + reference: https://doi.org/10.1038/s41592-019-0466-z + citation: Barkas et al, Nature Methods 2019 + source: https://github.com/hms-dbmi/conos + + - name: fastMNN + title: Running fastMNN on Seurat Objects + link: https://htmlpreview.github.io/?https://github.com/satijalab/seurat.wrappers/blob/master/docs/fast_mnn.html + reference: https://doi.org/10.1038/nbt.4091 + citation: Haghverdi et al, Nature Biotechnology 2018 + source: https://bioconductor.org/packages/release/bioc/html/scran.html + + - name: glmpca + title: Running GLM-PCA on a Seurat Object + link: http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/glmpca.html + reference: https://doi.org/10.1186/s13059-019-1861-6 + citation: Townes et al, Genome Biology 2019 + source: https://github.com/willtownes/glmpca + + - name: Harmony + title: Integration of datasets using Harmony + link: https://htmlpreview.github.io/?https://github.com/satijalab/seurat.wrappers/blob/master/docs/harmony.html + reference: https://doi.org/10.1038/s41592-019-0619-0 + citation: Korsunsky et al, Nature Methods 2019 + source: https://github.com/immunogenomics/harmony + + - name: LIGER + title: Integrating Seurat objects using LIGER + link: https://htmlpreview.github.io/?https://github.com/satijalab/seurat.wrappers/blob/master/docs/liger.html + reference: https://doi.org/10.1016/j.cell.2019.05.006 + citation: Welch et al, Cell 2019 + source: https://github.com/MacoskoLab/liger + + - name: Monocle3 + title: Calculating Trajectories with Monocle 3 and Seurat + link: http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/monocle3.html + reference: https://doi.org/10.1038/s41586-019-0969-x + citation: Cao et al, Nature 2019 + source: https://cole-trapnell-lab.github.io/monocle3 + + - name: Nebulosa + title: Visualization of gene expression with Nebulosa + link: http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/nebulosa.html + reference: https://github.com/powellgenomicslab/Nebulosa + citation: Jose Alquicira-Hernandez and Joseph E. Powell, Under Review + source: https://github.com/powellgenomicslab/Nebulosa + + - name: schex + title: Using schex with Seurat + link: https://htmlpreview.github.io/?https://github.com/satijalab/seurat.wrappers/blob/master/docs/schex.html + reference: https://doi.org/0.1242/dev.173807 + citation: Freytag, R package 2019 + source: https://github.com/SaskiaFreytag/schex + + - name: scVelo + title: Estimating RNA Velocity using Seurat and scVelo + link: http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/scvelo.html + reference: https://doi.org/10.1101/820936 + citation: Bergen et al, bioRxiv 2019 + source: https://scvelo.readthedocs.io/ + + - name: Velocity + title: Estimating RNA Velocity using Seurat + link: https://htmlpreview.github.io/?https://github.com/satijalab/seurat.wrappers/blob/master/docs/velocity.html + reference: 10.1038/s41586-018-0414-6 + citation: La Manno et al, Nature 2018 + source: https://velocyto.org + + - name: CIPR + title: Using CIPR with human PBMC data + link: http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/cipr.html + reference: https://doi.org/10.1186/s12859-020-3538-2 + citation: Ekiz et. al., BMC Bioinformatics 2020 + source: https://github.com/atakanekiz/CIPR-Package + + - name: miQC + title: Running miQC on Seurat objects + link: http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/miQC.html + reference: https://www.biorxiv.org/content/10.1101/2021.03.03.433798v1 + citation: Hippen et. al., bioRxiv 2021 + source: https://github.com/greenelab/miQC + + - name: tricycle + title: Running estimate_cycle_position from tricycle on Seurat Objects + link: http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/tricycle.html + reference: https://doi.org/10.1101/2021.04.06.438463 + citation: Zheng et. al., bioRxiv 2021 + source: https://www.bioconductor.org/packages/release/bioc/html/tricycle.html \ No newline at end of file diff --git a/vignettes/visiumhd_analysis_vignette.Rmd b/vignettes/visiumhd_analysis_vignette.Rmd new file mode 100644 index 000000000..a2c88651c --- /dev/null +++ b/vignettes/visiumhd_analysis_vignette.Rmd @@ -0,0 +1,563 @@ +--- +title: "Analysis, visualization, and integration of Visium HD spatial datasets with Seurat" +output: + html_document: + theme: united +df_print: kable +pdf_document: default +date: 'Compiled: `r Sys.Date()`' +--- + +```{r setup, include=FALSE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = 'styler', + message = FALSE, + warning = FALSE, + fig.width = 10, + time_it = TRUE, + error = TRUE +) +``` + +# Visium HD support in Seurat + +We have [previously released support](https://satijalab.org/seurat/articles/spatial_vignette) Seurat for sequencing-based spatial transcriptomic (ST) technologies, including 10x visium and SLIDE-seq. We have now updated Seurat to be compatible with the Visium HD technology, which performs profiling at substantially higher spatial resolution than previous versions. + +Users can install the Visium HD-compatible release from Github. Existing Seurat workflows for [clustering, visualization, and downstream analysis](https://satijalab.org/seurat/articles/pbmc3k_tutorial) have been updated to support both Visium and Visium HD data. + +We note that Visium HD data is generated from spatially patterned olignocleotides labeled in 2um x 2um bins. However, since the data from this resolution is sparse, adjacent bins are pooled together to create 8um and 16um resolutions. 10x recommends the use of 8um binned data for analysis, but Seurat supports in the simultaneous loading of multiple binnings - and stores them in a single object as multiple assays. + +In this vignette, we provide an overview of some of the spatial workflows that Seurat supports for analyzing Visium HD data, in particular: + +* Unsupervised clustering +* Identification of spatial tissue domains +* Subsetting spatial regions +* Integration with scRNA-seq data +* Comparing the spatial localization of different cell types + +Please note that Visium HD is a new data type, and we expect to update this vignette as we test additional methods for spatial data analysis. We strongly encourage users to explore how different parameter settings affect their results, to analyze data iteratively (and in collaboration with biological experts), and to orthogonally validate unexpected or surprising biological findings. + +We focus our analysis on a Visium HD dataset from the mouse brain, available to download [here](https://www.10xgenomics.com/datasets/visium-hd-cytassist-gene-expression-libraries-of-mouse-brain-he) but also run clustering workflow on a dataset from the mouse intestine. + +## Install Seurat Update +```{r, eval=FALSE} +# packages required for Visium HD +install.packages("hdf5r") +install.packages("arrow") +``` + +```{r install, eval = FALSE} +library(Seurat) +library(ggplot2) +library(patchwork) +library(dplyr) +``` + +## Load Visium HD data + +* Visium HD mouse brain dataset is available for download [here](https://support.10xgenomics.com/spatial-gene-expression/datasets) +* The Seurat can store multiple binnings/resolutions in different assays +* `bin.size` parameter specifies resolutions to load (8 and 16um are loaded by default) +* Users can switch between resolutions by [changing the assay](https://satijalab.org/seurat/articles/multimodal_vignette) + +```{r data} +localdir <- "/brahms/lis/visium_hd/mouse/new_mousebrain/" +object <- Load10X_Spatial(data.dir = localdir, bin.size = c(8, 16)) + +# Setting default assay changes between 8um and 16um binning +Assays(object) +DefaultAssay(object) <- "Spatial.008um" +``` +```{r qc, fig.height=5} +vln.plot <- VlnPlot(object, features = 'nCount_Spatial.008um', pt.size = 0) + theme(axis.text = element_text(size = 4)) + NoLegend() +count.plot <- SpatialFeaturePlot(object, features = 'nCount_Spatial.008um') + theme(legend.position = "right") + +# note that many spots have very few counts, in-part +# due to low cellular density in certain tissue regions +vln.plot | count.plot +``` + +## Normalize datasets + +In this vignette we use standard log-normalization for spatial data. We note that the best normalization methods for spatial data are still being developed and evaluated, and encourage users to read manuscripts from the [Phipson/Davis](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-024-03241-7) and [Fan](https://www.biorxiv.org/content/10.1101/2023.08.30.555624v2) labs to learn more about potential caveats for spatial normalization. + +```{r normalize} +# normalize both 8um and 16um bins +DefaultAssay(object) <- "Spatial.008um" +object <- NormalizeData(object) + +DefaultAssay(object) <- "Spatial.016um" +object <- NormalizeData(object) +``` + +## Visualize gene expression + +* Adjusting `pt.size.factor` (set to 1.2 by default) helps to visualize molecular and histological info in this HD dataset +* You can also adjust the `shape`, and `stroke` (outline) parameters for visualization + +```{r feature.plots} +# switch spatial resolution to 16um from 8um +DefaultAssay(object) <- "Spatial.016um" +p1 <- SpatialFeaturePlot(object, features = "Rorb") + ggtitle("Rorb expression (16um)") + +# switch back to 8um +DefaultAssay(object) <- "Spatial.008um" +p2 <- SpatialFeaturePlot(object, features = "Hpca") + ggtitle("Hpca expression (8um)") + +p1 | p2 +``` + +## Unsupervised clustering + +While the standard scRNA-seq clustering workflow can also be applied to spatial datasets - we have observed that when working with Visium HD datasets, the [Seurat v5 sketch clustering workflow](https://satijalab.org/seurat/articles/seurat5_sketch_analysis) exhibits improved performance, especially for identifying rare and spatially restricted groups. + +As described in [Hao et al, Nature Biotechnology 2023](https://www.nature.com/articles/s41587-023-01767-y) and [Hie et al](https://www.sciencedirect.com/science/article/pii/S2405471219301528), sketch-based analyses aim to 'subsample' large datasets in a way that preserves rare populations. Here, we sketch the Visium HD dataset, perform clustering on the subsampled cells, and then project the cluster labels back to the full dataset. + +Details of the sketching procedure and workflow are described in [Hao et al, Nature Biotechnology 2023](https://www.nature.com/articles/s41587-023-01767-y) and the [Seurat v5 sketch clustering vignette](https://satijalab.org/seurat/articles/seurat5_sketch_analysis). Since the full Visium HD dataset fits in memory, we do not use any of the on-disk capabilities of Seurat v5 in this vignette. + +```{r sketching} +# note that data is already normalized +DefaultAssay(object) <- "Spatial.008um" +object <- FindVariableFeatures(object) +object <- ScaleData(object) +# we select 50,0000 cells and create a new 'sketch' assay +object <- SketchData( + object = object, + ncells = 50000, + method = "LeverageScore", + sketched.assay = "sketch" +) +``` + +```{r sketch.preprocess} +# switch analysis to sketched cells +DefaultAssay(object) <- "sketch" + +# perform clustering workflow +object <- FindVariableFeatures(object) +object <- ScaleData(object) +object <- RunPCA(object, assay="sketch", reduction.name = "pca.sketch") +object <- FindNeighbors(object, assay="sketch", reduction = "pca.sketch", dims = 1:50) +object <- FindClusters(object, cluster.name="seurat_cluster.sketched", resolution = 3) +object <- RunUMAP(object, reduction = "pca.sketch", reduction.name = "umap.sketch", return.model = T, dims = 1:50) +``` + +Now we can project the cluster labels, and dimensional reductions (PCA and UMAP) that we learned from the 50,000 sketched cells - to the entire dataset, using the `ProjectData` function. + +In the resulting object, for all cells: + +* cluster labels will be stored in `object$seurat_cluster.projected` +* Projected PCA embeddings will be stored in `object[["pca.008um"]]` +* Projected UMAP embeddings will be stored in `object[["umap.sketch"]]` + +```{r project} +object <- ProjectData( + object = object, + assay = "Spatial.008um", + full.reduction = "full.pca.sketch", + sketched.assay = "sketch", + sketched.reduction = "pca.sketch", + umap.model = "umap.sketch", + dims = 1:50, + refdata = list(seurat_cluster.projected = "seurat_cluster.sketched") +) +``` + +We can visualize the clustering results for the sketched cells, as well as the projected clustering results for the full dataset: + +```{r project.plots, fig.height = 8} +DefaultAssay(object) <- "sketch" +Idents(object) <- "seurat_cluster.sketched" +p1 <- DimPlot(object, reduction = "umap.sketch", label=F) + ggtitle("Sketched clustering (50,000 cells)") + theme(legend.position = "bottom") + +# switch to full dataset +DefaultAssay(object) <- "Spatial.008um" +Idents(object) <- "seurat_cluster.projected" +p2 <- DimPlot(object, reduction = "full.umap.sketch", label=F) + ggtitle("Projected clustering (full dataset)") + theme(legend.position = "bottom") + +p1 | p2 +``` + +Of course, we can now also visualize the unsupervised clusters based on their spatial location. Note that running `SpatialDimPlot(object, interactive = TRUE)`, also enables interactive visualization and exploration. + +```{r dim.plot} +SpatialDimPlot(object, label=T, repel=T, label.size = 4) +``` + +When there are many different clusters (some of which are spatially restricted and others are mixed), plotting the spatial location of all clusters can be challenging to interpret. We find it helpful to plot the spatial location of different clusters individually. For example, we highlight the spatial localization of a few clusters below, which happen to correspond to different cortical layers: + +```{r cluster.plot, fig.height = 7} +Idents(object) <- "seurat_cluster.projected" +cells <- CellsByIdentities(object, idents=c(0,4,32,34,35)) +p <- SpatialDimPlot(object, cells.highlight = cells[setdiff(names(cells), "NA")], + cols.highlight = c("#FFFF00","grey50"), facet.highlight = T, combine=T) + NoLegend() +p + +Idents(object) <- "seurat_cluster.projected" +cells <- CellsByIdentities(object, idents=c(0,4,32,34,35)) +p <- SpatialDimPlot(object, cells.highlight = cells[setdiff(names(cells), "NA")], + cols.highlight = c("#FFFF00","grey50"), facet.highlight = T, combine=T) + NoLegend() +p +``` + +We can also find and visualize the top gene expression markers for each cluster: + +```{r heatmap, fig.height = 12} +# Crete downsampled object to make visualization either +DefaultAssay(object) <- "Spatial.008um" +Idents(object) <- "seurat_cluster.projected" +object_subset <- subset(object, cells = Cells(object[['Spatial.008um']]), downsample=1000) + +# Order clusters by similarity +DefaultAssay(object_subset) <- "Spatial.008um" +Idents(object_subset) <- "seurat_cluster.projected" +object_subset <- BuildClusterTree(object_subset, assay = "Spatial.008um", reduction = "full.pca.sketch", reorder = T) + +markers <- FindAllMarkers(object_subset, assay = 'Spatial.008um', only.pos = TRUE) +markers %>% + group_by(cluster) %>% + dplyr::filter(avg_log2FC > 1) %>% + slice_head(n = 5) %>% + ungroup() -> top5 + +object_subset <- ScaleData(object_subset, assay = "Spatial.008um", features = top5$gene) +p <- DoHeatmap(object_subset, assay = "Spatial.008um", features = top5$gene, size = 2.5) + theme(axis.text = element_text(size = 5.5)) + NoLegend() +p +``` + +## Identifying spatially-defined tissue domains + +While the previous analyses consider each bin independently, spatial data enables cells to be defined not just by their neighborhood, but also by their broader spatial context. + +In [Singhal et al.](https://www.nature.com/articles/s41588-024-01664-3), the authors introduce BANKSY, Building Aggregates with a Neighborhood Kernel and Spatial Yardstick (BANKSY). BANKSY performs multiple tasks, but we find it particularly valuable for identifying and segmenting tissue domains. When performing clustering, BANKSY augments a spot's expression pattern with both the mean and the gradient of gene expression levels in a spot's broader neighborhood. + +We thank the authors for enabling BANKSY to be compatible with Seurat via the [`SeuratWrappers`](https://github.com/satijalab/seurat-wrappers) framework, which requires separate installation of the BANKSY package: + +```{r banksy} +if (!requireNamespace("Banksy", quietly = TRUE)) { + remotes::install_github("prabhakarlab/Banksy@devel") +} +library(SeuratWrappers) +library(Banksy) +``` + +Before running BANKSY, there are two important model parameters that users should consider: + +* `k_geom` : Local neighborhood size. Larger values will yield larger domains +* `lambda` : Influence of the neighborhood. Larger values yield more spatially coherent domains + +The `RunBanksy` function creates a new `BANKSY` assay, which can be used for dimensional reduction and clustering: + +```{r banksy.run} +object <- RunBanksy(object, lambda = 0.8, verbose=TRUE, + assay = 'Spatial.008um', slot = 'data', features = 'variable', + k_geom = 50) +``` +```{r banksy.preprocess} +DefaultAssay(object) <- "BANKSY" +object <- RunPCA(object, assay = 'BANKSY', reduction.name = "pca.banksy", features = rownames(object), npcs = 30) +object <- FindNeighbors(object, reduction = "pca.banksy", dims = 1:30) +object <- FindClusters(object, cluster.name = "banksy_cluster", resolution = 0.5) +``` + +```{r banksy.plot} +Idents(object) <- "banksy_cluster" +p <- SpatialDimPlot(object, group.by="banksy_cluster",label=T, repel=T, label.size = 4) +p +``` + +As with unsupervised clustering, we can highlight the spatial location of each tissue domain individually: + +```{r banksy.cluster.plot, fig.height = 10} +banksy_cells <- CellsByIdentities(object) +p <- SpatialDimPlot(object, cells.highlight = banksy_cells[setdiff(names(banksy_cells), "NA")], cols.highlight = c("#FFFF00","grey50"),facet.highlight = T, combine=T) + NoLegend() +p +``` + +## Subset out anatomical regions + +Users may wish to segment or subset out a restricted region for further downstream analysis. For example, here we create a coordinate-defined segmentation mask marking cortical and hippocampal regions from the entire dataset using the `CreateSegmentation` function, and then identify cells that fall into this region with the `Overlay` function. + +The list of coordinates is available for download [here](https://www.dropbox.com/scl/fi/qbs3j1alq33f0qz892ub3/cortex-hippocampus_coordinates.csv?rlkey=lsxglb15jhjdrircy9lb6n0rd&dl=0), and users can identify these boundaries when exploring their own datasets using the `interactive=TRUE` argument to `SpatialDimPlot`. + +```{r subset1} +cortex.coordinates <- as.data.frame(read.csv('/brahms/lis/visium_hd/final_mouse/cortex-hippocampus_coordinates.csv')) +cortex <- CreateSegmentation(cortex.coordinates) + +object[["cortex"]] <- Overlay(object[["slice1.008um"]], cortex) +cortex <- subset(object, cells=Cells(object[['cortex']])) +``` + +## Integration with scRNA-seq data (deconvolution) + +Seurat v5 also includes support for [Robust Cell Type Decomposition](https://www.nature.com/articles/s41587-021-00830-w), a computational approach to deconvolve spot-level data from spatial datasets, when provided with an scRNA-seq reference. RCTD has been shown to accurately annotate spatial data from a variety of technologies, including SLIDE-seq, Visium, and the 10x Xenium in-situ spatial platform. We observe good performance with Visium HD as well. + +To run RCTD, we first install the `spacexr` package from GitHub which implements RCTD. When running RCTD, we follow the instructions from the [RCTD vignette](https://raw.githack.com/dmcable/spacexr/master/vignettes/spatial-transcriptomics.html). + +```{r load.spacexr} +if (!requireNamespace("spacexr", quietly = TRUE)) { + devtools::install_github("dmcable/spacexr", build_vignettes = FALSE) +} +library(spacexr) +``` + +RCTD takes an scRNA-seq dataset as a reference, and a spatial dataset as a query. For a reference, we use a mouse scRNA-seq dataset from the Allen Brain Atlas, available for download [here](https://www.dropbox.com/scl/fi/r1mixf4eof2cot891n215/allen_scRNAseq_ref.Rds?rlkey=ynr6s6wu1efqsjsu3h40vitt7&dl=0). The reference scRNAs-eq dataset has been reduced to 200,000 cells (and rare cell types <25 cells have been removed). + +We use the cortex Visium HD object as the spatial query. For computational efficiency, we sketch the spatial query dataset, apply RCTD to deconvolute the 'sketched' cortical cells and annotate them, and then project these annotations to the full cortical dataset. + +```{r sketch.cortex} +#sketch the cortical subset of the Visium HD dataset +DefaultAssay(cortex) <- "Spatial.008um" +cortex <- FindVariableFeatures(cortex) +cortex <- SketchData( + object = cortex, + ncells = 50000, + method = "LeverageScore", + sketched.assay = "sketch" +) + +DefaultAssay(cortex) <- "sketch" +cortex <- ScaleData(cortex) +cortex <- RunPCA(cortex, assay="sketch", reduction.name = "pca.cortex.sketch", verbose = T) +cortex <- FindNeighbors(cortex, reduction = "pca.cortex.sketch", dims = 1:50) +cortex <- RunUMAP(cortex, reduction = "pca.cortex.sketch", reduction.name = "umap.cortex.sketch", return.model = T, dims = 1:50, verbose = T) +``` + +```{r sc.data} +# load in the reference scRNA-seq dataset +ref <- readRDS("/brahms/satijar/allen_scRNAseq_ref.Rds") +``` + +```{r rctd.setup, warning=FALSE} +Idents(ref) <- "subclass_label" +counts <- ref[["RNA"]]$counts +cluster <- as.factor(ref$subclass_label) +nUMI <- ref$nCount_RNA +levels(cluster) <- gsub("/", "-", levels(cluster)) +cluster <- droplevels(cluster) + +# create the RCTD reference object +reference <- Reference(counts, cluster, nUMI) + +counts_hd <- cortex[["sketch"]]$counts +cortex_cells_hd <- colnames(cortex[["sketch"]]) +coords <- GetTissueCoordinates(cortex)[cortex_cells_hd,1:2] + +# create the RCTD query object +query <- SpatialRNA(coords, counts_hd, colSums(counts_hd)) +``` + +```{r run.rctd, warning=FALSE, results=FALSE} +# run RCTD +RCTD <- create.RCTD(query, reference, max_cores = 28) +RCTD <- run.RCTD(RCTD, doublet_mode = "doublet") +# add results back to Seurat object +cortex <- AddMetaData(cortex, metadata = RCTD@results$results_df) +``` + +```{r project.rctd} +# project RCTD labels from sketched cortical cells to all cortical cells +cortex$first_type <- as.character(cortex$first_type) +cortex$first_type[is.na(cortex$first_type)] <- 'Unknown' +cortex <- ProjectData( + object = cortex, + assay = "Spatial.008um", + full.reduction = "pca.cortex", + sketched.assay = "sketch", + sketched.reduction = "pca.cortex.sketch", + umap.model = "umap.cortex.sketch", + dims = 1:50, + refdata = list(full_first_type = "first_type") +) +``` + +```{r rctd_results, fig.height=5} +DefaultAssay(object) <- "Spatial.008um" + +# we only ran RCTD on the cortical cells +# set labels to all other cells as "Unknown" +object[[]][, "full_first_type"] <- "Unknown" +object$full_first_type[Cells(cortex)] <- cortex$full_first_type[Cells(cortex)] +Idents(object) <- 'full_first_type' + +# now we can spatially map the location of any scRNA-seq cell type +# start with Layered (starts with L), excitatory neurons in the cortex +cells <- CellsByIdentities(object) +excitatory_names <- sort(grep("^L.* CTX",names(cells),value = TRUE)) +p <- SpatialDimPlot(object, cells.highlight = cells[excitatory_names], cols.highlight = c("#FFFF00","grey50"), facet.highlight = T, combine=T, ncol=4) +p +``` + +We can now look for associations between the scRNA-seq labels of individual bins, and their tissue domain identity (as assigned by BANKSY). By asking which domains the excitatory neuron cells fall in, we can rename the BANKSY clusters as neuronal layers: + +```{r celltype.plot, fig.height = 12} +plot_cell_types <- function(data, label) { + p <- ggplot(data, aes(x = get(label), y = n, fill = full_first_type)) + + geom_bar(stat = "identity", position = "stack") + + geom_text(aes(label = ifelse(n >= min_count_to_show_label, full_first_type, "")), position = position_stack(vjust = 0.5), size = 2) + + xlab(label) + + ylab("# of Spots") + + ggtitle(paste0("Distribution of Cell Types across ", label)) + + theme_minimal() +} + +cell_type_banksy_counts <- object[[]] %>% + dplyr::filter(full_first_type %in% excitatory_names) %>% + dplyr::count(full_first_type, banksy_cluster) + +min_count_to_show_label <- 20 + +p <- plot_cell_types(cell_type_banksy_counts, "banksy_cluster") +p +``` + +Based on this plot, we can now assign cells (even if they are not excitatory neurons) to individual neuronal layers. + +```{r layer.label} +Idents(object) <- 'banksy_cluster' +object$layer_id <- 'Unknown' +object$layer_id[WhichCells(object,idents = c(7))] <- "Layer 2/3" +object$layer_id[WhichCells(object,idents = c(15))] <- "Layer 4" +object$layer_id[WhichCells(object,idents = c(5))] <- "Layer 5" +object$layer_id[WhichCells(object,idents = c(1))] <- "Layer 6" +``` + +Finally, we can visualize the spatial distribution of other cell types, and ask which cortical layers they fall in. For example, in contrast to excitatory neurons, inhibitory (GABAergic) interneurons in the cortex are not spatially restricted to individual layers - but they do show biases. + +In our [previous analysis of STARmap data](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6687398/), and [consistent with previous work](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6456269/), we found that SST and PV interneuron classes tend to be restricted to layers 4-6, while VIP and Lamp5 interneurons tend to be located in layers 2/3. These results were based on an in-situ imaging technology which captures single-cell profiles - and here we ask whether we can find the same result in Visium HD spot-based data. + +```{r layer.dim.plot, fig.height=3} +# set ID to RCTD label +Idents(object) <- 'full_first_type' + +# Visualize distribution of 4 interneuron subtypes +inhibitory_names <- c("Sst","Pvalb","Vip","Lamp5") +cell_ids <- CellsByIdentities(object, idents = inhibitory_names) +p <- SpatialDimPlot(object, cells.highlight = cell_ids, cols.highlight = c("#FFFF00","grey50"), facet.highlight = T, combine=T, ncol=4) +p +``` +```{r prop.plot} +# create barplot to show proportions of cell types of interest +layer_table <- table(object$full_first_type, object$layer_id)[inhibitory_names,1:4] + +neuron_props <- reshape2::melt(prop.table(layer_table), margin = 1) +ggplot(neuron_props, aes(x = Var1, y = value, fill = Var2)) + + geom_bar(stat = "identity", position = "fill") + + labs(x = "Cell type", y = "Proportion", fill = "Layer") + + theme_classic() +``` +```{r save.brain, include=F, eval=T} +saveRDS(object, file='/brahms/lis/visium_hd/final_mouse/vig/0506final.rds') +``` + +We recapitulate the same findings, previously identified in in-situ imaging data, in the Visium HD dataset. This highlights that the 8um binning of Visium HD, even though it does not represent true single cell resolution, is capable of accurately localizing scRNA-seq-defined cell types, although we strongly encourage users to orthogonally validate unexpected or surprising biological findings. + +## Unsupervised clustering: mouse intestine + +We briefly demonstrate our sketch-clustering workflow on a second Visium HD dataset, from the Mouse Small Intestine (FFPE), available for download [here](https://www.10xgenomics.com/datasets/visium-hd-cytassist-gene-expression-libraries-of-mouse-intestine). We identify clusters, visualize their spatial locations, and report their top gene expression markers: +```{r intestine} +localdir <- "/brahms/lis/visium_hd/Visium_HD_Public_Data/HD_public_data/Visium_HD_Mouse_Small_Intestine/outs" +object <- Load10X_Spatial(data.dir = localdir, bin.size = 8) +``` +```{r intestine.process} +DefaultAssay(object) <- "Spatial.008um" +object <- NormalizeData(object) +object <- FindVariableFeatures(object) +object <- ScaleData(object) +``` + +```{r intestine.sketching} +object <- SketchData( + object = object, + ncells = 50000, + method = "LeverageScore", + sketched.assay = "sketch" +) +``` + +```{r intestine.sketch.preprocess} +DefaultAssay(object) <- "sketch" +object <- FindVariableFeatures(object) +object <- ScaleData(object) +object <- RunPCA(object, assay="sketch", reduction.name = "pca.sketch") +object <- FindNeighbors(object, assay="sketch", reduction = "pca.sketch", dims = 1:50) +object <- FindClusters(object, cluster.name="seurat_cluster.sketched", resolution = 3) +object <- RunUMAP(object, reduction = "pca.sketch", reduction.name = "umap.sketch", return.model = T, dims = 1:50) +``` + +```{r intestine.project} +object <- ProjectData( + object = object, + assay = "Spatial.008um", + full.reduction = "full.pca.sketch", + sketched.assay = "sketch", + sketched.reduction = "pca.sketch", + umap.model = "umap.sketch", + dims = 1:50, + refdata = list(seurat_cluster.projected = "seurat_cluster.sketched") +) +``` +```{r intestine.project.plots, fig.height = 8} +Idents(object) <- "seurat_cluster.projected" +DefaultAssay(object) <- "Spatial.008um" + +p1 <- DimPlot(object, reduction = "umap.sketch", label=F) + theme(legend.position = "bottom") +p2 <- SpatialDimPlot(object, label=F) + theme(legend.position = "bottom") +p1 | p2 +``` + +We visualize the location of each cluster individually: + +```{r cluster.plot.intestine, fig.height = 8} +Idents(object) <- "seurat_cluster.projected" +cells <- CellsByIdentities(object, idents=c(1,5,18,26)) +p <- SpatialDimPlot(object, cells.highlight = cells[setdiff(names(cells), "NA")], cols.highlight = c("#FFFF00","grey50"), facet.highlight = T, combine=T) + NoLegend() +p +``` + +```{r heatmap.compare, fig.height = 8} +DefaultAssay(object) <- "Spatial.008um" +Idents(object) <- "seurat_cluster.projected" +object_subset <- subset(object, cells = Cells(object[['Spatial.008um']]), downsample=1000) + +DefaultAssay(object_subset) <- "Spatial.008um" +Idents(object_subset) <- "seurat_cluster.projected" +object_subset <- BuildClusterTree(object_subset, assay = "Spatial.008um", reduction = "full.pca.sketch", reorder = T) + +markers <- FindAllMarkers(object_subset, assay = "Spatial.008um", only.pos = TRUE) +markers %>% + group_by(cluster) %>% + dplyr::filter(avg_log2FC > 1) %>% + slice_head(n = 5) %>% + ungroup() -> top5 + +object_subset <- ScaleData(object_subset, assay = "Spatial.008um", features=top5$gene) +p <- DoHeatmap(object_subset, assay = "Spatial.008um", features = top5$gene, size = 2.5) + theme(axis.text = element_text(size = 5.5)) + NoLegend() +p +``` +```{r save.intestine, include=F, eval=T} +saveRDS(object, file='/brahms/lis/visium_hd/intestine/vig/0506final.rds') +``` + +```{r save.times, include = FALSE, eval = T} +write.csv(x = t(as.data.frame(all_times)), file = "/home/lis/seurat-private/output/timings/visiumhd_analysis_vignette_final_times.csv") +``` + +
    + **Session Info** +```{r} +sessionInfo() +``` +
    diff --git a/vignettes/visualization_vignette.Rmd b/vignettes/visualization_vignette.Rmd index 08f5118f5..cf1808f27 100644 --- a/vignettes/visualization_vignette.Rmd +++ b/vignettes/visualization_vignette.Rmd @@ -29,15 +29,16 @@ knitr::opts_chunk$set( tidy.opts = list(width.cutoff = 95), message = FALSE, warning = FALSE, - time_it = TRUE + time_it = TRUE, + error = TRUE ) -options(SeuratData.repo.use = 'satijalab04.nygenome.org') +options(SeuratData.repo.use = 'seurat.nygenome.org') ``` We'll demonstrate visualization techniques in Seurat using our previously computed Seurat object from the 2,700 PBMC tutorial. You can download this dataset from [SeuratData](https://github.com/satijalab/seurat-data) ```{r data, eval = FALSE} -SeuratData::InstallData('pbmc3k') +SeuratData::InstallData("pbmc3k") ``` ```{r seed, include=FALSE} @@ -49,8 +50,8 @@ library(Seurat) library(SeuratData) library(ggplot2) library(patchwork) -data("pbmc3k.final") -pbmc3k.final$groups <- sample(c('group1', 'group2'), size = ncol(pbmc3k.final), replace = TRUE) +pbmc3k.final <- LoadData("pbmc3k", type = "pbmc3k.final") +pbmc3k.final$groups <- sample(c("group1", "group2"), size = ncol(pbmc3k.final), replace = TRUE) features <- c("LYZ", "CCL5", "IL32", "PTPRCAP", "FCGR3A", "PF4") pbmc3k.final ``` @@ -152,7 +153,7 @@ ggsave(filename = "../output/images/visualization_vignette.jpg", height = 7, wid Seurat utilizes R's plotly graphing library to create interactive plots. This interactive plotting feature works with any ggplot2-based scatter plots (requires a `geom_point` layer). To use, simply make a ggplot2-based scatter plot (such as `DimPlot()` or `FeaturePlot()`) and pass the resulting plot to `HoverLocator()` ```{r hover} -# Include additional data to display alongside cell names by passing in a data frame of information +# Include additional data to display alongside cell names by passing in a data frame of information. # Works well when using FetchData plot <- FeaturePlot(pbmc3k.final, features = 'MS4A1') HoverLocator(plot = plot, information = FetchData(pbmc3k.final, vars = c('ident', 'PC_1', 'nFeature_RNA'))) @@ -160,7 +161,7 @@ HoverLocator(plot = plot, information = FetchData(pbmc3k.final, vars = c('ident' Another interactive feature provided by Seurat is being able to manually select cells for further investigation. We have found this particularly useful for small clusters that do not always separate using unbiased clustering, but which look tantalizingly distinct. You can now select these cells by creating a ggplot2-based scatter plot (such as with `DimPlot()` or `FeaturePlot()`, and passing the returned plot to `CellSelector()`. `CellSelector()` will return a vector with the names of the points selected, so that you can then set them to a new identity class and perform differential expression. -For example, lets pretend that DCs had merged with monocytes in the clustering, but we wanted to see what was unique about them based on their position in the tSNE plot. +For example, let's pretend that DCs had merged with monocytes in the clustering, but we wanted to see what was unique about them based on their position in the tSNE plot. ```{r identify, eval=FALSE} pbmc3k.final <- RenameIdents(pbmc3k.final, "DC" = "CD14+ Mono") @@ -223,6 +224,7 @@ Plotting multiple plots was previously achieved with the `CombinePlot()` functio ```{r combining_plots, fig.height = 5} plot1 <- DimPlot(pbmc3k.final) +# Create scatter plot with the Pearson correlation value as the title plot2 <- FeatureScatter(pbmc3k.final, feature1 = 'LYZ', feature2 = 'CCL5') # Combine two plots plot1 + plot2 diff --git a/vignettes/weighted_nearest_neighbor_analysis.Rmd b/vignettes/weighted_nearest_neighbor_analysis.Rmd index 4c207c7d0..aa1e98997 100644 --- a/vignettes/weighted_nearest_neighbor_analysis.Rmd +++ b/vignettes/weighted_nearest_neighbor_analysis.Rmd @@ -23,7 +23,8 @@ knitr::knit_hooks$set(time_it = local({ knitr::opts_chunk$set( message = FALSE, warning = FALSE, - time_it = TRUE + time_it = TRUE, + error = TRUE ) ``` The simultaneous measurement of multiple modalities, known as multimodal analysis, represents an exciting frontier for single-cell genomics and necessitates new computational methods that can define cellular states based on multiple data types. The varying information content of each modality, even across cells in the same dataset, represents a pressing challenge for the analysis and integration of multimodal datasets. In ([Hao\*, Hao\* et al, Cell 2021](https://doi.org/10.1016/j.cell.2021.04.048)), we introduce 'weighted-nearest neighbor' (WNN) analysis, an unsupervised framework to learn the relative utility of each data type in each cell, enabling an integrative analysis of multiple modalities. @@ -36,18 +37,13 @@ This vignette introduces the WNN workflow for the analysis of multimodal single- We demonstrate the use of WNN analysis to two single-cell multimodal technologies: CITE-seq and 10x multiome. We define the cellular states based on both modalities, instead of either individual modality. - # WNN analysis of CITE-seq, RNA + ADT We use the CITE-seq dataset from ([Stuart\*, Butler\* et al, Cell 2019](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8)), which consists of 30,672 scRNA-seq profiles measured alongside a panel of 25 antibodies from bone marrow. The object contains two assays, RNA and antibody-derived tags (ADT). -To run this vignette please install Seurat v4, available on [CRAN](https://cran.r-project.org/web/packages/Seurat/index.html), and SeuratData, available on [GitHub](https://github.com/satijalab/seurat-data). - -```{r install, eval = FALSE} -install.packages("Seurat") -``` +To run this vignette please install SeuratData, available on [GitHub](https://github.com/satijalab/seurat-data). ```{r, include = FALSE, cache=FALSE} -options(SeuratData.repo.use = "http://satijalab04.nygenome.org") +options(SeuratData.repo.use = "http://seurat.nygenome.org") ``` ```{r packages, cache=FALSE} @@ -162,8 +158,8 @@ You can download the dataset from the 10x Genomics website [here](https://suppor Finally, in order to run the vignette, make sure the following packages are installed: -* [Seurat v4](install.html) -* [Signac](https://satijalab.org/signac/) for the analysis of single-cell chromatin datasets +* [Seurat](install.html) +* [Signac](https://stuartlab.org/signac/) for the analysis of single-cell chromatin datasets * [EnsDb.Hsapiens.v86](https://bioconductor.org/packages/release/data/annotation/html/EnsDb.Hsapiens.v86.html) for a set of annotations for hg38 * [dplyr](https://cran.r-project.org/web/packages/dplyr/index.html) to help manipulate data tables @@ -177,7 +173,7 @@ library(ggplot2) -We'll create a Seurat object based on the gene expression data, and then add in the ATAC-seq data as a second assay. You can explore the [Signac getting started vignette](https://satijalab.org/signac/articles/pbmc_vignette.html) for more information on the creation and processing of a ChromatinAssay object. +We'll create a Seurat object based on the gene expression data, and then add in the ATAC-seq data as a second assay. You can explore the [Signac getting started vignette](https://stuartlab.org/signac/articles/pbmc_vignette.html) for more information on the creation and processing of a ChromatinAssay object. ```{r CreateObject} # the 10x hdf5 file contains both data types. @@ -215,7 +211,7 @@ pbmc[["ATAC"]] <- chrom_assay We perform basic QC based on the number of detected molecules for each modality as well as mitochondrial percentage. ```{r QCObject, fig.width=10} -VlnPlot(pbmc, features = c("nCount_ATAC", "nCount_RNA","percent.mt"), ncol = 3, +VlnPlot(pbmc, features = c("nCount_ATAC", "nCount_RNA", "percent.mt"), ncol = 3, log = TRUE, pt.size = 0) + NoLegend() pbmc <- subset( @@ -252,7 +248,7 @@ pbmc <- RunUMAP(pbmc, nn.name = "weighted.nn", reduction.name = "wnn.umap", redu pbmc <- FindClusters(pbmc, graph.name = "wsnn", algorithm = 3, verbose = FALSE) ``` -We annotate the clusters below. Note that you could also annotate the dataset using our supervised mapping pipelines, using either our [vignette](multimodal_reference_mapping.html), or [automated web tool, Azimuth](www.satijalab.org/azimuth). +We annotate the clusters below. Note that you could also annotate the dataset using our supervised mapping pipelines, using either our [vignette](multimodal_reference_mapping.html), or [automated web tool, Azimuth](https://satijalab.org/azimuth). ```{r Annotate, results = 'hide'} # perform sub-clustering on cluster 6 to find additional structure @@ -282,7 +278,7 @@ p3 <- DimPlot(pbmc, reduction = "wnn.umap", group.by = "celltype", label = TRUE, p1 + p2 + p3 & NoLegend() & theme(plot.title = element_text(hjust = 0.5)) ``` -For example, the ATAC-seq data assists in the separation of CD4 and CD8 T cell states. This is due to the presence of multiple loci that exhibit differential accessibility between different T cell subtypes. For example, we can visualize 'pseudobulk' tracks of the CD8A locus alongside violin plots of gene expression levels, using tools in the [Signac visualization vignette](https://satijalab.org/signac/articles/visualization.html). +For example, the ATAC-seq data assists in the separation of CD4 and CD8 T cell states. This is due to the presence of multiple loci that exhibit differential accessibility between different T cell subtypes. For example, we can visualize 'pseudobulk' tracks of the CD8A locus alongside violin plots of gene expression levels, using tools in the [Signac visualization vignette](https://stuartlab.org/signac/articles/visualization.html). ```{r coverageplotcd8, fig.width=10} ## to make the visualization easier, subset T cell clusters @@ -292,7 +288,7 @@ tcells <- subset(pbmc, idents = tcell.names) CoveragePlot(tcells, region = 'CD8A', features = 'CD8A', assay = 'ATAC', expression.assay = 'SCT', peaks = FALSE) ``` -Next, we will examine the accessible regions of each cell to determine enriched motifs. As described in the [Signac motifs vignette](https://satijalab.org/signac/articles/motif_vignette.html), there are a few ways to do this, but we will use the [chromVAR](https://www.nature.com/articles/nmeth.4401) package from the Greenleaf lab. This calculates a per-cell accessibility score for known motifs, and adds these scores as a third assay (`chromvar`) in the Seurat object. +Next, we will examine the accessible regions of each cell to determine enriched motifs. As described in the [Signac motifs vignette](https://stuartlab.org/signac/articles/motif_vignette.html), there are a few ways to do this, but we will use the [chromVAR](https://www.nature.com/articles/nmeth.4401) package from the Greenleaf lab. This calculates a per-cell accessibility score for known motifs, and adds these scores as a third assay (`chromvar`) in the Seurat object. To continue, please make sure you have the following packages installed.