Add native reader support for AnnData v0.8.0

theislab · Mar 7, 2023 · 843fbc2 · 843fbc2
1 parent fa22a44
commit 843fbc2
Show file tree

Hide file tree

Showing 7 changed files with 238 additions and 26 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -12,7 +12,12 @@ Authors@R:
              family = "Lun",
              role = c("aut"),
              email = "[email protected]",
-             comment = c(ORCID = "0000-0002-3564-4813"))
+             comment = c(ORCID = "0000-0002-3564-4813")),
+      person(given = "Jack",
+             family = "Kamm",
+             role = c("ctb"),
+             email = "[email protected]",
+             comment = c(ORCID = "0000-0003-2412-756X"))
     )
 Description:
     Provides methods to convert between Python AnnData objects and
@@ -63,7 +68,7 @@ BugReports: https://github.com/theislab/zellkonverter/issues
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.1
+RoxygenNote: 7.2.3
 Language: en-GB
 StagedInstall: no
 VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
@@ -29,8 +29,10 @@ importFrom(SingleCellExperiment,"colPairs<-")
 importFrom(SingleCellExperiment,"reducedDims<-")
 importFrom(SingleCellExperiment,"rowPairs<-")
 importFrom(SingleCellExperiment,SingleCellExperiment)
+importFrom(SummarizedExperiment,"assays<-")
 importFrom(SummarizedExperiment,"colData<-")
 importFrom(SummarizedExperiment,"rowData<-")
+importFrom(SummarizedExperiment,assays)
 importFrom(SummarizedExperiment,colData)
 importFrom(SummarizedExperiment,rowData)
 importFrom(basilisk,basiliskRun)
@@ -43,3 +45,4 @@ importFrom(reticulate,import_builtins)
 importFrom(reticulate,py_to_r)
 importFrom(reticulate,r_to_py)
 importFrom(utils,capture.output)
+importFrom(utils,compareVersion)
diff --git a/R/read.R b/R/read.R
@@ -79,7 +79,7 @@ readH5AD <- function(file, X_name = NULL, use_hdf5 = FALSE,
         )
 
     } else if (reader == "R") {
-        sce <- .native_reader(file, backed = use_hdf5, verbose = verbose)
+        sce <- .native_reader(file, backed = use_hdf5, verbose = verbose, version=version)
     }
 
     return(sce)
@@ -99,22 +99,26 @@ readH5AD <- function(file, X_name = NULL, use_hdf5 = FALSE,
                 ...)
 }
 
+#' @importFrom utils compareVersion
 #' @importFrom S4Vectors I DataFrame wmsg
-#' @importFrom SummarizedExperiment rowData colData rowData<- colData<-
+#' @importFrom SummarizedExperiment assays assays<- rowData colData rowData<- colData<-
 #' @importFrom SingleCellExperiment SingleCellExperiment reducedDims<- colPairs<- rowPairs<-
-.native_reader <- function(file, backed = FALSE, verbose = FALSE) {
+.native_reader <- function(file, backed = FALSE, verbose = FALSE, version = NULL) {
     .ui_info("Using the {.field R} reader")
     .ui_step("Reading {.file {file}}", spinner = TRUE)
 
+    if (is.null(version)) {
+        version <- .AnnDataVersions[1]
+    }
+
     contents <- .list_contents(file)
 
+    all.assays <- list()
+
     # Let's read in the X matrix first... if it's there.
-    if (!"X" %in% names(contents)) {
-        stop("missing an 'X' entry in '", file, "'")
+    if ("X" %in% names(contents)) {
+        all.assays[["X"]] <- .read_matrix(file, "X", contents[["X"]], backed = backed)
     }
-    all.assays <- list(
-        X = .read_matrix(file, "X", contents[["X"]], backed = backed)
-    )
 
     for (layer in names(contents[["layers"]])) {
         tryCatch(
@@ -140,7 +144,7 @@ readH5AD <- function(file, X_name = NULL, use_hdf5 = FALSE,
     # Adding the various pieces of data.
     tryCatch(
         {
-            col_data <- .read_dim_data(file, "obs", contents[["obs"]])
+            col_data <- .read_dim_data(file, "obs", contents[["obs"]], version)
             if (!is.null(col_data)) {
                 colData(sce) <- col_data
             }
@@ -155,9 +159,13 @@ readH5AD <- function(file, X_name = NULL, use_hdf5 = FALSE,
 
     tryCatch(
         {
-            row_data <- .read_dim_data(file, "var", contents[["var"]])
+            row_data <- .read_dim_data(file, "var", contents[["var"]], version)
             if (!is.null(row_data)) {
                 rowData(sce) <- row_data
+                # Manually set SCE rownames, because setting rowData
+                # doesn't seem to set them. (Even tho setting colData
+                # does set the colnames)
+                rownames(sce) <- rownames(row_data)
             }
         },
         error = function(e) {
@@ -225,7 +233,15 @@ readH5AD <- function(file, X_name = NULL, use_hdf5 = FALSE,
     if ("uns" %in% names(contents)) {
         tryCatch(
             {
-                metadata(sce) <- rhdf5::h5read(file, "uns")
+                uns <- rhdf5::h5read(file, "uns")
+
+                if (compareVersion(version, "0.8") >= 0) {
+                    uns <- .convert_element(
+                        uns, "uns", file, recursive=TRUE
+                    )
+                }
+
+                metadata(sce) <- uns
             },
             error = function(e) {
                 warning(wmsg(
@@ -236,6 +252,12 @@ readH5AD <- function(file, X_name = NULL, use_hdf5 = FALSE,
         )
     }
 
+    if (("X_name" %in% names(metadata(sce))) && ("X" %in% names(contents))) {
+        stopifnot(names(assays(sce))[1] == "X") #should be true b/c X is read 1st
+        names(assays(sce))[1] <- metadata(sce)[["X_name"]]
+        metadata(sce)[["X_name"]] <- NULL
+    }
+
     sce
 }
 
@@ -289,23 +311,87 @@ readH5AD <- function(file, X_name = NULL, use_hdf5 = FALSE,
     mat
 }
 
+.convert_element <- function(obj, path, file, recursive=FALSE) {
+    element_attrs <- rhdf5::h5readAttributes(file, path)
+
+    # Convert categorical element
+    if (identical(element_attrs[["encoding-type"]], "categorical")) {
+        codes <- obj[["codes"]] + 1
+        codes[codes == 0] <- NA
+        levels <- obj[["categories"]]
+
+        # HACK rhdf5 doesn't yet support enums in attrs
+        # (h5readAttributes() above may warn about this)
+        #ord <- as.logical(element_attrs[["ordered"]])
+        ord <- FALSE
+
+        obj <- factor(levels[codes], levels=levels, ordered=ord)
+        return(obj)
+    }
+
+    # Handle nullable booleans/integers
+    if (element_attrs[["encoding-type"]] %in% c("nullable-boolean",
+                                                "nullable-integer")) {
+        mask <- as.logical(obj[["mask"]]) # convert enum to bool
+        obj <- obj[["values"]]
+        obj[mask] <- NA
+    }
+
+    # Handle booleans. Non-nullable booleans have encoding-type
+    # "array", so we have to infer the type from the enum levels
+    if (is.factor(obj) && identical(levels(obj), c("FALSE", "TRUE"))) {
+        obj <- as.logical(obj)
+        return(obj)
+    }
+
+    # Recursively convert element members
+    if (recursive && is.list(obj) && !is.null(names(obj))) {
+        for (k in names(obj)) {
+            obj[[k]] <- .convert_element(
+                obj[[k]], file.path(path, k),
+                file, recursive=TRUE
+            )
+        }
+    }
+
+    if (is.list(obj) && !is.null(names(obj))) {
+        names(obj) <- make.names(names(obj))
+    }
+
+    obj
+}
+
+#' @importFrom utils compareVersion
 #' @importFrom S4Vectors DataFrame
-.read_dim_data <- function(file, path, fields) {
+.read_dim_data <- function(file, path, fields, version) {
     col_names <- setdiff(names(fields), c("__categories", "_index"))
     out_cols <- list()
     for (col_name in col_names) {
-        out_cols[[col_name]] <- as.vector(
-            rhdf5::h5read(file, file.path(path, col_name))
-        )
+        vec <- rhdf5::h5read(file, file.path(path, col_name))
+
+        if (compareVersion(version, "0.8") >= 0) {
+            vec <- .convert_element(
+                vec, file.path(path, col_name),
+                file, recursive=FALSE
+            )
+        }
+
+        if (!is.factor(vec)) {
+            vec <- as.vector(vec)
+        }
+
+        out_cols[[col_name]] <- vec
     }
 
-    cat_names <- names(fields[["__categories"]])
-    for (cat_name in cat_names) {
-        levels <- as.vector(
-            rhdf5::h5read(file, file.path(path, "__categories", cat_name))
-        )
-        out_cols[[cat_name]] <- factor(out_cols[[cat_name]])
-        levels(out_cols[[cat_name]]) <- levels
+    if (compareVersion(version, "0.8") < 0) {
+        cat_names <- names(fields[["__categories"]])
+        for (cat_name in cat_names) {
+            levels <- as.vector(
+                rhdf5::h5read(file, file.path(path, "__categories", cat_name))
+            )
+            out_cols[[cat_name]] <- factor(out_cols[[cat_name]])
+            levels(out_cols[[cat_name]]) <- levels
+        }
     }
 
     if (!is.null(fields[["_index"]])) {
@@ -314,11 +400,16 @@ readH5AD <- function(file, X_name = NULL, use_hdf5 = FALSE,
         indices <- NULL
     }
 
+    column_order <- rhdf5::h5readAttributes(file, path)[["column-order"]]
+    if (!is.null(column_order)) {
+        out_cols <- out_cols[column_order]
+    }
+
     if (length(out_cols)) {
         df <- do.call(DataFrame, out_cols)
         rownames(df) <- indices
     } else if (!is.null(indices)) {
-        df <- DataFrame(row_names = indices)
+        df <- DataFrame(row.names = indices)
     } else {
         df <- NULL
     }

diff --git a/inst/extdata/krumsiek11_augmented_v0-8.h5ad b/inst/extdata/krumsiek11_augmented_v0-8.h5ad
diff --git a/inst/scripts/krumsiek11_augmented.py b/inst/scripts/krumsiek11_augmented.py
@@ -0,0 +1,54 @@
+# This script was used to create the `krumsiek11_augmented_v0-8.h5ad`
+# file. It adds some extra data to the previous `krumsiek11.h5ad`
+# dataset to cover some additional cases for testing (NAs, booleans,
+# etc). The data was saved in AnnData=0.8.0 format.
+#
+# Key package versions:
+#  - anndata=0.8.0
+#  - h5py=3.8.0
+#  - hdf5=1.14.0
+#  - numpy=1.23.5
+#  - pandas=1.5.3
+#  - python=3.9.16
+#  - scanpy=1.9.2
+
+import numpy as np
+import pandas as pd
+import anndata as ad
+
+adata = ad.read_h5ad("krumsiek11.h5ad")
+
+# add string column to rowData/var. Make the entries unique so it's
+# saved as str instead of factor
+adata.var["dummy_str"] = [f"row{i}" for i in range(adata.shape[1])]
+
+# add float column to colData/obs
+adata.obs["dummy_num"] = 42.42
+
+# float column with NA
+adata.obs["dummy_num2"] = adata.obs["dummy_num"]
+adata.obs["dummy_num2"][0] = float("nan")
+
+# int column
+adata.obs["dummy_int"] = np.arange(adata.shape[0])
+
+# int column with NA
+adata.obs["dummy_int2"] = pd.array([None] + [42] * (adata.shape[0] - 1))
+
+# bool column
+adata.obs["dummy_bool"] = True
+adata.obs["dummy_bool"][0] = False
+
+# bool column with NA
+adata.obs["dummy_bool2"] = pd.array([False, None] + [True] * (adata.shape[0] - 2))
+
+# also add some entries to the metadata/uns
+adata.uns["dummy_category"] = pd.array(["a", "b", None], dtype="category")
+
+adata.uns["dummy_bool"] = [True, True, False]
+adata.uns["dummy_bool2"] = pd.array([True, False, None])
+
+adata.uns["dummy_int"] = [1,2,3]
+adata.uns["dummy_int2"] = pd.array([1,2,None])
+
+adata.write("krumsiek11_augmented_v0-8.h5ad")
diff --git a/man/zellkonverter-package.Rd b/man/zellkonverter-package.Rd
diff --git a/tests/testthat/test-read.R b/tests/testthat/test-read.R
@@ -1,6 +1,7 @@
 # This tests the readH5AD function (and by implication, SCE2AnnData).
 library(SummarizedExperiment)
 file <- system.file("extdata", "krumsiek11.h5ad", package = "zellkonverter")
+file_v08 <- system.file("extdata", "krumsiek11_augmented_v0-8.h5ad", package = "zellkonverter")
 
 test_that("Reading H5AD works", {
     sce <- readH5AD(file)
@@ -67,13 +68,66 @@ test_that("readH5AD works in a separate process", {
 })
 
 test_that("Reading H5AD works with native reader", {
-    sce <- readH5AD(file, reader = "R")
+    sce <- readH5AD(file, reader = "R", version="0.7")
     expect_s4_class(sce, "SingleCellExperiment")
 
     expect_identical(assayNames(sce), "X")
     expect_identical(colnames(colData(sce)), "cell_type")
 })
 
+test_that("Reading v0.8 H5AD works with native reader", {
+    sce_py <- readH5AD(file_v08)
+    sce_r <- readH5AD(file_v08, reader="R")
+
+    expect_identical(rownames(sce_py), rownames(sce_r))
+    expect_identical(colnames(sce_py), colnames(sce_r))
+
+    expect_identical(rowData(sce_py), rowData(sce_r))
+
+    expect_identical(colnames(colData(sce_py)), colnames(colData(sce_r)))
+
+    # check colData columns that Python reader is able to handle
+    good_coldat_columns <- c("cell_type", "dummy_bool", "dummy_int",
+                             "dummy_num", "dummy_num2")
+
+    expect_equal(colData(sce_py)[,good_coldat_columns],
+                 colData(sce_r)[,good_coldat_columns])
+
+    # Manually check colData columns that Python reader doesn't handle
+    # right (it reads them as environment objects)
+    expect_equal(colData(sce_r)$dummy_bool2,
+                 c(FALSE, NA, rep(TRUE, 638)))
+
+    expect_equal(colData(sce_r)$dummy_int2,
+                 c(NA, rep(42, 639)))
+
+    # check the X assay
+    expect_identical(assays(sce_py), assays(sce_r))
+
+    # check the easy metadata columns
+    for (key in c("highlights", "iroot", "dummy_int")) {
+        expect_equal(metadata(sce_py)[[key]], metadata(sce_r)[[key]])
+    }
+
+    # python reads uns[dummy_bool] as an array, so convert it (is that
+    # a bug in the python reader?)
+    expect_equal(
+        as.vector(metadata(sce_py)[["dummy_bool"]]),
+        metadata(sce_r)[["dummy_bool"]]
+    )
+
+    # python reader doesn't parse these metadata, so check manually
+    # (the factor is skipped outright; the bool/int are returned as environments).
+    expect_identical(metadata(sce_r)[["dummy_bool2"]],
+                     c(TRUE, FALSE, NA))
+
+    expect_equal(metadata(sce_r)[["dummy_int2"]],
+                 as.array(c(1, 2, NA)))
+
+    expect_equal(metadata(sce_r)[["dummy_category"]],
+                 factor(c("a", "b", NA)))
+})
+
 test_that("Skipping slot conversion works", {
     sce <- readH5AD(file, layers = FALSE, uns = FALSE, var = FALSE, obs = FALSE,
                     varm = FALSE, obsm = FALSE, varp = FALSE, obsp = FALSE)