Skip to content

Commit

Permalink
Add native reader support for AnnData v0.8.0
Browse files Browse the repository at this point in the history
  • Loading branch information
jackkamm committed Mar 7, 2023
1 parent fa22a44 commit 843fbc2
Show file tree
Hide file tree
Showing 7 changed files with 238 additions and 26 deletions.
9 changes: 7 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,12 @@ Authors@R:
family = "Lun",
role = c("aut"),
email = "[email protected]",
comment = c(ORCID = "0000-0002-3564-4813"))
comment = c(ORCID = "0000-0002-3564-4813")),
person(given = "Jack",
family = "Kamm",
role = c("ctb"),
email = "[email protected]",
comment = c(ORCID = "0000-0003-2412-756X"))
)
Description:
Provides methods to convert between Python AnnData objects and
Expand Down Expand Up @@ -63,7 +68,7 @@ BugReports: https://github.com/theislab/zellkonverter/issues
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.1
RoxygenNote: 7.2.3
Language: en-GB
StagedInstall: no
VignetteBuilder: knitr
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,10 @@ importFrom(SingleCellExperiment,"colPairs<-")
importFrom(SingleCellExperiment,"reducedDims<-")
importFrom(SingleCellExperiment,"rowPairs<-")
importFrom(SingleCellExperiment,SingleCellExperiment)
importFrom(SummarizedExperiment,"assays<-")
importFrom(SummarizedExperiment,"colData<-")
importFrom(SummarizedExperiment,"rowData<-")
importFrom(SummarizedExperiment,assays)
importFrom(SummarizedExperiment,colData)
importFrom(SummarizedExperiment,rowData)
importFrom(basilisk,basiliskRun)
Expand All @@ -43,3 +45,4 @@ importFrom(reticulate,import_builtins)
importFrom(reticulate,py_to_r)
importFrom(reticulate,r_to_py)
importFrom(utils,capture.output)
importFrom(utils,compareVersion)
137 changes: 114 additions & 23 deletions R/read.R
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ readH5AD <- function(file, X_name = NULL, use_hdf5 = FALSE,
)

} else if (reader == "R") {
sce <- .native_reader(file, backed = use_hdf5, verbose = verbose)
sce <- .native_reader(file, backed = use_hdf5, verbose = verbose, version=version)
}

return(sce)
Expand All @@ -99,22 +99,26 @@ readH5AD <- function(file, X_name = NULL, use_hdf5 = FALSE,
...)
}

#' @importFrom utils compareVersion
#' @importFrom S4Vectors I DataFrame wmsg
#' @importFrom SummarizedExperiment rowData colData rowData<- colData<-
#' @importFrom SummarizedExperiment assays assays<- rowData colData rowData<- colData<-
#' @importFrom SingleCellExperiment SingleCellExperiment reducedDims<- colPairs<- rowPairs<-
.native_reader <- function(file, backed = FALSE, verbose = FALSE) {
.native_reader <- function(file, backed = FALSE, verbose = FALSE, version = NULL) {
.ui_info("Using the {.field R} reader")
.ui_step("Reading {.file {file}}", spinner = TRUE)

if (is.null(version)) {
version <- .AnnDataVersions[1]
}

contents <- .list_contents(file)

all.assays <- list()

# Let's read in the X matrix first... if it's there.
if (!"X" %in% names(contents)) {
stop("missing an 'X' entry in '", file, "'")
if ("X" %in% names(contents)) {
all.assays[["X"]] <- .read_matrix(file, "X", contents[["X"]], backed = backed)
}
all.assays <- list(
X = .read_matrix(file, "X", contents[["X"]], backed = backed)
)

for (layer in names(contents[["layers"]])) {
tryCatch(
Expand All @@ -140,7 +144,7 @@ readH5AD <- function(file, X_name = NULL, use_hdf5 = FALSE,
# Adding the various pieces of data.
tryCatch(
{
col_data <- .read_dim_data(file, "obs", contents[["obs"]])
col_data <- .read_dim_data(file, "obs", contents[["obs"]], version)
if (!is.null(col_data)) {
colData(sce) <- col_data
}
Expand All @@ -155,9 +159,13 @@ readH5AD <- function(file, X_name = NULL, use_hdf5 = FALSE,

tryCatch(
{
row_data <- .read_dim_data(file, "var", contents[["var"]])
row_data <- .read_dim_data(file, "var", contents[["var"]], version)
if (!is.null(row_data)) {
rowData(sce) <- row_data
# Manually set SCE rownames, because setting rowData
# doesn't seem to set them. (Even tho setting colData
# does set the colnames)
rownames(sce) <- rownames(row_data)
}
},
error = function(e) {
Expand Down Expand Up @@ -225,7 +233,15 @@ readH5AD <- function(file, X_name = NULL, use_hdf5 = FALSE,
if ("uns" %in% names(contents)) {
tryCatch(
{
metadata(sce) <- rhdf5::h5read(file, "uns")
uns <- rhdf5::h5read(file, "uns")

if (compareVersion(version, "0.8") >= 0) {
uns <- .convert_element(
uns, "uns", file, recursive=TRUE
)
}

metadata(sce) <- uns
},
error = function(e) {
warning(wmsg(
Expand All @@ -236,6 +252,12 @@ readH5AD <- function(file, X_name = NULL, use_hdf5 = FALSE,
)
}

if (("X_name" %in% names(metadata(sce))) && ("X" %in% names(contents))) {
stopifnot(names(assays(sce))[1] == "X") #should be true b/c X is read 1st
names(assays(sce))[1] <- metadata(sce)[["X_name"]]
metadata(sce)[["X_name"]] <- NULL
}

sce
}

Expand Down Expand Up @@ -289,23 +311,87 @@ readH5AD <- function(file, X_name = NULL, use_hdf5 = FALSE,
mat
}

.convert_element <- function(obj, path, file, recursive=FALSE) {
element_attrs <- rhdf5::h5readAttributes(file, path)

# Convert categorical element
if (identical(element_attrs[["encoding-type"]], "categorical")) {
codes <- obj[["codes"]] + 1
codes[codes == 0] <- NA
levels <- obj[["categories"]]

# HACK rhdf5 doesn't yet support enums in attrs
# (h5readAttributes() above may warn about this)
#ord <- as.logical(element_attrs[["ordered"]])
ord <- FALSE

obj <- factor(levels[codes], levels=levels, ordered=ord)
return(obj)
}

# Handle nullable booleans/integers
if (element_attrs[["encoding-type"]] %in% c("nullable-boolean",
"nullable-integer")) {
mask <- as.logical(obj[["mask"]]) # convert enum to bool
obj <- obj[["values"]]
obj[mask] <- NA
}

# Handle booleans. Non-nullable booleans have encoding-type
# "array", so we have to infer the type from the enum levels
if (is.factor(obj) && identical(levels(obj), c("FALSE", "TRUE"))) {
obj <- as.logical(obj)
return(obj)
}

# Recursively convert element members
if (recursive && is.list(obj) && !is.null(names(obj))) {
for (k in names(obj)) {
obj[[k]] <- .convert_element(
obj[[k]], file.path(path, k),
file, recursive=TRUE
)
}
}

if (is.list(obj) && !is.null(names(obj))) {
names(obj) <- make.names(names(obj))
}

obj
}

#' @importFrom utils compareVersion
#' @importFrom S4Vectors DataFrame
.read_dim_data <- function(file, path, fields) {
.read_dim_data <- function(file, path, fields, version) {
col_names <- setdiff(names(fields), c("__categories", "_index"))
out_cols <- list()
for (col_name in col_names) {
out_cols[[col_name]] <- as.vector(
rhdf5::h5read(file, file.path(path, col_name))
)
vec <- rhdf5::h5read(file, file.path(path, col_name))

if (compareVersion(version, "0.8") >= 0) {
vec <- .convert_element(
vec, file.path(path, col_name),
file, recursive=FALSE
)
}

if (!is.factor(vec)) {
vec <- as.vector(vec)
}

out_cols[[col_name]] <- vec
}

cat_names <- names(fields[["__categories"]])
for (cat_name in cat_names) {
levels <- as.vector(
rhdf5::h5read(file, file.path(path, "__categories", cat_name))
)
out_cols[[cat_name]] <- factor(out_cols[[cat_name]])
levels(out_cols[[cat_name]]) <- levels
if (compareVersion(version, "0.8") < 0) {
cat_names <- names(fields[["__categories"]])
for (cat_name in cat_names) {
levels <- as.vector(
rhdf5::h5read(file, file.path(path, "__categories", cat_name))
)
out_cols[[cat_name]] <- factor(out_cols[[cat_name]])
levels(out_cols[[cat_name]]) <- levels
}
}

if (!is.null(fields[["_index"]])) {
Expand All @@ -314,11 +400,16 @@ readH5AD <- function(file, X_name = NULL, use_hdf5 = FALSE,
indices <- NULL
}

column_order <- rhdf5::h5readAttributes(file, path)[["column-order"]]
if (!is.null(column_order)) {
out_cols <- out_cols[column_order]
}

if (length(out_cols)) {
df <- do.call(DataFrame, out_cols)
rownames(df) <- indices
} else if (!is.null(indices)) {
df <- DataFrame(row_names = indices)
df <- DataFrame(row.names = indices)
} else {
df <- NULL
}
Expand Down
Binary file added inst/extdata/krumsiek11_augmented_v0-8.h5ad
Binary file not shown.
54 changes: 54 additions & 0 deletions inst/scripts/krumsiek11_augmented.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# This script was used to create the `krumsiek11_augmented_v0-8.h5ad`
# file. It adds some extra data to the previous `krumsiek11.h5ad`
# dataset to cover some additional cases for testing (NAs, booleans,
# etc). The data was saved in AnnData=0.8.0 format.
#
# Key package versions:
# - anndata=0.8.0
# - h5py=3.8.0
# - hdf5=1.14.0
# - numpy=1.23.5
# - pandas=1.5.3
# - python=3.9.16
# - scanpy=1.9.2

import numpy as np
import pandas as pd
import anndata as ad

adata = ad.read_h5ad("krumsiek11.h5ad")

# add string column to rowData/var. Make the entries unique so it's
# saved as str instead of factor
adata.var["dummy_str"] = [f"row{i}" for i in range(adata.shape[1])]

# add float column to colData/obs
adata.obs["dummy_num"] = 42.42

# float column with NA
adata.obs["dummy_num2"] = adata.obs["dummy_num"]
adata.obs["dummy_num2"][0] = float("nan")

# int column
adata.obs["dummy_int"] = np.arange(adata.shape[0])

# int column with NA
adata.obs["dummy_int2"] = pd.array([None] + [42] * (adata.shape[0] - 1))

# bool column
adata.obs["dummy_bool"] = True
adata.obs["dummy_bool"][0] = False

# bool column with NA
adata.obs["dummy_bool2"] = pd.array([False, None] + [True] * (adata.shape[0] - 2))

# also add some entries to the metadata/uns
adata.uns["dummy_category"] = pd.array(["a", "b", None], dtype="category")

adata.uns["dummy_bool"] = [True, True, False]
adata.uns["dummy_bool2"] = pd.array([True, False, None])

adata.uns["dummy_int"] = [1,2,3]
adata.uns["dummy_int2"] = pd.array([1,2,None])

adata.write("krumsiek11_augmented_v0-8.h5ad")
5 changes: 5 additions & 0 deletions man/zellkonverter-package.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

56 changes: 55 additions & 1 deletion tests/testthat/test-read.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# This tests the readH5AD function (and by implication, SCE2AnnData).
library(SummarizedExperiment)
file <- system.file("extdata", "krumsiek11.h5ad", package = "zellkonverter")
file_v08 <- system.file("extdata", "krumsiek11_augmented_v0-8.h5ad", package = "zellkonverter")

test_that("Reading H5AD works", {
sce <- readH5AD(file)
Expand Down Expand Up @@ -67,13 +68,66 @@ test_that("readH5AD works in a separate process", {
})

test_that("Reading H5AD works with native reader", {
sce <- readH5AD(file, reader = "R")
sce <- readH5AD(file, reader = "R", version="0.7")
expect_s4_class(sce, "SingleCellExperiment")

expect_identical(assayNames(sce), "X")
expect_identical(colnames(colData(sce)), "cell_type")
})

test_that("Reading v0.8 H5AD works with native reader", {
sce_py <- readH5AD(file_v08)
sce_r <- readH5AD(file_v08, reader="R")

expect_identical(rownames(sce_py), rownames(sce_r))
expect_identical(colnames(sce_py), colnames(sce_r))

expect_identical(rowData(sce_py), rowData(sce_r))

expect_identical(colnames(colData(sce_py)), colnames(colData(sce_r)))

# check colData columns that Python reader is able to handle
good_coldat_columns <- c("cell_type", "dummy_bool", "dummy_int",
"dummy_num", "dummy_num2")

expect_equal(colData(sce_py)[,good_coldat_columns],
colData(sce_r)[,good_coldat_columns])

# Manually check colData columns that Python reader doesn't handle
# right (it reads them as environment objects)
expect_equal(colData(sce_r)$dummy_bool2,
c(FALSE, NA, rep(TRUE, 638)))

expect_equal(colData(sce_r)$dummy_int2,
c(NA, rep(42, 639)))

# check the X assay
expect_identical(assays(sce_py), assays(sce_r))

# check the easy metadata columns
for (key in c("highlights", "iroot", "dummy_int")) {
expect_equal(metadata(sce_py)[[key]], metadata(sce_r)[[key]])
}

# python reads uns[dummy_bool] as an array, so convert it (is that
# a bug in the python reader?)
expect_equal(
as.vector(metadata(sce_py)[["dummy_bool"]]),
metadata(sce_r)[["dummy_bool"]]
)

# python reader doesn't parse these metadata, so check manually
# (the factor is skipped outright; the bool/int are returned as environments).
expect_identical(metadata(sce_r)[["dummy_bool2"]],
c(TRUE, FALSE, NA))

expect_equal(metadata(sce_r)[["dummy_int2"]],
as.array(c(1, 2, NA)))

expect_equal(metadata(sce_r)[["dummy_category"]],
factor(c("a", "b", NA)))
})

test_that("Skipping slot conversion works", {
sce <- readH5AD(file, layers = FALSE, uns = FALSE, var = FALSE, obs = FALSE,
varm = FALSE, obsm = FALSE, varp = FALSE, obsp = FALSE)
Expand Down

0 comments on commit 843fbc2

Please sign in to comment.