Skip to content

Commit

Permalink
Merge branch 'main' into de/sc-33875/use_rc_in_ci
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl committed Sep 15, 2023
2 parents 0e802b9 + 84e53e5 commit af37773
Show file tree
Hide file tree
Showing 17 changed files with 335 additions and 109 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/r-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,11 @@ jobs:
- name: Install BioConductor package SingleCellExperiment
run: cd apis/r && tools/r-ci.sh install_bioc SingleCellExperiment

- name: Install rc version of tiledb-r (macOS)
- name: Install r-universe build of tiledb-r (macOS)"
if: ${{ matrix.os == 'macOS-latest' }}
run: cd apis/r && Rscript -e "install.packages('tiledb', repos = c('https://eddelbuettel.r-universe.dev', 'https://cloud.r-project.org'))"

- name: Install rc version of tiledb-r (linux)
- name: Install r-universe build of tiledb-r (linux)"
if: ${{ matrix.os != 'macOS-latest' }}
run: cd apis/r && Rscript -e "options(bspm.version.check=TRUE); install.packages('tiledb', repos = c('https://eddelbuettel.r-universe.dev/bin/linux/jammy/4.3/', 'https://cloud.r-project.org'))"

Expand Down
27 changes: 21 additions & 6 deletions .github/workflows/r-python-interop-testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@ name: TileDB-SOMA R-Python interop testing

on:
pull_request:
paths:
- "apis/python/**"
- "apis/r/**"
- "apis/system/**"
# TODO: leave this enabled for pre-merge signal for now. At some point we may want to go back to
# only having this signal post-merge.
#paths:
# - "apis/python/**"
# - "apis/r/**"
# - "apis/system/**"
push:
branches:
- main
Expand Down Expand Up @@ -38,6 +40,14 @@ jobs:
- name: MkVars
run: mkdir ~/.R && echo "CXX17FLAGS=-Wno-deprecated-declarations -Wno-deprecated" > ~/.R/Makevars

- name: Install r-universe build of tiledb-r (macOS)"
if: ${{ matrix.os == 'macOS-latest' }}
run: cd apis/r && Rscript -e "install.packages('tiledb', repos = c('https://eddelbuettel.r-universe.dev', 'https://cloud.r-project.org'))"

- name: Install r-universe build of tiledb-r (linux)"
if: ${{ matrix.os != 'macOS-latest' }}
run: cd apis/r && Rscript -e "options(bspm.version.check=TRUE); install.packages('tiledb', repos = c('https://eddelbuettel.r-universe.dev/bin/linux/jammy/4.3/', 'https://cloud.r-project.org'))"

- name: Build and install libtiledbsoma
run: sudo scripts/bld --prefix=/usr/local && sudo ldconfig

Expand All @@ -48,6 +58,9 @@ jobs:
FILE=$(ls -1t *.tar.gz | head -n 1)
R CMD INSTALL $FILE
- name: Show R package versions
run: Rscript -e 'tiledbsoma::show_package_versions()'

- name: Install testing prereqs
run: python -m pip -v install -U pip pytest-cov 'typeguard<3.0' types-setuptools

Expand All @@ -61,8 +74,10 @@ jobs:
- name: Install tiledbsoma
run: python -m pip -v install -e apis/python

- name: Show package versions
run: python scripts/show-versions.py
- name: Show Python package versions
run: |
python -c 'import tiledbsoma; tiledbsoma.show_package_versions()'
python scripts/show-versions.py
- name: Interop Tests
run: python -m pytest apis/system/tests/
50 changes: 46 additions & 4 deletions apis/python/src/tiledbsoma/_arrow_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ def tiledb_type_from_arrow_type(
Raises:
TypeError: if the type is unsupported.
"""
if pa.types.is_dictionary(t):
t = t.index_type

arrow_to_tdb = _ARROW_TO_TDB_DIM if is_indexed_column else _ARROW_TO_TDB_ATTR
if t in arrow_to_tdb:
arrow_type = arrow_to_tdb[t]
Expand Down Expand Up @@ -142,7 +145,9 @@ def arrow_type_from_tiledb_dtype(
return pa.from_numpy_dtype(tiledb_dtype)


def tiledb_schema_to_arrow(tdb_schema: tiledb.ArraySchema) -> pa.Schema:
def tiledb_schema_to_arrow(
tdb_schema: tiledb.ArraySchema, uri: str, ctx: tiledb.ctx.Ctx
) -> pa.Schema:
arrow_schema_dict = {}
dom = tdb_schema.domain
for i in range(dom.ndim):
Expand All @@ -152,12 +157,32 @@ def tiledb_schema_to_arrow(tdb_schema: tiledb.ArraySchema) -> pa.Schema:
name = "unnamed"
arrow_schema_dict[name] = arrow_type_from_tiledb_dtype(dim.dtype)

# If there are any enumerated-type columns, we'll need to open the array once to get
# some information from there. If not, we'll need to open the array zero times.
# Open the array only if we'll need it for enum infos.
A = None
for i in range(tdb_schema.nattr):
attr = tdb_schema.attr(i)
name = attr.name
if name == "":
name = "unnamed"
arrow_schema_dict[name] = arrow_type_from_tiledb_dtype(attr.dtype, attr.isascii)
if attr.enum_label is not None: # enumerated
if A is None:
A = tiledb.open(uri, ctx=ctx)
info = A.enum(name)
arrow_schema_dict[name] = pa.dictionary(
index_type=arrow_type_from_tiledb_dtype(attr.dtype),
value_type=arrow_type_from_tiledb_dtype(
tiledb.datatypes.DataType.from_tiledb(info.type).np_dtype
),
ordered=info.ordered,
)
else: # non-enumerated
arrow_schema_dict[name] = arrow_type_from_tiledb_dtype(
attr.dtype, attr.isascii
)
if A is not None:
A.close()

return pa.schema(arrow_schema_dict)

Expand All @@ -170,8 +195,6 @@ def df_to_arrow(df: pd.DataFrame) -> pa.Table:
null_fields = set()
# Not for name, col in df.items() since we need df[k] on the left-hand sides
for k in df:
if df[k].dtype == "category":
df[k] = df[k].astype(df[k].cat.categories.dtype)
if df[k].isnull().any():
if df[k].isnull().all():
df[k] = pa.nulls(df.shape[0], pa.infer_type(df[k]))
Expand All @@ -182,6 +205,25 @@ def df_to_arrow(df: pd.DataFrame) -> pa.Table:
inplace=True,
)
null_fields.add(k)

# For categoricals, it's possible to get
# TypeError: Object of type bool_ is not JSON serializable
# deep within library functions. Debugging reveals that this happens when
# the df[key].values.ordered is of type np.bool_ rather than Python bool.
# So, we cast and reconstruct.
for key in df:
column = df[key]
if isinstance(column.dtype, pd.CategoricalDtype):
if hasattr(column.values, "categories"):
categories = column.values.categories

if hasattr(column.values, "ordered"):
ordered = bool(column.values.ordered)

df[key] = pd.Categorical(
values=column, categories=categories, ordered=ordered
)

arrow_table = pa.Table.from_pandas(df)
if null_fields:
md = arrow_table.schema.metadata
Expand Down
59 changes: 51 additions & 8 deletions apis/python/src/tiledbsoma/_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@
"""
Implementation of a SOMA DataFrame
"""
from typing import Any, Optional, Sequence, Tuple, Type, Union, cast
from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union, cast

import numpy as np
import pandas as pd
import pyarrow as pa
import somacore
import tiledb
from numpy.typing import NDArray
from somacore import options
from typing_extensions import Self

Expand Down Expand Up @@ -133,6 +135,9 @@ def create(
platform_config: Optional[options.PlatformConfig] = None,
context: Optional[SOMATileDBContext] = None,
tiledb_timestamp: Optional[OpenTimestamp] = None,
enumerations: Optional[Dict[str, Union[Sequence[Any], NDArray[Any]]]] = None,
ordered_enumerations: Optional[Sequence[str]] = None,
column_to_enumerations: Optional[Dict[str, str]] = None,
) -> "DataFrame":
"""Creates the data structure on disk/S3/cloud.
Expand Down Expand Up @@ -168,6 +173,9 @@ def create(
If specified, overrides the default timestamp
used to open this object. If unset, uses the timestamp provided by
the context.
enumeration:
If specified, enumerate attributes with the given sequence of values.
Returns:
The DataFrame.
Expand Down Expand Up @@ -208,6 +216,9 @@ def create(
schema,
index_column_names,
domain,
enumerations or {},
ordered_enumerations or [],
column_to_enumerations or {},
TileDBCreateOptions.from_platform_config(platform_config),
context,
)
Expand Down Expand Up @@ -261,6 +272,17 @@ def count(self) -> int:
self._check_open_read()
return cast(int, self._soma_reader().nnz())

def enumeration(self, name: str) -> Tuple[Any, ...]:
"""Doc place holder.
Returns:
Tuple[Any, ...]: _description_
"""
return tuple(self._soma_reader().get_enum(name))

def column_to_enumeration(self, name: str) -> str:
return str(self._soma_reader().get_enum_label_on_attr(name))

def __len__(self) -> int:
"""Returns the number of rows in the dataframe. Same as ``df.count``."""
return self.count
Expand Down Expand Up @@ -380,17 +402,19 @@ def write(
_util.check_type("values", values, (pa.Table,))

del platform_config # unused
dim_cols_map = {}
attr_cols_map = {}
dim_cols_map: Dict[str, pd.DataFrame] = {}
attr_cols_map: Dict[str, pd.DataFrame] = {}
dim_names_set = self.index_column_names
n = None

for name in values.schema.names:
n = len(values.column(name))
if name in dim_names_set:
dim_cols_map[name] = values.column(name).to_pandas()
col = values.column(name)
n = len(col)
cols_map = dim_cols_map if name in dim_names_set else attr_cols_map
if pa.types.is_dictionary(col.type) and col.num_chunks != 0:
cols_map[name] = col.chunk(0).indices.to_pandas()
else:
attr_cols_map[name] = values.column(name).to_pandas()
cols_map[name] = col.to_pandas()
if n is None:
raise ValueError(f"did not find any column names in {values.schema.names}")

Expand Down Expand Up @@ -634,7 +658,8 @@ def _canonicalize_schema(
raise ValueError(
f"All index names must be defined in the dataframe schema: '{index_column_name}' not in {schema_names_string}"
)
if schema.field(index_column_name).type not in [
dtype = schema.field(index_column_name).type
if not pa.types.is_dictionary(dtype) and dtype not in [
pa.int8(),
pa.uint8(),
pa.int16(),
Expand Down Expand Up @@ -665,6 +690,9 @@ def _build_tiledb_schema(
schema: pa.Schema,
index_column_names: Sequence[str],
domain: Optional[Sequence[Optional[Tuple[Any, Any]]]],
enumerations: Dict[str, Any],
ordered_enumerations: Sequence[str],
column_to_enumerations: Dict[str, str],
tiledb_create_options: TileDBCreateOptions,
context: SOMATileDBContext,
) -> tiledb.ArraySchema:
Expand Down Expand Up @@ -714,6 +742,17 @@ def _build_tiledb_schema(

dom = tiledb.Domain(dims, ctx=context.tiledb_ctx)

enums = []
if enumerations is not None:
for enum_name in enumerations:
enums.append(
tiledb.Enumeration(
enum_name,
enum_name in ordered_enumerations,
np.array(enumerations[enum_name]),
)
)

attrs = []
metadata = schema.metadata or {}
for attr_name in schema.names:
Expand All @@ -728,6 +767,9 @@ def _build_tiledb_schema(
filters=tiledb_create_options.attr_filters_tiledb(
attr_name, ["ZstdFilter"]
),
enum_label=column_to_enumerations[attr_name]
if attr_name in column_to_enumerations
else None,
ctx=context.tiledb_ctx,
)
attrs.append(attr)
Expand All @@ -737,6 +779,7 @@ def _build_tiledb_schema(
return tiledb.ArraySchema(
domain=dom,
attrs=attrs,
enums=enums,
sparse=True,
allows_duplicates=tiledb_create_options.allows_duplicates,
offsets_filters=tiledb_create_options.offsets_filters_tiledb(),
Expand Down
17 changes: 10 additions & 7 deletions apis/python/src/tiledbsoma/_query_condition.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,8 @@ def __attrs_post_init__(self):
"(Is this an empty expression?)"
)

def init_query_condition(self, schema: tiledb.ArraySchema, query_attrs: List[str]):
qctree = QueryConditionTree(schema, query_attrs)
def init_query_condition(self, uri: str, query_attrs: List[str]):
qctree = QueryConditionTree(tiledb.open(uri), query_attrs)
self.c_obj = qctree.visit(self.tree.body)

if not isinstance(self.c_obj, clib.PyQueryCondition):
Expand All @@ -143,7 +143,7 @@ def init_query_condition(self, schema: tiledb.ArraySchema, query_attrs: List[str

@attrs.define
class QueryConditionTree(ast.NodeVisitor):
schema: tiledb.ArraySchema
array: tiledb.Array
query_attrs: List[str]

def visit_BitOr(self, node):
Expand Down Expand Up @@ -237,8 +237,11 @@ def aux_visit_Compare(

att = self.get_att_from_node(att)
val = self.get_val_from_node(val)

dt = self.schema.attr(att).dtype
enum_label = self.array.attr(att).enum_label
if enum_label is not None:
dt = self.array.enum(enum_label).dtype
else:
dt = self.array.attr(att).dtype
dtype = "string" if dt.kind in "SUa" else dt.name
val = self.cast_val_to_dtype(val, dtype)

Expand Down Expand Up @@ -318,8 +321,8 @@ def get_att_from_node(self, node: QueryConditionNodeElem) -> Any:
f"Incorrect type for attribute name: {ast.dump(node)}"
)

if not self.schema.has_attr(att):
if self.schema.domain.has_dim(att):
if not self.array.schema.has_attr(att):
if self.array.schema.domain.has_dim(att):
raise tiledb.TileDBError(
f"`{att}` is a dimension. QueryConditions currently only "
"work on attributes."
Expand Down
6 changes: 3 additions & 3 deletions apis/python/src/tiledbsoma/_tiledb_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def schema(self) -> pa.Schema:
Lifecycle:
Experimental.
"""
return tiledb_schema_to_arrow(self._tiledb_array_schema())
return tiledb_schema_to_arrow(self._tiledb_array_schema(), self.uri, self._ctx)

def _tiledb_array_schema(self) -> tiledb.ArraySchema:
"""Returns the TileDB array schema, for internal use."""
Expand Down Expand Up @@ -101,8 +101,8 @@ def _soma_reader(
# Leave empty arguments out of kwargs to allow C++ constructor defaults to apply, as
# they're not all wrapped in std::optional<>.
kwargs: Dict[str, object] = {}
if schema:
kwargs["schema"] = schema
# if schema:
# kwargs["schema"] = schema
if column_names:
kwargs["column_names"] = column_names
if query_condition:
Expand Down
9 changes: 6 additions & 3 deletions apis/python/src/tiledbsoma/io/_registration/signatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,12 @@ def _string_dict_from_arrow_schema(schema: pa.Schema) -> Dict[str, str]:
Converts an Arrow schema to a string/string dict, which is easier on the eyes,
easier to convert from/to JSON for distributed logging, and easier to do del-key on.
"""

retval = {name: _stringify_type(schema.field(name).type) for name in schema.names}

retval = {}
for name in schema.names:
arrow_type = schema.field(name).type
if pa.types.is_dictionary(arrow_type):
arrow_type = arrow_type.index_type
retval[name] = _stringify_type(arrow_type)
# The soma_joinid field is specific to SOMA data but does not exist in AnnData/H5AD. When we
# pre-check an AnnData/H5AD input to see if it's appendable to an existing SOMA experiment, we
# must not punish the AnnData/H5AD input for it not having a soma_joinid column in its obs and
Expand Down
Loading

0 comments on commit af37773

Please sign in to comment.