Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

No PR #1730

Closed
wants to merge 9 commits into from
Closed

No PR #1730

4 changes: 4 additions & 0 deletions .github/workflows/python-ci-minimal.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,16 @@ jobs:
- runs-on: ubuntu-22.04
cc: gcc-11
cxx: g++-11
- runs-on: macos-12
cc: clang
cxx: clang++
uses: ./.github/workflows/python-ci-single.yml
with:
os: ${{ matrix.os }}
python_version: ${{ matrix.python-version }}
cc: ${{ matrix.cc }}
cxx: ${{ matrix.cxx }}
is_mac: ${{ contains(matrix.os, 'macos') }}
report_codecov: ${{ matrix.python-version == '3.10' }}
run_lint: ${{ matrix.python-version == '3.10' }}
secrets: inherit
10 changes: 1 addition & 9 deletions .github/workflows/r-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,7 @@ jobs:

- name: Install BioConductor package SingleCellExperiment
run: cd apis/r && tools/r-ci.sh install_bioc SingleCellExperiment

- name: Install r-universe build of tiledb-r (macOS)
if: ${{ matrix.os == 'macOS-latest' }}
run: cd apis/r && Rscript -e "install.packages('tiledb', repos = c('https://eddelbuettel.r-universe.dev', 'https://cloud.r-project.org'))"

- name: Install r-universe build of tiledb-r (linux)
if: ${{ matrix.os != 'macOS-latest' }}
run: cd apis/r && Rscript -e "options(bspm.version.check=TRUE); install.packages('tiledb', repos = c('https://eddelbuettel.r-universe.dev/bin/linux/jammy/4.3/', 'https://cloud.r-project.org'))"


- name: Dependencies
run: cd apis/r && tools/r-ci.sh install_all

Expand Down
22 changes: 7 additions & 15 deletions apis/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,21 +271,13 @@ def run(self):
"anndata < 0.9; python_version<'3.8'",
"anndata; python_version>='3.8'",
"attrs>=22.2",
# Pinning numba & its particular numpy constraints:
# The old pip solver (<=2020) doesn't deal with the transitive
# requirements (scanpy -> numba -> numpy) properly resulting in broken
# installation of incompatible numpy>=1.24. Issue #1051
# These pins can be removed either when there's a new numba release
# with less-particular numpy version constraints, or if we decide we no
# longer need to support the old pip solver (default on ubuntu 20.04).
#
# Also: numba doesn't support Python 3.11 until 0.57.0rc1.
# It' not preferable to pin to an RC dependency, so we only do this
# when we must, which is for 3.11.
"numba==0.56.4; python_version<'3.11'",
"numba==0.57; python_version=='3.11'",
"numpy>=1.18,<1.24; python_version<'3.11'",
"numpy>=1.18,<1.25; python_version=='3.11'",
"numba~=0.58.0; python_version>='3.8'",
# Older numba version needed for Python3.7.
# This older numba version was also incompatble with newer numpy
# versions, and the old pip solver (<=2020) needed us to explicate
# that constraint here (issue #1051).
"numba==0.56.4; python_version<'3.8'",
"numpy>=1.18,<1.24; python_version<'3.8'",
"pandas",
"pyarrow>=9.0.0",
"scanpy>=1.9.2",
Expand Down
30 changes: 19 additions & 11 deletions apis/python/src/tiledbsoma/_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,6 @@ def write(
"""
_util.check_type("values", values, (pa.Table,))

del platform_config # unused
dim_cols_map: Dict[str, pd.DataFrame] = {}
attr_cols_map: Dict[str, pd.DataFrame] = {}
dim_names_set = self.index_column_names
Expand All @@ -410,18 +409,25 @@ def write(
for name in values.schema.names:
col = values.column(name)
n = len(col)

cols_map = dim_cols_map if name in dim_names_set else attr_cols_map
if pa.types.is_dictionary(col.type) and col.num_chunks != 0:
attr = self._handle.schema.attr(name)
if attr.enum_label is not None:
# Normal case: writing categorical data to categorical schema.
cols_map[name] = col.chunk(0).indices.to_pandas()
else:
# Schema is non-categorical but the user is writing categorical.
# Simply decategoricalize for them.
if name in dim_names_set:
# Dims are never categorical. Decategoricalize for them.
cols_map[name] = pa.chunked_array(
[chunk.dictionary_decode() for chunk in col.chunks]
)
else:
attr = self._handle.schema.attr(name)
if attr.enum_label is not None:
# Normal case: writing categorical data to categorical schema.
cols_map[name] = col.chunk(0).indices.to_pandas()
else:
# Schema is non-categorical but the user is writing categorical.
# Simply decategoricalize for them.
cols_map[name] = pa.chunked_array(
[chunk.dictionary_decode() for chunk in col.chunks]
)
else:
cols_map[name] = col.to_pandas()

Expand All @@ -437,14 +443,17 @@ def write(
dim_cols_list = [dim_cols_map[name] for name in self.index_column_names]
dim_cols_tuple = tuple(dim_cols_list)
self._handle.writer[dim_cols_tuple] = attr_cols_map
self._consolidate_and_vacuum_fragment_metadata()
tiledb_create_options = TileDBCreateOptions.from_platform_config(
platform_config
)
if tiledb_create_options.consolidate_and_vacuum:
self._consolidate_and_vacuum()

return self

def _set_reader_coord(
self, sr: clib.SOMAArray, dim_idx: int, dim: tiledb.Dim, coord: object
) -> bool:

if coord is None:
return True # No constraint; select all in this dimension

Expand Down Expand Up @@ -582,7 +591,6 @@ def _set_reader_coord_by_py_seq_or_np_array(
def _set_reader_coord_by_numeric_slice(
self, sr: clib.SOMAArray, dim_idx: int, dim: tiledb.Dim, coord: Slice[Any]
) -> bool:

try:
lo_hi = _util.slice_to_numeric_range(coord, dim.domain)
except _util.NonNumericDimensionError:
Expand Down
7 changes: 5 additions & 2 deletions apis/python/src/tiledbsoma/_dense_nd_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,12 @@ def write(
"""
_util.check_type("values", values, (pa.Tensor,))

del platform_config # Currently unused.
self._handle.writer[coords] = values.to_numpy()
self._consolidate_and_vacuum_fragment_metadata()
tiledb_create_options = TileDBCreateOptions.from_platform_config(
platform_config
)
if tiledb_create_options.consolidate_and_vacuum:
self._consolidate_and_vacuum()
return self

@classmethod
Expand Down
19 changes: 12 additions & 7 deletions apis/python/src/tiledbsoma/_sparse_nd_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,11 @@ def write(
Lifecycle:
Experimental.
"""
del platform_config # Currently unused.

arr = self._handle.writer
tiledb_create_options = TileDBCreateOptions.from_platform_config(
platform_config
)

if isinstance(values, pa.SparseCOOTensor):
# Write bulk data
Expand All @@ -197,8 +199,9 @@ def write(
bounding_box = self._compute_bounding_box_metadata(maxes)
self._set_bounding_box_metadata(bounding_box)

# Consolidate non-bulk data
self._consolidate_and_vacuum_fragment_metadata()
if tiledb_create_options.consolidate_and_vacuum:
# Consolidate non-bulk data
self._consolidate_and_vacuum()
return self

if isinstance(values, (pa.SparseCSCMatrix, pa.SparseCSRMatrix)):
Expand All @@ -216,8 +219,9 @@ def write(
bounding_box = self._compute_bounding_box_metadata([nr - 1, nc - 1])
self._set_bounding_box_metadata(bounding_box)

# Consolidate non-bulk data
self._consolidate_and_vacuum_fragment_metadata()
if tiledb_create_options.consolidate_and_vacuum:
# Consolidate non-bulk data
self._consolidate_and_vacuum()
return self

if isinstance(values, pa.Table):
Expand All @@ -241,8 +245,9 @@ def write(
bounding_box = self._compute_bounding_box_metadata(maxes)
self._set_bounding_box_metadata(bounding_box)

# Consolidate non-bulk data
self._consolidate_and_vacuum_fragment_metadata()
if tiledb_create_options.consolidate_and_vacuum:
# Consolidate non-bulk data
self._consolidate_and_vacuum()
return self

raise TypeError(
Expand Down
29 changes: 25 additions & 4 deletions apis/python/src/tiledbsoma/_tiledb_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import ctypes
import os
import sys
from typing import Any, Dict, Optional, Sequence, Tuple
from typing import Any, Dict, List, Optional, Sequence, Tuple

import pyarrow as pa
import tiledb
Expand Down Expand Up @@ -194,20 +194,41 @@ def _create_internal(
cls._set_create_metadata(handle)
return handle

def _consolidate_and_vacuum_fragment_metadata(self) -> None:
def _consolidate_and_vacuum(
self, modes: List[str] = ["fragment_meta", "commits"]
) -> None:
"""
This post-ingestion helper consolidates and vacuums fragment metadata and commit files --
this is quick to do, and positively impacts query performance. It does _not_ consolidate
bulk array data, which is more time-consuming and should be done at the user's opt-in
discretion.
"""

for mode in ["fragment_meta", "commits"]:
for mode in modes:
self._consolidate(modes=[mode])
self._vacuum(modes=[mode])

def _consolidate(self, modes: List[str] = ["fragment_meta", "commits"]) -> None:
"""
This post-ingestion helper consolidates by default fragment metadata and commit files --
this is quick to do, and positively impacts query performance.
"""

for mode in modes:
cfg = self._ctx.config()
cfg["sm.consolidation.mode"] = mode
cfg["sm.vacuum.mode"] = mode
ctx = tiledb.Ctx(cfg)

tiledb.consolidate(self.uri, ctx=ctx)

def _vacuum(self, modes: List[str] = ["fragment_meta", "commits"]) -> None:
"""
This post-ingestion helper vacuums by default fragment metadata and commit files. Vacuuming is not multi-process safe and requires coordination that nothing is currently reading the files that will be vacuumed.
"""

for mode in modes:
cfg = self._ctx.config()
cfg["sm.vacuum.mode"] = mode
ctx = tiledb.Ctx(cfg)

tiledb.vacuum(self.uri, ctx=ctx)
3 changes: 3 additions & 0 deletions apis/python/src/tiledbsoma/options/_tiledb_create_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,9 @@ class TileDBCreateOptions:
attrs: Mapping[str, _ColumnConfig] = attrs_.field(
factory=dict, converter=_normalize_columns
)
consolidate_and_vacuum: bool = attrs_.field(
validator=vld.instance_of(bool), default=False
)

@classmethod
def from_platform_config(
Expand Down
35 changes: 35 additions & 0 deletions apis/python/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -914,6 +914,41 @@ def test_write_categorical_types(tmp_path):
assert (df == sdf.read().concat().to_pandas()).all().all()


def test_write_categorical_dims(tmp_path):
"""
Categories are not supported as dims. Here we test our handling of what we
do when we are given them as input.
"""
schema = pa.schema(
[
("soma_joinid", pa.int64()),
("string", pa.dictionary(pa.int8(), pa.large_string())),
]
)
with soma.DataFrame.create(
tmp_path.as_posix(),
schema=schema,
index_column_names=["soma_joinid"],
enumerations={
"enum-string": ["b", "a"],
},
ordered_enumerations=[],
column_to_enumerations={
"string": "enum-string",
},
) as sdf:
df = pd.DataFrame(
data={
"soma_joinid": pd.Categorical([0, 1, 2, 3], categories=[0, 1, 2, 3]),
"string": pd.Categorical(["a", "b", "a", "b"], categories=["b", "a"]),
}
)
sdf.write(pa.Table.from_pandas(df))

with soma.DataFrame.open(tmp_path.as_posix()) as sdf:
assert (df == sdf.read().concat().to_pandas()).all().all()


def test_result_order(tmp_path):
# cf. https://docs.tiledb.com/main/background/key-concepts-and-data-format#data-layout
schema = pa.schema(
Expand Down
29 changes: 29 additions & 0 deletions apis/python/tests/test_sparse_nd_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1100,3 +1100,32 @@ def test_timestamped_ops(tmp_path):
[0, 0],
]
assert a.nnz == 1


def test_empty_indexed_read(tmp_path):
"""
Verify that queries expected to return empty results actually
work. There are edge cases around SparseTensors, which are unable
to represent empty arrays.
"""
shape = (10, 100)
soma.SparseNDArray.create(
tmp_path.as_posix(), type=pa.uint16(), shape=shape
).close()

data = create_random_tensor("coo", shape, np.float64, 1.0)
with soma.SparseNDArray.open(tmp_path.as_posix(), "w") as a:
a.write(data)

with soma.SparseNDArray.open(tmp_path.as_posix()) as a:
coords = [slice(None), slice(None)]
assert sum(len(t) for t in a.read(coords).tables()) == 1000

coords = [[3], [4]]
assert sum(len(t) for t in a.read(coords).tables()) == 1

coords = [[3], []]
assert sum(len(t) for t in a.read(coords).tables()) == 0

coords = [[], [4]]
assert sum(len(t) for t in a.read(coords).tables()) == 0
3 changes: 3 additions & 0 deletions apis/python/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@


def readGitVersion():
# NOTE: this will fail if on a fork with unsynchronized tags.
# use `git fetch --tags upstream`
# and `git push --tags <your fork>`
try:
proc = subprocess.Popen(
("git", "describe", "--long", "--tags", "--match", "[0-9]*.*"),
Expand Down
8 changes: 5 additions & 3 deletions apis/r/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Description: Interface for working with 'TileDB'-based Stack of Matrices,
like those commonly used for single cell data analysis. It is documented at
<https://github.com/single-cell-data>; a formal specification available is at
<https://github.com/single-cell-data/SOMA/blob/main/abstract_specification.md>.
Version: 1.4.3.1
Version: 1.4.3.2
Authors@R: c(
person(given = "Aaron", family = "Wolen",
role = c("cre", "aut"), email = "[email protected]",
Expand Down Expand Up @@ -44,10 +44,12 @@ Imports:
data.table,
spdl,
rlang,
tools
tools,
tibble
LinkingTo:
Rcpp,
RcppSpdlog
RcppSpdlog,
RcppInt64
Additional_repositories: https://ghrr.github.io/drat
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
Expand Down
1 change: 1 addition & 0 deletions apis/r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ importFrom(spdl,debug)
importFrom(spdl,info)
importFrom(spdl,setup)
importFrom(stats,setNames)
importFrom(tibble,as_tibble)
importFrom(tools,R_user_dir)
importFrom(tools,file_path_sans_ext)
importFrom(urltools,url_compose)
Expand Down
1 change: 1 addition & 0 deletions apis/r/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

* Add support for writing `SummarizedExperiment` and `SingleCellExperiment` object to SOMAs
* Add support for bounding boxes for sparse arrays
* Add support for creating `SOMADataFrames` with `ordered()` columns


# tiledbsoma 1.4.0
Expand Down
Loading
Loading