Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update test read write on disk #515

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 30 additions & 11 deletions src/spatialdata/_core/spatialdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1059,27 +1059,34 @@ def _validate_can_safely_write_to_path(
"Overwriting non-Zarr stores is not supported to prevent accidental data loss."
)
if not overwrite:
raise ValueError("The Zarr store already exists. Use `overwrite=True` to try overwriting the store.")
raise ValueError(
"The Zarr store already exists. Use `overwrite=True` to try overwriting the store."
"Please note that only Zarr stores not currently in used by the current SpatialData object can be "
"overwritten."
)
ERROR_MSG = (
"Cannot overwrite. The target path of the write operation is in use. Please save the data to a "
"different location. "
)
WORKAROUND = (
"\nWorkaround: please see discussion here https://github.com/scverse/spatialdata/discussions/520."
)
if any(_backed_elements_contained_in_path(path=file_path, object=self)):
raise ValueError(
"The file path specified is a parent directory of one or more files used for backing for one or "
"more elements in the SpatialData object. Please save the data to a different location."
ERROR_MSG + "\nDetails: the target path contains one or more files that Dask use for "
"backing elements in the SpatialData object." + WORKAROUND
)
if self.path is not None and (
_is_subfolder(parent=self.path, child=file_path) or _is_subfolder(parent=file_path, child=self.path)
):
if saving_an_element and _is_subfolder(parent=self.path, child=file_path):
raise ValueError(
"Currently, overwriting existing elements is not supported. It is recommended to manually save "
"the element in a different location and then eventually copy it to the original desired "
"location. Deleting the old element and saving the new element to the same location is not "
"advised because data loss may occur if the execution is interrupted during writing."
ERROR_MSG + "\nDetails: the target path in which to save an element is a subfolder "
"of the current Zarr store." + WORKAROUND
)
raise ValueError(
"The file path specified either contains either is contained in the one used for backing. "
"Currently, overwriting the backing Zarr store is not supported to prevent accidental data loss "
"that could occur if the execution is interrupted during writing. Please save the data to a "
"different location."
ERROR_MSG + "\nDetails: the target path either contains, coincides or is contained in"
" the current Zarr store." + WORKAROUND
)

def write(
Expand Down Expand Up @@ -2080,6 +2087,18 @@ def __setitem__(self, key: str, value: SpatialElement | AnnData) -> None:
else:
raise TypeError(f"Unknown element type with schema: {schema!r}.")

def __delitem__(self, key: str) -> None:
"""
Delete the element from the SpatialData object.

Parameters
----------
key
The name of the element to delete.
"""
element_type, _, _ = self._find_element(key)
getattr(self, element_type).__delitem__(key)


class QueryManager:
"""Perform queries on SpatialData objects."""
Expand Down
9 changes: 9 additions & 0 deletions tests/core/operations/test_spatialdata_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,15 @@ def test_set_item(full_sdata: SpatialData) -> None:
full_sdata[name] = full_sdata[name]


def test_del_item(full_sdata: SpatialData) -> None:
for name in ["image2d", "labels2d", "points_0", "circles", "poly"]:
del full_sdata[name]
with pytest.raises(KeyError):
del full_sdata[name]
with pytest.raises(KeyError, match="Could not find element with name"):
_ = full_sdata["not_present"]


def test_no_shared_transformations() -> None:
"""Test transformation dictionary copy for transformations not to be shared."""
sdata = blobs()
Expand Down
89 changes: 66 additions & 23 deletions tests/io/test_readwrite.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pytest
from anndata import AnnData
from numpy.random import default_rng
from spatialdata import SpatialData
from spatialdata import SpatialData, read_zarr
from spatialdata._io._utils import _are_directories_identical, get_dask_backing_files
from spatialdata.datasets import blobs
from spatialdata.models import Image2DModel
Expand Down Expand Up @@ -131,7 +131,20 @@ def test_incremental_io_in_memory(
with pytest.raises(KeyError, match="Key `table` already exists."):
sdata["table"] = v

def test_incremental_io_on_disk(self, tmp_path: str, full_sdata: SpatialData):
@pytest.mark.parametrize("dask_backed", [True, False])
@pytest.mark.parametrize("workaround", [1, 2])
def test_incremental_io_on_disk(
self, tmp_path: str, full_sdata: SpatialData, dask_backed: bool, workaround: int
) -> None:
"""
This tests shows workaround on how to rewrite existing data on disk.

The user is recommended to study them, understand the implications and eventually adapt them to their use case.
We are working on simpler workarounds and on a more robust solution to this problem, but unfortunately it is not
yet available.

In particular the complex "dask-backed" case for workaround 1 could be simplified once
"""
tmpdir = Path(tmp_path) / "incremental_io.zarr"
sdata = SpatialData()
sdata.write(tmpdir)
Expand All @@ -147,32 +160,63 @@ def test_incremental_io_on_disk(self, tmp_path: str, full_sdata: SpatialData):
]:
sdata[name] = full_sdata[name]
sdata.write_element(name)
if dask_backed:
# this forces the element to write to be dask-backed from disk. In this case, overwriting the data is
# more laborious because we are writing the data to the same location that defines the data!
sdata = read_zarr(sdata.path)

with pytest.raises(
ValueError, match="The Zarr store already exists. Use `overwrite=True` to try overwriting the store."
):
sdata.write_element(name)

with pytest.raises(ValueError, match="Currently, overwriting existing elements is not supported."):
with pytest.raises(ValueError, match="Cannot overwrite."):
sdata.write_element(name, overwrite=True)

# workaround 1, mostly safe (untested for Windows platform, network drives, multi-threaded
# setups, ...)
new_name = f"{name}_new_place"
# write a copy of the data
sdata[new_name] = sdata[name]
sdata.write_element(new_name)
# rewrite the original data
sdata.delete_element_from_disk(name)
sdata.write_element(name)
# remove the copy
sdata.delete_element_from_disk(new_name)
element_type = sdata._element_type_from_element_name(new_name)
del getattr(sdata, element_type)[new_name]

# workaround 2, unsafe but sometimes acceptable depending on the user's workflow
sdata.delete_element_from_disk(name)
sdata.write_element(name)
if workaround == 1:
new_name = f"{name}_new_place"
# workaround 1, mostly safe (untested for Windows platform, network drives, multi-threaded
# setups, ...). If the scenario matches your use case, please use with caution.

if not dask_backed: # easier case
# a. write a backup copy of the data
sdata[new_name] = sdata[name]
sdata.write_element(new_name)
# b. rewrite the original data
sdata.delete_element_from_disk(name)
sdata.write_element(name)
# c. remove the backup copy
del sdata[new_name]
sdata.delete_element_from_disk(new_name)
else: # dask-backed case, more complex
# a. write a backup copy of the data
sdata[new_name] = sdata[name]
sdata.write_element(new_name)
# a2. remove the in-memory copy from the SpatialData object (note,
# at this point the backup copy still exists on-disk)
del sdata[new_name]
del sdata[name]
# a3 load the backup copy into memory
sdata_copy = read_zarr(sdata.path)
# b1. rewrite the original data
sdata.delete_element_from_disk(name)
sdata[name] = sdata_copy[new_name]
sdata.write_element(name)
# b2. reload the new data into memory (because it has been written but in-memory it still points
# from the backup location)
sdata = read_zarr(sdata.path)
# c. remove the backup copy
del sdata[new_name]
sdata.delete_element_from_disk(new_name)
elif workaround == 2:
# workaround 2, unsafe but sometimes acceptable depending on the user's workflow.

# this works only if the data is not dask-backed, otherwise an exception will be raised because the code
# would be trying to delete the data that the Dask object is pointing to!
if not dask_backed:
# a. rewrite the original data (risky!)
sdata.delete_element_from_disk(name)
sdata.write_element(name)

def test_incremental_io_table_legacy(self, table_single_annotation: SpatialData) -> None:
s = table_single_annotation
Expand Down Expand Up @@ -291,8 +335,7 @@ def test_overwrite_fails_when_no_zarr_store_bug_dask_backed_data(self, full_sdat

with pytest.raises(
ValueError,
match="The file path specified is a parent directory of one or more files used for backing for one "
"or ",
match="Cannot overwrite.",
):
full_sdata.write(f, overwrite=True)

Expand All @@ -310,7 +353,7 @@ def test_overwrite_fails_when_zarr_store_present(self, full_sdata):

with pytest.raises(
ValueError,
match="The file path specified either contains either is contained in the one used for backing.",
match="Cannot overwrite.",
):
full_sdata.write(f, overwrite=True)

Expand Down