Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce reader memory consumption #228

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 119 additions & 18 deletions src/spatialdata_io/readers/xenium.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from datatree.datatree import DataTree
from geopandas import GeoDataFrame
from joblib import Parallel, delayed
from ome_zarr.io import parse_url
from pyarrow import Table
from shapely import Polygon
from spatialdata import SpatialData
Expand Down Expand Up @@ -68,6 +69,7 @@ def xenium(
imread_kwargs: Mapping[str, Any] = MappingProxyType({}),
image_models_kwargs: Mapping[str, Any] = MappingProxyType({}),
labels_models_kwargs: Mapping[str, Any] = MappingProxyType({}),
output_path: Path | None = None,
) -> SpatialData:
"""
Read a *10X Genomics Xenium* dataset into a SpatialData object.
Expand Down Expand Up @@ -124,10 +126,13 @@ def xenium(
Keyword arguments to pass to the image models.
labels_models_kwargs
Keyword arguments to pass to the labels models.
output_path
Path to directly write the output to a zarr file. This can decrease the memory requirement. If not provided, the
function will return a :class:`spatialdata.SpatialData` object.

Returns
-------
:class:`spatialdata.SpatialData`
If `output_path` is provided, the function will return `None`. Otherwise, it will return a :class:`spatialdata.SpatialData` object.

Notes
-----
Expand Down Expand Up @@ -160,6 +165,8 @@ def xenium(
image_models_kwargs, labels_models_kwargs
)
path = Path(path)
output_path = Path(output_path) if output_path is not None else None

with open(path / XeniumKeys.XENIUM_SPECS) as f:
specs = json.load(f)
# to trigger the warning if the version cannot be parsed
Expand Down Expand Up @@ -204,11 +211,14 @@ def xenium(
table.obs[XeniumKeys.Z_LEVEL] = cell_summary_table[XeniumKeys.Z_LEVEL]
table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT]

polygons = {}
labels = {}
tables = {}
points = {}
images = {}
sdata = SpatialData()

if output_path is not None:
sdata.path = output_path
sdata._validate_can_safely_write_to_path(output_path, overwrite=False)
store = parse_url(output_path, mode="w").store
_ = zarr.group(store=store, overwrite=False)
store.close()

# From the public release notes here:
# https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/release-notes/release-notes-for-xoa
Expand All @@ -217,23 +227,41 @@ def xenium(
# nuclei to cells. Therefore for the moment we only link the table to the cell labels, and not to the nucleus
# labels.
if nucleus_labels:
labels["nucleus_labels"], _ = _get_labels_and_indices_mapping(
sdata.labels["nucleus_labels"], _ = _get_labels_and_indices_mapping(
path,
XeniumKeys.CELLS_ZARR,
specs,
mask_index=0,
labels_name="nucleus_labels",
labels_models_kwargs=labels_models_kwargs,
)
if output_path is not None:
sdata._write_element(
element=sdata.labels["nucleus_labels"],
zarr_container_path=output_path,
element_type="labels",
element_name="nucleus_labels",
overwrite=False,
)
del sdata.labels["nucleus_labels"]
if cells_labels:
labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping(
sdata.labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping(
path,
XeniumKeys.CELLS_ZARR,
specs,
mask_index=1,
labels_name="cell_labels",
labels_models_kwargs=labels_models_kwargs,
)
if output_path is not None:
sdata._write_element(
element=sdata.labels["cell_labels"],
zarr_container_path=output_path,
element_type="labels",
element_name="cell_labels",
overwrite=False,
)
del sdata.labels["cell_labels"]
if cell_labels_indices_mapping is not None and table is not None:
if not pd.DataFrame.equals(cell_labels_indices_mapping["cell_id"], table.obs[str(XeniumKeys.CELL_ID)]):
warnings.warn(
Expand All @@ -249,41 +277,86 @@ def xenium(
table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels"

if nucleus_boundaries:
polygons["nucleus_boundaries"] = _get_polygons(
sdata.shapes["nucleus_boundaries"] = _get_polygons(
path,
XeniumKeys.NUCLEUS_BOUNDARIES_FILE,
specs,
n_jobs,
idx=table.obs[str(XeniumKeys.CELL_ID)].copy(),
)
if output_path is not None:
sdata._write_element(
element=sdata.shapes["nucleus_boundaries"],
zarr_container_path=output_path,
element_type="shapes",
element_name="nucleus_boundaries",
overwrite=False,
)
del sdata.shapes["nucleus_boundaries"]

if cells_boundaries:
polygons["cell_boundaries"] = _get_polygons(
sdata.shapes["cell_boundaries"] = _get_polygons(
path,
XeniumKeys.CELL_BOUNDARIES_FILE,
specs,
n_jobs,
idx=table.obs[str(XeniumKeys.CELL_ID)].copy(),
)
if output_path is not None:
sdata._write_element(
element=sdata.shapes["cell_boundaries"],
zarr_container_path=output_path,
element_type="shapes",
element_name="cell_boundaries",
overwrite=False,
)
del sdata.shapes["cell_boundaries"]

if transcripts:
points["transcripts"] = _get_points(path, specs)
sdata.points["transcripts"] = _get_points(path, specs)
if output_path is not None:
sdata._write_element(
element=sdata.points["transcripts"],
zarr_container_path=output_path,
element_type="points",
element_name="transcripts",
overwrite=False,
)
del sdata.points["transcripts"]

if version is None or version < packaging.version.parse("2.0.0"):
if morphology_mip:
images["morphology_mip"] = _get_images(
sdata.images["morphology_mip"] = _get_images(
path,
XeniumKeys.MORPHOLOGY_MIP_FILE,
imread_kwargs,
image_models_kwargs,
)
if output_path is not None:
sdata._write_element(
element=sdata.images["morphology_mip"],
zarr_container_path=output_path,
element_type="images",
element_name="morphology_mip",
overwrite=False,
)
del sdata.images["morphology_mip"]
if morphology_focus:
images["morphology_focus"] = _get_images(
sdata.images["morphology_focus"] = _get_images(
path,
XeniumKeys.MORPHOLOGY_FOCUS_FILE,
imread_kwargs,
image_models_kwargs,
)
if output_path is not None:
sdata._write_element(
element=sdata.images["morphology_focus"],
zarr_container_path=output_path,
element_type="images",
element_name="morphology_focus",
overwrite=False,
)
del sdata.images["morphology_focus"]
else:
if morphology_focus:
morphology_focus_dir = path / XeniumKeys.MORPHOLOGY_FOCUS_DIR
Expand Down Expand Up @@ -331,28 +404,56 @@ def filter(self, record: logging.LogRecord) -> bool:
"c_coords" not in image_models_kwargs
), "The channel names for the morphology focus images are handled internally"
image_models_kwargs["c_coords"] = list(channel_names.values())
images["morphology_focus"] = _get_images(
sdata.images["morphology_focus"] = _get_images(
morphology_focus_dir,
XeniumKeys.MORPHOLOGY_FOCUS_CHANNEL_IMAGE.format(0),
imread_kwargs,
image_models_kwargs,
)
del image_models_kwargs["c_coords"]
if output_path is not None:
sdata._write_element(
element=sdata.images["morphology_focus"],
zarr_container_path=output_path,
element_type="images",
element_name="morphology_focus",
overwrite=False,
)
del sdata.images["morphology_focus"]
logger.removeFilter(IgnoreSpecificMessage())

if table is not None:
tables["table"] = table
sdata.tables["table"] = table
if output_path is not None:
sdata._write_element(
element=sdata.tables["table"],
zarr_container_path=output_path,
element_type="tables",
element_name="table",
overwrite=False,
)
del sdata.tables["table"]

elements_dict = {"images": images, "labels": labels, "points": points, "tables": tables, "shapes": polygons}
if cells_as_circles:
elements_dict["shapes"][specs["region"]] = circles
sdata = SpatialData(**elements_dict)
sdata.shapes[specs["region"]] = circles

# find and add additional aligned images
if aligned_images:
extra_images = _add_aligned_images(path, imread_kwargs, image_models_kwargs)
for key, value in extra_images.items():
sdata.images[key] = value
if output_path is not None:
sdata._write_element(
element=sdata.images[key],
zarr_container_path=output_path,
element_type="images",
element_name=key,
overwrite=False,
)
del sdata.images[key]

if output_path is not None:
sdata.write_consolidated_metadata()

return sdata

Expand Down
Loading