Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: update validation for obs['is_primary_data'] to enforce False if uns['spatial']['is_single'] is False #865

Merged
merged 42 commits into from
May 2, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
ab3c971
feat(schema 5.1.0): validate uns[spatial]
MillenniumFalconMechanic Apr 24, 2024
b6c93ad
Linting
MillenniumFalconMechanic Apr 24, 2024
9010444
Updated error messages
MillenniumFalconMechanic Apr 25, 2024
ec8fb63
Review updates
MillenniumFalconMechanic Apr 25, 2024
832b297
Review updates
MillenniumFalconMechanic Apr 25, 2024
faec600
Reverted update
MillenniumFalconMechanic Apr 25, 2024
4c8a739
feat(schema 5.1.0): validate obs[array_col], obs[array_row] and obs[i…
MillenniumFalconMechanic Apr 26, 2024
476b869
Corrected test name
MillenniumFalconMechanic Apr 26, 2024
b6020a8
Linting
MillenniumFalconMechanic Apr 26, 2024
74496f5
Updated forbidden tests
MillenniumFalconMechanic Apr 26, 2024
b37c52c
Update cellxgene_schema_cli/cellxgene_schema/validate.py
MillenniumFalconMechanic Apr 29, 2024
126cdb1
Updated tests
MillenniumFalconMechanic Apr 29, 2024
3b01478
Review updates
MillenniumFalconMechanic Apr 29, 2024
1fea0cd
feat: update validation for obs['is_primary_data'] to enforce False i…
nayib-jose-gloria Apr 30, 2024
f518647
Merge branch 'main' into nayib/834-primary-data
nayib-jose-gloria Apr 30, 2024
6f25b11
unit test refactor
nayib-jose-gloria Apr 30, 2024
7fe05b2
feat(schema 5.1.0): validate uns[spatial]
MillenniumFalconMechanic Apr 24, 2024
dc30afb
Linting
MillenniumFalconMechanic Apr 24, 2024
869c29f
Updated error messages
MillenniumFalconMechanic Apr 25, 2024
60cdf04
Review updates
MillenniumFalconMechanic Apr 25, 2024
a2935a5
Review updates
MillenniumFalconMechanic Apr 25, 2024
76d62c9
Reverted update
MillenniumFalconMechanic Apr 25, 2024
abb1ca4
feat(schema 5.1.0): validate obs[array_col], obs[array_row] and obs[i…
MillenniumFalconMechanic Apr 26, 2024
e10eafe
Corrected test name
MillenniumFalconMechanic Apr 26, 2024
4208307
Linting
MillenniumFalconMechanic Apr 26, 2024
1cb1463
Updated forbidden tests
MillenniumFalconMechanic Apr 26, 2024
b2a676c
Update cellxgene_schema_cli/cellxgene_schema/validate.py
MillenniumFalconMechanic Apr 29, 2024
7d06a3f
Updated tests
MillenniumFalconMechanic Apr 29, 2024
796010b
Review updates
MillenniumFalconMechanic Apr 29, 2024
0e5dfd7
Updated tissue position validation to be row-specific
MillenniumFalconMechanic Apr 30, 2024
d202136
Linting
MillenniumFalconMechanic Apr 30, 2024
07ae9d9
Corrected test
MillenniumFalconMechanic Apr 30, 2024
4d4fb2b
Updated tests post-rebase
MillenniumFalconMechanic Apr 30, 2024
7be5f4b
Merge branch 'mim/829-spatial-obs' into nayib/834-primary-data
nayib-jose-gloria May 1, 2024
6ccd093
fix test
nayib-jose-gloria May 1, 2024
5a3c660
lint
nayib-jose-gloria May 1, 2024
01bea67
Merge branch 'main' into nayib/834-primary-data
nayib-jose-gloria May 1, 2024
eb31ce3
tests
nayib-jose-gloria May 1, 2024
580f8e0
refactor + linting
nayib-jose-gloria May 1, 2024
8bc61a9
fix test
nayib-jose-gloria May 2, 2024
db7801d
Merge branch 'main' into nayib/834-primary-data
nayib-jose-gloria May 2, 2024
680e62f
lint
nayib-jose-gloria May 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 120 additions & 11 deletions cellxgene_schema_cli/cellxgene_schema/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@
ASSAY_VISIUM = "EFO:0010961"
ASSAY_SLIDE_SEQV2 = "EFO:0030062"

ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE = "obs['assay_ontology_term_id'] 'EFO:0010961' (Visium Spatial Gene Expression) and uns['spatial']['is_single'] is True."
ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE_FORBIDDEN = f"is only allowed for {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE}"
ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE_REQUIRED = f"is required for {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE}"


class Validator:
"""Handles validation of AnnData"""
Expand All @@ -46,6 +50,8 @@ def reset(self):
self._raw_layer_exists = None
self.is_seurat_convertible: bool = True
self.is_spatial = None
self.is_visium = None
self.is_visium_and_is_single_true = None

# Matrix (e.g., X, raw.X, ...) number non-zero cache
self.number_non_zero = dict()
Expand Down Expand Up @@ -76,6 +82,20 @@ def _is_supported_spatial_assay(self) -> bool:
self.is_spatial = False
return self.is_spatial

def _is_visium_and_is_single_true(self) -> bool:
"""
Determine if the assay_ontology_term_id is Visium (EFO:0010961) and uns.spatial.is_single is True.

:return True if assay_ontology_term_id is Visium and is_single_cell is True, False otherwise.
:rtype bool
"""
if self.is_visium_and_is_single_true is None:
try:
self.is_visium_and_is_single_true = self._is_visium() and self.adata.uns["spatial"]["is_single"]
except AttributeError:
self.is_visium_and_is_single_true = False
return self.is_visium_and_is_single_true

def _validate_encoding_version(self):
import h5py

Expand Down Expand Up @@ -1331,7 +1351,99 @@ def _check_column_availability(self):

def _check_spatial(self):
"""
Validate spatial-related values of the AnnData object. Validation is not defined in schema definition yaml.
Sequence validation of spatial-related values of the AnnData object.

:rtype none
"""
self._check_spatial_uns()
self._check_spatial_obs()

def _check_spatial_obs(self):
"""
Validate obs spatial-related values of the AnnData object. Validation is not defined in schema definition yaml.
Errors are added to self.errors.

:rtype none
"""

# Exit if obs is not specified. Error is reported in core validate functionality.
obs_component = getattr_anndata(self.adata, "obs")
if obs_component is None:
return

# obs spatial validation is dependent on uns.spatial.is_single; exit if not specified. Errors are
# reported downstream.
uns_component = getattr_anndata(self.adata, "uns")
if uns_component is None or "spatial" not in uns_component or "is_single" not in uns_component["spatial"]:
return

# Validate tissue positions.
self._validate_spatial_tissue_position("array_col", 0, 127)
self._validate_spatial_tissue_position("array_row", 0, 77)
self._validate_spatial_tissue_position("in_tissue", 0, 1)

self._validate_spatial_is_primary_data()

def _validate_spatial_is_primary_data(self):
"""
Validate is_primary_data for spatial datasets.
"""
is_single = (
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we use _is_single() here once this is rebased on main?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah yes good call

self.adata.uns["spatial"]["is_single"]
if "spatial" in self.adata.uns and "is_single" in self.adata.uns["spatial"]
else None
)
obs = getattr_anndata(self.adata, "obs")
if obs is None or "is_primary_data" not in obs:
return
if is_single is False and obs["is_primary_data"].any():
self.errors.append(
"When uns['spatial']['is_single'] is False, "
"obs['is_primary_data'] must be False for all rows."
)

def _validate_spatial_tissue_position(self, tissue_position_name: str, min: int, max: int):
"""
Validate tissue position is allowed and required, and are integers within the given range. Validation is not defined in
schema definition yaml.

:rtype none
"""
# Tissue position is foribidden if assay is not Visium or is_single is False.
is_visium_and_uns_is_single = self._is_visium_and_is_single_true()
obs_tissue_position = self.adata.obs.get(tissue_position_name)
if obs_tissue_position is not None and not is_visium_and_uns_is_single:
self.errors.append(f"obs['{tissue_position_name}'] {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE_FORBIDDEN}")
return

# Exit if we're not dealing with Visium and _is_single True as no further checks are necessary.
if not is_visium_and_uns_is_single:
return

# Tissue position is required.
if obs_tissue_position is None:
self.errors.append(f"obs['{tissue_position_name}'] {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE_REQUIRED}")
return

# Tissue position must be an int.
if not np.issubdtype(obs_tissue_position.dtype, np.integer):
self.errors.append(f"obs['{tissue_position_name}'] must be of int type, it is {obs_tissue_position.dtype}.")
return

# Tissue position must be within the given range.
if not ((obs_tissue_position >= min) & (obs_tissue_position <= max)).all():
if tissue_position_name == "in_tissue":
error_message_token = f"{min} or {max}"
else:
error_message_token = f"between {min} and {max}"
self.errors.append(
f"obs['{tissue_position_name}'] must be {error_message_token}, the min and max are {obs_tissue_position.min()} and {obs_tissue_position.max()}. "
f"This must be the value of the column tissue_positions_in_tissue from the tissue_positions_list.csv or tissue_positions.csv."
)

def _check_spatial_uns(self):
"""
Validate uns spatial-related values of the AnnData object. Validation is not defined in schema definition yaml.
Errors are added to self.errors.

:rtype none
Expand Down Expand Up @@ -1395,13 +1507,9 @@ def _check_spatial(self):
return

# library_id is forbidden if assay is not Visium or is_single is false.
is_visium = self._is_visium()
is_visium_and_uns_is_single = is_visium and uns_is_single
is_visium_and_uns_is_single = self._is_visium_and_is_single_true()
if len(library_ids) > 0 and not is_visium_and_uns_is_single:
self.errors.append(
"uns['spatial'][library_id] is only allowed for obs['assay_ontology_term_id'] "
"'EFO:0010961' (Visium Spatial Gene Expression) and uns['spatial']['is_single'] is True."
)
self.errors.append(f"uns['spatial'][library_id] {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE_FORBIDDEN}")
# Exit as library_id is not allowed.
return

Expand All @@ -1412,8 +1520,7 @@ def _check_spatial(self):
# library_id is required if assay is Visium and is_single is True.
if len(library_ids) == 0:
self.errors.append(
"uns['spatial'] must contain at least one key representing the library_id when obs['assay_ontology_term_id'] "
"'EFO:0010961' (Visium Spatial Gene Expression) and uns['spatial']['is_single'] is True."
f"uns['spatial'] must contain at least one key representing the library_id when {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE}"
)
# Exit as library_id is missing.
return
Expand Down Expand Up @@ -1531,8 +1638,10 @@ def _is_visium(self) -> bool:
:return True if assay_ontology_term_id is Visium, False otherwise.
:rtype bool
"""
assay_ontology_term_id = self.adata.obs.get("assay_ontology_term_id")
return assay_ontology_term_id is not None and (assay_ontology_term_id == ASSAY_VISIUM).any()
if self.is_visium is None:
assay_ontology_term_id = self.adata.obs.get("assay_ontology_term_id")
self.is_visium = assay_ontology_term_id is not None and (assay_ontology_term_id == ASSAY_VISIUM).any()
return self.is_visium

def _validate_spatial_image_shape(self, image_name: str, image: np.ndarray, max_dimension: int = None):
"""
Expand Down
73 changes: 73 additions & 0 deletions cellxgene_schema_cli/tests/fixtures/examples_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,8 @@
good_obs_visium = pd.DataFrame(
[
[
1,
1,
"CL:0000066",
"EFO:0010961",
"MONDO:0100096",
Expand All @@ -141,8 +143,11 @@
"HsapDv:0000003",
"donor_1",
"na",
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

diff is from #827 merged into main

0,
],
[
2,
2,
"CL:0000192",
"EFO:0010961",
"PATO:0000461",
Expand All @@ -155,10 +160,13 @@
"MmusDv:0000003",
"donor_2",
"na",
1,
],
],
index=["X", "Y"],
columns=[
"array_col",
"array_row",
"cell_type_ontology_term_id",
"assay_ontology_term_id",
"disease_ontology_term_id",
Expand All @@ -171,6 +179,7 @@
"development_stage_ontology_term_id",
"donor_id",
"suspension_type",
"in_tissue",
],
)

Expand Down Expand Up @@ -231,6 +240,58 @@
good_obs_slide_seqv2.loc[:, ["suspension_type"]] = good_obs_slide_seqv2.astype("category")
good_obs_slide_seqv2.loc[:, ["tissue_type"]] = good_obs.astype("category")

good_obs_visium_is_single_false = pd.DataFrame(
[
[
"CL:0000066",
"EFO:0010961",
"MONDO:0100096",
"NCBITaxon:9606",
"PATO:0000383",
"UBERON:0002048",
"tissue",
False,
"HANCESTRO:0575",
"HsapDv:0000003",
"donor_1",
"na",
],
[
"CL:0000192",
"EFO:0010961",
"PATO:0000461",
"NCBITaxon:10090",
"unknown",
"CL:0000192",
"cell culture",
False,
"na",
"MmusDv:0000003",
"donor_2",
"na",
],
],
index=["X", "Y"],
columns=[
"cell_type_ontology_term_id",
"assay_ontology_term_id",
"disease_ontology_term_id",
"organism_ontology_term_id",
"sex_ontology_term_id",
"tissue_ontology_term_id",
"tissue_type",
"is_primary_data",
"self_reported_ethnicity_ontology_term_id",
"development_stage_ontology_term_id",
"donor_id",
"suspension_type",
],
)

good_obs_visium_is_single_false.loc[:, ["donor_id"]] = good_obs_visium_is_single_false.astype("category")
good_obs_visium_is_single_false.loc[:, ["suspension_type"]] = good_obs_visium_is_single_false.astype("category")
good_obs_visium_is_single_false.loc[:, ["tissue_type"]] = good_obs_visium_is_single_false.astype("category")

# ---
# 2. Creating individual var components: valid object and valid object and with labels

Expand Down Expand Up @@ -318,6 +379,14 @@
},
}

good_uns_with_is_single_false = {
"title": "A title",
"default_embedding": "X_umap",
"X_approximate_distribution": "normal",
"batch_condition": ["is_primary_data"],
"spatial": {"is_single": False},
}

good_uns_with_slide_seqV2_spatial = {
"title": "A title",
"default_embedding": "X_umap",
Expand Down Expand Up @@ -399,6 +468,10 @@
var=good_var,
)

adata_spatial_is_single_false = anndata.AnnData(
X=sparse.csr_matrix(X), obs=good_obs_visium_is_single_false, uns=good_uns_with_is_single_false, obsm=good_obsm_spatial, var=good_var
)

# anndata for testing migration
unmigrated_obs = pd.DataFrame(
[
Expand Down
55 changes: 28 additions & 27 deletions cellxgene_schema_cli/tests/test_schema_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ def validator_with_adata_missing_raw(validator) -> Validator:
validator.adata = examples.adata_non_raw.copy()
return validator

@pytest.fixture
def validator_with_spatial_and_is_single_false(validator) -> Validator:
validator.adata = examples.adata_spatial_is_single_false.copy()
return validator

@pytest.fixture
def validator_with_visium_assay(validator) -> Validator:
Expand Down Expand Up @@ -1101,6 +1105,17 @@ def test_is_primary_data(self, validator_with_adata):
"ERROR: Column 'is_primary_data' in dataframe 'obs' " "must be boolean, not 'object'."
]

def test_is_primary_data__spatial(self, validator_with_spatial_and_is_single_false):
"""
is_primary_data bool. This MUST be False if dataset has uns['spatial']['is_single'] == False
"""
validator = validator_with_spatial_and_is_single_false
validator.adata.obs["is_primary_data"][0] = True
validator.validate_adata()
assert validator.errors == [
"ERROR: When uns['spatial']['is_single'] is False, obs['is_primary_data'] must be False for all rows."
]

def test_donor_id_must_be_categorical(self, validator_with_adata):
"""
donor_id categorical with str categories. This MUST be free-text that identifies
Expand Down Expand Up @@ -1993,25 +2008,18 @@ def test_obsm_values_no_X_embedding__non_spatial_dataset(self, validator_with_ad
"WARNING: Validation of raw layer was not performed due to current errors, try again after fixing current errors.",
]

@pytest.mark.parametrize(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

split this test up into two for simplicity in the set-up

"assay_ontology_term_id, uns_spatial",
[
("EFO:0010961", examples.good_uns_with_visium_spatial["spatial"]),
("EFO:0030062", examples.good_uns_with_slide_seqV2_spatial["spatial"]),
],
)
def test_obsm_values_no_X_embedding__spatial_dataset(
self, validator_with_adata, assay_ontology_term_id, uns_spatial
):
validator = validator_with_adata
validator.adata.obsm["harmony"] = validator.adata.obsm["X_umap"]
validator.adata.uns["default_embedding"] = "harmony"
validator.adata.uns["spatial"] = uns_spatial
validator.adata.obsm["spatial"] = validator.adata.obsm["X_umap"]
def test_obsm_values_no_X_embedding__visium_dataset(self, validator_with_visium_assay):
validator = validator_with_visium_assay
validator.adata.uns["default_embedding"] = "spatial"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not familiar with the complete, required shape for adata here but I see there are a couple of differences between the previous version and validator_with_visium_assay (e.g. validator.adata.obsm["harmony"] is no longer defined, validator.adata.uns["default_embedding"] is now spatial). Are these updates expected/OK?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, its expected

for non-spatial datasets, not having an X-prefix embedding like X_umap should result in an error (see test_obsm_values_no_X_embedding__non_spatial_dataset).

However, for spatial datasets, we do not require an X-prefix embedding, just 'spatial' embedding. This test case was originally modeled after test_obsm_values_no_X_embedding__non_spatial_dataset, which used 'harmony' as its non-X-prefix embedding to trigger the error. But I realized spatial is already a non-X-prefix embedding so we didn't need to add another one to test.

del validator.adata.obsm["X_umap"]
validator.validate_adata()
assert validator.errors == []
assert validator.is_spatial is True

def test_obsm_values_no_X_embedding__slide_seq_v2_dataset(self, validator_with_slide_seq_v2_assay):
validator = validator_with_slide_seq_v2_assay
validator.adata.uns["default_embedding"] = "spatial"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like this test is missing validator.validate_adata() and the corresponding assertions.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sad, it must've gotten lost along the way during a merge resolution

del validator.adata.obsm["X_umap"]
validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id
validator.adata.obs["suspension_type"] = "na"
validator.adata.obs.loc[:, ["suspension_type"]] = validator.adata.obs.astype("category")
validator.validate_adata()
assert validator.errors == []
assert validator.is_spatial is True
Expand All @@ -2024,19 +2032,12 @@ def test_obsm_values_spatial_embedding_missing__is_single_true(self, validator_w
"ERROR: 'spatial' embedding is required in 'adata.obsm' if adata.uns['spatial']['is_single'] is True."
]

def test_obsm_values_spatial_embedding_missing__is_single_false(self, validator_with_visium_assay):
validator = validator_with_visium_assay
validator.adata.uns["spatial"] = {"is_single": False}
def test_obsm_values_spatial_embedding_missing__is_single_false(self, validator_with_spatial_and_is_single_false):
validator = validator_with_spatial_and_is_single_false
del validator.adata.obsm["spatial"]
validator.validate_adata()
assert validator.errors == []

def test_obsm_values_spatial_embedding_present__is_single_false(self, validator_with_visium_assay):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

moved this test into test_validate.py as test__validate_spatial_is_single_false_ok and renamed now that we have the fixture adata_spatial_is_single_false with spatial embedding present

validator = validator_with_visium_assay
validator.adata.uns["spatial"] = {"is_single": False}
validator.validate_adata()
assert validator.errors == []

def test_obsm_values_spatial_embedding_present__is_single_none(self, validator_with_adata):
validator = validator_with_adata
validator.adata.obsm["spatial"] = validator.adata.obsm["X_umap"]
Expand Down
Loading