Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(schema 5.1.0): throw error if obsm is not type np.ndarray #859

Merged
merged 17 commits into from
May 6, 2024
Merged
28 changes: 24 additions & 4 deletions cellxgene_schema_cli/cellxgene_schema/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -968,10 +968,14 @@ def _validate_obsm(self):

obsm_with_x_prefix = 0
for key, value in self.adata.obsm.items():
if not isinstance(key, str):
# no validation on none string OBSM key types.
continue
issue_list = self.errors

regex_pattern = r"^[a-zA-Z][a-zA-Z0-9_.-]*$"

unknown_key = False # an unknown key does not match 'spatial' or 'X_{suffix}'
if key.startswith("X_"):
obsm_with_x_prefix += 1
if key.lower() == "x_spatial":
Expand All @@ -990,6 +994,7 @@ def _validate_obsm(self):
f"not be available in Explorer"
)
issue_list = self.warnings
unknown_key = True

if not isinstance(value, np.ndarray):
self.errors.append(
Expand All @@ -998,11 +1003,26 @@ def _validate_obsm(self):
# Skip over the subsequent checks that require the value to be an array
continue

if len(value.shape) < 2 or value.shape[0] != self.adata.n_obs or value.shape[1] < 2:
issue_list.append(
f"All embeddings must have as many rows as cells, and at least two columns."
f" 'adata.obsm['{key}']' has shape of '{value.shape}'."
if len(value.shape) < 2:
self.errors.append(
f"All embeddings must at least two dimensions. 'adata.obsm['{key}']' has a shape length of '{len(value.shape)}'."
Copy link
Contributor

@nayib-jose-gloria nayib-jose-gloria May 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

EDIT: nvm previous statment was incorrect

)
else:
if value.shape[0] != self.adata.n_obs:
self.errors.append(
f"All embeddings must have as many rows as cells. 'adata.obsm['{key}']' has rows='{value.shape[0]}'."
)

if unknown_key and value.shape[1] < 1:
self.errors.append(
f"All other embeddings must have at least one column. 'adata.obsm['{key}']' has columns='{value.shape[1]}'."
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: in this context, I don't think the curator reading would know what "other embeddings" mean. Maybe "any embeddings not specified in the schema reference"?

)

if not unknown_key and value.shape[1] < 2:
self.errors.append(
f"All 'X_' and 'spatial' embeddings must have at least two columns. 'adata.obsm['{key}']' has columns='{value.shape[1]}'."
)

if not (np.issubdtype(value.dtype, np.integer) or np.issubdtype(value.dtype, np.floating)):
issue_list.append(
f"adata.obsm['{key}'] has an invalid data type. It should be "
Expand Down
35 changes: 25 additions & 10 deletions cellxgene_schema_cli/tests/test_schema_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -1988,10 +1988,9 @@ def test_obsm_values_no_X_embedding__non_spatial_dataset(self, validator_with_ad
assert validator.is_spatial is False
assert validator.warnings == [
"WARNING: Dataframe 'var' only has 4 rows. Features SHOULD NOT be filtered from expression matrix.",
"WARNING: Embedding key in 'adata.obsm' harmony does not start with X_ and thus will not be available in "
"Explorer",
"WARNING: Validation of raw layer was not performed due to current errors, try again after fixing current "
"errors.",
"WARNING: Embedding key in 'adata.obsm' harmony is not 'spatial' nor does it start with 'X_'. "
"Thus, it will not be available in Explorer",
"WARNING: Validation of raw layer was not performed due to current errors, try again after fixing current errors.",
]

@pytest.mark.parametrize(
Expand Down Expand Up @@ -2053,7 +2052,8 @@ def test_obsm_values_warn_start_with_X(self, validator_with_adata):
validator.validate_adata()
assert validator.warnings == [
"WARNING: Dataframe 'var' only has 4 rows. Features SHOULD NOT be filtered from expression matrix.",
"WARNING: Embedding key in 'adata.obsm' harmony does not start with X_ and thus will not be available in Explorer",
"WARNING: Embedding key in 'adata.obsm' harmony is not 'spatial' nor does it start with 'X_'. "
"Thus, it will not be available in Explorer",
"WARNING: Validation of raw layer was not performed due to current errors, try again after fixing current errors.",
]
assert validator.errors == [
Expand Down Expand Up @@ -2086,7 +2086,8 @@ def test_obsm_values_key_start_with_number(self, validator_with_adata):
]
assert validator.warnings == [
"WARNING: Dataframe 'var' only has 4 rows. Features SHOULD NOT be filtered from expression matrix.",
"WARNING: Embedding key in 'adata.obsm' 3D does not start with X_ and thus will not be available in Explorer",
"WARNING: Embedding key in 'adata.obsm' 3D is not 'spatial' nor does it start with 'X_'. "
"Thus, it will not be available in Explorer",
"WARNING: Validation of raw layer was not performed due to current errors, try again after fixing current errors.",
]

Expand Down Expand Up @@ -2136,9 +2137,22 @@ def test_obsm_shape_one_column(self, validator_with_visium_assay, key):
validator.adata.obsm[key] = numpy.delete(validator.adata.obsm[key], 0, 1)
validator.validate_adata()
assert validator.errors == [
"ERROR: All embeddings must have as many rows as cells, and "
f"at least two columns. 'adata.obsm['{key}']' has shape "
"of '(2, 1)'."
"ERROR: All 'X_' and 'spatial' embeddings must have at least two columns. "
f"'adata.obsm['{key}']' has columns='1'."
]

def test_obsm_shape_zero_column_with_unknown_key(self, validator_with_adata):
"""
embeddings that are not 'X_' or 'spatial' that are ndarrays must have at least one column
"""
# Makes 0 column array
validator = validator_with_adata
n_obs = validator_with_adata.adata.n_obs
validator.adata.obsm["unknown"] = numpy.zeros((n_obs, 0))
validator.validate_adata()
assert validator.errors == [
"ERROR: The size of the ndarray stored for a 'adata.obsm['unknown']' MUST NOT " "be zero.",
"ERROR: All other embeddings must have at least one column. " "'adata.obsm['unknown']' has columns='0'.",
]

def test_obsm_shape_same_rows_and_columns(self, validator_with_adata):
Expand Down Expand Up @@ -2166,7 +2180,8 @@ def test_obsm_size_zero(self, validator_with_adata):
validator.adata = save_and_read_adata(adata)
validator.validate_adata()
assert validator.errors == [
"ERROR: The size of the ndarray stored for a 'adata.obsm['badsize']' MUST NOT be zero.",
"ERROR: The size of the ndarray stored for a 'adata.obsm['badsize']' MUST NOT " "be zero.",
"ERROR: All other embeddings must have at least one column. " "'adata.obsm['badsize']' has columns='0'.",
]


Expand Down
Loading