diff --git a/singer_sdk/helpers/_schema.py b/singer_sdk/helpers/_schema.py new file mode 100644 index 000000000..742c6c34f --- /dev/null +++ b/singer_sdk/helpers/_schema.py @@ -0,0 +1,100 @@ +"""Provides an object model for JSON Schema.""" + +from dataclasses import dataclass +from typing import Any, List, Optional, Union + +from singer import Schema + +# These are keys defined in the JSON Schema spec that do not themselves contain +# schemas (or lists of schemas) +STANDARD_KEYS = [ + "title", + "description", + "minimum", + "maximum", + "exclusiveMinimum", + "exclusiveMaximum", + "multipleOf", + "maxLength", + "minLength", + "format", + "type", + "required", + "enum", + # These are NOT simple keys (they can contain schemas themselves). We could + # consider adding extra handling to them. + "additionalProperties", + "anyOf", + "patternProperties", +] + + +@dataclass +class SchemaPlus(Schema): + """Object model for JSON Schema. + + Tap and Target authors may find this to be more convenient than + working directly with JSON Schema data structures. + + This is based on, and overwrites + https://github.com/transferwise/pipelinewise-singer-python/blob/master/singer/schema.py. + This is because we wanted to expand it with extra STANDARD_KEYS. + + """ + + type: Optional[Union[str, List[str]]] = None + properties: Optional[dict] = None + items: Optional[Any] = None + description: Optional[str] = None + minimum: Optional[float] = None + maximum: Optional[float] = None + exclusiveMinimum: Optional[float] = None + exclusiveMaximum: Optional[float] = None + multipleOf: Optional[float] = None + maxLength: Optional[int] = None + minLength: Optional[int] = None + anyOf: Optional[Any] = None + format: Optional[str] = None + additionalProperties: Optional[Any] = None + patternProperties: Optional[Any] = None + required: Optional[List[str]] = None + enum: Optional[List[Any]] = None + title: Optional[str] = None + + def to_dict(self): + """Return the raw JSON Schema as a (possibly nested) dict.""" + result = {} + + if self.properties is not None: + result["properties"] = {k: v.to_dict() for k, v in self.properties.items()} + + if self.items is not None: + result["items"] = self.items.to_dict() + + for key in STANDARD_KEYS: + if self.__dict__.get(key) is not None: + result[key] = self.__dict__[key] + + return result + + @classmethod + def from_dict(cls, data, **schema_defaults): + """Initialize a Schema object based on the JSON Schema structure. + + :param schema_defaults: The default values to the Schema constructor. + """ + kwargs = schema_defaults.copy() + properties = data.get("properties") + items = data.get("items") + + if properties is not None: + kwargs["properties"] = { + k: SchemaPlus.from_dict(v, **schema_defaults) + for k, v in properties.items() + } + if items is not None: + kwargs["items"] = SchemaPlus.from_dict(items, **schema_defaults) + for key in STANDARD_KEYS: + if key in data: + kwargs[key] = data[key] + return SchemaPlus(**kwargs) diff --git a/singer_sdk/helpers/_singer.py b/singer_sdk/helpers/_singer.py index 465e58fd9..8b74643d3 100644 --- a/singer_sdk/helpers/_singer.py +++ b/singer_sdk/helpers/_singer.py @@ -5,7 +5,8 @@ from singer.catalog import Catalog as BaseCatalog from singer.catalog import CatalogEntry as BaseCatalogEntry -from singer.schema import Schema + +from singer_sdk.helpers._schema import SchemaPlus Breadcrumb = Tuple[str, ...] @@ -210,7 +211,7 @@ class CatalogEntry(BaseCatalogEntry): tap_stream_id: str metadata: MetadataMapping - schema: Schema + schema: SchemaPlus stream: Optional[str] = None key_properties: Optional[List[str]] = None replication_key: Optional[str] = None @@ -231,7 +232,7 @@ def from_dict(cls, stream: Dict[str, Any]): key_properties=stream.get("key_properties"), database=stream.get("database_name"), table=stream.get("table_name"), - schema=Schema.from_dict(stream.get("schema", {})), + schema=SchemaPlus.from_dict(stream.get("schema", {})), is_view=stream.get("is_view"), stream_alias=stream.get("stream_alias"), metadata=MetadataMapping.from_iterable(stream.get("metadata", [])), diff --git a/singer_sdk/streams/core.py b/singer_sdk/streams/core.py index 3d0081cd9..bdf9ed8a8 100644 --- a/singer_sdk/streams/core.py +++ b/singer_sdk/streams/core.py @@ -27,13 +27,13 @@ import pendulum import requests import singer -from singer import RecordMessage, SchemaMessage, StateMessage -from singer.schema import Schema +from singer import RecordMessage, Schema, SchemaMessage, StateMessage from singer_sdk.exceptions import InvalidStreamSortException, MaxRecordsLimitException from singer_sdk.helpers._catalog import pop_deselected_record_properties from singer_sdk.helpers._compat import final from singer_sdk.helpers._flattening import get_flattening_options +from singer_sdk.helpers._schema import SchemaPlus from singer_sdk.helpers._singer import ( Catalog, CatalogEntry, @@ -530,7 +530,7 @@ def _singer_catalog_entry(self) -> CatalogEntry: return CatalogEntry( tap_stream_id=self.tap_stream_id, stream=self.name, - schema=Schema.from_dict(self.schema), + schema=SchemaPlus.from_dict(self.schema), metadata=self.metadata, key_properties=self.primary_keys or [], replication_key=self.replication_key, diff --git a/singer_sdk/streams/sql.py b/singer_sdk/streams/sql.py index dfafa2e3e..abfb88fd6 100644 --- a/singer_sdk/streams/sql.py +++ b/singer_sdk/streams/sql.py @@ -6,13 +6,13 @@ from functools import lru_cache from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union, cast -import singer import sqlalchemy from sqlalchemy.engine import Engine from sqlalchemy.engine.reflection import Inspector from singer_sdk import typing as th from singer_sdk.exceptions import ConfigValidationError +from singer_sdk.helpers._schema import SchemaPlus from singer_sdk.helpers._singer import CatalogEntry, MetadataMapping from singer_sdk.plugin_base import PluginBase as TapBaseClass from singer_sdk.streams.core import Stream @@ -401,7 +401,7 @@ def discover_catalog_entry( stream=unique_stream_id, table=table_name, key_properties=key_properties, - schema=singer.Schema.from_dict(schema), + schema=SchemaPlus.from_dict(schema), is_view=is_view, replication_method=replication_method, metadata=MetadataMapping.get_standard_metadata( diff --git a/tests/core/test_catalog_selection.py b/tests/core/test_catalog_selection.py index f0019ba40..2f203be5d 100644 --- a/tests/core/test_catalog_selection.py +++ b/tests/core/test_catalog_selection.py @@ -10,6 +10,7 @@ get_selected_schema, pop_deselected_record_properties, ) +from singer_sdk.helpers._schema import SchemaPlus from singer_sdk.typing import ObjectType, PropertiesList, Property, StringType @@ -151,7 +152,7 @@ def catalog_entry_obj(schema, stream_name, selection_metadata) -> singer.Catalog return singer.CatalogEntry( tap_stream_id=stream_name, stream=stream_name, - schema=singer.Schema.from_dict(schema), + schema=SchemaPlus.from_dict(schema), metadata=singer.MetadataMapping.from_iterable(selection_metadata), ) diff --git a/tests/core/test_schema.py b/tests/core/test_schema.py new file mode 100644 index 000000000..87b7ec03b --- /dev/null +++ b/tests/core/test_schema.py @@ -0,0 +1,66 @@ +""" +Testing that SchemaPlus can convert schemas lossless from and to dicts. + +Schemas are taken from these examples; https://json-schema.org/learn/miscellaneous-examples.html + +NOTE: The following properties are not currently supported; +pattern +unevaluatedProperties +propertyNames +minProperties +maxProperties +prefixItems +contains +minContains +maxContains +minItems +maxItems +uniqueItems +enum +const +contentMediaType +contentEncoding +allOf +oneOf +not + +Some of these could be trivially added (if they are SIMPLE_PROPERTIES. +Some might need more thinking if they can contain schemas (though, note that we also treat 'additionalProperties', +'anyOf' and' patternProperties' as SIMPLE even though they can contain schemas. +""" + +from singer_sdk.helpers._schema import SchemaPlus + + +def test_simple_schema(): + simple_schema = { + "title": "Longitude and Latitude Values", + "description": "A geographical coordinate.", + "required": ["latitude", "longitude"], + "type": "object", + "properties": { + "latitude": {"type": "number", "minimum": -90, "maximum": 90}, + "longitude": {"type": "number", "minimum": -180, "maximum": 180}, + }, + } + + schema_plus = SchemaPlus.from_dict(simple_schema) + assert schema_plus.to_dict() == simple_schema + assert schema_plus.required == ["latitude", "longitude"] + assert isinstance(schema_plus.properties["latitude"], SchemaPlus) + latitude = schema_plus.properties["latitude"] + assert latitude.type == "number" + + +def test_schema_with_items(): + schema = { + "description": "A representation of a person, company, organization, or place", + "type": "object", + "properties": {"fruits": {"type": "array", "items": {"type": "string"}}}, + } + schema_plus = SchemaPlus.from_dict(schema) + assert schema_plus.to_dict() == schema + assert isinstance(schema_plus.properties["fruits"], SchemaPlus) + fruits = schema_plus.properties["fruits"] + assert isinstance(fruits.items, SchemaPlus) + assert fruits.items.type == "string"