From 891782b10827dfed55503f99bba08de4d442f66e Mon Sep 17 00:00:00 2001 From: Jack Burnett Date: Fri, 19 Aug 2022 12:22:04 +0100 Subject: [PATCH 1/9] Overwrite schema to allow required+enum keys --- singer_sdk/helpers/_schema.py | 136 +++++++++++++++++++++++++++ singer_sdk/helpers/_singer.py | 7 +- singer_sdk/streams/core.py | 4 +- singer_sdk/streams/sql.py | 4 +- tests/core/test_catalog_selection.py | 3 +- tests/core/test_schema.py | 66 +++++++++++++ 6 files changed, 212 insertions(+), 8 deletions(-) create mode 100644 singer_sdk/helpers/_schema.py create mode 100644 tests/core/test_schema.py diff --git a/singer_sdk/helpers/_schema.py b/singer_sdk/helpers/_schema.py new file mode 100644 index 000000000..e33018bc7 --- /dev/null +++ b/singer_sdk/helpers/_schema.py @@ -0,0 +1,136 @@ +# pylint: disable=redefined-builtin, too-many-arguments, invalid-name +"""Provides an object model for JSON Schema.""" + +import json + +from singer import Schema + +# These are keys defined in the JSON Schema spec that do not themselves contain +# schemas (or lists of schemas) +STANDARD_KEYS = [ + "title", + "description", + "minimum", + "maximum", + "exclusiveMinimum", + "exclusiveMaximum", + "multipleOf", + "maxLength", + "minLength", + "format", + "type", + "required", + "enum", + # These are NOT simple keys (they can contain schemas themselves). We could + # consider adding extra handling to them. + "additionalProperties", + "anyOf", + "patternProperties", +] + + +class SchemaPlus(Schema): # pylint: disable=too-many-instance-attributes + """Object model for JSON Schema. + + Tap and Target authors may find this to be more convenient than + working directly with JSON Schema data structures. + + This is based on, and overwrites + https://github.com/transferwise/pipelinewise-singer-python/blob/master/singer/schema.py. + This is because we wanted to expand it with extra STANDARD_KEYS. + + """ + + # pylint: disable=too-many-locals + def __init__( + self, + type=None, + format=None, + properties=None, + items=None, + description=None, + minimum=None, + maximum=None, + exclusiveMinimum=None, + exclusiveMaximum=None, + multipleOf=None, + maxLength=None, + minLength=None, + additionalProperties=None, + anyOf=None, + patternProperties=None, + required=None, + enum=None, + title=None, + ): + """Creates a SchemaPlus with the given json-schema keys.""" + self.type = type + self.properties = properties + self.items = items + self.description = description + self.minimum = minimum + self.maximum = maximum + self.exclusiveMinimum = exclusiveMinimum + self.exclusiveMaximum = exclusiveMaximum + self.multipleOf = multipleOf + self.maxLength = maxLength + self.minLength = minLength + self.anyOf = anyOf + self.format = format + self.additionalProperties = additionalProperties + self.patternProperties = patternProperties + self.required = required + self.enum = enum + self.title = title + + def __str__(self): + return json.dumps(self.to_dict()) + + def __repr__(self): + pairs = [k + "=" + repr(v) for k, v in self.__dict__.items()] + args = ", ".join(pairs) + return "SchemaPlus(" + args + ")" + + def __eq__(self, other): + return self.__dict__ == other.__dict__ + + def to_dict(self): + """Return the raw JSON Schema as a (possibly nested) dict.""" + result = {} + + if self.properties is not None: + result["properties"] = { + k: v.to_dict() + for k, v in self.properties.items() # pylint: disable=no-member + } + + if self.items is not None: + result["items"] = self.items.to_dict() # pylint: disable=no-member + + for key in STANDARD_KEYS: + if self.__dict__.get(key) is not None: + result[key] = self.__dict__[key] + + return result + + @classmethod + def from_dict(cls, data, **schema_defaults): + """Initialize a Schema object based on the JSON Schema structure. + + :param schema_defaults: The default values to the Schema constructor. + """ + kwargs = schema_defaults.copy() + properties = data.get("properties") + items = data.get("items") + + if properties is not None: + kwargs["properties"] = { + k: SchemaPlus.from_dict(v, **schema_defaults) + for k, v in properties.items() + } + if items is not None: + kwargs["items"] = SchemaPlus.from_dict(items, **schema_defaults) + for key in STANDARD_KEYS: + if key in data: + kwargs[key] = data[key] + return SchemaPlus(**kwargs) diff --git a/singer_sdk/helpers/_singer.py b/singer_sdk/helpers/_singer.py index 465e58fd9..8b74643d3 100644 --- a/singer_sdk/helpers/_singer.py +++ b/singer_sdk/helpers/_singer.py @@ -5,7 +5,8 @@ from singer.catalog import Catalog as BaseCatalog from singer.catalog import CatalogEntry as BaseCatalogEntry -from singer.schema import Schema + +from singer_sdk.helpers._schema import SchemaPlus Breadcrumb = Tuple[str, ...] @@ -210,7 +211,7 @@ class CatalogEntry(BaseCatalogEntry): tap_stream_id: str metadata: MetadataMapping - schema: Schema + schema: SchemaPlus stream: Optional[str] = None key_properties: Optional[List[str]] = None replication_key: Optional[str] = None @@ -231,7 +232,7 @@ def from_dict(cls, stream: Dict[str, Any]): key_properties=stream.get("key_properties"), database=stream.get("database_name"), table=stream.get("table_name"), - schema=Schema.from_dict(stream.get("schema", {})), + schema=SchemaPlus.from_dict(stream.get("schema", {})), is_view=stream.get("is_view"), stream_alias=stream.get("stream_alias"), metadata=MetadataMapping.from_iterable(stream.get("metadata", [])), diff --git a/singer_sdk/streams/core.py b/singer_sdk/streams/core.py index cbf46a360..6be8c41a0 100644 --- a/singer_sdk/streams/core.py +++ b/singer_sdk/streams/core.py @@ -28,7 +28,7 @@ import requests import singer from singer import RecordMessage, SchemaMessage, StateMessage -from singer.schema import Schema +from singer_sdk.helpers._schema import SchemaPlus from singer_sdk.exceptions import InvalidStreamSortException, MaxRecordsLimitException from singer_sdk.helpers._catalog import pop_deselected_record_properties @@ -502,7 +502,7 @@ def _singer_catalog_entry(self) -> CatalogEntry: return CatalogEntry( tap_stream_id=self.tap_stream_id, stream=self.name, - schema=Schema.from_dict(self.schema), + schema=SchemaPlus.from_dict(self.schema), metadata=self.metadata, key_properties=self.primary_keys or [], replication_key=self.replication_key, diff --git a/singer_sdk/streams/sql.py b/singer_sdk/streams/sql.py index dfafa2e3e..abfb88fd6 100644 --- a/singer_sdk/streams/sql.py +++ b/singer_sdk/streams/sql.py @@ -6,13 +6,13 @@ from functools import lru_cache from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union, cast -import singer import sqlalchemy from sqlalchemy.engine import Engine from sqlalchemy.engine.reflection import Inspector from singer_sdk import typing as th from singer_sdk.exceptions import ConfigValidationError +from singer_sdk.helpers._schema import SchemaPlus from singer_sdk.helpers._singer import CatalogEntry, MetadataMapping from singer_sdk.plugin_base import PluginBase as TapBaseClass from singer_sdk.streams.core import Stream @@ -401,7 +401,7 @@ def discover_catalog_entry( stream=unique_stream_id, table=table_name, key_properties=key_properties, - schema=singer.Schema.from_dict(schema), + schema=SchemaPlus.from_dict(schema), is_view=is_view, replication_method=replication_method, metadata=MetadataMapping.get_standard_metadata( diff --git a/tests/core/test_catalog_selection.py b/tests/core/test_catalog_selection.py index f0019ba40..2f203be5d 100644 --- a/tests/core/test_catalog_selection.py +++ b/tests/core/test_catalog_selection.py @@ -10,6 +10,7 @@ get_selected_schema, pop_deselected_record_properties, ) +from singer_sdk.helpers._schema import SchemaPlus from singer_sdk.typing import ObjectType, PropertiesList, Property, StringType @@ -151,7 +152,7 @@ def catalog_entry_obj(schema, stream_name, selection_metadata) -> singer.Catalog return singer.CatalogEntry( tap_stream_id=stream_name, stream=stream_name, - schema=singer.Schema.from_dict(schema), + schema=SchemaPlus.from_dict(schema), metadata=singer.MetadataMapping.from_iterable(selection_metadata), ) diff --git a/tests/core/test_schema.py b/tests/core/test_schema.py new file mode 100644 index 000000000..87b7ec03b --- /dev/null +++ b/tests/core/test_schema.py @@ -0,0 +1,66 @@ +""" +Testing that SchemaPlus can convert schemas lossless from and to dicts. + +Schemas are taken from these examples; https://json-schema.org/learn/miscellaneous-examples.html + +NOTE: The following properties are not currently supported; +pattern +unevaluatedProperties +propertyNames +minProperties +maxProperties +prefixItems +contains +minContains +maxContains +minItems +maxItems +uniqueItems +enum +const +contentMediaType +contentEncoding +allOf +oneOf +not + +Some of these could be trivially added (if they are SIMPLE_PROPERTIES. +Some might need more thinking if they can contain schemas (though, note that we also treat 'additionalProperties', +'anyOf' and' patternProperties' as SIMPLE even though they can contain schemas. +""" + +from singer_sdk.helpers._schema import SchemaPlus + + +def test_simple_schema(): + simple_schema = { + "title": "Longitude and Latitude Values", + "description": "A geographical coordinate.", + "required": ["latitude", "longitude"], + "type": "object", + "properties": { + "latitude": {"type": "number", "minimum": -90, "maximum": 90}, + "longitude": {"type": "number", "minimum": -180, "maximum": 180}, + }, + } + + schema_plus = SchemaPlus.from_dict(simple_schema) + assert schema_plus.to_dict() == simple_schema + assert schema_plus.required == ["latitude", "longitude"] + assert isinstance(schema_plus.properties["latitude"], SchemaPlus) + latitude = schema_plus.properties["latitude"] + assert latitude.type == "number" + + +def test_schema_with_items(): + schema = { + "description": "A representation of a person, company, organization, or place", + "type": "object", + "properties": {"fruits": {"type": "array", "items": {"type": "string"}}}, + } + schema_plus = SchemaPlus.from_dict(schema) + assert schema_plus.to_dict() == schema + assert isinstance(schema_plus.properties["fruits"], SchemaPlus) + fruits = schema_plus.properties["fruits"] + assert isinstance(fruits.items, SchemaPlus) + assert fruits.items.type == "string" From 6c8dd701364f2c59ae76f646c41e7ffe35731ede Mon Sep 17 00:00:00 2001 From: Jack Burnett Date: Fri, 19 Aug 2022 12:30:04 +0100 Subject: [PATCH 2/9] Fixing missing import --- singer_sdk/streams/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/singer_sdk/streams/core.py b/singer_sdk/streams/core.py index 6be8c41a0..4efc183d5 100644 --- a/singer_sdk/streams/core.py +++ b/singer_sdk/streams/core.py @@ -27,7 +27,7 @@ import pendulum import requests import singer -from singer import RecordMessage, SchemaMessage, StateMessage +from singer import RecordMessage, SchemaMessage, StateMessage, Schema from singer_sdk.helpers._schema import SchemaPlus from singer_sdk.exceptions import InvalidStreamSortException, MaxRecordsLimitException From 7938da85175f34e6696c1718642b17b1099fe912 Mon Sep 17 00:00:00 2001 From: Jack Burnett Date: Fri, 19 Aug 2022 12:41:12 +0100 Subject: [PATCH 3/9] isrot --- singer_sdk/streams/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/singer_sdk/streams/core.py b/singer_sdk/streams/core.py index 4efc183d5..fa6203c4e 100644 --- a/singer_sdk/streams/core.py +++ b/singer_sdk/streams/core.py @@ -27,13 +27,13 @@ import pendulum import requests import singer -from singer import RecordMessage, SchemaMessage, StateMessage, Schema -from singer_sdk.helpers._schema import SchemaPlus +from singer import RecordMessage, Schema, SchemaMessage, StateMessage from singer_sdk.exceptions import InvalidStreamSortException, MaxRecordsLimitException from singer_sdk.helpers._catalog import pop_deselected_record_properties from singer_sdk.helpers._compat import final from singer_sdk.helpers._flattening import get_flattening_options +from singer_sdk.helpers._schema import SchemaPlus from singer_sdk.helpers._singer import ( Catalog, CatalogEntry, From 8b64daabfc7cad6042bfafbec46500260d6e92a6 Mon Sep 17 00:00:00 2001 From: Jack Burnett Date: Wed, 24 Aug 2022 09:41:01 +0100 Subject: [PATCH 4/9] Address comments --- singer_sdk/helpers/_schema.py | 74 ++++++++++------------------------- 1 file changed, 21 insertions(+), 53 deletions(-) diff --git a/singer_sdk/helpers/_schema.py b/singer_sdk/helpers/_schema.py index e33018bc7..5c4b006ba 100644 --- a/singer_sdk/helpers/_schema.py +++ b/singer_sdk/helpers/_schema.py @@ -1,7 +1,8 @@ # pylint: disable=redefined-builtin, too-many-arguments, invalid-name """Provides an object model for JSON Schema.""" -import json +from dataclasses import dataclass +from typing import Any, List, Optional, Union from singer import Schema @@ -29,6 +30,7 @@ ] +@dataclass class SchemaPlus(Schema): # pylint: disable=too-many-instance-attributes """Object model for JSON Schema. @@ -41,58 +43,24 @@ class SchemaPlus(Schema): # pylint: disable=too-many-instance-attributes """ - # pylint: disable=too-many-locals - def __init__( - self, - type=None, - format=None, - properties=None, - items=None, - description=None, - minimum=None, - maximum=None, - exclusiveMinimum=None, - exclusiveMaximum=None, - multipleOf=None, - maxLength=None, - minLength=None, - additionalProperties=None, - anyOf=None, - patternProperties=None, - required=None, - enum=None, - title=None, - ): - """Creates a SchemaPlus with the given json-schema keys.""" - self.type = type - self.properties = properties - self.items = items - self.description = description - self.minimum = minimum - self.maximum = maximum - self.exclusiveMinimum = exclusiveMinimum - self.exclusiveMaximum = exclusiveMaximum - self.multipleOf = multipleOf - self.maxLength = maxLength - self.minLength = minLength - self.anyOf = anyOf - self.format = format - self.additionalProperties = additionalProperties - self.patternProperties = patternProperties - self.required = required - self.enum = enum - self.title = title - - def __str__(self): - return json.dumps(self.to_dict()) - - def __repr__(self): - pairs = [k + "=" + repr(v) for k, v in self.__dict__.items()] - args = ", ".join(pairs) - return "SchemaPlus(" + args + ")" - - def __eq__(self, other): - return self.__dict__ == other.__dict__ + type: Optional[Union[str, List[str]]] = None + properties: Optional[dict] = None + items: Optional[Any] = None + description: Optional[str] = None + minimum: Optional[float] = None + maximum: Optional[float] = None + exclusiveMinimum: Optional[float] = None + exclusiveMaximum: Optional[float] = None + multipleOf: Optional[float] = None + maxLength: Optional[int] = None + minLength: Optional[int] = None + anyOf: Optional[Any] = None + format: Optional[str] = None + additionalProperties: Optional[Any] = None + patternProperties: Optional[Any] = None + required: Optional[List[str]] = None + enum: Optional[List[Any]] = None + title: Optional[str] = None def to_dict(self): """Return the raw JSON Schema as a (possibly nested) dict.""" From a9edc67e1b10b78bde2f39ce3ac61361ae1a6f14 Mon Sep 17 00:00:00 2001 From: Jack-Burnett <55663412+Jack-Burnett@users.noreply.github.com> Date: Wed, 24 Aug 2022 17:47:39 +0100 Subject: [PATCH 5/9] Update singer_sdk/helpers/_schema.py Co-authored-by: Edgar R. M. --- singer_sdk/helpers/_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/singer_sdk/helpers/_schema.py b/singer_sdk/helpers/_schema.py index 5c4b006ba..3d5d91229 100644 --- a/singer_sdk/helpers/_schema.py +++ b/singer_sdk/helpers/_schema.py @@ -73,7 +73,7 @@ def to_dict(self): } if self.items is not None: - result["items"] = self.items.to_dict() # pylint: disable=no-member + result["items"] = self.items.to_dict() for key in STANDARD_KEYS: if self.__dict__.get(key) is not None: From 66f36a9b96f9a86872d1244700f6228c8ee85680 Mon Sep 17 00:00:00 2001 From: Jack-Burnett <55663412+Jack-Burnett@users.noreply.github.com> Date: Wed, 24 Aug 2022 17:47:45 +0100 Subject: [PATCH 6/9] Update singer_sdk/helpers/_schema.py Co-authored-by: Edgar R. M. --- singer_sdk/helpers/_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/singer_sdk/helpers/_schema.py b/singer_sdk/helpers/_schema.py index 3d5d91229..b0dbac6aa 100644 --- a/singer_sdk/helpers/_schema.py +++ b/singer_sdk/helpers/_schema.py @@ -31,7 +31,7 @@ @dataclass -class SchemaPlus(Schema): # pylint: disable=too-many-instance-attributes +class SchemaPlus(Schema): """Object model for JSON Schema. Tap and Target authors may find this to be more convenient than From 57f05e8c639e1e4910c9659bb5f227a4295cfdfe Mon Sep 17 00:00:00 2001 From: Jack-Burnett <55663412+Jack-Burnett@users.noreply.github.com> Date: Wed, 24 Aug 2022 17:47:57 +0100 Subject: [PATCH 7/9] Update singer_sdk/helpers/_schema.py Co-authored-by: Edgar R. M. --- singer_sdk/helpers/_schema.py | 1 - 1 file changed, 1 deletion(-) diff --git a/singer_sdk/helpers/_schema.py b/singer_sdk/helpers/_schema.py index b0dbac6aa..99d138e8a 100644 --- a/singer_sdk/helpers/_schema.py +++ b/singer_sdk/helpers/_schema.py @@ -1,4 +1,3 @@ -# pylint: disable=redefined-builtin, too-many-arguments, invalid-name """Provides an object model for JSON Schema.""" from dataclasses import dataclass From 7828aa120fc2ce48e9630575d8dce2d6f2a9da1d Mon Sep 17 00:00:00 2001 From: Jack-Burnett <55663412+Jack-Burnett@users.noreply.github.com> Date: Wed, 24 Aug 2022 17:48:02 +0100 Subject: [PATCH 8/9] Update singer_sdk/helpers/_schema.py Co-authored-by: Edgar R. M. --- singer_sdk/helpers/_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/singer_sdk/helpers/_schema.py b/singer_sdk/helpers/_schema.py index 99d138e8a..dfd0e4ab9 100644 --- a/singer_sdk/helpers/_schema.py +++ b/singer_sdk/helpers/_schema.py @@ -68,7 +68,7 @@ def to_dict(self): if self.properties is not None: result["properties"] = { k: v.to_dict() - for k, v in self.properties.items() # pylint: disable=no-member + for k, v in self.properties.items() } if self.items is not None: From 6689f4e3a578cf1dd94f01f4420bb4b6192d3cda Mon Sep 17 00:00:00 2001 From: Jack Burnett Date: Wed, 24 Aug 2022 17:54:28 +0100 Subject: [PATCH 9/9] black --- singer_sdk/helpers/_schema.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/singer_sdk/helpers/_schema.py b/singer_sdk/helpers/_schema.py index dfd0e4ab9..742c6c34f 100644 --- a/singer_sdk/helpers/_schema.py +++ b/singer_sdk/helpers/_schema.py @@ -66,10 +66,7 @@ def to_dict(self): result = {} if self.properties is not None: - result["properties"] = { - k: v.to_dict() - for k, v in self.properties.items() - } + result["properties"] = {k: v.to_dict() for k, v in self.properties.items()} if self.items is not None: result["items"] = self.items.to_dict()