Skip to content

Commit

Permalink
fix: Allow Singer schemas to include the required and enum fields (
Browse files Browse the repository at this point in the history
…#917)

* Overwrite schema to allow required+enum keys

* Fixing missing import

* isrot

* Address comments

* Update singer_sdk/helpers/_schema.py

Co-authored-by: Edgar R. M. <[email protected]>

* Update singer_sdk/helpers/_schema.py

Co-authored-by: Edgar R. M. <[email protected]>

* Update singer_sdk/helpers/_schema.py

Co-authored-by: Edgar R. M. <[email protected]>

* Update singer_sdk/helpers/_schema.py

Co-authored-by: Edgar R. M. <[email protected]>

* black

Co-authored-by: Edgar R. M. <[email protected]>

Closes #901
  • Loading branch information
Jack-Burnett authored Aug 24, 2022
1 parent 52db6ec commit a3f570c
Show file tree
Hide file tree
Showing 6 changed files with 177 additions and 9 deletions.
100 changes: 100 additions & 0 deletions singer_sdk/helpers/_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"""Provides an object model for JSON Schema."""

from dataclasses import dataclass
from typing import Any, List, Optional, Union

from singer import Schema

# These are keys defined in the JSON Schema spec that do not themselves contain
# schemas (or lists of schemas)
STANDARD_KEYS = [
"title",
"description",
"minimum",
"maximum",
"exclusiveMinimum",
"exclusiveMaximum",
"multipleOf",
"maxLength",
"minLength",
"format",
"type",
"required",
"enum",
# These are NOT simple keys (they can contain schemas themselves). We could
# consider adding extra handling to them.
"additionalProperties",
"anyOf",
"patternProperties",
]


@dataclass
class SchemaPlus(Schema):
"""Object model for JSON Schema.
Tap and Target authors may find this to be more convenient than
working directly with JSON Schema data structures.
This is based on, and overwrites
https://github.com/transferwise/pipelinewise-singer-python/blob/master/singer/schema.py.
This is because we wanted to expand it with extra STANDARD_KEYS.
"""

type: Optional[Union[str, List[str]]] = None
properties: Optional[dict] = None
items: Optional[Any] = None
description: Optional[str] = None
minimum: Optional[float] = None
maximum: Optional[float] = None
exclusiveMinimum: Optional[float] = None
exclusiveMaximum: Optional[float] = None
multipleOf: Optional[float] = None
maxLength: Optional[int] = None
minLength: Optional[int] = None
anyOf: Optional[Any] = None
format: Optional[str] = None
additionalProperties: Optional[Any] = None
patternProperties: Optional[Any] = None
required: Optional[List[str]] = None
enum: Optional[List[Any]] = None
title: Optional[str] = None

def to_dict(self):
"""Return the raw JSON Schema as a (possibly nested) dict."""
result = {}

if self.properties is not None:
result["properties"] = {k: v.to_dict() for k, v in self.properties.items()}

if self.items is not None:
result["items"] = self.items.to_dict()

for key in STANDARD_KEYS:
if self.__dict__.get(key) is not None:
result[key] = self.__dict__[key]

return result

@classmethod
def from_dict(cls, data, **schema_defaults):
"""Initialize a Schema object based on the JSON Schema structure.
:param schema_defaults: The default values to the Schema constructor.
"""
kwargs = schema_defaults.copy()
properties = data.get("properties")
items = data.get("items")

if properties is not None:
kwargs["properties"] = {
k: SchemaPlus.from_dict(v, **schema_defaults)
for k, v in properties.items()
}
if items is not None:
kwargs["items"] = SchemaPlus.from_dict(items, **schema_defaults)
for key in STANDARD_KEYS:
if key in data:
kwargs[key] = data[key]
return SchemaPlus(**kwargs)
7 changes: 4 additions & 3 deletions singer_sdk/helpers/_singer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@

from singer.catalog import Catalog as BaseCatalog
from singer.catalog import CatalogEntry as BaseCatalogEntry
from singer.schema import Schema

from singer_sdk.helpers._schema import SchemaPlus

Breadcrumb = Tuple[str, ...]

Expand Down Expand Up @@ -210,7 +211,7 @@ class CatalogEntry(BaseCatalogEntry):

tap_stream_id: str
metadata: MetadataMapping
schema: Schema
schema: SchemaPlus
stream: Optional[str] = None
key_properties: Optional[List[str]] = None
replication_key: Optional[str] = None
Expand All @@ -231,7 +232,7 @@ def from_dict(cls, stream: Dict[str, Any]):
key_properties=stream.get("key_properties"),
database=stream.get("database_name"),
table=stream.get("table_name"),
schema=Schema.from_dict(stream.get("schema", {})),
schema=SchemaPlus.from_dict(stream.get("schema", {})),
is_view=stream.get("is_view"),
stream_alias=stream.get("stream_alias"),
metadata=MetadataMapping.from_iterable(stream.get("metadata", [])),
Expand Down
6 changes: 3 additions & 3 deletions singer_sdk/streams/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,13 @@
import pendulum
import requests
import singer
from singer import RecordMessage, SchemaMessage, StateMessage
from singer.schema import Schema
from singer import RecordMessage, Schema, SchemaMessage, StateMessage

from singer_sdk.exceptions import InvalidStreamSortException, MaxRecordsLimitException
from singer_sdk.helpers._catalog import pop_deselected_record_properties
from singer_sdk.helpers._compat import final
from singer_sdk.helpers._flattening import get_flattening_options
from singer_sdk.helpers._schema import SchemaPlus
from singer_sdk.helpers._singer import (
Catalog,
CatalogEntry,
Expand Down Expand Up @@ -530,7 +530,7 @@ def _singer_catalog_entry(self) -> CatalogEntry:
return CatalogEntry(
tap_stream_id=self.tap_stream_id,
stream=self.name,
schema=Schema.from_dict(self.schema),
schema=SchemaPlus.from_dict(self.schema),
metadata=self.metadata,
key_properties=self.primary_keys or [],
replication_key=self.replication_key,
Expand Down
4 changes: 2 additions & 2 deletions singer_sdk/streams/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
from functools import lru_cache
from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union, cast

import singer
import sqlalchemy
from sqlalchemy.engine import Engine
from sqlalchemy.engine.reflection import Inspector

from singer_sdk import typing as th
from singer_sdk.exceptions import ConfigValidationError
from singer_sdk.helpers._schema import SchemaPlus
from singer_sdk.helpers._singer import CatalogEntry, MetadataMapping
from singer_sdk.plugin_base import PluginBase as TapBaseClass
from singer_sdk.streams.core import Stream
Expand Down Expand Up @@ -401,7 +401,7 @@ def discover_catalog_entry(
stream=unique_stream_id,
table=table_name,
key_properties=key_properties,
schema=singer.Schema.from_dict(schema),
schema=SchemaPlus.from_dict(schema),
is_view=is_view,
replication_method=replication_method,
metadata=MetadataMapping.get_standard_metadata(
Expand Down
3 changes: 2 additions & 1 deletion tests/core/test_catalog_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
get_selected_schema,
pop_deselected_record_properties,
)
from singer_sdk.helpers._schema import SchemaPlus
from singer_sdk.typing import ObjectType, PropertiesList, Property, StringType


Expand Down Expand Up @@ -151,7 +152,7 @@ def catalog_entry_obj(schema, stream_name, selection_metadata) -> singer.Catalog
return singer.CatalogEntry(
tap_stream_id=stream_name,
stream=stream_name,
schema=singer.Schema.from_dict(schema),
schema=SchemaPlus.from_dict(schema),
metadata=singer.MetadataMapping.from_iterable(selection_metadata),
)

Expand Down
66 changes: 66 additions & 0 deletions tests/core/test_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
"""
Testing that SchemaPlus can convert schemas lossless from and to dicts.
Schemas are taken from these examples; https://json-schema.org/learn/miscellaneous-examples.html
NOTE: The following properties are not currently supported;
pattern
unevaluatedProperties
propertyNames
minProperties
maxProperties
prefixItems
contains
minContains
maxContains
minItems
maxItems
uniqueItems
enum
const
contentMediaType
contentEncoding
allOf
oneOf
not
Some of these could be trivially added (if they are SIMPLE_PROPERTIES.
Some might need more thinking if they can contain schemas (though, note that we also treat 'additionalProperties',
'anyOf' and' patternProperties' as SIMPLE even though they can contain schemas.
"""

from singer_sdk.helpers._schema import SchemaPlus


def test_simple_schema():
simple_schema = {
"title": "Longitude and Latitude Values",
"description": "A geographical coordinate.",
"required": ["latitude", "longitude"],
"type": "object",
"properties": {
"latitude": {"type": "number", "minimum": -90, "maximum": 90},
"longitude": {"type": "number", "minimum": -180, "maximum": 180},
},
}

schema_plus = SchemaPlus.from_dict(simple_schema)
assert schema_plus.to_dict() == simple_schema
assert schema_plus.required == ["latitude", "longitude"]
assert isinstance(schema_plus.properties["latitude"], SchemaPlus)
latitude = schema_plus.properties["latitude"]
assert latitude.type == "number"


def test_schema_with_items():
schema = {
"description": "A representation of a person, company, organization, or place",
"type": "object",
"properties": {"fruits": {"type": "array", "items": {"type": "string"}}},
}
schema_plus = SchemaPlus.from_dict(schema)
assert schema_plus.to_dict() == schema
assert isinstance(schema_plus.properties["fruits"], SchemaPlus)
fruits = schema_plus.properties["fruits"]
assert isinstance(fruits.items, SchemaPlus)
assert fruits.items.type == "string"

0 comments on commit a3f570c

Please sign in to comment.