Skip to content

Commit

Permalink
Add SemanticModel Node Type (#7769)
Browse files Browse the repository at this point in the history
* Add dbt-semantic-interfaces as a dependency

With the integration with MetricFlow we're taking a dependency on
`dbt-semantic-interfaces` which acts as the source of truth for
protocols which MetricFlow and dbt-core need to agree on. Additionally
we're hard pinning to 0.1.0.dev3 for now. We plan on having a less
restrictive specification when dbt-core 1.6 hits GA.

* Add implementations of DSI Metadata protocol to nodes.py

* CT-2521: Initial work on adding new SemanticModel node

* CT-2521: Second rough draft of SemanticModels

* CT-2521: Update schema v10

* CT-2521: Update unit tests for new SemanticModel collection in manifest

* CT-2521: Add changelog entry

* CT-2521: Final touches on initial implementation of SemanticModel parsing

* Change name of Metadata class to reduce potential for confusion

* Remove "Replaceable" inheritance, per review

* CT-2521: Rename internal variables from semantic_models to semantic_nodes

* CT-2521: Update manifest schema to reflect change

---------

Co-authored-by: Quigley Malcolm <[email protected]>
  • Loading branch information
peterallenwebb and QMalcolm authored Jun 8, 2023
1 parent 2d23782 commit a89da7c
Show file tree
Hide file tree
Showing 18 changed files with 649 additions and 46 deletions.
6 changes: 6 additions & 0 deletions .changes/unreleased/Features-20230606-165351.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
kind: Features
body: Added support for parsing and serializaing semantic models
time: 2023-06-06T16:53:51.117429-04:00
custom:
Author: peterallenwebb
Issue: 7499 7503
1 change: 1 addition & 0 deletions core/dbt/contracts/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ class SchemaSourceFile(BaseSourceFile):
groups: List[str] = field(default_factory=list)
# node patches contain models, seeds, snapshots, analyses
ndp: List[str] = field(default_factory=list)
semantic_nodes: List[str] = field(default_factory=list)
# any macro patches in this file by macro unique_id.
mcp: Dict[str, str] = field(default_factory=dict)
# any source patches in this file. The entries are package, name pairs
Expand Down
29 changes: 20 additions & 9 deletions core/dbt/contracts/graph/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,22 @@
from dbt.contracts.publication import ProjectDependencies, PublicationConfig, PublicModel

from dbt.contracts.graph.nodes import (
Macro,
BaseNode,
Documentation,
SourceDefinition,
GenericTestNode,
Exposure,
Metric,
GenericTestNode,
GraphMemberNode,
Group,
UnpatchedSourceDefinition,
Macro,
ManifestNode,
GraphMemberNode,
ResultNode,
BaseNode,
ManifestOrPublicNode,
Metric,
ModelNode,
RelationalNode,
ResultNode,
SemanticModel,
SourceDefinition,
UnpatchedSourceDefinition,
)
from dbt.contracts.graph.unparsed import SourcePatch, NodeVersion, UnparsedVersion
from dbt.contracts.graph.manifest_upgrade import upgrade_manifest_json
Expand Down Expand Up @@ -706,6 +707,7 @@ class Manifest(MacroMethods, DataClassMessagePackMixin, dbtClassMixin):
public_nodes: MutableMapping[str, PublicModel] = field(default_factory=dict)
project_dependencies: Optional[ProjectDependencies] = None
publications: MutableMapping[str, PublicationConfig] = field(default_factory=dict)
semantic_nodes: MutableMapping[str, SemanticModel] = field(default_factory=dict)

_doc_lookup: Optional[DocLookup] = field(
default=None, metadata={"serialize": lambda x: None, "deserialize": lambda x: None}
Expand Down Expand Up @@ -894,7 +896,7 @@ def build_group_map(self):
group_map[node.group].append(node.unique_id)
self.group_map = group_map

def writable_manifest(self):
def writable_manifest(self) -> "WritableManifest":
self.build_parent_and_child_maps()
self.build_group_map()
return WritableManifest(
Expand All @@ -912,6 +914,7 @@ def writable_manifest(self):
child_map=self.child_map,
parent_map=self.parent_map,
group_map=self.group_map,
semantic_nodes=self.semantic_nodes,
)

def write(self, path):
Expand Down Expand Up @@ -1246,6 +1249,11 @@ def add_doc(self, source_file: SourceFile, doc: Documentation):
self.docs[doc.unique_id] = doc
source_file.docs.append(doc.unique_id)

def add_semantic_model(self, source_file: SchemaSourceFile, semantic_model: SemanticModel):
_check_duplicates(semantic_model, self.semantic_nodes)
self.semantic_nodes[semantic_model.unique_id] = semantic_model
source_file.semantic_nodes.append(semantic_model.unique_id)

# end of methods formerly in ParseResult

# Provide support for copy.deepcopy() - we just need to avoid the lock!
Expand Down Expand Up @@ -1345,6 +1353,9 @@ class WritableManifest(ArtifactMixin):
public_nodes: Mapping[UniqueID, PublicModel] = field(
metadata=dict(description=("The public models used in the dbt project"))
)
semantic_nodes: Mapping[UniqueID, SemanticModel] = field(
metadata=dict(description=("The semantic models defined in the dbt project"))
)
metadata: ManifestMetadata = field(
metadata=dict(
description="Metadata about the manifest",
Expand Down
2 changes: 2 additions & 0 deletions core/dbt/contracts/graph/manifest_upgrade.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,4 +127,6 @@ def upgrade_manifest_json(manifest: dict) -> dict:
if "root_path" in doc_content:
del doc_content["root_path"]
doc_content["resource_type"] = "doc"
if "semantic_nodes" not in manifest:
manifest["semantic_nodes"] = {}
return manifest
66 changes: 50 additions & 16 deletions core/dbt/contracts/graph/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,29 +6,23 @@
import hashlib

from mashumaro.types import SerializableType
from typing import (
Optional,
Union,
List,
Dict,
Any,
Sequence,
Tuple,
Iterator,
)
from typing import Optional, Union, List, Dict, Any, Sequence, Tuple, Iterator, Protocol

from dbt.dataclass_schema import dbtClassMixin, ExtensibleDbtClassMixin

from dbt.clients.system import write_file
from dbt.contracts.files import FileHash
from dbt.contracts.graph.unparsed import (
Dimension,
Docs,
Entity,
ExposureType,
ExternalTable,
FreshnessThreshold,
HasYamlMetadata,
MacroArgument,
MaturityType,
Measure,
MetricFilter,
MetricTime,
Owner,
Expand Down Expand Up @@ -62,12 +56,6 @@
EmptySnapshotConfig,
SnapshotConfig,
)
import sys

if sys.version_info >= (3, 8):
from typing import Protocol
else:
from typing_extensions import Protocol


# =====================================================================
Expand Down Expand Up @@ -564,6 +552,30 @@ def depends_on_macros(self):
return self.depends_on.macros


@dataclass
class FileSlice(dbtClassMixin, Replaceable):
"""Provides file slice level context about what something was created from.
Implementation of the dbt-semantic-interfaces `FileSlice` protocol
"""

filename: str
content: str
start_line_number: int
end_line_number: int


@dataclass
class SourceFileMetadata(dbtClassMixin, Replaceable):
"""Provides file context about what something was created from.
Implementation of the dbt-semantic-interfaces `Metadata` protocol
"""

repo_file_path: str
file_slice: FileSlice


# ====================================
# CompiledNode subclasses
# ====================================
Expand Down Expand Up @@ -1411,6 +1423,28 @@ class Group(BaseNode):
resource_type: NodeType = field(metadata={"restrict": [NodeType.Group]})


# ====================================
# SemanticModel and related classes
# ====================================


@dataclass
class NodeRelation(dbtClassMixin):
alias: str
schema_name: str # TODO: Could this be called simply "schema" so we could reuse StateRelation?
database: Optional[str] = None


@dataclass
class SemanticModel(GraphNode):
description: Optional[str]
model: str
node_relation: Optional[NodeRelation]
entities: Sequence[Entity]
measures: Sequence[Measure]
dimensions: Sequence[Dimension]


# ====================================
# Patches
# ====================================
Expand Down
54 changes: 54 additions & 0 deletions core/dbt/contracts/graph/unparsed.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,6 +661,60 @@ def validate(cls, data):
raise ValidationError("Group owner must have at least one of 'name' or 'email'.")


#
# semantic interfaces unparsed objects
#


@dataclass
class Entity(dbtClassMixin):
name: str
type: str # actually an enum
description: Optional[str] = None
role: Optional[str] = None
expr: Optional[str] = None


@dataclass
class MeasureAggregationParameters(dbtClassMixin):
percentile: Optional[float] = None
use_discrete_percentile: bool = False
use_approximate_percentile: bool = False


@dataclass
class Measure(dbtClassMixin):
name: str
agg: str # actually an enum
description: Optional[str] = None
create_metric: Optional[bool] = None
expr: Optional[str] = None
agg_params: Optional[MeasureAggregationParameters] = None
non_additive_dimension: Optional[Dict[str, Any]] = None
agg_time_dimension: Optional[str] = None


@dataclass
class Dimension(dbtClassMixin):
name: str
type: str # actually an enum
description: Optional[str] = None
is_partition: Optional[bool] = False
type_params: Optional[Dict[str, Any]] = None
expr: Optional[str] = None
# TODO metadata: Optional[Metadata] (this would actually be the YML for the dimension)


@dataclass
class UnparsedSemanticModel(dbtClassMixin):
name: str
description: Optional[str]
model: str # looks like "ref(...)"
entities: List[Entity] = field(default_factory=list)
measures: List[Measure] = field(default_factory=list)
dimensions: List[Dimension] = field(default_factory=list)


def normalize_date(d: Optional[datetime.date]) -> Optional[datetime.datetime]:
"""Convert date to datetime (at midnight), and add local time zone if naive"""
if d is None:
Expand Down
1 change: 1 addition & 0 deletions core/dbt/node_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class NodeType(StrEnum):
Exposure = "exposure"
Metric = "metric"
Group = "group"
SemanticModel = "semantic model"

@classmethod
def executable(cls) -> List["NodeType"]:
Expand Down
25 changes: 25 additions & 0 deletions core/dbt/parser/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
DeprecatedReference,
UpcomingReferenceDeprecation,
)
from dbt_extractor import py_extract_from_source # type: ignore
from dbt.logger import DbtProcessState
from dbt.node_types import NodeType, AccessType
from dbt.clients.jinja import get_rendered, MacroStack
Expand Down Expand Up @@ -99,6 +100,7 @@
ManifestNode,
ResultNode,
ModelNode,
NodeRelation,
)
from dbt.contracts.graph.unparsed import NodeVersion
from dbt.contracts.util import Writable
Expand Down Expand Up @@ -529,6 +531,7 @@ def load(self):
self.process_refs(self.root_project.project_name)
self.process_docs(self.root_project)
self.process_metrics(self.root_project)
self.process_semantic_models()
self.check_valid_group_config()
self.check_valid_access_property()

Expand Down Expand Up @@ -1180,6 +1183,28 @@ def process_metrics(self, config: RuntimeConfig):
continue
_process_metrics_for_node(self.manifest, current_project, exposure)

def process_semantic_models(self) -> None:
for semantic_model in self.manifest.semantic_nodes.values():
if semantic_model.model:
statically_parsed = py_extract_from_source(f"{{{{ {semantic_model.model} }}}}")
if statically_parsed["refs"]:

ref = statically_parsed["refs"][0]
if len(ref) == 2:
input_package_name, input_model_name = ref
else:
input_package_name, input_model_name = None, ref[0]

refd_node = self.manifest.ref_lookup.find(
input_model_name, input_package_name, None, self.manifest
)
if isinstance(refd_node, ModelNode):
semantic_model.node_relation = NodeRelation(
alias=refd_node.alias,
schema_name=refd_node.schema,
database=refd_node.database,
)

# nodes: node and column descriptions
# sources: source and table descriptions, column descriptions
# macros: macro argument descriptions
Expand Down
52 changes: 50 additions & 2 deletions core/dbt/parser/schema_yaml_readers.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
from dbt.parser.schemas import YamlReader, SchemaParser
from dbt.parser.common import YamlBlock
from dbt.node_types import NodeType
from dbt.contracts.graph.unparsed import UnparsedExposure, UnparsedMetric, UnparsedGroup
from dbt.contracts.graph.nodes import Exposure, Metric, Group
from dbt.contracts.graph.unparsed import (
UnparsedExposure,
UnparsedGroup,
UnparsedMetric,
UnparsedSemanticModel,
)
from dbt.contracts.graph.nodes import Exposure, Group, Metric, SemanticModel
from dbt.exceptions import DbtInternalError, YamlParseDictError, JSONValidationError
from dbt.context.providers import generate_parse_exposure, generate_parse_metrics
from dbt.contracts.graph.model_config import MetricConfig, ExposureConfig
Expand Down Expand Up @@ -269,3 +274,46 @@ def parse(self):
raise YamlParseDictError(self.yaml.path, self.key, data, exc)

self.parse_group(unparsed)


class SemanticModelParser(YamlReader):
def __init__(self, schema_parser: SchemaParser, yaml: YamlBlock):
super().__init__(schema_parser, yaml, "semantic_models")
self.schema_parser = schema_parser
self.yaml = yaml

def parse_semantic_model(self, unparsed: UnparsedSemanticModel):
package_name = self.project.project_name
unique_id = f"{NodeType.SemanticModel}.{package_name}.{unparsed.name}"
path = self.yaml.path.relative_path

fqn = self.schema_parser.get_fqn_prefix(path)
fqn.append(unparsed.name)

parsed = SemanticModel(
description=unparsed.description,
fqn=fqn,
model=unparsed.model,
name=unparsed.name,
node_relation=None, # Resolved from the value of "model" after parsing
original_file_path=self.yaml.path.original_file_path,
package_name=package_name,
path=path,
resource_type=NodeType.SemanticModel,
unique_id=unique_id,
entities=unparsed.entities,
measures=unparsed.measures,
dimensions=unparsed.dimensions,
)

self.manifest.add_semantic_model(self.yaml.file, parsed)

def parse(self):
for data in self.get_key_dicts():
try:
UnparsedSemanticModel.validate(data)
unparsed = UnparsedSemanticModel.from_dict(data)
except (ValidationError, JSONValidationError) as exc:
raise YamlParseDictError(self.yaml.path, self.key, data, exc)

self.parse_semantic_model(unparsed)
Loading

0 comments on commit a89da7c

Please sign in to comment.