dbt-labs · peterallenwebb · Jun 8, 2023 · May 31, 2023 · Jun 1, 2023 · Jun 2, 2023
@@ -0,0 +1,6 @@
+kind: Features
+body: Added support for parsing and serializaing semantic models
+time: 2023-06-06T16:53:51.117429-04:00
+custom:
+  Author: peterallenwebb
+  Issue: 7499 7503
@@ -228,6 +228,7 @@ class SchemaSourceFile(BaseSourceFile):
     groups: List[str] = field(default_factory=list)
     # node patches contain models, seeds, snapshots, analyses
     ndp: List[str] = field(default_factory=list)
+    semantic_models: List[str] = field(default_factory=list)
     # any macro patches in this file by macro unique_id.
     mcp: Dict[str, str] = field(default_factory=dict)
     # any source patches in this file. The entries are package, name pairs

@@ -25,21 +25,22 @@
 from dbt.contracts.publication import ProjectDependencies, PublicationConfig, PublicModel
 
 from dbt.contracts.graph.nodes import (
-    Macro,
+    BaseNode,
     Documentation,
-    SourceDefinition,
-    GenericTestNode,
     Exposure,
-    Metric,
+    GenericTestNode,
+    GraphMemberNode,
     Group,
-    UnpatchedSourceDefinition,
+    Macro,
     ManifestNode,
-    GraphMemberNode,
-    ResultNode,
-    BaseNode,
     ManifestOrPublicNode,
+    Metric,
     ModelNode,
     RelationalNode,
+    ResultNode,
+    SemanticModel,
+    SourceDefinition,
+    UnpatchedSourceDefinition,
 )
 from dbt.contracts.graph.unparsed import SourcePatch, NodeVersion, UnparsedVersion
 from dbt.contracts.graph.manifest_upgrade import upgrade_manifest_json
@@ -706,6 +707,7 @@ class Manifest(MacroMethods, DataClassMessagePackMixin, dbtClassMixin):
     public_nodes: MutableMapping[str, PublicModel] = field(default_factory=dict)
     project_dependencies: Optional[ProjectDependencies] = None
     publications: MutableMapping[str, PublicationConfig] = field(default_factory=dict)
+    semantic_models: MutableMapping[str, SemanticModel] = field(default_factory=dict)
 
     _doc_lookup: Optional[DocLookup] = field(
         default=None, metadata={"serialize": lambda x: None, "deserialize": lambda x: None}
@@ -894,7 +896,7 @@ def build_group_map(self):
                 group_map[node.group].append(node.unique_id)
         self.group_map = group_map
 
-    def writable_manifest(self):
+    def writable_manifest(self) -> "WritableManifest":
         self.build_parent_and_child_maps()
         self.build_group_map()
         return WritableManifest(
@@ -912,6 +914,7 @@ def writable_manifest(self):
             child_map=self.child_map,
             parent_map=self.parent_map,
             group_map=self.group_map,
+            semantic_models=self.semantic_models,
         )
 
     def write(self, path):
@@ -1246,6 +1249,11 @@ def add_doc(self, source_file: SourceFile, doc: Documentation):
         self.docs[doc.unique_id] = doc
         source_file.docs.append(doc.unique_id)
 
+    def add_semantic_model(self, source_file: SchemaSourceFile, semantic_model: SemanticModel):
+        _check_duplicates(semantic_model, self.semantic_models)
+        self.semantic_models[semantic_model.unique_id] = semantic_model
+        source_file.semantic_models.append(semantic_model.unique_id)
+
     # end of methods formerly in ParseResult
 
     # Provide support for copy.deepcopy() - we just need to avoid the lock!
@@ -1345,6 +1353,9 @@ class WritableManifest(ArtifactMixin):
     public_nodes: Mapping[UniqueID, PublicModel] = field(
         metadata=dict(description=("The public models used in the dbt project"))
     )
+    semantic_models: Mapping[UniqueID, SemanticModel] = field(
+        metadata=dict(description=("The semantic models defined in the dbt project"))
+    )
     metadata: ManifestMetadata = field(
         metadata=dict(
             description="Metadata about the manifest",

@@ -127,4 +127,6 @@ def upgrade_manifest_json(manifest: dict) -> dict:
         if "root_path" in doc_content:
             del doc_content["root_path"]
         doc_content["resource_type"] = "doc"
+    if "semantic_models" not in manifest:
+        manifest["semantic_models"] = {}
     return manifest
@@ -6,29 +6,23 @@
 import hashlib
 
 from mashumaro.types import SerializableType
-from typing import (
-    Optional,
-    Union,
-    List,
-    Dict,
-    Any,
-    Sequence,
-    Tuple,
-    Iterator,
-)
+from typing import Optional, Union, List, Dict, Any, Sequence, Tuple, Iterator, Protocol
 
 from dbt.dataclass_schema import dbtClassMixin, ExtensibleDbtClassMixin
 
 from dbt.clients.system import write_file
 from dbt.contracts.files import FileHash
 from dbt.contracts.graph.unparsed import (
+    Dimension,
     Docs,
+    Entity,
     ExposureType,
     ExternalTable,
     FreshnessThreshold,
     HasYamlMetadata,
     MacroArgument,
     MaturityType,
+    Measure,
     MetricFilter,
     MetricTime,
     Owner,
@@ -62,12 +56,6 @@
     EmptySnapshotConfig,
     SnapshotConfig,
 )
-import sys
-
-if sys.version_info >= (3, 8):
-    from typing import Protocol
-else:
-    from typing_extensions import Protocol
 
 
 # =====================================================================
@@ -564,6 +552,30 @@ def depends_on_macros(self):
         return self.depends_on.macros
 
 
+@dataclass
+class FileSlice(dbtClassMixin, Replaceable):
+    """Provides file slice level context about what something was created from.
+
+    Implementation of the dbt-semantic-interfaces `FileSlice` protocol
+    """
+
+    filename: str
+    content: str
+    start_line_number: int
+    end_line_number: int
+
+
+@dataclass
+class Metadata(dbtClassMixin, Replaceable):
+    """Provides file context about what something was created from.
+
+    Implementation of the dbt-semantic-interfaces `Metadata` protocol
+    """
+
+    repo_file_path: str
+    file_slice: FileSlice
+
+
 # ====================================
 # CompiledNode subclasses
 # ====================================
@@ -1411,6 +1423,28 @@ class Group(BaseNode):
     resource_type: NodeType = field(metadata={"restrict": [NodeType.Group]})
 
 
+# ====================================
+# SemanticModel and related classes
+# ====================================
+
+
+@dataclass
+class NodeRelation(dbtClassMixin):
+    alias: str
+    schema_name: str  # TODO: Could this be called simply "schema" so we could reuse StateRelation?
+    database: Optional[str] = None
+
+
+@dataclass
+class SemanticModel(GraphNode):
+    description: Optional[str]
+    model: str
+    node_relation: Optional[NodeRelation]
+    entities: Sequence[Entity]
+    measures: Sequence[Measure]
+    dimensions: Sequence[Dimension]
+
+
 # ====================================
 # Patches
 # ====================================

@@ -661,6 +661,60 @@ def validate(cls, data):
             raise ValidationError("Group owner must have at least one of 'name' or 'email'.")
 
 
+#
+# semantic interfaces unparsed objects
+#
+
+
+@dataclass
+class Entity(dbtClassMixin, Replaceable):
+    name: str
+    type: str  # actually an enum
+    description: Optional[str] = None
+    role: Optional[str] = None
+    expr: Optional[str] = None
+
+
+@dataclass
+class MeasureAggregationParameters(dbtClassMixin, Replaceable):
+    percentile: Optional[float] = None
+    use_discrete_percentile: bool = False
+    use_approximate_percentile: bool = False
+
+
+@dataclass
+class Measure(dbtClassMixin, Replaceable):
+    name: str
+    agg: str  # actually an enum
+    description: Optional[str] = None
+    create_metric: Optional[bool] = None
+    expr: Optional[str] = None
+    agg_params: Optional[MeasureAggregationParameters] = None
+    non_additive_dimension: Optional[Dict[str, Any]] = None
+    agg_time_dimension: Optional[str] = None
+
+
+@dataclass
+class Dimension(dbtClassMixin, Replaceable):
+    name: str
+    type: str  # actually an enum
+    description: Optional[str] = None
+    is_partition: Optional[bool] = False
+    type_params: Optional[Dict[str, Any]] = None
+    expr: Optional[str] = None
+    # TODO metadata: Optional[Metadata] (this would actually be the YML for the dimension)
+
+
+@dataclass
+class UnparsedSemanticModel(dbtClassMixin, Replaceable):
+    name: str
+    description: Optional[str]
+    model: str  # looks like "ref(...)"
+    entities: List[Entity] = field(default_factory=list)
+    measures: List[Measure] = field(default_factory=list)
+    dimensions: List[Dimension] = field(default_factory=list)
+
+
 def normalize_date(d: Optional[datetime.date]) -> Optional[datetime.datetime]:
     """Convert date to datetime (at midnight), and add local time zone if naive"""
     if d is None:

@@ -33,6 +33,7 @@ class NodeType(StrEnum):
     Exposure = "exposure"
     Metric = "metric"
     Group = "group"
+    SemanticModel = "semantic model"
 
     @classmethod
     def executable(cls) -> List["NodeType"]:

@@ -57,6 +57,7 @@
     DeprecatedReference,
     UpcomingReferenceDeprecation,
 )
+from dbt_extractor import py_extract_from_source  # type: ignore
 from dbt.logger import DbtProcessState
 from dbt.node_types import NodeType, AccessType
 from dbt.clients.jinja import get_rendered, MacroStack
@@ -99,6 +100,7 @@
     ManifestNode,
     ResultNode,
     ModelNode,
+    NodeRelation,
 )
 from dbt.contracts.graph.unparsed import NodeVersion
 from dbt.contracts.util import Writable
@@ -528,6 +530,7 @@ def load(self):
             self.process_refs(self.root_project.project_name)
             self.process_docs(self.root_project)
             self.process_metrics(self.root_project)
+            self.process_semantic_models()
             self.check_valid_group_config()
 
             # update tracking data
@@ -1176,6 +1179,28 @@ def process_metrics(self, config: RuntimeConfig):
                 continue
             _process_metrics_for_node(self.manifest, current_project, exposure)
 
+    def process_semantic_models(self) -> None:
+        for semantic_model in self.manifest.semantic_models.values():
+            if semantic_model.model:
+                statically_parsed = py_extract_from_source(f"{{{{ {semantic_model.model} }}}}")
+                if statically_parsed["refs"]:
+
+                    ref = statically_parsed["refs"][0]
+                    if len(ref) == 2:
+                        input_package_name, input_model_name = ref
+                    else:
+                        input_package_name, input_model_name = None, ref[0]
+
+                    refd_node = self.manifest.ref_lookup.find(
+                        input_model_name, input_package_name, None, self.manifest
+                    )
+                    if isinstance(refd_node, ModelNode):
+                        semantic_model.node_relation = NodeRelation(
+                            alias=refd_node.alias,
+                            schema_name=refd_node.schema,
+                            database=refd_node.database,
+                        )
+
     # nodes: node and column descriptions
     # sources: source and table descriptions, column descriptions
     # macros: macro argument descriptions

@@ -1,8 +1,13 @@
 from dbt.parser.schemas import YamlReader, SchemaParser
 from dbt.parser.common import YamlBlock
 from dbt.node_types import NodeType
-from dbt.contracts.graph.unparsed import UnparsedExposure, UnparsedMetric, UnparsedGroup
-from dbt.contracts.graph.nodes import Exposure, Metric, Group
+from dbt.contracts.graph.unparsed import (
+    UnparsedExposure,
+    UnparsedGroup,
+    UnparsedMetric,
+    UnparsedSemanticModel,
+)
+from dbt.contracts.graph.nodes import Exposure, Group, Metric, SemanticModel
 from dbt.exceptions import DbtInternalError, YamlParseDictError, JSONValidationError
 from dbt.context.providers import generate_parse_exposure, generate_parse_metrics
 from dbt.contracts.graph.model_config import MetricConfig, ExposureConfig
@@ -269,3 +274,46 @@ def parse(self):
                 raise YamlParseDictError(self.yaml.path, self.key, data, exc)
 
             self.parse_group(unparsed)
+
+
+class SemanticModelParser(YamlReader):
+    def __init__(self, schema_parser: SchemaParser, yaml: YamlBlock):
+        super().__init__(schema_parser, yaml, "semantic_models")
+        self.schema_parser = schema_parser
+        self.yaml = yaml
+
+    def parse_semantic_model(self, unparsed: UnparsedSemanticModel):
+        package_name = self.project.project_name
+        unique_id = f"{NodeType.SemanticModel}.{package_name}.{unparsed.name}"
+        path = self.yaml.path.relative_path
+
+        fqn = self.schema_parser.get_fqn_prefix(path)
+        fqn.append(unparsed.name)
+
+        parsed = SemanticModel(
+            description=unparsed.description,
+            fqn=fqn,
+            model=unparsed.model,
+            name=unparsed.name,
+            node_relation=None,  # Resolved from the value of "model" after parsing
+            original_file_path=self.yaml.path.original_file_path,
+            package_name=package_name,
+            path=path,
+            resource_type=NodeType.SemanticModel,
+            unique_id=unique_id,
+            entities=unparsed.entities,
+            measures=unparsed.measures,
+            dimensions=unparsed.dimensions,
+        )
+
+        self.manifest.add_semantic_model(self.yaml.file, parsed)
+
+    def parse(self):
+        for data in self.get_key_dicts():
+            try:
+                UnparsedSemanticModel.validate(data)
+                unparsed = UnparsedSemanticModel.from_dict(data)
+            except (ValidationError, JSONValidationError) as exc:
+                raise YamlParseDictError(self.yaml.path, self.key, data, exc)
+
+            self.parse_semantic_model(unparsed)