From 32dfd04a52134d9260be3939f488547cc42cce37 Mon Sep 17 00:00:00 2001 From: Michelle Ark Date: Thu, 25 Jan 2024 15:41:40 -0500 Subject: [PATCH 1/7] fix seed selection: apply selected_ids to catalog construction --- core/dbt/task/docs/generate.py | 25 +++++++++++++++++-------- tests/functional/docs/test_generate.py | 21 ++++++++++++++++----- 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/core/dbt/task/docs/generate.py b/core/dbt/task/docs/generate.py index acd68cb37ee..2d296b225e5 100644 --- a/core/dbt/task/docs/generate.py +++ b/core/dbt/task/docs/generate.py @@ -13,6 +13,8 @@ from dbt.task.compile import CompileTask from dbt.adapters.factory import get_adapter + +from dbt.graph.graph import UniqueId from dbt.contracts.graph.nodes import ResultNode from dbt.contracts.graph.manifest import Manifest from dbt.artifacts.schemas.results import NodeStatus @@ -112,7 +114,7 @@ def add_column(self, data: PrimitiveDict): table.columns[column.name] = column def make_unique_id_map( - self, manifest: Manifest + self, manifest: Manifest, selected_node_ids: Set[UniqueId] ) -> Tuple[Dict[str, CatalogTable], Dict[str, CatalogTable]]: nodes: Dict[str, CatalogTable] = {} sources: Dict[str, CatalogTable] = {} @@ -123,7 +125,8 @@ def make_unique_id_map( key = table.key() if key in node_map: unique_id = node_map[key] - nodes[unique_id] = table.replace(unique_id=unique_id) + if unique_id in selected_node_ids: + nodes[unique_id] = table.replace(unique_id=unique_id) unique_ids = source_map.get(table.key(), set()) for unique_id in unique_ids: @@ -134,7 +137,8 @@ def make_unique_id_map( table.to_dict(omit_none=True), ) else: - sources[unique_id] = table.replace(unique_id=unique_id) + if unique_id in selected_node_ids: + sources[unique_id] = table.replace(unique_id=unique_id) return nodes, sources @@ -238,6 +242,7 @@ def run(self) -> CatalogArtifact: if self.manifest is None: raise DbtInternalError("self.manifest was None in run!") + selected_node_ids: Set[UniqueId] = set() if self.args.empty_catalog: catalog_table: agate.Table = agate.Table([]) exceptions: List[Exception] = [] @@ -251,14 +256,18 @@ def run(self) -> CatalogArtifact: selected_node_ids = self.job_queue.get_selected_nodes() selected_nodes = self._get_nodes_from_ids(self.manifest, selected_node_ids) - source_ids = self._get_nodes_from_ids( + source_nodes = self._get_nodes_from_ids( self.manifest, self.manifest.sources.keys() ) - selected_nodes.extend(source_ids) + + selected_node_ids.update( + {UniqueId(unique_id) for unique_id in self.manifest.sources.keys()} + ) + selected_nodes.extend(source_nodes) relations = { - adapter.Relation.create_from(adapter.config, node_id) - for node_id in selected_nodes + adapter.Relation.create_from(adapter.config, node) + for node in selected_nodes } # This generates the catalog as an agate.Table @@ -285,7 +294,7 @@ def run(self) -> CatalogArtifact: if exceptions: errors = [str(e) for e in exceptions] - nodes, sources = catalog.make_unique_id_map(self.manifest) + nodes, sources = catalog.make_unique_id_map(self.manifest, selected_node_ids) results = self.get_catalog_results( nodes=nodes, sources=sources, diff --git a/tests/functional/docs/test_generate.py b/tests/functional/docs/test_generate.py index 1047600a820..e1880689e72 100644 --- a/tests/functional/docs/test_generate.py +++ b/tests/functional/docs/test_generate.py @@ -74,6 +74,7 @@ def test_select_limits_no_match(self, project): run_dbt(["run"]) catalog = run_dbt(["docs", "generate", "--select", "my_missing_model"]) assert len(catalog.nodes) == 0 + assert len(catalog.sources) == 0 class TestGenerateCatalogWithSources(TestBaseGenerate): @@ -90,11 +91,21 @@ def test_catalog_with_sources(self, project): class TestGenerateSelectSource(TestBaseGenerate): def test_select_source(self, project): run_dbt(["build"]) - catalog = run_dbt(["docs", "generate", "--select", "source:test.my_seed.sample_seed"]) - # 2 seeds - # TODO: Filtering doesn't work for seeds - assert len(catalog.nodes) == 2 # 2 sources - # TODO: Filtering doesn't work for sources + catalog = run_dbt(["docs", "generate", "--select", "source:test.my_seed.sample_seed"]) + # TODO: Filtering doesn't work for sources (should be 1) + assert len(catalog.sources) == 2 + + +class TestGenerateSelectSeed(TestBaseGenerate): + def test_select_seed(self, project): + run_dbt(["build"]) + + # 2 seeds, 1 selected + catalog = run_dbt(["docs", "generate", "--select", "sample_seed"]) + assert len(catalog.nodes) == 1 + assert "seed.test.sample_seed" in catalog.nodes + + # TODO: Filtering doesn't work for sources (should be 0) assert len(catalog.sources) == 2 From 5149cc4cbe8125ac654de432fd1725cc00e93f54 Mon Sep 17 00:00:00 2001 From: Michelle Ark Date: Thu, 25 Jan 2024 15:54:49 -0500 Subject: [PATCH 2/7] fix source selection in docs generate: apply source selection --- core/dbt/task/docs/generate.py | 56 ++++++++++++++++---------- tests/functional/docs/test_generate.py | 13 +++--- 2 files changed, 42 insertions(+), 27 deletions(-) diff --git a/core/dbt/task/docs/generate.py b/core/dbt/task/docs/generate.py index 2d296b225e5..efc6b0bde78 100644 --- a/core/dbt/task/docs/generate.py +++ b/core/dbt/task/docs/generate.py @@ -32,7 +32,7 @@ from dbt_common.exceptions import DbtInternalError from dbt.exceptions import AmbiguousCatalogMatchError from dbt.graph import ResourceTypeSelector -from dbt.node_types import EXECUTABLE_NODE_TYPES +from dbt.node_types import EXECUTABLE_NODE_TYPES, NodeType from dbt_common.events.functions import fire_event from dbt.adapters.events.types import ( WriteCatalogFailure, @@ -256,14 +256,15 @@ def run(self) -> CatalogArtifact: selected_node_ids = self.job_queue.get_selected_nodes() selected_nodes = self._get_nodes_from_ids(self.manifest, selected_node_ids) - source_nodes = self._get_nodes_from_ids( - self.manifest, self.manifest.sources.keys() + # Source selection is handled separately from main job_queue selection because + # SourceDefinition nodes cannot be safely compiled / run by the CompileRunner / CompileTask, + # but can still be included in the catalog + selected_source_ids = self._get_selected_source_ids() + selected_source_nodes = self._get_nodes_from_ids( + self.manifest, selected_source_ids ) - - selected_node_ids.update( - {UniqueId(unique_id) for unique_id in self.manifest.sources.keys()} - ) - selected_nodes.extend(source_nodes) + selected_node_ids.update(selected_source_ids) + selected_nodes.extend(selected_source_nodes) relations = { adapter.Relation.create_from(adapter.config, node) @@ -331,19 +332,6 @@ def run(self) -> CatalogArtifact: fire_event(CatalogWritten(path=os.path.abspath(catalog_path))) return results - @staticmethod - def _get_nodes_from_ids(manifest: Manifest, node_ids: Iterable[str]) -> List[ResultNode]: - selected: List[ResultNode] = [] - for unique_id in node_ids: - if unique_id in manifest.nodes: - node = manifest.nodes[unique_id] - if node.is_relational and not node.is_ephemeral_model: - selected.append(node) - elif unique_id in manifest.sources: - source = manifest.sources[unique_id] - selected.append(source) - return selected - def get_node_selector(self) -> ResourceTypeSelector: if self.manifest is None or self.graph is None: raise DbtInternalError("manifest and graph must be set to perform node selection") @@ -381,3 +369,29 @@ def interpret_results(self, results: Optional[CatalogResults]) -> bool: return True return super().interpret_results(compile_results) + + @staticmethod + def _get_nodes_from_ids(manifest: Manifest, node_ids: Iterable[str]) -> List[ResultNode]: + selected: List[ResultNode] = [] + for unique_id in node_ids: + if unique_id in manifest.nodes: + node = manifest.nodes[unique_id] + if node.is_relational and not node.is_ephemeral_model: + selected.append(node) + elif unique_id in manifest.sources: + source = manifest.sources[unique_id] + selected.append(source) + return selected + + def _get_selected_source_ids(self) -> Set[UniqueId]: + if self.manifest is None or self.graph is None: + raise DbtInternalError("manifest and graph must be set to perform node selection") + + source_selector = ResourceTypeSelector( + graph=self.graph, + manifest=self.manifest, + previous_state=self.previous_state, + resource_types=[NodeType.Source], + ) + + return source_selector.get_graph_queue(self.get_selection_spec()).get_selected_nodes() diff --git a/tests/functional/docs/test_generate.py b/tests/functional/docs/test_generate.py index e1880689e72..a62fab73f82 100644 --- a/tests/functional/docs/test_generate.py +++ b/tests/functional/docs/test_generate.py @@ -92,10 +92,12 @@ class TestGenerateSelectSource(TestBaseGenerate): def test_select_source(self, project): run_dbt(["build"]) - # 2 sources + # 2 sources, 1 selected catalog = run_dbt(["docs", "generate", "--select", "source:test.my_seed.sample_seed"]) - # TODO: Filtering doesn't work for sources (should be 1) - assert len(catalog.sources) == 2 + assert len(catalog.sources) == 1 + assert "test.my_seed.sample_seed" in catalog.sources + # no nodes selected + assert len(catalog.nodes) == 0 class TestGenerateSelectSeed(TestBaseGenerate): @@ -106,6 +108,5 @@ def test_select_seed(self, project): catalog = run_dbt(["docs", "generate", "--select", "sample_seed"]) assert len(catalog.nodes) == 1 assert "seed.test.sample_seed" in catalog.nodes - - # TODO: Filtering doesn't work for sources (should be 0) - assert len(catalog.sources) == 2 + # no sources selected + assert len(catalog.sources) == 0 From 518e3bcb21febc534a7fa7715ff64d16ed2e8af6 Mon Sep 17 00:00:00 2001 From: Michelle Ark Date: Thu, 25 Jan 2024 15:56:48 -0500 Subject: [PATCH 3/7] changelog entry --- .changes/unreleased/Fixes-20240125-155641.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .changes/unreleased/Fixes-20240125-155641.yaml diff --git a/.changes/unreleased/Fixes-20240125-155641.yaml b/.changes/unreleased/Fixes-20240125-155641.yaml new file mode 100644 index 00000000000..c5e3b45d06f --- /dev/null +++ b/.changes/unreleased/Fixes-20240125-155641.yaml @@ -0,0 +1,6 @@ +kind: Fixes +body: Fix seed and source selection in `dbt docs generate` +time: 2024-01-25T15:56:41.557934-05:00 +custom: + Author: michelleark + Issue: "9161" From cd2bafc82fdc65e015cc1873b96e1d9e4f647b36 Mon Sep 17 00:00:00 2001 From: Michelle Ark Date: Thu, 25 Jan 2024 16:07:28 -0500 Subject: [PATCH 4/7] more precise assertion in TestGenerateSelectSource.test_select_source --- tests/functional/docs/test_generate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/functional/docs/test_generate.py b/tests/functional/docs/test_generate.py index a62fab73f82..3ba7c8929e0 100644 --- a/tests/functional/docs/test_generate.py +++ b/tests/functional/docs/test_generate.py @@ -95,7 +95,7 @@ def test_select_source(self, project): # 2 sources, 1 selected catalog = run_dbt(["docs", "generate", "--select", "source:test.my_seed.sample_seed"]) assert len(catalog.sources) == 1 - assert "test.my_seed.sample_seed" in catalog.sources + assert "source.test.my_seed.sample_seed" in catalog.sources # no nodes selected assert len(catalog.nodes) == 0 From 5634453018853d12530b6167609448e0c699eba5 Mon Sep 17 00:00:00 2001 From: Michelle Ark Date: Thu, 25 Jan 2024 16:10:54 -0500 Subject: [PATCH 5/7] restore default behaviour for make_unique_id_map --- core/dbt/task/docs/generate.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/core/dbt/task/docs/generate.py b/core/dbt/task/docs/generate.py index efc6b0bde78..f56868093cb 100644 --- a/core/dbt/task/docs/generate.py +++ b/core/dbt/task/docs/generate.py @@ -114,8 +114,13 @@ def add_column(self, data: PrimitiveDict): table.columns[column.name] = column def make_unique_id_map( - self, manifest: Manifest, selected_node_ids: Set[UniqueId] + self, manifest: Manifest, selected_node_ids: Optional[Set[UniqueId]] = None ) -> Tuple[Dict[str, CatalogTable], Dict[str, CatalogTable]]: + """ + Create mappings between CatalogKeys and CatalogTables for nodes and sources, filteded by selected_node_ids. + + By default, selected_node_ids is None and all nodes and sources defined in the manifest are included in the mappings. + """ nodes: Dict[str, CatalogTable] = {} sources: Dict[str, CatalogTable] = {} @@ -125,7 +130,7 @@ def make_unique_id_map( key = table.key() if key in node_map: unique_id = node_map[key] - if unique_id in selected_node_ids: + if selected_node_ids is None or unique_id in selected_node_ids: nodes[unique_id] = table.replace(unique_id=unique_id) unique_ids = source_map.get(table.key(), set()) @@ -137,7 +142,7 @@ def make_unique_id_map( table.to_dict(omit_none=True), ) else: - if unique_id in selected_node_ids: + if selected_node_ids is None or unique_id in selected_node_ids: sources[unique_id] = table.replace(unique_id=unique_id) return nodes, sources @@ -242,10 +247,11 @@ def run(self) -> CatalogArtifact: if self.manifest is None: raise DbtInternalError("self.manifest was None in run!") - selected_node_ids: Set[UniqueId] = set() + selected_node_ids: Optional[Set[UniqueId]] = None if self.args.empty_catalog: catalog_table: agate.Table = agate.Table([]) exceptions: List[Exception] = [] + selected_node_ids = set() else: adapter = get_adapter(self.config) with adapter.connection_named("generate_catalog"): From e6ffbfd68be34fa57241c678986139f856a11fc9 Mon Sep 17 00:00:00 2001 From: Michelle Ark Date: Mon, 29 Jan 2024 17:59:31 -0500 Subject: [PATCH 6/7] typos --- core/dbt/task/docs/generate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/dbt/task/docs/generate.py b/core/dbt/task/docs/generate.py index f56868093cb..1712c17a9d0 100644 --- a/core/dbt/task/docs/generate.py +++ b/core/dbt/task/docs/generate.py @@ -117,7 +117,7 @@ def make_unique_id_map( self, manifest: Manifest, selected_node_ids: Optional[Set[UniqueId]] = None ) -> Tuple[Dict[str, CatalogTable], Dict[str, CatalogTable]]: """ - Create mappings between CatalogKeys and CatalogTables for nodes and sources, filteded by selected_node_ids. + Create mappings between CatalogKeys and CatalogTables for nodes and sources, filtered by selected_node_ids. By default, selected_node_ids is None and all nodes and sources defined in the manifest are included in the mappings. """ @@ -264,7 +264,7 @@ def run(self) -> CatalogArtifact: # Source selection is handled separately from main job_queue selection because # SourceDefinition nodes cannot be safely compiled / run by the CompileRunner / CompileTask, - # but can still be included in the catalog + # but should still be included in the catalog based on the selection spec selected_source_ids = self._get_selected_source_ids() selected_source_nodes = self._get_nodes_from_ids( self.manifest, selected_source_ids From 2b4b810db680f8807fdab86ef1886ada91c76576 Mon Sep 17 00:00:00 2001 From: Michelle Ark Date: Mon, 29 Jan 2024 18:16:26 -0500 Subject: [PATCH 7/7] improve test setup to decouple seeds & sources + explicit testing for node and source w same relation --- core/dbt/task/docs/generate.py | 5 +-- tests/functional/docs/test_generate.py | 62 ++++++++++++++++++++++---- 2 files changed, 55 insertions(+), 12 deletions(-) diff --git a/core/dbt/task/docs/generate.py b/core/dbt/task/docs/generate.py index 5043a1d54cb..36fa394d041 100644 --- a/core/dbt/task/docs/generate.py +++ b/core/dbt/task/docs/generate.py @@ -141,9 +141,8 @@ def make_unique_id_map( sources[unique_id].to_dict(omit_none=True), table.to_dict(omit_none=True), ) - else: - if selected_node_ids is None or unique_id in selected_node_ids: - sources[unique_id] = table.replace(unique_id=unique_id) + elif selected_node_ids is None or unique_id in selected_node_ids: + sources[unique_id] = table.replace(unique_id=unique_id) return nodes, sources diff --git a/tests/functional/docs/test_generate.py b/tests/functional/docs/test_generate.py index de9163ca743..4129fc0abb5 100644 --- a/tests/functional/docs/test_generate.py +++ b/tests/functional/docs/test_generate.py @@ -18,12 +18,13 @@ sample_config = """ sources: - - name: my_seed + - name: my_source_schema schema: "{{ target.schema }}" tables: - - name: sample_seed - - name: second_seed - - name: fake_seed + - name: sample_source + - name: second_source + - name: non_existent_source + - name: source_from_seed """ @@ -81,7 +82,13 @@ def test_select_limits_no_match(self, project): class TestGenerateCatalogWithSources(TestBaseGenerate): def test_catalog_with_sources(self, project): + # populate sources other than non_existent_source + project.run_sql("create table {}.sample_source (id int)".format(project.test_schema)) + project.run_sql("create table {}.second_source (id int)".format(project.test_schema)) + + # build nodes run_dbt(["build"]) + catalog = run_dbt(["docs", "generate"]) # 2 seeds + 2 models @@ -92,7 +99,7 @@ def test_catalog_with_sources(self, project): class TestGenerateCatalogWithExternalNodes(TestBaseGenerate): @mock.patch("dbt.plugins.get_plugin_manager") - def test_catalog_with_sources(self, get_plugin_manager, project): + def test_catalog_with_external_node(self, get_plugin_manager, project): project.run_sql("create table {}.external_model (id int)".format(project.test_schema)) run_dbt(["build"]) @@ -113,24 +120,61 @@ def test_catalog_with_sources(self, get_plugin_manager, project): class TestGenerateSelectSource(TestBaseGenerate): + @pytest.fixture(scope="class") + def seeds(self): + return { + "sample_seed.csv": sample_seed, + "second_seed.csv": sample_seed, + "source_from_seed.csv": sample_seed, + } + def test_select_source(self, project): run_dbt(["build"]) - # 2 sources, 1 selected - catalog = run_dbt(["docs", "generate", "--select", "source:test.my_seed.sample_seed"]) + project.run_sql("create table {}.sample_source (id int)".format(project.test_schema)) + project.run_sql("create table {}.second_source (id int)".format(project.test_schema)) + + # 2 existing sources, 1 selected + catalog = run_dbt( + ["docs", "generate", "--select", "source:test.my_source_schema.sample_source"] + ) assert len(catalog.sources) == 1 - assert "source.test.my_seed.sample_seed" in catalog.sources + assert "source.test.my_source_schema.sample_source" in catalog.sources # no nodes selected assert len(catalog.nodes) == 0 + # 2 existing sources sources, 1 selected that has relation as a seed + catalog = run_dbt( + ["docs", "generate", "--select", "source:test.my_source_schema.source_from_seed"] + ) + assert len(catalog.sources) == 1 + assert "source.test.my_source_schema.source_from_seed" in catalog.sources + # seed with same relation that was not selected not in catalog + assert len(catalog.nodes) == 0 + class TestGenerateSelectSeed(TestBaseGenerate): + @pytest.fixture(scope="class") + def seeds(self): + return { + "sample_seed.csv": sample_seed, + "second_seed.csv": sample_seed, + "source_from_seed.csv": sample_seed, + } + def test_select_seed(self, project): run_dbt(["build"]) - # 2 seeds, 1 selected + # 3 seeds, 1 selected catalog = run_dbt(["docs", "generate", "--select", "sample_seed"]) assert len(catalog.nodes) == 1 assert "seed.test.sample_seed" in catalog.nodes # no sources selected assert len(catalog.sources) == 0 + + # 3 seeds, 1 selected that has same relation as a source + catalog = run_dbt(["docs", "generate", "--select", "source_from_seed"]) + assert len(catalog.nodes) == 1 + assert "seed.test.source_from_seed" in catalog.nodes + # source with same relation that was not selected not in catalog + assert len(catalog.sources) == 0