fix: Catalog init introduces significant overhead (#1270)

Fixed #1268 Moved instrumentation after the entire execution. <img width="1840" alt="image" src="https://github.com/georgia-tech-db/evadb/assets/12206234/3d770689-deff-4408-bb64-9df320a95fa1"> <img width="1832" alt="image" src="https://github.com/georgia-tech-db/evadb/assets/12206234/969ed19b-0985-4feb-ac87-045542a7b485">
georgia-tech-db · Oct 9, 2023 · 18bc547 · 18bc547
1 parent 3ea2f8a
commit 18bc547
Show file tree

Hide file tree

Showing 11 changed files with 89 additions and 109 deletions.
diff --git a/evadb/executor/abstract_executor.py b/evadb/executor/abstract_executor.py
@@ -39,6 +39,7 @@ def __init__(self, db: EvaDBDatabase, node: AbstractPlan):
         self._config: ConfigurationManager = db.config if db else None
         self._children = []
 
+    # @lru_cache(maxsize=None)
     def catalog(self) -> "CatalogManager":
         """The object is intentionally generated on demand to prevent serialization issues. Having a SQLAlchemy object as a member variable can cause problems with multiprocessing. See get_catalog_instance()"""
         return self._db.catalog() if self._db else None

diff --git a/evadb/executor/apply_and_merge_executor.py b/evadb/executor/apply_and_merge_executor.py
@@ -16,6 +16,7 @@
 
 from evadb.database import EvaDBDatabase
 from evadb.executor.abstract_executor import AbstractExecutor
+from evadb.executor.executor_utils import instrument_function_expression_cost
 from evadb.models.storage.batch import Batch
 from evadb.plan_nodes.apply_and_merge_plan import ApplyAndMergePlan
 
@@ -42,19 +43,13 @@ def exec(self, *args, **kwargs) -> Iterator[Batch]:
         for batch in child_executor.exec(**kwargs):
             func_result = self.func_expr.evaluate(batch)
 
-            # persist stats of function expression
-            if self.func_expr.function_obj and self.func_expr._stats:
-                function_id = self.func_expr.function_obj.row_id
-                self.catalog().upsert_function_cost_catalog_entry(
-                    function_id,
-                    self.func_expr.function_obj.name,
-                    self.func_expr._stats.prev_cost,
-                )
-
             output = Batch.merge_column_wise([batch, func_result])
             if self.do_unnest:
                 output.unnest(func_result.columns)
                 # we reset the index as after unnest there can be duplicate index
                 output.reset_index()
 
             yield output
+
+        # persist stats of function expression
+        instrument_function_expression_cost(self.func_expr, self.catalog())
diff --git a/evadb/executor/executor_utils.py b/evadb/executor/executor_utils.py
@@ -15,7 +15,7 @@
 import glob
 import os
 from pathlib import Path
-from typing import TYPE_CHECKING, Generator, List
+from typing import TYPE_CHECKING, Generator, List, Union
 
 from evadb.catalog.catalog_utils import xform_column_definitions_to_catalog_entries
 from evadb.catalog.models.utils import TableCatalogEntry
@@ -39,41 +39,45 @@ class ExecutorError(Exception):
     pass
 
 
-def apply_project(
-    batch: Batch, project_list: List[AbstractExpression], catalog: "CatalogManager"
+def instrument_function_expression_cost(
+    expr: Union[AbstractExpression, List[AbstractExpression]],
+    catalog: "CatalogManager",
 ):
+    """We are expecting an instance of a catalog. An optimization can be to avoid creating a catalog instance if there is no function expression. An easy fix is to pass the function handler and create the catalog instance only if there is a function expression. In the past, this was problematic because of Ray. We can revisit it again."""
+
+    if expr is None:
+        return
+
+    list_expr = expr
+    if not isinstance(expr, list):
+        list_expr = [expr]
+
+    # persist stats of function expression
+    for expr in list_expr:
+        for func_expr in expr.find_all(FunctionExpression):
+            if func_expr.function_obj and func_expr._stats:
+                function_id = func_expr.function_obj.row_id
+                catalog.upsert_function_cost_catalog_entry(
+                    function_id,
+                    func_expr.function_obj.name,
+                    func_expr._stats.prev_cost,
+                )
+
+
+def apply_project(batch: Batch, project_list: List[AbstractExpression]):
     if not batch.empty() and project_list:
         batches = [expr.evaluate(batch) for expr in project_list]
         batch = Batch.merge_column_wise(batches)
 
-        # persist stats of function expression
-        for expr in project_list:
-            for func_expr in expr.find_all(FunctionExpression):
-                if func_expr.function_obj and func_expr._stats:
-                    function_id = func_expr.function_obj.row_id
-                    catalog.upsert_function_cost_catalog_entry(
-                        function_id,
-                        func_expr.function_obj.name,
-                        func_expr._stats.prev_cost,
-                    )
     return batch
 
 
-def apply_predicate(
-    batch: Batch, predicate: AbstractExpression, catalog: "CatalogManager"
-) -> Batch:
+def apply_predicate(batch: Batch, predicate: AbstractExpression) -> Batch:
     if not batch.empty() and predicate is not None:
         outcomes = predicate.evaluate(batch)
         batch.drop_zero(outcomes)
         batch.reset_index()
 
-        # persist stats of function expression
-        for func_expr in predicate.find_all(FunctionExpression):
-            if func_expr.function_obj and func_expr._stats:
-                function_id = func_expr.function_obj.row_id
-                catalog.upsert_function_cost_catalog_entry(
-                    function_id, func_expr.function_obj.name, func_expr._stats.prev_cost
-                )
     return batch
 
 

diff --git a/evadb/executor/function_scan_executor.py b/evadb/executor/function_scan_executor.py
@@ -16,6 +16,7 @@
 
 from evadb.database import EvaDBDatabase
 from evadb.executor.abstract_executor import AbstractExecutor
+from evadb.executor.executor_utils import instrument_function_expression_cost
 from evadb.models.storage.batch import Batch
 from evadb.plan_nodes.function_scan_plan import FunctionScanPlan
 
@@ -41,17 +42,11 @@ def exec(self, *args, **kwargs) -> Iterator[Batch]:
         if not lateral_input.empty():
             res = self.func_expr.evaluate(lateral_input)
 
-            # persist stats of function expression
-            if self.func_expr.function_obj and self.func_expr._stats:
-                function_id = self.func_expr.function_obj.row_id
-                self.catalog().upsert_function_cost_catalog_entry(
-                    function_id,
-                    self.func_expr.function_obj.name,
-                    self.func_expr._stats.prev_cost,
-                )
-
             if not res.empty():
                 if self.do_unnest:
                     res.unnest(res.columns)
 
                 yield res
+
+            # persist stats of function expression
+            instrument_function_expression_cost(self.func_expr, self.catalog())
diff --git a/evadb/executor/hash_join_executor.py b/evadb/executor/hash_join_executor.py
@@ -16,7 +16,11 @@
 
 from evadb.database import EvaDBDatabase
 from evadb.executor.abstract_executor import AbstractExecutor
-from evadb.executor.executor_utils import apply_predicate, apply_project
+from evadb.executor.executor_utils import (
+    apply_predicate,
+    apply_project,
+    instrument_function_expression_cost,
+)
 from evadb.models.storage.batch import Batch
 from evadb.plan_nodes.hash_join_probe_plan import HashJoinProbePlan
 
@@ -38,8 +42,12 @@ def exec(self, *args, **kwargs) -> Iterator[Batch]:
                 probe_batch.reassign_indices_to_hash(hash_keys)
                 join_batch = Batch.join(probe_batch, build_batch)
                 join_batch.reset_index()
-                join_batch = apply_predicate(join_batch, self.predicate, self.catalog())
-                join_batch = apply_project(
-                    join_batch, self.join_project, self.catalog()
-                )
+                join_batch = apply_predicate(join_batch, self.predicate)
+                join_batch = apply_project(join_batch, self.join_project)
                 yield join_batch
+
+        # instrument required stats
+        if self.predicate or self.join_project:
+            catalog = self.catalog()
+            instrument_function_expression_cost(self.predicate, catalog)
+            instrument_function_expression_cost(self.join_project, catalog)
diff --git a/evadb/executor/lateral_join_executor.py b/evadb/executor/lateral_join_executor.py
diff --git a/evadb/executor/nested_loop_join_executor.py b/evadb/executor/nested_loop_join_executor.py
@@ -16,7 +16,10 @@
 
 from evadb.database import EvaDBDatabase
 from evadb.executor.abstract_executor import AbstractExecutor
-from evadb.executor.executor_utils import apply_predicate
+from evadb.executor.executor_utils import (
+    apply_predicate,
+    instrument_function_expression_cost,
+)
 from evadb.models.storage.batch import Batch
 from evadb.plan_nodes.nested_loop_join_plan import NestedLoopJoinPlan
 
@@ -33,8 +36,10 @@ def exec(self, *args, **kwargs) -> Iterator[Batch]:
             for row2 in inner.exec(**kwargs):
                 result_batch = Batch.join(row1, row2)
                 result_batch.reset_index()
-                result_batch = apply_predicate(
-                    result_batch, self.predicate, self.catalog()
-                )
+                result_batch = apply_predicate(result_batch, self.predicate)
                 if not result_batch.empty():
                     yield result_batch
+
+        # instrument required stats
+        if self.predicate:
+            instrument_function_expression_cost(self.predicate, self.catalog())
diff --git a/evadb/executor/plan_executor.py b/evadb/executor/plan_executor.py
@@ -31,7 +31,6 @@
 from evadb.executor.hash_join_executor import HashJoinExecutor
 from evadb.executor.insert_executor import InsertExecutor
 from evadb.executor.join_build_executor import BuildJoinExecutor
-from evadb.executor.lateral_join_executor import LateralJoinExecutor
 from evadb.executor.limit_executor import LimitExecutor
 from evadb.executor.load_executor import LoadDataExecutor
 from evadb.executor.nested_loop_join_executor import NestedLoopJoinExecutor
@@ -128,12 +127,6 @@ def _build_execution_tree(
             executor_node = SampleExecutor(db=self._db, node=plan)
         elif plan_opr_type == PlanOprType.NESTED_LOOP_JOIN:
             executor_node = NestedLoopJoinExecutor(db=self._db, node=plan)
-        elif plan_opr_type == PlanOprType.LATERAL_JOIN:
-            logger.warn(
-                "LateralJoin Executor should not be part of the execution plan."
-                "Please raise an issue with the current query. Thanks!"
-            )
-            executor_node = LateralJoinExecutor(db=self._db, node=plan)
         elif plan_opr_type == PlanOprType.HASH_JOIN:
             executor_node = HashJoinExecutor(db=self._db, node=plan)
         elif plan_opr_type == PlanOprType.HASH_BUILD:

diff --git a/evadb/executor/predicate_executor.py b/evadb/executor/predicate_executor.py
@@ -16,7 +16,10 @@
 
 from evadb.database import EvaDBDatabase
 from evadb.executor.abstract_executor import AbstractExecutor
-from evadb.executor.executor_utils import apply_predicate
+from evadb.executor.executor_utils import (
+    apply_predicate,
+    instrument_function_expression_cost,
+)
 from evadb.models.storage.batch import Batch
 from evadb.plan_nodes.predicate_plan import PredicatePlan
 
@@ -31,6 +34,9 @@ def __init__(self, db: EvaDBDatabase, node: PredicatePlan):
     def exec(self, *args, **kwargs) -> Iterator[Batch]:
         child_executor = self.children[0]
         for batch in child_executor.exec(**kwargs):
-            batch = apply_predicate(batch, self.predicate, self.catalog())
+            batch = apply_predicate(batch, self.predicate)
             if not batch.empty():
                 yield batch
+
+        # perform any required instrumentation before we return
+        instrument_function_expression_cost(self.predicate, self.catalog())
diff --git a/evadb/executor/project_executor.py b/evadb/executor/project_executor.py
@@ -18,7 +18,11 @@
 
 from evadb.database import EvaDBDatabase
 from evadb.executor.abstract_executor import AbstractExecutor
-from evadb.executor.executor_utils import ExecutorError, apply_project
+from evadb.executor.executor_utils import (
+    ExecutorError,
+    apply_project,
+    instrument_function_expression_cost,
+)
 from evadb.models.storage.batch import Batch
 from evadb.plan_nodes.project_plan import ProjectPlan
 
@@ -35,15 +39,18 @@ def exec(self, *args, **kwargs) -> Iterator[Batch]:
         if len(self.children) == 0:
             # Create a dummy batch with size 1
             dummy_batch = Batch(pd.DataFrame([0]))
-            batch = apply_project(dummy_batch, self.target_list, self.catalog())
+            batch = apply_project(dummy_batch, self.target_list)
             if not batch.empty():
                 yield batch
         # SELECT expr FROM table;
         elif len(self.children) == 1:
             child_executor = self.children[0]
             for batch in child_executor.exec(**kwargs):
-                batch = apply_project(batch, self.target_list, self.catalog())
+                batch = apply_project(batch, self.target_list)
                 if not batch.empty():
                     yield batch
         else:
             raise ExecutorError("ProjectExecutor has more than 1 children.")
+
+        # instrument required stats
+        instrument_function_expression_cost(self.target_list, self.catalog())
diff --git a/evadb/executor/seq_scan_executor.py b/evadb/executor/seq_scan_executor.py
@@ -16,7 +16,11 @@
 
 from evadb.database import EvaDBDatabase
 from evadb.executor.abstract_executor import AbstractExecutor
-from evadb.executor.executor_utils import apply_predicate, apply_project
+from evadb.executor.executor_utils import (
+    apply_predicate,
+    apply_project,
+    instrument_function_expression_cost,
+)
 from evadb.models.storage.batch import Batch
 from evadb.plan_nodes.seq_scan_plan import SeqScanPlan
 
@@ -44,9 +48,15 @@ def exec(self, *args, **kwargs) -> Iterator[Batch]:
                 batch.modify_column_alias(self.alias)
 
             # We do the predicate first
-            batch = apply_predicate(batch, self.predicate, self.catalog())
+            batch = apply_predicate(batch, self.predicate)
             # Then do project
-            batch = apply_project(batch, self.project_expr, self.catalog())
+            batch = apply_project(batch, self.project_expr)
 
             if not batch.empty():
                 yield batch
+
+        # instrument required stats
+        if self.predicate or self.project_expr:
+            catalog = self.catalog()
+            instrument_function_expression_cost(self.predicate, catalog)
+            instrument_function_expression_cost(self.project_expr, catalog)