googleapis · TrevorBergeron · Oct 18, 2024 · Oct 10, 2024 · Oct 10, 2024 · Oct 11, 2024
@@ -18,6 +18,7 @@
 import google.cloud.bigquery as bigquery
 
 import bigframes.core.compile.compiler as compiler
+import bigframes.core.rewrite as rewrites
 
 if TYPE_CHECKING:
     import bigframes.core.nodes
@@ -42,6 +43,11 @@ def compile_unordered(
         col_id_overrides: Mapping[str, str] = {},
     ) -> str:
         """Compile node into sql where rows are unsorted, and no ordering information is preserved."""
+        new_node, limit = rewrites.pullup_limit_from_slice(node)
+        if limit is not None:
+            return self._compiler.compile_ordered_ir(new_node).to_sql(
+                col_id_overrides=col_id_overrides, ordered=True, limit=limit
+            )
         return self._compiler.compile_unordered_ir(node).to_sql(
             col_id_overrides=col_id_overrides
         )
@@ -53,8 +59,9 @@ def compile_ordered(
         col_id_overrides: Mapping[str, str] = {},
     ) -> str:
         """Compile node into sql where rows are sorted with ORDER BY."""
-        return self._compiler.compile_ordered_ir(node).to_sql(
-            col_id_overrides=col_id_overrides, ordered=True
+        new_node, limit = rewrites.pullup_limit_from_slice(node)
+        return self._compiler.compile_ordered_ir(new_node).to_sql(
+            col_id_overrides=col_id_overrides, ordered=True, limit=limit
         )
 
     def compile_raw(

@@ -943,8 +943,9 @@ def to_sql(
         self,
         col_id_overrides: typing.Mapping[str, str] = {},
         ordered: bool = False,
+        limit: Optional[int] = None,
     ) -> str:
-        if ordered:
+        if ordered or limit:
             # Need to bake ordering expressions into the selected column in order for our ordering clause builder to work.
             baked_ir = self._bake_ordering()
             sql = ibis_bigquery.Backend().compile(
@@ -969,7 +970,10 @@ def to_sql(
                 order_by_clause = bigframes.core.sql.ordering_clause(
                     baked_ir._ordering.all_ordering_columns
                 )
-                sql += f"{order_by_clause}\n"
+                sql += f"\n{order_by_clause}"
+            if limit is not None:
+                assert isinstance(limit, int)
+                sql += f"\nLIMIT {limit}"
         else:
             sql = ibis_bigquery.Backend().compile(
                 self._to_ibis_expr(

diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py
@@ -36,6 +36,7 @@
 import bigframes.core.identifiers as ids
 import bigframes.core.nodes as nodes
 import bigframes.core.ordering as bf_ordering
+import bigframes.core.rewrite as rewrites
 
 if typing.TYPE_CHECKING:
     import bigframes.core
@@ -48,20 +49,32 @@ class Compiler:
     # In unstrict mode, ordering from ReadTable or after joins may be ambiguous to improve query performance.
     strict: bool = True
     scalar_op_compiler = compile_scalar.ScalarOpCompiler()
+    enable_pruning: bool = False
+
+    def _preprocess(self, node: nodes.BigFrameNode):
+        if self.enable_pruning:
+            used_fields = frozenset(field.id for field in node.fields)
+            node = node.prune(used_fields)
+        node = functools.cache(rewrites.replace_slice_ops)(node)
+        return node
 
     def compile_ordered_ir(self, node: nodes.BigFrameNode) -> compiled.OrderedIR:
-        ir = typing.cast(compiled.OrderedIR, self.compile_node(node, True))
+        ir = typing.cast(
+            compiled.OrderedIR, self.compile_node(self._preprocess(node), True)
+        )
         if self.strict:
             assert ir.has_total_order
         return ir
 
     def compile_unordered_ir(self, node: nodes.BigFrameNode) -> compiled.UnorderedIR:
-        return typing.cast(compiled.UnorderedIR, self.compile_node(node, False))
+        return typing.cast(
+            compiled.UnorderedIR, self.compile_node(self._preprocess(node), False)
+        )
 
     def compile_peak_sql(
         self, node: nodes.BigFrameNode, n_rows: int
     ) -> typing.Optional[str]:
-        return self.compile_unordered_ir(node).peek_sql(n_rows)
+        return self.compile_unordered_ir(self._preprocess(node)).peek_sql(n_rows)
 
     # TODO: Remove cache when schema no longer requires compilation to derive schema (and therefor only compiles for execution)
     @functools.lru_cache(maxsize=5000)

@@ -385,6 +385,44 @@ def common_selection_root(
     return None
 
 
+def pullup_limit_from_slice(
+    root: nodes.BigFrameNode,
+) -> Tuple[nodes.BigFrameNode, Optional[int]]:
+    """
+    This is a BQ-sql specific optimization that can be helpful as ORDER BY LIMIT is more efficient than ROW_NUMBER() + WHERE.
+    """
+    if isinstance(root, nodes.SliceNode):
+        new_child, limit = pullup_limit_from_slice(root.child)
+        # head case
+        if (
+            (not root.start)
+            and ((root.stop is not None) and root.stop > 0)
+            and (root.step == 1)
+        ):
+            limit = root.stop
+            new_root, prior_limit = pullup_limit_from_slice(root.child)
+            if prior_limit is not None and prior_limit < limit:
+                limit = prior_limit
+            return new_root, limit
+        # tail case
+        if (
+            (root.start in [None, -1])
+            and ((root.stop is not None) and root.stop < 0)
+            and (root.step == -1)
+        ):
+            limit = -root.stop
+            new_root, prior_limit = pullup_limit_from_slice(root.child)
+            if prior_limit is not None and prior_limit < limit:
+                limit = prior_limit
+            return nodes.ReversedNode(new_root), limit
+    elif isinstance(root, nodes.UnaryNode) and root.row_preserving:
+        new_child, limit = pullup_limit_from_slice(root.child)
+        if limit is not None:
+            return root.transform_children(lambda _: new_child), limit
+    # Many ops don't support pulling up slice, like filter, agg, join, etc.
+    return root, None
+
+
 def replace_slice_ops(root: nodes.BigFrameNode) -> nodes.BigFrameNode:
     # TODO: we want to pull up some slices into limit op if near root.
     if isinstance(root, nodes.SliceNode):

@@ -45,7 +45,6 @@
 import bigframes.core.identifiers
 import bigframes.core.nodes as nodes
 import bigframes.core.ordering as order
-import bigframes.core.rewrite as rewrites
 import bigframes.core.schema
 import bigframes.core.tree_properties as tree_properties
 import bigframes.features
@@ -437,7 +436,6 @@ def _get_optimized_plan(self, node: nodes.BigFrameNode) -> nodes.BigFrameNode:
         if ENABLE_PRUNING:
             used_fields = frozenset(field.id for field in optimized_plan.fields)
             optimized_plan = optimized_plan.prune(used_fields)
-        optimized_plan = rewrites.replace_slice_ops(optimized_plan)
         return optimized_plan
 
     def _is_trivially_executable(self, array_value: bigframes.core.ArrayValue):