diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index afa13375b1..11899eef11 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -41,6 +41,7 @@ import bigframes.core.guid as guid import bigframes.core.join_def as join_defs import bigframes.core.ordering as ordering +import bigframes.core.tree_properties as tree_properties import bigframes.core.utils import bigframes.core.utils as utils import bigframes.dtypes @@ -443,8 +444,10 @@ def to_pandas( df.set_axis(self.column_labels, axis=1, copy=False) return df, query_job - def try_peek(self, n: int = 20) -> typing.Optional[pd.DataFrame]: - if self.expr.node.peekable: + def try_peek( + self, n: int = 20, force: bool = False + ) -> typing.Optional[pd.DataFrame]: + if force or tree_properties.peekable(self.expr.node): iterator, _ = self.session._peek(self.expr, n) df = self._to_dataframe(iterator) self._copy_index_to_pandas(df) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index c1ceeebffe..8f646ac4bb 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -90,11 +90,6 @@ def session(self): def _node_hash(self): return hash(tuple(hash(getattr(self, field.name)) for field in fields(self))) - @property - def peekable(self) -> bool: - """Indicates whether the node can be sampled efficiently""" - return all(child.peekable for child in self.child_nodes) - @property def roots(self) -> typing.Set[BigFrameNode]: roots = itertools.chain.from_iterable( @@ -143,12 +138,6 @@ def child_nodes(self) -> typing.Sequence[BigFrameNode]: def __hash__(self): return self._node_hash - @property - def peekable(self) -> bool: - children_peekable = all(child.peekable for child in self.child_nodes) - single_root = len(self.roots) == 1 - return children_peekable and single_root - @functools.cached_property def schema(self) -> schemata.ArraySchema: def join_mapping_to_schema_item(mapping: JoinColumnMapping): @@ -204,10 +193,6 @@ class ReadLocalNode(BigFrameNode): def __hash__(self): return self._node_hash - @property - def peekable(self) -> bool: - return True - @property def roots(self) -> typing.Set[BigFrameNode]: return {self} @@ -233,10 +218,6 @@ def session(self): def __hash__(self): return self._node_hash - @property - def peekable(self) -> bool: - return True - @property def roots(self) -> typing.Set[BigFrameNode]: return {self} @@ -261,13 +242,9 @@ class PromoteOffsetsNode(UnaryNode): def __hash__(self): return self._node_hash - @property - def peekable(self) -> bool: - return False - @property def non_local(self) -> bool: - return False + return True @property def schema(self) -> schemata.ArraySchema: @@ -371,10 +348,6 @@ def row_preserving(self) -> bool: def __hash__(self): return self._node_hash - @property - def peekable(self) -> bool: - return False - @property def non_local(self) -> bool: return True @@ -407,10 +380,6 @@ class WindowOpNode(UnaryNode): def __hash__(self): return self._node_hash - @property - def peekable(self) -> bool: - return False - @property def non_local(self) -> bool: return True @@ -459,10 +428,6 @@ def row_preserving(self) -> bool: def non_local(self) -> bool: return True - @property - def peekable(self) -> bool: - return False - @functools.cached_property def schema(self) -> schemata.ArraySchema: def infer_dtype( diff --git a/bigframes/core/traversal.py b/bigframes/core/tree_properties.py similarity index 72% rename from bigframes/core/traversal.py rename to bigframes/core/tree_properties.py index b038ee6599..bc29f115f6 100644 --- a/bigframes/core/traversal.py +++ b/bigframes/core/tree_properties.py @@ -12,8 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. + import bigframes.core.nodes as nodes +# TODO: Convert these functions to iterative or enforce hard limit on tree depth. The below algorithms can cause stack to exceed limit. + def is_trivially_executable(node: nodes.BigFrameNode) -> bool: if local_only(node): @@ -25,3 +28,11 @@ def is_trivially_executable(node: nodes.BigFrameNode) -> bool: def local_only(node: nodes.BigFrameNode) -> bool: return all(isinstance(node, nodes.ReadLocalNode) for node in node.roots) + + +def peekable(node: nodes.BigFrameNode) -> bool: + if local_only(node): + return True + children_peekable = all(peekable(child) for child in node.child_nodes) + self_peekable = not node.non_local + return children_peekable and self_peekable diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 599546284b..066b082490 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1125,7 +1125,7 @@ def peek(self, n: int = 5, *, force: bool = True) -> pandas.DataFrame: if maybe_result is None: if force: self._cached() - maybe_result = self._block.try_peek(n) + maybe_result = self._block.try_peek(n, force=True) assert maybe_result is not None else: raise ValueError( diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 479b3a7bac..ddb2646ce6 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -82,7 +82,8 @@ import bigframes.core.guid as guid from bigframes.core.ordering import IntegerEncoding import bigframes.core.ordering as order -import bigframes.core.traversal as traversals +import bigframes.core.tree_properties as traversals +import bigframes.core.tree_properties as tree_properties import bigframes.core.utils as utils import bigframes.dtypes import bigframes.formatting_helpers as formatting_helpers @@ -1848,8 +1849,8 @@ def _peek( self, array_value: core.ArrayValue, n_rows: int ) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]: """A 'peek' efficiently accesses a small number of rows in the dataframe.""" - if not array_value.node.peekable: - raise NotImplementedError("cannot efficient peek this dataframe") + if not tree_properties.peekable(array_value.node): + warnings.warn("Peeking this value cannot be done efficiently.") sql = self._compile_unordered(array_value).peek_sql(n_rows) return self._start_query( sql=sql, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 355849538e..ebb6f7d52c 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -494,6 +494,17 @@ def test_df_peek_force_default(scalars_dfs): assert len(peek_result) == 3 +def test_df_peek_reset_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + peek_result = ( + scalars_df[["int64_col", "int64_too"]].reset_index(drop=True).peek(n=3) + ) + pd.testing.assert_index_equal( + scalars_pandas_df[["int64_col", "int64_too"]].columns, peek_result.columns + ) + assert len(peek_result) == 3 + + def test_repr_w_all_rows(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs