From 2cd64891170dcd4f2a709024a2993e36db210976 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Wed, 27 Dec 2023 16:36:15 -0800 Subject: [PATCH 1/4] feat: Add dataframe.to_html (#259) Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes b/296945119 --- bigframes/dataframe.py | 52 ++++++++ tests/system/small/test_dataframe.py | 9 ++ .../bigframes_vendored/pandas/core/frame.py | 124 ++++++++++++++++++ 3 files changed, 185 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 423c2bcaac..ab0006ea20 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2682,6 +2682,58 @@ def to_string( encoding, ) + def to_html( + self, + buf=None, + columns: Sequence[str] | None = None, + col_space=None, + header: bool = True, + index: bool = True, + na_rep: str = "NaN", + formatters=None, + float_format=None, + sparsify: bool | None = None, + index_names: bool = True, + justify: str | None = None, + max_rows: int | None = None, + max_cols: int | None = None, + show_dimensions: bool = False, + decimal: str = ".", + bold_rows: bool = True, + classes: str | list | tuple | None = None, + escape: bool = True, + notebook: bool = False, + border: int | None = None, + table_id: str | None = None, + render_links: bool = False, + encoding: str | None = None, + ) -> str: + return self.to_pandas().to_html( + buf, + columns, # type: ignore + col_space, + header, + index, + na_rep, + formatters, + float_format, + sparsify, + index_names, + justify, # type: ignore + max_rows, + max_cols, + show_dimensions, + decimal, + bold_rows, + classes, + escape, + notebook, + border, + table_id, + render_links, + encoding, + ) + def to_markdown( self, buf=None, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 86b8cfbe66..cb2e4f94fa 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3463,6 +3463,15 @@ def test_df_to_string(scalars_df_index, scalars_pandas_df_index): assert bf_result == pd_result +def test_df_to_html(scalars_df_index, scalars_pandas_df_index): + unsupported = ["numeric_col"] # formatted differently + + bf_result = scalars_df_index.drop(columns=unsupported).to_html() + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_html() + + assert bf_result == pd_result + + def test_df_to_markdown(scalars_df_index, scalars_pandas_df_index): # Nulls have bug from tabulate https://github.com/astanin/python-tabulate/issues/231 bf_result = scalars_df_index.dropna().to_markdown() diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index d7ecae102b..f2de8fcb6a 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -685,6 +685,130 @@ def to_string( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def to_html( + self, + buf=None, + columns: Sequence[str] | None = None, + col_space=None, + header: bool = True, + index: bool = True, + na_rep: str = "NaN", + formatters=None, + float_format=None, + sparsify: bool | None = None, + index_names: bool = True, + justify: str | None = None, + max_rows: int | None = None, + max_cols: int | None = None, + show_dimensions: bool = False, + decimal: str = ".", + bold_rows: bool = True, + classes: str | list | tuple | None = None, + escape: bool = True, + notebook: bool = False, + border: int | None = None, + table_id: str | None = None, + render_links: bool = False, + encoding: str | None = None, + ): + """Render a DataFrame as an HTML table. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> print(df.to_html()) + + + + + + + + + + + + + + + + + + + + +
col1col2
013
124
+ + Args: + buf (str, Path or StringIO-like, optional, default None): + Buffer to write to. If None, the output is returned as a string. + columns (sequence, optional, default None): + The subset of columns to write. Writes all columns by default. + col_space (str or int, list or dict of int or str, optional): + The minimum width of each column in CSS length units. An int is + assumed to be px units. + header (bool, optional): + Whether to print column labels, default True. + index (bool, optional, default True): + Whether to print index (row) labels. + na_rep (str, optional, default 'NaN'): + String representation of NAN to use. + formatters (list, tuple or dict of one-param. functions, optional): + Formatter functions to apply to columns' elements by position or + name. + The result of each function must be a unicode string. + List/tuple must be of length equal to the number of columns. + float_format (one-parameter function, optional, default None): + Formatter function to apply to columns' elements if they are + floats. This function must return a unicode string and will + be applied only to the non-NaN elements, with NaN being + handled by na_rep. + sparsify (bool, optional, default True): + Set to False for a DataFrame with a hierarchical index to print + every multiindex key at each row. + index_names (bool, optional, default True): + Prints the names of the indexes. + justify (str, default None): + How to justify the column labels. If None uses the option from + the print configuration (controlled by set_option), 'right' out + of the box. Valid values are, 'left', 'right', 'center', 'justify', + 'justify-all', 'start', 'end', 'inherit', 'match-parent', 'initial', + 'unset'. + max_rows (int, optional): + Maximum number of rows to display in the console. + max_cols (int, optional): + Maximum number of columns to display in the console. + show_dimensions (bool, default False): + Display DataFrame dimensions (number of rows by number of columns). + decimal (str, default '.'): + Character recognized as decimal separator, e.g. ',' in Europe. + bold_rows (bool, default True): + Make the row labels bold in the output. + classes (str or list or tuple, default None): + CSS class(es) to apply to the resulting html table. + escape (bool, default True): + Convert the characters <, >, and & to HTML-safe sequences. + notebook (bool, default False): + Whether the generated HTML is for IPython Notebook. + border (int): + A border=border attribute is included in the opening + tag. Default pd.options.display.html.border. + table_id (str, optional): + A css id is included in the opening
tag if specified. + render_links (bool, default False): + Convert URLs to HTML links. + encoding (str, default "utf-8"): + Set character encoding. + + Returns: + str or None: If buf is None, returns the result as a string. Otherwise + returns None. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def to_markdown( self, buf=None, From ac1a745ddce9865f4585777b43c2234b9bf2841d Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 28 Dec 2023 18:00:22 +0000 Subject: [PATCH 2/4] fix: exclude pandas 2.2.0rc0 to unblock prerelease tests (#292) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal issue 317908521 🦕 --- noxfile.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/noxfile.py b/noxfile.py index c4bbd7a65a..1d3624005a 100644 --- a/noxfile.py +++ b/noxfile.py @@ -518,9 +518,13 @@ def prerelease(session: nox.sessions.Session, tests_path): "--prefer-binary", "--pre", "--upgrade", - # TODO(shobs): Remove tying to version 2.1.3 after - # https://github.com/pandas-dev/pandas/issues/56463 is resolved - "pandas!=2.1.4", + # TODO(shobs): Remove excluding version 2.1.4 after + # https://github.com/pandas-dev/pandas/issues/56463 is resolved. + # + # TODO(shobs): Remove excluding version 2.2.0rc0 after + # https://github.com/pandas-dev/pandas/issues/56646 and + # https://github.com/pandas-dev/pandas/issues/56651 are resolved. + "pandas!=2.1.4,!=2.2.0rc0", ) already_installed.add("pandas") From 252f3a2a0e1296c7d786acdc0bdebe9e4a9ae1be Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Thu, 28 Dec 2023 10:54:15 -0800 Subject: [PATCH 3/4] docs: fix the rendering for `get_dummies` (#291) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated (if necessary) - docs: https://screenshot.googleplex.com/8X53mhLdQb2dQsd Fixes internal issue 317915956 🦕 --- third_party/bigframes_vendored/pandas/core/reshape/encoding.py | 1 + 1 file changed, 1 insertion(+) diff --git a/third_party/bigframes_vendored/pandas/core/reshape/encoding.py b/third_party/bigframes_vendored/pandas/core/reshape/encoding.py index da92b58f50..b7f67473ea 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/encoding.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/encoding.py @@ -25,6 +25,7 @@ def get_dummies( prepended to the value. **Examples:** + >>> import bigframes.pandas as pd >>> pd.options.display.progress_bar = None >>> s = pd.Series(list('abca')) From 746115d5564c95bc3c4a5309c99e7a29e535e6fe Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Thu, 28 Dec 2023 12:42:15 -0800 Subject: [PATCH 4/4] Fix: Update dataframe.to_gbq to dedup column names. (#286) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/dataframe.py | 14 ++++---- tests/system/small/test_dataframe_io.py | 44 +++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 6 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index ab0006ea20..595670b0b6 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2759,26 +2759,28 @@ def _apply_unary_op(self, operation: ops.UnaryOp) -> DataFrame: def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str: """Create query text representing this dataframe for I/O.""" array_value = self._block.expr + + new_col_labels, new_idx_labels = utils.get_standardized_ids( + self._block.column_labels, self.index.names + ) + columns = list(self._block.value_columns) - column_labels = list(self._block.column_labels) + column_labels = new_col_labels # This code drops unnamed indexes to keep consistent with the behavior of # most pandas write APIs. The exception is `pandas.to_csv`, which keeps # unnamed indexes as `Unnamed: 0`. # TODO(chelsealin): check if works for multiple indexes. if index and self.index.name is not None: columns.extend(self._block.index_columns) - column_labels.extend(self.index.names) + column_labels.extend(new_idx_labels) else: array_value = array_value.drop_columns(self._block.index_columns) # Make columns in SQL reflect _labels_ not _ids_. Note: This may use # the arbitrary unicode column labels feature in BigQuery, which is # currently (June 2023) in preview. - # TODO(swast): Handle duplicate and NULL labels. id_overrides = { - col_id: col_label - for col_id, col_label in zip(columns, column_labels) - if col_label and isinstance(col_label, str) + col_id: col_label for col_id, col_label in zip(columns, column_labels) } if ordering_id is not None: diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 59864e483e..6f1b31b48e 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -273,6 +273,50 @@ def test_to_gbq_if_exists( ) +def test_to_gbq_w_duplicate_column_names( + scalars_df_index, scalars_pandas_df_index, dataset_id +): + """Test the `to_gbq` API when dealing with duplicate column names.""" + destination_table = f"{dataset_id}.test_to_gbq_w_duplicate_column_names" + + # Renaming 'int64_too' to 'int64_col', which will result in 'int64_too' + # becoming 'int64_col_1' after deduplication. + scalars_df_index = scalars_df_index.rename(columns={"int64_too": "int64_col"}) + scalars_df_index.to_gbq(destination_table, if_exists="replace") + + bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas() + + pd.testing.assert_series_equal( + scalars_pandas_df_index["int64_col"], bf_result["int64_col"] + ) + pd.testing.assert_series_equal( + scalars_pandas_df_index["int64_too"], + bf_result["int64_col_1"], + check_names=False, + ) + + +def test_to_gbq_w_None_column_names( + scalars_df_index, scalars_pandas_df_index, dataset_id +): + """Test the `to_gbq` API with None as a column name.""" + destination_table = f"{dataset_id}.test_to_gbq_w_none_column_names" + + scalars_df_index = scalars_df_index.rename(columns={"int64_too": None}) + scalars_df_index.to_gbq(destination_table, if_exists="replace") + + bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas() + + pd.testing.assert_series_equal( + scalars_pandas_df_index["int64_col"], bf_result["int64_col"] + ) + pd.testing.assert_series_equal( + scalars_pandas_df_index["int64_too"], + bf_result["bigframes_unnamed_column"], + check_names=False, + ) + + def test_to_gbq_w_invalid_destination_table(scalars_df_index): with pytest.raises(ValueError): scalars_df_index.to_gbq("table_id")