From 2cd64891170dcd4f2a709024a2993e36db210976 Mon Sep 17 00:00:00 2001
From: Huan Chen <142538604+Genesis929@users.noreply.github.com>
Date: Wed, 27 Dec 2023 16:36:15 -0800
Subject: [PATCH 1/4] feat: Add dataframe.to_html (#259)
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly:
- [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea
- [ ] Ensure the tests and linter pass
- [ ] Code coverage does not decrease (if any source code was changed)
- [ ] Appropriate docs were updated (if necessary)
Fixes b/296945119
---
bigframes/dataframe.py | 52 ++++++++
tests/system/small/test_dataframe.py | 9 ++
.../bigframes_vendored/pandas/core/frame.py | 124 ++++++++++++++++++
3 files changed, 185 insertions(+)
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 423c2bcaac..ab0006ea20 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -2682,6 +2682,58 @@ def to_string(
encoding,
)
+ def to_html(
+ self,
+ buf=None,
+ columns: Sequence[str] | None = None,
+ col_space=None,
+ header: bool = True,
+ index: bool = True,
+ na_rep: str = "NaN",
+ formatters=None,
+ float_format=None,
+ sparsify: bool | None = None,
+ index_names: bool = True,
+ justify: str | None = None,
+ max_rows: int | None = None,
+ max_cols: int | None = None,
+ show_dimensions: bool = False,
+ decimal: str = ".",
+ bold_rows: bool = True,
+ classes: str | list | tuple | None = None,
+ escape: bool = True,
+ notebook: bool = False,
+ border: int | None = None,
+ table_id: str | None = None,
+ render_links: bool = False,
+ encoding: str | None = None,
+ ) -> str:
+ return self.to_pandas().to_html(
+ buf,
+ columns, # type: ignore
+ col_space,
+ header,
+ index,
+ na_rep,
+ formatters,
+ float_format,
+ sparsify,
+ index_names,
+ justify, # type: ignore
+ max_rows,
+ max_cols,
+ show_dimensions,
+ decimal,
+ bold_rows,
+ classes,
+ escape,
+ notebook,
+ border,
+ table_id,
+ render_links,
+ encoding,
+ )
+
def to_markdown(
self,
buf=None,
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index 86b8cfbe66..cb2e4f94fa 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -3463,6 +3463,15 @@ def test_df_to_string(scalars_df_index, scalars_pandas_df_index):
assert bf_result == pd_result
+def test_df_to_html(scalars_df_index, scalars_pandas_df_index):
+ unsupported = ["numeric_col"] # formatted differently
+
+ bf_result = scalars_df_index.drop(columns=unsupported).to_html()
+ pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_html()
+
+ assert bf_result == pd_result
+
+
def test_df_to_markdown(scalars_df_index, scalars_pandas_df_index):
# Nulls have bug from tabulate https://github.com/astanin/python-tabulate/issues/231
bf_result = scalars_df_index.dropna().to_markdown()
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index d7ecae102b..f2de8fcb6a 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -685,6 +685,130 @@ def to_string(
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+ def to_html(
+ self,
+ buf=None,
+ columns: Sequence[str] | None = None,
+ col_space=None,
+ header: bool = True,
+ index: bool = True,
+ na_rep: str = "NaN",
+ formatters=None,
+ float_format=None,
+ sparsify: bool | None = None,
+ index_names: bool = True,
+ justify: str | None = None,
+ max_rows: int | None = None,
+ max_cols: int | None = None,
+ show_dimensions: bool = False,
+ decimal: str = ".",
+ bold_rows: bool = True,
+ classes: str | list | tuple | None = None,
+ escape: bool = True,
+ notebook: bool = False,
+ border: int | None = None,
+ table_id: str | None = None,
+ render_links: bool = False,
+ encoding: str | None = None,
+ ):
+ """Render a DataFrame as an HTML table.
+
+ **Examples:**
+
+ >>> import bigframes.pandas as bpd
+ >>> bpd.options.display.progress_bar = None
+
+ >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
+ >>> print(df.to_html())
+
+
+
+ |
+ col1 |
+ col2 |
+
+
+
+
+ 0 |
+ 1 |
+ 3 |
+
+
+ 1 |
+ 2 |
+ 4 |
+
+
+
+
+ Args:
+ buf (str, Path or StringIO-like, optional, default None):
+ Buffer to write to. If None, the output is returned as a string.
+ columns (sequence, optional, default None):
+ The subset of columns to write. Writes all columns by default.
+ col_space (str or int, list or dict of int or str, optional):
+ The minimum width of each column in CSS length units. An int is
+ assumed to be px units.
+ header (bool, optional):
+ Whether to print column labels, default True.
+ index (bool, optional, default True):
+ Whether to print index (row) labels.
+ na_rep (str, optional, default 'NaN'):
+ String representation of NAN to use.
+ formatters (list, tuple or dict of one-param. functions, optional):
+ Formatter functions to apply to columns' elements by position or
+ name.
+ The result of each function must be a unicode string.
+ List/tuple must be of length equal to the number of columns.
+ float_format (one-parameter function, optional, default None):
+ Formatter function to apply to columns' elements if they are
+ floats. This function must return a unicode string and will
+ be applied only to the non-NaN elements, with NaN being
+ handled by na_rep.
+ sparsify (bool, optional, default True):
+ Set to False for a DataFrame with a hierarchical index to print
+ every multiindex key at each row.
+ index_names (bool, optional, default True):
+ Prints the names of the indexes.
+ justify (str, default None):
+ How to justify the column labels. If None uses the option from
+ the print configuration (controlled by set_option), 'right' out
+ of the box. Valid values are, 'left', 'right', 'center', 'justify',
+ 'justify-all', 'start', 'end', 'inherit', 'match-parent', 'initial',
+ 'unset'.
+ max_rows (int, optional):
+ Maximum number of rows to display in the console.
+ max_cols (int, optional):
+ Maximum number of columns to display in the console.
+ show_dimensions (bool, default False):
+ Display DataFrame dimensions (number of rows by number of columns).
+ decimal (str, default '.'):
+ Character recognized as decimal separator, e.g. ',' in Europe.
+ bold_rows (bool, default True):
+ Make the row labels bold in the output.
+ classes (str or list or tuple, default None):
+ CSS class(es) to apply to the resulting html table.
+ escape (bool, default True):
+ Convert the characters <, >, and & to HTML-safe sequences.
+ notebook (bool, default False):
+ Whether the generated HTML is for IPython Notebook.
+ border (int):
+ A border=border attribute is included in the opening
+ tag. Default pd.options.display.html.border.
+ table_id (str, optional):
+ A css id is included in the opening tag if specified.
+ render_links (bool, default False):
+ Convert URLs to HTML links.
+ encoding (str, default "utf-8"):
+ Set character encoding.
+
+ Returns:
+ str or None: If buf is None, returns the result as a string. Otherwise
+ returns None.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
def to_markdown(
self,
buf=None,
From ac1a745ddce9865f4585777b43c2234b9bf2841d Mon Sep 17 00:00:00 2001
From: Shobhit Singh
Date: Thu, 28 Dec 2023 18:00:22 +0000
Subject: [PATCH 2/4] fix: exclude pandas 2.2.0rc0 to unblock prerelease tests
(#292)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly:
- [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea
- [ ] Ensure the tests and linter pass
- [ ] Code coverage does not decrease (if any source code was changed)
- [ ] Appropriate docs were updated (if necessary)
Fixes internal issue 317908521 🦕
---
noxfile.py | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/noxfile.py b/noxfile.py
index c4bbd7a65a..1d3624005a 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -518,9 +518,13 @@ def prerelease(session: nox.sessions.Session, tests_path):
"--prefer-binary",
"--pre",
"--upgrade",
- # TODO(shobs): Remove tying to version 2.1.3 after
- # https://github.com/pandas-dev/pandas/issues/56463 is resolved
- "pandas!=2.1.4",
+ # TODO(shobs): Remove excluding version 2.1.4 after
+ # https://github.com/pandas-dev/pandas/issues/56463 is resolved.
+ #
+ # TODO(shobs): Remove excluding version 2.2.0rc0 after
+ # https://github.com/pandas-dev/pandas/issues/56646 and
+ # https://github.com/pandas-dev/pandas/issues/56651 are resolved.
+ "pandas!=2.1.4,!=2.2.0rc0",
)
already_installed.add("pandas")
From 252f3a2a0e1296c7d786acdc0bdebe9e4a9ae1be Mon Sep 17 00:00:00 2001
From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com>
Date: Thu, 28 Dec 2023 10:54:15 -0800
Subject: [PATCH 3/4] docs: fix the rendering for `get_dummies` (#291)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly:
- [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea
- [ ] Ensure the tests and linter pass
- [ ] Code coverage does not decrease (if any source code was changed)
- [x] Appropriate docs were updated (if necessary)
- docs: https://screenshot.googleplex.com/8X53mhLdQb2dQsd
Fixes internal issue 317915956 🦕
---
third_party/bigframes_vendored/pandas/core/reshape/encoding.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/third_party/bigframes_vendored/pandas/core/reshape/encoding.py b/third_party/bigframes_vendored/pandas/core/reshape/encoding.py
index da92b58f50..b7f67473ea 100644
--- a/third_party/bigframes_vendored/pandas/core/reshape/encoding.py
+++ b/third_party/bigframes_vendored/pandas/core/reshape/encoding.py
@@ -25,6 +25,7 @@ def get_dummies(
prepended to the value.
**Examples:**
+
>>> import bigframes.pandas as pd
>>> pd.options.display.progress_bar = None
>>> s = pd.Series(list('abca'))
From 746115d5564c95bc3c4a5309c99e7a29e535e6fe Mon Sep 17 00:00:00 2001
From: Huan Chen <142538604+Genesis929@users.noreply.github.com>
Date: Thu, 28 Dec 2023 12:42:15 -0800
Subject: [PATCH 4/4] Fix: Update dataframe.to_gbq to dedup column names.
(#286)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly:
- [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea
- [ ] Ensure the tests and linter pass
- [ ] Code coverage does not decrease (if any source code was changed)
- [ ] Appropriate docs were updated (if necessary)
Fixes # 🦕
---
bigframes/dataframe.py | 14 ++++----
tests/system/small/test_dataframe_io.py | 44 +++++++++++++++++++++++++
2 files changed, 52 insertions(+), 6 deletions(-)
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index ab0006ea20..595670b0b6 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -2759,26 +2759,28 @@ def _apply_unary_op(self, operation: ops.UnaryOp) -> DataFrame:
def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str:
"""Create query text representing this dataframe for I/O."""
array_value = self._block.expr
+
+ new_col_labels, new_idx_labels = utils.get_standardized_ids(
+ self._block.column_labels, self.index.names
+ )
+
columns = list(self._block.value_columns)
- column_labels = list(self._block.column_labels)
+ column_labels = new_col_labels
# This code drops unnamed indexes to keep consistent with the behavior of
# most pandas write APIs. The exception is `pandas.to_csv`, which keeps
# unnamed indexes as `Unnamed: 0`.
# TODO(chelsealin): check if works for multiple indexes.
if index and self.index.name is not None:
columns.extend(self._block.index_columns)
- column_labels.extend(self.index.names)
+ column_labels.extend(new_idx_labels)
else:
array_value = array_value.drop_columns(self._block.index_columns)
# Make columns in SQL reflect _labels_ not _ids_. Note: This may use
# the arbitrary unicode column labels feature in BigQuery, which is
# currently (June 2023) in preview.
- # TODO(swast): Handle duplicate and NULL labels.
id_overrides = {
- col_id: col_label
- for col_id, col_label in zip(columns, column_labels)
- if col_label and isinstance(col_label, str)
+ col_id: col_label for col_id, col_label in zip(columns, column_labels)
}
if ordering_id is not None:
diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py
index 59864e483e..6f1b31b48e 100644
--- a/tests/system/small/test_dataframe_io.py
+++ b/tests/system/small/test_dataframe_io.py
@@ -273,6 +273,50 @@ def test_to_gbq_if_exists(
)
+def test_to_gbq_w_duplicate_column_names(
+ scalars_df_index, scalars_pandas_df_index, dataset_id
+):
+ """Test the `to_gbq` API when dealing with duplicate column names."""
+ destination_table = f"{dataset_id}.test_to_gbq_w_duplicate_column_names"
+
+ # Renaming 'int64_too' to 'int64_col', which will result in 'int64_too'
+ # becoming 'int64_col_1' after deduplication.
+ scalars_df_index = scalars_df_index.rename(columns={"int64_too": "int64_col"})
+ scalars_df_index.to_gbq(destination_table, if_exists="replace")
+
+ bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas()
+
+ pd.testing.assert_series_equal(
+ scalars_pandas_df_index["int64_col"], bf_result["int64_col"]
+ )
+ pd.testing.assert_series_equal(
+ scalars_pandas_df_index["int64_too"],
+ bf_result["int64_col_1"],
+ check_names=False,
+ )
+
+
+def test_to_gbq_w_None_column_names(
+ scalars_df_index, scalars_pandas_df_index, dataset_id
+):
+ """Test the `to_gbq` API with None as a column name."""
+ destination_table = f"{dataset_id}.test_to_gbq_w_none_column_names"
+
+ scalars_df_index = scalars_df_index.rename(columns={"int64_too": None})
+ scalars_df_index.to_gbq(destination_table, if_exists="replace")
+
+ bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas()
+
+ pd.testing.assert_series_equal(
+ scalars_pandas_df_index["int64_col"], bf_result["int64_col"]
+ )
+ pd.testing.assert_series_equal(
+ scalars_pandas_df_index["int64_too"],
+ bf_result["bigframes_unnamed_column"],
+ check_names=False,
+ )
+
+
def test_to_gbq_w_invalid_destination_table(scalars_df_index):
with pytest.raises(ValueError):
scalars_df_index.to_gbq("table_id")