From 2cd64891170dcd4f2a709024a2993e36db210976 Mon Sep 17 00:00:00 2001
From: Huan Chen <142538604+Genesis929@users.noreply.github.com>
Date: Wed, 27 Dec 2023 16:36:15 -0800
Subject: [PATCH 1/4] feat: Add dataframe.to_html (#259)

Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly:
- [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code!  That way we can discuss the change, evaluate designs, and agree on the general idea
- [ ] Ensure the tests and linter pass
- [ ] Code coverage does not decrease (if any source code was changed)
- [ ] Appropriate docs were updated (if necessary)

Fixes b/296945119
---
 bigframes/dataframe.py                        |  52 ++++++++
 tests/system/small/test_dataframe.py          |   9 ++
 .../bigframes_vendored/pandas/core/frame.py   | 124 ++++++++++++++++++
 3 files changed, 185 insertions(+)

diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 423c2bcaac..ab0006ea20 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -2682,6 +2682,58 @@ def to_string(
             encoding,
         )
 
+    def to_html(
+        self,
+        buf=None,
+        columns: Sequence[str] | None = None,
+        col_space=None,
+        header: bool = True,
+        index: bool = True,
+        na_rep: str = "NaN",
+        formatters=None,
+        float_format=None,
+        sparsify: bool | None = None,
+        index_names: bool = True,
+        justify: str | None = None,
+        max_rows: int | None = None,
+        max_cols: int | None = None,
+        show_dimensions: bool = False,
+        decimal: str = ".",
+        bold_rows: bool = True,
+        classes: str | list | tuple | None = None,
+        escape: bool = True,
+        notebook: bool = False,
+        border: int | None = None,
+        table_id: str | None = None,
+        render_links: bool = False,
+        encoding: str | None = None,
+    ) -> str:
+        return self.to_pandas().to_html(
+            buf,
+            columns,  # type: ignore
+            col_space,
+            header,
+            index,
+            na_rep,
+            formatters,
+            float_format,
+            sparsify,
+            index_names,
+            justify,  # type: ignore
+            max_rows,
+            max_cols,
+            show_dimensions,
+            decimal,
+            bold_rows,
+            classes,
+            escape,
+            notebook,
+            border,
+            table_id,
+            render_links,
+            encoding,
+        )
+
     def to_markdown(
         self,
         buf=None,
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index 86b8cfbe66..cb2e4f94fa 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -3463,6 +3463,15 @@ def test_df_to_string(scalars_df_index, scalars_pandas_df_index):
     assert bf_result == pd_result
 
 
+def test_df_to_html(scalars_df_index, scalars_pandas_df_index):
+    unsupported = ["numeric_col"]  # formatted differently
+
+    bf_result = scalars_df_index.drop(columns=unsupported).to_html()
+    pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_html()
+
+    assert bf_result == pd_result
+
+
 def test_df_to_markdown(scalars_df_index, scalars_pandas_df_index):
     # Nulls have bug from tabulate https://github.com/astanin/python-tabulate/issues/231
     bf_result = scalars_df_index.dropna().to_markdown()
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index d7ecae102b..f2de8fcb6a 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -685,6 +685,130 @@ def to_string(
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def to_html(
+        self,
+        buf=None,
+        columns: Sequence[str] | None = None,
+        col_space=None,
+        header: bool = True,
+        index: bool = True,
+        na_rep: str = "NaN",
+        formatters=None,
+        float_format=None,
+        sparsify: bool | None = None,
+        index_names: bool = True,
+        justify: str | None = None,
+        max_rows: int | None = None,
+        max_cols: int | None = None,
+        show_dimensions: bool = False,
+        decimal: str = ".",
+        bold_rows: bool = True,
+        classes: str | list | tuple | None = None,
+        escape: bool = True,
+        notebook: bool = False,
+        border: int | None = None,
+        table_id: str | None = None,
+        render_links: bool = False,
+        encoding: str | None = None,
+    ):
+        """Render a DataFrame as an HTML table.
+
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
+            >>> print(df.to_html())
+            <table border="1" class="dataframe">
+            <thead>
+                <tr style="text-align: right;">
+                <th></th>
+                <th>col1</th>
+                <th>col2</th>
+                </tr>
+            </thead>
+            <tbody>
+                <tr>
+                <th>0</th>
+                <td>1</td>
+                <td>3</td>
+                </tr>
+                <tr>
+                <th>1</th>
+                <td>2</td>
+                <td>4</td>
+                </tr>
+            </tbody>
+            </table>
+
+        Args:
+            buf (str, Path or StringIO-like, optional, default None):
+                Buffer to write to. If None, the output is returned as a string.
+            columns (sequence, optional, default None):
+                The subset of columns to write. Writes all columns by default.
+            col_space (str or int, list or dict of int or str, optional):
+                The minimum width of each column in CSS length units. An int is
+                assumed to be px units.
+            header (bool, optional):
+                Whether to print column labels, default True.
+            index (bool, optional, default True):
+                Whether to print index (row) labels.
+            na_rep (str, optional, default 'NaN'):
+                String representation of NAN to use.
+            formatters (list, tuple or dict of one-param. functions, optional):
+                Formatter functions to apply to columns' elements by position or
+                name.
+                The result of each function must be a unicode string.
+                List/tuple must be of length equal to the number of columns.
+            float_format (one-parameter function, optional, default None):
+                Formatter function to apply to columns' elements if they are
+                floats. This function must return a unicode string and will
+                be applied only to the non-NaN elements, with NaN being
+                handled by na_rep.
+            sparsify (bool, optional, default True):
+                Set to False for a DataFrame with a hierarchical index to print
+                every multiindex key at each row.
+            index_names (bool, optional, default True):
+                Prints the names of the indexes.
+            justify (str, default None):
+                How to justify the column labels. If None uses the option from
+                the print configuration (controlled by set_option), 'right' out
+                of the box. Valid values are, 'left', 'right', 'center', 'justify',
+                'justify-all', 'start', 'end', 'inherit', 'match-parent', 'initial',
+                'unset'.
+            max_rows (int, optional):
+                Maximum number of rows to display in the console.
+            max_cols (int, optional):
+                Maximum number of columns to display in the console.
+            show_dimensions (bool, default False):
+                Display DataFrame dimensions (number of rows by number of columns).
+            decimal (str, default '.'):
+                Character recognized as decimal separator, e.g. ',' in Europe.
+            bold_rows (bool, default True):
+                Make the row labels bold in the output.
+            classes (str or list or tuple, default None):
+                CSS class(es) to apply to the resulting html table.
+            escape (bool, default True):
+                Convert the characters <, >, and & to HTML-safe sequences.
+            notebook (bool, default False):
+                Whether the generated HTML is for IPython Notebook.
+            border (int):
+                A border=border attribute is included in the opening <table>
+                tag. Default pd.options.display.html.border.
+            table_id (str, optional):
+                A css id is included in the opening <table> tag if specified.
+            render_links (bool, default False):
+                Convert URLs to HTML links.
+            encoding (str, default "utf-8"):
+                Set character encoding.
+
+        Returns:
+            str or None: If buf is None, returns the result as a string. Otherwise
+            returns None.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
     def to_markdown(
         self,
         buf=None,

From ac1a745ddce9865f4585777b43c2234b9bf2841d Mon Sep 17 00:00:00 2001
From: Shobhit Singh <shobs@google.com>
Date: Thu, 28 Dec 2023 18:00:22 +0000
Subject: [PATCH 2/4] fix: exclude pandas 2.2.0rc0 to unblock prerelease tests
 (#292)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly:
- [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code!  That way we can discuss the change, evaluate designs, and agree on the general idea
- [ ] Ensure the tests and linter pass
- [ ] Code coverage does not decrease (if any source code was changed)
- [ ] Appropriate docs were updated (if necessary)

Fixes internal issue 317908521 🦕
---
 noxfile.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/noxfile.py b/noxfile.py
index c4bbd7a65a..1d3624005a 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -518,9 +518,13 @@ def prerelease(session: nox.sessions.Session, tests_path):
         "--prefer-binary",
         "--pre",
         "--upgrade",
-        # TODO(shobs): Remove tying to version 2.1.3 after
-        # https://github.com/pandas-dev/pandas/issues/56463 is resolved
-        "pandas!=2.1.4",
+        # TODO(shobs): Remove excluding version 2.1.4 after
+        # https://github.com/pandas-dev/pandas/issues/56463 is resolved.
+        #
+        # TODO(shobs): Remove excluding version 2.2.0rc0 after
+        # https://github.com/pandas-dev/pandas/issues/56646 and
+        # https://github.com/pandas-dev/pandas/issues/56651 are resolved.
+        "pandas!=2.1.4,!=2.2.0rc0",
     )
     already_installed.add("pandas")
 

From 252f3a2a0e1296c7d786acdc0bdebe9e4a9ae1be Mon Sep 17 00:00:00 2001
From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com>
Date: Thu, 28 Dec 2023 10:54:15 -0800
Subject: [PATCH 3/4] docs: fix the rendering for `get_dummies` (#291)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly:
- [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code!  That way we can discuss the change, evaluate designs, and agree on the general idea
- [ ] Ensure the tests and linter pass
- [ ] Code coverage does not decrease (if any source code was changed)
- [x] Appropriate docs were updated (if necessary)
        - docs:  https://screenshot.googleplex.com/8X53mhLdQb2dQsd
Fixes internal issue 317915956 🦕
---
 third_party/bigframes_vendored/pandas/core/reshape/encoding.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/bigframes_vendored/pandas/core/reshape/encoding.py b/third_party/bigframes_vendored/pandas/core/reshape/encoding.py
index da92b58f50..b7f67473ea 100644
--- a/third_party/bigframes_vendored/pandas/core/reshape/encoding.py
+++ b/third_party/bigframes_vendored/pandas/core/reshape/encoding.py
@@ -25,6 +25,7 @@ def get_dummies(
     prepended to the value.
 
     **Examples:**
+
         >>> import bigframes.pandas as pd
         >>> pd.options.display.progress_bar = None
         >>> s = pd.Series(list('abca'))

From 746115d5564c95bc3c4a5309c99e7a29e535e6fe Mon Sep 17 00:00:00 2001
From: Huan Chen <142538604+Genesis929@users.noreply.github.com>
Date: Thu, 28 Dec 2023 12:42:15 -0800
Subject: [PATCH 4/4] Fix: Update dataframe.to_gbq to dedup column names.
 (#286)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly:
- [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code!  That way we can discuss the change, evaluate designs, and agree on the general idea
- [ ] Ensure the tests and linter pass
- [ ] Code coverage does not decrease (if any source code was changed)
- [ ] Appropriate docs were updated (if necessary)

Fixes #<issue_number_goes_here> 🦕
---
 bigframes/dataframe.py                  | 14 ++++----
 tests/system/small/test_dataframe_io.py | 44 +++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index ab0006ea20..595670b0b6 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -2759,26 +2759,28 @@ def _apply_unary_op(self, operation: ops.UnaryOp) -> DataFrame:
     def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str:
         """Create query text representing this dataframe for I/O."""
         array_value = self._block.expr
+
+        new_col_labels, new_idx_labels = utils.get_standardized_ids(
+            self._block.column_labels, self.index.names
+        )
+
         columns = list(self._block.value_columns)
-        column_labels = list(self._block.column_labels)
+        column_labels = new_col_labels
         # This code drops unnamed indexes to keep consistent with the behavior of
         # most pandas write APIs. The exception is `pandas.to_csv`, which keeps
         # unnamed indexes as `Unnamed: 0`.
         # TODO(chelsealin): check if works for multiple indexes.
         if index and self.index.name is not None:
             columns.extend(self._block.index_columns)
-            column_labels.extend(self.index.names)
+            column_labels.extend(new_idx_labels)
         else:
             array_value = array_value.drop_columns(self._block.index_columns)
 
         # Make columns in SQL reflect _labels_ not _ids_. Note: This may use
         # the arbitrary unicode column labels feature in BigQuery, which is
         # currently (June 2023) in preview.
-        # TODO(swast): Handle duplicate and NULL labels.
         id_overrides = {
-            col_id: col_label
-            for col_id, col_label in zip(columns, column_labels)
-            if col_label and isinstance(col_label, str)
+            col_id: col_label for col_id, col_label in zip(columns, column_labels)
         }
 
         if ordering_id is not None:
diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py
index 59864e483e..6f1b31b48e 100644
--- a/tests/system/small/test_dataframe_io.py
+++ b/tests/system/small/test_dataframe_io.py
@@ -273,6 +273,50 @@ def test_to_gbq_if_exists(
     )
 
 
+def test_to_gbq_w_duplicate_column_names(
+    scalars_df_index, scalars_pandas_df_index, dataset_id
+):
+    """Test the `to_gbq` API when dealing with duplicate column names."""
+    destination_table = f"{dataset_id}.test_to_gbq_w_duplicate_column_names"
+
+    # Renaming 'int64_too' to 'int64_col', which will result in 'int64_too'
+    # becoming 'int64_col_1' after deduplication.
+    scalars_df_index = scalars_df_index.rename(columns={"int64_too": "int64_col"})
+    scalars_df_index.to_gbq(destination_table, if_exists="replace")
+
+    bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas()
+
+    pd.testing.assert_series_equal(
+        scalars_pandas_df_index["int64_col"], bf_result["int64_col"]
+    )
+    pd.testing.assert_series_equal(
+        scalars_pandas_df_index["int64_too"],
+        bf_result["int64_col_1"],
+        check_names=False,
+    )
+
+
+def test_to_gbq_w_None_column_names(
+    scalars_df_index, scalars_pandas_df_index, dataset_id
+):
+    """Test the `to_gbq` API with None as a column name."""
+    destination_table = f"{dataset_id}.test_to_gbq_w_none_column_names"
+
+    scalars_df_index = scalars_df_index.rename(columns={"int64_too": None})
+    scalars_df_index.to_gbq(destination_table, if_exists="replace")
+
+    bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas()
+
+    pd.testing.assert_series_equal(
+        scalars_pandas_df_index["int64_col"], bf_result["int64_col"]
+    )
+    pd.testing.assert_series_equal(
+        scalars_pandas_df_index["int64_too"],
+        bf_result["bigframes_unnamed_column"],
+        check_names=False,
+    )
+
+
 def test_to_gbq_w_invalid_destination_table(scalars_df_index):
     with pytest.raises(ValueError):
         scalars_df_index.to_gbq("table_id")