From cca4ba921b4e293663876b04e207cc4662c0fc30 Mon Sep 17 00:00:00 2001 From: Romain Lebbadi-Breteau Date: Thu, 8 Aug 2024 22:13:22 -0400 Subject: [PATCH] ENH: Add an option to prevent stripping extra whitespaces in pd.read_html Co-authored-by: Derekt2 --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/html.py | 28 +++++++++++++++++++++++++++- pandas/tests/io/test_html.py | 20 ++++++++++++++++++++ 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 005818b0779e6..f7222ea7bd072 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -46,6 +46,7 @@ Other enhancements - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`) - :class:`Series` now supports the Arrow PyCapsule Interface for export (:issue:`59518`) - :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`) +- :func:`read_html` now accepts a ``strip_whitespace`` argument to decide if extra whitespaces should be trimmed in HTML tables (:issue:`24766`) - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) - :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`) diff --git a/pandas/io/html.py b/pandas/io/html.py index 183af3a03221b..0bc5f5e396043 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -172,6 +172,11 @@ class _HtmlFrameParser: .. versionadded:: 1.5.0 + strip_whitespace : bool + Whether table row values should have all extra whitespaces stripped to + a single space. + .. versionadded:: 3.0.0 + Attributes ---------- io : str or file-like @@ -196,6 +201,11 @@ class _HtmlFrameParser: .. versionadded:: 1.5.0 + strip_whitespace : bool + Whether table row values should have all extra whitespaces stripped to + a single space. + .. versionadded:: 3.0.0 + Notes ----- To subclass this class effectively you must override the following methods: @@ -222,6 +232,7 @@ def __init__( displayed_only: bool, extract_links: Literal[None, "header", "footer", "body", "all"], storage_options: StorageOptions = None, + strip_whitespace: bool = True, ) -> None: self.io = io self.match = match @@ -230,6 +241,7 @@ def __init__( self.displayed_only = displayed_only self.extract_links = extract_links self.storage_options = storage_options + self.strip_whitespace = strip_whitespace def parse_tables(self): """ @@ -523,10 +535,15 @@ def _expand_colspan_rowspan( index += 1 # Append the text from this , colspan times - text = _remove_whitespace(self._text_getter(td)) + if self.strip_whitespace: + text = _remove_whitespace(self._text_getter(td)) + else: + text = self._text_getter(td) + if self.extract_links in ("all", section): href = self._href_getter(td) text = (text, href) + rowspan = int(self._attr_getter(td, "rowspan") or 1) colspan = int(self._attr_getter(td, "colspan") or 1) @@ -962,6 +979,7 @@ def _parse( displayed_only, extract_links, storage_options, + strip_whitespace, **kwargs, ): flavor = _validate_flavor(flavor) @@ -978,6 +996,7 @@ def _parse( displayed_only, extract_links, storage_options, + strip_whitespace, ) try: @@ -1045,6 +1064,7 @@ def read_html( extract_links: Literal[None, "header", "footer", "body", "all"] = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, storage_options: StorageOptions = None, + strip_whitespace: bool = True, ) -> list[DataFrame]: r""" Read HTML tables into a ``list`` of ``DataFrame`` objects. @@ -1165,6 +1185,11 @@ def read_html( .. versionadded:: 2.1.0 + strip_whitespace : bool + Whether table row values should have all extra whitespaces stripped to + a single space. + .. versionadded:: 3.0.0 + Returns ------- dfs @@ -1245,4 +1270,5 @@ def read_html( extract_links=extract_links, dtype_backend=dtype_backend, storage_options=storage_options, + strip_whitespace=strip_whitespace, ) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index bef28c4f027da..4bbd0a7ea3801 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1657,3 +1657,23 @@ def test_style_tag(self, flavor_read_html): result = flavor_read_html(StringIO(data))[0] expected = DataFrame(data=[["A1", "B1"], ["A2", "B2"]], columns=["A", "B"]) tm.assert_frame_equal(result, expected) + + def test_strip_whitespace(self, flavor_read_html): + # GH 24766 + data = """ + + + + + +
Field 1 +Field 2Value 1 +Value 2
+ """ + result_strip = flavor_read_html(StringIO(data))[0] + expected_strip = DataFrame([["Field 1 Field 2", "Value 1 Value 2"]]) + tm.assert_frame_equal(result_strip, expected_strip) + + result_nostrip = flavor_read_html(StringIO(data), strip_whitespace=False)[0] + expected_nostrip = DataFrame([["Field 1\nField 2", "Value 1\nValue 2"]]) + tm.assert_frame_equal(result_nostrip, expected_nostrip)