From 70e4165f2f4e6660a046693b5360107e55560719 Mon Sep 17 00:00:00 2001 From: Romain Lebbadi-Breteau Date: Thu, 8 Aug 2024 22:13:22 -0400 Subject: [PATCH] ENH: Add an option to prevent stripping extra whitespaces in pd.read_html Co-authored-by: Derekt2 --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/html.py | 28 +++++++++++++++++++++++++++- pandas/tests/io/test_html.py | 20 ++++++++++++++++++++ 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ee9d18d0c7ce2a..0b2ed47a2406dc 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -43,6 +43,7 @@ Other enhancements - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`) - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`) - :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`) +- :func:`read_html` now accepts a ``strip_whitespace`` argument to decide if extra whitespaces should be trimmed in HTML tables (:issue:`24766`) - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) - :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`) diff --git a/pandas/io/html.py b/pandas/io/html.py index c9897f628fdc95..5a4ffd9d479e2a 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -172,6 +172,11 @@ class _HtmlFrameParser: .. versionadded:: 1.5.0 + strip_whitespace : bool + Whether table row values should have all extra whitespaces stripped to + a single space. + .. versionadded:: 3.0.0 + Attributes ---------- io : str or file-like @@ -196,6 +201,11 @@ class _HtmlFrameParser: .. versionadded:: 1.5.0 + strip_whitespace : bool + Whether table row values should have all extra whitespaces stripped to + a single space. + .. versionadded:: 3.0.0 + Notes ----- To subclass this class effectively you must override the following methods: @@ -222,6 +232,7 @@ def __init__( displayed_only: bool, extract_links: Literal[None, "header", "footer", "body", "all"], storage_options: StorageOptions = None, + strip_whitespace: bool = True, ) -> None: self.io = io self.match = match @@ -230,6 +241,7 @@ def __init__( self.displayed_only = displayed_only self.extract_links = extract_links self.storage_options = storage_options + self.strip_whitespace = strip_whitespace def parse_tables(self): """ @@ -506,10 +518,15 @@ def _expand_colspan_rowspan( index += 1 # Append the text from this , colspan times - text = _remove_whitespace(self._text_getter(td)) + if self.strip_whitespace: + text = _remove_whitespace(self._text_getter(td)) + else: + text = self._text_getter(td) + if self.extract_links in ("all", section): href = self._href_getter(td) text = (text, href) + rowspan = int(self._attr_getter(td, "rowspan") or 1) colspan = int(self._attr_getter(td, "colspan") or 1) @@ -944,6 +961,7 @@ def _parse( displayed_only, extract_links, storage_options, + strip_whitespace, **kwargs, ): flavor = _validate_flavor(flavor) @@ -960,6 +978,7 @@ def _parse( displayed_only, extract_links, storage_options, + strip_whitespace, ) try: @@ -1027,6 +1046,7 @@ def read_html( extract_links: Literal[None, "header", "footer", "body", "all"] = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, storage_options: StorageOptions = None, + strip_whitespace: bool = True, ) -> list[DataFrame]: r""" Read HTML tables into a ``list`` of ``DataFrame`` objects. @@ -1147,6 +1167,11 @@ def read_html( .. versionadded:: 2.1.0 + strip_whitespace : bool + Whether table row values should have all extra whitespaces stripped to + a single space. + .. versionadded:: 3.0.0 + Returns ------- dfs @@ -1227,4 +1252,5 @@ def read_html( extract_links=extract_links, dtype_backend=dtype_backend, storage_options=storage_options, + strip_whitespace=strip_whitespace, ) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 164646aedf4647..da416097a4b88e 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1646,3 +1646,23 @@ def test_style_tag(self, flavor_read_html): result = flavor_read_html(StringIO(data))[0] expected = DataFrame(data=[["A1", "B1"], ["A2", "B2"]], columns=["A", "B"]) tm.assert_frame_equal(result, expected) + + def test_strip_whitespace(self, flavor_read_html): + # GH 24766 + data = """ + + + + + +
Field 1 +Field 2Value 1 +Value 2
+ """ + result_strip = flavor_read_html(StringIO(data))[0] + expected_strip = DataFrame([["Field 1 Field 2", "Value 1 Value 2"]]) + tm.assert_frame_equal(result_strip, expected_strip) + + result_nostrip = flavor_read_html(StringIO(data), strip_whitespace=False)[0] + expected_nostrip = DataFrame([["Field 1\nField 2", "Value 1\nValue 2"]]) + tm.assert_frame_equal(result_nostrip, expected_nostrip)