Skip to content

Commit

Permalink
ENH: Add an option to prevent stripping extra whitespaces in pd.read_…
Browse files Browse the repository at this point in the history
…html

Co-authored-by: Derekt2 <[email protected]>
  • Loading branch information
RomainL972 and Derekt2 committed Aug 12, 2024
1 parent a3f2d48 commit 70e4165
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 1 deletion.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ Other enhancements
- :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`)
- :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`)
- :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`)
- :func:`read_html` now accepts a ``strip_whitespace`` argument to decide if extra whitespaces should be trimmed in HTML tables (:issue:`24766`)
- :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`)
- :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)
- :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`)
Expand Down
28 changes: 27 additions & 1 deletion pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,11 @@ class _HtmlFrameParser:
.. versionadded:: 1.5.0
strip_whitespace : bool
Whether table row values should have all extra whitespaces stripped to
a single space.
.. versionadded:: 3.0.0
Attributes
----------
io : str or file-like
Expand All @@ -196,6 +201,11 @@ class _HtmlFrameParser:
.. versionadded:: 1.5.0
strip_whitespace : bool
Whether table row values should have all extra whitespaces stripped to
a single space.
.. versionadded:: 3.0.0
Notes
-----
To subclass this class effectively you must override the following methods:
Expand All @@ -222,6 +232,7 @@ def __init__(
displayed_only: bool,
extract_links: Literal[None, "header", "footer", "body", "all"],
storage_options: StorageOptions = None,
strip_whitespace: bool = True,
) -> None:
self.io = io
self.match = match
Expand All @@ -230,6 +241,7 @@ def __init__(
self.displayed_only = displayed_only
self.extract_links = extract_links
self.storage_options = storage_options
self.strip_whitespace = strip_whitespace

def parse_tables(self):
"""
Expand Down Expand Up @@ -506,10 +518,15 @@ def _expand_colspan_rowspan(
index += 1

# Append the text from this <td>, colspan times
text = _remove_whitespace(self._text_getter(td))
if self.strip_whitespace:
text = _remove_whitespace(self._text_getter(td))
else:
text = self._text_getter(td)

if self.extract_links in ("all", section):
href = self._href_getter(td)
text = (text, href)

rowspan = int(self._attr_getter(td, "rowspan") or 1)
colspan = int(self._attr_getter(td, "colspan") or 1)

Expand Down Expand Up @@ -944,6 +961,7 @@ def _parse(
displayed_only,
extract_links,
storage_options,
strip_whitespace,
**kwargs,
):
flavor = _validate_flavor(flavor)
Expand All @@ -960,6 +978,7 @@ def _parse(
displayed_only,
extract_links,
storage_options,
strip_whitespace,
)

try:
Expand Down Expand Up @@ -1027,6 +1046,7 @@ def read_html(
extract_links: Literal[None, "header", "footer", "body", "all"] = None,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
storage_options: StorageOptions = None,
strip_whitespace: bool = True,
) -> list[DataFrame]:
r"""
Read HTML tables into a ``list`` of ``DataFrame`` objects.
Expand Down Expand Up @@ -1147,6 +1167,11 @@ def read_html(
.. versionadded:: 2.1.0
strip_whitespace : bool
Whether table row values should have all extra whitespaces stripped to
a single space.
.. versionadded:: 3.0.0
Returns
-------
dfs
Expand Down Expand Up @@ -1227,4 +1252,5 @@ def read_html(
extract_links=extract_links,
dtype_backend=dtype_backend,
storage_options=storage_options,
strip_whitespace=strip_whitespace,
)
20 changes: 20 additions & 0 deletions pandas/tests/io/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -1646,3 +1646,23 @@ def test_style_tag(self, flavor_read_html):
result = flavor_read_html(StringIO(data))[0]
expected = DataFrame(data=[["A1", "B1"], ["A2", "B2"]], columns=["A", "B"])
tm.assert_frame_equal(result, expected)

def test_strip_whitespace(self, flavor_read_html):
# GH 24766
data = """
<table>
<tr>
<td>Field 1
Field 2</td>
<td>Value 1
Value 2</td>
</tr>
</table>
"""
result_strip = flavor_read_html(StringIO(data))[0]
expected_strip = DataFrame([["Field 1 Field 2", "Value 1 Value 2"]])
tm.assert_frame_equal(result_strip, expected_strip)

result_nostrip = flavor_read_html(StringIO(data), strip_whitespace=False)[0]
expected_nostrip = DataFrame([["Field 1\nField 2", "Value 1\nValue 2"]])
tm.assert_frame_equal(result_nostrip, expected_nostrip)

0 comments on commit 70e4165

Please sign in to comment.