Skip to content

Commit

Permalink
REGR: TextIOWrapper raising an error in read_csv (#48651)
Browse files Browse the repository at this point in the history
* REGR: TextIOWrapper raising an error in read_csv

* pyupgrade

* do not try to seek on unseekable buffers

* unseekable buffer might also have read ahead

* safer alternative: do not mess with internal/private(?) buffer of TextIOWrapper (effectively applies the shortcut only to files pandas opens)
  • Loading branch information
twoertwein authored Sep 20, 2022
1 parent 744b846 commit 73d15a7
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 13 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.5.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ including other versions of pandas.

Fixed regressions
~~~~~~~~~~~~~~~~~
-
- Regression in :func:`.read_csv` causing an ``EmptyDataError`` when using an UTF-8 file handle that was already read from (:issue:`48646`)
-

.. ---------------------------------------------------------------------------
Expand Down
12 changes: 0 additions & 12 deletions pandas/io/parsers/c_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

from collections import defaultdict
import inspect
from io import TextIOWrapper
from typing import (
TYPE_CHECKING,
Hashable,
Expand Down Expand Up @@ -67,17 +66,6 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None:
# Have to pass int, would break tests using TextReader directly otherwise :(
kwds["on_bad_lines"] = self.on_bad_lines.value

# c-engine can cope with utf-8 bytes. Remove TextIOWrapper when its errors
# policy is the same as the one given to read_csv
if (
isinstance(src, TextIOWrapper)
and src.encoding == "utf-8"
and (src.errors or "strict") == kwds["encoding_errors"]
):
# error: Incompatible types in assignment (expression has type "BinaryIO",
# variable has type "ReadCsvBuffer[str]")
src = src.buffer # type: ignore[assignment]

for key in (
"storage_options",
"encoding",
Expand Down
11 changes: 11 additions & 0 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
from pandas.io.common import (
IOHandles,
get_handle,
stringify_path,
validate_header_arg,
)
from pandas.io.parsers.arrow_parser_wrapper import ArrowParserWrapper
Expand Down Expand Up @@ -1727,6 +1728,16 @@ def _make_engine(
if engine == "pyarrow":
is_text = False
mode = "rb"
elif (
engine == "c"
and self.options.get("encoding", "utf-8") == "utf-8"
and isinstance(stringify_path(f), str)
):
# c engine can decode utf-8 bytes, adding TextIOWrapper makes
# the c-engine especially for memory_map=True far slower
is_text = False
if "b" not in mode:
mode += "b"
self.handles = get_handle(
f,
mode,
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/io/parser/common/test_common_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -928,3 +928,17 @@ def test_read_table_posargs_deprecation(all_parsers):
"except for the argument 'filepath_or_buffer' will be keyword-only"
)
parser.read_table_check_warnings(FutureWarning, msg, data, " ")


def test_read_seek(all_parsers):
# GH48646
parser = all_parsers
prefix = "### DATA\n"
content = "nkey,value\ntables,rectangular\n"
with tm.ensure_clean() as path:
Path(path).write_text(prefix + content)
with open(path, encoding="utf-8") as file:
file.readline()
actual = parser.read_csv(file)
expected = parser.read_csv(StringIO(content))
tm.assert_frame_equal(actual, expected)

0 comments on commit 73d15a7

Please sign in to comment.