From 73d15a7632e1b555defcc7942e5f629161626a4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Tue, 20 Sep 2022 14:45:59 -0400 Subject: [PATCH] REGR: TextIOWrapper raising an error in read_csv (#48651) * REGR: TextIOWrapper raising an error in read_csv * pyupgrade * do not try to seek on unseekable buffers * unseekable buffer might also have read ahead * safer alternative: do not mess with internal/private(?) buffer of TextIOWrapper (effectively applies the shortcut only to files pandas opens) --- doc/source/whatsnew/v1.5.1.rst | 2 +- pandas/io/parsers/c_parser_wrapper.py | 12 ------------ pandas/io/parsers/readers.py | 11 +++++++++++ pandas/tests/io/parser/common/test_common_basic.py | 14 ++++++++++++++ 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v1.5.1.rst b/doc/source/whatsnew/v1.5.1.rst index f8069b5476d9e..9d40d9118db32 100644 --- a/doc/source/whatsnew/v1.5.1.rst +++ b/doc/source/whatsnew/v1.5.1.rst @@ -14,7 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- +- Regression in :func:`.read_csv` causing an ``EmptyDataError`` when using an UTF-8 file handle that was already read from (:issue:`48646`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 99051ec661413..6e4ea85548230 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -2,7 +2,6 @@ from collections import defaultdict import inspect -from io import TextIOWrapper from typing import ( TYPE_CHECKING, Hashable, @@ -67,17 +66,6 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: # Have to pass int, would break tests using TextReader directly otherwise :( kwds["on_bad_lines"] = self.on_bad_lines.value - # c-engine can cope with utf-8 bytes. Remove TextIOWrapper when its errors - # policy is the same as the one given to read_csv - if ( - isinstance(src, TextIOWrapper) - and src.encoding == "utf-8" - and (src.errors or "strict") == kwds["encoding_errors"] - ): - # error: Incompatible types in assignment (expression has type "BinaryIO", - # variable has type "ReadCsvBuffer[str]") - src = src.buffer # type: ignore[assignment] - for key in ( "storage_options", "encoding", diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 20122d69748aa..eaec4c6bd5991 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -60,6 +60,7 @@ from pandas.io.common import ( IOHandles, get_handle, + stringify_path, validate_header_arg, ) from pandas.io.parsers.arrow_parser_wrapper import ArrowParserWrapper @@ -1727,6 +1728,16 @@ def _make_engine( if engine == "pyarrow": is_text = False mode = "rb" + elif ( + engine == "c" + and self.options.get("encoding", "utf-8") == "utf-8" + and isinstance(stringify_path(f), str) + ): + # c engine can decode utf-8 bytes, adding TextIOWrapper makes + # the c-engine especially for memory_map=True far slower + is_text = False + if "b" not in mode: + mode += "b" self.handles = get_handle( f, mode, diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index a7cdc3c1a84d2..359b059252556 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -928,3 +928,17 @@ def test_read_table_posargs_deprecation(all_parsers): "except for the argument 'filepath_or_buffer' will be keyword-only" ) parser.read_table_check_warnings(FutureWarning, msg, data, " ") + + +def test_read_seek(all_parsers): + # GH48646 + parser = all_parsers + prefix = "### DATA\n" + content = "nkey,value\ntables,rectangular\n" + with tm.ensure_clean() as path: + Path(path).write_text(prefix + content) + with open(path, encoding="utf-8") as file: + file.readline() + actual = parser.read_csv(file) + expected = parser.read_csv(StringIO(content)) + tm.assert_frame_equal(actual, expected)