REGR: TextIOWrapper raising an error in read_csv (#48651)

* REGR: TextIOWrapper raising an error in read_csv * pyupgrade * do not try to seek on unseekable buffers * unseekable buffer might also have read ahead * safer alternative: do not mess with internal/private(?) buffer of TextIOWrapper (effectively applies the shortcut only to files pandas opens)
pandas-dev · Sep 20, 2022 · 73d15a7 · 73d15a7
1 parent 744b846
commit 73d15a7
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 13 deletions.
diff --git a/doc/source/whatsnew/v1.5.1.rst b/doc/source/whatsnew/v1.5.1.rst
@@ -14,7 +14,7 @@ including other versions of pandas.
 
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
--
+- Regression in :func:`.read_csv` causing an ``EmptyDataError`` when using an UTF-8 file handle that was already read from (:issue:`48646`)
 -
 
 .. ---------------------------------------------------------------------------

diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
@@ -2,7 +2,6 @@
 
 from collections import defaultdict
 import inspect
-from io import TextIOWrapper
 from typing import (
     TYPE_CHECKING,
     Hashable,
@@ -67,17 +66,6 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None:
         # Have to pass int, would break tests using TextReader directly otherwise :(
         kwds["on_bad_lines"] = self.on_bad_lines.value
 
-        # c-engine can cope with utf-8 bytes. Remove TextIOWrapper when its errors
-        # policy is the same as the one given to read_csv
-        if (
-            isinstance(src, TextIOWrapper)
-            and src.encoding == "utf-8"
-            and (src.errors or "strict") == kwds["encoding_errors"]
-        ):
-            # error: Incompatible types in assignment (expression has type "BinaryIO",
-            # variable has type "ReadCsvBuffer[str]")
-            src = src.buffer  # type: ignore[assignment]
-
         for key in (
             "storage_options",
             "encoding",

diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -60,6 +60,7 @@
 from pandas.io.common import (
     IOHandles,
     get_handle,
+    stringify_path,
     validate_header_arg,
 )
 from pandas.io.parsers.arrow_parser_wrapper import ArrowParserWrapper
@@ -1727,6 +1728,16 @@ def _make_engine(
             if engine == "pyarrow":
                 is_text = False
                 mode = "rb"
+            elif (
+                engine == "c"
+                and self.options.get("encoding", "utf-8") == "utf-8"
+                and isinstance(stringify_path(f), str)
+            ):
+                # c engine can decode utf-8 bytes, adding TextIOWrapper makes
+                # the c-engine especially for memory_map=True far slower
+                is_text = False
+                if "b" not in mode:
+                    mode += "b"
             self.handles = get_handle(
                 f,
                 mode,

diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py
@@ -928,3 +928,17 @@ def test_read_table_posargs_deprecation(all_parsers):
         "except for the argument 'filepath_or_buffer' will be keyword-only"
     )
     parser.read_table_check_warnings(FutureWarning, msg, data, " ")
+
+
+def test_read_seek(all_parsers):
+    # GH48646
+    parser = all_parsers
+    prefix = "### DATA\n"
+    content = "nkey,value\ntables,rectangular\n"
+    with tm.ensure_clean() as path:
+        Path(path).write_text(prefix + content)
+        with open(path, encoding="utf-8") as file:
+            file.readline()
+            actual = parser.read_csv(file)
+        expected = parser.read_csv(StringIO(content))
+    tm.assert_frame_equal(actual, expected)