From 5fa94765d68aebf32e738d64a4031dd182c46bf4 Mon Sep 17 00:00:00 2001 From: Philipp Hoffmann Date: Thu, 28 Mar 2024 19:10:56 +0100 Subject: [PATCH] BUG: #57954 encoding ignored for filelike (#57968) * add exception when encodings exist and do not match * add exception when encodings exist and do not match * add test for mismatching encodings warning * add test for mismatching encodings warning * add encoding for python 3.10+ * move to _check_file; invert var and condition --- pandas/io/parsers/readers.py | 11 +++++++++++ pandas/tests/io/parser/test_c_parser_only.py | 2 +- pandas/tests/io/parser/test_textreader.py | 7 +++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index b234a6b78e051..7ecd8cd6d5012 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1310,6 +1310,16 @@ def _check_file_or_buffer(self, f, engine: CSVEngine) -> None: raise ValueError( "The 'python' engine cannot iterate through this file buffer." ) + if hasattr(f, "encoding"): + file_encoding = f.encoding + orig_reader_enc = self.orig_options.get("encoding", None) + any_none = file_encoding is None or orig_reader_enc is None + if file_encoding != orig_reader_enc and not any_none: + file_path = getattr(f, "name", None) + raise ValueError( + f"The specified reader encoding {orig_reader_enc} is different " + f"from the encoding {file_encoding} of file {file_path}." + ) def _clean_options( self, options: dict[str, Any], engine: CSVEngine @@ -1485,6 +1495,7 @@ def _make_engine( "pyarrow": ArrowParserWrapper, "python-fwf": FixedWidthFieldParser, } + if engine not in mapping: raise ValueError( f"Unknown engine: {engine} (valid options are {mapping.keys()})" diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 090235c862a2a..98a460f221592 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -511,7 +511,7 @@ def __next__(self): def test_buffer_rd_bytes_bad_unicode(c_parser_only): # see gh-22748 t = BytesIO(b"\xb0") - t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape") + t = TextIOWrapper(t, encoding="UTF-8", errors="surrogateescape") msg = "'utf-8' codec can't encode character" with pytest.raises(UnicodeError, match=msg): c_parser_only.read_csv(t, encoding="UTF-8") diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 6aeed2377a3aa..eeb783f1957b7 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -48,6 +48,13 @@ def test_StringIO(self, csv_path): reader = TextReader(src, header=None) reader.read() + def test_encoding_mismatch_warning(self, csv_path): + # GH-57954 + with open(csv_path, encoding="UTF-8") as f: + msg = "latin1 is different from the encoding" + with pytest.raises(ValueError, match=msg): + read_csv(f, encoding="latin1") + def test_string_factorize(self): # should this be optional? data = "a\nb\na\nb\na"