Skip to content

Commit

Permalink
BUG: pandas-dev#57954 encoding ignored for filelike (pandas-dev#57968)
Browse files Browse the repository at this point in the history
* add exception when encodings exist and do not match

* add exception when encodings exist and do not match

* add test for mismatching encodings warning

* add test for mismatching encodings warning

* add encoding for python 3.10+

* move to _check_file; invert var and condition
  • Loading branch information
dontgoto authored and pmhatre1 committed May 7, 2024
1 parent a494109 commit 5fa9476
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 1 deletion.
11 changes: 11 additions & 0 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1310,6 +1310,16 @@ def _check_file_or_buffer(self, f, engine: CSVEngine) -> None:
raise ValueError(
"The 'python' engine cannot iterate through this file buffer."
)
if hasattr(f, "encoding"):
file_encoding = f.encoding
orig_reader_enc = self.orig_options.get("encoding", None)
any_none = file_encoding is None or orig_reader_enc is None
if file_encoding != orig_reader_enc and not any_none:
file_path = getattr(f, "name", None)
raise ValueError(
f"The specified reader encoding {orig_reader_enc} is different "
f"from the encoding {file_encoding} of file {file_path}."
)

def _clean_options(
self, options: dict[str, Any], engine: CSVEngine
Expand Down Expand Up @@ -1485,6 +1495,7 @@ def _make_engine(
"pyarrow": ArrowParserWrapper,
"python-fwf": FixedWidthFieldParser,
}

if engine not in mapping:
raise ValueError(
f"Unknown engine: {engine} (valid options are {mapping.keys()})"
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/io/parser/test_c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,7 @@ def __next__(self):
def test_buffer_rd_bytes_bad_unicode(c_parser_only):
# see gh-22748
t = BytesIO(b"\xb0")
t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape")
t = TextIOWrapper(t, encoding="UTF-8", errors="surrogateescape")
msg = "'utf-8' codec can't encode character"
with pytest.raises(UnicodeError, match=msg):
c_parser_only.read_csv(t, encoding="UTF-8")
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/io/parser/test_textreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,13 @@ def test_StringIO(self, csv_path):
reader = TextReader(src, header=None)
reader.read()

def test_encoding_mismatch_warning(self, csv_path):
# GH-57954
with open(csv_path, encoding="UTF-8") as f:
msg = "latin1 is different from the encoding"
with pytest.raises(ValueError, match=msg):
read_csv(f, encoding="latin1")

def test_string_factorize(self):
# should this be optional?
data = "a\nb\na\nb\na"
Expand Down

0 comments on commit 5fa9476

Please sign in to comment.