From a7d96faf2d302bc854931455f13ebf22ed1e8287 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 25 Jun 2020 18:38:08 +0100 Subject: [PATCH] TYP: make the type annotations of read_csv & read_table discoverable (#34976) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/io/parsers.py | 369 +++++++++++++++----------- pandas/tests/io/parser/test_common.py | 34 +++ 3 files changed, 246 insertions(+), 158 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 10dac7e2863f9..6808737d4fa5e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1025,6 +1025,7 @@ I/O - Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the `%` character and no parameters were present (:issue:`34211`) - Bug in :meth:`~pandas.io.stata.StataReader` which resulted in categorical variables with difference dtypes when reading data using an iterator. (:issue:`31544`) - :meth:`HDFStore.keys` has now an optional `include` parameter that allows the retrieval of all native HDF5 table names (:issue:`29916`) +- `TypeError` exceptions raised by :meth:`read_csv` and :meth:`read_table` were showing as ``parser_f`` when an unexpected keyword argument was passed (:issue:`25648`) - Bug in :meth:`read_excel` for ODS files removes 0.0 values (:issue:`27222`) - Bug in :meth:`ujson.encode` was raising an `OverflowError` with numbers larger than sys.maxsize (:issue: `34395`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 62347f7110d76..c427d3a198b10 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -530,176 +530,229 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): _deprecated_args: Set[str] = set() -def _make_parser_function(name, default_sep=","): - def parser_f( - filepath_or_buffer: FilePathOrBuffer, - sep=default_sep, - delimiter=None, - # Column and Index Locations and Names - header="infer", - names=None, - index_col=None, - usecols=None, - squeeze=False, - prefix=None, - mangle_dupe_cols=True, - # General Parsing Configuration - dtype=None, - engine=None, - converters=None, - true_values=None, - false_values=None, - skipinitialspace=False, - skiprows=None, - skipfooter=0, - nrows=None, - # NA and Missing Data Handling - na_values=None, - keep_default_na=True, - na_filter=True, - verbose=False, - skip_blank_lines=True, - # Datetime Handling - parse_dates=False, - infer_datetime_format=False, - keep_date_col=False, - date_parser=None, - dayfirst=False, - cache_dates=True, - # Iteration - iterator=False, - chunksize=None, - # Quoting, Compression, and File Format - compression="infer", - thousands=None, - decimal: str = ".", - lineterminator=None, - quotechar='"', - quoting=csv.QUOTE_MINIMAL, - doublequote=True, - escapechar=None, - comment=None, - encoding=None, - dialect=None, - # Error Handling - error_bad_lines=True, - warn_bad_lines=True, - # Internal - delim_whitespace=False, - low_memory=_c_parser_defaults["low_memory"], - memory_map=False, - float_precision=None, - ): - - # gh-23761 - # - # When a dialect is passed, it overrides any of the overlapping - # parameters passed in directly. We don't want to warn if the - # default parameters were passed in (since it probably means - # that the user didn't pass them in explicitly in the first place). - # - # "delimiter" is the annoying corner case because we alias it to - # "sep" before doing comparison to the dialect values later on. - # Thus, we need a flag to indicate that we need to "override" - # the comparison to dialect values by checking if default values - # for BOTH "delimiter" and "sep" were provided. - if dialect is not None: - sep_override = delimiter is None and sep == default_sep - kwds = dict(sep_override=sep_override) - else: - kwds = dict() - - # Alias sep -> delimiter. - if delimiter is None: - delimiter = sep - - if delim_whitespace and delimiter != default_sep: - raise ValueError( - "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only specify one." - ) +@Appender( + _doc_read_csv_and_table.format( + func_name="read_csv", + summary="Read a comma-separated values (csv) file into DataFrame.", + _default_sep="','", + ) +) +def read_csv( + filepath_or_buffer: FilePathOrBuffer, + sep=",", + delimiter=None, + # Column and Index Locations and Names + header="infer", + names=None, + index_col=None, + usecols=None, + squeeze=False, + prefix=None, + mangle_dupe_cols=True, + # General Parsing Configuration + dtype=None, + engine=None, + converters=None, + true_values=None, + false_values=None, + skipinitialspace=False, + skiprows=None, + skipfooter=0, + nrows=None, + # NA and Missing Data Handling + na_values=None, + keep_default_na=True, + na_filter=True, + verbose=False, + skip_blank_lines=True, + # Datetime Handling + parse_dates=False, + infer_datetime_format=False, + keep_date_col=False, + date_parser=None, + dayfirst=False, + cache_dates=True, + # Iteration + iterator=False, + chunksize=None, + # Quoting, Compression, and File Format + compression="infer", + thousands=None, + decimal: str = ".", + lineterminator=None, + quotechar='"', + quoting=csv.QUOTE_MINIMAL, + doublequote=True, + escapechar=None, + comment=None, + encoding=None, + dialect=None, + # Error Handling + error_bad_lines=True, + warn_bad_lines=True, + # Internal + delim_whitespace=False, + low_memory=_c_parser_defaults["low_memory"], + memory_map=False, + float_precision=None, +): + # gh-23761 + # + # When a dialect is passed, it overrides any of the overlapping + # parameters passed in directly. We don't want to warn if the + # default parameters were passed in (since it probably means + # that the user didn't pass them in explicitly in the first place). + # + # "delimiter" is the annoying corner case because we alias it to + # "sep" before doing comparison to the dialect values later on. + # Thus, we need a flag to indicate that we need to "override" + # the comparison to dialect values by checking if default values + # for BOTH "delimiter" and "sep" were provided. + default_sep = "," + + if dialect is not None: + sep_override = delimiter is None and sep == default_sep + kwds = dict(sep_override=sep_override) + else: + kwds = dict() - if engine is not None: - engine_specified = True - else: - engine = "c" - engine_specified = False + # Alias sep -> delimiter. + if delimiter is None: + delimiter = sep - kwds.update( - delimiter=delimiter, - engine=engine, - dialect=dialect, - compression=compression, - engine_specified=engine_specified, - doublequote=doublequote, - escapechar=escapechar, - quotechar=quotechar, - quoting=quoting, - skipinitialspace=skipinitialspace, - lineterminator=lineterminator, - header=header, - index_col=index_col, - names=names, - prefix=prefix, - skiprows=skiprows, - skipfooter=skipfooter, - na_values=na_values, - true_values=true_values, - false_values=false_values, - keep_default_na=keep_default_na, - thousands=thousands, - comment=comment, - decimal=decimal, - parse_dates=parse_dates, - keep_date_col=keep_date_col, - dayfirst=dayfirst, - date_parser=date_parser, - cache_dates=cache_dates, - nrows=nrows, - iterator=iterator, - chunksize=chunksize, - converters=converters, - dtype=dtype, - usecols=usecols, - verbose=verbose, - encoding=encoding, - squeeze=squeeze, - memory_map=memory_map, - float_precision=float_precision, - na_filter=na_filter, - delim_whitespace=delim_whitespace, - warn_bad_lines=warn_bad_lines, - error_bad_lines=error_bad_lines, - low_memory=low_memory, - mangle_dupe_cols=mangle_dupe_cols, - infer_datetime_format=infer_datetime_format, - skip_blank_lines=skip_blank_lines, + if delim_whitespace and delimiter != default_sep: + raise ValueError( + "Specified a delimiter with both sep and " + "delim_whitespace=True; you can only specify one." ) - return _read(filepath_or_buffer, kwds) - - parser_f.__name__ = name - - return parser_f + if engine is not None: + engine_specified = True + else: + engine = "c" + engine_specified = False + + kwds.update( + delimiter=delimiter, + engine=engine, + dialect=dialect, + compression=compression, + engine_specified=engine_specified, + doublequote=doublequote, + escapechar=escapechar, + quotechar=quotechar, + quoting=quoting, + skipinitialspace=skipinitialspace, + lineterminator=lineterminator, + header=header, + index_col=index_col, + names=names, + prefix=prefix, + skiprows=skiprows, + skipfooter=skipfooter, + na_values=na_values, + true_values=true_values, + false_values=false_values, + keep_default_na=keep_default_na, + thousands=thousands, + comment=comment, + decimal=decimal, + parse_dates=parse_dates, + keep_date_col=keep_date_col, + dayfirst=dayfirst, + date_parser=date_parser, + cache_dates=cache_dates, + nrows=nrows, + iterator=iterator, + chunksize=chunksize, + converters=converters, + dtype=dtype, + usecols=usecols, + verbose=verbose, + encoding=encoding, + squeeze=squeeze, + memory_map=memory_map, + float_precision=float_precision, + na_filter=na_filter, + delim_whitespace=delim_whitespace, + warn_bad_lines=warn_bad_lines, + error_bad_lines=error_bad_lines, + low_memory=low_memory, + mangle_dupe_cols=mangle_dupe_cols, + infer_datetime_format=infer_datetime_format, + skip_blank_lines=skip_blank_lines, + ) + return _read(filepath_or_buffer, kwds) -read_csv = _make_parser_function("read_csv", default_sep=",") -read_csv = Appender( - _doc_read_csv_and_table.format( - func_name="read_csv", - summary="Read a comma-separated values (csv) file into DataFrame.", - _default_sep="','", - ) -)(read_csv) -read_table = _make_parser_function("read_table", default_sep="\t") -read_table = Appender( +@Appender( _doc_read_csv_and_table.format( func_name="read_table", summary="Read general delimited file into DataFrame.", _default_sep=r"'\\t' (tab-stop)", ) -)(read_table) +) +def read_table( + filepath_or_buffer: FilePathOrBuffer, + sep="\t", + delimiter=None, + # Column and Index Locations and Names + header="infer", + names=None, + index_col=None, + usecols=None, + squeeze=False, + prefix=None, + mangle_dupe_cols=True, + # General Parsing Configuration + dtype=None, + engine=None, + converters=None, + true_values=None, + false_values=None, + skipinitialspace=False, + skiprows=None, + skipfooter=0, + nrows=None, + # NA and Missing Data Handling + na_values=None, + keep_default_na=True, + na_filter=True, + verbose=False, + skip_blank_lines=True, + # Datetime Handling + parse_dates=False, + infer_datetime_format=False, + keep_date_col=False, + date_parser=None, + dayfirst=False, + cache_dates=True, + # Iteration + iterator=False, + chunksize=None, + # Quoting, Compression, and File Format + compression="infer", + thousands=None, + decimal: str = ".", + lineterminator=None, + quotechar='"', + quoting=csv.QUOTE_MINIMAL, + doublequote=True, + escapechar=None, + comment=None, + encoding=None, + dialect=None, + # Error Handling + error_bad_lines=True, + warn_bad_lines=True, + # Internal + delim_whitespace=False, + low_memory=_c_parser_defaults["low_memory"], + memory_map=False, + float_precision=None, +): + return read_csv(**locals()) def read_fwf( diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index e6e868689b060..12e73bae40eac 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -5,6 +5,7 @@ import codecs import csv from datetime import datetime +from inspect import signature from io import StringIO import os import platform @@ -2071,6 +2072,39 @@ def test_read_csv_raises_on_header_prefix(all_parsers): parser.read_csv(s, header=0, prefix="_X") +def test_unexpected_keyword_parameter_exception(all_parsers): + # GH-34976 + parser = all_parsers + + msg = "{}\\(\\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=msg.format("read_csv")): + parser.read_csv("foo.csv", foo=1) + with pytest.raises(TypeError, match=msg.format("read_table")): + parser.read_table("foo.tsv", foo=1) + + +def test_read_table_same_signature_as_read_csv(all_parsers): + # GH-34976 + parser = all_parsers + + table_sign = signature(parser.read_table) + csv_sign = signature(parser.read_csv) + + assert table_sign.parameters.keys() == csv_sign.parameters.keys() + assert table_sign.return_annotation == csv_sign.return_annotation + + for key, csv_param in csv_sign.parameters.items(): + table_param = table_sign.parameters[key] + if key == "sep": + assert csv_param.default == "," + assert table_param.default == "\t" + assert table_param.annotation == csv_param.annotation + assert table_param.kind == csv_param.kind + continue + else: + assert table_param == csv_param + + def test_read_table_equivalency_to_read_csv(all_parsers): # see gh-21948 # As of 0.25.0, read_table is undeprecated