From 3c5998e723ade572f62dbe66215c6d8e5fefc5e1 Mon Sep 17 00:00:00 2001 From: Jesse Farnham Date: Wed, 8 Apr 2020 10:41:22 -0400 Subject: [PATCH 1/8] ENH: Support passing compression args to gzip and bz2 This commit closes GH#33196 but takes a more generic approach than the suggested solution. Instead of providing a 'fast' kwarg or global compression level setting, this commit extends the ability to pass compression settings as a dict to the gzip and bz2 compression methods. In this way, if the user wants faster compression, they can pass compression={'method': 'gzip', 'compresslevel'=1} rather than just compression='gzip'. Note: For the API to be consistent when passing paths vs. filelikes, GZipFile and gzip2.open() must accept the same kwargs. --- doc/source/whatsnew/v1.1.0.rst | 6 ++++++ pandas/core/generic.py | 9 ++++++++- pandas/io/common.py | 18 ++++++++++++------ pandas/tests/io/test_compression.py | 23 +++++++++++++++++++++++ 4 files changed, 49 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index fc5893e401836..f7ed07848bb84 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -91,6 +91,12 @@ Other enhancements - The :meth:`DataFrame.to_feather` method now supports additional keyword arguments (e.g. to set the compression) that are added in pyarrow 0.17 (:issue:`33422`). +- :meth:`DataFrame.to_csv`, :meth:`DataFrame.to_pickle`, + and :meth:`DataFrame.to_json` now support passing a dict of + compression arguments when using the ``gzip`` and ``bz2`` protocols. + This can be used to set a custom compression level, e.g., + ``df.to_csv(path, compression={'method': 'gzip', 'compresslevel': 1}`` + (:issue:`33196`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index eb6554bf2260c..2adfd2bb9a7b3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3096,7 +3096,8 @@ def to_csv( compression mode is 'infer' and `path_or_buf` is path-like, then detect compression mode from the following extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given - and mode is 'zip' or inferred as 'zip', other entries passed as + and mode is one of {'zip', 'gzip', 'bz2'}, or inferred as + one of the above, other entries passed as additional compression options. .. versionchanged:: 1.0.0 @@ -3105,6 +3106,12 @@ def to_csv( and other entries as additional compression options if compression mode is 'zip'. + .. versionchanged:: 1.1.0 + + Passing compression options as keys in dict is + supported for compression modes 'gzip' and 'bz2' + as well as 'zip'. + quoting : optional constant from csv module Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` then floats are converted to strings and thus csv.QUOTE_NONNUMERIC diff --git a/pandas/io/common.py b/pandas/io/common.py index 0fce8f5382686..449321831cbef 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -351,8 +351,9 @@ def get_handle( 'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' and `filepath_or_buffer` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise - no compression). If dict and compression mode is 'zip' or inferred as - 'zip', other entries passed as additional compression options. + no compression). If dict and compression mode is one of + {'zip', 'gzip', 'bz2'}, or inferred as one of the above, + other entries passed as additional compression options. .. versionchanged:: 1.0.0 @@ -360,6 +361,11 @@ def get_handle( and other keys as compression options if compression mode is 'zip'. + .. versionchanged:: 1.1.0 + + Passing compression options as keys in dict is now + supported for compression modes 'gzip' and 'bz2' as well as 'zip'. + memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True @@ -397,16 +403,16 @@ def get_handle( # GZ Compression if compression == "gzip": if is_path: - f = gzip.open(path_or_buf, mode) + f = gzip.open(path_or_buf, mode, **compression_args) else: - f = gzip.GzipFile(fileobj=path_or_buf) + f = gzip.GzipFile(fileobj=path_or_buf, **compression_args) # BZ Compression elif compression == "bz2": if is_path: - f = bz2.BZ2File(path_or_buf, mode) + f = bz2.BZ2File(path_or_buf, mode, **compression_args) else: - f = bz2.BZ2File(path_or_buf) + f = bz2.BZ2File(path_or_buf, **compression_args) # ZIP Compression elif compression == "zip": diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 841241d5124e0..708d7bdaf00ad 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -143,3 +143,26 @@ def test_with_missing_lzma_runtime(): """ ) subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE) + + +@pytest.mark.parametrize( + "obj", + [ + pd.DataFrame( + 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + columns=["X", "Y", "Z"], + ), + pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"), + ], +) +@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"]) +def test_gzip_compression_level_path(obj, method): + """GH#33398 Ideally this test should be repeated for bz2 as well, + but that is not practical because a file size of >100k is needed to see any + size difference between bz2 compression settings.""" + with tm.ensure_clean() as path: + getattr(obj, method)(path, compression="gzip") + compressed_size_default = os.path.getsize(path) + getattr(obj, method)(path, compression={"method": "gzip", "compresslevel": 1}) + compressed_size_fast = os.path.getsize(path) + assert compressed_size_default < compressed_size_fast From 28eecf262586df6519b701a8ccf9db6a811f23fd Mon Sep 17 00:00:00 2001 From: Jesse Farnham Date: Wed, 8 Apr 2020 12:55:03 -0400 Subject: [PATCH 2/8] Add test for bz2; mypy ignore non-typesafe use of kwargs --- pandas/io/common.py | 12 ++++++++---- pandas/tests/io/test_compression.py | 30 ++++++++++++++++++++++++----- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 449321831cbef..30a1d5b6ea9c3 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -403,16 +403,20 @@ def get_handle( # GZ Compression if compression == "gzip": if is_path: - f = gzip.open(path_or_buf, mode, **compression_args) + f = gzip.open( + path_or_buf, mode, **compression_args) # type: ignore else: - f = gzip.GzipFile(fileobj=path_or_buf, **compression_args) + f = gzip.GzipFile( + fileobj=path_or_buf, **compression_args) # type: ignore # BZ Compression elif compression == "bz2": if is_path: - f = bz2.BZ2File(path_or_buf, mode, **compression_args) + f = bz2.BZ2File( + path_or_buf, mode, **compression_args) # type: ignore else: - f = bz2.BZ2File(path_or_buf, **compression_args) + f = bz2.BZ2File( + path_or_buf, **compression_args) # type: ignore # ZIP Compression elif compression == "zip": diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 708d7bdaf00ad..609a898d1638c 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -156,13 +156,33 @@ def test_with_missing_lzma_runtime(): ], ) @pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"]) -def test_gzip_compression_level_path(obj, method): - """GH#33398 Ideally this test should be repeated for bz2 as well, - but that is not practical because a file size of >100k is needed to see any - size difference between bz2 compression settings.""" +def test_gzip_compression_level(obj, method): + #GH33196 with tm.ensure_clean() as path: getattr(obj, method)(path, compression="gzip") compressed_size_default = os.path.getsize(path) - getattr(obj, method)(path, compression={"method": "gzip", "compresslevel": 1}) + getattr(obj, method)( + path, compression={ "method": "gzip", "compresslevel": 1}) compressed_size_fast = os.path.getsize(path) assert compressed_size_default < compressed_size_fast + + +@pytest.mark.parametrize( + "obj", + [ + pd.DataFrame( + 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + columns=["X", "Y", "Z"], + ), + pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"), + ], +) +@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"]) +def test_bzip_compression_level(obj, method): + """GH33196 bzip needs file size > 100k to show a size difference between + compression levels, so here we just check if the call works when + compression is passed as a dict. + """ + with tm.ensure_clean() as path: + getattr(obj, method)( + path, compression={ "method": "bz2", "compresslevel": 1}) \ No newline at end of file From fef23b9664cc2797d32e1887f0e46e84d89ac2be Mon Sep 17 00:00:00 2001 From: Jesse Farnham Date: Wed, 8 Apr 2020 12:56:03 -0400 Subject: [PATCH 3/8] black formatting --- pandas/io/common.py | 12 +++++------- pandas/tests/io/test_compression.py | 8 +++----- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 30a1d5b6ea9c3..8a3aafee70475 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -403,20 +403,18 @@ def get_handle( # GZ Compression if compression == "gzip": if is_path: - f = gzip.open( - path_or_buf, mode, **compression_args) # type: ignore + f = gzip.open(path_or_buf, mode, **compression_args) # type: ignore else: f = gzip.GzipFile( - fileobj=path_or_buf, **compression_args) # type: ignore + fileobj=path_or_buf, **compression_args + ) # type: ignore # BZ Compression elif compression == "bz2": if is_path: - f = bz2.BZ2File( - path_or_buf, mode, **compression_args) # type: ignore + f = bz2.BZ2File(path_or_buf, mode, **compression_args) # type: ignore else: - f = bz2.BZ2File( - path_or_buf, **compression_args) # type: ignore + f = bz2.BZ2File(path_or_buf, **compression_args) # type: ignore # ZIP Compression elif compression == "zip": diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 609a898d1638c..59c9bd0a36d3d 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -157,12 +157,11 @@ def test_with_missing_lzma_runtime(): ) @pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"]) def test_gzip_compression_level(obj, method): - #GH33196 + # GH33196 with tm.ensure_clean() as path: getattr(obj, method)(path, compression="gzip") compressed_size_default = os.path.getsize(path) - getattr(obj, method)( - path, compression={ "method": "gzip", "compresslevel": 1}) + getattr(obj, method)(path, compression={"method": "gzip", "compresslevel": 1}) compressed_size_fast = os.path.getsize(path) assert compressed_size_default < compressed_size_fast @@ -184,5 +183,4 @@ def test_bzip_compression_level(obj, method): compression is passed as a dict. """ with tm.ensure_clean() as path: - getattr(obj, method)( - path, compression={ "method": "bz2", "compresslevel": 1}) \ No newline at end of file + getattr(obj, method)(path, compression={"method": "bz2", "compresslevel": 1}) From 1ec1f7d2693c218800d4c9347cf64def69aed907 Mon Sep 17 00:00:00 2001 From: Jesse Farnham Date: Wed, 8 Apr 2020 14:56:17 -0400 Subject: [PATCH 4/8] Move mypy ignore flag to same line --- pandas/io/common.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 8a3aafee70475..224cd6e4b50bc 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -405,9 +405,7 @@ def get_handle( if is_path: f = gzip.open(path_or_buf, mode, **compression_args) # type: ignore else: - f = gzip.GzipFile( - fileobj=path_or_buf, **compression_args - ) # type: ignore + f = gzip.GzipFile(fileobj=path_or_buf, **compression_args) # type: ignore # BZ Compression elif compression == "bz2": From 601ab724f38753522490b7377add2aecd2b88080 Mon Sep 17 00:00:00 2001 From: Jesse Farnham Date: Wed, 8 Apr 2020 15:42:42 -0400 Subject: [PATCH 5/8] Move mypy ignore flags to correct line --- pandas/io/common.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 224cd6e4b50bc..955aaa4e07af8 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -403,14 +403,20 @@ def get_handle( # GZ Compression if compression == "gzip": if is_path: - f = gzip.open(path_or_buf, mode, **compression_args) # type: ignore + f = gzip.open( + path_or_buf, mode, **compression_args # type: ignore + ) else: - f = gzip.GzipFile(fileobj=path_or_buf, **compression_args) # type: ignore + f = gzip.GzipFile( + fileobj=path_or_buf, **compression_args # type: ignore + ) # BZ Compression elif compression == "bz2": if is_path: - f = bz2.BZ2File(path_or_buf, mode, **compression_args) # type: ignore + f = bz2.BZ2File( + path_or_buf, mode, **compression_args # type: ignore + ) else: f = bz2.BZ2File(path_or_buf, **compression_args) # type: ignore From 04f6608f68a40b95de1cd4135c7a7f82e1f25367 Mon Sep 17 00:00:00 2001 From: Jesse Farnham Date: Wed, 8 Apr 2020 17:45:15 -0400 Subject: [PATCH 6/8] Add comment linking type ignores to mypy issue --- pandas/io/common.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/io/common.py b/pandas/io/common.py index 955aaa4e07af8..ff527de79c387 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -400,6 +400,9 @@ def get_handle( if compression: + # GH33398 the type ignores here seem related to mypy issue #5382; + # it may be possible to remove them once that is resolved. + # GZ Compression if compression == "gzip": if is_path: From d8bffbf28df1133e312a4e14d97b1f8568029230 Mon Sep 17 00:00:00 2001 From: Jesse Farnham Date: Wed, 8 Apr 2020 18:06:42 -0400 Subject: [PATCH 7/8] Update base IO documentation --- doc/source/user_guide/io.rst | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index f2152c43ceaba..b7a38971115f4 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -285,14 +285,18 @@ chunksize : int, default ``None`` Quoting, compression, and file format +++++++++++++++++++++++++++++++++++++ -compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``}, default ``'infer'`` +compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``, ``dict``}, default ``'infer'`` For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, zip, or xz if filepath_or_buffer is a string ending in '.gz', '.bz2', '.zip', or '.xz', respectively, and no decompression otherwise. If using 'zip', the ZIP file must contain only one data file to be read in. - Set to ``None`` for no decompression. + Set to ``None`` for no decompression. Can also be a dict with key ``'method'`` + set to one of {``'zip'``, ``'gzip'``, ``'bz2'``}, and other keys set to + compression settings. As an example, the following could be passed for + faster compression: ``compression={'method': 'gzip', 'compresslevel': 1}``. .. versionchanged:: 0.24.0 'infer' option added and set to default. + .. versionchanged:: 1.1.0 dict option extended to support ``gzip`` and ``bz2``. thousands : str, default ``None`` Thousands separator. decimal : str, default ``'.'`` @@ -3347,6 +3351,12 @@ The compression type can be an explicit parameter or be inferred from the file e If 'infer', then use ``gzip``, ``bz2``, ``zip``, or ``xz`` if filename ends in ``'.gz'``, ``'.bz2'``, ``'.zip'``, or ``'.xz'``, respectively. +The compression parameter can also be a ``dict`` in order to pass options to the +compression protocol. It must have a ``'method'`` key set to the name +of the compression protocol, which must be one of +{``'zip'``, ``'gzip'``, ``'bz2'``}. All other key-value pairs are passed to +the underlying compression library. + .. ipython:: python df = pd.DataFrame({ @@ -3383,6 +3393,15 @@ The default is to 'infer': rt = pd.read_pickle("s1.pkl.bz2") rt +Passing options to the compression protocol in order to speed up compression: + +.. ipython:: python + + df.to_pickle( + "data.pkl.gz", + compression={"method": "gzip", 'compresslevel': 1} + ) + .. ipython:: python :suppress: From 14ca83d9e9ebaa02b47437073e92559f7cbb5d87 Mon Sep 17 00:00:00 2001 From: Jesse Farnham Date: Wed, 8 Apr 2020 18:39:56 -0400 Subject: [PATCH 8/8] fix linting issue --- doc/source/user_guide/io.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index b7a38971115f4..df6b44ac654ce 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3398,8 +3398,8 @@ Passing options to the compression protocol in order to speed up compression: .. ipython:: python df.to_pickle( - "data.pkl.gz", - compression={"method": "gzip", 'compresslevel': 1} + "data.pkl.gz", + compression={"method": "gzip", 'compresslevel': 1} ) .. ipython:: python