From 3c5998e723ade572f62dbe66215c6d8e5fefc5e1 Mon Sep 17 00:00:00 2001
From: Jesse Farnham <jessefarnham1@gmail.com>
Date: Wed, 8 Apr 2020 10:41:22 -0400
Subject: [PATCH 1/8] ENH: Support passing compression args to gzip and bz2

This commit closes GH#33196 but takes a more generic approach than the suggested
solution. Instead of providing a 'fast' kwarg or global compression level
setting, this commit extends the ability to pass compression settings as a
dict to the gzip and bz2 compression methods. In this way, if the user
wants faster compression, they can pass
compression={'method': 'gzip', 'compresslevel'=1} rather than
just compression='gzip'.

Note: For the API to be consistent when passing  paths vs. filelikes, GZipFile
and gzip2.open() must accept the same kwargs.
---
 doc/source/whatsnew/v1.1.0.rst      |  6 ++++++
 pandas/core/generic.py              |  9 ++++++++-
 pandas/io/common.py                 | 18 ++++++++++++------
 pandas/tests/io/test_compression.py | 23 +++++++++++++++++++++++
 4 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index fc5893e401836..f7ed07848bb84 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -91,6 +91,12 @@ Other enhancements
 - The :meth:`DataFrame.to_feather` method now supports additional keyword
   arguments (e.g. to set the compression) that are added in pyarrow 0.17
   (:issue:`33422`).
+- :meth:`DataFrame.to_csv`, :meth:`DataFrame.to_pickle`,
+  and :meth:`DataFrame.to_json` now support passing a dict of
+  compression arguments when using the ``gzip`` and ``bz2`` protocols.
+  This can be used to set a custom compression level, e.g.,
+  ``df.to_csv(path, compression={'method': 'gzip', 'compresslevel': 1}``
+  (:issue:`33196`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index eb6554bf2260c..2adfd2bb9a7b3 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -3096,7 +3096,8 @@ def to_csv(
             compression mode is 'infer' and `path_or_buf` is path-like, then
             detect compression mode from the following extensions: '.gz',
             '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given
-            and mode is 'zip' or inferred as 'zip', other entries passed as
+            and mode is one of {'zip', 'gzip', 'bz2'}, or inferred as
+            one of the above, other entries passed as
             additional compression options.
 
             .. versionchanged:: 1.0.0
@@ -3105,6 +3106,12 @@ def to_csv(
                and other entries as additional compression options if
                compression mode is 'zip'.
 
+            .. versionchanged:: 1.1.0
+
+               Passing compression options as keys in dict is
+               supported for compression modes 'gzip' and 'bz2'
+               as well as 'zip'.
+
         quoting : optional constant from csv module
             Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
             then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
diff --git a/pandas/io/common.py b/pandas/io/common.py
index 0fce8f5382686..449321831cbef 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -351,8 +351,9 @@ def get_handle(
         'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer'
         and `filepath_or_buffer` is path-like, then detect compression from
         the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise
-        no compression). If dict and compression mode is 'zip' or inferred as
-        'zip', other entries passed as additional compression options.
+        no compression). If dict and compression mode is one of
+        {'zip', 'gzip', 'bz2'}, or inferred as one of the above,
+        other entries passed as additional compression options.
 
         .. versionchanged:: 1.0.0
 
@@ -360,6 +361,11 @@ def get_handle(
            and other keys as compression options if compression
            mode is 'zip'.
 
+        .. versionchanged:: 1.1.0
+
+           Passing compression options as keys in dict is now
+           supported for compression modes 'gzip' and 'bz2' as well as 'zip'.
+
     memory_map : boolean, default False
         See parsers._parser_params for more information.
     is_text : boolean, default True
@@ -397,16 +403,16 @@ def get_handle(
         # GZ Compression
         if compression == "gzip":
             if is_path:
-                f = gzip.open(path_or_buf, mode)
+                f = gzip.open(path_or_buf, mode, **compression_args)
             else:
-                f = gzip.GzipFile(fileobj=path_or_buf)
+                f = gzip.GzipFile(fileobj=path_or_buf, **compression_args)
 
         # BZ Compression
         elif compression == "bz2":
             if is_path:
-                f = bz2.BZ2File(path_or_buf, mode)
+                f = bz2.BZ2File(path_or_buf, mode, **compression_args)
             else:
-                f = bz2.BZ2File(path_or_buf)
+                f = bz2.BZ2File(path_or_buf, **compression_args)
 
         # ZIP Compression
         elif compression == "zip":
diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py
index 841241d5124e0..708d7bdaf00ad 100644
--- a/pandas/tests/io/test_compression.py
+++ b/pandas/tests/io/test_compression.py
@@ -143,3 +143,26 @@ def test_with_missing_lzma_runtime():
         """
     )
     subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE)
+
+
+@pytest.mark.parametrize(
+    "obj",
+    [
+        pd.DataFrame(
+            100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
+            columns=["X", "Y", "Z"],
+        ),
+        pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
+    ],
+)
+@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
+def test_gzip_compression_level_path(obj, method):
+    """GH#33398 Ideally this test should be repeated for bz2 as well,
+    but that is not practical because a file size of >100k is needed to see any
+    size difference between bz2 compression settings."""
+    with tm.ensure_clean() as path:
+        getattr(obj, method)(path, compression="gzip")
+        compressed_size_default = os.path.getsize(path)
+        getattr(obj, method)(path, compression={"method": "gzip", "compresslevel": 1})
+        compressed_size_fast = os.path.getsize(path)
+        assert compressed_size_default < compressed_size_fast

From 28eecf262586df6519b701a8ccf9db6a811f23fd Mon Sep 17 00:00:00 2001
From: Jesse Farnham <jessefarnham1@gmail.com>
Date: Wed, 8 Apr 2020 12:55:03 -0400
Subject: [PATCH 2/8] Add test for bz2; mypy ignore non-typesafe use of kwargs

---
 pandas/io/common.py                 | 12 ++++++++----
 pandas/tests/io/test_compression.py | 30 ++++++++++++++++++++++++-----
 2 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/pandas/io/common.py b/pandas/io/common.py
index 449321831cbef..30a1d5b6ea9c3 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -403,16 +403,20 @@ def get_handle(
         # GZ Compression
         if compression == "gzip":
             if is_path:
-                f = gzip.open(path_or_buf, mode, **compression_args)
+                f = gzip.open(
+                    path_or_buf, mode, **compression_args)  # type: ignore
             else:
-                f = gzip.GzipFile(fileobj=path_or_buf, **compression_args)
+                f = gzip.GzipFile(
+                    fileobj=path_or_buf, **compression_args)  # type: ignore
 
         # BZ Compression
         elif compression == "bz2":
             if is_path:
-                f = bz2.BZ2File(path_or_buf, mode, **compression_args)
+                f = bz2.BZ2File(
+                    path_or_buf, mode, **compression_args)  # type: ignore
             else:
-                f = bz2.BZ2File(path_or_buf, **compression_args)
+                f = bz2.BZ2File(
+                    path_or_buf, **compression_args)  # type: ignore
 
         # ZIP Compression
         elif compression == "zip":
diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py
index 708d7bdaf00ad..609a898d1638c 100644
--- a/pandas/tests/io/test_compression.py
+++ b/pandas/tests/io/test_compression.py
@@ -156,13 +156,33 @@ def test_with_missing_lzma_runtime():
     ],
 )
 @pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
-def test_gzip_compression_level_path(obj, method):
-    """GH#33398 Ideally this test should be repeated for bz2 as well,
-    but that is not practical because a file size of >100k is needed to see any
-    size difference between bz2 compression settings."""
+def test_gzip_compression_level(obj, method):
+    #GH33196
     with tm.ensure_clean() as path:
         getattr(obj, method)(path, compression="gzip")
         compressed_size_default = os.path.getsize(path)
-        getattr(obj, method)(path, compression={"method": "gzip", "compresslevel": 1})
+        getattr(obj, method)(
+            path, compression={ "method": "gzip", "compresslevel": 1})
         compressed_size_fast = os.path.getsize(path)
         assert compressed_size_default < compressed_size_fast
+
+
+@pytest.mark.parametrize(
+    "obj",
+    [
+        pd.DataFrame(
+            100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
+            columns=["X", "Y", "Z"],
+        ),
+        pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
+    ],
+)
+@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
+def test_bzip_compression_level(obj, method):
+    """GH33196 bzip needs file size > 100k to show a size difference between
+    compression levels, so here we just check if the call works when
+    compression is passed as a dict.
+    """
+    with tm.ensure_clean() as path:
+        getattr(obj, method)(
+            path, compression={ "method": "bz2", "compresslevel": 1})
\ No newline at end of file

From fef23b9664cc2797d32e1887f0e46e84d89ac2be Mon Sep 17 00:00:00 2001
From: Jesse Farnham <jessefarnham1@gmail.com>
Date: Wed, 8 Apr 2020 12:56:03 -0400
Subject: [PATCH 3/8] black formatting

---
 pandas/io/common.py                 | 12 +++++-------
 pandas/tests/io/test_compression.py |  8 +++-----
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/pandas/io/common.py b/pandas/io/common.py
index 30a1d5b6ea9c3..8a3aafee70475 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -403,20 +403,18 @@ def get_handle(
         # GZ Compression
         if compression == "gzip":
             if is_path:
-                f = gzip.open(
-                    path_or_buf, mode, **compression_args)  # type: ignore
+                f = gzip.open(path_or_buf, mode, **compression_args)  # type: ignore
             else:
                 f = gzip.GzipFile(
-                    fileobj=path_or_buf, **compression_args)  # type: ignore
+                    fileobj=path_or_buf, **compression_args
+                )  # type: ignore
 
         # BZ Compression
         elif compression == "bz2":
             if is_path:
-                f = bz2.BZ2File(
-                    path_or_buf, mode, **compression_args)  # type: ignore
+                f = bz2.BZ2File(path_or_buf, mode, **compression_args)  # type: ignore
             else:
-                f = bz2.BZ2File(
-                    path_or_buf, **compression_args)  # type: ignore
+                f = bz2.BZ2File(path_or_buf, **compression_args)  # type: ignore
 
         # ZIP Compression
         elif compression == "zip":
diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py
index 609a898d1638c..59c9bd0a36d3d 100644
--- a/pandas/tests/io/test_compression.py
+++ b/pandas/tests/io/test_compression.py
@@ -157,12 +157,11 @@ def test_with_missing_lzma_runtime():
 )
 @pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
 def test_gzip_compression_level(obj, method):
-    #GH33196
+    # GH33196
     with tm.ensure_clean() as path:
         getattr(obj, method)(path, compression="gzip")
         compressed_size_default = os.path.getsize(path)
-        getattr(obj, method)(
-            path, compression={ "method": "gzip", "compresslevel": 1})
+        getattr(obj, method)(path, compression={"method": "gzip", "compresslevel": 1})
         compressed_size_fast = os.path.getsize(path)
         assert compressed_size_default < compressed_size_fast
 
@@ -184,5 +183,4 @@ def test_bzip_compression_level(obj, method):
     compression is passed as a dict.
     """
     with tm.ensure_clean() as path:
-        getattr(obj, method)(
-            path, compression={ "method": "bz2", "compresslevel": 1})
\ No newline at end of file
+        getattr(obj, method)(path, compression={"method": "bz2", "compresslevel": 1})

From 1ec1f7d2693c218800d4c9347cf64def69aed907 Mon Sep 17 00:00:00 2001
From: Jesse Farnham <jessefarnham1@gmail.com>
Date: Wed, 8 Apr 2020 14:56:17 -0400
Subject: [PATCH 4/8] Move mypy ignore flag to same line

---
 pandas/io/common.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pandas/io/common.py b/pandas/io/common.py
index 8a3aafee70475..224cd6e4b50bc 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -405,9 +405,7 @@ def get_handle(
             if is_path:
                 f = gzip.open(path_or_buf, mode, **compression_args)  # type: ignore
             else:
-                f = gzip.GzipFile(
-                    fileobj=path_or_buf, **compression_args
-                )  # type: ignore
+                f = gzip.GzipFile(fileobj=path_or_buf, **compression_args)  # type: ignore
 
         # BZ Compression
         elif compression == "bz2":

From 601ab724f38753522490b7377add2aecd2b88080 Mon Sep 17 00:00:00 2001
From: Jesse Farnham <jessefarnham1@gmail.com>
Date: Wed, 8 Apr 2020 15:42:42 -0400
Subject: [PATCH 5/8] Move mypy ignore flags to correct line

---
 pandas/io/common.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/pandas/io/common.py b/pandas/io/common.py
index 224cd6e4b50bc..955aaa4e07af8 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -403,14 +403,20 @@ def get_handle(
         # GZ Compression
         if compression == "gzip":
             if is_path:
-                f = gzip.open(path_or_buf, mode, **compression_args)  # type: ignore
+                f = gzip.open(
+                    path_or_buf, mode, **compression_args  # type: ignore
+                )
             else:
-                f = gzip.GzipFile(fileobj=path_or_buf, **compression_args)  # type: ignore
+                f = gzip.GzipFile(
+                    fileobj=path_or_buf, **compression_args  # type: ignore
+                )
 
         # BZ Compression
         elif compression == "bz2":
             if is_path:
-                f = bz2.BZ2File(path_or_buf, mode, **compression_args)  # type: ignore
+                f = bz2.BZ2File(
+                    path_or_buf, mode, **compression_args  # type: ignore
+                )
             else:
                 f = bz2.BZ2File(path_or_buf, **compression_args)  # type: ignore
 

From 04f6608f68a40b95de1cd4135c7a7f82e1f25367 Mon Sep 17 00:00:00 2001
From: Jesse Farnham <jessefarnham1@gmail.com>
Date: Wed, 8 Apr 2020 17:45:15 -0400
Subject: [PATCH 6/8] Add comment linking type ignores to mypy issue

---
 pandas/io/common.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pandas/io/common.py b/pandas/io/common.py
index 955aaa4e07af8..ff527de79c387 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -400,6 +400,9 @@ def get_handle(
 
     if compression:
 
+        # GH33398 the type ignores here seem related to mypy issue #5382;
+        # it may be possible to remove them once that is resolved.
+
         # GZ Compression
         if compression == "gzip":
             if is_path:

From d8bffbf28df1133e312a4e14d97b1f8568029230 Mon Sep 17 00:00:00 2001
From: Jesse Farnham <jessefarnham1@gmail.com>
Date: Wed, 8 Apr 2020 18:06:42 -0400
Subject: [PATCH 7/8] Update base IO documentation

---
 doc/source/user_guide/io.rst | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index f2152c43ceaba..b7a38971115f4 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -285,14 +285,18 @@ chunksize : int, default ``None``
 Quoting, compression, and file format
 +++++++++++++++++++++++++++++++++++++
 
-compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``}, default ``'infer'``
+compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``, ``dict``}, default ``'infer'``
   For on-the-fly decompression of on-disk data. If 'infer', then use gzip,
   bz2, zip, or xz if filepath_or_buffer is a string ending in '.gz', '.bz2',
   '.zip', or '.xz', respectively, and no decompression otherwise. If using 'zip',
   the ZIP file must contain only one data file to be read in.
-  Set to ``None`` for no decompression.
+  Set to ``None`` for no decompression. Can also be a dict with key ``'method'``
+  set to one of {``'zip'``, ``'gzip'``, ``'bz2'``}, and other keys set to
+  compression settings. As an example, the following could be passed for
+  faster compression: ``compression={'method': 'gzip', 'compresslevel': 1}``.
 
   .. versionchanged:: 0.24.0 'infer' option added and set to default.
+  .. versionchanged:: 1.1.0 dict option extended to support ``gzip`` and ``bz2``.
 thousands : str, default ``None``
   Thousands separator.
 decimal : str, default ``'.'``
@@ -3347,6 +3351,12 @@ The compression type can be an explicit parameter or be inferred from the file e
 If 'infer', then use ``gzip``, ``bz2``, ``zip``, or ``xz`` if filename ends in ``'.gz'``, ``'.bz2'``, ``'.zip'``, or
 ``'.xz'``, respectively.
 
+The compression parameter can also be a ``dict`` in order to pass options to the
+compression protocol. It must have a ``'method'`` key set to the name
+of the compression protocol, which must be one of
+{``'zip'``, ``'gzip'``, ``'bz2'``}. All other key-value pairs are passed to
+the underlying compression library.
+
 .. ipython:: python
 
    df = pd.DataFrame({
@@ -3383,6 +3393,15 @@ The default is to 'infer':
    rt = pd.read_pickle("s1.pkl.bz2")
    rt
 
+Passing options to the compression protocol in order to speed up compression:
+
+.. ipython:: python
+
+   df.to_pickle(
+      "data.pkl.gz",
+      compression={"method": "gzip", 'compresslevel': 1}
+   )
+
 .. ipython:: python
    :suppress:
 

From 14ca83d9e9ebaa02b47437073e92559f7cbb5d87 Mon Sep 17 00:00:00 2001
From: Jesse Farnham <jessefarnham1@gmail.com>
Date: Wed, 8 Apr 2020 18:39:56 -0400
Subject: [PATCH 8/8] fix linting issue

---
 doc/source/user_guide/io.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index b7a38971115f4..df6b44ac654ce 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -3398,8 +3398,8 @@ Passing options to the compression protocol in order to speed up compression:
 .. ipython:: python
 
    df.to_pickle(
-      "data.pkl.gz",
-      compression={"method": "gzip", 'compresslevel': 1}
+       "data.pkl.gz",
+       compression={"method": "gzip", 'compresslevel': 1}
    )
 
 .. ipython:: python