From a5043e710939e7691bdd57087bf475df3bc0aa48 Mon Sep 17 00:00:00 2001
From: mwish <maplewish117@gmail.com>
Date: Tue, 17 Oct 2023 21:21:01 +0800
Subject: [PATCH] GH-37312: [Python][Docs] Update Python docstrings to reflect
 new parquet encoding option (#38070)

### Rationale for this change

Since parquet C++ has complete all encoding, we can publish this in Python doc.

### What changes are included in this PR?

Add encoding in document.

### Are these changes tested?

No

### Are there any user-facing changes?

No

* Closes: #37312

Lead-authored-by: mwish <maplewish117@gmail.com>
Co-authored-by: mwish <1506118561@qq.com>
Co-authored-by: Rok Mihevc <rok@mihevc.org>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 python/pyarrow/parquet/core.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index a3e5ef76c99b6..51ad955d19f78 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -767,13 +767,16 @@ def _sanitize_table(table, new_schema, flavor):
     Other features such as compression algorithms or the new serialized
     data page format must be enabled separately (see 'compression' and
     'data_page_version').
-use_dictionary : bool or list
+use_dictionary : bool or list, default True
     Specify if we should use dictionary encoding in general or only for
     some columns.
-compression : str or dict
+    When encoding the column, if the dictionary size is too large, the
+    column will fallback to ``PLAIN`` encoding. Specially, ``BOOLEAN`` type
+    doesn't support dictionary encoding.
+compression : str or dict, default 'snappy'
     Specify the compression codec, either on a general basis or per-column.
     Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}.
-write_statistics : bool or list
+write_statistics : bool or list, default True
     Specify if we should write statistics in general (default is True) or only
     for some columns.
 use_deprecated_int96_timestamps : bool, default None
@@ -821,7 +824,10 @@ def _sanitize_table(table, new_schema, flavor):
     and should be combined with a compression codec.
 column_encoding : string or dict, default None
     Specify the encoding scheme on a per column basis.
-    Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT'}.
+    Can only be used when when ``use_dictionary`` is set to False, and
+    cannot be used in combination with ``use_byte_stream_split``.
+    Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT',
+    'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'}.
     Certain encodings are only compatible with certain data types.
     Please refer to the encodings section of `Reading and writing Parquet
     files <https://arrow.apache.org/docs/cpp/parquet.html#encodings>`_.