From a5043e710939e7691bdd57087bf475df3bc0aa48 Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 17 Oct 2023 21:21:01 +0800 Subject: [PATCH] GH-37312: [Python][Docs] Update Python docstrings to reflect new parquet encoding option (#38070) ### Rationale for this change Since parquet C++ has complete all encoding, we can publish this in Python doc. ### What changes are included in this PR? Add encoding in document. ### Are these changes tested? No ### Are there any user-facing changes? No * Closes: #37312 Lead-authored-by: mwish Co-authored-by: mwish <1506118561@qq.com> Co-authored-by: Rok Mihevc Signed-off-by: Joris Van den Bossche --- python/pyarrow/parquet/core.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index a3e5ef76c99b6..51ad955d19f78 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -767,13 +767,16 @@ def _sanitize_table(table, new_schema, flavor): Other features such as compression algorithms or the new serialized data page format must be enabled separately (see 'compression' and 'data_page_version'). -use_dictionary : bool or list +use_dictionary : bool or list, default True Specify if we should use dictionary encoding in general or only for some columns. -compression : str or dict + When encoding the column, if the dictionary size is too large, the + column will fallback to ``PLAIN`` encoding. Specially, ``BOOLEAN`` type + doesn't support dictionary encoding. +compression : str or dict, default 'snappy' Specify the compression codec, either on a general basis or per-column. Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}. -write_statistics : bool or list +write_statistics : bool or list, default True Specify if we should write statistics in general (default is True) or only for some columns. use_deprecated_int96_timestamps : bool, default None @@ -821,7 +824,10 @@ def _sanitize_table(table, new_schema, flavor): and should be combined with a compression codec. column_encoding : string or dict, default None Specify the encoding scheme on a per column basis. - Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT'}. + Can only be used when when ``use_dictionary`` is set to False, and + cannot be used in combination with ``use_byte_stream_split``. + Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT', + 'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'}. Certain encodings are only compatible with certain data types. Please refer to the encodings section of `Reading and writing Parquet files `_.