From da51207e3666e95d95100d47616ad17bcd7ae845 Mon Sep 17 00:00:00 2001 From: Jakub Filipek Date: Sun, 15 Dec 2024 00:53:07 -0800 Subject: [PATCH] feat: Add `bin.reinterpret` (#20263) Co-authored-by: ritchie --- .../binary/cast_binary_to_numerical.rs | 80 +++++++++++++ .../src/chunked_array/binary/mod.rs | 1 + .../src/chunked_array/binary/namespace.rs | 32 ++++++ crates/polars-plan/src/dsl/binary.rs | 9 ++ .../src/dsl/function_expr/binary.rs | 19 +++ crates/polars-python/src/expr/binary.rs | 24 ++++ .../source/reference/expressions/binary.rst | 1 + .../docs/source/reference/series/binary.rst | 1 + py-polars/polars/_typing.py | 1 + py-polars/polars/expr/binary.py | 51 ++++++++- py-polars/polars/series/binary.py | 41 ++++++- .../unit/operations/namespaces/test_binary.py | 108 ++++++++++++++++++ 12 files changed, 366 insertions(+), 2 deletions(-) create mode 100644 crates/polars-ops/src/chunked_array/binary/cast_binary_to_numerical.rs diff --git a/crates/polars-ops/src/chunked_array/binary/cast_binary_to_numerical.rs b/crates/polars-ops/src/chunked_array/binary/cast_binary_to_numerical.rs new file mode 100644 index 000000000000..d3f76f6b8263 --- /dev/null +++ b/crates/polars-ops/src/chunked_array/binary/cast_binary_to_numerical.rs @@ -0,0 +1,80 @@ +use arrow::array::{Array, BinaryViewArray, PrimitiveArray}; +use arrow::datatypes::ArrowDataType; +use arrow::types::NativeType; +use polars_error::PolarsResult; + +/// Trait for casting bytes to a primitive type +pub trait Cast { + fn cast_le(val: &[u8]) -> Option + where + Self: Sized; + fn cast_be(val: &[u8]) -> Option + where + Self: Sized; +} +macro_rules! impl_cast { + ($primitive_type:ident) => { + impl Cast for $primitive_type { + fn cast_le(val: &[u8]) -> Option { + Some($primitive_type::from_le_bytes(val.try_into().ok()?)) + } + + fn cast_be(val: &[u8]) -> Option { + Some($primitive_type::from_be_bytes(val.try_into().ok()?)) + } + } + }; +} + +impl_cast!(i8); +impl_cast!(i16); +impl_cast!(i32); +impl_cast!(i64); +impl_cast!(i128); +impl_cast!(u8); +impl_cast!(u16); +impl_cast!(u32); +impl_cast!(u64); +impl_cast!(u128); +impl_cast!(f32); +impl_cast!(f64); + +/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null. +pub(super) fn cast_binview_to_primitive( + from: &BinaryViewArray, + to: &ArrowDataType, + is_little_endian: bool, +) -> PrimitiveArray +where + T: Cast + NativeType, +{ + let iter = from.iter().map(|x| { + x.and_then::(|x| { + if is_little_endian { + T::cast_le(x) + } else { + T::cast_be(x) + } + }) + }); + + PrimitiveArray::::from_trusted_len_iter(iter).to(to.clone()) +} + +/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null. +pub(super) fn cast_binview_to_primitive_dyn( + from: &dyn Array, + to: &ArrowDataType, + is_little_endian: bool, +) -> PolarsResult> +where + T: Cast + NativeType, +{ + let from = from.as_any().downcast_ref().unwrap(); + + Ok(Box::new(cast_binview_to_primitive::( + from, + to, + is_little_endian, + ))) +} diff --git a/crates/polars-ops/src/chunked_array/binary/mod.rs b/crates/polars-ops/src/chunked_array/binary/mod.rs index fce3c77a5df7..df8847a1bc0c 100644 --- a/crates/polars-ops/src/chunked_array/binary/mod.rs +++ b/crates/polars-ops/src/chunked_array/binary/mod.rs @@ -1,3 +1,4 @@ +mod cast_binary_to_numerical; mod namespace; pub use namespace::*; diff --git a/crates/polars-ops/src/chunked_array/binary/namespace.rs b/crates/polars-ops/src/chunked_array/binary/namespace.rs index b20f1d9e3e5a..3cd299892fa5 100644 --- a/crates/polars-ops/src/chunked_array/binary/namespace.rs +++ b/crates/polars-ops/src/chunked_array/binary/namespace.rs @@ -1,6 +1,7 @@ #[cfg(feature = "binary_encoding")] use std::borrow::Cow; +use arrow::with_match_primitive_type; #[cfg(feature = "binary_encoding")] use base64::engine::general_purpose; #[cfg(feature = "binary_encoding")] @@ -9,6 +10,7 @@ use memchr::memmem::find; use polars_compute::size::binary_size_bytes; use polars_core::prelude::arity::{broadcast_binary_elementwise_values, unary_elementwise_values}; +use super::cast_binary_to_numerical::cast_binview_to_primitive_dyn; use super::*; pub trait BinaryNameSpaceImpl: AsBinary { @@ -127,6 +129,36 @@ pub trait BinaryNameSpaceImpl: AsBinary { .unwrap() } } + + #[cfg(feature = "binary_encoding")] + #[allow(clippy::wrong_self_convention)] + fn from_buffer(&self, dtype: &DataType, is_little_endian: bool) -> PolarsResult { + let ca = self.as_binary(); + let arrow_type = dtype.to_arrow(CompatLevel::newest()); + + match arrow_type.to_physical_type() { + arrow::datatypes::PhysicalType::Primitive(ty) => { + with_match_primitive_type!(ty, |$T| { + unsafe { + Ok(Series::from_chunks_and_dtype_unchecked( + ca.name().clone(), + ca.chunks().iter().map(|chunk| { + cast_binview_to_primitive_dyn::<$T>( + &**chunk, + &arrow_type, + is_little_endian, + ) + }).collect::>>()?, + dtype + )) + } + }) + }, + _ => Err( + polars_err!(InvalidOperation:"unsupported data type in from_buffer. Only numerical types are allowed."), + ), + } + } } impl BinaryNameSpaceImpl for BinaryChunked {} diff --git a/crates/polars-plan/src/dsl/binary.rs b/crates/polars-plan/src/dsl/binary.rs index 9091b1777b65..659d498b4388 100644 --- a/crates/polars-plan/src/dsl/binary.rs +++ b/crates/polars-plan/src/dsl/binary.rs @@ -64,4 +64,13 @@ impl BinaryNameSpace { self.0 .map_private(FunctionExpr::BinaryExpr(BinaryFunction::Base64Encode)) } + + #[cfg(feature = "binary_encoding")] + pub fn from_buffer(self, to_type: DataType, is_little_endian: bool) -> Expr { + self.0 + .map_private(FunctionExpr::BinaryExpr(BinaryFunction::FromBuffer( + to_type, + is_little_endian, + ))) + } } diff --git a/crates/polars-plan/src/dsl/function_expr/binary.rs b/crates/polars-plan/src/dsl/function_expr/binary.rs index 88f3ad71b545..00855bd4e97b 100644 --- a/crates/polars-plan/src/dsl/function_expr/binary.rs +++ b/crates/polars-plan/src/dsl/function_expr/binary.rs @@ -19,6 +19,8 @@ pub enum BinaryFunction { #[cfg(feature = "binary_encoding")] Base64Encode, Size, + #[cfg(feature = "binary_encoding")] + FromBuffer(DataType, bool), } impl BinaryFunction { @@ -32,6 +34,8 @@ impl BinaryFunction { #[cfg(feature = "binary_encoding")] HexEncode | Base64Encode => mapper.with_dtype(DataType::String), Size => mapper.with_dtype(DataType::UInt32), + #[cfg(feature = "binary_encoding")] + FromBuffer(dtype, _) => mapper.with_dtype(dtype.clone()), } } } @@ -52,6 +56,8 @@ impl Display for BinaryFunction { #[cfg(feature = "binary_encoding")] Base64Encode => "base64_encode", Size => "size_bytes", + #[cfg(feature = "binary_encoding")] + FromBuffer(_, _) => "from_buffer", }; write!(f, "bin.{s}") } @@ -79,6 +85,8 @@ impl From for SpecialEq> { #[cfg(feature = "binary_encoding")] Base64Encode => map!(base64_encode), Size => map!(size_bytes), + #[cfg(feature = "binary_encoding")] + FromBuffer(dtype, is_little_endian) => map!(from_buffer, &dtype, is_little_endian), } } } @@ -141,6 +149,17 @@ pub(super) fn base64_encode(s: &Column) -> PolarsResult { Ok(ca.base64_encode().into()) } +#[cfg(feature = "binary_encoding")] +pub(super) fn from_buffer( + s: &Column, + dtype: &DataType, + is_little_endian: bool, +) -> PolarsResult { + let ca = s.binary()?; + ca.from_buffer(dtype, is_little_endian) + .map(|val| val.into()) +} + impl From for FunctionExpr { fn from(b: BinaryFunction) -> Self { FunctionExpr::BinaryExpr(b) diff --git a/crates/polars-python/src/expr/binary.rs b/crates/polars-python/src/expr/binary.rs index 7833c450af2a..681d05697133 100644 --- a/crates/polars-python/src/expr/binary.rs +++ b/crates/polars-python/src/expr/binary.rs @@ -1,5 +1,7 @@ +use polars::prelude::DataType; use pyo3::prelude::*; +use crate::prelude::Wrap; use crate::PyExpr; #[pymethods] @@ -40,6 +42,28 @@ impl PyExpr { self.inner.clone().binary().base64_encode().into() } + #[cfg(feature = "binary_encoding")] + #[allow(clippy::wrong_self_convention)] + fn from_buffer(&self, dtype: Wrap, kind: &str) -> PyResult { + use pyo3::exceptions::PyValueError; + + let is_little_endian = match kind.to_lowercase().as_str() { + "little" => true, + "big" => false, + _ => { + return Err(PyValueError::new_err(format!( + "Invalid endianness: {kind}. Valid values are \"little\" or \"big\"." + ))) + }, + }; + Ok(self + .inner + .clone() + .binary() + .from_buffer(dtype.0, is_little_endian) + .into()) + } + fn bin_size_bytes(&self) -> Self { self.inner.clone().binary().size_bytes().into() } diff --git a/py-polars/docs/source/reference/expressions/binary.rst b/py-polars/docs/source/reference/expressions/binary.rst index 53f380408267..ed13c8dd3596 100644 --- a/py-polars/docs/source/reference/expressions/binary.rst +++ b/py-polars/docs/source/reference/expressions/binary.rst @@ -13,5 +13,6 @@ The following methods are available under the `expr.bin` attribute. Expr.bin.decode Expr.bin.encode Expr.bin.ends_with + Expr.bin.reinterpret Expr.bin.size Expr.bin.starts_with diff --git a/py-polars/docs/source/reference/series/binary.rst b/py-polars/docs/source/reference/series/binary.rst index c1dab31448d2..25e622d420b2 100644 --- a/py-polars/docs/source/reference/series/binary.rst +++ b/py-polars/docs/source/reference/series/binary.rst @@ -13,5 +13,6 @@ The following methods are available under the `Series.bin` attribute. Series.bin.decode Series.bin.encode Series.bin.ends_with + Series.bin.reinterpret Series.bin.size Series.bin.starts_with diff --git a/py-polars/polars/_typing.py b/py-polars/polars/_typing.py index 92b4109f620b..ea6284cda9c9 100644 --- a/py-polars/polars/_typing.py +++ b/py-polars/polars/_typing.py @@ -125,6 +125,7 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: RankMethod: TypeAlias = Literal["average", "min", "max", "dense", "ordinal", "random"] Roll: TypeAlias = Literal["raise", "forward", "backward"] SerializationFormat: TypeAlias = Literal["binary", "json"] +Endianness: TypeAlias = Literal["little", "big"] SizeUnit: TypeAlias = Literal[ "b", "kb", diff --git a/py-polars/polars/expr/binary.py b/py-polars/polars/expr/binary.py index 62cf43180fd3..b2ef66f5b4dd 100644 --- a/py-polars/polars/expr/binary.py +++ b/py-polars/polars/expr/binary.py @@ -5,10 +5,17 @@ from polars._utils.parse import parse_into_expression from polars._utils.various import scale_bytes from polars._utils.wrap import wrap_expr +from polars.datatypes import parse_into_dtype if TYPE_CHECKING: from polars import Expr - from polars._typing import IntoExpr, SizeUnit, TransferEncoding + from polars._typing import ( + Endianness, + IntoExpr, + PolarsDataType, + SizeUnit, + TransferEncoding, + ) class ExprBinaryNameSpace: @@ -289,3 +296,45 @@ def size(self, unit: SizeUnit = "b") -> Expr: sz = wrap_expr(self._pyexpr.bin_size_bytes()) sz = scale_bytes(sz, unit) return sz + + def reinterpret( + self, *, dtype: PolarsDataType, endianness: Endianness = "little" + ) -> Expr: + r""" + Interpret a buffer as a numerical polars type. + + Parameters + ---------- + dtype : PolarsDataType + Which type to interpret binary column into. + endianness : {"big", "little"}, optional + Which endianness to use when interpreting bytes, by default "little". + + Returns + ------- + Expr + Expression of data type `dtype`. + Note that if binary array is too short value will be null. + If binary array is too long, remainder will be ignored. + + Examples + -------- + >>> df = pl.DataFrame({"data": [b"\x05\x00\x00\x00", b"\x10\x00\x01\x00"]}) + >>> df.with_columns( # doctest: +IGNORE_RESULT + ... casted=pl.col("data").bin.reinterpret( + ... dtype=pl.Int32, endianness="little" + ... ), + ... ) + shape: (2, 3) + ┌─────────────────────┬────────┐ + │ data ┆ caster │ + │ --- ┆ --- │ + │ binary ┆ i32 │ + ╞═════════════════════╪════════╡ + │ b"\x05\x00\x00\x00" ┆ 5 │ + │ b"\x10\x00\x01\x00" ┆ 65552 │ + └─────────────────────┴────────┘ + """ + dtype = parse_into_dtype(dtype) + + return wrap_expr(self._pyexpr.from_buffer(dtype, endianness)) diff --git a/py-polars/polars/series/binary.py b/py-polars/polars/series/binary.py index a1cdf5d8fb80..882fed59d998 100644 --- a/py-polars/polars/series/binary.py +++ b/py-polars/polars/series/binary.py @@ -6,7 +6,13 @@ if TYPE_CHECKING: from polars import Series - from polars._typing import IntoExpr, SizeUnit, TransferEncoding + from polars._typing import ( + Endianness, + IntoExpr, + PolarsDataType, + SizeUnit, + TransferEncoding, + ) from polars.polars import PySeries @@ -209,3 +215,36 @@ def size(self, unit: SizeUnit = "b") -> Series: 1.0 ] """ + + def reinterpret( + self, *, dtype: PolarsDataType, endianness: Endianness = "little" + ) -> Series: + r""" + Interpret a buffer as a numerical polars type. + + Parameters + ---------- + dtype : PolarsDataType + Which type to interpret binary column into. + endianness : {"big", "little"}, optional + Which endianness to use when interpreting bytes, by default "little". + + Returns + ------- + Series + Series of data type `dtype`. + Note that if binary array is too short value will be null. + If binary array is too long, remainder will be ignored. + + Examples + -------- + >>> s = pl.Series("data", [b"\x05\x00\x00\x00", b"\x10\x00\x01\x00"]) + >>> s.bin.reinterpret(dtype=pl.Int32, endianness="little") + shape: (2,) + Series: 'data' [i32] + [ + 5 + 65552 + ] + + """ diff --git a/py-polars/tests/unit/operations/namespaces/test_binary.py b/py-polars/tests/unit/operations/namespaces/test_binary.py index e15ca2010817..ab86b9b51c15 100644 --- a/py-polars/tests/unit/operations/namespaces/test_binary.py +++ b/py-polars/tests/unit/operations/namespaces/test_binary.py @@ -1,5 +1,7 @@ from __future__ import annotations +import random +import struct from typing import TYPE_CHECKING import pytest @@ -164,3 +166,109 @@ def test_binary_size(sz: int, unit: SizeUnit, expected: int | float) -> None: df["data"].bin.size(unit).item(), # series ): assert sz == expected + + +@pytest.mark.parametrize( + ("dtype", "type_size", "struct_type"), + [ + (pl.Int8, 1, "b"), + (pl.UInt8, 1, "B"), + (pl.Int16, 2, "h"), + (pl.UInt16, 2, "H"), + (pl.Int32, 4, "i"), + (pl.UInt32, 4, "I"), + (pl.Int64, 8, "q"), + (pl.UInt64, 8, "Q"), + (pl.Float32, 4, "f"), + (pl.Float64, 8, "d"), + ], +) +def test_reinterpret( + dtype: pl.DataType, + type_size: int, + struct_type: str, +) -> None: + # Make test reproducible + random.seed(42) + + byte_arr = [random.randbytes(type_size) for _ in range(3)] + df = pl.DataFrame({"x": byte_arr}) + + for endianness in ["little", "big"]: + # So that mypy doesn't complain + struct_endianness = "<" if endianness == "little" else ">" + expected = [ + struct.unpack_from(f"{struct_endianness}{struct_type}", elem_bytes)[0] + for elem_bytes in byte_arr + ] + expected_df = pl.DataFrame({"x": expected}, schema={"x": dtype}) + + result = df.select( + pl.col("x").bin.reinterpret(dtype=dtype, endianness=endianness) # type: ignore[arg-type] + ) + + assert_frame_equal(result, expected_df) + + +@pytest.mark.parametrize( + ("dtype", "type_size"), + [ + (pl.Int128, 16), + ], +) +def test_reinterpret_int( + dtype: pl.DataType, + type_size: int, +) -> None: + # Function used for testing integers that `struct` or `numpy` + # doesn't support parsing from bytes. + # Rather than creating bytes directly, create integer and view it as bytes + is_signed = dtype.is_signed_integer() + + if is_signed: + min_val = -(2 ** (type_size - 1)) + max_val = 2 ** (type_size - 1) - 1 + else: + min_val = 0 + max_val = 2**type_size - 1 + + # Make test reproducible + random.seed(42) + + expected = [random.randint(min_val, max_val) for _ in range(3)] + expected_df = pl.DataFrame({"x": expected}, schema={"x": dtype}) + + for endianness in ["little", "big"]: + byte_arr = [ + val.to_bytes(type_size, byteorder=endianness, signed=is_signed) # type: ignore[arg-type] + for val in expected + ] + df = pl.DataFrame({"x": byte_arr}) + + result = df.select( + pl.col("x").bin.reinterpret(dtype=dtype, endianness=endianness) # type: ignore[arg-type] + ) + + assert_frame_equal(result, expected_df) + + +def test_reinterpret_invalid() -> None: + # Fails because buffer has more than 4 bytes + df = pl.DataFrame({"x": [b"d3d3a"]}) + print(struct.unpack_from("