Skip to content

Commit

Permalink
feat: Add bin.reinterpret (pola-rs#20263)
Browse files Browse the repository at this point in the history
Co-authored-by: ritchie <[email protected]>
  • Loading branch information
balbok0 and ritchie46 authored Dec 15, 2024
1 parent 6d0f5df commit da51207
Show file tree
Hide file tree
Showing 12 changed files with 366 additions and 2 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
use arrow::array::{Array, BinaryViewArray, PrimitiveArray};
use arrow::datatypes::ArrowDataType;
use arrow::types::NativeType;
use polars_error::PolarsResult;

/// Trait for casting bytes to a primitive type
pub trait Cast {
fn cast_le(val: &[u8]) -> Option<Self>
where
Self: Sized;
fn cast_be(val: &[u8]) -> Option<Self>
where
Self: Sized;
}
macro_rules! impl_cast {
($primitive_type:ident) => {
impl Cast for $primitive_type {
fn cast_le(val: &[u8]) -> Option<Self> {
Some($primitive_type::from_le_bytes(val.try_into().ok()?))
}

fn cast_be(val: &[u8]) -> Option<Self> {
Some($primitive_type::from_be_bytes(val.try_into().ok()?))
}
}
};
}

impl_cast!(i8);
impl_cast!(i16);
impl_cast!(i32);
impl_cast!(i64);
impl_cast!(i128);
impl_cast!(u8);
impl_cast!(u16);
impl_cast!(u32);
impl_cast!(u64);
impl_cast!(u128);
impl_cast!(f32);
impl_cast!(f64);

/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null.
pub(super) fn cast_binview_to_primitive<T>(
from: &BinaryViewArray,
to: &ArrowDataType,
is_little_endian: bool,
) -> PrimitiveArray<T>
where
T: Cast + NativeType,
{
let iter = from.iter().map(|x| {
x.and_then::<T, _>(|x| {
if is_little_endian {
T::cast_le(x)
} else {
T::cast_be(x)
}
})
});

PrimitiveArray::<T>::from_trusted_len_iter(iter).to(to.clone())
}

/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null.
pub(super) fn cast_binview_to_primitive_dyn<T>(
from: &dyn Array,
to: &ArrowDataType,
is_little_endian: bool,
) -> PolarsResult<Box<dyn Array>>
where
T: Cast + NativeType,
{
let from = from.as_any().downcast_ref().unwrap();

Ok(Box::new(cast_binview_to_primitive::<T>(
from,
to,
is_little_endian,
)))
}
1 change: 1 addition & 0 deletions crates/polars-ops/src/chunked_array/binary/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
mod cast_binary_to_numerical;
mod namespace;

pub use namespace::*;
Expand Down
32 changes: 32 additions & 0 deletions crates/polars-ops/src/chunked_array/binary/namespace.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#[cfg(feature = "binary_encoding")]
use std::borrow::Cow;

use arrow::with_match_primitive_type;
#[cfg(feature = "binary_encoding")]
use base64::engine::general_purpose;
#[cfg(feature = "binary_encoding")]
Expand All @@ -9,6 +10,7 @@ use memchr::memmem::find;
use polars_compute::size::binary_size_bytes;
use polars_core::prelude::arity::{broadcast_binary_elementwise_values, unary_elementwise_values};

use super::cast_binary_to_numerical::cast_binview_to_primitive_dyn;
use super::*;

pub trait BinaryNameSpaceImpl: AsBinary {
Expand Down Expand Up @@ -127,6 +129,36 @@ pub trait BinaryNameSpaceImpl: AsBinary {
.unwrap()
}
}

#[cfg(feature = "binary_encoding")]
#[allow(clippy::wrong_self_convention)]
fn from_buffer(&self, dtype: &DataType, is_little_endian: bool) -> PolarsResult<Series> {
let ca = self.as_binary();
let arrow_type = dtype.to_arrow(CompatLevel::newest());

match arrow_type.to_physical_type() {
arrow::datatypes::PhysicalType::Primitive(ty) => {
with_match_primitive_type!(ty, |$T| {
unsafe {
Ok(Series::from_chunks_and_dtype_unchecked(
ca.name().clone(),
ca.chunks().iter().map(|chunk| {
cast_binview_to_primitive_dyn::<$T>(
&**chunk,
&arrow_type,
is_little_endian,
)
}).collect::<PolarsResult<Vec<_>>>()?,
dtype
))
}
})
},
_ => Err(
polars_err!(InvalidOperation:"unsupported data type in from_buffer. Only numerical types are allowed."),
),
}
}
}

impl BinaryNameSpaceImpl for BinaryChunked {}
9 changes: 9 additions & 0 deletions crates/polars-plan/src/dsl/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,13 @@ impl BinaryNameSpace {
self.0
.map_private(FunctionExpr::BinaryExpr(BinaryFunction::Base64Encode))
}

#[cfg(feature = "binary_encoding")]
pub fn from_buffer(self, to_type: DataType, is_little_endian: bool) -> Expr {
self.0
.map_private(FunctionExpr::BinaryExpr(BinaryFunction::FromBuffer(
to_type,
is_little_endian,
)))
}
}
19 changes: 19 additions & 0 deletions crates/polars-plan/src/dsl/function_expr/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ pub enum BinaryFunction {
#[cfg(feature = "binary_encoding")]
Base64Encode,
Size,
#[cfg(feature = "binary_encoding")]
FromBuffer(DataType, bool),
}

impl BinaryFunction {
Expand All @@ -32,6 +34,8 @@ impl BinaryFunction {
#[cfg(feature = "binary_encoding")]
HexEncode | Base64Encode => mapper.with_dtype(DataType::String),
Size => mapper.with_dtype(DataType::UInt32),
#[cfg(feature = "binary_encoding")]
FromBuffer(dtype, _) => mapper.with_dtype(dtype.clone()),
}
}
}
Expand All @@ -52,6 +56,8 @@ impl Display for BinaryFunction {
#[cfg(feature = "binary_encoding")]
Base64Encode => "base64_encode",
Size => "size_bytes",
#[cfg(feature = "binary_encoding")]
FromBuffer(_, _) => "from_buffer",
};
write!(f, "bin.{s}")
}
Expand Down Expand Up @@ -79,6 +85,8 @@ impl From<BinaryFunction> for SpecialEq<Arc<dyn ColumnsUdf>> {
#[cfg(feature = "binary_encoding")]
Base64Encode => map!(base64_encode),
Size => map!(size_bytes),
#[cfg(feature = "binary_encoding")]
FromBuffer(dtype, is_little_endian) => map!(from_buffer, &dtype, is_little_endian),
}
}
}
Expand Down Expand Up @@ -141,6 +149,17 @@ pub(super) fn base64_encode(s: &Column) -> PolarsResult<Column> {
Ok(ca.base64_encode().into())
}

#[cfg(feature = "binary_encoding")]
pub(super) fn from_buffer(
s: &Column,
dtype: &DataType,
is_little_endian: bool,
) -> PolarsResult<Column> {
let ca = s.binary()?;
ca.from_buffer(dtype, is_little_endian)
.map(|val| val.into())
}

impl From<BinaryFunction> for FunctionExpr {
fn from(b: BinaryFunction) -> Self {
FunctionExpr::BinaryExpr(b)
Expand Down
24 changes: 24 additions & 0 deletions crates/polars-python/src/expr/binary.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use polars::prelude::DataType;
use pyo3::prelude::*;

use crate::prelude::Wrap;
use crate::PyExpr;

#[pymethods]
Expand Down Expand Up @@ -40,6 +42,28 @@ impl PyExpr {
self.inner.clone().binary().base64_encode().into()
}

#[cfg(feature = "binary_encoding")]
#[allow(clippy::wrong_self_convention)]
fn from_buffer(&self, dtype: Wrap<DataType>, kind: &str) -> PyResult<Self> {
use pyo3::exceptions::PyValueError;

let is_little_endian = match kind.to_lowercase().as_str() {
"little" => true,
"big" => false,
_ => {
return Err(PyValueError::new_err(format!(
"Invalid endianness: {kind}. Valid values are \"little\" or \"big\"."
)))
},
};
Ok(self
.inner
.clone()
.binary()
.from_buffer(dtype.0, is_little_endian)
.into())
}

fn bin_size_bytes(&self) -> Self {
self.inner.clone().binary().size_bytes().into()
}
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/expressions/binary.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,6 @@ The following methods are available under the `expr.bin` attribute.
Expr.bin.decode
Expr.bin.encode
Expr.bin.ends_with
Expr.bin.reinterpret
Expr.bin.size
Expr.bin.starts_with
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/series/binary.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,6 @@ The following methods are available under the `Series.bin` attribute.
Series.bin.decode
Series.bin.encode
Series.bin.ends_with
Series.bin.reinterpret
Series.bin.size
Series.bin.starts_with
1 change: 1 addition & 0 deletions py-polars/polars/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
RankMethod: TypeAlias = Literal["average", "min", "max", "dense", "ordinal", "random"]
Roll: TypeAlias = Literal["raise", "forward", "backward"]
SerializationFormat: TypeAlias = Literal["binary", "json"]
Endianness: TypeAlias = Literal["little", "big"]
SizeUnit: TypeAlias = Literal[
"b",
"kb",
Expand Down
51 changes: 50 additions & 1 deletion py-polars/polars/expr/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,17 @@
from polars._utils.parse import parse_into_expression
from polars._utils.various import scale_bytes
from polars._utils.wrap import wrap_expr
from polars.datatypes import parse_into_dtype

if TYPE_CHECKING:
from polars import Expr
from polars._typing import IntoExpr, SizeUnit, TransferEncoding
from polars._typing import (
Endianness,
IntoExpr,
PolarsDataType,
SizeUnit,
TransferEncoding,
)


class ExprBinaryNameSpace:
Expand Down Expand Up @@ -289,3 +296,45 @@ def size(self, unit: SizeUnit = "b") -> Expr:
sz = wrap_expr(self._pyexpr.bin_size_bytes())
sz = scale_bytes(sz, unit)
return sz

def reinterpret(
self, *, dtype: PolarsDataType, endianness: Endianness = "little"
) -> Expr:
r"""
Interpret a buffer as a numerical polars type.
Parameters
----------
dtype : PolarsDataType
Which type to interpret binary column into.
endianness : {"big", "little"}, optional
Which endianness to use when interpreting bytes, by default "little".
Returns
-------
Expr
Expression of data type `dtype`.
Note that if binary array is too short value will be null.
If binary array is too long, remainder will be ignored.
Examples
--------
>>> df = pl.DataFrame({"data": [b"\x05\x00\x00\x00", b"\x10\x00\x01\x00"]})
>>> df.with_columns( # doctest: +IGNORE_RESULT
... casted=pl.col("data").bin.reinterpret(
... dtype=pl.Int32, endianness="little"
... ),
... )
shape: (2, 3)
┌─────────────────────┬────────┐
│ data ┆ caster │
│ --- ┆ --- │
│ binary ┆ i32 │
╞═════════════════════╪════════╡
│ b"\x05\x00\x00\x00" ┆ 5 │
│ b"\x10\x00\x01\x00" ┆ 65552 │
└─────────────────────┴────────┘
"""
dtype = parse_into_dtype(dtype)

return wrap_expr(self._pyexpr.from_buffer(dtype, endianness))
41 changes: 40 additions & 1 deletion py-polars/polars/series/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,13 @@

if TYPE_CHECKING:
from polars import Series
from polars._typing import IntoExpr, SizeUnit, TransferEncoding
from polars._typing import (
Endianness,
IntoExpr,
PolarsDataType,
SizeUnit,
TransferEncoding,
)
from polars.polars import PySeries


Expand Down Expand Up @@ -209,3 +215,36 @@ def size(self, unit: SizeUnit = "b") -> Series:
1.0
]
"""

def reinterpret(
self, *, dtype: PolarsDataType, endianness: Endianness = "little"
) -> Series:
r"""
Interpret a buffer as a numerical polars type.
Parameters
----------
dtype : PolarsDataType
Which type to interpret binary column into.
endianness : {"big", "little"}, optional
Which endianness to use when interpreting bytes, by default "little".
Returns
-------
Series
Series of data type `dtype`.
Note that if binary array is too short value will be null.
If binary array is too long, remainder will be ignored.
Examples
--------
>>> s = pl.Series("data", [b"\x05\x00\x00\x00", b"\x10\x00\x01\x00"])
>>> s.bin.reinterpret(dtype=pl.Int32, endianness="little")
shape: (2,)
Series: 'data' [i32]
[
5
65552
]
"""
Loading

0 comments on commit da51207

Please sign in to comment.