From da51207e3666e95d95100d47616ad17bcd7ae845 Mon Sep 17 00:00:00 2001
From: Jakub Filipek <jfilipek@proton.me>
Date: Sun, 15 Dec 2024 00:53:07 -0800
Subject: [PATCH] feat: Add `bin.reinterpret` (#20263)

Co-authored-by: ritchie <ritchie46@gmail.com>
---
 .../binary/cast_binary_to_numerical.rs        |  80 +++++++++++++
 .../src/chunked_array/binary/mod.rs           |   1 +
 .../src/chunked_array/binary/namespace.rs     |  32 ++++++
 crates/polars-plan/src/dsl/binary.rs          |   9 ++
 .../src/dsl/function_expr/binary.rs           |  19 +++
 crates/polars-python/src/expr/binary.rs       |  24 ++++
 .../source/reference/expressions/binary.rst   |   1 +
 .../docs/source/reference/series/binary.rst   |   1 +
 py-polars/polars/_typing.py                   |   1 +
 py-polars/polars/expr/binary.py               |  51 ++++++++-
 py-polars/polars/series/binary.py             |  41 ++++++-
 .../unit/operations/namespaces/test_binary.py | 108 ++++++++++++++++++
 12 files changed, 366 insertions(+), 2 deletions(-)
 create mode 100644 crates/polars-ops/src/chunked_array/binary/cast_binary_to_numerical.rs
diff --git a/crates/polars-ops/src/chunked_array/binary/cast_binary_to_numerical.rs b/crates/polars-ops/src/chunked_array/binary/cast_binary_to_numerical.rs
new file mode 100644
index 000000000000..d3f76f6b8263
--- /dev/null
+++ b/crates/polars-ops/src/chunked_array/binary/cast_binary_to_numerical.rs
@@ -0,0 +1,80 @@
+use arrow::array::{Array, BinaryViewArray, PrimitiveArray};
+use arrow::datatypes::ArrowDataType;
+use arrow::types::NativeType;
+use polars_error::PolarsResult;
+
+/// Trait for casting bytes to a primitive type
+pub trait Cast {
+    fn cast_le(val: &[u8]) -> Option<Self>
+    where
+        Self: Sized;
+    fn cast_be(val: &[u8]) -> Option<Self>
+    where
+        Self: Sized;
+}
+macro_rules! impl_cast {
+    ($primitive_type:ident) => {
+        impl Cast for $primitive_type {
+            fn cast_le(val: &[u8]) -> Option<Self> {
+                Some($primitive_type::from_le_bytes(val.try_into().ok()?))
+            }
+
+            fn cast_be(val: &[u8]) -> Option<Self> {
+                Some($primitive_type::from_be_bytes(val.try_into().ok()?))
+            }
+        }
+    };
+}
+
+impl_cast!(i8);
+impl_cast!(i16);
+impl_cast!(i32);
+impl_cast!(i64);
+impl_cast!(i128);
+impl_cast!(u8);
+impl_cast!(u16);
+impl_cast!(u32);
+impl_cast!(u64);
+impl_cast!(u128);
+impl_cast!(f32);
+impl_cast!(f64);
+
+/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null.
+pub(super) fn cast_binview_to_primitive<T>(
+    from: &BinaryViewArray,
+    to: &ArrowDataType,
+    is_little_endian: bool,
+) -> PrimitiveArray<T>
+where
+    T: Cast + NativeType,
+{
+    let iter = from.iter().map(|x| {
+        x.and_then::<T, _>(|x| {
+            if is_little_endian {
+                T::cast_le(x)
+            } else {
+                T::cast_be(x)
+            }
+        })
+    });
+
+    PrimitiveArray::<T>::from_trusted_len_iter(iter).to(to.clone())
+}
+
+/// Casts a [`BinaryArray`] to a [`PrimitiveArray`], making any uncastable value a Null.
+pub(super) fn cast_binview_to_primitive_dyn<T>(
+    from: &dyn Array,
+    to: &ArrowDataType,
+    is_little_endian: bool,
+) -> PolarsResult<Box<dyn Array>>
+where
+    T: Cast + NativeType,
+{
+    let from = from.as_any().downcast_ref().unwrap();
+
+    Ok(Box::new(cast_binview_to_primitive::<T>(
+        from,
+        to,
+        is_little_endian,
+    )))
+}
diff --git a/crates/polars-ops/src/chunked_array/binary/mod.rs b/crates/polars-ops/src/chunked_array/binary/mod.rs
index fce3c77a5df7..df8847a1bc0c 100644
--- a/crates/polars-ops/src/chunked_array/binary/mod.rs
+++ b/crates/polars-ops/src/chunked_array/binary/mod.rs
@@ -1,3 +1,4 @@
+mod cast_binary_to_numerical;
 mod namespace;
 
 pub use namespace::*;
diff --git a/crates/polars-ops/src/chunked_array/binary/namespace.rs b/crates/polars-ops/src/chunked_array/binary/namespace.rs
index b20f1d9e3e5a..3cd299892fa5 100644
--- a/crates/polars-ops/src/chunked_array/binary/namespace.rs
+++ b/crates/polars-ops/src/chunked_array/binary/namespace.rs
@@ -1,6 +1,7 @@
 #[cfg(feature = "binary_encoding")]
 use std::borrow::Cow;
 
+use arrow::with_match_primitive_type;
 #[cfg(feature = "binary_encoding")]
 use base64::engine::general_purpose;
 #[cfg(feature = "binary_encoding")]
@@ -9,6 +10,7 @@ use memchr::memmem::find;
 use polars_compute::size::binary_size_bytes;
 use polars_core::prelude::arity::{broadcast_binary_elementwise_values, unary_elementwise_values};
 
+use super::cast_binary_to_numerical::cast_binview_to_primitive_dyn;
 use super::*;
 
 pub trait BinaryNameSpaceImpl: AsBinary {
@@ -127,6 +129,36 @@ pub trait BinaryNameSpaceImpl: AsBinary {
                 .unwrap()
         }
     }
+
+    #[cfg(feature = "binary_encoding")]
+    #[allow(clippy::wrong_self_convention)]
+    fn from_buffer(&self, dtype: &DataType, is_little_endian: bool) -> PolarsResult<Series> {
+        let ca = self.as_binary();
+        let arrow_type = dtype.to_arrow(CompatLevel::newest());
+
+        match arrow_type.to_physical_type() {
+            arrow::datatypes::PhysicalType::Primitive(ty) => {
+                with_match_primitive_type!(ty, |$T| {
+                    unsafe {
+                        Ok(Series::from_chunks_and_dtype_unchecked(
+                            ca.name().clone(),
+                            ca.chunks().iter().map(|chunk| {
+                                cast_binview_to_primitive_dyn::<$T>(
+                                    &**chunk,
+                                    &arrow_type,
+                                    is_little_endian,
+                                )
+                            }).collect::<PolarsResult<Vec<_>>>()?,
+                            dtype
+                        ))
+                    }
+                })
+            },
+            _ => Err(
+                polars_err!(InvalidOperation:"unsupported data type in from_buffer. Only numerical types are allowed."),
+            ),
+        }
+    }
 }
 
 impl BinaryNameSpaceImpl for BinaryChunked {}
diff --git a/crates/polars-plan/src/dsl/binary.rs b/crates/polars-plan/src/dsl/binary.rs
index 9091b1777b65..659d498b4388 100644
--- a/crates/polars-plan/src/dsl/binary.rs
+++ b/crates/polars-plan/src/dsl/binary.rs
@@ -64,4 +64,13 @@ impl BinaryNameSpace {
         self.0
             .map_private(FunctionExpr::BinaryExpr(BinaryFunction::Base64Encode))
     }
+
+    #[cfg(feature = "binary_encoding")]
+    pub fn from_buffer(self, to_type: DataType, is_little_endian: bool) -> Expr {
+        self.0
+            .map_private(FunctionExpr::BinaryExpr(BinaryFunction::FromBuffer(
+                to_type,
+                is_little_endian,
+            )))
+    }
 }
diff --git a/crates/polars-plan/src/dsl/function_expr/binary.rs b/crates/polars-plan/src/dsl/function_expr/binary.rs
index 88f3ad71b545..00855bd4e97b 100644
--- a/crates/polars-plan/src/dsl/function_expr/binary.rs
+++ b/crates/polars-plan/src/dsl/function_expr/binary.rs
@@ -19,6 +19,8 @@ pub enum BinaryFunction {
     #[cfg(feature = "binary_encoding")]
     Base64Encode,
     Size,
+    #[cfg(feature = "binary_encoding")]
+    FromBuffer(DataType, bool),
 }
 
 impl BinaryFunction {
@@ -32,6 +34,8 @@ impl BinaryFunction {
             #[cfg(feature = "binary_encoding")]
             HexEncode | Base64Encode => mapper.with_dtype(DataType::String),
             Size => mapper.with_dtype(DataType::UInt32),
+            #[cfg(feature = "binary_encoding")]
+            FromBuffer(dtype, _) => mapper.with_dtype(dtype.clone()),
         }
     }
 }
@@ -52,6 +56,8 @@ impl Display for BinaryFunction {
             #[cfg(feature = "binary_encoding")]
             Base64Encode => "base64_encode",
             Size => "size_bytes",
+            #[cfg(feature = "binary_encoding")]
+            FromBuffer(_, _) => "from_buffer",
         };
         write!(f, "bin.{s}")
     }
@@ -79,6 +85,8 @@ impl From<BinaryFunction> for SpecialEq<Arc<dyn ColumnsUdf>> {
             #[cfg(feature = "binary_encoding")]
             Base64Encode => map!(base64_encode),
             Size => map!(size_bytes),
+            #[cfg(feature = "binary_encoding")]
+            FromBuffer(dtype, is_little_endian) => map!(from_buffer, &dtype, is_little_endian),
         }
     }
 }
@@ -141,6 +149,17 @@ pub(super) fn base64_encode(s: &Column) -> PolarsResult<Column> {
     Ok(ca.base64_encode().into())
 }
 
+#[cfg(feature = "binary_encoding")]
+pub(super) fn from_buffer(
+    s: &Column,
+    dtype: &DataType,
+    is_little_endian: bool,
+) -> PolarsResult<Column> {
+    let ca = s.binary()?;
+    ca.from_buffer(dtype, is_little_endian)
+        .map(|val| val.into())
+}
+
 impl From<BinaryFunction> for FunctionExpr {
     fn from(b: BinaryFunction) -> Self {
         FunctionExpr::BinaryExpr(b)
diff --git a/crates/polars-python/src/expr/binary.rs b/crates/polars-python/src/expr/binary.rs
index 7833c450af2a..681d05697133 100644
--- a/crates/polars-python/src/expr/binary.rs
+++ b/crates/polars-python/src/expr/binary.rs
@@ -1,5 +1,7 @@
+use polars::prelude::DataType;
 use pyo3::prelude::*;
 
+use crate::prelude::Wrap;
 use crate::PyExpr;
 
 #[pymethods]
@@ -40,6 +42,28 @@ impl PyExpr {
         self.inner.clone().binary().base64_encode().into()
     }
 
+    #[cfg(feature = "binary_encoding")]
+    #[allow(clippy::wrong_self_convention)]
+    fn from_buffer(&self, dtype: Wrap<DataType>, kind: &str) -> PyResult<Self> {
+        use pyo3::exceptions::PyValueError;
+
+        let is_little_endian = match kind.to_lowercase().as_str() {
+            "little" => true,
+            "big" => false,
+            _ => {
+                return Err(PyValueError::new_err(format!(
+                    "Invalid endianness: {kind}. Valid values are \"little\" or \"big\"."
+                )))
+            },
+        };
+        Ok(self
+            .inner
+            .clone()
+            .binary()
+            .from_buffer(dtype.0, is_little_endian)
+            .into())
+    }
+
     fn bin_size_bytes(&self) -> Self {
         self.inner.clone().binary().size_bytes().into()
     }
diff --git a/py-polars/docs/source/reference/expressions/binary.rst b/py-polars/docs/source/reference/expressions/binary.rst
index 53f380408267..ed13c8dd3596 100644
--- a/py-polars/docs/source/reference/expressions/binary.rst
+++ b/py-polars/docs/source/reference/expressions/binary.rst
@@ -13,5 +13,6 @@ The following methods are available under the `expr.bin` attribute.
     Expr.bin.decode
     Expr.bin.encode
     Expr.bin.ends_with
+    Expr.bin.reinterpret
     Expr.bin.size
     Expr.bin.starts_with
diff --git a/py-polars/docs/source/reference/series/binary.rst b/py-polars/docs/source/reference/series/binary.rst
index c1dab31448d2..25e622d420b2 100644
--- a/py-polars/docs/source/reference/series/binary.rst
+++ b/py-polars/docs/source/reference/series/binary.rst
@@ -13,5 +13,6 @@ The following methods are available under the `Series.bin` attribute.
     Series.bin.decode
     Series.bin.encode
     Series.bin.ends_with
+    Series.bin.reinterpret
     Series.bin.size
     Series.bin.starts_with
diff --git a/py-polars/polars/_typing.py b/py-polars/polars/_typing.py
index 92b4109f620b..ea6284cda9c9 100644
--- a/py-polars/polars/_typing.py
+++ b/py-polars/polars/_typing.py
@@ -125,6 +125,7 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
 RankMethod: TypeAlias = Literal["average", "min", "max", "dense", "ordinal", "random"]
 Roll: TypeAlias = Literal["raise", "forward", "backward"]
 SerializationFormat: TypeAlias = Literal["binary", "json"]
+Endianness: TypeAlias = Literal["little", "big"]
 SizeUnit: TypeAlias = Literal[
     "b",
     "kb",
diff --git a/py-polars/polars/expr/binary.py b/py-polars/polars/expr/binary.py
index 62cf43180fd3..b2ef66f5b4dd 100644
--- a/py-polars/polars/expr/binary.py
+++ b/py-polars/polars/expr/binary.py
@@ -5,10 +5,17 @@
 from polars._utils.parse import parse_into_expression
 from polars._utils.various import scale_bytes
 from polars._utils.wrap import wrap_expr
+from polars.datatypes import parse_into_dtype
 
 if TYPE_CHECKING:
     from polars import Expr
-    from polars._typing import IntoExpr, SizeUnit, TransferEncoding
+    from polars._typing import (
+        Endianness,
+        IntoExpr,
+        PolarsDataType,
+        SizeUnit,
+        TransferEncoding,
+    )
 
 
 class ExprBinaryNameSpace:
@@ -289,3 +296,45 @@ def size(self, unit: SizeUnit = "b") -> Expr:
         sz = wrap_expr(self._pyexpr.bin_size_bytes())
         sz = scale_bytes(sz, unit)
         return sz
+
+    def reinterpret(
+        self, *, dtype: PolarsDataType, endianness: Endianness = "little"
+    ) -> Expr:
+        r"""
+        Interpret a buffer as a numerical polars type.
+
+        Parameters
+        ----------
+        dtype : PolarsDataType
+            Which type to interpret binary column into.
+        endianness : {"big", "little"}, optional
+            Which endianness to use when interpreting bytes, by default "little".
+
+        Returns
+        -------
+        Expr
+            Expression of data type `dtype`.
+            Note that if binary array is too short value will be null.
+            If binary array is too long, remainder will be ignored.
+
+        Examples
+        --------
+        >>> df = pl.DataFrame({"data": [b"\x05\x00\x00\x00", b"\x10\x00\x01\x00"]})
+        >>> df.with_columns(  # doctest: +IGNORE_RESULT
+        ...     casted=pl.col("data").bin.reinterpret(
+        ...         dtype=pl.Int32, endianness="little"
+        ...     ),
+        ... )
+        shape: (2, 3)
+        ┌─────────────────────┬────────┐
+        │ data                ┆ caster │
+        │ ---                 ┆ ---    │
+        │ binary              ┆ i32    │
+        ╞═════════════════════╪════════╡
+        │ b"\x05\x00\x00\x00" ┆ 5      │
+        │ b"\x10\x00\x01\x00" ┆ 65552  │
+        └─────────────────────┴────────┘
+        """
+        dtype = parse_into_dtype(dtype)
+
+        return wrap_expr(self._pyexpr.from_buffer(dtype, endianness))
diff --git a/py-polars/polars/series/binary.py b/py-polars/polars/series/binary.py
index a1cdf5d8fb80..882fed59d998 100644
--- a/py-polars/polars/series/binary.py
+++ b/py-polars/polars/series/binary.py
@@ -6,7 +6,13 @@
 
 if TYPE_CHECKING:
     from polars import Series
-    from polars._typing import IntoExpr, SizeUnit, TransferEncoding
+    from polars._typing import (
+        Endianness,
+        IntoExpr,
+        PolarsDataType,
+        SizeUnit,
+        TransferEncoding,
+    )
     from polars.polars import PySeries
 
 
@@ -209,3 +215,36 @@ def size(self, unit: SizeUnit = "b") -> Series:
             1.0
         ]
         """
+
+    def reinterpret(
+        self, *, dtype: PolarsDataType, endianness: Endianness = "little"
+    ) -> Series:
+        r"""
+        Interpret a buffer as a numerical polars type.
+
+        Parameters
+        ----------
+        dtype : PolarsDataType
+            Which type to interpret binary column into.
+        endianness : {"big", "little"}, optional
+            Which endianness to use when interpreting bytes, by default "little".
+
+        Returns
+        -------
+        Series
+            Series of data type `dtype`.
+            Note that if binary array is too short value will be null.
+            If binary array is too long, remainder will be ignored.
+
+        Examples
+        --------
+        >>> s = pl.Series("data", [b"\x05\x00\x00\x00", b"\x10\x00\x01\x00"])
+        >>> s.bin.reinterpret(dtype=pl.Int32, endianness="little")
+        shape: (2,)
+        Series: 'data' [i32]
+        [
+            5
+            65552
+        ]
+
+        """
diff --git a/py-polars/tests/unit/operations/namespaces/test_binary.py b/py-polars/tests/unit/operations/namespaces/test_binary.py
index e15ca2010817..ab86b9b51c15 100644
--- a/py-polars/tests/unit/operations/namespaces/test_binary.py
+++ b/py-polars/tests/unit/operations/namespaces/test_binary.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import random
+import struct
 from typing import TYPE_CHECKING
 
 import pytest
@@ -164,3 +166,109 @@ def test_binary_size(sz: int, unit: SizeUnit, expected: int | float) -> None:
         df["data"].bin.size(unit).item(),  # series
     ):
         assert sz == expected
+
+
+@pytest.mark.parametrize(
+    ("dtype", "type_size", "struct_type"),
+    [
+        (pl.Int8, 1, "b"),
+        (pl.UInt8, 1, "B"),
+        (pl.Int16, 2, "h"),
+        (pl.UInt16, 2, "H"),
+        (pl.Int32, 4, "i"),
+        (pl.UInt32, 4, "I"),
+        (pl.Int64, 8, "q"),
+        (pl.UInt64, 8, "Q"),
+        (pl.Float32, 4, "f"),
+        (pl.Float64, 8, "d"),
+    ],
+)
+def test_reinterpret(
+    dtype: pl.DataType,
+    type_size: int,
+    struct_type: str,
+) -> None:
+    # Make test reproducible
+    random.seed(42)
+
+    byte_arr = [random.randbytes(type_size) for _ in range(3)]
+    df = pl.DataFrame({"x": byte_arr})
+
+    for endianness in ["little", "big"]:
+        # So that mypy doesn't complain
+        struct_endianness = "<" if endianness == "little" else ">"
+        expected = [
+            struct.unpack_from(f"{struct_endianness}{struct_type}", elem_bytes)[0]
+            for elem_bytes in byte_arr
+        ]
+        expected_df = pl.DataFrame({"x": expected}, schema={"x": dtype})
+
+        result = df.select(
+            pl.col("x").bin.reinterpret(dtype=dtype, endianness=endianness)  # type: ignore[arg-type]
+        )
+
+        assert_frame_equal(result, expected_df)
+
+
+@pytest.mark.parametrize(
+    ("dtype", "type_size"),
+    [
+        (pl.Int128, 16),
+    ],
+)
+def test_reinterpret_int(
+    dtype: pl.DataType,
+    type_size: int,
+) -> None:
+    # Function used for testing integers that `struct` or `numpy`
+    # doesn't support parsing from bytes.
+    # Rather than creating bytes directly, create integer and view it as bytes
+    is_signed = dtype.is_signed_integer()
+
+    if is_signed:
+        min_val = -(2 ** (type_size - 1))
+        max_val = 2 ** (type_size - 1) - 1
+    else:
+        min_val = 0
+        max_val = 2**type_size - 1
+
+    # Make test reproducible
+    random.seed(42)
+
+    expected = [random.randint(min_val, max_val) for _ in range(3)]
+    expected_df = pl.DataFrame({"x": expected}, schema={"x": dtype})
+
+    for endianness in ["little", "big"]:
+        byte_arr = [
+            val.to_bytes(type_size, byteorder=endianness, signed=is_signed)  # type: ignore[arg-type]
+            for val in expected
+        ]
+        df = pl.DataFrame({"x": byte_arr})
+
+        result = df.select(
+            pl.col("x").bin.reinterpret(dtype=dtype, endianness=endianness)  # type: ignore[arg-type]
+        )
+
+        assert_frame_equal(result, expected_df)
+
+
+def test_reinterpret_invalid() -> None:
+    # Fails because buffer has more than 4 bytes
+    df = pl.DataFrame({"x": [b"d3d3a"]})
+    print(struct.unpack_from("<i", b"d3d3a"))
+    assert_frame_equal(
+        df.select(pl.col("x").bin.reinterpret(dtype=pl.Int32)),
+        pl.DataFrame({"x": [None]}, schema={"x": pl.Int32}),
+    )
+
+    # Fails because buffer has less than 4 bytes
+    df = pl.DataFrame({"x": [b"d3"]})
+    print(df.select(pl.col("x").bin.reinterpret(dtype=pl.Int32)))
+    assert_frame_equal(
+        df.select(pl.col("x").bin.reinterpret(dtype=pl.Int32)),
+        pl.DataFrame({"x": [None]}, schema={"x": pl.Int32}),
+    )
+
+    # Fails because dtype is invalid
+    with pytest.raises(pl.exceptions.InvalidOperationError):
+        df.select(pl.col("x").bin.reinterpret(dtype=pl.String))