From 6355a0f9f890f5cac2a57dbf6567ef179036dafb Mon Sep 17 00:00:00 2001 From: David Hewitt Date: Mon, 5 Feb 2024 13:02:29 +0000 Subject: [PATCH 1/5] add `PyBackedStr` and `PyBackedBytes` --- src/lib.rs | 1 + src/py_backed.rs | 117 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 src/py_backed.rs diff --git a/src/lib.rs b/src/lib.rs index 3f91eb56913..6323a1f1ac9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -426,6 +426,7 @@ pub mod marshal; pub mod sync; pub mod panic; pub mod prelude; +pub mod py_backed; pub mod pycell; pub mod pyclass; pub mod pyclass_init; diff --git a/src/py_backed.rs b/src/py_backed.rs new file mode 100644 index 00000000000..e2c60633c72 --- /dev/null +++ b/src/py_backed.rs @@ -0,0 +1,117 @@ +use std::ops::Deref; + +use crate::{ + types::{ + any::PyAnyMethods, bytearray::PyByteArrayMethods, bytes::PyBytesMethods, + string::PyStringMethods, PyByteArray, PyBytes, PyString, + }, + Bound, DowncastError, FromPyObject, Py, PyAny, PyResult, +}; + +/// A wrapper around `str` where the storage is owned by a Python `bytes` or `str` object. +/// +/// This type gives access to the underlying data via a `Deref` implementation. +pub struct PyBackedStr { + #[allow(dead_code)] + storage: PyBackedStrStorage, + data: *const u8, + length: usize, +} + +#[allow(dead_code)] +enum PyBackedStrStorage { + String(Py), + Bytes(Py), +} + +impl Deref for PyBackedStr { + type Target = str; + fn deref(&self) -> &str { + unsafe { + // Safety: `data` is a pointer to the start of a valid UTF-8 string of length `length`. + std::str::from_utf8_unchecked(std::slice::from_raw_parts(self.data, self.length)) + } + } +} + +impl FromPyObject<'_> for PyBackedStr { + fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult { + let py_string = obj.downcast::()?; + #[cfg(any(Py_3_10, not(Py_LIMITED_API)))] + { + let s = py_string.to_str()?; + let data = s.as_ptr(); + let length = s.len(); + Ok(Self { + storage: PyBackedStrStorage::String(py_string.to_owned().unbind()), + data, + length, + }) + } + #[cfg(not(any(Py_3_10, not(Py_LIMITED_API))))] + { + let bytes = string.encode_utf8()?; + let b = bytes.as_bytes(); + let data = b.as_ptr(); + let length = b.len(); + Ok(Self { + storage: PyBackedStrStorage::Bytes(bytes.unbind()), + data, + length, + }) + } + } +} + +/// A wrapper around `[u8]` where the storage is either owned by a Python `bytes` object, or a Rust `Vec`. +/// +/// This type gives access to the underlying data via a `Deref` implementation. +pub struct PyBackedBytes { + #[allow(dead_code)] // only held so that the storage is not dropped + storage: PyBackedBytesStorage, + data: *const u8, + length: usize, +} + +enum PyBackedBytesStorage { + Python(Py), + Rust(Vec), +} + +impl Deref for PyBackedBytes { + type Target = [u8]; + fn deref(&self) -> &[u8] { + unsafe { + // Safety: `data` is a pointer to the start of a buffer of length `length`. + std::slice::from_raw_parts(self.data, self.length) + } + } +} + +impl FromPyObject<'_> for PyBackedBytes { + fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult { + if let Ok(bytes) = obj.downcast::() { + let b = bytes.as_bytes(); + let data = b.as_ptr(); + let len = b.len(); + return Ok(Self { + storage: PyBackedBytesStorage::Python(bytes.to_owned().unbind()), + data, + length: len, + }); + } + + if let Ok(bytearray) = obj.downcast::() { + let s = bytearray.to_vec(); + let data = s.as_ptr(); + let len = s.len(); + return Ok(Self { + storage: PyBackedBytesStorage::Rust(s), + data, + length: len, + }); + } + + return Err(DowncastError::new(obj, "`bytes` or `bytearray`").into()); + } +} From 5d9160e2597233d720e26c4af905e0ceb3fe2bf4 Mon Sep 17 00:00:00 2001 From: David Hewitt Date: Wed, 14 Feb 2024 23:13:16 +0000 Subject: [PATCH 2/5] review: adamreichold feedback --- src/lib.rs | 2 +- src/py_backed.rs | 117 --------------------------- src/pybacked.rs | 200 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 201 insertions(+), 118 deletions(-) delete mode 100644 src/py_backed.rs create mode 100644 src/pybacked.rs diff --git a/src/lib.rs b/src/lib.rs index 6323a1f1ac9..2a970ce1d5f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -426,7 +426,7 @@ pub mod marshal; pub mod sync; pub mod panic; pub mod prelude; -pub mod py_backed; +pub mod pybacked; pub mod pycell; pub mod pyclass; pub mod pyclass_init; diff --git a/src/py_backed.rs b/src/py_backed.rs deleted file mode 100644 index e2c60633c72..00000000000 --- a/src/py_backed.rs +++ /dev/null @@ -1,117 +0,0 @@ -use std::ops::Deref; - -use crate::{ - types::{ - any::PyAnyMethods, bytearray::PyByteArrayMethods, bytes::PyBytesMethods, - string::PyStringMethods, PyByteArray, PyBytes, PyString, - }, - Bound, DowncastError, FromPyObject, Py, PyAny, PyResult, -}; - -/// A wrapper around `str` where the storage is owned by a Python `bytes` or `str` object. -/// -/// This type gives access to the underlying data via a `Deref` implementation. -pub struct PyBackedStr { - #[allow(dead_code)] - storage: PyBackedStrStorage, - data: *const u8, - length: usize, -} - -#[allow(dead_code)] -enum PyBackedStrStorage { - String(Py), - Bytes(Py), -} - -impl Deref for PyBackedStr { - type Target = str; - fn deref(&self) -> &str { - unsafe { - // Safety: `data` is a pointer to the start of a valid UTF-8 string of length `length`. - std::str::from_utf8_unchecked(std::slice::from_raw_parts(self.data, self.length)) - } - } -} - -impl FromPyObject<'_> for PyBackedStr { - fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult { - let py_string = obj.downcast::()?; - #[cfg(any(Py_3_10, not(Py_LIMITED_API)))] - { - let s = py_string.to_str()?; - let data = s.as_ptr(); - let length = s.len(); - Ok(Self { - storage: PyBackedStrStorage::String(py_string.to_owned().unbind()), - data, - length, - }) - } - #[cfg(not(any(Py_3_10, not(Py_LIMITED_API))))] - { - let bytes = string.encode_utf8()?; - let b = bytes.as_bytes(); - let data = b.as_ptr(); - let length = b.len(); - Ok(Self { - storage: PyBackedStrStorage::Bytes(bytes.unbind()), - data, - length, - }) - } - } -} - -/// A wrapper around `[u8]` where the storage is either owned by a Python `bytes` object, or a Rust `Vec`. -/// -/// This type gives access to the underlying data via a `Deref` implementation. -pub struct PyBackedBytes { - #[allow(dead_code)] // only held so that the storage is not dropped - storage: PyBackedBytesStorage, - data: *const u8, - length: usize, -} - -enum PyBackedBytesStorage { - Python(Py), - Rust(Vec), -} - -impl Deref for PyBackedBytes { - type Target = [u8]; - fn deref(&self) -> &[u8] { - unsafe { - // Safety: `data` is a pointer to the start of a buffer of length `length`. - std::slice::from_raw_parts(self.data, self.length) - } - } -} - -impl FromPyObject<'_> for PyBackedBytes { - fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult { - if let Ok(bytes) = obj.downcast::() { - let b = bytes.as_bytes(); - let data = b.as_ptr(); - let len = b.len(); - return Ok(Self { - storage: PyBackedBytesStorage::Python(bytes.to_owned().unbind()), - data, - length: len, - }); - } - - if let Ok(bytearray) = obj.downcast::() { - let s = bytearray.to_vec(); - let data = s.as_ptr(); - let len = s.len(); - return Ok(Self { - storage: PyBackedBytesStorage::Rust(s), - data, - length: len, - }); - } - - return Err(DowncastError::new(obj, "`bytes` or `bytearray`").into()); - } -} diff --git a/src/pybacked.rs b/src/pybacked.rs new file mode 100644 index 00000000000..f5b51234831 --- /dev/null +++ b/src/pybacked.rs @@ -0,0 +1,200 @@ +use std::{ops::Deref, ptr::NonNull}; + +use crate::{ + types::{ + any::PyAnyMethods, bytearray::PyByteArrayMethods, bytes::PyBytesMethods, + string::PyStringMethods, PyByteArray, PyBytes, PyString, + }, + Bound, DowncastError, FromPyObject, Py, PyAny, PyErr, PyResult, +}; + +/// A wrapper around `str` where the storage is owned by a Python `bytes` or `str` object. +/// +/// This type gives access to the underlying data via a `Deref` implementation. +pub struct PyBackedStr { + #[allow(dead_code)] // only held so that the storage is not dropped + storage: Py, + data: NonNull, + length: usize, +} + +impl Deref for PyBackedStr { + type Target = str; + fn deref(&self) -> &str { + unsafe { + // Safety: `data` is a pointer to the start of a valid UTF-8 string of length `length`. + std::str::from_utf8_unchecked(std::slice::from_raw_parts( + self.data.as_ptr(), + self.length, + )) + } + } +} + +impl TryFrom> for PyBackedStr { + type Error = PyErr; + fn try_from(py_string: Bound<'_, PyString>) -> Result { + #[cfg(any(Py_3_10, not(Py_LIMITED_API)))] + { + let s = py_string.to_str()?; + let data = unsafe { NonNull::new_unchecked(dbg!(s.as_ptr() as _)) }; + let length = s.len(); + Ok(Self { + storage: py_string.as_any().to_owned().unbind(), + data, + length, + }) + } + #[cfg(not(any(Py_3_10, not(Py_LIMITED_API))))] + { + let bytes = string.encode_utf8()?; + let b = bytes.as_bytes(); + let data = unsafe { NonNull::new_unchecked(b.as_ptr()) }; + let length = b.len(); + Ok(Self { + storage: bytes.into_any().unbind(), + data, + length, + }) + } + } +} + +impl FromPyObject<'_> for PyBackedStr { + fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult { + let py_string = obj.downcast::()?.to_owned(); + Self::try_from(py_string) + } +} + +/// A wrapper around `[u8]` where the storage is either owned by a Python `bytes` object, or a Rust `Box<[u8]>`. +/// +/// This type gives access to the underlying data via a `Deref` implementation. +pub struct PyBackedBytes { + #[allow(dead_code)] // only held so that the storage is not dropped + storage: PyBackedBytesStorage, + data: NonNull, + length: usize, +} + +enum PyBackedBytesStorage { + Python(Py), + Rust(Box<[u8]>), +} + +impl Deref for PyBackedBytes { + type Target = [u8]; + fn deref(&self) -> &[u8] { + unsafe { + // Safety: `data` is a pointer to the start of a buffer of length `length`. + std::slice::from_raw_parts(self.data.as_ptr(), self.length) + } + } +} + +impl From> for PyBackedBytes { + fn from(py_bytes: Bound<'_, PyBytes>) -> Self { + let b = py_bytes.as_bytes(); + let data = unsafe { NonNull::new_unchecked(b.as_ptr() as _) }; + let length = b.len(); + Self { + storage: PyBackedBytesStorage::Python(py_bytes.to_owned().unbind()), + data, + length, + } + } +} + +impl From> for PyBackedBytes { + fn from(py_bytearray: Bound<'_, PyByteArray>) -> Self { + let s = py_bytearray.to_vec().into_boxed_slice(); + let data = unsafe { NonNull::new_unchecked(s.as_ptr() as _) }; + let length = s.len(); + Self { + storage: PyBackedBytesStorage::Rust(s), + data, + length, + } + } +} + +impl FromPyObject<'_> for PyBackedBytes { + fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult { + if let Ok(bytes) = obj.downcast::() { + Ok(Self::from(bytes.to_owned())) + } else if let Ok(bytearray) = obj.downcast::() { + Ok(Self::from(bytearray.to_owned())) + } else { + Err(DowncastError::new(obj, "`bytes` or `bytearray`").into()) + } + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::Python; + + #[test] + fn py_backed_str_empty() { + Python::with_gil(|py| { + let s = PyString::new_bound(py, ""); + let py_backed_str = s.extract::().unwrap(); + assert_eq!(&*py_backed_str, ""); + }); + } + + #[test] + fn py_backed_str() { + Python::with_gil(|py| { + let s = PyString::new_bound(py, "hello"); + let py_backed_str = s.extract::().unwrap(); + assert_eq!(&*py_backed_str, "hello"); + }); + } + + #[test] + fn py_backed_str_try_from() { + Python::with_gil(|py| { + let s = PyString::new_bound(py, "hello"); + let py_backed_str = PyBackedStr::try_from(s).unwrap(); + assert_eq!(&*py_backed_str, "hello"); + }); + } + + #[test] + fn py_backed_bytes_empty() { + Python::with_gil(|py| { + let b = PyBytes::new_bound(py, &[]); + let py_backed_bytes = b.extract::().unwrap(); + assert_eq!(&*py_backed_bytes, &[]); + }); + } + + #[test] + fn py_backed_bytes() { + Python::with_gil(|py| { + let b = PyBytes::new_bound(py, b"abcde"); + let py_backed_bytes = b.extract::().unwrap(); + assert_eq!(&*py_backed_bytes, b"abcde"); + }); + } + + #[test] + fn py_backed_bytes_from_bytes() { + Python::with_gil(|py| { + let b = PyBytes::new_bound(py, b"abcde"); + let py_backed_bytes = PyBackedBytes::from(b); + assert_eq!(&*py_backed_bytes, b"abcde"); + }); + } + + #[test] + fn py_backed_bytes_from_bytearray() { + Python::with_gil(|py| { + let b = PyByteArray::new_bound(py, b"abcde"); + let py_backed_bytes = PyBackedBytes::from(b); + assert_eq!(&*py_backed_bytes, b"abcde"); + }); + } +} From 775a0f48d0724d0d47ceb473a3d98ff6e56dbbc2 Mon Sep 17 00:00:00 2001 From: David Hewitt Date: Wed, 14 Feb 2024 23:20:07 +0000 Subject: [PATCH 3/5] use `NonNull<[u8]>` --- src/pybacked.rs | 36 ++++++++++-------------------------- 1 file changed, 10 insertions(+), 26 deletions(-) diff --git a/src/pybacked.rs b/src/pybacked.rs index f5b51234831..d1e60db6149 100644 --- a/src/pybacked.rs +++ b/src/pybacked.rs @@ -14,20 +14,14 @@ use crate::{ pub struct PyBackedStr { #[allow(dead_code)] // only held so that the storage is not dropped storage: Py, - data: NonNull, - length: usize, + data: NonNull<[u8]>, } impl Deref for PyBackedStr { type Target = str; fn deref(&self) -> &str { - unsafe { - // Safety: `data` is a pointer to the start of a valid UTF-8 string of length `length`. - std::str::from_utf8_unchecked(std::slice::from_raw_parts( - self.data.as_ptr(), - self.length, - )) - } + // Safety: `data` is known to be immutable utf8 string and owned by self + unsafe { std::str::from_utf8_unchecked(self.data.as_ref()) } } } @@ -37,24 +31,21 @@ impl TryFrom> for PyBackedStr { #[cfg(any(Py_3_10, not(Py_LIMITED_API)))] { let s = py_string.to_str()?; - let data = unsafe { NonNull::new_unchecked(dbg!(s.as_ptr() as _)) }; - let length = s.len(); + let data = NonNull::from(s.as_bytes()); Ok(Self { storage: py_string.as_any().to_owned().unbind(), data, - length, }) } #[cfg(not(any(Py_3_10, not(Py_LIMITED_API))))] { let bytes = string.encode_utf8()?; let b = bytes.as_bytes(); - let data = unsafe { NonNull::new_unchecked(b.as_ptr()) }; + let data = NonNull::from(b); let length = b.len(); Ok(Self { storage: bytes.into_any().unbind(), data, - length, }) } } @@ -73,8 +64,7 @@ impl FromPyObject<'_> for PyBackedStr { pub struct PyBackedBytes { #[allow(dead_code)] // only held so that the storage is not dropped storage: PyBackedBytesStorage, - data: NonNull, - length: usize, + data: NonNull<[u8]>, } enum PyBackedBytesStorage { @@ -85,22 +75,18 @@ enum PyBackedBytesStorage { impl Deref for PyBackedBytes { type Target = [u8]; fn deref(&self) -> &[u8] { - unsafe { - // Safety: `data` is a pointer to the start of a buffer of length `length`. - std::slice::from_raw_parts(self.data.as_ptr(), self.length) - } + // Safety: `data` is known to be immutable and owned by self + unsafe { self.data.as_ref() } } } impl From> for PyBackedBytes { fn from(py_bytes: Bound<'_, PyBytes>) -> Self { let b = py_bytes.as_bytes(); - let data = unsafe { NonNull::new_unchecked(b.as_ptr() as _) }; - let length = b.len(); + let data = NonNull::from(b); Self { storage: PyBackedBytesStorage::Python(py_bytes.to_owned().unbind()), data, - length, } } } @@ -108,12 +94,10 @@ impl From> for PyBackedBytes { impl From> for PyBackedBytes { fn from(py_bytearray: Bound<'_, PyByteArray>) -> Self { let s = py_bytearray.to_vec().into_boxed_slice(); - let data = unsafe { NonNull::new_unchecked(s.as_ptr() as _) }; - let length = s.len(); + let data = NonNull::from(s.as_ref()); Self { storage: PyBackedBytesStorage::Rust(s), data, - length, } } } From 419ccb371c468825912181da0de7fb5bfb4aac9b Mon Sep 17 00:00:00 2001 From: David Hewitt Date: Wed, 14 Feb 2024 23:24:44 +0000 Subject: [PATCH 4/5] clippy and newsfragment --- newsfragments/3802.added.md | 1 + src/pybacked.rs | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 newsfragments/3802.added.md diff --git a/newsfragments/3802.added.md b/newsfragments/3802.added.md new file mode 100644 index 00000000000..86b98e9df97 --- /dev/null +++ b/newsfragments/3802.added.md @@ -0,0 +1 @@ +Add `PyBackedStr` and `PyBackedBytes`, as alternatives to `&str` and `&bytes` where a Python object owns the data. diff --git a/src/pybacked.rs b/src/pybacked.rs index d1e60db6149..b88781f35e6 100644 --- a/src/pybacked.rs +++ b/src/pybacked.rs @@ -1,3 +1,5 @@ +//! Contains types for working with Python objects that own the underlying data. + use std::{ops::Deref, ptr::NonNull}; use crate::{ @@ -39,7 +41,7 @@ impl TryFrom> for PyBackedStr { } #[cfg(not(any(Py_3_10, not(Py_LIMITED_API))))] { - let bytes = string.encode_utf8()?; + let bytes = py_string.encode_utf8()?; let b = bytes.as_bytes(); let data = NonNull::from(b); let length = b.len(); @@ -67,6 +69,7 @@ pub struct PyBackedBytes { data: NonNull<[u8]>, } +#[allow(dead_code)] enum PyBackedBytesStorage { Python(Py), Rust(Box<[u8]>), From 6aec91cba6b634aa496d203ed0ed729dd3884181 Mon Sep 17 00:00:00 2001 From: Adam Reichold Date: Thu, 15 Feb 2024 08:21:20 +0100 Subject: [PATCH 5/5] drop binding unused after refactoring --- src/pybacked.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pybacked.rs b/src/pybacked.rs index b88781f35e6..bd29c830e65 100644 --- a/src/pybacked.rs +++ b/src/pybacked.rs @@ -44,7 +44,6 @@ impl TryFrom> for PyBackedStr { let bytes = py_string.encode_utf8()?; let b = bytes.as_bytes(); let data = NonNull::from(b); - let length = b.len(); Ok(Self { storage: bytes.into_any().unbind(), data,