Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PyString::data(): return the internal representation of the Python unicode object #247

Merged
merged 6 commits into from
Feb 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 112 additions & 1 deletion python3-sys/src/unicodeobject.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ use libc::{c_char, c_int, c_void, wchar_t};

use crate::object::*;
use crate::pyport::Py_ssize_t;
#[cfg(not(Py_LIMITED_API))]
use crate::pyport::Py_hash_t;

#[cfg(not(Py_LIMITED_API))]
#[deprecated(since = "0.2.1", note = "Deprecated since Python 3.3 / PEP 393")]
Expand Down Expand Up @@ -123,7 +125,7 @@ extern "C" {
pub fn PyUnicode_FromOrdinal(ordinal: c_int) -> *mut PyObject;
#[cfg(not(Py_3_9))]
pub fn PyUnicode_ClearFreeList() -> c_int;
#[cfg(not(Py_LIMITED_API))]
#[cfg(any(not(Py_LIMITED_API), Py_3_10))]
pub fn PyUnicode_AsUTF8AndSize(unicode: *mut PyObject, size: *mut Py_ssize_t) -> *const c_char;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_AsUTF8(unicode: *mut PyObject) -> *const c_char;
Expand Down Expand Up @@ -429,4 +431,113 @@ extern "C" {
pub fn PyUnicode_IsIdentifier(s: *mut PyObject) -> c_int;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_AsUnicodeCopy(unicode: *mut PyObject) -> *mut Py_UNICODE;

#[cfg(not(Py_LIMITED_API))]
fn _PyUnicode_Ready(o: *mut PyObject) -> c_int;
}

#[repr(C)]
#[cfg(not(Py_LIMITED_API))]
pub struct PyASCIIObject {
pub ob_base: PyObject,
pub length: Py_ssize_t,
pub hash: Py_hash_t,
pub state: u32,
pub wstr: *mut c_void
}

#[repr(C)]
#[cfg(not(Py_LIMITED_API))]
pub struct PyCompactUnicodeObject {
_base: PyASCIIObject,
utf8_length: Py_ssize_t,
utf8: *mut u8,
wstr_length: Py_ssize_t
}

#[repr(C)]
#[cfg(not(Py_LIMITED_API))]
pub struct PyUnicodeObject {
_base: PyASCIIObject,
data: *mut c_void
}

#[cfg(not(Py_LIMITED_API))]
#[inline]
unsafe fn PyUnicode_IS_ASCII(o: *mut PyObject) -> bool {
let ascii_bit = 1 << 6;
let state = (*(o as *mut PyASCIIObject)).state;
(state & ascii_bit) != 0
}

#[cfg(not(Py_LIMITED_API))]
#[inline]
unsafe fn PyUnicode_IS_COMPACT(o: *mut PyObject) -> bool {
let compact_bit = 1 << 5;
let state = (*(o as *mut PyASCIIObject)).state;
(state & compact_bit) != 0
}

#[cfg(not(Py_LIMITED_API))]
pub const PyUnicode_WCHAR_KIND: u32 = 0;
#[cfg(not(Py_LIMITED_API))]
pub const PyUnicode_1BYTE_KIND: u32 = 1;
#[cfg(not(Py_LIMITED_API))]
pub const PyUnicode_2BYTE_KIND: u32 = 2;
#[cfg(not(Py_LIMITED_API))]
pub const PyUnicode_4BYTE_KIND: u32 = 4;

#[cfg(not(Py_LIMITED_API))]
#[inline]
pub unsafe fn PyUnicode_KIND(o: *mut PyObject) -> u32 {
debug_assert!(PyUnicode_Check(o) > 0);
debug_assert!(PyUnicode_IS_READY(o));
let state = (*(o as *mut PyASCIIObject)).state;
(state >> 2) & 7
}

#[cfg(not(Py_LIMITED_API))]
pub unsafe fn PyUnicode_DATA(o: *mut PyObject) -> *mut c_void {
debug_assert!(PyUnicode_Check(o) > 0);
debug_assert!(PyUnicode_IS_READY(o));
if PyUnicode_IS_COMPACT(o) {
// fn _PyUnicode_COMPACT_DATA
if PyUnicode_IS_ASCII(o) {
(o as *mut PyASCIIObject).offset(1) as *mut c_void
} else {
(o as *mut PyCompactUnicodeObject).offset(1) as *mut c_void
}
} else {
// fn _PyUnicode_NONCOMPACT_DATA
let data = (*(o as *mut PyUnicodeObject)).data;
debug_assert!(!data.is_null());
data
}
}

#[cfg(not(Py_LIMITED_API))]
#[inline]
pub unsafe fn PyUnicode_GET_LENGTH(o: *mut PyObject) -> Py_ssize_t {
debug_assert!(PyUnicode_Check(o) > 0);
debug_assert!(PyUnicode_IS_READY(o));
(*(o as *mut PyASCIIObject)).length
}

#[cfg(not(Py_LIMITED_API))]
#[inline]
unsafe fn PyUnicode_IS_READY(o: *mut PyObject) -> bool {
let ready_bit = 1 << 7;
let state = (*(o as *mut PyASCIIObject)).state;
(state & ready_bit) != 0
}

#[cfg(not(Py_LIMITED_API))]
#[inline]
pub unsafe fn PyUnicode_READY(o: *mut PyObject) -> c_int {
debug_assert!(PyUnicode_Check(o) > 0);
if PyUnicode_IS_READY(o) {
0
} else {
_PyUnicode_Ready(o)
}
}
96 changes: 84 additions & 12 deletions src/objects/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ impl<'a> PyStringData<'a> {
)),
},
PyStringData::Latin1(data) => {
if data.iter().all(|&b| b.is_ascii()) {
if data.is_ascii() {
Ok(Cow::Borrowed(unsafe { str::from_utf8_unchecked(data) }))
} else {
Ok(Cow::Owned(data.iter().map(|&b| b as char).collect()))
Expand Down Expand Up @@ -214,7 +214,7 @@ impl<'a> PyStringData<'a> {
match self {
PyStringData::Utf8(data) => String::from_utf8_lossy(data),
PyStringData::Latin1(data) => {
if data.iter().all(|&b| b.is_ascii()) {
if data.is_ascii() {
Cow::Borrowed(unsafe { str::from_utf8_unchecked(data) })
} else {
Cow::Owned(data.iter().map(|&b| b as char).collect())
Expand Down Expand Up @@ -283,17 +283,24 @@ impl PyString {
}

#[cfg(feature = "python3-sys")]
fn data_impl(&self, py: Python) -> PyStringData {
// TODO: return the original representation instead
// of forcing the UTF-8 representation to be created.
let mut size: ffi::Py_ssize_t = 0;
fn data_impl(&self, _py: Python) -> PyStringData {
let ptr = self.as_ptr();
unsafe {
let data = ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size) as *const u8;
if data.is_null() {
PyErr::fetch(py).print(py);
panic!("PyUnicode_AsUTF8AndSize failed");
let ready = ffi::PyUnicode_READY(ptr);
if ready < 0 {
// should fail only on OOM
ffi::PyErr_Print();
panic!("PyUnicode_READY failed");
}
let size = ffi::PyUnicode_GET_LENGTH(ptr) as usize;
let data = ffi::PyUnicode_DATA(ptr);
let kind = ffi::PyUnicode_KIND(ptr);
match kind {
ffi::PyUnicode_1BYTE_KIND => PyStringData::Latin1(std::slice::from_raw_parts(data as *const u8, size)),
ffi::PyUnicode_2BYTE_KIND => PyStringData::Utf16(std::slice::from_raw_parts(data as *const u16, size)),
ffi::PyUnicode_4BYTE_KIND => PyStringData::Utf32(std::slice::from_raw_parts(data as *const u32, size)),
_ => panic!("Unknown PyUnicode_KIND")
}
PyStringData::Utf8(std::slice::from_raw_parts(data, size as usize))
}
}

Expand All @@ -306,7 +313,26 @@ impl PyString {
/// (containing unpaired surrogates, or a Python 2.7 byte string that is
/// not valid UTF-8).
pub fn to_string(&self, py: Python) -> PyResult<Cow<str>> {
self.data(py).to_string(py)
#[cfg(feature = "python3-sys")]
unsafe {
// On Python 3, we can use the UTF-8 representation stored
// inside the Python string.
// This should produce identical results to
// `self.data(py).to_string(py)` but avoids
// re-encoding the string on every to_string call.
let mut size: ffi::Py_ssize_t = 0;
let data = ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size);
if data.is_null() {
return Err(PyErr::fetch(py));
} else {
let slice = std::slice::from_raw_parts(data as *const u8, size as usize);
return Ok(Cow::Borrowed(std::str::from_utf8_unchecked(slice)));
}
}
#[cfg(feature = "python27-sys")]
{
return self.data(py).to_string(py);
}
}

/// Convert the `PyString` into a Rust string.
Expand Down Expand Up @@ -535,6 +561,7 @@ impl RefFromPyObject for [u8] {
mod test {
use crate::conversion::{RefFromPyObject, ToPyObject};
use crate::python::{Python, PythonObject};
use super::{PyString, PyStringData};

#[test]
fn test_non_bmp() {
Expand Down Expand Up @@ -583,4 +610,49 @@ mod test {
let v = py_bytes.extract::<Vec<u8>>(py).unwrap();
assert_eq!(b"Hello", &v[..]);
}

#[allow(unused_variables)] // when compiling for py2.7
#[test]
fn test_extract_umlaut() {
let gil = Python::acquire_gil();
let py = gil.python();
let py_string = py.eval("u'x=\\u00e4'", None, None).unwrap();
let data = py_string.cast_as::<PyString>(py).unwrap().data(py);
#[cfg(feature = "python3-sys")]
{
if let PyStringData::Latin1(s) = data {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it worth adding tests for the other PyStringData variants? I assume non-Latin-1 text (e.g. something in Greek) would be Utf16 and non-BMP text (e.g. something with emojis) would be Utf32.

assert_eq!([b'x', b'=', 0xe4], *s);
} else {
panic!("Expected PyStringData::Latin1");
}
}
assert_eq!("x=ä", py_string.extract::<String>(py).unwrap());
}

#[allow(unused_variables)] // when compiling for py2.7
#[test]
fn test_extract_lone_surrogate() {
let gil = Python::acquire_gil();
let py = gil.python();
let py_string = py.eval("u'x=\\ud800'", None, None).unwrap();
let data = py_string.cast_as::<PyString>(py).unwrap().data(py);
#[cfg(feature = "python3-sys")]
{
if let PyStringData::Utf16(s) = data {
assert_eq!(['x' as u16, '=' as u16, 0xd800], *s);
} else {
panic!("Expected PyStringData::Utf16");
}
}
assert!(py_string.extract::<String>(py).is_err());
}

#[test]
fn test_extract_lone_surrogate_lossy() {
let gil = Python::acquire_gil();
let py = gil.python();
let py_string = py.eval("u'x=\\ud800'", None, None).unwrap();
let result = py_string.cast_as::<PyString>(py).unwrap().to_string_lossy(py);
assert_eq!("x=\u{fffd}", result);
}
}