Merge pull request #3677 from davidhewitt/string2
implement `PyStringMethods`
This commit is contained in:
commit
43827e39ee
|
@ -0,0 +1 @@
|
||||||
|
Add `PyString::to_cow`. Add `Py<PyString>::to_str`, `Py<PyString>::to_cow`, and `Py<PyString>::to_string_lossy`, as ways to access Python string data safely beyond the GIL lifetime.
|
|
@ -33,3 +33,4 @@ pub use crate::wrap_pyfunction;
|
||||||
// pub(crate) use crate::types::dict::PyDictMethods;
|
// pub(crate) use crate::types::dict::PyDictMethods;
|
||||||
// pub(crate) use crate::types::float::PyFloatMethods;
|
// pub(crate) use crate::types::float::PyFloatMethods;
|
||||||
// pub(crate) use crate::types::sequence::PySequenceMethods;
|
// pub(crate) use crate::types::sequence::PySequenceMethods;
|
||||||
|
// pub(crate) use crate::types::string::PyStringMethods;
|
||||||
|
|
|
@ -301,7 +301,7 @@ mod pysuper;
|
||||||
pub(crate) mod sequence;
|
pub(crate) mod sequence;
|
||||||
pub(crate) mod set;
|
pub(crate) mod set;
|
||||||
mod slice;
|
mod slice;
|
||||||
mod string;
|
pub(crate) mod string;
|
||||||
mod traceback;
|
mod traceback;
|
||||||
mod tuple;
|
mod tuple;
|
||||||
mod typeobject;
|
mod typeobject;
|
||||||
|
|
|
@ -1,7 +1,11 @@
|
||||||
#[cfg(not(Py_LIMITED_API))]
|
#[cfg(not(Py_LIMITED_API))]
|
||||||
use crate::exceptions::PyUnicodeDecodeError;
|
use crate::exceptions::PyUnicodeDecodeError;
|
||||||
|
use crate::ffi_ptr_ext::FfiPtrExt;
|
||||||
|
use crate::instance::Py2Borrowed;
|
||||||
|
use crate::types::any::PyAnyMethods;
|
||||||
|
use crate::types::bytes::PyBytesMethods;
|
||||||
use crate::types::PyBytes;
|
use crate::types::PyBytes;
|
||||||
use crate::{ffi, PyAny, PyResult, Python};
|
use crate::{ffi, IntoPy, Py, Py2, PyAny, PyResult, Python};
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::os::raw::c_char;
|
use std::os::raw::c_char;
|
||||||
use std::str;
|
use std::str;
|
||||||
|
@ -165,7 +169,7 @@ impl PyString {
|
||||||
pub fn from_object<'p>(src: &'p PyAny, encoding: &str, errors: &str) -> PyResult<&'p PyString> {
|
pub fn from_object<'p>(src: &'p PyAny, encoding: &str, errors: &str) -> PyResult<&'p PyString> {
|
||||||
unsafe {
|
unsafe {
|
||||||
src.py()
|
src.py()
|
||||||
.from_owned_ptr_or_err::<PyString>(ffi::PyUnicode_FromEncodedObject(
|
.from_owned_ptr_or_err(ffi::PyUnicode_FromEncodedObject(
|
||||||
src.as_ptr(),
|
src.as_ptr(),
|
||||||
encoding.as_ptr() as *const c_char,
|
encoding.as_ptr() as *const c_char,
|
||||||
errors.as_ptr() as *const c_char,
|
errors.as_ptr() as *const c_char,
|
||||||
|
@ -173,32 +177,32 @@ impl PyString {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Gets the Python string as a byte slice.
|
/// Gets the Python string as a Rust UTF-8 string slice.
|
||||||
///
|
///
|
||||||
/// Returns a `UnicodeEncodeError` if the input is not valid unicode
|
/// Returns a `UnicodeEncodeError` if the input is not valid unicode
|
||||||
/// (containing unpaired surrogates).
|
/// (containing unpaired surrogates).
|
||||||
#[inline]
|
|
||||||
pub fn to_str(&self) -> PyResult<&str> {
|
pub fn to_str(&self) -> PyResult<&str> {
|
||||||
let utf8_slice = {
|
#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
|
||||||
cfg_if::cfg_if! {
|
{
|
||||||
if #[cfg(any(Py_3_10, not(Py_LIMITED_API)))] {
|
Py2Borrowed::from_gil_ref(self).to_str()
|
||||||
// PyUnicode_AsUTF8AndSize only available on limited API starting with 3.10.
|
}
|
||||||
let mut size: ffi::Py_ssize_t = 0;
|
|
||||||
let data: *const u8 = unsafe { ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size).cast() };
|
#[cfg(not(any(Py_3_10, not(Py_LIMITED_API))))]
|
||||||
if data.is_null() {
|
{
|
||||||
return Err(crate::PyErr::fetch(self.py()));
|
let bytes = unsafe {
|
||||||
} else {
|
self.py()
|
||||||
unsafe { std::slice::from_raw_parts(data, size as usize) }
|
.from_owned_ptr_or_err::<PyBytes>(ffi::PyUnicode_AsUTF8String(self.as_ptr()))
|
||||||
}
|
}?;
|
||||||
} else {
|
Ok(unsafe { std::str::from_utf8_unchecked(bytes.as_bytes()) })
|
||||||
let bytes = unsafe {
|
}
|
||||||
self.py().from_owned_ptr_or_err::<PyBytes>(ffi::PyUnicode_AsUTF8String(self.as_ptr()))?
|
}
|
||||||
};
|
|
||||||
bytes.as_bytes()
|
/// Converts the `PyString` into a Rust string, avoiding copying when possible.
|
||||||
}
|
///
|
||||||
}
|
/// Returns a `UnicodeEncodeError` if the input is not valid unicode
|
||||||
};
|
/// (containing unpaired surrogates).
|
||||||
Ok(unsafe { std::str::from_utf8_unchecked(utf8_slice) })
|
pub fn to_cow(&self) -> PyResult<Cow<'_, str>> {
|
||||||
|
Py2Borrowed::from_gil_ref(self).to_cow()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Converts the `PyString` into a Rust string.
|
/// Converts the `PyString` into a Rust string.
|
||||||
|
@ -206,20 +210,7 @@ impl PyString {
|
||||||
/// Unpaired surrogates invalid UTF-8 sequences are
|
/// Unpaired surrogates invalid UTF-8 sequences are
|
||||||
/// replaced with `U+FFFD REPLACEMENT CHARACTER`.
|
/// replaced with `U+FFFD REPLACEMENT CHARACTER`.
|
||||||
pub fn to_string_lossy(&self) -> Cow<'_, str> {
|
pub fn to_string_lossy(&self) -> Cow<'_, str> {
|
||||||
match self.to_str() {
|
Py2Borrowed::from_gil_ref(self).to_string_lossy()
|
||||||
Ok(s) => Cow::Borrowed(s),
|
|
||||||
Err(_) => {
|
|
||||||
let bytes = unsafe {
|
|
||||||
self.py()
|
|
||||||
.from_owned_ptr::<PyBytes>(ffi::PyUnicode_AsEncodedString(
|
|
||||||
self.as_ptr(),
|
|
||||||
b"utf-8\0" as *const _ as _,
|
|
||||||
b"surrogatepass\0" as *const _ as _,
|
|
||||||
))
|
|
||||||
};
|
|
||||||
String::from_utf8_lossy(bytes.as_bytes())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Obtains the raw data backing the Python string.
|
/// Obtains the raw data backing the Python string.
|
||||||
|
@ -238,6 +229,137 @@ impl PyString {
|
||||||
/// expected on the targets where you plan to distribute your software.
|
/// expected on the targets where you plan to distribute your software.
|
||||||
#[cfg(not(Py_LIMITED_API))]
|
#[cfg(not(Py_LIMITED_API))]
|
||||||
pub unsafe fn data(&self) -> PyResult<PyStringData<'_>> {
|
pub unsafe fn data(&self) -> PyResult<PyStringData<'_>> {
|
||||||
|
Py2Borrowed::from_gil_ref(self).data()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Implementation of functionality for [`PyString`].
|
||||||
|
///
|
||||||
|
/// These methods are defined for the `Py2<'py, PyString>` smart pointer, so to use method call
|
||||||
|
/// syntax these methods are separated into a trait, because stable Rust does not yet support
|
||||||
|
/// `arbitrary_self_types`.
|
||||||
|
#[doc(alias = "PyString")]
|
||||||
|
pub(crate) trait PyStringMethods<'py> {
|
||||||
|
/// Gets the Python string as a Rust UTF-8 string slice.
|
||||||
|
///
|
||||||
|
/// Returns a `UnicodeEncodeError` if the input is not valid unicode
|
||||||
|
/// (containing unpaired surrogates).
|
||||||
|
#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
|
||||||
|
fn to_str(&self) -> PyResult<&str>;
|
||||||
|
|
||||||
|
/// Converts the `PyString` into a Rust string, avoiding copying when possible.
|
||||||
|
///
|
||||||
|
/// Returns a `UnicodeEncodeError` if the input is not valid unicode
|
||||||
|
/// (containing unpaired surrogates).
|
||||||
|
fn to_cow(&self) -> PyResult<Cow<'_, str>>;
|
||||||
|
|
||||||
|
/// Converts the `PyString` into a Rust string.
|
||||||
|
///
|
||||||
|
/// Unpaired surrogates invalid UTF-8 sequences are
|
||||||
|
/// replaced with `U+FFFD REPLACEMENT CHARACTER`.
|
||||||
|
fn to_string_lossy(&self) -> Cow<'_, str>;
|
||||||
|
|
||||||
|
/// Obtains the raw data backing the Python string.
|
||||||
|
///
|
||||||
|
/// If the Python string object was created through legacy APIs, its internal storage format
|
||||||
|
/// will be canonicalized before data is returned.
|
||||||
|
///
|
||||||
|
/// # Safety
|
||||||
|
///
|
||||||
|
/// This function implementation relies on manually decoding a C bitfield. In practice, this
|
||||||
|
/// works well on common little-endian architectures such as x86_64, where the bitfield has a
|
||||||
|
/// common representation (even if it is not part of the C spec). The PyO3 CI tests this API on
|
||||||
|
/// x86_64 platforms.
|
||||||
|
///
|
||||||
|
/// By using this API, you accept responsibility for testing that PyStringData behaves as
|
||||||
|
/// expected on the targets where you plan to distribute your software.
|
||||||
|
#[cfg(not(Py_LIMITED_API))]
|
||||||
|
unsafe fn data(&self) -> PyResult<PyStringData<'_>>;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'py> PyStringMethods<'py> for Py2<'py, PyString> {
|
||||||
|
#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
|
||||||
|
fn to_str(&self) -> PyResult<&str> {
|
||||||
|
Py2Borrowed::from(self).to_str()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn to_cow(&self) -> PyResult<Cow<'_, str>> {
|
||||||
|
Py2Borrowed::from(self).to_cow()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn to_string_lossy(&self) -> Cow<'_, str> {
|
||||||
|
Py2Borrowed::from(self).to_string_lossy()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(Py_LIMITED_API))]
|
||||||
|
unsafe fn data(&self) -> PyResult<PyStringData<'_>> {
|
||||||
|
Py2Borrowed::from(self).data()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Py2Borrowed<'a, '_, PyString> {
|
||||||
|
#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
|
||||||
|
#[allow(clippy::wrong_self_convention)]
|
||||||
|
fn to_str(self) -> PyResult<&'a str> {
|
||||||
|
// PyUnicode_AsUTF8AndSize only available on limited API starting with 3.10.
|
||||||
|
let mut size: ffi::Py_ssize_t = 0;
|
||||||
|
let data: *const u8 =
|
||||||
|
unsafe { ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size).cast() };
|
||||||
|
if data.is_null() {
|
||||||
|
Err(crate::PyErr::fetch(self.py()))
|
||||||
|
} else {
|
||||||
|
Ok(unsafe {
|
||||||
|
std::str::from_utf8_unchecked(std::slice::from_raw_parts(data, size as usize))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[allow(clippy::wrong_self_convention)]
|
||||||
|
fn to_cow(self) -> PyResult<Cow<'a, str>> {
|
||||||
|
// TODO: this method can probably be deprecated once Python 3.9 support is dropped,
|
||||||
|
// because all versions then support the more efficient `to_str`.
|
||||||
|
#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
|
||||||
|
{
|
||||||
|
self.to_str().map(Cow::Borrowed)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(any(Py_3_10, not(Py_LIMITED_API))))]
|
||||||
|
{
|
||||||
|
let bytes = unsafe {
|
||||||
|
ffi::PyUnicode_AsUTF8String(self.as_ptr())
|
||||||
|
.assume_owned_or_err(self.py())?
|
||||||
|
.downcast_into_unchecked::<PyBytes>()
|
||||||
|
};
|
||||||
|
Ok(Cow::Owned(
|
||||||
|
unsafe { str::from_utf8_unchecked(bytes.as_bytes()) }.to_owned(),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[allow(clippy::wrong_self_convention)]
|
||||||
|
fn to_string_lossy(self) -> Cow<'a, str> {
|
||||||
|
let ptr = self.as_ptr();
|
||||||
|
let py = self.py();
|
||||||
|
|
||||||
|
#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
|
||||||
|
if let Ok(s) = self.to_str() {
|
||||||
|
return Cow::Borrowed(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
let bytes = unsafe {
|
||||||
|
ffi::PyUnicode_AsEncodedString(
|
||||||
|
ptr,
|
||||||
|
b"utf-8\0".as_ptr().cast(),
|
||||||
|
b"surrogatepass\0".as_ptr().cast(),
|
||||||
|
)
|
||||||
|
.assume_owned(py)
|
||||||
|
.downcast_into_unchecked::<PyBytes>()
|
||||||
|
};
|
||||||
|
Cow::Owned(String::from_utf8_lossy(bytes.as_bytes()).into_owned())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(Py_LIMITED_API))]
|
||||||
|
unsafe fn data(self) -> PyResult<PyStringData<'a>> {
|
||||||
let ptr = self.as_ptr();
|
let ptr = self.as_ptr();
|
||||||
|
|
||||||
#[cfg(not(Py_3_12))]
|
#[cfg(not(Py_3_12))]
|
||||||
|
@ -275,6 +397,60 @@ impl PyString {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Py<PyString> {
|
||||||
|
/// Gets the Python string as a Rust UTF-8 string slice.
|
||||||
|
///
|
||||||
|
/// Returns a `UnicodeEncodeError` if the input is not valid unicode
|
||||||
|
/// (containing unpaired surrogates).
|
||||||
|
///
|
||||||
|
/// Because `str` objects are immutable, the returned slice is independent of
|
||||||
|
/// the GIL lifetime.
|
||||||
|
#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
|
||||||
|
pub fn to_str<'a>(&'a self, py: Python<'_>) -> PyResult<&'a str> {
|
||||||
|
self.attach_borrow(py).to_str()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Converts the `PyString` into a Rust string, avoiding copying when possible.
|
||||||
|
///
|
||||||
|
/// Returns a `UnicodeEncodeError` if the input is not valid unicode
|
||||||
|
/// (containing unpaired surrogates).
|
||||||
|
///
|
||||||
|
/// Because `str` objects are immutable, the returned slice is independent of
|
||||||
|
/// the GIL lifetime.
|
||||||
|
pub fn to_cow<'a>(&'a self, py: Python<'_>) -> PyResult<Cow<'a, str>> {
|
||||||
|
self.attach_borrow(py).to_cow()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Converts the `PyString` into a Rust string.
|
||||||
|
///
|
||||||
|
/// Unpaired surrogates invalid UTF-8 sequences are
|
||||||
|
/// replaced with `U+FFFD REPLACEMENT CHARACTER`.
|
||||||
|
///
|
||||||
|
/// Because `str` objects are immutable, the returned slice is independent of
|
||||||
|
/// the GIL lifetime.
|
||||||
|
pub fn to_string_lossy<'a>(&'a self, py: Python<'_>) -> Cow<'a, str> {
|
||||||
|
self.attach_borrow(py).to_string_lossy()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl IntoPy<Py<PyString>> for Py2<'_, PyString> {
|
||||||
|
fn into_py(self, _py: Python<'_>) -> Py<PyString> {
|
||||||
|
self.into()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl IntoPy<Py<PyString>> for &Py2<'_, PyString> {
|
||||||
|
fn into_py(self, _py: Python<'_>) -> Py<PyString> {
|
||||||
|
self.clone().into()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl IntoPy<Py<PyString>> for &'_ Py<PyString> {
|
||||||
|
fn into_py(self, py: Python<'_>) -> Py<PyString> {
|
||||||
|
self.clone_ref(py)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
@ -284,7 +460,7 @@ mod tests {
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_to_str_ascii() {
|
fn test_to_str_utf8() {
|
||||||
Python::with_gil(|py| {
|
Python::with_gil(|py| {
|
||||||
let s = "ascii 🐈";
|
let s = "ascii 🐈";
|
||||||
let obj: PyObject = PyString::new(py, s).into();
|
let obj: PyObject = PyString::new(py, s).into();
|
||||||
|
@ -479,4 +655,42 @@ mod tests {
|
||||||
assert_ne!(py_string1.as_ptr(), py_string3.as_ptr());
|
assert_ne!(py_string1.as_ptr(), py_string3.as_ptr());
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_py_to_str_utf8() {
|
||||||
|
Python::with_gil(|py| {
|
||||||
|
let s = "ascii 🐈";
|
||||||
|
let py_string: Py<PyString> = PyString::new(py, s).into_py(py);
|
||||||
|
|
||||||
|
#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
|
||||||
|
assert_eq!(s, py_string.to_str(py).unwrap());
|
||||||
|
|
||||||
|
assert_eq!(s, py_string.to_cow(py).unwrap());
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_py_to_str_surrogate() {
|
||||||
|
Python::with_gil(|py| {
|
||||||
|
let py_string: Py<PyString> =
|
||||||
|
py.eval(r"'\ud800'", None, None).unwrap().extract().unwrap();
|
||||||
|
|
||||||
|
#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
|
||||||
|
assert!(py_string.to_str(py).is_err());
|
||||||
|
|
||||||
|
assert!(py_string.to_cow(py).is_err());
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_py_to_string_lossy() {
|
||||||
|
Python::with_gil(|py| {
|
||||||
|
let py_string: Py<PyString> = py
|
||||||
|
.eval(r"'🐈 Hello \ud800World'", None, None)
|
||||||
|
.unwrap()
|
||||||
|
.extract()
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(py_string.to_string_lossy(py), "🐈 Hello <20><><EFBFBD>World");
|
||||||
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue