Simplify PyString, PyBytes, PyUnicode
PyStringData comments mention wanting to receive interpreter- specific unicode types. I tried implementing this, but it's more complex and slower to call libpython to fill a buffer of u32 or such and convert that in Rust using widestring. This implementation receives UTF-8 from PyUnicode_AsUTF8AndSize() in python3 and PyUnicode_AsUTF8String() in python2. PyStringData is removed as unnecessary. The data() method on PyString, PyBytes, and PyUnicode is replaced with as_bytes(). The python2 API changes improve the performance of extracting unicode strings.
This commit is contained in:
parent
3a95d163ca
commit
851d2207c0
|
@ -12,10 +12,13 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|||
* `PyComplex` by kngwyu in [#226](https://github.com/PyO3/pyo3/pull/226)
|
||||
* `PyDict::from_sequence()`, equivalent to `dict([(key, val), ...])`
|
||||
* Bindings for the `datetime` standard library types: `PyDate`, `PyTime`, `PyDateTime`, `PyTzInfo`, `PyDelta` with associated `ffi` types, by pganssle [#200](https://github.com/PyO3/pyo3/pull/200).
|
||||
* `PyString`, `PyUnicode`, and `PyBytes` now have an `as_bytes()` method that returns `&[u8]`.
|
||||
|
||||
### Removed
|
||||
* Removed most entries from the prelude. The new prelude is small and clear.
|
||||
* Slowly removing specialization uses
|
||||
* `PyString`, `PyUnicode`, and `PyBytes` no longer have a `data()` method
|
||||
(replaced by `as_bytes()`) and `PyStringData` has been removed.
|
||||
|
||||
### Changed
|
||||
* Removes the types from the root module and the prelude. They now live in `pyo3::types` instead.
|
||||
|
|
|
@ -27,7 +27,6 @@ pub use self::slice::{PySlice, PySliceIndices};
|
|||
pub use self::string::{PyBytes, PyString, PyString as PyUnicode};
|
||||
#[cfg(not(Py_3))]
|
||||
pub use self::string2::{PyBytes, PyString, PyUnicode};
|
||||
pub use self::stringdata::PyStringData;
|
||||
pub use self::tuple::PyTuple;
|
||||
pub use self::typeobject::PyType;
|
||||
use ffi;
|
||||
|
@ -195,7 +194,6 @@ mod module;
|
|||
mod sequence;
|
||||
mod set;
|
||||
mod slice;
|
||||
mod stringdata;
|
||||
mod stringutils;
|
||||
mod tuple;
|
||||
mod typeobject;
|
||||
|
|
|
@ -5,13 +5,13 @@ use std::borrow::Cow;
|
|||
use std::os::raw::c_char;
|
||||
use std::{mem, str};
|
||||
|
||||
use super::PyStringData;
|
||||
use err::{PyErr, PyResult};
|
||||
use ffi;
|
||||
use instance::{Py, PyObjectWithToken};
|
||||
use object::PyObject;
|
||||
use python::{Python, ToPyPointer};
|
||||
use types::PyObjectRef;
|
||||
use types::exceptions;
|
||||
|
||||
/// Represents a Python `string`.
|
||||
#[repr(transparent)]
|
||||
|
@ -50,18 +50,14 @@ impl PyString {
|
|||
}
|
||||
}
|
||||
|
||||
/// Gets the python string data in its underlying representation.
|
||||
pub fn data(&self) -> PyStringData {
|
||||
// TODO: return the original representation instead
|
||||
// of forcing the UTF-8 representation to be created.
|
||||
/// Get the Python string as a byte slice.
|
||||
#[inline]
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
unsafe {
|
||||
let mut size: ffi::Py_ssize_t = mem::uninitialized();
|
||||
let data = ffi::PyUnicode_AsUTF8AndSize(self.0.as_ptr(), &mut size) as *const u8;
|
||||
if data.is_null() {
|
||||
PyErr::fetch(self.py()).print(self.py());
|
||||
panic!("PyUnicode_AsUTF8AndSize failed");
|
||||
}
|
||||
PyStringData::Utf8(std::slice::from_raw_parts(data, size as usize))
|
||||
debug_assert!(!data.is_null());
|
||||
std::slice::from_raw_parts(data, size as usize)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -70,7 +66,12 @@ impl PyString {
|
|||
/// Returns a `UnicodeDecodeError` if the input is not valid unicode
|
||||
/// (containing unpaired surrogates).
|
||||
pub fn to_string(&self) -> PyResult<Cow<str>> {
|
||||
self.data().to_string(self.py())
|
||||
match std::str::from_utf8(self.as_bytes()) {
|
||||
Ok(s) => Ok(Cow::Borrowed(s)),
|
||||
Err(e) => Err(PyErr::from_instance(
|
||||
exceptions::UnicodeDecodeError::new_utf8(self.py(), self.as_bytes(), e)?,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert the `PyString` into a Rust string.
|
||||
|
@ -78,7 +79,7 @@ impl PyString {
|
|||
/// Unpaired surrogates invalid UTF-8 sequences are
|
||||
/// replaced with U+FFFD REPLACEMENT CHARACTER.
|
||||
pub fn to_string_lossy(&self) -> Cow<str> {
|
||||
self.data().to_string_lossy()
|
||||
String::from_utf8_lossy(self.as_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -103,11 +104,13 @@ impl PyBytes {
|
|||
))
|
||||
}
|
||||
|
||||
/// Gets the Python string data as byte slice.
|
||||
pub fn data(&self) -> &[u8] {
|
||||
/// Get the Python string as a byte slice.
|
||||
#[inline]
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
unsafe {
|
||||
let buffer = ffi::PyBytes_AsString(self.as_ptr()) as *const u8;
|
||||
let length = ffi::PyBytes_Size(self.as_ptr()) as usize;
|
||||
debug_assert!(!buffer.is_null());
|
||||
std::slice::from_raw_parts(buffer, length)
|
||||
}
|
||||
}
|
||||
|
@ -115,9 +118,12 @@ impl PyBytes {
|
|||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use conversion::{FromPyObject, ToPyObject};
|
||||
use std::borrow::Cow;
|
||||
use conversion::{FromPyObject, ToPyObject, PyTryFrom};
|
||||
use instance::AsPyRef;
|
||||
use python::Python;
|
||||
use object::PyObject;
|
||||
use super::PyString;
|
||||
|
||||
#[test]
|
||||
fn test_non_bmp() {
|
||||
|
@ -138,4 +144,36 @@ mod test {
|
|||
let s2: &str = FromPyObject::extract(py_string.as_ref(py)).unwrap();
|
||||
assert_eq!(s, s2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_as_bytes() {
|
||||
let gil = Python::acquire_gil();
|
||||
let py = gil.python();
|
||||
let s = "ascii 🐈";
|
||||
let obj: PyObject = PyString::new(py, s).into();
|
||||
let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
|
||||
assert_eq!(s.as_bytes(), py_string.as_bytes());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_string_ascii() {
|
||||
let gil = Python::acquire_gil();
|
||||
let py = gil.python();
|
||||
let s = "ascii";
|
||||
let obj: PyObject = PyString::new(py, s).into();
|
||||
let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
|
||||
assert!(py_string.to_string().is_ok());
|
||||
assert_eq!(Cow::Borrowed(s), py_string.to_string().unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_string_unicode() {
|
||||
let gil = Python::acquire_gil();
|
||||
let py = gil.python();
|
||||
let s = "哈哈🐈";
|
||||
let obj: PyObject = PyString::new(py, s).into();
|
||||
let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
|
||||
assert!(py_string.to_string().is_ok());
|
||||
assert_eq!(Cow::Borrowed(s), py_string.to_string().unwrap());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,13 +7,14 @@ use std::borrow::Cow;
|
|||
use std::os::raw::c_char;
|
||||
use std::str;
|
||||
|
||||
use super::{PyObjectRef, PyStringData};
|
||||
use err::PyResult;
|
||||
use super::PyObjectRef;
|
||||
use err::{PyErr, PyResult};
|
||||
use ffi;
|
||||
use instance::{Py, PyObjectWithToken};
|
||||
use object::PyObject;
|
||||
use objectprotocol::ObjectProtocol;
|
||||
use python::{Python, ToPyPointer};
|
||||
use types::exceptions;
|
||||
|
||||
/// Represents a Python `string`.
|
||||
#[repr(transparent)]
|
||||
|
@ -62,16 +63,13 @@ impl PyString {
|
|||
}
|
||||
}
|
||||
|
||||
/// Gets the python string data in its underlying representation.
|
||||
///
|
||||
/// For Python 2 byte strings, this function always returns `PyStringData::Utf8`,
|
||||
/// even if the bytes are not valid UTF-8.
|
||||
/// For unicode strings, returns the underlying representation used by Python.
|
||||
pub fn data(&self) -> PyStringData {
|
||||
/// Get the Python string as a byte slice.
|
||||
#[inline]
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
if let Ok(bytes) = self.cast_as::<PyBytes>() {
|
||||
PyStringData::Utf8(bytes.data())
|
||||
bytes.as_bytes()
|
||||
} else if let Ok(unicode) = self.cast_as::<PyUnicode>() {
|
||||
unicode.data()
|
||||
unicode.as_bytes()
|
||||
} else {
|
||||
panic!("PyString is neither `str` nor `unicode`")
|
||||
}
|
||||
|
@ -86,7 +84,12 @@ impl PyString {
|
|||
/// (containing unpaired surrogates, or a Python 2.7 byte string that is
|
||||
/// not valid UTF-8).
|
||||
pub fn to_string(&self) -> PyResult<Cow<str>> {
|
||||
self.data().to_string(self.py())
|
||||
match std::str::from_utf8(self.as_bytes()) {
|
||||
Ok(s) => Ok(Cow::Borrowed(s)),
|
||||
Err(e) => Err(PyErr::from_instance(
|
||||
exceptions::UnicodeDecodeError::new_utf8(self.py(), self.as_bytes(), e)?,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert the `PyString` into a Rust string.
|
||||
|
@ -97,7 +100,7 @@ impl PyString {
|
|||
/// Unpaired surrogates and (on Python 2.7) invalid UTF-8 sequences are
|
||||
/// replaced with U+FFFD REPLACEMENT CHARACTER.
|
||||
pub fn to_string_lossy(&self) -> Cow<str> {
|
||||
self.data().to_string_lossy()
|
||||
String::from_utf8_lossy(self.as_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -112,11 +115,13 @@ impl PyBytes {
|
|||
unsafe { Py::from_owned_ptr_or_panic(ffi::PyBytes_FromStringAndSize(ptr, len)) }
|
||||
}
|
||||
|
||||
/// Gets the Python string data as byte slice.
|
||||
pub fn data(&self) -> &[u8] {
|
||||
/// Get the Python string as a byte slice.
|
||||
#[inline]
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
unsafe {
|
||||
let buffer = ffi::PyBytes_AsString(self.as_ptr()) as *const u8;
|
||||
let length = ffi::PyBytes_Size(self.as_ptr()) as usize;
|
||||
debug_assert!(!buffer.is_null());
|
||||
std::slice::from_raw_parts(buffer, length)
|
||||
}
|
||||
}
|
||||
|
@ -145,12 +150,17 @@ impl PyUnicode {
|
|||
}
|
||||
}
|
||||
|
||||
/// Gets the python string data in its underlying representation.
|
||||
pub fn data(&self) -> PyStringData {
|
||||
/// Get the Python string as a byte slice.
|
||||
#[inline]
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
unsafe {
|
||||
let buffer = ffi::PyUnicode_AS_UNICODE(self.as_ptr());
|
||||
let length = ffi::PyUnicode_GET_SIZE(self.as_ptr()) as usize;
|
||||
std::slice::from_raw_parts(buffer, length).into()
|
||||
let data: Py<PyBytes> = Py::from_owned_ptr(
|
||||
ffi::PyUnicode_AsUTF8String(self.0.as_ptr()),
|
||||
);
|
||||
let buffer = ffi::PyBytes_AsString(data.as_ptr()) as *const u8;
|
||||
let length = ffi::PyBytes_Size(data.as_ptr()) as usize;
|
||||
debug_assert!(!buffer.is_null());
|
||||
std::slice::from_raw_parts(buffer, length)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -159,14 +169,19 @@ impl PyUnicode {
|
|||
/// Returns a `UnicodeDecodeError` if the input is not valid unicode
|
||||
/// (containing unpaired surrogates).
|
||||
pub fn to_string(&self) -> PyResult<Cow<str>> {
|
||||
self.data().to_string(self.py())
|
||||
match std::str::from_utf8(self.as_bytes()) {
|
||||
Ok(s) => Ok(Cow::Borrowed(s)),
|
||||
Err(e) => Err(PyErr::from_instance(
|
||||
exceptions::UnicodeDecodeError::new_utf8(self.py(), self.as_bytes(), e)?,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert the `PyString` into a Rust string.
|
||||
///
|
||||
/// Unpaired surrogates are replaced with U+FFFD REPLACEMENT CHARACTER.
|
||||
pub fn to_string_lossy(&self) -> Cow<str> {
|
||||
self.data().to_string_lossy()
|
||||
String::from_utf8_lossy(self.as_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -188,9 +203,12 @@ impl std::convert::From<Py<PyUnicode>> for Py<PyString> {
|
|||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use conversion::{FromPyObject, ToPyObject};
|
||||
use std::borrow::Cow;
|
||||
use conversion::{FromPyObject, ToPyObject, PyTryFrom};
|
||||
use instance::AsPyRef;
|
||||
use python::Python;
|
||||
use object::PyObject;
|
||||
use super::PyString;
|
||||
|
||||
#[test]
|
||||
fn test_non_bmp() {
|
||||
|
@ -211,4 +229,36 @@ mod test {
|
|||
let s2: &str = FromPyObject::extract(py_string.as_ref(py)).unwrap();
|
||||
assert_eq!(s, s2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_as_bytes() {
|
||||
let gil = Python::acquire_gil();
|
||||
let py = gil.python();
|
||||
let s = "ascii 🐈";
|
||||
let obj: PyObject = PyString::new(py, s).into();
|
||||
let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
|
||||
assert_eq!(s.as_bytes(), py_string.as_bytes());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_string_ascii() {
|
||||
let gil = Python::acquire_gil();
|
||||
let py = gil.python();
|
||||
let s = "ascii";
|
||||
let obj: PyObject = PyString::new(py, s).into();
|
||||
let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
|
||||
assert!(py_string.to_string().is_ok());
|
||||
assert_eq!(Cow::Borrowed(s), py_string.to_string().unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_string_unicode() {
|
||||
let gil = Python::acquire_gil();
|
||||
let py = gil.python();
|
||||
let s = "哈哈🐈";
|
||||
let obj: PyObject = PyString::new(py, s).into();
|
||||
let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
|
||||
assert!(py_string.to_string().is_ok());
|
||||
assert_eq!(Cow::Borrowed(s), py_string.to_string().unwrap());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,125 +0,0 @@
|
|||
// Copyright (c) 2017-present PyO3 Project and Contributors
|
||||
//
|
||||
// based on Daniel Grunwald's https://github.com/dgrunwald/rust-cpython
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::{char, str};
|
||||
|
||||
use err::{PyErr, PyResult};
|
||||
use python::Python;
|
||||
use types::exceptions;
|
||||
|
||||
/// Enum of possible Python string representations.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum PyStringData<'a> {
|
||||
Latin1(&'a [u8]),
|
||||
Utf8(&'a [u8]),
|
||||
Utf16(&'a [u16]),
|
||||
Utf32(&'a [u32]),
|
||||
}
|
||||
|
||||
impl<'a> From<&'a str> for PyStringData<'a> {
|
||||
#[inline]
|
||||
fn from(val: &'a str) -> PyStringData<'a> {
|
||||
PyStringData::Utf8(val.as_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a [u16]> for PyStringData<'a> {
|
||||
#[inline]
|
||||
fn from(val: &'a [u16]) -> PyStringData<'a> {
|
||||
PyStringData::Utf16(val)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a [u32]> for PyStringData<'a> {
|
||||
#[inline]
|
||||
fn from(val: &'a [u32]) -> PyStringData<'a> {
|
||||
PyStringData::Utf32(val)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> PyStringData<'a> {
|
||||
/// Convert the Python string data to a Rust string.
|
||||
///
|
||||
/// For UTF-8 and ASCII-only latin-1, returns a borrow into the original string data.
|
||||
/// For Latin-1, UTF-16 and UTF-32, returns an owned string.
|
||||
///
|
||||
/// Fails with UnicodeDecodeError if the string data isn't valid in its encoding.
|
||||
pub fn to_string(self, py: Python) -> PyResult<Cow<'a, str>> {
|
||||
match self {
|
||||
PyStringData::Utf8(data) => match str::from_utf8(data) {
|
||||
Ok(s) => Ok(Cow::Borrowed(s)),
|
||||
Err(e) => Err(PyErr::from_instance(
|
||||
exceptions::UnicodeDecodeError::new_utf8(py, data, e)?,
|
||||
)),
|
||||
},
|
||||
PyStringData::Latin1(data) => {
|
||||
if data.iter().all(|&b| b.is_ascii()) {
|
||||
Ok(Cow::Borrowed(unsafe { str::from_utf8_unchecked(data) }))
|
||||
} else {
|
||||
Ok(Cow::Owned(data.iter().map(|&b| b as char).collect()))
|
||||
}
|
||||
}
|
||||
PyStringData::Utf16(data) => {
|
||||
fn utf16_bytes(input: &[u16]) -> &[u8] {
|
||||
unsafe { &*(input as *const [u16] as *const [u8]) }
|
||||
}
|
||||
match String::from_utf16(data) {
|
||||
Ok(s) => Ok(Cow::Owned(s)),
|
||||
Err(_) => Err(PyErr::from_instance(
|
||||
exceptions::UnicodeDecodeError::new_err(
|
||||
py,
|
||||
cstr!("utf-16"),
|
||||
utf16_bytes(data),
|
||||
0..2 * data.len(),
|
||||
cstr!("invalid utf-16"),
|
||||
)?,
|
||||
)),
|
||||
}
|
||||
}
|
||||
PyStringData::Utf32(data) => {
|
||||
fn utf32_bytes(input: &[u32]) -> &[u8] {
|
||||
unsafe { &*(input as *const [u32] as *const [u8]) }
|
||||
}
|
||||
match data.iter().map(|&u| char::from_u32(u)).collect() {
|
||||
Some(s) => Ok(Cow::Owned(s)),
|
||||
None => Err(PyErr::from_instance(
|
||||
exceptions::UnicodeDecodeError::new_err(
|
||||
py,
|
||||
cstr!("utf-32"),
|
||||
utf32_bytes(data),
|
||||
0..4 * data.len(),
|
||||
cstr!("invalid utf-32"),
|
||||
)?,
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert the Python string data to a Rust string.
|
||||
///
|
||||
/// Returns a borrow into the original string data if possible.
|
||||
///
|
||||
/// Data that isn't valid in its encoding will be replaced
|
||||
/// with U+FFFD REPLACEMENT CHARACTER.
|
||||
pub fn to_string_lossy(self) -> Cow<'a, str> {
|
||||
match self {
|
||||
PyStringData::Utf8(data) => String::from_utf8_lossy(data),
|
||||
PyStringData::Latin1(data) => {
|
||||
if data.iter().all(|&b| b.is_ascii()) {
|
||||
Cow::Borrowed(unsafe { str::from_utf8_unchecked(data) })
|
||||
} else {
|
||||
Cow::Owned(data.iter().map(|&b| b as char).collect())
|
||||
}
|
||||
}
|
||||
PyStringData::Utf16(data) => Cow::Owned(String::from_utf16_lossy(data)),
|
||||
PyStringData::Utf32(data) => Cow::Owned(
|
||||
data.iter()
|
||||
.map(|&u| char::from_u32(u).unwrap_or('\u{FFFD}'))
|
||||
.collect(),
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue