Simplify PyString, PyBytes, PyUnicode

PyStringData comments mention wanting to receive interpreter- specific unicode types. I tried implementing this, but it's more complex and slower to call libpython to fill a buffer of u32 or such and convert that in Rust using widestring. This implementation receives UTF-8 from PyUnicode_AsUTF8AndSize() in python3 and PyUnicode_AsUTF8String() in python2. PyStringData is removed as unnecessary. The data() method on PyString, PyBytes, and PyUnicode is replaced with as_bytes(). The python2 API changes improve the performance of extracting unicode strings.
2018-09-28 14:27:49 +00:00 · 2018-09-28 14:27:49 +00:00 · 851d2207c0
parent 3a95d163ca
commit 851d2207c0
5 changed files with 128 additions and 164 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -12,10 +12,13 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 * `PyComplex` by kngwyu in [#226](https://github.com/PyO3/pyo3/pull/226)
 * `PyDict::from_sequence()`, equivalent to `dict([(key, val), ...])`
 * Bindings for the `datetime` standard library types: `PyDate`, `PyTime`, `PyDateTime`, `PyTzInfo`, `PyDelta` with associated `ffi` types, by pganssle [#200](https://github.com/PyO3/pyo3/pull/200).
+ * `PyString`, `PyUnicode`, and `PyBytes` now have an `as_bytes()` method that returns `&[u8]`.

 ### Removed
 * Removed most entries from the prelude. The new prelude is small and clear.
 * Slowly removing specialization uses
+ * `PyString`, `PyUnicode`, and `PyBytes` no longer have a `data()` method
+ (replaced by `as_bytes()`) and `PyStringData` has been removed.

 ### Changed
 * Removes the types from the root module and the prelude. They now live in `pyo3::types` instead.
--- a/src/types/mod.rs
+++ b/src/types/mod.rs
@ -27,7 +27,6 @@ pub use self::slice::{PySlice, PySliceIndices};
 pub use self::string::{PyBytes, PyString, PyString as PyUnicode};
 #[cfg(not(Py_3))]
 pub use self::string2::{PyBytes, PyString, PyUnicode};
-pub use self::stringdata::PyStringData;
 pub use self::tuple::PyTuple;
 pub use self::typeobject::PyType;
 use ffi;
@ -195,7 +194,6 @@ mod module;
 mod sequence;
 mod set;
 mod slice;
-mod stringdata;
 mod stringutils;
 mod tuple;
 mod typeobject;
--- a/src/types/string.rs
+++ b/src/types/string.rs
@ -5,13 +5,13 @@ use std::borrow::Cow;
 use std::os::raw::c_char;
 use std::{mem, str};

-use super::PyStringData;
 use err::{PyErr, PyResult};
 use ffi;
 use instance::{Py, PyObjectWithToken};
 use object::PyObject;
 use python::{Python, ToPyPointer};
 use types::PyObjectRef;
+use types::exceptions;

 /// Represents a Python `string`.
 #[repr(transparent)]
@ -50,18 +50,14 @@ impl PyString {
        }
    }

-    /// Gets the python string data in its underlying representation.
-    pub fn data(&self) -> PyStringData {
-        // TODO: return the original representation instead
-        // of forcing the UTF-8 representation to be created.
+    /// Get the Python string as a byte slice.
+    #[inline]
+    pub fn as_bytes(&self) -> &[u8] {
        unsafe {
            let mut size: ffi::Py_ssize_t = mem::uninitialized();
            let data = ffi::PyUnicode_AsUTF8AndSize(self.0.as_ptr(), &mut size) as *const u8;
-            if data.is_null() {
-                PyErr::fetch(self.py()).print(self.py());
-                panic!("PyUnicode_AsUTF8AndSize failed");
-            }
-            PyStringData::Utf8(std::slice::from_raw_parts(data, size as usize))
+            debug_assert!(!data.is_null());
+            std::slice::from_raw_parts(data, size as usize)
        }
    }

@ -70,7 +66,12 @@ impl PyString {
    /// Returns a `UnicodeDecodeError` if the input is not valid unicode
    /// (containing unpaired surrogates).
    pub fn to_string(&self) -> PyResult<Cow<str>> {
-        self.data().to_string(self.py())
+        match std::str::from_utf8(self.as_bytes()) {
+            Ok(s) => Ok(Cow::Borrowed(s)),
+            Err(e) => Err(PyErr::from_instance(
+                exceptions::UnicodeDecodeError::new_utf8(self.py(), self.as_bytes(), e)?,
+            ))
+        }
    }

    /// Convert the `PyString` into a Rust string.
@ -78,7 +79,7 @@ impl PyString {
    /// Unpaired surrogates invalid UTF-8 sequences are
    /// replaced with U+FFFD REPLACEMENT CHARACTER.
    pub fn to_string_lossy(&self) -> Cow<str> {
-        self.data().to_string_lossy()
+        String::from_utf8_lossy(self.as_bytes())
    }
 }

@ -103,11 +104,13 @@ impl PyBytes {
        ))
    }

-    /// Gets the Python string data as byte slice.
-    pub fn data(&self) -> &[u8] {
+    /// Get the Python string as a byte slice.
+    #[inline]
+    pub fn as_bytes(&self) -> &[u8] {
        unsafe {
            let buffer = ffi::PyBytes_AsString(self.as_ptr()) as *const u8;
            let length = ffi::PyBytes_Size(self.as_ptr()) as usize;
+            debug_assert!(!buffer.is_null());
            std::slice::from_raw_parts(buffer, length)
        }
    }
@ -115,9 +118,12 @@ impl PyBytes {

 #[cfg(test)]
 mod test {
-    use conversion::{FromPyObject, ToPyObject};
+    use std::borrow::Cow;
+    use conversion::{FromPyObject, ToPyObject, PyTryFrom};
    use instance::AsPyRef;
    use python::Python;
+    use object::PyObject;
+    use super::PyString;

    #[test]
    fn test_non_bmp() {
@ -138,4 +144,36 @@ mod test {
        let s2: &str = FromPyObject::extract(py_string.as_ref(py)).unwrap();
        assert_eq!(s, s2);
    }
+
+    #[test]
+    fn test_as_bytes() {
+        let gil = Python::acquire_gil();
+        let py = gil.python();
+        let s = "ascii 🐈";
+        let obj: PyObject = PyString::new(py, s).into();
+        let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
+        assert_eq!(s.as_bytes(), py_string.as_bytes());
+    }
+
+    #[test]
+    fn test_to_string_ascii() {
+        let gil = Python::acquire_gil();
+        let py = gil.python();
+        let s = "ascii";
+        let obj: PyObject = PyString::new(py, s).into();
+        let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
+        assert!(py_string.to_string().is_ok());
+        assert_eq!(Cow::Borrowed(s), py_string.to_string().unwrap());
+    }
+
+    #[test]
+    fn test_to_string_unicode() {
+        let gil = Python::acquire_gil();
+        let py = gil.python();
+        let s = "哈哈🐈";
+        let obj: PyObject = PyString::new(py, s).into();
+        let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
+        assert!(py_string.to_string().is_ok());
+        assert_eq!(Cow::Borrowed(s), py_string.to_string().unwrap());
+    }
 }
--- a/src/types/string2.rs
+++ b/src/types/string2.rs
@ -7,13 +7,14 @@ use std::borrow::Cow;
 use std::os::raw::c_char;
 use std::str;

-use super::{PyObjectRef, PyStringData};
-use err::PyResult;
+use super::PyObjectRef;
+use err::{PyErr, PyResult};
 use ffi;
 use instance::{Py, PyObjectWithToken};
 use object::PyObject;
 use objectprotocol::ObjectProtocol;
 use python::{Python, ToPyPointer};
+use types::exceptions;

 /// Represents a Python `string`.
 #[repr(transparent)]
@ -62,16 +63,13 @@ impl PyString {
        }
    }

-    /// Gets the python string data in its underlying representation.
-    ///
-    /// For Python 2 byte strings, this function always returns `PyStringData::Utf8`,
-    /// even if the bytes are not valid UTF-8.
-    /// For unicode strings, returns the underlying representation used by Python.
-    pub fn data(&self) -> PyStringData {
+    /// Get the Python string as a byte slice.
+    #[inline]
+    pub fn as_bytes(&self) -> &[u8] {
        if let Ok(bytes) = self.cast_as::<PyBytes>() {
-            PyStringData::Utf8(bytes.data())
+            bytes.as_bytes()
        } else if let Ok(unicode) = self.cast_as::<PyUnicode>() {
-            unicode.data()
+            unicode.as_bytes()
        } else {
            panic!("PyString is neither `str` nor `unicode`")
        }
@ -86,7 +84,12 @@ impl PyString {
    /// (containing unpaired surrogates, or a Python 2.7 byte string that is
    /// not valid UTF-8).
    pub fn to_string(&self) -> PyResult<Cow<str>> {
-        self.data().to_string(self.py())
+        match std::str::from_utf8(self.as_bytes()) {
+            Ok(s) => Ok(Cow::Borrowed(s)),
+            Err(e) => Err(PyErr::from_instance(
+                exceptions::UnicodeDecodeError::new_utf8(self.py(), self.as_bytes(), e)?,
+            ))
+        }
    }

    /// Convert the `PyString` into a Rust string.
@ -97,7 +100,7 @@ impl PyString {
    /// Unpaired surrogates and (on Python 2.7) invalid UTF-8 sequences are
    /// replaced with U+FFFD REPLACEMENT CHARACTER.
    pub fn to_string_lossy(&self) -> Cow<str> {
-        self.data().to_string_lossy()
+        String::from_utf8_lossy(self.as_bytes())
    }
 }

@ -112,11 +115,13 @@ impl PyBytes {
        unsafe { Py::from_owned_ptr_or_panic(ffi::PyBytes_FromStringAndSize(ptr, len)) }
    }

-    /// Gets the Python string data as byte slice.
-    pub fn data(&self) -> &[u8] {
+    /// Get the Python string as a byte slice.
+    #[inline]
+    pub fn as_bytes(&self) -> &[u8] {
        unsafe {
            let buffer = ffi::PyBytes_AsString(self.as_ptr()) as *const u8;
            let length = ffi::PyBytes_Size(self.as_ptr()) as usize;
+            debug_assert!(!buffer.is_null());
            std::slice::from_raw_parts(buffer, length)
        }
    }
@ -145,12 +150,17 @@ impl PyUnicode {
        }
    }

-    /// Gets the python string data in its underlying representation.
-    pub fn data(&self) -> PyStringData {
+    /// Get the Python string as a byte slice.
+    #[inline]
+    pub fn as_bytes(&self) -> &[u8] {
        unsafe {
-            let buffer = ffi::PyUnicode_AS_UNICODE(self.as_ptr());
-            let length = ffi::PyUnicode_GET_SIZE(self.as_ptr()) as usize;
-            std::slice::from_raw_parts(buffer, length).into()
+            let data: Py<PyBytes> = Py::from_owned_ptr(
+                ffi::PyUnicode_AsUTF8String(self.0.as_ptr()),
+            );
+            let buffer = ffi::PyBytes_AsString(data.as_ptr()) as *const u8;
+            let length = ffi::PyBytes_Size(data.as_ptr()) as usize;
+            debug_assert!(!buffer.is_null());
+            std::slice::from_raw_parts(buffer, length)
        }
    }

@ -159,14 +169,19 @@ impl PyUnicode {
    /// Returns a `UnicodeDecodeError` if the input is not valid unicode
    /// (containing unpaired surrogates).
    pub fn to_string(&self) -> PyResult<Cow<str>> {
-        self.data().to_string(self.py())
+        match std::str::from_utf8(self.as_bytes()) {
+            Ok(s) => Ok(Cow::Borrowed(s)),
+            Err(e) => Err(PyErr::from_instance(
+                exceptions::UnicodeDecodeError::new_utf8(self.py(), self.as_bytes(), e)?,
+            ))
+        }
    }

    /// Convert the `PyString` into a Rust string.
    ///
    /// Unpaired surrogates are replaced with U+FFFD REPLACEMENT CHARACTER.
    pub fn to_string_lossy(&self) -> Cow<str> {
-        self.data().to_string_lossy()
+        String::from_utf8_lossy(self.as_bytes())
    }
 }

@ -188,9 +203,12 @@ impl std::convert::From<Py<PyUnicode>> for Py<PyString> {

 #[cfg(test)]
 mod test {
-    use conversion::{FromPyObject, ToPyObject};
+    use std::borrow::Cow;
+    use conversion::{FromPyObject, ToPyObject, PyTryFrom};
    use instance::AsPyRef;
    use python::Python;
+    use object::PyObject;
+    use super::PyString;

    #[test]
    fn test_non_bmp() {
@ -211,4 +229,36 @@ mod test {
        let s2: &str = FromPyObject::extract(py_string.as_ref(py)).unwrap();
        assert_eq!(s, s2);
    }
+
+    #[test]
+    fn test_as_bytes() {
+        let gil = Python::acquire_gil();
+        let py = gil.python();
+        let s = "ascii 🐈";
+        let obj: PyObject = PyString::new(py, s).into();
+        let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
+        assert_eq!(s.as_bytes(), py_string.as_bytes());
+    }
+
+    #[test]
+    fn test_to_string_ascii() {
+        let gil = Python::acquire_gil();
+        let py = gil.python();
+        let s = "ascii";
+        let obj: PyObject = PyString::new(py, s).into();
+        let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
+        assert!(py_string.to_string().is_ok());
+        assert_eq!(Cow::Borrowed(s), py_string.to_string().unwrap());
+    }
+
+    #[test]
+    fn test_to_string_unicode() {
+        let gil = Python::acquire_gil();
+        let py = gil.python();
+        let s = "哈哈🐈";
+        let obj: PyObject = PyString::new(py, s).into();
+        let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
+        assert!(py_string.to_string().is_ok());
+        assert_eq!(Cow::Borrowed(s), py_string.to_string().unwrap());
+    }
 }
--- a/src/types/stringdata.rs
+++ b/src/types/stringdata.rs
@ -1,125 +0,0 @@
-// Copyright (c) 2017-present PyO3 Project and Contributors
-//
-// based on Daniel Grunwald's https://github.com/dgrunwald/rust-cpython
-
-use std::borrow::Cow;
-use std::{char, str};
-
-use err::{PyErr, PyResult};
-use python::Python;
-use types::exceptions;
-
-/// Enum of possible Python string representations.
-#[derive(Clone, Copy, Debug)]
-pub enum PyStringData<'a> {
-    Latin1(&'a [u8]),
-    Utf8(&'a [u8]),
-    Utf16(&'a [u16]),
-    Utf32(&'a [u32]),
-}
-
-impl<'a> From<&'a str> for PyStringData<'a> {
-    #[inline]
-    fn from(val: &'a str) -> PyStringData<'a> {
-        PyStringData::Utf8(val.as_bytes())
-    }
-}
-
-impl<'a> From<&'a [u16]> for PyStringData<'a> {
-    #[inline]
-    fn from(val: &'a [u16]) -> PyStringData<'a> {
-        PyStringData::Utf16(val)
-    }
-}
-
-impl<'a> From<&'a [u32]> for PyStringData<'a> {
-    #[inline]
-    fn from(val: &'a [u32]) -> PyStringData<'a> {
-        PyStringData::Utf32(val)
-    }
-}
-
-impl<'a> PyStringData<'a> {
-    /// Convert the Python string data to a Rust string.
-    ///
-    /// For UTF-8 and ASCII-only latin-1, returns a borrow into the original string data.
-    /// For Latin-1, UTF-16 and UTF-32, returns an owned string.
-    ///
-    /// Fails with UnicodeDecodeError if the string data isn't valid in its encoding.
-    pub fn to_string(self, py: Python) -> PyResult<Cow<'a, str>> {
-        match self {
-            PyStringData::Utf8(data) => match str::from_utf8(data) {
-                Ok(s) => Ok(Cow::Borrowed(s)),
-                Err(e) => Err(PyErr::from_instance(
-                    exceptions::UnicodeDecodeError::new_utf8(py, data, e)?,
-                )),
-            },
-            PyStringData::Latin1(data) => {
-                if data.iter().all(|&b| b.is_ascii()) {
-                    Ok(Cow::Borrowed(unsafe { str::from_utf8_unchecked(data) }))
-                } else {
-                    Ok(Cow::Owned(data.iter().map(|&b| b as char).collect()))
-                }
-            }
-            PyStringData::Utf16(data) => {
-                fn utf16_bytes(input: &[u16]) -> &[u8] {
-                    unsafe { &*(input as *const [u16] as *const [u8]) }
-                }
-                match String::from_utf16(data) {
-                    Ok(s) => Ok(Cow::Owned(s)),
-                    Err(_) => Err(PyErr::from_instance(
-                        exceptions::UnicodeDecodeError::new_err(
-                            py,
-                            cstr!("utf-16"),
-                            utf16_bytes(data),
-                            0..2 * data.len(),
-                            cstr!("invalid utf-16"),
-                        )?,
-                    )),
-                }
-            }
-            PyStringData::Utf32(data) => {
-                fn utf32_bytes(input: &[u32]) -> &[u8] {
-                    unsafe { &*(input as *const [u32] as *const [u8]) }
-                }
-                match data.iter().map(|&u| char::from_u32(u)).collect() {
-                    Some(s) => Ok(Cow::Owned(s)),
-                    None => Err(PyErr::from_instance(
-                        exceptions::UnicodeDecodeError::new_err(
-                            py,
-                            cstr!("utf-32"),
-                            utf32_bytes(data),
-                            0..4 * data.len(),
-                            cstr!("invalid utf-32"),
-                        )?,
-                    )),
-                }
-            }
-        }
-    }
-
-    /// Convert the Python string data to a Rust string.
-    ///
-    /// Returns a borrow into the original string data if possible.
-    ///
-    /// Data that isn't valid in its encoding will be replaced
-    /// with U+FFFD REPLACEMENT CHARACTER.
-    pub fn to_string_lossy(self) -> Cow<'a, str> {
-        match self {
-            PyStringData::Utf8(data) => String::from_utf8_lossy(data),
-            PyStringData::Latin1(data) => {
-                if data.iter().all(|&b| b.is_ascii()) {
-                    Cow::Borrowed(unsafe { str::from_utf8_unchecked(data) })
-                } else {
-                    Cow::Owned(data.iter().map(|&b| b as char).collect())
-                }
-            }
-            PyStringData::Utf16(data) => Cow::Owned(String::from_utf16_lossy(data)),
-            PyStringData::Utf32(data) => Cow::Owned(
-                data.iter()
-                    .map(|&u| char::from_u32(u).unwrap_or('\u{FFFD}'))
-                    .collect(),
-            ),
-        }
-    }
-}