diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b84b26d..e580c45a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,10 +12,13 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. * `PyComplex` by kngwyu in [#226](https://github.com/PyO3/pyo3/pull/226) * `PyDict::from_sequence()`, equivalent to `dict([(key, val), ...])` * Bindings for the `datetime` standard library types: `PyDate`, `PyTime`, `PyDateTime`, `PyTzInfo`, `PyDelta` with associated `ffi` types, by pganssle [#200](https://github.com/PyO3/pyo3/pull/200). + * `PyString`, `PyUnicode`, and `PyBytes` now have an `as_bytes()` method that returns `&[u8]`. ### Removed * Removed most entries from the prelude. The new prelude is small and clear. * Slowly removing specialization uses + * `PyString`, `PyUnicode`, and `PyBytes` no longer have a `data()` method + (replaced by `as_bytes()`) and `PyStringData` has been removed. ### Changed * Removes the types from the root module and the prelude. They now live in `pyo3::types` instead. diff --git a/src/types/mod.rs b/src/types/mod.rs index 391d527d..c37cafce 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -27,7 +27,6 @@ pub use self::slice::{PySlice, PySliceIndices}; pub use self::string::{PyBytes, PyString, PyString as PyUnicode}; #[cfg(not(Py_3))] pub use self::string2::{PyBytes, PyString, PyUnicode}; -pub use self::stringdata::PyStringData; pub use self::tuple::PyTuple; pub use self::typeobject::PyType; use ffi; @@ -195,7 +194,6 @@ mod module; mod sequence; mod set; mod slice; -mod stringdata; mod stringutils; mod tuple; mod typeobject; diff --git a/src/types/string.rs b/src/types/string.rs index ed898c69..b5d4d7d7 100644 --- a/src/types/string.rs +++ b/src/types/string.rs @@ -5,13 +5,13 @@ use std::borrow::Cow; use std::os::raw::c_char; use std::{mem, str}; -use super::PyStringData; use err::{PyErr, PyResult}; use ffi; use instance::{Py, PyObjectWithToken}; use object::PyObject; use python::{Python, ToPyPointer}; use types::PyObjectRef; +use types::exceptions; /// Represents a Python `string`. #[repr(transparent)] @@ -50,18 +50,14 @@ impl PyString { } } - /// Gets the python string data in its underlying representation. - pub fn data(&self) -> PyStringData { - // TODO: return the original representation instead - // of forcing the UTF-8 representation to be created. + /// Get the Python string as a byte slice. + #[inline] + pub fn as_bytes(&self) -> &[u8] { unsafe { let mut size: ffi::Py_ssize_t = mem::uninitialized(); let data = ffi::PyUnicode_AsUTF8AndSize(self.0.as_ptr(), &mut size) as *const u8; - if data.is_null() { - PyErr::fetch(self.py()).print(self.py()); - panic!("PyUnicode_AsUTF8AndSize failed"); - } - PyStringData::Utf8(std::slice::from_raw_parts(data, size as usize)) + debug_assert!(!data.is_null()); + std::slice::from_raw_parts(data, size as usize) } } @@ -70,7 +66,12 @@ impl PyString { /// Returns a `UnicodeDecodeError` if the input is not valid unicode /// (containing unpaired surrogates). pub fn to_string(&self) -> PyResult> { - self.data().to_string(self.py()) + match std::str::from_utf8(self.as_bytes()) { + Ok(s) => Ok(Cow::Borrowed(s)), + Err(e) => Err(PyErr::from_instance( + exceptions::UnicodeDecodeError::new_utf8(self.py(), self.as_bytes(), e)?, + )) + } } /// Convert the `PyString` into a Rust string. @@ -78,7 +79,7 @@ impl PyString { /// Unpaired surrogates invalid UTF-8 sequences are /// replaced with U+FFFD REPLACEMENT CHARACTER. pub fn to_string_lossy(&self) -> Cow { - self.data().to_string_lossy() + String::from_utf8_lossy(self.as_bytes()) } } @@ -103,11 +104,13 @@ impl PyBytes { )) } - /// Gets the Python string data as byte slice. - pub fn data(&self) -> &[u8] { + /// Get the Python string as a byte slice. + #[inline] + pub fn as_bytes(&self) -> &[u8] { unsafe { let buffer = ffi::PyBytes_AsString(self.as_ptr()) as *const u8; let length = ffi::PyBytes_Size(self.as_ptr()) as usize; + debug_assert!(!buffer.is_null()); std::slice::from_raw_parts(buffer, length) } } @@ -115,9 +118,12 @@ impl PyBytes { #[cfg(test)] mod test { - use conversion::{FromPyObject, ToPyObject}; + use std::borrow::Cow; + use conversion::{FromPyObject, ToPyObject, PyTryFrom}; use instance::AsPyRef; use python::Python; + use object::PyObject; + use super::PyString; #[test] fn test_non_bmp() { @@ -138,4 +144,36 @@ mod test { let s2: &str = FromPyObject::extract(py_string.as_ref(py)).unwrap(); assert_eq!(s, s2); } + + #[test] + fn test_as_bytes() { + let gil = Python::acquire_gil(); + let py = gil.python(); + let s = "ascii 🐈"; + let obj: PyObject = PyString::new(py, s).into(); + let py_string = ::try_from(obj.as_ref(py)).unwrap(); + assert_eq!(s.as_bytes(), py_string.as_bytes()); + } + + #[test] + fn test_to_string_ascii() { + let gil = Python::acquire_gil(); + let py = gil.python(); + let s = "ascii"; + let obj: PyObject = PyString::new(py, s).into(); + let py_string = ::try_from(obj.as_ref(py)).unwrap(); + assert!(py_string.to_string().is_ok()); + assert_eq!(Cow::Borrowed(s), py_string.to_string().unwrap()); + } + + #[test] + fn test_to_string_unicode() { + let gil = Python::acquire_gil(); + let py = gil.python(); + let s = "哈哈🐈"; + let obj: PyObject = PyString::new(py, s).into(); + let py_string = ::try_from(obj.as_ref(py)).unwrap(); + assert!(py_string.to_string().is_ok()); + assert_eq!(Cow::Borrowed(s), py_string.to_string().unwrap()); + } } diff --git a/src/types/string2.rs b/src/types/string2.rs index 9d757984..2a95f3f2 100644 --- a/src/types/string2.rs +++ b/src/types/string2.rs @@ -7,13 +7,14 @@ use std::borrow::Cow; use std::os::raw::c_char; use std::str; -use super::{PyObjectRef, PyStringData}; -use err::PyResult; +use super::PyObjectRef; +use err::{PyErr, PyResult}; use ffi; use instance::{Py, PyObjectWithToken}; use object::PyObject; use objectprotocol::ObjectProtocol; use python::{Python, ToPyPointer}; +use types::exceptions; /// Represents a Python `string`. #[repr(transparent)] @@ -62,16 +63,13 @@ impl PyString { } } - /// Gets the python string data in its underlying representation. - /// - /// For Python 2 byte strings, this function always returns `PyStringData::Utf8`, - /// even if the bytes are not valid UTF-8. - /// For unicode strings, returns the underlying representation used by Python. - pub fn data(&self) -> PyStringData { + /// Get the Python string as a byte slice. + #[inline] + pub fn as_bytes(&self) -> &[u8] { if let Ok(bytes) = self.cast_as::() { - PyStringData::Utf8(bytes.data()) + bytes.as_bytes() } else if let Ok(unicode) = self.cast_as::() { - unicode.data() + unicode.as_bytes() } else { panic!("PyString is neither `str` nor `unicode`") } @@ -86,7 +84,12 @@ impl PyString { /// (containing unpaired surrogates, or a Python 2.7 byte string that is /// not valid UTF-8). pub fn to_string(&self) -> PyResult> { - self.data().to_string(self.py()) + match std::str::from_utf8(self.as_bytes()) { + Ok(s) => Ok(Cow::Borrowed(s)), + Err(e) => Err(PyErr::from_instance( + exceptions::UnicodeDecodeError::new_utf8(self.py(), self.as_bytes(), e)?, + )) + } } /// Convert the `PyString` into a Rust string. @@ -97,7 +100,7 @@ impl PyString { /// Unpaired surrogates and (on Python 2.7) invalid UTF-8 sequences are /// replaced with U+FFFD REPLACEMENT CHARACTER. pub fn to_string_lossy(&self) -> Cow { - self.data().to_string_lossy() + String::from_utf8_lossy(self.as_bytes()) } } @@ -112,11 +115,13 @@ impl PyBytes { unsafe { Py::from_owned_ptr_or_panic(ffi::PyBytes_FromStringAndSize(ptr, len)) } } - /// Gets the Python string data as byte slice. - pub fn data(&self) -> &[u8] { + /// Get the Python string as a byte slice. + #[inline] + pub fn as_bytes(&self) -> &[u8] { unsafe { let buffer = ffi::PyBytes_AsString(self.as_ptr()) as *const u8; let length = ffi::PyBytes_Size(self.as_ptr()) as usize; + debug_assert!(!buffer.is_null()); std::slice::from_raw_parts(buffer, length) } } @@ -145,12 +150,17 @@ impl PyUnicode { } } - /// Gets the python string data in its underlying representation. - pub fn data(&self) -> PyStringData { + /// Get the Python string as a byte slice. + #[inline] + pub fn as_bytes(&self) -> &[u8] { unsafe { - let buffer = ffi::PyUnicode_AS_UNICODE(self.as_ptr()); - let length = ffi::PyUnicode_GET_SIZE(self.as_ptr()) as usize; - std::slice::from_raw_parts(buffer, length).into() + let data: Py = Py::from_owned_ptr( + ffi::PyUnicode_AsUTF8String(self.0.as_ptr()), + ); + let buffer = ffi::PyBytes_AsString(data.as_ptr()) as *const u8; + let length = ffi::PyBytes_Size(data.as_ptr()) as usize; + debug_assert!(!buffer.is_null()); + std::slice::from_raw_parts(buffer, length) } } @@ -159,14 +169,19 @@ impl PyUnicode { /// Returns a `UnicodeDecodeError` if the input is not valid unicode /// (containing unpaired surrogates). pub fn to_string(&self) -> PyResult> { - self.data().to_string(self.py()) + match std::str::from_utf8(self.as_bytes()) { + Ok(s) => Ok(Cow::Borrowed(s)), + Err(e) => Err(PyErr::from_instance( + exceptions::UnicodeDecodeError::new_utf8(self.py(), self.as_bytes(), e)?, + )) + } } /// Convert the `PyString` into a Rust string. /// /// Unpaired surrogates are replaced with U+FFFD REPLACEMENT CHARACTER. pub fn to_string_lossy(&self) -> Cow { - self.data().to_string_lossy() + String::from_utf8_lossy(self.as_bytes()) } } @@ -188,9 +203,12 @@ impl std::convert::From> for Py { #[cfg(test)] mod test { - use conversion::{FromPyObject, ToPyObject}; + use std::borrow::Cow; + use conversion::{FromPyObject, ToPyObject, PyTryFrom}; use instance::AsPyRef; use python::Python; + use object::PyObject; + use super::PyString; #[test] fn test_non_bmp() { @@ -211,4 +229,36 @@ mod test { let s2: &str = FromPyObject::extract(py_string.as_ref(py)).unwrap(); assert_eq!(s, s2); } + + #[test] + fn test_as_bytes() { + let gil = Python::acquire_gil(); + let py = gil.python(); + let s = "ascii 🐈"; + let obj: PyObject = PyString::new(py, s).into(); + let py_string = ::try_from(obj.as_ref(py)).unwrap(); + assert_eq!(s.as_bytes(), py_string.as_bytes()); + } + + #[test] + fn test_to_string_ascii() { + let gil = Python::acquire_gil(); + let py = gil.python(); + let s = "ascii"; + let obj: PyObject = PyString::new(py, s).into(); + let py_string = ::try_from(obj.as_ref(py)).unwrap(); + assert!(py_string.to_string().is_ok()); + assert_eq!(Cow::Borrowed(s), py_string.to_string().unwrap()); + } + + #[test] + fn test_to_string_unicode() { + let gil = Python::acquire_gil(); + let py = gil.python(); + let s = "哈哈🐈"; + let obj: PyObject = PyString::new(py, s).into(); + let py_string = ::try_from(obj.as_ref(py)).unwrap(); + assert!(py_string.to_string().is_ok()); + assert_eq!(Cow::Borrowed(s), py_string.to_string().unwrap()); + } } diff --git a/src/types/stringdata.rs b/src/types/stringdata.rs deleted file mode 100644 index fa8015c1..00000000 --- a/src/types/stringdata.rs +++ /dev/null @@ -1,125 +0,0 @@ -// Copyright (c) 2017-present PyO3 Project and Contributors -// -// based on Daniel Grunwald's https://github.com/dgrunwald/rust-cpython - -use std::borrow::Cow; -use std::{char, str}; - -use err::{PyErr, PyResult}; -use python::Python; -use types::exceptions; - -/// Enum of possible Python string representations. -#[derive(Clone, Copy, Debug)] -pub enum PyStringData<'a> { - Latin1(&'a [u8]), - Utf8(&'a [u8]), - Utf16(&'a [u16]), - Utf32(&'a [u32]), -} - -impl<'a> From<&'a str> for PyStringData<'a> { - #[inline] - fn from(val: &'a str) -> PyStringData<'a> { - PyStringData::Utf8(val.as_bytes()) - } -} - -impl<'a> From<&'a [u16]> for PyStringData<'a> { - #[inline] - fn from(val: &'a [u16]) -> PyStringData<'a> { - PyStringData::Utf16(val) - } -} - -impl<'a> From<&'a [u32]> for PyStringData<'a> { - #[inline] - fn from(val: &'a [u32]) -> PyStringData<'a> { - PyStringData::Utf32(val) - } -} - -impl<'a> PyStringData<'a> { - /// Convert the Python string data to a Rust string. - /// - /// For UTF-8 and ASCII-only latin-1, returns a borrow into the original string data. - /// For Latin-1, UTF-16 and UTF-32, returns an owned string. - /// - /// Fails with UnicodeDecodeError if the string data isn't valid in its encoding. - pub fn to_string(self, py: Python) -> PyResult> { - match self { - PyStringData::Utf8(data) => match str::from_utf8(data) { - Ok(s) => Ok(Cow::Borrowed(s)), - Err(e) => Err(PyErr::from_instance( - exceptions::UnicodeDecodeError::new_utf8(py, data, e)?, - )), - }, - PyStringData::Latin1(data) => { - if data.iter().all(|&b| b.is_ascii()) { - Ok(Cow::Borrowed(unsafe { str::from_utf8_unchecked(data) })) - } else { - Ok(Cow::Owned(data.iter().map(|&b| b as char).collect())) - } - } - PyStringData::Utf16(data) => { - fn utf16_bytes(input: &[u16]) -> &[u8] { - unsafe { &*(input as *const [u16] as *const [u8]) } - } - match String::from_utf16(data) { - Ok(s) => Ok(Cow::Owned(s)), - Err(_) => Err(PyErr::from_instance( - exceptions::UnicodeDecodeError::new_err( - py, - cstr!("utf-16"), - utf16_bytes(data), - 0..2 * data.len(), - cstr!("invalid utf-16"), - )?, - )), - } - } - PyStringData::Utf32(data) => { - fn utf32_bytes(input: &[u32]) -> &[u8] { - unsafe { &*(input as *const [u32] as *const [u8]) } - } - match data.iter().map(|&u| char::from_u32(u)).collect() { - Some(s) => Ok(Cow::Owned(s)), - None => Err(PyErr::from_instance( - exceptions::UnicodeDecodeError::new_err( - py, - cstr!("utf-32"), - utf32_bytes(data), - 0..4 * data.len(), - cstr!("invalid utf-32"), - )?, - )), - } - } - } - } - - /// Convert the Python string data to a Rust string. - /// - /// Returns a borrow into the original string data if possible. - /// - /// Data that isn't valid in its encoding will be replaced - /// with U+FFFD REPLACEMENT CHARACTER. - pub fn to_string_lossy(self) -> Cow<'a, str> { - match self { - PyStringData::Utf8(data) => String::from_utf8_lossy(data), - PyStringData::Latin1(data) => { - if data.iter().all(|&b| b.is_ascii()) { - Cow::Borrowed(unsafe { str::from_utf8_unchecked(data) }) - } else { - Cow::Owned(data.iter().map(|&b| b as char).collect()) - } - } - PyStringData::Utf16(data) => Cow::Owned(String::from_utf16_lossy(data)), - PyStringData::Utf32(data) => Cow::Owned( - data.iter() - .map(|&u| char::from_u32(u).unwrap_or('\u{FFFD}')) - .collect(), - ), - } - } -}