Merge pull request #238 from ijl/pystring

Simplify PyString, PyBytes, PyUnicode
This commit is contained in:
konstin 2018-09-29 00:01:15 +02:00 committed by GitHub
commit da906fb715
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 132 additions and 164 deletions

View file

@ -12,10 +12,13 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
* `PyComplex` by kngwyu in [#226](https://github.com/PyO3/pyo3/pull/226)
* `PyDict::from_sequence()`, equivalent to `dict([(key, val), ...])`
* Bindings for the `datetime` standard library types: `PyDate`, `PyTime`, `PyDateTime`, `PyTzInfo`, `PyDelta` with associated `ffi` types, by pganssle [#200](https://github.com/PyO3/pyo3/pull/200).
* `PyString`, `PyUnicode`, and `PyBytes` now have an `as_bytes()` method that returns `&[u8]`.
### Removed
* Removed most entries from the prelude. The new prelude is small and clear.
* Slowly removing specialization uses
* `PyString`, `PyUnicode`, and `PyBytes` no longer have a `data()` method
(replaced by `as_bytes()`) and `PyStringData` has been removed.
### Changed
* Removes the types from the root module and the prelude. They now live in `pyo3::types` instead.

View file

@ -27,7 +27,6 @@ pub use self::slice::{PySlice, PySliceIndices};
pub use self::string::{PyBytes, PyString, PyString as PyUnicode};
#[cfg(not(Py_3))]
pub use self::string2::{PyBytes, PyString, PyUnicode};
pub use self::stringdata::PyStringData;
pub use self::tuple::PyTuple;
pub use self::typeobject::PyType;
use ffi;
@ -195,7 +194,6 @@ mod module;
mod sequence;
mod set;
mod slice;
mod stringdata;
mod stringutils;
mod tuple;
mod typeobject;

View file

@ -5,13 +5,13 @@ use std::borrow::Cow;
use std::os::raw::c_char;
use std::{mem, str};
use super::PyStringData;
use err::{PyErr, PyResult};
use ffi;
use instance::{Py, PyObjectWithToken};
use object::PyObject;
use python::{Python, ToPyPointer};
use types::PyObjectRef;
use types::exceptions;
/// Represents a Python `string`.
#[repr(transparent)]
@ -50,18 +50,16 @@ impl PyString {
}
}
/// Gets the python string data in its underlying representation.
pub fn data(&self) -> PyStringData {
// TODO: return the original representation instead
// of forcing the UTF-8 representation to be created.
/// Get the Python string as a byte slice.
#[inline]
pub fn as_bytes(&self) -> &[u8] {
unsafe {
let mut size: ffi::Py_ssize_t = mem::uninitialized();
let data = ffi::PyUnicode_AsUTF8AndSize(self.0.as_ptr(), &mut size) as *const u8;
if data.is_null() {
PyErr::fetch(self.py()).print(self.py());
panic!("PyUnicode_AsUTF8AndSize failed");
}
PyStringData::Utf8(std::slice::from_raw_parts(data, size as usize))
// PyUnicode_AsUTF8AndSize would return null if the pointer did not reference a valid
// unicode object, but because we have a valid PyString, assume success
debug_assert!(!data.is_null());
std::slice::from_raw_parts(data, size as usize)
}
}
@ -70,7 +68,12 @@ impl PyString {
/// Returns a `UnicodeDecodeError` if the input is not valid unicode
/// (containing unpaired surrogates).
pub fn to_string(&self) -> PyResult<Cow<str>> {
self.data().to_string(self.py())
match std::str::from_utf8(self.as_bytes()) {
Ok(s) => Ok(Cow::Borrowed(s)),
Err(e) => Err(PyErr::from_instance(
exceptions::UnicodeDecodeError::new_utf8(self.py(), self.as_bytes(), e)?,
))
}
}
/// Convert the `PyString` into a Rust string.
@ -78,7 +81,7 @@ impl PyString {
/// Unpaired surrogates invalid UTF-8 sequences are
/// replaced with U+FFFD REPLACEMENT CHARACTER.
pub fn to_string_lossy(&self) -> Cow<str> {
self.data().to_string_lossy()
String::from_utf8_lossy(self.as_bytes())
}
}
@ -103,11 +106,13 @@ impl PyBytes {
))
}
/// Gets the Python string data as byte slice.
pub fn data(&self) -> &[u8] {
/// Get the Python string as a byte slice.
#[inline]
pub fn as_bytes(&self) -> &[u8] {
unsafe {
let buffer = ffi::PyBytes_AsString(self.as_ptr()) as *const u8;
let length = ffi::PyBytes_Size(self.as_ptr()) as usize;
debug_assert!(!buffer.is_null());
std::slice::from_raw_parts(buffer, length)
}
}
@ -115,9 +120,12 @@ impl PyBytes {
#[cfg(test)]
mod test {
use conversion::{FromPyObject, ToPyObject};
use std::borrow::Cow;
use conversion::{FromPyObject, ToPyObject, PyTryFrom};
use instance::AsPyRef;
use python::Python;
use object::PyObject;
use super::PyString;
#[test]
fn test_non_bmp() {
@ -138,4 +146,36 @@ mod test {
let s2: &str = FromPyObject::extract(py_string.as_ref(py)).unwrap();
assert_eq!(s, s2);
}
#[test]
fn test_as_bytes() {
let gil = Python::acquire_gil();
let py = gil.python();
let s = "ascii 🐈";
let obj: PyObject = PyString::new(py, s).into();
let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
assert_eq!(s.as_bytes(), py_string.as_bytes());
}
#[test]
fn test_to_string_ascii() {
let gil = Python::acquire_gil();
let py = gil.python();
let s = "ascii";
let obj: PyObject = PyString::new(py, s).into();
let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
assert!(py_string.to_string().is_ok());
assert_eq!(Cow::Borrowed(s), py_string.to_string().unwrap());
}
#[test]
fn test_to_string_unicode() {
let gil = Python::acquire_gil();
let py = gil.python();
let s = "哈哈🐈";
let obj: PyObject = PyString::new(py, s).into();
let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
assert!(py_string.to_string().is_ok());
assert_eq!(Cow::Borrowed(s), py_string.to_string().unwrap());
}
}

View file

@ -7,13 +7,14 @@ use std::borrow::Cow;
use std::os::raw::c_char;
use std::str;
use super::{PyObjectRef, PyStringData};
use err::PyResult;
use super::PyObjectRef;
use err::{PyErr, PyResult};
use ffi;
use instance::{Py, PyObjectWithToken};
use object::PyObject;
use objectprotocol::ObjectProtocol;
use python::{Python, ToPyPointer};
use types::exceptions;
/// Represents a Python `string`.
#[repr(transparent)]
@ -62,16 +63,13 @@ impl PyString {
}
}
/// Gets the python string data in its underlying representation.
///
/// For Python 2 byte strings, this function always returns `PyStringData::Utf8`,
/// even if the bytes are not valid UTF-8.
/// For unicode strings, returns the underlying representation used by Python.
pub fn data(&self) -> PyStringData {
/// Get the Python string as a byte slice.
#[inline]
pub fn as_bytes(&self) -> &[u8] {
if let Ok(bytes) = self.cast_as::<PyBytes>() {
PyStringData::Utf8(bytes.data())
bytes.as_bytes()
} else if let Ok(unicode) = self.cast_as::<PyUnicode>() {
unicode.data()
unicode.as_bytes()
} else {
panic!("PyString is neither `str` nor `unicode`")
}
@ -86,7 +84,12 @@ impl PyString {
/// (containing unpaired surrogates, or a Python 2.7 byte string that is
/// not valid UTF-8).
pub fn to_string(&self) -> PyResult<Cow<str>> {
self.data().to_string(self.py())
match std::str::from_utf8(self.as_bytes()) {
Ok(s) => Ok(Cow::Borrowed(s)),
Err(e) => Err(PyErr::from_instance(
exceptions::UnicodeDecodeError::new_utf8(self.py(), self.as_bytes(), e)?,
))
}
}
/// Convert the `PyString` into a Rust string.
@ -97,7 +100,7 @@ impl PyString {
/// Unpaired surrogates and (on Python 2.7) invalid UTF-8 sequences are
/// replaced with U+FFFD REPLACEMENT CHARACTER.
pub fn to_string_lossy(&self) -> Cow<str> {
self.data().to_string_lossy()
String::from_utf8_lossy(self.as_bytes())
}
}
@ -112,11 +115,13 @@ impl PyBytes {
unsafe { Py::from_owned_ptr_or_panic(ffi::PyBytes_FromStringAndSize(ptr, len)) }
}
/// Gets the Python string data as byte slice.
pub fn data(&self) -> &[u8] {
/// Get the Python string as a byte slice.
#[inline]
pub fn as_bytes(&self) -> &[u8] {
unsafe {
let buffer = ffi::PyBytes_AsString(self.as_ptr()) as *const u8;
let length = ffi::PyBytes_Size(self.as_ptr()) as usize;
debug_assert!(!buffer.is_null());
std::slice::from_raw_parts(buffer, length)
}
}
@ -145,12 +150,19 @@ impl PyUnicode {
}
}
/// Gets the python string data in its underlying representation.
pub fn data(&self) -> PyStringData {
/// Get the Python string as a byte slice.
#[inline]
pub fn as_bytes(&self) -> &[u8] {
unsafe {
let buffer = ffi::PyUnicode_AS_UNICODE(self.as_ptr());
let length = ffi::PyUnicode_GET_SIZE(self.as_ptr()) as usize;
std::slice::from_raw_parts(buffer, length).into()
// PyUnicode_AsUTF8String would return null if the pointer did not reference a valid
// unicode object, but because we have a valid PyUnicode, assume success
let data: Py<PyBytes> = Py::from_owned_ptr(
ffi::PyUnicode_AsUTF8String(self.0.as_ptr()),
);
let buffer = ffi::PyBytes_AsString(data.as_ptr()) as *const u8;
let length = ffi::PyBytes_Size(data.as_ptr()) as usize;
debug_assert!(!buffer.is_null());
std::slice::from_raw_parts(buffer, length)
}
}
@ -159,14 +171,19 @@ impl PyUnicode {
/// Returns a `UnicodeDecodeError` if the input is not valid unicode
/// (containing unpaired surrogates).
pub fn to_string(&self) -> PyResult<Cow<str>> {
self.data().to_string(self.py())
match std::str::from_utf8(self.as_bytes()) {
Ok(s) => Ok(Cow::Borrowed(s)),
Err(e) => Err(PyErr::from_instance(
exceptions::UnicodeDecodeError::new_utf8(self.py(), self.as_bytes(), e)?,
))
}
}
/// Convert the `PyString` into a Rust string.
///
/// Unpaired surrogates are replaced with U+FFFD REPLACEMENT CHARACTER.
pub fn to_string_lossy(&self) -> Cow<str> {
self.data().to_string_lossy()
String::from_utf8_lossy(self.as_bytes())
}
}
@ -188,9 +205,12 @@ impl std::convert::From<Py<PyUnicode>> for Py<PyString> {
#[cfg(test)]
mod test {
use conversion::{FromPyObject, ToPyObject};
use std::borrow::Cow;
use conversion::{FromPyObject, ToPyObject, PyTryFrom};
use instance::AsPyRef;
use python::Python;
use object::PyObject;
use super::PyString;
#[test]
fn test_non_bmp() {
@ -211,4 +231,36 @@ mod test {
let s2: &str = FromPyObject::extract(py_string.as_ref(py)).unwrap();
assert_eq!(s, s2);
}
#[test]
fn test_as_bytes() {
let gil = Python::acquire_gil();
let py = gil.python();
let s = "ascii 🐈";
let obj: PyObject = PyString::new(py, s).into();
let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
assert_eq!(s.as_bytes(), py_string.as_bytes());
}
#[test]
fn test_to_string_ascii() {
let gil = Python::acquire_gil();
let py = gil.python();
let s = "ascii";
let obj: PyObject = PyString::new(py, s).into();
let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
assert!(py_string.to_string().is_ok());
assert_eq!(Cow::Borrowed(s), py_string.to_string().unwrap());
}
#[test]
fn test_to_string_unicode() {
let gil = Python::acquire_gil();
let py = gil.python();
let s = "哈哈🐈";
let obj: PyObject = PyString::new(py, s).into();
let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
assert!(py_string.to_string().is_ok());
assert_eq!(Cow::Borrowed(s), py_string.to_string().unwrap());
}
}

View file

@ -1,125 +0,0 @@
// Copyright (c) 2017-present PyO3 Project and Contributors
//
// based on Daniel Grunwald's https://github.com/dgrunwald/rust-cpython
use std::borrow::Cow;
use std::{char, str};
use err::{PyErr, PyResult};
use python::Python;
use types::exceptions;
/// Enum of possible Python string representations.
#[derive(Clone, Copy, Debug)]
pub enum PyStringData<'a> {
Latin1(&'a [u8]),
Utf8(&'a [u8]),
Utf16(&'a [u16]),
Utf32(&'a [u32]),
}
impl<'a> From<&'a str> for PyStringData<'a> {
#[inline]
fn from(val: &'a str) -> PyStringData<'a> {
PyStringData::Utf8(val.as_bytes())
}
}
impl<'a> From<&'a [u16]> for PyStringData<'a> {
#[inline]
fn from(val: &'a [u16]) -> PyStringData<'a> {
PyStringData::Utf16(val)
}
}
impl<'a> From<&'a [u32]> for PyStringData<'a> {
#[inline]
fn from(val: &'a [u32]) -> PyStringData<'a> {
PyStringData::Utf32(val)
}
}
impl<'a> PyStringData<'a> {
/// Convert the Python string data to a Rust string.
///
/// For UTF-8 and ASCII-only latin-1, returns a borrow into the original string data.
/// For Latin-1, UTF-16 and UTF-32, returns an owned string.
///
/// Fails with UnicodeDecodeError if the string data isn't valid in its encoding.
pub fn to_string(self, py: Python) -> PyResult<Cow<'a, str>> {
match self {
PyStringData::Utf8(data) => match str::from_utf8(data) {
Ok(s) => Ok(Cow::Borrowed(s)),
Err(e) => Err(PyErr::from_instance(
exceptions::UnicodeDecodeError::new_utf8(py, data, e)?,
)),
},
PyStringData::Latin1(data) => {
if data.iter().all(|&b| b.is_ascii()) {
Ok(Cow::Borrowed(unsafe { str::from_utf8_unchecked(data) }))
} else {
Ok(Cow::Owned(data.iter().map(|&b| b as char).collect()))
}
}
PyStringData::Utf16(data) => {
fn utf16_bytes(input: &[u16]) -> &[u8] {
unsafe { &*(input as *const [u16] as *const [u8]) }
}
match String::from_utf16(data) {
Ok(s) => Ok(Cow::Owned(s)),
Err(_) => Err(PyErr::from_instance(
exceptions::UnicodeDecodeError::new_err(
py,
cstr!("utf-16"),
utf16_bytes(data),
0..2 * data.len(),
cstr!("invalid utf-16"),
)?,
)),
}
}
PyStringData::Utf32(data) => {
fn utf32_bytes(input: &[u32]) -> &[u8] {
unsafe { &*(input as *const [u32] as *const [u8]) }
}
match data.iter().map(|&u| char::from_u32(u)).collect() {
Some(s) => Ok(Cow::Owned(s)),
None => Err(PyErr::from_instance(
exceptions::UnicodeDecodeError::new_err(
py,
cstr!("utf-32"),
utf32_bytes(data),
0..4 * data.len(),
cstr!("invalid utf-32"),
)?,
)),
}
}
}
}
/// Convert the Python string data to a Rust string.
///
/// Returns a borrow into the original string data if possible.
///
/// Data that isn't valid in its encoding will be replaced
/// with U+FFFD REPLACEMENT CHARACTER.
pub fn to_string_lossy(self) -> Cow<'a, str> {
match self {
PyStringData::Utf8(data) => String::from_utf8_lossy(data),
PyStringData::Latin1(data) => {
if data.iter().all(|&b| b.is_ascii()) {
Cow::Borrowed(unsafe { str::from_utf8_unchecked(data) })
} else {
Cow::Owned(data.iter().map(|&b| b as char).collect())
}
}
PyStringData::Utf16(data) => Cow::Owned(String::from_utf16_lossy(data)),
PyStringData::Utf32(data) => Cow::Owned(
data.iter()
.map(|&u| char::from_u32(u).unwrap_or('\u{FFFD}'))
.collect(),
),
}
}
}