From 03ced8917b4ed5b8f1cc878321fdd7e9b0e9a2ed Mon Sep 17 00:00:00 2001 From: Nikolay Kim Date: Sun, 11 Jun 2017 17:20:18 -0700 Subject: [PATCH] PyString for py2 --- src/ffi2/object.rs | 7 +-- src/objects/string2.rs | 117 ++++++++++++++++++++++++++++++++++------- 2 files changed, 100 insertions(+), 24 deletions(-) diff --git a/src/ffi2/object.rs b/src/ffi2/object.rs index 23c6623a..1cb38444 100644 --- a/src/ffi2/object.rs +++ b/src/ffi2/object.rs @@ -33,12 +33,7 @@ pub const PyObject_HEAD_INIT: PyObject = PyObject { #[repr(C)] #[derive(Copy, Clone)] pub struct PyVarObject { - #[cfg(py_sys_config="Py_TRACE_REFS")] - pub _ob_next: *mut PyObject, - #[cfg(py_sys_config="Py_TRACE_REFS")] - pub _ob_prev: *mut PyObject, - pub ob_refcnt: Py_ssize_t, - pub ob_type: *mut PyTypeObject, + pub ob_base: PyObject, pub ob_size: Py_ssize_t, } diff --git a/src/objects/string2.rs b/src/objects/string2.rs index 364bdf60..065f3c98 100644 --- a/src/objects/string2.rs +++ b/src/objects/string2.rs @@ -5,6 +5,7 @@ use std; use std::str; use std::borrow::Cow; +use std::ascii::AsciiExt; use std::os::raw::c_char; use ffi; @@ -13,11 +14,18 @@ use pointers::PyPtr; use python::{Python, ToPyPointer}; use super::{PyObject, PyStringData}; -/// Represents a Python string. Corresponds to `unicode` in Python 2 +/// Represents a Python string. pub struct PyString(PyPtr); pyobject_convert!(PyString); -pyobject_nativetype!(PyString, PyUnicode_Check, PyUnicode_Type); +pyobject_nativetype!(PyString, PyString_Check, PyBaseString_Type); + + +/// Represents a Python unicode string. +pub struct PyUnicode(PyPtr); + +pyobject_convert!(PyUnicode); +pyobject_nativetype!(PyUnicode, PyUnicode_Check, PyUnicode_Type); /// Represents a Python byte string. Corresponds to `str` in Python 2 pub struct PyBytes(PyPtr); @@ -25,6 +33,81 @@ pub struct PyBytes(PyPtr); pyobject_convert!(PyBytes); pyobject_nativetype!(PyBytes, PyString_Check, PyBaseString_Type); +impl PyString { + /// Creates a new Python string object. + /// + /// This function will create a byte string if the + /// input string is ASCII-only; and a unicode string otherwise. + /// Use `PyUnicode::new()` to always create a unicode string. + /// + /// Panics if out of memory. + pub fn new(py: Python, s: &str) -> PyString { + if s.is_ascii() { + PyBytes::new(py, s.as_bytes()).into_basestring() + } else { + PyUnicode::new(py, s).into_basestring() + } + } + + pub fn from_object(py: Python, src: &PyObject, + encoding: &str, errors: &str) -> PyResult { + unsafe { + Ok(PyString(PyPtr::from_owned_ptr_or_err( + py, ffi::PyUnicode_FromEncodedObject( + src.as_ptr(), encoding.as_ptr() as *const i8, errors.as_ptr() as *const i8))? + )) + } + } + + /// Gets the python string data in its underlying representation. + /// + /// For Python 2 byte strings, this function always returns `PyStringData::Utf8`, + /// even if the bytes are not valid UTF-8. + /// For unicode strings, returns the underlying representation used by Python. + pub fn data(&self, py: Python) -> PyStringData { + let ob: &PyObject = self.as_ref(); + if let Ok(bytes) = ob.cast_as::(py) { + PyStringData::Utf8(bytes.data(py)) + } else if let Ok(unicode) = ob.cast_as::(py) { + unicode.data(py) + } else { + panic!("PyString is neither `str` nor `unicode`") + } + } + + /// Convert the `PyString` into a Rust string. + /// + /// On Python 2.7, if the `PyString` refers to a byte string, + /// it will be decoded using UTF-8. + /// + /// Returns a `UnicodeDecodeError` if the input is not valid unicode + /// (containing unpaired surrogates, or a Python 2.7 byte string that is + /// not valid UTF-8). + pub fn to_string(&self, py: Python) -> PyResult> { + self.data(py).to_string(py) + } + + /// Convert the `PyString` into a Rust string. + /// + /// On Python 2.7, if the `PyString` refers to a byte string, + /// it will be decoded using UTF-8. + /// + /// Unpaired surrogates and (on Python 2.7) invalid UTF-8 sequences are + /// replaced with U+FFFD REPLACEMENT CHARACTER. + pub fn to_string_lossy(&self, py: Python) -> Cow { + self.data(py).to_string_lossy() + } + + #[inline] + pub fn is_base_string(obj: &PyObject) -> bool { + unsafe { + ffi::PyType_FastSubclass( + ffi::Py_TYPE(obj.as_ptr()), + ffi::Py_TPFLAGS_STRING_SUBCLASS | ffi::Py_TPFLAGS_UNICODE_SUBCLASS) != 0 + } + } +} + impl PyBytes { /// Creates a new Python byte string object. @@ -40,6 +123,12 @@ impl PyBytes { } } + /// Converts from `PyBytes` to `PyString`. + #[inline] + pub fn into_basestring(self) -> PyString { + ::unchecked_downcast_into(self) + } + /// Gets the Python string data as byte slice. pub fn data(&self, _py: Python) -> &[u8] { unsafe { @@ -49,34 +138,26 @@ impl PyBytes { } } - #[inline] - pub fn is_base_string(obj: &PyObject) -> bool { - unsafe { - ffi::PyType_FastSubclass( - ffi::Py_TYPE(obj.as_ptr()), - ffi::Py_TPFLAGS_STRING_SUBCLASS | ffi::Py_TPFLAGS_UNICODE_SUBCLASS) != 0 - } - } } -impl PyString { +impl PyUnicode { /// Creates a new Python unicode string object. /// /// Panics if out of memory. - pub fn new(_py: Python, s: &str) -> PyString { + pub fn new(_py: Python, s: &str) -> PyUnicode { let ptr = s.as_ptr() as *const c_char; let len = s.len() as ffi::Py_ssize_t; unsafe { - PyString(PyPtr::from_owned_ptr_or_panic( + PyUnicode(PyPtr::from_owned_ptr_or_panic( ffi::PyUnicode_FromStringAndSize(ptr, len))) } } pub fn from_object(py: Python, src: &PyObject, encoding: &str, errors: &str) - -> PyResult + -> PyResult { unsafe { - Ok(PyString( + Ok(PyUnicode( PyPtr::from_owned_ptr_or_err( py, ffi::PyUnicode_FromEncodedObject( src.as_ptr(), @@ -85,10 +166,10 @@ impl PyString { } } - /// Converts from `PyString` to `PyBytes`. + /// Converts from `PyUnicode` to `PyString`. #[inline] - pub fn into_bytes(self) -> PyBytes { - ::unchecked_downcast_into(self) + pub fn into_basestring(self) -> PyString { + ::unchecked_downcast_into(self) } /// Gets the python string data in its underlying representation.