In python 3, use PyUnicode_AsUTF8AndSize for extracting UTF-8 from unicode objects.

2015-09-20 16:21:00 +02:00 · 2015-09-20 16:21:00 +02:00 · b21a23b304
parent 6c0e7e07c7
commit b21a23b304
3 changed files with 139 additions and 5 deletions
--- a/python3-sys/src/lib.rs
+++ b/python3-sys/src/lib.rs
@ -88,9 +88,9 @@ mod typeslots;

 mod pydebug;

-mod bytearrayobject; // TODO supports PEP-384 only; needs adjustment for Python 3.3 and 3.5
-mod bytesobject; // TODO supports PEP-384 only; needs adjustment for Python 3.3 and 3.5
-mod unicodeobject; // TODO supports PEP-384 only; needs adjustment for Python 3.3 and 3.5
+mod bytearrayobject;
+mod bytesobject;
+mod unicodeobject;
 mod longobject; // TODO supports PEP-384 only; needs adjustment for Python 3.3 and 3.5
 // mod longintrepr; TODO excluded by PEP-384
 mod boolobject; // TODO supports PEP-384 only; needs adjustment for Python 3.3 and 3.5
--- a/python3-sys/src/unicodeobject.rs
+++ b/python3-sys/src/unicodeobject.rs
@ -2,6 +2,9 @@ use libc::{c_void, c_char, c_int, wchar_t};
 use object::*;
 use pyport::Py_ssize_t;

+#[cfg(not(Py_LIMITED_API))]
+pub type Py_UNICODE = wchar_t;
+
 pub type Py_UCS4 = u32;
 pub type Py_UCS2 = u16;
 pub type Py_UCS1 = u8;
@ -24,21 +27,51 @@ pub unsafe fn PyUnicode_CheckExact(op : *mut PyObject) -> c_int {
 pub const Py_UNICODE_REPLACEMENT_CHARACTER : Py_UCS4 = 0xFFFD;

 extern "C" {
+    #[cfg(not(Py_LIMITED_API))]
+    pub fn PyUnicode_New(size: Py_ssize_t, maxchar: Py_UCS4) -> *mut PyObject;
+    
+    #[cfg(not(Py_LIMITED_API))]
+    pub fn PyUnicode_CopyCharacters(to: *mut PyObject, to_start: Py_ssize_t,
+                                    from: *mut PyObject,
+                                    from_start: Py_ssize_t,
+                                    how_many: Py_ssize_t) -> Py_ssize_t;
+    #[cfg(not(Py_LIMITED_API))]
+    pub fn PyUnicode_Fill(unicode: *mut PyObject, start: Py_ssize_t,
+                          length: Py_ssize_t, fill_char: Py_UCS4)
+     -> Py_ssize_t;
+    #[cfg(not(Py_LIMITED_API))]
+    pub fn PyUnicode_FromUnicode(u: *const Py_UNICODE, size: Py_ssize_t)
+     -> *mut PyObject;
+
    pub fn PyUnicode_FromStringAndSize(u: *const c_char,
                                       size: Py_ssize_t) -> *mut PyObject;
    pub fn PyUnicode_FromString(u: *const c_char) -> *mut PyObject;
+
+    #[cfg(not(Py_LIMITED_API))]
+    pub fn PyUnicode_FromKindAndData(kind: c_int,
+                                     buffer: *const c_void,
+                                     size: Py_ssize_t) -> *mut PyObject;
+
    pub fn PyUnicode_Substring(str: *mut PyObject, start: Py_ssize_t,
                               end: Py_ssize_t) -> *mut PyObject;
    pub fn PyUnicode_AsUCS4(unicode: *mut PyObject, buffer: *mut Py_UCS4,
                            buflen: Py_ssize_t, copy_null: c_int)
     -> *mut Py_UCS4;
    pub fn PyUnicode_AsUCS4Copy(unicode: *mut PyObject) -> *mut Py_UCS4;
+    #[cfg(not(Py_LIMITED_API))]
+    pub fn PyUnicode_AsUnicode(unicode: *mut PyObject) -> *mut Py_UNICODE;
+    #[cfg(not(Py_LIMITED_API))]
+    pub fn PyUnicode_AsUnicodeAndSize(unicode: *mut PyObject,
+                                      size: *mut Py_ssize_t)
+     -> *mut Py_UNICODE;
    pub fn PyUnicode_GetLength(unicode: *mut PyObject) -> Py_ssize_t;
    pub fn PyUnicode_GetSize(unicode: *mut PyObject) -> Py_ssize_t;
    pub fn PyUnicode_ReadChar(unicode: *mut PyObject, index: Py_ssize_t)
     -> Py_UCS4;
    pub fn PyUnicode_WriteChar(unicode: *mut PyObject, index: Py_ssize_t,
                               character: Py_UCS4) -> c_int;
+    #[cfg(not(Py_LIMITED_API))]
+    pub fn PyUnicode_GetMax() -> Py_UNICODE;
    pub fn PyUnicode_Resize(unicode: *mut *mut PyObject, length: Py_ssize_t)
     -> c_int;
    pub fn PyUnicode_FromEncodedObject(obj: *mut PyObject,
@ -62,6 +95,12 @@ extern "C" {
                                      size: *mut Py_ssize_t) -> *mut wchar_t;
    pub fn PyUnicode_FromOrdinal(ordinal: c_int) -> *mut PyObject;
    pub fn PyUnicode_ClearFreeList() -> c_int;
+    #[cfg(not(Py_LIMITED_API))]
+    pub fn PyUnicode_AsUTF8AndSize(unicode: *mut PyObject,
+                                   size: *mut Py_ssize_t)
+     -> *mut c_char;
+    #[cfg(not(Py_LIMITED_API))]
+    pub fn PyUnicode_AsUTF8(unicode: *mut PyObject) -> *mut c_char;
    pub fn PyUnicode_GetDefaultEncoding() -> *const c_char;
    pub fn PyUnicode_Decode(s: *const c_char, size: Py_ssize_t,
                            encoding: *const c_char,
@ -74,6 +113,10 @@ extern "C" {
                                      encoding: *const c_char,
                                      errors: *const c_char)
     -> *mut PyObject;
+    #[cfg(not(Py_LIMITED_API))]
+    pub fn PyUnicode_Encode(s: *const Py_UNICODE, size: Py_ssize_t,
+                            encoding: *const c_char,
+                            errors: *const c_char) -> *mut PyObject;
    pub fn PyUnicode_AsEncodedObject(unicode: *mut PyObject,
                                     encoding: *const c_char,
                                     errors: *const c_char)
@ -96,6 +139,12 @@ extern "C" {
                                        errors: *const c_char,
                                        consumed: *mut Py_ssize_t)
     -> *mut PyObject;
+    #[cfg(not(Py_LIMITED_API))]
+    pub fn PyUnicode_EncodeUTF7(data: *const Py_UNICODE, length: Py_ssize_t,
+                                base64SetO: c_int,
+                                base64WhiteSpace: c_int,
+                                errors: *const c_char)
+     -> *mut PyObject;
    pub fn PyUnicode_DecodeUTF8(string: *const c_char,
                                length: Py_ssize_t,
                                errors: *const c_char)
@ -106,6 +155,10 @@ extern "C" {
                                        consumed: *mut Py_ssize_t)
     -> *mut PyObject;
    pub fn PyUnicode_AsUTF8String(unicode: *mut PyObject) -> *mut PyObject;
+    #[cfg(not(Py_LIMITED_API))]
+    pub fn PyUnicode_EncodeUTF8(data: *const Py_UNICODE, length: Py_ssize_t,
+                                errors: *const c_char)
+     -> *mut PyObject;
    pub fn PyUnicode_DecodeUTF32(string: *const c_char,
                                 length: Py_ssize_t,
                                 errors: *const c_char,
@ -118,6 +171,10 @@ extern "C" {
                                         consumed: *mut Py_ssize_t)
     -> *mut PyObject;
    pub fn PyUnicode_AsUTF32String(unicode: *mut PyObject) -> *mut PyObject;
+    #[cfg(not(Py_LIMITED_API))]
+    pub fn PyUnicode_EncodeUTF32(data: *const Py_UNICODE, length: Py_ssize_t,
+                                 errors: *const c_char,
+                                 byteorder: c_int) -> *mut PyObject;
    pub fn PyUnicode_DecodeUTF16(string: *const c_char,
                                 length: Py_ssize_t,
                                 errors: *const c_char,
@ -130,34 +187,74 @@ extern "C" {
                                         consumed: *mut Py_ssize_t)
     -> *mut PyObject;
    pub fn PyUnicode_AsUTF16String(unicode: *mut PyObject) -> *mut PyObject;
+    #[cfg(not(Py_LIMITED_API))]
+    pub fn PyUnicode_EncodeUTF16(data: *const Py_UNICODE, length: Py_ssize_t,
+                                 errors: *const c_char,
+                                 byteorder: c_int) -> *mut PyObject;
    pub fn PyUnicode_DecodeUnicodeEscape(string: *const c_char,
                                         length: Py_ssize_t,
                                         errors: *const c_char)
     -> *mut PyObject;
    pub fn PyUnicode_AsUnicodeEscapeString(unicode: *mut PyObject)
     -> *mut PyObject;
+    #[cfg(not(Py_LIMITED_API))]
+    pub fn PyUnicode_EncodeUnicodeEscape(data: *const Py_UNICODE,
+                                         length: Py_ssize_t) -> *mut PyObject;
    pub fn PyUnicode_DecodeRawUnicodeEscape(string: *const c_char,
                                            length: Py_ssize_t,
                                            errors: *const c_char)
     -> *mut PyObject;
    pub fn PyUnicode_AsRawUnicodeEscapeString(unicode: *mut PyObject)
     -> *mut PyObject;
+    #[cfg(not(Py_LIMITED_API))]
+    pub fn PyUnicode_EncodeRawUnicodeEscape(data: *const Py_UNICODE,
+                                            length: Py_ssize_t)
+     -> *mut PyObject;
    pub fn PyUnicode_DecodeLatin1(string: *const c_char,
                                  length: Py_ssize_t,
                                  errors: *const c_char)
     -> *mut PyObject;
    pub fn PyUnicode_AsLatin1String(unicode: *mut PyObject) -> *mut PyObject;
+    #[cfg(not(Py_LIMITED_API))]
+    pub fn PyUnicode_EncodeLatin1(data: *const Py_UNICODE, length: Py_ssize_t,
+                                  errors: *const c_char)
+     -> *mut PyObject;
    pub fn PyUnicode_DecodeASCII(string: *const c_char,
                                 length: Py_ssize_t,
                                 errors: *const c_char)
     -> *mut PyObject;
    pub fn PyUnicode_AsASCIIString(unicode: *mut PyObject) -> *mut PyObject;
+    #[cfg(not(Py_LIMITED_API))]
+    pub fn PyUnicode_EncodeASCII(data: *const Py_UNICODE, length: Py_ssize_t,
+                                 errors: *const c_char)
+     -> *mut PyObject;
    pub fn PyUnicode_DecodeCharmap(string: *const c_char,
                                   length: Py_ssize_t, mapping: *mut PyObject,
                                   errors: *const c_char)
     -> *mut PyObject;
    pub fn PyUnicode_AsCharmapString(unicode: *mut PyObject,
                                     mapping: *mut PyObject) -> *mut PyObject;
+    #[cfg(not(Py_LIMITED_API))]
+    pub fn PyUnicode_EncodeCharmap(data: *const Py_UNICODE,
+                                   length: Py_ssize_t, mapping: *mut PyObject,
+                                   errors: *const c_char)
+     -> *mut PyObject;
+    #[cfg(not(Py_LIMITED_API))]
+    pub fn PyUnicode_TranslateCharmap(data: *const Py_UNICODE,
+                                      length: Py_ssize_t,
+                                      table: *mut PyObject,
+                                      errors: *const c_char)
+     -> *mut PyObject;
+     
+    #[cfg(not(Py_LIMITED_API))]
+    pub fn PyUnicode_EncodeDecimal(s: *mut Py_UNICODE, length: Py_ssize_t,
+                                   output: *mut c_char,
+                                   errors: *const c_char)
+     -> c_int;
+    #[cfg(not(Py_LIMITED_API))]
+    pub fn PyUnicode_TransformDecimalToASCII(s: *mut Py_UNICODE,
+                                             length: Py_ssize_t)
+     -> *mut PyObject;
    pub fn PyUnicode_DecodeLocaleAndSize(str: *const c_char,
                                         len: Py_ssize_t,
                                         errors: *const c_char)
@ -225,5 +322,7 @@ extern "C" {
    pub fn PyUnicode_Contains(container: *mut PyObject,
                              element: *mut PyObject) -> c_int;
    pub fn PyUnicode_IsIdentifier(s: *mut PyObject) -> c_int;
+    #[cfg(not(Py_LIMITED_API))]
+    pub fn PyUnicode_AsUnicodeCopy(unicode: *mut PyObject) -> *mut Py_UNICODE;
 }

--- a/src/objects/string.rs
+++ b/src/objects/string.rs
@ -92,8 +92,8 @@ impl <'p> PyUnicode<'p> {
    /// Convert the `PyUnicode` into a rust string.
    ///
    /// Returns a `UnicodeDecodeError` if the input contains invalid code points.
+    #[cfg(feature="python27-sys")]
    pub fn to_string(&self) -> PyResult<'p, Cow<str>> {
-        // TODO: use PyUnicode_AsUTF8AndSize if available
        let py = self.python();
        let bytes: PyBytes = unsafe {
            try!(err::result_cast_from_owned_ptr(py, ffi::PyUnicode_AsUTF8String(self.as_ptr())))
@ -107,8 +107,8 @@ impl <'p> PyUnicode<'p> {
    /// Convert the `PyUnicode` into a rust string.
    ///
    /// Any invalid code points are replaced with U+FFFD REPLACEMENT CHARACTER.
+    #[cfg(feature="python27-sys")]
    pub fn to_string_lossy(&self) -> Cow<str> {
-        // TODO: use PyUnicode_AsUTF8AndSize if available
        // TODO: test how this function handles lone surrogates or otherwise invalid code points
        let py = self.python();
        let bytes: PyBytes = unsafe {
@ -117,6 +117,41 @@ impl <'p> PyUnicode<'p> {
        };
        Cow::Owned(String::from_utf8_lossy(bytes.as_slice()).into_owned())
    }
+
+    #[cfg(feature="python3-sys")]
+    fn to_utf8_bytes(&self) -> PyResult<'p, &[u8]> {
+        unsafe {
+            let mut length = 0;
+            let data = ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut length);
+            if data.is_null() {
+                Err(PyErr::fetch(self.python()))
+            } else {
+                Ok(std::slice::from_raw_parts(data as *const u8, length as usize))
+            }
+        }
+    }
+
+    /// Convert the `PyUnicode` into a rust string.
+    ///
+    /// Returns a `UnicodeDecodeError` if the input contains invalid code points.
+    #[cfg(feature="python3-sys")]
+    pub fn to_string(&self) -> PyResult<'p, Cow<str>> {
+        let py = self.python();
+        let bytes = try!(self.to_utf8_bytes());
+        match str::from_utf8(bytes) {
+            Ok(s) => Ok(Cow::Borrowed(s)),
+            Err(e) => Err(PyErr::from_instance(try!(exc::UnicodeDecodeError::new_utf8(py, bytes, e))))
+        }
+    }
+
+    /// Convert the `PyUnicode` into a rust string.
+    ///
+    /// Any invalid code points are replaced with U+FFFD REPLACEMENT CHARACTER.
+    #[cfg(feature="python3-sys")]
+    pub fn to_string_lossy(&self) -> Cow<str> {
+        let bytes = self.to_utf8_bytes().expect("Error in PyUnicode_AsUTF8AndSize");
+        String::from_utf8_lossy(bytes)
+    }
 }

 // On PyString (i.e. PyBytes in 2.7, PyUnicode otherwise), put static methods