In python 3, use PyUnicode_AsUTF8AndSize for extracting UTF-8 from unicode objects.

This commit is contained in:
Daniel Grunwald 2015-09-20 16:21:00 +02:00
parent 6c0e7e07c7
commit b21a23b304
3 changed files with 139 additions and 5 deletions

View file

@ -88,9 +88,9 @@ mod typeslots;
mod pydebug;
mod bytearrayobject; // TODO supports PEP-384 only; needs adjustment for Python 3.3 and 3.5
mod bytesobject; // TODO supports PEP-384 only; needs adjustment for Python 3.3 and 3.5
mod unicodeobject; // TODO supports PEP-384 only; needs adjustment for Python 3.3 and 3.5
mod bytearrayobject;
mod bytesobject;
mod unicodeobject;
mod longobject; // TODO supports PEP-384 only; needs adjustment for Python 3.3 and 3.5
// mod longintrepr; TODO excluded by PEP-384
mod boolobject; // TODO supports PEP-384 only; needs adjustment for Python 3.3 and 3.5

View file

@ -2,6 +2,9 @@ use libc::{c_void, c_char, c_int, wchar_t};
use object::*;
use pyport::Py_ssize_t;
#[cfg(not(Py_LIMITED_API))]
pub type Py_UNICODE = wchar_t;
pub type Py_UCS4 = u32;
pub type Py_UCS2 = u16;
pub type Py_UCS1 = u8;
@ -24,21 +27,51 @@ pub unsafe fn PyUnicode_CheckExact(op : *mut PyObject) -> c_int {
pub const Py_UNICODE_REPLACEMENT_CHARACTER : Py_UCS4 = 0xFFFD;
extern "C" {
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_New(size: Py_ssize_t, maxchar: Py_UCS4) -> *mut PyObject;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_CopyCharacters(to: *mut PyObject, to_start: Py_ssize_t,
from: *mut PyObject,
from_start: Py_ssize_t,
how_many: Py_ssize_t) -> Py_ssize_t;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_Fill(unicode: *mut PyObject, start: Py_ssize_t,
length: Py_ssize_t, fill_char: Py_UCS4)
-> Py_ssize_t;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_FromUnicode(u: *const Py_UNICODE, size: Py_ssize_t)
-> *mut PyObject;
pub fn PyUnicode_FromStringAndSize(u: *const c_char,
size: Py_ssize_t) -> *mut PyObject;
pub fn PyUnicode_FromString(u: *const c_char) -> *mut PyObject;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_FromKindAndData(kind: c_int,
buffer: *const c_void,
size: Py_ssize_t) -> *mut PyObject;
pub fn PyUnicode_Substring(str: *mut PyObject, start: Py_ssize_t,
end: Py_ssize_t) -> *mut PyObject;
pub fn PyUnicode_AsUCS4(unicode: *mut PyObject, buffer: *mut Py_UCS4,
buflen: Py_ssize_t, copy_null: c_int)
-> *mut Py_UCS4;
pub fn PyUnicode_AsUCS4Copy(unicode: *mut PyObject) -> *mut Py_UCS4;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_AsUnicode(unicode: *mut PyObject) -> *mut Py_UNICODE;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_AsUnicodeAndSize(unicode: *mut PyObject,
size: *mut Py_ssize_t)
-> *mut Py_UNICODE;
pub fn PyUnicode_GetLength(unicode: *mut PyObject) -> Py_ssize_t;
pub fn PyUnicode_GetSize(unicode: *mut PyObject) -> Py_ssize_t;
pub fn PyUnicode_ReadChar(unicode: *mut PyObject, index: Py_ssize_t)
-> Py_UCS4;
pub fn PyUnicode_WriteChar(unicode: *mut PyObject, index: Py_ssize_t,
character: Py_UCS4) -> c_int;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_GetMax() -> Py_UNICODE;
pub fn PyUnicode_Resize(unicode: *mut *mut PyObject, length: Py_ssize_t)
-> c_int;
pub fn PyUnicode_FromEncodedObject(obj: *mut PyObject,
@ -62,6 +95,12 @@ extern "C" {
size: *mut Py_ssize_t) -> *mut wchar_t;
pub fn PyUnicode_FromOrdinal(ordinal: c_int) -> *mut PyObject;
pub fn PyUnicode_ClearFreeList() -> c_int;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_AsUTF8AndSize(unicode: *mut PyObject,
size: *mut Py_ssize_t)
-> *mut c_char;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_AsUTF8(unicode: *mut PyObject) -> *mut c_char;
pub fn PyUnicode_GetDefaultEncoding() -> *const c_char;
pub fn PyUnicode_Decode(s: *const c_char, size: Py_ssize_t,
encoding: *const c_char,
@ -74,6 +113,10 @@ extern "C" {
encoding: *const c_char,
errors: *const c_char)
-> *mut PyObject;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_Encode(s: *const Py_UNICODE, size: Py_ssize_t,
encoding: *const c_char,
errors: *const c_char) -> *mut PyObject;
pub fn PyUnicode_AsEncodedObject(unicode: *mut PyObject,
encoding: *const c_char,
errors: *const c_char)
@ -96,6 +139,12 @@ extern "C" {
errors: *const c_char,
consumed: *mut Py_ssize_t)
-> *mut PyObject;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_EncodeUTF7(data: *const Py_UNICODE, length: Py_ssize_t,
base64SetO: c_int,
base64WhiteSpace: c_int,
errors: *const c_char)
-> *mut PyObject;
pub fn PyUnicode_DecodeUTF8(string: *const c_char,
length: Py_ssize_t,
errors: *const c_char)
@ -106,6 +155,10 @@ extern "C" {
consumed: *mut Py_ssize_t)
-> *mut PyObject;
pub fn PyUnicode_AsUTF8String(unicode: *mut PyObject) -> *mut PyObject;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_EncodeUTF8(data: *const Py_UNICODE, length: Py_ssize_t,
errors: *const c_char)
-> *mut PyObject;
pub fn PyUnicode_DecodeUTF32(string: *const c_char,
length: Py_ssize_t,
errors: *const c_char,
@ -118,6 +171,10 @@ extern "C" {
consumed: *mut Py_ssize_t)
-> *mut PyObject;
pub fn PyUnicode_AsUTF32String(unicode: *mut PyObject) -> *mut PyObject;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_EncodeUTF32(data: *const Py_UNICODE, length: Py_ssize_t,
errors: *const c_char,
byteorder: c_int) -> *mut PyObject;
pub fn PyUnicode_DecodeUTF16(string: *const c_char,
length: Py_ssize_t,
errors: *const c_char,
@ -130,34 +187,74 @@ extern "C" {
consumed: *mut Py_ssize_t)
-> *mut PyObject;
pub fn PyUnicode_AsUTF16String(unicode: *mut PyObject) -> *mut PyObject;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_EncodeUTF16(data: *const Py_UNICODE, length: Py_ssize_t,
errors: *const c_char,
byteorder: c_int) -> *mut PyObject;
pub fn PyUnicode_DecodeUnicodeEscape(string: *const c_char,
length: Py_ssize_t,
errors: *const c_char)
-> *mut PyObject;
pub fn PyUnicode_AsUnicodeEscapeString(unicode: *mut PyObject)
-> *mut PyObject;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_EncodeUnicodeEscape(data: *const Py_UNICODE,
length: Py_ssize_t) -> *mut PyObject;
pub fn PyUnicode_DecodeRawUnicodeEscape(string: *const c_char,
length: Py_ssize_t,
errors: *const c_char)
-> *mut PyObject;
pub fn PyUnicode_AsRawUnicodeEscapeString(unicode: *mut PyObject)
-> *mut PyObject;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_EncodeRawUnicodeEscape(data: *const Py_UNICODE,
length: Py_ssize_t)
-> *mut PyObject;
pub fn PyUnicode_DecodeLatin1(string: *const c_char,
length: Py_ssize_t,
errors: *const c_char)
-> *mut PyObject;
pub fn PyUnicode_AsLatin1String(unicode: *mut PyObject) -> *mut PyObject;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_EncodeLatin1(data: *const Py_UNICODE, length: Py_ssize_t,
errors: *const c_char)
-> *mut PyObject;
pub fn PyUnicode_DecodeASCII(string: *const c_char,
length: Py_ssize_t,
errors: *const c_char)
-> *mut PyObject;
pub fn PyUnicode_AsASCIIString(unicode: *mut PyObject) -> *mut PyObject;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_EncodeASCII(data: *const Py_UNICODE, length: Py_ssize_t,
errors: *const c_char)
-> *mut PyObject;
pub fn PyUnicode_DecodeCharmap(string: *const c_char,
length: Py_ssize_t, mapping: *mut PyObject,
errors: *const c_char)
-> *mut PyObject;
pub fn PyUnicode_AsCharmapString(unicode: *mut PyObject,
mapping: *mut PyObject) -> *mut PyObject;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_EncodeCharmap(data: *const Py_UNICODE,
length: Py_ssize_t, mapping: *mut PyObject,
errors: *const c_char)
-> *mut PyObject;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_TranslateCharmap(data: *const Py_UNICODE,
length: Py_ssize_t,
table: *mut PyObject,
errors: *const c_char)
-> *mut PyObject;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_EncodeDecimal(s: *mut Py_UNICODE, length: Py_ssize_t,
output: *mut c_char,
errors: *const c_char)
-> c_int;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_TransformDecimalToASCII(s: *mut Py_UNICODE,
length: Py_ssize_t)
-> *mut PyObject;
pub fn PyUnicode_DecodeLocaleAndSize(str: *const c_char,
len: Py_ssize_t,
errors: *const c_char)
@ -225,5 +322,7 @@ extern "C" {
pub fn PyUnicode_Contains(container: *mut PyObject,
element: *mut PyObject) -> c_int;
pub fn PyUnicode_IsIdentifier(s: *mut PyObject) -> c_int;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_AsUnicodeCopy(unicode: *mut PyObject) -> *mut Py_UNICODE;
}

View file

@ -92,8 +92,8 @@ impl <'p> PyUnicode<'p> {
/// Convert the `PyUnicode` into a rust string.
///
/// Returns a `UnicodeDecodeError` if the input contains invalid code points.
#[cfg(feature="python27-sys")]
pub fn to_string(&self) -> PyResult<'p, Cow<str>> {
// TODO: use PyUnicode_AsUTF8AndSize if available
let py = self.python();
let bytes: PyBytes = unsafe {
try!(err::result_cast_from_owned_ptr(py, ffi::PyUnicode_AsUTF8String(self.as_ptr())))
@ -107,8 +107,8 @@ impl <'p> PyUnicode<'p> {
/// Convert the `PyUnicode` into a rust string.
///
/// Any invalid code points are replaced with U+FFFD REPLACEMENT CHARACTER.
#[cfg(feature="python27-sys")]
pub fn to_string_lossy(&self) -> Cow<str> {
// TODO: use PyUnicode_AsUTF8AndSize if available
// TODO: test how this function handles lone surrogates or otherwise invalid code points
let py = self.python();
let bytes: PyBytes = unsafe {
@ -117,6 +117,41 @@ impl <'p> PyUnicode<'p> {
};
Cow::Owned(String::from_utf8_lossy(bytes.as_slice()).into_owned())
}
#[cfg(feature="python3-sys")]
fn to_utf8_bytes(&self) -> PyResult<'p, &[u8]> {
unsafe {
let mut length = 0;
let data = ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut length);
if data.is_null() {
Err(PyErr::fetch(self.python()))
} else {
Ok(std::slice::from_raw_parts(data as *const u8, length as usize))
}
}
}
/// Convert the `PyUnicode` into a rust string.
///
/// Returns a `UnicodeDecodeError` if the input contains invalid code points.
#[cfg(feature="python3-sys")]
pub fn to_string(&self) -> PyResult<'p, Cow<str>> {
let py = self.python();
let bytes = try!(self.to_utf8_bytes());
match str::from_utf8(bytes) {
Ok(s) => Ok(Cow::Borrowed(s)),
Err(e) => Err(PyErr::from_instance(try!(exc::UnicodeDecodeError::new_utf8(py, bytes, e))))
}
}
/// Convert the `PyUnicode` into a rust string.
///
/// Any invalid code points are replaced with U+FFFD REPLACEMENT CHARACTER.
#[cfg(feature="python3-sys")]
pub fn to_string_lossy(&self) -> Cow<str> {
let bytes = self.to_utf8_bytes().expect("Error in PyUnicode_AsUTF8AndSize");
String::from_utf8_lossy(bytes)
}
}
// On PyString (i.e. PyBytes in 2.7, PyUnicode otherwise), put static methods