In python 3, use PyUnicode_AsUTF8AndSize for extracting UTF-8 from unicode objects.
This commit is contained in:
parent
6c0e7e07c7
commit
b21a23b304
|
@ -88,9 +88,9 @@ mod typeslots;
|
|||
|
||||
mod pydebug;
|
||||
|
||||
mod bytearrayobject; // TODO supports PEP-384 only; needs adjustment for Python 3.3 and 3.5
|
||||
mod bytesobject; // TODO supports PEP-384 only; needs adjustment for Python 3.3 and 3.5
|
||||
mod unicodeobject; // TODO supports PEP-384 only; needs adjustment for Python 3.3 and 3.5
|
||||
mod bytearrayobject;
|
||||
mod bytesobject;
|
||||
mod unicodeobject;
|
||||
mod longobject; // TODO supports PEP-384 only; needs adjustment for Python 3.3 and 3.5
|
||||
// mod longintrepr; TODO excluded by PEP-384
|
||||
mod boolobject; // TODO supports PEP-384 only; needs adjustment for Python 3.3 and 3.5
|
||||
|
|
|
@ -2,6 +2,9 @@ use libc::{c_void, c_char, c_int, wchar_t};
|
|||
use object::*;
|
||||
use pyport::Py_ssize_t;
|
||||
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub type Py_UNICODE = wchar_t;
|
||||
|
||||
pub type Py_UCS4 = u32;
|
||||
pub type Py_UCS2 = u16;
|
||||
pub type Py_UCS1 = u8;
|
||||
|
@ -24,21 +27,51 @@ pub unsafe fn PyUnicode_CheckExact(op : *mut PyObject) -> c_int {
|
|||
pub const Py_UNICODE_REPLACEMENT_CHARACTER : Py_UCS4 = 0xFFFD;
|
||||
|
||||
extern "C" {
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub fn PyUnicode_New(size: Py_ssize_t, maxchar: Py_UCS4) -> *mut PyObject;
|
||||
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub fn PyUnicode_CopyCharacters(to: *mut PyObject, to_start: Py_ssize_t,
|
||||
from: *mut PyObject,
|
||||
from_start: Py_ssize_t,
|
||||
how_many: Py_ssize_t) -> Py_ssize_t;
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub fn PyUnicode_Fill(unicode: *mut PyObject, start: Py_ssize_t,
|
||||
length: Py_ssize_t, fill_char: Py_UCS4)
|
||||
-> Py_ssize_t;
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub fn PyUnicode_FromUnicode(u: *const Py_UNICODE, size: Py_ssize_t)
|
||||
-> *mut PyObject;
|
||||
|
||||
pub fn PyUnicode_FromStringAndSize(u: *const c_char,
|
||||
size: Py_ssize_t) -> *mut PyObject;
|
||||
pub fn PyUnicode_FromString(u: *const c_char) -> *mut PyObject;
|
||||
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub fn PyUnicode_FromKindAndData(kind: c_int,
|
||||
buffer: *const c_void,
|
||||
size: Py_ssize_t) -> *mut PyObject;
|
||||
|
||||
pub fn PyUnicode_Substring(str: *mut PyObject, start: Py_ssize_t,
|
||||
end: Py_ssize_t) -> *mut PyObject;
|
||||
pub fn PyUnicode_AsUCS4(unicode: *mut PyObject, buffer: *mut Py_UCS4,
|
||||
buflen: Py_ssize_t, copy_null: c_int)
|
||||
-> *mut Py_UCS4;
|
||||
pub fn PyUnicode_AsUCS4Copy(unicode: *mut PyObject) -> *mut Py_UCS4;
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub fn PyUnicode_AsUnicode(unicode: *mut PyObject) -> *mut Py_UNICODE;
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub fn PyUnicode_AsUnicodeAndSize(unicode: *mut PyObject,
|
||||
size: *mut Py_ssize_t)
|
||||
-> *mut Py_UNICODE;
|
||||
pub fn PyUnicode_GetLength(unicode: *mut PyObject) -> Py_ssize_t;
|
||||
pub fn PyUnicode_GetSize(unicode: *mut PyObject) -> Py_ssize_t;
|
||||
pub fn PyUnicode_ReadChar(unicode: *mut PyObject, index: Py_ssize_t)
|
||||
-> Py_UCS4;
|
||||
pub fn PyUnicode_WriteChar(unicode: *mut PyObject, index: Py_ssize_t,
|
||||
character: Py_UCS4) -> c_int;
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub fn PyUnicode_GetMax() -> Py_UNICODE;
|
||||
pub fn PyUnicode_Resize(unicode: *mut *mut PyObject, length: Py_ssize_t)
|
||||
-> c_int;
|
||||
pub fn PyUnicode_FromEncodedObject(obj: *mut PyObject,
|
||||
|
@ -62,6 +95,12 @@ extern "C" {
|
|||
size: *mut Py_ssize_t) -> *mut wchar_t;
|
||||
pub fn PyUnicode_FromOrdinal(ordinal: c_int) -> *mut PyObject;
|
||||
pub fn PyUnicode_ClearFreeList() -> c_int;
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub fn PyUnicode_AsUTF8AndSize(unicode: *mut PyObject,
|
||||
size: *mut Py_ssize_t)
|
||||
-> *mut c_char;
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub fn PyUnicode_AsUTF8(unicode: *mut PyObject) -> *mut c_char;
|
||||
pub fn PyUnicode_GetDefaultEncoding() -> *const c_char;
|
||||
pub fn PyUnicode_Decode(s: *const c_char, size: Py_ssize_t,
|
||||
encoding: *const c_char,
|
||||
|
@ -74,6 +113,10 @@ extern "C" {
|
|||
encoding: *const c_char,
|
||||
errors: *const c_char)
|
||||
-> *mut PyObject;
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub fn PyUnicode_Encode(s: *const Py_UNICODE, size: Py_ssize_t,
|
||||
encoding: *const c_char,
|
||||
errors: *const c_char) -> *mut PyObject;
|
||||
pub fn PyUnicode_AsEncodedObject(unicode: *mut PyObject,
|
||||
encoding: *const c_char,
|
||||
errors: *const c_char)
|
||||
|
@ -96,6 +139,12 @@ extern "C" {
|
|||
errors: *const c_char,
|
||||
consumed: *mut Py_ssize_t)
|
||||
-> *mut PyObject;
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub fn PyUnicode_EncodeUTF7(data: *const Py_UNICODE, length: Py_ssize_t,
|
||||
base64SetO: c_int,
|
||||
base64WhiteSpace: c_int,
|
||||
errors: *const c_char)
|
||||
-> *mut PyObject;
|
||||
pub fn PyUnicode_DecodeUTF8(string: *const c_char,
|
||||
length: Py_ssize_t,
|
||||
errors: *const c_char)
|
||||
|
@ -106,6 +155,10 @@ extern "C" {
|
|||
consumed: *mut Py_ssize_t)
|
||||
-> *mut PyObject;
|
||||
pub fn PyUnicode_AsUTF8String(unicode: *mut PyObject) -> *mut PyObject;
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub fn PyUnicode_EncodeUTF8(data: *const Py_UNICODE, length: Py_ssize_t,
|
||||
errors: *const c_char)
|
||||
-> *mut PyObject;
|
||||
pub fn PyUnicode_DecodeUTF32(string: *const c_char,
|
||||
length: Py_ssize_t,
|
||||
errors: *const c_char,
|
||||
|
@ -118,6 +171,10 @@ extern "C" {
|
|||
consumed: *mut Py_ssize_t)
|
||||
-> *mut PyObject;
|
||||
pub fn PyUnicode_AsUTF32String(unicode: *mut PyObject) -> *mut PyObject;
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub fn PyUnicode_EncodeUTF32(data: *const Py_UNICODE, length: Py_ssize_t,
|
||||
errors: *const c_char,
|
||||
byteorder: c_int) -> *mut PyObject;
|
||||
pub fn PyUnicode_DecodeUTF16(string: *const c_char,
|
||||
length: Py_ssize_t,
|
||||
errors: *const c_char,
|
||||
|
@ -130,34 +187,74 @@ extern "C" {
|
|||
consumed: *mut Py_ssize_t)
|
||||
-> *mut PyObject;
|
||||
pub fn PyUnicode_AsUTF16String(unicode: *mut PyObject) -> *mut PyObject;
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub fn PyUnicode_EncodeUTF16(data: *const Py_UNICODE, length: Py_ssize_t,
|
||||
errors: *const c_char,
|
||||
byteorder: c_int) -> *mut PyObject;
|
||||
pub fn PyUnicode_DecodeUnicodeEscape(string: *const c_char,
|
||||
length: Py_ssize_t,
|
||||
errors: *const c_char)
|
||||
-> *mut PyObject;
|
||||
pub fn PyUnicode_AsUnicodeEscapeString(unicode: *mut PyObject)
|
||||
-> *mut PyObject;
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub fn PyUnicode_EncodeUnicodeEscape(data: *const Py_UNICODE,
|
||||
length: Py_ssize_t) -> *mut PyObject;
|
||||
pub fn PyUnicode_DecodeRawUnicodeEscape(string: *const c_char,
|
||||
length: Py_ssize_t,
|
||||
errors: *const c_char)
|
||||
-> *mut PyObject;
|
||||
pub fn PyUnicode_AsRawUnicodeEscapeString(unicode: *mut PyObject)
|
||||
-> *mut PyObject;
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub fn PyUnicode_EncodeRawUnicodeEscape(data: *const Py_UNICODE,
|
||||
length: Py_ssize_t)
|
||||
-> *mut PyObject;
|
||||
pub fn PyUnicode_DecodeLatin1(string: *const c_char,
|
||||
length: Py_ssize_t,
|
||||
errors: *const c_char)
|
||||
-> *mut PyObject;
|
||||
pub fn PyUnicode_AsLatin1String(unicode: *mut PyObject) -> *mut PyObject;
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub fn PyUnicode_EncodeLatin1(data: *const Py_UNICODE, length: Py_ssize_t,
|
||||
errors: *const c_char)
|
||||
-> *mut PyObject;
|
||||
pub fn PyUnicode_DecodeASCII(string: *const c_char,
|
||||
length: Py_ssize_t,
|
||||
errors: *const c_char)
|
||||
-> *mut PyObject;
|
||||
pub fn PyUnicode_AsASCIIString(unicode: *mut PyObject) -> *mut PyObject;
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub fn PyUnicode_EncodeASCII(data: *const Py_UNICODE, length: Py_ssize_t,
|
||||
errors: *const c_char)
|
||||
-> *mut PyObject;
|
||||
pub fn PyUnicode_DecodeCharmap(string: *const c_char,
|
||||
length: Py_ssize_t, mapping: *mut PyObject,
|
||||
errors: *const c_char)
|
||||
-> *mut PyObject;
|
||||
pub fn PyUnicode_AsCharmapString(unicode: *mut PyObject,
|
||||
mapping: *mut PyObject) -> *mut PyObject;
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub fn PyUnicode_EncodeCharmap(data: *const Py_UNICODE,
|
||||
length: Py_ssize_t, mapping: *mut PyObject,
|
||||
errors: *const c_char)
|
||||
-> *mut PyObject;
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub fn PyUnicode_TranslateCharmap(data: *const Py_UNICODE,
|
||||
length: Py_ssize_t,
|
||||
table: *mut PyObject,
|
||||
errors: *const c_char)
|
||||
-> *mut PyObject;
|
||||
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub fn PyUnicode_EncodeDecimal(s: *mut Py_UNICODE, length: Py_ssize_t,
|
||||
output: *mut c_char,
|
||||
errors: *const c_char)
|
||||
-> c_int;
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub fn PyUnicode_TransformDecimalToASCII(s: *mut Py_UNICODE,
|
||||
length: Py_ssize_t)
|
||||
-> *mut PyObject;
|
||||
pub fn PyUnicode_DecodeLocaleAndSize(str: *const c_char,
|
||||
len: Py_ssize_t,
|
||||
errors: *const c_char)
|
||||
|
@ -225,5 +322,7 @@ extern "C" {
|
|||
pub fn PyUnicode_Contains(container: *mut PyObject,
|
||||
element: *mut PyObject) -> c_int;
|
||||
pub fn PyUnicode_IsIdentifier(s: *mut PyObject) -> c_int;
|
||||
#[cfg(not(Py_LIMITED_API))]
|
||||
pub fn PyUnicode_AsUnicodeCopy(unicode: *mut PyObject) -> *mut Py_UNICODE;
|
||||
}
|
||||
|
||||
|
|
|
@ -92,8 +92,8 @@ impl <'p> PyUnicode<'p> {
|
|||
/// Convert the `PyUnicode` into a rust string.
|
||||
///
|
||||
/// Returns a `UnicodeDecodeError` if the input contains invalid code points.
|
||||
#[cfg(feature="python27-sys")]
|
||||
pub fn to_string(&self) -> PyResult<'p, Cow<str>> {
|
||||
// TODO: use PyUnicode_AsUTF8AndSize if available
|
||||
let py = self.python();
|
||||
let bytes: PyBytes = unsafe {
|
||||
try!(err::result_cast_from_owned_ptr(py, ffi::PyUnicode_AsUTF8String(self.as_ptr())))
|
||||
|
@ -107,8 +107,8 @@ impl <'p> PyUnicode<'p> {
|
|||
/// Convert the `PyUnicode` into a rust string.
|
||||
///
|
||||
/// Any invalid code points are replaced with U+FFFD REPLACEMENT CHARACTER.
|
||||
#[cfg(feature="python27-sys")]
|
||||
pub fn to_string_lossy(&self) -> Cow<str> {
|
||||
// TODO: use PyUnicode_AsUTF8AndSize if available
|
||||
// TODO: test how this function handles lone surrogates or otherwise invalid code points
|
||||
let py = self.python();
|
||||
let bytes: PyBytes = unsafe {
|
||||
|
@ -117,6 +117,41 @@ impl <'p> PyUnicode<'p> {
|
|||
};
|
||||
Cow::Owned(String::from_utf8_lossy(bytes.as_slice()).into_owned())
|
||||
}
|
||||
|
||||
#[cfg(feature="python3-sys")]
|
||||
fn to_utf8_bytes(&self) -> PyResult<'p, &[u8]> {
|
||||
unsafe {
|
||||
let mut length = 0;
|
||||
let data = ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut length);
|
||||
if data.is_null() {
|
||||
Err(PyErr::fetch(self.python()))
|
||||
} else {
|
||||
Ok(std::slice::from_raw_parts(data as *const u8, length as usize))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert the `PyUnicode` into a rust string.
|
||||
///
|
||||
/// Returns a `UnicodeDecodeError` if the input contains invalid code points.
|
||||
#[cfg(feature="python3-sys")]
|
||||
pub fn to_string(&self) -> PyResult<'p, Cow<str>> {
|
||||
let py = self.python();
|
||||
let bytes = try!(self.to_utf8_bytes());
|
||||
match str::from_utf8(bytes) {
|
||||
Ok(s) => Ok(Cow::Borrowed(s)),
|
||||
Err(e) => Err(PyErr::from_instance(try!(exc::UnicodeDecodeError::new_utf8(py, bytes, e))))
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert the `PyUnicode` into a rust string.
|
||||
///
|
||||
/// Any invalid code points are replaced with U+FFFD REPLACEMENT CHARACTER.
|
||||
#[cfg(feature="python3-sys")]
|
||||
pub fn to_string_lossy(&self) -> Cow<str> {
|
||||
let bytes = self.to_utf8_bytes().expect("Error in PyUnicode_AsUTF8AndSize");
|
||||
String::from_utf8_lossy(bytes)
|
||||
}
|
||||
}
|
||||
|
||||
// On PyString (i.e. PyBytes in 2.7, PyUnicode otherwise), put static methods
|
||||
|
|
Loading…
Reference in a new issue