Fix handling of invalid utf-8 sequences in PyString::to_string_lossy

This commit is contained in:
Alexander Niederbühl 2019-10-21 15:48:01 +02:00
parent 34843d754a
commit defa43015a
4 changed files with 54 additions and 4 deletions

View File

@ -11,6 +11,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
* FFI compatibility for PEP 590 Vectorcall.
### Fixed
* Fix handling of invalid utf-8 sequences in `PyString::as_bytes` [#639](https://github.com/PyO3/pyo3/pull/639)
and `PyString::to_string_lossy` [#642](https://github.com/PyO3/pyo3/pull/642).
## [0.8.1]
### Added

View File

@ -22,6 +22,11 @@ impl BytesExtractor {
let rust_string: String = string.extract().unwrap();
Ok(rust_string.len())
}
pub fn from_str_lossy(&mut self, string: &PyString) -> PyResult<usize> {
let rust_string_lossy: String = string.to_string_lossy().to_string();
Ok(rust_string_lossy.len())
}
}
#[pymodule]

View File

@ -30,6 +30,7 @@ def test_pybuffer_doesnot_leak_memory():
message_b = b'\\(-"-;) Praying that memory leak would not happen..'
message_s = '\\(-"-;) Praying that memory leak would not happen..'
message_surrogate = '\\(-"-;) Praying that memory leak would not happen.. \ud800'
def from_bytes():
extractor.from_bytes(message_b)
@ -37,9 +38,14 @@ def test_pybuffer_doesnot_leak_memory():
def from_str():
extractor.from_str(message_s)
def from_str_lossy():
extractor.from_str_lossy(message_surrogate)
# Running the memory_diff to warm-up the garbage collector
memory_diff(from_bytes)
memory_diff(from_str)
memory_diff(from_str_lossy)
assert memory_diff(from_bytes) == 0
assert memory_diff(from_str) == 0
assert memory_diff(from_str_lossy) == 0

View File

@ -3,6 +3,7 @@
use crate::conversion::FromPyObject;
use crate::conversion::{PyTryFrom, ToPyObject};
use crate::err::{PyErr, PyResult};
use crate::gil;
use crate::instance::PyNativeType;
use crate::object::PyObject;
use crate::types::PyAny;
@ -11,8 +12,10 @@ use crate::IntoPy;
use crate::Python;
use crate::{ffi, FromPy};
use std::borrow::Cow;
use std::ffi::CStr;
use std::ops::Index;
use std::os::raw::c_char;
use std::ptr::NonNull;
use std::slice::SliceIndex;
use std::str;
@ -87,10 +90,29 @@ impl PyString {
/// Unpaired surrogates invalid UTF-8 sequences are
/// replaced with U+FFFD REPLACEMENT CHARACTER.
pub fn to_string_lossy(&self) -> Cow<str> {
// TODO: Handle error of `as_bytes`
// see https://github.com/PyO3/pyo3/pull/634
let bytes = self.as_bytes().unwrap();
String::from_utf8_lossy(bytes)
match self.to_string() {
Ok(s) => s,
Err(_) => {
unsafe {
let py_bytes = ffi::PyUnicode_AsEncodedString(
self.0.as_ptr(),
CStr::from_bytes_with_nul(b"utf-8\0").unwrap().as_ptr(),
CStr::from_bytes_with_nul(b"surrogatepass\0")
.unwrap()
.as_ptr(),
);
// Since we have a valid PyString and replace any surrogates, assume success.
debug_assert!(!py_bytes.is_null());
// ensure DECREF will be called
gil::register_pointer(NonNull::new(py_bytes).unwrap());
let buffer = ffi::PyBytes_AsString(py_bytes) as *const u8;
debug_assert!(!buffer.is_null());
let length = ffi::PyBytes_Size(py_bytes) as usize;
let bytes = std::slice::from_raw_parts(buffer, length);
String::from_utf8_lossy(bytes)
}
}
}
}
}
@ -308,6 +330,18 @@ mod test {
assert_eq!(Cow::Borrowed(s), py_string.to_string().unwrap());
}
#[test]
fn test_to_string_lossy() {
let gil = Python::acquire_gil();
let py = gil.python();
let obj: PyObject = py
.eval(r#"'🐈 Hello \ud800World'"#, None, None)
.unwrap()
.into();
let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
assert_eq!(py_string.to_string_lossy(), "🐈 Hello <20><><EFBFBD>World");
}
#[test]
fn test_bytes_index() {
let gil = Python::acquire_gil();