Fix handling of invalid utf-8 sequences in PyString::to_string_lossy
This commit is contained in:
parent
34843d754a
commit
defa43015a
|
@ -11,6 +11,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|||
|
||||
* FFI compatibility for PEP 590 Vectorcall.
|
||||
|
||||
### Fixed
|
||||
|
||||
* Fix handling of invalid utf-8 sequences in `PyString::as_bytes` [#639](https://github.com/PyO3/pyo3/pull/639)
|
||||
and `PyString::to_string_lossy` [#642](https://github.com/PyO3/pyo3/pull/642).
|
||||
|
||||
## [0.8.1]
|
||||
|
||||
### Added
|
||||
|
|
|
@ -22,6 +22,11 @@ impl BytesExtractor {
|
|||
let rust_string: String = string.extract().unwrap();
|
||||
Ok(rust_string.len())
|
||||
}
|
||||
|
||||
pub fn from_str_lossy(&mut self, string: &PyString) -> PyResult<usize> {
|
||||
let rust_string_lossy: String = string.to_string_lossy().to_string();
|
||||
Ok(rust_string_lossy.len())
|
||||
}
|
||||
}
|
||||
|
||||
#[pymodule]
|
||||
|
|
|
@ -30,6 +30,7 @@ def test_pybuffer_doesnot_leak_memory():
|
|||
|
||||
message_b = b'\\(-"-;) Praying that memory leak would not happen..'
|
||||
message_s = '\\(-"-;) Praying that memory leak would not happen..'
|
||||
message_surrogate = '\\(-"-;) Praying that memory leak would not happen.. \ud800'
|
||||
|
||||
def from_bytes():
|
||||
extractor.from_bytes(message_b)
|
||||
|
@ -37,9 +38,14 @@ def test_pybuffer_doesnot_leak_memory():
|
|||
def from_str():
|
||||
extractor.from_str(message_s)
|
||||
|
||||
def from_str_lossy():
|
||||
extractor.from_str_lossy(message_surrogate)
|
||||
|
||||
# Running the memory_diff to warm-up the garbage collector
|
||||
memory_diff(from_bytes)
|
||||
memory_diff(from_str)
|
||||
memory_diff(from_str_lossy)
|
||||
|
||||
assert memory_diff(from_bytes) == 0
|
||||
assert memory_diff(from_str) == 0
|
||||
assert memory_diff(from_str_lossy) == 0
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
use crate::conversion::FromPyObject;
|
||||
use crate::conversion::{PyTryFrom, ToPyObject};
|
||||
use crate::err::{PyErr, PyResult};
|
||||
use crate::gil;
|
||||
use crate::instance::PyNativeType;
|
||||
use crate::object::PyObject;
|
||||
use crate::types::PyAny;
|
||||
|
@ -11,8 +12,10 @@ use crate::IntoPy;
|
|||
use crate::Python;
|
||||
use crate::{ffi, FromPy};
|
||||
use std::borrow::Cow;
|
||||
use std::ffi::CStr;
|
||||
use std::ops::Index;
|
||||
use std::os::raw::c_char;
|
||||
use std::ptr::NonNull;
|
||||
use std::slice::SliceIndex;
|
||||
use std::str;
|
||||
|
||||
|
@ -87,10 +90,29 @@ impl PyString {
|
|||
/// Unpaired surrogates invalid UTF-8 sequences are
|
||||
/// replaced with U+FFFD REPLACEMENT CHARACTER.
|
||||
pub fn to_string_lossy(&self) -> Cow<str> {
|
||||
// TODO: Handle error of `as_bytes`
|
||||
// see https://github.com/PyO3/pyo3/pull/634
|
||||
let bytes = self.as_bytes().unwrap();
|
||||
String::from_utf8_lossy(bytes)
|
||||
match self.to_string() {
|
||||
Ok(s) => s,
|
||||
Err(_) => {
|
||||
unsafe {
|
||||
let py_bytes = ffi::PyUnicode_AsEncodedString(
|
||||
self.0.as_ptr(),
|
||||
CStr::from_bytes_with_nul(b"utf-8\0").unwrap().as_ptr(),
|
||||
CStr::from_bytes_with_nul(b"surrogatepass\0")
|
||||
.unwrap()
|
||||
.as_ptr(),
|
||||
);
|
||||
// Since we have a valid PyString and replace any surrogates, assume success.
|
||||
debug_assert!(!py_bytes.is_null());
|
||||
// ensure DECREF will be called
|
||||
gil::register_pointer(NonNull::new(py_bytes).unwrap());
|
||||
let buffer = ffi::PyBytes_AsString(py_bytes) as *const u8;
|
||||
debug_assert!(!buffer.is_null());
|
||||
let length = ffi::PyBytes_Size(py_bytes) as usize;
|
||||
let bytes = std::slice::from_raw_parts(buffer, length);
|
||||
String::from_utf8_lossy(bytes)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -308,6 +330,18 @@ mod test {
|
|||
assert_eq!(Cow::Borrowed(s), py_string.to_string().unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_string_lossy() {
|
||||
let gil = Python::acquire_gil();
|
||||
let py = gil.python();
|
||||
let obj: PyObject = py
|
||||
.eval(r#"'🐈 Hello \ud800World'"#, None, None)
|
||||
.unwrap()
|
||||
.into();
|
||||
let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
|
||||
assert_eq!(py_string.to_string_lossy(), "🐈 Hello <20><><EFBFBD>World");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bytes_index() {
|
||||
let gil = Python::acquire_gil();
|
||||
|
|
Loading…
Reference in New Issue