Fix handling of invalid utf-8 sequences in PyString::to_string_lossy

2019-10-21 15:48:01 +02:00 · 2019-10-21 15:48:01 +02:00 · defa43015a
parent 34843d754a
commit defa43015a
4 changed files with 54 additions and 4 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -11,6 +11,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

 * FFI compatibility for PEP 590 Vectorcall.

+### Fixed
+
+* Fix handling of invalid utf-8 sequences in `PyString::as_bytes` [#639](https://github.com/PyO3/pyo3/pull/639)
+and `PyString::to_string_lossy` [#642](https://github.com/PyO3/pyo3/pull/642).
+
 ## [0.8.1]

 ### Added
--- a/examples/rustapi_module/src/buf_and_str.rs
+++ b/examples/rustapi_module/src/buf_and_str.rs
@ -22,6 +22,11 @@ impl BytesExtractor {
        let rust_string: String = string.extract().unwrap();
        Ok(rust_string.len())
    }
+
+    pub fn from_str_lossy(&mut self, string: &PyString) -> PyResult<usize> {
+        let rust_string_lossy: String = string.to_string_lossy().to_string();
+        Ok(rust_string_lossy.len())
+    }
 }

 #[pymodule]
--- a/examples/rustapi_module/tests/test_buf_and_str.py
+++ b/examples/rustapi_module/tests/test_buf_and_str.py
@ -30,6 +30,7 @@ def test_pybuffer_doesnot_leak_memory():

    message_b = b'\\(-"-;) Praying that memory leak would not happen..'
    message_s = '\\(-"-;) Praying that memory leak would not happen..'
+    message_surrogate = '\\(-"-;) Praying that memory leak would not happen.. \ud800'

    def from_bytes():
        extractor.from_bytes(message_b)
@ -37,9 +38,14 @@ def test_pybuffer_doesnot_leak_memory():
    def from_str():
        extractor.from_str(message_s)

+    def from_str_lossy():
+        extractor.from_str_lossy(message_surrogate)
+
    # Running the memory_diff to warm-up the garbage collector
    memory_diff(from_bytes)
    memory_diff(from_str)
+    memory_diff(from_str_lossy)

    assert memory_diff(from_bytes) == 0
    assert memory_diff(from_str) == 0
+    assert memory_diff(from_str_lossy) == 0
--- a/src/types/string.rs
+++ b/src/types/string.rs
@ -3,6 +3,7 @@
 use crate::conversion::FromPyObject;
 use crate::conversion::{PyTryFrom, ToPyObject};
 use crate::err::{PyErr, PyResult};
+use crate::gil;
 use crate::instance::PyNativeType;
 use crate::object::PyObject;
 use crate::types::PyAny;
@ -11,8 +12,10 @@ use crate::IntoPy;
 use crate::Python;
 use crate::{ffi, FromPy};
 use std::borrow::Cow;
+use std::ffi::CStr;
 use std::ops::Index;
 use std::os::raw::c_char;
+use std::ptr::NonNull;
 use std::slice::SliceIndex;
 use std::str;

@ -87,10 +90,29 @@ impl PyString {
    /// Unpaired surrogates invalid UTF-8 sequences are
    /// replaced with U+FFFD REPLACEMENT CHARACTER.
    pub fn to_string_lossy(&self) -> Cow<str> {
-        // TODO: Handle error of `as_bytes`
-        // see https://github.com/PyO3/pyo3/pull/634
-        let bytes = self.as_bytes().unwrap();
-        String::from_utf8_lossy(bytes)
+        match self.to_string() {
+            Ok(s) => s,
+            Err(_) => {
+                unsafe {
+                    let py_bytes = ffi::PyUnicode_AsEncodedString(
+                        self.0.as_ptr(),
+                        CStr::from_bytes_with_nul(b"utf-8\0").unwrap().as_ptr(),
+                        CStr::from_bytes_with_nul(b"surrogatepass\0")
+                            .unwrap()
+                            .as_ptr(),
+                    );
+                    // Since we have a valid PyString and replace any surrogates, assume success.
+                    debug_assert!(!py_bytes.is_null());
+                    // ensure DECREF will be called
+                    gil::register_pointer(NonNull::new(py_bytes).unwrap());
+                    let buffer = ffi::PyBytes_AsString(py_bytes) as *const u8;
+                    debug_assert!(!buffer.is_null());
+                    let length = ffi::PyBytes_Size(py_bytes) as usize;
+                    let bytes = std::slice::from_raw_parts(buffer, length);
+                    String::from_utf8_lossy(bytes)
+                }
+            }
+        }
    }
 }

@ -308,6 +330,18 @@ mod test {
        assert_eq!(Cow::Borrowed(s), py_string.to_string().unwrap());
    }

+    #[test]
+    fn test_to_string_lossy() {
+        let gil = Python::acquire_gil();
+        let py = gil.python();
+        let obj: PyObject = py
+            .eval(r#"'🐈 Hello \ud800World'"#, None, None)
+            .unwrap()
+            .into();
+        let py_string = <PyString as PyTryFrom>::try_from(obj.as_ref(py)).unwrap();
+        assert_eq!(py_string.to_string_lossy(), "🐈 Hello <20><><EFBFBD>World");
+    }
+
    #[test]
    fn test_bytes_index() {
        let gil = Python::acquire_gil();