From defa43015a3b2b8aebfcd542be6fbfe84b9c6f93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20Niederb=C3=BChl?= Date: Mon, 21 Oct 2019 15:48:01 +0200 Subject: [PATCH] Fix handling of invalid utf-8 sequences in PyString::to_string_lossy --- CHANGELOG.md | 5 +++ examples/rustapi_module/src/buf_and_str.rs | 5 +++ .../rustapi_module/tests/test_buf_and_str.py | 6 +++ src/types/string.rs | 42 +++++++++++++++++-- 4 files changed, 54 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4861ad83..e1b4e338 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. * FFI compatibility for PEP 590 Vectorcall. +### Fixed + +* Fix handling of invalid utf-8 sequences in `PyString::as_bytes` [#639](https://github.com/PyO3/pyo3/pull/639) +and `PyString::to_string_lossy` [#642](https://github.com/PyO3/pyo3/pull/642). + ## [0.8.1] ### Added diff --git a/examples/rustapi_module/src/buf_and_str.rs b/examples/rustapi_module/src/buf_and_str.rs index 82ce977f..c21c6db3 100644 --- a/examples/rustapi_module/src/buf_and_str.rs +++ b/examples/rustapi_module/src/buf_and_str.rs @@ -22,6 +22,11 @@ impl BytesExtractor { let rust_string: String = string.extract().unwrap(); Ok(rust_string.len()) } + + pub fn from_str_lossy(&mut self, string: &PyString) -> PyResult { + let rust_string_lossy: String = string.to_string_lossy().to_string(); + Ok(rust_string_lossy.len()) + } } #[pymodule] diff --git a/examples/rustapi_module/tests/test_buf_and_str.py b/examples/rustapi_module/tests/test_buf_and_str.py index 429e2ce6..1ebd85d6 100644 --- a/examples/rustapi_module/tests/test_buf_and_str.py +++ b/examples/rustapi_module/tests/test_buf_and_str.py @@ -30,6 +30,7 @@ def test_pybuffer_doesnot_leak_memory(): message_b = b'\\(-"-;) Praying that memory leak would not happen..' message_s = '\\(-"-;) Praying that memory leak would not happen..' + message_surrogate = '\\(-"-;) Praying that memory leak would not happen.. \ud800' def from_bytes(): extractor.from_bytes(message_b) @@ -37,9 +38,14 @@ def test_pybuffer_doesnot_leak_memory(): def from_str(): extractor.from_str(message_s) + def from_str_lossy(): + extractor.from_str_lossy(message_surrogate) + # Running the memory_diff to warm-up the garbage collector memory_diff(from_bytes) memory_diff(from_str) + memory_diff(from_str_lossy) assert memory_diff(from_bytes) == 0 assert memory_diff(from_str) == 0 + assert memory_diff(from_str_lossy) == 0 diff --git a/src/types/string.rs b/src/types/string.rs index 460ccf3e..12d8ee34 100644 --- a/src/types/string.rs +++ b/src/types/string.rs @@ -3,6 +3,7 @@ use crate::conversion::FromPyObject; use crate::conversion::{PyTryFrom, ToPyObject}; use crate::err::{PyErr, PyResult}; +use crate::gil; use crate::instance::PyNativeType; use crate::object::PyObject; use crate::types::PyAny; @@ -11,8 +12,10 @@ use crate::IntoPy; use crate::Python; use crate::{ffi, FromPy}; use std::borrow::Cow; +use std::ffi::CStr; use std::ops::Index; use std::os::raw::c_char; +use std::ptr::NonNull; use std::slice::SliceIndex; use std::str; @@ -87,10 +90,29 @@ impl PyString { /// Unpaired surrogates invalid UTF-8 sequences are /// replaced with U+FFFD REPLACEMENT CHARACTER. pub fn to_string_lossy(&self) -> Cow { - // TODO: Handle error of `as_bytes` - // see https://github.com/PyO3/pyo3/pull/634 - let bytes = self.as_bytes().unwrap(); - String::from_utf8_lossy(bytes) + match self.to_string() { + Ok(s) => s, + Err(_) => { + unsafe { + let py_bytes = ffi::PyUnicode_AsEncodedString( + self.0.as_ptr(), + CStr::from_bytes_with_nul(b"utf-8\0").unwrap().as_ptr(), + CStr::from_bytes_with_nul(b"surrogatepass\0") + .unwrap() + .as_ptr(), + ); + // Since we have a valid PyString and replace any surrogates, assume success. + debug_assert!(!py_bytes.is_null()); + // ensure DECREF will be called + gil::register_pointer(NonNull::new(py_bytes).unwrap()); + let buffer = ffi::PyBytes_AsString(py_bytes) as *const u8; + debug_assert!(!buffer.is_null()); + let length = ffi::PyBytes_Size(py_bytes) as usize; + let bytes = std::slice::from_raw_parts(buffer, length); + String::from_utf8_lossy(bytes) + } + } + } } } @@ -308,6 +330,18 @@ mod test { assert_eq!(Cow::Borrowed(s), py_string.to_string().unwrap()); } + #[test] + fn test_to_string_lossy() { + let gil = Python::acquire_gil(); + let py = gil.python(); + let obj: PyObject = py + .eval(r#"'🐈 Hello \ud800World'"#, None, None) + .unwrap() + .into(); + let py_string = ::try_from(obj.as_ref(py)).unwrap(); + assert_eq!(py_string.to_string_lossy(), "🐈 Hello ���World"); + } + #[test] fn test_bytes_index() { let gil = Python::acquire_gil();