From 662eecfb447830f169eeb2b8b23793f105e194c4 Mon Sep 17 00:00:00 2001 From: David Hewitt Date: Mon, 5 Feb 2024 12:22:10 +0000 Subject: [PATCH] add `PyStringMethods::encode_utf8` --- newsfragments/3801.added.md | 1 + src/types/string.rs | 39 ++++++++++++++++++++++++++++++++----- 2 files changed, 35 insertions(+), 5 deletions(-) create mode 100644 newsfragments/3801.added.md diff --git a/newsfragments/3801.added.md b/newsfragments/3801.added.md new file mode 100644 index 00000000..78f45032 --- /dev/null +++ b/newsfragments/3801.added.md @@ -0,0 +1 @@ +Add `PyStringMethods::encode_utf8`. diff --git a/src/types/string.rs b/src/types/string.rs index b1457950..2b263571 100644 --- a/src/types/string.rs +++ b/src/types/string.rs @@ -305,6 +305,9 @@ pub trait PyStringMethods<'py> { /// replaced with `U+FFFD REPLACEMENT CHARACTER`. fn to_string_lossy(&self) -> Cow<'_, str>; + /// Encodes this string as a Python `bytes` object, using UTF-8 encoding. + fn encode_utf8(&self) -> PyResult>; + /// Obtains the raw data backing the Python string. /// /// If the Python string object was created through legacy APIs, its internal storage format @@ -337,6 +340,14 @@ impl<'py> PyStringMethods<'py> for Bound<'py, PyString> { self.as_borrowed().to_string_lossy() } + fn encode_utf8(&self) -> PyResult> { + unsafe { + ffi::PyUnicode_AsUTF8String(self.as_ptr()) + .assume_owned_or_err(self.py()) + .downcast_into_unchecked::() + } + } + #[cfg(not(Py_LIMITED_API))] unsafe fn data(&self) -> PyResult> { self.as_borrowed().data() @@ -371,11 +382,7 @@ impl<'a> Borrowed<'a, '_, PyString> { #[cfg(not(any(Py_3_10, not(Py_LIMITED_API))))] { - let bytes = unsafe { - ffi::PyUnicode_AsUTF8String(self.as_ptr()) - .assume_owned_or_err(self.py())? - .downcast_into_unchecked::() - }; + let bytes = self.encode_utf8()?; Ok(Cow::Owned( unsafe { str::from_utf8_unchecked(bytes.as_bytes()) }.to_owned(), )) @@ -535,6 +542,28 @@ mod tests { }) } + #[test] + fn test_encode_utf8_unicode() { + Python::with_gil(|py| { + let s = "哈哈🐈"; + let obj = PyString::new_bound(py, s); + assert_eq!(s.as_bytes(), obj.encode_utf8().unwrap().as_bytes()); + }) + } + + #[test] + fn test_encode_utf8_surrogate() { + Python::with_gil(|py| { + let obj: PyObject = py.eval(r"'\ud800'", None, None).unwrap().into(); + assert!(obj + .bind(py) + .downcast::() + .unwrap() + .encode_utf8() + .is_err()); + }) + } + #[test] fn test_to_string_lossy() { Python::with_gil(|py| {