From 755dd221e767400a62dd3dec4af71e614cebb313 Mon Sep 17 00:00:00 2001 From: David Hewitt <1939362+davidhewitt@users.noreply.github.com> Date: Thu, 26 Aug 2021 19:29:20 +0100 Subject: [PATCH] pystring: disable `PyString::data` on big-endian targets --- CHANGELOG.md | 1 + src/ffi/cpython/unicodeobject.rs | 74 ++++++++++++++++++----------- src/types/string.rs | 81 +++++++++++++++++++------------- 3 files changed, 96 insertions(+), 60 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a90b465e..52376da0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - `PySequence::get_slice` now returns `PyResult<&PySequence>` instead of `PyResult<&PyAny>`. [#1829](https://github.com/PyO3/pyo3/pull/1829) - Deprecate `PyTuple::split_from`. [#1804](https://github.com/PyO3/pyo3/pull/1804) - Deprecate `PyTuple::slice`, new method `PyTuple::get_slice` added with `usize` indices. [#1828](https://github.com/PyO3/pyo3/pull/1828) +- Mark `PyString::data` as `unsafe` and disable it and some supporting PyUnicode FFI APIs (which depend on a C bitfield) on big-endian targets. [#1834](https://github.com/PyO3/pyo3/pull/1834) ## [0.14.3] - 2021-08-22 diff --git a/src/ffi/cpython/unicodeobject.rs b/src/ffi/cpython/unicodeobject.rs index 7c612435..1fbd1aac 100644 --- a/src/ffi/cpython/unicodeobject.rs +++ b/src/ffi/cpython/unicodeobject.rs @@ -50,29 +50,34 @@ pub struct PyASCIIObject { pub wstr: *mut wchar_t, } +/// Interacting with the bitfield is not actually well-defined, so we mark these APIs unsafe. +/// +/// In addition, they are disabled on big-endian architectures to restrict this to most "common" +/// platforms, which are at least tested on CI and appear to be sound. +#[cfg(not(target_endian = "big"))] impl PyASCIIObject { #[inline] - pub fn interned(&self) -> c_uint { + pub unsafe fn interned(&self) -> c_uint { self.state & 3 } #[inline] - pub fn kind(&self) -> c_uint { + pub unsafe fn kind(&self) -> c_uint { (self.state >> 2) & 7 } #[inline] - pub fn compact(&self) -> c_uint { + pub unsafe fn compact(&self) -> c_uint { (self.state >> 5) & 1 } #[inline] - pub fn ascii(&self) -> c_uint { + pub unsafe fn ascii(&self) -> c_uint { (self.state >> 6) & 1 } #[inline] - pub fn ready(&self) -> c_uint { + pub unsafe fn ready(&self) -> c_uint { (self.state >> 7) & 1 } } @@ -114,6 +119,7 @@ pub const SSTATE_INTERNED_MORTAL: c_uint = 1; pub const SSTATE_INTERNED_IMMORTAL: c_uint = 2; #[inline] +#[cfg(not(target_endian = "big"))] pub unsafe fn PyUnicode_IS_ASCII(op: *mut PyObject) -> c_uint { debug_assert!(PyUnicode_Check(op) != 0); debug_assert!(PyUnicode_IS_READY(op) != 0); @@ -122,11 +128,13 @@ pub unsafe fn PyUnicode_IS_ASCII(op: *mut PyObject) -> c_uint { } #[inline] +#[cfg(not(target_endian = "big"))] pub unsafe fn PyUnicode_IS_COMPACT(op: *mut PyObject) -> c_uint { (*(op as *mut PyASCIIObject)).compact() } #[inline] +#[cfg(not(target_endian = "big"))] pub unsafe fn PyUnicode_IS_COMPACT_ASCII(op: *mut PyObject) -> c_uint { if (*(op as *mut PyASCIIObject)).ascii() != 0 && PyUnicode_IS_COMPACT(op) != 0 { 1 @@ -144,21 +152,25 @@ pub const PyUnicode_2BYTE_KIND: c_uint = 2; pub const PyUnicode_4BYTE_KIND: c_uint = 4; #[inline] +#[cfg(not(target_endian = "big"))] pub unsafe fn PyUnicode_1BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS1 { PyUnicode_DATA(op) as *mut Py_UCS1 } #[inline] +#[cfg(not(target_endian = "big"))] pub unsafe fn PyUnicode_2BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS2 { PyUnicode_DATA(op) as *mut Py_UCS2 } #[inline] +#[cfg(not(target_endian = "big"))] pub unsafe fn PyUnicode_4BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS4 { PyUnicode_DATA(op) as *mut Py_UCS4 } #[inline] +#[cfg(not(target_endian = "big"))] pub unsafe fn PyUnicode_KIND(op: *mut PyObject) -> c_uint { debug_assert!(PyUnicode_Check(op) != 0); debug_assert!(PyUnicode_IS_READY(op) != 0); @@ -167,6 +179,7 @@ pub unsafe fn PyUnicode_KIND(op: *mut PyObject) -> c_uint { } #[inline] +#[cfg(not(target_endian = "big"))] pub unsafe fn _PyUnicode_COMPACT_DATA(op: *mut PyObject) -> *mut c_void { if PyUnicode_IS_ASCII(op) != 0 { (op as *mut PyASCIIObject).offset(1) as *mut c_void @@ -176,6 +189,7 @@ pub unsafe fn _PyUnicode_COMPACT_DATA(op: *mut PyObject) -> *mut c_void { } #[inline] +#[cfg(not(target_endian = "big"))] pub unsafe fn _PyUnicode_NONCOMPACT_DATA(op: *mut PyObject) -> *mut c_void { debug_assert!(!(*(op as *mut PyUnicodeObject)).data.any.is_null()); @@ -183,6 +197,7 @@ pub unsafe fn _PyUnicode_NONCOMPACT_DATA(op: *mut PyObject) -> *mut c_void { } #[inline] +#[cfg(not(target_endian = "big"))] pub unsafe fn PyUnicode_DATA(op: *mut PyObject) -> *mut c_void { debug_assert!(PyUnicode_Check(op) != 0); @@ -206,6 +221,7 @@ pub unsafe fn PyUnicode_GET_LENGTH(op: *mut PyObject) -> Py_ssize_t { } #[inline] +#[cfg(not(target_endian = "big"))] pub unsafe fn PyUnicode_IS_READY(op: *mut PyObject) -> c_uint { (*(op as *mut PyASCIIObject)).ready() } @@ -213,6 +229,7 @@ pub unsafe fn PyUnicode_IS_READY(op: *mut PyObject) -> c_uint { #[cfg(not(Py_3_12))] #[cfg_attr(Py_3_10, deprecated(note = "Python 3.10"))] #[inline] +#[cfg(not(target_endian = "big"))] pub unsafe fn PyUnicode_READY(op: *mut PyObject) -> c_int { debug_assert!(PyUnicode_Check(op) != 0); @@ -481,6 +498,7 @@ extern "C" { // skipped _PyUnicode_ScanIdentifier #[cfg(test)] +#[cfg(not(target_endian = "big"))] mod tests { use super::*; use crate::types::PyString; @@ -498,30 +516,32 @@ mod tests { wstr: std::ptr::null_mut() as *mut wchar_t, }; - assert_eq!(o.interned(), 0); - assert_eq!(o.kind(), 0); - assert_eq!(o.compact(), 0); - assert_eq!(o.ascii(), 0); - assert_eq!(o.ready(), 0); + unsafe { + assert_eq!(o.interned(), 0); + assert_eq!(o.kind(), 0); + assert_eq!(o.compact(), 0); + assert_eq!(o.ascii(), 0); + assert_eq!(o.ready(), 0); - for i in 0..4 { - o.state = i; - assert_eq!(o.interned(), i); + for i in 0..4 { + o.state = i; + assert_eq!(o.interned(), i); + } + + for i in 0..8 { + o.state = i << 2; + assert_eq!(o.kind(), i); + } + + o.state = 1 << 5; + assert_eq!(o.compact(), 1); + + o.state = 1 << 6; + assert_eq!(o.ascii(), 1); + + o.state = 1 << 7; + assert_eq!(o.ready(), 1); } - - for i in 0..8 { - o.state = i << 2; - assert_eq!(o.kind(), i); - } - - o.state = 1 << 5; - assert_eq!(o.compact(), 1); - - o.state = 1 << 6; - assert_eq!(o.ascii(), 1); - - o.state = 1 << 7; - assert_eq!(o.ready(), 1); } #[test] diff --git a/src/types/string.rs b/src/types/string.rs index cfccd178..27b63880 100644 --- a/src/types/string.rs +++ b/src/types/string.rs @@ -17,8 +17,8 @@ use std::str; /// /// Python internally stores strings in various representations. This enumeration /// represents those variations. -#[cfg(not(Py_LIMITED_API))] -#[cfg_attr(docsrs, doc(cfg(not(Py_LIMITED_API))))] +#[cfg(not(any(Py_LIMITED_API, target_endian = "big")))] +#[cfg_attr(docsrs, doc(cfg(not(any(Py_LIMITED_API, target_endian = "big")))))] #[derive(Clone, Copy, Debug, PartialEq)] pub enum PyStringData<'a> { /// UCS1 representation. @@ -31,7 +31,7 @@ pub enum PyStringData<'a> { Ucs4(&'a [u32]), } -#[cfg(not(Py_LIMITED_API))] +#[cfg(not(any(Py_LIMITED_API, target_endian = "big")))] impl<'a> PyStringData<'a> { /// Obtain the raw bytes backing this instance as a [u8] slice. pub fn as_bytes(&self) -> &[u8] { @@ -211,16 +211,28 @@ impl PyString { /// Obtains the raw data backing the Python string. /// - /// If the Python string object was created through legacy APIs, its internal - /// storage format will be canonicalized before data is returned. - #[cfg(not(Py_LIMITED_API))] - #[cfg_attr(docsrs, doc(cfg(not(Py_LIMITED_API))))] - pub fn data(&self) -> PyResult> { + /// If the Python string object was created through legacy APIs, its internal storage format + /// will be canonicalized before data is returned. + /// + /// # Safety + /// + /// This function implementation relies on manually decoding a C bitfield. In practice, this + /// works well on common little-endian architectures such as x86_64, where the bitfield has a + /// common representation (even if it is not part of the C spec). The PyO3 CI tests this API on + /// x86_64 platforms. + /// + /// By using this API, you accept responsibility for testing that PyStringData behaves as + /// expected on the targets where you plan to distribute your software. + /// + /// For example, it is known not to work on big-endian platforms. + #[cfg(not(any(Py_LIMITED_API, target_endian = "big")))] + #[cfg_attr(docsrs, doc(cfg(not(any(Py_LIMITED_API, target_endian = "big")))))] + pub unsafe fn data(&self) -> PyResult> { let ptr = self.as_ptr(); if cfg!(not(Py_3_12)) { #[allow(deprecated)] - let ready = unsafe { ffi::PyUnicode_READY(ptr) }; + let ready = ffi::PyUnicode_READY(ptr); if ready != 0 { // Exception was created on failure. return Err(crate::PyErr::api_call_failed(self.py())); @@ -230,20 +242,23 @@ impl PyString { // The string should be in its canonical form after calling `PyUnicode_READY()`. // And non-canonical form not possible after Python 3.12. So it should be safe // to call these APIs. - let length = unsafe { ffi::PyUnicode_GET_LENGTH(ptr) } as usize; - let raw_data = unsafe { ffi::PyUnicode_DATA(ptr) }; - let kind = unsafe { ffi::PyUnicode_KIND(ptr) }; + let length = ffi::PyUnicode_GET_LENGTH(ptr) as usize; + let raw_data = ffi::PyUnicode_DATA(ptr); + let kind = ffi::PyUnicode_KIND(ptr); match kind { - ffi::PyUnicode_1BYTE_KIND => Ok(PyStringData::Ucs1(unsafe { - std::slice::from_raw_parts(raw_data as *const u8, length) - })), - ffi::PyUnicode_2BYTE_KIND => Ok(PyStringData::Ucs2(unsafe { - std::slice::from_raw_parts(raw_data as *const u16, length) - })), - ffi::PyUnicode_4BYTE_KIND => Ok(PyStringData::Ucs4(unsafe { - std::slice::from_raw_parts(raw_data as *const u32, length) - })), + ffi::PyUnicode_1BYTE_KIND => Ok(PyStringData::Ucs1(std::slice::from_raw_parts( + raw_data as *const u8, + length, + ))), + ffi::PyUnicode_2BYTE_KIND => Ok(PyStringData::Ucs2(std::slice::from_raw_parts( + raw_data as *const u16, + length, + ))), + ffi::PyUnicode_4BYTE_KIND => Ok(PyStringData::Ucs4(std::slice::from_raw_parts( + raw_data as *const u32, + length, + ))), _ => unreachable!(), } } @@ -465,11 +480,11 @@ mod tests { } #[test] - #[cfg(not(Py_LIMITED_API))] + #[cfg(not(any(Py_LIMITED_API, target_endian = "big")))] fn test_string_data_ucs1() { Python::with_gil(|py| { let s = PyString::new(py, "hello, world"); - let data = s.data().unwrap(); + let data = unsafe { s.data().unwrap() }; assert_eq!(data, PyStringData::Ucs1(b"hello, world")); assert_eq!(data.to_string(py).unwrap(), Cow::Borrowed("hello, world")); @@ -478,7 +493,7 @@ mod tests { } #[test] - #[cfg(not(Py_LIMITED_API))] + #[cfg(not(any(Py_LIMITED_API, target_endian = "big")))] fn test_string_data_ucs1_invalid() { Python::with_gil(|py| { // 0xfe is not allowed in UTF-8. @@ -492,7 +507,7 @@ mod tests { }; assert!(!ptr.is_null()); let s: &PyString = unsafe { py.from_owned_ptr(ptr) }; - let data = s.data().unwrap(); + let data = unsafe { s.data().unwrap() }; assert_eq!(data, PyStringData::Ucs1(b"f\xfe")); let err = data.to_string(py).unwrap_err(); assert_eq!(err.ptype(py), PyUnicodeDecodeError::type_object(py)); @@ -504,12 +519,12 @@ mod tests { } #[test] - #[cfg(not(Py_LIMITED_API))] + #[cfg(not(any(Py_LIMITED_API, target_endian = "big")))] fn test_string_data_ucs2() { Python::with_gil(|py| { let s = py.eval("'foo\\ud800'", None, None).unwrap(); let py_string = s.cast_as::().unwrap(); - let data = py_string.data().unwrap(); + let data = unsafe { py_string.data().unwrap() }; assert_eq!(data, PyStringData::Ucs2(&[102, 111, 111, 0xd800])); assert_eq!( @@ -520,7 +535,7 @@ mod tests { } #[test] - #[cfg(not(Py_LIMITED_API))] + #[cfg(not(any(Py_LIMITED_API, target_endian = "big")))] fn test_string_data_ucs2_invalid() { Python::with_gil(|py| { // U+FF22 (valid) & U+d800 (never valid) @@ -534,7 +549,7 @@ mod tests { }; assert!(!ptr.is_null()); let s: &PyString = unsafe { py.from_owned_ptr(ptr) }; - let data = s.data().unwrap(); + let data = unsafe { s.data().unwrap() }; assert_eq!(data, PyStringData::Ucs2(&[0xff22, 0xd800])); let err = data.to_string(py).unwrap_err(); assert_eq!(err.ptype(py), PyUnicodeDecodeError::type_object(py)); @@ -546,12 +561,12 @@ mod tests { } #[test] - #[cfg(not(Py_LIMITED_API))] + #[cfg(not(any(Py_LIMITED_API, target_endian = "big")))] fn test_string_data_ucs4() { Python::with_gil(|py| { let s = "哈哈🐈"; let py_string = PyString::new(py, s); - let data = py_string.data().unwrap(); + let data = unsafe { py_string.data().unwrap() }; assert_eq!(data, PyStringData::Ucs4(&[21704, 21704, 128008])); assert_eq!(data.to_string_lossy(), Cow::Owned::(s.to_string())); @@ -559,7 +574,7 @@ mod tests { } #[test] - #[cfg(not(Py_LIMITED_API))] + #[cfg(not(any(Py_LIMITED_API, target_endian = "big")))] fn test_string_data_ucs4_invalid() { Python::with_gil(|py| { // U+20000 (valid) & U+d800 (never valid) @@ -573,7 +588,7 @@ mod tests { }; assert!(!ptr.is_null()); let s: &PyString = unsafe { py.from_owned_ptr(ptr) }; - let data = s.data().unwrap(); + let data = unsafe { s.data().unwrap() }; assert_eq!(data, PyStringData::Ucs4(&[0x20000, 0xd800])); let err = data.to_string(py).unwrap_err(); assert_eq!(err.ptype(py), PyUnicodeDecodeError::type_object(py));