Merge pull request #1777 from indygreg/unicode-apis

ffi: define some cpython/unicodeobject bindings
2021-08-14 23:07:49 +01:00 · 2021-08-14 23:07:49 +01:00 · eafc75ab06
parent 584de688c3 807c8ee33c
commit eafc75ab06
4 changed files with 609 additions and 145 deletions
--- a/src/ffi/cpython/mod.rs
+++ b/src/ffi/cpython/mod.rs
@ -20,6 +20,7 @@ pub(crate) mod pydebug;
 #[cfg(all(Py_3_8, not(PyPy)))]
 pub(crate) mod pylifecycle;
 pub(crate) mod pystate;
+pub(crate) mod unicodeobject;

 pub use self::abstract_::*;
 #[cfg(not(PyPy))]
@ -40,3 +41,4 @@ pub use self::pydebug::*;
 #[cfg(all(Py_3_8, not(PyPy)))]
 pub use self::pylifecycle::*;
 pub use self::pystate::*;
+pub use self::unicodeobject::*;
--- a/src/ffi/cpython/unicodeobject.rs
+++ b/src/ffi/cpython/unicodeobject.rs
@ -0,0 +1,605 @@
+use crate::ffi::{
+    PyObject, PyUnicode_Check, Py_UCS1, Py_UCS2, Py_UCS4, Py_UNICODE, Py_hash_t, Py_ssize_t,
+};
+use libc::wchar_t;
+use std::os::raw::{c_char, c_int, c_uint, c_void};
+
+// skipped Py_UNICODE_ISSPACE()
+// skipped Py_UNICODE_ISLOWER()
+// skipped Py_UNICODE_ISUPPER()
+// skipped Py_UNICODE_ISTITLE()
+// skipped Py_UNICODE_ISLINEBREAK
+// skipped Py_UNICODE_TOLOWER
+// skipped Py_UNICODE_TOUPPER
+// skipped Py_UNICODE_TOTITLE
+// skipped Py_UNICODE_ISDECIMAL
+// skipped Py_UNICODE_ISDIGIT
+// skipped Py_UNICODE_ISNUMERIC
+// skipped Py_UNICODE_ISPRINTABLE
+// skipped Py_UNICODE_TODECIMAL
+// skipped Py_UNICODE_TODIGIT
+// skipped Py_UNICODE_TONUMERIC
+// skipped Py_UNICODE_ISALPHA
+// skipped Py_UNICODE_ISALNUM
+// skipped Py_UNICODE_COPY
+// skipped Py_UNICODE_FILL
+// skipped Py_UNICODE_IS_SURROGATE
+// skipped Py_UNICODE_IS_HIGH_SURROGATE
+// skipped Py_UNICODE_IS_LOW_SURROGATE
+// skipped Py_UNICODE_JOIN_SURROGATES
+// skipped Py_UNICODE_HIGH_SURROGATE
+// skipped Py_UNICODE_LOW_SURROGATE
+
+#[repr(C)]
+pub struct PyASCIIObject {
+    pub ob_base: PyObject,
+    pub length: Py_ssize_t,
+    pub hash: Py_hash_t,
+    /// A bit field with various properties.
+    ///
+    /// Rust doesn't expose bitfields. So we have accessor functions for
+    /// retrieving values.
+    ///
+    /// unsigned int interned:2; // SSTATE_* constants.
+    /// unsigned int kind:3;     // PyUnicode_*_KIND constants.
+    /// unsigned int compact:1;
+    /// unsigned int ascii:1;
+    /// unsigned int ready:1;
+    /// unsigned int :24;
+    pub state: u32,
+    pub wstr: *mut wchar_t,
+}
+
+impl PyASCIIObject {
+    #[inline]
+    pub fn interned(&self) -> c_uint {
+        self.state & 3
+    }
+
+    #[inline]
+    pub fn kind(&self) -> c_uint {
+        (self.state >> 2) & 7
+    }
+
+    #[inline]
+    pub fn compact(&self) -> c_uint {
+        (self.state >> 5) & 1
+    }
+
+    #[inline]
+    pub fn ascii(&self) -> c_uint {
+        (self.state >> 6) & 1
+    }
+
+    #[inline]
+    pub fn ready(&self) -> c_uint {
+        (self.state >> 7) & 1
+    }
+}
+
+#[repr(C)]
+pub struct PyCompactUnicodeObject {
+    pub _base: PyASCIIObject,
+    pub utf8_length: Py_ssize_t,
+    pub utf8: *mut c_char,
+    pub wstr_length: Py_ssize_t,
+}
+
+#[repr(C)]
+pub union PyUnicodeObjectData {
+    any: *mut c_void,
+    latin1: *mut Py_UCS1,
+    ucs2: *mut Py_UCS2,
+    ucs4: *mut Py_UCS4,
+}
+
+#[repr(C)]
+pub struct PyUnicodeObject {
+    pub _base: PyCompactUnicodeObject,
+    pub data: PyUnicodeObjectData,
+}
+
+extern "C" {
+    #[cfg(not(PyPy))]
+    pub fn _PyUnicode_CheckConsistency(op: *mut PyObject, check_content: c_int) -> c_int;
+}
+
+// skipped PyUnicode_GET_SIZE
+// skipped PyUnicode_GET_DATA_SIZE
+// skipped PyUnicode_AS_UNICODE
+// skipped PyUnicode_AS_DATA
+
+pub const SSTATE_NOT_INTERNED: c_uint = 0;
+pub const SSTATE_INTERNED_MORTAL: c_uint = 1;
+pub const SSTATE_INTERNED_IMMORTAL: c_uint = 2;
+
+#[inline]
+pub unsafe fn PyUnicode_IS_ASCII(op: *mut PyObject) -> c_uint {
+    debug_assert!(PyUnicode_Check(op) != 0);
+    debug_assert!(PyUnicode_IS_READY(op) != 0);
+
+    (*(op as *mut PyASCIIObject)).ascii()
+}
+
+#[inline]
+pub unsafe fn PyUnicode_IS_COMPACT(op: *mut PyObject) -> c_uint {
+    (*(op as *mut PyASCIIObject)).compact()
+}
+
+#[inline]
+pub unsafe fn PyUnicode_IS_COMPACT_ASCII(op: *mut PyObject) -> c_uint {
+    if (*(op as *mut PyASCIIObject)).ascii() != 0 && PyUnicode_IS_COMPACT(op) != 0 {
+        1
+    } else {
+        0
+    }
+}
+
+#[cfg(not(Py_3_12))]
+#[cfg_attr(Py_3_10, deprecated(note = "Python 3.10"))]
+pub const PyUnicode_WCHAR_KIND: c_uint = 0;
+
+pub const PyUnicode_1BYTE_KIND: c_uint = 1;
+pub const PyUnicode_2BYTE_KIND: c_uint = 2;
+pub const PyUnicode_4BYTE_KIND: c_uint = 4;
+
+#[inline]
+pub unsafe fn PyUnicode_1BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS1 {
+    PyUnicode_DATA(op) as *mut Py_UCS1
+}
+
+#[inline]
+pub unsafe fn PyUnicode_2BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS2 {
+    PyUnicode_DATA(op) as *mut Py_UCS2
+}
+
+#[inline]
+pub unsafe fn PyUnicode_4BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS4 {
+    PyUnicode_DATA(op) as *mut Py_UCS4
+}
+
+#[inline]
+pub unsafe fn PyUnicode_KIND(op: *mut PyObject) -> c_uint {
+    debug_assert!(PyUnicode_Check(op) != 0);
+    debug_assert!(PyUnicode_IS_READY(op) != 0);
+
+    (*(op as *mut PyASCIIObject)).kind()
+}
+
+#[inline]
+pub unsafe fn _PyUnicode_COMPACT_DATA(op: *mut PyObject) -> *mut c_void {
+    if PyUnicode_IS_ASCII(op) != 0 {
+        (op as *mut PyASCIIObject).offset(1) as *mut c_void
+    } else {
+        (op as *mut PyCompactUnicodeObject).offset(1) as *mut c_void
+    }
+}
+
+#[inline]
+pub unsafe fn _PyUnicode_NONCOMPACT_DATA(op: *mut PyObject) -> *mut c_void {
+    debug_assert!(!(*(op as *mut PyUnicodeObject)).data.any.is_null());
+
+    (*(op as *mut PyUnicodeObject)).data.any
+}
+
+#[inline]
+pub unsafe fn PyUnicode_DATA(op: *mut PyObject) -> *mut c_void {
+    debug_assert!(PyUnicode_Check(op) != 0);
+
+    if PyUnicode_IS_COMPACT(op) != 0 {
+        _PyUnicode_COMPACT_DATA(op)
+    } else {
+        _PyUnicode_NONCOMPACT_DATA(op)
+    }
+}
+
+// skipped PyUnicode_WRITE
+// skipped PyUnicode_READ
+// skipped PyUnicode_READ_CHAR
+
+#[inline]
+pub unsafe fn PyUnicode_GET_LENGTH(op: *mut PyObject) -> Py_ssize_t {
+    debug_assert!(PyUnicode_Check(op) != 0);
+    debug_assert!(PyUnicode_IS_READY(op) != 0);
+
+    (*(op as *mut PyASCIIObject)).length
+}
+
+#[inline]
+pub unsafe fn PyUnicode_IS_READY(op: *mut PyObject) -> c_uint {
+    (*(op as *mut PyASCIIObject)).ready()
+}
+
+#[cfg(not(Py_3_12))]
+#[cfg_attr(Py_3_10, deprecated(note = "Python 3.10"))]
+#[inline]
+pub unsafe fn PyUnicode_READY(op: *mut PyObject) -> c_int {
+    debug_assert!(PyUnicode_Check(op) != 0);
+
+    if PyUnicode_IS_READY(op) != 0 {
+        0
+    } else {
+        _PyUnicode_Ready(op)
+    }
+}
+
+// skipped PyUnicode_MAX_CHAR_VALUE
+// skipped _PyUnicode_get_wstr_length
+// skipped PyUnicode_WSTR_LENGTH
+
+extern "C" {
+    #[cfg_attr(PyPy, link_name = "PyPyUnicode_New")]
+    pub fn PyUnicode_New(size: Py_ssize_t, maxchar: Py_UCS4) -> *mut PyObject;
+    #[cfg_attr(PyPy, link_name = "_PyPyUnicode_Ready")]
+    pub fn _PyUnicode_Ready(unicode: *mut PyObject) -> c_int;
+
+    // skipped _PyUnicode_Copy
+
+    #[cfg(not(PyPy))]
+    #[cfg_attr(docsrs, doc(cfg(not(PyPy))))]
+    pub fn PyUnicode_CopyCharacters(
+        to: *mut PyObject,
+        to_start: Py_ssize_t,
+        from: *mut PyObject,
+        from_start: Py_ssize_t,
+        how_many: Py_ssize_t,
+    ) -> Py_ssize_t;
+
+    // skipped _PyUnicode_FastCopyCharacters
+
+    #[cfg(not(PyPy))]
+    #[cfg_attr(docsrs, doc(cfg(not(PyPy))))]
+    pub fn PyUnicode_Fill(
+        unicode: *mut PyObject,
+        start: Py_ssize_t,
+        length: Py_ssize_t,
+        fill_char: Py_UCS4,
+    ) -> Py_ssize_t;
+
+    // skipped _PyUnicode_FastFill
+
+    #[cfg(not(Py_3_12))]
+    #[deprecated]
+    #[cfg_attr(PyPy, link_name = "PyPyUnicode_FromUnicode")]
+    pub fn PyUnicode_FromUnicode(u: *const Py_UNICODE, size: Py_ssize_t) -> *mut PyObject;
+
+    #[cfg_attr(PyPy, link_name = "PyPyUnicode_FromKindAndData")]
+    pub fn PyUnicode_FromKindAndData(
+        kind: c_int,
+        buffer: *const c_void,
+        size: Py_ssize_t,
+    ) -> *mut PyObject;
+
+    // skipped _PyUnicode_FromASCII
+    // skipped _PyUnicode_FindMaxChar
+
+    #[cfg(not(Py_3_12))]
+    #[deprecated]
+    #[cfg_attr(PyPy, link_name = "PyPyUnicode_AsUnicode")]
+    pub fn PyUnicode_AsUnicode(unicode: *mut PyObject) -> *mut Py_UNICODE;
+
+    // skipped _PyUnicode_AsUnicode
+
+    #[cfg(not(Py_3_12))]
+    #[deprecated]
+    #[cfg_attr(PyPy, link_name = "PyPyUnicode_AsUnicodeAndSize")]
+    pub fn PyUnicode_AsUnicodeAndSize(
+        unicode: *mut PyObject,
+        size: *mut Py_ssize_t,
+    ) -> *mut Py_UNICODE;
+
+    // skipped PyUnicode_GetMax
+}
+
+// skipped _PyUnicodeWriter
+// skipped _PyUnicodeWriter_Init
+// skipped _PyUnicodeWriter_Prepare
+// skipped _PyUnicodeWriter_PrepareInternal
+// skipped _PyUnicodeWriter_PrepareKind
+// skipped _PyUnicodeWriter_PrepareKindInternal
+// skipped _PyUnicodeWriter_WriteChar
+// skipped _PyUnicodeWriter_WriteStr
+// skipped _PyUnicodeWriter_WriteSubstring
+// skipped _PyUnicodeWriter_WriteASCIIString
+// skipped _PyUnicodeWriter_WriteLatin1String
+// skipped _PyUnicodeWriter_Finish
+// skipped _PyUnicodeWriter_Dealloc
+// skipped _PyUnicode_FormatAdvancedWriter
+
+extern "C" {
+    #[cfg(Py_3_7)]
+    #[cfg_attr(PyPy, link_name = "PyPyUnicode_AsUTF8AndSize")]
+    pub fn PyUnicode_AsUTF8AndSize(unicode: *mut PyObject, size: *mut Py_ssize_t) -> *const c_char;
+
+    #[cfg(not(Py_3_7))]
+    #[cfg_attr(PyPy, link_name = "PyPyUnicode_AsUTF8AndSize")]
+    pub fn PyUnicode_AsUTF8AndSize(unicode: *mut PyObject, size: *mut Py_ssize_t) -> *mut c_char;
+
+    // skipped _PyUnicode_AsStringAndSize
+
+    #[cfg(Py_3_7)]
+    #[cfg_attr(PyPy, link_name = "PyPyUnicode_AsUTF8")]
+    pub fn PyUnicode_AsUTF8(unicode: *mut PyObject) -> *const c_char;
+
+    #[cfg(not(Py_3_7))]
+    #[cfg_attr(PyPy, link_name = "PyPyUnicode_AsUTF8")]
+    pub fn PyUnicode_AsUTF8(unicode: *mut PyObject) -> *mut c_char;
+
+    // skipped _PyUnicode_AsString
+
+    pub fn PyUnicode_Encode(
+        s: *const Py_UNICODE,
+        size: Py_ssize_t,
+        encoding: *const c_char,
+        errors: *const c_char,
+    ) -> *mut PyObject;
+
+    pub fn PyUnicode_EncodeUTF7(
+        data: *const Py_UNICODE,
+        length: Py_ssize_t,
+        base64SetO: c_int,
+        base64WhiteSpace: c_int,
+        errors: *const c_char,
+    ) -> *mut PyObject;
+
+    // skipped _PyUnicode_EncodeUTF7
+    // skipped _PyUnicode_AsUTF8String
+
+    #[cfg_attr(PyPy, link_name = "PyPyUnicode_EncodeUTF8")]
+    pub fn PyUnicode_EncodeUTF8(
+        data: *const Py_UNICODE,
+        length: Py_ssize_t,
+        errors: *const c_char,
+    ) -> *mut PyObject;
+
+    pub fn PyUnicode_EncodeUTF32(
+        data: *const Py_UNICODE,
+        length: Py_ssize_t,
+        errors: *const c_char,
+        byteorder: c_int,
+    ) -> *mut PyObject;
+
+    // skipped _PyUnicode_EncodeUTF32
+
+    pub fn PyUnicode_EncodeUTF16(
+        data: *const Py_UNICODE,
+        length: Py_ssize_t,
+        errors: *const c_char,
+        byteorder: c_int,
+    ) -> *mut PyObject;
+
+    // skipped _PyUnicode_EncodeUTF16
+    // skipped _PyUnicode_DecodeUnicodeEscape
+
+    pub fn PyUnicode_EncodeUnicodeEscape(
+        data: *const Py_UNICODE,
+        length: Py_ssize_t,
+    ) -> *mut PyObject;
+
+    pub fn PyUnicode_EncodeRawUnicodeEscape(
+        data: *const Py_UNICODE,
+        length: Py_ssize_t,
+    ) -> *mut PyObject;
+
+    // skipped _PyUnicode_AsLatin1String
+
+    #[cfg_attr(PyPy, link_name = "PyPyUnicode_EncodeLatin1")]
+    pub fn PyUnicode_EncodeLatin1(
+        data: *const Py_UNICODE,
+        length: Py_ssize_t,
+        errors: *const c_char,
+    ) -> *mut PyObject;
+
+    // skipped _PyUnicode_AsASCIIString
+
+    #[cfg_attr(PyPy, link_name = "PyPyUnicode_EncodeASCII")]
+    pub fn PyUnicode_EncodeASCII(
+        data: *const Py_UNICODE,
+        length: Py_ssize_t,
+        errors: *const c_char,
+    ) -> *mut PyObject;
+
+    pub fn PyUnicode_EncodeCharmap(
+        data: *const Py_UNICODE,
+        length: Py_ssize_t,
+        mapping: *mut PyObject,
+        errors: *const c_char,
+    ) -> *mut PyObject;
+
+    // skipped _PyUnicode_EncodeCharmap
+
+    pub fn PyUnicode_TranslateCharmap(
+        data: *const Py_UNICODE,
+        length: Py_ssize_t,
+        table: *mut PyObject,
+        errors: *const c_char,
+    ) -> *mut PyObject;
+
+    // skipped PyUnicode_EncodeMBCS
+
+    #[cfg_attr(PyPy, link_name = "PyPyUnicode_EncodeDecimal")]
+    pub fn PyUnicode_EncodeDecimal(
+        s: *mut Py_UNICODE,
+        length: Py_ssize_t,
+        output: *mut c_char,
+        errors: *const c_char,
+    ) -> c_int;
+
+    #[cfg_attr(PyPy, link_name = "PyPyUnicode_TransformDecimalToASCII")]
+    pub fn PyUnicode_TransformDecimalToASCII(
+        s: *mut Py_UNICODE,
+        length: Py_ssize_t,
+    ) -> *mut PyObject;
+
+    // skipped _PyUnicode_TransformDecimalAndSpaceToASCII
+}
+
+// skipped _PyUnicode_JoinArray
+// skipped _PyUnicode_EqualToASCIIId
+// skipped _PyUnicode_EqualToASCIIString
+// skipped _PyUnicode_XStrip
+// skipped _PyUnicode_InsertThousandsGrouping
+
+// skipped _Py_ascii_whitespace
+
+// skipped _PyUnicode_IsLowercase
+// skipped _PyUnicode_IsUppercase
+// skipped _PyUnicode_IsTitlecase
+// skipped _PyUnicode_IsXidStart
+// skipped _PyUnicode_IsXidContinue
+// skipped _PyUnicode_IsWhitespace
+// skipped _PyUnicode_IsLinebreak
+// skipped _PyUnicode_ToLowercase
+// skipped _PyUnicode_ToUppercase
+// skipped _PyUnicode_ToTitlecase
+// skipped _PyUnicode_ToLowerFull
+// skipped _PyUnicode_ToTitleFull
+// skipped _PyUnicode_ToUpperFull
+// skipped _PyUnicode_ToFoldedFull
+// skipped _PyUnicode_IsCaseIgnorable
+// skipped _PyUnicode_IsCased
+// skipped _PyUnicode_ToDecimalDigit
+// skipped _PyUnicode_ToDigit
+// skipped _PyUnicode_ToNumeric
+// skipped _PyUnicode_IsDecimalDigit
+// skipped _PyUnicode_IsDigit
+// skipped _PyUnicode_IsNumeric
+// skipped _PyUnicode_IsPrintable
+// skipped _PyUnicode_IsAlpha
+// skipped Py_UNICODE_strlen
+// skipped Py_UNICODE_strcpy
+// skipped Py_UNICODE_strcat
+// skipped Py_UNICODE_strncpy
+// skipped Py_UNICODE_strcmp
+// skipped Py_UNICODE_strncmp
+// skipped Py_UNICODE_strchr
+// skipped Py_UNICODE_strrchr
+// skipped _PyUnicode_FormatLong
+// skipped PyUnicode_AsUnicodeCopy
+// skipped _PyUnicode_FromId
+// skipped _PyUnicode_EQ
+// skipped _PyUnicode_ScanIdentifier
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::types::PyString;
+    use crate::{AsPyPointer, Python};
+
+    #[test]
+    fn ascii_object_bitfield() {
+        let ob_base: PyObject = unsafe { std::mem::zeroed() };
+
+        let mut o = PyASCIIObject {
+            ob_base,
+            length: 0,
+            hash: 0,
+            state: 0,
+            wstr: std::ptr::null_mut() as *mut wchar_t,
+        };
+
+        assert_eq!(o.interned(), 0);
+        assert_eq!(o.kind(), 0);
+        assert_eq!(o.compact(), 0);
+        assert_eq!(o.ascii(), 0);
+        assert_eq!(o.ready(), 0);
+
+        for i in 0..4 {
+            o.state = i;
+            assert_eq!(o.interned(), i);
+        }
+
+        for i in 0..8 {
+            o.state = i << 2;
+            assert_eq!(o.kind(), i);
+        }
+
+        o.state = 1 << 5;
+        assert_eq!(o.compact(), 1);
+
+        o.state = 1 << 6;
+        assert_eq!(o.ascii(), 1);
+
+        o.state = 1 << 7;
+        assert_eq!(o.ready(), 1);
+    }
+
+    #[test]
+    #[cfg_attr(Py_3_10, allow(deprecated))]
+    fn ascii() {
+        Python::with_gil(|py| {
+            // This test relies on implementation details of PyString.
+            let s = PyString::new(py, "hello, world");
+            let ptr = s.as_ptr();
+
+            unsafe {
+                let ascii_ptr = ptr as *mut PyASCIIObject;
+                let ascii = ascii_ptr.as_ref().unwrap();
+
+                assert_eq!(ascii.interned(), 0);
+                assert_eq!(ascii.kind(), PyUnicode_1BYTE_KIND);
+                assert_eq!(ascii.compact(), 1);
+                assert_eq!(ascii.ascii(), 1);
+                assert_eq!(ascii.ready(), 1);
+
+                assert_eq!(PyUnicode_IS_ASCII(ptr), 1);
+                assert_eq!(PyUnicode_IS_COMPACT(ptr), 1);
+                assert_eq!(PyUnicode_IS_COMPACT_ASCII(ptr), 1);
+
+                assert!(!PyUnicode_1BYTE_DATA(ptr).is_null());
+                // 2 and 4 byte macros return nonsense for this string instance.
+                assert_eq!(PyUnicode_KIND(ptr), PyUnicode_1BYTE_KIND);
+
+                assert!(!_PyUnicode_COMPACT_DATA(ptr).is_null());
+                // _PyUnicode_NONCOMPACT_DATA isn't valid for compact strings.
+                assert!(!PyUnicode_DATA(ptr).is_null());
+
+                assert_eq!(PyUnicode_GET_LENGTH(ptr), s.len().unwrap() as _);
+                assert_eq!(PyUnicode_IS_READY(ptr), 1);
+
+                // This has potential to mutate object. But it should be a no-op since
+                // we're already ready.
+                assert_eq!(PyUnicode_READY(ptr), 0);
+            }
+        })
+    }
+
+    #[test]
+    #[cfg_attr(Py_3_10, allow(deprecated))]
+    fn ucs4() {
+        Python::with_gil(|py| {
+            let s = "哈哈🐈";
+            let py_string = PyString::new(py, s);
+            let ptr = py_string.as_ptr();
+
+            unsafe {
+                let ascii_ptr = ptr as *mut PyASCIIObject;
+                let ascii = ascii_ptr.as_ref().unwrap();
+
+                assert_eq!(ascii.interned(), 0);
+                assert_eq!(ascii.kind(), PyUnicode_4BYTE_KIND);
+                assert_eq!(ascii.compact(), 1);
+                assert_eq!(ascii.ascii(), 0);
+                assert_eq!(ascii.ready(), 1);
+
+                assert_eq!(PyUnicode_IS_ASCII(ptr), 0);
+                assert_eq!(PyUnicode_IS_COMPACT(ptr), 1);
+                assert_eq!(PyUnicode_IS_COMPACT_ASCII(ptr), 0);
+
+                assert!(!PyUnicode_4BYTE_DATA(ptr).is_null());
+                assert_eq!(PyUnicode_KIND(ptr), PyUnicode_4BYTE_KIND);
+
+                assert!(!_PyUnicode_COMPACT_DATA(ptr).is_null());
+                // _PyUnicode_NONCOMPACT_DATA isn't valid for compact strings.
+                assert!(!PyUnicode_DATA(ptr).is_null());
+
+                assert_eq!(PyUnicode_GET_LENGTH(ptr), py_string.len().unwrap() as _);
+                assert_eq!(PyUnicode_IS_READY(ptr), 1);
+
+                // This has potential to mutate object. But it should be a no-op since
+                // we're already ready.
+                assert_eq!(PyUnicode_READY(ptr), 0);
+            }
+        })
+    }
+}
--- a/src/ffi/unicodeobject.rs
+++ b/src/ffi/unicodeobject.rs
@ -40,40 +40,11 @@ pub unsafe fn PyUnicode_CheckExact(op: *mut PyObject) -> c_int {
 pub const Py_UNICODE_REPLACEMENT_CHARACTER: Py_UCS4 = 0xFFFD;

 extern "C" {
-    #[cfg(not(Py_LIMITED_API))]
-    pub fn PyUnicode_New(size: Py_ssize_t, maxchar: Py_UCS4) -> *mut PyObject;
-
-    #[cfg(not(Py_LIMITED_API))]
-    pub fn PyUnicode_CopyCharacters(
-        to: *mut PyObject,
-        to_start: Py_ssize_t,
-        from: *mut PyObject,
-        from_start: Py_ssize_t,
-        how_many: Py_ssize_t,
-    ) -> Py_ssize_t;
-    #[cfg(not(Py_LIMITED_API))]
-    pub fn PyUnicode_Fill(
-        unicode: *mut PyObject,
-        start: Py_ssize_t,
-        length: Py_ssize_t,
-        fill_char: Py_UCS4,
-    ) -> Py_ssize_t;
-    #[cfg(all(not(Py_LIMITED_API), not(Py_3_12)))]
-    #[deprecated]
-    #[cfg_attr(PyPy, link_name = "PyPyUnicode_FromUnicode")]
-    pub fn PyUnicode_FromUnicode(u: *const Py_UNICODE, size: Py_ssize_t) -> *mut PyObject;

    #[cfg_attr(PyPy, link_name = "PyPyUnicode_FromStringAndSize")]
    pub fn PyUnicode_FromStringAndSize(u: *const c_char, size: Py_ssize_t) -> *mut PyObject;
    pub fn PyUnicode_FromString(u: *const c_char) -> *mut PyObject;

-    #[cfg(not(Py_LIMITED_API))]
-    pub fn PyUnicode_FromKindAndData(
-        kind: c_int,
-        buffer: *const c_void,
-        size: Py_ssize_t,
-    ) -> *mut PyObject;
-
    pub fn PyUnicode_Substring(
        str: *mut PyObject,
        start: Py_ssize_t,
@ -86,17 +57,6 @@ extern "C" {
        copy_null: c_int,
    ) -> *mut Py_UCS4;
    pub fn PyUnicode_AsUCS4Copy(unicode: *mut PyObject) -> *mut Py_UCS4;
-    #[cfg(all(not(Py_LIMITED_API), not(Py_3_12)))]
-    #[deprecated]
-    #[cfg_attr(PyPy, link_name = "PyPyUnicode_AsUnicode")]
-    pub fn PyUnicode_AsUnicode(unicode: *mut PyObject) -> *mut Py_UNICODE;
-    #[cfg(all(not(Py_LIMITED_API), not(Py_3_12)))]
-    #[deprecated]
-    #[cfg_attr(PyPy, link_name = "PyPyUnicode_AsUnicodeAndSize")]
-    pub fn PyUnicode_AsUnicodeAndSize(
-        unicode: *mut PyObject,
-        size: *mut Py_ssize_t,
-    ) -> *mut Py_UNICODE;
    #[cfg_attr(PyPy, link_name = "PyPyUnicode_GetLength")]
    pub fn PyUnicode_GetLength(unicode: *mut PyObject) -> Py_ssize_t;
    #[cfg_attr(PyPy, link_name = "PyPyUnicode_GetSize")]
@ -143,20 +103,6 @@ extern "C" {
    #[cfg_attr(PyPy, link_name = "PyPyUnicode_FromOrdinal")]
    pub fn PyUnicode_FromOrdinal(ordinal: c_int) -> *mut PyObject;
    pub fn PyUnicode_ClearFreeList() -> c_int;
-    #[cfg(any(not(Py_LIMITED_API), Py_3_10))]
-    #[cfg(Py_3_7)]
-    #[cfg_attr(PyPy, link_name = "PyPyUnicode_AsUTF8AndSize")]
-    pub fn PyUnicode_AsUTF8AndSize(unicode: *mut PyObject, size: *mut Py_ssize_t) -> *const c_char;
-    #[cfg(not(Py_3_7))]
-    #[cfg_attr(PyPy, link_name = "PyPyUnicode_AsUTF8AndSize")]
-    pub fn PyUnicode_AsUTF8AndSize(unicode: *mut PyObject, size: *mut Py_ssize_t) -> *mut c_char;
-    #[cfg(not(Py_LIMITED_API))]
-    #[cfg(Py_3_7)]
-    #[cfg_attr(PyPy, link_name = "PyPyUnicode_AsUTF8")]
-    pub fn PyUnicode_AsUTF8(unicode: *mut PyObject) -> *const c_char;
-    #[cfg(not(Py_3_7))]
-    #[cfg_attr(PyPy, link_name = "PyPyUnicode_AsUTF8")]
-    pub fn PyUnicode_AsUTF8(unicode: *mut PyObject) -> *mut c_char;
    #[cfg_attr(PyPy, link_name = "PyPyUnicode_GetDefaultEncoding")]
    pub fn PyUnicode_GetDefaultEncoding() -> *const c_char;
    #[cfg_attr(PyPy, link_name = "PyPyUnicode_Decode")]
@ -176,13 +122,6 @@ extern "C" {
        encoding: *const c_char,
        errors: *const c_char,
    ) -> *mut PyObject;
-    #[cfg(not(Py_LIMITED_API))]
-    pub fn PyUnicode_Encode(
-        s: *const Py_UNICODE,
-        size: Py_ssize_t,
-        encoding: *const c_char,
-        errors: *const c_char,
-    ) -> *mut PyObject;
    #[cfg_attr(PyPy, link_name = "PyPyUnicode_AsEncodedObject")]
    pub fn PyUnicode_AsEncodedObject(
        unicode: *mut PyObject,
@ -212,14 +151,6 @@ extern "C" {
        errors: *const c_char,
        consumed: *mut Py_ssize_t,
    ) -> *mut PyObject;
-    #[cfg(not(Py_LIMITED_API))]
-    pub fn PyUnicode_EncodeUTF7(
-        data: *const Py_UNICODE,
-        length: Py_ssize_t,
-        base64SetO: c_int,
-        base64WhiteSpace: c_int,
-        errors: *const c_char,
-    ) -> *mut PyObject;
    #[cfg_attr(PyPy, link_name = "PyPyUnicode_DecodeUTF8")]
    pub fn PyUnicode_DecodeUTF8(
        string: *const c_char,
@ -234,13 +165,6 @@ extern "C" {
    ) -> *mut PyObject;
    #[cfg_attr(PyPy, link_name = "PyPyUnicode_AsUTF8String")]
    pub fn PyUnicode_AsUTF8String(unicode: *mut PyObject) -> *mut PyObject;
-    #[cfg(not(Py_LIMITED_API))]
-    #[cfg_attr(PyPy, link_name = "PyPyUnicode_EncodeUTF8")]
-    pub fn PyUnicode_EncodeUTF8(
-        data: *const Py_UNICODE,
-        length: Py_ssize_t,
-        errors: *const c_char,
-    ) -> *mut PyObject;
    #[cfg_attr(PyPy, link_name = "PyPyUnicode_DecodeUTF32")]
    pub fn PyUnicode_DecodeUTF32(
        string: *const c_char,
@ -257,13 +181,6 @@ extern "C" {
    ) -> *mut PyObject;
    #[cfg_attr(PyPy, link_name = "PyPyUnicode_AsUTF32String")]
    pub fn PyUnicode_AsUTF32String(unicode: *mut PyObject) -> *mut PyObject;
-    #[cfg(not(Py_LIMITED_API))]
-    pub fn PyUnicode_EncodeUTF32(
-        data: *const Py_UNICODE,
-        length: Py_ssize_t,
-        errors: *const c_char,
-        byteorder: c_int,
-    ) -> *mut PyObject;
    #[cfg_attr(PyPy, link_name = "PyPyUnicode_DecodeUTF16")]
    pub fn PyUnicode_DecodeUTF16(
        string: *const c_char,
@ -280,13 +197,6 @@ extern "C" {
    ) -> *mut PyObject;
    #[cfg_attr(PyPy, link_name = "PyPyUnicode_AsUTF16String")]
    pub fn PyUnicode_AsUTF16String(unicode: *mut PyObject) -> *mut PyObject;
-    #[cfg(not(Py_LIMITED_API))]
-    pub fn PyUnicode_EncodeUTF16(
-        data: *const Py_UNICODE,
-        length: Py_ssize_t,
-        errors: *const c_char,
-        byteorder: c_int,
-    ) -> *mut PyObject;
    pub fn PyUnicode_DecodeUnicodeEscape(
        string: *const c_char,
        length: Py_ssize_t,
@ -294,22 +204,12 @@ extern "C" {
    ) -> *mut PyObject;
    #[cfg_attr(PyPy, link_name = "PyPyUnicode_AsUnicodeEscapeString")]
    pub fn PyUnicode_AsUnicodeEscapeString(unicode: *mut PyObject) -> *mut PyObject;
-    #[cfg(not(Py_LIMITED_API))]
-    pub fn PyUnicode_EncodeUnicodeEscape(
-        data: *const Py_UNICODE,
-        length: Py_ssize_t,
-    ) -> *mut PyObject;
    pub fn PyUnicode_DecodeRawUnicodeEscape(
        string: *const c_char,
        length: Py_ssize_t,
        errors: *const c_char,
    ) -> *mut PyObject;
    pub fn PyUnicode_AsRawUnicodeEscapeString(unicode: *mut PyObject) -> *mut PyObject;
-    #[cfg(not(Py_LIMITED_API))]
-    pub fn PyUnicode_EncodeRawUnicodeEscape(
-        data: *const Py_UNICODE,
-        length: Py_ssize_t,
-    ) -> *mut PyObject;
    #[cfg_attr(PyPy, link_name = "PyPyUnicode_DecodeLatin1")]
    pub fn PyUnicode_DecodeLatin1(
        string: *const c_char,
@ -318,13 +218,6 @@ extern "C" {
    ) -> *mut PyObject;
    #[cfg_attr(PyPy, link_name = "PyPyUnicode_AsLatin1String")]
    pub fn PyUnicode_AsLatin1String(unicode: *mut PyObject) -> *mut PyObject;
-    #[cfg(not(Py_LIMITED_API))]
-    #[cfg_attr(PyPy, link_name = "PyPyUnicode_EncodeLatin1")]
-    pub fn PyUnicode_EncodeLatin1(
-        data: *const Py_UNICODE,
-        length: Py_ssize_t,
-        errors: *const c_char,
-    ) -> *mut PyObject;
    #[cfg_attr(PyPy, link_name = "PyPyUnicode_DecodeASCII")]
    pub fn PyUnicode_DecodeASCII(
        string: *const c_char,
@ -333,13 +226,6 @@ extern "C" {
    ) -> *mut PyObject;
    #[cfg_attr(PyPy, link_name = "PyPyUnicode_AsASCIIString")]
    pub fn PyUnicode_AsASCIIString(unicode: *mut PyObject) -> *mut PyObject;
-    #[cfg(not(Py_LIMITED_API))]
-    #[cfg_attr(PyPy, link_name = "PyPyUnicode_EncodeASCII")]
-    pub fn PyUnicode_EncodeASCII(
-        data: *const Py_UNICODE,
-        length: Py_ssize_t,
-        errors: *const c_char,
-    ) -> *mut PyObject;
    pub fn PyUnicode_DecodeCharmap(
        string: *const c_char,
        length: Py_ssize_t,
@ -350,35 +236,6 @@ extern "C" {
        unicode: *mut PyObject,
        mapping: *mut PyObject,
    ) -> *mut PyObject;
-    #[cfg(not(Py_LIMITED_API))]
-    pub fn PyUnicode_EncodeCharmap(
-        data: *const Py_UNICODE,
-        length: Py_ssize_t,
-        mapping: *mut PyObject,
-        errors: *const c_char,
-    ) -> *mut PyObject;
-    #[cfg(not(Py_LIMITED_API))]
-    pub fn PyUnicode_TranslateCharmap(
-        data: *const Py_UNICODE,
-        length: Py_ssize_t,
-        table: *mut PyObject,
-        errors: *const c_char,
-    ) -> *mut PyObject;
-
-    #[cfg(not(Py_LIMITED_API))]
-    #[cfg_attr(PyPy, link_name = "PyPyUnicode_EncodeDecimal")]
-    pub fn PyUnicode_EncodeDecimal(
-        s: *mut Py_UNICODE,
-        length: Py_ssize_t,
-        output: *mut c_char,
-        errors: *const c_char,
-    ) -> c_int;
-    #[cfg(not(Py_LIMITED_API))]
-    #[cfg_attr(PyPy, link_name = "PyPyUnicode_TransformDecimalToASCII")]
-    pub fn PyUnicode_TransformDecimalToASCII(
-        s: *mut Py_UNICODE,
-        length: Py_ssize_t,
-    ) -> *mut PyObject;
    pub fn PyUnicode_DecodeLocaleAndSize(
        str: *const c_char,
        len: Py_ssize_t,
--- a/src/types/string.rs
+++ b/src/types/string.rs
@ -49,8 +49,8 @@ impl PyString {
    pub fn to_str(&self) -> PyResult<&str> {
        let utf8_slice = {
            cfg_if::cfg_if! {
-                if #[cfg(any(not(Py_LIMITED_API), Py_3_10))] {
-                    // PyUnicode_AsUTF8AndSize only available on limited API from Python 3.10 and up.
+                if #[cfg(not(Py_LIMITED_API))] {
+                    // PyUnicode_AsUTF8AndSize only available on limited API.
                    let mut size: ffi::Py_ssize_t = 0;
                    let data = unsafe { ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size) };
                    if data.is_null() {