From 2afadb7168b8753bd4a1cddaf8bcd63ea3053927 Mon Sep 17 00:00:00 2001 From: Gregory Szorc Date: Tue, 10 Aug 2021 16:08:11 -0700 Subject: [PATCH] string: implement API to access raw string data With the recent implementation of non-limited unicode APIs, we're able to query Python's low-level state to access the raw bytes that Python is using to store string objects. This commit implements a safe Rust API for obtaining a view into Python's internals and representing the raw bytes Python is using to store strings. Not only do we allow accessing what Python has stored internally, but we also support coercing this data to a `Cow`. Closes #1776. --- CHANGELOG.md | 4 + src/types/string.rs | 197 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 200 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e01866e6584..a77dafdfafb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] +### Added + +- Implement `PyString.data()` to access the raw bytes storing a Python string. [#1776](https://github.com/PyO3/pyo3/issues/1776) + ### Changed - Change `PyErr::fetch()` to return `Option`. [#1717](https://github.com/PyO3/pyo3/pull/1717) diff --git a/src/types/string.rs b/src/types/string.rs index f382e33264c..16020f89ae9 100644 --- a/src/types/string.rs +++ b/src/types/string.rs @@ -1,14 +1,124 @@ // Copyright (c) 2017-present PyO3 Project and Contributors +use crate::exceptions::PyUnicodeDecodeError; use crate::types::PyBytes; use crate::{ ffi, AsPyPointer, FromPyObject, IntoPy, PyAny, PyNativeType, PyObject, PyResult, PyTryFrom, Python, ToPyObject, }; use std::borrow::Cow; +use std::ffi::CStr; use std::os::raw::c_char; use std::str; +/// Represents raw data backing a Python `str`. +/// +/// Python internally stores strings in various representations. This enumeration +/// represents those variations. +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum PyStringData<'a> { + /// UCS1 representation. + Ucs1(&'a [u8]), + + /// UCS2 representation. + Ucs2(&'a [u16]), + + /// UCS4 representation. + Ucs4(&'a [u32]), +} + +impl<'a> PyStringData<'a> { + /// Obtain the raw bytes backing this instance as a [u8] slice. + pub fn as_bytes(&self) -> &[u8] { + match self { + Self::Ucs1(s) => s, + Self::Ucs2(s) => unsafe { + std::slice::from_raw_parts( + s.as_ptr() as *const u8, + s.len() * self.character_width_bytes(), + ) + }, + Self::Ucs4(s) => unsafe { + std::slice::from_raw_parts( + s.as_ptr() as *const u8, + s.len() * self.character_width_bytes(), + ) + }, + } + } + + /// Size in bytes of each *character* in the underlying buffer. + #[inline] + pub fn character_width_bytes(&self) -> usize { + match self { + Self::Ucs1(_) => 1, + Self::Ucs2(_) => 2, + Self::Ucs4(_) => 4, + } + } + + /// Convert the raw data to a Rust string. + /// + /// For UCS-1 / UTF-8, returns a borrow into the original slice. For UCS-2 and UCS-4, + /// returns an owned string. + /// + /// Returns [UnicodeDecodeError] if the string data isn't valid in its purported. + #[allow(clippy::wrong_self_convention)] + pub fn to_string(&self, py: Python) -> PyResult> { + match self { + Self::Ucs1(data) => match str::from_utf8(data) { + Ok(s) => Ok(Cow::Borrowed(s)), + Err(e) => Err(crate::PyErr::from_instance(PyUnicodeDecodeError::new_utf8( + py, data, e, + )?)), + }, + Self::Ucs2(data) => match String::from_utf16(data) { + Ok(s) => Ok(Cow::Owned(s)), + Err(e) => { + let mut message = e.to_string().as_bytes().to_vec(); + message.push(0); + + Err(crate::PyErr::from_instance(PyUnicodeDecodeError::new( + py, + CStr::from_bytes_with_nul(b"utf-16\0").unwrap(), + self.as_bytes(), + 0..self.as_bytes().len(), + CStr::from_bytes_with_nul(&message).unwrap(), + )?)) + } + }, + Self::Ucs4(data) => match data.iter().map(|&c| char::from_u32(c)).collect() { + Some(s) => Ok(Cow::Owned(s)), + None => Err(crate::PyErr::from_instance(PyUnicodeDecodeError::new( + py, + CStr::from_bytes_with_nul(b"utf-32\0").unwrap(), + self.as_bytes(), + 0..self.as_bytes().len(), + CStr::from_bytes_with_nul(b"error converting utf-32").unwrap(), + )?)), + }, + } + } + + /// Convert the raw data to a Rust string, possibly with data loss. + /// + /// Invalid *characters* will be replaced with `U+FFFD REPLACEMENT CHARACTER`. + /// + /// Returns a borrow into original data, when possible, or owned data otherwise. + #[allow(clippy::wrong_self_convention)] + pub fn to_string_lossy(&self) -> Cow { + match self { + Self::Ucs1(data) => String::from_utf8_lossy(data), + Self::Ucs2(data) => Cow::Owned(String::from_utf16_lossy(data)), + Self::Ucs4(data) => Cow::Owned( + data.iter() + .map(|&c| char::from_u32(c).unwrap_or('\u{FFFD}')) + .collect(), + ), + } + } +} + /// Represents a Python `string` (a Unicode string object). /// /// This type is immutable. @@ -89,6 +199,48 @@ impl PyString { } } } + + /// Obtains the raw data backing the Python string. + /// + /// If the Python string object was created through legacy APIs, its internal + /// storage format will be canonicalized before data is returned. + #[cfg(not(Py_LIMITED_API))] + pub fn data(&self) -> PyResult> { + let ptr = self.as_ptr(); + + if cfg!(not(Py_3_12)) { + #[allow(deprecated)] + let ready = unsafe { ffi::PyUnicode_READY(ptr) }; + if ready != 0 { + // Exception was created on failure. + return Err(crate::PyErr::fetch(self.py()) + .expect("exception should be set if PyUnicode_READY failed")); + } + } + + // The string should be in its canonical form after calling `PyUnicode_READY()`. + // And non-canonical form not possible after Python 3.12. So it should be safe + // to call these APIs. + let length = unsafe { ffi::PyUnicode_GET_LENGTH(ptr) } as usize; + let raw_data = unsafe { ffi::PyUnicode_DATA(ptr) }; + let kind = unsafe { ffi::PyUnicode_KIND(ptr) }; + + match kind { + ffi::PyUnicode_1BYTE_KIND => Ok(PyStringData::Ucs1(unsafe { + std::slice::from_raw_parts(raw_data as *const u8, length) + })), + ffi::PyUnicode_2BYTE_KIND => Ok(PyStringData::Ucs2(unsafe { + std::slice::from_raw_parts(raw_data as *const u16, length) + })), + ffi::PyUnicode_4BYTE_KIND => Ok(PyStringData::Ucs4(unsafe { + std::slice::from_raw_parts(raw_data as *const u32, length) + })), + kind => Err(crate::exceptions::PyValueError::new_err(format!( + "unknown string kind: {}", + kind + ))), + } + } } /// Converts a Rust `str` to a Python object. @@ -192,9 +344,10 @@ impl FromPyObject<'_> for char { #[cfg(test)] mod tests { - use super::PyString; + use super::{PyString, PyStringData}; use crate::Python; use crate::{FromPyObject, PyObject, PyTryFrom, ToPyObject}; + use std::borrow::Cow; #[test] fn test_non_bmp() { @@ -297,4 +450,46 @@ mod tests { assert_eq!(format!("{}", s), "Hello\n"); }) } + + #[test] + #[cfg(not(Py_LIMITED_API))] + fn test_string_data_ucs1() { + Python::with_gil(|py| { + let s = PyString::new(py, "hello, world"); + let data = s.data().unwrap(); + + assert_eq!(data, PyStringData::Ucs1(b"hello, world")); + assert_eq!(data.to_string(py).unwrap(), Cow::Borrowed("hello, world")); + assert_eq!(data.to_string_lossy(), Cow::Borrowed("hello, world")); + }) + } + + #[test] + #[cfg(not(Py_LIMITED_API))] + fn test_string_data_ucs2() { + Python::with_gil(|py| { + let s = py.eval("'foo\\ud800'", None, None).unwrap(); + let py_string = s.cast_as::().unwrap(); + let data = py_string.data().unwrap(); + + assert_eq!(data, PyStringData::Ucs2(&[102, 111, 111, 0xd800])); + assert_eq!( + data.to_string_lossy(), + Cow::Owned::("foo�".to_string()) + ); + }) + } + + #[test] + #[cfg(not(Py_LIMITED_API))] + fn test_string_data_ucs4() { + Python::with_gil(|py| { + let s = "哈哈🐈"; + let py_string = PyString::new(py, s); + let data = py_string.data().unwrap(); + + assert_eq!(data, PyStringData::Ucs4(&[21704, 21704, 128008])); + assert_eq!(data.to_string_lossy(), Cow::Owned::(s.to_string())); + }) + } }