pyo3/types/
string.rs

1#[cfg(not(Py_LIMITED_API))]
2use crate::exceptions::PyUnicodeDecodeError;
3use crate::ffi_ptr_ext::FfiPtrExt;
4use crate::instance::Borrowed;
5use crate::py_result_ext::PyResultExt;
6use crate::types::any::PyAnyMethods;
7use crate::types::bytes::PyBytesMethods;
8use crate::types::PyBytes;
9#[allow(deprecated)]
10use crate::IntoPy;
11use crate::{ffi, Bound, Py, PyAny, PyResult, Python};
12use std::borrow::Cow;
13use std::str;
14
15/// Deprecated alias for [`PyString`].
16#[deprecated(since = "0.23.0", note = "use `PyString` instead")]
17pub type PyUnicode = PyString;
18
19/// Represents raw data backing a Python `str`.
20///
21/// Python internally stores strings in various representations. This enumeration
22/// represents those variations.
23#[cfg(not(Py_LIMITED_API))]
24#[derive(Clone, Copy, Debug, PartialEq, Eq)]
25pub enum PyStringData<'a> {
26    /// UCS1 representation.
27    Ucs1(&'a [u8]),
28
29    /// UCS2 representation.
30    Ucs2(&'a [u16]),
31
32    /// UCS4 representation.
33    Ucs4(&'a [u32]),
34}
35
36#[cfg(not(Py_LIMITED_API))]
37impl<'a> PyStringData<'a> {
38    /// Obtain the raw bytes backing this instance as a [u8] slice.
39    pub fn as_bytes(&self) -> &[u8] {
40        match self {
41            Self::Ucs1(s) => s,
42            Self::Ucs2(s) => unsafe {
43                std::slice::from_raw_parts(s.as_ptr().cast(), s.len() * self.value_width_bytes())
44            },
45            Self::Ucs4(s) => unsafe {
46                std::slice::from_raw_parts(s.as_ptr().cast(), s.len() * self.value_width_bytes())
47            },
48        }
49    }
50
51    /// Size in bytes of each value/item in the underlying slice.
52    #[inline]
53    pub fn value_width_bytes(&self) -> usize {
54        match self {
55            Self::Ucs1(_) => 1,
56            Self::Ucs2(_) => 2,
57            Self::Ucs4(_) => 4,
58        }
59    }
60
61    /// Convert the raw data to a Rust string.
62    ///
63    /// For UCS-1 / UTF-8, returns a borrow into the original slice. For UCS-2 and UCS-4,
64    /// returns an owned string.
65    ///
66    /// Returns [PyUnicodeDecodeError] if the string data isn't valid in its purported
67    /// storage format. This should only occur for strings that were created via Python
68    /// C APIs that skip input validation (like `PyUnicode_FromKindAndData`) and should
69    /// never occur for strings that were created from Python code.
70    pub fn to_string(self, py: Python<'_>) -> PyResult<Cow<'a, str>> {
71        use std::ffi::CStr;
72        match self {
73            Self::Ucs1(data) => match str::from_utf8(data) {
74                Ok(s) => Ok(Cow::Borrowed(s)),
75                Err(e) => Err(PyUnicodeDecodeError::new_utf8(py, data, e)?.into()),
76            },
77            Self::Ucs2(data) => match String::from_utf16(data) {
78                Ok(s) => Ok(Cow::Owned(s)),
79                Err(e) => {
80                    let mut message = e.to_string().as_bytes().to_vec();
81                    message.push(0);
82
83                    Err(PyUnicodeDecodeError::new(
84                        py,
85                        ffi::c_str!("utf-16"),
86                        self.as_bytes(),
87                        0..self.as_bytes().len(),
88                        CStr::from_bytes_with_nul(&message).unwrap(),
89                    )?
90                    .into())
91                }
92            },
93            Self::Ucs4(data) => match data.iter().map(|&c| std::char::from_u32(c)).collect() {
94                Some(s) => Ok(Cow::Owned(s)),
95                None => Err(PyUnicodeDecodeError::new(
96                    py,
97                    ffi::c_str!("utf-32"),
98                    self.as_bytes(),
99                    0..self.as_bytes().len(),
100                    ffi::c_str!("error converting utf-32"),
101                )?
102                .into()),
103            },
104        }
105    }
106
107    /// Convert the raw data to a Rust string, possibly with data loss.
108    ///
109    /// Invalid code points will be replaced with `U+FFFD REPLACEMENT CHARACTER`.
110    ///
111    /// Returns a borrow into original data, when possible, or owned data otherwise.
112    ///
113    /// The return value of this function should only disagree with [Self::to_string]
114    /// when that method would error.
115    pub fn to_string_lossy(self) -> Cow<'a, str> {
116        match self {
117            Self::Ucs1(data) => String::from_utf8_lossy(data),
118            Self::Ucs2(data) => Cow::Owned(String::from_utf16_lossy(data)),
119            Self::Ucs4(data) => Cow::Owned(
120                data.iter()
121                    .map(|&c| std::char::from_u32(c).unwrap_or('\u{FFFD}'))
122                    .collect(),
123            ),
124        }
125    }
126}
127
128/// Represents a Python `string` (a Unicode string object).
129///
130/// Values of this type are accessed via PyO3's smart pointers, e.g. as
131/// [`Py<PyString>`][crate::Py] or [`Bound<'py, PyString>`][Bound].
132///
133/// For APIs available on `str` objects, see the [`PyStringMethods`] trait which is implemented for
134/// [`Bound<'py, PyString>`][Bound].
135///
136/// # Equality
137///
138/// For convenience, [`Bound<'py, PyString>`] implements [`PartialEq<str>`] to allow comparing the
139/// data in the Python string to a Rust UTF-8 string slice.
140///
141/// This is not always the most appropriate way to compare Python strings, as Python string subclasses
142/// may have different equality semantics. In situations where subclasses overriding equality might be
143/// relevant, use [`PyAnyMethods::eq`], at cost of the additional overhead of a Python method call.
144///
145/// ```rust
146/// # use pyo3::prelude::*;
147/// use pyo3::types::PyString;
148///
149/// # Python::with_gil(|py| {
150/// let py_string = PyString::new(py, "foo");
151/// // via PartialEq<str>
152/// assert_eq!(py_string, "foo");
153///
154/// // via Python equality
155/// assert!(py_string.as_any().eq("foo").unwrap());
156/// # });
157/// ```
158#[repr(transparent)]
159pub struct PyString(PyAny);
160
161pyobject_native_type_core!(PyString, pyobject_native_static_type_object!(ffi::PyUnicode_Type), #checkfunction=ffi::PyUnicode_Check);
162
163impl PyString {
164    /// Creates a new Python string object.
165    ///
166    /// Panics if out of memory.
167    pub fn new<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> {
168        let ptr = s.as_ptr().cast();
169        let len = s.len() as ffi::Py_ssize_t;
170        unsafe {
171            ffi::PyUnicode_FromStringAndSize(ptr, len)
172                .assume_owned(py)
173                .downcast_into_unchecked()
174        }
175    }
176
177    /// Deprecated name for [`PyString::new`].
178    #[deprecated(since = "0.23.0", note = "renamed to `PyString::new`")]
179    #[inline]
180    pub fn new_bound<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> {
181        Self::new(py, s)
182    }
183
184    /// Intern the given string
185    ///
186    /// This will return a reference to the same Python string object if called repeatedly with the same string.
187    ///
188    /// Note that while this is more memory efficient than [`PyString::new_bound`], it unconditionally allocates a
189    /// temporary Python string object and is thereby slower than [`PyString::new_bound`].
190    ///
191    /// Panics if out of memory.
192    pub fn intern<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> {
193        let ptr = s.as_ptr().cast();
194        let len = s.len() as ffi::Py_ssize_t;
195        unsafe {
196            let mut ob = ffi::PyUnicode_FromStringAndSize(ptr, len);
197            if !ob.is_null() {
198                ffi::PyUnicode_InternInPlace(&mut ob);
199            }
200            ob.assume_owned(py).downcast_into_unchecked()
201        }
202    }
203
204    /// Deprecated name for [`PyString::intern`].
205    #[deprecated(since = "0.23.0", note = "renamed to `PyString::intern`")]
206    #[inline]
207    pub fn intern_bound<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> {
208        Self::intern(py, s)
209    }
210
211    /// Attempts to create a Python string from a Python [bytes-like object].
212    ///
213    /// [bytes-like object]: (https://docs.python.org/3/glossary.html#term-bytes-like-object).
214    pub fn from_object<'py>(
215        src: &Bound<'py, PyAny>,
216        encoding: &str,
217        errors: &str,
218    ) -> PyResult<Bound<'py, PyString>> {
219        unsafe {
220            ffi::PyUnicode_FromEncodedObject(
221                src.as_ptr(),
222                encoding.as_ptr().cast(),
223                errors.as_ptr().cast(),
224            )
225            .assume_owned_or_err(src.py())
226            .downcast_into_unchecked()
227        }
228    }
229
230    /// Deprecated name for [`PyString::from_object`].
231    #[deprecated(since = "0.23.0", note = "renamed to `PyString::from_object`")]
232    #[inline]
233    pub fn from_object_bound<'py>(
234        src: &Bound<'py, PyAny>,
235        encoding: &str,
236        errors: &str,
237    ) -> PyResult<Bound<'py, PyString>> {
238        Self::from_object(src, encoding, errors)
239    }
240}
241
242/// Implementation of functionality for [`PyString`].
243///
244/// These methods are defined for the `Bound<'py, PyString>` smart pointer, so to use method call
245/// syntax these methods are separated into a trait, because stable Rust does not yet support
246/// `arbitrary_self_types`.
247#[doc(alias = "PyString")]
248pub trait PyStringMethods<'py>: crate::sealed::Sealed {
249    /// Gets the Python string as a Rust UTF-8 string slice.
250    ///
251    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
252    /// (containing unpaired surrogates).
253    #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
254    fn to_str(&self) -> PyResult<&str>;
255
256    /// Converts the `PyString` into a Rust string, avoiding copying when possible.
257    ///
258    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
259    /// (containing unpaired surrogates).
260    fn to_cow(&self) -> PyResult<Cow<'_, str>>;
261
262    /// Converts the `PyString` into a Rust string.
263    ///
264    /// Unpaired surrogates invalid UTF-8 sequences are
265    /// replaced with `U+FFFD REPLACEMENT CHARACTER`.
266    fn to_string_lossy(&self) -> Cow<'_, str>;
267
268    /// Encodes this string as a Python `bytes` object, using UTF-8 encoding.
269    fn encode_utf8(&self) -> PyResult<Bound<'py, PyBytes>>;
270
271    /// Obtains the raw data backing the Python string.
272    ///
273    /// If the Python string object was created through legacy APIs, its internal storage format
274    /// will be canonicalized before data is returned.
275    ///
276    /// # Safety
277    ///
278    /// This function implementation relies on manually decoding a C bitfield. In practice, this
279    /// works well on common little-endian architectures such as x86_64, where the bitfield has a
280    /// common representation (even if it is not part of the C spec). The PyO3 CI tests this API on
281    /// x86_64 platforms.
282    ///
283    /// By using this API, you accept responsibility for testing that PyStringData behaves as
284    /// expected on the targets where you plan to distribute your software.
285    #[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
286    unsafe fn data(&self) -> PyResult<PyStringData<'_>>;
287}
288
289impl<'py> PyStringMethods<'py> for Bound<'py, PyString> {
290    #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
291    fn to_str(&self) -> PyResult<&str> {
292        self.as_borrowed().to_str()
293    }
294
295    fn to_cow(&self) -> PyResult<Cow<'_, str>> {
296        self.as_borrowed().to_cow()
297    }
298
299    fn to_string_lossy(&self) -> Cow<'_, str> {
300        self.as_borrowed().to_string_lossy()
301    }
302
303    fn encode_utf8(&self) -> PyResult<Bound<'py, PyBytes>> {
304        unsafe {
305            ffi::PyUnicode_AsUTF8String(self.as_ptr())
306                .assume_owned_or_err(self.py())
307                .downcast_into_unchecked::<PyBytes>()
308        }
309    }
310
311    #[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
312    unsafe fn data(&self) -> PyResult<PyStringData<'_>> {
313        self.as_borrowed().data()
314    }
315}
316
317impl<'a> Borrowed<'a, '_, PyString> {
318    #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
319    #[allow(clippy::wrong_self_convention)]
320    pub(crate) fn to_str(self) -> PyResult<&'a str> {
321        // PyUnicode_AsUTF8AndSize only available on limited API starting with 3.10.
322        let mut size: ffi::Py_ssize_t = 0;
323        let data: *const u8 =
324            unsafe { ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size).cast() };
325        if data.is_null() {
326            Err(crate::PyErr::fetch(self.py()))
327        } else {
328            Ok(unsafe {
329                std::str::from_utf8_unchecked(std::slice::from_raw_parts(data, size as usize))
330            })
331        }
332    }
333
334    #[allow(clippy::wrong_self_convention)]
335    pub(crate) fn to_cow(self) -> PyResult<Cow<'a, str>> {
336        // TODO: this method can probably be deprecated once Python 3.9 support is dropped,
337        // because all versions then support the more efficient `to_str`.
338        #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
339        {
340            self.to_str().map(Cow::Borrowed)
341        }
342
343        #[cfg(not(any(Py_3_10, not(Py_LIMITED_API))))]
344        {
345            let bytes = self.encode_utf8()?;
346            Ok(Cow::Owned(
347                unsafe { str::from_utf8_unchecked(bytes.as_bytes()) }.to_owned(),
348            ))
349        }
350    }
351
352    #[allow(clippy::wrong_self_convention)]
353    fn to_string_lossy(self) -> Cow<'a, str> {
354        let ptr = self.as_ptr();
355        let py = self.py();
356
357        #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
358        if let Ok(s) = self.to_str() {
359            return Cow::Borrowed(s);
360        }
361
362        let bytes = unsafe {
363            ffi::PyUnicode_AsEncodedString(
364                ptr,
365                ffi::c_str!("utf-8").as_ptr(),
366                ffi::c_str!("surrogatepass").as_ptr(),
367            )
368            .assume_owned(py)
369            .downcast_into_unchecked::<PyBytes>()
370        };
371        Cow::Owned(String::from_utf8_lossy(bytes.as_bytes()).into_owned())
372    }
373
374    #[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
375    unsafe fn data(self) -> PyResult<PyStringData<'a>> {
376        let ptr = self.as_ptr();
377
378        #[cfg(not(Py_3_12))]
379        #[allow(deprecated)]
380        {
381            let ready = ffi::PyUnicode_READY(ptr);
382            if ready != 0 {
383                // Exception was created on failure.
384                return Err(crate::PyErr::fetch(self.py()));
385            }
386        }
387
388        // The string should be in its canonical form after calling `PyUnicode_READY()`.
389        // And non-canonical form not possible after Python 3.12. So it should be safe
390        // to call these APIs.
391        let length = ffi::PyUnicode_GET_LENGTH(ptr) as usize;
392        let raw_data = ffi::PyUnicode_DATA(ptr);
393        let kind = ffi::PyUnicode_KIND(ptr);
394
395        match kind {
396            ffi::PyUnicode_1BYTE_KIND => Ok(PyStringData::Ucs1(std::slice::from_raw_parts(
397                raw_data as *const u8,
398                length,
399            ))),
400            ffi::PyUnicode_2BYTE_KIND => Ok(PyStringData::Ucs2(std::slice::from_raw_parts(
401                raw_data as *const u16,
402                length,
403            ))),
404            ffi::PyUnicode_4BYTE_KIND => Ok(PyStringData::Ucs4(std::slice::from_raw_parts(
405                raw_data as *const u32,
406                length,
407            ))),
408            _ => unreachable!(),
409        }
410    }
411}
412
413impl Py<PyString> {
414    /// Gets the Python string as a Rust UTF-8 string slice.
415    ///
416    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
417    /// (containing unpaired surrogates).
418    ///
419    /// Because `str` objects are immutable, the returned slice is independent of
420    /// the GIL lifetime.
421    #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
422    pub fn to_str<'a>(&'a self, py: Python<'_>) -> PyResult<&'a str> {
423        self.bind_borrowed(py).to_str()
424    }
425
426    /// Converts the `PyString` into a Rust string, avoiding copying when possible.
427    ///
428    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
429    /// (containing unpaired surrogates).
430    ///
431    /// Because `str` objects are immutable, the returned slice is independent of
432    /// the GIL lifetime.
433    pub fn to_cow<'a>(&'a self, py: Python<'_>) -> PyResult<Cow<'a, str>> {
434        self.bind_borrowed(py).to_cow()
435    }
436
437    /// Converts the `PyString` into a Rust string.
438    ///
439    /// Unpaired surrogates invalid UTF-8 sequences are
440    /// replaced with `U+FFFD REPLACEMENT CHARACTER`.
441    ///
442    /// Because `str` objects are immutable, the returned slice is independent of
443    /// the GIL lifetime.
444    pub fn to_string_lossy<'a>(&'a self, py: Python<'_>) -> Cow<'a, str> {
445        self.bind_borrowed(py).to_string_lossy()
446    }
447}
448
449#[allow(deprecated)]
450impl IntoPy<Py<PyString>> for Bound<'_, PyString> {
451    fn into_py(self, _py: Python<'_>) -> Py<PyString> {
452        self.unbind()
453    }
454}
455
456#[allow(deprecated)]
457impl IntoPy<Py<PyString>> for &Bound<'_, PyString> {
458    fn into_py(self, _py: Python<'_>) -> Py<PyString> {
459        self.clone().unbind()
460    }
461}
462
463#[allow(deprecated)]
464impl IntoPy<Py<PyString>> for &'_ Py<PyString> {
465    fn into_py(self, py: Python<'_>) -> Py<PyString> {
466        self.clone_ref(py)
467    }
468}
469
470/// Compares whether the data in the Python string is equal to the given UTF8.
471///
472/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
473impl PartialEq<str> for Bound<'_, PyString> {
474    #[inline]
475    fn eq(&self, other: &str) -> bool {
476        self.as_borrowed() == *other
477    }
478}
479
480/// Compares whether the data in the Python string is equal to the given UTF8.
481///
482/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
483impl PartialEq<&'_ str> for Bound<'_, PyString> {
484    #[inline]
485    fn eq(&self, other: &&str) -> bool {
486        self.as_borrowed() == **other
487    }
488}
489
490/// Compares whether the data in the Python string is equal to the given UTF8.
491///
492/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
493impl PartialEq<Bound<'_, PyString>> for str {
494    #[inline]
495    fn eq(&self, other: &Bound<'_, PyString>) -> bool {
496        *self == other.as_borrowed()
497    }
498}
499
500/// Compares whether the data in the Python string is equal to the given UTF8.
501///
502/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
503impl PartialEq<&'_ Bound<'_, PyString>> for str {
504    #[inline]
505    fn eq(&self, other: &&Bound<'_, PyString>) -> bool {
506        *self == other.as_borrowed()
507    }
508}
509
510/// Compares whether the data in the Python string is equal to the given UTF8.
511///
512/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
513impl PartialEq<Bound<'_, PyString>> for &'_ str {
514    #[inline]
515    fn eq(&self, other: &Bound<'_, PyString>) -> bool {
516        **self == other.as_borrowed()
517    }
518}
519
520/// Compares whether the data in the Python string is equal to the given UTF8.
521///
522/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
523impl PartialEq<str> for &'_ Bound<'_, PyString> {
524    #[inline]
525    fn eq(&self, other: &str) -> bool {
526        self.as_borrowed() == other
527    }
528}
529
530/// Compares whether the data in the Python string is equal to the given UTF8.
531///
532/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
533impl PartialEq<str> for Borrowed<'_, '_, PyString> {
534    #[inline]
535    fn eq(&self, other: &str) -> bool {
536        #[cfg(not(Py_3_13))]
537        {
538            self.to_cow().map_or(false, |s| s == other)
539        }
540
541        #[cfg(Py_3_13)]
542        unsafe {
543            ffi::PyUnicode_EqualToUTF8AndSize(
544                self.as_ptr(),
545                other.as_ptr().cast(),
546                other.len() as _,
547            ) == 1
548        }
549    }
550}
551
552/// Compares whether the data in the Python string is equal to the given UTF8.
553///
554/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
555impl PartialEq<&str> for Borrowed<'_, '_, PyString> {
556    #[inline]
557    fn eq(&self, other: &&str) -> bool {
558        *self == **other
559    }
560}
561
562/// Compares whether the data in the Python string is equal to the given UTF8.
563///
564/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
565impl PartialEq<Borrowed<'_, '_, PyString>> for str {
566    #[inline]
567    fn eq(&self, other: &Borrowed<'_, '_, PyString>) -> bool {
568        other == self
569    }
570}
571
572/// Compares whether the data in the Python string is equal to the given UTF8.
573///
574/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
575impl PartialEq<Borrowed<'_, '_, PyString>> for &'_ str {
576    #[inline]
577    fn eq(&self, other: &Borrowed<'_, '_, PyString>) -> bool {
578        other == self
579    }
580}
581
582#[cfg(test)]
583mod tests {
584    use super::*;
585    use crate::{IntoPyObject, PyObject};
586
587    #[test]
588    fn test_to_cow_utf8() {
589        Python::with_gil(|py| {
590            let s = "ascii 🐈";
591            let py_string = PyString::new(py, s);
592            assert_eq!(s, py_string.to_cow().unwrap());
593        })
594    }
595
596    #[test]
597    fn test_to_cow_surrogate() {
598        Python::with_gil(|py| {
599            let py_string = py
600                .eval(ffi::c_str!(r"'\ud800'"), None, None)
601                .unwrap()
602                .downcast_into::<PyString>()
603                .unwrap();
604            assert!(py_string.to_cow().is_err());
605        })
606    }
607
608    #[test]
609    fn test_to_cow_unicode() {
610        Python::with_gil(|py| {
611            let s = "哈哈🐈";
612            let py_string = PyString::new(py, s);
613            assert_eq!(s, py_string.to_cow().unwrap());
614        })
615    }
616
617    #[test]
618    fn test_encode_utf8_unicode() {
619        Python::with_gil(|py| {
620            let s = "哈哈🐈";
621            let obj = PyString::new(py, s);
622            assert_eq!(s.as_bytes(), obj.encode_utf8().unwrap().as_bytes());
623        })
624    }
625
626    #[test]
627    fn test_encode_utf8_surrogate() {
628        Python::with_gil(|py| {
629            let obj: PyObject = py
630                .eval(ffi::c_str!(r"'\ud800'"), None, None)
631                .unwrap()
632                .into();
633            assert!(obj
634                .bind(py)
635                .downcast::<PyString>()
636                .unwrap()
637                .encode_utf8()
638                .is_err());
639        })
640    }
641
642    #[test]
643    fn test_to_string_lossy() {
644        Python::with_gil(|py| {
645            let py_string = py
646                .eval(ffi::c_str!(r"'🐈 Hello \ud800World'"), None, None)
647                .unwrap()
648                .downcast_into::<PyString>()
649                .unwrap();
650
651            assert_eq!(py_string.to_string_lossy(), "🐈 Hello ���World");
652        })
653    }
654
655    #[test]
656    fn test_debug_string() {
657        Python::with_gil(|py| {
658            let s = "Hello\n".into_pyobject(py).unwrap();
659            assert_eq!(format!("{:?}", s), "'Hello\\n'");
660        })
661    }
662
663    #[test]
664    fn test_display_string() {
665        Python::with_gil(|py| {
666            let s = "Hello\n".into_pyobject(py).unwrap();
667            assert_eq!(format!("{}", s), "Hello\n");
668        })
669    }
670
671    #[test]
672    #[cfg(not(any(Py_LIMITED_API, PyPy)))]
673    fn test_string_data_ucs1() {
674        Python::with_gil(|py| {
675            let s = PyString::new(py, "hello, world");
676            let data = unsafe { s.data().unwrap() };
677
678            assert_eq!(data, PyStringData::Ucs1(b"hello, world"));
679            assert_eq!(data.to_string(py).unwrap(), Cow::Borrowed("hello, world"));
680            assert_eq!(data.to_string_lossy(), Cow::Borrowed("hello, world"));
681        })
682    }
683
684    #[test]
685    #[cfg(not(any(Py_LIMITED_API, PyPy)))]
686    fn test_string_data_ucs1_invalid() {
687        Python::with_gil(|py| {
688            // 0xfe is not allowed in UTF-8.
689            let buffer = b"f\xfe\0";
690            let ptr = unsafe {
691                crate::ffi::PyUnicode_FromKindAndData(
692                    crate::ffi::PyUnicode_1BYTE_KIND as _,
693                    buffer.as_ptr().cast(),
694                    2,
695                )
696            };
697            assert!(!ptr.is_null());
698            let s = unsafe { ptr.assume_owned(py).downcast_into_unchecked::<PyString>() };
699            let data = unsafe { s.data().unwrap() };
700            assert_eq!(data, PyStringData::Ucs1(b"f\xfe"));
701            let err = data.to_string(py).unwrap_err();
702            assert!(err.get_type(py).is(&py.get_type::<PyUnicodeDecodeError>()));
703            assert!(err
704                .to_string()
705                .contains("'utf-8' codec can't decode byte 0xfe in position 1"));
706            assert_eq!(data.to_string_lossy(), Cow::Borrowed("f�"));
707        });
708    }
709
710    #[test]
711    #[cfg(not(any(Py_LIMITED_API, PyPy)))]
712    fn test_string_data_ucs2() {
713        Python::with_gil(|py| {
714            let s = py.eval(ffi::c_str!("'foo\\ud800'"), None, None).unwrap();
715            let py_string = s.downcast::<PyString>().unwrap();
716            let data = unsafe { py_string.data().unwrap() };
717
718            assert_eq!(data, PyStringData::Ucs2(&[102, 111, 111, 0xd800]));
719            assert_eq!(
720                data.to_string_lossy(),
721                Cow::Owned::<str>("foo�".to_string())
722            );
723        })
724    }
725
726    #[test]
727    #[cfg(all(not(any(Py_LIMITED_API, PyPy)), target_endian = "little"))]
728    fn test_string_data_ucs2_invalid() {
729        Python::with_gil(|py| {
730            // U+FF22 (valid) & U+d800 (never valid)
731            let buffer = b"\x22\xff\x00\xd8\x00\x00";
732            let ptr = unsafe {
733                crate::ffi::PyUnicode_FromKindAndData(
734                    crate::ffi::PyUnicode_2BYTE_KIND as _,
735                    buffer.as_ptr().cast(),
736                    2,
737                )
738            };
739            assert!(!ptr.is_null());
740            let s = unsafe { ptr.assume_owned(py).downcast_into_unchecked::<PyString>() };
741            let data = unsafe { s.data().unwrap() };
742            assert_eq!(data, PyStringData::Ucs2(&[0xff22, 0xd800]));
743            let err = data.to_string(py).unwrap_err();
744            assert!(err.get_type(py).is(&py.get_type::<PyUnicodeDecodeError>()));
745            assert!(err
746                .to_string()
747                .contains("'utf-16' codec can't decode bytes in position 0-3"));
748            assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("B�".into()));
749        });
750    }
751
752    #[test]
753    #[cfg(not(any(Py_LIMITED_API, PyPy)))]
754    fn test_string_data_ucs4() {
755        Python::with_gil(|py| {
756            let s = "哈哈🐈";
757            let py_string = PyString::new(py, s);
758            let data = unsafe { py_string.data().unwrap() };
759
760            assert_eq!(data, PyStringData::Ucs4(&[21704, 21704, 128008]));
761            assert_eq!(data.to_string_lossy(), Cow::Owned::<str>(s.to_string()));
762        })
763    }
764
765    #[test]
766    #[cfg(all(not(any(Py_LIMITED_API, PyPy)), target_endian = "little"))]
767    fn test_string_data_ucs4_invalid() {
768        Python::with_gil(|py| {
769            // U+20000 (valid) & U+d800 (never valid)
770            let buffer = b"\x00\x00\x02\x00\x00\xd8\x00\x00\x00\x00\x00\x00";
771            let ptr = unsafe {
772                crate::ffi::PyUnicode_FromKindAndData(
773                    crate::ffi::PyUnicode_4BYTE_KIND as _,
774                    buffer.as_ptr().cast(),
775                    2,
776                )
777            };
778            assert!(!ptr.is_null());
779            let s = unsafe { ptr.assume_owned(py).downcast_into_unchecked::<PyString>() };
780            let data = unsafe { s.data().unwrap() };
781            assert_eq!(data, PyStringData::Ucs4(&[0x20000, 0xd800]));
782            let err = data.to_string(py).unwrap_err();
783            assert!(err.get_type(py).is(&py.get_type::<PyUnicodeDecodeError>()));
784            assert!(err
785                .to_string()
786                .contains("'utf-32' codec can't decode bytes in position 0-7"));
787            assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("𠀀�".into()));
788        });
789    }
790
791    #[test]
792    fn test_intern_string() {
793        Python::with_gil(|py| {
794            let py_string1 = PyString::intern(py, "foo");
795            assert_eq!(py_string1, "foo");
796
797            let py_string2 = PyString::intern(py, "foo");
798            assert_eq!(py_string2, "foo");
799
800            assert_eq!(py_string1.as_ptr(), py_string2.as_ptr());
801
802            let py_string3 = PyString::intern(py, "bar");
803            assert_eq!(py_string3, "bar");
804
805            assert_ne!(py_string1.as_ptr(), py_string3.as_ptr());
806        });
807    }
808
809    #[test]
810    fn test_py_to_str_utf8() {
811        Python::with_gil(|py| {
812            let s = "ascii 🐈";
813            let py_string = PyString::new(py, s).unbind();
814
815            #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
816            assert_eq!(s, py_string.to_str(py).unwrap());
817
818            assert_eq!(s, py_string.to_cow(py).unwrap());
819        })
820    }
821
822    #[test]
823    fn test_py_to_str_surrogate() {
824        Python::with_gil(|py| {
825            let py_string: Py<PyString> = py
826                .eval(ffi::c_str!(r"'\ud800'"), None, None)
827                .unwrap()
828                .extract()
829                .unwrap();
830
831            #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
832            assert!(py_string.to_str(py).is_err());
833
834            assert!(py_string.to_cow(py).is_err());
835        })
836    }
837
838    #[test]
839    fn test_py_to_string_lossy() {
840        Python::with_gil(|py| {
841            let py_string: Py<PyString> = py
842                .eval(ffi::c_str!(r"'🐈 Hello \ud800World'"), None, None)
843                .unwrap()
844                .extract()
845                .unwrap();
846            assert_eq!(py_string.to_string_lossy(py), "🐈 Hello ���World");
847        })
848    }
849
850    #[test]
851    fn test_comparisons() {
852        Python::with_gil(|py| {
853            let s = "hello, world";
854            let py_string = PyString::new(py, s);
855
856            assert_eq!(py_string, "hello, world");
857
858            assert_eq!(py_string, s);
859            assert_eq!(&py_string, s);
860            assert_eq!(s, py_string);
861            assert_eq!(s, &py_string);
862
863            assert_eq!(py_string, *s);
864            assert_eq!(&py_string, *s);
865            assert_eq!(*s, py_string);
866            assert_eq!(*s, &py_string);
867
868            let py_string = py_string.as_borrowed();
869
870            assert_eq!(py_string, s);
871            assert_eq!(&py_string, s);
872            assert_eq!(s, py_string);
873            assert_eq!(s, &py_string);
874
875            assert_eq!(py_string, *s);
876            assert_eq!(*s, py_string);
877        })
878    }
879}
⚠️ Internal Docs ⚠️ Not Public API 👉 Official Docs Here