mirror of
https://github.com/pybind/pybind11.git
synced 2024-11-25 06:35:12 +00:00
Changing pybind11::str to exclusively hold PyUnicodeObject (#2409)
* Changing pybind11::str to exclusively hold PyUnicodeObject
This commit is contained in:
parent
587d5f840a
commit
0432ae7c52
@ -6,10 +6,15 @@ Changelog
|
||||
Starting with version 1.8.0, pybind11 releases use a `semantic versioning
|
||||
<http://semver.org>`_ policy.
|
||||
|
||||
v2.6.3 (TBA, not yet released)
|
||||
v2.7.0 (TBA, not yet released)
|
||||
------------------------------
|
||||
|
||||
* Details to follow here
|
||||
* ``py::str`` changed to exclusively hold `PyUnicodeObject`. Previously
|
||||
``py::str`` could also hold `bytes`, which is probably surprising, was
|
||||
never documented, and can mask bugs (e.g. accidental use of ``py::str``
|
||||
instead of ``py::bytes``).
|
||||
`#2409 <https://github.com/pybind/pybind11/pull/2409>`_
|
||||
|
||||
|
||||
v2.6.2 (Jan 26, 2021)
|
||||
---------------------
|
||||
|
@ -10,6 +10,31 @@ modernization and other useful information.
|
||||
|
||||
.. _upgrade-guide-2.6:
|
||||
|
||||
v2.7
|
||||
====
|
||||
|
||||
*Before* v2.7, ``py::str`` can hold ``PyUnicodeObject`` or ``PyBytesObject``,
|
||||
and ``py::isinstance<str>()`` is ``true`` for both ``py::str`` and
|
||||
``py::bytes``. Starting with v2.7, ``py::str`` exclusively holds
|
||||
``PyUnicodeObject`` (`#2409 <https://github.com/pybind/pybind11/pull/2409>`_),
|
||||
and ``py::isinstance<str>()`` is ``true`` only for ``py::str``. To help in
|
||||
the transition of user code, the ``PYBIND11_STR_LEGACY_PERMISSIVE`` macro
|
||||
is provided as an escape hatch to go back to the legacy behavior. This macro
|
||||
will be removed in future releases. Two types of required fixes are expected
|
||||
to be common:
|
||||
|
||||
* Accidental use of ``py::str`` instead of ``py::bytes``, masked by the legacy
|
||||
behavior. These are probably very easy to fix, by changing from
|
||||
``py::str`` to ``py::bytes``.
|
||||
|
||||
* Reliance on py::isinstance<str>(obj) being ``true`` for
|
||||
``py::bytes``. This is likely to be easy to fix in most cases by adding
|
||||
``|| py::isinstance<bytes>(obj)``, but a fix may be more involved, e.g. if
|
||||
``py::isinstance<T>`` appears in a template. Such situations will require
|
||||
careful review and custom fixes.
|
||||
|
||||
|
||||
|
||||
v2.6
|
||||
====
|
||||
|
||||
|
@ -1656,6 +1656,17 @@ struct pyobject_caster {
|
||||
|
||||
template <typename T = type, enable_if_t<std::is_base_of<object, T>::value, int> = 0>
|
||||
bool load(handle src, bool /* convert */) {
|
||||
#if PY_MAJOR_VERSION < 3 && !defined(PYBIND11_STR_LEGACY_PERMISSIVE)
|
||||
// For Python 2, without this implicit conversion, Python code would
|
||||
// need to be cluttered with six.ensure_text() or similar, only to be
|
||||
// un-cluttered later after Python 2 support is dropped.
|
||||
if (std::is_same<T, str>::value && isinstance<bytes>(src)) {
|
||||
PyObject *str_from_bytes = PyUnicode_FromEncodedObject(src.ptr(), "utf-8", nullptr);
|
||||
if (!str_from_bytes) throw error_already_set();
|
||||
value = reinterpret_steal<type>(str_from_bytes);
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
if (!isinstance<type>(src))
|
||||
return false;
|
||||
value = reinterpret_borrow<type>(src);
|
||||
|
@ -163,6 +163,19 @@
|
||||
#include <typeindex>
|
||||
#include <type_traits>
|
||||
|
||||
// #define PYBIND11_STR_LEGACY_PERMISSIVE
|
||||
// If DEFINED, pybind11::str can hold PyUnicodeObject or PyBytesObject
|
||||
// (probably surprising and never documented, but this was the
|
||||
// legacy behavior until and including v2.6.x). As a side-effect,
|
||||
// pybind11::isinstance<str>() is true for both pybind11::str and
|
||||
// pybind11::bytes.
|
||||
// If UNDEFINED, pybind11::str can only hold PyUnicodeObject, and
|
||||
// pybind11::isinstance<str>() is true only for pybind11::str.
|
||||
// However, for Python 2 only (!), the pybind11::str caster
|
||||
// implicitly decodes bytes to PyUnicodeObject. This is to ease
|
||||
// the transition from the legacy behavior to the non-permissive
|
||||
// behavior.
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3 /// Compatibility macros for various Python versions
|
||||
#define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyInstanceMethod_New(ptr)
|
||||
#define PYBIND11_INSTANCE_METHOD_CHECK PyInstanceMethod_Check
|
||||
|
@ -756,7 +756,12 @@ inline bool PyIterable_Check(PyObject *obj) {
|
||||
inline bool PyNone_Check(PyObject *o) { return o == Py_None; }
|
||||
inline bool PyEllipsis_Check(PyObject *o) { return o == Py_Ellipsis; }
|
||||
|
||||
#ifdef PYBIND11_STR_LEGACY_PERMISSIVE
|
||||
inline bool PyUnicode_Check_Permissive(PyObject *o) { return PyUnicode_Check(o) || PYBIND11_BYTES_CHECK(o); }
|
||||
#define PYBIND11_STR_CHECK_FUN detail::PyUnicode_Check_Permissive
|
||||
#else
|
||||
#define PYBIND11_STR_CHECK_FUN PyUnicode_Check
|
||||
#endif
|
||||
|
||||
inline bool PyStaticMethod_Check(PyObject *o) { return o->ob_type == &PyStaticMethod_Type; }
|
||||
|
||||
@ -936,7 +941,7 @@ class bytes;
|
||||
|
||||
class str : public object {
|
||||
public:
|
||||
PYBIND11_OBJECT_CVT(str, object, detail::PyUnicode_Check_Permissive, raw_str)
|
||||
PYBIND11_OBJECT_CVT(str, object, PYBIND11_STR_CHECK_FUN, raw_str)
|
||||
|
||||
str(const char *c, size_t n)
|
||||
: object(PyUnicode_FromStringAndSize(c, (ssize_t) n), stolen_t{}) {
|
||||
|
@ -144,7 +144,7 @@ template <typename Type, typename Value> struct list_caster {
|
||||
using value_conv = make_caster<Value>;
|
||||
|
||||
bool load(handle src, bool convert) {
|
||||
if (!isinstance<sequence>(src) || isinstance<str>(src))
|
||||
if (!isinstance<sequence>(src) || isinstance<bytes>(src) || isinstance<str>(src))
|
||||
return false;
|
||||
auto s = reinterpret_borrow<sequence>(src);
|
||||
value.clear();
|
||||
|
@ -413,4 +413,15 @@ TEST_SUBMODULE(pytypes, m) {
|
||||
|
||||
// test_builtin_functions
|
||||
m.def("get_len", [](py::handle h) { return py::len(h); });
|
||||
|
||||
#ifdef PYBIND11_STR_LEGACY_PERMISSIVE
|
||||
m.attr("PYBIND11_STR_LEGACY_PERMISSIVE") = true;
|
||||
#endif
|
||||
|
||||
m.def("isinstance_pybind11_bytes", [](py::object o) { return py::isinstance<py::bytes>(o); });
|
||||
m.def("isinstance_pybind11_str", [](py::object o) { return py::isinstance<py::str>(o); });
|
||||
|
||||
m.def("pass_to_pybind11_bytes", [](py::bytes b) { return py::len(b); });
|
||||
m.def("pass_to_pybind11_str", [](py::str s) { return py::len(s); });
|
||||
m.def("pass_to_std_string", [](std::string s) { return s.size(); });
|
||||
}
|
||||
|
@ -120,14 +120,17 @@ def test_str(doc):
|
||||
assert s1 == s2
|
||||
|
||||
malformed_utf8 = b"\x80"
|
||||
assert m.str_from_object(malformed_utf8) is malformed_utf8 # To be fixed; see #2380
|
||||
if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"):
|
||||
assert m.str_from_object(malformed_utf8) is malformed_utf8
|
||||
elif env.PY2:
|
||||
with pytest.raises(UnicodeDecodeError):
|
||||
m.str_from_object(malformed_utf8)
|
||||
else:
|
||||
assert m.str_from_object(malformed_utf8) == "b'\\x80'"
|
||||
if env.PY2:
|
||||
# with pytest.raises(UnicodeDecodeError):
|
||||
# m.str_from_object(malformed_utf8)
|
||||
with pytest.raises(UnicodeDecodeError):
|
||||
m.str_from_handle(malformed_utf8)
|
||||
else:
|
||||
# assert m.str_from_object(malformed_utf8) == "b'\\x80'"
|
||||
assert m.str_from_handle(malformed_utf8) == "b'\\x80'"
|
||||
|
||||
|
||||
@ -303,13 +306,26 @@ def test_pybind11_str_raw_str():
|
||||
valid_orig = u"DZ"
|
||||
valid_utf8 = valid_orig.encode("utf-8")
|
||||
valid_cvt = cvt(valid_utf8)
|
||||
assert type(valid_cvt) == bytes # Probably surprising.
|
||||
assert valid_cvt == b"\xc7\xb1"
|
||||
if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"):
|
||||
assert valid_cvt is valid_utf8
|
||||
else:
|
||||
assert type(valid_cvt) is unicode if env.PY2 else str # noqa: F821
|
||||
if env.PY2:
|
||||
assert valid_cvt == valid_orig
|
||||
else:
|
||||
assert valid_cvt == "b'\\xc7\\xb1'"
|
||||
|
||||
malformed_utf8 = b"\x80"
|
||||
if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"):
|
||||
assert cvt(malformed_utf8) is malformed_utf8
|
||||
else:
|
||||
if env.PY2:
|
||||
with pytest.raises(UnicodeDecodeError):
|
||||
cvt(malformed_utf8)
|
||||
else:
|
||||
malformed_cvt = cvt(malformed_utf8)
|
||||
assert type(malformed_cvt) == bytes # Probably surprising.
|
||||
assert malformed_cvt == b"\x80"
|
||||
assert type(malformed_cvt) is str
|
||||
assert malformed_cvt == "b'\\x80'"
|
||||
|
||||
|
||||
def test_implicit_casting():
|
||||
@ -488,3 +504,40 @@ def test_builtin_functions():
|
||||
"object of type 'generator' has no len()",
|
||||
"'generator' has no length",
|
||||
] # PyPy
|
||||
|
||||
|
||||
def test_isinstance_string_types():
|
||||
assert m.isinstance_pybind11_bytes(b"")
|
||||
assert not m.isinstance_pybind11_bytes(u"")
|
||||
|
||||
assert m.isinstance_pybind11_str(u"")
|
||||
if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"):
|
||||
assert m.isinstance_pybind11_str(b"")
|
||||
else:
|
||||
assert not m.isinstance_pybind11_str(b"")
|
||||
|
||||
|
||||
def test_pass_bytes_or_unicode_to_string_types():
|
||||
assert m.pass_to_pybind11_bytes(b"Bytes") == 5
|
||||
with pytest.raises(TypeError):
|
||||
m.pass_to_pybind11_bytes(u"Str")
|
||||
|
||||
if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE") or env.PY2:
|
||||
assert m.pass_to_pybind11_str(b"Bytes") == 5
|
||||
else:
|
||||
with pytest.raises(TypeError):
|
||||
m.pass_to_pybind11_str(b"Bytes")
|
||||
assert m.pass_to_pybind11_str(u"Str") == 3
|
||||
|
||||
assert m.pass_to_std_string(b"Bytes") == 5
|
||||
assert m.pass_to_std_string(u"Str") == 3
|
||||
|
||||
malformed_utf8 = b"\x80"
|
||||
if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"):
|
||||
assert m.pass_to_pybind11_str(malformed_utf8) == 1
|
||||
elif env.PY2:
|
||||
with pytest.raises(UnicodeDecodeError):
|
||||
m.pass_to_pybind11_str(malformed_utf8)
|
||||
else:
|
||||
with pytest.raises(TypeError):
|
||||
m.pass_to_pybind11_str(malformed_utf8)
|
||||
|
Loading…
Reference in New Issue
Block a user