Changing pybind11::str to exclusively hold PyUnicodeObject (#2409)

* Changing pybind11::str to exclusively hold PyUnicodeObject
This commit is contained in:
Ralf W. Grosse-Kunstleve 2021-01-29 12:41:42 -05:00 committed by GitHub
parent 587d5f840a
commit 0432ae7c52
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 136 additions and 13 deletions

View File

@ -6,10 +6,15 @@ Changelog
Starting with version 1.8.0, pybind11 releases use a `semantic versioning
<http://semver.org>`_ policy.
v2.6.3 (TBA, not yet released)
v2.7.0 (TBA, not yet released)
------------------------------
* Details to follow here
* ``py::str`` changed to exclusively hold `PyUnicodeObject`. Previously
``py::str`` could also hold `bytes`, which is probably surprising, was
never documented, and can mask bugs (e.g. accidental use of ``py::str``
instead of ``py::bytes``).
`#2409 <https://github.com/pybind/pybind11/pull/2409>`_
v2.6.2 (Jan 26, 2021)
---------------------

View File

@ -10,6 +10,31 @@ modernization and other useful information.
.. _upgrade-guide-2.6:
v2.7
====
*Before* v2.7, ``py::str`` can hold ``PyUnicodeObject`` or ``PyBytesObject``,
and ``py::isinstance<str>()`` is ``true`` for both ``py::str`` and
``py::bytes``. Starting with v2.7, ``py::str`` exclusively holds
``PyUnicodeObject`` (`#2409 <https://github.com/pybind/pybind11/pull/2409>`_),
and ``py::isinstance<str>()`` is ``true`` only for ``py::str``. To help in
the transition of user code, the ``PYBIND11_STR_LEGACY_PERMISSIVE`` macro
is provided as an escape hatch to go back to the legacy behavior. This macro
will be removed in future releases. Two types of required fixes are expected
to be common:
* Accidental use of ``py::str`` instead of ``py::bytes``, masked by the legacy
behavior. These are probably very easy to fix, by changing from
``py::str`` to ``py::bytes``.
* Reliance on py::isinstance<str>(obj) being ``true`` for
``py::bytes``. This is likely to be easy to fix in most cases by adding
``|| py::isinstance<bytes>(obj)``, but a fix may be more involved, e.g. if
``py::isinstance<T>`` appears in a template. Such situations will require
careful review and custom fixes.
v2.6
====

View File

@ -1656,6 +1656,17 @@ struct pyobject_caster {
template <typename T = type, enable_if_t<std::is_base_of<object, T>::value, int> = 0>
bool load(handle src, bool /* convert */) {
#if PY_MAJOR_VERSION < 3 && !defined(PYBIND11_STR_LEGACY_PERMISSIVE)
// For Python 2, without this implicit conversion, Python code would
// need to be cluttered with six.ensure_text() or similar, only to be
// un-cluttered later after Python 2 support is dropped.
if (std::is_same<T, str>::value && isinstance<bytes>(src)) {
PyObject *str_from_bytes = PyUnicode_FromEncodedObject(src.ptr(), "utf-8", nullptr);
if (!str_from_bytes) throw error_already_set();
value = reinterpret_steal<type>(str_from_bytes);
return true;
}
#endif
if (!isinstance<type>(src))
return false;
value = reinterpret_borrow<type>(src);

View File

@ -163,6 +163,19 @@
#include <typeindex>
#include <type_traits>
// #define PYBIND11_STR_LEGACY_PERMISSIVE
// If DEFINED, pybind11::str can hold PyUnicodeObject or PyBytesObject
// (probably surprising and never documented, but this was the
// legacy behavior until and including v2.6.x). As a side-effect,
// pybind11::isinstance<str>() is true for both pybind11::str and
// pybind11::bytes.
// If UNDEFINED, pybind11::str can only hold PyUnicodeObject, and
// pybind11::isinstance<str>() is true only for pybind11::str.
// However, for Python 2 only (!), the pybind11::str caster
// implicitly decodes bytes to PyUnicodeObject. This is to ease
// the transition from the legacy behavior to the non-permissive
// behavior.
#if PY_MAJOR_VERSION >= 3 /// Compatibility macros for various Python versions
#define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyInstanceMethod_New(ptr)
#define PYBIND11_INSTANCE_METHOD_CHECK PyInstanceMethod_Check

View File

@ -756,7 +756,12 @@ inline bool PyIterable_Check(PyObject *obj) {
inline bool PyNone_Check(PyObject *o) { return o == Py_None; }
inline bool PyEllipsis_Check(PyObject *o) { return o == Py_Ellipsis; }
#ifdef PYBIND11_STR_LEGACY_PERMISSIVE
inline bool PyUnicode_Check_Permissive(PyObject *o) { return PyUnicode_Check(o) || PYBIND11_BYTES_CHECK(o); }
#define PYBIND11_STR_CHECK_FUN detail::PyUnicode_Check_Permissive
#else
#define PYBIND11_STR_CHECK_FUN PyUnicode_Check
#endif
inline bool PyStaticMethod_Check(PyObject *o) { return o->ob_type == &PyStaticMethod_Type; }
@ -936,7 +941,7 @@ class bytes;
class str : public object {
public:
PYBIND11_OBJECT_CVT(str, object, detail::PyUnicode_Check_Permissive, raw_str)
PYBIND11_OBJECT_CVT(str, object, PYBIND11_STR_CHECK_FUN, raw_str)
str(const char *c, size_t n)
: object(PyUnicode_FromStringAndSize(c, (ssize_t) n), stolen_t{}) {

View File

@ -144,7 +144,7 @@ template <typename Type, typename Value> struct list_caster {
using value_conv = make_caster<Value>;
bool load(handle src, bool convert) {
if (!isinstance<sequence>(src) || isinstance<str>(src))
if (!isinstance<sequence>(src) || isinstance<bytes>(src) || isinstance<str>(src))
return false;
auto s = reinterpret_borrow<sequence>(src);
value.clear();

View File

@ -413,4 +413,15 @@ TEST_SUBMODULE(pytypes, m) {
// test_builtin_functions
m.def("get_len", [](py::handle h) { return py::len(h); });
#ifdef PYBIND11_STR_LEGACY_PERMISSIVE
m.attr("PYBIND11_STR_LEGACY_PERMISSIVE") = true;
#endif
m.def("isinstance_pybind11_bytes", [](py::object o) { return py::isinstance<py::bytes>(o); });
m.def("isinstance_pybind11_str", [](py::object o) { return py::isinstance<py::str>(o); });
m.def("pass_to_pybind11_bytes", [](py::bytes b) { return py::len(b); });
m.def("pass_to_pybind11_str", [](py::str s) { return py::len(s); });
m.def("pass_to_std_string", [](std::string s) { return s.size(); });
}

View File

@ -120,14 +120,17 @@ def test_str(doc):
assert s1 == s2
malformed_utf8 = b"\x80"
assert m.str_from_object(malformed_utf8) is malformed_utf8 # To be fixed; see #2380
if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"):
assert m.str_from_object(malformed_utf8) is malformed_utf8
elif env.PY2:
with pytest.raises(UnicodeDecodeError):
m.str_from_object(malformed_utf8)
else:
assert m.str_from_object(malformed_utf8) == "b'\\x80'"
if env.PY2:
# with pytest.raises(UnicodeDecodeError):
# m.str_from_object(malformed_utf8)
with pytest.raises(UnicodeDecodeError):
m.str_from_handle(malformed_utf8)
else:
# assert m.str_from_object(malformed_utf8) == "b'\\x80'"
assert m.str_from_handle(malformed_utf8) == "b'\\x80'"
@ -303,13 +306,26 @@ def test_pybind11_str_raw_str():
valid_orig = u"DZ"
valid_utf8 = valid_orig.encode("utf-8")
valid_cvt = cvt(valid_utf8)
assert type(valid_cvt) == bytes # Probably surprising.
assert valid_cvt == b"\xc7\xb1"
if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"):
assert valid_cvt is valid_utf8
else:
assert type(valid_cvt) is unicode if env.PY2 else str # noqa: F821
if env.PY2:
assert valid_cvt == valid_orig
else:
assert valid_cvt == "b'\\xc7\\xb1'"
malformed_utf8 = b"\x80"
if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"):
assert cvt(malformed_utf8) is malformed_utf8
else:
if env.PY2:
with pytest.raises(UnicodeDecodeError):
cvt(malformed_utf8)
else:
malformed_cvt = cvt(malformed_utf8)
assert type(malformed_cvt) == bytes # Probably surprising.
assert malformed_cvt == b"\x80"
assert type(malformed_cvt) is str
assert malformed_cvt == "b'\\x80'"
def test_implicit_casting():
@ -488,3 +504,40 @@ def test_builtin_functions():
"object of type 'generator' has no len()",
"'generator' has no length",
] # PyPy
def test_isinstance_string_types():
assert m.isinstance_pybind11_bytes(b"")
assert not m.isinstance_pybind11_bytes(u"")
assert m.isinstance_pybind11_str(u"")
if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"):
assert m.isinstance_pybind11_str(b"")
else:
assert not m.isinstance_pybind11_str(b"")
def test_pass_bytes_or_unicode_to_string_types():
assert m.pass_to_pybind11_bytes(b"Bytes") == 5
with pytest.raises(TypeError):
m.pass_to_pybind11_bytes(u"Str")
if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE") or env.PY2:
assert m.pass_to_pybind11_str(b"Bytes") == 5
else:
with pytest.raises(TypeError):
m.pass_to_pybind11_str(b"Bytes")
assert m.pass_to_pybind11_str(u"Str") == 3
assert m.pass_to_std_string(b"Bytes") == 5
assert m.pass_to_std_string(u"Str") == 3
malformed_utf8 = b"\x80"
if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"):
assert m.pass_to_pybind11_str(malformed_utf8) == 1
elif env.PY2:
with pytest.raises(UnicodeDecodeError):
m.pass_to_pybind11_str(malformed_utf8)
else:
with pytest.raises(TypeError):
m.pass_to_pybind11_str(malformed_utf8)