Unicode fixes and docs (#624)

* Propagate unicode conversion failure

If returning a std::string with invalid utf-8 data, we currently fail
with an uninformative TypeError instead of propagating the
UnicodeDecodeError that Python sets on failure.

* Add support for u16/u32strings and literals

This adds support for wchar{16,32}_t character literals and the
associated std::u{16,32}string types.  It also folds the
character/string conversion into a single type_caster template, since
the type casters for string and wstring were mostly the same anyway.

* Added too-long and too-big character conversion errors

With this commit, when casting to a single character, as opposed to a
C-style string, we make sure the input wasn't a multi-character string
or a single character with codepoint too large for the character type.

This also changes the character cast op to CharT instead of CharT& (we
need to be able to return a temporary decoded char value, but also
because there's little gained by bothering with an lvalue return here).

Finally it changes the char caster to 'has-a-string-caster' instead of
'is-a-string-caster' because, with the cast_op change above, there's
nothing at all gained from inheritance.  This also lets us remove the
`success` from the string caster (which was only there for the char
caster) into the char caster itself.  (I also renamed it to 'none' and
inverted its value to better reflect its purpose).  The None -> nullptr
loading also now takes place only under a `convert = true` load pass.
Although it's unlikely that a function taking a char also has overloads
that can take a None, it seems marginally more correct to treat it as a
conversion.

This commit simplifies the size assumptions about character sizes with
static_asserts to back them up.
This commit is contained in:
Jason Rhinelander 2017-02-14 05:08:19 -05:00 committed by Wenzel Jakob
parent ada763b9ee
commit 11a337f16f
5 changed files with 266 additions and 90 deletions

View File

@ -94,14 +94,26 @@ as arguments and return values, refer to the section on binding :ref:`classes`.
+------------------------------------+---------------------------+-------------------------------+
| ``char`` | Character literal | :file:`pybind11/pybind11.h` |
+------------------------------------+---------------------------+-------------------------------+
| ``char16_t`` | UTF-16 character literal | :file:`pybind11/pybind11.h` |
+------------------------------------+---------------------------+-------------------------------+
| ``char32_t`` | UTF-32 character literal | :file:`pybind11/pybind11.h` |
+------------------------------------+---------------------------+-------------------------------+
| ``wchar_t`` | Wide character literal | :file:`pybind11/pybind11.h` |
+------------------------------------+---------------------------+-------------------------------+
| ``const char *`` | UTF-8 string literal | :file:`pybind11/pybind11.h` |
+------------------------------------+---------------------------+-------------------------------+
| ``const char16_t *`` | UTF-16 string literal | :file:`pybind11/pybind11.h` |
+------------------------------------+---------------------------+-------------------------------+
| ``const char32_t *`` | UTF-32 string literal | :file:`pybind11/pybind11.h` |
+------------------------------------+---------------------------+-------------------------------+
| ``const wchar_t *`` | Wide string literal | :file:`pybind11/pybind11.h` |
+------------------------------------+---------------------------+-------------------------------+
| ``std::string`` | STL dynamic UTF-8 string | :file:`pybind11/pybind11.h` |
+------------------------------------+---------------------------+-------------------------------+
| ``std::u16string`` | STL dynamic UTF-16 string | :file:`pybind11/pybind11.h` |
+------------------------------------+---------------------------+-------------------------------+
| ``std::u32string`` | STL dynamic UTF-32 string | :file:`pybind11/pybind11.h` |
+------------------------------------+---------------------------+-------------------------------+
| ``std::wstring`` | STL dynamic wide string | :file:`pybind11/pybind11.h` |
+------------------------------------+---------------------------+-------------------------------+
| ``std::pair<T1, T2>`` | Pair of two custom types | :file:`pybind11/pybind11.h` |

View File

@ -471,8 +471,15 @@ public:
template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>
template <typename CharT> using is_std_char_type = any_of<
std::is_same<CharT, char>, /* std::string */
std::is_same<CharT, char16_t>, /* std::u16string */
std::is_same<CharT, char32_t>, /* std::u32string */
std::is_same<CharT, wchar_t> /* std::wstring */
>;
template <typename T>
struct type_caster<T, enable_if_t<std::is_arithmetic<T>::value>> {
struct type_caster<T, enable_if_t<std::is_arithmetic<T>::value && !is_std_char_type<T>::value>> {
using _py_type_0 = conditional_t<sizeof(T) <= sizeof(long), long, long long>;
using _py_type_1 = conditional_t<std::is_signed<T>::value, _py_type_0, typename std::make_unsigned<_py_type_0>::type>;
using py_type = conditional_t<std::is_floating_point<T>::value, double, _py_type_1>;
@ -617,122 +624,144 @@ public:
PYBIND11_TYPE_CASTER(bool, _("bool"));
};
template <> class type_caster<std::string> {
public:
bool load(handle src, bool) {
object temp;
handle load_src = src;
if (!src) {
return false;
} else if (PyUnicode_Check(load_src.ptr())) {
temp = reinterpret_steal<object>(PyUnicode_AsUTF8String(load_src.ptr()));
if (!temp) { PyErr_Clear(); return false; } // UnicodeEncodeError
load_src = temp;
}
char *buffer;
ssize_t length;
int err = PYBIND11_BYTES_AS_STRING_AND_SIZE(load_src.ptr(), &buffer, &length);
if (err == -1) { PyErr_Clear(); return false; } // TypeError
value = std::string(buffer, (size_t) length);
success = true;
return true;
}
static handle cast(const std::string &src, return_value_policy /* policy */, handle /* parent */) {
return PyUnicode_FromStringAndSize(src.c_str(), (ssize_t) src.length());
}
PYBIND11_TYPE_CASTER(std::string, _(PYBIND11_STRING_NAME));
protected:
bool success = false;
};
template <> class type_caster<std::wstring> {
public:
// Helper class for UTF-{8,16,32} C++ stl strings:
template <typename CharT, class Traits, class Allocator>
struct type_caster<std::basic_string<CharT, Traits, Allocator>, enable_if_t<is_std_char_type<CharT>::value>> {
// Simplify life by being able to assume standard char sizes (the standard only guarantees
// minimums), but Python requires exact sizes
static_assert(!std::is_same<CharT, char>::value || sizeof(CharT) == 1, "Unsupported char size != 1");
static_assert(!std::is_same<CharT, char16_t>::value || sizeof(CharT) == 2, "Unsupported char16_t size != 2");
static_assert(!std::is_same<CharT, char32_t>::value || sizeof(CharT) == 4, "Unsupported char32_t size != 4");
// wchar_t can be either 16 bits (Windows) or 32 (everywhere else)
static_assert(!std::is_same<CharT, wchar_t>::value || sizeof(CharT) == 2 || sizeof(CharT) == 4,
"Unsupported wchar_t size != 2/4");
static constexpr size_t UTF_N = 8 * sizeof(CharT);
static constexpr const char *encoding = UTF_N == 8 ? "utf8" : UTF_N == 16 ? "utf16" : "utf32";
using StringType = std::basic_string<CharT, Traits, Allocator>;
bool load(handle src, bool) {
#if PY_VERSION_MAJOR < 3
object temp;
#endif
handle load_src = src;
if (!src) {
return false;
} else if (!PyUnicode_Check(load_src.ptr())) {
#if PY_VERSION_MAJOR >= 3
return false;
// The below is a guaranteed failure in Python 3 when PyUnicode_Check returns false
#else
temp = reinterpret_steal<object>(PyUnicode_FromObject(load_src.ptr()));
if (!temp) { PyErr_Clear(); return false; }
load_src = temp;
}
wchar_t *buffer = nullptr;
ssize_t length = -1;
#if PY_MAJOR_VERSION >= 3
buffer = PyUnicode_AsWideCharString(load_src.ptr(), &length);
#else
temp = reinterpret_steal<object>(PyUnicode_AsEncodedString(
load_src.ptr(), sizeof(wchar_t) == sizeof(short)
? "utf16" : "utf32", nullptr));
if (temp) {
int err = PYBIND11_BYTES_AS_STRING_AND_SIZE(temp.ptr(), (char **) &buffer, &length);
if (err == -1) { buffer = nullptr; } // TypeError
length = length / (ssize_t) sizeof(wchar_t) - 1; ++buffer; // Skip BOM
}
#endif
if (!buffer) { PyErr_Clear(); return false; }
value = std::wstring(buffer, (size_t) length);
success = true;
}
object utfNbytes = reinterpret_steal<object>(PyUnicode_AsEncodedString(
load_src.ptr(), encoding, nullptr));
if (!utfNbytes) { PyErr_Clear(); return false; }
const CharT *buffer = reinterpret_cast<const CharT *>(PYBIND11_BYTES_AS_STRING(utfNbytes.ptr()));
size_t length = (size_t) PYBIND11_BYTES_SIZE(utfNbytes.ptr()) / sizeof(CharT);
if (UTF_N > 8) { buffer++; length--; } // Skip BOM for UTF-16/32
value = StringType(buffer, length);
return true;
}
static handle cast(const std::wstring &src, return_value_policy /* policy */, handle /* parent */) {
return PyUnicode_FromWideChar(src.c_str(), (ssize_t) src.length());
static handle cast(const StringType &src, return_value_policy /* policy */, handle /* parent */) {
const char *buffer = reinterpret_cast<const char *>(src.c_str());
ssize_t nbytes = ssize_t(src.size() * sizeof(CharT));
handle s = PyUnicode_Decode(buffer, nbytes, encoding, nullptr);
if (!s) throw error_already_set();
return s;
}
PYBIND11_TYPE_CASTER(std::wstring, _(PYBIND11_STRING_NAME));
protected:
bool success = false;
PYBIND11_TYPE_CASTER(StringType, _(PYBIND11_STRING_NAME));
};
template <> class type_caster<char> : public type_caster<std::string> {
// Type caster for C-style strings. We basically use a std::string type caster, but also add the
// ability to use None as a nullptr char* (which the string caster doesn't allow).
template <typename CharT> struct type_caster<CharT, enable_if_t<is_std_char_type<CharT>::value>> {
using StringType = std::basic_string<CharT>;
using StringCaster = type_caster<StringType>;
StringCaster str_caster;
bool none = false;
public:
bool load(handle src, bool convert) {
if (src.is_none()) return true;
return type_caster<std::string>::load(src, convert);
if (!src) return false;
if (src.is_none()) {
// Defer accepting None to other overloads (if we aren't in convert mode):
if (!convert) return false;
none = true;
return true;
}
return str_caster.load(src, convert);
}
static handle cast(const char *src, return_value_policy /* policy */, handle /* parent */) {
if (src == nullptr) return none().inc_ref();
return PyUnicode_FromString(src);
static handle cast(const CharT *src, return_value_policy policy, handle parent) {
if (src == nullptr) return pybind11::none().inc_ref();
return StringCaster::cast(StringType(src), policy, parent);
}
static handle cast(char src, return_value_policy /* policy */, handle /* parent */) {
char str[2] = { src, '\0' };
return PyUnicode_DecodeLatin1(str, 1, nullptr);
static handle cast(CharT src, return_value_policy policy, handle parent) {
if (std::is_same<char, CharT>::value) {
handle s = PyUnicode_DecodeLatin1((const char *) &src, 1, nullptr);
if (!s) throw error_already_set();
return s;
}
return StringCaster::cast(StringType(1, src), policy, parent);
}
operator char*() { return success ? const_cast<char *>(value.c_str()) : nullptr; }
operator char&() { return value[0]; }
static PYBIND11_DESCR name() { return type_descr(_(PYBIND11_STRING_NAME)); }
};
template <> class type_caster<wchar_t> : public type_caster<std::wstring> {
public:
bool load(handle src, bool convert) {
if (src.is_none()) return true;
return type_caster<std::wstring>::load(src, convert);
}
static handle cast(const wchar_t *src, return_value_policy /* policy */, handle /* parent */) {
if (src == nullptr) return none().inc_ref();
return PyUnicode_FromWideChar(src, (ssize_t) wcslen(src));
}
static handle cast(wchar_t src, return_value_policy /* policy */, handle /* parent */) {
wchar_t wstr[2] = { src, L'\0' };
return PyUnicode_FromWideChar(wstr, 1);
}
operator wchar_t*() { return success ? const_cast<wchar_t *>(value.c_str()) : nullptr; }
operator wchar_t&() { return value[0]; }
operator CharT*() { return none ? nullptr : const_cast<CharT *>(static_cast<StringType &>(str_caster).c_str()); }
operator CharT() {
if (none)
throw value_error("Cannot convert None to a character");
auto &value = static_cast<StringType &>(str_caster);
size_t str_len = value.size();
if (str_len == 0)
throw value_error("Cannot convert empty string to a character");
// If we're in UTF-8 mode, we have two possible failures: one for a unicode character that
// is too high, and one for multiple unicode characters (caught later), so we need to figure
// out how long the first encoded character is in bytes to distinguish between these two
// errors. We also allow want to allow unicode characters U+0080 through U+00FF, as those
// can fit into a single char value.
if (StringCaster::UTF_N == 8 && str_len > 1 && str_len <= 4) {
unsigned char v0 = static_cast<unsigned char>(value[0]);
size_t char0_bytes = !(v0 & 0x80) ? 1 : // low bits only: 0-127
(v0 & 0xE0) == 0xC0 ? 2 : // 0b110xxxxx - start of 2-byte sequence
(v0 & 0xF0) == 0xE0 ? 3 : // 0b1110xxxx - start of 3-byte sequence
4; // 0b11110xxx - start of 4-byte sequence
if (char0_bytes == str_len) {
// If we have a 128-255 value, we can decode it into a single char:
if (char0_bytes == 2 && (v0 & 0xFC) == 0xC0) { // 0x110000xx 0x10xxxxxx
return static_cast<CharT>(((v0 & 3) << 6) + (static_cast<unsigned char>(value[1]) & 0x3F));
}
// Otherwise we have a single character, but it's > U+00FF
throw value_error("Character code point not in range(0x100)");
}
}
// UTF-16 is much easier: we can only have a surrogate pair for values above U+FFFF, thus a
// surrogate pair with total length 2 instantly indicates a range error (but not a "your
// string was too long" error).
else if (StringCaster::UTF_N == 16 && str_len == 2) {
char16_t v0 = static_cast<char16_t>(value[0]);
if (v0 >= 0xD800 && v0 < 0xE000)
throw value_error("Character code point not in range(0x10000)");
}
if (str_len != 1)
throw value_error("Expected a character, but multi-character string found");
return value[0];
}
static PYBIND11_DESCR name() { return type_descr(_(PYBIND11_STRING_NAME)); }
template <typename _T> using cast_op_type = typename std::remove_reference<pybind11::detail::cast_op_type<_T>>::type;
};
template <typename T1, typename T2> class type_caster<std::pair<T1, T2>> {

View File

@ -111,6 +111,7 @@
#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyBytes_FromStringAndSize
#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyBytes_AsStringAndSize
#define PYBIND11_BYTES_AS_STRING PyBytes_AsString
#define PYBIND11_BYTES_SIZE PyBytes_Size
#define PYBIND11_LONG_CHECK(o) PyLong_Check(o)
#define PYBIND11_LONG_AS_LONGLONG(o) PyLong_AsLongLong(o)
#define PYBIND11_LONG_AS_UNSIGNED_LONGLONG(o) PyLong_AsUnsignedLongLong(o)
@ -129,6 +130,7 @@
#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyString_FromStringAndSize
#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyString_AsStringAndSize
#define PYBIND11_BYTES_AS_STRING PyString_AsString
#define PYBIND11_BYTES_SIZE PyString_Size
#define PYBIND11_LONG_CHECK(o) (PyInt_Check(o) || PyLong_Check(o))
#define PYBIND11_LONG_AS_LONGLONG(o) (PyInt_Check(o) ? (long long) PyLong_AsLong(o) : PyLong_AsLongLong(o))
#define PYBIND11_LONG_AS_UNSIGNED_LONGLONG(o) (PyInt_Check(o) ? (unsigned long long) PyLong_AsUnsignedLong(o) : PyLong_AsUnsignedLongLong(o))

View File

@ -17,6 +17,11 @@
# include <fcntl.h>
#endif
#if defined(_MSC_VER)
# pragma warning(push)
# pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
#endif
class ExamplePythonTypes {
public:
static ExamplePythonTypes *new_instance() {
@ -426,4 +431,41 @@ test_initializer python_types([](py::module &m) {
"l"_a=l
);
});
// Some test characters in utf16 and utf32 encodings. The last one (the 𝐀) contains a null byte
char32_t a32 = 0x61 /*a*/, z32 = 0x7a /*z*/, ib32 = 0x203d /*‽*/, cake32 = 0x1f382 /*🎂*/, mathbfA32 = 0x1d400 /*𝐀*/;
char16_t b16 = 0x62 /*b*/, z16 = 0x7a, ib16 = 0x203d, cake16_1 = 0xd83c, cake16_2 = 0xdf82, mathbfA16_1 = 0xd835, mathbfA16_2 = 0xdc00;
std::wstring wstr;
wstr.push_back(0x61); // a
wstr.push_back(0x2e18); // ⸘
if (sizeof(wchar_t) == 2) { wstr.push_back(mathbfA16_1); wstr.push_back(mathbfA16_2); } // 𝐀, utf16
else { wstr.push_back((wchar_t) mathbfA32); } // 𝐀, utf32
wstr.push_back(0x7a); // z
m.def("good_utf8_string", []() { return std::string(u8"Say utf8\u203d \U0001f382 \U0001d400"); }); // Say utf8‽ 🎂 𝐀
m.def("good_utf16_string", [=]() { return std::u16string({ b16, ib16, cake16_1, cake16_2, mathbfA16_1, mathbfA16_2, z16 }); }); // b‽🎂𝐀z
m.def("good_utf32_string", [=]() { return std::u32string({ a32, mathbfA32, cake32, ib32, z32 }); }); // a𝐀🎂‽z
m.def("good_wchar_string", [=]() { return wstr; }); // a‽𝐀z
m.def("bad_utf8_string", []() { return std::string("abc\xd0" "def"); });
m.def("bad_utf16_string", [=]() { return std::u16string({ b16, char16_t(0xd800), z16 }); });
// Under Python 2.7, invalid unicode UTF-32 characters don't appear to trigger UnicodeDecodeError
if (PY_MAJOR_VERSION >= 3)
m.def("bad_utf32_string", [=]() { return std::u32string({ a32, char32_t(0xd800), z32 }); });
if (PY_MAJOR_VERSION >= 3 || sizeof(wchar_t) == 2)
m.def("bad_wchar_string", [=]() { return std::wstring({ wchar_t(0x61), wchar_t(0xd800) }); });
m.def("u8_Z", []() -> char { return 'Z'; });
m.def("u8_eacute", []() -> char { return '\xe9'; });
m.def("u16_ibang", [=]() -> char16_t { return ib16; });
m.def("u32_mathbfA", [=]() -> char32_t { return mathbfA32; });
m.def("wchar_heart", []() -> wchar_t { return 0x2665; });
m.attr("wchar_size") = py::cast(sizeof(wchar_t));
m.def("ord_char", [](char c) -> int { return static_cast<unsigned char>(c); });
m.def("ord_char16", [](char16_t c) -> uint16_t { return c; });
m.def("ord_char32", [](char32_t c) -> uint32_t { return c; });
m.def("ord_wchar", [](wchar_t c) -> int { return c; });
});
#if defined(_MSC_VER)
# pragma warning(pop)
#endif

View File

@ -1,3 +1,4 @@
# Python < 3 needs this: coding=utf-8
import pytest
from pybind11_tests import ExamplePythonTypes, ConstructorStats, has_optional, has_exp_optional
@ -410,3 +411,93 @@ def test_implicit_casting():
'int_i1': 42, 'int_i2': 42, 'int_e': 43, 'int_p': 44
}
assert z['l'] == [3, 6, 9, 12, 15]
def test_unicode_conversion():
"""Tests unicode conversion and error reporting."""
import pybind11_tests
from pybind11_tests import (good_utf8_string, bad_utf8_string,
good_utf16_string, bad_utf16_string,
good_utf32_string, # bad_utf32_string,
good_wchar_string, # bad_wchar_string,
u8_Z, u8_eacute, u16_ibang, u32_mathbfA, wchar_heart)
assert good_utf8_string() == u"Say utf8‽ 🎂 𝐀"
assert good_utf16_string() == u"b‽🎂𝐀z"
assert good_utf32_string() == u"a𝐀🎂‽z"
assert good_wchar_string() == u"a⸘𝐀z"
with pytest.raises(UnicodeDecodeError):
bad_utf8_string()
with pytest.raises(UnicodeDecodeError):
bad_utf16_string()
# These are provided only if they actually fail (they don't when 32-bit and under Python 2.7)
if hasattr(pybind11_tests, "bad_utf32_string"):
with pytest.raises(UnicodeDecodeError):
pybind11_tests.bad_utf32_string()
if hasattr(pybind11_tests, "bad_wchar_string"):
with pytest.raises(UnicodeDecodeError):
pybind11_tests.bad_wchar_string()
assert u8_Z() == 'Z'
assert u8_eacute() == u'é'
assert u16_ibang() == u''
assert u32_mathbfA() == u'𝐀'
assert wchar_heart() == u''
def test_single_char_arguments():
"""Tests failures for passing invalid inputs to char-accepting functions"""
from pybind11_tests import ord_char, ord_char16, ord_char32, ord_wchar, wchar_size
def toobig_message(r):
return "Character code point not in range({0:#x})".format(r)
toolong_message = "Expected a character, but multi-character string found"
assert ord_char(u'a') == 0x61 # simple ASCII
assert ord_char(u'é') == 0xE9 # requires 2 bytes in utf-8, but can be stuffed in a char
with pytest.raises(ValueError) as excinfo:
assert ord_char(u'Ā') == 0x100 # requires 2 bytes, doesn't fit in a char
assert str(excinfo.value) == toobig_message(0x100)
with pytest.raises(ValueError) as excinfo:
assert ord_char(u'ab')
assert str(excinfo.value) == toolong_message
assert ord_char16(u'a') == 0x61
assert ord_char16(u'é') == 0xE9
assert ord_char16(u'Ā') == 0x100
assert ord_char16(u'') == 0x203d
assert ord_char16(u'') == 0x2665
with pytest.raises(ValueError) as excinfo:
assert ord_char16(u'🎂') == 0x1F382 # requires surrogate pair
assert str(excinfo.value) == toobig_message(0x10000)
with pytest.raises(ValueError) as excinfo:
assert ord_char16(u'aa')
assert str(excinfo.value) == toolong_message
assert ord_char32(u'a') == 0x61
assert ord_char32(u'é') == 0xE9
assert ord_char32(u'Ā') == 0x100
assert ord_char32(u'') == 0x203d
assert ord_char32(u'') == 0x2665
assert ord_char32(u'🎂') == 0x1F382
with pytest.raises(ValueError) as excinfo:
assert ord_char32(u'aa')
assert str(excinfo.value) == toolong_message
assert ord_wchar(u'a') == 0x61
assert ord_wchar(u'é') == 0xE9
assert ord_wchar(u'Ā') == 0x100
assert ord_wchar(u'') == 0x203d
assert ord_wchar(u'') == 0x2665
if wchar_size == 2:
with pytest.raises(ValueError) as excinfo:
assert ord_wchar(u'🎂') == 0x1F382 # requires surrogate pair
assert str(excinfo.value) == toobig_message(0x10000)
else:
assert ord_wchar(u'🎂') == 0x1F382
with pytest.raises(ValueError) as excinfo:
assert ord_wchar(u'aa')
assert str(excinfo.value) == toolong_message