Add C++20 char8_t/u8string support (#2026)

* Fix test build in C++20

* Add C++20 char8_t/u8string support
This commit is contained in:
Vemund Handeland 2019-12-19 12:16:24 +01:00 committed by Wenzel Jakob
parent 37d04abdee
commit 6e39b765b2
3 changed files with 72 additions and 5 deletions

View File

@ -32,6 +32,10 @@
#include <string_view> #include <string_view>
#endif #endif
#if defined(__cpp_lib_char8_t) && __cpp_lib_char8_t >= 201811L
# define PYBIND11_HAS_U8STRING
#endif
NAMESPACE_BEGIN(PYBIND11_NAMESPACE) NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
NAMESPACE_BEGIN(detail) NAMESPACE_BEGIN(detail)
@ -988,6 +992,9 @@ public:
template <typename CharT> using is_std_char_type = any_of< template <typename CharT> using is_std_char_type = any_of<
std::is_same<CharT, char>, /* std::string */ std::is_same<CharT, char>, /* std::string */
#if defined(PYBIND11_HAS_U8STRING)
std::is_same<CharT, char8_t>, /* std::u8string */
#endif
std::is_same<CharT, char16_t>, /* std::u16string */ std::is_same<CharT, char16_t>, /* std::u16string */
std::is_same<CharT, char32_t>, /* std::u32string */ std::is_same<CharT, char32_t>, /* std::u32string */
std::is_same<CharT, wchar_t> /* std::wstring */ std::is_same<CharT, wchar_t> /* std::wstring */
@ -1191,6 +1198,9 @@ template <typename StringType, bool IsView = false> struct string_caster {
// Simplify life by being able to assume standard char sizes (the standard only guarantees // Simplify life by being able to assume standard char sizes (the standard only guarantees
// minimums, but Python requires exact sizes) // minimums, but Python requires exact sizes)
static_assert(!std::is_same<CharT, char>::value || sizeof(CharT) == 1, "Unsupported char size != 1"); static_assert(!std::is_same<CharT, char>::value || sizeof(CharT) == 1, "Unsupported char size != 1");
#if defined(PYBIND11_HAS_U8STRING)
static_assert(!std::is_same<CharT, char8_t>::value || sizeof(CharT) == 1, "Unsupported char8_t size != 1");
#endif
static_assert(!std::is_same<CharT, char16_t>::value || sizeof(CharT) == 2, "Unsupported char16_t size != 2"); static_assert(!std::is_same<CharT, char16_t>::value || sizeof(CharT) == 2, "Unsupported char16_t size != 2");
static_assert(!std::is_same<CharT, char32_t>::value || sizeof(CharT) == 4, "Unsupported char32_t size != 4"); static_assert(!std::is_same<CharT, char32_t>::value || sizeof(CharT) == 4, "Unsupported char32_t size != 4");
// wchar_t can be either 16 bits (Windows) or 32 (everywhere else) // wchar_t can be either 16 bits (Windows) or 32 (everywhere else)
@ -1209,7 +1219,7 @@ template <typename StringType, bool IsView = false> struct string_caster {
#if PY_MAJOR_VERSION >= 3 #if PY_MAJOR_VERSION >= 3
return load_bytes(load_src); return load_bytes(load_src);
#else #else
if (sizeof(CharT) == 1) { if (std::is_same<CharT, char>::value) {
return load_bytes(load_src); return load_bytes(load_src);
} }
@ -1269,7 +1279,7 @@ private:
// without any encoding/decoding attempt). For other C++ char sizes this is a no-op. // without any encoding/decoding attempt). For other C++ char sizes this is a no-op.
// which supports loading a unicode from a str, doesn't take this path. // which supports loading a unicode from a str, doesn't take this path.
template <typename C = CharT> template <typename C = CharT>
bool load_bytes(enable_if_t<sizeof(C) == 1, handle> src) { bool load_bytes(enable_if_t<std::is_same<C, char>::value, handle> src) {
if (PYBIND11_BYTES_CHECK(src.ptr())) { if (PYBIND11_BYTES_CHECK(src.ptr())) {
// We were passed a Python 3 raw bytes; accept it into a std::string or char* // We were passed a Python 3 raw bytes; accept it into a std::string or char*
// without any encoding attempt. // without any encoding attempt.
@ -1284,7 +1294,7 @@ private:
} }
template <typename C = CharT> template <typename C = CharT>
bool load_bytes(enable_if_t<sizeof(C) != 1, handle>) { return false; } bool load_bytes(enable_if_t<!std::is_same<C, char>::value, handle>) { return false; }
}; };
template <typename CharT, class Traits, class Allocator> template <typename CharT, class Traits, class Allocator>

View File

@ -30,7 +30,7 @@ TEST_SUBMODULE(builtin_casters, m) {
else { wstr.push_back((wchar_t) mathbfA32); } // 𝐀, utf32 else { wstr.push_back((wchar_t) mathbfA32); } // 𝐀, utf32
wstr.push_back(0x7a); // z wstr.push_back(0x7a); // z
m.def("good_utf8_string", []() { return std::string(u8"Say utf8\u203d \U0001f382 \U0001d400"); }); // Say utf8‽ 🎂 𝐀 m.def("good_utf8_string", []() { return std::string((const char*)u8"Say utf8\u203d \U0001f382 \U0001d400"); }); // Say utf8‽ 🎂 𝐀
m.def("good_utf16_string", [=]() { return std::u16string({ b16, ib16, cake16_1, cake16_2, mathbfA16_1, mathbfA16_2, z16 }); }); // b‽🎂𝐀z m.def("good_utf16_string", [=]() { return std::u16string({ b16, ib16, cake16_1, cake16_2, mathbfA16_1, mathbfA16_2, z16 }); }); // b‽🎂𝐀z
m.def("good_utf32_string", [=]() { return std::u32string({ a32, mathbfA32, cake32, ib32, z32 }); }); // a𝐀🎂‽z m.def("good_utf32_string", [=]() { return std::u32string({ a32, mathbfA32, cake32, ib32, z32 }); }); // a𝐀🎂‽z
m.def("good_wchar_string", [=]() { return wstr; }); // a‽𝐀z m.def("good_wchar_string", [=]() { return wstr; }); // a‽𝐀z
@ -60,6 +60,18 @@ TEST_SUBMODULE(builtin_casters, m) {
m.def("strlen", [](char *s) { return strlen(s); }); m.def("strlen", [](char *s) { return strlen(s); });
m.def("string_length", [](std::string s) { return s.length(); }); m.def("string_length", [](std::string s) { return s.length(); });
#ifdef PYBIND11_HAS_U8STRING
m.attr("has_u8string") = true;
m.def("good_utf8_u8string", []() { return std::u8string(u8"Say utf8\u203d \U0001f382 \U0001d400"); }); // Say utf8‽ 🎂 𝐀
m.def("bad_utf8_u8string", []() { return std::u8string((const char8_t*)"abc\xd0" "def"); });
m.def("u8_char8_Z", []() -> char8_t { return u8'Z'; });
// test_single_char_arguments
m.def("ord_char8", [](char8_t c) -> int { return static_cast<unsigned char>(c); });
m.def("ord_char8_lv", [](char8_t &c) -> int { return static_cast<unsigned char>(c); });
#endif
// test_string_view // test_string_view
#ifdef PYBIND11_HAS_STRING_VIEW #ifdef PYBIND11_HAS_STRING_VIEW
m.attr("has_string_view") = true; m.attr("has_string_view") = true;
@ -69,9 +81,15 @@ TEST_SUBMODULE(builtin_casters, m) {
m.def("string_view_chars", [](std::string_view s) { py::list l; for (auto c : s) l.append((std::uint8_t) c); return l; }); m.def("string_view_chars", [](std::string_view s) { py::list l; for (auto c : s) l.append((std::uint8_t) c); return l; });
m.def("string_view16_chars", [](std::u16string_view s) { py::list l; for (auto c : s) l.append((int) c); return l; }); m.def("string_view16_chars", [](std::u16string_view s) { py::list l; for (auto c : s) l.append((int) c); return l; });
m.def("string_view32_chars", [](std::u32string_view s) { py::list l; for (auto c : s) l.append((int) c); return l; }); m.def("string_view32_chars", [](std::u32string_view s) { py::list l; for (auto c : s) l.append((int) c); return l; });
m.def("string_view_return", []() { return std::string_view(u8"utf8 secret \U0001f382"); }); m.def("string_view_return", []() { return std::string_view((const char*)u8"utf8 secret \U0001f382"); });
m.def("string_view16_return", []() { return std::u16string_view(u"utf16 secret \U0001f382"); }); m.def("string_view16_return", []() { return std::u16string_view(u"utf16 secret \U0001f382"); });
m.def("string_view32_return", []() { return std::u32string_view(U"utf32 secret \U0001f382"); }); m.def("string_view32_return", []() { return std::u32string_view(U"utf32 secret \U0001f382"); });
# ifdef PYBIND11_HAS_U8STRING
m.def("string_view8_print", [](std::u8string_view s) { py::print(s, s.size()); });
m.def("string_view8_chars", [](std::u8string_view s) { py::list l; for (auto c : s) l.append((std::uint8_t) c); return l; });
m.def("string_view8_return", []() { return std::u8string_view(u8"utf8 secret \U0001f382"); });
# endif
#endif #endif
// test_integer_casting // test_integer_casting

View File

@ -15,6 +15,8 @@ def test_unicode_conversion():
assert m.good_utf16_string() == u"b‽🎂𝐀z" assert m.good_utf16_string() == u"b‽🎂𝐀z"
assert m.good_utf32_string() == u"a𝐀🎂‽z" assert m.good_utf32_string() == u"a𝐀🎂‽z"
assert m.good_wchar_string() == u"a⸘𝐀z" assert m.good_wchar_string() == u"a⸘𝐀z"
if hasattr(m, "has_u8string"):
assert m.good_utf8_u8string() == u"Say utf8‽ 🎂 𝐀"
with pytest.raises(UnicodeDecodeError): with pytest.raises(UnicodeDecodeError):
m.bad_utf8_string() m.bad_utf8_string()
@ -29,12 +31,17 @@ def test_unicode_conversion():
if hasattr(m, "bad_wchar_string"): if hasattr(m, "bad_wchar_string"):
with pytest.raises(UnicodeDecodeError): with pytest.raises(UnicodeDecodeError):
m.bad_wchar_string() m.bad_wchar_string()
if hasattr(m, "has_u8string"):
with pytest.raises(UnicodeDecodeError):
m.bad_utf8_u8string()
assert m.u8_Z() == 'Z' assert m.u8_Z() == 'Z'
assert m.u8_eacute() == u'é' assert m.u8_eacute() == u'é'
assert m.u16_ibang() == u'' assert m.u16_ibang() == u''
assert m.u32_mathbfA() == u'𝐀' assert m.u32_mathbfA() == u'𝐀'
assert m.wchar_heart() == u'' assert m.wchar_heart() == u''
if hasattr(m, "has_u8string"):
assert m.u8_char8_Z() == 'Z'
def test_single_char_arguments(): def test_single_char_arguments():
@ -92,6 +99,17 @@ def test_single_char_arguments():
assert m.ord_wchar(u'aa') assert m.ord_wchar(u'aa')
assert str(excinfo.value) == toolong_message assert str(excinfo.value) == toolong_message
if hasattr(m, "has_u8string"):
assert m.ord_char8(u'a') == 0x61 # simple ASCII
assert m.ord_char8_lv(u'b') == 0x62
assert m.ord_char8(u'é') == 0xE9 # requires 2 bytes in utf-8, but can be stuffed in a char
with pytest.raises(ValueError) as excinfo:
assert m.ord_char8(u'Ā') == 0x100 # requires 2 bytes, doesn't fit in a char
assert str(excinfo.value) == toobig_message(0x100)
with pytest.raises(ValueError) as excinfo:
assert m.ord_char8(u'ab')
assert str(excinfo.value) == toolong_message
def test_bytes_to_string(): def test_bytes_to_string():
"""Tests the ability to pass bytes to C++ string-accepting functions. Note that this is """Tests the ability to pass bytes to C++ string-accepting functions. Note that this is
@ -116,10 +134,15 @@ def test_string_view(capture):
assert m.string_view_chars("Hi 🎂") == [72, 105, 32, 0xf0, 0x9f, 0x8e, 0x82] assert m.string_view_chars("Hi 🎂") == [72, 105, 32, 0xf0, 0x9f, 0x8e, 0x82]
assert m.string_view16_chars("Hi 🎂") == [72, 105, 32, 0xd83c, 0xdf82] assert m.string_view16_chars("Hi 🎂") == [72, 105, 32, 0xd83c, 0xdf82]
assert m.string_view32_chars("Hi 🎂") == [72, 105, 32, 127874] assert m.string_view32_chars("Hi 🎂") == [72, 105, 32, 127874]
if hasattr(m, "has_u8string"):
assert m.string_view8_chars("Hi") == [72, 105]
assert m.string_view8_chars("Hi 🎂") == [72, 105, 32, 0xf0, 0x9f, 0x8e, 0x82]
assert m.string_view_return() == "utf8 secret 🎂" assert m.string_view_return() == "utf8 secret 🎂"
assert m.string_view16_return() == "utf16 secret 🎂" assert m.string_view16_return() == "utf16 secret 🎂"
assert m.string_view32_return() == "utf32 secret 🎂" assert m.string_view32_return() == "utf32 secret 🎂"
if hasattr(m, "has_u8string"):
assert m.string_view8_return() == "utf8 secret 🎂"
with capture: with capture:
m.string_view_print("Hi") m.string_view_print("Hi")
@ -132,6 +155,14 @@ def test_string_view(capture):
utf16 🎂 8 utf16 🎂 8
utf32 🎂 7 utf32 🎂 7
""" """
if hasattr(m, "has_u8string"):
with capture:
m.string_view8_print("Hi")
m.string_view8_print("utf8 🎂")
assert capture == """
Hi 2
utf8 🎂 9
"""
with capture: with capture:
m.string_view_print("Hi, ascii") m.string_view_print("Hi, ascii")
@ -144,6 +175,14 @@ def test_string_view(capture):
Hi, utf16 🎂 12 Hi, utf16 🎂 12
Hi, utf32 🎂 11 Hi, utf32 🎂 11
""" """
if hasattr(m, "has_u8string"):
with capture:
m.string_view8_print("Hi, ascii")
m.string_view8_print("Hi, utf8 🎂")
assert capture == """
Hi, ascii 9
Hi, utf8 🎂 13
"""
def test_integer_casting(): def test_integer_casting():