Call PyUnicode_DecodeUTF* directly

Some versions of Python 2.7 reportedly (#713) have issues with
PyUnicode_Decode being passed the encoding string, so just skip it
entirely by calling the PyUnicode_DecodeUTF* function directly.  This
will also be slightly more efficient by avoiding having to check the
encoding string, and (for python 2) going through the unicode class's
decode (python 3 fast-tracks this for all utf-{8,16,32} encodings;
python 2 only fast-tracked for the exact string "utf-8", which we
weren't passing anyway (we had "utf8")).

This doesn't work for PyPy, however: its `PyUnicode_DecodeUTF{8,16,32}`
appear rather broken: the UTF8 one segfaults, while the 16/32 require
recasting into a non-const `char *` (and might segfault; I didn't get
far enough to find out).  Just avoid the whole thing by keeping the
encoding-passed-as-string version for PyPy, which seems to work
reliably.
This commit is contained in:
Jason Rhinelander 2017-03-09 11:35:28 -05:00
parent e5456c2226
commit ee9296395d

View File

@ -641,7 +641,6 @@ struct type_caster<std::basic_string<CharT, Traits, Allocator>, enable_if_t<is_s
static_assert(!std::is_same<CharT, wchar_t>::value || sizeof(CharT) == 2 || sizeof(CharT) == 4,
"Unsupported wchar_t size != 2/4");
static constexpr size_t UTF_N = 8 * sizeof(CharT);
static constexpr const char *encoding = UTF_N == 8 ? "utf8" : UTF_N == 16 ? "utf16" : "utf32";
using StringType = std::basic_string<CharT, Traits, Allocator>;
@ -666,7 +665,7 @@ struct type_caster<std::basic_string<CharT, Traits, Allocator>, enable_if_t<is_s
}
object utfNbytes = reinterpret_steal<object>(PyUnicode_AsEncodedString(
load_src.ptr(), encoding, nullptr));
load_src.ptr(), UTF_N == 8 ? "utf-8" : UTF_N == 16 ? "utf-16" : "utf-32", nullptr));
if (!utfNbytes) { PyErr_Clear(); return false; }
const CharT *buffer = reinterpret_cast<const CharT *>(PYBIND11_BYTES_AS_STRING(utfNbytes.ptr()));
@ -679,12 +678,28 @@ struct type_caster<std::basic_string<CharT, Traits, Allocator>, enable_if_t<is_s
static handle cast(const StringType &src, return_value_policy /* policy */, handle /* parent */) {
const char *buffer = reinterpret_cast<const char *>(src.c_str());
ssize_t nbytes = ssize_t(src.size() * sizeof(CharT));
handle s = PyUnicode_Decode(buffer, nbytes, encoding, nullptr);
handle s = decode_utfN(buffer, nbytes);
if (!s) throw error_already_set();
return s;
}
PYBIND11_TYPE_CASTER(StringType, _(PYBIND11_STRING_NAME));
private:
static handle decode_utfN(const char *buffer, ssize_t nbytes) {
#if !defined(PYPY_VERSION)
return
UTF_N == 8 ? PyUnicode_DecodeUTF8(buffer, nbytes, nullptr) :
UTF_N == 16 ? PyUnicode_DecodeUTF16(buffer, nbytes, nullptr, nullptr) :
PyUnicode_DecodeUTF32(buffer, nbytes, nullptr, nullptr);
#else
// PyPy seems to have multiple problems related to PyUnicode_UTF*: the UTF8 version
// sometimes segfaults for unknown reasons, while the UTF16 and 32 versions require a
// non-const char * arguments, which is also a nuissance, so bypass the whole thing by just
// passing the encoding as a string value, which works properly:
return PyUnicode_Decode(buffer, nbytes, UTF_N == 8 ? "utf-8" : UTF_N == 16 ? "utf-16" : "utf-32", nullptr);
#endif
}
};
// Type caster for C-style strings. We basically use a std::string type caster, but also add the