diff --git a/src/indexer.cc b/src/indexer.cc index 6446c2a2..1c1e4e96 100644 --- a/src/indexer.cc +++ b/src/indexer.cc @@ -423,22 +423,23 @@ public: unsigned start_column = sm.getLineNumber(bInfo.first, bInfo.second); std::string ret; int pad = -1; - for (const char *p = raw.data(), *e = raw.end(); p < e;) { + for (const uint8_t *p = raw.bytes_begin(), *e = raw.bytes_end(); p < e;) { // The first line starts with a comment marker, but the rest needs // un-indenting. unsigned skip = start_column - 1; for (; skip > 0 && p < e && (*p == ' ' || *p == '\t'); p++) skip--; - const char *q = p; + bool high = false; + const uint8_t *q = p; while (q < e && *q != '\n') - q++; + high |= *q++ >= 0x80; if (q < e) q++; // A minimalist approach to skip Doxygen comment markers. // See https://www.stack.nl/~dimitri/doxygen/manual/docblocks.html if (pad < 0) { // First line, detect the length of comment marker and put into |pad| - const char *begin = p; + const uint8_t *begin = p; while (p < e && (*p == '/' || *p == '*' || *p == '-' || *p == '=')) p++; if (p < e && (*p == '<' || *p == '!')) @@ -456,7 +457,24 @@ public: (*p == ' ' || *p == '/' || *p == '*' || *p == '<' || *p == '!')) prefix--, p++; } - ret.insert(ret.end(), p, q); + if (high) { + while (p < q) { + int i = 0, c = *p < 0x80 ? 0 + : *p < 0xc0 || *p >= 0xf8 + ? -1 + : *p >= 0xf0 ? 3 : *p >= 0xe0 ? 2 : 1; + const uint8_t *r = p + 1; + for (; i < c && r < q && *r >= 0x80; i++, r++) + ; + if (i == c) + ret.insert(ret.end(), (const char *)p, (const char *)r); + else + ret += '?'; + p = r; + } + } else { + ret.insert(ret.end(), (const char *)p, (const char *)q); + } p = q; } while (ret.size() && isspace(ret.back()))