Use UTF-8 character iterator in GetOffsetForPosition which is good unless UTF-16 surrogate pairs are used #57

This commit is contained in:
Fangrui Song 2018-01-13 10:43:37 -08:00
parent dab379ad46
commit 942a0354d3
2 changed files with 18 additions and 17 deletions

View File

@ -4,20 +4,22 @@
#include <algorithm>
// VSCode (UTF-16) disagrees with Emacs lsp-mode (UTF-8) on how to represent
// text documents.
// We use a UTF-8 iterator to approximate UTF-16 in the specification (weird).
// This is good enough and fails only for UTF-16 surrogate pairs.
int GetOffsetForPosition(lsPosition position, const std::string& content) {
if (content.empty())
return 0;
int offset = 0;
int remaining_lines = position.line;
while (remaining_lines > 0 && offset < static_cast<int>(content.size())) {
if (content[offset] == '\n')
--remaining_lines;
++offset;
}
return std::min<int>(offset + position.character, content.size());
size_t i = 0;
for (; position.line > 0 && i < content.size(); i++)
if (content[i] == '\n')
position.line--;
for (; position.character > 0 && i < content.size(); position.character--)
if (uint8_t(content[i++]) >= 128) {
// Skip 0b10xxxxxx
while (i < content.size() && uint8_t(content[i]) >= 128 && uint8_t(content[i]) < 192)
i++;
}
return int(i);
}
lsPosition CharPos(const std::string& search,

View File

@ -344,10 +344,9 @@ void WorkingFiles::OnChange(const lsTextDocumentDidChangeParams& change) {
} else {
int start_offset =
GetOffsetForPosition(diff.range->start, file->buffer_content);
int end_offset =
diff.rangeLength
? start_offset + *diff.rangeLength
: GetOffsetForPosition(diff.range->end, file->buffer_content);
// Ignore TextDocumentContentChangeEvent.rangeLength which causes trouble
// when UTF-16 surrogate pairs are used.
int end_offset = GetOffsetForPosition(diff.range->end, file->buffer_content);
file->buffer_content.replace(file->buffer_content.begin() + start_offset,
file->buffer_content.begin() + end_offset,
diff.text);