Use UTF-8 character iterator in GetOffsetForPosition which is good unless UTF-16 surrogate pairs are used #57

2026-01-08 23:33:34 +00:00 · 2018-01-13 10:43:37 -08:00 · 2018-01-13 10:43:37 -08:00 · 942a0354d3
commit 942a0354d3
parent dab379ad46
2 changed files with 18 additions and 17 deletions
--- a/src/lex_utils.cc
+++ b/src/lex_utils.cc
@ -4,20 +4,22 @@

 #include <algorithm>

+// VSCode (UTF-16) disagrees with Emacs lsp-mode (UTF-8) on how to represent
+// text documents.
+// We use a UTF-8 iterator to approximate UTF-16 in the specification (weird).
+// This is good enough and fails only for UTF-16 surrogate pairs.
 int GetOffsetForPosition(lsPosition position, const std::string& content) {
-  if (content.empty())
-    return 0;
-
-  int offset = 0;
-
-  int remaining_lines = position.line;
-  while (remaining_lines > 0 && offset < static_cast<int>(content.size())) {
-    if (content[offset] == '\n')
-      --remaining_lines;
-    ++offset;
-  }
-
-  return std::min<int>(offset + position.character, content.size());
+  size_t i = 0;
+  for (; position.line > 0 && i < content.size(); i++)
+    if (content[i] == '\n')
+      position.line--;
+  for (; position.character > 0 && i < content.size(); position.character--)
+    if (uint8_t(content[i++]) >= 128) {
+      // Skip 0b10xxxxxx
+      while (i < content.size() && uint8_t(content[i]) >= 128 && uint8_t(content[i]) < 192)
+        i++;
+    }
+  return int(i);
 }

 lsPosition CharPos(const std::string& search,
--- a/src/working_files.cc
+++ b/src/working_files.cc
@ -344,10 +344,9 @@ void WorkingFiles::OnChange(const lsTextDocumentDidChangeParams& change) {
    } else {
      int start_offset =
          GetOffsetForPosition(diff.range->start, file->buffer_content);
-      int end_offset =
-          diff.rangeLength
-              ? start_offset + *diff.rangeLength
-              : GetOffsetForPosition(diff.range->end, file->buffer_content);
+      // Ignore TextDocumentContentChangeEvent.rangeLength which causes trouble
+      // when UTF-16 surrogate pairs are used.
+      int end_offset = GetOffsetForPosition(diff.range->end, file->buffer_content);
      file->buffer_content.replace(file->buffer_content.begin() + start_offset,
                                   file->buffer_content.begin() + end_offset,
                                   diff.text);