mirror of
https://github.com/MaskRay/ccls.git
synced 2025-01-19 03:55:49 +00:00
Add fuzzy_match.h and let textDocument/definition search for the identifier at point (e.g. in comments)
This commit is contained in:
parent
afd38cbce9
commit
8a939389d8
120
src/fuzzy_match.cc
Normal file
120
src/fuzzy_match.cc
Normal file
@ -0,0 +1,120 @@
|
||||
#include "fuzzy_match.h"
|
||||
|
||||
#include <ctype.h>
|
||||
#include <limits.h>
|
||||
#include <algorithm>
|
||||
|
||||
// Penalty of dropping a leading character in str
|
||||
constexpr int kLeadingGapScore = -4;
|
||||
// Penalty of dropping a non-leading character in str
|
||||
constexpr int kGapScore = -5;
|
||||
// Bonus of aligning with an initial character of a word in pattern. Must be
|
||||
// greater than 1
|
||||
constexpr int kPatternStartMultiplier = 2;
|
||||
|
||||
constexpr int kWordStartScore = 50;
|
||||
constexpr int kNonWordScore = 40;
|
||||
constexpr int kCaseMatchScore = 2;
|
||||
|
||||
// Less than kWordStartScore
|
||||
constexpr int kConsecutiveScore = kWordStartScore + kGapScore;
|
||||
// Slightly less than kConsecutiveScore
|
||||
constexpr int kCamelScore = kWordStartScore + kGapScore - 1;
|
||||
|
||||
enum class CharClass { Lower, Upper, Digit, NonWord };
|
||||
|
||||
static CharClass GetCharClass(int c) {
|
||||
if (islower(c))
|
||||
return CharClass::Lower;
|
||||
if (isupper(c))
|
||||
return CharClass::Upper;
|
||||
if (isdigit(c))
|
||||
return CharClass::Digit;
|
||||
return CharClass::NonWord;
|
||||
}
|
||||
|
||||
static int GetScoreFor(CharClass prev, CharClass curr) {
|
||||
if (prev == CharClass::NonWord && curr != CharClass::NonWord)
|
||||
return kWordStartScore;
|
||||
if ((prev == CharClass::Lower && curr == CharClass::Upper) ||
|
||||
(prev != CharClass::Digit && curr == CharClass::Digit))
|
||||
return kCamelScore;
|
||||
if (curr == CharClass::NonWord)
|
||||
return kNonWordScore;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
fuzzyEvaluate implements a global sequence alignment algorithm to find the
|
||||
maximum accumulated score by aligning `pattern` to `str`. It applies when
|
||||
`pattern` is a subsequence of `str`.
|
||||
|
||||
Scoring criteria
|
||||
- Prefer matches at the start of a word, or the start of subwords in
|
||||
CamelCase/camelCase/camel123 words. See kWordStartScore/kCamelScore
|
||||
- Non-word characters matter. See kNonWordScore
|
||||
- The first characters of words of `pattern` receive bonus because they usually
|
||||
have more significance than the rest. See kPatternStartMultiplier
|
||||
- Superfluous characters in `str` will reduce the score (gap penalty). See
|
||||
kGapScore
|
||||
- Prefer early occurrence of the first character. See kLeadingGapScore/kGapScore
|
||||
|
||||
The recurrence of the dynamic programming:
|
||||
dp[i][j]: maximum accumulated score by aligning pattern[0..i] to str[0..j]
|
||||
dp[0][j] = leading_gap_penalty(0, j) + score[j]
|
||||
dp[i][j] = max(dp[i-1][j-1] + CONSECUTIVE_SCORE, max(dp[i-1][k] +
|
||||
gap_penalty(k+1, j) + score[j] : k < j))
|
||||
The first dimension can be suppressed since we do not need a matching scheme,
|
||||
which reduces the space complexity from O(N*M) to O(M)
|
||||
*/
|
||||
int FuzzyEvaluate(std::string_view pattern,
|
||||
std::string_view str,
|
||||
std::vector<int>& score,
|
||||
std::vector<int>& dp) {
|
||||
bool pfirst = true, // aligning the first character of pattern
|
||||
pstart = true; // whether we are aligning the start of a word in pattern
|
||||
int uleft = 0, // value of the upper left cell
|
||||
ulefts = 0, // maximum value of uleft and cells on the left
|
||||
left, lefts; // similar to uleft/ulefts, but for the next row
|
||||
|
||||
// Calculate position score for each character in str.
|
||||
CharClass prev = CharClass::NonWord;
|
||||
for (int i = 0; i < int(str.size()); i++) {
|
||||
CharClass cur = GetCharClass(str[i]);
|
||||
score[i] = GetScoreFor(prev, cur);
|
||||
prev = cur;
|
||||
}
|
||||
std::fill_n(dp.begin(), str.size(), kMinScore);
|
||||
|
||||
// Align each character of pattern.
|
||||
for (unsigned char pc : pattern) {
|
||||
if (isspace(pc)) {
|
||||
pstart = true;
|
||||
continue;
|
||||
}
|
||||
lefts = kMinScore;
|
||||
// Enumerate the character in str to be aligned with pc.
|
||||
for (int i = 0; i < int(str.size()); i++) {
|
||||
left = dp[i];
|
||||
lefts = std::max(lefts + kGapScore, left);
|
||||
// Use lower() if case-insensitive
|
||||
if (tolower(pc) == tolower(str[i])) {
|
||||
int t = score[i] * (pstart ? kPatternStartMultiplier : 1);
|
||||
dp[i] = (pfirst ? kLeadingGapScore * i + t
|
||||
: std::max(uleft + kConsecutiveScore, ulefts + t)) +
|
||||
(pc == str[i] ? kCaseMatchScore : 0);
|
||||
} else
|
||||
dp[i] = kMinScore;
|
||||
uleft = left;
|
||||
ulefts = lefts;
|
||||
}
|
||||
pfirst = pstart = false;
|
||||
}
|
||||
|
||||
// Enumerate the end position of the match in str. Each removed trailing
|
||||
// character has a penulty of kGapScore.
|
||||
lefts = kMinScore;
|
||||
for (int i = 0; i < int(str.size()); i++)
|
||||
lefts = std::max(lefts + kGapScore, dp[i]);
|
||||
return lefts;
|
||||
}
|
17
src/fuzzy_match.h
Normal file
17
src/fuzzy_match.h
Normal file
@ -0,0 +1,17 @@
|
||||
#pragma once
|
||||
|
||||
#include <string_view.h>
|
||||
|
||||
#include <limits.h>
|
||||
#include <vector>
|
||||
|
||||
// Negative but far from INT_MIN so that intermediate results are hard to
|
||||
// overflow
|
||||
constexpr int kMinScore = INT_MIN / 2;
|
||||
|
||||
// Evaluate the score matching |pattern| against |str|, the larger the better.
|
||||
// |score| and |dp| must be at least as long as |str|.
|
||||
int FuzzyEvaluate(std::string_view pattern,
|
||||
std::string_view str,
|
||||
std::vector<int>& score,
|
||||
std::vector<int>& dp);
|
@ -1,7 +1,12 @@
|
||||
#include "fuzzy_match.h"
|
||||
#include "lex_utils.h"
|
||||
#include "message_handler.h"
|
||||
#include "query_utils.h"
|
||||
#include "queue_manager.h"
|
||||
|
||||
#include <ctype.h>
|
||||
#include <limits.h>
|
||||
|
||||
namespace {
|
||||
void PushBack(std::vector<lsLocation>* result, optional<lsLocation> location) {
|
||||
if (location)
|
||||
@ -61,12 +66,14 @@ struct TextDocumentDefinitionHandler
|
||||
Out_TextDocumentDefinition out;
|
||||
out.id = request->id;
|
||||
|
||||
bool has_symbol = false;
|
||||
int target_line = request->params.position.line;
|
||||
int target_column = request->params.position.character;
|
||||
|
||||
for (SymbolRef sym :
|
||||
FindSymbolsAtLocation(working_file, file, request->params.position)) {
|
||||
// Found symbol. Return definition.
|
||||
has_symbol = true;
|
||||
|
||||
// Special cases which are handled:
|
||||
// - symbol has declaration but no definition (ie, pure virtual)
|
||||
@ -121,6 +128,39 @@ struct TextDocumentDefinitionHandler
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Find the best match of the identifier at point.
|
||||
if (!has_symbol && db->symbols.size()) {
|
||||
const std::string& buffer = working_file->buffer_content;
|
||||
int start = GetOffsetForPosition(request->params.position, buffer);
|
||||
int end = start;
|
||||
while (start > 0 && isalnum(buffer[start - 1]))
|
||||
start--;
|
||||
while (isalnum(buffer[end]))
|
||||
end++;
|
||||
auto query = std::string_view(buffer).substr(start, end - start);
|
||||
|
||||
int best_score = kMinScore;
|
||||
int best_i = 0;
|
||||
std::vector<int> score, dp;
|
||||
for (int i = 0; i < (int)db->symbols.size(); ++i) {
|
||||
std::string_view short_name = db->GetSymbolShortName(i);
|
||||
if (short_name.size() > score.size()) {
|
||||
score.resize(short_name.size());
|
||||
dp.resize(short_name.size());
|
||||
}
|
||||
int t = FuzzyEvaluate(query, short_name, score, dp);
|
||||
if (t > best_score) {
|
||||
best_score = t;
|
||||
best_i = i;
|
||||
}
|
||||
}
|
||||
Maybe<Use> use = GetDefinitionSpellingOfSymbol(db, db->symbols[best_i]);
|
||||
if (use) {
|
||||
optional<lsLocation> ls_loc = GetLsLocation(db, working_files, *use);
|
||||
if (ls_loc)
|
||||
out.result.push_back(*ls_loc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
QueueManager::WriteStdout(IpcId::TextDocumentDefinition, out);
|
||||
|
@ -1,3 +1,4 @@
|
||||
#include "fuzzy_match.h"
|
||||
#include "lex_utils.h"
|
||||
#include "message_handler.h"
|
||||
#include "query_utils.h"
|
||||
@ -60,123 +61,6 @@ MAKE_REFLECT_STRUCT(Out_WorkspaceSymbol, jsonrpc, id, result);
|
||||
|
||||
///// Fuzzy matching
|
||||
|
||||
// Negative but far from INT_MIN so that intermediate results are hard to
|
||||
// overflow
|
||||
constexpr int kMinScore = INT_MIN / 2;
|
||||
// Penalty of dropping a leading character in str
|
||||
constexpr int kLeadingGapScore = -4;
|
||||
// Penalty of dropping a non-leading character in str
|
||||
constexpr int kGapScore = -5;
|
||||
// Bonus of aligning with an initial character of a word in pattern. Must be
|
||||
// greater than 1
|
||||
constexpr int kPatternStartMultiplier = 2;
|
||||
|
||||
constexpr int kWordStartScore = 50;
|
||||
constexpr int kNonWordScore = 40;
|
||||
constexpr int kCaseMatchScore = 2;
|
||||
|
||||
// Less than kWordStartScore
|
||||
constexpr int kConsecutiveScore = kWordStartScore + kGapScore;
|
||||
// Slightly less than kConsecutiveScore
|
||||
constexpr int kCamelScore = kWordStartScore + kGapScore - 1;
|
||||
|
||||
enum class CharClass { Lower, Upper, Digit, NonWord };
|
||||
|
||||
static CharClass GetCharClass(int c) {
|
||||
if (islower(c))
|
||||
return CharClass::Lower;
|
||||
if (isupper(c))
|
||||
return CharClass::Upper;
|
||||
if (isdigit(c))
|
||||
return CharClass::Digit;
|
||||
return CharClass::NonWord;
|
||||
}
|
||||
|
||||
static int GetScoreFor(CharClass prev, CharClass curr) {
|
||||
if (prev == CharClass::NonWord && curr != CharClass::NonWord)
|
||||
return kWordStartScore;
|
||||
if ((prev == CharClass::Lower && curr == CharClass::Upper) ||
|
||||
(prev != CharClass::Digit && curr == CharClass::Digit))
|
||||
return kCamelScore;
|
||||
if (curr == CharClass::NonWord)
|
||||
return kNonWordScore;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
fuzzyEvaluate implements a global sequence alignment algorithm to find the
|
||||
maximum accumulated score by aligning `pattern` to `str`. It applies when
|
||||
`pattern` is a subsequence of `str`.
|
||||
|
||||
Scoring criteria
|
||||
- Prefer matches at the start of a word, or the start of subwords in
|
||||
CamelCase/camelCase/camel123 words. See kWordStartScore/kCamelScore
|
||||
- Non-word characters matter. See kNonWordScore
|
||||
- The first characters of words of `pattern` receive bonus because they usually
|
||||
have more significance than the rest. See kPatternStartMultiplier
|
||||
- Superfluous characters in `str` will reduce the score (gap penalty). See
|
||||
kGapScore
|
||||
- Prefer early occurrence of the first character. See kLeadingGapScore/kGapScore
|
||||
|
||||
The recurrence of the dynamic programming:
|
||||
dp[i][j]: maximum accumulated score by aligning pattern[0..i] to str[0..j]
|
||||
dp[0][j] = leading_gap_penalty(0, j) + score[j]
|
||||
dp[i][j] = max(dp[i-1][j-1] + CONSECUTIVE_SCORE, max(dp[i-1][k] +
|
||||
gap_penalty(k+1, j) + score[j] : k < j))
|
||||
The first dimension can be suppressed since we do not need a matching scheme,
|
||||
which reduces the space complexity from O(N*M) to O(M)
|
||||
*/
|
||||
int FuzzyEvaluate(std::string_view pattern,
|
||||
std::string_view str,
|
||||
std::vector<int>& score,
|
||||
std::vector<int>& dp) {
|
||||
bool pfirst = true, // aligning the first character of pattern
|
||||
pstart = true; // whether we are aligning the start of a word in pattern
|
||||
int uleft = 0, // value of the upper left cell
|
||||
ulefts = 0, // maximum value of uleft and cells on the left
|
||||
left, lefts; // similar to uleft/ulefts, but for the next row
|
||||
|
||||
// Calculate position score for each character in str.
|
||||
CharClass prev = CharClass::NonWord;
|
||||
for (int i = 0; i < int(str.size()); i++) {
|
||||
CharClass cur = GetCharClass(str[i]);
|
||||
score[i] = GetScoreFor(prev, cur);
|
||||
prev = cur;
|
||||
}
|
||||
std::fill_n(dp.begin(), str.size(), kMinScore);
|
||||
|
||||
// Align each character of pattern.
|
||||
for (unsigned char pc : pattern) {
|
||||
if (isspace(pc)) {
|
||||
pstart = true;
|
||||
continue;
|
||||
}
|
||||
lefts = kMinScore;
|
||||
// Enumerate the character in str to be aligned with pc.
|
||||
for (int i = 0; i < int(str.size()); i++) {
|
||||
left = dp[i];
|
||||
lefts = std::max(lefts + kGapScore, left);
|
||||
// Use lower() if case-insensitive
|
||||
if (tolower(pc) == tolower(str[i])) {
|
||||
int t = score[i] * (pstart ? kPatternStartMultiplier : 1);
|
||||
dp[i] = (pfirst ? kLeadingGapScore * i + t
|
||||
: std::max(uleft + kConsecutiveScore, ulefts + t)) +
|
||||
(pc == str[i] ? kCaseMatchScore : 0);
|
||||
} else
|
||||
dp[i] = kMinScore;
|
||||
uleft = left;
|
||||
ulefts = lefts;
|
||||
}
|
||||
pfirst = pstart = false;
|
||||
}
|
||||
|
||||
// Enumerate the end position of the match in str. Each removed trailing
|
||||
// character has a penulty of kGapScore.
|
||||
lefts = kMinScore;
|
||||
for (int i = 0; i < int(str.size()); i++)
|
||||
lefts = std::max(lefts + kGapScore, dp[i]);
|
||||
return lefts;
|
||||
}
|
||||
|
||||
struct WorkspaceSymbolHandler : BaseMessageHandler<Ipc_WorkspaceSymbol> {
|
||||
void Run(Ipc_WorkspaceSymbol* request) override {
|
||||
|
Loading…
Reference in New Issue
Block a user