mirror of
https://github.com/MaskRay/ccls.git
synced 2025-02-16 21:58:08 +00:00
Add fuzzy_match.h and let textDocument/definition search for the identifier at point (e.g. in comments)
This commit is contained in:
parent
afd38cbce9
commit
8a939389d8
120
src/fuzzy_match.cc
Normal file
120
src/fuzzy_match.cc
Normal file
@ -0,0 +1,120 @@
|
|||||||
|
#include "fuzzy_match.h"
|
||||||
|
|
||||||
|
#include <ctype.h>
|
||||||
|
#include <limits.h>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
|
// Penalty of dropping a leading character in str
|
||||||
|
constexpr int kLeadingGapScore = -4;
|
||||||
|
// Penalty of dropping a non-leading character in str
|
||||||
|
constexpr int kGapScore = -5;
|
||||||
|
// Bonus of aligning with an initial character of a word in pattern. Must be
|
||||||
|
// greater than 1
|
||||||
|
constexpr int kPatternStartMultiplier = 2;
|
||||||
|
|
||||||
|
constexpr int kWordStartScore = 50;
|
||||||
|
constexpr int kNonWordScore = 40;
|
||||||
|
constexpr int kCaseMatchScore = 2;
|
||||||
|
|
||||||
|
// Less than kWordStartScore
|
||||||
|
constexpr int kConsecutiveScore = kWordStartScore + kGapScore;
|
||||||
|
// Slightly less than kConsecutiveScore
|
||||||
|
constexpr int kCamelScore = kWordStartScore + kGapScore - 1;
|
||||||
|
|
||||||
|
enum class CharClass { Lower, Upper, Digit, NonWord };
|
||||||
|
|
||||||
|
static CharClass GetCharClass(int c) {
|
||||||
|
if (islower(c))
|
||||||
|
return CharClass::Lower;
|
||||||
|
if (isupper(c))
|
||||||
|
return CharClass::Upper;
|
||||||
|
if (isdigit(c))
|
||||||
|
return CharClass::Digit;
|
||||||
|
return CharClass::NonWord;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int GetScoreFor(CharClass prev, CharClass curr) {
|
||||||
|
if (prev == CharClass::NonWord && curr != CharClass::NonWord)
|
||||||
|
return kWordStartScore;
|
||||||
|
if ((prev == CharClass::Lower && curr == CharClass::Upper) ||
|
||||||
|
(prev != CharClass::Digit && curr == CharClass::Digit))
|
||||||
|
return kCamelScore;
|
||||||
|
if (curr == CharClass::NonWord)
|
||||||
|
return kNonWordScore;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
fuzzyEvaluate implements a global sequence alignment algorithm to find the
|
||||||
|
maximum accumulated score by aligning `pattern` to `str`. It applies when
|
||||||
|
`pattern` is a subsequence of `str`.
|
||||||
|
|
||||||
|
Scoring criteria
|
||||||
|
- Prefer matches at the start of a word, or the start of subwords in
|
||||||
|
CamelCase/camelCase/camel123 words. See kWordStartScore/kCamelScore
|
||||||
|
- Non-word characters matter. See kNonWordScore
|
||||||
|
- The first characters of words of `pattern` receive bonus because they usually
|
||||||
|
have more significance than the rest. See kPatternStartMultiplier
|
||||||
|
- Superfluous characters in `str` will reduce the score (gap penalty). See
|
||||||
|
kGapScore
|
||||||
|
- Prefer early occurrence of the first character. See kLeadingGapScore/kGapScore
|
||||||
|
|
||||||
|
The recurrence of the dynamic programming:
|
||||||
|
dp[i][j]: maximum accumulated score by aligning pattern[0..i] to str[0..j]
|
||||||
|
dp[0][j] = leading_gap_penalty(0, j) + score[j]
|
||||||
|
dp[i][j] = max(dp[i-1][j-1] + CONSECUTIVE_SCORE, max(dp[i-1][k] +
|
||||||
|
gap_penalty(k+1, j) + score[j] : k < j))
|
||||||
|
The first dimension can be suppressed since we do not need a matching scheme,
|
||||||
|
which reduces the space complexity from O(N*M) to O(M)
|
||||||
|
*/
|
||||||
|
int FuzzyEvaluate(std::string_view pattern,
|
||||||
|
std::string_view str,
|
||||||
|
std::vector<int>& score,
|
||||||
|
std::vector<int>& dp) {
|
||||||
|
bool pfirst = true, // aligning the first character of pattern
|
||||||
|
pstart = true; // whether we are aligning the start of a word in pattern
|
||||||
|
int uleft = 0, // value of the upper left cell
|
||||||
|
ulefts = 0, // maximum value of uleft and cells on the left
|
||||||
|
left, lefts; // similar to uleft/ulefts, but for the next row
|
||||||
|
|
||||||
|
// Calculate position score for each character in str.
|
||||||
|
CharClass prev = CharClass::NonWord;
|
||||||
|
for (int i = 0; i < int(str.size()); i++) {
|
||||||
|
CharClass cur = GetCharClass(str[i]);
|
||||||
|
score[i] = GetScoreFor(prev, cur);
|
||||||
|
prev = cur;
|
||||||
|
}
|
||||||
|
std::fill_n(dp.begin(), str.size(), kMinScore);
|
||||||
|
|
||||||
|
// Align each character of pattern.
|
||||||
|
for (unsigned char pc : pattern) {
|
||||||
|
if (isspace(pc)) {
|
||||||
|
pstart = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
lefts = kMinScore;
|
||||||
|
// Enumerate the character in str to be aligned with pc.
|
||||||
|
for (int i = 0; i < int(str.size()); i++) {
|
||||||
|
left = dp[i];
|
||||||
|
lefts = std::max(lefts + kGapScore, left);
|
||||||
|
// Use lower() if case-insensitive
|
||||||
|
if (tolower(pc) == tolower(str[i])) {
|
||||||
|
int t = score[i] * (pstart ? kPatternStartMultiplier : 1);
|
||||||
|
dp[i] = (pfirst ? kLeadingGapScore * i + t
|
||||||
|
: std::max(uleft + kConsecutiveScore, ulefts + t)) +
|
||||||
|
(pc == str[i] ? kCaseMatchScore : 0);
|
||||||
|
} else
|
||||||
|
dp[i] = kMinScore;
|
||||||
|
uleft = left;
|
||||||
|
ulefts = lefts;
|
||||||
|
}
|
||||||
|
pfirst = pstart = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Enumerate the end position of the match in str. Each removed trailing
|
||||||
|
// character has a penulty of kGapScore.
|
||||||
|
lefts = kMinScore;
|
||||||
|
for (int i = 0; i < int(str.size()); i++)
|
||||||
|
lefts = std::max(lefts + kGapScore, dp[i]);
|
||||||
|
return lefts;
|
||||||
|
}
|
17
src/fuzzy_match.h
Normal file
17
src/fuzzy_match.h
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <string_view.h>
|
||||||
|
|
||||||
|
#include <limits.h>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
// Negative but far from INT_MIN so that intermediate results are hard to
|
||||||
|
// overflow
|
||||||
|
constexpr int kMinScore = INT_MIN / 2;
|
||||||
|
|
||||||
|
// Evaluate the score matching |pattern| against |str|, the larger the better.
|
||||||
|
// |score| and |dp| must be at least as long as |str|.
|
||||||
|
int FuzzyEvaluate(std::string_view pattern,
|
||||||
|
std::string_view str,
|
||||||
|
std::vector<int>& score,
|
||||||
|
std::vector<int>& dp);
|
@ -1,7 +1,12 @@
|
|||||||
|
#include "fuzzy_match.h"
|
||||||
|
#include "lex_utils.h"
|
||||||
#include "message_handler.h"
|
#include "message_handler.h"
|
||||||
#include "query_utils.h"
|
#include "query_utils.h"
|
||||||
#include "queue_manager.h"
|
#include "queue_manager.h"
|
||||||
|
|
||||||
|
#include <ctype.h>
|
||||||
|
#include <limits.h>
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
void PushBack(std::vector<lsLocation>* result, optional<lsLocation> location) {
|
void PushBack(std::vector<lsLocation>* result, optional<lsLocation> location) {
|
||||||
if (location)
|
if (location)
|
||||||
@ -61,12 +66,14 @@ struct TextDocumentDefinitionHandler
|
|||||||
Out_TextDocumentDefinition out;
|
Out_TextDocumentDefinition out;
|
||||||
out.id = request->id;
|
out.id = request->id;
|
||||||
|
|
||||||
|
bool has_symbol = false;
|
||||||
int target_line = request->params.position.line;
|
int target_line = request->params.position.line;
|
||||||
int target_column = request->params.position.character;
|
int target_column = request->params.position.character;
|
||||||
|
|
||||||
for (SymbolRef sym :
|
for (SymbolRef sym :
|
||||||
FindSymbolsAtLocation(working_file, file, request->params.position)) {
|
FindSymbolsAtLocation(working_file, file, request->params.position)) {
|
||||||
// Found symbol. Return definition.
|
// Found symbol. Return definition.
|
||||||
|
has_symbol = true;
|
||||||
|
|
||||||
// Special cases which are handled:
|
// Special cases which are handled:
|
||||||
// - symbol has declaration but no definition (ie, pure virtual)
|
// - symbol has declaration but no definition (ie, pure virtual)
|
||||||
@ -121,6 +128,39 @@ struct TextDocumentDefinitionHandler
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Find the best match of the identifier at point.
|
||||||
|
if (!has_symbol && db->symbols.size()) {
|
||||||
|
const std::string& buffer = working_file->buffer_content;
|
||||||
|
int start = GetOffsetForPosition(request->params.position, buffer);
|
||||||
|
int end = start;
|
||||||
|
while (start > 0 && isalnum(buffer[start - 1]))
|
||||||
|
start--;
|
||||||
|
while (isalnum(buffer[end]))
|
||||||
|
end++;
|
||||||
|
auto query = std::string_view(buffer).substr(start, end - start);
|
||||||
|
|
||||||
|
int best_score = kMinScore;
|
||||||
|
int best_i = 0;
|
||||||
|
std::vector<int> score, dp;
|
||||||
|
for (int i = 0; i < (int)db->symbols.size(); ++i) {
|
||||||
|
std::string_view short_name = db->GetSymbolShortName(i);
|
||||||
|
if (short_name.size() > score.size()) {
|
||||||
|
score.resize(short_name.size());
|
||||||
|
dp.resize(short_name.size());
|
||||||
|
}
|
||||||
|
int t = FuzzyEvaluate(query, short_name, score, dp);
|
||||||
|
if (t > best_score) {
|
||||||
|
best_score = t;
|
||||||
|
best_i = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Maybe<Use> use = GetDefinitionSpellingOfSymbol(db, db->symbols[best_i]);
|
||||||
|
if (use) {
|
||||||
|
optional<lsLocation> ls_loc = GetLsLocation(db, working_files, *use);
|
||||||
|
if (ls_loc)
|
||||||
|
out.result.push_back(*ls_loc);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
QueueManager::WriteStdout(IpcId::TextDocumentDefinition, out);
|
QueueManager::WriteStdout(IpcId::TextDocumentDefinition, out);
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
#include "fuzzy_match.h"
|
||||||
#include "lex_utils.h"
|
#include "lex_utils.h"
|
||||||
#include "message_handler.h"
|
#include "message_handler.h"
|
||||||
#include "query_utils.h"
|
#include "query_utils.h"
|
||||||
@ -60,123 +61,6 @@ MAKE_REFLECT_STRUCT(Out_WorkspaceSymbol, jsonrpc, id, result);
|
|||||||
|
|
||||||
///// Fuzzy matching
|
///// Fuzzy matching
|
||||||
|
|
||||||
// Negative but far from INT_MIN so that intermediate results are hard to
|
|
||||||
// overflow
|
|
||||||
constexpr int kMinScore = INT_MIN / 2;
|
|
||||||
// Penalty of dropping a leading character in str
|
|
||||||
constexpr int kLeadingGapScore = -4;
|
|
||||||
// Penalty of dropping a non-leading character in str
|
|
||||||
constexpr int kGapScore = -5;
|
|
||||||
// Bonus of aligning with an initial character of a word in pattern. Must be
|
|
||||||
// greater than 1
|
|
||||||
constexpr int kPatternStartMultiplier = 2;
|
|
||||||
|
|
||||||
constexpr int kWordStartScore = 50;
|
|
||||||
constexpr int kNonWordScore = 40;
|
|
||||||
constexpr int kCaseMatchScore = 2;
|
|
||||||
|
|
||||||
// Less than kWordStartScore
|
|
||||||
constexpr int kConsecutiveScore = kWordStartScore + kGapScore;
|
|
||||||
// Slightly less than kConsecutiveScore
|
|
||||||
constexpr int kCamelScore = kWordStartScore + kGapScore - 1;
|
|
||||||
|
|
||||||
enum class CharClass { Lower, Upper, Digit, NonWord };
|
|
||||||
|
|
||||||
static CharClass GetCharClass(int c) {
|
|
||||||
if (islower(c))
|
|
||||||
return CharClass::Lower;
|
|
||||||
if (isupper(c))
|
|
||||||
return CharClass::Upper;
|
|
||||||
if (isdigit(c))
|
|
||||||
return CharClass::Digit;
|
|
||||||
return CharClass::NonWord;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int GetScoreFor(CharClass prev, CharClass curr) {
|
|
||||||
if (prev == CharClass::NonWord && curr != CharClass::NonWord)
|
|
||||||
return kWordStartScore;
|
|
||||||
if ((prev == CharClass::Lower && curr == CharClass::Upper) ||
|
|
||||||
(prev != CharClass::Digit && curr == CharClass::Digit))
|
|
||||||
return kCamelScore;
|
|
||||||
if (curr == CharClass::NonWord)
|
|
||||||
return kNonWordScore;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
fuzzyEvaluate implements a global sequence alignment algorithm to find the
|
|
||||||
maximum accumulated score by aligning `pattern` to `str`. It applies when
|
|
||||||
`pattern` is a subsequence of `str`.
|
|
||||||
|
|
||||||
Scoring criteria
|
|
||||||
- Prefer matches at the start of a word, or the start of subwords in
|
|
||||||
CamelCase/camelCase/camel123 words. See kWordStartScore/kCamelScore
|
|
||||||
- Non-word characters matter. See kNonWordScore
|
|
||||||
- The first characters of words of `pattern` receive bonus because they usually
|
|
||||||
have more significance than the rest. See kPatternStartMultiplier
|
|
||||||
- Superfluous characters in `str` will reduce the score (gap penalty). See
|
|
||||||
kGapScore
|
|
||||||
- Prefer early occurrence of the first character. See kLeadingGapScore/kGapScore
|
|
||||||
|
|
||||||
The recurrence of the dynamic programming:
|
|
||||||
dp[i][j]: maximum accumulated score by aligning pattern[0..i] to str[0..j]
|
|
||||||
dp[0][j] = leading_gap_penalty(0, j) + score[j]
|
|
||||||
dp[i][j] = max(dp[i-1][j-1] + CONSECUTIVE_SCORE, max(dp[i-1][k] +
|
|
||||||
gap_penalty(k+1, j) + score[j] : k < j))
|
|
||||||
The first dimension can be suppressed since we do not need a matching scheme,
|
|
||||||
which reduces the space complexity from O(N*M) to O(M)
|
|
||||||
*/
|
|
||||||
int FuzzyEvaluate(std::string_view pattern,
|
|
||||||
std::string_view str,
|
|
||||||
std::vector<int>& score,
|
|
||||||
std::vector<int>& dp) {
|
|
||||||
bool pfirst = true, // aligning the first character of pattern
|
|
||||||
pstart = true; // whether we are aligning the start of a word in pattern
|
|
||||||
int uleft = 0, // value of the upper left cell
|
|
||||||
ulefts = 0, // maximum value of uleft and cells on the left
|
|
||||||
left, lefts; // similar to uleft/ulefts, but for the next row
|
|
||||||
|
|
||||||
// Calculate position score for each character in str.
|
|
||||||
CharClass prev = CharClass::NonWord;
|
|
||||||
for (int i = 0; i < int(str.size()); i++) {
|
|
||||||
CharClass cur = GetCharClass(str[i]);
|
|
||||||
score[i] = GetScoreFor(prev, cur);
|
|
||||||
prev = cur;
|
|
||||||
}
|
|
||||||
std::fill_n(dp.begin(), str.size(), kMinScore);
|
|
||||||
|
|
||||||
// Align each character of pattern.
|
|
||||||
for (unsigned char pc : pattern) {
|
|
||||||
if (isspace(pc)) {
|
|
||||||
pstart = true;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
lefts = kMinScore;
|
|
||||||
// Enumerate the character in str to be aligned with pc.
|
|
||||||
for (int i = 0; i < int(str.size()); i++) {
|
|
||||||
left = dp[i];
|
|
||||||
lefts = std::max(lefts + kGapScore, left);
|
|
||||||
// Use lower() if case-insensitive
|
|
||||||
if (tolower(pc) == tolower(str[i])) {
|
|
||||||
int t = score[i] * (pstart ? kPatternStartMultiplier : 1);
|
|
||||||
dp[i] = (pfirst ? kLeadingGapScore * i + t
|
|
||||||
: std::max(uleft + kConsecutiveScore, ulefts + t)) +
|
|
||||||
(pc == str[i] ? kCaseMatchScore : 0);
|
|
||||||
} else
|
|
||||||
dp[i] = kMinScore;
|
|
||||||
uleft = left;
|
|
||||||
ulefts = lefts;
|
|
||||||
}
|
|
||||||
pfirst = pstart = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Enumerate the end position of the match in str. Each removed trailing
|
|
||||||
// character has a penulty of kGapScore.
|
|
||||||
lefts = kMinScore;
|
|
||||||
for (int i = 0; i < int(str.size()); i++)
|
|
||||||
lefts = std::max(lefts + kGapScore, dp[i]);
|
|
||||||
return lefts;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct WorkspaceSymbolHandler : BaseMessageHandler<Ipc_WorkspaceSymbol> {
|
struct WorkspaceSymbolHandler : BaseMessageHandler<Ipc_WorkspaceSymbol> {
|
||||||
void Run(Ipc_WorkspaceSymbol* request) override {
|
void Run(Ipc_WorkspaceSymbol* request) override {
|
||||||
|
Loading…
Reference in New Issue
Block a user