mirror of
https://github.com/MaskRay/ccls.git
synced 2024-11-26 09:31:59 +00:00
[workspace/symbol] Sort candidates with a fuzzy matching algorithm (#182)
This commit is contained in:
parent
dce1365eb6
commit
24f428c670
@ -24,7 +24,7 @@ struct Config {
|
||||
bool logSkippedPathsForIndex = false;
|
||||
|
||||
// Maximum workspace search results.
|
||||
int maxWorkspaceSearchResults = 1000;
|
||||
int maxWorkspaceSearchResults = 200;
|
||||
|
||||
// Force a certain number of indexer threads. If less than 1 a default value
|
||||
// should be used.
|
||||
|
@ -1,3 +1,8 @@
|
||||
#include <ctype.h>
|
||||
#include <limits.h>
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
|
||||
#include "lex_utils.h"
|
||||
#include "message_handler.h"
|
||||
#include "query_utils.h"
|
||||
@ -7,29 +12,30 @@
|
||||
namespace {
|
||||
|
||||
// Lookup |symbol| in |db| and insert the value into |result|.
|
||||
void InsertSymbolIntoResult(QueryDatabase* db,
|
||||
bool InsertSymbolIntoResult(QueryDatabase* db,
|
||||
WorkingFiles* working_files,
|
||||
SymbolIdx symbol,
|
||||
std::vector<lsSymbolInformation>* result) {
|
||||
optional<lsSymbolInformation> info =
|
||||
GetSymbolInfo(db, working_files, symbol, false /*use_short_name*/);
|
||||
if (!info)
|
||||
return;
|
||||
return false;
|
||||
|
||||
optional<QueryLocation> location = GetDefinitionExtentOfSymbol(db, symbol);
|
||||
if (!location) {
|
||||
auto decls = GetDeclarationsOfSymbolForGotoDefinition(db, symbol);
|
||||
if (decls.empty())
|
||||
return;
|
||||
return false;
|
||||
location = decls[0];
|
||||
}
|
||||
|
||||
optional<lsLocation> ls_location =
|
||||
GetLsLocation(db, working_files, *location);
|
||||
if (!ls_location)
|
||||
return;
|
||||
return false;
|
||||
info->location = *ls_location;
|
||||
result->push_back(*info);
|
||||
return true;
|
||||
}
|
||||
|
||||
struct lsWorkspaceSymbolParams {
|
||||
@ -51,12 +57,119 @@ struct Out_WorkspaceSymbol : public lsOutMessage<Out_WorkspaceSymbol> {
|
||||
};
|
||||
MAKE_REFLECT_STRUCT(Out_WorkspaceSymbol, jsonrpc, id, result);
|
||||
|
||||
|
||||
///// Fuzzy matching
|
||||
|
||||
// Negative but far from INT_MIN so that intermediate results are hard to
|
||||
// overflow
|
||||
constexpr int kMinScore = INT_MIN / 2;
|
||||
// Penalty of dropping a leading character in str
|
||||
constexpr int kLeadingGapScore = -4;
|
||||
// Penalty of dropping a non-leading character in str
|
||||
constexpr int kGapScore = -5;
|
||||
// Bonus of aligning with an initial character of a word in pattern. Must be
|
||||
// greater than 1
|
||||
constexpr int kPatternStartMultiplier = 2;
|
||||
|
||||
constexpr int kWordStartScore = 100;
|
||||
constexpr int kNonWordScore = 90;
|
||||
|
||||
// Less than kWordStartScore
|
||||
constexpr int kConsecutiveScore = kWordStartScore + kGapScore;
|
||||
// Slightly less than kConsecutiveScore
|
||||
constexpr int kCamelScore = kWordStartScore + kGapScore - 1;
|
||||
|
||||
enum class CharClass { Lower, Upper, Digit, NonWord };
|
||||
|
||||
static enum CharClass getCharClass(int c) {
|
||||
if (islower(c)) return CharClass::Lower;
|
||||
if (isupper(c)) return CharClass::Upper;
|
||||
if (isdigit(c)) return CharClass::Digit;
|
||||
return CharClass::NonWord;
|
||||
}
|
||||
|
||||
static int getScoreFor(CharClass prev, CharClass curr) {
|
||||
if (prev == CharClass::NonWord && curr != CharClass::NonWord)
|
||||
return kWordStartScore;
|
||||
if ((prev == CharClass::Lower && curr == CharClass::Upper) ||
|
||||
(prev != CharClass::Digit && curr == CharClass::Digit))
|
||||
return kCamelScore;
|
||||
if (curr == CharClass::NonWord)
|
||||
return kNonWordScore;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
fuzzyEvaluate implements a global sequence alignment algorithm to find the maximum accumulated score by aligning `pattern` to `str`. It applies when `pattern` is a subsequence of `str`.
|
||||
|
||||
Scoring criteria
|
||||
- Prefer matches at the start of a word, or the start of subwords in CamelCase/camelCase/camel123 words. See kWordStartScore/kCamelScore
|
||||
- Non-word characters matter. See kNonWordScore
|
||||
- The first characters of words of `pattern` receive bonus because they usually have more significance than the rest. See kPatternStartMultiplier
|
||||
- Superfluous characters in `str` will reduce the score (gap penalty). See kGapScore
|
||||
- Prefer early occurrence of the first character. See kLeadingGapScore/kGapScore
|
||||
|
||||
The recurrence of the dynamic programming:
|
||||
dp[i][j]: maximum accumulated score by aligning pattern[0..i] to str[0..j]
|
||||
dp[0][j] = leading_gap_penalty(0, j) + score[j]
|
||||
dp[i][j] = max(dp[i-1][j-1] + CONSECUTIVE_SCORE, max(dp[i-1][k] + gap_penalty(k+1, j) + score[j] : k < j))
|
||||
The first dimension can be suppressed since we do not need a matching scheme, which reduces the space complexity from O(N*M) to O(M)
|
||||
*/
|
||||
int fuzzyEvaluate(const std::string& pattern,
|
||||
const std::string& str,
|
||||
std::vector<int>& score,
|
||||
std::vector<int>& dp) {
|
||||
bool pfirst = true, // aligning the first character of pattern
|
||||
pstart = true; // whether we are aligning the start of a word in pattern
|
||||
int uleft = 0, // value of the upper left cell
|
||||
ulefts = 0, // maximum value of uleft and cells on the left
|
||||
left, lefts; // similar to uleft/ulefts, but for the next row
|
||||
|
||||
// Calculate position score for each character in str.
|
||||
CharClass prev = CharClass::NonWord;
|
||||
for (int i = 0; i < int(str.size()); i++) {
|
||||
CharClass cur = getCharClass(str[i]);
|
||||
score[i] = getScoreFor(prev, cur);
|
||||
prev = cur;
|
||||
}
|
||||
std::fill_n(dp.begin(), str.size(), kMinScore);
|
||||
|
||||
// Align each character of pattern.
|
||||
for (unsigned char pc: pattern) {
|
||||
if (isspace(pc)) {
|
||||
pstart = true;
|
||||
continue;
|
||||
}
|
||||
lefts = kMinScore;
|
||||
// Enumerate the character in str to be aligned with pc.
|
||||
for (int i = 0; i < int(str.size()); i++) {
|
||||
left = dp[i];
|
||||
lefts = std::max(lefts + kGapScore, left);
|
||||
if (tolower(pc) == tolower(str[i])) {
|
||||
int t = score[i] * (pstart ? kPatternStartMultiplier : 1);
|
||||
dp[i] = pfirst ? kLeadingGapScore * i + t
|
||||
: std::max(uleft + kConsecutiveScore, ulefts + t);
|
||||
} else
|
||||
dp[i] = kMinScore;
|
||||
uleft = left;
|
||||
ulefts = lefts;
|
||||
}
|
||||
pfirst = pstart = false;
|
||||
}
|
||||
|
||||
// Enumerate the end position of the match in str.
|
||||
lefts = kMinScore;
|
||||
for (int i = 0; i < int(str.size()); i++)
|
||||
// For function types, db->detailed_names may have trailing characters for
|
||||
// parameters. We do not want to penalize them.
|
||||
// If we use `short_name` instead of `detailed_name` for fuzzy matching, the
|
||||
// penulty kGapScore can be used.
|
||||
lefts = std::max(lefts /*+ kGapScore */, dp[i]);
|
||||
return lefts;
|
||||
}
|
||||
|
||||
struct WorkspaceSymbolHandler : BaseMessageHandler<Ipc_WorkspaceSymbol> {
|
||||
void Run(Ipc_WorkspaceSymbol* request) override {
|
||||
// TODO: implement fuzzy search, see
|
||||
// https://github.com/junegunn/fzf/blob/master/src/matcher.go for
|
||||
// inspiration
|
||||
|
||||
Out_WorkspaceSymbol out;
|
||||
out.id = request->id;
|
||||
|
||||
@ -66,7 +179,10 @@ struct WorkspaceSymbolHandler : BaseMessageHandler<Ipc_WorkspaceSymbol> {
|
||||
std::string query = request->params.query;
|
||||
|
||||
std::unordered_set<std::string> inserted_results;
|
||||
// db->detailed_names indices of each lsSymbolInformation in out.result
|
||||
std::vector<int> result_indices;
|
||||
inserted_results.reserve(config->maxWorkspaceSearchResults);
|
||||
result_indices.reserve(config->maxWorkspaceSearchResults);
|
||||
|
||||
for (int i = 0; i < db->detailed_names.size(); ++i) {
|
||||
if (db->detailed_names[i].find(query) != std::string::npos) {
|
||||
@ -74,26 +190,56 @@ struct WorkspaceSymbolHandler : BaseMessageHandler<Ipc_WorkspaceSymbol> {
|
||||
if (!inserted_results.insert(db->detailed_names[i]).second)
|
||||
continue;
|
||||
|
||||
InsertSymbolIntoResult(db, working_files, db->symbols[i], &out.result);
|
||||
if (InsertSymbolIntoResult(db, working_files, db->symbols[i], &out.result)) {
|
||||
result_indices.push_back(i);
|
||||
if (out.result.size() >= config->maxWorkspaceSearchResults)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (out.result.size() < config->maxWorkspaceSearchResults) {
|
||||
std::string query_without_space;
|
||||
query_without_space.reserve(query.size());
|
||||
for (char c: query)
|
||||
if (!isspace(c))
|
||||
query_without_space += c;
|
||||
|
||||
for (int i = 0; i < db->detailed_names.size(); ++i) {
|
||||
if (SubstringMatch(query, db->detailed_names[i])) {
|
||||
if (SubstringMatch(query_without_space, db->detailed_names[i])) {
|
||||
// Do not show the same entry twice.
|
||||
if (!inserted_results.insert(db->detailed_names[i]).second)
|
||||
continue;
|
||||
|
||||
InsertSymbolIntoResult(db, working_files, db->symbols[i],
|
||||
&out.result);
|
||||
if (InsertSymbolIntoResult(db, working_files, db->symbols[i], &out.result)) {
|
||||
result_indices.push_back(i);
|
||||
if (out.result.size() >= config->maxWorkspaceSearchResults)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (out.result.size() < config->maxWorkspaceSearchResults) {
|
||||
int longest = 0;
|
||||
for (int i: result_indices)
|
||||
longest = std::max(longest, int(db->detailed_names[i].size()));
|
||||
|
||||
std::vector<int> score(longest), // score for each position
|
||||
dp(longest); // dp[i]: maximum value by aligning pattern[0..pi] to str[0..si]
|
||||
std::vector<std::pair<int, int>> permutation(result_indices.size());
|
||||
for (int i = 0; i < int(result_indices.size()); i++) {
|
||||
permutation[i] = {
|
||||
fuzzyEvaluate(query, db->detailed_names[result_indices[i]], score,
|
||||
dp),
|
||||
i};
|
||||
}
|
||||
std::sort(permutation.begin(), permutation.end(),
|
||||
std::greater<std::pair<int, int>>());
|
||||
for (int i = 0; i < int(result_indices.size()); i++)
|
||||
if (i != permutation[i].second)
|
||||
std::swap(out.result[i], out.result[permutation[i].second]);
|
||||
}
|
||||
|
||||
LOG_S(INFO) << "[querydb] Found " << out.result.size()
|
||||
<< " results for query " << query;
|
||||
|
Loading…
Reference in New Issue
Block a user