mirror of
https://github.com/MaskRay/ccls.git
synced 2024-11-22 23:55:08 +00:00
[workspace/symbol] Sort candidates with a fuzzy matching algorithm (#182)
This commit is contained in:
parent
dce1365eb6
commit
24f428c670
@ -24,7 +24,7 @@ struct Config {
|
|||||||
bool logSkippedPathsForIndex = false;
|
bool logSkippedPathsForIndex = false;
|
||||||
|
|
||||||
// Maximum workspace search results.
|
// Maximum workspace search results.
|
||||||
int maxWorkspaceSearchResults = 1000;
|
int maxWorkspaceSearchResults = 200;
|
||||||
|
|
||||||
// Force a certain number of indexer threads. If less than 1 a default value
|
// Force a certain number of indexer threads. If less than 1 a default value
|
||||||
// should be used.
|
// should be used.
|
||||||
|
@ -1,3 +1,8 @@
|
|||||||
|
#include <ctype.h>
|
||||||
|
#include <limits.h>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <functional>
|
||||||
|
|
||||||
#include "lex_utils.h"
|
#include "lex_utils.h"
|
||||||
#include "message_handler.h"
|
#include "message_handler.h"
|
||||||
#include "query_utils.h"
|
#include "query_utils.h"
|
||||||
@ -7,29 +12,30 @@
|
|||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
// Lookup |symbol| in |db| and insert the value into |result|.
|
// Lookup |symbol| in |db| and insert the value into |result|.
|
||||||
void InsertSymbolIntoResult(QueryDatabase* db,
|
bool InsertSymbolIntoResult(QueryDatabase* db,
|
||||||
WorkingFiles* working_files,
|
WorkingFiles* working_files,
|
||||||
SymbolIdx symbol,
|
SymbolIdx symbol,
|
||||||
std::vector<lsSymbolInformation>* result) {
|
std::vector<lsSymbolInformation>* result) {
|
||||||
optional<lsSymbolInformation> info =
|
optional<lsSymbolInformation> info =
|
||||||
GetSymbolInfo(db, working_files, symbol, false /*use_short_name*/);
|
GetSymbolInfo(db, working_files, symbol, false /*use_short_name*/);
|
||||||
if (!info)
|
if (!info)
|
||||||
return;
|
return false;
|
||||||
|
|
||||||
optional<QueryLocation> location = GetDefinitionExtentOfSymbol(db, symbol);
|
optional<QueryLocation> location = GetDefinitionExtentOfSymbol(db, symbol);
|
||||||
if (!location) {
|
if (!location) {
|
||||||
auto decls = GetDeclarationsOfSymbolForGotoDefinition(db, symbol);
|
auto decls = GetDeclarationsOfSymbolForGotoDefinition(db, symbol);
|
||||||
if (decls.empty())
|
if (decls.empty())
|
||||||
return;
|
return false;
|
||||||
location = decls[0];
|
location = decls[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
optional<lsLocation> ls_location =
|
optional<lsLocation> ls_location =
|
||||||
GetLsLocation(db, working_files, *location);
|
GetLsLocation(db, working_files, *location);
|
||||||
if (!ls_location)
|
if (!ls_location)
|
||||||
return;
|
return false;
|
||||||
info->location = *ls_location;
|
info->location = *ls_location;
|
||||||
result->push_back(*info);
|
result->push_back(*info);
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct lsWorkspaceSymbolParams {
|
struct lsWorkspaceSymbolParams {
|
||||||
@ -51,12 +57,119 @@ struct Out_WorkspaceSymbol : public lsOutMessage<Out_WorkspaceSymbol> {
|
|||||||
};
|
};
|
||||||
MAKE_REFLECT_STRUCT(Out_WorkspaceSymbol, jsonrpc, id, result);
|
MAKE_REFLECT_STRUCT(Out_WorkspaceSymbol, jsonrpc, id, result);
|
||||||
|
|
||||||
|
|
||||||
|
///// Fuzzy matching
|
||||||
|
|
||||||
|
// Negative but far from INT_MIN so that intermediate results are hard to
|
||||||
|
// overflow
|
||||||
|
constexpr int kMinScore = INT_MIN / 2;
|
||||||
|
// Penalty of dropping a leading character in str
|
||||||
|
constexpr int kLeadingGapScore = -4;
|
||||||
|
// Penalty of dropping a non-leading character in str
|
||||||
|
constexpr int kGapScore = -5;
|
||||||
|
// Bonus of aligning with an initial character of a word in pattern. Must be
|
||||||
|
// greater than 1
|
||||||
|
constexpr int kPatternStartMultiplier = 2;
|
||||||
|
|
||||||
|
constexpr int kWordStartScore = 100;
|
||||||
|
constexpr int kNonWordScore = 90;
|
||||||
|
|
||||||
|
// Less than kWordStartScore
|
||||||
|
constexpr int kConsecutiveScore = kWordStartScore + kGapScore;
|
||||||
|
// Slightly less than kConsecutiveScore
|
||||||
|
constexpr int kCamelScore = kWordStartScore + kGapScore - 1;
|
||||||
|
|
||||||
|
enum class CharClass { Lower, Upper, Digit, NonWord };
|
||||||
|
|
||||||
|
static enum CharClass getCharClass(int c) {
|
||||||
|
if (islower(c)) return CharClass::Lower;
|
||||||
|
if (isupper(c)) return CharClass::Upper;
|
||||||
|
if (isdigit(c)) return CharClass::Digit;
|
||||||
|
return CharClass::NonWord;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int getScoreFor(CharClass prev, CharClass curr) {
|
||||||
|
if (prev == CharClass::NonWord && curr != CharClass::NonWord)
|
||||||
|
return kWordStartScore;
|
||||||
|
if ((prev == CharClass::Lower && curr == CharClass::Upper) ||
|
||||||
|
(prev != CharClass::Digit && curr == CharClass::Digit))
|
||||||
|
return kCamelScore;
|
||||||
|
if (curr == CharClass::NonWord)
|
||||||
|
return kNonWordScore;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
fuzzyEvaluate implements a global sequence alignment algorithm to find the maximum accumulated score by aligning `pattern` to `str`. It applies when `pattern` is a subsequence of `str`.
|
||||||
|
|
||||||
|
Scoring criteria
|
||||||
|
- Prefer matches at the start of a word, or the start of subwords in CamelCase/camelCase/camel123 words. See kWordStartScore/kCamelScore
|
||||||
|
- Non-word characters matter. See kNonWordScore
|
||||||
|
- The first characters of words of `pattern` receive bonus because they usually have more significance than the rest. See kPatternStartMultiplier
|
||||||
|
- Superfluous characters in `str` will reduce the score (gap penalty). See kGapScore
|
||||||
|
- Prefer early occurrence of the first character. See kLeadingGapScore/kGapScore
|
||||||
|
|
||||||
|
The recurrence of the dynamic programming:
|
||||||
|
dp[i][j]: maximum accumulated score by aligning pattern[0..i] to str[0..j]
|
||||||
|
dp[0][j] = leading_gap_penalty(0, j) + score[j]
|
||||||
|
dp[i][j] = max(dp[i-1][j-1] + CONSECUTIVE_SCORE, max(dp[i-1][k] + gap_penalty(k+1, j) + score[j] : k < j))
|
||||||
|
The first dimension can be suppressed since we do not need a matching scheme, which reduces the space complexity from O(N*M) to O(M)
|
||||||
|
*/
|
||||||
|
int fuzzyEvaluate(const std::string& pattern,
|
||||||
|
const std::string& str,
|
||||||
|
std::vector<int>& score,
|
||||||
|
std::vector<int>& dp) {
|
||||||
|
bool pfirst = true, // aligning the first character of pattern
|
||||||
|
pstart = true; // whether we are aligning the start of a word in pattern
|
||||||
|
int uleft = 0, // value of the upper left cell
|
||||||
|
ulefts = 0, // maximum value of uleft and cells on the left
|
||||||
|
left, lefts; // similar to uleft/ulefts, but for the next row
|
||||||
|
|
||||||
|
// Calculate position score for each character in str.
|
||||||
|
CharClass prev = CharClass::NonWord;
|
||||||
|
for (int i = 0; i < int(str.size()); i++) {
|
||||||
|
CharClass cur = getCharClass(str[i]);
|
||||||
|
score[i] = getScoreFor(prev, cur);
|
||||||
|
prev = cur;
|
||||||
|
}
|
||||||
|
std::fill_n(dp.begin(), str.size(), kMinScore);
|
||||||
|
|
||||||
|
// Align each character of pattern.
|
||||||
|
for (unsigned char pc: pattern) {
|
||||||
|
if (isspace(pc)) {
|
||||||
|
pstart = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
lefts = kMinScore;
|
||||||
|
// Enumerate the character in str to be aligned with pc.
|
||||||
|
for (int i = 0; i < int(str.size()); i++) {
|
||||||
|
left = dp[i];
|
||||||
|
lefts = std::max(lefts + kGapScore, left);
|
||||||
|
if (tolower(pc) == tolower(str[i])) {
|
||||||
|
int t = score[i] * (pstart ? kPatternStartMultiplier : 1);
|
||||||
|
dp[i] = pfirst ? kLeadingGapScore * i + t
|
||||||
|
: std::max(uleft + kConsecutiveScore, ulefts + t);
|
||||||
|
} else
|
||||||
|
dp[i] = kMinScore;
|
||||||
|
uleft = left;
|
||||||
|
ulefts = lefts;
|
||||||
|
}
|
||||||
|
pfirst = pstart = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Enumerate the end position of the match in str.
|
||||||
|
lefts = kMinScore;
|
||||||
|
for (int i = 0; i < int(str.size()); i++)
|
||||||
|
// For function types, db->detailed_names may have trailing characters for
|
||||||
|
// parameters. We do not want to penalize them.
|
||||||
|
// If we use `short_name` instead of `detailed_name` for fuzzy matching, the
|
||||||
|
// penulty kGapScore can be used.
|
||||||
|
lefts = std::max(lefts /*+ kGapScore */, dp[i]);
|
||||||
|
return lefts;
|
||||||
|
}
|
||||||
|
|
||||||
struct WorkspaceSymbolHandler : BaseMessageHandler<Ipc_WorkspaceSymbol> {
|
struct WorkspaceSymbolHandler : BaseMessageHandler<Ipc_WorkspaceSymbol> {
|
||||||
void Run(Ipc_WorkspaceSymbol* request) override {
|
void Run(Ipc_WorkspaceSymbol* request) override {
|
||||||
// TODO: implement fuzzy search, see
|
|
||||||
// https://github.com/junegunn/fzf/blob/master/src/matcher.go for
|
|
||||||
// inspiration
|
|
||||||
|
|
||||||
Out_WorkspaceSymbol out;
|
Out_WorkspaceSymbol out;
|
||||||
out.id = request->id;
|
out.id = request->id;
|
||||||
|
|
||||||
@ -66,7 +179,10 @@ struct WorkspaceSymbolHandler : BaseMessageHandler<Ipc_WorkspaceSymbol> {
|
|||||||
std::string query = request->params.query;
|
std::string query = request->params.query;
|
||||||
|
|
||||||
std::unordered_set<std::string> inserted_results;
|
std::unordered_set<std::string> inserted_results;
|
||||||
|
// db->detailed_names indices of each lsSymbolInformation in out.result
|
||||||
|
std::vector<int> result_indices;
|
||||||
inserted_results.reserve(config->maxWorkspaceSearchResults);
|
inserted_results.reserve(config->maxWorkspaceSearchResults);
|
||||||
|
result_indices.reserve(config->maxWorkspaceSearchResults);
|
||||||
|
|
||||||
for (int i = 0; i < db->detailed_names.size(); ++i) {
|
for (int i = 0; i < db->detailed_names.size(); ++i) {
|
||||||
if (db->detailed_names[i].find(query) != std::string::npos) {
|
if (db->detailed_names[i].find(query) != std::string::npos) {
|
||||||
@ -74,26 +190,56 @@ struct WorkspaceSymbolHandler : BaseMessageHandler<Ipc_WorkspaceSymbol> {
|
|||||||
if (!inserted_results.insert(db->detailed_names[i]).second)
|
if (!inserted_results.insert(db->detailed_names[i]).second)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
InsertSymbolIntoResult(db, working_files, db->symbols[i], &out.result);
|
if (InsertSymbolIntoResult(db, working_files, db->symbols[i], &out.result)) {
|
||||||
|
result_indices.push_back(i);
|
||||||
if (out.result.size() >= config->maxWorkspaceSearchResults)
|
if (out.result.size() >= config->maxWorkspaceSearchResults)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (out.result.size() < config->maxWorkspaceSearchResults) {
|
if (out.result.size() < config->maxWorkspaceSearchResults) {
|
||||||
|
std::string query_without_space;
|
||||||
|
query_without_space.reserve(query.size());
|
||||||
|
for (char c: query)
|
||||||
|
if (!isspace(c))
|
||||||
|
query_without_space += c;
|
||||||
|
|
||||||
for (int i = 0; i < db->detailed_names.size(); ++i) {
|
for (int i = 0; i < db->detailed_names.size(); ++i) {
|
||||||
if (SubstringMatch(query, db->detailed_names[i])) {
|
if (SubstringMatch(query_without_space, db->detailed_names[i])) {
|
||||||
// Do not show the same entry twice.
|
// Do not show the same entry twice.
|
||||||
if (!inserted_results.insert(db->detailed_names[i]).second)
|
if (!inserted_results.insert(db->detailed_names[i]).second)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
InsertSymbolIntoResult(db, working_files, db->symbols[i],
|
if (InsertSymbolIntoResult(db, working_files, db->symbols[i], &out.result)) {
|
||||||
&out.result);
|
result_indices.push_back(i);
|
||||||
if (out.result.size() >= config->maxWorkspaceSearchResults)
|
if (out.result.size() >= config->maxWorkspaceSearchResults)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (out.result.size() < config->maxWorkspaceSearchResults) {
|
||||||
|
int longest = 0;
|
||||||
|
for (int i: result_indices)
|
||||||
|
longest = std::max(longest, int(db->detailed_names[i].size()));
|
||||||
|
|
||||||
|
std::vector<int> score(longest), // score for each position
|
||||||
|
dp(longest); // dp[i]: maximum value by aligning pattern[0..pi] to str[0..si]
|
||||||
|
std::vector<std::pair<int, int>> permutation(result_indices.size());
|
||||||
|
for (int i = 0; i < int(result_indices.size()); i++) {
|
||||||
|
permutation[i] = {
|
||||||
|
fuzzyEvaluate(query, db->detailed_names[result_indices[i]], score,
|
||||||
|
dp),
|
||||||
|
i};
|
||||||
|
}
|
||||||
|
std::sort(permutation.begin(), permutation.end(),
|
||||||
|
std::greater<std::pair<int, int>>());
|
||||||
|
for (int i = 0; i < int(result_indices.size()); i++)
|
||||||
|
if (i != permutation[i].second)
|
||||||
|
std::swap(out.result[i], out.result[permutation[i].second]);
|
||||||
|
}
|
||||||
|
|
||||||
LOG_S(INFO) << "[querydb] Found " << out.result.size()
|
LOG_S(INFO) << "[querydb] Found " << out.result.size()
|
||||||
<< " results for query " << query;
|
<< " results for query " << query;
|
||||||
|
Loading…
Reference in New Issue
Block a user