[workspace/symbol] Sort candidates with a fuzzy matching algorithm (#182)

This commit is contained in:
Fangrui Song 2017-12-23 19:23:29 -08:00 committed by GitHub
parent dce1365eb6
commit 24f428c670
2 changed files with 172 additions and 26 deletions

View File

@ -24,7 +24,7 @@ struct Config {
bool logSkippedPathsForIndex = false; bool logSkippedPathsForIndex = false;
// Maximum workspace search results. // Maximum workspace search results.
int maxWorkspaceSearchResults = 1000; int maxWorkspaceSearchResults = 200;
// Force a certain number of indexer threads. If less than 1 a default value // Force a certain number of indexer threads. If less than 1 a default value
// should be used. // should be used.

View File

@ -1,3 +1,8 @@
#include <ctype.h>
#include <limits.h>
#include <algorithm>
#include <functional>
#include "lex_utils.h" #include "lex_utils.h"
#include "message_handler.h" #include "message_handler.h"
#include "query_utils.h" #include "query_utils.h"
@ -7,29 +12,30 @@
namespace { namespace {
// Lookup |symbol| in |db| and insert the value into |result|. // Lookup |symbol| in |db| and insert the value into |result|.
void InsertSymbolIntoResult(QueryDatabase* db, bool InsertSymbolIntoResult(QueryDatabase* db,
WorkingFiles* working_files, WorkingFiles* working_files,
SymbolIdx symbol, SymbolIdx symbol,
std::vector<lsSymbolInformation>* result) { std::vector<lsSymbolInformation>* result) {
optional<lsSymbolInformation> info = optional<lsSymbolInformation> info =
GetSymbolInfo(db, working_files, symbol, false /*use_short_name*/); GetSymbolInfo(db, working_files, symbol, false /*use_short_name*/);
if (!info) if (!info)
return; return false;
optional<QueryLocation> location = GetDefinitionExtentOfSymbol(db, symbol); optional<QueryLocation> location = GetDefinitionExtentOfSymbol(db, symbol);
if (!location) { if (!location) {
auto decls = GetDeclarationsOfSymbolForGotoDefinition(db, symbol); auto decls = GetDeclarationsOfSymbolForGotoDefinition(db, symbol);
if (decls.empty()) if (decls.empty())
return; return false;
location = decls[0]; location = decls[0];
} }
optional<lsLocation> ls_location = optional<lsLocation> ls_location =
GetLsLocation(db, working_files, *location); GetLsLocation(db, working_files, *location);
if (!ls_location) if (!ls_location)
return; return false;
info->location = *ls_location; info->location = *ls_location;
result->push_back(*info); result->push_back(*info);
return true;
} }
struct lsWorkspaceSymbolParams { struct lsWorkspaceSymbolParams {
@ -51,12 +57,119 @@ struct Out_WorkspaceSymbol : public lsOutMessage<Out_WorkspaceSymbol> {
}; };
MAKE_REFLECT_STRUCT(Out_WorkspaceSymbol, jsonrpc, id, result); MAKE_REFLECT_STRUCT(Out_WorkspaceSymbol, jsonrpc, id, result);
///// Fuzzy matching
// Negative but far from INT_MIN so that intermediate results are hard to
// overflow
constexpr int kMinScore = INT_MIN / 2;
// Penalty of dropping a leading character in str
constexpr int kLeadingGapScore = -4;
// Penalty of dropping a non-leading character in str
constexpr int kGapScore = -5;
// Bonus of aligning with an initial character of a word in pattern. Must be
// greater than 1
constexpr int kPatternStartMultiplier = 2;
constexpr int kWordStartScore = 100;
constexpr int kNonWordScore = 90;
// Less than kWordStartScore
constexpr int kConsecutiveScore = kWordStartScore + kGapScore;
// Slightly less than kConsecutiveScore
constexpr int kCamelScore = kWordStartScore + kGapScore - 1;
enum class CharClass { Lower, Upper, Digit, NonWord };
static enum CharClass getCharClass(int c) {
if (islower(c)) return CharClass::Lower;
if (isupper(c)) return CharClass::Upper;
if (isdigit(c)) return CharClass::Digit;
return CharClass::NonWord;
}
static int getScoreFor(CharClass prev, CharClass curr) {
if (prev == CharClass::NonWord && curr != CharClass::NonWord)
return kWordStartScore;
if ((prev == CharClass::Lower && curr == CharClass::Upper) ||
(prev != CharClass::Digit && curr == CharClass::Digit))
return kCamelScore;
if (curr == CharClass::NonWord)
return kNonWordScore;
return 0;
}
/*
fuzzyEvaluate implements a global sequence alignment algorithm to find the maximum accumulated score by aligning `pattern` to `str`. It applies when `pattern` is a subsequence of `str`.
Scoring criteria
- Prefer matches at the start of a word, or the start of subwords in CamelCase/camelCase/camel123 words. See kWordStartScore/kCamelScore
- Non-word characters matter. See kNonWordScore
- The first characters of words of `pattern` receive bonus because they usually have more significance than the rest. See kPatternStartMultiplier
- Superfluous characters in `str` will reduce the score (gap penalty). See kGapScore
- Prefer early occurrence of the first character. See kLeadingGapScore/kGapScore
The recurrence of the dynamic programming:
dp[i][j]: maximum accumulated score by aligning pattern[0..i] to str[0..j]
dp[0][j] = leading_gap_penalty(0, j) + score[j]
dp[i][j] = max(dp[i-1][j-1] + CONSECUTIVE_SCORE, max(dp[i-1][k] + gap_penalty(k+1, j) + score[j] : k < j))
The first dimension can be suppressed since we do not need a matching scheme, which reduces the space complexity from O(N*M) to O(M)
*/
int fuzzyEvaluate(const std::string& pattern,
const std::string& str,
std::vector<int>& score,
std::vector<int>& dp) {
bool pfirst = true, // aligning the first character of pattern
pstart = true; // whether we are aligning the start of a word in pattern
int uleft = 0, // value of the upper left cell
ulefts = 0, // maximum value of uleft and cells on the left
left, lefts; // similar to uleft/ulefts, but for the next row
// Calculate position score for each character in str.
CharClass prev = CharClass::NonWord;
for (int i = 0; i < int(str.size()); i++) {
CharClass cur = getCharClass(str[i]);
score[i] = getScoreFor(prev, cur);
prev = cur;
}
std::fill_n(dp.begin(), str.size(), kMinScore);
// Align each character of pattern.
for (unsigned char pc: pattern) {
if (isspace(pc)) {
pstart = true;
continue;
}
lefts = kMinScore;
// Enumerate the character in str to be aligned with pc.
for (int i = 0; i < int(str.size()); i++) {
left = dp[i];
lefts = std::max(lefts + kGapScore, left);
if (tolower(pc) == tolower(str[i])) {
int t = score[i] * (pstart ? kPatternStartMultiplier : 1);
dp[i] = pfirst ? kLeadingGapScore * i + t
: std::max(uleft + kConsecutiveScore, ulefts + t);
} else
dp[i] = kMinScore;
uleft = left;
ulefts = lefts;
}
pfirst = pstart = false;
}
// Enumerate the end position of the match in str.
lefts = kMinScore;
for (int i = 0; i < int(str.size()); i++)
// For function types, db->detailed_names may have trailing characters for
// parameters. We do not want to penalize them.
// If we use `short_name` instead of `detailed_name` for fuzzy matching, the
// penulty kGapScore can be used.
lefts = std::max(lefts /*+ kGapScore */, dp[i]);
return lefts;
}
struct WorkspaceSymbolHandler : BaseMessageHandler<Ipc_WorkspaceSymbol> { struct WorkspaceSymbolHandler : BaseMessageHandler<Ipc_WorkspaceSymbol> {
void Run(Ipc_WorkspaceSymbol* request) override { void Run(Ipc_WorkspaceSymbol* request) override {
// TODO: implement fuzzy search, see
// https://github.com/junegunn/fzf/blob/master/src/matcher.go for
// inspiration
Out_WorkspaceSymbol out; Out_WorkspaceSymbol out;
out.id = request->id; out.id = request->id;
@ -66,7 +179,10 @@ struct WorkspaceSymbolHandler : BaseMessageHandler<Ipc_WorkspaceSymbol> {
std::string query = request->params.query; std::string query = request->params.query;
std::unordered_set<std::string> inserted_results; std::unordered_set<std::string> inserted_results;
// db->detailed_names indices of each lsSymbolInformation in out.result
std::vector<int> result_indices;
inserted_results.reserve(config->maxWorkspaceSearchResults); inserted_results.reserve(config->maxWorkspaceSearchResults);
result_indices.reserve(config->maxWorkspaceSearchResults);
for (int i = 0; i < db->detailed_names.size(); ++i) { for (int i = 0; i < db->detailed_names.size(); ++i) {
if (db->detailed_names[i].find(query) != std::string::npos) { if (db->detailed_names[i].find(query) != std::string::npos) {
@ -74,26 +190,56 @@ struct WorkspaceSymbolHandler : BaseMessageHandler<Ipc_WorkspaceSymbol> {
if (!inserted_results.insert(db->detailed_names[i]).second) if (!inserted_results.insert(db->detailed_names[i]).second)
continue; continue;
InsertSymbolIntoResult(db, working_files, db->symbols[i], &out.result); if (InsertSymbolIntoResult(db, working_files, db->symbols[i], &out.result)) {
result_indices.push_back(i);
if (out.result.size() >= config->maxWorkspaceSearchResults) if (out.result.size() >= config->maxWorkspaceSearchResults)
break; break;
} }
} }
}
if (out.result.size() < config->maxWorkspaceSearchResults) { if (out.result.size() < config->maxWorkspaceSearchResults) {
std::string query_without_space;
query_without_space.reserve(query.size());
for (char c: query)
if (!isspace(c))
query_without_space += c;
for (int i = 0; i < db->detailed_names.size(); ++i) { for (int i = 0; i < db->detailed_names.size(); ++i) {
if (SubstringMatch(query, db->detailed_names[i])) { if (SubstringMatch(query_without_space, db->detailed_names[i])) {
// Do not show the same entry twice. // Do not show the same entry twice.
if (!inserted_results.insert(db->detailed_names[i]).second) if (!inserted_results.insert(db->detailed_names[i]).second)
continue; continue;
InsertSymbolIntoResult(db, working_files, db->symbols[i], if (InsertSymbolIntoResult(db, working_files, db->symbols[i], &out.result)) {
&out.result); result_indices.push_back(i);
if (out.result.size() >= config->maxWorkspaceSearchResults) if (out.result.size() >= config->maxWorkspaceSearchResults)
break; break;
} }
} }
} }
}
if (out.result.size() < config->maxWorkspaceSearchResults) {
int longest = 0;
for (int i: result_indices)
longest = std::max(longest, int(db->detailed_names[i].size()));
std::vector<int> score(longest), // score for each position
dp(longest); // dp[i]: maximum value by aligning pattern[0..pi] to str[0..si]
std::vector<std::pair<int, int>> permutation(result_indices.size());
for (int i = 0; i < int(result_indices.size()); i++) {
permutation[i] = {
fuzzyEvaluate(query, db->detailed_names[result_indices[i]], score,
dp),
i};
}
std::sort(permutation.begin(), permutation.end(),
std::greater<std::pair<int, int>>());
for (int i = 0; i < int(result_indices.size()); i++)
if (i != permutation[i].second)
std::swap(out.result[i], out.result[permutation[i].second]);
}
LOG_S(INFO) << "[querydb] Found " << out.result.size() LOG_S(INFO) << "[querydb] Found " << out.result.size()
<< " results for query " << query; << " results for query " << query;