diff --git a/src/config.h b/src/config.h index 316cbfc3..b0776c3d 100644 --- a/src/config.h +++ b/src/config.h @@ -24,7 +24,7 @@ struct Config { bool logSkippedPathsForIndex = false; // Maximum workspace search results. - int maxWorkspaceSearchResults = 1000; + int maxWorkspaceSearchResults = 200; // Force a certain number of indexer threads. If less than 1 a default value // should be used. @@ -109,4 +109,4 @@ MAKE_REFLECT_STRUCT(Config, enableSnippetInsertion); // Expected client version. We show an error if this doesn't match. -constexpr const int kExpectedClientVersion = 3; \ No newline at end of file +constexpr const int kExpectedClientVersion = 3; diff --git a/src/messages/workspace_symbol.cc b/src/messages/workspace_symbol.cc index 9fb51f0f..c3197e11 100644 --- a/src/messages/workspace_symbol.cc +++ b/src/messages/workspace_symbol.cc @@ -1,3 +1,8 @@ +#include +#include +#include +#include + #include "lex_utils.h" #include "message_handler.h" #include "query_utils.h" @@ -7,29 +12,30 @@ namespace { // Lookup |symbol| in |db| and insert the value into |result|. -void InsertSymbolIntoResult(QueryDatabase* db, +bool InsertSymbolIntoResult(QueryDatabase* db, WorkingFiles* working_files, SymbolIdx symbol, std::vector* result) { optional info = GetSymbolInfo(db, working_files, symbol, false /*use_short_name*/); if (!info) - return; + return false; optional location = GetDefinitionExtentOfSymbol(db, symbol); if (!location) { auto decls = GetDeclarationsOfSymbolForGotoDefinition(db, symbol); if (decls.empty()) - return; + return false; location = decls[0]; } optional ls_location = GetLsLocation(db, working_files, *location); if (!ls_location) - return; + return false; info->location = *ls_location; result->push_back(*info); + return true; } struct lsWorkspaceSymbolParams { @@ -51,12 +57,119 @@ struct Out_WorkspaceSymbol : public lsOutMessage { }; MAKE_REFLECT_STRUCT(Out_WorkspaceSymbol, jsonrpc, id, result); + +///// Fuzzy matching + +// Negative but far from INT_MIN so that intermediate results are hard to +// overflow +constexpr int kMinScore = INT_MIN / 2; +// Penalty of dropping a leading character in str +constexpr int kLeadingGapScore = -4; +// Penalty of dropping a non-leading character in str +constexpr int kGapScore = -5; +// Bonus of aligning with an initial character of a word in pattern. Must be +// greater than 1 +constexpr int kPatternStartMultiplier = 2; + +constexpr int kWordStartScore = 100; +constexpr int kNonWordScore = 90; + +// Less than kWordStartScore +constexpr int kConsecutiveScore = kWordStartScore + kGapScore; +// Slightly less than kConsecutiveScore +constexpr int kCamelScore = kWordStartScore + kGapScore - 1; + +enum class CharClass { Lower, Upper, Digit, NonWord }; + +static enum CharClass getCharClass(int c) { + if (islower(c)) return CharClass::Lower; + if (isupper(c)) return CharClass::Upper; + if (isdigit(c)) return CharClass::Digit; + return CharClass::NonWord; +} + +static int getScoreFor(CharClass prev, CharClass curr) { + if (prev == CharClass::NonWord && curr != CharClass::NonWord) + return kWordStartScore; + if ((prev == CharClass::Lower && curr == CharClass::Upper) || + (prev != CharClass::Digit && curr == CharClass::Digit)) + return kCamelScore; + if (curr == CharClass::NonWord) + return kNonWordScore; + return 0; +} + +/* +fuzzyEvaluate implements a global sequence alignment algorithm to find the maximum accumulated score by aligning `pattern` to `str`. It applies when `pattern` is a subsequence of `str`. + +Scoring criteria +- Prefer matches at the start of a word, or the start of subwords in CamelCase/camelCase/camel123 words. See kWordStartScore/kCamelScore +- Non-word characters matter. See kNonWordScore +- The first characters of words of `pattern` receive bonus because they usually have more significance than the rest. See kPatternStartMultiplier +- Superfluous characters in `str` will reduce the score (gap penalty). See kGapScore +- Prefer early occurrence of the first character. See kLeadingGapScore/kGapScore + +The recurrence of the dynamic programming: +dp[i][j]: maximum accumulated score by aligning pattern[0..i] to str[0..j] +dp[0][j] = leading_gap_penalty(0, j) + score[j] +dp[i][j] = max(dp[i-1][j-1] + CONSECUTIVE_SCORE, max(dp[i-1][k] + gap_penalty(k+1, j) + score[j] : k < j)) +The first dimension can be suppressed since we do not need a matching scheme, which reduces the space complexity from O(N*M) to O(M) +*/ +int fuzzyEvaluate(const std::string& pattern, + const std::string& str, + std::vector& score, + std::vector& dp) { + bool pfirst = true, // aligning the first character of pattern + pstart = true; // whether we are aligning the start of a word in pattern + int uleft = 0, // value of the upper left cell + ulefts = 0, // maximum value of uleft and cells on the left + left, lefts; // similar to uleft/ulefts, but for the next row + + // Calculate position score for each character in str. + CharClass prev = CharClass::NonWord; + for (int i = 0; i < int(str.size()); i++) { + CharClass cur = getCharClass(str[i]); + score[i] = getScoreFor(prev, cur); + prev = cur; + } + std::fill_n(dp.begin(), str.size(), kMinScore); + + // Align each character of pattern. + for (unsigned char pc: pattern) { + if (isspace(pc)) { + pstart = true; + continue; + } + lefts = kMinScore; + // Enumerate the character in str to be aligned with pc. + for (int i = 0; i < int(str.size()); i++) { + left = dp[i]; + lefts = std::max(lefts + kGapScore, left); + if (tolower(pc) == tolower(str[i])) { + int t = score[i] * (pstart ? kPatternStartMultiplier : 1); + dp[i] = pfirst ? kLeadingGapScore * i + t + : std::max(uleft + kConsecutiveScore, ulefts + t); + } else + dp[i] = kMinScore; + uleft = left; + ulefts = lefts; + } + pfirst = pstart = false; + } + + // Enumerate the end position of the match in str. + lefts = kMinScore; + for (int i = 0; i < int(str.size()); i++) + // For function types, db->detailed_names may have trailing characters for + // parameters. We do not want to penalize them. + // If we use `short_name` instead of `detailed_name` for fuzzy matching, the + // penulty kGapScore can be used. + lefts = std::max(lefts /*+ kGapScore */, dp[i]); + return lefts; +} + struct WorkspaceSymbolHandler : BaseMessageHandler { void Run(Ipc_WorkspaceSymbol* request) override { - // TODO: implement fuzzy search, see - // https://github.com/junegunn/fzf/blob/master/src/matcher.go for - // inspiration - Out_WorkspaceSymbol out; out.id = request->id; @@ -66,7 +179,10 @@ struct WorkspaceSymbolHandler : BaseMessageHandler { std::string query = request->params.query; std::unordered_set inserted_results; + // db->detailed_names indices of each lsSymbolInformation in out.result + std::vector result_indices; inserted_results.reserve(config->maxWorkspaceSearchResults); + result_indices.reserve(config->maxWorkspaceSearchResults); for (int i = 0; i < db->detailed_names.size(); ++i) { if (db->detailed_names[i].find(query) != std::string::npos) { @@ -74,31 +190,61 @@ struct WorkspaceSymbolHandler : BaseMessageHandler { if (!inserted_results.insert(db->detailed_names[i]).second) continue; - InsertSymbolIntoResult(db, working_files, db->symbols[i], &out.result); - if (out.result.size() >= config->maxWorkspaceSearchResults) - break; - } - } - - if (out.result.size() < config->maxWorkspaceSearchResults) { - for (int i = 0; i < db->detailed_names.size(); ++i) { - if (SubstringMatch(query, db->detailed_names[i])) { - // Do not show the same entry twice. - if (!inserted_results.insert(db->detailed_names[i]).second) - continue; - - InsertSymbolIntoResult(db, working_files, db->symbols[i], - &out.result); + if (InsertSymbolIntoResult(db, working_files, db->symbols[i], &out.result)) { + result_indices.push_back(i); if (out.result.size() >= config->maxWorkspaceSearchResults) break; } } } + if (out.result.size() < config->maxWorkspaceSearchResults) { + std::string query_without_space; + query_without_space.reserve(query.size()); + for (char c: query) + if (!isspace(c)) + query_without_space += c; + + for (int i = 0; i < db->detailed_names.size(); ++i) { + if (SubstringMatch(query_without_space, db->detailed_names[i])) { + // Do not show the same entry twice. + if (!inserted_results.insert(db->detailed_names[i]).second) + continue; + + if (InsertSymbolIntoResult(db, working_files, db->symbols[i], &out.result)) { + result_indices.push_back(i); + if (out.result.size() >= config->maxWorkspaceSearchResults) + break; + } + } + } + } + + if (out.result.size() < config->maxWorkspaceSearchResults) { + int longest = 0; + for (int i: result_indices) + longest = std::max(longest, int(db->detailed_names[i].size())); + + std::vector score(longest), // score for each position + dp(longest); // dp[i]: maximum value by aligning pattern[0..pi] to str[0..si] + std::vector> permutation(result_indices.size()); + for (int i = 0; i < int(result_indices.size()); i++) { + permutation[i] = { + fuzzyEvaluate(query, db->detailed_names[result_indices[i]], score, + dp), + i}; + } + std::sort(permutation.begin(), permutation.end(), + std::greater>()); + for (int i = 0; i < int(result_indices.size()); i++) + if (i != permutation[i].second) + std::swap(out.result[i], out.result[permutation[i].second]); + } + LOG_S(INFO) << "[querydb] Found " << out.result.size() << " results for query " << query; QueueManager::WriteStdout(IpcId::WorkspaceSymbol, out); } }; REGISTER_MESSAGE_HANDLER(WorkspaceSymbolHandler); -} // namespace \ No newline at end of file +} // namespace