mirror of
				https://github.com/MaskRay/ccls.git
				synced 2025-11-03 22:04:24 +00:00 
			
		
		
		
	[workspace/symbol] Sort candidates with a fuzzy matching algorithm (#182)
This commit is contained in:
		
							parent
							
								
									dce1365eb6
								
							
						
					
					
						commit
						24f428c670
					
				@ -24,7 +24,7 @@ struct Config {
 | 
			
		||||
  bool logSkippedPathsForIndex = false;
 | 
			
		||||
 | 
			
		||||
  // Maximum workspace search results.
 | 
			
		||||
  int maxWorkspaceSearchResults = 1000;
 | 
			
		||||
  int maxWorkspaceSearchResults = 200;
 | 
			
		||||
 | 
			
		||||
  // Force a certain number of indexer threads. If less than 1 a default value
 | 
			
		||||
  // should be used.
 | 
			
		||||
 | 
			
		||||
@ -1,3 +1,8 @@
 | 
			
		||||
#include <ctype.h>
 | 
			
		||||
#include <limits.h>
 | 
			
		||||
#include <algorithm>
 | 
			
		||||
#include <functional>
 | 
			
		||||
 | 
			
		||||
#include "lex_utils.h"
 | 
			
		||||
#include "message_handler.h"
 | 
			
		||||
#include "query_utils.h"
 | 
			
		||||
@ -7,29 +12,30 @@
 | 
			
		||||
namespace {
 | 
			
		||||
 | 
			
		||||
// Lookup |symbol| in |db| and insert the value into |result|.
 | 
			
		||||
void InsertSymbolIntoResult(QueryDatabase* db,
 | 
			
		||||
bool InsertSymbolIntoResult(QueryDatabase* db,
 | 
			
		||||
                            WorkingFiles* working_files,
 | 
			
		||||
                            SymbolIdx symbol,
 | 
			
		||||
                            std::vector<lsSymbolInformation>* result) {
 | 
			
		||||
  optional<lsSymbolInformation> info =
 | 
			
		||||
      GetSymbolInfo(db, working_files, symbol, false /*use_short_name*/);
 | 
			
		||||
  if (!info)
 | 
			
		||||
    return;
 | 
			
		||||
    return false;
 | 
			
		||||
 | 
			
		||||
  optional<QueryLocation> location = GetDefinitionExtentOfSymbol(db, symbol);
 | 
			
		||||
  if (!location) {
 | 
			
		||||
    auto decls = GetDeclarationsOfSymbolForGotoDefinition(db, symbol);
 | 
			
		||||
    if (decls.empty())
 | 
			
		||||
      return;
 | 
			
		||||
      return false;
 | 
			
		||||
    location = decls[0];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  optional<lsLocation> ls_location =
 | 
			
		||||
      GetLsLocation(db, working_files, *location);
 | 
			
		||||
  if (!ls_location)
 | 
			
		||||
    return;
 | 
			
		||||
    return false;
 | 
			
		||||
  info->location = *ls_location;
 | 
			
		||||
  result->push_back(*info);
 | 
			
		||||
  return true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
struct lsWorkspaceSymbolParams {
 | 
			
		||||
@ -51,12 +57,119 @@ struct Out_WorkspaceSymbol : public lsOutMessage<Out_WorkspaceSymbol> {
 | 
			
		||||
};
 | 
			
		||||
MAKE_REFLECT_STRUCT(Out_WorkspaceSymbol, jsonrpc, id, result);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
///// Fuzzy matching
 | 
			
		||||
 | 
			
		||||
// Negative but far from INT_MIN so that intermediate results are hard to
 | 
			
		||||
// overflow
 | 
			
		||||
constexpr int kMinScore = INT_MIN / 2;
 | 
			
		||||
// Penalty of dropping a leading character in str
 | 
			
		||||
constexpr int kLeadingGapScore = -4;
 | 
			
		||||
// Penalty of dropping a non-leading character in str
 | 
			
		||||
constexpr int kGapScore = -5;
 | 
			
		||||
// Bonus of aligning with an initial character of a word in pattern. Must be
 | 
			
		||||
// greater than 1
 | 
			
		||||
constexpr int kPatternStartMultiplier = 2;
 | 
			
		||||
 | 
			
		||||
constexpr int kWordStartScore = 100;
 | 
			
		||||
constexpr int kNonWordScore = 90;
 | 
			
		||||
 | 
			
		||||
// Less than kWordStartScore
 | 
			
		||||
constexpr int kConsecutiveScore = kWordStartScore + kGapScore;
 | 
			
		||||
// Slightly less than kConsecutiveScore
 | 
			
		||||
constexpr int kCamelScore = kWordStartScore + kGapScore - 1;
 | 
			
		||||
 | 
			
		||||
enum class CharClass { Lower, Upper, Digit, NonWord };
 | 
			
		||||
 | 
			
		||||
static enum CharClass getCharClass(int c) {
 | 
			
		||||
  if (islower(c)) return CharClass::Lower;
 | 
			
		||||
  if (isupper(c)) return CharClass::Upper;
 | 
			
		||||
  if (isdigit(c)) return CharClass::Digit;
 | 
			
		||||
  return CharClass::NonWord;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static int getScoreFor(CharClass prev, CharClass curr) {
 | 
			
		||||
  if (prev == CharClass::NonWord && curr != CharClass::NonWord)
 | 
			
		||||
    return kWordStartScore;
 | 
			
		||||
  if ((prev == CharClass::Lower && curr == CharClass::Upper) ||
 | 
			
		||||
      (prev != CharClass::Digit && curr == CharClass::Digit))
 | 
			
		||||
    return kCamelScore;
 | 
			
		||||
  if (curr == CharClass::NonWord)
 | 
			
		||||
    return kNonWordScore;
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
fuzzyEvaluate implements a global sequence alignment algorithm to find the maximum accumulated score by aligning `pattern` to `str`. It applies when `pattern` is a subsequence of `str`.
 | 
			
		||||
 | 
			
		||||
Scoring criteria
 | 
			
		||||
- Prefer matches at the start of a word, or the start of subwords in CamelCase/camelCase/camel123 words. See kWordStartScore/kCamelScore
 | 
			
		||||
- Non-word characters matter. See kNonWordScore
 | 
			
		||||
- The first characters of words of `pattern` receive bonus because they usually have more significance than the rest. See kPatternStartMultiplier
 | 
			
		||||
- Superfluous characters in `str` will reduce the score (gap penalty). See kGapScore
 | 
			
		||||
- Prefer early occurrence of the first character. See kLeadingGapScore/kGapScore
 | 
			
		||||
 | 
			
		||||
The recurrence of the dynamic programming:
 | 
			
		||||
dp[i][j]: maximum accumulated score by aligning pattern[0..i] to str[0..j]
 | 
			
		||||
dp[0][j] = leading_gap_penalty(0, j) + score[j]
 | 
			
		||||
dp[i][j] = max(dp[i-1][j-1] + CONSECUTIVE_SCORE, max(dp[i-1][k] + gap_penalty(k+1, j) + score[j] : k < j))
 | 
			
		||||
The first dimension can be suppressed since we do not need a matching scheme, which reduces the space complexity from O(N*M) to O(M)
 | 
			
		||||
*/
 | 
			
		||||
int fuzzyEvaluate(const std::string& pattern,
 | 
			
		||||
                  const std::string& str,
 | 
			
		||||
                  std::vector<int>& score,
 | 
			
		||||
                  std::vector<int>& dp) {
 | 
			
		||||
  bool pfirst = true,  // aligning the first character of pattern
 | 
			
		||||
      pstart = true;   // whether we are aligning the start of a word in pattern
 | 
			
		||||
  int uleft = 0,       // value of the upper left cell
 | 
			
		||||
      ulefts = 0,      // maximum value of uleft and cells on the left
 | 
			
		||||
      left, lefts;     // similar to uleft/ulefts, but for the next row
 | 
			
		||||
 | 
			
		||||
  // Calculate position score for each character in str.
 | 
			
		||||
  CharClass prev = CharClass::NonWord;
 | 
			
		||||
  for (int i = 0; i < int(str.size()); i++) {
 | 
			
		||||
    CharClass cur = getCharClass(str[i]);
 | 
			
		||||
    score[i] = getScoreFor(prev, cur);
 | 
			
		||||
    prev = cur;
 | 
			
		||||
  }
 | 
			
		||||
  std::fill_n(dp.begin(), str.size(), kMinScore);
 | 
			
		||||
 | 
			
		||||
  // Align each character of pattern.
 | 
			
		||||
  for (unsigned char pc: pattern) {
 | 
			
		||||
    if (isspace(pc)) {
 | 
			
		||||
      pstart = true;
 | 
			
		||||
      continue;
 | 
			
		||||
    }
 | 
			
		||||
    lefts = kMinScore;
 | 
			
		||||
    // Enumerate the character in str to be aligned with pc.
 | 
			
		||||
    for (int i = 0; i < int(str.size()); i++) {
 | 
			
		||||
      left = dp[i];
 | 
			
		||||
      lefts = std::max(lefts + kGapScore, left);
 | 
			
		||||
      if (tolower(pc) == tolower(str[i])) {
 | 
			
		||||
        int t = score[i] * (pstart ? kPatternStartMultiplier : 1);
 | 
			
		||||
        dp[i] = pfirst ? kLeadingGapScore * i + t
 | 
			
		||||
                       : std::max(uleft + kConsecutiveScore, ulefts + t);
 | 
			
		||||
      } else
 | 
			
		||||
        dp[i] = kMinScore;
 | 
			
		||||
      uleft = left;
 | 
			
		||||
      ulefts = lefts;
 | 
			
		||||
    }
 | 
			
		||||
    pfirst = pstart = false;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Enumerate the end position of the match in str.
 | 
			
		||||
  lefts = kMinScore;
 | 
			
		||||
  for (int i = 0; i < int(str.size()); i++)
 | 
			
		||||
    // For function types, db->detailed_names may have trailing characters for
 | 
			
		||||
    // parameters. We do not want to penalize them.
 | 
			
		||||
    // If we use `short_name` instead of `detailed_name` for fuzzy matching, the
 | 
			
		||||
    // penulty kGapScore can be used.
 | 
			
		||||
    lefts = std::max(lefts /*+ kGapScore */, dp[i]);
 | 
			
		||||
  return lefts;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
struct WorkspaceSymbolHandler : BaseMessageHandler<Ipc_WorkspaceSymbol> {
 | 
			
		||||
  void Run(Ipc_WorkspaceSymbol* request) override {
 | 
			
		||||
    // TODO: implement fuzzy search, see
 | 
			
		||||
    // https://github.com/junegunn/fzf/blob/master/src/matcher.go for
 | 
			
		||||
    // inspiration
 | 
			
		||||
 | 
			
		||||
    Out_WorkspaceSymbol out;
 | 
			
		||||
    out.id = request->id;
 | 
			
		||||
 | 
			
		||||
@ -66,7 +179,10 @@ struct WorkspaceSymbolHandler : BaseMessageHandler<Ipc_WorkspaceSymbol> {
 | 
			
		||||
    std::string query = request->params.query;
 | 
			
		||||
 | 
			
		||||
    std::unordered_set<std::string> inserted_results;
 | 
			
		||||
    // db->detailed_names indices of each lsSymbolInformation in out.result
 | 
			
		||||
    std::vector<int> result_indices;
 | 
			
		||||
    inserted_results.reserve(config->maxWorkspaceSearchResults);
 | 
			
		||||
    result_indices.reserve(config->maxWorkspaceSearchResults);
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < db->detailed_names.size(); ++i) {
 | 
			
		||||
      if (db->detailed_names[i].find(query) != std::string::npos) {
 | 
			
		||||
@ -74,27 +190,57 @@ struct WorkspaceSymbolHandler : BaseMessageHandler<Ipc_WorkspaceSymbol> {
 | 
			
		||||
        if (!inserted_results.insert(db->detailed_names[i]).second)
 | 
			
		||||
          continue;
 | 
			
		||||
 | 
			
		||||
        InsertSymbolIntoResult(db, working_files, db->symbols[i], &out.result);
 | 
			
		||||
        if (out.result.size() >= config->maxWorkspaceSearchResults)
 | 
			
		||||
          break;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (out.result.size() < config->maxWorkspaceSearchResults) {
 | 
			
		||||
      for (int i = 0; i < db->detailed_names.size(); ++i) {
 | 
			
		||||
        if (SubstringMatch(query, db->detailed_names[i])) {
 | 
			
		||||
          // Do not show the same entry twice.
 | 
			
		||||
          if (!inserted_results.insert(db->detailed_names[i]).second)
 | 
			
		||||
            continue;
 | 
			
		||||
 | 
			
		||||
          InsertSymbolIntoResult(db, working_files, db->symbols[i],
 | 
			
		||||
                                 &out.result);
 | 
			
		||||
        if (InsertSymbolIntoResult(db, working_files, db->symbols[i], &out.result)) {
 | 
			
		||||
          result_indices.push_back(i);
 | 
			
		||||
          if (out.result.size() >= config->maxWorkspaceSearchResults)
 | 
			
		||||
            break;
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (out.result.size() < config->maxWorkspaceSearchResults) {
 | 
			
		||||
      std::string query_without_space;
 | 
			
		||||
      query_without_space.reserve(query.size());
 | 
			
		||||
      for (char c: query)
 | 
			
		||||
        if (!isspace(c))
 | 
			
		||||
          query_without_space += c;
 | 
			
		||||
 | 
			
		||||
      for (int i = 0; i < db->detailed_names.size(); ++i) {
 | 
			
		||||
        if (SubstringMatch(query_without_space, db->detailed_names[i])) {
 | 
			
		||||
          // Do not show the same entry twice.
 | 
			
		||||
          if (!inserted_results.insert(db->detailed_names[i]).second)
 | 
			
		||||
            continue;
 | 
			
		||||
 | 
			
		||||
          if (InsertSymbolIntoResult(db, working_files, db->symbols[i], &out.result)) {
 | 
			
		||||
            result_indices.push_back(i);
 | 
			
		||||
            if (out.result.size() >= config->maxWorkspaceSearchResults)
 | 
			
		||||
              break;
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (out.result.size() < config->maxWorkspaceSearchResults) {
 | 
			
		||||
      int longest = 0;
 | 
			
		||||
      for (int i: result_indices)
 | 
			
		||||
        longest = std::max(longest, int(db->detailed_names[i].size()));
 | 
			
		||||
 | 
			
		||||
      std::vector<int> score(longest), // score for each position
 | 
			
		||||
          dp(longest); // dp[i]: maximum value by aligning pattern[0..pi] to str[0..si]
 | 
			
		||||
      std::vector<std::pair<int, int>> permutation(result_indices.size());
 | 
			
		||||
      for (int i = 0; i < int(result_indices.size()); i++) {
 | 
			
		||||
        permutation[i] = {
 | 
			
		||||
            fuzzyEvaluate(query, db->detailed_names[result_indices[i]], score,
 | 
			
		||||
                          dp),
 | 
			
		||||
            i};
 | 
			
		||||
      }
 | 
			
		||||
      std::sort(permutation.begin(), permutation.end(),
 | 
			
		||||
                std::greater<std::pair<int, int>>());
 | 
			
		||||
      for (int i = 0; i < int(result_indices.size()); i++)
 | 
			
		||||
        if (i != permutation[i].second)
 | 
			
		||||
          std::swap(out.result[i], out.result[permutation[i].second]);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    LOG_S(INFO) << "[querydb] Found " << out.result.size()
 | 
			
		||||
                << " results for query " << query;
 | 
			
		||||
    QueueManager::WriteStdout(IpcId::WorkspaceSymbol, out);
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user