mirror of
				https://github.com/MaskRay/ccls.git
				synced 2025-11-04 06:15:20 +00:00 
			
		
		
		
	[workspace/symbol] Sort candidates with a fuzzy matching algorithm (#182)
This commit is contained in:
		
							parent
							
								
									dce1365eb6
								
							
						
					
					
						commit
						24f428c670
					
				@ -24,7 +24,7 @@ struct Config {
 | 
				
			|||||||
  bool logSkippedPathsForIndex = false;
 | 
					  bool logSkippedPathsForIndex = false;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Maximum workspace search results.
 | 
					  // Maximum workspace search results.
 | 
				
			||||||
  int maxWorkspaceSearchResults = 1000;
 | 
					  int maxWorkspaceSearchResults = 200;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Force a certain number of indexer threads. If less than 1 a default value
 | 
					  // Force a certain number of indexer threads. If less than 1 a default value
 | 
				
			||||||
  // should be used.
 | 
					  // should be used.
 | 
				
			||||||
 | 
				
			|||||||
@ -1,3 +1,8 @@
 | 
				
			|||||||
 | 
					#include <ctype.h>
 | 
				
			||||||
 | 
					#include <limits.h>
 | 
				
			||||||
 | 
					#include <algorithm>
 | 
				
			||||||
 | 
					#include <functional>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include "lex_utils.h"
 | 
					#include "lex_utils.h"
 | 
				
			||||||
#include "message_handler.h"
 | 
					#include "message_handler.h"
 | 
				
			||||||
#include "query_utils.h"
 | 
					#include "query_utils.h"
 | 
				
			||||||
@ -7,29 +12,30 @@
 | 
				
			|||||||
namespace {
 | 
					namespace {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Lookup |symbol| in |db| and insert the value into |result|.
 | 
					// Lookup |symbol| in |db| and insert the value into |result|.
 | 
				
			||||||
void InsertSymbolIntoResult(QueryDatabase* db,
 | 
					bool InsertSymbolIntoResult(QueryDatabase* db,
 | 
				
			||||||
                            WorkingFiles* working_files,
 | 
					                            WorkingFiles* working_files,
 | 
				
			||||||
                            SymbolIdx symbol,
 | 
					                            SymbolIdx symbol,
 | 
				
			||||||
                            std::vector<lsSymbolInformation>* result) {
 | 
					                            std::vector<lsSymbolInformation>* result) {
 | 
				
			||||||
  optional<lsSymbolInformation> info =
 | 
					  optional<lsSymbolInformation> info =
 | 
				
			||||||
      GetSymbolInfo(db, working_files, symbol, false /*use_short_name*/);
 | 
					      GetSymbolInfo(db, working_files, symbol, false /*use_short_name*/);
 | 
				
			||||||
  if (!info)
 | 
					  if (!info)
 | 
				
			||||||
    return;
 | 
					    return false;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  optional<QueryLocation> location = GetDefinitionExtentOfSymbol(db, symbol);
 | 
					  optional<QueryLocation> location = GetDefinitionExtentOfSymbol(db, symbol);
 | 
				
			||||||
  if (!location) {
 | 
					  if (!location) {
 | 
				
			||||||
    auto decls = GetDeclarationsOfSymbolForGotoDefinition(db, symbol);
 | 
					    auto decls = GetDeclarationsOfSymbolForGotoDefinition(db, symbol);
 | 
				
			||||||
    if (decls.empty())
 | 
					    if (decls.empty())
 | 
				
			||||||
      return;
 | 
					      return false;
 | 
				
			||||||
    location = decls[0];
 | 
					    location = decls[0];
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  optional<lsLocation> ls_location =
 | 
					  optional<lsLocation> ls_location =
 | 
				
			||||||
      GetLsLocation(db, working_files, *location);
 | 
					      GetLsLocation(db, working_files, *location);
 | 
				
			||||||
  if (!ls_location)
 | 
					  if (!ls_location)
 | 
				
			||||||
    return;
 | 
					    return false;
 | 
				
			||||||
  info->location = *ls_location;
 | 
					  info->location = *ls_location;
 | 
				
			||||||
  result->push_back(*info);
 | 
					  result->push_back(*info);
 | 
				
			||||||
 | 
					  return true;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct lsWorkspaceSymbolParams {
 | 
					struct lsWorkspaceSymbolParams {
 | 
				
			||||||
@ -51,12 +57,119 @@ struct Out_WorkspaceSymbol : public lsOutMessage<Out_WorkspaceSymbol> {
 | 
				
			|||||||
};
 | 
					};
 | 
				
			||||||
MAKE_REFLECT_STRUCT(Out_WorkspaceSymbol, jsonrpc, id, result);
 | 
					MAKE_REFLECT_STRUCT(Out_WorkspaceSymbol, jsonrpc, id, result);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					///// Fuzzy matching
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Negative but far from INT_MIN so that intermediate results are hard to
 | 
				
			||||||
 | 
					// overflow
 | 
				
			||||||
 | 
					constexpr int kMinScore = INT_MIN / 2;
 | 
				
			||||||
 | 
					// Penalty of dropping a leading character in str
 | 
				
			||||||
 | 
					constexpr int kLeadingGapScore = -4;
 | 
				
			||||||
 | 
					// Penalty of dropping a non-leading character in str
 | 
				
			||||||
 | 
					constexpr int kGapScore = -5;
 | 
				
			||||||
 | 
					// Bonus of aligning with an initial character of a word in pattern. Must be
 | 
				
			||||||
 | 
					// greater than 1
 | 
				
			||||||
 | 
					constexpr int kPatternStartMultiplier = 2;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					constexpr int kWordStartScore = 100;
 | 
				
			||||||
 | 
					constexpr int kNonWordScore = 90;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Less than kWordStartScore
 | 
				
			||||||
 | 
					constexpr int kConsecutiveScore = kWordStartScore + kGapScore;
 | 
				
			||||||
 | 
					// Slightly less than kConsecutiveScore
 | 
				
			||||||
 | 
					constexpr int kCamelScore = kWordStartScore + kGapScore - 1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					enum class CharClass { Lower, Upper, Digit, NonWord };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static enum CharClass getCharClass(int c) {
 | 
				
			||||||
 | 
					  if (islower(c)) return CharClass::Lower;
 | 
				
			||||||
 | 
					  if (isupper(c)) return CharClass::Upper;
 | 
				
			||||||
 | 
					  if (isdigit(c)) return CharClass::Digit;
 | 
				
			||||||
 | 
					  return CharClass::NonWord;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static int getScoreFor(CharClass prev, CharClass curr) {
 | 
				
			||||||
 | 
					  if (prev == CharClass::NonWord && curr != CharClass::NonWord)
 | 
				
			||||||
 | 
					    return kWordStartScore;
 | 
				
			||||||
 | 
					  if ((prev == CharClass::Lower && curr == CharClass::Upper) ||
 | 
				
			||||||
 | 
					      (prev != CharClass::Digit && curr == CharClass::Digit))
 | 
				
			||||||
 | 
					    return kCamelScore;
 | 
				
			||||||
 | 
					  if (curr == CharClass::NonWord)
 | 
				
			||||||
 | 
					    return kNonWordScore;
 | 
				
			||||||
 | 
					  return 0;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					fuzzyEvaluate implements a global sequence alignment algorithm to find the maximum accumulated score by aligning `pattern` to `str`. It applies when `pattern` is a subsequence of `str`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Scoring criteria
 | 
				
			||||||
 | 
					- Prefer matches at the start of a word, or the start of subwords in CamelCase/camelCase/camel123 words. See kWordStartScore/kCamelScore
 | 
				
			||||||
 | 
					- Non-word characters matter. See kNonWordScore
 | 
				
			||||||
 | 
					- The first characters of words of `pattern` receive bonus because they usually have more significance than the rest. See kPatternStartMultiplier
 | 
				
			||||||
 | 
					- Superfluous characters in `str` will reduce the score (gap penalty). See kGapScore
 | 
				
			||||||
 | 
					- Prefer early occurrence of the first character. See kLeadingGapScore/kGapScore
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The recurrence of the dynamic programming:
 | 
				
			||||||
 | 
					dp[i][j]: maximum accumulated score by aligning pattern[0..i] to str[0..j]
 | 
				
			||||||
 | 
					dp[0][j] = leading_gap_penalty(0, j) + score[j]
 | 
				
			||||||
 | 
					dp[i][j] = max(dp[i-1][j-1] + CONSECUTIVE_SCORE, max(dp[i-1][k] + gap_penalty(k+1, j) + score[j] : k < j))
 | 
				
			||||||
 | 
					The first dimension can be suppressed since we do not need a matching scheme, which reduces the space complexity from O(N*M) to O(M)
 | 
				
			||||||
 | 
					*/
 | 
				
			||||||
 | 
					int fuzzyEvaluate(const std::string& pattern,
 | 
				
			||||||
 | 
					                  const std::string& str,
 | 
				
			||||||
 | 
					                  std::vector<int>& score,
 | 
				
			||||||
 | 
					                  std::vector<int>& dp) {
 | 
				
			||||||
 | 
					  bool pfirst = true,  // aligning the first character of pattern
 | 
				
			||||||
 | 
					      pstart = true;   // whether we are aligning the start of a word in pattern
 | 
				
			||||||
 | 
					  int uleft = 0,       // value of the upper left cell
 | 
				
			||||||
 | 
					      ulefts = 0,      // maximum value of uleft and cells on the left
 | 
				
			||||||
 | 
					      left, lefts;     // similar to uleft/ulefts, but for the next row
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Calculate position score for each character in str.
 | 
				
			||||||
 | 
					  CharClass prev = CharClass::NonWord;
 | 
				
			||||||
 | 
					  for (int i = 0; i < int(str.size()); i++) {
 | 
				
			||||||
 | 
					    CharClass cur = getCharClass(str[i]);
 | 
				
			||||||
 | 
					    score[i] = getScoreFor(prev, cur);
 | 
				
			||||||
 | 
					    prev = cur;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  std::fill_n(dp.begin(), str.size(), kMinScore);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Align each character of pattern.
 | 
				
			||||||
 | 
					  for (unsigned char pc: pattern) {
 | 
				
			||||||
 | 
					    if (isspace(pc)) {
 | 
				
			||||||
 | 
					      pstart = true;
 | 
				
			||||||
 | 
					      continue;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    lefts = kMinScore;
 | 
				
			||||||
 | 
					    // Enumerate the character in str to be aligned with pc.
 | 
				
			||||||
 | 
					    for (int i = 0; i < int(str.size()); i++) {
 | 
				
			||||||
 | 
					      left = dp[i];
 | 
				
			||||||
 | 
					      lefts = std::max(lefts + kGapScore, left);
 | 
				
			||||||
 | 
					      if (tolower(pc) == tolower(str[i])) {
 | 
				
			||||||
 | 
					        int t = score[i] * (pstart ? kPatternStartMultiplier : 1);
 | 
				
			||||||
 | 
					        dp[i] = pfirst ? kLeadingGapScore * i + t
 | 
				
			||||||
 | 
					                       : std::max(uleft + kConsecutiveScore, ulefts + t);
 | 
				
			||||||
 | 
					      } else
 | 
				
			||||||
 | 
					        dp[i] = kMinScore;
 | 
				
			||||||
 | 
					      uleft = left;
 | 
				
			||||||
 | 
					      ulefts = lefts;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    pfirst = pstart = false;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Enumerate the end position of the match in str.
 | 
				
			||||||
 | 
					  lefts = kMinScore;
 | 
				
			||||||
 | 
					  for (int i = 0; i < int(str.size()); i++)
 | 
				
			||||||
 | 
					    // For function types, db->detailed_names may have trailing characters for
 | 
				
			||||||
 | 
					    // parameters. We do not want to penalize them.
 | 
				
			||||||
 | 
					    // If we use `short_name` instead of `detailed_name` for fuzzy matching, the
 | 
				
			||||||
 | 
					    // penulty kGapScore can be used.
 | 
				
			||||||
 | 
					    lefts = std::max(lefts /*+ kGapScore */, dp[i]);
 | 
				
			||||||
 | 
					  return lefts;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct WorkspaceSymbolHandler : BaseMessageHandler<Ipc_WorkspaceSymbol> {
 | 
					struct WorkspaceSymbolHandler : BaseMessageHandler<Ipc_WorkspaceSymbol> {
 | 
				
			||||||
  void Run(Ipc_WorkspaceSymbol* request) override {
 | 
					  void Run(Ipc_WorkspaceSymbol* request) override {
 | 
				
			||||||
    // TODO: implement fuzzy search, see
 | 
					 | 
				
			||||||
    // https://github.com/junegunn/fzf/blob/master/src/matcher.go for
 | 
					 | 
				
			||||||
    // inspiration
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    Out_WorkspaceSymbol out;
 | 
					    Out_WorkspaceSymbol out;
 | 
				
			||||||
    out.id = request->id;
 | 
					    out.id = request->id;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -66,7 +179,10 @@ struct WorkspaceSymbolHandler : BaseMessageHandler<Ipc_WorkspaceSymbol> {
 | 
				
			|||||||
    std::string query = request->params.query;
 | 
					    std::string query = request->params.query;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    std::unordered_set<std::string> inserted_results;
 | 
					    std::unordered_set<std::string> inserted_results;
 | 
				
			||||||
 | 
					    // db->detailed_names indices of each lsSymbolInformation in out.result
 | 
				
			||||||
 | 
					    std::vector<int> result_indices;
 | 
				
			||||||
    inserted_results.reserve(config->maxWorkspaceSearchResults);
 | 
					    inserted_results.reserve(config->maxWorkspaceSearchResults);
 | 
				
			||||||
 | 
					    result_indices.reserve(config->maxWorkspaceSearchResults);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for (int i = 0; i < db->detailed_names.size(); ++i) {
 | 
					    for (int i = 0; i < db->detailed_names.size(); ++i) {
 | 
				
			||||||
      if (db->detailed_names[i].find(query) != std::string::npos) {
 | 
					      if (db->detailed_names[i].find(query) != std::string::npos) {
 | 
				
			||||||
@ -74,26 +190,56 @@ struct WorkspaceSymbolHandler : BaseMessageHandler<Ipc_WorkspaceSymbol> {
 | 
				
			|||||||
        if (!inserted_results.insert(db->detailed_names[i]).second)
 | 
					        if (!inserted_results.insert(db->detailed_names[i]).second)
 | 
				
			||||||
          continue;
 | 
					          continue;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        InsertSymbolIntoResult(db, working_files, db->symbols[i], &out.result);
 | 
					        if (InsertSymbolIntoResult(db, working_files, db->symbols[i], &out.result)) {
 | 
				
			||||||
 | 
					          result_indices.push_back(i);
 | 
				
			||||||
          if (out.result.size() >= config->maxWorkspaceSearchResults)
 | 
					          if (out.result.size() >= config->maxWorkspaceSearchResults)
 | 
				
			||||||
            break;
 | 
					            break;
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (out.result.size() < config->maxWorkspaceSearchResults) {
 | 
					    if (out.result.size() < config->maxWorkspaceSearchResults) {
 | 
				
			||||||
 | 
					      std::string query_without_space;
 | 
				
			||||||
 | 
					      query_without_space.reserve(query.size());
 | 
				
			||||||
 | 
					      for (char c: query)
 | 
				
			||||||
 | 
					        if (!isspace(c))
 | 
				
			||||||
 | 
					          query_without_space += c;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      for (int i = 0; i < db->detailed_names.size(); ++i) {
 | 
					      for (int i = 0; i < db->detailed_names.size(); ++i) {
 | 
				
			||||||
        if (SubstringMatch(query, db->detailed_names[i])) {
 | 
					        if (SubstringMatch(query_without_space, db->detailed_names[i])) {
 | 
				
			||||||
          // Do not show the same entry twice.
 | 
					          // Do not show the same entry twice.
 | 
				
			||||||
          if (!inserted_results.insert(db->detailed_names[i]).second)
 | 
					          if (!inserted_results.insert(db->detailed_names[i]).second)
 | 
				
			||||||
            continue;
 | 
					            continue;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
          InsertSymbolIntoResult(db, working_files, db->symbols[i],
 | 
					          if (InsertSymbolIntoResult(db, working_files, db->symbols[i], &out.result)) {
 | 
				
			||||||
                                 &out.result);
 | 
					            result_indices.push_back(i);
 | 
				
			||||||
            if (out.result.size() >= config->maxWorkspaceSearchResults)
 | 
					            if (out.result.size() >= config->maxWorkspaceSearchResults)
 | 
				
			||||||
              break;
 | 
					              break;
 | 
				
			||||||
          }
 | 
					          }
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if (out.result.size() < config->maxWorkspaceSearchResults) {
 | 
				
			||||||
 | 
					      int longest = 0;
 | 
				
			||||||
 | 
					      for (int i: result_indices)
 | 
				
			||||||
 | 
					        longest = std::max(longest, int(db->detailed_names[i].size()));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      std::vector<int> score(longest), // score for each position
 | 
				
			||||||
 | 
					          dp(longest); // dp[i]: maximum value by aligning pattern[0..pi] to str[0..si]
 | 
				
			||||||
 | 
					      std::vector<std::pair<int, int>> permutation(result_indices.size());
 | 
				
			||||||
 | 
					      for (int i = 0; i < int(result_indices.size()); i++) {
 | 
				
			||||||
 | 
					        permutation[i] = {
 | 
				
			||||||
 | 
					            fuzzyEvaluate(query, db->detailed_names[result_indices[i]], score,
 | 
				
			||||||
 | 
					                          dp),
 | 
				
			||||||
 | 
					            i};
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      std::sort(permutation.begin(), permutation.end(),
 | 
				
			||||||
 | 
					                std::greater<std::pair<int, int>>());
 | 
				
			||||||
 | 
					      for (int i = 0; i < int(result_indices.size()); i++)
 | 
				
			||||||
 | 
					        if (i != permutation[i].second)
 | 
				
			||||||
 | 
					          std::swap(out.result[i], out.result[permutation[i].second]);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    LOG_S(INFO) << "[querydb] Found " << out.result.size()
 | 
					    LOG_S(INFO) << "[querydb] Found " << out.result.size()
 | 
				
			||||||
                << " results for query " << query;
 | 
					                << " results for query " << query;
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
		Reference in New Issue
	
	Block a user