mirror of
https://github.com/MaskRay/ccls.git
synced 2024-11-29 11:01:57 +00:00
Variant of clangd fuzzy matcher
This commit is contained in:
parent
bcdb8690f0
commit
b2b5e57761
@ -1,120 +1,116 @@
|
|||||||
#include "fuzzy_match.h"
|
#include "fuzzy_match.h"
|
||||||
|
|
||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
#include <limits.h>
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
// Penalty of dropping a leading character in str
|
enum FuzzyMatcher::CharClass : int { Other, Lower, Upper };
|
||||||
constexpr int kLeadingGapScore = -4;
|
enum FuzzyMatcher::CharRole : int { None, Tail, Head };
|
||||||
// Penalty of dropping a non-leading character in str
|
|
||||||
constexpr int kGapScore = -5;
|
|
||||||
// Bonus of aligning with an initial character of a word in pattern. Must be
|
|
||||||
// greater than 1
|
|
||||||
constexpr int kPatternStartMultiplier = 2;
|
|
||||||
|
|
||||||
constexpr int kWordStartScore = 50;
|
namespace {
|
||||||
constexpr int kNonWordScore = 40;
|
FuzzyMatcher::CharClass GetCharClass(int c) {
|
||||||
constexpr int kCaseMatchScore = 2;
|
|
||||||
|
|
||||||
// Less than kWordStartScore
|
|
||||||
constexpr int kConsecutiveScore = kWordStartScore + kGapScore;
|
|
||||||
// Slightly less than kConsecutiveScore
|
|
||||||
constexpr int kCamelScore = kWordStartScore + kGapScore - 1;
|
|
||||||
|
|
||||||
enum class CharClass { Lower, Upper, Digit, NonWord };
|
|
||||||
|
|
||||||
static CharClass GetCharClass(int c) {
|
|
||||||
if (islower(c))
|
if (islower(c))
|
||||||
return CharClass::Lower;
|
return Lower;
|
||||||
if (isupper(c))
|
if (isupper(c))
|
||||||
return CharClass::Upper;
|
return Upper;
|
||||||
if (isdigit(c))
|
return Other;
|
||||||
return CharClass::Digit;
|
|
||||||
return CharClass::NonWord;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int GetScoreFor(CharClass prev, CharClass curr) {
|
void CalculateRoles(std::string_view s,
|
||||||
if (prev == CharClass::NonWord && curr != CharClass::NonWord)
|
FuzzyMatcher::CharRole roles[],
|
||||||
return kWordStartScore;
|
int* class_set) {
|
||||||
if ((prev == CharClass::Lower && curr == CharClass::Upper) ||
|
if (s.empty()) {
|
||||||
(prev != CharClass::Digit && curr == CharClass::Digit))
|
*class_set = 0;
|
||||||
return kCamelScore;
|
return;
|
||||||
if (curr == CharClass::NonWord)
|
}
|
||||||
return kNonWordScore;
|
FuzzyMatcher::CharClass pre = Other, cur = GetCharClass(s[0]), suc;
|
||||||
return 0;
|
*class_set = 1 << cur;
|
||||||
|
auto fn = [&]() {
|
||||||
|
if (cur == Other)
|
||||||
|
return None;
|
||||||
|
// U(U)L is Head while U(U)U is Tail
|
||||||
|
return pre == Other || (cur == Upper && (pre == Lower || suc != Upper))
|
||||||
|
? Head
|
||||||
|
: Tail;
|
||||||
|
};
|
||||||
|
for (size_t i = 0; i < s.size() - 1; i++) {
|
||||||
|
suc = GetCharClass(s[i + 1]);
|
||||||
|
*class_set |= 1 << suc;
|
||||||
|
roles[i] = fn();
|
||||||
|
pre = cur;
|
||||||
|
cur = suc;
|
||||||
|
}
|
||||||
|
roles[s.size() - 1] = fn();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
int FuzzyMatcher::MissScore(int j, bool last) {
|
||||||
fuzzyEvaluate implements a global sequence alignment algorithm to find the
|
int s = last ? -20 : 0;
|
||||||
maximum accumulated score by aligning `pattern` to `str`. It applies when
|
if (text_role[j] == Head)
|
||||||
`pattern` is a subsequence of `str`.
|
s -= 10;
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
Scoring criteria
|
int FuzzyMatcher::MatchScore(int i, int j, bool last) {
|
||||||
- Prefer matches at the start of a word, or the start of subwords in
|
int s = 40;
|
||||||
CamelCase/camelCase/camel123 words. See kWordStartScore/kCamelScore
|
if ((pat[i] == text[j] && ((pat_set & 1 << Upper) || i == j)))
|
||||||
- Non-word characters matter. See kNonWordScore
|
s += 20;
|
||||||
- The first characters of words of `pattern` receive bonus because they usually
|
if (pat_role[i] == Head && text_role[j] == Head)
|
||||||
have more significance than the rest. See kPatternStartMultiplier
|
s += 50;
|
||||||
- Superfluous characters in `str` will reduce the score (gap penalty). See
|
if (text_role[j] == Tail && i && !last)
|
||||||
kGapScore
|
s -= 50;
|
||||||
- Prefer early occurrence of the first character. See kLeadingGapScore/kGapScore
|
if (pat_role[i] == Head && text_role[j] == Tail)
|
||||||
|
s -= 30;
|
||||||
|
if (i == 0 && text_role[j] == Tail)
|
||||||
|
s -= 70;
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
The recurrence of the dynamic programming:
|
FuzzyMatcher::FuzzyMatcher(std::string_view pattern) {
|
||||||
dp[i][j]: maximum accumulated score by aligning pattern[0..i] to str[0..j]
|
CalculateRoles(pattern, pat_role, &pat_set);
|
||||||
dp[0][j] = leading_gap_penalty(0, j) + score[j]
|
size_t n = 0;
|
||||||
dp[i][j] = max(dp[i-1][j-1] + CONSECUTIVE_SCORE, max(dp[i-1][k] +
|
for (size_t i = 0; i < pattern.size(); i++)
|
||||||
gap_penalty(k+1, j) + score[j] : k < j))
|
if (pattern[i] != ' ') {
|
||||||
The first dimension can be suppressed since we do not need a matching scheme,
|
pat += pattern[i];
|
||||||
which reduces the space complexity from O(N*M) to O(M)
|
low_pat[n] = (char)::tolower(pattern[i]);
|
||||||
*/
|
pat_role[n] = pat_role[i];
|
||||||
int FuzzyEvaluate(std::string_view pattern,
|
n++;
|
||||||
std::string_view str,
|
|
||||||
std::vector<int>& score,
|
|
||||||
std::vector<int>& dp) {
|
|
||||||
bool pfirst = true, // aligning the first character of pattern
|
|
||||||
pstart = true; // whether we are aligning the start of a word in pattern
|
|
||||||
int uleft = 0, // value of the upper left cell
|
|
||||||
ulefts = 0, // maximum value of uleft and cells on the left
|
|
||||||
left, lefts; // similar to uleft/ulefts, but for the next row
|
|
||||||
|
|
||||||
// Calculate position score for each character in str.
|
|
||||||
CharClass prev = CharClass::NonWord;
|
|
||||||
for (int i = 0; i < int(str.size()); i++) {
|
|
||||||
CharClass cur = GetCharClass(str[i]);
|
|
||||||
score[i] = GetScoreFor(prev, cur);
|
|
||||||
prev = cur;
|
|
||||||
}
|
}
|
||||||
std::fill_n(dp.begin(), str.size(), kMinScore);
|
}
|
||||||
|
|
||||||
// Align each character of pattern.
|
int FuzzyMatcher::Match(std::string_view text) {
|
||||||
for (unsigned char pc : pattern) {
|
int n = int(text.size());
|
||||||
if (isspace(pc)) {
|
if (n > kMaxText)
|
||||||
pstart = true;
|
return kMinScore + 1;
|
||||||
continue;
|
this->text = text;
|
||||||
|
for (int i = 0; i < n; i++)
|
||||||
|
low_text[i] = (char)::tolower(text[i]);
|
||||||
|
CalculateRoles(text, text_role, &text_set);
|
||||||
|
dp[0][0][0] = 0;
|
||||||
|
dp[0][0][1] = kMinScore;
|
||||||
|
for (int j = 0; j < n; j++) {
|
||||||
|
dp[0][j + 1][0] = dp[0][j][0] + MissScore(j, false);
|
||||||
|
dp[0][j + 1][1] = kMinScore;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < int(pat.size()); i++) {
|
||||||
|
int(*pre)[2] = dp[i & 1];
|
||||||
|
int(*cur)[2] = dp[i + 1 & 1];
|
||||||
|
cur[0][0] = cur[0][1] = kMinScore;
|
||||||
|
for (int j = 0; j < n; j++) {
|
||||||
|
cur[j + 1][0] = std::max(cur[j][0] + MissScore(j, false),
|
||||||
|
cur[j][1] + MissScore(j, true));
|
||||||
|
if (low_pat[i] != low_text[j])
|
||||||
|
cur[j + 1][1] = kMinScore;
|
||||||
|
else {
|
||||||
|
cur[j + 1][1] = std::max(pre[j][0] + MatchScore(i, j, false),
|
||||||
|
pre[j][1] + MatchScore(i, j, true));
|
||||||
}
|
}
|
||||||
lefts = kMinScore;
|
|
||||||
// Enumerate the character in str to be aligned with pc.
|
|
||||||
for (int i = 0; i < int(str.size()); i++) {
|
|
||||||
left = dp[i];
|
|
||||||
lefts = std::max(lefts + kGapScore, left);
|
|
||||||
// Use lower() if case-insensitive
|
|
||||||
if (tolower(pc) == tolower(str[i])) {
|
|
||||||
int t = score[i] * (pstart ? kPatternStartMultiplier : 1);
|
|
||||||
dp[i] = (pfirst ? kLeadingGapScore * i + t
|
|
||||||
: std::max(uleft + kConsecutiveScore, ulefts + t)) +
|
|
||||||
(pc == str[i] ? kCaseMatchScore : 0);
|
|
||||||
} else
|
|
||||||
dp[i] = kMinScore;
|
|
||||||
uleft = left;
|
|
||||||
ulefts = lefts;
|
|
||||||
}
|
}
|
||||||
pfirst = pstart = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Enumerate the end position of the match in str. Each removed trailing
|
// Enumerate the end position of the match in str. Each removed trailing
|
||||||
// character has a penulty of kGapScore.
|
// character has a penulty.
|
||||||
lefts = kMinScore;
|
int ret = kMinScore;
|
||||||
for (int i = 0; i < int(str.size()); i++)
|
for (int j = 1; j <= n; j++)
|
||||||
lefts = std::max(lefts + kGapScore, dp[i]);
|
ret = std::max(ret, dp[pat.size() & 1][j][1] - 3 * (n - j));
|
||||||
return lefts;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -3,15 +3,29 @@
|
|||||||
#include <string_view.h>
|
#include <string_view.h>
|
||||||
|
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
// Negative but far from INT_MIN so that intermediate results are hard to
|
class FuzzyMatcher {
|
||||||
// overflow
|
public:
|
||||||
constexpr int kMinScore = INT_MIN / 2;
|
constexpr static int kMaxPat = 100;
|
||||||
|
constexpr static int kMaxText = 200;
|
||||||
|
// Negative but far from INT_MIN so that intermediate results are hard to
|
||||||
|
// overflow.
|
||||||
|
constexpr static int kMinScore = INT_MIN / 2;
|
||||||
|
|
||||||
// Evaluate the score matching |pattern| against |str|, the larger the better.
|
FuzzyMatcher(std::string_view pattern);
|
||||||
// |score| and |dp| must be at least as long as |str|.
|
int Match(std::string_view text);
|
||||||
int FuzzyEvaluate(std::string_view pattern,
|
|
||||||
std::string_view str,
|
enum CharClass : int;
|
||||||
std::vector<int>& score,
|
enum CharRole : int;
|
||||||
std::vector<int>& dp);
|
|
||||||
|
private:
|
||||||
|
std::string pat;
|
||||||
|
std::string_view text;
|
||||||
|
int pat_set, text_set;
|
||||||
|
char low_pat[kMaxPat], low_text[kMaxText];
|
||||||
|
CharRole pat_role[kMaxPat], text_role[kMaxText];
|
||||||
|
int dp[2][kMaxText + 1][2];
|
||||||
|
|
||||||
|
int MatchScore(int i, int j, bool last);
|
||||||
|
int MissScore(int j, bool last);
|
||||||
|
};
|
||||||
|
@ -122,21 +122,16 @@ struct WorkspaceSymbolHandler : BaseMessageHandler<Ipc_WorkspaceSymbol> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (config->workspaceSymbol.sort) {
|
if (config->workspaceSymbol.sort && query.size() <= FuzzyMatcher::kMaxPat) {
|
||||||
// Sort results with a fuzzy matching algorithm.
|
// Sort results with a fuzzy matching algorithm.
|
||||||
int longest = 0;
|
int longest = 0;
|
||||||
for (int i : result_indices)
|
for (int i : result_indices)
|
||||||
longest = std::max(longest, int(db->GetSymbolDetailedName(i).size()));
|
longest = std::max(longest, int(db->GetSymbolDetailedName(i).size()));
|
||||||
|
FuzzyMatcher fuzzy(query);
|
||||||
std::vector<int> score(longest); // score for each position
|
|
||||||
std::vector<int> dp(
|
|
||||||
longest); // dp[i]: maximum value by aligning pattern to str[0..i]
|
|
||||||
std::vector<std::pair<int, int>> permutation(result_indices.size());
|
std::vector<std::pair<int, int>> permutation(result_indices.size());
|
||||||
for (int i = 0; i < int(result_indices.size()); i++) {
|
for (int i = 0; i < int(result_indices.size()); i++) {
|
||||||
permutation[i] = {
|
permutation[i] = {
|
||||||
FuzzyEvaluate(query, db->GetSymbolDetailedName(result_indices[i]),
|
fuzzy.Match(db->GetSymbolDetailedName(result_indices[i])), i};
|
||||||
score, dp),
|
|
||||||
i};
|
|
||||||
}
|
}
|
||||||
std::sort(permutation.begin(), permutation.end(),
|
std::sort(permutation.begin(), permutation.end(),
|
||||||
std::greater<std::pair<int, int>>());
|
std::greater<std::pair<int, int>>());
|
||||||
|
Loading…
Reference in New Issue
Block a user