2018-02-12 09:01:02 +00:00
|
|
|
#include "fuzzy_match.h"
|
|
|
|
|
2018-08-09 17:08:14 +00:00
|
|
|
#include <algorithm>
|
2018-02-12 09:01:02 +00:00
|
|
|
#include <ctype.h>
|
2018-03-18 19:17:40 +00:00
|
|
|
#include <stdio.h>
|
|
|
|
#include <vector>
|
2018-02-12 09:01:02 +00:00
|
|
|
|
2018-03-16 15:28:37 +00:00
|
|
|
enum CharClass { Other, Lower, Upper };
|
|
|
|
enum CharRole { None, Tail, Head };
|
2018-02-12 09:01:02 +00:00
|
|
|
|
2018-03-16 07:19:49 +00:00
|
|
|
namespace {
|
2018-03-16 15:28:37 +00:00
|
|
|
CharClass GetCharClass(int c) {
|
2018-02-12 09:01:02 +00:00
|
|
|
if (islower(c))
|
2018-03-16 07:19:49 +00:00
|
|
|
return Lower;
|
2018-02-12 09:01:02 +00:00
|
|
|
if (isupper(c))
|
2018-03-16 07:19:49 +00:00
|
|
|
return Upper;
|
|
|
|
return Other;
|
2018-02-12 09:01:02 +00:00
|
|
|
}
|
|
|
|
|
2018-08-09 17:08:14 +00:00
|
|
|
void CalculateRoles(std::string_view s, int roles[], int *class_set) {
|
2018-03-16 07:19:49 +00:00
|
|
|
if (s.empty()) {
|
|
|
|
*class_set = 0;
|
|
|
|
return;
|
|
|
|
}
|
2018-03-16 15:28:37 +00:00
|
|
|
CharClass pre = Other, cur = GetCharClass(s[0]), suc;
|
2018-03-16 07:19:49 +00:00
|
|
|
*class_set = 1 << cur;
|
|
|
|
auto fn = [&]() {
|
|
|
|
if (cur == Other)
|
|
|
|
return None;
|
|
|
|
// U(U)L is Head while U(U)U is Tail
|
|
|
|
return pre == Other || (cur == Upper && (pre == Lower || suc != Upper))
|
|
|
|
? Head
|
|
|
|
: Tail;
|
|
|
|
};
|
|
|
|
for (size_t i = 0; i < s.size() - 1; i++) {
|
|
|
|
suc = GetCharClass(s[i + 1]);
|
|
|
|
*class_set |= 1 << suc;
|
|
|
|
roles[i] = fn();
|
|
|
|
pre = cur;
|
|
|
|
cur = suc;
|
|
|
|
}
|
|
|
|
roles[s.size() - 1] = fn();
|
|
|
|
}
|
2018-08-09 17:08:14 +00:00
|
|
|
} // namespace
|
2018-02-12 09:01:02 +00:00
|
|
|
|
2018-03-16 07:19:49 +00:00
|
|
|
int FuzzyMatcher::MissScore(int j, bool last) {
|
2018-04-02 03:55:10 +00:00
|
|
|
int s = -3;
|
|
|
|
if (last)
|
|
|
|
s -= 10;
|
2018-03-16 07:19:49 +00:00
|
|
|
if (text_role[j] == Head)
|
|
|
|
s -= 10;
|
|
|
|
return s;
|
|
|
|
}
|
2018-02-12 09:01:02 +00:00
|
|
|
|
2018-03-16 07:19:49 +00:00
|
|
|
int FuzzyMatcher::MatchScore(int i, int j, bool last) {
|
2018-03-18 19:17:40 +00:00
|
|
|
int s = 0;
|
2018-04-02 03:55:10 +00:00
|
|
|
// Case matching.
|
2018-03-16 15:28:37 +00:00
|
|
|
if (pat[i] == text[j]) {
|
|
|
|
s++;
|
2018-04-02 03:55:10 +00:00
|
|
|
// pat contains uppercase letters or prefix matching.
|
2018-03-16 15:28:37 +00:00
|
|
|
if ((pat_set & 1 << Upper) || i == j)
|
2018-03-23 21:57:52 +00:00
|
|
|
s++;
|
|
|
|
}
|
|
|
|
if (pat_role[i] == Head) {
|
|
|
|
if (text_role[j] == Head)
|
|
|
|
s += 30;
|
|
|
|
else if (text_role[j] == Tail)
|
|
|
|
s -= 10;
|
2018-03-16 15:28:37 +00:00
|
|
|
}
|
2018-04-02 03:55:10 +00:00
|
|
|
// Matching a tail while previous char wasn't matched.
|
2018-03-16 07:19:49 +00:00
|
|
|
if (text_role[j] == Tail && i && !last)
|
|
|
|
s -= 30;
|
2018-04-02 03:55:10 +00:00
|
|
|
// First char of pat matches a tail.
|
2018-03-16 07:19:49 +00:00
|
|
|
if (i == 0 && text_role[j] == Tail)
|
2018-03-18 19:17:40 +00:00
|
|
|
s -= 40;
|
2018-03-16 07:19:49 +00:00
|
|
|
return s;
|
|
|
|
}
|
2018-02-12 09:01:02 +00:00
|
|
|
|
2018-04-14 18:57:23 +00:00
|
|
|
FuzzyMatcher::FuzzyMatcher(std::string_view pattern, int sensitivity) {
|
2018-03-16 07:19:49 +00:00
|
|
|
CalculateRoles(pattern, pat_role, &pat_set);
|
2018-04-14 18:57:23 +00:00
|
|
|
if (sensitivity == 1)
|
|
|
|
sensitivity = pat_set & 1 << Upper ? 2 : 0;
|
|
|
|
case_sensitivity = sensitivity;
|
2018-03-16 07:19:49 +00:00
|
|
|
size_t n = 0;
|
|
|
|
for (size_t i = 0; i < pattern.size(); i++)
|
|
|
|
if (pattern[i] != ' ') {
|
|
|
|
pat += pattern[i];
|
|
|
|
low_pat[n] = (char)::tolower(pattern[i]);
|
|
|
|
pat_role[n] = pat_role[i];
|
|
|
|
n++;
|
|
|
|
}
|
|
|
|
}
|
2018-02-12 09:01:02 +00:00
|
|
|
|
2018-03-16 07:19:49 +00:00
|
|
|
int FuzzyMatcher::Match(std::string_view text) {
|
|
|
|
int n = int(text.size());
|
|
|
|
if (n > kMaxText)
|
|
|
|
return kMinScore + 1;
|
|
|
|
this->text = text;
|
|
|
|
for (int i = 0; i < n; i++)
|
|
|
|
low_text[i] = (char)::tolower(text[i]);
|
|
|
|
CalculateRoles(text, text_role, &text_set);
|
2018-03-18 21:26:15 +00:00
|
|
|
dp[0][0][0] = dp[0][0][1] = 0;
|
2018-03-16 07:19:49 +00:00
|
|
|
for (int j = 0; j < n; j++) {
|
|
|
|
dp[0][j + 1][0] = dp[0][j][0] + MissScore(j, false);
|
2018-03-18 19:17:40 +00:00
|
|
|
dp[0][j + 1][1] = kMinScore * 2;
|
2018-02-12 09:01:02 +00:00
|
|
|
}
|
2018-03-16 07:19:49 +00:00
|
|
|
for (int i = 0; i < int(pat.size()); i++) {
|
|
|
|
int(*pre)[2] = dp[i & 1];
|
2018-03-16 15:28:37 +00:00
|
|
|
int(*cur)[2] = dp[(i + 1) & 1];
|
2018-03-18 19:17:40 +00:00
|
|
|
cur[i][0] = cur[i][1] = kMinScore;
|
|
|
|
for (int j = i; j < n; j++) {
|
2018-03-16 07:19:49 +00:00
|
|
|
cur[j + 1][0] = std::max(cur[j][0] + MissScore(j, false),
|
|
|
|
cur[j][1] + MissScore(j, true));
|
2018-03-18 19:17:40 +00:00
|
|
|
// For the first char of pattern, apply extra restriction to filter bad
|
|
|
|
// candidates (e.g. |int| in |PRINT|)
|
2018-04-14 18:57:23 +00:00
|
|
|
cur[j + 1][1] = (case_sensitivity ? pat[i] == text[j]
|
|
|
|
: low_pat[i] == low_text[j] &&
|
|
|
|
(i || text_role[j] != Tail ||
|
|
|
|
pat[i] == text[j]))
|
|
|
|
? std::max(pre[j][0] + MatchScore(i, j, false),
|
|
|
|
pre[j][1] + MatchScore(i, j, true))
|
2018-05-11 21:23:53 +00:00
|
|
|
: kMinScore * 2;
|
2018-02-12 09:01:02 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Enumerate the end position of the match in str. Each removed trailing
|
2018-03-16 07:19:49 +00:00
|
|
|
// character has a penulty.
|
|
|
|
int ret = kMinScore;
|
2018-03-18 19:17:40 +00:00
|
|
|
for (int j = pat.size(); j <= n; j++)
|
2018-04-02 03:55:10 +00:00
|
|
|
ret = std::max(ret, dp[pat.size() & 1][j][1] - 2 * (n - j));
|
2018-03-16 07:19:49 +00:00
|
|
|
return ret;
|
2018-02-12 09:01:02 +00:00
|
|
|
}
|
2018-03-18 19:17:40 +00:00
|
|
|
|
2018-07-09 00:13:41 +00:00
|
|
|
#if 0
|
2018-03-18 19:17:40 +00:00
|
|
|
TEST_SUITE("fuzzy_match") {
|
|
|
|
bool Ranks(std::string_view pat, std::vector<const char*> texts) {
|
2018-04-14 18:57:23 +00:00
|
|
|
FuzzyMatcher fuzzy(pat, 0);
|
2018-03-18 19:17:40 +00:00
|
|
|
std::vector<int> scores;
|
|
|
|
for (auto text : texts)
|
|
|
|
scores.push_back(fuzzy.Match(text));
|
|
|
|
bool ret = true;
|
|
|
|
for (size_t i = 0; i < texts.size() - 1; i++)
|
|
|
|
if (scores[i] < scores[i + 1]) {
|
|
|
|
ret = false;
|
|
|
|
break;
|
|
|
|
}
|
2018-03-23 21:57:52 +00:00
|
|
|
if (!ret) {
|
2018-03-18 19:17:40 +00:00
|
|
|
for (size_t i = 0; i < texts.size(); i++)
|
|
|
|
printf("%s %d ", texts[i], scores[i]);
|
|
|
|
puts("");
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_CASE("test") {
|
2018-04-14 18:57:23 +00:00
|
|
|
FuzzyMatcher fuzzy("", 0);
|
2018-03-18 21:26:15 +00:00
|
|
|
CHECK(fuzzy.Match("") == 0);
|
|
|
|
CHECK(fuzzy.Match("aaa") < 0);
|
|
|
|
|
2018-03-18 19:17:40 +00:00
|
|
|
// case
|
2018-03-23 21:57:52 +00:00
|
|
|
CHECK(Ranks("monad", {"monad", "Monad", "mONAD"}));
|
2018-03-18 19:17:40 +00:00
|
|
|
// initials
|
2018-03-23 21:57:52 +00:00
|
|
|
CHECK(Ranks("ab", {"ab", "aoo_boo", "acb"}));
|
|
|
|
CHECK(Ranks("CC", {"CamelCase", "camelCase", "camelcase"}));
|
|
|
|
CHECK(Ranks("cC", {"camelCase", "CamelCase", "camelcase"}));
|
2018-04-04 06:05:41 +00:00
|
|
|
CHECK(Ranks("c c", {"camelCase", "camel case", "CamelCase", "camelcase",
|
2018-03-23 21:57:52 +00:00
|
|
|
"camel ace"}));
|
|
|
|
CHECK(Ranks("Da.Te",
|
|
|
|
{"Data.Text", "Data.Text.Lazy", "Data.Aeson.Encoding.text"}));
|
|
|
|
CHECK(Ranks("foo bar.h", {"foo/bar.h", "foobar.h"}));
|
2018-03-18 19:17:40 +00:00
|
|
|
// prefix
|
2018-03-23 21:57:52 +00:00
|
|
|
CHECK(Ranks("is", {"isIEEE", "inSuf"}));
|
2018-03-18 19:17:40 +00:00
|
|
|
// shorter
|
2018-03-23 21:57:52 +00:00
|
|
|
CHECK(Ranks("ma", {"map", "many", "maximum"}));
|
|
|
|
CHECK(Ranks("print", {"printf", "sprintf"}));
|
2018-03-18 19:17:40 +00:00
|
|
|
// score(PRINT) = kMinScore
|
2018-03-23 21:57:52 +00:00
|
|
|
CHECK(Ranks("ast", {"ast", "AST", "INT_FAST16_MAX"}));
|
2018-03-18 19:17:40 +00:00
|
|
|
// score(PRINT) > kMinScore
|
2018-03-23 21:57:52 +00:00
|
|
|
CHECK(Ranks("Int", {"int", "INT", "PRINT"}));
|
2018-03-18 19:17:40 +00:00
|
|
|
}
|
|
|
|
}
|
2018-07-09 00:13:41 +00:00
|
|
|
#endif
|