From 639f587e01312ac23500a31b51d29c34010c3518 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 18 Mar 2018 12:17:40 -0700 Subject: [PATCH] Optimize FuzzyMatcher and add tests. --- src/fuzzy_match.cc | 76 +++++++++++++++++++++++++------- src/fuzzy_match.h | 2 +- src/messages/workspace_symbol.cc | 5 ++- 3 files changed, 66 insertions(+), 17 deletions(-) diff --git a/src/fuzzy_match.cc b/src/fuzzy_match.cc index 269b2a94..79b0727b 100644 --- a/src/fuzzy_match.cc +++ b/src/fuzzy_match.cc @@ -1,7 +1,11 @@ #include "fuzzy_match.h" +#include + #include +#include #include +#include enum CharClass { Other, Lower, Upper }; enum CharRole { None, Tail, Head }; @@ -49,20 +53,20 @@ int FuzzyMatcher::MissScore(int j, bool last) { } int FuzzyMatcher::MatchScore(int i, int j, bool last) { - int s = 40; + int s = 0; if (pat[i] == text[j]) { s++; if ((pat_set & 1 << Upper) || i == j) - s += 20; + s += 10; } if (pat_role[i] == Head && text_role[j] == Head) - s += 50; + s += 30; if (text_role[j] == Tail && i && !last) - s -= 50; - if (pat_role[i] == Head && text_role[j] == Tail) s -= 30; + if (pat_role[i] == Head && text_role[j] == Tail) + s -= 10; if (i == 0 && text_role[j] == Tail) - s -= 70; + s -= 40; return s; } @@ -87,31 +91,73 @@ int FuzzyMatcher::Match(std::string_view text) { low_text[i] = ::tolower(text[i]); CalculateRoles(text, text_role, &text_set); dp[0][0][0] = 0; - dp[0][0][1] = kMinScore; + dp[0][0][1] = kMinScore * 2; for (int j = 0; j < n; j++) { dp[0][j + 1][0] = dp[0][j][0] + MissScore(j, false); - dp[0][j + 1][1] = kMinScore; + dp[0][j + 1][1] = kMinScore * 2; } for (int i = 0; i < int(pat.size()); i++) { int(*pre)[2] = dp[i & 1]; int(*cur)[2] = dp[(i + 1) & 1]; - cur[0][0] = cur[0][1] = kMinScore; - for (int j = 0; j < n; j++) { + cur[i][0] = cur[i][1] = kMinScore; + for (int j = i; j < n; j++) { cur[j + 1][0] = std::max(cur[j][0] + MissScore(j, false), cur[j][1] + MissScore(j, true)); - if (low_pat[i] != low_text[j]) - cur[j + 1][1] = kMinScore; - else { + // For the first char of pattern, apply extra restriction to filter bad + // candidates (e.g. |int| in |PRINT|) + if (low_pat[i] == low_text[j] && + (i || text_role[j] != Tail || pat[i] == text[j])) { cur[j + 1][1] = std::max(pre[j][0] + MatchScore(i, j, false), pre[j][1] + MatchScore(i, j, true)); - } + } else + cur[j + 1][1] = kMinScore * 2; } } // Enumerate the end position of the match in str. Each removed trailing // character has a penulty. int ret = kMinScore; - for (int j = 1; j <= n; j++) + for (int j = pat.size(); j <= n; j++) ret = std::max(ret, dp[pat.size() & 1][j][1] - 3 * (n - j)); return ret; } + +TEST_SUITE("fuzzy_match") { + bool Ranks(std::string_view pat, std::vector texts) { + FuzzyMatcher fuzzy(pat); + std::vector scores; + for (auto text : texts) + scores.push_back(fuzzy.Match(text)); + bool ret = true; + for (size_t i = 0; i < texts.size() - 1; i++) + if (scores[i] < scores[i + 1]) { + ret = false; + break; + } + if (1 || !ret) { + for (size_t i = 0; i < texts.size(); i++) + printf("%s %d ", texts[i], scores[i]); + puts(""); + } + return ret; + } + + TEST_CASE("test") { + // case + Ranks("monad", {"monad", "Monad", "mONAD"}); + // initials + Ranks("ab", {"ab", "aoo_boo", "acb"}); + Ranks("CC", {"CamelCase", "camelCase", "camelcase"}); + Ranks("cC", {"camelCase", "CamelCase", "camelcase"}); + Ranks("Da.Te", {"Data.Text", "Data.Text.Lazy", "Data.Aeson.Encoding.text"}); + // prefix + Ranks("is", {"isIEEE", "inSuf"}); + // shorter + Ranks("ma", {"map", "many", "maximum"}); + Ranks("print", {"printf", "sprintf"}); + // score(PRINT) = kMinScore + Ranks("int", {"int", "INT", "PRINT"}); + // score(PRINT) > kMinScore + Ranks("Int", {"int", "INT", "PRINT"}); + } +} diff --git a/src/fuzzy_match.h b/src/fuzzy_match.h index a3b5191c..79599857 100644 --- a/src/fuzzy_match.h +++ b/src/fuzzy_match.h @@ -11,7 +11,7 @@ public: constexpr static int kMaxText = 200; // Negative but far from INT_MIN so that intermediate results are hard to // overflow. - constexpr static int kMinScore = INT_MIN / 2; + constexpr static int kMinScore = INT_MIN / 4; FuzzyMatcher(std::string_view pattern); int Match(std::string_view text); diff --git a/src/messages/workspace_symbol.cc b/src/messages/workspace_symbol.cc index f6a00f27..e02b2212 100644 --- a/src/messages/workspace_symbol.cc +++ b/src/messages/workspace_symbol.cc @@ -136,7 +136,10 @@ struct WorkspaceSymbolHandler : BaseMessageHandler { std::sort(permutation.begin(), permutation.end(), std::greater>()); out.result.reserve(result_indices.size()); - for (int i = 0; i < int(result_indices.size()); i++) + // Discard awful candidates. + for (int i = 0; i < int(result_indices.size()) && + permutation[i].first > FuzzyMatcher::kMinScore; + i++) out.result.push_back( std::move(unsorted_results[permutation[i].second])); } else {