Add fuzzy_match.h and let textDocument/definition search for the identifier at point (e.g. in comments)

This commit is contained in:
Fangrui Song 2018-02-12 01:01:02 -08:00
parent afd38cbce9
commit 8a939389d8
4 changed files with 178 additions and 117 deletions

120
src/fuzzy_match.cc Normal file
View File

@ -0,0 +1,120 @@
#include "fuzzy_match.h"
#include <ctype.h>
#include <limits.h>
#include <algorithm>
// Penalty of dropping a leading character in str
constexpr int kLeadingGapScore = -4;
// Penalty of dropping a non-leading character in str
constexpr int kGapScore = -5;
// Bonus of aligning with an initial character of a word in pattern. Must be
// greater than 1
constexpr int kPatternStartMultiplier = 2;
constexpr int kWordStartScore = 50;
constexpr int kNonWordScore = 40;
constexpr int kCaseMatchScore = 2;
// Less than kWordStartScore
constexpr int kConsecutiveScore = kWordStartScore + kGapScore;
// Slightly less than kConsecutiveScore
constexpr int kCamelScore = kWordStartScore + kGapScore - 1;
enum class CharClass { Lower, Upper, Digit, NonWord };
static CharClass GetCharClass(int c) {
if (islower(c))
return CharClass::Lower;
if (isupper(c))
return CharClass::Upper;
if (isdigit(c))
return CharClass::Digit;
return CharClass::NonWord;
}
static int GetScoreFor(CharClass prev, CharClass curr) {
if (prev == CharClass::NonWord && curr != CharClass::NonWord)
return kWordStartScore;
if ((prev == CharClass::Lower && curr == CharClass::Upper) ||
(prev != CharClass::Digit && curr == CharClass::Digit))
return kCamelScore;
if (curr == CharClass::NonWord)
return kNonWordScore;
return 0;
}
/*
fuzzyEvaluate implements a global sequence alignment algorithm to find the
maximum accumulated score by aligning `pattern` to `str`. It applies when
`pattern` is a subsequence of `str`.
Scoring criteria
- Prefer matches at the start of a word, or the start of subwords in
CamelCase/camelCase/camel123 words. See kWordStartScore/kCamelScore
- Non-word characters matter. See kNonWordScore
- The first characters of words of `pattern` receive bonus because they usually
have more significance than the rest. See kPatternStartMultiplier
- Superfluous characters in `str` will reduce the score (gap penalty). See
kGapScore
- Prefer early occurrence of the first character. See kLeadingGapScore/kGapScore
The recurrence of the dynamic programming:
dp[i][j]: maximum accumulated score by aligning pattern[0..i] to str[0..j]
dp[0][j] = leading_gap_penalty(0, j) + score[j]
dp[i][j] = max(dp[i-1][j-1] + CONSECUTIVE_SCORE, max(dp[i-1][k] +
gap_penalty(k+1, j) + score[j] : k < j))
The first dimension can be suppressed since we do not need a matching scheme,
which reduces the space complexity from O(N*M) to O(M)
*/
int FuzzyEvaluate(std::string_view pattern,
std::string_view str,
std::vector<int>& score,
std::vector<int>& dp) {
bool pfirst = true, // aligning the first character of pattern
pstart = true; // whether we are aligning the start of a word in pattern
int uleft = 0, // value of the upper left cell
ulefts = 0, // maximum value of uleft and cells on the left
left, lefts; // similar to uleft/ulefts, but for the next row
// Calculate position score for each character in str.
CharClass prev = CharClass::NonWord;
for (int i = 0; i < int(str.size()); i++) {
CharClass cur = GetCharClass(str[i]);
score[i] = GetScoreFor(prev, cur);
prev = cur;
}
std::fill_n(dp.begin(), str.size(), kMinScore);
// Align each character of pattern.
for (unsigned char pc : pattern) {
if (isspace(pc)) {
pstart = true;
continue;
}
lefts = kMinScore;
// Enumerate the character in str to be aligned with pc.
for (int i = 0; i < int(str.size()); i++) {
left = dp[i];
lefts = std::max(lefts + kGapScore, left);
// Use lower() if case-insensitive
if (tolower(pc) == tolower(str[i])) {
int t = score[i] * (pstart ? kPatternStartMultiplier : 1);
dp[i] = (pfirst ? kLeadingGapScore * i + t
: std::max(uleft + kConsecutiveScore, ulefts + t)) +
(pc == str[i] ? kCaseMatchScore : 0);
} else
dp[i] = kMinScore;
uleft = left;
ulefts = lefts;
}
pfirst = pstart = false;
}
// Enumerate the end position of the match in str. Each removed trailing
// character has a penulty of kGapScore.
lefts = kMinScore;
for (int i = 0; i < int(str.size()); i++)
lefts = std::max(lefts + kGapScore, dp[i]);
return lefts;
}

17
src/fuzzy_match.h Normal file
View File

@ -0,0 +1,17 @@
#pragma once
#include <string_view.h>
#include <limits.h>
#include <vector>
// Negative but far from INT_MIN so that intermediate results are hard to
// overflow
constexpr int kMinScore = INT_MIN / 2;
// Evaluate the score matching |pattern| against |str|, the larger the better.
// |score| and |dp| must be at least as long as |str|.
int FuzzyEvaluate(std::string_view pattern,
std::string_view str,
std::vector<int>& score,
std::vector<int>& dp);

View File

@ -1,7 +1,12 @@
#include "fuzzy_match.h"
#include "lex_utils.h"
#include "message_handler.h"
#include "query_utils.h"
#include "queue_manager.h"
#include <ctype.h>
#include <limits.h>
namespace {
void PushBack(std::vector<lsLocation>* result, optional<lsLocation> location) {
if (location)
@ -61,12 +66,14 @@ struct TextDocumentDefinitionHandler
Out_TextDocumentDefinition out;
out.id = request->id;
bool has_symbol = false;
int target_line = request->params.position.line;
int target_column = request->params.position.character;
for (SymbolRef sym :
FindSymbolsAtLocation(working_file, file, request->params.position)) {
// Found symbol. Return definition.
has_symbol = true;
// Special cases which are handled:
// - symbol has declaration but no definition (ie, pure virtual)
@ -121,6 +128,39 @@ struct TextDocumentDefinitionHandler
break;
}
}
// Find the best match of the identifier at point.
if (!has_symbol && db->symbols.size()) {
const std::string& buffer = working_file->buffer_content;
int start = GetOffsetForPosition(request->params.position, buffer);
int end = start;
while (start > 0 && isalnum(buffer[start - 1]))
start--;
while (isalnum(buffer[end]))
end++;
auto query = std::string_view(buffer).substr(start, end - start);
int best_score = kMinScore;
int best_i = 0;
std::vector<int> score, dp;
for (int i = 0; i < (int)db->symbols.size(); ++i) {
std::string_view short_name = db->GetSymbolShortName(i);
if (short_name.size() > score.size()) {
score.resize(short_name.size());
dp.resize(short_name.size());
}
int t = FuzzyEvaluate(query, short_name, score, dp);
if (t > best_score) {
best_score = t;
best_i = i;
}
}
Maybe<Use> use = GetDefinitionSpellingOfSymbol(db, db->symbols[best_i]);
if (use) {
optional<lsLocation> ls_loc = GetLsLocation(db, working_files, *use);
if (ls_loc)
out.result.push_back(*ls_loc);
}
}
}
QueueManager::WriteStdout(IpcId::TextDocumentDefinition, out);

View File

@ -1,3 +1,4 @@
#include "fuzzy_match.h"
#include "lex_utils.h"
#include "message_handler.h"
#include "query_utils.h"
@ -60,123 +61,6 @@ MAKE_REFLECT_STRUCT(Out_WorkspaceSymbol, jsonrpc, id, result);
///// Fuzzy matching
// Negative but far from INT_MIN so that intermediate results are hard to
// overflow
constexpr int kMinScore = INT_MIN / 2;
// Penalty of dropping a leading character in str
constexpr int kLeadingGapScore = -4;
// Penalty of dropping a non-leading character in str
constexpr int kGapScore = -5;
// Bonus of aligning with an initial character of a word in pattern. Must be
// greater than 1
constexpr int kPatternStartMultiplier = 2;
constexpr int kWordStartScore = 50;
constexpr int kNonWordScore = 40;
constexpr int kCaseMatchScore = 2;
// Less than kWordStartScore
constexpr int kConsecutiveScore = kWordStartScore + kGapScore;
// Slightly less than kConsecutiveScore
constexpr int kCamelScore = kWordStartScore + kGapScore - 1;
enum class CharClass { Lower, Upper, Digit, NonWord };
static CharClass GetCharClass(int c) {
if (islower(c))
return CharClass::Lower;
if (isupper(c))
return CharClass::Upper;
if (isdigit(c))
return CharClass::Digit;
return CharClass::NonWord;
}
static int GetScoreFor(CharClass prev, CharClass curr) {
if (prev == CharClass::NonWord && curr != CharClass::NonWord)
return kWordStartScore;
if ((prev == CharClass::Lower && curr == CharClass::Upper) ||
(prev != CharClass::Digit && curr == CharClass::Digit))
return kCamelScore;
if (curr == CharClass::NonWord)
return kNonWordScore;
return 0;
}
/*
fuzzyEvaluate implements a global sequence alignment algorithm to find the
maximum accumulated score by aligning `pattern` to `str`. It applies when
`pattern` is a subsequence of `str`.
Scoring criteria
- Prefer matches at the start of a word, or the start of subwords in
CamelCase/camelCase/camel123 words. See kWordStartScore/kCamelScore
- Non-word characters matter. See kNonWordScore
- The first characters of words of `pattern` receive bonus because they usually
have more significance than the rest. See kPatternStartMultiplier
- Superfluous characters in `str` will reduce the score (gap penalty). See
kGapScore
- Prefer early occurrence of the first character. See kLeadingGapScore/kGapScore
The recurrence of the dynamic programming:
dp[i][j]: maximum accumulated score by aligning pattern[0..i] to str[0..j]
dp[0][j] = leading_gap_penalty(0, j) + score[j]
dp[i][j] = max(dp[i-1][j-1] + CONSECUTIVE_SCORE, max(dp[i-1][k] +
gap_penalty(k+1, j) + score[j] : k < j))
The first dimension can be suppressed since we do not need a matching scheme,
which reduces the space complexity from O(N*M) to O(M)
*/
int FuzzyEvaluate(std::string_view pattern,
std::string_view str,
std::vector<int>& score,
std::vector<int>& dp) {
bool pfirst = true, // aligning the first character of pattern
pstart = true; // whether we are aligning the start of a word in pattern
int uleft = 0, // value of the upper left cell
ulefts = 0, // maximum value of uleft and cells on the left
left, lefts; // similar to uleft/ulefts, but for the next row
// Calculate position score for each character in str.
CharClass prev = CharClass::NonWord;
for (int i = 0; i < int(str.size()); i++) {
CharClass cur = GetCharClass(str[i]);
score[i] = GetScoreFor(prev, cur);
prev = cur;
}
std::fill_n(dp.begin(), str.size(), kMinScore);
// Align each character of pattern.
for (unsigned char pc : pattern) {
if (isspace(pc)) {
pstart = true;
continue;
}
lefts = kMinScore;
// Enumerate the character in str to be aligned with pc.
for (int i = 0; i < int(str.size()); i++) {
left = dp[i];
lefts = std::max(lefts + kGapScore, left);
// Use lower() if case-insensitive
if (tolower(pc) == tolower(str[i])) {
int t = score[i] * (pstart ? kPatternStartMultiplier : 1);
dp[i] = (pfirst ? kLeadingGapScore * i + t
: std::max(uleft + kConsecutiveScore, ulefts + t)) +
(pc == str[i] ? kCaseMatchScore : 0);
} else
dp[i] = kMinScore;
uleft = left;
ulefts = lefts;
}
pfirst = pstart = false;
}
// Enumerate the end position of the match in str. Each removed trailing
// character has a penulty of kGapScore.
lefts = kMinScore;
for (int i = 0; i < int(str.size()); i++)
lefts = std::max(lefts + kGapScore, dp[i]);
return lefts;
}
struct WorkspaceSymbolHandler : BaseMessageHandler<Ipc_WorkspaceSymbol> {
void Run(Ipc_WorkspaceSymbol* request) override {