Commit b8d9bc19 authored by Leonard Grey's avatar Leonard Grey Committed by Commit Bot

Commander: implement naive fuzzy finding

Bug: 1014639
Change-Id: I2168155284dd77296a6e26410e0d242092de8b57
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2410525
Commit-Queue: Leonard Grey <lgrey@chromium.org>
Reviewed-by: default avatarElly Fong-Jones <ellyjones@chromium.org>
Cr-Commit-Position: refs/heads/master@{#807544}
parent 5b24eaba
......@@ -5,6 +5,136 @@
#include "chrome/browser/ui/commander/fuzzy_finder.h"
#include "base/i18n/case_conversion.h"
#include "base/i18n/char_iterator.h"
#include "base/strings/string_util.h"
namespace {
// Used only for exact matches.
static const double kMaxScore = 1.0;
// When needle is a prefix of haystack.
static const double kPrefixScore = .99;
// When a heuristic determines that the match should score highly,
// but it is *not* an exact match or prefix.
static const double kVeryHighScore = .95;
struct MatchRecord {
MatchRecord(int start, int end, bool is_boundary, int gap_before)
: range(start, end), gap_before(gap_before), is_boundary(is_boundary) {}
gfx::Range range;
int gap_before;
bool is_boundary;
};
// Scores matches identified by ConsecutiveMatchWithGaps(). See that comment
// for details.
double ScoreForMatches(const std::vector<MatchRecord>& matches,
size_t needle_size,
size_t haystack_size) {
// |base_score| is the maximum per match, so total should not exceed 1.0.
const double base_score = 1.0 / needle_size;
const double gap_penalty = 1.0 / haystack_size;
static const double kRegularMultiplier = .5;
static const double kWordBoundaryMultiplier = .8;
static const double kInitialMultiplier = 1.0;
double score = 0;
for (size_t i = 0; i < matches.size(); i++) {
MatchRecord match = matches[i];
// The first character of the match is special; it gets a relative bonus
// if it is on a boundary. Otherwise, it is penalized by the distance
// between it and the previous match.
if (match.is_boundary) {
score +=
base_score * (i == 0 ? kInitialMultiplier : kWordBoundaryMultiplier);
} else {
double penalty_multiplier = 1 - (gap_penalty * match.gap_before);
DCHECK_GT(penalty_multiplier, 0);
score += base_score * kRegularMultiplier * penalty_multiplier;
}
// ...then the rest of a contiguous match.
score += (match.range.length() - 1) * base_score * kRegularMultiplier;
}
DCHECK(score <= 1.0);
return score;
}
// Returns a positive score if every code point in |needle| is present in
// |haystack| in the same order. The match *need not* be contiguous. Matches in
// special positions are given extra weight, and noncontiguous matches are
// penalized based on the size of the gaps between.
// This is not guaranteed to return the best possible match; for example, given
// needle = "orange" and haystack = "William of Orange", this function will
// match as "William [o]f O[range]" rather than "William of [Orange]". It's main
// use is to filter nonmatches before a more comprehensive algorithm, and as a
// fallback for when the inputs are too high for a more comprehensive algorithm
// to be performant.
double ConsecutiveMatchWithGaps(const base::string16& needle,
const base::string16& haystack,
std::vector<gfx::Range>* matched_ranges) {
DCHECK(needle == base::i18n::FoldCase(needle));
DCHECK(haystack == base::i18n::FoldCase(haystack));
DCHECK(matched_ranges->empty());
// Special case for prefix.
if (base::StartsWith(haystack, needle)) {
matched_ranges->emplace_back(0, needle.size());
return kPrefixScore;
}
base::i18n::UTF16CharIterator n_iter(&needle);
base::i18n::UTF16CharIterator h_iter(&haystack);
std::vector<MatchRecord> matches;
int gap_size_before_match = 0;
int match_began_on_boundary = true;
bool in_match = false;
int match_start = -1;
// Find matching ranges.
while (!n_iter.end() && !h_iter.end()) {
if (n_iter.get() == h_iter.get()) {
// There's a match.
if (!in_match) {
// Match start.
in_match = true;
match_start = h_iter.array_pos();
match_began_on_boundary =
h_iter.start() ||
base::IsUnicodeWhitespace(h_iter.PreviousCodePoint());
}
h_iter.Advance();
n_iter.Advance();
} else {
if (in_match) {
DCHECK(match_start != -1);
in_match = false;
matches.emplace_back(match_start, h_iter.array_pos(),
match_began_on_boundary, gap_size_before_match);
gap_size_before_match = 1;
match_start = -1;
} else {
gap_size_before_match++;
}
h_iter.Advance();
}
}
if (!n_iter.end()) {
// Didn't match all of |needle|.
matched_ranges->clear();
return 0;
}
if (in_match) {
DCHECK(match_start != -1);
matches.emplace_back(match_start, h_iter.array_pos(),
match_began_on_boundary, gap_size_before_match);
}
for (const MatchRecord& match : matches) {
matched_ranges->push_back(match.range);
}
double score = ScoreForMatches(matches, needle.size(), haystack.size());
score *= kPrefixScore; // Normalize so that a prefix always wins.
return score;
}
} // namespace
namespace commander {
......@@ -14,18 +144,75 @@ double FuzzyFind(const base::string16& needle,
DCHECK(needle == base::i18n::FoldCase(needle));
matched_ranges->clear();
const base::string16& folded = base::i18n::FoldCase(haystack);
if (folded == needle) {
matched_ranges->emplace_back(0, needle.length());
return 1.0;
}
size_t substring_position = folded.find(needle);
if (substring_position == std::string::npos)
size_t m = needle.size();
size_t n = folded.size();
// Special case 0: M > N. We don't allow skipping anything in |needle|, so
// no match possible.
if (m > n) {
return 0;
matched_ranges->emplace_back(substring_position, needle.length());
if (substring_position == 0)
return .99;
return std::min(1 - static_cast<double>(substring_position) / folded.length(),
0.01);
}
// Special case 1: M == N. It must be either an exact match,
// or a non-match.
if (m == n) {
if (folded == needle) {
matched_ranges->emplace_back(0, needle.length());
return kMaxScore;
} else {
return 0;
}
}
// Special case 2: M == 1. Scan through all matches, and return:
// no match ->
// 0
// prefix match ->
// kPrefixScore
// word boundary match (e.g. needle: j, haystack "Orange [J]uice") ->
// kVeryHighScore
// any other match ->
// Scored based on how far into haystack needle is found, normalized by
// haystack length.
if (m == 1) {
size_t substring_position = folded.find(needle);
while (substring_position != std::string::npos) {
if (substring_position == 0) {
// Prefix match.
matched_ranges->emplace_back(0, 1);
return kPrefixScore;
} else {
wchar_t previous = folded.at(substring_position - 1);
if (base::IsUnicodeWhitespace(previous)) {
// Word boundary. Since we've eliminated prefix by now, this is as
// good as we're going to get, so we can return.
matched_ranges->clear();
matched_ranges->emplace_back(substring_position,
substring_position + 1);
return kVeryHighScore;
// Internal match. If |matched_ranges| is already populated, we've
// seen another internal match previously, so ignore this one.
} else if (matched_ranges->empty()) {
matched_ranges->emplace_back(substring_position,
substring_position + 1);
}
}
substring_position = folded.find(needle, substring_position + 1);
}
if (matched_ranges->empty()) {
return 0;
} else {
// First internal match.
DCHECK_EQ(matched_ranges->size(), 1u);
double position = static_cast<double>(matched_ranges->back().start());
return std::min(1 - position / folded.length(), 0.01);
}
}
// This has two purposes:
// 1. If there's no match here, we should bail instead of wasting time on the
// full O(mn) matching algorithm.
// 2. If m * n is too big, we will use this result instead of doing the full
// full O(mn) matching algorithm.
// ***TEMPORARY***: The full algorithm isn't implemented yet, so we will use
// this unconditionally for now.
return ConsecutiveMatchWithGaps(needle, folded, matched_ranges);
}
} // namespace commander
......@@ -12,21 +12,15 @@
namespace commander {
// TODO(lgrey): Make this actually fuzzy find.
// Returns a score from 0 to 1 based on how well |needle| matches |haystack|.
// 0 means no match. |matched_ranges| will be filled with the ranges of
// |haystack| that match |needle| so they can be highlighted in the UI; see
// comment on commander::CommandItem |matched_ranges| for a worked example.
// *** TEMPORARY ***
// Temporarily, a non-zero match means that |needle| is a substring of
// |haystack|, with a penalty applied based on how far into |haystack|
// |needle| begins. Exact matches are 1.0 (vs. a max of .99 for non-exact
// prefix).
// This will be replaced with a more sophisticated implementation in the
// near future.
// *** END TEMPORARY ***
// |needle| is expected to already be case folded (this is DCHECKED) to save
// redundant processing, as the needle will be checked with many haystacks.
// redundant processing, as one needle will be checked against many haystacks.
// TODO(lgrey): This currently uses an algorithm which is not guaranteed to
// return the optimal match. Update this to use a more comprehensive method
// when inputs are small enough.
double FuzzyFind(const base::string16& needle,
const base::string16& haystack,
std::vector<gfx::Range>* matched_ranges);
......
......@@ -26,6 +26,44 @@ TEST(CommanderFuzzyFinder, ExactMatchIsOne) {
EXPECT_EQ(ranges, std::vector<gfx::Range>({{0, 6}}));
}
// This ensures coverage for a fast path. Successful match is
// tested in ExactMatchIsOne() above.
TEST(CommanderFuzzyFinder, NeedleHaystackSameLength) {
std::vector<gfx::Range> ranges;
EXPECT_EQ(0, FuzzyFind(base::ASCIIToUTF16("ranges"),
base::ASCIIToUTF16("orange"), &ranges));
EXPECT_TRUE(ranges.empty());
}
// This ensures coverage for a fast path (just making sure the path has
// coverage rather than ensuring the path is taken).
TEST(CommanderFuzzyFinder, SingleCharNeedle) {
std::vector<gfx::Range> ranges;
double prefix_score =
FuzzyFind(base::ASCIIToUTF16("o"), base::ASCIIToUTF16("orange"), &ranges);
EXPECT_EQ(ranges, std::vector<gfx::Range>({{0, 1}}));
double internal_score =
FuzzyFind(base::ASCIIToUTF16("o"), base::ASCIIToUTF16("phone"), &ranges);
EXPECT_EQ(ranges, std::vector<gfx::Range>({{2, 3}}));
double boundary_score = FuzzyFind(
base::ASCIIToUTF16("o"), base::ASCIIToUTF16("phone operator"), &ranges);
EXPECT_EQ(ranges, std::vector<gfx::Range>({{6, 7}}));
// Expected ordering:
// - Prefix should rank highest.
// - Word boundary matches that are not the prefix should rank next
// highest, even if there's an internal match earlier in the haystack.
// - Internal matches should rank lowest.
EXPECT_GT(prefix_score, boundary_score);
EXPECT_GT(boundary_score, internal_score);
// ...and non-matches should have score = 0.
EXPECT_EQ(0, FuzzyFind(base::ASCIIToUTF16("o"),
base::ASCIIToUTF16("aquarium"), &ranges));
EXPECT_TRUE(ranges.empty());
}
TEST(CommanderFuzzyFinder, CaseInsensitive) {
std::vector<gfx::Range> ranges;
EXPECT_EQ(1, FuzzyFind(base::ASCIIToUTF16("orange"),
......@@ -38,12 +76,9 @@ TEST(CommanderFuzzyFinder, PrefixRanksHigherThanInternal) {
double prefix_rank = FuzzyFind(base::ASCIIToUTF16("orange"),
base::ASCIIToUTF16("Orange juice"), &ranges);
EXPECT_EQ(ranges, std::vector<gfx::Range>({{0, 6}}));
double non_prefix_rank =
FuzzyFind(base::ASCIIToUTF16("orange"),
base::ASCIIToUTF16("William of Orange"), &ranges);
EXPECT_EQ(ranges, std::vector<gfx::Range>({{11, 6}}));
EXPECT_GT(prefix_rank, 0);
EXPECT_GT(non_prefix_rank, 0);
......@@ -51,4 +86,20 @@ TEST(CommanderFuzzyFinder, PrefixRanksHigherThanInternal) {
EXPECT_LT(non_prefix_rank, 1);
EXPECT_GT(prefix_rank, non_prefix_rank);
}
TEST(CommanderFuzzyFinder, NeedleLongerThanHaystack) {
std::vector<gfx::Range> ranges;
EXPECT_EQ(0, FuzzyFind(base::ASCIIToUTF16("orange juice"),
base::ASCIIToUTF16("orange"), &ranges));
EXPECT_TRUE(ranges.empty());
}
TEST(CommanderFuzzyFinder, Noncontiguous) {
std::vector<gfx::Range> ranges;
EXPECT_GT(FuzzyFind(base::ASCIIToUTF16("tuot"),
base::UTF8ToUTF16("Tlön, Uqbar, Orbis Tertius"), &ranges),
0);
EXPECT_EQ(ranges,
std::vector<gfx::Range>({{0, 1}, {6, 7}, {13, 14}, {19, 20}}));
}
} // namespace commander
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment