Commit adf1c7d4 authored by Thanh Nguyen's avatar Thanh Nguyen Committed by Commit Bot

Use PrefixMatcher class to calculate prefix match score

Previously FuzzyTokenizedStringMatch uses its own way to calculate
prefix match score. This CL uses PrefixMatcher class to calculate prefix
match score to be consistent with TokenizedStringMatch.

Bug: 1086841
Change-Id: I8fcf37b27e2345dca03498788303ac2287324997
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2237492
Commit-Queue: Thanh Nguyen <thanhdng@chromium.org>
Reviewed-by: default avatarJia Meng <jiameng@chromium.org>
Cr-Commit-Position: refs/heads/master@{#776813}
parent 549dacf7
......@@ -13,13 +13,12 @@
#include "base/strings/strcat.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "chrome/common/string_matching/prefix_matcher.h"
#include "chrome/common/string_matching/sequence_matcher.h"
namespace {
constexpr double kMinScore = 0.0;
constexpr double kMaxScore = 1.0;
constexpr double kFirstCharacterMatchPenalty = 0.2;
constexpr double kPrefixMatchPenalty = 0.1;
// Returns sorted tokens from a TokenizedString.
std::vector<base::string16> ProcessAndSort(const TokenizedString& text) {
......@@ -35,57 +34,6 @@ std::vector<base::string16> ProcessAndSort(const TokenizedString& text) {
FuzzyTokenizedStringMatch::~FuzzyTokenizedStringMatch() {}
FuzzyTokenizedStringMatch::FuzzyTokenizedStringMatch() {}
double FuzzyTokenizedStringMatch::FirstCharacterMatch(
const TokenizedString& query,
const TokenizedString& text) {
const base::string16 query_lower = base::i18n::ToLower(query.text());
size_t query_index = 0;
for (size_t text_index = 0; text_index < text.tokens().size(); text_index++) {
if (query_index < query_lower.size() &&
text.tokens()[text_index][0] == query_lower[query_index]) {
query_index++;
if (query_index == query_lower.size()) {
// Penalizes the score using the number of text's tokens that are
// needed.
return std::max(kMinScore,
kMaxScore - kFirstCharacterMatchPenalty *
(text_index + 1 - query_lower.size()));
}
}
}
return kMinScore;
}
double FuzzyTokenizedStringMatch::PrefixMatch(const TokenizedString& query,
const TokenizedString& text) {
const std::vector<base::string16> query_tokens(query.tokens());
const std::vector<base::string16> text_tokens(text.tokens());
double match_score = kMaxScore;
int previous_matched_index = -1;
// For every query token, check if it is a prefix of a text token. The newly
// matching text token must have higher index than the previous matched token.
for (const auto& query_token : query_tokens) {
bool matched = false;
for (size_t text_index = previous_matched_index + 1;
text_index < text_tokens.size(); text_index++) {
if (query_token.size() <= text_tokens[text_index].size() &&
query_token ==
text_tokens[text_index].substr(0, query_token.size())) {
matched = true;
// Penalizes the score based on the number of skipped tokens.
match_score -=
kPrefixMatchPenalty * (text_index - previous_matched_index - 1);
previous_matched_index = text_index;
break;
}
}
if (!matched) {
return kMinScore;
}
}
return std::max(kMinScore, match_score);
}
double FuzzyTokenizedStringMatch::TokenSetRatio(
const TokenizedString& query,
const TokenizedString& text,
......@@ -283,7 +231,9 @@ double FuzzyTokenizedStringMatch::WeightedRatio(
double FuzzyTokenizedStringMatch::PrefixMatcher(const TokenizedString& query,
const TokenizedString& text) {
return std::max(PrefixMatch(query, text), FirstCharacterMatch(query, text));
::PrefixMatcher match(query, text);
match.Match();
return 1.0 - std::pow(0.5, match.relevance());
}
bool FuzzyTokenizedStringMatch::IsRelevant(const TokenizedString& query,
......
......@@ -25,16 +25,6 @@ class FuzzyTokenizedStringMatch {
FuzzyTokenizedStringMatch();
~FuzzyTokenizedStringMatch();
// Check if the query only contains first characters of the text,
// e.g. "coc" is a match of "Clash of Clan". Range of the score is [0, 1].
static double FirstCharacterMatch(const TokenizedString& query,
const TokenizedString& text);
// Check if tokens of query are prefixes of text's tokens. Range of score is
// [0, 1].
static double PrefixMatch(const TokenizedString& query,
const TokenizedString& text);
// TokenSetRatio takes two sets of tokens, finds their intersection and
// differences. From the intersection and differences, it rewrites the |query|
// and |text| and find the similarity ratio between them. This function
......@@ -75,7 +65,8 @@ class FuzzyTokenizedStringMatch {
bool use_edit_distance,
double num_matching_blocks_penalty);
// Since prefix match should always be favored over other matches, this
// function is dedicated to calculate a prefix match score in range of [0, 1].
// function is dedicated to calculate a prefix match score in range of [0, 1]
// using PrefixMatcher class.
// This score has two components: first character match and whole prefix
// match.
static double PrefixMatcher(const TokenizedString& query,
......
......@@ -165,57 +165,40 @@ TEST_F(FuzzyTokenizedStringMatchTest, WeightedRatio) {
}
}
TEST_F(FuzzyTokenizedStringMatchTest, FirstCharacterMatchTest) {
{
base::string16 query(base::UTF8ToUTF16("COC"));
base::string16 text(base::UTF8ToUTF16("Clash of Clan"));
EXPECT_EQ(FuzzyTokenizedStringMatch::FirstCharacterMatch(
TokenizedString(query), TokenizedString(text)),
1.0);
}
{
base::string16 query(base::UTF8ToUTF16("CC"));
base::string16 text(base::UTF8ToUTF16("Clash of Clan"));
EXPECT_EQ(FuzzyTokenizedStringMatch::FirstCharacterMatch(
TokenizedString(query), TokenizedString(text)),
0.8);
}
{
base::string16 query(base::UTF8ToUTF16("C o C"));
base::string16 text(base::UTF8ToUTF16("Clash of Clan"));
EXPECT_EQ(FuzzyTokenizedStringMatch::FirstCharacterMatch(
TokenizedString(query), TokenizedString(text)),
0.0);
}
}
TEST_F(FuzzyTokenizedStringMatchTest, PrefixMatchTest) {
TEST_F(FuzzyTokenizedStringMatchTest, PrefixMatcherTest) {
{
base::string16 query(base::UTF8ToUTF16("clas"));
base::string16 text(base::UTF8ToUTF16("Clash of Clan"));
EXPECT_EQ(FuzzyTokenizedStringMatch::PrefixMatch(TokenizedString(query),
TokenizedString(text)),
1.0);
EXPECT_NEAR(FuzzyTokenizedStringMatch::PrefixMatcher(TokenizedString(query),
TokenizedString(text)),
0.94, 0.01);
}
{
base::string16 query(base::UTF8ToUTF16("clash clan"));
base::string16 text(base::UTF8ToUTF16("Clash of Clan"));
EXPECT_EQ(FuzzyTokenizedStringMatch::PrefixMatch(TokenizedString(query),
TokenizedString(text)),
0.9);
EXPECT_NEAR(FuzzyTokenizedStringMatch::PrefixMatcher(TokenizedString(query),
TokenizedString(text)),
0.99, 0.01);
}
{
base::string16 query(base::UTF8ToUTF16("c o c"));
base::string16 text(base::UTF8ToUTF16("Clash of Clan"));
EXPECT_EQ(FuzzyTokenizedStringMatch::PrefixMatch(TokenizedString(query),
TokenizedString(text)),
1.0);
EXPECT_NEAR(FuzzyTokenizedStringMatch::PrefixMatcher(TokenizedString(query),
TokenizedString(text)),
0.84, 0.01);
}
{
base::string16 query(base::UTF8ToUTF16("wifi"));
base::string16 text(base::UTF8ToUTF16("wi-fi"));
EXPECT_NEAR(FuzzyTokenizedStringMatch::PrefixMatcher(TokenizedString(query),
TokenizedString(text)),
0.91, 0.01);
}
{
base::string16 query(base::UTF8ToUTF16("clam"));
base::string16 text(base::UTF8ToUTF16("Clash of Clan"));
EXPECT_EQ(FuzzyTokenizedStringMatch::PrefixMatch(TokenizedString(query),
TokenizedString(text)),
EXPECT_EQ(FuzzyTokenizedStringMatch::PrefixMatcher(TokenizedString(query),
TokenizedString(text)),
0.0);
}
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment