Commit 2b965865 authored by Jia's avatar Jia Committed by Commit Bot

[cros search service] Add a parameter to SequenceMatcher

This cl adds a penalty factor to SequenceMatcher so that
we can penalize lots of short matching blocks.

This cl also update search parameters.

Bug: 1081584,1090181,1090154,1090148
Change-Id: Idd35ddd7cfbcdb3928e4b966f2ec757d0b75296c
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2227721
Commit-Queue: Jia Meng <jiameng@chromium.org>
Reviewed-by: default avatarThanh Nguyen <thanhdng@chromium.org>
Reviewed-by: default avatarKyle Horimoto <khorimoto@chromium.org>
Cr-Commit-Position: refs/heads/master@{#774930}
parent 43c8771f
...@@ -74,7 +74,7 @@ bool IsItemRelevant( ...@@ -74,7 +74,7 @@ bool IsItemRelevant(
FuzzyTokenizedStringMatch match; FuzzyTokenizedStringMatch match;
if (match.IsRelevant(query, *tag, relevance_threshold, use_prefix_only, if (match.IsRelevant(query, *tag, relevance_threshold, use_prefix_only,
use_weighted_ratio, use_edit_distance, use_weighted_ratio, use_edit_distance,
partial_match_penalty_rate)) { partial_match_penalty_rate, 0.1)) {
*relevance_score = match.relevance(); *relevance_score = match.relevance();
for (const auto& hit : match.hits()) { for (const auto& hit : match.hits()) {
local_search_service::Range range; local_search_service::Range range;
......
...@@ -40,7 +40,7 @@ struct SearchParams { ...@@ -40,7 +40,7 @@ struct SearchParams {
double partial_match_penalty_rate = 0.9; double partial_match_penalty_rate = 0.9;
bool use_prefix_only = false; bool use_prefix_only = false;
bool use_edit_distance = false; bool use_edit_distance = false;
bool split_search_tags = true; bool split_search_tags = false;
}; };
// A numeric range used to represent the start and end position. // A numeric range used to represent the start and end position.
......
...@@ -133,6 +133,9 @@ TEST_F(IndexTest, SearchTagSplit) { ...@@ -133,6 +133,9 @@ TEST_F(IndexTest, SearchTagSplit) {
std::vector<Data> data = CreateTestData(data_to_register); std::vector<Data> data = CreateTestData(data_to_register);
EXPECT_EQ(data.size(), 2u); EXPECT_EQ(data.size(), 2u);
SearchParams search_params;
search_params.split_search_tags = true;
index_.SetSearchParams(search_params);
index_.AddOrUpdate(data); index_.AddOrUpdate(data);
EXPECT_EQ(index_.GetSize(), 2u); EXPECT_EQ(index_.GetSize(), 2u);
......
...@@ -107,21 +107,21 @@ class SearchHandlerTest : public testing::Test { ...@@ -107,21 +107,21 @@ class SearchHandlerTest : public testing::Test {
}; };
TEST_F(SearchHandlerTest, AddAndRemove) { TEST_F(SearchHandlerTest, AddAndRemove) {
// Add printing search tags to registry and search for "Printing". // Add printing search tags to registry and search for "Print".
search_tag_registry_.AddSearchTags(GetPrintingSearchConcepts()); search_tag_registry_.AddSearchTags(GetPrintingSearchConcepts());
std::vector<mojom::SearchResultPtr> search_results; std::vector<mojom::SearchResultPtr> search_results;
// 2 results should be available for a "Printing" query. // 3 results should be available for a "Print" query.
mojom::SearchHandlerAsyncWaiter(handler_remote_.get()) mojom::SearchHandlerAsyncWaiter(handler_remote_.get())
.Search(base::ASCIIToUTF16("Printing"), .Search(base::ASCIIToUTF16("Print"),
/*max_num_results=*/3u, /*max_num_results=*/3u,
mojom::ParentResultBehavior::kDoNotIncludeParentResults, mojom::ParentResultBehavior::kDoNotIncludeParentResults,
&search_results); &search_results);
EXPECT_EQ(search_results.size(), 2u); EXPECT_EQ(search_results.size(), 3u);
// Limit results to 1 max and ensure that only 1 result is returned. // Limit results to 1 max and ensure that only 1 result is returned.
mojom::SearchHandlerAsyncWaiter(handler_remote_.get()) mojom::SearchHandlerAsyncWaiter(handler_remote_.get())
.Search(base::ASCIIToUTF16("Printing"), .Search(base::ASCIIToUTF16("Print"),
/*max_num_results=*/1u, /*max_num_results=*/1u,
mojom::ParentResultBehavior::kDoNotIncludeParentResults, mojom::ParentResultBehavior::kDoNotIncludeParentResults,
&search_results); &search_results);
...@@ -139,7 +139,7 @@ TEST_F(SearchHandlerTest, AddAndRemove) { ...@@ -139,7 +139,7 @@ TEST_F(SearchHandlerTest, AddAndRemove) {
// returned for "Printing". // returned for "Printing".
search_tag_registry_.RemoveSearchTags(GetPrintingSearchConcepts()); search_tag_registry_.RemoveSearchTags(GetPrintingSearchConcepts());
mojom::SearchHandlerAsyncWaiter(handler_remote_.get()) mojom::SearchHandlerAsyncWaiter(handler_remote_.get())
.Search(base::ASCIIToUTF16("Printing"), .Search(base::ASCIIToUTF16("Print"),
/*max_num_results=*/3u, /*max_num_results=*/3u,
mojom::ParentResultBehavior::kDoNotIncludeParentResults, mojom::ParentResultBehavior::kDoNotIncludeParentResults,
&search_results); &search_results);
...@@ -187,11 +187,11 @@ TEST_F(SearchHandlerTest, DefaultRank) { ...@@ -187,11 +187,11 @@ TEST_F(SearchHandlerTest, DefaultRank) {
search_tag_registry_.AddSearchTags(GetPrintingSearchConcepts()); search_tag_registry_.AddSearchTags(GetPrintingSearchConcepts());
std::vector<mojom::SearchResultPtr> search_results; std::vector<mojom::SearchResultPtr> search_results;
// Search for "Printing". Only the IDS_OS_SETTINGS_TAG_PRINTING result // Search for "Print". Only the IDS_OS_SETTINGS_TAG_PRINTING result
// contains the word "Printing", but the other results have the similar word // contains the word "Printing", but the other results have the similar word
// "Printer". Thus, "Printing" has a higher relevance score. // "Printer". Thus, "Printing" has a higher relevance score.
mojom::SearchHandlerAsyncWaiter(handler_remote_.get()) mojom::SearchHandlerAsyncWaiter(handler_remote_.get())
.Search(base::ASCIIToUTF16("Printing"), .Search(base::ASCIIToUTF16("Print"),
/*max_num_results=*/3u, /*max_num_results=*/3u,
mojom::ParentResultBehavior::kAllowParentResults, mojom::ParentResultBehavior::kAllowParentResults,
&search_results); &search_results);
......
...@@ -91,7 +91,8 @@ double FuzzyTokenizedStringMatch::TokenSetRatio( ...@@ -91,7 +91,8 @@ double FuzzyTokenizedStringMatch::TokenSetRatio(
const TokenizedString& text, const TokenizedString& text,
bool partial, bool partial,
double partial_match_penalty_rate, double partial_match_penalty_rate,
bool use_edit_distance) { bool use_edit_distance,
double num_matching_blocks_penalty) {
std::set<base::string16> query_token(query.tokens().begin(), std::set<base::string16> query_token(query.tokens().begin(),
query.tokens().end()); query.tokens().end());
std::set<base::string16> text_token(text.tokens().begin(), std::set<base::string16> text_token(text.tokens().begin(),
...@@ -127,21 +128,26 @@ double FuzzyTokenizedStringMatch::TokenSetRatio( ...@@ -127,21 +128,26 @@ double FuzzyTokenizedStringMatch::TokenSetRatio(
base::JoinString(text_diff_query, base::UTF8ToUTF16(" "))}); base::JoinString(text_diff_query, base::UTF8ToUTF16(" "))});
if (partial) { if (partial) {
return std::max( return std::max({PartialRatio(intersection_string, query_rewritten,
{PartialRatio(intersection_string, query_rewritten, partial_match_penalty_rate, use_edit_distance,
partial_match_penalty_rate, use_edit_distance), num_matching_blocks_penalty),
PartialRatio(intersection_string, text_rewritten, PartialRatio(intersection_string, text_rewritten,
partial_match_penalty_rate, use_edit_distance), partial_match_penalty_rate, use_edit_distance,
num_matching_blocks_penalty),
PartialRatio(query_rewritten, text_rewritten, PartialRatio(query_rewritten, text_rewritten,
partial_match_penalty_rate, use_edit_distance)}); partial_match_penalty_rate, use_edit_distance,
num_matching_blocks_penalty)});
} }
return std::max( return std::max(
{SequenceMatcher(intersection_string, query_rewritten, use_edit_distance) {SequenceMatcher(intersection_string, query_rewritten, use_edit_distance,
num_matching_blocks_penalty)
.Ratio(), .Ratio(),
SequenceMatcher(intersection_string, text_rewritten, use_edit_distance) SequenceMatcher(intersection_string, text_rewritten, use_edit_distance,
num_matching_blocks_penalty)
.Ratio(), .Ratio(),
SequenceMatcher(query_rewritten, text_rewritten, use_edit_distance) SequenceMatcher(query_rewritten, text_rewritten, use_edit_distance,
num_matching_blocks_penalty)
.Ratio()}); .Ratio()});
} }
...@@ -150,7 +156,8 @@ double FuzzyTokenizedStringMatch::TokenSortRatio( ...@@ -150,7 +156,8 @@ double FuzzyTokenizedStringMatch::TokenSortRatio(
const TokenizedString& text, const TokenizedString& text,
bool partial, bool partial,
double partial_match_penalty_rate, double partial_match_penalty_rate,
bool use_edit_distance) { bool use_edit_distance,
double num_matching_blocks_penalty) {
const base::string16 query_sorted = const base::string16 query_sorted =
base::JoinString(ProcessAndSort(query), base::UTF8ToUTF16(" ")); base::JoinString(ProcessAndSort(query), base::UTF8ToUTF16(" "));
const base::string16 text_sorted = const base::string16 text_sorted =
...@@ -158,16 +165,19 @@ double FuzzyTokenizedStringMatch::TokenSortRatio( ...@@ -158,16 +165,19 @@ double FuzzyTokenizedStringMatch::TokenSortRatio(
if (partial) { if (partial) {
return PartialRatio(query_sorted, text_sorted, partial_match_penalty_rate, return PartialRatio(query_sorted, text_sorted, partial_match_penalty_rate,
use_edit_distance); use_edit_distance, num_matching_blocks_penalty);
} }
return SequenceMatcher(query_sorted, text_sorted, use_edit_distance).Ratio(); return SequenceMatcher(query_sorted, text_sorted, use_edit_distance,
num_matching_blocks_penalty)
.Ratio();
} }
double FuzzyTokenizedStringMatch::PartialRatio( double FuzzyTokenizedStringMatch::PartialRatio(
const base::string16& query, const base::string16& query,
const base::string16& text, const base::string16& text,
double partial_match_penalty_rate, double partial_match_penalty_rate,
bool use_edit_distance) { bool use_edit_distance,
double num_matching_blocks_penalty) {
if (query.empty() || text.empty()) { if (query.empty() || text.empty()) {
return kMinScore; return kMinScore;
} }
...@@ -180,7 +190,9 @@ double FuzzyTokenizedStringMatch::PartialRatio( ...@@ -180,7 +190,9 @@ double FuzzyTokenizedStringMatch::PartialRatio(
} }
const auto matching_blocks = const auto matching_blocks =
SequenceMatcher(shorter, longer, use_edit_distance).GetMatchingBlocks(); SequenceMatcher(shorter, longer, use_edit_distance,
num_matching_blocks_penalty)
.GetMatchingBlocks();
double partial_ratio = 0; double partial_ratio = 0;
for (const auto& block : matching_blocks) { for (const auto& block : matching_blocks) {
...@@ -203,7 +215,7 @@ double FuzzyTokenizedStringMatch::PartialRatio( ...@@ -203,7 +215,7 @@ double FuzzyTokenizedStringMatch::PartialRatio(
partial_ratio = std::max( partial_ratio = std::max(
partial_ratio, partial_ratio,
SequenceMatcher(shorter, longer.substr(long_start, shorter.size()), SequenceMatcher(shorter, longer.substr(long_start, shorter.size()),
use_edit_distance) use_edit_distance, num_matching_blocks_penalty)
.Ratio() * .Ratio() *
penalty); penalty);
...@@ -218,7 +230,8 @@ double FuzzyTokenizedStringMatch::WeightedRatio( ...@@ -218,7 +230,8 @@ double FuzzyTokenizedStringMatch::WeightedRatio(
const TokenizedString& query, const TokenizedString& query,
const TokenizedString& text, const TokenizedString& text,
double partial_match_penalty_rate, double partial_match_penalty_rate,
bool use_edit_distance) { bool use_edit_distance,
double num_matching_blocks_penalty) {
const double unbase_scale = 0.95; const double unbase_scale = 0.95;
// Since query.text() and text.text() is not normalized, we use query.tokens() // Since query.text() and text.text() is not normalized, we use query.tokens()
// and text.tokens() instead. // and text.tokens() instead.
...@@ -227,7 +240,8 @@ double FuzzyTokenizedStringMatch::WeightedRatio( ...@@ -227,7 +240,8 @@ double FuzzyTokenizedStringMatch::WeightedRatio(
const base::string16 text_normalized( const base::string16 text_normalized(
base::JoinString(text.tokens(), base::UTF8ToUTF16(" "))); base::JoinString(text.tokens(), base::UTF8ToUTF16(" ")));
double weighted_ratio = double weighted_ratio =
SequenceMatcher(query_normalized, text_normalized, use_edit_distance) SequenceMatcher(query_normalized, text_normalized, use_edit_distance,
num_matching_blocks_penalty)
.Ratio(); .Ratio();
const double length_ratio = const double length_ratio =
static_cast<double>( static_cast<double>(
...@@ -245,22 +259,24 @@ double FuzzyTokenizedStringMatch::WeightedRatio( ...@@ -245,22 +259,24 @@ double FuzzyTokenizedStringMatch::WeightedRatio(
weighted_ratio = weighted_ratio =
std::max(weighted_ratio, std::max(weighted_ratio,
PartialRatio(query_normalized, text_normalized, PartialRatio(query_normalized, text_normalized,
partial_match_penalty_rate, use_edit_distance) * partial_match_penalty_rate, use_edit_distance,
num_matching_blocks_penalty) *
partial_scale); partial_scale);
} }
weighted_ratio = weighted_ratio =
std::max(weighted_ratio, std::max(weighted_ratio,
TokenSortRatio(query, text, use_partial /*partial*/, TokenSortRatio(query, text, use_partial /*partial*/,
partial_match_penalty_rate, use_edit_distance) * partial_match_penalty_rate, use_edit_distance,
num_matching_blocks_penalty) *
unbase_scale * partial_scale); unbase_scale * partial_scale);
// Do not use partial match for token set because the match between the // Do not use partial match for token set because the match between the
// intersection string and query/text rewrites will always return an extremely // intersection string and query/text rewrites will always return an extremely
// high value. // high value.
weighted_ratio = weighted_ratio = std::max(
std::max(weighted_ratio, weighted_ratio,
TokenSetRatio(query, text, false /*partial*/, TokenSetRatio(query, text, false /*partial*/, partial_match_penalty_rate,
partial_match_penalty_rate, use_edit_distance) * use_edit_distance, num_matching_blocks_penalty) *
unbase_scale * partial_scale); unbase_scale * partial_scale);
return weighted_ratio; return weighted_ratio;
} }
...@@ -276,7 +292,8 @@ bool FuzzyTokenizedStringMatch::IsRelevant(const TokenizedString& query, ...@@ -276,7 +292,8 @@ bool FuzzyTokenizedStringMatch::IsRelevant(const TokenizedString& query,
bool use_prefix_only, bool use_prefix_only,
bool use_weighted_ratio, bool use_weighted_ratio,
bool use_edit_distance, bool use_edit_distance,
double partial_match_penalty_rate) { double partial_match_penalty_rate,
double num_matching_blocks_penalty) {
// If there is an exact match, relevance will be 1.0 and there is only 1 hit // If there is an exact match, relevance will be 1.0 and there is only 1 hit
// that is the entire text/query. // that is the entire text/query.
const auto& query_text = query.text(); const auto& query_text = query.text();
...@@ -292,7 +309,8 @@ bool FuzzyTokenizedStringMatch::IsRelevant(const TokenizedString& query, ...@@ -292,7 +309,8 @@ bool FuzzyTokenizedStringMatch::IsRelevant(const TokenizedString& query,
// Find |hits_| using SequenceMatcher on original query and text. // Find |hits_| using SequenceMatcher on original query and text.
for (const auto& match : for (const auto& match :
SequenceMatcher(query_text, text_text, use_edit_distance) SequenceMatcher(query_text, text_text, use_edit_distance,
num_matching_blocks_penalty)
.GetMatchingBlocks()) { .GetMatchingBlocks()) {
if (match.length > 0) { if (match.length > 0) {
hits_.push_back(gfx::Range(match.pos_second_string, hits_.push_back(gfx::Range(match.pos_second_string,
...@@ -317,15 +335,17 @@ bool FuzzyTokenizedStringMatch::IsRelevant(const TokenizedString& query, ...@@ -317,15 +335,17 @@ bool FuzzyTokenizedStringMatch::IsRelevant(const TokenizedString& query,
if (use_weighted_ratio) { if (use_weighted_ratio) {
// If WeightedRatio is used, |relevance_| is the average of WeightedRatio // If WeightedRatio is used, |relevance_| is the average of WeightedRatio
// and PrefixMatcher scores. // and PrefixMatcher scores.
relevance_ = (WeightedRatio(query, text, partial_match_penalty_rate, relevance_ =
use_edit_distance) + (WeightedRatio(query, text, partial_match_penalty_rate,
use_edit_distance, num_matching_blocks_penalty) +
prefix_score) / prefix_score) /
2; 2;
} else { } else {
// Use simple algorithm to calculate match ratio. // Use simple algorithm to calculate match ratio.
relevance_ = relevance_ =
(SequenceMatcher(base::i18n::ToLower(query_text), (SequenceMatcher(base::i18n::ToLower(query_text),
base::i18n::ToLower(text_text), use_edit_distance) base::i18n::ToLower(text_text), use_edit_distance,
num_matching_blocks_penalty)
.Ratio() + .Ratio() +
prefix_score) / prefix_score) /
2; 2;
......
...@@ -44,7 +44,8 @@ class FuzzyTokenizedStringMatch { ...@@ -44,7 +44,8 @@ class FuzzyTokenizedStringMatch {
const TokenizedString& text, const TokenizedString& text,
bool partial, bool partial,
double partial_match_penalty_rate, double partial_match_penalty_rate,
bool use_edit_distance); bool use_edit_distance,
double num_matching_blocks_penalty);
// TokenSortRatio takes two set of tokens, sorts them and find the similarity // TokenSortRatio takes two set of tokens, sorts them and find the similarity
// between two sorted strings. This function assumes that TokenizedString is // between two sorted strings. This function assumes that TokenizedString is
...@@ -53,7 +54,8 @@ class FuzzyTokenizedStringMatch { ...@@ -53,7 +54,8 @@ class FuzzyTokenizedStringMatch {
const TokenizedString& text, const TokenizedString& text,
bool partial, bool partial,
double partial_match_penalty_rate, double partial_match_penalty_rate,
bool use_edit_distance); bool use_edit_distance,
double num_matching_blocks_penalty);
// Finds the best ratio of shorter text with a part of longer text. // Finds the best ratio of shorter text with a part of longer text.
// This function assumes that TokenizedString is already normalized (converted // This function assumes that TokenizedString is already normalized (converted
...@@ -61,7 +63,8 @@ class FuzzyTokenizedStringMatch { ...@@ -61,7 +63,8 @@ class FuzzyTokenizedStringMatch {
static double PartialRatio(const base::string16& query, static double PartialRatio(const base::string16& query,
const base::string16& text, const base::string16& text,
double partial_match_penalty_rate, double partial_match_penalty_rate,
bool use_edit_distance); bool use_edit_distance,
double num_matching_blocks_penalty);
// Combines scores from different ratio functions. This function assumes that // Combines scores from different ratio functions. This function assumes that
// TokenizedString is already normalized (converted to lower cases). // TokenizedString is already normalized (converted to lower cases).
...@@ -69,7 +72,8 @@ class FuzzyTokenizedStringMatch { ...@@ -69,7 +72,8 @@ class FuzzyTokenizedStringMatch {
static double WeightedRatio(const TokenizedString& query, static double WeightedRatio(const TokenizedString& query,
const TokenizedString& text, const TokenizedString& text,
double partial_match_penalty_rate, double partial_match_penalty_rate,
bool use_edit_distance); bool use_edit_distance,
double num_matching_blocks_penalty);
// Since prefix match should always be favored over other matches, this // Since prefix match should always be favored over other matches, this
// function is dedicated to calculate a prefix match score in range of [0, 1]. // function is dedicated to calculate a prefix match score in range of [0, 1].
// This score has two components: first character match and whole prefix // This score has two components: first character match and whole prefix
...@@ -85,7 +89,8 @@ class FuzzyTokenizedStringMatch { ...@@ -85,7 +89,8 @@ class FuzzyTokenizedStringMatch {
bool use_prefix_only, bool use_prefix_only,
bool use_weighted_ratio, bool use_weighted_ratio,
bool use_edit_distance, bool use_edit_distance,
double partial_match_penalty_rate); double partial_match_penalty_rate,
double num_matching_blocks_penalty = 0.0);
double relevance() const { return relevance_; } double relevance() const { return relevance_; }
const Hits& hits() const { return hits_; } const Hits& hits() const { return hits_; }
......
...@@ -28,9 +28,11 @@ SequenceMatcher::Match::Match(int pos_first, int pos_second, int len) ...@@ -28,9 +28,11 @@ SequenceMatcher::Match::Match(int pos_first, int pos_second, int len)
SequenceMatcher::SequenceMatcher(const base::string16& first_string, SequenceMatcher::SequenceMatcher(const base::string16& first_string,
const base::string16& second_string, const base::string16& second_string,
bool use_edit_distance) bool use_edit_distance,
double num_matching_blocks_penalty)
: first_string_(first_string), : first_string_(first_string),
second_string_(second_string), second_string_(second_string),
num_matching_blocks_penalty_(num_matching_blocks_penalty),
dp_common_string_(second_string.size() + 1, 0) { dp_common_string_(second_string.size() + 1, 0) {
DCHECK(!first_string_.empty() || !second_string_.empty()); DCHECK(!first_string_.empty() || !second_string_.empty());
...@@ -189,10 +191,15 @@ double SequenceMatcher::Ratio() { ...@@ -189,10 +191,15 @@ double SequenceMatcher::Ratio() {
int sum_match = 0; int sum_match = 0;
const int sum_length = first_string_.size() + second_string_.size(); const int sum_length = first_string_.size() + second_string_.size();
DCHECK_NE(sum_length, 0); DCHECK_NE(sum_length, 0);
const int num_blocks = GetMatchingBlocks().size();
for (const auto& match : GetMatchingBlocks()) { for (const auto& match : GetMatchingBlocks()) {
sum_match += match.length; sum_match += match.length;
} }
block_matching_ratio_ = 2.0 * sum_match / sum_length; // Subtract two because the last one is always an "empty block". Hence
// actual number of matching blocks is |num_blocks - 1|.
block_matching_ratio_ =
2.0 * sum_match / sum_length *
exp(-(num_blocks - 2) * num_matching_blocks_penalty_);
} }
return block_matching_ratio_; return block_matching_ratio_;
} }
...@@ -30,9 +30,16 @@ class SequenceMatcher { ...@@ -30,9 +30,16 @@ class SequenceMatcher {
// Length of the common substring. // Length of the common substring.
int length; int length;
}; };
// |num_matching_blocks_penalty| is used to penalize too many small matching
// blocks. For the same number of matching characters, we prefer fewer
// matching blocks. Value equal to 0 means no penalty. Values greater than 0
// means heavier penalty will be applied to larger number of blocks. This is
// only appled if |use_edit_distance| is false.
SequenceMatcher(const base::string16& first_string, SequenceMatcher(const base::string16& first_string,
const base::string16& second_string, const base::string16& second_string,
bool use_edit_distance); bool use_edit_distance,
double num_matching_blocks_penalty);
~SequenceMatcher() = default; ~SequenceMatcher() = default;
...@@ -59,6 +66,7 @@ class SequenceMatcher { ...@@ -59,6 +66,7 @@ class SequenceMatcher {
private: private:
base::string16 first_string_; base::string16 first_string_;
base::string16 second_string_; base::string16 second_string_;
double num_matching_blocks_penalty_ = 0.0;
double edit_distance_ratio_ = -1.0; double edit_distance_ratio_ = -1.0;
double block_matching_ratio_ = -1.0; double block_matching_ratio_ = -1.0;
std::vector<Match> matching_blocks_; std::vector<Match> matching_blocks_;
......
...@@ -22,46 +22,50 @@ class SequenceMatcherTest : public testing::Test {}; ...@@ -22,46 +22,50 @@ class SequenceMatcherTest : public testing::Test {};
TEST_F(SequenceMatcherTest, TestEditDistance) { TEST_F(SequenceMatcherTest, TestEditDistance) {
// Transposition // Transposition
ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("abcd"), ASSERT_EQ(
base::UTF8ToUTF16("abdc"), kDefaultUseEditDistance) SequenceMatcher(base::UTF8ToUTF16("abcd"), base::UTF8ToUTF16("abdc"),
kDefaultUseEditDistance, 0.0)
.EditDistance(), .EditDistance(),
1); 1);
// Deletion // Deletion
ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("abcde"), ASSERT_EQ(
base::UTF8ToUTF16("abcd"), kDefaultUseEditDistance) SequenceMatcher(base::UTF8ToUTF16("abcde"), base::UTF8ToUTF16("abcd"),
kDefaultUseEditDistance, 0.0)
.EditDistance(), .EditDistance(),
1); 1);
ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("12"), base::UTF8ToUTF16(""), ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("12"), base::UTF8ToUTF16(""),
kDefaultUseEditDistance) kDefaultUseEditDistance, 0.0)
.EditDistance(), .EditDistance(),
2); 2);
// Insertion // Insertion
ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("abc"), ASSERT_EQ(
base::UTF8ToUTF16("abxbc"), kDefaultUseEditDistance) SequenceMatcher(base::UTF8ToUTF16("abc"), base::UTF8ToUTF16("abxbc"),
kDefaultUseEditDistance, 0.0)
.EditDistance(), .EditDistance(),
2); 2);
ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16(""), base::UTF8ToUTF16("abxbc"), ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16(""), base::UTF8ToUTF16("abxbc"),
kDefaultUseEditDistance) kDefaultUseEditDistance, 0.0)
.EditDistance(), .EditDistance(),
5); 5);
// Substitution // Substitution
ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("book"), ASSERT_EQ(
base::UTF8ToUTF16("back"), kDefaultUseEditDistance) SequenceMatcher(base::UTF8ToUTF16("book"), base::UTF8ToUTF16("back"),
kDefaultUseEditDistance, 0.0)
.EditDistance(), .EditDistance(),
2); 2);
// Combination // Combination
ASSERT_EQ( ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("caclulation"),
SequenceMatcher(base::UTF8ToUTF16("caclulation"), base::UTF8ToUTF16("calculator"),
base::UTF8ToUTF16("calculator"), kDefaultUseEditDistance) kDefaultUseEditDistance, 0.0)
.EditDistance(), .EditDistance(),
3); 3);
ASSERT_EQ( ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("sunday"),
SequenceMatcher(base::UTF8ToUTF16("sunday"), base::UTF8ToUTF16("saturday"),
base::UTF8ToUTF16("saturday"), kDefaultUseEditDistance) kDefaultUseEditDistance, 0.0)
.EditDistance(), .EditDistance(),
3); 3);
} }
...@@ -69,7 +73,7 @@ TEST_F(SequenceMatcherTest, TestEditDistance) { ...@@ -69,7 +73,7 @@ TEST_F(SequenceMatcherTest, TestEditDistance) {
TEST_F(SequenceMatcherTest, TestFindLongestMatch) { TEST_F(SequenceMatcherTest, TestFindLongestMatch) {
SequenceMatcher sequence_match(base::UTF8ToUTF16("miscellanious"), SequenceMatcher sequence_match(base::UTF8ToUTF16("miscellanious"),
base::UTF8ToUTF16("miscellaneous"), base::UTF8ToUTF16("miscellaneous"),
kDefaultUseEditDistance); kDefaultUseEditDistance, 0.0);
ASSERT_TRUE(MatchEqual(sequence_match.FindLongestMatch(0, 13, 0, 13), ASSERT_TRUE(MatchEqual(sequence_match.FindLongestMatch(0, 13, 0, 13),
Match(0, 0, 9))); Match(0, 0, 9)));
ASSERT_TRUE(MatchEqual(sequence_match.FindLongestMatch(7, 13, 7, 13), ASSERT_TRUE(MatchEqual(sequence_match.FindLongestMatch(7, 13, 7, 13),
...@@ -77,17 +81,17 @@ TEST_F(SequenceMatcherTest, TestFindLongestMatch) { ...@@ -77,17 +81,17 @@ TEST_F(SequenceMatcherTest, TestFindLongestMatch) {
ASSERT_TRUE(MatchEqual( ASSERT_TRUE(MatchEqual(
SequenceMatcher(base::UTF8ToUTF16(""), base::UTF8ToUTF16("abcd"), SequenceMatcher(base::UTF8ToUTF16(""), base::UTF8ToUTF16("abcd"),
kDefaultUseEditDistance) kDefaultUseEditDistance, 0.0)
.FindLongestMatch(0, 0, 0, 4), .FindLongestMatch(0, 0, 0, 4),
Match(0, 0, 0))); Match(0, 0, 0)));
ASSERT_TRUE(MatchEqual( ASSERT_TRUE(MatchEqual(SequenceMatcher(base::UTF8ToUTF16("abababbababa"),
SequenceMatcher(base::UTF8ToUTF16("abababbababa"), base::UTF8ToUTF16("ababbaba"),
base::UTF8ToUTF16("ababbaba"), kDefaultUseEditDistance) kDefaultUseEditDistance, 0.0)
.FindLongestMatch(0, 12, 0, 8), .FindLongestMatch(0, 12, 0, 8),
Match(2, 0, 8))); Match(2, 0, 8)));
ASSERT_TRUE(MatchEqual( ASSERT_TRUE(MatchEqual(
SequenceMatcher(base::UTF8ToUTF16("aaaaaa"), base::UTF8ToUTF16("aaaaa"), SequenceMatcher(base::UTF8ToUTF16("aaaaaa"), base::UTF8ToUTF16("aaaaa"),
kDefaultUseEditDistance) kDefaultUseEditDistance, 0.0)
.FindLongestMatch(0, 6, 0, 5), .FindLongestMatch(0, 6, 0, 5),
Match(0, 0, 5))); Match(0, 0, 5)));
} }
...@@ -96,7 +100,7 @@ TEST_F(SequenceMatcherTest, TestGetMatchingBlocks) { ...@@ -96,7 +100,7 @@ TEST_F(SequenceMatcherTest, TestGetMatchingBlocks) {
SequenceMatcher sequence_match( SequenceMatcher sequence_match(
base::UTF8ToUTF16("This is a demo sentence!!!"), base::UTF8ToUTF16("This is a demo sentence!!!"),
base::UTF8ToUTF16("This demo sentence is good!!!"), base::UTF8ToUTF16("This demo sentence is good!!!"),
kDefaultUseEditDistance); kDefaultUseEditDistance, 0.0);
const std::vector<Match> true_matches = {Match(0, 0, 4), Match(9, 4, 14), const std::vector<Match> true_matches = {Match(0, 0, 4), Match(9, 4, 14),
Match(23, 26, 3), Match(26, 29, 0)}; Match(23, 26, 3), Match(26, 29, 0)};
const std::vector<Match> matches = sequence_match.GetMatchingBlocks(); const std::vector<Match> matches = sequence_match.GetMatchingBlocks();
...@@ -107,30 +111,76 @@ TEST_F(SequenceMatcherTest, TestGetMatchingBlocks) { ...@@ -107,30 +111,76 @@ TEST_F(SequenceMatcherTest, TestGetMatchingBlocks) {
} }
TEST_F(SequenceMatcherTest, TestSequenceMatcherRatio) { TEST_F(SequenceMatcherTest, TestSequenceMatcherRatio) {
ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("abcd"), ASSERT_EQ(
base::UTF8ToUTF16("adbc"), kDefaultUseEditDistance) SequenceMatcher(base::UTF8ToUTF16("abcd"), base::UTF8ToUTF16("adbc"),
kDefaultUseEditDistance, 0.0)
.Ratio(), .Ratio(),
0.75); 0.75);
ASSERT_EQ( ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("white cats"),
SequenceMatcher(base::UTF8ToUTF16("white cats"), base::UTF8ToUTF16("cats white"),
base::UTF8ToUTF16("cats white"), kDefaultUseEditDistance) kDefaultUseEditDistance, 0.0)
.Ratio(), .Ratio(),
0.5); 0.5);
} }
TEST_F(SequenceMatcherTest, TestSequenceMatcherRatioWithoutPenalty) {
// Two matching blocks, total matching blocks length is 4.
EXPECT_NEAR(SequenceMatcher(base::UTF8ToUTF16("word"),
base::UTF8ToUTF16("hello world"),
kDefaultUseEditDistance, 0.0)
.Ratio(),
0.533, 0.001);
// One matching block, length is 4.
EXPECT_NEAR(SequenceMatcher(base::UTF8ToUTF16("worl"),
base::UTF8ToUTF16("hello world"),
kDefaultUseEditDistance, 0.0)
.Ratio(),
0.533, 0.001);
// No matching block at all.
EXPECT_NEAR(
SequenceMatcher(base::UTF8ToUTF16("abcd"), base::UTF8ToUTF16("xyz"),
kDefaultUseEditDistance, 0.0)
.Ratio(),
0.0, 0.001);
}
TEST_F(SequenceMatcherTest, TestSequenceMatcherRatioWithPenalty) {
// Two matching blocks, total matching blocks length is 4.
EXPECT_NEAR(SequenceMatcher(base::UTF8ToUTF16("word"),
base::UTF8ToUTF16("hello world"),
kDefaultUseEditDistance, 0.1)
.Ratio(),
0.4825, 0.0001);
// One matching block, length is 4.
EXPECT_NEAR(SequenceMatcher(base::UTF8ToUTF16("worl"),
base::UTF8ToUTF16("hello world"),
kDefaultUseEditDistance, 0.1)
.Ratio(),
0.533, 0.001);
// No matching block at all.
EXPECT_NEAR(
SequenceMatcher(base::UTF8ToUTF16("abcd"), base::UTF8ToUTF16("xyz"),
kDefaultUseEditDistance, 0.1)
.Ratio(),
0.0, 0.001);
}
TEST_F(SequenceMatcherTest, TestEditDistanceRatio) { TEST_F(SequenceMatcherTest, TestEditDistanceRatio) {
ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("abcd"), ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("abcd"),
base::UTF8ToUTF16("adbc"), true) base::UTF8ToUTF16("adbc"), true, 0.0)
.Ratio(), .Ratio(),
0.5); 0.5);
EXPECT_NEAR(SequenceMatcher(base::UTF8ToUTF16("white cats"), EXPECT_NEAR(SequenceMatcher(base::UTF8ToUTF16("white cats"),
base::UTF8ToUTF16("cats white"), true) base::UTF8ToUTF16("cats white"), true, 0.0)
.Ratio(), .Ratio(),
0.2, 0.01); 0.2, 0.01);
// Totally different // Totally different
EXPECT_NEAR(SequenceMatcher(base::UTF8ToUTF16("dog"), EXPECT_NEAR(SequenceMatcher(base::UTF8ToUTF16("dog"),
base::UTF8ToUTF16("elphant"), true) base::UTF8ToUTF16("elphant"), true, 0.0)
.Ratio(), .Ratio(),
0.0, 0.01); 0.0, 0.01);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment