Commit 1681a14b authored by Thanh Nguyen's avatar Thanh Nguyen Committed by Commit Bot

[cros-fuzzy-app] Add more support functions for FuzzyTokenizedStringMatch

This CL adds more support functions for FuzzyTokenizedStringMatch and
finalizes the implementation of WeightedRatio.

Bug: 990684
Change-Id: I4d9632d42f391085a654d92ec450a51c31db5b26
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1807638Reviewed-by: default avatarJia Meng <jiameng@chromium.org>
Commit-Queue: Thanh Nguyen <thanhdng@chromium.org>
Cr-Commit-Position: refs/heads/master@{#699824}
parent 4bef3139
......@@ -3,6 +3,12 @@
// found in the LICENSE file.
#include "chrome/browser/ui/app_list/search/search_utils/fuzzy_tokenized_string_match.h"
#include <algorithm>
#include <iterator>
#include "base/strings/strcat.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "chrome/browser/ui/app_list/search/search_utils/sequence_matcher.h"
......@@ -11,83 +17,156 @@ namespace app_list {
namespace {
const double kRelevanceThreshold = 0.6;
double PartialRatio(const TokenizedString& query,
const TokenizedString& text,
SequenceMatcher& sequence_matcher) {
// TODO(crbug.com/990684): implement the logic of this function.
return 0.0;
// Returns sorted tokens from a TokenizedString.
std::vector<base::string16> ProcessAndSort(const TokenizedString& text) {
std::vector<base::string16> result;
for (const auto& token : text.tokens()) {
result.emplace_back(token);
}
std::sort(result.begin(), result.end());
return result;
}
} // namespace
double PartialTokenSetRatio(const TokenizedString& query,
const TokenizedString& text,
SequenceMatcher& sequence_matcher) {
// TODO(crbug.com/990684): implement the logic of this function.
return 0.0;
}
FuzzyTokenizedStringMatch::~FuzzyTokenizedStringMatch() {}
FuzzyTokenizedStringMatch::FuzzyTokenizedStringMatch() {}
double FuzzyTokenizedStringMatch::TokenSetRatio(const TokenizedString& query,
const TokenizedString& text,
bool partial) {
std::set<base::string16> query_token(query.tokens().begin(),
query.tokens().end());
std::set<base::string16> text_token(text.tokens().begin(),
text.tokens().end());
std::vector<base::string16> intersection;
std::vector<base::string16> query_diff_text;
std::vector<base::string16> text_diff_query;
// Find the intersection and the differences between two set of tokens.
std::set_intersection(query_token.begin(), query_token.end(),
text_token.begin(), text_token.end(),
std::back_inserter(intersection));
std::set_difference(query_token.begin(), query_token.end(),
text_token.begin(), text_token.end(),
std::back_inserter(query_diff_text));
std::set_difference(text_token.begin(), text_token.end(), query_token.begin(),
query_token.end(), std::back_inserter(text_diff_query));
const base::string16 intersection_string =
base::JoinString(intersection, base::UTF8ToUTF16(" "));
const base::string16 query_rewritten =
intersection.empty()
? base::JoinString(query_diff_text, base::UTF8ToUTF16(" "))
: base::StrCat(
{intersection_string, base::UTF8ToUTF16(" "),
base::JoinString(query_diff_text, base::UTF8ToUTF16(" "))});
const base::string16 text_rewritten =
intersection.empty()
? base::JoinString(text_diff_query, base::UTF8ToUTF16(" "))
: base::StrCat(
{intersection_string, base::UTF8ToUTF16(" "),
base::JoinString(text_diff_query, base::UTF8ToUTF16(" "))});
if (partial) {
return std::max({PartialRatio(intersection_string, query_rewritten),
PartialRatio(intersection_string, text_rewritten),
PartialRatio(query_rewritten, text_rewritten)});
}
double PartialTokenSortRatio(const TokenizedString& query,
const TokenizedString& text,
SequenceMatcher& sequence_matcher) {
// TODO(crbug.com/990684): implement the logic of this function.
return 0.0;
return std::max({SequenceMatcher(intersection_string, query_rewritten)
.Ratio(false /*use_edit_distance*/),
SequenceMatcher(intersection_string, text_rewritten)
.Ratio(false /*use_edit_distance*/),
SequenceMatcher(query_rewritten, text_rewritten)
.Ratio(false /*use_edit_distance*/)});
}
double TokenSortRatio(const TokenizedString& query,
const TokenizedString& text,
SequenceMatcher& sequence_matcher) {
// TODO(crbug.com/990684): implement the logic of this function.
return 0.0;
double FuzzyTokenizedStringMatch::TokenSortRatio(const TokenizedString& query,
const TokenizedString& text,
bool partial) {
const base::string16 query_sorted =
base::JoinString(ProcessAndSort(query), base::UTF8ToUTF16(" "));
const base::string16 text_sorted =
base::JoinString(ProcessAndSort(text), base::UTF8ToUTF16(" "));
if (partial) {
return PartialRatio(query_sorted, text_sorted);
}
return SequenceMatcher(query_sorted, text_sorted)
.Ratio(false /*use_edit_distance*/);
}
double TokenSetRatio(const TokenizedString& query,
const TokenizedString& text,
SequenceMatcher& sequence_matcher) {
// TODO(crbug.com/990684): implement the logic of this function.
return 0.0;
double FuzzyTokenizedStringMatch::PartialRatio(const base::string16& query,
const base::string16& text) {
if (query.empty() || text.empty()) {
return 0.0;
}
base::string16 shorter = query;
base::string16 longer = text;
if (shorter.size() > longer.size()) {
shorter = text;
longer = query;
}
const auto matching_blocks =
SequenceMatcher(shorter, longer).GetMatchingBlocks();
double partial_ratio = 0;
for (const auto& block : matching_blocks) {
const int long_start =
block.pos_second_string > block.pos_first_string
? block.pos_second_string - block.pos_first_string
: 0;
// TODO(crbug/990684): currently this part re-calculate the ratio for every
// pair. Improve this to reduce latency.
partial_ratio = std::max(
partial_ratio,
SequenceMatcher(shorter, longer.substr(long_start, shorter.size()))
.Ratio(false /*use_edit_distance*/));
if (partial_ratio > 0.995) {
return 1;
}
}
return partial_ratio;
}
double WeightedRatio(const TokenizedString& query,
const TokenizedString& text,
SequenceMatcher& sequence_matcher) {
double FuzzyTokenizedStringMatch::WeightedRatio(const TokenizedString& query,
const TokenizedString& text) {
const double unbase_scale = 0.95;
double weighted_ratio = sequence_matcher.Ratio();
double weighted_ratio = SequenceMatcher(query.text(), text.text())
.Ratio(false /*use_edit_distance*/);
const double length_ratio =
static_cast<double>(std::max(query.text().size(), text.text().size())) /
std::min(query.text().size(), text.text().size());
// Use partial if two strings are quite different in sizes.
if (length_ratio >= 1.5) {
const bool use_partial = length_ratio >= 1.5;
double partial_scale = 1;
if (use_partial) {
// If one string is much much shorter than the other, set |partial_scale| to
// be 0.6, otherwise set it to be 0.9.
const double partial_scale = length_ratio > 8 ? 0.6 : 0.9;
partial_scale = length_ratio > 8 ? 0.6 : 0.9;
weighted_ratio =
std::max(weighted_ratio,
PartialRatio(query, text, sequence_matcher) * partial_scale);
weighted_ratio = std::max(
weighted_ratio, PartialTokenSortRatio(query, text, sequence_matcher) *
unbase_scale * partial_scale);
weighted_ratio = std::max(
weighted_ratio, PartialTokenSetRatio(query, text, sequence_matcher) *
unbase_scale * partial_scale);
return weighted_ratio;
PartialRatio(query.text(), text.text()) * partial_scale);
}
// If strings are similar length, don't use partial.
weighted_ratio =
std::max(weighted_ratio,
TokenSortRatio(query, text, sequence_matcher) * unbase_scale);
weighted_ratio =
std::max(weighted_ratio,
TokenSetRatio(query, text, sequence_matcher) * unbase_scale);
weighted_ratio = std::max(
weighted_ratio, TokenSortRatio(query, text, /*partial=*/use_partial) *
unbase_scale * partial_scale);
weighted_ratio = std::max(
weighted_ratio, TokenSetRatio(query, text, /*partial=*/use_partial) *
unbase_scale * partial_scale);
return weighted_ratio;
}
} // namespace
bool FuzzyTokenizedStringMatch::IsRelevant(const TokenizedString& query,
const TokenizedString& text) {
// TODO(crbug.com/990684): add prefix matching logic.
SequenceMatcher sequence_matcher(base::UTF16ToUTF8(query.text()),
base::UTF16ToUTF8(text.text()));
relevance_ = WeightedRatio(query, text, sequence_matcher);
relevance_ = WeightedRatio(query, text);
return relevance_ > kRelevanceThreshold;
}
......
......@@ -6,6 +6,7 @@
#define CHROME_BROWSER_UI_APP_LIST_SEARCH_SEARCH_UTILS_FUZZY_TOKENIZED_STRING_MATCH_H_
#include "ash/public/cpp/app_list/tokenized_string.h"
#include "base/gtest_prod_util.h"
#include "base/macros.h"
#include "ui/gfx/range/range.h"
......@@ -32,6 +33,33 @@ class FuzzyTokenizedStringMatch {
double relevance() const { return relevance_; }
private:
FRIEND_TEST_ALL_PREFIXES(FuzzyTokenizedStringMatchTest, PartialRatioTest);
FRIEND_TEST_ALL_PREFIXES(FuzzyTokenizedStringMatchTest, TokenSetRatioTest);
FRIEND_TEST_ALL_PREFIXES(FuzzyTokenizedStringMatchTest, TokenSortRatioTest);
FRIEND_TEST_ALL_PREFIXES(FuzzyTokenizedStringMatchTest, WeightedRatio);
// Finds the best ratio of shorter text with a part of longer text.
// This function assumes that TokenizedString is already normalized (converted
// to lower case). The return score is in range of [0, 1].
double PartialRatio(const base::string16& query, const base::string16& text);
// TokenSetRatio takes two sets of tokens, finds their intersection and
// differences. From the intersection and differences, it rewrites the |query|
// and |text| and find the similarity ratio between them. This function
// assumes that TokenizedString is already normalized (converted to lower
// case). Duplicates tokens will be removed for ratio computation.
double TokenSetRatio(const TokenizedString& query,
const TokenizedString& text,
bool partial);
// TokenSortRatio takes two set of tokens, sorts them and find the similarity
// between two sorted strings. This function assumes that TokenizedString is
// already normalized (converted to lower case)
double TokenSortRatio(const TokenizedString& query,
const TokenizedString& text,
bool partial);
// Combines scores from different ratio functions. This function assumes that
// TokenizedString is already normalized (converted to lower cases).
// The return score is in range of [0, 1].
double WeightedRatio(const TokenizedString& query,
const TokenizedString& text);
// Score in range of [0,1] representing how well the query matches the text.
double relevance_ = 0;
Hits hits_;
......
// Copyright (c) 2019 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/browser/ui/app_list/search/search_utils/fuzzy_tokenized_string_match.h"
#include "ash/public/cpp/app_list/tokenized_string.h"
#include "base/macros.h"
#include "base/strings/utf_string_conversions.h"
#include "chrome/browser/ui/app_list/search/search_utils/sequence_matcher.h"
#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"
namespace app_list {
class FuzzyTokenizedStringMatchTest : public testing::Test {};
TEST_F(FuzzyTokenizedStringMatchTest, PartialRatioTest) {
FuzzyTokenizedStringMatch match;
EXPECT_EQ(match.PartialRatio(base::UTF8ToUTF16("abcde"),
base::UTF8ToUTF16("ababcXXXbcdeY")),
0.8);
EXPECT_NEAR(match.PartialRatio(base::UTF8ToUTF16("big string"),
base::UTF8ToUTF16("strength")),
0.71, 0.01);
EXPECT_EQ(match.PartialRatio(base::UTF8ToUTF16("abc"), base::UTF8ToUTF16("")),
0);
EXPECT_NEAR(match.PartialRatio(base::UTF8ToUTF16("different in order"),
base::UTF8ToUTF16("order text")),
0.67, 0.01);
}
TEST_F(FuzzyTokenizedStringMatchTest, TokenSetRatioTest) {
FuzzyTokenizedStringMatch match;
{
base::string16 query(base::UTF8ToUTF16("order different in"));
base::string16 text(base::UTF8ToUTF16("text order"));
EXPECT_EQ(match.TokenSetRatio(TokenizedString(query), TokenizedString(text),
true),
1);
EXPECT_NEAR(match.TokenSetRatio(TokenizedString(query),
TokenizedString(text), false),
0.67, 0.01);
}
{
base::string16 query(base::UTF8ToUTF16("short text"));
base::string16 text(
base::UTF8ToUTF16("this text is really really really long"));
EXPECT_EQ(match.TokenSetRatio(TokenizedString(query), TokenizedString(text),
true),
1);
EXPECT_NEAR(match.TokenSetRatio(TokenizedString(query),
TokenizedString(text), false),
0.57, 0.01);
}
{
base::string16 query(base::UTF8ToUTF16("common string"));
base::string16 text(base::UTF8ToUTF16("nothing is shared"));
EXPECT_NEAR(match.TokenSetRatio(TokenizedString(query),
TokenizedString(text), true),
0.38, 0.01);
EXPECT_NEAR(match.TokenSetRatio(TokenizedString(query),
TokenizedString(text), false),
0.33, 0.01);
}
{
base::string16 query(
base::UTF8ToUTF16("token shared token same shared same"));
base::string16 text(base::UTF8ToUTF16("token shared token text text long"));
EXPECT_EQ(match.TokenSetRatio(TokenizedString(query), TokenizedString(text),
true),
1);
EXPECT_NEAR(match.TokenSetRatio(TokenizedString(query),
TokenizedString(text), false),
0.83, 0.01);
}
}
TEST_F(FuzzyTokenizedStringMatchTest, TokenSortRatioTest) {
FuzzyTokenizedStringMatch match;
{
base::string16 query(base::UTF8ToUTF16("order different in"));
base::string16 text(base::UTF8ToUTF16("text order"));
EXPECT_NEAR(match.TokenSortRatio(TokenizedString(query),
TokenizedString(text), true),
0.67, 0.01);
EXPECT_NEAR(match.TokenSortRatio(TokenizedString(query),
TokenizedString(text), false),
0.36, 0.01);
}
{
base::string16 query(base::UTF8ToUTF16("short text"));
base::string16 text(
base::UTF8ToUTF16("this text is really really really long"));
EXPECT_EQ(match.TokenSortRatio(TokenizedString(query),
TokenizedString(text), true),
0.5);
EXPECT_NEAR(match.TokenSortRatio(TokenizedString(query),
TokenizedString(text), false),
0.33, 0.01);
}
{
base::string16 query(base::UTF8ToUTF16("common string"));
base::string16 text(base::UTF8ToUTF16("nothing is shared"));
EXPECT_NEAR(match.TokenSortRatio(TokenizedString(query),
TokenizedString(text), true),
0.38, 0.01);
EXPECT_NEAR(match.TokenSortRatio(TokenizedString(query),
TokenizedString(text), false),
0.33, 0.01);
}
}
TEST_F(FuzzyTokenizedStringMatchTest, WeightedRatio) {
FuzzyTokenizedStringMatch match;
{
base::string16 query(base::UTF8ToUTF16("anonymous"));
base::string16 text(base::UTF8ToUTF16("famous"));
EXPECT_NEAR(
match.WeightedRatio(TokenizedString(query), TokenizedString(text)),
0.67, 0.01);
}
{
base::string16 query(base::UTF8ToUTF16("clash of clan"));
base::string16 text(base::UTF8ToUTF16("clash of titan"));
EXPECT_NEAR(
match.WeightedRatio(TokenizedString(query), TokenizedString(text)),
0.81, 0.01);
}
{
base::string16 query(base::UTF8ToUTF16("final fantasy"));
base::string16 text(base::UTF8ToUTF16("finalfantasy"));
EXPECT_NEAR(
match.WeightedRatio(TokenizedString(query), TokenizedString(text)),
0.96, 0.01);
}
{
base::string16 query(base::UTF8ToUTF16("short text"));
base::string16 text(
base::UTF8ToUTF16("this sentence is much much much much much longer "
"than the text before"));
EXPECT_NEAR(
match.WeightedRatio(TokenizedString(query), TokenizedString(text)),
0.85, 0.01);
}
}
} // namespace app_list
......@@ -26,8 +26,8 @@ SequenceMatcher::Match::Match(int pos_first, int pos_second, int len)
DCHECK_GE(length, 0);
}
SequenceMatcher::SequenceMatcher(const std::string& first_string,
const std::string& second_string)
SequenceMatcher::SequenceMatcher(const base::string16& first_string,
const base::string16& second_string)
: first_string_(first_string),
second_string_(second_string),
dp_common_string_(second_string.size() + 1, 0) {
......
......@@ -31,8 +31,8 @@ class SequenceMatcher {
// Length of the common substring.
int length;
};
SequenceMatcher(const std::string& first_string,
const std::string& second_string);
SequenceMatcher(const base::string16& first_string,
const base::string16& second_string);
~SequenceMatcher() = default;
......@@ -59,8 +59,8 @@ class SequenceMatcher {
std::vector<Match> GetMatchingBlocks();
private:
std::string first_string_;
std::string second_string_;
base::string16 first_string_;
base::string16 second_string_;
double edit_distance_ratio_ = -1.0;
double block_matching_ratio_ = -1.0;
std::vector<Match> matching_blocks_;
......
......@@ -5,6 +5,7 @@
#include "chrome/browser/ui/app_list/search/search_utils/sequence_matcher.h"
#include "base/macros.h"
#include "base/strings/utf_string_conversions.h"
#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"
......@@ -23,45 +24,72 @@ class SequenceMatcherTest : public testing::Test {};
TEST_F(SequenceMatcherTest, TestEditDistance) {
// Transposition
ASSERT_EQ(SequenceMatcher("abcd", "abdc").EditDistance(), 1);
ASSERT_EQ(
SequenceMatcher(base::UTF8ToUTF16("abcd"), base::UTF8ToUTF16("abdc"))
.EditDistance(),
1);
// Deletion
ASSERT_EQ(SequenceMatcher("abcde", "abcd").EditDistance(), 1);
ASSERT_EQ(SequenceMatcher("12", "").EditDistance(), 2);
ASSERT_EQ(
SequenceMatcher(base::UTF8ToUTF16("abcde"), base::UTF8ToUTF16("abcd"))
.EditDistance(),
1);
ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("12"), base::UTF8ToUTF16(""))
.EditDistance(),
2);
// Insertion
ASSERT_EQ(SequenceMatcher("abc", "abxbc").EditDistance(), 2);
ASSERT_EQ(SequenceMatcher("", "abxbc").EditDistance(), 5);
ASSERT_EQ(
SequenceMatcher(base::UTF8ToUTF16("abc"), base::UTF8ToUTF16("abxbc"))
.EditDistance(),
2);
ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16(""), base::UTF8ToUTF16("abxbc"))
.EditDistance(),
5);
// Substitution
ASSERT_EQ(SequenceMatcher("book", "back").EditDistance(), 2);
ASSERT_EQ(
SequenceMatcher(base::UTF8ToUTF16("book"), base::UTF8ToUTF16("back"))
.EditDistance(),
2);
// Combination
ASSERT_EQ(SequenceMatcher("caclulation", "calculator").EditDistance(), 3);
ASSERT_EQ(SequenceMatcher("sunday", "saturday").EditDistance(), 3);
ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("caclulation"),
base::UTF8ToUTF16("calculator"))
.EditDistance(),
3);
ASSERT_EQ(SequenceMatcher(base::UTF8ToUTF16("sunday"),
base::UTF8ToUTF16("saturday"))
.EditDistance(),
3);
}
TEST_F(SequenceMatcherTest, TestFindLongestMatch) {
SequenceMatcher sequence_match("miscellanious", "miscellaneous");
SequenceMatcher sequence_match(base::UTF8ToUTF16("miscellanious"),
base::UTF8ToUTF16("miscellaneous"));
ASSERT_TRUE(MatchEqual(sequence_match.FindLongestMatch(0, 13, 0, 13),
Match(0, 0, 9)));
ASSERT_TRUE(MatchEqual(sequence_match.FindLongestMatch(7, 13, 7, 13),
Match(10, 10, 3)));
ASSERT_TRUE(
MatchEqual(SequenceMatcher("", "abcd").FindLongestMatch(0, 0, 0, 4),
Match(0, 0, 0)));
ASSERT_TRUE(MatchEqual(
SequenceMatcher("abababbababa", "ababbaba").FindLongestMatch(0, 12, 0, 8),
Match(2, 0, 8)));
SequenceMatcher(base::UTF8ToUTF16(""), base::UTF8ToUTF16("abcd"))
.FindLongestMatch(0, 0, 0, 4),
Match(0, 0, 0)));
ASSERT_TRUE(MatchEqual(SequenceMatcher(base::UTF8ToUTF16("abababbababa"),
base::UTF8ToUTF16("ababbaba"))
.FindLongestMatch(0, 12, 0, 8),
Match(2, 0, 8)));
ASSERT_TRUE(MatchEqual(
SequenceMatcher("aaaaaa", "aaaaa").FindLongestMatch(0, 6, 0, 5),
SequenceMatcher(base::UTF8ToUTF16("aaaaaa"), base::UTF8ToUTF16("aaaaa"))
.FindLongestMatch(0, 6, 0, 5),
Match(0, 0, 5)));
}
TEST_F(SequenceMatcherTest, TestGetMatchingBlocks) {
SequenceMatcher sequence_match("This is a demo sentence!!!",
"This demo sentence is good!!!");
SequenceMatcher sequence_match(
base::UTF8ToUTF16("This is a demo sentence!!!"),
base::UTF8ToUTF16("This demo sentence is good!!!"));
const std::vector<Match> true_matches = {Match(0, 0, 4), Match(9, 4, 14),
Match(23, 26, 3), Match(26, 29, 0)};
const std::vector<Match> matches = sequence_match.GetMatchingBlocks();
......
......@@ -5147,6 +5147,7 @@ test("unit_tests") {
"../browser/ui/app_list/search/search_result_ranker/recurrence_ranker_unittest.cc",
"../browser/ui/app_list/search/search_result_ranker/recurrence_ranker_util_unittest.cc",
"../browser/ui/app_list/search/search_result_ranker/search_result_ranker_unittest.cc",
"../browser/ui/app_list/search/search_utils/fuzzy_tokenized_string_match_unittest.cc",
"../browser/ui/app_list/search/search_utils/sequence_matcher_unittest.cc",
"../browser/ui/app_list/search/settings_shortcut/settings_shortcut_provider_unittest.cc",
"../browser/ui/app_list/search/settings_shortcut/settings_shortcut_result_unittest.cc",
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment