Use PrefixMatcher class to calculate prefix match score

Previously FuzzyTokenizedStringMatch uses its own way to calculate prefix match score. This CL uses PrefixMatcher class to calculate prefix match score to be consistent with TokenizedStringMatch. Bug: 1086841 Change-Id: I8fcf37b27e2345dca03498788303ac2287324997 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2237492 Commit-Queue: Thanh Nguyen <thanhdng@chromium.org> Reviewed-by: Jia Meng <jiameng@chromium.org> Cr-Commit-Position: refs/heads/master@{#776813}

Use PrefixMatcher class to calculate prefix match score
Previously FuzzyTokenizedStringMatch uses its own way to calculate prefix match score. This CL uses PrefixMatcher class to calculate prefix match score to be consistent with TokenizedStringMatch. Bug: 1086841 Change-Id: I8fcf37b27e2345dca03498788303ac2287324997 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2237492 Commit-Queue: Thanh Nguyen <thanhdng@chromium.org> Reviewed-by: Jia Meng <jiameng@chromium.org> Cr-Commit-Position: refs/heads/master@{#776813}
adf1c7d4 · Thanh Nguyen · Commit Bot · 549dacf7 · adf1c7d4 · adf1c7d4
Commit adf1c7d4 authored Jun 10, 2020 by Thanh Nguyen Committed by Commit Bot Jun 10, 2020
3 changed files
--- a/chrome/common/string_matching/fuzzy_tokenized_string_match.cc
+++ b/chrome/common/string_matching/fuzzy_tokenized_string_match.cc
@@ -13,13 +13,12 @@
 #include "base/strings/strcat.h"
 #include "base/strings/string_util.h"
 #include "base/strings/utf_string_conversions.h"
+#include "chrome/common/string_matching/prefix_matcher.h"
 #include "chrome/common/string_matching/sequence_matcher.h"

 namespace {
 constexpr double kMinScore = 0.0;
 constexpr double kMaxScore = 1.0;
-constexpr double kFirstCharacterMatchPenalty = 0.2;
-constexpr double kPrefixMatchPenalty = 0.1;

 // Returns sorted tokens from a TokenizedString.
 std::vector<base::string16> ProcessAndSort(const TokenizedString& text) {
@@ -35,57 +34,6 @@ std::vector<base::string16> ProcessAndSort(const TokenizedString& text) {
 FuzzyTokenizedStringMatch::~FuzzyTokenizedStringMatch() {}
 FuzzyTokenizedStringMatch::FuzzyTokenizedStringMatch() {}

-double FuzzyTokenizedStringMatch::FirstCharacterMatch(
-    const TokenizedString& query,
-    const TokenizedString& text) {
-  const base::string16 query_lower = base::i18n::ToLower(query.text());
-  size_t query_index = 0;
-  for (size_t text_index = 0; text_index < text.tokens().size(); text_index++) {
-    if (query_index < query_lower.size() &&
-        text.tokens()[text_index][0] == query_lower[query_index]) {
-      query_index++;
-      if (query_index == query_lower.size()) {
-        // Penalizes the score using the number of text's tokens that are
-        // needed.
-        return std::max(kMinScore,
-                        kMaxScore - kFirstCharacterMatchPenalty *
-                                        (text_index + 1 - query_lower.size()));
-      }
-    }
-  }
-  return kMinScore;
-}
-
-double FuzzyTokenizedStringMatch::PrefixMatch(const TokenizedString& query,
-                                              const TokenizedString& text) {
-  const std::vector<base::string16> query_tokens(query.tokens());
-  const std::vector<base::string16> text_tokens(text.tokens());
-  double match_score = kMaxScore;
-  int previous_matched_index = -1;
-  // For every query token, check if it is a prefix of a text token. The newly
-  // matching text token must have higher index than the previous matched token.
-  for (const auto& query_token : query_tokens) {
-    bool matched = false;
-    for (size_t text_index = previous_matched_index + 1;
-         text_index < text_tokens.size(); text_index++) {
-      if (query_token.size() <= text_tokens[text_index].size() &&
-          query_token ==
-              text_tokens[text_index].substr(0, query_token.size())) {
-        matched = true;
-        // Penalizes the score based on the number of skipped tokens.
-        match_score -=
-            kPrefixMatchPenalty * (text_index - previous_matched_index - 1);
-        previous_matched_index = text_index;
-        break;
-      }
-    }
-    if (!matched) {
-      return kMinScore;
-    }
-  }
-  return std::max(kMinScore, match_score);
-}
-
 double FuzzyTokenizedStringMatch::TokenSetRatio(
    const TokenizedString& query,
    const TokenizedString& text,
@@ -283,7 +231,9 @@ double FuzzyTokenizedStringMatch::WeightedRatio(

 double FuzzyTokenizedStringMatch::PrefixMatcher(const TokenizedString& query,
                                                const TokenizedString& text) {
-  return std::max(PrefixMatch(query, text), FirstCharacterMatch(query, text));
+  ::PrefixMatcher match(query, text);
+  match.Match();
+  return 1.0 - std::pow(0.5, match.relevance());
 }

 bool FuzzyTokenizedStringMatch::IsRelevant(const TokenizedString& query,

--- a/chrome/common/string_matching/fuzzy_tokenized_string_match.h
+++ b/chrome/common/string_matching/fuzzy_tokenized_string_match.h
@@ -25,16 +25,6 @@ class FuzzyTokenizedStringMatch {
  FuzzyTokenizedStringMatch();
  ~FuzzyTokenizedStringMatch();

-  // Check if the query only contains first characters of the text,
-  // e.g. "coc" is a match of "Clash of Clan". Range of the score is [0, 1].
-  static double FirstCharacterMatch(const TokenizedString& query,
-                                    const TokenizedString& text);
-
-  // Check if tokens of query are prefixes of text's tokens. Range of score is
-  // [0, 1].
-  static double PrefixMatch(const TokenizedString& query,
-                            const TokenizedString& text);
-
  // TokenSetRatio takes two sets of tokens, finds their intersection and
  // differences. From the intersection and differences, it rewrites the |query|
  // and |text| and find the similarity ratio between them. This function
@@ -75,7 +65,8 @@ class FuzzyTokenizedStringMatch {
                              bool use_edit_distance,
                              double num_matching_blocks_penalty);
  // Since prefix match should always be favored over other matches, this
-  // function is dedicated to calculate a prefix match score in range of [0, 1].
+  // function is dedicated to calculate a prefix match score in range of [0, 1]
+  // using PrefixMatcher class.
  // This score has two components: first character match and whole prefix
  // match.
  static double PrefixMatcher(const TokenizedString& query,

--- a/chrome/common/string_matching/fuzzy_tokenized_string_match_unittest.cc
+++ b/chrome/common/string_matching/fuzzy_tokenized_string_match_unittest.cc
@@ -165,57 +165,40 @@ TEST_F(FuzzyTokenizedStringMatchTest, WeightedRatio) {
  }
 }

-TEST_F(FuzzyTokenizedStringMatchTest, FirstCharacterMatchTest) {
-  {
-    base::string16 query(base::UTF8ToUTF16("COC"));
-    base::string16 text(base::UTF8ToUTF16("Clash of Clan"));
-    EXPECT_EQ(FuzzyTokenizedStringMatch::FirstCharacterMatch(
-                  TokenizedString(query), TokenizedString(text)),
-              1.0);
-  }
-  {
-    base::string16 query(base::UTF8ToUTF16("CC"));
-    base::string16 text(base::UTF8ToUTF16("Clash of Clan"));
-    EXPECT_EQ(FuzzyTokenizedStringMatch::FirstCharacterMatch(
-                  TokenizedString(query), TokenizedString(text)),
-              0.8);
-  }
-  {
-    base::string16 query(base::UTF8ToUTF16("C o C"));
-    base::string16 text(base::UTF8ToUTF16("Clash of Clan"));
-    EXPECT_EQ(FuzzyTokenizedStringMatch::FirstCharacterMatch(
-                  TokenizedString(query), TokenizedString(text)),
-              0.0);
-  }
-}
-
-TEST_F(FuzzyTokenizedStringMatchTest, PrefixMatchTest) {
+TEST_F(FuzzyTokenizedStringMatchTest, PrefixMatcherTest) {
  {
    base::string16 query(base::UTF8ToUTF16("clas"));
    base::string16 text(base::UTF8ToUTF16("Clash of Clan"));
-    EXPECT_EQ(FuzzyTokenizedStringMatch::PrefixMatch(TokenizedString(query),
-                                                     TokenizedString(text)),
-              1.0);
+    EXPECT_NEAR(FuzzyTokenizedStringMatch::PrefixMatcher(TokenizedString(query),
+                                                         TokenizedString(text)),
+                0.94, 0.01);
  }
  {
    base::string16 query(base::UTF8ToUTF16("clash clan"));
    base::string16 text(base::UTF8ToUTF16("Clash of Clan"));
-    EXPECT_EQ(FuzzyTokenizedStringMatch::PrefixMatch(TokenizedString(query),
-                                                     TokenizedString(text)),
-              0.9);
+    EXPECT_NEAR(FuzzyTokenizedStringMatch::PrefixMatcher(TokenizedString(query),
+                                                         TokenizedString(text)),
+                0.99, 0.01);
  }
  {
    base::string16 query(base::UTF8ToUTF16("c o c"));
    base::string16 text(base::UTF8ToUTF16("Clash of Clan"));
-    EXPECT_EQ(FuzzyTokenizedStringMatch::PrefixMatch(TokenizedString(query),
-                                                     TokenizedString(text)),
-              1.0);
+    EXPECT_NEAR(FuzzyTokenizedStringMatch::PrefixMatcher(TokenizedString(query),
+                                                         TokenizedString(text)),
+                0.84, 0.01);
+  }
+  {
+    base::string16 query(base::UTF8ToUTF16("wifi"));
+    base::string16 text(base::UTF8ToUTF16("wi-fi"));
+    EXPECT_NEAR(FuzzyTokenizedStringMatch::PrefixMatcher(TokenizedString(query),
+                                                         TokenizedString(text)),
+                0.91, 0.01);
  }
  {
    base::string16 query(base::UTF8ToUTF16("clam"));
    base::string16 text(base::UTF8ToUTF16("Clash of Clan"));
-    EXPECT_EQ(FuzzyTokenizedStringMatch::PrefixMatch(TokenizedString(query),
-                                                     TokenizedString(text)),
+    EXPECT_EQ(FuzzyTokenizedStringMatch::PrefixMatcher(TokenizedString(query),
+                                                       TokenizedString(text)),
              0.0);
  }
 }