Omnibox: Create Field Trial for HQP to Ignore Mid-Word Matches

Adds code to HistoryQuick provider to make it optionally only consider terms when they match at the beginning of a word boundary in the URL or titles. The current behavior allows terms to match anywhere regardless of word boundaries. Then, creates a field trial to control this behavior. Enables the trial for 25% of users. BUG=161911 Review URL: https://chromiumcodereview.appspot.com/11421139 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@170784 0039d316-1c4b-4281-b951-d872f2087c98

Omnibox: Create Field Trial for HQP to Ignore Mid-Word Matches
Adds code to HistoryQuick provider to make it optionally only consider terms when they match at the beginning of a word boundary in the URL or titles. The current behavior allows terms to match anywhere regardless of word boundaries. Then, creates a field trial to control this behavior. Enables the trial for 25% of users. BUG=161911 Review URL: https://chromiumcodereview.appspot.com/11421139 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@170784 0039d316-1c4b-4281-b951-d872f2087c98
68c76ec1 · mpearson@chromium.org · 57892182 · 68c76ec1 · 68c76ec1 · 68c76ec1
Commit 68c76ec1 authored Dec 03, 2012 by mpearson@chromium.org
5 changed files
--- a/chrome/browser/autocomplete/autocomplete_field_trial.cc
+++ b/chrome/browser/autocomplete/autocomplete_field_trial.cc
@@ -27,6 +27,8 @@ static const char kHUPCreateShorterMatchFieldTrialName[] =
    "OmniboxHUPCreateShorterMatch";
 static const char kHQPReplaceHUPScoringFieldTrialName[] =
    "OmniboxHQPReplaceHUPNumComponentsFix";
+static const char kHQPOnlyCountMatchesAtWordBoundariesFieldTrialName[] =
+    "OmniboxHQPOnlyCountMatchesAtWordBoundaries";

 // Field trial experiment probabilities.

@@ -74,6 +76,13 @@ const base::FieldTrial::Probability
 const base::FieldTrial::Probability
    kHQPReplaceHUPScoringFieldTrialExperimentFraction = 25;

+// For the field trial that ignores all mid-term matches in HistoryQuick
+// provider, put 25% ( = 25/100 ) of the users in the experiment group.
+const base::FieldTrial::Probability
+    kHQPOnlyCountMatchesAtWordBoundariesFieldTrialDivisor = 100;
+const base::FieldTrial::Probability
+    kHQPOnlyCountMatchesAtWordBoundariesFieldTrialExperimentFraction = 25;
+

 // Field trial IDs.
 // Though they are not literally "const", they are set only once, in
@@ -97,6 +106,10 @@ int hup_dont_create_shorter_match_experiment_group = 0;
 // experiment group.
 int hqp_replace_hup_scoring_experiment_group = 0;

+// Field trial ID for the HistoryQuick provider only count matches at
+// word boundaries experiment group.
+int hqp_only_count_matches_at_word_boundaries_experiment_group = 0;
+
 }


@@ -191,6 +204,18 @@ void AutocompleteFieldTrial::Activate() {
  trial->UseOneTimeRandomization();
  hqp_replace_hup_scoring_experiment_group = trial->AppendGroup("HQPReplaceHUP",
      kHQPReplaceHUPScoringFieldTrialExperimentFraction);
+
+  // Create the field trial that makes HistoryQuick provider score
+  // ignore all matches that happen in the middle of a word.  Make it
+  // expire on June 23, 2013.
+  trial = base::FieldTrialList::FactoryGetFieldTrial(
+      kHQPOnlyCountMatchesAtWordBoundariesFieldTrialName,
+      kHQPOnlyCountMatchesAtWordBoundariesFieldTrialDivisor,
+      "Standard", 2013, 6, 23, NULL);
+  trial->UseOneTimeRandomization();
+  hqp_only_count_matches_at_word_boundaries_experiment_group =
+      trial->AppendGroup("HQPOnlyCountMatchesAtWordBoundaries",
+          kHQPOnlyCountMatchesAtWordBoundariesFieldTrialExperimentFraction);
 }

 bool AutocompleteFieldTrial::InDisallowInlineHQPFieldTrial() {
@@ -264,3 +289,19 @@ bool AutocompleteFieldTrial::InHQPReplaceHUPScoringFieldTrialExperimentGroup() {
      kHQPReplaceHUPScoringFieldTrialName);
  return group == hqp_replace_hup_scoring_experiment_group;
 }
+
+bool AutocompleteFieldTrial::InHQPOnlyCountMatchesAtWordBoundariesFieldTrial() {
+  return base::FieldTrialList::TrialExists(
+      kHQPOnlyCountMatchesAtWordBoundariesFieldTrialName);
+}
+
+bool AutocompleteFieldTrial::
+    InHQPOnlyCountMatchesAtWordBoundariesFieldTrialExperimentGroup() {
+  if (!InHQPOnlyCountMatchesAtWordBoundariesFieldTrial())
+    return false;
+
+  // Return true if we're in the experiment group.
+  const int group = base::FieldTrialList::FindValue(
+      kHQPOnlyCountMatchesAtWordBoundariesFieldTrialName);
+  return group == hqp_only_count_matches_at_word_boundaries_experiment_group;
+}
--- a/chrome/browser/autocomplete/autocomplete_field_trial.h
+++ b/chrome/browser/autocomplete/autocomplete_field_trial.h
@@ -83,6 +83,18 @@ class AutocompleteFieldTrial {
  // is larger.
  static bool InHQPReplaceHUPScoringFieldTrialExperimentGroup();

+  // ---------------------------------------------------------
+  // For the HistoryQuick provider only count matches at word boundaries
+  // field trial.
+
+  // Returns whether the user is in any group for this field trial.
+  // (Should always be true unless initialization went wrong.)
+  static bool InHQPOnlyCountMatchesAtWordBoundariesFieldTrial();
+
+  // Returns whether we should ignore all mid-word matches in
+  // HistoryQuick provider.
+  static bool InHQPOnlyCountMatchesAtWordBoundariesFieldTrialExperimentGroup();
+
 private:
  DISALLOW_IMPLICIT_CONSTRUCTORS(AutocompleteFieldTrial);
 };

--- a/chrome/browser/history/scored_history_match.cc
+++ b/chrome/browser/history/scored_history_match.cc
@@ -56,6 +56,7 @@ const int kBaseScoreForUntypedResultsInHUPLikeScoring = 900;

 bool ScoredHistoryMatch::initialized_ = false;
 bool ScoredHistoryMatch::use_new_scoring = false;
+bool ScoredHistoryMatch::only_count_matches_at_word_boundaries = false;
 bool ScoredHistoryMatch::also_do_hup_like_scoring = false;

 ScoredHistoryMatch::ScoredHistoryMatch()
@@ -63,6 +64,7 @@ ScoredHistoryMatch::ScoredHistoryMatch()
      can_inline(false) {
  if (!initialized_) {
    InitializeNewScoringField();
+    InitializeOnlyCountMatchesAtWordBoundariesField();
    InitializeAlsoDoHUPLikeScoringField();
    initialized_ = true;
  }
@@ -79,6 +81,7 @@ ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& row,
      can_inline(false) {
  if (!initialized_) {
    InitializeNewScoringField();
+    InitializeOnlyCountMatchesAtWordBoundariesField();
    InitializeAlsoDoHUPLikeScoringField();
    initialized_ = true;
  }
@@ -161,10 +164,13 @@ ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& row,
    // Get partial scores based on term matching. Note that the score for
    // each of the URL and title are adjusted by the fraction of the
    // terms appearing in each.
-    int url_score = ScoreComponentForMatches(url_matches, url.length()) *
+    int url_score =
+        ScoreComponentForMatches(url_matches, word_starts.url_word_starts_,
+                                 url.length()) *
        std::min(url_matches.size(), terms.size()) / terms.size();
    int title_score =
-        ScoreComponentForMatches(title_matches, title.length()) *
+        ScoreComponentForMatches(title_matches, word_starts.title_word_starts_,
+                                 title.length()) *
        std::min(title_matches.size(), terms.size()) / terms.size();
    // Arbitrarily pick the best.
    // TODO(mrossetti): It might make sense that a term which appears in both
@@ -287,8 +293,23 @@ int AccumulateMatchLength(int total, const TermMatch& match) {
 }

 // static
-int ScoredHistoryMatch::ScoreComponentForMatches(const TermMatches& matches,
-                                                 size_t max_length) {
+int ScoredHistoryMatch::ScoreComponentForMatches(
+    const TermMatches& provided_matches,
+    const WordStarts& word_starts,
+    size_t max_length) {
+  if (provided_matches.empty())
+    return 0;
+
+  TermMatches matches_at_word_boundaries;
+  if (only_count_matches_at_word_boundaries) {
+    MakeTermMatchesOnlyAtWordBoundaries(provided_matches, word_starts,
+                                        &matches_at_word_boundaries);
+  }
+  // The actual matches we'll use for matching.  This is |provided_matches|
+  // with all the matches not at a word boundary removed (if told to do so).
+  const TermMatches& matches = only_count_matches_at_word_boundaries ?
+      matches_at_word_boundaries : provided_matches;
+
  if (matches.empty())
    return 0;

@@ -345,6 +366,31 @@ int ScoredHistoryMatch::ScoreComponentForMatches(const TermMatches& matches,
  return ScoreForValue(raw_score, kTermScoreLevel);
 }

+// static
+void ScoredHistoryMatch::MakeTermMatchesOnlyAtWordBoundaries(
+    const TermMatches& provided_matches,
+    const WordStarts& word_starts,
+    TermMatches* matches_at_word_boundaries) {
+  matches_at_word_boundaries->clear();
+  // Resize it to an upper-bound estimate of the correct size.
+  matches_at_word_boundaries->reserve(provided_matches.size());
+  WordStarts::const_iterator next_word_starts = word_starts.begin();
+  for (TermMatches::const_iterator iter = provided_matches.begin();
+       iter != provided_matches.end(); ++iter) {
+    // Advance next_word_starts until it's >= the position of the term
+    // we're considering.
+    while ((next_word_starts != word_starts.end()) &&
+           (*next_word_starts < iter->offset)) {
+      ++next_word_starts;
+    }
+    if ((next_word_starts != word_starts.end()) &&
+        (*next_word_starts == iter->offset)) {
+      // At word boundary: copy this element into |matches_at_word_boundaries|.
+      matches_at_word_boundaries->push_back(*iter);
+    }
+  }
+}
+
 // static
 int ScoredHistoryMatch::ScoreForValue(int value, const int* value_ranks) {
  int i = 0;
@@ -657,6 +703,14 @@ void ScoredHistoryMatch::InitializeNewScoringField() {
      new_scoring_option, NUM_OPTIONS);
 }

+void ScoredHistoryMatch::InitializeOnlyCountMatchesAtWordBoundariesField() {
+  only_count_matches_at_word_boundaries =
+      AutocompleteFieldTrial::
+          InHQPOnlyCountMatchesAtWordBoundariesFieldTrial() &&
+      AutocompleteFieldTrial::
+          InHQPOnlyCountMatchesAtWordBoundariesFieldTrialExperimentGroup();
+}
+
 void ScoredHistoryMatch::InitializeAlsoDoHUPLikeScoringField() {
  also_do_hup_like_scoring =
      AutocompleteFieldTrial::InHQPReplaceHUPScoringFieldTrial() &&

--- a/chrome/browser/history/scored_history_match.h
+++ b/chrome/browser/history/scored_history_match.h
@@ -38,12 +38,26 @@ struct ScoredHistoryMatch : public history::HistoryMatch {
                     BookmarkService* bookmark_service);
  ~ScoredHistoryMatch();

-  // Calculates a component score based on position, ordering and total
-  // substring match size using metrics recorded in |matches|. |max_length|
-  // is the length of the string against which the terms are being searched.
-  static int ScoreComponentForMatches(const TermMatches& matches,
+  // Calculates a component score based on position, ordering, word
+  // boundaries, and total substring match size using metrics recorded
+  // in |matches| and |word_starts|. |max_length| is the length of
+  // the string against which the terms are being searched.
+  // |provided_matches| should already be sorted and de-duped, and
+  // |word_starts| must be sorted.
+  static int ScoreComponentForMatches(const TermMatches& provided_matches,
+                                      const WordStarts& word_starts,
                                      size_t max_length);

+  // Given a set of term matches |provided_matches| and word boundaries
+  // |word_starts|, fills in |matches_at_word_boundaries| with only the
+  // matches in |provided_matches| that are at word boundaries.
+  // |provided_matches| should already be sorted and de-duped, and
+  // |word_starts| must be sorted.
+  static void MakeTermMatchesOnlyAtWordBoundaries(
+      const TermMatches& provided_matches,
+      const WordStarts& word_starts,
+      TermMatches* matches_at_word_boundaries);
+
  // Converts a raw value for some particular scoring factor into a score
  // component for that factor.  The conversion function is piecewise linear,
  // with input values provided in |value_ranks| and resulting output scores
@@ -74,7 +88,9 @@ struct ScoredHistoryMatch : public history::HistoryMatch {
  // boundaries).  |url_matches| and |title_matches| provide details
  // about where the matches in the URL and title are and what terms
  // (identified by a term number < |num_terms|) match where.
-  // |word_starts| explains where word boundaries are.
+  // |word_starts| explains where word boundaries are.  Its parts (title
+  // and url) must be sorted.  Also, |url_matches| and
+  // |titles_matches| should already be sorted and de-duped.
  static float GetTopicalityScore(const int num_terms,
                                  const string16& url,
                                  const TermMatches& url_matches,
@@ -102,6 +118,9 @@ struct ScoredHistoryMatch : public history::HistoryMatch {
  // field trial state.
  static void InitializeNewScoringField();

+  // Sets only_count_matches_at_word_boundaries based on the field trial state.
+  static void InitializeOnlyCountMatchesAtWordBoundariesField();
+
  // Sets also_do_hup_like_scoring based on the field trial state.
  static void InitializeAlsoDoHUPLikeScoringField();

@@ -134,7 +153,7 @@ struct ScoredHistoryMatch : public history::HistoryMatch {
  static const int kMaxRawTermScore = 30;
  static float* raw_term_score_to_topicality_score;

-  // Allows us to determing setting for use_new_scoring_ only once.
+  // Used so we initialize static variables only once (on first use).
  static bool initialized_;

  // Whether to use new-scoring or old-scoring.  Set in the
@@ -145,6 +164,9 @@ struct ScoredHistoryMatch : public history::HistoryMatch {
  // class as well (see boolean below).
  static bool use_new_scoring;

+  // If true, we ignore all matches that are in the middle of a word.
+  static bool only_count_matches_at_word_boundaries;
+
  // If true, assign raw scores to be max(whatever it normally would be,
  // a score that's similar to the score HistoryURL provider would assign).
  // This variable is set in the constructor by examining the field trial

--- a/chrome/browser/history/scored_history_match_unittest.cc
+++ b/chrome/browser/history/scored_history_match_unittest.cc
@@ -77,6 +77,85 @@ float ScoredHistoryMatchTest::GetTopicalityScoreOfTermAgainstURLAndTitle(
      1, url, url_matches, title_matches, word_starts);
 }

+TEST_F(ScoredHistoryMatchTest, MakeTermMatchesOnlyAtWordBoundaries) {
+  TermMatches matches, matches_at_word_boundaries;
+  WordStarts word_starts;
+
+  // no matches but some word starts -> no matches at word boundary
+  matches.clear();
+  word_starts.clear();
+  word_starts.push_back(2);
+  word_starts.push_back(5);
+  word_starts.push_back(10);
+  ScoredHistoryMatch::MakeTermMatchesOnlyAtWordBoundaries(
+      matches, word_starts, &matches_at_word_boundaries);
+  EXPECT_EQ(0u, matches_at_word_boundaries.size());
+
+  // matches but no word starts -> no matches at word boundary
+  matches.clear();
+  matches.push_back(TermMatch(0, 1, 2));  // 2-character match at pos 1
+  matches.push_back(TermMatch(0, 7, 2));  // 2-character match at pos 7
+  word_starts.clear();
+  ScoredHistoryMatch::MakeTermMatchesOnlyAtWordBoundaries(
+      matches, word_starts, &matches_at_word_boundaries);
+  EXPECT_EQ(0u, matches_at_word_boundaries.size());
+
+  // matches and word starts don't overlap -> no matches at word boundary
+  matches.clear();
+  matches.push_back(TermMatch(0, 1, 2));  // 2-character match at pos 1
+  matches.push_back(TermMatch(0, 7, 2));  // 2-character match at pos 7
+  word_starts.clear();
+  word_starts.push_back(2);
+  word_starts.push_back(5);
+  word_starts.push_back(10);
+  ScoredHistoryMatch::MakeTermMatchesOnlyAtWordBoundaries(
+      matches, word_starts, &matches_at_word_boundaries);
+  EXPECT_EQ(0u, matches_at_word_boundaries.size());
+
+  // some matches are at word boundary and some aren't
+  matches.clear();
+  matches.push_back(TermMatch(0, 1, 2));  // 2-character match at pos 1
+  matches.push_back(TermMatch(1, 6, 3));  // 3-character match at pos 6
+  matches.push_back(TermMatch(0, 8, 2));  // 2-character match at pos 8
+  matches.push_back(TermMatch(2, 15, 7));  // 7-character match at pos 15
+  matches.push_back(TermMatch(1, 26, 3));  // 3-character match at pos 26
+  word_starts.clear();
+  word_starts.push_back(0);
+  word_starts.push_back(6);
+  word_starts.push_back(9);
+  word_starts.push_back(15);
+  word_starts.push_back(24);
+  ScoredHistoryMatch::MakeTermMatchesOnlyAtWordBoundaries(
+      matches, word_starts, &matches_at_word_boundaries);
+  EXPECT_EQ(2u, matches_at_word_boundaries.size());
+  EXPECT_EQ(1, matches_at_word_boundaries[0].term_num);
+  EXPECT_EQ(6u, matches_at_word_boundaries[0].offset);
+  EXPECT_EQ(3u, matches_at_word_boundaries[0].length);
+  EXPECT_EQ(2, matches_at_word_boundaries[1].term_num);
+  EXPECT_EQ(15u, matches_at_word_boundaries[1].offset);
+  EXPECT_EQ(7u, matches_at_word_boundaries[1].length);
+
+  // all matches are at word boundary
+  matches.clear();
+  matches.push_back(TermMatch(0, 2, 2));  // 2-character match at pos 2
+  matches.push_back(TermMatch(1, 9, 3));  // 3-character match at pos 9
+  word_starts.clear();
+  word_starts.push_back(0);
+  word_starts.push_back(2);
+  word_starts.push_back(6);
+  word_starts.push_back(9);
+  word_starts.push_back(15);
+  ScoredHistoryMatch::MakeTermMatchesOnlyAtWordBoundaries(
+      matches, word_starts, &matches_at_word_boundaries);
+  EXPECT_EQ(2u, matches_at_word_boundaries.size());
+  EXPECT_EQ(0, matches_at_word_boundaries[0].term_num);
+  EXPECT_EQ(2u, matches_at_word_boundaries[0].offset);
+  EXPECT_EQ(2u, matches_at_word_boundaries[0].length);
+  EXPECT_EQ(1, matches_at_word_boundaries[1].term_num);
+  EXPECT_EQ(9u, matches_at_word_boundaries[1].offset);
+  EXPECT_EQ(3u, matches_at_word_boundaries[1].length);
+}
+
 TEST_F(ScoredHistoryMatchTest, Scoring) {
  URLRow row_a(MakeURLRow("http://abcdef", "fedcba", 3, 30, 1));
  // We use NowFromSystemTime() because MakeURLRow uses the same function