Omnibox: Refactor ScoredHistoryMatch URL parsing

Previously, we determined which part of the URL the match was in by by searching the formatted URL string (in string16 form) for delimiters. After this refactor, we do this by using the "official" GURL component offsets adjusted for URL formatting. This is good both as a refactor, and also to lay the groundwork for adding |match_in_subdomain| and |match_in_path| flags. This CL doesn't actually add the above flags, and is intended to provide identical functionality as before (which is why no tests changed). Bug: 732582, 595524, 448659 Change-Id: I133c2ecb462597941b7284fd88f99e55f341f6b4 Reviewed-on: https://chromium-review.googlesource.com/564300 Commit-Queue: Tommy Li <tommycli@chromium.org> Reviewed-by: Justin Donnelly <jdonnelly@chromium.org> Cr-Commit-Position: refs/heads/master@{#485446}

Omnibox: Refactor ScoredHistoryMatch URL parsing
Previously, we determined which part of the URL the match was in by by searching the formatted URL string (in string16 form) for delimiters. After this refactor, we do this by using the "official" GURL component offsets adjusted for URL formatting. This is good both as a refactor, and also to lay the groundwork for adding |match_in_subdomain| and |match_in_path| flags. This CL doesn't actually add the above flags, and is intended to provide identical functionality as before (which is why no tests changed). Bug: 732582, 595524, 448659 Change-Id: I133c2ecb462597941b7284fd88f99e55f341f6b4 Reviewed-on: https://chromium-review.googlesource.com/564300 Commit-Queue: Tommy Li <tommycli@chromium.org> Reviewed-by: Justin Donnelly <jdonnelly@chromium.org> Cr-Commit-Position: refs/heads/master@{#485446}
efcd3a7e · Tommy C. Li · Commit Bot · bdb5508a · efcd3a7e · efcd3a7e
Commit efcd3a7e authored Jul 10, 2017 by Tommy C. Li Committed by Commit Bot Jul 10, 2017
3 changed files
--- a/components/omnibox/browser/scored_history_match.cc
+++ b/components/omnibox/browser/scored_history_match.cc
@@ -15,7 +15,6 @@
 #include "base/strings/string_number_conversions.h"
 #include "base/strings/string_split.h"
 #include "base/strings/string_util.h"
-#include "base/strings/utf_offset_string_conversions.h"
 #include "base/strings/utf_string_conversions.h"
 #include "components/bookmarks/browser/bookmark_utils.h"
 #include "components/omnibox/browser/history_url_provider.h"
@@ -102,6 +101,17 @@ void InitDaysAgoToRecencyScoreArray() {
  }
 }

+size_t GetAdjustedOffsetForComponent(
+    const GURL& url,
+    const base::OffsetAdjuster::Adjustments& adjustments,
+    const url::Parsed::ComponentType& component) {
+  const url::Parsed& parsed = url.parsed_for_possibly_invalid_spec();
+
+  size_t result = parsed.CountCharactersBefore(component, true);
+  base::OffsetAdjuster::AdjustOffset(adjustments, &result);
+  return result;
+}
+
 }  // namespace

 // static
@@ -254,8 +264,9 @@ ScoredHistoryMatch::ScoredHistoryMatch(
    }
  }

-  const float topicality_score = GetTopicalityScore(
-      terms_vector.size(), url, terms_to_word_starts_offsets, word_starts);
+  const float topicality_score =
+      GetTopicalityScore(terms_vector.size(), gurl, adjustments,
+                         terms_to_word_starts_offsets, word_starts);
  const float frequency_score = GetFrequency(now, is_url_bookmarked, visits);
  const float specificity_score =
      GetDocumentSpecificityScore(num_matching_pages);
@@ -425,7 +436,8 @@ void ScoredHistoryMatch::Init() {

 float ScoredHistoryMatch::GetTopicalityScore(
    const int num_terms,
-    const base::string16& url,
+    const GURL& url,
+    const base::OffsetAdjuster::Adjustments& adjustments,
    const WordStarts& terms_to_word_starts_offsets,
    const RowWordStarts& word_starts) {
  // A vector that accumulates per-term scores.  The strongest match--a
@@ -439,32 +451,24 @@ float ScoredHistoryMatch::GetTopicalityScore(
      word_starts.url_word_starts_.begin();
  WordStarts::const_iterator end_word_starts =
      word_starts.url_word_starts_.end();
-  const size_t question_mark_pos = url.find('?');
-  const size_t colon_pos = url.find(':');
-  // The + 3 skips the // that probably appears in the protocol
-  // after the colon.  If the protocol doesn't have two slashes after
-  // the colon, that's okay--all this ends up doing is starting our
-  // search for the next / a few characters into the hostname.  The
-  // only times this can cause problems is if we have a protocol without
-  // a // after the colon and the hostname is only one or two characters.
-  // This isn't worth worrying about.
-  const size_t end_of_hostname_pos = (colon_pos != std::string::npos)
-                                         ? url.find('/', colon_pos + 3)
-                                         : url.find('/');
-  size_t last_part_of_hostname_pos = (end_of_hostname_pos != std::string::npos)
-                                         ? url.rfind('.', end_of_hostname_pos)
-                                         : url.rfind('.');
+
+  const size_t query_pos =
+      GetAdjustedOffsetForComponent(url, adjustments, url::Parsed::QUERY);
+  const size_t host_pos =
+      GetAdjustedOffsetForComponent(url, adjustments, url::Parsed::HOST);
+  const size_t path_pos =
+      GetAdjustedOffsetForComponent(url, adjustments, url::Parsed::PATH);
  // Loop through all URL matches and score them appropriately.
  // First, filter all matches not at a word boundary and in the path (or
  // later).
  url_matches = FilterTermMatchesByWordStarts(
      url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_,
-      end_of_hostname_pos, std::string::npos);
-  if (colon_pos != std::string::npos) {
+      path_pos, std::string::npos);
+  if (url.has_scheme()) {
    // Also filter matches not at a word boundary and in the scheme.
    url_matches = FilterTermMatchesByWordStarts(
        url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_,
-        0, colon_pos);
+        0, host_pos);
  }
  for (const auto& url_match : url_matches) {
    // Calculate the offset in the URL string where the meaningful (word) part
@@ -480,21 +484,22 @@ float ScoredHistoryMatch::GetTopicalityScore(
    }
    const bool at_word_boundary = (next_word_starts != end_word_starts) &&
                                  (*next_word_starts == term_word_offset);
-    if ((question_mark_pos != std::string::npos) &&
-        (term_word_offset >= question_mark_pos)) {
-      // The match is in a CGI ?... fragment.
+    if (term_word_offset >= query_pos) {
+      // The match is in the query or ref component.
      DCHECK(at_word_boundary);
      term_scores[url_match.term_num] += 5;
-    } else if ((end_of_hostname_pos != std::string::npos) &&
-               (term_word_offset >= end_of_hostname_pos)) {
-      // The match is in the path.
+    } else if (term_word_offset >= path_pos) {
+      // The match is in the path component.
      DCHECK(at_word_boundary);
      term_scores[url_match.term_num] += 8;
-    } else if ((colon_pos == std::string::npos) ||
-               (term_word_offset >= colon_pos)) {
-      // The match is in the hostname.
-      if ((last_part_of_hostname_pos == std::string::npos) ||
-          (term_word_offset < last_part_of_hostname_pos)) {
+    } else if (term_word_offset >= host_pos) {
+      // Get the position of the last period in the hostname.
+      const url::Parsed& parsed = url.parsed_for_possibly_invalid_spec();
+      size_t last_part_of_host_pos = url.possibly_invalid_spec().rfind(
+          '.', parsed.CountCharactersBefore(url::Parsed::PATH, true));
+      base::OffsetAdjuster::AdjustOffset(adjustments, &last_part_of_host_pos);
+
+      if (term_word_offset < last_part_of_host_pos) {
        // Either there are no dots in the hostname or this match isn't
        // the last dotted component.
        term_scores[url_match.term_num] += at_word_boundary ? 10 : 2;

--- a/components/omnibox/browser/scored_history_match.h
+++ b/components/omnibox/browser/scored_history_match.h
@@ -12,6 +12,7 @@

 #include "base/gtest_prod_util.h"
 #include "base/strings/string16.h"
+#include "base/strings/utf_offset_string_conversions.h"
 #include "base/time/time.h"
 #include "components/history/core/browser/history_match.h"
 #include "components/history/core/browser/history_types.h"
@@ -122,9 +123,10 @@ struct ScoredHistoryMatch : public history::HistoryMatch {
  // url_matches and title_matches in the process so they only reflect matches
  // used for scoring.  (For instance, some mid-word matches are not given
  // credit in scoring.)  Requires that |url_matches| and |title_matches| are
-  // sorted.
+  // sorted. |adjustments| must contain any adjustments used to format |url|.
  float GetTopicalityScore(const int num_terms,
-                           const base::string16& cleaned_up_url,
+                           const GURL& url,
+                           const base::OffsetAdjuster::Adjustments& adjustments,
                           const WordStarts& terms_to_word_starts_offsets,
                           const RowWordStarts& word_starts);


--- a/components/omnibox/browser/scored_history_match_unittest.cc
+++ b/components/omnibox/browser/scored_history_match_unittest.cc
@@ -66,7 +66,7 @@ class ScoredHistoryMatchTest : public testing::Test {
  // GetTopicalityScore().  It only works for scoring a single term, not
  // multiple terms.
  float GetTopicalityScoreOfTermAgainstURLAndTitle(const base::string16& term,
-                                                   const base::string16& url,
+                                                   const GURL& url,
                                                   const base::string16& title);
 };

@@ -107,7 +107,7 @@ String16Vector ScoredHistoryMatchTest::Make2Terms(const char* term_1,

 float ScoredHistoryMatchTest::GetTopicalityScoreOfTermAgainstURLAndTitle(
    const base::string16& term,
-    const base::string16& url,
+    const GURL& url,
    const base::string16& title) {
  String16Vector term_vector = {term};
  WordStarts term_word_starts = {0};
@@ -119,16 +119,18 @@ float ScoredHistoryMatchTest::GetTopicalityScoreOfTermAgainstURLAndTitle(
    term_word_starts[0] = iter.prev();
  }
  RowWordStarts row_word_starts;
-  String16SetFromString16(url, &row_word_starts.url_word_starts_);
+  base::string16 url_string = base::UTF8ToUTF16(url.spec());
+  String16SetFromString16(url_string, &row_word_starts.url_word_starts_);
  String16SetFromString16(title, &row_word_starts.title_word_starts_);
  ScoredHistoryMatch scored_match(history::URLRow(GURL(url)), VisitInfoVector(),
                                  term, term_vector, term_word_starts,
                                  row_word_starts, false, 1, base::Time::Max());
-  scored_match.url_matches = MatchTermInString(term, url, 0);
+  scored_match.url_matches = MatchTermInString(term, url_string, 0);
  scored_match.title_matches = MatchTermInString(term, title, 0);
  scored_match.topicality_threshold_ = -1;
-  return scored_match.GetTopicalityScore(1, url, term_word_starts,
-                                         row_word_starts);
+  return scored_match.GetTopicalityScore(1, url,
+                                         base::OffsetAdjuster::Adjustments(),
+                                         term_word_starts, row_word_starts);
 }

 TEST_F(ScoredHistoryMatchTest, Scoring) {
@@ -350,10 +352,10 @@ TEST_F(ScoredHistoryMatchTest, Inlining) {

 TEST_F(ScoredHistoryMatchTest, GetTopicalityScoreTrailingSlash) {
  const float hostname = GetTopicalityScoreOfTermAgainstURLAndTitle(
-      ASCIIToUTF16("def"), ASCIIToUTF16("http://abc.def.com/"),
+      ASCIIToUTF16("def"), GURL("http://abc.def.com/"),
      ASCIIToUTF16("Non-Matching Title"));
  const float hostname_no_slash = GetTopicalityScoreOfTermAgainstURLAndTitle(
-      ASCIIToUTF16("def"), ASCIIToUTF16("http://abc.def.com"),
+      ASCIIToUTF16("def"), GURL("http://abc.def.com"),
      ASCIIToUTF16("Non-Matching Title"));
  EXPECT_EQ(hostname_no_slash, hostname);
 }
@@ -615,9 +617,7 @@ TEST_F(ScoredHistoryMatchTest, GetDocumentSpecificityScore) {
 // This function only tests scoring of single terms that match exactly
 // once somewhere in the URL or title.
 TEST_F(ScoredHistoryMatchTest, GetTopicalityScore) {
-  base::string16 url = ASCIIToUTF16(
-      "http://abc.def.com/path1/path2?"
-      "arg1=val1&arg2=val2#hash_component");
+  GURL url("http://abc.def.com/path1/path2?arg1=val1&arg2=val2#hash_component");
  base::string16 title = ASCIIToUTF16("here is a title");
  auto Score = [&](const char* term) {
    return GetTopicalityScoreOfTermAgainstURLAndTitle(ASCIIToUTF16(term), url,