Commit efcd3a7e authored by Tommy C. Li's avatar Tommy C. Li Committed by Commit Bot

Omnibox: Refactor ScoredHistoryMatch URL parsing

Previously, we determined which part of the URL the match was in by
by searching the formatted URL string (in string16 form) for delimiters.

After this refactor, we do this by using the "official" GURL component
offsets adjusted for URL formatting.

This is good both as a refactor, and also to lay the groundwork for
adding |match_in_subdomain| and |match_in_path| flags.

This CL doesn't actually add the above flags, and is intended to
provide identical functionality as before (which is why no tests
changed).

Bug: 732582, 595524, 448659
Change-Id: I133c2ecb462597941b7284fd88f99e55f341f6b4
Reviewed-on: https://chromium-review.googlesource.com/564300
Commit-Queue: Tommy Li <tommycli@chromium.org>
Reviewed-by: default avatarJustin Donnelly <jdonnelly@chromium.org>
Cr-Commit-Position: refs/heads/master@{#485446}
parent bdb5508a
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
#include "base/strings/string_number_conversions.h" #include "base/strings/string_number_conversions.h"
#include "base/strings/string_split.h" #include "base/strings/string_split.h"
#include "base/strings/string_util.h" #include "base/strings/string_util.h"
#include "base/strings/utf_offset_string_conversions.h"
#include "base/strings/utf_string_conversions.h" #include "base/strings/utf_string_conversions.h"
#include "components/bookmarks/browser/bookmark_utils.h" #include "components/bookmarks/browser/bookmark_utils.h"
#include "components/omnibox/browser/history_url_provider.h" #include "components/omnibox/browser/history_url_provider.h"
...@@ -102,6 +101,17 @@ void InitDaysAgoToRecencyScoreArray() { ...@@ -102,6 +101,17 @@ void InitDaysAgoToRecencyScoreArray() {
} }
} }
size_t GetAdjustedOffsetForComponent(
const GURL& url,
const base::OffsetAdjuster::Adjustments& adjustments,
const url::Parsed::ComponentType& component) {
const url::Parsed& parsed = url.parsed_for_possibly_invalid_spec();
size_t result = parsed.CountCharactersBefore(component, true);
base::OffsetAdjuster::AdjustOffset(adjustments, &result);
return result;
}
} // namespace } // namespace
// static // static
...@@ -254,8 +264,9 @@ ScoredHistoryMatch::ScoredHistoryMatch( ...@@ -254,8 +264,9 @@ ScoredHistoryMatch::ScoredHistoryMatch(
} }
} }
const float topicality_score = GetTopicalityScore( const float topicality_score =
terms_vector.size(), url, terms_to_word_starts_offsets, word_starts); GetTopicalityScore(terms_vector.size(), gurl, adjustments,
terms_to_word_starts_offsets, word_starts);
const float frequency_score = GetFrequency(now, is_url_bookmarked, visits); const float frequency_score = GetFrequency(now, is_url_bookmarked, visits);
const float specificity_score = const float specificity_score =
GetDocumentSpecificityScore(num_matching_pages); GetDocumentSpecificityScore(num_matching_pages);
...@@ -425,7 +436,8 @@ void ScoredHistoryMatch::Init() { ...@@ -425,7 +436,8 @@ void ScoredHistoryMatch::Init() {
float ScoredHistoryMatch::GetTopicalityScore( float ScoredHistoryMatch::GetTopicalityScore(
const int num_terms, const int num_terms,
const base::string16& url, const GURL& url,
const base::OffsetAdjuster::Adjustments& adjustments,
const WordStarts& terms_to_word_starts_offsets, const WordStarts& terms_to_word_starts_offsets,
const RowWordStarts& word_starts) { const RowWordStarts& word_starts) {
// A vector that accumulates per-term scores. The strongest match--a // A vector that accumulates per-term scores. The strongest match--a
...@@ -439,32 +451,24 @@ float ScoredHistoryMatch::GetTopicalityScore( ...@@ -439,32 +451,24 @@ float ScoredHistoryMatch::GetTopicalityScore(
word_starts.url_word_starts_.begin(); word_starts.url_word_starts_.begin();
WordStarts::const_iterator end_word_starts = WordStarts::const_iterator end_word_starts =
word_starts.url_word_starts_.end(); word_starts.url_word_starts_.end();
const size_t question_mark_pos = url.find('?');
const size_t colon_pos = url.find(':'); const size_t query_pos =
// The + 3 skips the // that probably appears in the protocol GetAdjustedOffsetForComponent(url, adjustments, url::Parsed::QUERY);
// after the colon. If the protocol doesn't have two slashes after const size_t host_pos =
// the colon, that's okay--all this ends up doing is starting our GetAdjustedOffsetForComponent(url, adjustments, url::Parsed::HOST);
// search for the next / a few characters into the hostname. The const size_t path_pos =
// only times this can cause problems is if we have a protocol without GetAdjustedOffsetForComponent(url, adjustments, url::Parsed::PATH);
// a // after the colon and the hostname is only one or two characters.
// This isn't worth worrying about.
const size_t end_of_hostname_pos = (colon_pos != std::string::npos)
? url.find('/', colon_pos + 3)
: url.find('/');
size_t last_part_of_hostname_pos = (end_of_hostname_pos != std::string::npos)
? url.rfind('.', end_of_hostname_pos)
: url.rfind('.');
// Loop through all URL matches and score them appropriately. // Loop through all URL matches and score them appropriately.
// First, filter all matches not at a word boundary and in the path (or // First, filter all matches not at a word boundary and in the path (or
// later). // later).
url_matches = FilterTermMatchesByWordStarts( url_matches = FilterTermMatchesByWordStarts(
url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_, url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_,
end_of_hostname_pos, std::string::npos); path_pos, std::string::npos);
if (colon_pos != std::string::npos) { if (url.has_scheme()) {
// Also filter matches not at a word boundary and in the scheme. // Also filter matches not at a word boundary and in the scheme.
url_matches = FilterTermMatchesByWordStarts( url_matches = FilterTermMatchesByWordStarts(
url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_, url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_,
0, colon_pos); 0, host_pos);
} }
for (const auto& url_match : url_matches) { for (const auto& url_match : url_matches) {
// Calculate the offset in the URL string where the meaningful (word) part // Calculate the offset in the URL string where the meaningful (word) part
...@@ -480,21 +484,22 @@ float ScoredHistoryMatch::GetTopicalityScore( ...@@ -480,21 +484,22 @@ float ScoredHistoryMatch::GetTopicalityScore(
} }
const bool at_word_boundary = (next_word_starts != end_word_starts) && const bool at_word_boundary = (next_word_starts != end_word_starts) &&
(*next_word_starts == term_word_offset); (*next_word_starts == term_word_offset);
if ((question_mark_pos != std::string::npos) && if (term_word_offset >= query_pos) {
(term_word_offset >= question_mark_pos)) { // The match is in the query or ref component.
// The match is in a CGI ?... fragment.
DCHECK(at_word_boundary); DCHECK(at_word_boundary);
term_scores[url_match.term_num] += 5; term_scores[url_match.term_num] += 5;
} else if ((end_of_hostname_pos != std::string::npos) && } else if (term_word_offset >= path_pos) {
(term_word_offset >= end_of_hostname_pos)) { // The match is in the path component.
// The match is in the path.
DCHECK(at_word_boundary); DCHECK(at_word_boundary);
term_scores[url_match.term_num] += 8; term_scores[url_match.term_num] += 8;
} else if ((colon_pos == std::string::npos) || } else if (term_word_offset >= host_pos) {
(term_word_offset >= colon_pos)) { // Get the position of the last period in the hostname.
// The match is in the hostname. const url::Parsed& parsed = url.parsed_for_possibly_invalid_spec();
if ((last_part_of_hostname_pos == std::string::npos) || size_t last_part_of_host_pos = url.possibly_invalid_spec().rfind(
(term_word_offset < last_part_of_hostname_pos)) { '.', parsed.CountCharactersBefore(url::Parsed::PATH, true));
base::OffsetAdjuster::AdjustOffset(adjustments, &last_part_of_host_pos);
if (term_word_offset < last_part_of_host_pos) {
// Either there are no dots in the hostname or this match isn't // Either there are no dots in the hostname or this match isn't
// the last dotted component. // the last dotted component.
term_scores[url_match.term_num] += at_word_boundary ? 10 : 2; term_scores[url_match.term_num] += at_word_boundary ? 10 : 2;
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include "base/gtest_prod_util.h" #include "base/gtest_prod_util.h"
#include "base/strings/string16.h" #include "base/strings/string16.h"
#include "base/strings/utf_offset_string_conversions.h"
#include "base/time/time.h" #include "base/time/time.h"
#include "components/history/core/browser/history_match.h" #include "components/history/core/browser/history_match.h"
#include "components/history/core/browser/history_types.h" #include "components/history/core/browser/history_types.h"
...@@ -122,9 +123,10 @@ struct ScoredHistoryMatch : public history::HistoryMatch { ...@@ -122,9 +123,10 @@ struct ScoredHistoryMatch : public history::HistoryMatch {
// url_matches and title_matches in the process so they only reflect matches // url_matches and title_matches in the process so they only reflect matches
// used for scoring. (For instance, some mid-word matches are not given // used for scoring. (For instance, some mid-word matches are not given
// credit in scoring.) Requires that |url_matches| and |title_matches| are // credit in scoring.) Requires that |url_matches| and |title_matches| are
// sorted. // sorted. |adjustments| must contain any adjustments used to format |url|.
float GetTopicalityScore(const int num_terms, float GetTopicalityScore(const int num_terms,
const base::string16& cleaned_up_url, const GURL& url,
const base::OffsetAdjuster::Adjustments& adjustments,
const WordStarts& terms_to_word_starts_offsets, const WordStarts& terms_to_word_starts_offsets,
const RowWordStarts& word_starts); const RowWordStarts& word_starts);
......
...@@ -66,7 +66,7 @@ class ScoredHistoryMatchTest : public testing::Test { ...@@ -66,7 +66,7 @@ class ScoredHistoryMatchTest : public testing::Test {
// GetTopicalityScore(). It only works for scoring a single term, not // GetTopicalityScore(). It only works for scoring a single term, not
// multiple terms. // multiple terms.
float GetTopicalityScoreOfTermAgainstURLAndTitle(const base::string16& term, float GetTopicalityScoreOfTermAgainstURLAndTitle(const base::string16& term,
const base::string16& url, const GURL& url,
const base::string16& title); const base::string16& title);
}; };
...@@ -107,7 +107,7 @@ String16Vector ScoredHistoryMatchTest::Make2Terms(const char* term_1, ...@@ -107,7 +107,7 @@ String16Vector ScoredHistoryMatchTest::Make2Terms(const char* term_1,
float ScoredHistoryMatchTest::GetTopicalityScoreOfTermAgainstURLAndTitle( float ScoredHistoryMatchTest::GetTopicalityScoreOfTermAgainstURLAndTitle(
const base::string16& term, const base::string16& term,
const base::string16& url, const GURL& url,
const base::string16& title) { const base::string16& title) {
String16Vector term_vector = {term}; String16Vector term_vector = {term};
WordStarts term_word_starts = {0}; WordStarts term_word_starts = {0};
...@@ -119,16 +119,18 @@ float ScoredHistoryMatchTest::GetTopicalityScoreOfTermAgainstURLAndTitle( ...@@ -119,16 +119,18 @@ float ScoredHistoryMatchTest::GetTopicalityScoreOfTermAgainstURLAndTitle(
term_word_starts[0] = iter.prev(); term_word_starts[0] = iter.prev();
} }
RowWordStarts row_word_starts; RowWordStarts row_word_starts;
String16SetFromString16(url, &row_word_starts.url_word_starts_); base::string16 url_string = base::UTF8ToUTF16(url.spec());
String16SetFromString16(url_string, &row_word_starts.url_word_starts_);
String16SetFromString16(title, &row_word_starts.title_word_starts_); String16SetFromString16(title, &row_word_starts.title_word_starts_);
ScoredHistoryMatch scored_match(history::URLRow(GURL(url)), VisitInfoVector(), ScoredHistoryMatch scored_match(history::URLRow(GURL(url)), VisitInfoVector(),
term, term_vector, term_word_starts, term, term_vector, term_word_starts,
row_word_starts, false, 1, base::Time::Max()); row_word_starts, false, 1, base::Time::Max());
scored_match.url_matches = MatchTermInString(term, url, 0); scored_match.url_matches = MatchTermInString(term, url_string, 0);
scored_match.title_matches = MatchTermInString(term, title, 0); scored_match.title_matches = MatchTermInString(term, title, 0);
scored_match.topicality_threshold_ = -1; scored_match.topicality_threshold_ = -1;
return scored_match.GetTopicalityScore(1, url, term_word_starts, return scored_match.GetTopicalityScore(1, url,
row_word_starts); base::OffsetAdjuster::Adjustments(),
term_word_starts, row_word_starts);
} }
TEST_F(ScoredHistoryMatchTest, Scoring) { TEST_F(ScoredHistoryMatchTest, Scoring) {
...@@ -350,10 +352,10 @@ TEST_F(ScoredHistoryMatchTest, Inlining) { ...@@ -350,10 +352,10 @@ TEST_F(ScoredHistoryMatchTest, Inlining) {
TEST_F(ScoredHistoryMatchTest, GetTopicalityScoreTrailingSlash) { TEST_F(ScoredHistoryMatchTest, GetTopicalityScoreTrailingSlash) {
const float hostname = GetTopicalityScoreOfTermAgainstURLAndTitle( const float hostname = GetTopicalityScoreOfTermAgainstURLAndTitle(
ASCIIToUTF16("def"), ASCIIToUTF16("http://abc.def.com/"), ASCIIToUTF16("def"), GURL("http://abc.def.com/"),
ASCIIToUTF16("Non-Matching Title")); ASCIIToUTF16("Non-Matching Title"));
const float hostname_no_slash = GetTopicalityScoreOfTermAgainstURLAndTitle( const float hostname_no_slash = GetTopicalityScoreOfTermAgainstURLAndTitle(
ASCIIToUTF16("def"), ASCIIToUTF16("http://abc.def.com"), ASCIIToUTF16("def"), GURL("http://abc.def.com"),
ASCIIToUTF16("Non-Matching Title")); ASCIIToUTF16("Non-Matching Title"));
EXPECT_EQ(hostname_no_slash, hostname); EXPECT_EQ(hostname_no_slash, hostname);
} }
...@@ -615,9 +617,7 @@ TEST_F(ScoredHistoryMatchTest, GetDocumentSpecificityScore) { ...@@ -615,9 +617,7 @@ TEST_F(ScoredHistoryMatchTest, GetDocumentSpecificityScore) {
// This function only tests scoring of single terms that match exactly // This function only tests scoring of single terms that match exactly
// once somewhere in the URL or title. // once somewhere in the URL or title.
TEST_F(ScoredHistoryMatchTest, GetTopicalityScore) { TEST_F(ScoredHistoryMatchTest, GetTopicalityScore) {
base::string16 url = ASCIIToUTF16( GURL url("http://abc.def.com/path1/path2?arg1=val1&arg2=val2#hash_component");
"http://abc.def.com/path1/path2?"
"arg1=val1&arg2=val2#hash_component");
base::string16 title = ASCIIToUTF16("here is a title"); base::string16 title = ASCIIToUTF16("here is a title");
auto Score = [&](const char* term) { auto Score = [&](const char* term) {
return GetTopicalityScoreOfTermAgainstURLAndTitle(ASCIIToUTF16(term), url, return GetTopicalityScoreOfTermAgainstURLAndTitle(ASCIIToUTF16(term), url,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment