Commit efcd3a7e authored by Tommy C. Li's avatar Tommy C. Li Committed by Commit Bot

Omnibox: Refactor ScoredHistoryMatch URL parsing

Previously, we determined which part of the URL the match was in by
by searching the formatted URL string (in string16 form) for delimiters.

After this refactor, we do this by using the "official" GURL component
offsets adjusted for URL formatting.

This is good both as a refactor, and also to lay the groundwork for
adding |match_in_subdomain| and |match_in_path| flags.

This CL doesn't actually add the above flags, and is intended to
provide identical functionality as before (which is why no tests
changed).

Bug: 732582, 595524, 448659
Change-Id: I133c2ecb462597941b7284fd88f99e55f341f6b4
Reviewed-on: https://chromium-review.googlesource.com/564300
Commit-Queue: Tommy Li <tommycli@chromium.org>
Reviewed-by: default avatarJustin Donnelly <jdonnelly@chromium.org>
Cr-Commit-Position: refs/heads/master@{#485446}
parent bdb5508a
......@@ -15,7 +15,6 @@
#include "base/strings/string_number_conversions.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_offset_string_conversions.h"
#include "base/strings/utf_string_conversions.h"
#include "components/bookmarks/browser/bookmark_utils.h"
#include "components/omnibox/browser/history_url_provider.h"
......@@ -102,6 +101,17 @@ void InitDaysAgoToRecencyScoreArray() {
}
}
size_t GetAdjustedOffsetForComponent(
const GURL& url,
const base::OffsetAdjuster::Adjustments& adjustments,
const url::Parsed::ComponentType& component) {
const url::Parsed& parsed = url.parsed_for_possibly_invalid_spec();
size_t result = parsed.CountCharactersBefore(component, true);
base::OffsetAdjuster::AdjustOffset(adjustments, &result);
return result;
}
} // namespace
// static
......@@ -254,8 +264,9 @@ ScoredHistoryMatch::ScoredHistoryMatch(
}
}
const float topicality_score = GetTopicalityScore(
terms_vector.size(), url, terms_to_word_starts_offsets, word_starts);
const float topicality_score =
GetTopicalityScore(terms_vector.size(), gurl, adjustments,
terms_to_word_starts_offsets, word_starts);
const float frequency_score = GetFrequency(now, is_url_bookmarked, visits);
const float specificity_score =
GetDocumentSpecificityScore(num_matching_pages);
......@@ -425,7 +436,8 @@ void ScoredHistoryMatch::Init() {
float ScoredHistoryMatch::GetTopicalityScore(
const int num_terms,
const base::string16& url,
const GURL& url,
const base::OffsetAdjuster::Adjustments& adjustments,
const WordStarts& terms_to_word_starts_offsets,
const RowWordStarts& word_starts) {
// A vector that accumulates per-term scores. The strongest match--a
......@@ -439,32 +451,24 @@ float ScoredHistoryMatch::GetTopicalityScore(
word_starts.url_word_starts_.begin();
WordStarts::const_iterator end_word_starts =
word_starts.url_word_starts_.end();
const size_t question_mark_pos = url.find('?');
const size_t colon_pos = url.find(':');
// The + 3 skips the // that probably appears in the protocol
// after the colon. If the protocol doesn't have two slashes after
// the colon, that's okay--all this ends up doing is starting our
// search for the next / a few characters into the hostname. The
// only times this can cause problems is if we have a protocol without
// a // after the colon and the hostname is only one or two characters.
// This isn't worth worrying about.
const size_t end_of_hostname_pos = (colon_pos != std::string::npos)
? url.find('/', colon_pos + 3)
: url.find('/');
size_t last_part_of_hostname_pos = (end_of_hostname_pos != std::string::npos)
? url.rfind('.', end_of_hostname_pos)
: url.rfind('.');
const size_t query_pos =
GetAdjustedOffsetForComponent(url, adjustments, url::Parsed::QUERY);
const size_t host_pos =
GetAdjustedOffsetForComponent(url, adjustments, url::Parsed::HOST);
const size_t path_pos =
GetAdjustedOffsetForComponent(url, adjustments, url::Parsed::PATH);
// Loop through all URL matches and score them appropriately.
// First, filter all matches not at a word boundary and in the path (or
// later).
url_matches = FilterTermMatchesByWordStarts(
url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_,
end_of_hostname_pos, std::string::npos);
if (colon_pos != std::string::npos) {
path_pos, std::string::npos);
if (url.has_scheme()) {
// Also filter matches not at a word boundary and in the scheme.
url_matches = FilterTermMatchesByWordStarts(
url_matches, terms_to_word_starts_offsets, word_starts.url_word_starts_,
0, colon_pos);
0, host_pos);
}
for (const auto& url_match : url_matches) {
// Calculate the offset in the URL string where the meaningful (word) part
......@@ -480,21 +484,22 @@ float ScoredHistoryMatch::GetTopicalityScore(
}
const bool at_word_boundary = (next_word_starts != end_word_starts) &&
(*next_word_starts == term_word_offset);
if ((question_mark_pos != std::string::npos) &&
(term_word_offset >= question_mark_pos)) {
// The match is in a CGI ?... fragment.
if (term_word_offset >= query_pos) {
// The match is in the query or ref component.
DCHECK(at_word_boundary);
term_scores[url_match.term_num] += 5;
} else if ((end_of_hostname_pos != std::string::npos) &&
(term_word_offset >= end_of_hostname_pos)) {
// The match is in the path.
} else if (term_word_offset >= path_pos) {
// The match is in the path component.
DCHECK(at_word_boundary);
term_scores[url_match.term_num] += 8;
} else if ((colon_pos == std::string::npos) ||
(term_word_offset >= colon_pos)) {
// The match is in the hostname.
if ((last_part_of_hostname_pos == std::string::npos) ||
(term_word_offset < last_part_of_hostname_pos)) {
} else if (term_word_offset >= host_pos) {
// Get the position of the last period in the hostname.
const url::Parsed& parsed = url.parsed_for_possibly_invalid_spec();
size_t last_part_of_host_pos = url.possibly_invalid_spec().rfind(
'.', parsed.CountCharactersBefore(url::Parsed::PATH, true));
base::OffsetAdjuster::AdjustOffset(adjustments, &last_part_of_host_pos);
if (term_word_offset < last_part_of_host_pos) {
// Either there are no dots in the hostname or this match isn't
// the last dotted component.
term_scores[url_match.term_num] += at_word_boundary ? 10 : 2;
......
......@@ -12,6 +12,7 @@
#include "base/gtest_prod_util.h"
#include "base/strings/string16.h"
#include "base/strings/utf_offset_string_conversions.h"
#include "base/time/time.h"
#include "components/history/core/browser/history_match.h"
#include "components/history/core/browser/history_types.h"
......@@ -122,9 +123,10 @@ struct ScoredHistoryMatch : public history::HistoryMatch {
// url_matches and title_matches in the process so they only reflect matches
// used for scoring. (For instance, some mid-word matches are not given
// credit in scoring.) Requires that |url_matches| and |title_matches| are
// sorted.
// sorted. |adjustments| must contain any adjustments used to format |url|.
float GetTopicalityScore(const int num_terms,
const base::string16& cleaned_up_url,
const GURL& url,
const base::OffsetAdjuster::Adjustments& adjustments,
const WordStarts& terms_to_word_starts_offsets,
const RowWordStarts& word_starts);
......
......@@ -66,7 +66,7 @@ class ScoredHistoryMatchTest : public testing::Test {
// GetTopicalityScore(). It only works for scoring a single term, not
// multiple terms.
float GetTopicalityScoreOfTermAgainstURLAndTitle(const base::string16& term,
const base::string16& url,
const GURL& url,
const base::string16& title);
};
......@@ -107,7 +107,7 @@ String16Vector ScoredHistoryMatchTest::Make2Terms(const char* term_1,
float ScoredHistoryMatchTest::GetTopicalityScoreOfTermAgainstURLAndTitle(
const base::string16& term,
const base::string16& url,
const GURL& url,
const base::string16& title) {
String16Vector term_vector = {term};
WordStarts term_word_starts = {0};
......@@ -119,16 +119,18 @@ float ScoredHistoryMatchTest::GetTopicalityScoreOfTermAgainstURLAndTitle(
term_word_starts[0] = iter.prev();
}
RowWordStarts row_word_starts;
String16SetFromString16(url, &row_word_starts.url_word_starts_);
base::string16 url_string = base::UTF8ToUTF16(url.spec());
String16SetFromString16(url_string, &row_word_starts.url_word_starts_);
String16SetFromString16(title, &row_word_starts.title_word_starts_);
ScoredHistoryMatch scored_match(history::URLRow(GURL(url)), VisitInfoVector(),
term, term_vector, term_word_starts,
row_word_starts, false, 1, base::Time::Max());
scored_match.url_matches = MatchTermInString(term, url, 0);
scored_match.url_matches = MatchTermInString(term, url_string, 0);
scored_match.title_matches = MatchTermInString(term, title, 0);
scored_match.topicality_threshold_ = -1;
return scored_match.GetTopicalityScore(1, url, term_word_starts,
row_word_starts);
return scored_match.GetTopicalityScore(1, url,
base::OffsetAdjuster::Adjustments(),
term_word_starts, row_word_starts);
}
TEST_F(ScoredHistoryMatchTest, Scoring) {
......@@ -350,10 +352,10 @@ TEST_F(ScoredHistoryMatchTest, Inlining) {
TEST_F(ScoredHistoryMatchTest, GetTopicalityScoreTrailingSlash) {
const float hostname = GetTopicalityScoreOfTermAgainstURLAndTitle(
ASCIIToUTF16("def"), ASCIIToUTF16("http://abc.def.com/"),
ASCIIToUTF16("def"), GURL("http://abc.def.com/"),
ASCIIToUTF16("Non-Matching Title"));
const float hostname_no_slash = GetTopicalityScoreOfTermAgainstURLAndTitle(
ASCIIToUTF16("def"), ASCIIToUTF16("http://abc.def.com"),
ASCIIToUTF16("def"), GURL("http://abc.def.com"),
ASCIIToUTF16("Non-Matching Title"));
EXPECT_EQ(hostname_no_slash, hostname);
}
......@@ -615,9 +617,7 @@ TEST_F(ScoredHistoryMatchTest, GetDocumentSpecificityScore) {
// This function only tests scoring of single terms that match exactly
// once somewhere in the URL or title.
TEST_F(ScoredHistoryMatchTest, GetTopicalityScore) {
base::string16 url = ASCIIToUTF16(
"http://abc.def.com/path1/path2?"
"arg1=val1&arg2=val2#hash_component");
GURL url("http://abc.def.com/path1/path2?arg1=val1&arg2=val2#hash_component");
base::string16 title = ASCIIToUTF16("here is a title");
auto Score = [&](const char* term) {
return GetTopicalityScoreOfTermAgainstURLAndTitle(ASCIIToUTF16(term), url,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment