Commit 958c7e4e authored by Mikel Astiz's avatar Mikel Astiz Committed by Commit Bot

Fix TopSites duplicates due to missing canonicalization

Prior to this patch, TopSites only stripped the "www." prefix when
mapping a URL to a visit segment, which reflects how tiles will be
formed (deduplicated, or lack thereof) on the NTP.

We update this logic to handle common prefixes on mobile, as well as
ignoring the distinction between http and https.

The proposed implementation involves a history DB upgrade, which
processes the persisted segment names with the new version of the
function.

Bug: 784337
Change-Id: Ibe6b609bf22df28866b069efd47acc483389509d
Reviewed-on: https://chromium-review.googlesource.com/766747Reviewed-by: default avatarBrett Wilson <brettw@chromium.org>
Commit-Queue: Mikel Astiz <mastiz@chromium.org>
Cr-Commit-Position: refs/heads/master@{#517068}
parent c13cae8d
...@@ -38,6 +38,8 @@ ...@@ -38,6 +38,8 @@
#include "components/history/core/browser/page_usage_data.h" #include "components/history/core/browser/page_usage_data.h"
#include "components/history/core/test/history_backend_db_base_test.h" #include "components/history/core/test/history_backend_db_base_test.h"
#include "components/history/core/test/test_history_database.h" #include "components/history/core/test/test_history_database.h"
#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"
namespace history { namespace history {
namespace { namespace {
...@@ -1330,6 +1332,118 @@ TEST_F(HistoryBackendDBTest, CheckLastCompatibleVersion) { ...@@ -1330,6 +1332,118 @@ TEST_F(HistoryBackendDBTest, CheckLastCompatibleVersion) {
} }
} }
// Tests that visit segment names are recomputed and segments merged when
// migrating to version 37.
TEST_F(HistoryBackendDBTest, MigrateVisitSegmentNames) {
ASSERT_NO_FATAL_FAILURE(CreateDBVersion(32));
const SegmentID segment_id1 = 7;
const SegmentID segment_id2 = 8;
const URLID url_id1 = 3;
const URLID url_id2 = 4;
const GURL url1("http://www.foo.com");
const GURL url2("http://m.foo.com");
const std::string legacy_segment_name1("http://foo.com/");
const std::string legacy_segment_name2("http://m.foo.com/");
const base::string16 title1(base::ASCIIToUTF16("Title1"));
const base::string16 title2(base::ASCIIToUTF16("Title2"));
const base::Time segment_time(base::Time::Now());
{
// Open the db for manual manipulation.
sql::Connection db;
ASSERT_TRUE(db.Open(history_dir_.Append(kHistoryFilename)));
// Add first entry to urls.
{
sql::Statement s(
db.GetUniqueStatement("INSERT INTO urls "
"(id, url, title, last_visit_time) VALUES "
"(?, ?, ?, ?)"));
s.BindInt64(0, url_id1);
s.BindString(1, url1.spec());
s.BindString16(2, title1);
s.BindInt64(3, segment_time.ToInternalValue());
ASSERT_TRUE(s.Run());
}
// Add first entry to segments.
{
sql::Statement s(
db.GetUniqueStatement("INSERT INTO segments "
"(id, name, url_id) VALUES "
"(?, ?, ?)"));
s.BindInt64(0, segment_id1);
s.BindString(1, legacy_segment_name1);
s.BindInt64(2, url_id1);
ASSERT_TRUE(s.Run());
}
// And first to segment_usage.
{
sql::Statement s(db.GetUniqueStatement(
"INSERT INTO segment_usage "
"(id, segment_id, time_slot, visit_count) VALUES "
"(?, ?, ?, ?)"));
s.BindInt64(0, 4); // id.
s.BindInt64(1, segment_id1);
s.BindInt64(2, segment_time.ToInternalValue());
s.BindInt(3, 11); // visit count.
ASSERT_TRUE(s.Run());
}
// Add second entry to urls.
{
sql::Statement s(
db.GetUniqueStatement("INSERT INTO urls "
"(id, url, title, last_visit_time) VALUES "
"(?, ?, ?, ?)"));
s.BindInt64(0, url_id2);
s.BindString(1, url2.spec());
s.BindString16(2, title2);
s.BindInt64(3, segment_time.ToInternalValue());
ASSERT_TRUE(s.Run());
}
// Add second entry to segments.
{
sql::Statement s(
db.GetUniqueStatement("INSERT INTO segments "
"(id, name, url_id) VALUES "
"(?, ?, ?)"));
s.BindInt64(0, segment_id2);
s.BindString(1, legacy_segment_name2);
s.BindInt64(2, url_id2);
ASSERT_TRUE(s.Run());
}
// And second to segment_usage.
{
sql::Statement s(db.GetUniqueStatement(
"INSERT INTO segment_usage "
"(id, segment_id, time_slot, visit_count) VALUES "
"(?, ?, ?, ?)"));
s.BindInt64(0, 5); // id.
s.BindInt64(1, segment_id2);
s.BindInt64(2, segment_time.ToInternalValue());
s.BindInt(3, 13); // visit count.
ASSERT_TRUE(s.Run());
}
}
// Re-open the db, triggering migration.
CreateBackendAndDatabase();
std::vector<std::unique_ptr<PageUsageData>> results =
db_->QuerySegmentUsage(segment_time, /*max_result_count=*/10,
base::Callback<bool(const GURL&)>());
ASSERT_EQ(1u, results.size());
EXPECT_THAT(results[0]->GetURL(), testing::AnyOf(url1, url2));
EXPECT_THAT(results[0]->GetTitle(), testing::AnyOf(title1, title2));
EXPECT_EQ(segment_id1, db_->GetSegmentNamed(legacy_segment_name1));
EXPECT_EQ(0u, db_->GetSegmentNamed(legacy_segment_name2));
}
bool FilterURL(const GURL& url) { bool FilterURL(const GURL& url) {
return url.SchemeIsHTTPOrHTTPS(); return url.SchemeIsHTTPOrHTTPS();
} }
......
...@@ -37,7 +37,7 @@ namespace { ...@@ -37,7 +37,7 @@ namespace {
// Current version number. We write databases at the "current" version number, // Current version number. We write databases at the "current" version number,
// but any previous version that can read the "compatible" one can make do with // but any previous version that can read the "compatible" one can make do with
// our database without *too* many bad effects. // our database without *too* many bad effects.
const int kCurrentVersionNumber = 37; const int kCurrentVersionNumber = 38;
const int kCompatibleVersionNumber = 16; const int kCompatibleVersionNumber = 16;
const char kEarlyExpirationThresholdKey[] = "early_expiration_threshold"; const char kEarlyExpirationThresholdKey[] = "early_expiration_threshold";
const int kMaxHostsInMemory = 10000; const int kMaxHostsInMemory = 10000;
...@@ -590,6 +590,13 @@ sql::InitStatus HistoryDatabase::EnsureCurrentVersion() { ...@@ -590,6 +590,13 @@ sql::InitStatus HistoryDatabase::EnsureCurrentVersion() {
meta_table_.SetVersionNumber(cur_version); meta_table_.SetVersionNumber(cur_version);
} }
if (cur_version == 37) {
if (!MigrateVisitSegmentNames())
return LogMigrationFailure(37);
cur_version++;
meta_table_.SetVersionNumber(cur_version);
}
// ========================= ^^ new migration code goes here ^^ // ========================= ^^ new migration code goes here ^^
// ADDING NEW MIGRATION CODE // ADDING NEW MIGRATION CODE
// ========================= // =========================
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "base/logging.h" #include "base/logging.h"
#include "base/macros.h" #include "base/macros.h"
#include "base/memory/ptr_util.h" #include "base/memory/ptr_util.h"
#include "base/strings/string_piece.h"
#include "base/strings/string_util.h" #include "base/strings/string_util.h"
#include "components/history/core/browser/page_usage_data.h" #include "components/history/core/browser/page_usage_data.h"
#include "sql/statement.h" #include "sql/statement.h"
...@@ -104,17 +105,19 @@ std::string VisitSegmentDatabase::ComputeSegmentName(const GURL& url) { ...@@ -104,17 +105,19 @@ std::string VisitSegmentDatabase::ComputeSegmentName(const GURL& url) {
// TODO(brettw) this should probably use the registry controlled // TODO(brettw) this should probably use the registry controlled
// domains service. // domains service.
GURL::Replacements r; GURL::Replacements r;
const char kWWWDot[] = "www.";
const int kWWWDotLen = arraysize(kWWWDot) - 1;
std::string host = url.host(); std::string host = url.host();
// Remove www. to avoid some dups.
if (static_cast<int>(host.size()) > kWWWDotLen && // Strip various common prefixes in order to group the resulting hostnames
base::StartsWith(host, kWWWDot, base::CompareCase::INSENSITIVE_ASCII)) { // together and avoid duplicates.
r.SetHost(host.c_str(), for (base::StringPiece prefix : {"www.", "m.", "mobile.", "touch."}) {
url::Component(kWWWDotLen, if (host.size() > prefix.size() &&
static_cast<int>(host.size()) - kWWWDotLen)); base::StartsWith(host, prefix, base::CompareCase::INSENSITIVE_ASCII)) {
r.SetHost(host.c_str(),
url::Component(prefix.size(), host.size() - prefix.size()));
break;
}
} }
// Remove other stuff we don't want. // Remove other stuff we don't want.
r.ClearUsername(); r.ClearUsername();
r.ClearPassword(); r.ClearPassword();
...@@ -122,6 +125,10 @@ std::string VisitSegmentDatabase::ComputeSegmentName(const GURL& url) { ...@@ -122,6 +125,10 @@ std::string VisitSegmentDatabase::ComputeSegmentName(const GURL& url) {
r.ClearRef(); r.ClearRef();
r.ClearPort(); r.ClearPort();
// Canonicalize https to http in order to avoid duplicates.
if (url.SchemeIs(url::kHttpsScheme))
r.SetSchemeStr(url::kHttpScheme);
return url.ReplaceComponents(r).spec(); return url.ReplaceComponents(r).spec();
} }
...@@ -324,4 +331,84 @@ bool VisitSegmentDatabase::MigratePresentationIndex() { ...@@ -324,4 +331,84 @@ bool VisitSegmentDatabase::MigratePresentationIndex() {
transaction.Commit(); transaction.Commit();
} }
bool VisitSegmentDatabase::MigrateVisitSegmentNames() {
sql::Statement select(
GetDB().GetUniqueStatement("SELECT id, name FROM segments"));
if (!select.is_valid())
return false;
bool success = true;
while (select.Step()) {
SegmentID id = select.ColumnInt64(0);
std::string old_name = select.ColumnString(1);
std::string new_name = ComputeSegmentName(GURL(old_name));
if (new_name.empty() || old_name == new_name)
continue;
SegmentID to_segment_id = GetSegmentNamed(new_name);
if (to_segment_id) {
// |new_name| is already in use, so merge.
success = success && MergeSegments(/*from_segment_id=*/id, to_segment_id);
} else {
// Trivial rename of the segment.
success = success && RenameSegment(id, new_name);
}
}
return success;
}
bool VisitSegmentDatabase::RenameSegment(SegmentID segment_id,
const std::string& new_name) {
sql::Statement statement(GetDB().GetCachedStatement(
SQL_FROM_HERE, "UPDATE segments SET name = ? WHERE id = ?"));
statement.BindString(0, new_name);
statement.BindInt64(1, segment_id);
return statement.Run();
}
bool VisitSegmentDatabase::MergeSegments(SegmentID from_segment_id,
SegmentID to_segment_id) {
sql::Transaction transaction(&GetDB());
if (!transaction.Begin())
return false;
// For each time slot where there are visits for the absorbed segment
// (|from_segment_id|), add them to the absorbing/staying segment
// (|to_segment_id|).
sql::Statement select(
GetDB().GetCachedStatement(SQL_FROM_HERE,
"SELECT time_slot, visit_count FROM "
"segment_usage WHERE segment_id = ?"));
select.BindInt64(0, from_segment_id);
while (select.Step()) {
base::Time ts = base::Time::FromInternalValue(select.ColumnInt64(0));
int64_t visit_count = select.ColumnInt64(1);
IncreaseSegmentVisitCount(to_segment_id, ts, visit_count);
}
// Update all references in the visits database.
sql::Statement update(GetDB().GetCachedStatement(
SQL_FROM_HERE, "UPDATE visits SET segment_id = ? WHERE segment_id = ?"));
update.BindInt64(0, to_segment_id);
update.BindInt64(1, from_segment_id);
if (!update.Run())
return false;
// Delete old segment usage data.
sql::Statement deletion1(GetDB().GetCachedStatement(
SQL_FROM_HERE, "DELETE FROM segment_usage WHERE segment_id = ?"));
deletion1.BindInt64(0, from_segment_id);
if (!deletion1.Run())
return false;
// Delete old segment data.
sql::Statement deletion2(GetDB().GetCachedStatement(
SQL_FROM_HERE, "DELETE FROM segments WHERE id = ?"));
deletion2.BindInt64(0, from_segment_id);
if (!deletion2.Run())
return false;
return transaction.Commit();
}
} // namespace history } // namespace history
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#define COMPONENTS_HISTORY_CORE_BROWSER_VISITSEGMENT_DATABASE_H_ #define COMPONENTS_HISTORY_CORE_BROWSER_VISITSEGMENT_DATABASE_H_
#include <memory> #include <memory>
#include <string>
#include "base/callback_forward.h" #include "base/callback_forward.h"
#include "base/macros.h" #include "base/macros.h"
...@@ -87,7 +88,21 @@ class VisitSegmentDatabase { ...@@ -87,7 +88,21 @@ class VisitSegmentDatabase {
// presentation table is removed entirely. // presentation table is removed entirely.
bool MigratePresentationIndex(); bool MigratePresentationIndex();
// Runs ComputeSegmentName() to recompute 'name'. If multiple segments have
// the same name, they are merged by:
// 1. Choosing one arbitrary |segment_id| and updating all references.
// 2. Merging duplicate |segment_usage| entries (add up visit counts).
// 3. Deleting old data for the absorbed segment.
bool MigrateVisitSegmentNames();
private: private:
// Updates the |name| column for a single segment. Returns true on success.
bool RenameSegment(SegmentID segment_id, const std::string& new_name);
// Merges two segments such that data is aggregated, all former references to
// |from_segment_id| are updated to |to_segment_id| and |from_segment_id| is
// deleted. Returns true on success.
bool MergeSegments(SegmentID from_segment_id, SegmentID to_segment_id);
DISALLOW_COPY_AND_ASSIGN(VisitSegmentDatabase); DISALLOW_COPY_AND_ASSIGN(VisitSegmentDatabase);
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment