Commit 6b54c507 authored by Mustafa Emre Acer's avatar Mustafa Emre Acer Committed by Commit Bot

Lookalike URLs: Implement edit distance for engaged sites

This CL adds the ability to compare navigated domain against engaged sites using
edit distance. It also moves around some code without making functional changes.

Bug: 942160
Change-Id: I8d0de1f6cebfb10fefef2e379a84e3818d759daa
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1555004
Commit-Queue: Mustafa Emre Acer <meacer@chromium.org>
Reviewed-by: default avatarJoe DeBlasio <jdeblasio@chromium.org>
Cr-Commit-Position: refs/heads/master@{#649693}
parent 0307cf30
......@@ -31,10 +31,11 @@ class LookalikeUrlInterstitialPage
kTopSite = 1,
kSiteEngagement = 2,
kEditDistance = 3,
kEditDistanceSiteEngagement = 4,
// Append new items to the end of the list above; do not modify or replace
// existing values. Comment out obsolete items.
kMaxValue = kEditDistance,
kMaxValue = kEditDistanceSiteEngagement,
};
// Used for UKM. There is only a single UserAction per navigation.
......
......@@ -26,6 +26,12 @@ namespace lookalikes {
struct DomainInfo;
// Returns true if the Levenshtein distance between |str1| and |str2| is at most
// one. This has O(max(n,m)) complexity as opposed to O(n*m) of the usual edit
// distance computation.
bool IsEditDistanceAtMostOne(const base::string16& str1,
const base::string16& str2);
// Observes navigations and shows an interstitial if the navigated domain name
// is visually similar to a top domain or a domain with a site engagement score.
class LookalikeUrlNavigationThrottle : public content::NavigationThrottle {
......@@ -39,13 +45,13 @@ class LookalikeUrlNavigationThrottle : public content::NavigationThrottle {
kMatchTopSite = 3,
kMatchSiteEngagement = 4,
kMatchEditDistance = 5,
kMatchEditDistanceSiteEngagement = 6,
// Append new items to the end of the list above; do not modify or
// replace existing values. Comment out obsolete items.
kMaxValue = kMatchEditDistance,
kMaxValue = kMatchEditDistanceSiteEngagement,
};
static const char kHistogramName[];
explicit LookalikeUrlNavigationThrottle(content::NavigationHandle* handle);
......@@ -77,6 +83,9 @@ class LookalikeUrlNavigationThrottle : public content::NavigationThrottle {
const DomainInfo& navigated_domain,
const std::vector<DomainInfo>& engaged_sites);
bool ShouldDisplayInterstitial(
LookalikeUrlInterstitialPage::MatchType match_type) const;
// Returns true if a domain is visually similar to the hostname of |url|. The
// matching domain can be a top domain or an engaged site. Similarity check
// is made using both visual skeleton and edit distance comparison. If this
......@@ -87,16 +96,6 @@ class LookalikeUrlNavigationThrottle : public content::NavigationThrottle {
std::string* matched_domain,
LookalikeUrlInterstitialPage::MatchType* match_type);
// Returns if the Levenshtein distance between |str1| and |str2| is at most 1.
// This has O(max(n,m)) complexity as opposed to O(n*m) of the usual edit
// distance computation.
static bool IsEditDistanceAtMostOne(const base::string16& str1,
const base::string16& str2);
// Returns the first matching top domain with an edit distance of at most one
// to |domain_and_registry|.
static std::string GetSimilarDomainFromTop500(const DomainInfo& domain_info);
ThrottleCheckResult ShowInterstitial(
const GURL& safe_domain,
const GURL& url,
......
......@@ -472,6 +472,34 @@ IN_PROC_BROWSER_TEST_P(LookalikeUrlNavigationThrottleBrowserTest,
CheckNoUkm();
}
// Navigate to a domain within an edit distance of 1 to an engaged domain.
// This should record metrics, but should not show a lookalike warning
// interstitial yet.
IN_PROC_BROWSER_TEST_P(LookalikeUrlNavigationThrottleBrowserTest,
EditDistance_EngagedDomain_Match) {
base::HistogramTester histograms;
SetEngagementScore(browser(), GURL("https://test-site.com"), kHighEngagement);
// The skeleton of this domain is one 1 edit away from the skeleton of
// test-site.com.
const GURL kNavigatedUrl = GetURL("best-sité.com");
// Even if the navigated site has a low engagement score, it should be
// considered for lookalike suggestions.
SetEngagementScore(browser(), kNavigatedUrl, kLowEngagement);
// Advance clock to force a fetch of new engaged sites list.
test_clock()->Advance(base::TimeDelta::FromHours(1));
TestInterstitialNotShown(browser(), kNavigatedUrl);
histograms.ExpectTotalCount(LookalikeUrlNavigationThrottle::kHistogramName,
1);
histograms.ExpectBucketCount(
LookalikeUrlNavigationThrottle::kHistogramName,
NavigationSuggestionEvent::kMatchEditDistanceSiteEngagement, 1);
CheckUkm({kNavigatedUrl}, "MatchType",
MatchType::kEditDistanceSiteEngagement);
}
// Navigate to a domain within an edit distance of 1 to a top domain.
// This should record metrics, but should not show a lookalike warning
// interstitial yet.
......@@ -512,6 +540,29 @@ IN_PROC_BROWSER_TEST_P(LookalikeUrlNavigationThrottleBrowserTest,
CheckNoUkm();
}
// Tests negative examples for the edit distance with engaged sites.
IN_PROC_BROWSER_TEST_P(LookalikeUrlNavigationThrottleBrowserTest,
EditDistance_SiteEngagement_NoMatch) {
SetEngagementScore(browser(), GURL("https://test-site.com.tr"),
kHighEngagement);
SetEngagementScore(browser(), GURL("https://1234.com"), kHighEngagement);
SetEngagementScore(browser(), GURL("https://gooogle.com"), kHighEngagement);
// Advance clock to force a fetch of new engaged sites list.
test_clock()->Advance(base::TimeDelta::FromHours(1));
// Matches test-site.com.tr but only differs in registry.
TestInterstitialNotShown(browser(), GetURL("test-site.com.tw"));
CheckNoUkm();
// Matches gooogle.com but is a top domain itself.
TestInterstitialNotShown(browser(), GetURL("google.com"));
CheckNoUkm();
// Matches 1234.com but is too short.
TestInterstitialNotShown(browser(), GetURL("123.com"));
CheckNoUkm();
}
// Test that the heuristics are triggered even with net errors.
IN_PROC_BROWSER_TEST_P(LookalikeUrlNavigationThrottleBrowserTest,
NetError_Interstitial) {
......
......@@ -61,9 +61,9 @@ TEST(LookalikeUrlNavigationThrottleTest, IsEditDistanceAtMostOne) {
{L"google.com", L"goooglé.com", false},
};
for (const TestCase& test_case : kTestCases) {
bool result = LookalikeUrlNavigationThrottle::IsEditDistanceAtMostOne(
base::WideToUTF16(test_case.domain),
base::WideToUTF16(test_case.top_domain));
bool result =
IsEditDistanceAtMostOne(base::WideToUTF16(test_case.domain),
base::WideToUTF16(test_case.top_domain));
EXPECT_EQ(test_case.expected, result);
}
}
......
......@@ -21,6 +21,7 @@
#include "components/content_settings/core/browser/host_content_settings_map.h"
#include "components/keyed_service/content/browser_context_dependency_manager.h"
#include "components/keyed_service/content/browser_context_keyed_service_factory.h"
#include "components/url_formatter/top_domains/top_domain_util.h"
#include "components/url_formatter/url_formatter.h"
#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
......@@ -78,9 +79,11 @@ std::string GetETLDPlusOne(const std::string& hostname) {
}
DomainInfo::DomainInfo(const std::string& arg_domain_and_registry,
const std::string& arg_domain_without_registry,
const url_formatter::IDNConversionResult& arg_idn_result,
const url_formatter::Skeletons& arg_skeletons)
: domain_and_registry(arg_domain_and_registry),
domain_without_registry(arg_domain_without_registry),
idn_result(arg_idn_result),
skeletons(arg_skeletons) {}
......@@ -91,10 +94,16 @@ DomainInfo::DomainInfo(const DomainInfo&) = default;
DomainInfo GetDomainInfo(const GURL& url) {
// Perform all computations on eTLD+1.
const std::string domain_and_registry = GetETLDPlusOne(url.host());
const std::string domain_without_registry =
domain_and_registry.empty()
? std::string()
: url_formatter::top_domains::HostnameWithoutRegistry(
domain_and_registry);
// eTLD+1 can be empty for private domains.
if (domain_and_registry.empty()) {
return DomainInfo(domain_and_registry, url_formatter::IDNConversionResult(),
return DomainInfo(domain_and_registry, domain_without_registry,
url_formatter::IDNConversionResult(),
url_formatter::Skeletons());
}
// Compute skeletons using eTLD+1, skipping all spoofing checks. Spoofing
......@@ -105,7 +114,8 @@ DomainInfo GetDomainInfo(const GURL& url) {
url_formatter::UnsafeIDNToUnicodeWithDetails(domain_and_registry);
const url_formatter::Skeletons skeletons =
url_formatter::GetSkeletons(idn_result.result);
return DomainInfo(domain_and_registry, idn_result, skeletons);
return DomainInfo(domain_and_registry, domain_without_registry, idn_result,
skeletons);
}
LookalikeUrlService::LookalikeUrlService(Profile* profile)
......
......@@ -34,13 +34,19 @@ std::string GetETLDPlusOne(const std::string& hostname);
struct DomainInfo {
// eTLD+1, used for skeleton and edit distance comparison. Must be ASCII.
// Can be empty.
const std::string domain_and_registry;
// eTLD+1 without the registry part. For "www.google.com", this will be
// "google". Used for edit distance comparisons. Can be empty.
const std::string domain_without_registry;
// Result of IDN conversion of domain_and_registry field.
const url_formatter::IDNConversionResult idn_result;
// Skeletons of domain_and_registry field.
const url_formatter::Skeletons skeletons;
DomainInfo(const std::string& arg_domain_and_registry,
const std::string& arg_domain_without_registry,
const url_formatter::IDNConversionResult& arg_idn_result,
const url_formatter::Skeletons& arg_skeletons);
~DomainInfo();
......
......@@ -38032,7 +38032,12 @@ Called by update_use_counter_css.py.-->
<int value="3" label="A navigation suggestion is found using top sites list"/>
<int value="4"
label="A navigation suggestion is found using site engagement"/>
<int value="5" label="A navigation suggestion is found using edit distance"/>
<int value="5"
label="A navigation suggestion is found using edit distance against a
top domain"/>
<int value="6"
label="A navigation suggestion is found using edit distance against an
engaged site"/>
</enum>
<enum name="NavigationWasServedFromCache">
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment