Commit fc6f3bb6 authored by Joe DeBlasio's avatar Joe DeBlasio Committed by Commit Bot

[Lookalikes][Safety Tips] Ignore common edit distance false positives.

This CL improves edit distance lookalike matching by not warning on two
classes of matches that are commonly false positives:
 1. Numeric suffixes, such as domain1.tld and domain2.tld.
 2. First-character edits, such as domain.tld and xdomain.tld.

Bug: 1096309
Change-Id: I9cdf46060ef00949435c7429b86a835de796fb49
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2255025
Commit-Queue: Joe DeBlasio <jdeblasio@chromium.org>
Reviewed-by: default avatarLivvie Lin <livvielin@chromium.org>
Reviewed-by: default avatarMustafa Emre Acer <meacer@chromium.org>
Cr-Commit-Position: refs/heads/master@{#781349}
parent beb35ad6
...@@ -311,7 +311,8 @@ ThrottleCheckResult LookalikeUrlNavigationThrottle::PerformChecks( ...@@ -311,7 +311,8 @@ ThrottleCheckResult LookalikeUrlNavigationThrottle::PerformChecks(
auto* config = GetSafetyTipsRemoteConfigProto(); auto* config = GetSafetyTipsRemoteConfigProto();
const LookalikeTargetAllowlistChecker in_target_allowlist = const LookalikeTargetAllowlistChecker in_target_allowlist =
base::BindRepeating(&IsTargetUrlAllowlistedBySafetyTipsComponent, config); base::BindRepeating(&IsTargetHostAllowlistedBySafetyTipsComponent,
config);
if (GetMatchingDomain(navigated_domain, engaged_sites, in_target_allowlist, if (GetMatchingDomain(navigated_domain, engaged_sites, in_target_allowlist,
&matched_domain, &match_type)) { &matched_domain, &match_type)) {
DCHECK(!matched_domain.empty()); DCHECK(!matched_domain.empty());
......
...@@ -48,7 +48,8 @@ bool ShouldTriggerSafetyTipFromLookalike( ...@@ -48,7 +48,8 @@ bool ShouldTriggerSafetyTipFromLookalike(
auto* config = GetSafetyTipsRemoteConfigProto(); auto* config = GetSafetyTipsRemoteConfigProto();
const LookalikeTargetAllowlistChecker in_target_allowlist = const LookalikeTargetAllowlistChecker in_target_allowlist =
base::BindRepeating(&IsTargetUrlAllowlistedBySafetyTipsComponent, config); base::BindRepeating(&IsTargetHostAllowlistedBySafetyTipsComponent,
config);
if (!GetMatchingDomain(navigated_domain, engaged_sites, in_target_allowlist, if (!GetMatchingDomain(navigated_domain, engaged_sites, in_target_allowlist,
&matched_domain, &match_type)) { &matched_domain, &match_type)) {
return false; return false;
......
...@@ -121,10 +121,10 @@ bool IsUrlAllowlistedBySafetyTipsComponent( ...@@ -121,10 +121,10 @@ bool IsUrlAllowlistedBySafetyTipsComponent(
return false; return false;
} }
bool IsTargetUrlAllowlistedBySafetyTipsComponent( bool IsTargetHostAllowlistedBySafetyTipsComponent(
const chrome_browser_safety_tips::SafetyTipsConfig* proto, const chrome_browser_safety_tips::SafetyTipsConfig* proto,
const GURL& url) { const std::string& hostname) {
DCHECK(!url.host().empty()); DCHECK(!hostname.empty());
if (proto == nullptr) { if (proto == nullptr) {
return false; return false;
} }
...@@ -135,7 +135,7 @@ bool IsTargetUrlAllowlistedBySafetyTipsComponent( ...@@ -135,7 +135,7 @@ bool IsTargetUrlAllowlistedBySafetyTipsComponent(
DCHECK(!host_pattern.regex().empty()); DCHECK(!host_pattern.regex().empty());
const re2::RE2 regex(host_pattern.regex()); const re2::RE2 regex(host_pattern.regex());
DCHECK(regex.ok()); DCHECK(regex.ok());
if (re2::RE2::FullMatch(url.host(), regex)) { if (re2::RE2::FullMatch(hostname, regex)) {
return true; return true;
} }
} }
......
...@@ -32,11 +32,11 @@ bool IsUrlAllowlistedBySafetyTipsComponent( ...@@ -32,11 +32,11 @@ bool IsUrlAllowlistedBySafetyTipsComponent(
const chrome_browser_safety_tips::SafetyTipsConfig* proto, const chrome_browser_safety_tips::SafetyTipsConfig* proto,
const GURL& url); const GURL& url);
// Checks the hostname of |url| against the component updater target allowlist // Checks |hostname| against the component updater target allowlist and returns
// and returns whether the URL is explicitly allowed. // whether it is explicitly allowed.
bool IsTargetUrlAllowlistedBySafetyTipsComponent( bool IsTargetHostAllowlistedBySafetyTipsComponent(
const chrome_browser_safety_tips::SafetyTipsConfig* proto, const chrome_browser_safety_tips::SafetyTipsConfig* proto,
const GURL& target_url); const std::string& hostname);
// Checks SafeBrowsing-style permutations of |url| against the component updater // Checks SafeBrowsing-style permutations of |url| against the component updater
// blocklist and returns the match type. kNone means the URL is not blocked. // blocklist and returns the match type. kNone means the URL is not blocked.
......
...@@ -22,8 +22,8 @@ TEST(SafetyTipsConfigTest, TestUrlAllowlist) { ...@@ -22,8 +22,8 @@ TEST(SafetyTipsConfigTest, TestUrlAllowlist) {
TEST(SafetyTipsConfigTest, TestTargetUrlAllowlist) { TEST(SafetyTipsConfigTest, TestTargetUrlAllowlist) {
SetSafetyTipAllowlistPatterns({}, {"exa.*\\.com"}); SetSafetyTipAllowlistPatterns({}, {"exa.*\\.com"});
auto* config = GetSafetyTipsRemoteConfigProto(); auto* config = GetSafetyTipsRemoteConfigProto();
EXPECT_TRUE(IsTargetUrlAllowlistedBySafetyTipsComponent( EXPECT_TRUE(
config, GURL("http://example.com"))); IsTargetHostAllowlistedBySafetyTipsComponent(config, "example.com"));
EXPECT_FALSE(IsTargetUrlAllowlistedBySafetyTipsComponent( EXPECT_FALSE(
config, GURL("http://example.org"))); IsTargetHostAllowlistedBySafetyTipsComponent(config, "example.org"));
} }
...@@ -36,6 +36,10 @@ const char kHistogramName[] = "NavigationSuggestion.Event"; ...@@ -36,6 +36,10 @@ const char kHistogramName[] = "NavigationSuggestion.Event";
namespace { namespace {
// Digits. Used for trimming domains in Edit Distance heuristic matches. Domains
// that only differ by trailing digits (e.g. a1.tld and a2.tld) are ignored.
const char kDigitChars[] = "0123456789";
// Minimum length of target domains, including TLD, that we protect against // Minimum length of target domains, including TLD, that we protect against
// target embedding. For example, "vk.ru" is 5 characters-long and won't be // target embedding. For example, "vk.ru" is 5 characters-long and won't be
// considered as a target. // considered as a target.
...@@ -164,24 +168,13 @@ std::string GetSimilarDomainFromTop500( ...@@ -164,24 +168,13 @@ std::string GetSimilarDomainFromTop500(
.domain; .domain;
DCHECK(!top_domain.empty()); DCHECK(!top_domain.empty());
// If the only difference between the navigated and top if (IsLikelyEditDistanceFalsePositive(navigated_domain,
// domains is the registry part, this is unlikely to be a spoofing GetDomainInfo(top_domain))) {
// attempt. Ignore this match and continue. E.g. If the navigated domain
// is google.com.tw and the top domain is google.com.tr, this won't
// produce a match.
const std::string top_domain_without_registry =
url_formatter::top_domains::HostnameWithoutRegistry(top_domain);
DCHECK(url_formatter::top_domains::IsEditDistanceCandidate(
top_domain_without_registry));
if (navigated_domain.domain_without_registry ==
top_domain_without_registry) {
continue; continue;
} }
// Skip past domains that are allowed to be spoofed. // Skip past domains that are allowed to be spoofed.
if (target_allowlisted.Run(GURL(std::string(url::kHttpsScheme) + if (target_allowlisted.Run(top_domain)) {
url::kStandardSchemeSeparator +
top_domain))) {
continue; continue;
} }
...@@ -209,20 +202,12 @@ std::string GetSimilarDomainFromEngagedSites( ...@@ -209,20 +202,12 @@ std::string GetSimilarDomainFromEngagedSites(
continue; continue;
} }
// If the only difference between the navigated and engaged if (IsLikelyEditDistanceFalsePositive(navigated_domain, engaged_site)) {
// domain is the registry part, this is unlikely to be a spoofing
// attempt. Ignore this match and continue. E.g. If the navigated
// domain is google.com.tw and the top domain is google.com.tr, this
// won't produce a match.
if (navigated_domain.domain_without_registry ==
engaged_site.domain_without_registry) {
continue; continue;
} }
// Skip past domains that are allowed to be spoofed. // Skip past domains that are allowed to be spoofed.
if (target_allowlisted.Run(GURL(std::string(url::kHttpsScheme) + if (target_allowlisted.Run(engaged_site.domain_and_registry)) {
url::kStandardSchemeSeparator +
engaged_site.domain_and_registry))) {
continue; continue;
} }
...@@ -255,18 +240,16 @@ bool ASubdomainIsAllowlisted( ...@@ -255,18 +240,16 @@ bool ASubdomainIsAllowlisted(
const std::string& embedded_target, const std::string& embedded_target,
const base::span<const std::string>& subdomain_labels_so_far, const base::span<const std::string>& subdomain_labels_so_far,
const LookalikeTargetAllowlistChecker& in_target_allowlist) { const LookalikeTargetAllowlistChecker& in_target_allowlist) {
const std::string https_scheme = if (in_target_allowlist.Run(embedded_target)) {
url::kHttpsScheme + std::string(url::kStandardSchemeSeparator);
if (in_target_allowlist.Run(GURL(https_scheme + embedded_target))) {
return true; return true;
} }
std::string potential_hostname = embedded_target; std::string potential_hostname = embedded_target;
// Attach each token from the end to the embedded target to check if that // Attach each token from the end to the embedded target to check if that
// subdomain has been allowlisted. // subdomain has been allowlisted.
for (int i = subdomain_labels_so_far.size() - 1; i >= 0; i--) { for (int i = subdomain_labels_so_far.size() - 1; i >= 0; i--) {
potential_hostname = subdomain_labels_so_far[i] + "." + potential_hostname; potential_hostname = subdomain_labels_so_far[i] + "." + potential_hostname;
if (in_target_allowlist.Run(GURL(https_scheme + potential_hostname))) { if (in_target_allowlist.Run(potential_hostname)) {
return true; return true;
} }
} }
...@@ -462,14 +445,14 @@ DomainInfo::~DomainInfo() = default; ...@@ -462,14 +445,14 @@ DomainInfo::~DomainInfo() = default;
DomainInfo::DomainInfo(const DomainInfo&) = default; DomainInfo::DomainInfo(const DomainInfo&) = default;
DomainInfo GetDomainInfo(const GURL& url) { DomainInfo GetDomainInfo(const std::string& hostname) {
if (net::IsLocalhost(url) || net::IsHostnameNonUnique(url.host())) { if (net::HostStringIsLocalhost(hostname) ||
net::IsHostnameNonUnique(hostname)) {
return DomainInfo(std::string(), std::string(), std::string(), return DomainInfo(std::string(), std::string(), std::string(),
url_formatter::IDNConversionResult(), url_formatter::IDNConversionResult(),
url_formatter::Skeletons()); url_formatter::Skeletons());
} }
const std::string hostname = url.host(); const std::string domain_and_registry = GetETLDPlusOne(hostname);
const std::string domain_and_registry = GetETLDPlusOne(url.host());
const std::string domain_without_registry = const std::string domain_without_registry =
domain_and_registry.empty() domain_and_registry.empty()
? std::string() ? std::string()
...@@ -494,6 +477,10 @@ DomainInfo GetDomainInfo(const GURL& url) { ...@@ -494,6 +477,10 @@ DomainInfo GetDomainInfo(const GURL& url) {
idn_result, skeletons); idn_result, skeletons);
} }
DomainInfo GetDomainInfo(const GURL& url) {
return GetDomainInfo(url.host());
}
std::string GetETLDPlusOne(const std::string& hostname) { std::string GetETLDPlusOne(const std::string& hostname) {
return net::registry_controlled_domains::GetDomainAndRegistry( return net::registry_controlled_domains::GetDomainAndRegistry(
hostname, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); hostname, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
...@@ -540,6 +527,60 @@ bool IsEditDistanceAtMostOne(const base::string16& str1, ...@@ -540,6 +527,60 @@ bool IsEditDistanceAtMostOne(const base::string16& str1,
return edit_count <= 1; return edit_count <= 1;
} }
bool IsLikelyEditDistanceFalsePositive(const DomainInfo& navigated_domain,
const DomainInfo& matched_domain) {
DCHECK(url_formatter::top_domains::IsEditDistanceCandidate(
matched_domain.domain_and_registry));
DCHECK(url_formatter::top_domains::IsEditDistanceCandidate(
navigated_domain.domain_and_registry));
// If the only difference between the domains is the registry part, this is
// unlikely to be a spoofing attempt and we should ignore this match. E.g.
// exclude matches like google.com.tw and google.com.tr.
if (navigated_domain.domain_without_registry ==
matched_domain.domain_without_registry) {
return true;
}
// If the domains only differ by a numeric suffix on their e2LD (e.g.
// site45.tld and site35.tld), then ignore the match.
auto nav_trimmed = base::TrimString(navigated_domain.domain_without_registry,
kDigitChars, base::TRIM_TRAILING);
auto matched_trimmed = base::TrimString(
matched_domain.domain_without_registry, kDigitChars, base::TRIM_TRAILING);
DCHECK_NE(navigated_domain.domain_without_registry,
matched_domain.domain_without_registry);
// We previously verified that the domains without registries weren't equal,
// so if they're equal now, the match must have come from numeric suffixes.
if (nav_trimmed == matched_trimmed) {
return true;
}
// Ignore domains that only differ by an insertion/substitution at the
// start, as these are usually different words, not lookalikes.
const auto nav_dom_len = navigated_domain.domain_and_registry.length();
const auto matched_dom_len = matched_domain.domain_and_registry.length();
const auto& nav_dom = navigated_domain.domain_and_registry;
const auto& matched_dom = matched_domain.domain_and_registry;
if (nav_dom_len == matched_dom_len) {
// e.g. hank vs tank
if (nav_dom.substr(1) == matched_dom.substr(1)) {
return true;
}
} else if (nav_dom_len < matched_dom_len) {
// e.g. oodle vs poodle
if (nav_dom == matched_dom.substr(1)) {
return true;
}
} else { // navigated_dom_len > matched_dom_len
// e.g. poodle vs oodle
if (nav_dom.substr(1) == matched_dom) {
return true;
}
}
return false;
}
bool IsTopDomain(const DomainInfo& domain_info) { bool IsTopDomain(const DomainInfo& domain_info) {
// Top domains are only accessible through their skeletons, so query the top // Top domains are only accessible through their skeletons, so query the top
// domains trie for each skeleton of this domain. // domains trie for each skeleton of this domain.
......
...@@ -20,7 +20,7 @@ extern const char kHistogramName[]; ...@@ -20,7 +20,7 @@ extern const char kHistogramName[];
} }
using LookalikeTargetAllowlistChecker = using LookalikeTargetAllowlistChecker =
base::RepeatingCallback<bool(const GURL&)>; base::RepeatingCallback<bool(const std::string&)>;
// Common words are allow listed in target embedding (e.g. weather.jp.com should // Common words are allow listed in target embedding (e.g. weather.jp.com should
// not be considered a lookalike for weather.com). However, some of the common // not be considered a lookalike for weather.com). However, some of the common
...@@ -129,9 +129,12 @@ struct DomainInfo { ...@@ -129,9 +129,12 @@ struct DomainInfo {
DomainInfo(const DomainInfo& other); DomainInfo(const DomainInfo& other);
}; };
// Returns a DomainInfo instance computed from |url|. Will return empty fields // Returns a DomainInfo instance computed from |hostname|. Will return empty
// for non-unique hostnames (e.g. site.test), localhost or sites whose eTLD+1 is // fields for non-unique hostnames (e.g. site.test), localhost or sites whose
// empty. // eTLD+1 is empty.
DomainInfo GetDomainInfo(const std::string& hostname);
// Convenience function for returning GetDomainInfo(url.host()).
DomainInfo GetDomainInfo(const GURL& url); DomainInfo GetDomainInfo(const GURL& url);
// Returns true if the Levenshtein distance between |str1| and |str2| is at most // Returns true if the Levenshtein distance between |str1| and |str2| is at most
...@@ -140,6 +143,13 @@ DomainInfo GetDomainInfo(const GURL& url); ...@@ -140,6 +143,13 @@ DomainInfo GetDomainInfo(const GURL& url);
bool IsEditDistanceAtMostOne(const base::string16& str1, bool IsEditDistanceAtMostOne(const base::string16& str1,
const base::string16& str2); const base::string16& str2);
// Returns whether |navigated_domain| and |matched_domain| are likely to be edit
// distance false positives, and thus the user should *not* be warned.
//
// Assumes |navigated_domain| and |matched_domain| are edit distance matches.
bool IsLikelyEditDistanceFalsePositive(const DomainInfo& navigated_domain,
const DomainInfo& matched_domain);
// Returns true if the domain given by |domain_info| is a top domain. // Returns true if the domain given by |domain_info| is a top domain.
bool IsTopDomain(const DomainInfo& domain_info); bool IsTopDomain(const DomainInfo& domain_info);
......
...@@ -65,12 +65,64 @@ TEST(LookalikeUrlUtilTest, IsEditDistanceAtMostOne) { ...@@ -65,12 +65,64 @@ TEST(LookalikeUrlUtilTest, IsEditDistanceAtMostOne) {
bool result = bool result =
IsEditDistanceAtMostOne(base::WideToUTF16(test_case.domain), IsEditDistanceAtMostOne(base::WideToUTF16(test_case.domain),
base::WideToUTF16(test_case.top_domain)); base::WideToUTF16(test_case.top_domain));
EXPECT_EQ(test_case.expected, result); EXPECT_EQ(test_case.expected, result)
<< "when comparing " << test_case.domain << " with "
<< test_case.top_domain;
} }
} }
bool IsGoogleScholar(const GURL& hostname) { TEST(LookalikeUrlUtilTest, EditDistanceExcludesCommonFalsePositives) {
return hostname.host() == "scholar.google.com"; const struct TestCase {
const char* domain;
const char* top_domain;
bool is_likely_false_positive;
} kTestCases[] = {
// Most edit distance instances are not likely false positives.
{"abcxd.com", "abcyd.com", false}, // Substitution
{"abcxd.com", "abcxxd.com", false}, // Deletion
{"abcxxd.com", "abcxd.com", false}, // Insertion
// But we permit cases where the only difference is in the tld.
{"abcde.com", "abcde.net", true},
// We also permit matches that are only due to a numeric suffix,
{"abcd1.com", "abcd2.com", true}, // Substitution
{"abcde.com", "abcde1.com", true}, // Numeric deletion
{"abcde1.com", "abcde.com", true}, // Numeric insertion
{"abcd11.com", "abcd21.com", true}, // Not-final-digit substitution
{"a.abcd1.com", "abcd2.com", true}, // Only relevant for eTLD+1.
// ...and that change must be due to the numeric suffix.
{"abcx1.com", "abcy1.com", false}, // Substitution before suffix
{"abcd1.com", "abcde1.com", false}, // Deletion before suffix
{"abcde1.com", "abcd1.com", false}, // Insertion before suffix
{"abcdx.com", "abcdy.com", false}, // Non-numeric substitution at end
// We also permit matches that are only due to a first-character change,
{"xabcd.com", "yabcd.com", true}, // Substitution
{"xabcde.com", "abcde.com", true}, // Insertion
{"abcde.com", "xabcde.com", true}, // Deletion
{"a.abcde.com", "xabcde.com", true}, // For eTLD+1
// ...so long as that change is only on the first character, not later.
{"abcde.com", "axbcde.com", false}, // Deletion
{"axbcde.com", "abcde.com", false}, // Insertion
{"axbcde.com", "aybcde.com", false}, // Substitution
};
for (const TestCase& test_case : kTestCases) {
auto navigated =
GetDomainInfo(GURL(std::string(url::kHttpsScheme) +
url::kStandardSchemeSeparator + test_case.domain));
auto matched = GetDomainInfo(GURL(std::string(url::kHttpsScheme) +
url::kStandardSchemeSeparator +
test_case.top_domain));
bool result = IsLikelyEditDistanceFalsePositive(navigated, matched);
EXPECT_EQ(test_case.is_likely_false_positive, result)
<< "when comparing " << test_case.domain << " with "
<< test_case.top_domain;
}
}
bool IsGoogleScholar(const std::string& hostname) {
return hostname == "scholar.google.com";
} }
struct TargetEmbeddingHeuristicTestCase { struct TargetEmbeddingHeuristicTestCase {
......
...@@ -13,7 +13,7 @@ namespace top_domains { ...@@ -13,7 +13,7 @@ namespace top_domains {
namespace { namespace {
// Minimum length of the e2LD (the registered domain name without the registry) // Minimum length of the e2LD (the registered domain name without the registry)
// to be considered for an edit distance comparison, including a trailing dot. // to be considered for an edit distance comparison.
// Thus: 'google.com' has of length 6 ("google") and is long enough, while // Thus: 'google.com' has of length 6 ("google") and is long enough, while
// 'abc.co.uk' has a length of 3 ("abc"), and will not be considered. // 'abc.co.uk' has a length of 3 ("abc"), and will not be considered.
const size_t kMinLengthForEditDistance = 5u; const size_t kMinLengthForEditDistance = 5u;
......
...@@ -75,7 +75,7 @@ LookalikeUrlTabHelper::ShouldAllowRequest( ...@@ -75,7 +75,7 @@ LookalikeUrlTabHelper::ShouldAllowRequest(
LookalikeUrlMatchType match_type; LookalikeUrlMatchType match_type;
// Target allowlist is not currently used in ios. // Target allowlist is not currently used in ios.
const LookalikeTargetAllowlistChecker in_target_allowlist = const LookalikeTargetAllowlistChecker in_target_allowlist =
base::BindRepeating(^(const GURL& url) { base::BindRepeating(^(const std::string& hostname) {
return false; return false;
}); });
if (!GetMatchingDomain(navigated_domain, engaged_sites, in_target_allowlist, if (!GetMatchingDomain(navigated_domain, engaged_sites, in_target_allowlist,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment