Commit fc6f3bb6 authored by Joe DeBlasio's avatar Joe DeBlasio Committed by Commit Bot

[Lookalikes][Safety Tips] Ignore common edit distance false positives.

This CL improves edit distance lookalike matching by not warning on two
classes of matches that are commonly false positives:
 1. Numeric suffixes, such as domain1.tld and domain2.tld.
 2. First-character edits, such as domain.tld and xdomain.tld.

Bug: 1096309
Change-Id: I9cdf46060ef00949435c7429b86a835de796fb49
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2255025
Commit-Queue: Joe DeBlasio <jdeblasio@chromium.org>
Reviewed-by: default avatarLivvie Lin <livvielin@chromium.org>
Reviewed-by: default avatarMustafa Emre Acer <meacer@chromium.org>
Cr-Commit-Position: refs/heads/master@{#781349}
parent beb35ad6
......@@ -311,7 +311,8 @@ ThrottleCheckResult LookalikeUrlNavigationThrottle::PerformChecks(
auto* config = GetSafetyTipsRemoteConfigProto();
const LookalikeTargetAllowlistChecker in_target_allowlist =
base::BindRepeating(&IsTargetUrlAllowlistedBySafetyTipsComponent, config);
base::BindRepeating(&IsTargetHostAllowlistedBySafetyTipsComponent,
config);
if (GetMatchingDomain(navigated_domain, engaged_sites, in_target_allowlist,
&matched_domain, &match_type)) {
DCHECK(!matched_domain.empty());
......
......@@ -48,7 +48,8 @@ bool ShouldTriggerSafetyTipFromLookalike(
auto* config = GetSafetyTipsRemoteConfigProto();
const LookalikeTargetAllowlistChecker in_target_allowlist =
base::BindRepeating(&IsTargetUrlAllowlistedBySafetyTipsComponent, config);
base::BindRepeating(&IsTargetHostAllowlistedBySafetyTipsComponent,
config);
if (!GetMatchingDomain(navigated_domain, engaged_sites, in_target_allowlist,
&matched_domain, &match_type)) {
return false;
......
......@@ -121,10 +121,10 @@ bool IsUrlAllowlistedBySafetyTipsComponent(
return false;
}
bool IsTargetUrlAllowlistedBySafetyTipsComponent(
bool IsTargetHostAllowlistedBySafetyTipsComponent(
const chrome_browser_safety_tips::SafetyTipsConfig* proto,
const GURL& url) {
DCHECK(!url.host().empty());
const std::string& hostname) {
DCHECK(!hostname.empty());
if (proto == nullptr) {
return false;
}
......@@ -135,7 +135,7 @@ bool IsTargetUrlAllowlistedBySafetyTipsComponent(
DCHECK(!host_pattern.regex().empty());
const re2::RE2 regex(host_pattern.regex());
DCHECK(regex.ok());
if (re2::RE2::FullMatch(url.host(), regex)) {
if (re2::RE2::FullMatch(hostname, regex)) {
return true;
}
}
......
......@@ -32,11 +32,11 @@ bool IsUrlAllowlistedBySafetyTipsComponent(
const chrome_browser_safety_tips::SafetyTipsConfig* proto,
const GURL& url);
// Checks the hostname of |url| against the component updater target allowlist
// and returns whether the URL is explicitly allowed.
bool IsTargetUrlAllowlistedBySafetyTipsComponent(
// Checks |hostname| against the component updater target allowlist and returns
// whether it is explicitly allowed.
bool IsTargetHostAllowlistedBySafetyTipsComponent(
const chrome_browser_safety_tips::SafetyTipsConfig* proto,
const GURL& target_url);
const std::string& hostname);
// Checks SafeBrowsing-style permutations of |url| against the component updater
// blocklist and returns the match type. kNone means the URL is not blocked.
......
......@@ -22,8 +22,8 @@ TEST(SafetyTipsConfigTest, TestUrlAllowlist) {
TEST(SafetyTipsConfigTest, TestTargetUrlAllowlist) {
SetSafetyTipAllowlistPatterns({}, {"exa.*\\.com"});
auto* config = GetSafetyTipsRemoteConfigProto();
EXPECT_TRUE(IsTargetUrlAllowlistedBySafetyTipsComponent(
config, GURL("http://example.com")));
EXPECT_FALSE(IsTargetUrlAllowlistedBySafetyTipsComponent(
config, GURL("http://example.org")));
EXPECT_TRUE(
IsTargetHostAllowlistedBySafetyTipsComponent(config, "example.com"));
EXPECT_FALSE(
IsTargetHostAllowlistedBySafetyTipsComponent(config, "example.org"));
}
......@@ -36,6 +36,10 @@ const char kHistogramName[] = "NavigationSuggestion.Event";
namespace {
// Digits. Used for trimming domains in Edit Distance heuristic matches. Domains
// that only differ by trailing digits (e.g. a1.tld and a2.tld) are ignored.
const char kDigitChars[] = "0123456789";
// Minimum length of target domains, including TLD, that we protect against
// target embedding. For example, "vk.ru" is 5 characters-long and won't be
// considered as a target.
......@@ -164,24 +168,13 @@ std::string GetSimilarDomainFromTop500(
.domain;
DCHECK(!top_domain.empty());
// If the only difference between the navigated and top
// domains is the registry part, this is unlikely to be a spoofing
// attempt. Ignore this match and continue. E.g. If the navigated domain
// is google.com.tw and the top domain is google.com.tr, this won't
// produce a match.
const std::string top_domain_without_registry =
url_formatter::top_domains::HostnameWithoutRegistry(top_domain);
DCHECK(url_formatter::top_domains::IsEditDistanceCandidate(
top_domain_without_registry));
if (navigated_domain.domain_without_registry ==
top_domain_without_registry) {
if (IsLikelyEditDistanceFalsePositive(navigated_domain,
GetDomainInfo(top_domain))) {
continue;
}
// Skip past domains that are allowed to be spoofed.
if (target_allowlisted.Run(GURL(std::string(url::kHttpsScheme) +
url::kStandardSchemeSeparator +
top_domain))) {
if (target_allowlisted.Run(top_domain)) {
continue;
}
......@@ -209,20 +202,12 @@ std::string GetSimilarDomainFromEngagedSites(
continue;
}
// If the only difference between the navigated and engaged
// domain is the registry part, this is unlikely to be a spoofing
// attempt. Ignore this match and continue. E.g. If the navigated
// domain is google.com.tw and the top domain is google.com.tr, this
// won't produce a match.
if (navigated_domain.domain_without_registry ==
engaged_site.domain_without_registry) {
if (IsLikelyEditDistanceFalsePositive(navigated_domain, engaged_site)) {
continue;
}
// Skip past domains that are allowed to be spoofed.
if (target_allowlisted.Run(GURL(std::string(url::kHttpsScheme) +
url::kStandardSchemeSeparator +
engaged_site.domain_and_registry))) {
if (target_allowlisted.Run(engaged_site.domain_and_registry)) {
continue;
}
......@@ -255,18 +240,16 @@ bool ASubdomainIsAllowlisted(
const std::string& embedded_target,
const base::span<const std::string>& subdomain_labels_so_far,
const LookalikeTargetAllowlistChecker& in_target_allowlist) {
const std::string https_scheme =
url::kHttpsScheme + std::string(url::kStandardSchemeSeparator);
if (in_target_allowlist.Run(GURL(https_scheme + embedded_target))) {
if (in_target_allowlist.Run(embedded_target)) {
return true;
}
std::string potential_hostname = embedded_target;
// Attach each token from the end to the embedded target to check if that
// subdomain has been allowlisted.
for (int i = subdomain_labels_so_far.size() - 1; i >= 0; i--) {
potential_hostname = subdomain_labels_so_far[i] + "." + potential_hostname;
if (in_target_allowlist.Run(GURL(https_scheme + potential_hostname))) {
if (in_target_allowlist.Run(potential_hostname)) {
return true;
}
}
......@@ -462,14 +445,14 @@ DomainInfo::~DomainInfo() = default;
DomainInfo::DomainInfo(const DomainInfo&) = default;
DomainInfo GetDomainInfo(const GURL& url) {
if (net::IsLocalhost(url) || net::IsHostnameNonUnique(url.host())) {
DomainInfo GetDomainInfo(const std::string& hostname) {
if (net::HostStringIsLocalhost(hostname) ||
net::IsHostnameNonUnique(hostname)) {
return DomainInfo(std::string(), std::string(), std::string(),
url_formatter::IDNConversionResult(),
url_formatter::Skeletons());
}
const std::string hostname = url.host();
const std::string domain_and_registry = GetETLDPlusOne(url.host());
const std::string domain_and_registry = GetETLDPlusOne(hostname);
const std::string domain_without_registry =
domain_and_registry.empty()
? std::string()
......@@ -494,6 +477,10 @@ DomainInfo GetDomainInfo(const GURL& url) {
idn_result, skeletons);
}
DomainInfo GetDomainInfo(const GURL& url) {
return GetDomainInfo(url.host());
}
std::string GetETLDPlusOne(const std::string& hostname) {
return net::registry_controlled_domains::GetDomainAndRegistry(
hostname, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
......@@ -540,6 +527,60 @@ bool IsEditDistanceAtMostOne(const base::string16& str1,
return edit_count <= 1;
}
bool IsLikelyEditDistanceFalsePositive(const DomainInfo& navigated_domain,
const DomainInfo& matched_domain) {
DCHECK(url_formatter::top_domains::IsEditDistanceCandidate(
matched_domain.domain_and_registry));
DCHECK(url_formatter::top_domains::IsEditDistanceCandidate(
navigated_domain.domain_and_registry));
// If the only difference between the domains is the registry part, this is
// unlikely to be a spoofing attempt and we should ignore this match. E.g.
// exclude matches like google.com.tw and google.com.tr.
if (navigated_domain.domain_without_registry ==
matched_domain.domain_without_registry) {
return true;
}
// If the domains only differ by a numeric suffix on their e2LD (e.g.
// site45.tld and site35.tld), then ignore the match.
auto nav_trimmed = base::TrimString(navigated_domain.domain_without_registry,
kDigitChars, base::TRIM_TRAILING);
auto matched_trimmed = base::TrimString(
matched_domain.domain_without_registry, kDigitChars, base::TRIM_TRAILING);
DCHECK_NE(navigated_domain.domain_without_registry,
matched_domain.domain_without_registry);
// We previously verified that the domains without registries weren't equal,
// so if they're equal now, the match must have come from numeric suffixes.
if (nav_trimmed == matched_trimmed) {
return true;
}
// Ignore domains that only differ by an insertion/substitution at the
// start, as these are usually different words, not lookalikes.
const auto nav_dom_len = navigated_domain.domain_and_registry.length();
const auto matched_dom_len = matched_domain.domain_and_registry.length();
const auto& nav_dom = navigated_domain.domain_and_registry;
const auto& matched_dom = matched_domain.domain_and_registry;
if (nav_dom_len == matched_dom_len) {
// e.g. hank vs tank
if (nav_dom.substr(1) == matched_dom.substr(1)) {
return true;
}
} else if (nav_dom_len < matched_dom_len) {
// e.g. oodle vs poodle
if (nav_dom == matched_dom.substr(1)) {
return true;
}
} else { // navigated_dom_len > matched_dom_len
// e.g. poodle vs oodle
if (nav_dom.substr(1) == matched_dom) {
return true;
}
}
return false;
}
bool IsTopDomain(const DomainInfo& domain_info) {
// Top domains are only accessible through their skeletons, so query the top
// domains trie for each skeleton of this domain.
......
......@@ -20,7 +20,7 @@ extern const char kHistogramName[];
}
using LookalikeTargetAllowlistChecker =
base::RepeatingCallback<bool(const GURL&)>;
base::RepeatingCallback<bool(const std::string&)>;
// Common words are allow listed in target embedding (e.g. weather.jp.com should
// not be considered a lookalike for weather.com). However, some of the common
......@@ -129,9 +129,12 @@ struct DomainInfo {
DomainInfo(const DomainInfo& other);
};
// Returns a DomainInfo instance computed from |url|. Will return empty fields
// for non-unique hostnames (e.g. site.test), localhost or sites whose eTLD+1 is
// empty.
// Returns a DomainInfo instance computed from |hostname|. Will return empty
// fields for non-unique hostnames (e.g. site.test), localhost or sites whose
// eTLD+1 is empty.
DomainInfo GetDomainInfo(const std::string& hostname);
// Convenience function for returning GetDomainInfo(url.host()).
DomainInfo GetDomainInfo(const GURL& url);
// Returns true if the Levenshtein distance between |str1| and |str2| is at most
......@@ -140,6 +143,13 @@ DomainInfo GetDomainInfo(const GURL& url);
bool IsEditDistanceAtMostOne(const base::string16& str1,
const base::string16& str2);
// Returns whether |navigated_domain| and |matched_domain| are likely to be edit
// distance false positives, and thus the user should *not* be warned.
//
// Assumes |navigated_domain| and |matched_domain| are edit distance matches.
bool IsLikelyEditDistanceFalsePositive(const DomainInfo& navigated_domain,
const DomainInfo& matched_domain);
// Returns true if the domain given by |domain_info| is a top domain.
bool IsTopDomain(const DomainInfo& domain_info);
......
......@@ -65,12 +65,64 @@ TEST(LookalikeUrlUtilTest, IsEditDistanceAtMostOne) {
bool result =
IsEditDistanceAtMostOne(base::WideToUTF16(test_case.domain),
base::WideToUTF16(test_case.top_domain));
EXPECT_EQ(test_case.expected, result);
EXPECT_EQ(test_case.expected, result)
<< "when comparing " << test_case.domain << " with "
<< test_case.top_domain;
}
}
bool IsGoogleScholar(const GURL& hostname) {
return hostname.host() == "scholar.google.com";
TEST(LookalikeUrlUtilTest, EditDistanceExcludesCommonFalsePositives) {
const struct TestCase {
const char* domain;
const char* top_domain;
bool is_likely_false_positive;
} kTestCases[] = {
// Most edit distance instances are not likely false positives.
{"abcxd.com", "abcyd.com", false}, // Substitution
{"abcxd.com", "abcxxd.com", false}, // Deletion
{"abcxxd.com", "abcxd.com", false}, // Insertion
// But we permit cases where the only difference is in the tld.
{"abcde.com", "abcde.net", true},
// We also permit matches that are only due to a numeric suffix,
{"abcd1.com", "abcd2.com", true}, // Substitution
{"abcde.com", "abcde1.com", true}, // Numeric deletion
{"abcde1.com", "abcde.com", true}, // Numeric insertion
{"abcd11.com", "abcd21.com", true}, // Not-final-digit substitution
{"a.abcd1.com", "abcd2.com", true}, // Only relevant for eTLD+1.
// ...and that change must be due to the numeric suffix.
{"abcx1.com", "abcy1.com", false}, // Substitution before suffix
{"abcd1.com", "abcde1.com", false}, // Deletion before suffix
{"abcde1.com", "abcd1.com", false}, // Insertion before suffix
{"abcdx.com", "abcdy.com", false}, // Non-numeric substitution at end
// We also permit matches that are only due to a first-character change,
{"xabcd.com", "yabcd.com", true}, // Substitution
{"xabcde.com", "abcde.com", true}, // Insertion
{"abcde.com", "xabcde.com", true}, // Deletion
{"a.abcde.com", "xabcde.com", true}, // For eTLD+1
// ...so long as that change is only on the first character, not later.
{"abcde.com", "axbcde.com", false}, // Deletion
{"axbcde.com", "abcde.com", false}, // Insertion
{"axbcde.com", "aybcde.com", false}, // Substitution
};
for (const TestCase& test_case : kTestCases) {
auto navigated =
GetDomainInfo(GURL(std::string(url::kHttpsScheme) +
url::kStandardSchemeSeparator + test_case.domain));
auto matched = GetDomainInfo(GURL(std::string(url::kHttpsScheme) +
url::kStandardSchemeSeparator +
test_case.top_domain));
bool result = IsLikelyEditDistanceFalsePositive(navigated, matched);
EXPECT_EQ(test_case.is_likely_false_positive, result)
<< "when comparing " << test_case.domain << " with "
<< test_case.top_domain;
}
}
bool IsGoogleScholar(const std::string& hostname) {
return hostname == "scholar.google.com";
}
struct TargetEmbeddingHeuristicTestCase {
......
......@@ -13,7 +13,7 @@ namespace top_domains {
namespace {
// Minimum length of the e2LD (the registered domain name without the registry)
// to be considered for an edit distance comparison, including a trailing dot.
// to be considered for an edit distance comparison.
// Thus: 'google.com' has of length 6 ("google") and is long enough, while
// 'abc.co.uk' has a length of 3 ("abc"), and will not be considered.
const size_t kMinLengthForEditDistance = 5u;
......
......@@ -75,7 +75,7 @@ LookalikeUrlTabHelper::ShouldAllowRequest(
LookalikeUrlMatchType match_type;
// Target allowlist is not currently used in ios.
const LookalikeTargetAllowlistChecker in_target_allowlist =
base::BindRepeating(^(const GURL& url) {
base::BindRepeating(^(const std::string& hostname) {
return false;
});
if (!GetMatchingDomain(navigated_domain, engaged_sites, in_target_allowlist,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment