Commit 6ad8400f authored by Joe DeBlasio's avatar Joe DeBlasio Committed by Commit Bot

[Lookalikes] Ensure TE length checks apply to no-separator matches.

This CL ensures that target embedding matches against no-separator top
domains (e.g. "googlecom.example.com") don't trigger if they are too
short. It also ensures that these embedded domains are properly checked
for common words and checked against the target allowlist. In doing
this, it also stops a DCHECK from triggering when a user visited a
domain like "googlecom.example.com" (instead of
"www.googlecom.example.com").

Bug: 1126146, 1127450
Change-Id: I19e0f5735240fad85ea7079c8905e95d2744ac2a
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2416266
Commit-Queue: Joe DeBlasio <jdeblasio@chromium.org>
Auto-Submit: Joe DeBlasio <jdeblasio@chromium.org>
Reviewed-by: default avatarMustafa Emre Acer <meacer@chromium.org>
Cr-Commit-Position: refs/heads/master@{#808174}
parent 10630b8a
......@@ -497,9 +497,19 @@ IN_PROC_BROWSER_TEST_P(LookalikeUrlNavigationThrottleBrowserTest,
LookalikeUrlMatchType::kTargetEmbedding);
}
// Target embedding should not trigger on allowlisted domains.
// Target embedding should not trigger on allowlisted embedder domains.
IN_PROC_BROWSER_TEST_P(LookalikeUrlNavigationThrottleBrowserTest,
TargetEmbedding_Allowlist) {
TargetEmbedding_EmbedderAllowlist) {
const GURL kNavigatedUrl = GetURL("google.com.allowlisted.com");
SetEngagementScore(browser(), kNavigatedUrl, kLowEngagement);
SetSafetyTipAllowlistPatterns({"allowlisted.com/"}, {});
TestInterstitialNotShown(browser(), kNavigatedUrl);
CheckNoUkm();
}
// Target embedding should not trigger on allowlisted target domains.
IN_PROC_BROWSER_TEST_P(LookalikeUrlNavigationThrottleBrowserTest,
TargetEmbedding_TargetAllowlist) {
const GURL kNavigatedUrl = GetURL("foo.scholar.google.com.com");
SetEngagementScore(browser(), kNavigatedUrl, kLowEngagement);
SetSafetyTipAllowlistPatterns({}, {"scholar\\.google\\.com"});
......@@ -507,6 +517,17 @@ IN_PROC_BROWSER_TEST_P(LookalikeUrlNavigationThrottleBrowserTest,
CheckNoUkm();
}
// Navigate to a domain target embedding a domain with no separators, but that
// matches the target allowlist. Regression test for crbug.com/1127450.
IN_PROC_BROWSER_TEST_P(LookalikeUrlNavigationThrottleBrowserTest,
TargetEmbedding_TargetAllowlistWithNoSeparators) {
const GURL kNavigatedUrl = GetURL("googlecom.example.com");
SetEngagementScore(browser(), kNavigatedUrl, kLowEngagement);
SetSafetyTipAllowlistPatterns({}, {"google\\.com"});
TestInterstitialNotShown(browser(), kNavigatedUrl);
CheckNoUkm();
}
// Similar to Idn_TopDomain_Match but the domain is not in top 500. Should not
// show an interstitial, but should still record metrics.
IN_PROC_BROWSER_TEST_P(LookalikeUrlNavigationThrottleBrowserTest,
......
......@@ -66,6 +66,10 @@ const char* kCommonWords[] = {"shop", "jobs", "live", "info", "study",
"ideal", "research", "france", "free", "mobile",
"sky", "ask"};
// What separators can be used to separate tokens in target embedding spoofs?
// e.g. www-google.com.example.com uses "-" (www-google) and "." (google.com).
const char kTargetEmbeddingSeparators[] = "-.";
bool SkeletonsMatch(const url_formatter::Skeletons& skeletons1,
const url_formatter::Skeletons& skeletons2) {
DCHECK(!skeletons1.empty());
......@@ -182,7 +186,8 @@ void RecordEvent(NavigationSuggestionEvent event) {
// StringPieces.
std::vector<base::StringPiece> SplitDomainWithouteTLDIntoTokens(
const std::string& host_without_etld) {
return base::SplitStringPiece(host_without_etld, "-.", base::TRIM_WHITESPACE,
return base::SplitStringPiece(host_without_etld, kTargetEmbeddingSeparators,
base::TRIM_WHITESPACE,
base::SPLIT_WANT_NONEMPTY);
}
......@@ -595,11 +600,29 @@ TargetEmbeddingType GetTargetEmbeddingType(
// This check happens first so that we can exclude invalid eTLD+1s next.
std::string embedded_target = GetMatchingTopDomainWithoutSeparators(
hostname_tokens_without_etld[end - 1]);
if (!embedded_target.empty() &&
!IsAllowedToBeEmbedded(etld_check_dominfo, etld_check_span,
in_target_allowlist)) {
*safe_hostname = embedded_target;
return TargetEmbeddingType::kInterstitial;
if (!embedded_target.empty()) {
// Extract the full possibly-spoofed domain. To get this, we take the
// hostname up until this point, strip off the no-separator bit (e.g.
// googlecom) and then re-add the the separated version (e.g. google.com).
auto spoofed_domain =
etld_check_host.substr(
0, etld_check_host.length() -
hostname_tokens_without_etld[end - 1].length()) +
embedded_target;
const auto no_separator_tokens = base::SplitStringPiece(
spoofed_domain, kTargetEmbeddingSeparators, base::TRIM_WHITESPACE,
base::SPLIT_WANT_NONEMPTY);
auto no_separator_dominfo = GetDomainInfo(embedded_target);
// Only flag on domains that are long enough, don't use common words, and
// aren't target-allowlisted.
if (no_separator_dominfo.domain_without_registry.length() >
kMinE2LDLengthForTargetEmbedding &&
!IsAllowedToBeEmbedded(no_separator_dominfo, no_separator_tokens,
in_target_allowlist)) {
*safe_hostname = embedded_target;
return TargetEmbeddingType::kInterstitial;
}
}
// Exclude otherwise-invalid eTLDs.
......
......@@ -198,7 +198,7 @@ TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) {
{"scholar.foo.google.com.foo.com", "google.com",
TargetEmbeddingType::kInterstitial},
// Targets should be longer than 6 characters.
// e2LDs should be longer than 3 characters.
{"hp.com-foo.com", "", TargetEmbeddingType::kNone},
// Targets with common words as e2LD are not considered embedded targets
......@@ -210,8 +210,15 @@ TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) {
{"foo.office.org-foo.com", "", TargetEmbeddingType::kNone},
// Targets could be embedded without their dots and dashes.
{"googlecom-foo.com", "google.com", TargetEmbeddingType::kInterstitial},
{"foo.googlecom-foo.com", "google.com",
TargetEmbeddingType::kInterstitial},
// But should not be detected if they're using a common word. weather.com
// is on the top domain list, but 'weather' is a common word.
{"weathercom-foo.com", "", TargetEmbeddingType::kNone},
// And should also not be detected if they're too short. vk.com is on the
// top domain list, but is shorter than kMinE2LDLengthForTargetEmbedding.
{"vkcom-foo.com", "", TargetEmbeddingType::kNone},
// Ensure legitimate domains don't trigger.
{"foo.google.com", "", TargetEmbeddingType::kNone},
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment