Commit 09587e2f authored by Joe DeBlasio's avatar Joe DeBlasio Committed by Chromium LUCI CQ

[Lookalikes] Expand common words used in target embedding.

This CL expands the list of common words ignored by target embedding to
the full list from //components/url_formatter/spoof_checks/common_words.
It also removes words that are on that list from the existing list of
common words (which is maintained as a supplemental list).

This CL also adds a little bit of logic to disable common word detection
on domains that are in the special list of domains that are allowed to
be embedded, but only at the end (domains that are higher-value that
use common word). This is necessary since, e.g., "office" is a common
word on the full common word list, but not in the old list.

A side effect of this change is that the common word list is included in
Android. This list causes a big bump in binary size. A future edit may
reduce the size of the word list used for Android, but that'll be a
substantial engineering effort, and this list is an important mitigation
in a security feature. The list is efficiently stored as a DAFSA, so
I know of no obvious way to shrink it down.

Binary-Size: Size increase is unavoidable (see above).
Bug: 1154726
Change-Id: I417d92761377f6b6e11772b8f06c9b36b5083676
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2630738
Commit-Queue: Joe DeBlasio <jdeblasio@chromium.org>
Reviewed-by: default avatarMustafa Emre Acer <meacer@chromium.org>
Cr-Commit-Position: refs/heads/master@{#844156}
parent b438da15
......@@ -18,6 +18,7 @@ static_library("core") {
"//components/security_state/core:features",
"//components/strings",
"//components/url_formatter",
"//components/url_formatter/spoof_checks/common_words:common",
"//components/url_formatter/spoof_checks/top_domains:common",
"//components/url_formatter/spoof_checks/top_domains:top500_domains",
"//components/url_formatter/spoof_checks/top_domains:top500_domains_header",
......
......@@ -28,6 +28,7 @@
#include "components/lookalikes/core/features.h"
#include "components/security_interstitials/core/pref_names.h"
#include "components/security_state/core/features.h"
#include "components/url_formatter/spoof_checks/common_words/common_words_util.h"
#include "components/url_formatter/spoof_checks/top_domains/top500_domains.h"
#include "components/url_formatter/spoof_checks/top_domains/top_domain_util.h"
#include "components/url_formatter/url_formatter.h"
......@@ -57,18 +58,15 @@ const size_t kMinE2LDLengthForTargetEmbedding = 4;
// This list will be added to the static list of common words so common words
// could be added to the list using a flag if needed.
const base::FeatureParam<std::string> kAdditionalCommonWords{
const base::FeatureParam<std::string> kRemoveAdditionalCommonWords{
&lookalikes::features::kDetectTargetEmbeddingLookalikes,
"additional_common_words", ""};
// We might not protect a domain whose e2LD is a common word in target embedding
// based on the TLD that is paired with it.
const char* kCommonWords[] = {
"shop", "jobs", "live", "info", "study", "asahi",
"weather", "health", "forum", "radio", "ideal", "research",
"france", "free", "mobile", "sky", "ask", "booking",
"canada", "dating", "dictionary", "express", "hoteles", "hotels",
"investing", "jharkhand", "nifty"};
// based on the TLD that is paired with it. This list supplements words from
// url_formatter::common_words::IsCommonWord().
const char* kLocalAdditionalCommonWords[] = {"asahi", "hoteles", "jharkhand",
"nifty"};
// These domains are plausible lookalike targets, but they also use common words
// in their names. Selectively prevent flagging embeddings where the embedder
......@@ -272,17 +270,33 @@ bool DoesETLDPlus1MatchTopDomainOrEngagedSite(
// weather.com, ask.com). Target embeddings of these domains are often false
// positives (e.g. "super-best-fancy-hotels.com" isn't spoofing "hotels.com").
bool UsesCommonWord(const DomainInfo& domain) {
std::vector<std::string> additional_common_words =
base::SplitString(kAdditionalCommonWords.Get(), ",",
base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
if (base::Contains(additional_common_words, domain.domain_without_registry)) {
// kDomainsPermittedInEndEmbeddings are based on domains with common words,
// but they should not be excluded here (and instead are checked later).
for (auto* permitted_ending : kDomainsPermittedInEndEmbeddings) {
if (domain.domain_and_registry == permitted_ending) {
return false;
}
}
// Search for words in the big common word list.
if (url_formatter::common_words::IsCommonWord(
domain.domain_without_registry)) {
return true;
}
for (auto* common_word : kCommonWords) {
// Also check the local lists.
for (auto* common_word : kLocalAdditionalCommonWords) {
if (domain.domain_without_registry == common_word) {
return true;
}
}
std::vector<std::string> additional_common_words =
base::SplitString(kRemoveAdditionalCommonWords.Get(), ",",
base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
if (base::Contains(additional_common_words, domain.domain_without_registry)) {
return true;
}
return false;
}
......
......@@ -215,6 +215,9 @@ TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) {
TargetEmbeddingType::kInterstitial},
{"foo.jobs.org-foo.com", "", TargetEmbeddingType::kNone},
{"foo.office.org-foo.com", "", TargetEmbeddingType::kNone},
// Common words (like 'jobs' are included in the big common word list.
// Ensure that the supplemental kCommonWords list is also checked.
{"foo.hoteles.com-foo.com", "", TargetEmbeddingType::kNone},
// Targets could be embedded without their dots and dashes.
{"googlecom-foo.com", "google.com", TargetEmbeddingType::kInterstitial},
......@@ -299,7 +302,12 @@ TEST(LookalikeUrlUtilTest, TargetEmbeddingTest) {
<< test_case.expected_safe_host << ", but "
<< (safe_hostname.empty() ? "it didn't trigger at all."
: "triggered on " + safe_hostname);
EXPECT_EQ(embedding_type, test_case.expected_type);
EXPECT_EQ(embedding_type, test_case.expected_type)
<< test_case.hostname << " should trigger on "
<< test_case.expected_safe_host << " but it returned "
<< (embedding_type == TargetEmbeddingType::kNone
? "kNone."
: "something unexpected");
} else {
EXPECT_EQ(embedding_type, TargetEmbeddingType::kNone)
<< test_case.hostname << " unexpectedly triggered on "
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment