Commit a02a238f authored by Aidan Beggs's avatar Aidan Beggs Committed by Commit Bot

Added the sensitive keyword detection heuristic.

As of right now, the heuristic does nothing when tripped. We will later
use the cases where the heuristic is triggered to iterate on, and
improve the algorithm.

Bug: 1012476
Change-Id: I441abe768701a7c6e1b5a0871be63acb9f948006
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1853914
Commit-Queue: Aidan Beggs <beggs@google.com>
Reviewed-by: default avatarJoe DeBlasio <jdeblasio@chromium.org>
Cr-Commit-Position: refs/heads/master@{#706065}
parent 67ebab07
......@@ -645,6 +645,8 @@ jumbo_split_static_library("browser") {
"lookalikes/lookalike_url_service.h",
"lookalikes/lookalike_url_tab_storage.cc",
"lookalikes/lookalike_url_tab_storage.h",
"lookalikes/safety_tips/local_heuristics.cc",
"lookalikes/safety_tips/local_heuristics.h",
"lookalikes/safety_tips/reputation_service.cc",
"lookalikes/safety_tips/reputation_service.h",
"lookalikes/safety_tips/reputation_web_contents_observer.cc",
......
// Copyright 2019 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/browser/lookalikes/safety_tips/local_heuristics.h"
#include "base/metrics/field_trial_params.h"
#include "base/strings/string_split.h"
#include "chrome/browser/lookalikes/lookalike_url_interstitial_page.h"
#include "chrome/browser/lookalikes/lookalike_url_navigation_throttle.h"
#include "components/security_state/core/features.h"
#include "components/url_formatter/spoof_checks/top_domains/top_domain_util.h"
namespace {
const base::FeatureParam<bool> kEnableLookalikeEditDistance{
&security_state::features::kSafetyTipUI, "editdistance", false};
const base::FeatureParam<bool> kEnableLookalikeEditDistanceSiteEngagement{
&security_state::features::kSafetyTipUI, "editdistance_siteengagement",
false};
} // namespace
namespace safety_tips {
bool ShouldTriggerSafetyTipFromLookalike(
const GURL& url,
const lookalikes::DomainInfo& navigated_domain,
const std::vector<lookalikes::DomainInfo>& engaged_sites,
GURL* safe_url) {
std::string matched_domain;
LookalikeUrlInterstitialPage::MatchType match_type;
if (!lookalikes::LookalikeUrlNavigationThrottle::GetMatchingDomain(
navigated_domain, engaged_sites, &matched_domain, &match_type)) {
return false;
}
// If we're already displaying an interstitial, don't warn again.
if (lookalikes::LookalikeUrlNavigationThrottle::ShouldDisplayInterstitial(
match_type, navigated_domain)) {
return false;
}
*safe_url = GURL(std::string(url::kHttpScheme) +
url::kStandardSchemeSeparator + matched_domain);
// Edit distance has higher false positives, so it gets its own feature param
if (match_type == LookalikeUrlInterstitialPage::MatchType::kEditDistance) {
return kEnableLookalikeEditDistance.Get();
}
if (match_type ==
LookalikeUrlInterstitialPage::MatchType::kEditDistanceSiteEngagement) {
return kEnableLookalikeEditDistanceSiteEngagement.Get();
}
return true;
}
bool ShouldTriggerSafetyTipFromKeywordInURL(
const GURL& url,
const char* const sensitive_keywords[],
size_t num_keywords) {
// "eTLD + 1 - 1": "www.google.com" -> "google"
std::string eTLD_plusminus;
base::TrimString(url_formatter::top_domains::HostnameWithoutRegistry(
lookalikes::GetETLDPlusOne(url.host())),
".", &eTLD_plusminus);
DCHECK(eTLD_plusminus.find('.') == std::string::npos);
const std::vector<std::string> eTLD_plusminus_parts = base::SplitString(
eTLD_plusminus, "-", base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
// We only care about finding a keyword here if there's more than one part to
// the tokenized eTLD+1-1.
if (eTLD_plusminus_parts.size() <= 1) {
return false;
}
for (const auto& eTLD_plusminus_part : eTLD_plusminus_parts) {
// We use a custom comparator for (char *) here, to avoid the costly
// construction of two std::strings every time two values are compared,
// and because (char *) orders by address, not lexicographically.
if (std::binary_search(sensitive_keywords,
sensitive_keywords + num_keywords,
eTLD_plusminus_part.c_str(),
[](const char* str_one, const char* str_two) {
return strcmp(str_one, str_two) < 0;
})) {
return true;
}
}
return false;
}
} // namespace safety_tips
// Copyright 2019 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CHROME_BROWSER_LOOKALIKES_SAFETY_TIPS_LOCAL_HEURISTICS_H_
#define CHROME_BROWSER_LOOKALIKES_SAFETY_TIPS_LOCAL_HEURISTICS_H_
#include <cstddef>
#include <string>
#include <vector>
#include "chrome/browser/lookalikes/lookalike_url_service.h"
#include "url/gurl.h"
// These functions exist as utility functions, and are currently used in
// "safety_tip_heuristics". These functions SHOULD NOT be called directly. See
// reptuation_service.h for module use.
namespace safety_tips {
// Checks to see whether a given URL qualifies as a lookalike domain, and thus
// should trigger a safety tip. This algorithm factors in the sites that the
// user has already engaged with. This heuristic stores a "safe url" that the
// navigated domain is a lookalike to, in the passed |safe_url|.
bool ShouldTriggerSafetyTipFromLookalike(
const GURL& url,
const lookalikes::DomainInfo& navigated_domain,
const std::vector<lookalikes::DomainInfo>& engaged_sites,
GURL* safe_url);
// Checks to see whether a given URL contains sensitive keywords in a way
// that it should trigger a safety tip.
bool ShouldTriggerSafetyTipFromKeywordInURL(
const GURL& url,
const char* const sensitive_keywords[],
size_t num_keywords);
} // namespace safety_tips
#endif // CHROME_BROWSER_LOOKALIKES_SAFETY_TIPS_LOCAL_HEURISTICS_H_
// Copyright 2019 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <vector>
#include "chrome/browser/lookalikes/safety_tips/local_heuristics.h"
#include "testing/gtest/include/gtest/gtest.h"
#include "url/gurl.h"
struct KeywordHeuristicTestCase {
const GURL url;
const bool should_trigger;
};
TEST(SafetyTipHeuristicsTest, SensitiveKeywordsTest) {
// These keywords must always be in sorted order.
const std::vector<const char*> keywords = {"bad", "evil", "keyword"};
const std::vector<KeywordHeuristicTestCase> test_cases = {
// Verify scheme doesn't affect results.
{GURL("http://www.bad.com"), false},
{GURL("https://www.bad.com"), false},
{GURL("http://bad-domain.com"), true},
{GURL("https://bad-domain.com"), true},
// We don't really care about sub-domains for this heuristic, verify this
// works as expected.
{GURL("http://www.evil-domain.safe-domain.com"), false},
{GURL("http://www.safe-domain.evil-domain.com"), true},
{GURL("http://www.bad-other.edu"), true},
{GURL("http://bad-keyword.com"), true},
{GURL("http://www.evil-and-bad.com"), true},
// Make sure heuristic still works, even for really long domains.
{GURL("http://"
"www.super-duper-uber-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-domain-with-a-lot-of-parts-to-it.org"),
false},
{GURL("http://"
"www.super-duper-uber-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-THISISEVIL-evil-THISISEVIL-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-long-long-long-long-long-"
"long-long-long-long-long-long-long-long-domain-with-a-lot-of-"
"parts-to-it.org"),
true},
// Ensure heuristic doesn't trigger on misspelled keywords.
{GURL("http://www.misspelled-example-keywrd.edu"), false},
{GURL("http://www.spelled-right-example-keyword.edu"), true},
// Make sure passing a lot of keywords doesn't result in a false negative.
{GURL("http://evil-bad-keyword-example.com"), true},
};
for (const auto& test_case : test_cases) {
ASSERT_EQ(test_case.should_trigger,
safety_tips::ShouldTriggerSafetyTipFromKeywordInURL(
test_case.url, keywords.data(), keywords.size()))
<< "Expected that \"" << test_case.url << "\" should"
<< (test_case.should_trigger ? "" : "n't") << " trigger but it did"
<< (test_case.should_trigger ? "n't" : "");
}
}
......@@ -10,10 +10,10 @@
#include "base/macros.h"
#include "base/memory/scoped_refptr.h"
#include "base/memory/singleton.h"
#include "base/metrics/field_trial_params.h"
#include "chrome/browser/lookalikes/lookalike_url_interstitial_page.h"
#include "chrome/browser/lookalikes/lookalike_url_navigation_throttle.h"
#include "chrome/browser/lookalikes/lookalike_url_service.h"
#include "chrome/browser/lookalikes/safety_tips/local_heuristics.h"
#include "chrome/browser/lookalikes/safety_tips/safety_tip_ui_helper.h"
#include "chrome/browser/lookalikes/safety_tips/safety_tips_config.h"
#include "chrome/browser/profiles/incognito_helpers.h"
......@@ -21,7 +21,7 @@
#include "components/keyed_service/content/browser_context_dependency_manager.h"
#include "components/keyed_service/content/browser_context_keyed_service_factory.h"
#include "components/safe_browsing/db/v4_protocol_manager_util.h"
#include "components/security_state/core/features.h"
#include "components/url_formatter/spoof_checks/top_domains/top500_domains.h"
#include "url/url_constants.h"
namespace {
......@@ -29,59 +29,10 @@ namespace {
using chrome_browser_safety_tips::FlaggedPage;
using chrome_browser_safety_tips::UrlPattern;
using lookalikes::DomainInfo;
using lookalikes::LookalikeUrlNavigationThrottle;
using lookalikes::LookalikeUrlService;
using LookalikeMatchType = LookalikeUrlInterstitialPage::MatchType;
using safe_browsing::V4ProtocolManagerUtil;
using safety_tips::ReputationService;
const base::FeatureParam<bool> kEnableLookalikeEditDistance{
&security_state::features::kSafetyTipUI, "editdistance", false};
const base::FeatureParam<bool> kEnableLookalikeEditDistanceSiteEngagement{
&security_state::features::kSafetyTipUI, "editdistance_siteengagement",
false};
bool ShouldTriggerSafetyTipFromLookalike(
const GURL& url,
const DomainInfo& navigated_domain,
const std::vector<DomainInfo>& engaged_sites,
GURL* safe_url) {
std::string matched_domain;
LookalikeMatchType match_type;
if (!LookalikeUrlNavigationThrottle::GetMatchingDomain(
navigated_domain, engaged_sites, &matched_domain, &match_type)) {
return false;
}
// If we're already displaying an interstitial, don't warn again.
if (LookalikeUrlNavigationThrottle::ShouldDisplayInterstitial(
match_type, navigated_domain)) {
return false;
}
*safe_url = GURL(std::string(url::kHttpScheme) +
url::kStandardSchemeSeparator + matched_domain);
// Edit distance has higher false positives, so it gets its own feature param
if (match_type == LookalikeMatchType::kEditDistance) {
return kEnableLookalikeEditDistance.Get();
}
if (match_type == LookalikeMatchType::kEditDistanceSiteEngagement) {
return kEnableLookalikeEditDistanceSiteEngagement.Get();
}
return true;
}
// TODO(crbug/984725): Implement Keyword Check
bool ShouldTriggerSafetyTipFromKeywordInURL(
const GURL& url,
const DomainInfo& navigated_domain,
const std::vector<DomainInfo>& engaged_sites) {
// TODO(crbug/987754): Record metrics here.
return false;
}
// This factory helps construct and find the singleton ReputationService linked
// to a Profile.
class ReputationServiceFactory : public BrowserContextKeyedServiceFactory {
......@@ -310,8 +261,8 @@ void ReputationService::GetReputationStatusWithEngagedSites(
}
// 5. Keyword heuristics.
if (ShouldTriggerSafetyTipFromKeywordInURL(url, navigated_domain,
engaged_sites)) {
if (ShouldTriggerSafetyTipFromKeywordInURL(
url, top500_domains::kTop500Keywords, 500)) {
std::move(callback).Run(security_state::SafetyTipStatus::kBadKeyword,
IsIgnored(url), url, GURL());
return;
......
......@@ -3688,6 +3688,7 @@ test("unit_tests") {
"../browser/importer/profile_writer_unittest.cc",
"../browser/lifetime/application_lifetime_unittest.cc",
"../browser/lookalikes/lookalike_url_navigation_throttle_unittest.cc",
"../browser/lookalikes/safety_tips/local_heuristics_unittest.cc",
"../browser/lookalikes/safety_tips/reputation_service_unittest.cc",
# Media remoting is not supported on Android for now.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment