Commit 1c7d92d0 authored by Aidan Beggs's avatar Aidan Beggs Committed by Commit Bot

Added the top 500 keywords to the binary, for later use in the keywords

heuristic.

This CL adds a set of the top 500 keywords to the binary at compile
time, to enable usage of this information in the phishing detection
keywords heuristic. Additionally, this CL cleans up and refactors the
compile-time processing and embedding of information related to the top
500 keywords, into the binary.

Bug: 1012476
Change-Id: I9d75d962d0425b0f037c87a5d6801c7c9299c476
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1849183
Commit-Queue: Aidan Beggs <beggs@google.com>
Reviewed-by: default avatarMustafa Emre Acer <meacer@chromium.org>
Cr-Commit-Position: refs/heads/master@{#704458}
parent e19fe5c6
......@@ -83,10 +83,14 @@ std::string GetMatchingSiteEngagementDomain(
}
// Returns the first matching top domain with an edit distance of at most one
// to |domain_and_registry|.
// to |domain_and_registry|. This search is done in lexicographic order on the
// top 500 suitable domains, instead of in order by popularity. This means that
// the resulting "similar" domain may not be the most popular domain that
// matches.
std::string GetSimilarDomainFromTop500(const DomainInfo& navigated_domain) {
for (const std::string& navigated_skeleton : navigated_domain.skeletons) {
for (const char* const top_domain_skeleton : top500_domains::kTop500) {
for (const char* const top_domain_skeleton :
top500_domains::kTop500EditDistanceSkeletons) {
if (lookalikes::IsEditDistanceAtMostOne(
base::UTF8ToUTF16(navigated_skeleton),
base::UTF8ToUTF16(top_domain_skeleton))) {
......
......@@ -69,9 +69,9 @@ compiled_action("generate_top_domains_test_trie") {
rebase_path(outputs, root_build_dir) + [ "--for_testing" ]
}
executable("make_top_domain_list_for_edit_distance") {
executable("make_top_domain_list_variables") {
sources = [
"make_top_domain_list_for_edit_distance.cc",
"make_top_domain_list_variables.cc",
]
deps = [
":common",
......@@ -108,8 +108,8 @@ source_set("unit_tests") {
# TODO(crbug/915921): Combine this and the previous one into a
# compiled_action_foreach target.
compiled_action("generate_top_domains_for_edit_distance") {
tool = ":make_top_domain_list_for_edit_distance"
compiled_action("generate_top_domain_list_variables_file") {
tool = ":make_top_domain_list_variables"
# Inputs in order expected by the command line of the tool.
inputs = [
......@@ -124,7 +124,7 @@ compiled_action("generate_top_domains_for_edit_distance") {
# top500_domains and top500_domains_header are intentionally separated to remove
# serialized build dependency from some targets to
# generate_top_domains_for_edit_distance action target.
# generate_top_domain_list_variables action target.
source_set("top500_domains") {
# This empty public is intentional to remove unnecessary build dependency.
public = []
......@@ -134,7 +134,7 @@ source_set("top500_domains") {
]
deps = [
":generate_top_domains_for_edit_distance",
":generate_top_domain_list_variables_file",
":top500_domains_header",
]
}
......
* domains.list
### Top Domains Utilities
* `domains.list`
A top domain list, one per line. Used as an input to
make_top_domain_skeletons. See http://go/chrome-top-domains-update for update
instructions.
* domains.skeletons
* `domains.skeletons`
The checked-in output of make_top_domain_skeletons. Processed during the
build to generate domains-trie-inc.cc, which is used by
......@@ -14,10 +16,19 @@
$ ninja -C $build_outdir make_top_domain_skeletons
$ $build_outdir/make_top_domain_skeletons
* test_domains.list
* `test_domains.list`
A list of domains to use in IDNToUnicode test instead of the actual
top domain list. Manually edited to match what's in IDNToUnicode test.
* test_domains.skeletons
* `test_domains.skeletons`
Generated output of test_domains.list along with domains.skeletons
by make_top_domain_skeletons.
* `top_domain_list_variable_builder.cc` / `top500_domains.h`
`top_domain_list_variable_builder.cc` is run at compile time to generate information about the top 500 domains
(currently, skeletons and keywords are created from these domains). This
information is then embedded directly into the chrome binary, and can be
accessed via the variables in the top500_domains namespace.
......@@ -2,9 +2,22 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// This binary generates an array of top domains suitable for edit distance
// matching. The input is the list of (skeleton, domain) pairs. The output is
// written as a C array of the domains.
// This binary generates two C arrays of useful information related to top
// domains, which we embed directly into
// the final Chrome binary. The input is a list of the top domains. The first
// output is named kTop500EditDistanceSkeletons,
// containing the skeletons of the top 500 domains suitable for use in the edit
// distance heuristic. The second output is named kTop500Keywords,
// containg the top 500 keywords suitable for use with the keyword matching
// heuristic (for instance, www.google.com -> google). Both outputs are written
// to the same file, which will be formatted as c++ source file with valid
// syntax.
// The C-strings in both of the output arrays are guaranteed to be in
// lexicographically sorted order.
// IMPORTANT: This binary asserts that there are at least enough sites in the
// input file to generate 500 skeletons and 500 keywords.
#include <iostream>
#include <string>
......@@ -96,30 +109,68 @@ int main(int argc, char* argv[]) {
return 1;
}
size_t count = 0;
std::string output =
R"(#include "components/url_formatter/spoof_checks/top_domains/top500_domains.h"
namespace top500_domains {
const char* const kTop500[500] = {
)";
std::set<std::string> skeletons;
std::set<std::string> keywords;
for (std::string line : lines) {
if (skeletons.size() >= kTopN && keywords.size() >= kTopN) {
break;
}
base::TrimWhitespaceASCII(line, base::TRIM_ALL, &line);
if (line.empty() || line[0] == '#') {
continue;
}
if (count >= kTopN)
break;
if (!url_formatter::top_domains::IsEditDistanceCandidate(line)) {
continue;
if (skeletons.size() < kTopN &&
url_formatter::top_domains::IsEditDistanceCandidate(line)) {
const std::string skeleton = GetSkeleton(line, spoof_checker.get());
if (skeletons.find(skeleton) == skeletons.end()) {
skeletons.insert(skeleton);
}
}
if (keywords.size() < kTopN) {
std::string keyword;
base::TrimString(
url_formatter::top_domains::HostnameWithoutRegistry(line), ".",
&keyword);
CHECK(keyword.find('.') == std::string::npos);
if (keywords.find(keyword) == keywords.end()) {
keywords.insert(keyword);
}
}
count++;
const std::string skeleton = GetSkeleton(line, spoof_checker.get());
output += "\"" + skeleton + "\",\n";
}
CHECK_EQ(skeletons.size(), kTopN);
CHECK_EQ(keywords.size(), kTopN);
std::vector<std::string> sorted_skeletons(skeletons.begin(), skeletons.end());
std::sort(sorted_skeletons.begin(), sorted_skeletons.end());
std::vector<std::string> sorted_keywords(keywords.begin(), keywords.end());
std::sort(sorted_keywords.begin(), sorted_keywords.end());
std::string output =
R"(#include "components/url_formatter/spoof_checks/top_domains/top500_domains.h"
namespace top500_domains {
const char* const kTop500EditDistanceSkeletons[500] = {
)";
for (const std::string& skeleton : sorted_skeletons) {
output += ("\"" + skeleton + "\"");
output += ",\n";
}
output += R"(};
} // namespace top500_domains
const char* const kTop500Keywords[500] = {
)";
for (const std::string& keyword : sorted_keywords) {
output += ("\"" + keyword + "\"");
output += ",\n";
}
output += R"(};
} // namespace top500_domains)";
base::FilePath output_path = base::FilePath::FromUTF8Unsafe(argv[2]);
if (base::WriteFile(output_path, output.c_str(),
static_cast<uint32_t>(output.size())) <= 0) {
......
......@@ -6,7 +6,9 @@
namespace top500_domains {
extern const char* const kTop500[500];
extern const char* const kTop500EditDistanceSkeletons[500];
extern const char* const kTop500Keywords[500];
} // namespace top500_domains
......
......@@ -17,6 +17,11 @@ const size_t kMinLengthForEditDistance = 5u;
} // namespace
bool IsEditDistanceCandidate(const std::string& hostname) {
return !hostname.empty() &&
HostnameWithoutRegistry(hostname).size() >= kMinLengthForEditDistance;
}
std::string HostnameWithoutRegistry(const std::string& hostname) {
DCHECK(!hostname.empty());
const size_t registry_size =
......@@ -27,11 +32,6 @@ std::string HostnameWithoutRegistry(const std::string& hostname) {
return hostname.substr(0, hostname.size() - registry_size);
}
bool IsEditDistanceCandidate(const std::string& hostname) {
return !hostname.empty() &&
HostnameWithoutRegistry(hostname).size() >= kMinLengthForEditDistance;
}
} // namespace top_domains
} // namespace url_formatter
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment