Commit f3718864 authored by Joe DeBlasio's avatar Joe DeBlasio Committed by Commit Bot

[Top Domains] Permit top 500 list to be... less than 500 domains.

This CL makes some small changes to allow the top 500 list to be less
than 500 domains long. This allows us to remove domains that should no
longer be in the list without having to regenerate the whole thing.

This CL also incidentally fixes a bug wherein some domains were treated
as top-500 domains even though they weren't.

This CL keeps some statically-defined variables that are defined in a
generated .cc file as having 500 entries, even though they actually have
fewer than that. This prevents us from having to regenerate the header
file as well, but eventually that's the right answer. I'm hoping we can
merge this CL to M84, which necessitates simplicity and safety over
elegance.

Bug: 1083489,1083487
Change-Id: I45a5425cd4a8c4a9135d6f723b08d628f164f7df
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2204554
Commit-Queue: Joe DeBlasio <jdeblasio@chromium.org>
Reviewed-by: default avatarMustafa Emre Acer <meacer@chromium.org>
Cr-Commit-Position: refs/heads/master@{#769507}
parent 3d535d67
......@@ -88,9 +88,16 @@ bool ShouldSuppressWarning(const GURL& url) {
} // namespace
ReputationService::ReputationService(Profile* profile)
: profile_(profile),
sensitive_keywords_(top500_domains::kTop500Keywords),
num_sensitive_keywords_(base::size(top500_domains::kTop500Keywords)) {}
: profile_(profile), sensitive_keywords_(top500_domains::kTop500Keywords) {
// kTop500Keywords can be padded at the end with blank entries.
for (num_sensitive_keywords_ = 0;
num_sensitive_keywords_ < base::size(top500_domains::kTop500Keywords);
++num_sensitive_keywords_) {
if (strlen(top500_domains::kTop500Keywords[num_sensitive_keywords_]) == 0) {
break;
}
}
}
ReputationService::~ReputationService() = default;
......
......@@ -75,6 +75,11 @@ std::string GetSimilarDomainFromTop500(
for (const std::string& navigated_skeleton : navigated_domain.skeletons) {
for (const char* const top_domain_skeleton :
top500_domains::kTop500EditDistanceSkeletons) {
// kTop500EditDistanceSkeletons may include blank entries.
if (strlen(top_domain_skeleton) == 0) {
continue;
}
if (!IsEditDistanceAtMostOne(base::UTF8ToUTF16(navigated_skeleton),
base::UTF8ToUTF16(top_domain_skeleton))) {
continue;
......
......@@ -61,9 +61,8 @@ compiled_action("generate_top_domains_test_trie") {
]
outputs = [ "$target_gen_dir/test_domains-trie-inc.cc" ]
# Passing --for_testing flag marks only the first site with is_top_500.
args = rebase_path(inputs, root_build_dir) +
rebase_path(outputs, root_build_dir) + [ "--for_testing" ]
args =
rebase_path(inputs, root_build_dir) + rebase_path(outputs, root_build_dir)
}
executable("make_top_domain_list_variables") {
......
......@@ -498,7 +498,7 @@ zillow.com
zing.vn
znanija.com
zomato.com
###
###END_TOP_500###
01net.com
1000.menu
10086.cn
......
......@@ -509,6 +509,7 @@ zillow.corn, zillow.com
zing.vn, zing.vn
znanija.corn, znanija.com
zornato.corn, zomato.com
###END_TOP_500###
Olnet.corn, 01net.com
lOOO.rnenu, 1000.menu
lOO86.cn, 10086.cn
......
......@@ -20,6 +20,7 @@
// input file to generate 500 skeletons and 500 keywords.
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
......@@ -39,7 +40,12 @@
namespace {
const size_t kTopN = 500;
// The size of the arrays generated in top500-domains-inc.cc. Must match that in
// top500_domains.h. If the file has fewer than kMaxEntries eligible top-500
// domains marked (e.g. because some are too short), the generated arrays may be
// padded with blank entries up to kMaxEntries.
const size_t kMaxEntries = 500;
const char* kTop500Separator = "###END_TOP_500###";
void PrintHelp() {
std::cout << "make_top_domain_list_for_edit_distance <input-file>"
......@@ -113,15 +119,20 @@ int main(int argc, char* argv[]) {
std::set<std::string> keywords;
for (std::string line : lines) {
if (skeletons.size() >= kTopN && keywords.size() >= kTopN) {
if (skeletons.size() >= kMaxEntries && keywords.size() >= kMaxEntries) {
break;
}
base::TrimWhitespaceASCII(line, base::TRIM_ALL, &line);
if (line == kTop500Separator) {
break;
}
if (line.empty() || line[0] == '#') {
continue;
}
if (skeletons.size() < kTopN &&
if (skeletons.size() < kMaxEntries &&
url_formatter::top_domains::IsEditDistanceCandidate(line)) {
const std::string skeleton = GetSkeleton(line, spoof_checker.get());
if (skeletons.find(skeleton) == skeletons.end()) {
......@@ -129,7 +140,7 @@ int main(int argc, char* argv[]) {
}
}
if (keywords.size() < kTopN) {
if (keywords.size() < kMaxEntries) {
std::string keywords_for_current_line =
url_formatter::top_domains::HostnameWithoutRegistry(line);
CHECK(keywords_for_current_line.find('.') == std::string::npos);
......@@ -141,15 +152,15 @@ int main(int argc, char* argv[]) {
keywords.insert(keyword);
}
if (keywords.size() >= kTopN) {
if (keywords.size() >= kMaxEntries) {
break;
}
}
}
}
CHECK_EQ(skeletons.size(), kTopN);
CHECK_EQ(keywords.size(), kTopN);
CHECK_LE(skeletons.size(), kMaxEntries);
CHECK_LE(keywords.size(), kMaxEntries);
std::vector<std::string> sorted_skeletons(skeletons.begin(), skeletons.end());
std::sort(sorted_skeletons.begin(), sorted_skeletons.end());
......@@ -157,27 +168,40 @@ int main(int argc, char* argv[]) {
std::vector<std::string> sorted_keywords(keywords.begin(), keywords.end());
std::sort(sorted_keywords.begin(), sorted_keywords.end());
std::string output =
std::ostringstream output_stream;
output_stream <<
R"(#include "components/url_formatter/spoof_checks/top_domains/top500_domains.h"
namespace top500_domains {
const char* const kTop500EditDistanceSkeletons[500] = {
const char* const kTop500EditDistanceSkeletons[)"
<< kMaxEntries << R"(] = {
)";
for (const std::string& skeleton : sorted_skeletons) {
output += ("\"" + skeleton + "\"");
output += ",\n";
output_stream << ("\"" + skeleton + "\"");
output_stream << ",\n";
}
output += R"(};
const char* const kTop500Keywords[500] = {
// Pad any remaining array slots with blank entries.
for (size_t i = skeletons.size(); i < kMaxEntries; ++i) {
output_stream << ("\"\",\n");
}
output_stream << R"(};
const char* const kTop500Keywords[)"
<< kMaxEntries << R"(] = {
)";
for (const std::string& keyword : sorted_keywords) {
output += ("\"" + keyword + "\"");
output += ",\n";
output_stream << ("\"" + keyword + "\"");
output_stream << ",\n";
}
// Pad any remaining array slots with blank entries.
for (size_t i = keywords.size(); i < kMaxEntries; ++i) {
output_stream << ("\"\",\n");
}
output += R"(};
output_stream << R"(};
} // namespace top500_domains)";
std::string output = output_stream.str();
base::FilePath output_path = base::FilePath::FromUTF8Unsafe(argv[2]);
if (base::WriteFile(output_path, output.c_str(),
static_cast<uint32_t>(output.size())) <= 0) {
......
......@@ -23,6 +23,8 @@
#include "third_party/icu/source/common/unicode/utypes.h"
#include "third_party/icu/source/i18n/unicode/uspoof.h"
const char* kTop500Separator = "###END_TOP_500###";
base::FilePath GetPath(base::StringPiece basename) {
base::FilePath path;
base::PathService::Get(base::DIR_SOURCE_ROOT, &path);
......@@ -74,6 +76,13 @@ int GenerateSkeletons(const char* input_file_name,
size_t max_labels = 0;
std::string domain_with_max_labels;
while (std::getline(input, domain)) {
base::TrimWhitespaceASCII(domain, base::TRIM_ALL, &domain);
if (domain == kTop500Separator) {
output += std::string(kTop500Separator) + "\n";
continue;
}
if (domain[0] == '#')
continue;
......
d4000.com
###END_TOP_500###
digklmo68.com
digklmo68.co.uk
islkpx123.com
......
......@@ -10,6 +10,7 @@
# in components/url_formatter/url_formatter.cc.
d4OOO.corn, d4000.com
###END_TOP_500###
digklrno68.corn, digklmo68.com
digklrno68.co.uk, digklmo68.co.uk
islkpxl23.corn, islkpx123.com
......
......@@ -35,11 +35,12 @@ using url_formatter::top_domains::TopDomainStateGenerator;
namespace {
const char* kTop500Separator = "###END_TOP_500###";
// Print the command line help.
void PrintHelp() {
std::cout << "top_domain_generator <input-file>"
<< " <template-file> <output-file> [--for_testing] [--v=1]"
<< std::endl;
<< " <template-file> <output-file> [--v=1]" << std::endl;
}
void CheckName(const std::string& name) {
......@@ -91,15 +92,20 @@ int main(int argc, char* argv[]) {
return 1;
}
const bool for_testing = command_line.HasSwitch("for_testing");
std::vector<std::string> lines = base::SplitString(
input_text, "\n", base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
bool is_top_500 = true;
TopDomainEntries entries;
std::set<std::string> skeletons;
for (std::string line : lines) {
base::TrimWhitespaceASCII(line, base::TRIM_ALL, &line);
if (line == kTop500Separator) {
is_top_500 = false;
continue;
}
if (line.empty() || line[0] == '#') {
continue;
}
......@@ -126,12 +132,7 @@ int main(int argc, char* argv[]) {
const GURL domain(std::string("http://") + tokens[1]);
entry->top_domain = domain.host();
// If testing, only mark the first site as "top 500".
if (for_testing) {
entry->is_top_500 = entries.size() < 1;
} else {
entry->is_top_500 = entries.size() < 500;
}
entry->is_top_500 = is_top_500;
CheckName(entry->skeleton);
CheckName(entry->top_domain);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment