Commit 9e93f982 authored by git-meacer.google.com's avatar git-meacer.google.com Committed by Commit Bot

Use a huffman trie for top domain storage in url formatter

UrlFormatter uses ICU's spoof checker to determine lookalike domains that contain unicode confusables. It does this by extracting a skeleton string from the given domain representing its visual appearance. For example, google.com and googlé[.]com have the same skeleton string (google.corn).

In addition to this, we want to display a "Did you mean to go to..." UI for navigations involving IDN if the domain name matches a top 10K domain. In order to do that, we need to store the domains associated with ICU skeletons.

UrlFormatter currently uses a DAFSA to store the list of the skeletons of the top 10K domains. It doesn't and cannot store the actual domain in this list. To support this, this CL changes the underlying storage from DAFSA to the Huffman Trie used by net's preload list code.

It
- Generates the huffman trie from top domain list during compile time.
- Decodes the huffman trie during runtime in IDNSpoofChecker::SimilarToTopDomains.

The design doc for the preload list migration is here: https://docs.google.com/document/d/11rqIozUDaK6DvNeu436SL3Coj65J5vhD9HVftOi-RrA/edit

As mentioned in the doc, micro benchmarks indicate that the binary size and speed is minimally impacted by this change (51KB additional size, 4 microseconds of additional time for each lookup).

Bug: 843361
Change-Id: If98b8161bf836fec6ba74e68587bd2159f4eb3d5
Reviewed-on: https://chromium-review.googlesource.com/1106539
Commit-Queue: Mustafa Emre Acer <meacer@chromium.org>
Reviewed-by: default avatarNick Harper <nharper@chromium.org>
Reviewed-by: default avatarPeter Kasting <pkasting@chromium.org>
Cr-Commit-Position: refs/heads/master@{#572055}
parent 1bea478e
......@@ -25,8 +25,9 @@ static_library("url_formatter") {
deps = [
"//base",
"//base:i18n",
"//components/url_formatter/top_domains",
"//components/url_formatter/top_domains:generate_top_domains_trie",
"//net",
"//net:preload_decoder",
"//third_party/icu",
"//ui/gfx",
"//url",
......@@ -48,8 +49,9 @@ source_set("unit_tests") {
deps = [
":url_formatter",
"//base",
"//components/url_formatter/top_domains",
"//components/url_formatter/top_domains:generate_top_domains_test_trie",
"//net",
"//net:preload_decoder",
"//testing/gtest",
"//ui/gfx",
"//url",
......
......@@ -22,6 +22,45 @@ namespace url_formatter {
namespace {
class TopDomainPreloadDecoder : public net::extras::PreloadDecoder {
public:
using net::extras::PreloadDecoder::PreloadDecoder;
~TopDomainPreloadDecoder() override {}
bool ReadEntry(net::extras::PreloadDecoder::BitReader* reader,
const std::string& search,
size_t current_search_offset,
bool* out_found) override {
bool is_same_skeleton;
if (!reader->Next(&is_same_skeleton))
return false;
if (is_same_skeleton) {
*out_found = true;
return true;
}
bool has_com_suffix = false;
if (!reader->Next(&has_com_suffix))
return false;
std::string top_domain;
for (char c;; top_domain += c) {
huffman_decoder().Decode(reader, &c);
if (c == net::extras::PreloadDecoder::kEndOfTable)
break;
}
if (has_com_suffix)
top_domain += ".com";
if (current_search_offset == 0) {
*out_found = true;
DCHECK(!top_domain.empty());
}
return true;
}
};
void OnThreadTermination(void* regex_matcher) {
delete reinterpret_cast<icu::RegexMatcher*>(regex_matcher);
}
......@@ -32,13 +71,20 @@ base::ThreadLocalStorage::Slot& DangerousPatternTLS() {
return *dangerous_pattern_tls;
}
#include "components/url_formatter/top_domains/alexa_skeletons-inc.cc"
#include "components/url_formatter/top_domains/alexa_domains-trie-inc.cc"
// All the domains in the above file have 3 or fewer labels.
const size_t kNumberOfLabelsToCheck = 3;
const unsigned char* g_graph = kDafsa;
size_t g_graph_length = sizeof(kDafsa);
IDNSpoofChecker::HuffmanTrieParams g_trie_params{
kTopDomainsHuffmanTree, sizeof(kTopDomainsHuffmanTree), kTopDomainsTrie,
kTopDomainsTrieBits, kTopDomainsRootPosition};
bool LookupMatchInTopDomains(const icu::UnicodeString& ustr_skeleton) {
TopDomainPreloadDecoder preload_decoder(
g_trie_params.huffman_tree, g_trie_params.huffman_tree_size,
g_trie_params.trie, g_trie_params.trie_bits,
g_trie_params.trie_root_position);
std::string skeleton;
ustr_skeleton.toUTF8String(skeleton);
DCHECK_NE(skeleton.back(), '.');
......@@ -52,10 +98,15 @@ bool LookupMatchInTopDomains(const icu::UnicodeString& ustr_skeleton) {
while (labels.size() > 1) {
std::string partial_skeleton = base::JoinString(labels, ".");
if (net::LookupStringInFixedSet(
g_graph, g_graph_length, partial_skeleton.data(),
partial_skeleton.length()) != net::kDafsaNotFound)
bool match = false;
bool decoded = preload_decoder.Decode(partial_skeleton, &match);
DCHECK(decoded);
if (!decoded)
return false;
if (match)
return true;
labels.erase(labels.begin());
}
return false;
......@@ -439,15 +490,17 @@ void IDNSpoofChecker::SetAllowedUnicodeSet(UErrorCode* status) {
uspoof_setAllowedUnicodeSet(checker_, &allowed_set, status);
}
void IDNSpoofChecker::RestoreTopDomainGraphToDefault() {
g_graph = kDafsa;
g_graph_length = sizeof(kDafsa);
// static
void IDNSpoofChecker::SetTrieParamsForTesting(
const HuffmanTrieParams& trie_params) {
g_trie_params = trie_params;
}
void IDNSpoofChecker::SetTopDomainGraph(base::StringPiece domain_graph) {
DCHECK_NE(0u, domain_graph.length());
g_graph = reinterpret_cast<const unsigned char*>(domain_graph.data());
g_graph_length = domain_graph.length();
// static
void IDNSpoofChecker::RestoreTrieParamsForTesting() {
g_trie_params = HuffmanTrieParams{
kTopDomainsHuffmanTree, sizeof(kTopDomainsHuffmanTree), kTopDomainsTrie,
kTopDomainsTrieBits, kTopDomainsRootPosition};
}
} // namespace url_formatter
......@@ -11,6 +11,8 @@
#include "base/gtest_prod_util.h"
#include "base/strings/string16.h"
#include "base/strings/string_piece_forward.h"
#include "net/extras/preload_data/decoder.h"
#include "third_party/icu/source/common/unicode/uniset.h"
#include "third_party/icu/source/common/unicode/utypes.h"
#include "third_party/icu/source/common/unicode/uversion.h"
......@@ -35,6 +37,13 @@ FORWARD_DECLARE_TEST(UrlFormatterTest, IDNToUnicode);
class IDNSpoofChecker {
public:
struct HuffmanTrieParams {
const uint8_t* huffman_tree;
size_t huffman_tree_size;
const uint8_t* trie;
size_t trie_bits;
size_t trie_root_position;
};
IDNSpoofChecker();
~IDNSpoofChecker();
......@@ -62,8 +71,8 @@ class IDNSpoofChecker {
bool IsMadeOfLatinAlikeCyrillic(const icu::UnicodeString& label);
// Used for unit tests.
static void RestoreTopDomainGraphToDefault();
static void SetTopDomainGraph(base::StringPiece domain_graph);
static void SetTrieParamsForTesting(const HuffmanTrieParams& trie_params);
static void RestoreTrieParamsForTesting();
USpoofChecker* checker_;
icu::UnicodeSet deviation_characters_;
......
......@@ -2,26 +2,12 @@
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
action_foreach("top_domains") {
script = "//net/tools/dafsa/make_dafsa.py"
sources = [
"alexa_skeletons.gperf",
"test_skeletons.gperf",
]
outputs = [
"${target_gen_dir}/{{source_name_part}}-inc.cc",
]
args = [
"{{source}}",
rebase_path("${target_gen_dir}/{{source_name_part}}-inc.cc",
root_build_dir),
]
}
import("//build/compiled_action.gni")
if (!is_ios && !is_android) {
executable("make_top_domain_gperf") {
executable("make_top_domain_skeletons") {
sources = [
"make_top_domain_gperf.cc",
"make_top_domain_skeletons.cc",
]
deps = [
......@@ -31,3 +17,56 @@ if (!is_ios && !is_android) {
]
}
}
executable("top_domain_generator") {
sources = [
"top_domain_generator.cc",
"top_domain_state_generator.cc",
"top_domain_state_generator.h",
"trie_entry.cc",
"trie_entry.h",
]
deps = [
"//base",
"//build/config:exe_and_shlib_deps",
"//net/tools/huffman_trie:huffman_trie_generator_sources",
]
if (is_ios) {
libs = [ "UIKit.framework" ]
}
}
compiled_action("generate_top_domains_trie") {
tool = ":top_domain_generator"
# Inputs in order expected by the command line of the tool.
inputs = [
"//components/url_formatter/top_domains/alexa_domains.skeletons",
"//components/url_formatter/top_domains/top_domains_trie.template",
]
outputs = [
"$target_gen_dir/alexa_domains-trie-inc.cc",
]
args =
# Make sure the inputs are system-absolute, as base::File cannot open
# files with ".." components.
rebase_path(inputs, "", "/") + rebase_path(outputs, root_build_dir)
}
# TODO: Combine this and the previous one into a compiled_action_foreach target.
compiled_action("generate_top_domains_test_trie") {
tool = ":top_domain_generator"
# Inputs in order expected by the command line of the tool.
inputs = [
"//components/url_formatter/top_domains/test_domains.skeletons",
"//components/url_formatter/top_domains/top_domains_trie.template",
]
outputs = [
"$target_gen_dir/test_domains-trie-inc.cc",
]
args =
# Make sure the inputs are system-absolute, as base::File cannot open
# files with ".." components.
rebase_path(inputs, "", "/") + rebase_path(outputs, root_build_dir)
}
This diff is collapsed.
......@@ -17,6 +17,7 @@
#include "base/path_service.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "third_party/icu/source/common/unicode/unistr.h"
#include "third_party/icu/source/common/unicode/utypes.h"
#include "third_party/icu/source/i18n/unicode/uspoof.h"
......@@ -50,8 +51,9 @@ bool WriteToFile(const std::string& content, base::StringPiece basename) {
return succeeded;
}
int GenerateDasfa(const char* input_file_name,
const USpoofChecker* spoof_checker) {
int GenerateSkeletons(const char* input_file_name,
const char* output_file_name,
const USpoofChecker* spoof_checker) {
base::FilePath input_file = GetPath(input_file_name);
std::string input_content;
if (!base::ReadFileToString(input_file, &input_content)) {
......@@ -62,16 +64,17 @@ int GenerateDasfa(const char* input_file_name,
std::stringstream input(input_content);
std::string output =
R"(// Copyright 2017 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
R"(# Copyright 2018 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
// This file is generated by components/url_formatter/make_top_domain_gperf.cc
// DO NOT MANUALLY EDIT!
# This file is generated by
# components/url_formatter/make_top_domain_skeletons.cc
# DO NOT MANUALLY EDIT!
# Each entry is the skeleton of a top domain for the confusability check
# in components/url_formatter/url_formatter.cc.
// Each entry is the skeleton of a top domain for the confusability check
// in components/url_formatter/url_formatter.cc.
%%
)";
std::string domain;
......@@ -83,9 +86,9 @@ int GenerateDasfa(const char* input_file_name,
std::string skeleton = GetSkeleton(domain, spoof_checker);
if (skeleton.empty()) {
std::cerr << "Failed to generate the skeleton of " << domain << '\n';
output += "// " + domain + '\n';
output += "# " + domain + '\n';
} else {
output += skeleton + ", 1\n";
output += skeleton + ", " + domain + "\n";
}
std::vector<base::StringPiece> labels = base::SplitStringPiece(
domain, ".", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
......@@ -95,13 +98,6 @@ int GenerateDasfa(const char* input_file_name,
}
}
output += "%%\n";
std::string output_file_name(input_file_name);
base::ReplaceSubstringsAfterOffset(&output_file_name, 0, "domain",
"skeleton");
base::ReplaceSubstringsAfterOffset(&output_file_name, 0, "list", "gperf");
if (!WriteToFile(output, output_file_name))
return 1;
......@@ -129,6 +125,8 @@ int main(int argc, const char** argv) {
<< u_errorName(status) << ".\n";
return 1;
}
GenerateDasfa("alexa_domains.list", spoof_checker.get());
GenerateDasfa("test_domains.list", spoof_checker.get());
GenerateSkeletons("alexa_domains.list", "alexa_domains.skeletons",
spoof_checker.get());
GenerateSkeletons("test_domains.list", "test_domains.skeletons",
spoof_checker.get());
}
# Copyright 2017 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# This file is generated by components/url_formatter/make_top_domain_skeletons.cc
# DO NOT MANUALLY EDIT!
# Each entry is the skeleton of a top domain for the confusability check
# in components/url_formatter/url_formatter.cc.
digklrno68.corn, digklmo68.com
digklrno68.co.uk, digklmo68.co.uk
islkpxl23.corn, islkpx123.com
isikpxl23.corn, isikpx123.com
os345.corn, os345.com
woder.corn, woder.com
wrnhtb.corn, wmhtb.com
phktb.corn, phktb.com
pkawx.corn, pkawx.com
wrnnr.corn, wmnr.com
rf.corn, rf.com
cyxe.corn, cyxe.com
ldg.corn, ldg.com
idg.corn, idg.com
ig.corn, ig.com
ld.corn, ld.com
lgd.corn, 1gd.com
cegjo.corn, cegjo.com
wsws.corn, wsws.com
wsu.corn, wsu.com
wsou.corn, wsou.com
l23456789O.corn, 1234567890.com
aece.corn, aece.com
aen.corn, aen.com
// Copyright 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// This binary generates a Huffman encoded trie from the top domain skeleton
// list. The keys of the trie are skeletons and the values are the corresponding
// top domains.
//
// The input is the list of (skeleton, domain) pairs. The output is written
// using the given template file.
#include <iostream>
#include <map>
#include <set>
#include <string>
#include <vector>
#include "base/command_line.h"
#include "base/files/file_util.h"
#include "base/logging.h"
#include "base/path_service.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "build/build_config.h"
#include "components/url_formatter/top_domains/top_domain_state_generator.h"
#include "components/url_formatter/top_domains/trie_entry.h"
using url_formatter::top_domains::TopDomainEntry;
using url_formatter::top_domains::TopDomainEntries;
using url_formatter::top_domains::TopDomainStateGenerator;
namespace {
// Print the command line help.
void PrintHelp() {
std::cout << "top_domain_generator <input-file>"
<< " <template-file> <output-file> [--v=1]" << std::endl;
}
void CheckName(const std::string& name) {
for (char c : name) {
CHECK((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') || c == '.' || c == '-' || c == '_')
<< name << " has invalid characters.";
}
}
} // namespace
int main(int argc, char* argv[]) {
base::CommandLine::Init(argc, argv);
const base::CommandLine& command_line =
*base::CommandLine::ForCurrentProcess();
logging::LoggingSettings settings;
settings.logging_dest = logging::LOG_TO_SYSTEM_DEBUG_LOG;
logging::InitLogging(settings);
#if defined(OS_WIN)
std::vector<std::string> args;
base::CommandLine::StringVector wide_args = command_line.GetArgs();
for (const auto& arg : wide_args) {
args.push_back(base::WideToUTF8(arg));
}
#else
base::CommandLine::StringVector args = command_line.GetArgs();
#endif
if (args.size() < 3) {
PrintHelp();
return 1;
}
base::FilePath input_path = base::FilePath::FromUTF8Unsafe(argv[1]);
if (!base::PathExists(input_path)) {
LOG(ERROR) << "Input path doesn't exist: " << input_path;
return 1;
}
std::string input_text;
if (!base::ReadFileToString(input_path, &input_text)) {
LOG(ERROR) << "Could not read input file: " << input_path;
return 1;
}
std::vector<std::string> lines = base::SplitString(
input_text, "\n", base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
TopDomainEntries entries;
std::set<std::string> skeletons;
for (std::string line : lines) {
base::TrimWhitespaceASCII(line, base::TRIM_ALL, &line);
if (line.empty() || line[0] == '#') {
continue;
}
auto entry = std::make_unique<TopDomainEntry>();
std::vector<std::string> tokens = base::SplitString(
line, ",", base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
CHECK_EQ(2u, tokens.size()) << "Invalid line: " << tokens[0];
const std::string skeleton = tokens[0];
if (skeletons.find(skeleton) != skeletons.end()) {
// Another site has the same skeleton. Simply ignore, as we already have a
// top domain corresponding to this skeleton.
continue;
}
skeletons.insert(skeleton);
// TODO: Should we lowercase these?
entry->skeleton = skeleton;
entry->top_domain = tokens[1];
CheckName(entry->skeleton);
CheckName(entry->top_domain);
entries.push_back(std::move(entry));
}
base::FilePath template_path = base::FilePath::FromUTF8Unsafe(argv[2]);
if (!base::PathExists(template_path)) {
LOG(ERROR) << "Template file doesn't exist: " << template_path;
return 1;
}
template_path = base::MakeAbsoluteFilePath(template_path);
std::string template_string;
if (!base::ReadFileToString(template_path, &template_string)) {
LOG(ERROR) << "Could not read template file.";
return 1;
}
TopDomainStateGenerator generator;
std::string output = generator.Generate(template_string, entries);
if (output.empty()) {
LOG(ERROR) << "Trie generation failed.";
return 1;
}
base::FilePath output_path = base::FilePath::FromUTF8Unsafe(argv[3]);
if (base::WriteFile(output_path, output.c_str(),
static_cast<uint32_t>(output.size())) <= 0) {
LOG(ERROR) << "Failed to write output: " << output_path;
return 1;
}
return 0;
}
\ No newline at end of file
// Copyright 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/url_formatter/top_domains/top_domain_state_generator.h"
#include <cstdint>
#include <memory>
#include <string>
#include "base/strings/string_number_conversions.h"
#include "base/strings/string_util.h"
#include "base/strings/stringprintf.h"
#include "net/tools/huffman_trie/huffman/huffman_builder.h"
#include "net/tools/huffman_trie/trie/trie_bit_buffer.h"
#include "net/tools/huffman_trie/trie/trie_writer.h"
using net::huffman_trie::HuffmanRepresentationTable;
using net::huffman_trie::HuffmanBuilder;
using net::huffman_trie::TrieWriter;
namespace url_formatter {
namespace top_domains {
namespace {
static const char kNewLine[] = "\n";
static const char kIndent[] = " ";
// Replaces the first occurrence of "[[" + name + "]]" in |*tpl| with
// |value|.
bool ReplaceTag(const std::string& name,
const std::string& value,
std::string* tpl) {
std::string tag = "[[" + name + "]]";
size_t start_pos = tpl->find(tag);
if (start_pos == std::string::npos) {
return false;
}
tpl->replace(start_pos, tag.length(), value);
return true;
}
// Formats the bytes in |bytes| as an C++ array initializer and returns the
// resulting string.
std::string FormatVectorAsArray(const std::vector<uint8_t>& bytes) {
std::string output = "{";
output.append(kNewLine);
output.append(kIndent);
output.append(kIndent);
size_t bytes_on_current_line = 0;
for (size_t i = 0; i < bytes.size(); ++i) {
base::StringAppendF(&output, "0x%02x,", bytes[i]);
bytes_on_current_line++;
if (bytes_on_current_line >= 12 && (i + 1) < bytes.size()) {
output.append(kNewLine);
output.append(kIndent);
output.append(kIndent);
bytes_on_current_line = 0;
} else if ((i + 1) < bytes.size()) {
output.append(" ");
}
}
output.append(kNewLine);
output.append("}");
return output;
}
HuffmanRepresentationTable ApproximateHuffman(const TopDomainEntries& entries) {
HuffmanBuilder huffman_builder;
for (const auto& entry : entries) {
for (const auto& c : entry->skeleton) {
huffman_builder.RecordUsage(c);
}
for (const auto& c : entry->top_domain) {
huffman_builder.RecordUsage(c);
}
huffman_builder.RecordUsage(net::huffman_trie::kTerminalValue);
huffman_builder.RecordUsage(net::huffman_trie::kEndOfTableValue);
}
return huffman_builder.ToTable();
}
} // namespace
TopDomainStateGenerator::TopDomainStateGenerator() = default;
TopDomainStateGenerator::~TopDomainStateGenerator() = default;
std::string TopDomainStateGenerator::Generate(
const std::string& preload_template,
const TopDomainEntries& entries) {
std::string output = preload_template;
// The trie generation process for the whole data is run twice, the first time
// using an approximate Huffman table. During this first run, the correct
// character frequencies are collected which are then used to calculate the
// most space efficient Huffman table for the given inputs. This table is used
// for the second run.
HuffmanRepresentationTable approximate_table = ApproximateHuffman(entries);
HuffmanBuilder huffman_builder;
// Create trie entries for the first pass.
std::vector<std::unique_ptr<TopDomainTrieEntry>> trie_entries;
std::vector<net::huffman_trie::TrieEntry*> raw_trie_entries;
for (const auto& entry : entries) {
auto trie_entry = std::make_unique<TopDomainTrieEntry>(
approximate_table, &huffman_builder, entry.get());
raw_trie_entries.push_back(trie_entry.get());
trie_entries.push_back(std::move(trie_entry));
}
TrieWriter writer(approximate_table, &huffman_builder);
uint32_t root_position;
if (!writer.WriteEntries(raw_trie_entries, &root_position))
return std::string();
HuffmanRepresentationTable optimal_table = huffman_builder.ToTable();
TrieWriter new_writer(optimal_table, &huffman_builder);
// Create trie entries using the optimal table for the second pass.
raw_trie_entries.clear();
trie_entries.clear();
for (const auto& entry : entries) {
auto trie_entry = std::make_unique<TopDomainTrieEntry>(
optimal_table, &huffman_builder, entry.get());
raw_trie_entries.push_back(trie_entry.get());
trie_entries.push_back(std::move(trie_entry));
}
if (!new_writer.WriteEntries(raw_trie_entries, &root_position))
return std::string();
uint32_t new_length = new_writer.position();
std::vector<uint8_t> huffman_tree = huffman_builder.ToVector();
new_writer.Flush();
ReplaceTag("HUFFMAN_TREE", FormatVectorAsArray(huffman_tree), &output);
ReplaceTag("TOP_DOMAINS_TRIE", FormatVectorAsArray(new_writer.bytes()),
&output);
ReplaceTag("TOP_DOMAINS_TRIE_BITS", base::NumberToString(new_length),
&output);
ReplaceTag("TOP_DOMAINS_TRIE_ROOT", base::NumberToString(root_position),
&output);
return output;
}
} // namespace top_domains
} // namespace url_formatter
// Copyright 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef COMPONENTS_URL_FORMATTER_TOP_DOMAINS_TOP_DOMAIN_STATE_GENERATOR_H_
#define COMPONENTS_URL_FORMATTER_TOP_DOMAINS_TOP_DOMAIN_STATE_GENERATOR_H_
#include <string>
#include "components/url_formatter/top_domains/trie_entry.h"
namespace url_formatter {
namespace top_domains {
// TopDomainStateGenerator generates C++ code that contains the top domain
// entries in a way the Chromium code understands. The code that reads the
// output can be found in components/url_formatter/idn_spoof_checker.cc.
// The output gets compiled into the binary.
//
// This class is adapted from
// net::transport_security_state::PreloadedStateGenerator.
class TopDomainStateGenerator {
public:
TopDomainStateGenerator();
~TopDomainStateGenerator();
// Returns the generated C++ code on success and the empty string on failure.
std::string Generate(const std::string& template_string,
const TopDomainEntries& entries);
};
} // namespace top_domains
} // namespace url_formatter
#endif // COMPONENTS_URL_FORMATTER_TOP_DOMAINS_TOP_DOMAIN_STATE_GENERATOR_H_
// kTopDomainsHuffmanTree describes a Huffman tree. The nodes of the tree are
// pairs of uint8s. The last node in the array is the root of the tree. Each pair
// is two uint8_t values, the first is "left" and the second is "right". If a
// uint8_t value has the MSB set then it represents a literal leaf value.
// Otherwise it's a pointer to the n'th element of the array.
static const uint8_t kTopDomainsHuffmanTree[] = [[HUFFMAN_TREE]];
static const uint8_t kTopDomainsTrie[] = [[TOP_DOMAINS_TRIE]];
static const unsigned kTopDomainsTrieBits = [[TOP_DOMAINS_TRIE_BITS]];
static const unsigned kTopDomainsRootPosition = [[TOP_DOMAINS_TRIE_ROOT]];
// Copyright 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/url_formatter/top_domains/trie_entry.h"
#include "base/strings/string_util.h"
#include "net/tools/huffman_trie/trie/trie_bit_buffer.h"
#include "net/tools/huffman_trie/trie/trie_writer.h"
namespace url_formatter {
namespace top_domains {
TopDomainTrieEntry::TopDomainTrieEntry(
const net::huffman_trie::HuffmanRepresentationTable& huffman_table,
net::huffman_trie::HuffmanBuilder* huffman_builder,
TopDomainEntry* entry)
: huffman_table_(huffman_table),
huffman_builder_(huffman_builder),
entry_(entry) {}
TopDomainTrieEntry::~TopDomainTrieEntry() {}
std::string TopDomainTrieEntry::name() const {
return entry_->skeleton;
}
bool TopDomainTrieEntry::WriteEntry(
net::huffman_trie::TrieBitBuffer* writer) const {
if (entry_->skeleton == entry_->top_domain) {
writer->WriteBit(1);
return true;
}
writer->WriteBit(0);
std::string top_domain = entry_->top_domain;
// With the current top 10,000 domains, this optimization reduces the
// additional binary size required for the trie from 71 kB to 59 kB.
if (base::EndsWith(top_domain, ".com",
base::CompareCase::INSENSITIVE_ASCII)) {
writer->WriteBit(1);
top_domain = top_domain.substr(0, top_domain.size() - 4);
} else {
writer->WriteBit(0);
}
for (const auto& c : top_domain) {
writer->WriteChar(c, huffman_table_, huffman_builder_);
}
writer->WriteChar(net::huffman_trie::kEndOfTableValue, huffman_table_,
huffman_builder_);
return true;
}
} // namespace top_domains
} // namespace url_formatter
// Copyright 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef COMPONENTS_URL_FORMATTER_TOP_DOMAINS_TRIE_ENTRY_H_
#define COMPONENTS_URL_FORMATTER_TOP_DOMAINS_TRIE_ENTRY_H_
#include <string>
#include <vector>
#include "net/tools/huffman_trie/huffman/huffman_builder.h"
#include "net/tools/huffman_trie/trie_entry.h"
namespace url_formatter {
namespace top_domains {
struct TopDomainEntry {
std::string skeleton;
std::string top_domain;
};
class TopDomainTrieEntry : public net::huffman_trie::TrieEntry {
public:
explicit TopDomainTrieEntry(
const net::huffman_trie::HuffmanRepresentationTable& huffman_table,
net::huffman_trie::HuffmanBuilder* huffman_builder,
TopDomainEntry* entry);
~TopDomainTrieEntry() override;
// huffman_trie::TrieEntry:
std::string name() const override;
bool WriteEntry(net::huffman_trie::TrieBitBuffer* writer) const override;
private:
const net::huffman_trie::HuffmanRepresentationTable& huffman_table_;
net::huffman_trie::HuffmanBuilder* huffman_builder_;
TopDomainEntry* entry_;
};
using TopDomainEntries = std::vector<std::unique_ptr<TopDomainEntry>>;
} // namespace top_domains
} // namespace url_formatter
#endif // COMPONENTS_URL_FORMATTER_TOP_DOMAINS_TRIE_ENTRY_H_
......@@ -993,14 +993,18 @@ void CheckAdjustedOffsets(const std::string& url_string,
}
namespace test {
#include "components/url_formatter/top_domains/test_skeletons-inc.cc"
#include "components/url_formatter/top_domains/test_domains-trie-inc.cc"
}
} // namespace
TEST(UrlFormatterTest, IDNToUnicode) {
IDNSpoofChecker::SetTopDomainGraph(base::StringPiece(
reinterpret_cast<const char*>(test::kDafsa), sizeof(test::kDafsa)));
IDNSpoofChecker::HuffmanTrieParams trie_params{
test::kTopDomainsHuffmanTree, sizeof(test::kTopDomainsHuffmanTree),
test::kTopDomainsTrie, test::kTopDomainsTrieBits,
test::kTopDomainsRootPosition};
IDNSpoofChecker::SetTrieParamsForTesting(trie_params);
for (size_t i = 0; i < arraysize(idn_cases); i++) {
base::string16 output(IDNToUnicode(idn_cases[i].input));
base::string16 expected(idn_cases[i].unicode_allowed
......@@ -1009,7 +1013,7 @@ TEST(UrlFormatterTest, IDNToUnicode) {
EXPECT_EQ(expected, output) << "input # " << i << ": \""
<< idn_cases[i].input << "\"";
}
IDNSpoofChecker::RestoreTopDomainGraphToDefault();
IDNSpoofChecker::RestoreTrieParamsForTesting();
}
TEST(UrlFormatterTest, FormatUrl) {
......
......@@ -269,8 +269,6 @@ component("net") {
"der/parser.h",
"der/tag.cc",
"der/tag.h",
"extras/preload_data/decoder.cc",
"extras/preload_data/decoder.h",
"http/http_auth_challenge_tokenizer.cc",
"http/http_auth_challenge_tokenizer.h",
"http/http_auth_scheme.cc",
......@@ -2133,6 +2131,7 @@ source_set("net_deps") {
public_deps = [
":constants",
":net_resources",
":preload_decoder",
"//base",
"//net/base/registry_controlled_domains",
"//third_party/protobuf:protobuf_lite",
......@@ -2266,6 +2265,16 @@ if (!is_proto_quic) {
}
}
static_library("preload_decoder") {
sources = [
"extras/preload_data/decoder.cc",
"extras/preload_data/decoder.h",
]
deps = [
"//base",
]
}
if (!is_ios) {
executable("dump_cache") {
testonly = true
......
......@@ -5,52 +5,6 @@
#include "net/extras/preload_data/decoder.h"
#include "base/logging.h"
namespace {
// HuffmanDecoder is a very simple Huffman reader. The input Huffman tree is
// simply encoded as a series of two-byte structures. The first byte determines
// the "0" pointer for that node and the second the "1" pointer. Each byte
// either has the MSB set, in which case the bottom 7 bits are the value for
// that position, or else the bottom seven bits contain the index of a node.
//
// The tree is decoded by walking rather than a table-driven approach.
class HuffmanDecoder {
public:
HuffmanDecoder(const uint8_t* tree, size_t tree_bytes)
: tree_(tree), tree_bytes_(tree_bytes) {}
bool Decode(net::extras::PreloadDecoder::BitReader* reader, char* out) {
const uint8_t* current = &tree_[tree_bytes_ - 2];
for (;;) {
bool bit;
if (!reader->Next(&bit)) {
return false;
}
uint8_t b = current[bit];
if (b & 0x80) {
*out = static_cast<char>(b & 0x7f);
return true;
}
unsigned offset = static_cast<unsigned>(b) * 2;
DCHECK_LT(offset, tree_bytes_);
if (offset >= tree_bytes_) {
return false;
}
current = &tree_[offset];
}
}
private:
const uint8_t* const tree_;
const size_t tree_bytes_;
};
} // namespace
namespace net {
namespace extras {
......@@ -131,26 +85,49 @@ bool PreloadDecoder::BitReader::Seek(size_t offset) {
return true;
}
PreloadDecoder::HuffmanDecoder::HuffmanDecoder(const uint8_t* tree,
size_t tree_bytes)
: tree_(tree), tree_bytes_(tree_bytes) {}
bool PreloadDecoder::HuffmanDecoder::Decode(PreloadDecoder::BitReader* reader,
char* out) const {
const uint8_t* current = &tree_[tree_bytes_ - 2];
for (;;) {
bool bit;
if (!reader->Next(&bit)) {
return false;
}
uint8_t b = current[bit];
if (b & 0x80) {
*out = static_cast<char>(b & 0x7f);
return true;
}
unsigned offset = static_cast<unsigned>(b) * 2;
DCHECK_LT(offset, tree_bytes_);
if (offset >= tree_bytes_) {
return false;
}
current = &tree_[offset];
}
}
PreloadDecoder::PreloadDecoder(const uint8_t* huffman_tree,
size_t huffman_tree_size,
const uint8_t* trie,
size_t trie_bits,
size_t trie_root_position)
: huffman_tree_(huffman_tree),
huffman_tree_size_(huffman_tree_size),
trie_(trie),
trie_bits_(trie_bits),
: huffman_decoder_(huffman_tree, huffman_tree_size),
bit_reader_(trie, trie_bits),
trie_root_position_(trie_root_position) {}
PreloadDecoder::~PreloadDecoder() {}
bool PreloadDecoder::Decode(const std::string& search, bool* out_found) {
HuffmanDecoder huffman(huffman_tree_, huffman_tree_size_);
BitReader reader(trie_, trie_bits_);
size_t bit_offset = trie_root_position_;
static const char kEndOfString = 0;
static const char kEndOfTable = 127;
*out_found = false;
// current_search_offset contains one more than the index of the current
......@@ -161,13 +138,13 @@ bool PreloadDecoder::Decode(const std::string& search, bool* out_found) {
for (;;) {
// Seek to the desired location.
if (!reader.Seek(bit_offset)) {
if (!bit_reader_.Seek(bit_offset)) {
return false;
}
// Decode the unary length of the common prefix.
size_t prefix_length;
if (!reader.Unary(&prefix_length)) {
if (!bit_reader_.Unary(&prefix_length)) {
return false;
}
......@@ -179,7 +156,7 @@ bool PreloadDecoder::Decode(const std::string& search, bool* out_found) {
}
char c;
if (!huffman.Decode(&reader, &c)) {
if (!huffman_decoder_.Decode(&bit_reader_, &c)) {
return false;
}
if (search[current_search_offset - 1] != c) {
......@@ -194,7 +171,7 @@ bool PreloadDecoder::Decode(const std::string& search, bool* out_found) {
// Next is the dispatch table.
for (;;) {
char c;
if (!huffman.Decode(&reader, &c)) {
if (!huffman_decoder_.Decode(&bit_reader_, &c)) {
return false;
}
if (c == kEndOfTable) {
......@@ -203,7 +180,8 @@ bool PreloadDecoder::Decode(const std::string& search, bool* out_found) {
}
if (c == kEndOfString) {
if (!ReadEntry(&reader, search, current_search_offset, out_found)) {
if (!ReadEntry(&bit_reader_, search, current_search_offset,
out_found)) {
return false;
}
if (current_search_offset == 0) {
......@@ -223,8 +201,8 @@ bool PreloadDecoder::Decode(const std::string& search, bool* out_found) {
// The first offset is backwards from the current position.
uint32_t jump_delta_bits;
uint32_t jump_delta;
if (!reader.Read(5, &jump_delta_bits) ||
!reader.Read(jump_delta_bits, &jump_delta)) {
if (!bit_reader_.Read(5, &jump_delta_bits) ||
!bit_reader_.Read(jump_delta_bits, &jump_delta)) {
return false;
}
......@@ -237,19 +215,19 @@ bool PreloadDecoder::Decode(const std::string& search, bool* out_found) {
} else {
// Subsequent offsets are forward from the target of the first offset.
uint32_t is_long_jump;
if (!reader.Read(1, &is_long_jump)) {
if (!bit_reader_.Read(1, &is_long_jump)) {
return false;
}
uint32_t jump_delta;
if (!is_long_jump) {
if (!reader.Read(7, &jump_delta)) {
if (!bit_reader_.Read(7, &jump_delta)) {
return false;
}
} else {
uint32_t jump_delta_bits;
if (!reader.Read(4, &jump_delta_bits) ||
!reader.Read(jump_delta_bits + 8, &jump_delta)) {
if (!bit_reader_.Read(4, &jump_delta_bits) ||
!bit_reader_.Read(jump_delta_bits + 8, &jump_delta)) {
return false;
}
}
......
......@@ -5,8 +5,12 @@
#ifndef NET_EXTRAS_PRELOAD_DATA_DECODER_H_
#define NET_EXTRAS_PRELOAD_DATA_DECODER_H_
#include <stdint.h>
#include <string>
#include "base/macros.h"
namespace net {
namespace extras {
......@@ -16,6 +20,9 @@ namespace extras {
// they are interested in.
class PreloadDecoder {
public:
// These must match the values in net/tools/huffman_trie/trie/trie_writer.h.
enum : char { kEndOfString = 0, kEndOfTable = 127 };
// BitReader is a class that allows a bytestring to be read bit-by-bit.
class BitReader {
public:
......@@ -51,6 +58,29 @@ class PreloadDecoder {
// num_bits_used_ contains the number of bits of |current_byte_| that have
// been read.
unsigned num_bits_used_;
DISALLOW_COPY_AND_ASSIGN(BitReader);
};
// HuffmanDecoder is a very simple Huffman reader. The input Huffman tree is
// simply encoded as a series of two-byte structures. The first byte
// determines the "0" pointer for that node and the second the "1" pointer.
// Each byte either has the MSB set, in which case the bottom 7 bits are the
// value for that position, or else the bottom seven bits contain the index of
// a node.
//
// The tree is decoded by walking rather than a table-driven approach.
class HuffmanDecoder {
public:
HuffmanDecoder(const uint8_t* tree, size_t tree_bytes);
bool Decode(PreloadDecoder::BitReader* reader, char* out) const;
private:
const uint8_t* const tree_;
const size_t tree_bytes_;
DISALLOW_COPY_AND_ASSIGN(HuffmanDecoder);
};
PreloadDecoder(const uint8_t* huffman_tree,
......@@ -84,17 +114,21 @@ class PreloadDecoder {
// value always comes before an entry for '.'.
bool Decode(const std::string& search, bool* out_found);
protected:
virtual bool ReadEntry(BitReader* reader,
const std::string& search,
size_t current_search_offset,
bool* out_found) = 0;
const HuffmanDecoder& huffman_decoder() const { return huffman_decoder_; }
private:
const uint8_t* huffman_tree_;
const size_t huffman_tree_size_;
const uint8_t* trie_;
const size_t trie_bits_;
HuffmanDecoder huffman_decoder_;
BitReader bit_reader_;
const size_t trie_root_position_;
DISALLOW_COPY_AND_ASSIGN(PreloadDecoder);
};
} // namespace extras
......
......@@ -2,8 +2,6 @@
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
assert(current_toolchain == host_toolchain)
source_set("huffman_trie_generator_sources") {
sources = [
"bit_writer.cc",
......
......@@ -123,7 +123,7 @@ bool TrieWriter::WriteDispatchTables(ReversedEntries::iterator start,
uint32_t* position) {
DCHECK(start != end) << "No entries passed to WriteDispatchTables";
huffman_trie::TrieBitBuffer writer;
TrieBitBuffer writer;
std::vector<uint8_t> prefix = LongestCommonPrefix(start, end);
for (size_t i = 0; i < prefix.size(); ++i) {
......
......@@ -20,8 +20,8 @@ enum : uint8_t { kTerminalValue = 0, kEndOfTableValue = 127 };
class TrieWriter {
public:
TrieWriter(const huffman_trie::HuffmanRepresentationTable& huffman_table,
huffman_trie::HuffmanBuilder* huffman_builder);
TrieWriter(const HuffmanRepresentationTable& huffman_table,
HuffmanBuilder* huffman_builder);
~TrieWriter();
// Constructs a trie containing all |entries|. The output is written to
......@@ -40,18 +40,25 @@ class TrieWriter {
// complete.
const std::vector<uint8_t>& bytes() const { return buffer_.bytes(); }
protected:
const HuffmanRepresentationTable& huffman_table() const {
return huffman_table_;
}
HuffmanBuilder* huffman_builder() { return huffman_builder_; }
private:
bool WriteDispatchTables(ReversedEntries::iterator start,
ReversedEntries::iterator end,
uint32_t* position);
huffman_trie::BitWriter buffer_;
const huffman_trie::HuffmanRepresentationTable& huffman_table_;
huffman_trie::HuffmanBuilder* huffman_builder_;
BitWriter buffer_;
const HuffmanRepresentationTable& huffman_table_;
HuffmanBuilder* huffman_builder_;
};
} // namespace huffman_trie
} // namespace net
#endif // NET_TOOLS_TRANSPORT_SECURITY_STATE_GENERATOR_TRIE_TRIE_WRITER_H_
#endif // NET_TOOLS_HUFFMAN_TRIE_TRIE_TRIE_WRITER_H_
......@@ -20,6 +20,8 @@ class TrieEntry {
TrieEntry();
virtual ~TrieEntry();
// The name to be used when inserting the entry to the trie. E.g. for HSTS
// preload list, this is the hostname.
virtual std::string name() const = 0;
virtual bool WriteEntry(huffman_trie::TrieBitBuffer* writer) const = 0;
};
......@@ -28,6 +30,8 @@ class TrieEntry {
// of raw pointers instead.
using TrieEntries = std::vector<TrieEntry*>;
// ReversedEntry points to a TrieEntry and contains the reversed name for
// that entry. This is used to construct the trie.
struct ReversedEntry {
ReversedEntry(std::vector<uint8_t> reversed_name, const TrieEntry* entry);
~ReversedEntry();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment