Commit 84d0bef3 authored by meacer's avatar meacer Committed by Commit Bot

IDN spoof checks: Refactor skeleton generation to support IDN in top domain list

Skeleton strings are used to detect confusable hostnames. They are generated in
two places:
1. During build time in make_top_domain_skeletons: This binary takes a list of
hostnames and generates their skeletons to be embedded statically to the Chrome
binary.
2. During runtime idn_spoof_checker.cc: This class generates skeletons of a
hostname to compare against a list of known skeleton strings. Before generating
a skeleton string, this class applies a few additional transformations such as
diacritic removal to the hostname to be able to detect more confusable
hostnames.

This CL extracts the skeleton generation in IDN spoof checker code to a separate
file so that make_top_domain_skeletons can apply the same transformations to
input hostnames when building the static top domain list.

This CL also modifies top_domain_generator binary which generates the actual
trie to be embedded to the binary. The trie currently doesn't allow non-ASCII
characters in its fields. This CL stores unicode hostnames in punycode to
overcome this restriction. Unicode hostnames may still have non-ASCII skeleton
strings, and top_domain_generator still doesn't support that. However, the
current top domain list doesn't have any IDN so this isn't a blocking issue.

Bug: 1040607
Change-Id: I40c654152025d910cbeb8ba32bff5b7835f00104
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1992011Reviewed-by: default avatarChristopher Thompson <cthomp@chromium.org>
Reviewed-by: default avatarMustafa Emre Acer <meacer@chromium.org>
Commit-Queue: Mustafa Emre Acer <meacer@chromium.org>
Cr-Commit-Position: refs/heads/master@{#729987}
parent b82bdd6d
......@@ -8,6 +8,17 @@ if (is_android) {
import("//build/config/android/rules.gni")
}
jumbo_static_library("skeleton_generator") {
sources = [
"spoof_checks/skeleton_generator.cc",
"spoof_checks/skeleton_generator.h",
]
deps = [
"//base",
"//base:i18n",
]
}
jumbo_static_library("url_formatter") {
sources = [
"elide_url.cc",
......@@ -31,6 +42,7 @@ jumbo_static_library("url_formatter") {
deps = [
"//base",
"//base:i18n",
"//components/url_formatter:skeleton_generator",
"//components/url_formatter/spoof_checks/top_domains:common",
"//components/url_formatter/spoof_checks/top_domains:generate_top_domains_trie",
"//net",
......
......@@ -285,86 +285,11 @@ IDNSpoofChecker::IDNSpoofChecker() {
icu::UnicodeSet(UNICODE_STRING_SIMPLE("[\\u00fe\\u00f0]"), status);
icelandic_characters_.freeze();
// Used for diacritics-removal before the skeleton calculation. Add
// "ł > l; ø > o; đ > d" that are not handled by "NFD; Nonspacing mark
// removal; NFC".
// TODO(jshin): Revisit "ł > l; ø > o" mapping.
UParseError parse_error;
diacritic_remover_.reset(icu::Transliterator::createFromRules(
UNICODE_STRING_SIMPLE("DropAcc"),
icu::UnicodeString::fromUTF8("::NFD; ::[:Nonspacing Mark:] Remove; ::NFC;"
" ł > l; ø > o; đ > d;"),
UTRANS_FORWARD, parse_error, status));
// Supplement the Unicode confusable list by the following mapping.
// NOTE: Adding a digit-lookalike? Add it to digit_lookalikes_ above, too.
// - {U+00E6 (æ), U+04D5 (ӕ)} => "ae"
// - {U+03FC (ϼ), U+048F (ҏ)} => p
// - {U+0127 (ħ), U+043D (н), U+045B (ћ), U+04A3 (ң), U+04A5 (ҥ),
// U+04C8 (ӈ), U+04CA (ӊ), U+050B (ԋ), U+0527 (ԧ), U+0529 (ԩ)} => h
// - {U+0138 (ĸ), U+03BA (κ), U+043A (к), U+049B (қ), U+049D (ҝ),
// U+049F (ҟ), U+04A1(ҡ), U+04C4 (ӄ), U+051F (ԟ)} => k
// - {U+014B (ŋ), U+043F (п), U+0525 (ԥ), U+0E01 (ก), U+05D7 (ח)} => n
// - U+0153 (œ) => "ce"
// TODO: see https://crbug.com/843352 for further work on
// U+0525 and U+0153.
// - {U+0167 (ŧ), U+0442 (т), U+04AD (ҭ), U+050F (ԏ), U+4E03 (七),
// U+4E05 (丅), U+4E06 (丆), U+4E01 (丁)} => t
// - {U+0185 (ƅ), U+044C (ь), U+048D (ҍ), U+0432 (в)} => b
// - {U+03C9 (ω), U+0448 (ш), U+0449 (щ), U+0E1E (พ),
// U+0E1F (ฟ), U+0E9E (ພ), U+0E9F (ຟ)} => w
// - {U+043C (м), U+04CE (ӎ)} => m
// - {U+0454 (є), U+04BD (ҽ), U+04BF (ҿ), U+1054 (ၔ)} => e
// - U+0491 (ґ) => r
// - {U+0493 (ғ), U+04FB (ӻ)} => f
// - {U+04AB (ҫ), U+1004 (င)} => c
// - {U+04B1 (ұ), U+4E2B (丫)} => y
// - {U+03C7 (χ), U+04B3 (ҳ), U+04FD (ӽ), U+04FF (ӿ), U+4E42 (乂)} => x
// - {U+0503 (ԃ), U+10EB (ძ)} => d
// - {U+050D (ԍ), U+100c (ဌ)} => g
// - {U+0D1F (ട), U+0E23 (ร), U+0EA3 (ຣ), U+0EAE (ຮ)} => s
// - U+1042 (၂) => j
// - {U+0966 (०), U+09E6 (০), U+0A66 (੦), U+0AE6 (૦), U+0B30 (ଠ),
// U+0B66 (୦), U+0CE6 (೦)} => o,
// - {U+09ED (৭), U+0A67 (੧), U+0AE7 (૧)} => q,
// - {U+0E1A (บ), U+0E9A (ບ)} => u,
// - {U+03B8 (θ)} => 0,
// - {U+0968 (२), U+09E8 (২), U+0A68 (੨), U+0A68 (੨), U+0AE8 (૨),
// U+0ce9 (೩), U+0ced (೭), U+0577 (շ)} => 2,
// - {U+0437 (з), U+0499 (ҙ), U+04E1 (ӡ), U+0909 (उ), U+0993 (ও),
// U+0A24 (ਤ), U+0A69 (੩), U+0AE9 (૩), U+0C69 (౩),
// U+1012 (ဒ), U+10D5 (ვ), U+10DE (პ)} => 3
// - {U+0A6B (੫), U+4E29 (丩), U+3110 (ㄐ)} => 4,
// - U+0573 (ճ) => 6
// - {U+09EA (৪), U+0A6A (੪), U+0b6b (୫)} => 8,
// - {U+0AED (૭), U+0b68 (୨), U+0C68 (౨)} => 9,
// Map a few dashes that ICU doesn't map. These are already blocked by ICU,
// but mapping them allows us to detect same skeletons.
// - {U+2014 (—), U+4E00 (一), U+2015 (―), U+23EA (⸺), U+2E3B (⸻)} => -,
extra_confusable_mapper_.reset(icu::Transliterator::createFromRules(
UNICODE_STRING_SIMPLE("ExtraConf"),
icu::UnicodeString::fromUTF8(
"[æӕ] > ae; [ϼҏ] > p; [ħнћңҥӈӊԋԧԩ] > h;"
"[ĸκкқҝҟҡӄԟ] > k; [ŋпԥกח] > n; œ > ce;"
"[ŧтҭԏ七丅丆丁] > t; [ƅьҍв] > b; [ωшщพฟພຟ] > w;"
"[мӎ] > m; [єҽҿၔ] > e; ґ > r; [ғӻ] > f;"
"[ҫင] > c; [ұ丫] > y; [χҳӽӿ乂] > x;"
"[ԃძ] > d; [ԍဌ] > g; [ടรຣຮ] > s; ၂ > j;"
"[०০੦૦ଠ୦೦] > o;"
"[৭੧૧] > q;"
"[บບ] > u;"
"[θ] > 0;"
"[२২੨੨૨೩೭շ] > 2;"
"[зҙӡउওਤ੩૩౩ဒვპ] > 3;"
"[੫丩ㄐ] > 4;"
"[ճ] > 6;"
"[৪੪୫] > 8;"
"[૭୨౨] > 9;"
"[—一―⸺⸻] > \\-;"),
UTRANS_FORWARD, parse_error, status));
DCHECK(U_SUCCESS(status))
<< "Spoofchecker initalization failed due to an error: "
<< u_errorName(status);
skeleton_generator_ = std::make_unique<SkeletonGenerator>(checker_);
}
IDNSpoofChecker::~IDNSpoofChecker() {
......@@ -561,47 +486,7 @@ TopDomainEntry IDNSpoofChecker::GetSimilarTopDomain(
}
Skeletons IDNSpoofChecker::GetSkeletons(base::StringPiece16 hostname) {
Skeletons skeletons;
size_t hostname_length = hostname.length() - (hostname.back() == '.' ? 1 : 0);
icu::UnicodeString host(FALSE, hostname.data(), hostname_length);
// If input has any characters outside Latin-Greek-Cyrillic and [0-9._-],
// there is no point in getting rid of diacritics because combining marks
// attached to non-LGC characters are already blocked.
if (lgc_letters_n_ascii_.span(host, 0, USET_SPAN_CONTAINED) == host.length())
diacritic_remover_->transliterate(host);
extra_confusable_mapper_->transliterate(host);
UErrorCode status = U_ZERO_ERROR;
icu::UnicodeString ustr_skeleton;
// Map U+04CF (ӏ) to lowercase L in addition to what uspoof_getSkeleton does
// (mapping it to lowercase I).
int32_t u04cf_pos;
if ((u04cf_pos = host.indexOf(0x4CF)) != -1) {
icu::UnicodeString host_alt(host);
size_t length = host_alt.length();
char16_t* buffer = host_alt.getBuffer(-1);
for (char16_t* uc = buffer + u04cf_pos; uc < buffer + length; ++uc) {
if (*uc == 0x4CF)
*uc = 0x6C; // Lowercase L
}
host_alt.releaseBuffer(length);
uspoof_getSkeletonUnicodeString(checker_, 0, host_alt, ustr_skeleton,
&status);
if (U_SUCCESS(status)) {
std::string skeleton;
ustr_skeleton.toUTF8String(skeleton);
skeletons.insert(skeleton);
}
}
uspoof_getSkeletonUnicodeString(checker_, 0, host, ustr_skeleton, &status);
if (U_SUCCESS(status)) {
std::string skeleton;
ustr_skeleton.toUTF8String(skeleton);
skeletons.insert(skeleton);
}
return skeletons;
return skeleton_generator_->GetSkeletons(hostname);
}
TopDomainEntry IDNSpoofChecker::LookupSkeletonInTopDomains(
......
......@@ -12,6 +12,7 @@
#include "base/gtest_prod_util.h"
#include "base/strings/string16.h"
#include "base/strings/string_piece_forward.h"
#include "components/url_formatter/spoof_checks/skeleton_generator.h"
#include "net/extras/preload_data/decoder.h"
#include "third_party/icu/source/common/unicode/uniset.h"
......@@ -21,7 +22,6 @@
// 'icu' does not work. Use U_ICU_NAMESPACE.
namespace U_ICU_NAMESPACE {
class Transliterator;
class UnicodeString;
} // namespace U_ICU_NAMESPACE
......@@ -143,8 +143,8 @@ class IDNSpoofChecker {
icu::UnicodeSet digit_lookalikes_;
icu::UnicodeSet lgc_letters_n_ascii_;
icu::UnicodeSet icelandic_characters_;
std::unique_ptr<icu::Transliterator> diacritic_remover_;
std::unique_ptr<icu::Transliterator> extra_confusable_mapper_;
std::unique_ptr<SkeletonGenerator> skeleton_generator_;
// List of scripts containing whole-script-confusable information.
std::vector<std::unique_ptr<WholeScriptConfusable>> wholescriptconfusables_;
......
......@@ -1236,6 +1236,9 @@ const IDNTestCase kIdnCases[] = {
{"xn--mlca6ab.com", L"டபடப.com", kUnsafe},
// Whole-script-confusable in Telugu:
{"xn--brcaabbb.com", L"౧౦౧౦౧౦.com", kUnsafe},
// IDN domain matching an IDN top-domain (fóó.com)
{"xn--fo-5ja.com", L"fóo.com", kUnsafe},
};
namespace test {
......
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/url_formatter/spoof_checks/skeleton_generator.h"
#include "base/memory/ptr_util.h"
#include "base/strings/string_piece.h"
#include "third_party/icu/source/i18n/unicode/regex.h"
#include "third_party/icu/source/i18n/unicode/translit.h"
#include "third_party/icu/source/i18n/unicode/uspoof.h"
SkeletonGenerator::SkeletonGenerator(const USpoofChecker* checker)
: checker_(checker) {
UErrorCode status = U_ZERO_ERROR;
// Used for diacritics-removal before the skeleton calculation. Add
// "ł > l; ø > o; đ > d" that are not handled by "NFD; Nonspacing mark
// removal; NFC".
// TODO(jshin): Revisit "ł > l; ø > o" mapping.
UParseError parse_error;
diacritic_remover_ = base::WrapUnique(icu::Transliterator::createFromRules(
UNICODE_STRING_SIMPLE("DropAcc"),
icu::UnicodeString::fromUTF8("::NFD; ::[:Nonspacing Mark:] Remove; ::NFC;"
" ł > l; ø > o; đ > d;"),
UTRANS_FORWARD, parse_error, status));
// This set is used to determine whether or not to apply a slow
// transliteration to remove diacritics to a given hostname before the
// confusable skeleton calculation for comparison with top domain names. If
// it has any character outside the set, the expensive step will be skipped
// because it cannot match any of top domain names.
// The last ([\u0300-\u0339] is a shorthand for "[:Identifier_Status=Allowed:]
// & [:Script_Extensions=Inherited:] - [\\u200C\\u200D]". The latter is a
// subset of the former but it does not matter because hostnames with
// characters outside the latter set would be rejected in an earlier step.
lgc_letters_n_ascii_ = icu::UnicodeSet(
UNICODE_STRING_SIMPLE("[[:Latin:][:Greek:][:Cyrillic:][0-9\\u002e_"
"\\u002d][\\u0300-\\u0339]]"),
status);
lgc_letters_n_ascii_.freeze();
// Supplement the Unicode confusable list by the following mapping.
// NOTE: Adding a digit-lookalike? Add it to digit_lookalikes_ in
// idn_spoof_checker.cc, too.
// - {U+00E6 (æ), U+04D5 (ӕ)} => "ae"
// - {U+03FC (ϼ), U+048F (ҏ)} => p
// - {U+0127 (ħ), U+043D (н), U+045B (ћ), U+04A3 (ң), U+04A5 (ҥ),
// U+04C8 (ӈ), U+04CA (ӊ), U+050B (ԋ), U+0527 (ԧ), U+0529 (ԩ)} => h
// - {U+0138 (ĸ), U+03BA (κ), U+043A (к), U+049B (қ), U+049D (ҝ),
// U+049F (ҟ), U+04A1(ҡ), U+04C4 (ӄ), U+051F (ԟ)} => k
// - {U+014B (ŋ), U+043F (п), U+0525 (ԥ), U+0E01 (ก), U+05D7 (ח)} => n
// - U+0153 (œ) => "ce"
// TODO: see https://crbug.com/843352 for further work on
// U+0525 and U+0153.
// - {U+0167 (ŧ), U+0442 (т), U+04AD (ҭ), U+050F (ԏ), U+4E03 (七),
// U+4E05 (丅), U+4E06 (丆), U+4E01 (丁)} => t
// - {U+0185 (ƅ), U+044C (ь), U+048D (ҍ), U+0432 (в)} => b
// - {U+03C9 (ω), U+0448 (ш), U+0449 (щ), U+0E1E (พ),
// U+0E1F (ฟ), U+0E9E (ພ), U+0E9F (ຟ)} => w
// - {U+043C (м), U+04CE (ӎ)} => m
// - {U+0454 (є), U+04BD (ҽ), U+04BF (ҿ), U+1054 (ၔ)} => e
// - U+0491 (ґ) => r
// - {U+0493 (ғ), U+04FB (ӻ)} => f
// - {U+04AB (ҫ), U+1004 (င)} => c
// - {U+04B1 (ұ), U+4E2B (丫)} => y
// - {U+03C7 (χ), U+04B3 (ҳ), U+04FD (ӽ), U+04FF (ӿ), U+4E42 (乂)} => x
// - {U+0503 (ԃ), U+10EB (ძ)} => d
// - {U+050D (ԍ), U+100c (ဌ)} => g
// - {U+0D1F (ട), U+0E23 (ร), U+0EA3 (ຣ), U+0EAE (ຮ)} => s
// - U+1042 (၂) => j
// - {U+0966 (०), U+09E6 (০), U+0A66 (੦), U+0AE6 (૦), U+0B30 (ଠ),
// U+0B66 (୦), U+0CE6 (೦)} => o,
// - {U+09ED (৭), U+0A67 (੧), U+0AE7 (૧)} => q,
// - {U+0E1A (บ), U+0E9A (ບ)} => u,
// - {U+03B8 (θ)} => 0,
// - {U+0968 (२), U+09E8 (২), U+0A68 (੨), U+0A68 (੨), U+0AE8 (૨),
// U+0ce9 (೩), U+0ced (೭), U+0577 (շ)} => 2,
// - {U+0437 (з), U+0499 (ҙ), U+04E1 (ӡ), U+0909 (उ), U+0993 (ও),
// U+0A24 (ਤ), U+0A69 (੩), U+0AE9 (૩), U+0C69 (౩),
// U+1012 (ဒ), U+10D5 (ვ), U+10DE (პ)} => 3
// - {U+0A6B (੫), U+4E29 (丩), U+3110 (ㄐ)} => 4,
// - U+0573 (ճ) => 6
// - {U+09EA (৪), U+0A6A (੪), U+0b6b (୫)} => 8,
// - {U+0AED (૭), U+0b68 (୨), U+0C68 (౨)} => 9,
// Map a few dashes that ICU doesn't map. These are already blocked by ICU,
// but mapping them allows us to detect same skeletons.
// - {U+2014 (—), U+4E00 (一), U+2015 (―), U+23EA (⸺), U+2E3B (⸻)} => -,
extra_confusable_mapper_ =
base::WrapUnique(icu::Transliterator::createFromRules(
UNICODE_STRING_SIMPLE("ExtraConf"),
icu::UnicodeString::fromUTF8(
"[æӕ] > ae; [ϼҏ] > p; [ħнћңҥӈӊԋԧԩ] > h;"
"[ĸκкқҝҟҡӄԟ] > k; [ŋпԥกח] > n; œ > ce;"
"[ŧтҭԏ七丅丆丁] > t; [ƅьҍв] > b; [ωшщพฟພຟ] > w;"
"[мӎ] > m; [єҽҿၔ] > e; ґ > r; [ғӻ] > f;"
"[ҫင] > c; [ұ丫] > y; [χҳӽӿ乂] > x;"
"[ԃძ] > d; [ԍဌ] > g; [ടรຣຮ] > s; ၂ > j;"
"[०০੦૦ଠ୦೦] > o;"
"[৭੧૧] > q;"
"[บບ] > u;"
"[θ] > 0;"
"[२২੨੨૨೩೭շ] > 2;"
"[зҙӡउওਤ੩૩౩ဒვპ] > 3;"
"[੫丩ㄐ] > 4;"
"[ճ] > 6;"
"[৪੪୫] > 8;"
"[૭୨౨] > 9;"
"[—一―⸺⸻] > \\-;"),
UTRANS_FORWARD, parse_error, status));
DCHECK(U_SUCCESS(status))
<< "Skeleton generator initalization failed due to an error: "
<< u_errorName(status);
}
SkeletonGenerator::~SkeletonGenerator() = default;
Skeletons SkeletonGenerator::GetSkeletons(base::StringPiece16 hostname) {
Skeletons skeletons;
size_t hostname_length = hostname.length() - (hostname.back() == '.' ? 1 : 0);
icu::UnicodeString host(FALSE, hostname.data(), hostname_length);
// If input has any characters outside Latin-Greek-Cyrillic and [0-9._-],
// there is no point in getting rid of diacritics because combining marks
// attached to non-LGC characters are already blocked.
if (lgc_letters_n_ascii_.span(host, 0, USET_SPAN_CONTAINED) == host.length())
diacritic_remover_->transliterate(host);
extra_confusable_mapper_->transliterate(host);
UErrorCode status = U_ZERO_ERROR;
icu::UnicodeString ustr_skeleton;
// Map U+04CF (ӏ) to lowercase L in addition to what uspoof_getSkeleton does
// (mapping it to lowercase I).
int32_t u04cf_pos;
if ((u04cf_pos = host.indexOf(0x4CF)) != -1) {
icu::UnicodeString host_alt(host);
size_t length = host_alt.length();
char16_t* buffer = host_alt.getBuffer(-1);
for (char16_t* uc = buffer + u04cf_pos; uc < buffer + length; ++uc) {
if (*uc == 0x4CF)
*uc = 0x6C; // Lowercase L
}
host_alt.releaseBuffer(length);
uspoof_getSkeletonUnicodeString(checker_, 0, host_alt, ustr_skeleton,
&status);
if (U_SUCCESS(status)) {
std::string skeleton;
ustr_skeleton.toUTF8String(skeleton);
skeletons.insert(skeleton);
}
}
uspoof_getSkeletonUnicodeString(checker_, 0, host, ustr_skeleton, &status);
if (U_SUCCESS(status)) {
std::string skeleton;
ustr_skeleton.toUTF8String(skeleton);
skeletons.insert(skeleton);
}
return skeletons;
}
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef COMPONENTS_URL_FORMATTER_SPOOF_CHECKS_SKELETON_GENERATOR_H_
#define COMPONENTS_URL_FORMATTER_SPOOF_CHECKS_SKELETON_GENERATOR_H_
#include <memory>
#include <string>
#include "base/containers/flat_set.h"
#include "base/strings/string_piece_forward.h"
#include "third_party/icu/source/common/unicode/uniset.h"
// 'icu' does not work. Use U_ICU_NAMESPACE.
namespace U_ICU_NAMESPACE {
class Transliterator;
} // namespace U_ICU_NAMESPACE
struct USpoofChecker;
using Skeletons = base::flat_set<std::string>;
// This class generates skeleton strings from hostnames. Skeletons are a
// transformation of the input string. Two hostnames are confusable if their
// skeletons are identical. See http://unicode.org/reports/tr39/ for more
// information.
// This class uses ICU to generate skeletons. Before passing the input to ICU,
// it performs additional transformations (diacritic removal and extra
// confusable mapping of certain characters) so that more confusable hostnames
// can be detected than would be by using plain ICU API.
class SkeletonGenerator {
public:
SkeletonGenerator(const USpoofChecker* checker);
~SkeletonGenerator();
// Returns the set of skeletons for the |hostname|. For IDN, |hostname| must
// already be decoded to unicode.
Skeletons GetSkeletons(base::StringPiece16 hostname);
private:
icu::UnicodeSet lgc_letters_n_ascii_;
std::unique_ptr<icu::Transliterator> diacritic_remover_;
std::unique_ptr<icu::Transliterator> extra_confusable_mapper_;
const USpoofChecker* checker_;
};
#endif // COMPONENTS_URL_FORMATTER_SPOOF_CHECKS_SKELETON_GENERATOR_H_
......@@ -13,6 +13,7 @@ if (!is_ios && !is_android) {
deps = [
"//base",
"//base:i18n",
"//components/url_formatter:skeleton_generator",
"//third_party/icu",
]
}
......@@ -28,7 +29,9 @@ executable("top_domain_generator") {
]
deps = [
"//base",
"//base:i18n",
"//net/tools/huffman_trie:huffman_trie_generator_sources",
"//url:url",
]
if (is_ios) {
libs = [ "UIKit.framework" ]
......
......@@ -6,6 +6,10 @@
make_top_domain_skeletons. See http://go/chrome-top-domains-update for update
instructions.
This list can contain ASCII and unicode domains. Unicode domains should not be
encoded in punycode.
* `domains.skeletons`
The checked-in output of make_top_domain_skeletons. Processed during the
......@@ -26,9 +30,16 @@
Generated output of test_domains.list along with domains.skeletons
by make_top_domain_skeletons.
* `top_domain_generator.cc`
Generates the Huffman encoded Trie containing a map of skeletons to top
domains. For now, the skeletons must be ASCII. Unicode domains are supported
but they are written as punycode to the trie.
* `top_domain_list_variable_builder.cc` / `top500_domains.h`
`top_domain_list_variable_builder.cc` is run at compile time to generate information about the top 500 domains
(currently, skeletons and keywords are created from these domains). This
information is then embedded directly into the chrome binary, and can be
accessed via the variables in the top500_domains namespace.
`top_domain_list_variable_builder.cc` is run at compile time to generate
information about the top 500 domains (currently, skeletons and keywords are
created from these domains). This information is then embedded directly into
the chrome binary, and can be accessed via the variables in the top500_domains
namespace.
......@@ -17,22 +17,12 @@
#include "base/path_service.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "components/url_formatter/spoof_checks/skeleton_generator.h"
#include "third_party/icu/source/common/unicode/unistr.h"
#include "third_party/icu/source/common/unicode/utypes.h"
#include "third_party/icu/source/i18n/unicode/uspoof.h"
std::string GetSkeleton(const std::string& domain,
const USpoofChecker* spoof_checker) {
UErrorCode status = U_ZERO_ERROR;
icu::UnicodeString ustr_skeleton;
uspoof_getSkeletonUnicodeString(spoof_checker, 0 /* not used */,
icu::UnicodeString::fromUTF8(domain),
ustr_skeleton, &status);
std::string skeleton;
return U_SUCCESS(status) ? ustr_skeleton.toUTF8String(skeleton) : skeleton;
}
base::FilePath GetPath(base::StringPiece basename) {
base::FilePath path;
base::PathService::Get(base::DIR_SOURCE_ROOT, &path);
......@@ -78,19 +68,24 @@ int GenerateSkeletons(const char* input_file_name,
)";
SkeletonGenerator skeleton_generator(spoof_checker);
std::string domain;
size_t max_labels = 0;
std::string domain_with_max_labels;
while (std::getline(input, domain)) {
if (domain[0] == '#')
continue;
std::string skeleton = GetSkeleton(domain, spoof_checker);
if (skeleton.empty()) {
std::cerr << "Failed to generate the skeleton of " << domain << '\n';
output += "# " + domain + '\n';
} else {
const base::string16 domain16 = base::UTF8ToUTF16(domain);
const Skeletons skeletons = skeleton_generator.GetSkeletons(domain16);
DCHECK(!skeletons.empty()) << "Failed to generate skeletons of " << domain;
for (const std::string& skeleton : skeletons) {
DCHECK(!skeleton.empty()) << "Empty skeleton for " << domain;
output += skeleton + ", " + domain + "\n";
}
std::vector<base::StringPiece> labels = base::SplitStringPiece(
domain, ".", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
if (labels.size() > max_labels) {
......
......@@ -34,3 +34,5 @@ qq.com
nn.com
# A domain with the same skeleton as itself:
test.net
# Unicode domain:
fóó.com
......@@ -44,3 +44,4 @@ oo.corn, oo.com
qq.corn, qq.com
nn.corn, nn.com
test.net, test.net
foo.corn, fóó.com
......@@ -17,6 +17,7 @@
#include "base/command_line.h"
#include "base/files/file_util.h"
#include "base/i18n/icu_util.h"
#include "base/logging.h"
#include "base/path_service.h"
#include "base/strings/string_number_conversions.h"
......@@ -26,6 +27,7 @@
#include "build/build_config.h"
#include "components/url_formatter/spoof_checks/top_domains/top_domain_state_generator.h"
#include "components/url_formatter/spoof_checks/top_domains/trie_entry.h"
#include "url/gurl.h"
using url_formatter::top_domains::TopDomainEntries;
using url_formatter::top_domains::TopDomainEntry;
......@@ -60,6 +62,8 @@ int main(int argc, char* argv[]) {
logging::LOG_TO_SYSTEM_DEBUG_LOG | logging::LOG_TO_STDERR;
logging::InitLogging(settings);
base::i18n::InitializeICU();
#if defined(OS_WIN)
std::vector<std::string> args;
base::CommandLine::StringVector wide_args = command_line.GetArgs();
......@@ -107,18 +111,22 @@ int main(int argc, char* argv[]) {
CHECK_EQ(2u, tokens.size()) << "Invalid line: " << tokens[0];
const std::string skeleton = tokens[0];
if (skeletons.find(skeleton) != skeletons.end()) {
// Another site has the same skeleton. Simply ignore, as we already have a
// top domain corresponding to this skeleton.
continue;
}
// Another site has the same skeleton. This is low proability so stop now.
CHECK(skeletons.find(skeleton) == skeletons.end())
<< "A domain with the same skeleton is already in the list ("
<< skeleton << ").";
skeletons.insert(skeleton);
// TODO: Should we lowercase these?
entry->skeleton = skeleton;
entry->top_domain = tokens[1];
// If testing, only mark the first 5 sites as "top 500".
// There might be unicode domains in the list. Store them in punycode in the
// trie.
const GURL domain(std::string("http://") + tokens[1]);
entry->top_domain = domain.host();
// If testing, only mark the first site as "top 500".
if (for_testing) {
entry->is_top_500 = entries.size() < 1;
} else {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment