Commit bfe0b8fe authored by Behnood Momenzadeh's avatar Behnood Momenzadeh Committed by Commit Bot

Add skeletons without separators to top domains' skeletons.

For target embedding matching, top domains could be matched when they
are embedded without token separators (e.g. googlecom). In order to be
able to match them, we need to generate skeletons of top domains without
token separators. This change adds these skeletons to skeletons list as
well as adding a flag in |TopDomainEntry| to show if the found match was
with a without-separator skeleton or a regular skeleton.

Change-Id: Id1387ef438f1ae0c73190c0b1cad913f6ad51b4a
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2205517Reviewed-by: default avatarJoe DeBlasio <jdeblasio@chromium.org>
Reviewed-by: default avatarMustafa Emre Acer <meacer@chromium.org>
Commit-Queue: Behnood Momenzadeh <behnoodm@google.com>
Cr-Commit-Position: refs/heads/master@{#771185}
parent bf16ce62
......@@ -24,6 +24,15 @@ namespace url_formatter {
namespace {
uint8_t BitLength(uint32_t input) {
uint8_t number_of_bits = 0;
while (input != 0) {
number_of_bits++;
input >>= 1;
}
return number_of_bits;
}
class TopDomainPreloadDecoder : public net::extras::PreloadDecoder {
public:
using net::extras::PreloadDecoder::PreloadDecoder;
......@@ -33,14 +42,24 @@ class TopDomainPreloadDecoder : public net::extras::PreloadDecoder {
const std::string& search,
size_t current_search_offset,
bool* out_found) override {
// Make sure the assigned bit length is enough to encode all SkeletonType
// values.
DCHECK_EQ(kSkeletonTypeBitLength,
BitLength(url_formatter::SkeletonType::kMaxValue));
bool is_same_skeleton;
if (!reader->Next(&is_same_skeleton))
return false;
TopDomainEntry top_domain;
if (!reader->Next(&top_domain.is_top_500))
return false;
uint32_t skeletontype_value;
if (!reader->Read(kSkeletonTypeBitLength, &skeletontype_value))
return false;
top_domain.skeleton_type =
static_cast<url_formatter::SkeletonType>(skeletontype_value);
if (is_same_skeleton) {
top_domain.domain = search;
} else {
......@@ -56,7 +75,6 @@ class TopDomainPreloadDecoder : public net::extras::PreloadDecoder {
if (has_com_suffix)
top_domain.domain += ".com";
}
if (current_search_offset == 0) {
*out_found = true;
DCHECK(!top_domain.domain.empty());
......@@ -538,7 +556,8 @@ Skeletons IDNSpoofChecker::GetSkeletons(base::StringPiece16 hostname) {
}
TopDomainEntry IDNSpoofChecker::LookupSkeletonInTopDomains(
const std::string& skeleton) {
const std::string& skeleton,
SkeletonType skeleton_type) {
DCHECK(!skeleton.empty());
// There are no other guarantees about a skeleton string such as not including
// a dot. Skeleton of certain characters are dots (e.g. "۰" (U+06F0)).
......@@ -554,7 +573,11 @@ TopDomainEntry IDNSpoofChecker::LookupSkeletonInTopDomains(
labels.begin() + labels.size() - kNumberOfLabelsToCheck);
}
while (labels.size() > 1) {
while (labels.size() > 0) {
// A full skeleton needs at least two labels to match.
if (labels.size() == 1 && skeleton_type == SkeletonType::kFull) {
break;
}
std::string partial_skeleton = base::JoinString(labels, ".");
bool match = false;
bool decoded = preload_decoder.Decode(partial_skeleton, &match);
......
......@@ -33,12 +33,31 @@ FORWARD_DECLARE_TEST(UrlFormatterTest, IDNToUnicode);
using Skeletons = base::flat_set<std::string>;
// The |SkeletonType| and |TopDomainEntry| are mirrored in trie_entry.h. These
// are used to insert and read nodes from the Trie.
// The type of skeleton in the trie node.
enum SkeletonType {
// The skeleton represents the full domain (e.g. google.corn).
kFull = 0,
// The skeleton represents the domain with '.'s and '-'s removed (e.g.
// googlecorn).
kSeparatorsRemoved = 1,
// Max value used to determine the number of different types. Update this and
// |kSkeletonTypeBitLength| when new SkeletonTypes are added.
kMaxValue = kSeparatorsRemoved
};
const uint8_t kSkeletonTypeBitLength = 1;
// Represents a top domain entry in the trie.
struct TopDomainEntry {
// The domain name.
std::string domain;
// True if the domain is in the top 500.
bool is_top_500 = false;
// Type of the skeleton stored in the trie node.
SkeletonType skeleton_type;
};
// A helper class for IDN Spoof checking, used to ensure that no IDN input is
......@@ -84,7 +103,11 @@ class IDNSpoofChecker {
Skeletons GetSkeletons(base::StringPiece16 hostname);
// Returns a top domain from the top 10K list matching the given |skeleton|.
TopDomainEntry LookupSkeletonInTopDomains(const std::string& skeleton);
// If |without_separators| is set, the skeleton will be compared against
// skeletons without '.' and '-'s as well.
TopDomainEntry LookupSkeletonInTopDomains(
const std::string& skeleton,
SkeletonType skeleton_type = SkeletonType::kFull);
// Used for unit tests.
static void SetTrieParamsForTesting(const HuffmanTrieParams& trie_params);
......
......@@ -1369,12 +1369,21 @@ TEST_F(IDNSpoofCheckerTest, LookupSkeletonInTopDomains) {
IDNSpoofChecker().LookupSkeletonInTopDomains("d4OOO.corn");
EXPECT_EQ("d4000.com", entry.domain);
EXPECT_TRUE(entry.is_top_500);
EXPECT_EQ(entry.skeleton_type, SkeletonType::kFull);
}
{
TopDomainEntry entry = IDNSpoofChecker().LookupSkeletonInTopDomains(
"d4OOOcorn", SkeletonType::kSeparatorsRemoved);
EXPECT_EQ("d4000.com", entry.domain);
EXPECT_TRUE(entry.is_top_500);
EXPECT_EQ(entry.skeleton_type, SkeletonType::kSeparatorsRemoved);
}
{
TopDomainEntry entry =
IDNSpoofChecker().LookupSkeletonInTopDomains("digklrno68.corn");
EXPECT_EQ("digklmo68.com", entry.domain);
EXPECT_FALSE(entry.is_top_500);
EXPECT_EQ(entry.skeleton_type, SkeletonType::kFull);
}
}
......@@ -1385,6 +1394,14 @@ TEST(IDNSpoofCheckerNoFixtureTest, LookupSkeletonInTopDomains) {
IDNSpoofChecker().LookupSkeletonInTopDomains("google.corn");
EXPECT_EQ("google.com", entry.domain);
EXPECT_TRUE(entry.is_top_500);
EXPECT_EQ(entry.skeleton_type, SkeletonType::kFull);
}
{
TopDomainEntry entry = IDNSpoofChecker().LookupSkeletonInTopDomains(
"googlecorn", SkeletonType::kSeparatorsRemoved);
EXPECT_EQ("google.com", entry.domain);
EXPECT_TRUE(entry.is_top_500);
EXPECT_EQ(entry.skeleton_type, SkeletonType::kSeparatorsRemoved);
}
{
// This is data dependent, must be updated when the top domain list
......@@ -1393,6 +1410,7 @@ TEST(IDNSpoofCheckerNoFixtureTest, LookupSkeletonInTopDomains) {
IDNSpoofChecker().LookupSkeletonInTopDomains("google.sk");
EXPECT_EQ("google.sk", entry.domain);
EXPECT_FALSE(entry.is_top_500);
EXPECT_EQ(entry.skeleton_type, SkeletonType::kFull);
}
}
......
......@@ -15,6 +15,7 @@
#include "base/i18n/icu_util.h"
#include "base/numerics/safe_conversions.h"
#include "base/path_service.h"
#include "base/strings/string16.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
......@@ -44,6 +45,31 @@ bool WriteToFile(const std::string& content, base::StringPiece basename) {
return succeeded;
}
std::string GenerateTop500OutputLine(const Skeletons& skeletons,
const Skeletons& no_separators_skeletons,
const std::string& domain) {
std::string output;
for (const std::string& skeleton : skeletons) {
for (const std::string& no_separators_skeleton : no_separators_skeletons) {
DCHECK(!skeleton.empty()) << "Empty skeleton for " << domain;
DCHECK(!no_separators_skeleton.empty())
<< "Empty without separator skeleton for " << domain;
output += skeleton + ", " + no_separators_skeleton + ", " + domain + "\n";
}
}
return output;
}
std::string GenerateTop5kOutputLine(const Skeletons& skeletons,
const std::string& domain) {
std::string output;
for (const std::string& skeleton : skeletons) {
DCHECK(!skeleton.empty()) << "Empty skeleton for " << domain;
output += skeleton + ", " + domain + "\n";
}
return output;
}
int GenerateSkeletons(const char* input_file_name,
const char* output_file_name,
const USpoofChecker* spoof_checker) {
......@@ -55,6 +81,10 @@ int GenerateSkeletons(const char* input_file_name,
return 1;
}
// These characters are used to separate labels in a hostname. We generate
// skeletons of top 500 domains without these separators as well. These
// skeletons could be used in lookalike heuristics such as Target Embedding.
base::string16 kLabelSeparators = base::UTF8ToUTF16(".-");
std::stringstream input(input_content);
std::string output =
R"(# Copyright 2018 The Chromium Authors. All rights reserved.
......@@ -65,6 +95,15 @@ int GenerateSkeletons(const char* input_file_name,
# components/url_formatter/spoof_checks/make_top_domain_skeletons.cc
# DO NOT MANUALLY EDIT!
# This list contains top 500 domains followed by the top 5000 domains. These are
# separated by ###END_TOP_500### line.
# For top 500 domains, each row has three columns: full skeleton, skeleton
# without label separators (e.g. '.' and '-'), and the domain itself.
# For top 5000 domains, each row has two columns: full skeleton and the domain
# itself.
# Each entry is the skeleton of a top domain for the confusability check
# in components/url_formatter/url_formatter.cc.
......@@ -75,11 +114,13 @@ int GenerateSkeletons(const char* input_file_name,
std::string domain;
size_t max_labels = 0;
std::string domain_with_max_labels;
bool is_top_500 = true;
while (std::getline(input, domain)) {
base::TrimWhitespaceASCII(domain, base::TRIM_ALL, &domain);
if (domain == kTop500Separator) {
output += std::string(kTop500Separator) + "\n";
is_top_500 = false;
continue;
}
......@@ -90,9 +131,21 @@ int GenerateSkeletons(const char* input_file_name,
const Skeletons skeletons = skeleton_generator.GetSkeletons(domain16);
DCHECK(!skeletons.empty()) << "Failed to generate skeletons of " << domain;
for (const std::string& skeleton : skeletons) {
DCHECK(!skeleton.empty()) << "Empty skeleton for " << domain;
output += skeleton + ", " + domain + "\n";
// Generate skeletons for domains without their separators (e.g. googlecom).
// These skeletons are used in target embedding lookalikes.
base::string16 domain16_with_no_separators;
base::ReplaceChars(domain16, kLabelSeparators, base::string16(),
&domain16_with_no_separators);
const Skeletons no_separators_skeletons =
skeleton_generator.GetSkeletons(domain16_with_no_separators);
DCHECK(!no_separators_skeletons.empty())
<< "No skeletons generated for " << domain16_with_no_separators;
if (is_top_500) {
output +=
GenerateTop500OutputLine(skeletons, no_separators_skeletons, domain);
} else {
output += GenerateTop5kOutputLine(skeletons, domain);
}
std::vector<base::StringPiece> labels = base::SplitStringPiece(
......
......@@ -6,10 +6,19 @@
# components/url_formatter/spoof_checks/make_top_domain_skeletons.cc
# DO NOT MANUALLY EDIT!
# This list contains top 500 domains followed by the top 5000 domains. These are
# separated by ###END_TOP_500### line.
# For top 500 domains, each row has three columns: full skeleton, skeleton
# without label separators (e.g. '.' and '-'), and the domain itself.
# For top 5000 domains, each row has two columns: full skeleton and the domain
# itself.
# Each entry is the skeleton of a top domain for the confusability check
# in components/url_formatter/url_formatter.cc.
d4OOO.corn, d4000.com
d4OOO.corn, d4OOOcorn, d4000.com
###END_TOP_500###
digklrno68.corn, digklmo68.com
digklrno68.co.uk, digklmo68.co.uk
......
......@@ -51,6 +51,37 @@ void CheckName(const std::string& name) {
}
}
std::unique_ptr<TopDomainEntry> MakeEntry(
const std::string& hostname,
const std::string& skeleton,
url_formatter::SkeletonType skeleton_type,
bool is_top_500,
std::set<std::string>* all_skeletons) {
auto entry = std::make_unique<TopDomainEntry>();
// Another site has the same skeleton. This is low proability so stop now.
CHECK(all_skeletons->find(skeleton) == all_skeletons->end())
<< "A domain with the same skeleton is already in the list (" << skeleton
<< ").";
all_skeletons->insert(skeleton);
// TODO: Should we lowercase these?
entry->skeleton = skeleton;
// There might be unicode domains in the list. Store them in punycode in
// the trie.
const GURL domain(std::string("http://") + hostname);
entry->top_domain = domain.host();
entry->is_top_500 = is_top_500;
entry->skeleton_type = skeleton_type;
CheckName(entry->skeleton);
CheckName(entry->top_domain);
return entry;
}
} // namespace
int main(int argc, char* argv[]) {
......@@ -97,7 +128,7 @@ int main(int argc, char* argv[]) {
bool is_top_500 = true;
TopDomainEntries entries;
std::set<std::string> skeletons;
std::set<std::string> all_skeletons;
for (std::string line : lines) {
base::TrimWhitespaceASCII(line, base::TRIM_ALL, &line);
......@@ -109,35 +140,27 @@ int main(int argc, char* argv[]) {
if (line.empty() || line[0] == '#') {
continue;
}
auto entry = std::make_unique<TopDomainEntry>();
std::vector<std::string> tokens = base::SplitString(
line, ",", base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
CHECK_EQ(2u, tokens.size()) << "Invalid line: " << tokens[0];
const std::string skeleton = tokens[0];
// Another site has the same skeleton. This is low proability so stop now.
CHECK(skeletons.find(skeleton) == skeletons.end())
<< "A domain with the same skeleton is already in the list ("
<< skeleton << ").";
skeletons.insert(skeleton);
// TODO: Should we lowercase these?
entry->skeleton = skeleton;
// There might be unicode domains in the list. Store them in punycode in the
// trie.
const GURL domain(std::string("http://") + tokens[1]);
entry->top_domain = domain.host();
entry->is_top_500 = is_top_500;
CheckName(entry->skeleton);
CheckName(entry->top_domain);
entries.push_back(std::move(entry));
// Top 500 domains will have full skeletons as well as skeletons without
// label separators (e.g. '.' and '-').
if (is_top_500) {
CHECK_EQ(3u, tokens.size()) << "Invalid line: " << tokens[0];
entries.push_back(MakeEntry(tokens[2], tokens[0],
url_formatter::SkeletonType::kFull,
/*is_top_500=*/true, &all_skeletons));
entries.push_back(MakeEntry(
tokens[2], tokens[1], url_formatter::SkeletonType::kSeparatorsRemoved,
/*is_top_500=*/true, &all_skeletons));
} else {
CHECK_EQ(2u, tokens.size()) << "Invalid line: " << tokens[0];
entries.push_back(MakeEntry(tokens[1], tokens[0],
url_formatter::SkeletonType::kFull,
/*is_top_500=*/false, &all_skeletons));
}
}
base::FilePath template_path = base::FilePath::FromUTF8Unsafe(args[1]);
......
......@@ -9,6 +9,15 @@
namespace url_formatter {
uint8_t BitLength(uint32_t input) {
uint8_t number_of_bits = 0;
while (input != 0) {
number_of_bits++;
input >>= 1;
}
return number_of_bits;
}
namespace top_domains {
TopDomainTrieEntry::TopDomainTrieEntry(
......@@ -27,13 +36,20 @@ std::string TopDomainTrieEntry::name() const {
bool TopDomainTrieEntry::WriteEntry(
net::huffman_trie::TrieBitBuffer* writer) const {
// Make sure the assigned bit length is enough to encode all SkeletonType
// values.
DCHECK_EQ(kSkeletonTypeBitLength,
BitLength(url_formatter::SkeletonType::kMaxValue));
if (entry_->skeleton == entry_->top_domain) {
writer->WriteBit(1);
writer->WriteBit(entry_->is_top_500 ? 1 : 0);
writer->WriteBits(entry_->skeleton_type, kSkeletonTypeBitLength);
return true;
}
writer->WriteBit(0);
writer->WriteBit(entry_->is_top_500 ? 1 : 0);
writer->WriteBits(entry_->skeleton_type, kSkeletonTypeBitLength);
std::string top_domain = entry_->top_domain;
// With the current top 10,000 domains, this optimization reduces the
......
......@@ -13,12 +13,30 @@
namespace url_formatter {
// The |SkeletonType| and |TopDomainEntry| are mirrored in trie_entry.h. These
// are used to insert and read nodes from the Trie.
// The type of skeleton in the trie node. This type is encoded by 2 bits in the
// trie.
enum SkeletonType {
// The skeleton represents the full domain (e.g. google.corn).
kFull = 0,
// The skeleton represents the domain with '.'s and '-'s removed (e.g.
// googlecorn).
kSeparatorsRemoved = 1,
// Max value used to determine the number of different types. Update this and
// |kSkeletonTypeBitLength| when new SkeletonTypes are added.
kMaxValue = kSeparatorsRemoved
};
const uint8_t kSkeletonTypeBitLength = 1;
namespace top_domains {
struct TopDomainEntry {
std::string skeleton;
std::string top_domain;
bool is_top_500;
SkeletonType skeleton_type;
};
class TopDomainTrieEntry : public net::huffman_trie::TrieEntry {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment