Commit d7f04496 authored by rsleevi@chromium.org's avatar rsleevi@chromium.org

Reduce footprint of registry controlled domain table

The perfect hash table containing all registry controlled domains is
replaced by a compact graph (a dafsa) to reduce binary size and PSS
of the running process. Size of the new structure is about 33kB
compared to 380kB for the perfect hash table.

Patch by Olle Liljenzin <ollel@opera.com>, originally at
https://codereview.chromium.org/197183002/

BUG=370672
R=brettw@chromium.org

Review URL: https://codereview.chromium.org/270363003

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@270039 0039d316-1c4b-4281-b951-d872f2087c98
parent 28e06318
......@@ -82,12 +82,13 @@ component("net") {
include_dirs = []
deps = [
":net_resources",
":net_resources",
"//base",
"//base:i18n",
"//base/third_party/dynamic_annotations",
"//crypto",
"//crypto:platform",
"//net/base/registry_controlled_domains:registry_controlled_domains",
"//sdch",
"//third_party/icu",
"//third_party/zlib",
......
# Copyright (c) 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
action_foreach("registry_controlled_domains") {
script = "//net/tools/tld_cleanup/make_dafsa.py"
sources = [
"effective_tld_names.gperf",
"effective_tld_names_unittest1.gperf",
"effective_tld_names_unittest2.gperf",
"effective_tld_names_unittest3.gperf",
"effective_tld_names_unittest4.gperf",
"effective_tld_names_unittest5.gperf",
"effective_tld_names_unittest6.gperf",
]
outputs = [
"${target_gen_dir}/{{source_name_part}}-inc.cc"
]
args = [
"{{source}}",
rebase_path("${target_gen_dir}/{{source_name_part}}-inc.cc", root_build_dir)
]
}
/* C++ code produced by gperf version 3.0.3 */
/* Command-line: gperf -a -L C++ -C -c -o -t -k '*' -NFindDomain -ZPerfect_Hash_Test1 -P -K name_offset -Q stringpool1 -D effective_tld_names_unittest1.gperf */
#if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \
&& ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) \
&& (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) \
&& ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) \
&& ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) \
&& ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) \
&& ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) \
&& ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) \
&& ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) \
&& ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) \
&& ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) \
&& ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) \
&& ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) \
&& ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) \
&& ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) \
&& ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) \
&& ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) \
&& ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) \
&& ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) \
&& ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) \
&& ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) \
&& ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) \
&& ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126))
/* The character set is not based on ISO-646. */
#error "gperf generated tables don't work with this execution character set. Please report a bug to <bug-gnu-gperf@gnu.org>."
#endif
#line 1 "effective_tld_names_unittest1.gperf"
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Test file used by registry_controlled_domain_unittest.
// We edit this file manually, then run
// gperf -a -L "C++" -C -c -o -t -k '*' -NFindDomain -ZPerfect_Hash_Test1 -P -K name_offset -Q stringpool1 -D effective_tld_names_unittest1.gperf > effective_tld_names_unittest1.cc
// to generate the perfect hashmap.
#line 10 "effective_tld_names_unittest1.gperf"
struct DomainRule {
int name_offset;
int type; // 1: exception, 2: wildcard, 4: private
};
#define TOTAL_KEYWORDS 11
#define MIN_WORD_LENGTH 1
#define MAX_WORD_LENGTH 11
#define MIN_HASH_VALUE 1
#define MAX_HASH_VALUE 17
/* maximum key range = 17, duplicates = 0 */
class Perfect_Hash_Test1
{
private:
static inline unsigned int hash (const char *str, unsigned int len);
public:
static const struct DomainRule *FindDomain (const char *str, unsigned int len);
};
inline unsigned int
Perfect_Hash_Test1::hash (register const char *str, register unsigned int len)
{
static const unsigned char asso_values[] =
{
18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 0, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 0, 0, 0,
18, 5, 0, 18, 18, 0, 0, 18, 18, 0,
5, 0, 0, 18, 0, 18, 5, 18, 0, 18,
18, 18, 0, 18, 18, 18, 18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
18, 18, 18, 18, 18, 18
};
register int hval = len;
switch (hval)
{
default:
hval += asso_values[(unsigned char)str[10]];
/*FALLTHROUGH*/
case 10:
hval += asso_values[(unsigned char)str[9]];
/*FALLTHROUGH*/
case 9:
hval += asso_values[(unsigned char)str[8]];
/*FALLTHROUGH*/
case 8:
hval += asso_values[(unsigned char)str[7]];
/*FALLTHROUGH*/
case 7:
hval += asso_values[(unsigned char)str[6]];
/*FALLTHROUGH*/
case 6:
hval += asso_values[(unsigned char)str[5]];
/*FALLTHROUGH*/
case 5:
hval += asso_values[(unsigned char)str[4]];
/*FALLTHROUGH*/
case 4:
hval += asso_values[(unsigned char)str[3]];
/*FALLTHROUGH*/
case 3:
hval += asso_values[(unsigned char)str[2]];
/*FALLTHROUGH*/
case 2:
hval += asso_values[(unsigned char)str[1]];
/*FALLTHROUGH*/
case 1:
hval += asso_values[(unsigned char)str[0]];
break;
}
return hval;
}
struct stringpool1_t
{
char stringpool1_str0[sizeof("c")];
char stringpool1_str1[sizeof("jp")];
char stringpool1_str2[sizeof("b.c")];
char stringpool1_str3[sizeof("ac.jp")];
char stringpool1_str4[sizeof("bar.jp")];
char stringpool1_str5[sizeof("no")];
char stringpool1_str6[sizeof("baz.bar.jp")];
char stringpool1_str7[sizeof("bar.baz.com")];
char stringpool1_str8[sizeof("priv.no")];
char stringpool1_str9[sizeof("pref.bar.jp")];
char stringpool1_str10[sizeof("private")];
};
static const struct stringpool1_t stringpool1_contents =
{
"c",
"jp",
"b.c",
"ac.jp",
"bar.jp",
"no",
"baz.bar.jp",
"bar.baz.com",
"priv.no",
"pref.bar.jp",
"private"
};
#define stringpool1 ((const char *) &stringpool1_contents)
const struct DomainRule *
Perfect_Hash_Test1::FindDomain (register const char *str, register unsigned int len)
{
static const struct DomainRule wordlist[] =
{
#line 21 "effective_tld_names_unittest1.gperf"
{(int)(long)&((struct stringpool1_t *)0)->stringpool1_str0, 2},
#line 15 "effective_tld_names_unittest1.gperf"
{(int)(long)&((struct stringpool1_t *)0)->stringpool1_str1, 0},
#line 22 "effective_tld_names_unittest1.gperf"
{(int)(long)&((struct stringpool1_t *)0)->stringpool1_str2, 1},
#line 16 "effective_tld_names_unittest1.gperf"
{(int)(long)&((struct stringpool1_t *)0)->stringpool1_str3, 0},
#line 17 "effective_tld_names_unittest1.gperf"
{(int)(long)&((struct stringpool1_t *)0)->stringpool1_str4, 2},
#line 23 "effective_tld_names_unittest1.gperf"
{(int)(long)&((struct stringpool1_t *)0)->stringpool1_str5, 0},
#line 18 "effective_tld_names_unittest1.gperf"
{(int)(long)&((struct stringpool1_t *)0)->stringpool1_str6, 2},
#line 20 "effective_tld_names_unittest1.gperf"
{(int)(long)&((struct stringpool1_t *)0)->stringpool1_str7, 0},
#line 24 "effective_tld_names_unittest1.gperf"
{(int)(long)&((struct stringpool1_t *)0)->stringpool1_str8, 4},
#line 19 "effective_tld_names_unittest1.gperf"
{(int)(long)&((struct stringpool1_t *)0)->stringpool1_str9, 1},
#line 25 "effective_tld_names_unittest1.gperf"
{(int)(long)&((struct stringpool1_t *)0)->stringpool1_str10, 4}
};
static const signed char lookup[] =
{
-1, 0, 1, 2, -1, 3, 4, 5, -1, -1, 6, 7, 8, -1,
-1, -1, 9, 10
};
if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH)
{
register int key = hash (str, len);
if (key <= MAX_HASH_VALUE && key >= 0)
{
register int index = lookup[key];
if (index >= 0)
{
register const char *s = wordlist[index].name_offset + stringpool1;
if (*str == *s && !strncmp (str + 1, s + 1, len - 1) && s[len] == '\0')
return &wordlist[index];
}
}
}
return 0;
}
#line 26 "effective_tld_names_unittest1.gperf"
/* C++ code produced by gperf version 3.0.3 */
/* Command-line: gperf -a -L C++ -C -c -o -t -k '*' -NFindDomain -ZPerfect_Hash_Test2 -P -K name_offset -Q stringpool2 -D -T effective_tld_names_unittest2.gperf */
#if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \
&& ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) \
&& (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) \
&& ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) \
&& ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) \
&& ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) \
&& ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) \
&& ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) \
&& ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) \
&& ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) \
&& ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) \
&& ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) \
&& ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) \
&& ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) \
&& ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) \
&& ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) \
&& ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) \
&& ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) \
&& ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) \
&& ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) \
&& ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) \
&& ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) \
&& ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126))
/* The character set is not based on ISO-646. */
#error "gperf generated tables don't work with this execution character set. Please report a bug to <bug-gnu-gperf@gnu.org>."
#endif
#line 1 "effective_tld_names_unittest2.gperf"
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Test file used by registry_controlled_domain_unittest.
// We edit this file manually, then run
// gperf -a -L "C++" -C -c -o -t -k '*' -NFindDomain -ZPerfect_Hash_Test2 -P -K name_offset -Q stringpool2 -D -T effective_tld_names_unittest2.gperf > effective_tld_names_unittest2.cc
// to generate the perfect hashmap.
#define TOTAL_KEYWORDS 2
#define MIN_WORD_LENGTH 2
#define MAX_WORD_LENGTH 6
#define MIN_HASH_VALUE 2
#define MAX_HASH_VALUE 6
/* maximum key range = 5, duplicates = 0 */
class Perfect_Hash_Test2
{
private:
static inline unsigned int hash (const char *str, unsigned int len);
public:
static const struct DomainRule *FindDomain (const char *str, unsigned int len);
};
inline unsigned int
Perfect_Hash_Test2::hash (register const char *str, register unsigned int len)
{
static const unsigned char asso_values[] =
{
7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 0, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 0, 0, 7,
7, 7, 7, 7, 7, 7, 0, 7, 7, 7,
7, 7, 0, 7, 0, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7
};
register int hval = len;
switch (hval)
{
default:
hval += asso_values[(unsigned char)str[5]];
/*FALLTHROUGH*/
case 5:
hval += asso_values[(unsigned char)str[4]];
/*FALLTHROUGH*/
case 4:
hval += asso_values[(unsigned char)str[3]];
/*FALLTHROUGH*/
case 3:
hval += asso_values[(unsigned char)str[2]];
/*FALLTHROUGH*/
case 2:
hval += asso_values[(unsigned char)str[1]];
/*FALLTHROUGH*/
case 1:
hval += asso_values[(unsigned char)str[0]];
break;
}
return hval;
}
struct stringpool2_t
{
char stringpool2_str0[sizeof("jp")];
char stringpool2_str1[sizeof("bar.jp")];
};
static const struct stringpool2_t stringpool2_contents =
{
"jp",
"bar.jp"
};
#define stringpool2 ((const char *) &stringpool2_contents)
const struct DomainRule *
Perfect_Hash_Test2::FindDomain (register const char *str, register unsigned int len)
{
static const struct DomainRule wordlist[] =
{
#line 15 "effective_tld_names_unittest2.gperf"
{(int)(long)&((struct stringpool2_t *)0)->stringpool2_str0, 0},
#line 16 "effective_tld_names_unittest2.gperf"
{(int)(long)&((struct stringpool2_t *)0)->stringpool2_str1, 0}
};
static const signed char lookup[] =
{
-1, -1, 0, -1, -1, -1, 1
};
if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH)
{
register int key = hash (str, len);
if (key <= MAX_HASH_VALUE && key >= 0)
{
register int index = lookup[key];
if (index >= 0)
{
register const char *s = wordlist[index].name_offset + stringpool2;
if (*str == *s && !strncmp (str + 1, s + 1, len - 1) && s[len] == '\0')
return &wordlist[index];
}
}
}
return 0;
}
#line 17 "effective_tld_names_unittest2.gperf"
This file is for testing 2 byte offsets
%%
0____________________________________________________________________________________________________0, 0
1____________________________________________________________________________________________________1, 4
2____________________________________________________________________________________________________2, 0
3____________________________________________________________________________________________________3, 4
4____________________________________________________________________________________________________4, 0
5____________________________________________________________________________________________________5, 4
6____________________________________________________________________________________________________6, 0
7____________________________________________________________________________________________________7, 4
8____________________________________________________________________________________________________8, 0
9____________________________________________________________________________________________________9, 4
%%
This file is for testing joined prefixes
%%
ai, 0
bj, 4
aak, 0
bbl, 4
aaaam, 0
bbbbn, 0
%%
This file is for testing joined suffixes
%%
ia, 0
jb, 4
kaa, 0
lbb, 4
maaaa, 0
nbbbb, 0
%%
......@@ -53,26 +53,151 @@
#include "url/gurl.h"
#include "url/url_parse.h"
#include "effective_tld_names.cc"
namespace net {
namespace registry_controlled_domains {
namespace {
#include "net/base/registry_controlled_domains/effective_tld_names-inc.cc"
// See make_dafsa.py for documentation of the generated dafsa byte array.
const unsigned char* g_graph = kDafsa;
size_t g_graph_length = sizeof(kDafsa);
const int kNotFound = -1;
const int kExceptionRule = 1;
const int kWildcardRule = 2;
const int kPrivateRule = 4;
const FindDomainPtr kDefaultFindDomainFunction = Perfect_Hash::FindDomain;
// Read next offset from pos.
// Returns true if an offset could be read, false otherwise.
bool GetNextOffset(const unsigned char** pos, const unsigned char* end,
const unsigned char** offset) {
if (*pos == end)
return false;
// When reading an offset the byte array must always contain at least
// three more bytes to consume. First the offset to read, then a node
// to skip over and finally a destination node. No object can be smaller
// than one byte.
CHECK_LT(*pos + 2, end);
size_t bytes_consumed;
switch (**pos & 0x60) {
case 0x60: // Read three byte offset
*offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2];
bytes_consumed = 3;
break;
case 0x40: // Read two byte offset
*offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1];
bytes_consumed = 2;
break;
default:
*offset += (*pos)[0] & 0x3F;
bytes_consumed = 1;
}
if ((**pos & 0x80) != 0) {
*pos = end;
} else {
*pos += bytes_consumed;
}
return true;
}
// Check if byte at offset is last in label.
bool IsEOL(const unsigned char* offset, const unsigned char* end) {
CHECK_LT(offset, end);
return (*offset & 0x80) != 0;
}
// Check if byte at offset matches first character in key.
// This version matches characters not last in label.
bool IsMatch(const unsigned char* offset, const unsigned char* end,
const char* key) {
CHECK_LT(offset, end);
return *offset == *key;
}
// Check if byte at offset matches first character in key.
// This version matches characters last in label.
bool IsEndCharMatch(const unsigned char* offset, const unsigned char* end,
const char* key) {
CHECK_LT(offset, end);
return *offset == (*key | 0x80);
}
// 'stringpool' is defined as a macro by the gperf-generated
// "effective_tld_names.cc". Provide a real constant value for it instead.
const char* const kDefaultStringPool = stringpool;
#undef stringpool
// Read return value at offset.
// Returns true if a return value could be read, false otherwise.
bool GetReturnValue(const unsigned char* offset, const unsigned char* end,
int* return_value) {
CHECK_LT(offset, end);
if ((*offset & 0xE0) == 0x80) {
*return_value = *offset & 0x0F;
return true;
}
return false;
}
FindDomainPtr g_find_domain_function = kDefaultFindDomainFunction;
const char* g_stringpool = kDefaultStringPool;
// Lookup a domain key in a byte array generated by make_dafsa.py.
// The rule type is returned if key is found, otherwise kNotFound is returned.
int LookupString(const unsigned char* graph, size_t length, const char* key,
size_t key_length) {
const unsigned char* pos = graph;
const unsigned char* end = graph + length;
const unsigned char* offset = pos;
const char* key_end = key + key_length;
while (GetNextOffset(&pos, end, &offset)) {
// char <char>+ end_char offsets
// char <char>+ return value
// char end_char offsets
// char return value
// end_char offsets
// return_value
bool did_consume = false;
if (key != key_end && !IsEOL(offset, end)) {
// Leading <char> is not a match. Don't dive into this child
if (!IsMatch(offset, end, key))
continue;
did_consume = true;
++offset;
++key;
// Possible matches at this point:
// <char>+ end_char offsets
// <char>+ return value
// end_char offsets
// return value
// Remove all remaining <char> nodes possible
while (!IsEOL(offset, end) && key != key_end) {
if (!IsMatch(offset, end, key))
return kNotFound;
++key;
++offset;
}
}
// Possible matches at this point:
// end_char offsets
// return_value
// If one or more <char> elements were consumed, a failure
// to match is terminal. Otherwise, try the next node.
if (key == key_end) {
int return_value;
if (GetReturnValue(offset, end, &return_value))
return return_value;
// The DAFSA guarantees that if the first char is a match, all
// remaining char elements MUST match if the key is truly present.
if (did_consume)
return kNotFound;
continue;
}
if (!IsEndCharMatch(offset, end, key)) {
if (did_consume)
return kNotFound; // Unexpected
continue;
}
++key;
pos = ++offset; // Dive into child
}
return kNotFound; // No match
}
size_t GetRegistryLengthImpl(
const std::string& host,
......@@ -105,46 +230,40 @@ size_t GetRegistryLengthImpl(
return 0; // This can't have a registry + domain.
while (1) {
const char* domain_str = host.data() + curr_start;
int domain_length = host_check_len - curr_start;
const DomainRule* rule = g_find_domain_function(domain_str, domain_length);
// We need to compare the string after finding a match because the
// no-collisions of perfect hashing only refers to items in the set. Since
// we're searching for arbitrary domains, there could be collisions.
// Furthermore, if the apparent match is a private registry and we're not
// including those, it can't be an actual match.
if (rule) {
bool do_check = !(rule->type & kPrivateRule) ||
private_filter == INCLUDE_PRIVATE_REGISTRIES;
if (do_check && base::strncasecmp(domain_str,
g_stringpool + rule->name_offset,
domain_length) == 0) {
// Exception rules override wildcard rules when the domain is an exact
// match, but wildcards take precedence when there's a subdomain.
if (rule->type & kWildcardRule && (prev_start != std::string::npos)) {
// If prev_start == host_check_begin, then the host is the registry
// itself, so return 0.
return (prev_start == host_check_begin) ?
0 : (host.length() - prev_start);
}
size_t domain_length = host_check_len - curr_start;
int type = LookupString(g_graph, g_graph_length, domain_str, domain_length);
bool do_check =
type != kNotFound && (!(type & kPrivateRule) ||
private_filter == INCLUDE_PRIVATE_REGISTRIES);
// If the apparent match is a private registry and we're not including
// those, it can't be an actual match.
if (do_check) {
// Exception rules override wildcard rules when the domain is an exact
// match, but wildcards take precedence when there's a subdomain.
if (type & kWildcardRule && (prev_start != std::string::npos)) {
// If prev_start == host_check_begin, then the host is the registry
// itself, so return 0.
return (prev_start == host_check_begin) ? 0
: (host.length() - prev_start);
}
if (rule->type & kExceptionRule) {
if (next_dot == std::string::npos) {
// If we get here, we had an exception rule with no dots (e.g.
// "!foo"). This would only be valid if we had a corresponding
// wildcard rule, which would have to be "*". But we explicitly
// disallow that case, so this kind of rule is invalid.
NOTREACHED() << "Invalid exception rule";
return 0;
}
return host.length() - next_dot - 1;
if (type & kExceptionRule) {
if (next_dot == std::string::npos) {
// If we get here, we had an exception rule with no dots (e.g.
// "!foo"). This would only be valid if we had a corresponding
// wildcard rule, which would have to be "*". But we explicitly
// disallow that case, so this kind of rule is invalid.
NOTREACHED() << "Invalid exception rule";
return 0;
}
// If curr_start == host_check_begin, then the host is the registry
// itself, so return 0.
return (curr_start == host_check_begin) ?
0 : (host.length() - curr_start);
return host.length() - next_dot - 1;
}
// If curr_start == host_check_begin, then the host is the registry
// itself, so return 0.
return (curr_start == host_check_begin) ? 0
: (host.length() - curr_start);
}
if (next_dot >= host_check_len) // Catches std::string::npos as well.
......@@ -260,10 +379,16 @@ size_t GetRegistryLength(
return GetRegistryLengthImpl(canon_host, unknown_filter, private_filter);
}
void SetFindDomainFunctionAndStringPoolForTesting(FindDomainPtr function,
const char* stringpool) {
g_find_domain_function = function ? function : kDefaultFindDomainFunction;
g_stringpool = stringpool ? stringpool : kDefaultStringPool;
void SetFindDomainGraph() {
g_graph = kDafsa;
g_graph_length = sizeof(kDafsa);
}
void SetFindDomainGraph(const unsigned char* domains, size_t length) {
CHECK(domains);
CHECK_NE(length, 0u);
g_graph = domains;
g_graph_length = length;
}
} // namespace registry_controlled_domains
......
......@@ -226,11 +226,12 @@ NET_EXPORT size_t GetRegistryLength(const std::string& host,
typedef const struct DomainRule* (*FindDomainPtr)(const char *, unsigned int);
// Used for unit tests, so that a different perfect hash map from the full
// list is used. Set to NULL to use the Default function.
NET_EXPORT_PRIVATE void SetFindDomainFunctionAndStringPoolForTesting(
FindDomainPtr fn, const char* stringpool);
// Used for unit tests. Use default domains.
NET_EXPORT_PRIVATE void SetFindDomainGraph();
// Used for unit tests, so that a frozen list of domains is used.
NET_EXPORT_PRIVATE void SetFindDomainGraph(const unsigned char* domains,
size_t length);
} // namespace registry_controlled_domains
} // namespace net
......
......@@ -6,15 +6,26 @@
#include "testing/gtest/include/gtest/gtest.h"
#include "url/gurl.h"
#include "effective_tld_names_unittest1.cc"
static const char* const Perfect_Hash_Test1_stringpool = stringpool1;
#undef TOTAL_KEYWORDS
#undef MIN_WORD_LENGTH
#undef MAX_WORD_LENGTH
#undef MIN_HASH_VALUE
#undef MAX_HASH_VALUE
#include "effective_tld_names_unittest2.cc"
static const char* const Perfect_Hash_Test2_stringpool = stringpool2;
namespace {
namespace test1 {
#include "net/base/registry_controlled_domains/effective_tld_names_unittest1-inc.cc"
}
namespace test2 {
#include "net/base/registry_controlled_domains/effective_tld_names_unittest2-inc.cc"
}
namespace test3 {
#include "net/base/registry_controlled_domains/effective_tld_names_unittest3-inc.cc"
}
namespace test4 {
#include "net/base/registry_controlled_domains/effective_tld_names_unittest4-inc.cc"
}
namespace test5 {
#include "net/base/registry_controlled_domains/effective_tld_names_unittest5-inc.cc"
}
namespace test6 {
#include "net/base/registry_controlled_domains/effective_tld_names_unittest6-inc.cc"
}
} // namespace
namespace net {
namespace registry_controlled_domains {
......@@ -50,6 +61,12 @@ size_t GetRegistryLengthFromHost(
return GetRegistryLength(host, unknown_filter, EXCLUDE_PRIVATE_REGISTRIES);
}
size_t GetRegistryLengthFromHostIncludingPrivate(
const std::string& host,
UnknownRegistryFilter unknown_filter) {
return GetRegistryLength(host, unknown_filter, INCLUDE_PRIVATE_REGISTRIES);
}
bool CompareDomains(const std::string& url1, const std::string& url2) {
GURL g1 = GURL(url1);
GURL g2 = GURL(url2);
......@@ -60,17 +77,16 @@ bool CompareDomains(const std::string& url1, const std::string& url2) {
class RegistryControlledDomainTest : public testing::Test {
protected:
void UseDomainData(FindDomainPtr function, const char* const stringpool) {
SetFindDomainFunctionAndStringPoolForTesting(function, stringpool);
template <typename Graph>
void UseDomainData(const Graph& graph) {
SetFindDomainGraph(graph, sizeof(Graph));
}
virtual void TearDown() {
SetFindDomainFunctionAndStringPoolForTesting(NULL, NULL);
}
virtual void TearDown() { SetFindDomainGraph(); }
};
TEST_F(RegistryControlledDomainTest, TestGetDomainAndRegistry) {
UseDomainData(Perfect_Hash_Test1::FindDomain, Perfect_Hash_Test1_stringpool);
UseDomainData(test1::kDafsa);
// Test GURL version of GetDomainAndRegistry().
EXPECT_EQ("baz.jp", GetDomainFromURL("http://a.baz.jp/file.html")); // 1
......@@ -129,7 +145,7 @@ TEST_F(RegistryControlledDomainTest, TestGetDomainAndRegistry) {
}
TEST_F(RegistryControlledDomainTest, TestGetRegistryLength) {
UseDomainData(Perfect_Hash_Test1::FindDomain, Perfect_Hash_Test1_stringpool);
UseDomainData(test1::kDafsa);
// Test GURL version of GetRegistryLength().
EXPECT_EQ(2U, GetRegistryLengthFromURL("http://a.baz.jp/file.html",
......@@ -248,7 +264,7 @@ TEST_F(RegistryControlledDomainTest, TestGetRegistryLength) {
}
TEST_F(RegistryControlledDomainTest, TestSameDomainOrHost) {
UseDomainData(Perfect_Hash_Test2::FindDomain, Perfect_Hash_Test2_stringpool);
UseDomainData(test2::kDafsa);
EXPECT_TRUE(CompareDomains("http://a.b.bar.jp/file.html",
"http://a.b.bar.jp/file.html")); // b.bar.jp
......@@ -295,7 +311,7 @@ TEST_F(RegistryControlledDomainTest, TestDefaultData) {
}
TEST_F(RegistryControlledDomainTest, TestPrivateRegistryHandling) {
UseDomainData(Perfect_Hash_Test1::FindDomain, Perfect_Hash_Test1_stringpool);
UseDomainData(test1::kDafsa);
// Testing the same dataset for INCLUDE_PRIVATE_REGISTRIES and
// EXCLUDE_PRIVATE_REGISTRIES arguments.
......@@ -347,6 +363,138 @@ TEST_F(RegistryControlledDomainTest, TestPrivateRegistryHandling) {
INCLUDE_UNKNOWN_REGISTRIES));
}
TEST_F(RegistryControlledDomainTest, TestDafsaTwoByteOffsets) {
UseDomainData(test3::kDafsa);
// Testing to lookup keys in a DAFSA with two byte offsets.
// This DAFSA is constructed so that labels begin and end with unique
// characters, which makes it impossible to merge labels. Each inner node
// is about 100 bytes and a one byte offset can at most add 64 bytes to
// previous offset. Thus the paths must go over two byte offsets.
const char* key0 =
"a.b.6____________________________________________________"
"________________________________________________6";
const char* key1 =
"a.b.7____________________________________________________"
"________________________________________________7";
const char* key2 =
"a.b.a____________________________________________________"
"________________________________________________8";
EXPECT_EQ(102U, GetRegistryLengthFromHost(key0, EXCLUDE_UNKNOWN_REGISTRIES));
EXPECT_EQ(0U, GetRegistryLengthFromHost(key1, EXCLUDE_UNKNOWN_REGISTRIES));
EXPECT_EQ(102U,
GetRegistryLengthFromHostIncludingPrivate(
key1, EXCLUDE_UNKNOWN_REGISTRIES));
EXPECT_EQ(0U, GetRegistryLengthFromHost(key2, EXCLUDE_UNKNOWN_REGISTRIES));
}
TEST_F(RegistryControlledDomainTest, TestDafsaThreeByteOffsets) {
UseDomainData(test4::kDafsa);
// Testing to lookup keys in a DAFSA with three byte offsets.
// This DAFSA is constructed so that labels begin and end with unique
// characters, which makes it impossible to merge labels. The byte array
// has a size of ~54k. A two byte offset can add at most add 8k to the
// previous offset. Since we can skip only forward in memory, the nodes
// representing the return values must be located near the end of the byte
// array. The probability that we can reach from an arbitrary inner node to
// a return value without using a three byte offset is small (but not zero).
// The test is repeated with some different keys and with a reasonable
// probability at least one of the tested paths has go over a three byte
// offset.
const char* key0 =
"a.b.Z6___________________________________________________"
"_________________________________________________Z6";
const char* key1 =
"a.b.Z7___________________________________________________"
"_________________________________________________Z7";
const char* key2 =
"a.b.Za___________________________________________________"
"_________________________________________________Z8";
EXPECT_EQ(104U, GetRegistryLengthFromHost(key0, EXCLUDE_UNKNOWN_REGISTRIES));
EXPECT_EQ(0U, GetRegistryLengthFromHost(key1, EXCLUDE_UNKNOWN_REGISTRIES));
EXPECT_EQ(104U,
GetRegistryLengthFromHostIncludingPrivate(
key1, EXCLUDE_UNKNOWN_REGISTRIES));
EXPECT_EQ(0U, GetRegistryLengthFromHost(key2, EXCLUDE_UNKNOWN_REGISTRIES));
}
TEST_F(RegistryControlledDomainTest, TestDafsaJoinedPrefixes) {
UseDomainData(test5::kDafsa);
// Testing to lookup keys in a DAFSA with compressed prefixes.
// This DAFSA is constructed from words with similar prefixes but distinct
// suffixes. The DAFSA will then form a trie with the implicit source node
// as root.
const char* key0 = "a.b.ai";
const char* key1 = "a.b.bj";
const char* key2 = "a.b.aak";
const char* key3 = "a.b.bbl";
const char* key4 = "a.b.aaa";
const char* key5 = "a.b.bbb";
const char* key6 = "a.b.aaaam";
const char* key7 = "a.b.bbbbn";
EXPECT_EQ(2U, GetRegistryLengthFromHost(key0, EXCLUDE_UNKNOWN_REGISTRIES));
EXPECT_EQ(0U, GetRegistryLengthFromHost(key1, EXCLUDE_UNKNOWN_REGISTRIES));
EXPECT_EQ(2U,
GetRegistryLengthFromHostIncludingPrivate(
key1, EXCLUDE_UNKNOWN_REGISTRIES));
EXPECT_EQ(3U, GetRegistryLengthFromHost(key2, EXCLUDE_UNKNOWN_REGISTRIES));
EXPECT_EQ(0U, GetRegistryLengthFromHost(key3, EXCLUDE_UNKNOWN_REGISTRIES));
EXPECT_EQ(3U,
GetRegistryLengthFromHostIncludingPrivate(
key3, EXCLUDE_UNKNOWN_REGISTRIES));
EXPECT_EQ(0U,
GetRegistryLengthFromHostIncludingPrivate(
key4, EXCLUDE_UNKNOWN_REGISTRIES));
EXPECT_EQ(0U,
GetRegistryLengthFromHostIncludingPrivate(
key5, EXCLUDE_UNKNOWN_REGISTRIES));
EXPECT_EQ(5U, GetRegistryLengthFromHost(key6, EXCLUDE_UNKNOWN_REGISTRIES));
EXPECT_EQ(5U, GetRegistryLengthFromHost(key7, EXCLUDE_UNKNOWN_REGISTRIES));
}
TEST_F(RegistryControlledDomainTest, TestDafsaJoinedSuffixes) {
UseDomainData(test6::kDafsa);
// Testing to lookup keys in a DAFSA with compressed suffixes.
// This DAFSA is constructed from words with similar suffixes but distinct
// prefixes. The DAFSA will then form a trie with the implicit sink node as
// root.
const char* key0 = "a.b.ia";
const char* key1 = "a.b.jb";
const char* key2 = "a.b.kaa";
const char* key3 = "a.b.lbb";
const char* key4 = "a.b.aaa";
const char* key5 = "a.b.bbb";
const char* key6 = "a.b.maaaa";
const char* key7 = "a.b.nbbbb";
EXPECT_EQ(2U, GetRegistryLengthFromHost(key0, EXCLUDE_UNKNOWN_REGISTRIES));
EXPECT_EQ(0U, GetRegistryLengthFromHost(key1, EXCLUDE_UNKNOWN_REGISTRIES));
EXPECT_EQ(2U,
GetRegistryLengthFromHostIncludingPrivate(
key1, EXCLUDE_UNKNOWN_REGISTRIES));
EXPECT_EQ(3U, GetRegistryLengthFromHost(key2, EXCLUDE_UNKNOWN_REGISTRIES));
EXPECT_EQ(0U, GetRegistryLengthFromHost(key3, EXCLUDE_UNKNOWN_REGISTRIES));
EXPECT_EQ(3U,
GetRegistryLengthFromHostIncludingPrivate(
key3, EXCLUDE_UNKNOWN_REGISTRIES));
EXPECT_EQ(0U,
GetRegistryLengthFromHostIncludingPrivate(
key4, EXCLUDE_UNKNOWN_REGISTRIES));
EXPECT_EQ(0U,
GetRegistryLengthFromHostIncludingPrivate(
key5, EXCLUDE_UNKNOWN_REGISTRIES));
EXPECT_EQ(5U, GetRegistryLengthFromHost(key6, EXCLUDE_UNKNOWN_REGISTRIES));
EXPECT_EQ(5U, GetRegistryLengthFromHost(key7, EXCLUDE_UNKNOWN_REGISTRIES));
}
} // namespace registry_controlled_domains
} // namespace net
......@@ -44,6 +44,42 @@
'net.gypi',
],
'targets': [
{
'target_name': 'net_derived_sources',
'type': 'none',
'sources': [
'base/registry_controlled_domains/effective_tld_names.gperf',
'base/registry_controlled_domains/effective_tld_names_unittest1.gperf',
'base/registry_controlled_domains/effective_tld_names_unittest2.gperf',
'base/registry_controlled_domains/effective_tld_names_unittest3.gperf',
'base/registry_controlled_domains/effective_tld_names_unittest4.gperf',
'base/registry_controlled_domains/effective_tld_names_unittest5.gperf',
'base/registry_controlled_domains/effective_tld_names_unittest6.gperf',
],
'rules': [
{
'rule_name': 'dafsa',
'extension': 'gperf',
'outputs': [
'<(SHARED_INTERMEDIATE_DIR)/net/<(RULE_INPUT_DIRNAME)/<(RULE_INPUT_ROOT)-inc.cc',
],
'inputs': [
'tools/tld_cleanup/make_dafsa.py',
],
'action': [
'python',
'tools/tld_cleanup/make_dafsa.py',
'<(RULE_INPUT_PATH)',
'<(SHARED_INTERMEDIATE_DIR)/net/<(RULE_INPUT_DIRNAME)/<(RULE_INPUT_ROOT)-inc.cc',
],
},
],
'direct_dependent_settings': {
'include_dirs': [
'<(SHARED_INTERMEDIATE_DIR)'
],
},
},
{
'target_name': 'net',
'type': '<(component)',
......@@ -58,6 +94,7 @@
'../third_party/icu/icu.gyp:icuuc',
'../third_party/zlib/zlib.gyp:zlib',
'../url/url.gyp:url_lib',
'net_derived_sources',
'net_resources',
],
'sources': [
......@@ -503,7 +540,8 @@
'../url/url.gyp:url_lib',
'http_server',
'net',
'net_test_support'
'net_derived_sources',
'net_test_support',
],
'sources': [
'<@(net_test_sources)',
......
# Copyright 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Chromium presubmit script for src/net/tools/tld_cleanup."""
def _RunMakeDafsaTests(input_api, output_api):
"""Runs unittest for make_dafsa if any related file has been modified."""
files = ('net/tools/tld_cleanup/make_dafsa.py',
'net/tools/tld_cleanup/make_dafsa_unittest.py')
if not any(f in input_api.LocalPaths() for f in files):
return
test_path = input_api.os_path.join(input_api.PresubmitLocalPath(),
'make_dafsa_unittest.py')
cmd_name = 'make_dafsa_unittest'
cmd = [input_api.python_executable, test_path]
test_cmd = input_api.Command(
name=cmd_name,
cmd=cmd,
kwargs={},
message=output_api.PresubmitPromptWarning)
return input_api.RunTests([test_cmd])
def CheckChangeOnUpload(input_api, output_api):
return _RunMakeDafsaTests(input_api, output_api)
def CheckChangeOnCommit(input_api, output_api):
return _RunMakeDafsaTests(input_api, output_api)
......@@ -20,12 +20,9 @@ When updating src/net/base/registry_controlled_domains/effective_tld_names.dat:
src/build/Debug. It will re-generate
src/net/base/registry_controlled_domains/effective_tld_names.gperf.
6. Run gperf on the new effective_tld_names.gperf:
pushd src/net/base/registry_controlled_domains;
gperf -a -L "C++" -C -c -o -t -k '*' -NFindDomain -P -K name_offset -D -m 10 \
effective_tld_names.gperf > effective_tld_names.cc;
popd;
It will produce a new effective_tld_names.cc.
6. Check in the updated effective_tld_names.dat, effective_tld_names.gperf
7. Check in the updated effective_tld_names.dat, effective_tld_names.gperf,
and effective_tld_names.cc together.
Note that gperf is no longer used for effective_tld_names, but when building
chromium the file effective_tld_names.gperf will be parsed by make_dafsa.py
to generate the file effective_tld_names-inc.cc, which is included in
registry_controlled_domain.cc
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment