Commit 4da5767e authored by pkalinnikov's avatar pkalinnikov Committed by Commit bot

Data structures for Safe Browsing subresource filtering rules.

The rules are defined in two formats:

 1) Protocol Buffers, which is used as the wire format for transferring the
 rules, and is intended to remain relatively stable.

 2) FlatBuffers, which is used internally as a compact representation of the
 rules, and which can be memory mapped into renderer processes directly.

This CL also fixes GYP files for flatc for iOS.

This is a reland of https://codereview.chromium.org/2086213003/#ps160001 now that the linking issue has been fixed by https://codereview.chromium.org/2094273003/.

BUG=609747

Review-Url: https://codereview.chromium.org/2140623002
Cr-Commit-Position: refs/heads/master@{#404654}
parent 35269b7d
# Copyright 2015 The Chromium Authors. All rights reserved.
# Copyright 2016 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
{
'targets': [
{
{
# GN version: //components/subresource_filter/core/browser
'target_name': 'subresource_filter_core_browser',
'type': 'static_library',
......@@ -53,6 +53,8 @@
'type': 'static_library',
'dependencies': [
'../base/base.gyp:base',
'subresource_filter_core_common_ruleset_flatbuffer',
'subresource_filter_core_common_ruleset_proto',
],
'include_dirs': [
'..',
......@@ -71,6 +73,40 @@
'subresource_filter/core/common/string_splitter.h',
'subresource_filter/core/common/uint64_hasher.h',
],
'export_dependent_settings': [
'subresource_filter_core_common_ruleset_flatbuffer',
'subresource_filter_core_common_ruleset_proto',
],
},
{
# GN version: //components/subresource_filter/core/common/flat:flatbuffer
'target_name': 'subresource_filter_core_common_ruleset_flatbuffer',
'type': 'none',
'sources': [
# Note: sources list duplicated in GN build.
'subresource_filter/core/common/flat/rules.fbs',
],
'variables': {
'flatc_out_dir': 'components/subresource_filter/core/common/flat'
},
'includes': ['../third_party/flatbuffers/flatc.gypi'],
'dependencies': [
'<(DEPTH)/third_party/flatbuffers/flatbuffers.gyp:flatbuffers',
]
},
{
# GN version: //components/subresource_filter/core/common/proto:proto
'target_name': 'subresource_filter_core_common_ruleset_proto',
'type': 'static_library',
'sources': [
# Note: sources list duplicated in GN build.
'subresource_filter/core/common/proto/rules.proto',
],
'variables': {
'proto_in_dir': 'subresource_filter/core/common/proto',
'proto_out_dir': 'components/subresource_filter/core/common/proto',
},
'includes': [ '../build/protoc.gypi' ],
},
],
'conditions': [
......
# Copyright 2015 The Chromium Authors. All rights reserved.
# Copyright 2016 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
......@@ -16,6 +16,12 @@ source_set("common") {
"string_splitter.h",
"uint64_hasher.h",
]
public_deps = [
"//components/subresource_filter/core/common/flat:flatbuffer",
"//components/subresource_filter/core/common/proto:proto",
]
deps = [
"//base",
]
......
# Copyright 2016 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import("//third_party/flatbuffers/flatbuffer.gni")
flatbuffer("flatbuffer") {
sources = [
"rules.fbs",
]
}
namespace subresource_filter.flat;
// Corresponds to subresource_filter::proto::UrlPatternType.
enum UrlPatternType : ubyte {
SUBSTRING,
WILDCARDED,
REGEXP,
}
// Corresponds to subresource_filter::proto::AnchorType.
enum AnchorType : ubyte {
NONE,
BOUNDARY,
SUBDOMAIN,
}
// URL rule matching options. These correspond to multiple fields of
// subresource_filter::proto::UrlRule, but here, they are represented as flags
// of the same bitmask to allow for compact storage.
enum OptionFlag : ubyte (bit_flags) {
IS_WHITELIST,
APPLIES_TO_FIRST_PARTY,
APPLIES_TO_THIRD_PARTY,
IS_MATCH_CASE,
}
// The flat representation of a single URL rule. For more details regarding the
// fields please see the comments to subresource_filter::proto::UrlRule.
table UrlRule {
// Rule matching options, a bitmask consisting of OptionFlags.
options : ubyte;
// A bitmask of element types, same as proto::UrlRule::element_types. Enables
// all element types except POPUP by default.
element_types : ushort = 2047;
// A bitmask of activation types, same as proto::UrlRule::activation_types.
// Disables all activation types by default.
activation_types : ubyte = 0;
// Use SUBSTRING as default, since it's the most used pattern type. Same as
// the corresponding proto::UrlRule::url_pattern_type.
url_pattern_type : UrlPatternType = SUBSTRING;
// Use NONE as default, since most of the rules are not anchored.
anchor_left : AnchorType = NONE;
anchor_right : AnchorType = NONE;
// The list of domains to be included/excluded from the filter's affected set.
// If a particular string in the list starts with '~' then the respective
// domain is excluded, otherwise included.
domains : [string];
// A URL pattern in the format defined by |url_pattern_type|.
url_pattern : string;
// The compound Knuth-Morris-Pratt failure function corresponding to
// |url_pattern|. Used for SUBSTRING and WILDCARDED URL patterns only.
//
// The |url_pattern| is split into subpatterns separated by a '*' wildcard.
// Then for each subpattern a failure function of the KMP algorithm is built,
// with the caveat that if some subpattern contains at least one '^'
// placeholder, all the separator characters in this subpattern are
// considered equivalent, and the failure function subarray is prefixed with
// the value 1.
//
// The failure functions of subpatterns are stored sequentially in the
// |failure_function| array. Some subpatterns, however, will not have a
// corresponding failure function, e.g. the first subpattern if the rule's
// |anchor_left| is BOUNDARY.
failure_function : [ubyte];
}
// Contains an N-gram (acting as a key in a hash table) and a list of URL rules
// associated with that N-gram.
table NGramToRules {
// A string consisting of N (up to 8) non-special characters, which are stored
// in the lowest N non-zero bytes, lower bytes corresponding to later symbols.
ngram : ulong;
// The list of rules containing |ngram| as a substring of their URL pattern.
rule_list : [UrlRule];
}
// A data structure used to select only a handful of URL rule candidates that
// need to be matched against a certain resource URL.
table UrlPatternIndex {
// The N of an N-gram index. Note: |n| should be between 1 and 8.
n : uint;
// A hash table with open addressing. The keys of the table are N-grams.
ngram_index : [NGramToRules];
// The slot that is pointed to by all empty slots of |ngram_index| hash table.
// Note: This is a workaround needed because null offsets are not allowed as
// elements of FlatBuffer arrays.
ngram_index_empty_slot : NGramToRules;
// A list storing the rules that doesn't contain any valid N-grams in their
// URL patterns. Contains all the REGEXP rules as well.
// TODO(pkalinnikov): Think about better implementation for the fallback
// index. Possibly make it a hash map and maybe merge it with the N-gram
// index, since we can treat any sequence of characters shorter than N as an
// N-gram with zero bytes used for padding.
fallback_rules : [UrlRule];
}
// The top-level data structure used to store URL rules.
table IndexedRuleset {
// The index of all blacklist URL rules.
blacklist_index : UrlPatternIndex;
// The index of all whitelist URL rules.
whitelist_index : UrlPatternIndex;
}
root_type IndexedRuleset;
# Copyright 2016 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import("//third_party/protobuf/proto_library.gni")
proto_library("proto") {
sources = [
"rules.proto",
]
}
// Copyright 2016 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
syntax = "proto2";
option optimize_for = LITE_RUNTIME;
package subresource_filter.proto;
option java_package = "org.chromium.components.subresource_filter.proto";
// The type of a subresource filtering rule.
enum RuleType {
RULE_TYPE_UNSPECIFIED = 0;
RULE_TYPE_COMMENT = 1; // Comment rule.
RULE_TYPE_URL = 2; // Network level filtering rule based on URL pattern.
RULE_TYPE_CSS = 3; // Element hiding rule based on a CSS selector.
};
// The format of a URL pattern.
enum UrlPatternType {
URL_PATTERN_TYPE_UNSPECIFIED = 0;
// A pattern without special characters, e.g. "example.com".
URL_PATTERN_TYPE_SUBSTRING = 1;
// The pattern contains one or more wildcards, namely '*' and/or '^'
// characters. The '*' matches any sequence of characters, while the '^'
// matches a separator, i.e. anything but a letter, a digit, or one of [-._%].
URL_PATTERN_TYPE_WILDCARDED = 2;
// The pattern is a regular expression.
URL_PATTERN_TYPE_REGEXP = 3;
};
// Types of anchors that can be used to constrain where a URL pattern must
// begin/end in the URL in order to be considered a match.
enum AnchorType {
ANCHOR_TYPE_UNSPECIFIED = 0;
// Acts like a '*' wildcard at the respective end of a pattern.
ANCHOR_TYPE_NONE = 1;
// The pattern must match from the start/until the end of the URL.
ANCHOR_TYPE_BOUNDARY = 2;
// The pattern must match starting with the TLD+n of the URL's domain, but the
// scheme and subdomains (if any) can be arbitrary.
ANCHOR_TYPE_SUBDOMAIN = 3;
};
// The types of subresource requests that a URL rule should be applied to.
// Note: Values are used as flags in a bitmask.
enum ElementType {
option allow_alias = true;
ELEMENT_TYPE_UNSPECIFIED = 0;
ELEMENT_TYPE_OTHER = 1;
ELEMENT_TYPE_SCRIPT = 2;
ELEMENT_TYPE_IMAGE = 4;
ELEMENT_TYPE_STYLESHEET = 8;
ELEMENT_TYPE_OBJECT = 16;
ELEMENT_TYPE_XMLHTTPREQUEST = 32;
ELEMENT_TYPE_OBJECT_SUBREQUEST = 64;
ELEMENT_TYPE_SUBDOCUMENT = 128;
ELEMENT_TYPE_PING = 256;
ELEMENT_TYPE_MEDIA = 512;
ELEMENT_TYPE_FONT = 1024;
ELEMENT_TYPE_POPUP = 2048;
// NOTE: Keep these two values consistent with the values above.
ELEMENT_TYPE_MAX = 2048;
ELEMENT_TYPE_ALL = 4095;
};
// The options controlling whether or not to activate filtering for subresources
// of documents that match the URL pattern of the rule.
// Note: Values are used as flags in a bitmask.
enum ActivationType {
option allow_alias = true;
ACTIVATION_TYPE_UNSPECIFIED = 0;
ACTIVATION_TYPE_DOCUMENT = 1; // Disable all rules on the page.
ACTIVATION_TYPE_ELEMHIDE = 2; // Disable CSS rules on the page.
ACTIVATION_TYPE_GENERICHIDE = 4; // Disable generic CSS rules on the page.
ACTIVATION_TYPE_GENERICBLOCK = 8; // Disable generic URL rules on the page.
// NOTE: Keep these two values consistent with the values above.
ACTIVATION_TYPE_MAX = 8;
ACTIVATION_TYPE_ALL = 15;
};
// The semantics of a rule. Defines how the rule relates to other rules and how
// it influences the filtering decision.
enum RuleSemantics {
RULE_SEMANTICS_UNSPECIFIED = 0;
// Matching subresource requests should be aborted, matching elements should
// be hidden.
RULE_SEMANTICS_BLACKLIST = 1;
// If the rule matches, it suppresses any matching RULE_SEMANTICS_BLACKLIST
// rule.
RULE_SEMANTICS_WHITELIST = 2;
}
// The type of relation between the source of the requested subresource and that
// of the document.
enum SourceType {
SOURCE_TYPE_UNSPECIFIED = 0;
SOURCE_TYPE_ANY = 1; // Doesn't matter.
SOURCE_TYPE_THIRD_PARTY = 2; // Requesting a trird-party resource.
SOURCE_TYPE_FIRST_PARTY = 3; // Requesting a first-party resource.
}
// An item of the domain list.
message DomainListItem {
// The UTF-8 representation of the domain, e.g. "subdomain.example.com".
optional string domain = 1;
// Defines whether the domain is excluded from the set of domains.
optional bool exclude = 2;
}
// A network level filtering rule based on a URL pattern. Corresponds to
// RULE_TYPE_URL.
message UrlRule {
// The semantics of the rule.
optional RuleSemantics semantics = 1;
// Restricts application of the rule to first-party/third-party requests.
optional SourceType source_type = 2;
// Stores the ElementTypes that the rule applies to as a bitwise OR of the
// corresponding ElementType values.
optional int32 element_types = 3;
// Stores the ActivationTypes associated with the rule as a bitwise OR of the
// corresponding ActivationType values.
optional int32 activation_types = 4;
// The list of domains to be included/excluded from the filter's affected set.
// The rule applies only to subresources of documents loaded from included
// domains (or subdomains thereof). If the list is empty, the rule is applied
// on documents from all domains.
// If |domains| is empty or has exceptions only, the rule is called generic.
// Otherwise is it called domain specific, i.e. applies to a limited number of
// domains.
repeated DomainListItem domains = 5;
// The format of |url_pattern|.
optional UrlPatternType url_pattern_type = 6;
// Defines where the URL pattern must start in the URL in order to be
// considered a match. Never used with REGEXP patterns.
optional AnchorType anchor_left = 7;
// Defines where the URL pattern must end in the URL in order to be
// considered a match. Never used with REGEXP patterns. Never equals to
// ANCHOR_TYPE_SUBDOMAIN.
optional AnchorType anchor_right = 8;
// When set, the rule applies only to URLs that match |url_pattern| in a
// case-sensitive way.
optional bool match_case = 9;
// The URL pattern of the format prescribed by |url_pattern_type|.
optional string url_pattern = 10;
}
// Element hiding rule based on a CSS selector. Corresponds to RULE_TYPE_CSS.
message CssRule {
// The semantics of the rule.
optional RuleSemantics semantics = 1;
// The list of domains, same as UrlRule::domains.
repeated DomainListItem domains = 2;
// A CSS selector as specified in http://www.w3.org/TR/css3-selectors.
optional string css_selector = 3;
}
// A comment line.
message Comment {
// Comment text.
optional string text = 1;
// For special key-value comments, if any.
optional string key = 2;
optional string value = 3;
}
// A container for lists of non-comment rules collated by RuleType.
message FilteringRules {
repeated UrlRule url_rules = 1;
repeated CssRule css_rules = 2;
}
......@@ -7,6 +7,7 @@
{
'target_name': 'compiler_files',
'type': 'static_library',
'toolsets': ['host'],
'include_dirs': [
'src/include',
],
......@@ -29,7 +30,8 @@
},
{
'target_name': 'flatbuffers',
'type': 'static_library',
'type': 'none',
'toolsets': ['host', 'target'],
'include_dirs': [
'src/include',
],
......@@ -46,6 +48,7 @@
{
'target_name': 'flatc',
'type': 'executable',
'toolsets': ['host'],
'dependencies': [
'compiler_files',
'flatbuffers',
......@@ -80,6 +83,7 @@
# cd third_party/flatbuffers/ && ../../out/Debug/flatbuffers_unittest
'target_name': 'flatbuffers_unittest',
'type': 'executable',
'toolsets': ['host'],
'dependencies': [
'compiler_files',
'flatbuffers'
......
......@@ -47,7 +47,7 @@
}
],
'dependencies': [
'<(DEPTH)/third_party/flatbuffers/flatbuffers.gyp:flatc',
'<(DEPTH)/third_party/flatbuffers/flatbuffers.gyp:flatc#host',
],
'include_dirs': [
'<(SHARED_INTERMEDIATE_DIR)/flatc_out',
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment