Commit 54dfcd96 authored by Karan Bhatia's avatar Karan Bhatia Committed by Commit Bot

Subresource Filter: Disallow indexing rules with non-ascii characters.

While comparing a rule against a url, url_pattern_index assumes that the
provided pattern and domains are in canonical form (internationalized domains in
punycode and non-ascii characters in percent-escaped utf8). Document this as
part of url_pattern_index and disallow indexing rules with non-ascii patterns or
domains.

BUG=879778

Change-Id: I725c37cd837c862ed3f0024c132ce2acf9c79642
Reviewed-on: https://chromium-review.googlesource.com/1198490
Commit-Queue: Karan Bhatia <karandeepb@chromium.org>
Reviewed-by: default avatarCharlie Harrison <csharrison@chromium.org>
Cr-Commit-Position: refs/heads/master@{#589020}
parent 45e35ec4
......@@ -9,6 +9,7 @@
#include "base/logging.h"
#include "base/macros.h"
#include "base/strings/string_piece.h"
#include "base/strings/utf_string_conversions.h"
#include "components/subresource_filter/core/common/first_party_origin.h"
#include "components/url_pattern_index/proto/rules.pb.h"
#include "components/url_pattern_index/url_pattern.h"
......@@ -132,6 +133,85 @@ TEST_F(SubresourceFilterIndexedRulesetTest, SimpleWhitelist) {
EXPECT_TRUE(ShouldAllow("https://example.com?filter_out=true"));
}
// Ensure patterns containing non-ascii characters are disallowed.
TEST_F(SubresourceFilterIndexedRulesetTest, NonAsciiPatterns) {
// non-ascii character é.
std::string non_ascii = base::WideToUTF8(L"\u00E9");
ASSERT_FALSE(AddSimpleRule(non_ascii));
Finish();
EXPECT_TRUE(ShouldAllow("https://example.com/q=" + non_ascii));
}
// Ensure that specifying non-ascii characters in percent encoded form in
// patterns works.
TEST_F(SubresourceFilterIndexedRulesetTest, PercentEncodedPatterns) {
// Percent encoded form of é.
ASSERT_TRUE(AddSimpleRule("%C3%A9"));
Finish();
EXPECT_FALSE(
ShouldAllow("https://example.com/q=" + base::WideToUTF8(L"\u00E9")));
}
// Ensures that specifying patterns in punycode works for matching IDN domains.
TEST_F(SubresourceFilterIndexedRulesetTest, IDNHosts) {
// ҏӊԟҭв.com
const std::string punycode = "xn--b1a9p8c1e8r.com";
ASSERT_TRUE(AddSimpleRule(punycode));
Finish();
EXPECT_FALSE(ShouldAllow("https://" + punycode));
EXPECT_FALSE(ShouldAllow(
base::WideToUTF8(L"https://\x048f\x04ca\x051f\x04ad\x0432.com")));
}
// Ensure patterns containing non-ascii domains are disallowed.
TEST_F(SubresourceFilterIndexedRulesetTest, NonAsciiDomain) {
const char* kUrl = "http://example.com";
// ґғ.com
std::string non_ascii_domain = base::WideToUTF8(L"\x0491\x0493.com");
auto rule = MakeUrlRule(UrlPattern(kUrl, testing::kSubstring));
testing::AddDomains({non_ascii_domain}, &rule);
ASSERT_FALSE(AddUrlRule(rule));
rule = MakeUrlRule(UrlPattern(kUrl, testing::kSubstring));
std::string non_ascii_excluded_domain = "~" + non_ascii_domain;
testing::AddDomains({non_ascii_excluded_domain}, &rule);
ASSERT_FALSE(AddUrlRule(rule));
Finish();
}
// Ensure patterns with percent encoded hosts match correctly.
TEST_F(SubresourceFilterIndexedRulesetTest, PercentEncodedHostPattern) {
const char* kPercentEncodedHost = "http://%2C.com/";
ASSERT_TRUE(AddSimpleRule(kPercentEncodedHost));
Finish();
EXPECT_FALSE(ShouldAllow("http://,.com/"));
EXPECT_FALSE(ShouldAllow(kPercentEncodedHost));
}
// Verifies the behavior for rules having percent encoded domains.
TEST_F(SubresourceFilterIndexedRulesetTest, PercentEncodedDomain) {
const char* kUrl = "http://example.com";
std::string percent_encoded_host = "%2C.com";
auto rule = MakeUrlRule(UrlPattern(kUrl, testing::kSubstring));
testing::AddDomains({percent_encoded_host}, &rule);
ASSERT_TRUE(AddUrlRule(rule));
Finish();
// Note: This should actually fail. However url_pattern_index lower cases all
// domains. Hence it doesn't correctly deal with domains having escape
// characters which are percent-encoded in upper case by Chrome's url parser.
EXPECT_TRUE(ShouldAllow(kUrl, "http://" + percent_encoded_host));
EXPECT_TRUE(ShouldAllow(kUrl, "http://,.com"));
}
TEST_F(SubresourceFilterIndexedRulesetTest, SimpleBlacklistAndWhitelist) {
ASSERT_TRUE(AddSimpleRule("?filter="));
ASSERT_TRUE(AddSimpleWhitelistRule("whitelisted.com/?filter="));
......
......@@ -87,11 +87,15 @@ table UrlRule {
// The list of domains to be included/excluded from the filter's affected set.
// Should either be null or have at least a single element. The domains
// should be in lower-case and kept sorted as defined by
// url_pattern_index::CompareDomains.
// url_pattern_index::CompareDomains. The entries must consist of only ascii
// characters. Use punycode encoding for internationalized domains.
domains_included : [string];
domains_excluded : [string];
// A URL pattern in the format defined by |url_pattern_type|.
// A URL pattern in the format defined by |url_pattern_type|. This should
// only consist of ascii characters, since it's matched against a url where
// the host is encoded in the punycode format (in case of internationalized
// domains) and any other non-ascii characters are percent-escaped in utf-8.
url_pattern : string;
// An id which uniquely identifies the rule. Clients must ensure uniqueness if
......
......@@ -133,16 +133,16 @@ class UrlRuleFlatBufferConverter {
IsMeaningful();
}
// Returns whether the |rule| can be converted to its FlatBuffers equivalent.
// The conversion is not possible if the rule has attributes not supported by
// this client version.
bool is_convertible() const { return is_convertible_; }
// Writes the URL |rule| to the FlatBuffer using the |builder|, and returns
// the offset to the serialized rule.
// the offset to the serialized rule. Returns an empty offset in case the rule
// can't be converted. The conversion is not possible if the rule has
// attributes not supported by this client version.
UrlRuleOffset SerializeConvertedRule(
flatbuffers::FlatBufferBuilder* builder) const {
DCHECK(is_convertible());
if (!is_convertible_)
return UrlRuleOffset();
DCHECK_NE(rule_.url_pattern_type(), proto::URL_PATTERN_TYPE_REGEXP);
FlatDomainsOffset domains_included_offset;
FlatDomainsOffset domains_excluded_offset;
......@@ -155,12 +155,15 @@ class UrlRuleFlatBufferConverter {
domains_included.reserve(rule_.domains_size());
for (const auto& domain_list_item : rule_.domains()) {
// Note: The |domain| can have non-ASCII UTF-8 characters, but
// ToLowerASCII leaves these intact.
// TODO(pkalinnikov): Convert non-ASCII characters to lower case too.
// TODO(pkalinnikov): Possibly convert Punycode to IDN here or directly
// assume this is done in the proto::UrlRule.
const std::string& domain = domain_list_item.domain();
// Non-ascii characters in domains are unsupported.
if (!base::IsStringASCII(domain))
return UrlRuleOffset();
// Note: This is not always correct. Chrome's URL parser uses upper-case
// for percent encoded hosts. E.g. https://,.com is encoded as
// https://%2C.com.
auto offset = builder->CreateSharedString(
HasNoUpperAscii(domain) ? domain : base::ToLowerASCII(domain));
......@@ -190,6 +193,10 @@ class UrlRuleFlatBufferConverter {
}
}
// Non-ascii characters in patterns are unsupported.
if (!base::IsStringASCII(rule_.url_pattern()))
return UrlRuleOffset();
auto url_pattern_offset = builder->CreateString(rule_.url_pattern());
return flat::CreateUrlRule(
......@@ -349,9 +356,6 @@ UrlRuleOffset SerializeUrlRule(const proto::UrlRule& rule,
flatbuffers::FlatBufferBuilder* builder) {
DCHECK(builder);
UrlRuleFlatBufferConverter converter(rule);
if (!converter.is_convertible())
return UrlRuleOffset();
DCHECK_NE(rule.url_pattern_type(), proto::URL_PATTERN_TYPE_REGEXP);
return converter.SerializeConvertedRule(builder);
}
......@@ -376,6 +380,20 @@ void UrlPatternIndexBuilder::IndexUrlRule(UrlRuleOffset offset) {
const auto* rule = flatbuffers::GetTemporaryPointer(*flat_builder_, offset);
DCHECK(rule);
// Sanity check that the rule does not have fields with non-ascii characters.
#if DCHECK_IS_ON()
DCHECK(base::IsStringASCII(ToStringPiece(rule->url_pattern())));
if (rule->domains_included()) {
for (auto* domain : *rule->domains_included())
DCHECK(base::IsStringASCII(ToStringPiece(domain)));
}
if (rule->domains_excluded()) {
for (auto* domain : *rule->domains_excluded())
DCHECK(base::IsStringASCII(ToStringPiece(domain)));
}
#endif
NGram ngram = GetMostDistinctiveNGram(ToStringPiece(rule->url_pattern()));
if (ngram) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment