Subresource Filter: Disallow indexing rules with non-ascii characters.

While comparing a rule against a url, url_pattern_index assumes that the provided pattern and domains are in canonical form (internationalized domains in punycode and non-ascii characters in percent-escaped utf8). Document this as part of url_pattern_index and disallow indexing rules with non-ascii patterns or domains. BUG=879778 Change-Id: I725c37cd837c862ed3f0024c132ce2acf9c79642 Reviewed-on: https://chromium-review.googlesource.com/1198490 Commit-Queue: Karan Bhatia <karandeepb@chromium.org> Reviewed-by: Charlie Harrison <csharrison@chromium.org> Cr-Commit-Position: refs/heads/master@{#589020}

Subresource Filter: Disallow indexing rules with non-ascii characters.
While comparing a rule against a url, url_pattern_index assumes that the provided pattern and domains are in canonical form (internationalized domains in punycode and non-ascii characters in percent-escaped utf8). Document this as part of url_pattern_index and disallow indexing rules with non-ascii patterns or domains. BUG=879778 Change-Id: I725c37cd837c862ed3f0024c132ce2acf9c79642 Reviewed-on: https://chromium-review.googlesource.com/1198490 Commit-Queue: Karan Bhatia <karandeepb@chromium.org> Reviewed-by: Charlie Harrison <csharrison@chromium.org> Cr-Commit-Position: refs/heads/master@{#589020}
54dfcd96 · Karan Bhatia · Commit Bot · 45e35ec4 · 54dfcd96 · 54dfcd96
Commit 54dfcd96 authored Sep 05, 2018 by Karan Bhatia Committed by Commit Bot Sep 05, 2018
3 changed files
--- a/components/subresource_filter/core/common/indexed_ruleset_unittest.cc
+++ b/components/subresource_filter/core/common/indexed_ruleset_unittest.cc
@@ -9,6 +9,7 @@
 #include "base/logging.h"
 #include "base/macros.h"
 #include "base/strings/string_piece.h"
+#include "base/strings/utf_string_conversions.h"
 #include "components/subresource_filter/core/common/first_party_origin.h"
 #include "components/url_pattern_index/proto/rules.pb.h"
 #include "components/url_pattern_index/url_pattern.h"
@@ -132,6 +133,85 @@ TEST_F(SubresourceFilterIndexedRulesetTest, SimpleWhitelist) {
  EXPECT_TRUE(ShouldAllow("https://example.com?filter_out=true"));
 }

+// Ensure patterns containing non-ascii characters are disallowed.
+TEST_F(SubresourceFilterIndexedRulesetTest, NonAsciiPatterns) {
+  // non-ascii character é.
+  std::string non_ascii = base::WideToUTF8(L"\u00E9");
+  ASSERT_FALSE(AddSimpleRule(non_ascii));
+  Finish();
+
+  EXPECT_TRUE(ShouldAllow("https://example.com/q=" + non_ascii));
+}
+
+// Ensure that specifying non-ascii characters in percent encoded form in
+// patterns works.
+TEST_F(SubresourceFilterIndexedRulesetTest, PercentEncodedPatterns) {
+  // Percent encoded form of é.
+  ASSERT_TRUE(AddSimpleRule("%C3%A9"));
+  Finish();
+
+  EXPECT_FALSE(
+      ShouldAllow("https://example.com/q=" + base::WideToUTF8(L"\u00E9")));
+}
+
+// Ensures that specifying patterns in punycode works for matching IDN domains.
+TEST_F(SubresourceFilterIndexedRulesetTest, IDNHosts) {
+  // ҏӊԟҭв.com
+  const std::string punycode = "xn--b1a9p8c1e8r.com";
+  ASSERT_TRUE(AddSimpleRule(punycode));
+  Finish();
+
+  EXPECT_FALSE(ShouldAllow("https://" + punycode));
+  EXPECT_FALSE(ShouldAllow(
+      base::WideToUTF8(L"https://\x048f\x04ca\x051f\x04ad\x0432.com")));
+}
+
+// Ensure patterns containing non-ascii domains are disallowed.
+TEST_F(SubresourceFilterIndexedRulesetTest, NonAsciiDomain) {
+  const char* kUrl = "http://example.com";
+
+  // ґғ.com
+  std::string non_ascii_domain = base::WideToUTF8(L"\x0491\x0493.com");
+
+  auto rule = MakeUrlRule(UrlPattern(kUrl, testing::kSubstring));
+  testing::AddDomains({non_ascii_domain}, &rule);
+  ASSERT_FALSE(AddUrlRule(rule));
+
+  rule = MakeUrlRule(UrlPattern(kUrl, testing::kSubstring));
+  std::string non_ascii_excluded_domain = "~" + non_ascii_domain;
+  testing::AddDomains({non_ascii_excluded_domain}, &rule);
+  ASSERT_FALSE(AddUrlRule(rule));
+
+  Finish();
+}
+
+// Ensure patterns with percent encoded hosts match correctly.
+TEST_F(SubresourceFilterIndexedRulesetTest, PercentEncodedHostPattern) {
+  const char* kPercentEncodedHost = "http://%2C.com/";
+  ASSERT_TRUE(AddSimpleRule(kPercentEncodedHost));
+  Finish();
+
+  EXPECT_FALSE(ShouldAllow("http://,.com/"));
+  EXPECT_FALSE(ShouldAllow(kPercentEncodedHost));
+}
+
+// Verifies the behavior for rules having percent encoded domains.
+TEST_F(SubresourceFilterIndexedRulesetTest, PercentEncodedDomain) {
+  const char* kUrl = "http://example.com";
+  std::string percent_encoded_host = "%2C.com";
+
+  auto rule = MakeUrlRule(UrlPattern(kUrl, testing::kSubstring));
+  testing::AddDomains({percent_encoded_host}, &rule);
+  ASSERT_TRUE(AddUrlRule(rule));
+  Finish();
+
+  // Note: This should actually fail. However url_pattern_index lower cases all
+  // domains. Hence it doesn't correctly deal with domains having escape
+  // characters which are percent-encoded in upper case by Chrome's url parser.
+  EXPECT_TRUE(ShouldAllow(kUrl, "http://" + percent_encoded_host));
+  EXPECT_TRUE(ShouldAllow(kUrl, "http://,.com"));
+}
+
 TEST_F(SubresourceFilterIndexedRulesetTest, SimpleBlacklistAndWhitelist) {
  ASSERT_TRUE(AddSimpleRule("?filter="));
  ASSERT_TRUE(AddSimpleWhitelistRule("whitelisted.com/?filter="));

--- a/components/url_pattern_index/flat/url_pattern_index.fbs
+++ b/components/url_pattern_index/flat/url_pattern_index.fbs
@@ -87,11 +87,15 @@ table UrlRule {
  // The list of domains to be included/excluded from the filter's affected set.
  // Should either be null or have at least a single element. The domains
  // should be in lower-case and kept sorted as defined by
-  // url_pattern_index::CompareDomains.
+  // url_pattern_index::CompareDomains. The entries must consist of only ascii
+  // characters. Use punycode encoding for internationalized domains.
  domains_included : [string];
  domains_excluded : [string];

-  // A URL pattern in the format defined by |url_pattern_type|.
+  // A URL pattern in the format defined by |url_pattern_type|.  This should
+  // only consist of ascii characters, since it's matched against a url where
+  // the host is encoded in the punycode format (in case of internationalized
+  // domains) and any other non-ascii characters are percent-escaped in utf-8.
  url_pattern : string;

  // An id which uniquely identifies the rule. Clients must ensure uniqueness if

--- a/components/url_pattern_index/url_pattern_index.cc
+++ b/components/url_pattern_index/url_pattern_index.cc
@@ -133,16 +133,16 @@ class UrlRuleFlatBufferConverter {
                      IsMeaningful();
  }

-  // Returns whether the |rule| can be converted to its FlatBuffers equivalent.
-  // The conversion is not possible if the rule has attributes not supported by
-  // this client version.
-  bool is_convertible() const { return is_convertible_; }
-
  // Writes the URL |rule| to the FlatBuffer using the |builder|, and returns
-  // the offset to the serialized rule.
+  // the offset to the serialized rule. Returns an empty offset in case the rule
+  // can't be converted. The conversion is not possible if the rule has
+  // attributes not supported by this client version.
  UrlRuleOffset SerializeConvertedRule(
      flatbuffers::FlatBufferBuilder* builder) const {
-    DCHECK(is_convertible());
+    if (!is_convertible_)
+      return UrlRuleOffset();
+
+    DCHECK_NE(rule_.url_pattern_type(), proto::URL_PATTERN_TYPE_REGEXP);

    FlatDomainsOffset domains_included_offset;
    FlatDomainsOffset domains_excluded_offset;
@@ -155,12 +155,15 @@ class UrlRuleFlatBufferConverter {
      domains_included.reserve(rule_.domains_size());

      for (const auto& domain_list_item : rule_.domains()) {
-        // Note: The |domain| can have non-ASCII UTF-8 characters, but
-        // ToLowerASCII leaves these intact.
-        // TODO(pkalinnikov): Convert non-ASCII characters to lower case too.
-        // TODO(pkalinnikov): Possibly convert Punycode to IDN here or directly
-        // assume this is done in the proto::UrlRule.
        const std::string& domain = domain_list_item.domain();
+
+        // Non-ascii characters in domains are unsupported.
+        if (!base::IsStringASCII(domain))
+          return UrlRuleOffset();
+
+        // Note: This is not always correct. Chrome's URL parser uses upper-case
+        // for percent encoded hosts. E.g. https://,.com is encoded as
+        // https://%2C.com.
        auto offset = builder->CreateSharedString(
            HasNoUpperAscii(domain) ? domain : base::ToLowerASCII(domain));

@@ -190,6 +193,10 @@ class UrlRuleFlatBufferConverter {
      }
    }

+    // Non-ascii characters in patterns are unsupported.
+    if (!base::IsStringASCII(rule_.url_pattern()))
+      return UrlRuleOffset();
+
    auto url_pattern_offset = builder->CreateString(rule_.url_pattern());

    return flat::CreateUrlRule(
@@ -349,9 +356,6 @@ UrlRuleOffset SerializeUrlRule(const proto::UrlRule& rule,
                               flatbuffers::FlatBufferBuilder* builder) {
  DCHECK(builder);
  UrlRuleFlatBufferConverter converter(rule);
-  if (!converter.is_convertible())
-    return UrlRuleOffset();
-  DCHECK_NE(rule.url_pattern_type(), proto::URL_PATTERN_TYPE_REGEXP);
  return converter.SerializeConvertedRule(builder);
 }

@@ -376,6 +380,20 @@ void UrlPatternIndexBuilder::IndexUrlRule(UrlRuleOffset offset) {

  const auto* rule = flatbuffers::GetTemporaryPointer(*flat_builder_, offset);
  DCHECK(rule);
+
+// Sanity check that the rule does not have fields with non-ascii characters.
+#if DCHECK_IS_ON()
+  DCHECK(base::IsStringASCII(ToStringPiece(rule->url_pattern())));
+  if (rule->domains_included()) {
+    for (auto* domain : *rule->domains_included())
+      DCHECK(base::IsStringASCII(ToStringPiece(domain)));
+  }
+  if (rule->domains_excluded()) {
+    for (auto* domain : *rule->domains_excluded())
+      DCHECK(base::IsStringASCII(ToStringPiece(domain)));
+  }
+#endif
+
  NGram ngram = GetMostDistinctiveNGram(ToStringPiece(rule->url_pattern()));

  if (ngram) {