Commit a9c4e1d4 authored by Karan Bhatia's avatar Karan Bhatia Committed by Commit Bot

UrlPatternIndex: Implement case-insensitive matching.

This fixes an existing TODO in url_pattern_index to implement case-insensitive
matching. To do so:
  - Lower case the stored n grams. These n grams are used to find prospective
    matches for a particular url.
  - Modify URLPattern to handle case-insensitive matching. This involves
    lower-casing the url pattern and the url, before comparing them.

BUG=767605

Cq-Include-Trybots: master.tryserver.chromium.perf:obbs_fyi
Change-Id: I1377c216d61392c508280141d0d947a47a3ee494
Reviewed-on: https://chromium-review.googlesource.com/1212183Reviewed-by: default avatarCharlie Harrison <csharrison@chromium.org>
Reviewed-by: default avatarNed Nguyen <nednguyen@google.com>
Reviewed-by: default avatarIstiaque Ahmed <lazyboy@chromium.org>
Commit-Queue: Karan Bhatia <karandeepb@chromium.org>
Cr-Commit-Position: refs/heads/master@{#590118}
parent a9970a07
......@@ -604,10 +604,8 @@ IN_PROC_BROWSER_TEST_P(DeclarativeNetRequestBrowserTest,
}
// Tests case sensitive url filters.
// TODO(crbug.com/767605): Enable once case-insensitive matching is implemented
// in url_pattern_index.
IN_PROC_BROWSER_TEST_P(DeclarativeNetRequestBrowserTest,
DISABLED_BlockRequests_IsUrlFilterCaseSensitive) {
BlockRequests_IsUrlFilterCaseSensitive) {
struct {
std::string url_filter;
size_t id;
......@@ -622,6 +620,7 @@ IN_PROC_BROWSER_TEST_P(DeclarativeNetRequestBrowserTest,
rule.id = rule_data.id;
rule.condition->is_url_filter_case_sensitive =
rule_data.is_url_filter_case_sensitive;
rule.condition->resource_types = std::vector<std::string>({"main_frame"});
rules.push_back(rule);
}
ASSERT_NO_FATAL_FAILURE(LoadExtensionWithRules(rules));
......
......@@ -55,12 +55,12 @@ VerifyStatus GetVerifyStatus(const uint8_t* buffer,
// static
// Keep this in sync with the version number in
// tools/perf/core/default_local_state.json.
const int RulesetIndexer::kIndexedFormatVersion = 20;
const int RulesetIndexer::kIndexedFormatVersion = 21;
// This static assert is meant to catch cases where
// url_pattern_index::kUrlPatternIndexFormatVersion is incremented without
// updating RulesetIndexer::kIndexedFormatVersion.
static_assert(url_pattern_index::kUrlPatternIndexFormatVersion == 1,
static_assert(url_pattern_index::kUrlPatternIndexFormatVersion == 2,
"kUrlPatternIndexFormatVersion has changed, make sure you've "
"also updated RulesetIndexer::kIndexedFormatVersion above.");
......
......@@ -109,8 +109,9 @@ table UrlRule {
// Contains an N-gram (acting as a key in a hash table) and a list of URL rules
// associated with that N-gram.
table NGramToRules {
// A string consisting of N (up to 8) non-special characters, which are stored
// in the lowest N non-zero bytes, lower bytes corresponding to later symbols.
// A string consisting of N (up to 8) ascii-only non-special characters, which
// are stored in the lowest N non-zero bytes, lower bytes corresponding to
// later symbols. These are lower-cased to support case-insensitive matching.
ngram : ulong;
// The list of rules containing |ngram| as a substring of their URL pattern.
......
......@@ -17,10 +17,12 @@
#include <stddef.h>
#include <algorithm>
#include <ostream>
#include "base/logging.h"
#include "base/numerics/checked_math.h"
#include "base/strings/string_util.h"
#include "components/url_pattern_index/flat/url_pattern_index_generated.h"
#include "components/url_pattern_index/fuzzy_pattern_matching.h"
#include "components/url_pattern_index/string_splitter.h"
......@@ -67,6 +69,10 @@ base::StringPiece ConvertString(const flatbuffers::String* string) {
: base::StringPiece();
}
bool HasAnyUpperAscii(base::StringPiece string) {
return std::any_of(string.begin(), string.end(), base::IsAsciiUpper<char>);
}
// Returns whether |position| within the |url| belongs to its |host| component
// and corresponds to the beginning of a (sub-)domain.
inline bool IsSubdomainAnchored(base::StringPiece url,
......@@ -138,95 +144,70 @@ size_t FindSubdomainAnchoredSubpattern(base::StringPiece url,
return base::StringPiece::npos;
}
} // namespace
UrlPattern::UrlPattern() = default;
UrlPattern::UrlPattern(base::StringPiece url_pattern,
proto::UrlPatternType type)
: type_(type), url_pattern_(url_pattern) {}
UrlPattern::UrlPattern(base::StringPiece url_pattern,
proto::AnchorType anchor_left,
proto::AnchorType anchor_right)
: type_(proto::URL_PATTERN_TYPE_WILDCARDED),
url_pattern_(url_pattern),
anchor_left_(anchor_left),
anchor_right_(anchor_right) {}
// Returns whether the given |url_pattern| matches the given |url_spec|.
// Compares the pattern the the url in a case-sensitive manner.
bool IsCaseSensitiveMatch(base::StringPiece url_pattern,
proto::AnchorType anchor_left,
proto::AnchorType anchor_right,
base::StringPiece url_spec,
url::Component url_host) {
DCHECK(!url_spec.empty());
UrlPattern::UrlPattern(const flat::UrlRule& rule)
: type_(ConvertUrlPatternType(rule.url_pattern_type())),
url_pattern_(ConvertString(rule.url_pattern())),
anchor_left_(ConvertAnchorType(rule.anchor_left())),
anchor_right_(ConvertAnchorType(rule.anchor_right())),
match_case_(!!(rule.options() & flat::OptionFlag_IS_MATCH_CASE)) {}
UrlPattern::~UrlPattern() = default;
bool UrlPattern::MatchesUrl(const GURL& url) const {
// Note: Empty domains are also invalid.
DCHECK(url.is_valid());
DCHECK(type_ == proto::URL_PATTERN_TYPE_SUBSTRING ||
type_ == proto::URL_PATTERN_TYPE_WILDCARDED);
StringSplitter<IsWildcard> subpatterns(url_pattern_);
StringSplitter<IsWildcard> subpatterns(url_pattern);
auto subpattern_it = subpatterns.begin();
auto subpattern_end = subpatterns.end();
if (subpattern_it == subpattern_end) {
return anchor_left_ == proto::ANCHOR_TYPE_NONE ||
anchor_right_ == proto::ANCHOR_TYPE_NONE;
return anchor_left == proto::ANCHOR_TYPE_NONE ||
anchor_right == proto::ANCHOR_TYPE_NONE;
}
const base::StringPiece spec = url.possibly_invalid_spec();
const url::Component host_part = url.parsed_for_possibly_invalid_spec().host;
DCHECK(!spec.empty());
base::StringPiece subpattern = *subpattern_it;
++subpattern_it;
// If there is only one |subpattern|, and it has a right anchor, then simply
// check that it is a suffix of the |spec|, and the left anchor is fulfilled.
// check that it is a suffix of the |url_spec|, and the left anchor is
// fulfilled.
if (subpattern_it == subpattern_end &&
anchor_right_ == proto::ANCHOR_TYPE_BOUNDARY) {
if (!EndsWithFuzzy(spec, subpattern))
anchor_right == proto::ANCHOR_TYPE_BOUNDARY) {
if (!EndsWithFuzzy(url_spec, subpattern))
return false;
if (anchor_left_ == proto::ANCHOR_TYPE_BOUNDARY)
return spec.size() == subpattern.size();
if (anchor_left_ == proto::ANCHOR_TYPE_SUBDOMAIN) {
DCHECK_LE(subpattern.size(), spec.size());
return url.has_host() &&
IsSubdomainAnchored(spec, host_part,
spec.size() - subpattern.size());
if (anchor_left == proto::ANCHOR_TYPE_BOUNDARY)
return url_spec.size() == subpattern.size();
if (anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN) {
DCHECK_LE(subpattern.size(), url_spec.size());
return url_host.is_nonempty() &&
IsSubdomainAnchored(url_spec, url_host,
url_spec.size() - subpattern.size());
}
return true;
}
// Otherwise, the first |subpattern| does not have to be a suffix. But it
// still can have a left anchor. Check and handle that.
base::StringPiece text = spec;
if (anchor_left_ == proto::ANCHOR_TYPE_BOUNDARY) {
if (!StartsWithFuzzy(spec, subpattern))
base::StringPiece text = url_spec;
if (anchor_left == proto::ANCHOR_TYPE_BOUNDARY) {
if (!StartsWithFuzzy(url_spec, subpattern))
return false;
if (subpattern_it == subpattern_end) {
DCHECK_EQ(anchor_right_, proto::ANCHOR_TYPE_NONE);
DCHECK_EQ(anchor_right, proto::ANCHOR_TYPE_NONE);
return true;
}
text.remove_prefix(subpattern.size());
} else if (anchor_left_ == proto::ANCHOR_TYPE_SUBDOMAIN) {
if (!url.has_host())
} else if (anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN) {
if (!url_host.is_nonempty())
return false;
const size_t match_begin =
FindSubdomainAnchoredSubpattern(spec, host_part, subpattern);
FindSubdomainAnchoredSubpattern(url_spec, url_host, subpattern);
if (match_begin == base::StringPiece::npos)
return false;
if (subpattern_it == subpattern_end) {
DCHECK_EQ(anchor_right_, proto::ANCHOR_TYPE_NONE);
DCHECK_EQ(anchor_right, proto::ANCHOR_TYPE_NONE);
return true;
}
text.remove_prefix(match_begin + subpattern.size());
} else {
DCHECK_EQ(anchor_left_, proto::ANCHOR_TYPE_NONE);
DCHECK_EQ(anchor_left, proto::ANCHOR_TYPE_NONE);
// Get back to the initial |subpattern|, process it in the loop below.
subpattern_it = subpatterns.begin();
}
......@@ -239,7 +220,7 @@ bool UrlPattern::MatchesUrl(const GURL& url) const {
DCHECK(!subpattern.empty());
if (++subpattern_it == subpattern_end &&
anchor_right_ == proto::ANCHOR_TYPE_BOUNDARY) {
anchor_right == proto::ANCHOR_TYPE_BOUNDARY) {
break;
}
......@@ -249,10 +230,62 @@ bool UrlPattern::MatchesUrl(const GURL& url) const {
text.remove_prefix(match_position + subpattern.size());
}
return anchor_right_ != proto::ANCHOR_TYPE_BOUNDARY ||
return anchor_right != proto::ANCHOR_TYPE_BOUNDARY ||
EndsWithFuzzy(text, subpattern);
}
} // namespace
UrlPattern::UrlPattern() = default;
UrlPattern::UrlPattern(base::StringPiece url_pattern,
proto::UrlPatternType type,
MatchCase match_case)
: type_(type), url_pattern_(url_pattern), match_case_(match_case) {}
UrlPattern::UrlPattern(base::StringPiece url_pattern,
proto::AnchorType anchor_left,
proto::AnchorType anchor_right)
: type_(proto::URL_PATTERN_TYPE_WILDCARDED),
url_pattern_(url_pattern),
anchor_left_(anchor_left),
anchor_right_(anchor_right) {}
UrlPattern::UrlPattern(const flat::UrlRule& rule)
: type_(ConvertUrlPatternType(rule.url_pattern_type())),
url_pattern_(ConvertString(rule.url_pattern())),
anchor_left_(ConvertAnchorType(rule.anchor_left())),
anchor_right_(ConvertAnchorType(rule.anchor_right())),
match_case_(rule.options() & flat::OptionFlag_IS_MATCH_CASE
? MatchCase::kTrue
: MatchCase::kFalse) {}
UrlPattern::~UrlPattern() = default;
bool UrlPattern::MatchesUrl(const GURL& url) const {
// Note: Empty domains are also invalid.
DCHECK(url.is_valid());
DCHECK(type_ == proto::URL_PATTERN_TYPE_SUBSTRING ||
type_ == proto::URL_PATTERN_TYPE_WILDCARDED);
DCHECK(base::IsStringASCII(url_pattern_));
DCHECK(base::IsStringASCII(url.possibly_invalid_spec()));
if (match_case() || (!HasAnyUpperAscii(url_pattern_) &&
!HasAnyUpperAscii(url.possibly_invalid_spec()))) {
return IsCaseSensitiveMatch(url_pattern_, anchor_left_, anchor_right_,
url.possibly_invalid_spec(),
url.parsed_for_possibly_invalid_spec().host);
}
// For case-insensitive matching, convert both pattern and url to lower case.
const std::string lower_case_url_pattern = base::ToLowerASCII(url_pattern_);
const std::string lower_case_url_spec =
base::ToLowerASCII(url.possibly_invalid_spec());
return IsCaseSensitiveMatch(lower_case_url_pattern, anchor_left_,
anchor_right_, lower_case_url_spec,
url.parsed_for_possibly_invalid_spec().host);
}
std::ostream& operator<<(std::ostream& out, const UrlPattern& pattern) {
switch (pattern.anchor_left()) {
case proto::ANCHOR_TYPE_SUBDOMAIN:
......
......@@ -23,11 +23,17 @@ struct UrlRule; // The FlatBuffers version of UrlRule.
// of the UrlRule that owns it, and to match it against URLs.
class UrlPattern {
public:
enum class MatchCase {
kTrue,
kFalse,
};
UrlPattern();
// Creates a |url_pattern| of a certain |type|.
// Creates a |url_pattern| of a certain |type| and case-sensitivity.
UrlPattern(base::StringPiece url_pattern,
proto::UrlPatternType type = proto::URL_PATTERN_TYPE_WILDCARDED);
proto::UrlPatternType type = proto::URL_PATTERN_TYPE_WILDCARDED,
MatchCase match_case = MatchCase::kFalse);
// Creates a WILDCARDED |url_pattern| with the specified anchors.
UrlPattern(base::StringPiece url_pattern,
......@@ -43,7 +49,7 @@ class UrlPattern {
base::StringPiece url_pattern() const { return url_pattern_; }
proto::AnchorType anchor_left() const { return anchor_left_; }
proto::AnchorType anchor_right() const { return anchor_right_; }
bool match_case() const { return match_case_; }
bool match_case() const { return match_case_ == MatchCase::kTrue; }
// Returns whether the |url| matches the URL |pattern|. Requires the type of
// |this| pattern to be either SUBSTRING or WILDCARDED.
......@@ -63,8 +69,7 @@ class UrlPattern {
proto::AnchorType anchor_left_ = proto::ANCHOR_TYPE_NONE;
proto::AnchorType anchor_right_ = proto::ANCHOR_TYPE_NONE;
// TODO(pkalinnikov): Implement case-insensitive matching.
bool match_case_ = false;
MatchCase match_case_ = MatchCase::kFalse;
DISALLOW_COPY_AND_ASSIGN(UrlPattern);
};
......
......@@ -12,6 +12,7 @@
#include "base/logging.h"
#include "base/macros.h"
#include "base/numerics/safe_conversions.h"
#include "base/optional.h"
#include "base/strings/string_piece.h"
#include "base/strings/string_util.h"
#include "components/url_pattern_index/ngram_extractor.h"
......@@ -98,8 +99,7 @@ base::StringPiece ToStringPiece(const flatbuffers::String* string) {
}
bool HasNoUpperAscii(base::StringPiece string) {
return std::none_of(string.begin(), string.end(),
[](char c) { return base::IsAsciiUpper(c); });
return std::none_of(string.begin(), string.end(), base::IsAsciiUpper<char>);
}
// Comparator to sort UrlRule. Sorts rules by descending order of rule priority.
......@@ -450,8 +450,11 @@ NGram UrlPatternIndexBuilder::GetMostDistinctiveNGram(
size_t min_list_size = std::numeric_limits<size_t>::max();
NGram best_ngram = 0;
// To support case-insensitive matching, lower case the n-grams for |pattern|.
DCHECK(base::IsStringASCII(pattern));
const std::string lower_case_pattern = base::ToLowerASCII(pattern);
auto ngrams = CreateNGramExtractor<kNGramSize, NGram>(
pattern, [](char c) { return c == '*' || c == '^'; });
lower_case_pattern, [](char c) { return c == '*' || c == '^'; });
for (uint64_t ngram : ngrams) {
const MutableUrlRuleList* rules = ngram_index_.Get(ngram);
......@@ -654,8 +657,14 @@ const flat::UrlRule* FindMatchInFlatUrlPatternIndex(
NGramHashTableProber prober;
// |hash_table| contains lower-cased n-grams. Lower case the url if necessary
// to find possible matches.
base::Optional<std::string> lower_case_url;
if (!HasNoUpperAscii(url.spec()))
lower_case_url = base::ToLowerASCII(url.spec());
auto ngrams = CreateNGramExtractor<kNGramSize, uint64_t>(
url.spec(), [](char) { return false; });
lower_case_url ? *lower_case_url : url.spec(),
[](char) { return false; });
auto get_max_priority_rule = [](const flat::UrlRule* lhs,
const flat::UrlRule* rhs) {
......
......@@ -66,7 +66,7 @@ int CompareDomains(base::StringPiece lhs_domain, base::StringPiece rhs_domain);
// Increase this value when introducing an incompatible change to the
// UrlPatternIndex schema (flat/url_pattern_index.fbs). url_pattern_index
// clients can use this as a signal to rebuild rulesets.
constexpr int kUrlPatternIndexFormatVersion = 1;
constexpr int kUrlPatternIndexFormatVersion = 2;
// The class used to construct an index over the URL patterns of a set of URL
// rules. The rules themselves need to be converted to FlatBuffers format by the
......
......@@ -131,6 +131,20 @@ TEST_F(UrlPatternIndexTest, NoRuleApplies) {
EXPECT_FALSE(FindMatch("http://example.com?k=v&filter_not"));
}
TEST_F(UrlPatternIndexTest, CaseSensitivity) {
ASSERT_TRUE(
AddUrlRule(MakeUrlRule(UrlPattern("case-INSENsitive", kSubstring))));
proto::UrlRule rule = MakeUrlRule(UrlPattern("case-sensitive"));
rule.set_match_case(true);
ASSERT_TRUE(AddUrlRule(rule));
Finish();
EXPECT_TRUE(FindMatch("http://abc.com/type=CASE-insEnsitIVe"));
EXPECT_TRUE(FindMatch("http://abc.com/type=case-INSENSITIVE"));
EXPECT_FALSE(FindMatch("http://abc.com?type=CASE-sensitive"));
EXPECT_TRUE(FindMatch("http://abc.com?type=case-sensitive"));
}
TEST_F(UrlPatternIndexTest, OneRuleWithoutMetaInfo) {
const struct {
UrlPattern url_pattern;
......
......@@ -14,10 +14,12 @@ namespace {
constexpr proto::AnchorType kAnchorNone = proto::ANCHOR_TYPE_NONE;
constexpr proto::AnchorType kBoundary = proto::ANCHOR_TYPE_BOUNDARY;
constexpr proto::AnchorType kSubdomain = proto::ANCHOR_TYPE_SUBDOMAIN;
constexpr UrlPattern::MatchCase kMatchCase = UrlPattern::MatchCase::kTrue;
constexpr UrlPattern::MatchCase kDonotMatchCase = UrlPattern::MatchCase::kFalse;
} // namespace
TEST(SubresourceFilterUrlPatternTest, MatchesUrl) {
TEST(UrlPatternTest, MatchesUrl) {
const struct {
UrlPattern url_pattern;
const char* url;
......@@ -122,6 +124,31 @@ TEST(SubresourceFilterUrlPatternTest, MatchesUrl) {
{{"/path", kSubdomain, kBoundary}, "http://a.com./path", true},
{{"^path", kSubdomain, kBoundary}, "http://a.com./path", true},
{{"path", kSubdomain, kBoundary}, "http://a.com./path", false},
// Case-sensitivity tests.
{{"path", proto::URL_PATTERN_TYPE_SUBSTRING, kDonotMatchCase},
"http://a.com/PaTh",
true},
{{"path", proto::URL_PATTERN_TYPE_SUBSTRING, kMatchCase},
"http://a.com/PaTh",
false},
{{"path", proto::URL_PATTERN_TYPE_SUBSTRING, kDonotMatchCase},
"http://a.com/path",
true},
{{"path", proto::URL_PATTERN_TYPE_SUBSTRING, kMatchCase},
"http://a.com/path",
true},
{{"abc*def^", proto::URL_PATTERN_TYPE_WILDCARDED, kMatchCase},
"http://a.com/abcxAdef/vo",
true},
{{"abc*def^", proto::URL_PATTERN_TYPE_WILDCARDED, kMatchCase},
"http://a.com/aBcxAdeF/vo",
false},
{{"abc*def^", proto::URL_PATTERN_TYPE_WILDCARDED, kDonotMatchCase},
"http://a.com/aBcxAdeF/vo",
true},
{{"abc*def^", proto::URL_PATTERN_TYPE_WILDCARDED, kDonotMatchCase},
"http://a.com/abcxAdef/vo",
true},
};
for (const auto& test_case : kTestCases) {
......
......@@ -96,7 +96,7 @@ TEST_F(IndexedRulesetFormatVersionTest, CheckVersionUpdated) {
EXPECT_EQ(StripCommentsAndWhitespace(kFlatbufferSchemaExpected),
StripCommentsAndWhitespace(flatbuffer_schema))
<< "Schema change detected; update this test and the schema version.";
EXPECT_EQ(1, GetIndexedRulesetFormatVersionForTesting())
EXPECT_EQ(2, GetIndexedRulesetFormatVersionForTesting())
<< "Update this test if you update the schema version.";
}
......
......@@ -47,12 +47,12 @@ namespace dnr_api = extensions::api::declarative_net_request;
// url_pattern_index.fbs. Whenever an extension with an indexed ruleset format
// version different from the one currently used by Chrome is loaded, the
// extension ruleset will be reindexed.
constexpr int kIndexedRulesetFormatVersion = 1;
constexpr int kIndexedRulesetFormatVersion = 2;
// This static assert is meant to catch cases where
// url_pattern_index::kUrlPatternIndexFormatVersion is incremented without
// updating kIndexedRulesetFormatVersion.
static_assert(url_pattern_index::kUrlPatternIndexFormatVersion == 1,
static_assert(url_pattern_index::kUrlPatternIndexFormatVersion == 2,
"kUrlPatternIndexFormatVersion has changed, make sure you've "
"also updated kIndexedRulesetFormatVersion above.");
......
......@@ -2,7 +2,7 @@
"subresource_filter": {
"ruleset_version": {
"content": "1000",
"format": 20
"format": 21
}
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment