Commit e39bc57a authored by Karan Bhatia's avatar Karan Bhatia Committed by Commit Bot

UrlMatcher: Initialize SubstringSetMatcher with patterns.

This CL changes SubstringSetMatcher to require patterns as part of its
constructor. This:

- Simplifies the interface and the contract with the client.
  SubstringSetMatcher doesn't refer to memory owned by client anymore.

- Reduces the need for storing the pattern vector in both
  SubstringSetMatcher and its clients, thereby reducing runtime memory
  usage.

Also, modernize the code a bit, e.g.
  - Don't take references to primitives.
  - Use for-each loops when possible.

This shouldn't introduce any behavior change.

BUG=974391

Change-Id: Ie6cf1e7c08507b7eaccff39626d43062383e1383
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2038075
Commit-Queue: Karan Bhatia <karandeepb@chromium.org>
Reviewed-by: default avatarDominic Battré <battre@chromium.org>
Cr-Commit-Position: refs/heads/master@{#739067}
parent c70b7e09
......@@ -7,6 +7,7 @@
#include <stddef.h>
#include <memory>
#include <utility>
#include "base/logging.h"
#include "base/strings/string_util.h"
......@@ -16,11 +17,8 @@
namespace url_matcher {
RegexSetMatcher::RegexSetMatcher() {}
RegexSetMatcher::~RegexSetMatcher() {
DeleteSubstringPatterns();
}
RegexSetMatcher::RegexSetMatcher() = default;
RegexSetMatcher::~RegexSetMatcher() = default;
void RegexSetMatcher::AddPatterns(
const std::vector<const StringPattern*>& regex_list) {
......@@ -97,22 +95,15 @@ void RegexSetMatcher::RebuildMatcher() {
std::vector<std::string> strings_to_match;
filtered_re2_->Compile(&strings_to_match);
substring_matcher_.reset(new SubstringSetMatcher);
DeleteSubstringPatterns();
std::vector<url_matcher::StringPattern> substring_patterns;
substring_patterns.reserve(strings_to_match.size());
// Build SubstringSetMatcher from |strings_to_match|.
// SubstringSetMatcher doesn't own its strings.
for (size_t i = 0; i < strings_to_match.size(); ++i) {
substring_patterns_.push_back(
std::make_unique<StringPattern>(strings_to_match[i], i));
}
std::vector<const StringPattern*> patterns;
for (const auto& pattern : substring_patterns_)
patterns.push_back(pattern.get());
substring_matcher_->RegisterPatterns(patterns);
}
for (size_t i = 0; i < strings_to_match.size(); ++i)
substring_patterns.emplace_back(std::move(strings_to_match[i]), i);
void RegexSetMatcher::DeleteSubstringPatterns() {
substring_patterns_.clear();
substring_matcher_ =
std::make_unique<SubstringSetMatcher>(substring_patterns);
}
} // namespace url_matcher
......@@ -61,9 +61,6 @@ class URL_MATCHER_EXPORT RegexSetMatcher {
// apparently not supported by FilteredRE2.
void RebuildMatcher();
// Clean up StringPatterns in |substring_patterns_|.
void DeleteSubstringPatterns();
// Mapping of regex StringPattern::IDs to regexes.
RegexMap regexes_;
// Mapping of RE2IDs from FilteredRE2 (which are assigned in order)
......@@ -72,10 +69,6 @@ class URL_MATCHER_EXPORT RegexSetMatcher {
std::unique_ptr<re2::FilteredRE2> filtered_re2_;
std::unique_ptr<SubstringSetMatcher> substring_matcher_;
// The substring patterns from FilteredRE2, which are used in
// |substring_matcher_| but whose lifetime is managed here.
std::vector<std::unique_ptr<StringPattern>> substring_patterns_;
};
} // namespace url_matcher
......
......@@ -26,6 +26,8 @@ bool ComparePatterns(const StringPattern* a, const StringPattern* b) {
// Given the set of patterns, compute how many nodes will the corresponding
// Aho-Corasick tree have. Note that |patterns| need to be sorted.
uint32_t TreeSize(const std::vector<const StringPattern*>& patterns) {
DCHECK(std::is_sorted(patterns.begin(), patterns.end(), ComparePatterns));
uint32_t result = 1u; // 1 for the root node.
if (patterns.empty())
return result;
......@@ -53,61 +55,46 @@ uint32_t TreeSize(const std::vector<const StringPattern*>& patterns) {
return result;
}
} // namespace
//
// SubstringSetMatcher
//
SubstringSetMatcher::SubstringSetMatcher() {
RebuildAhoCorasickTree(SubstringPatternVector());
}
SubstringSetMatcher::~SubstringSetMatcher() {}
std::vector<const StringPattern*> GetVectorOfPointers(
const std::vector<StringPattern>& patterns) {
std::vector<const StringPattern*> pattern_pointers;
pattern_pointers.reserve(patterns.size());
void SubstringSetMatcher::RegisterPatterns(
const std::vector<const StringPattern*>& patterns) {
RegisterAndUnregisterPatterns(patterns,
std::vector<const StringPattern*>());
}
for (const StringPattern& pattern : patterns)
pattern_pointers.push_back(&pattern);
void SubstringSetMatcher::UnregisterPatterns(
const std::vector<const StringPattern*>& patterns) {
RegisterAndUnregisterPatterns(std::vector<const StringPattern*>(),
patterns);
return pattern_pointers;
}
void SubstringSetMatcher::RegisterAndUnregisterPatterns(
const std::vector<const StringPattern*>& to_register,
const std::vector<const StringPattern*>& to_unregister) {
// Register patterns.
for (auto i = to_register.begin(); i != to_register.end(); ++i) {
DCHECK(patterns_.find((*i)->id()) == patterns_.end());
patterns_[(*i)->id()] = *i;
}
// Unregister patterns
for (auto i = to_unregister.begin(); i != to_unregister.end(); ++i) {
patterns_.erase((*i)->id());
}
// Now we compute the total number of tree nodes needed.
SubstringPatternVector sorted_patterns;
sorted_patterns.resize(patterns_.size());
} // namespace
size_t next = 0;
for (SubstringPatternMap::const_iterator i = patterns_.begin();
i != patterns_.end();
++i, ++next) {
sorted_patterns[next] = i->second;
SubstringSetMatcher::SubstringSetMatcher(
const std::vector<StringPattern>& patterns)
: SubstringSetMatcher(GetVectorOfPointers(patterns)) {}
SubstringSetMatcher::SubstringSetMatcher(
std::vector<const StringPattern*> patterns) {
// Ensure there are no duplicate IDs.
#if DCHECK_IS_ON()
{
std::set<int> ids;
for (const StringPattern* pattern : patterns) {
CHECK(!base::Contains(ids, pattern->id()));
ids.insert(pattern->id());
}
}
#endif
std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns);
tree_.reserve(TreeSize(sorted_patterns));
RebuildAhoCorasickTree(sorted_patterns);
// Compute the total number of tree nodes needed.
std::sort(patterns.begin(), patterns.end(), ComparePatterns);
tree_.reserve(TreeSize(patterns));
BuildAhoCorasickTree(patterns);
DCHECK_EQ(tree_.size(), TreeSize(patterns));
is_empty_ = patterns.empty() && tree_.size() == 1u;
}
SubstringSetMatcher::~SubstringSetMatcher() = default;
bool SubstringSetMatcher::Match(const std::string& text,
std::set<StringPattern::ID>* matches) const {
const size_t old_number_of_matches = matches->size();
......@@ -116,12 +103,12 @@ bool SubstringSetMatcher::Match(const std::string& text,
matches->insert(tree_[0].matches().begin(), tree_[0].matches().end());
uint32_t current_node = 0;
for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) {
uint32_t edge_from_current = tree_[current_node].GetEdge(*i);
for (const char c : text) {
uint32_t edge_from_current = tree_[current_node].GetEdge(c);
while (edge_from_current == AhoCorasickNode::kNoSuchEdge &&
current_node != 0) {
current_node = tree_[current_node].failure();
edge_from_current = tree_[current_node].GetEdge(*i);
edge_from_current = tree_[current_node].GetEdge(c);
}
if (edge_from_current != AhoCorasickNode::kNoSuchEdge) {
current_node = edge_from_current;
......@@ -135,29 +122,21 @@ bool SubstringSetMatcher::Match(const std::string& text,
return old_number_of_matches != matches->size();
}
bool SubstringSetMatcher::IsEmpty() const {
// An empty tree consists of only the root node.
return patterns_.empty() && tree_.size() == 1u;
}
size_t SubstringSetMatcher::EstimateMemoryUsage() const {
return base::trace_event::EstimateMemoryUsage(tree_) +
base::trace_event::EstimateMemoryUsage(patterns_);
return base::trace_event::EstimateMemoryUsage(tree_);
}
void SubstringSetMatcher::RebuildAhoCorasickTree(
const SubstringPatternVector& sorted_patterns) {
tree_.clear();
void SubstringSetMatcher::BuildAhoCorasickTree(
const SubstringPatternVector& patterns) {
DCHECK(tree_.empty());
// Initialize root note of tree.
AhoCorasickNode root;
root.set_failure(0);
tree_.push_back(root);
// Initialize root node of tree.
tree_.emplace_back();
tree_.back().set_failure(0);
// Insert all patterns.
for (auto i = sorted_patterns.begin(); i != sorted_patterns.end(); ++i) {
InsertPatternIntoAhoCorasickTree(*i);
}
// Build the initial trie for all the patterns.
for (const StringPattern* pattern : patterns)
InsertPatternIntoAhoCorasickTree(pattern);
CreateFailureEdges();
}
......@@ -193,15 +172,12 @@ void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree(
}
void SubstringSetMatcher::CreateFailureEdges() {
typedef AhoCorasickNode::Edges Edges;
base::queue<uint32_t> queue;
AhoCorasickNode& root = tree_[0];
root.set_failure(0);
const Edges& root_edges = root.edges();
for (auto e = root_edges.begin(); e != root_edges.end(); ++e) {
const uint32_t& leads_to = e->second;
for (const auto& edge : root.edges()) {
const uint32_t leads_to = edge.second;
tree_[leads_to].set_failure(0);
queue.push(leads_to);
}
......@@ -209,10 +185,9 @@ void SubstringSetMatcher::CreateFailureEdges() {
while (!queue.empty()) {
AhoCorasickNode& current_node = tree_[queue.front()];
queue.pop();
for (auto e = current_node.edges().begin(); e != current_node.edges().end();
++e) {
const char& edge_label = e->first;
const uint32_t& leads_to = e->second;
for (const auto& edge : current_node.edges()) {
const char edge_label = edge.first;
const uint32_t leads_to = edge.second;
queue.push(leads_to);
uint32_t failure = current_node.failure();
......@@ -237,22 +212,13 @@ const uint32_t SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = 0xFFFFFFFF;
SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode()
: failure_(kNoSuchEdge) {}
SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {}
SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(
const SubstringSetMatcher::AhoCorasickNode& other)
: edges_(other.edges_),
failure_(other.failure_),
matches_(other.matches_) {}
SubstringSetMatcher::AhoCorasickNode&
SubstringSetMatcher::AhoCorasickNode::operator=(
const SubstringSetMatcher::AhoCorasickNode& other) {
edges_ = other.edges_;
failure_ = other.failure_;
matches_ = other.matches_;
return *this;
}
SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() = default;
SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(AhoCorasickNode&& other) =
default;
SubstringSetMatcher::AhoCorasickNode& SubstringSetMatcher::AhoCorasickNode::
operator=(AhoCorasickNode&& other) = default;
uint32_t SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const {
auto i = edges_.find(c);
......
......@@ -23,32 +23,20 @@ namespace url_matcher {
// which string patterns occur in S.
class URL_MATCHER_EXPORT SubstringSetMatcher {
public:
SubstringSetMatcher();
// Registers all |patterns|. The same pattern cannot be registered twice and
// each pattern needs to have a unique ID.
SubstringSetMatcher(const std::vector<StringPattern>& patterns);
SubstringSetMatcher(std::vector<const StringPattern*> patterns);
~SubstringSetMatcher();
// Registers all |patterns|. The ownership remains with the caller.
// The same pattern cannot be registered twice and each pattern needs to have
// a unique ID.
// Ownership of the patterns remains with the caller.
void RegisterPatterns(const std::vector<const StringPattern*>& patterns);
// Unregisters the passed |patterns|.
void UnregisterPatterns(const std::vector<const StringPattern*>& patterns);
// Analogous to RegisterPatterns and UnregisterPatterns but executes both
// operations in one step, which is cheaper in the execution.
void RegisterAndUnregisterPatterns(
const std::vector<const StringPattern*>& to_register,
const std::vector<const StringPattern*>& to_unregister);
// Matches |text| against all registered StringPatterns. Stores the IDs
// of matching patterns in |matches|. |matches| is not cleared before adding
// to it.
bool Match(const std::string& text,
std::set<StringPattern::ID>* matches) const;
// Returns true if this object retains no allocated data. Only for debugging.
bool IsEmpty() const;
// Returns true if this object retains no allocated data.
bool IsEmpty() const { return is_empty_; }
// Returns the estimated memory usage in bytes.
size_t EstimateMemoryUsage() const;
......@@ -94,8 +82,8 @@ class URL_MATCHER_EXPORT SubstringSetMatcher {
AhoCorasickNode();
~AhoCorasickNode();
AhoCorasickNode(const AhoCorasickNode& other);
AhoCorasickNode& operator=(const AhoCorasickNode& other);
AhoCorasickNode(AhoCorasickNode&& other);
AhoCorasickNode& operator=(AhoCorasickNode&& other);
uint32_t GetEdge(char c) const;
void SetEdge(char c, uint32_t node);
......@@ -121,25 +109,20 @@ class URL_MATCHER_EXPORT SubstringSetMatcher {
Matches matches_;
};
typedef std::map<StringPattern::ID, const StringPattern*> SubstringPatternMap;
typedef std::vector<const StringPattern*> SubstringPatternVector;
// |sorted_patterns| is a copy of |patterns_| sorted by the pattern string.
void RebuildAhoCorasickTree(const SubstringPatternVector& sorted_patterns);
void BuildAhoCorasickTree(const SubstringPatternVector& patterns);
// Inserts a path for |pattern->pattern()| into the tree and adds
// |pattern->id()| to the set of matches. Ownership of |pattern| remains with
// the caller.
// |pattern->id()| to the set of matches.
void InsertPatternIntoAhoCorasickTree(const StringPattern* pattern);
void CreateFailureEdges();
// Set of all registered StringPatterns. Used to regenerate the
// Aho-Corasick tree in case patterns are registered or unregistered.
SubstringPatternMap patterns_;
// The nodes of a Aho-Corasick tree.
std::vector<AhoCorasickNode> tree_;
bool is_empty_ = true;
DISALLOW_COPY_AND_ASSIGN(SubstringSetMatcher);
};
......
......@@ -37,15 +37,6 @@ std::string GetString(size_t len) {
return std::string(pattern.begin(), pattern.end());
}
std::vector<const StringPattern*> GetVectorOfPointers(
const std::vector<StringPattern>& patterns) {
std::vector<const StringPattern*> pointers;
for (const StringPattern& pattern : patterns)
pointers.push_back(&pattern);
return pointers;
}
// Tests performance of SubstringSetMatcher for hundred thousand keys each of
// 100 characters.
TEST(SubstringSetMatcherPerfTest, HundredThousandKeys) {
......@@ -58,8 +49,7 @@ TEST(SubstringSetMatcherPerfTest, HundredThousandKeys) {
patterns.emplace_back(GetString(kPatternLen), i);
base::ElapsedTimer init_timer;
SubstringSetMatcher matcher;
matcher.RegisterPatterns(GetVectorOfPointers(patterns));
SubstringSetMatcher matcher(patterns);
base::TimeDelta init_time = init_timer.Elapsed();
// Match patterns against a string of 5000 characters.
......
......@@ -22,11 +22,9 @@ void TestOnePattern(const std::string& test_string,
std::string test =
"TestOnePattern(" + test_string + ", " + pattern + ", " +
(is_match ? "1" : "0") + ")";
std::vector<const StringPattern*> patterns;
StringPattern substring_pattern(pattern, 1);
patterns.push_back(&substring_pattern);
SubstringSetMatcher matcher;
matcher.RegisterPatterns(patterns);
std::vector<StringPattern> patterns;
patterns.emplace_back(pattern, 1);
SubstringSetMatcher matcher(patterns);
std::set<int> matches;
matcher.Match(test_string, &matches);
......@@ -56,8 +54,7 @@ void TestTwoPatterns(const std::string& test_string,
patterns.push_back(&substring_pattern_2);
patterns.push_back(&substring_pattern_1);
}
SubstringSetMatcher matcher;
matcher.RegisterPatterns(patterns);
SubstringSetMatcher matcher(patterns);
std::set<int> matches;
matcher.Match(test_string, &matches);
......@@ -123,50 +120,42 @@ TEST(SubstringSetMatcherTest, TestMatcher) {
TestTwoPatterns("abcde", std::string(), "abcdef", true, false);
}
TEST(SubstringSetMatcherTest, RegisterAndRemove) {
SubstringSetMatcher matcher;
TEST(SubstringSetMatcherTest, TestMatcher2) {
StringPattern pattern_1("a", 1);
StringPattern pattern_2("b", 2);
StringPattern pattern_3("c", 3);
std::vector<const StringPattern*> patterns;
patterns.push_back(&pattern_1);
matcher.RegisterPatterns(patterns);
patterns.clear();
patterns.push_back(&pattern_2);
patterns.push_back(&pattern_3);
matcher.RegisterPatterns(patterns);
std::vector<const StringPattern*> patterns = {&pattern_1, &pattern_2,
&pattern_3};
auto matcher = std::make_unique<SubstringSetMatcher>(patterns);
std::set<int> matches;
matcher.Match("abd", &matches);
matcher->Match("abd", &matches);
EXPECT_EQ(2u, matches.size());
EXPECT_TRUE(matches.end() != matches.find(1));
EXPECT_TRUE(matches.end() != matches.find(2));
patterns.clear();
patterns.push_back(&pattern_2);
matcher.UnregisterPatterns(patterns);
patterns = {&pattern_1, &pattern_3};
matcher = std::make_unique<SubstringSetMatcher>(patterns);
matches.clear();
matcher.Match("abd", &matches);
matcher->Match("abd", &matches);
EXPECT_EQ(1u, matches.size());
EXPECT_TRUE(matches.end() != matches.find(1));
EXPECT_TRUE(matches.end() == matches.find(2));
patterns.clear();
patterns.push_back(&pattern_1);
patterns.push_back(&pattern_3);
matcher.UnregisterPatterns(patterns);
EXPECT_TRUE(matcher.IsEmpty());
matcher = std::make_unique<SubstringSetMatcher>(
std::vector<const StringPattern*>());
EXPECT_TRUE(matcher->IsEmpty());
}
TEST(SubstringSetMatcherTest, TestEmptyMatcher) {
SubstringSetMatcher matcher;
std::vector<StringPattern> patterns;
SubstringSetMatcher matcher(patterns);
std::set<int> matches;
matcher.Match("abd", &matches);
EXPECT_TRUE(matches.empty());
EXPECT_TRUE(matcher.IsEmpty());
}
} // namespace url_matcher
......@@ -152,6 +152,10 @@ bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) {
return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES;
}
bool IsMatcherEmpty(const std::unique_ptr<SubstringSetMatcher>& matcher) {
return !matcher || matcher->IsEmpty();
}
} // namespace
//
......@@ -823,14 +827,14 @@ std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(
std::set<StringPattern::ID> matches;
std::string url_for_component_searches;
if (!full_url_matcher_.IsEmpty()) {
full_url_matcher_.Match(
if (!IsMatcherEmpty(full_url_matcher_)) {
full_url_matcher_->Match(
condition_factory_.CanonicalizeURLForFullSearches(url), &matches);
}
if (!url_component_matcher_.IsEmpty()) {
if (!IsMatcherEmpty(url_component_matcher_)) {
url_for_component_searches =
condition_factory_.CanonicalizeURLForComponentSearches(url);
url_component_matcher_.Match(url_for_component_searches, &matches);
url_component_matcher_->Match(url_for_component_searches, &matches);
}
if (!regex_set_matcher_.IsEmpty()) {
regex_set_matcher_.Match(
......@@ -868,15 +872,12 @@ std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(
}
bool URLMatcher::IsEmpty() const {
return condition_factory_.IsEmpty() &&
url_matcher_condition_sets_.empty() &&
substring_match_triggers_.empty() &&
full_url_matcher_.IsEmpty() &&
url_component_matcher_.IsEmpty() &&
regex_set_matcher_.IsEmpty() &&
origin_and_path_regex_set_matcher_.IsEmpty() &&
registered_full_url_patterns_.empty() &&
registered_url_component_patterns_.empty();
return condition_factory_.IsEmpty() && url_matcher_condition_sets_.empty() &&
substring_match_triggers_.empty() &&
IsMatcherEmpty(full_url_matcher_) &&
IsMatcherEmpty(url_component_matcher_) &&
regex_set_matcher_.IsEmpty() &&
origin_and_path_regex_set_matcher_.IsEmpty();
}
void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
......@@ -915,32 +916,13 @@ void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
}
}
// This is the set of patterns that were registered before this function
// is called.
std::set<const StringPattern*>& registered_patterns =
full_url_conditions ? registered_full_url_patterns_
: registered_url_component_patterns_;
// Add all patterns that are in new_patterns but not in registered_patterns.
std::vector<const StringPattern*> patterns_to_register =
base::STLSetDifference<std::vector<const StringPattern*> >(
new_patterns, registered_patterns);
// Remove all patterns that are in registered_patterns but not in
// new_patterns.
std::vector<const StringPattern*> patterns_to_unregister =
base::STLSetDifference<std::vector<const StringPattern*> >(
registered_patterns, new_patterns);
// Update the SubstringSetMatcher.
SubstringSetMatcher& url_matcher =
std::unique_ptr<SubstringSetMatcher>& url_matcher =
full_url_conditions ? full_url_matcher_ : url_component_matcher_;
url_matcher.RegisterAndUnregisterPatterns(patterns_to_register,
patterns_to_unregister);
// Update the set of registered_patterns for the next time this function
// is being called.
registered_patterns.swap(new_patterns);
url_matcher =
std::make_unique<SubstringSetMatcher>(std::vector<const StringPattern*>(
new_patterns.begin(), new_patterns.end()));
}
void URLMatcher::UpdateRegexSetMatcher() {
......
......@@ -417,12 +417,10 @@ class URL_MATCHER_EXPORT URLMatcher {
StringPatternTriggers;
StringPatternTriggers substring_match_triggers_;
SubstringSetMatcher full_url_matcher_;
SubstringSetMatcher url_component_matcher_;
std::unique_ptr<SubstringSetMatcher> full_url_matcher_;
std::unique_ptr<SubstringSetMatcher> url_component_matcher_;
RegexSetMatcher regex_set_matcher_;
RegexSetMatcher origin_and_path_regex_set_matcher_;
std::set<const StringPattern*> registered_full_url_patterns_;
std::set<const StringPattern*> registered_url_component_patterns_;
DISALLOW_COPY_AND_ASSIGN(URLMatcher);
};
......
......@@ -194,7 +194,7 @@ RegexRulesMatcher::GetBeforeRequestActionIgnoringAncestors(
}
void RegexRulesMatcher::InitializeMatcher() {
if (regex_list_->Length() == 0)
if (IsEmpty())
return;
for (const auto* regex_rule : *regex_list_) {
......@@ -240,19 +240,20 @@ void RegexRulesMatcher::InitializeMatcher() {
});
}));
// Convert |strings_to_match| to |filtered_re2_strings_to_match_| which stores
// a vector of url_matcher::StringPattern(s). This is necessary to use
// Convert |strings_to_match| to StringPatterns. This is necessary to use
// url_matcher::SubstringSetMatcher.
for (size_t i = 0; i < strings_to_match.size(); ++i) {
filtered_re2_strings_to_match_.emplace_back(std::move(strings_to_match[i]),
i);
}
std::vector<url_matcher::StringPattern> patterns;
patterns.reserve(strings_to_match.size());
for (size_t i = 0; i < strings_to_match.size(); ++i)
patterns.emplace_back(std::move(strings_to_match[i]), i);
std::vector<const url_matcher::StringPattern*> patterns;
for (const auto& pattern : filtered_re2_strings_to_match_)
patterns.push_back(&pattern);
substring_matcher_ =
std::make_unique<url_matcher::SubstringSetMatcher>(patterns);
}
substring_matcher_.RegisterPatterns(patterns);
bool RegexRulesMatcher::IsEmpty() const {
return regex_list_->Length() == 0;
}
const std::vector<RegexRuleInfo>& RegexRulesMatcher::GetPotentialMatches(
......@@ -261,16 +262,24 @@ const std::vector<RegexRuleInfo>& RegexRulesMatcher::GetPotentialMatches(
if (iter != params.potential_regex_matches.end())
return iter->second;
// Early out if this is an empty matcher.
if (IsEmpty()) {
auto result = params.potential_regex_matches.insert(
std::make_pair(this, std::vector<RegexRuleInfo>()));
return result.first->second;
}
// Compute the potential matches. FilteredRE2 requires the text to be lower
// cased first.
if (!params.lower_cased_url_spec)
params.lower_cased_url_spec = base::ToLowerASCII(params.url->spec());
// To pre-filter the set of regexes to match against |params|, we first need
// to compute the set of candidate strings in |filtered_re2_strings_to_match_|
// to compute the set of candidate strings tracked by |substring_matcher_|
// within |params.lower_cased_url_spec|.
std::set<int> candidate_ids_set;
substring_matcher_.Match(*params.lower_cased_url_spec, &candidate_ids_set);
DCHECK(substring_matcher_);
substring_matcher_->Match(*params.lower_cased_url_spec, &candidate_ids_set);
std::vector<int> candidate_ids_list(candidate_ids_set.begin(),
candidate_ids_set.end());
......
......@@ -5,6 +5,8 @@
#ifndef EXTENSIONS_BROWSER_API_DECLARATIVE_NET_REQUEST_REGEX_RULES_MATCHER_H_
#define EXTENSIONS_BROWSER_API_DECLARATIVE_NET_REQUEST_REGEX_RULES_MATCHER_H_
#include <memory>
#include "base/macros.h"
#include "components/url_matcher/substring_set_matcher.h"
#include "extensions/browser/api/declarative_net_request/ruleset_matcher_base.h"
......@@ -31,8 +33,7 @@ struct RegexRuleInfo {
// Initialization:
// 1. During initialization, we add each regex to the FilteredRE2 class.
// 2. We compile the FilteredRE2 object which returns us a set of substrings.
// These are stored in |filtered_re2_strings_to_match_| below. These are also
// added to |substring_matcher_| for use in #3 below.
// These are added to |substring_matcher_| for use in #3 below.
//
// Matching
// 3. Given a request url, we find the set of strings from #2. that are
......@@ -74,6 +75,9 @@ class RegexRulesMatcher final : public RulesetMatcherBase {
// Helper to build the necessary data structures for matching.
void InitializeMatcher();
// Returns true if this matcher doesn't correspond to any rules.
bool IsEmpty() const;
// Returns the potentially matching rules for the given request. A potentially
// matching rule is one whose metadata matches the given request |params| and
// which is not ruled out as a potential match by the |filtered_re2_| object.
......@@ -104,15 +108,11 @@ class RegexRulesMatcher final : public RulesetMatcherBase {
// |regex_list_|.
std::map<int, const flat::RegexRule*> re2_id_to_rules_map_;
// Candidate strings to match for each request, for pre-filtering. The ID of
// each url_matcher::StringPattern is its index within the vector. All the
// strings are lower-cased.
std::vector<url_matcher::StringPattern> filtered_re2_strings_to_match_;
// Structure for fast substring matching. Given a string S and a set of
// candidate strings, returns the sub-set of candidate strings that are a
// substring of S. Uses the Aho-Corasick algorithm internally.
url_matcher::SubstringSetMatcher substring_matcher_;
// substring of S. Uses the Aho-Corasick algorithm internally. Will be null
// iff IsEmpty() returns false.
std::unique_ptr<url_matcher::SubstringSetMatcher> substring_matcher_;
DISALLOW_COPY_AND_ASSIGN(RegexRulesMatcher);
};
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment