Commit e39bc57a authored by Karan Bhatia's avatar Karan Bhatia Committed by Commit Bot

UrlMatcher: Initialize SubstringSetMatcher with patterns.

This CL changes SubstringSetMatcher to require patterns as part of its
constructor. This:

- Simplifies the interface and the contract with the client.
  SubstringSetMatcher doesn't refer to memory owned by client anymore.

- Reduces the need for storing the pattern vector in both
  SubstringSetMatcher and its clients, thereby reducing runtime memory
  usage.

Also, modernize the code a bit, e.g.
  - Don't take references to primitives.
  - Use for-each loops when possible.

This shouldn't introduce any behavior change.

BUG=974391

Change-Id: Ie6cf1e7c08507b7eaccff39626d43062383e1383
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2038075
Commit-Queue: Karan Bhatia <karandeepb@chromium.org>
Reviewed-by: default avatarDominic Battré <battre@chromium.org>
Cr-Commit-Position: refs/heads/master@{#739067}
parent c70b7e09
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <stddef.h> #include <stddef.h>
#include <memory> #include <memory>
#include <utility>
#include "base/logging.h" #include "base/logging.h"
#include "base/strings/string_util.h" #include "base/strings/string_util.h"
...@@ -16,11 +17,8 @@ ...@@ -16,11 +17,8 @@
namespace url_matcher { namespace url_matcher {
RegexSetMatcher::RegexSetMatcher() {} RegexSetMatcher::RegexSetMatcher() = default;
RegexSetMatcher::~RegexSetMatcher() = default;
RegexSetMatcher::~RegexSetMatcher() {
DeleteSubstringPatterns();
}
void RegexSetMatcher::AddPatterns( void RegexSetMatcher::AddPatterns(
const std::vector<const StringPattern*>& regex_list) { const std::vector<const StringPattern*>& regex_list) {
...@@ -97,22 +95,15 @@ void RegexSetMatcher::RebuildMatcher() { ...@@ -97,22 +95,15 @@ void RegexSetMatcher::RebuildMatcher() {
std::vector<std::string> strings_to_match; std::vector<std::string> strings_to_match;
filtered_re2_->Compile(&strings_to_match); filtered_re2_->Compile(&strings_to_match);
substring_matcher_.reset(new SubstringSetMatcher); std::vector<url_matcher::StringPattern> substring_patterns;
DeleteSubstringPatterns(); substring_patterns.reserve(strings_to_match.size());
// Build SubstringSetMatcher from |strings_to_match|. // Build SubstringSetMatcher from |strings_to_match|.
// SubstringSetMatcher doesn't own its strings. for (size_t i = 0; i < strings_to_match.size(); ++i)
for (size_t i = 0; i < strings_to_match.size(); ++i) { substring_patterns.emplace_back(std::move(strings_to_match[i]), i);
substring_patterns_.push_back(
std::make_unique<StringPattern>(strings_to_match[i], i));
}
std::vector<const StringPattern*> patterns;
for (const auto& pattern : substring_patterns_)
patterns.push_back(pattern.get());
substring_matcher_->RegisterPatterns(patterns);
}
void RegexSetMatcher::DeleteSubstringPatterns() { substring_matcher_ =
substring_patterns_.clear(); std::make_unique<SubstringSetMatcher>(substring_patterns);
} }
} // namespace url_matcher } // namespace url_matcher
...@@ -61,9 +61,6 @@ class URL_MATCHER_EXPORT RegexSetMatcher { ...@@ -61,9 +61,6 @@ class URL_MATCHER_EXPORT RegexSetMatcher {
// apparently not supported by FilteredRE2. // apparently not supported by FilteredRE2.
void RebuildMatcher(); void RebuildMatcher();
// Clean up StringPatterns in |substring_patterns_|.
void DeleteSubstringPatterns();
// Mapping of regex StringPattern::IDs to regexes. // Mapping of regex StringPattern::IDs to regexes.
RegexMap regexes_; RegexMap regexes_;
// Mapping of RE2IDs from FilteredRE2 (which are assigned in order) // Mapping of RE2IDs from FilteredRE2 (which are assigned in order)
...@@ -72,10 +69,6 @@ class URL_MATCHER_EXPORT RegexSetMatcher { ...@@ -72,10 +69,6 @@ class URL_MATCHER_EXPORT RegexSetMatcher {
std::unique_ptr<re2::FilteredRE2> filtered_re2_; std::unique_ptr<re2::FilteredRE2> filtered_re2_;
std::unique_ptr<SubstringSetMatcher> substring_matcher_; std::unique_ptr<SubstringSetMatcher> substring_matcher_;
// The substring patterns from FilteredRE2, which are used in
// |substring_matcher_| but whose lifetime is managed here.
std::vector<std::unique_ptr<StringPattern>> substring_patterns_;
}; };
} // namespace url_matcher } // namespace url_matcher
......
...@@ -26,6 +26,8 @@ bool ComparePatterns(const StringPattern* a, const StringPattern* b) { ...@@ -26,6 +26,8 @@ bool ComparePatterns(const StringPattern* a, const StringPattern* b) {
// Given the set of patterns, compute how many nodes will the corresponding // Given the set of patterns, compute how many nodes will the corresponding
// Aho-Corasick tree have. Note that |patterns| need to be sorted. // Aho-Corasick tree have. Note that |patterns| need to be sorted.
uint32_t TreeSize(const std::vector<const StringPattern*>& patterns) { uint32_t TreeSize(const std::vector<const StringPattern*>& patterns) {
DCHECK(std::is_sorted(patterns.begin(), patterns.end(), ComparePatterns));
uint32_t result = 1u; // 1 for the root node. uint32_t result = 1u; // 1 for the root node.
if (patterns.empty()) if (patterns.empty())
return result; return result;
...@@ -53,61 +55,46 @@ uint32_t TreeSize(const std::vector<const StringPattern*>& patterns) { ...@@ -53,61 +55,46 @@ uint32_t TreeSize(const std::vector<const StringPattern*>& patterns) {
return result; return result;
} }
} // namespace std::vector<const StringPattern*> GetVectorOfPointers(
const std::vector<StringPattern>& patterns) {
// std::vector<const StringPattern*> pattern_pointers;
// SubstringSetMatcher pattern_pointers.reserve(patterns.size());
//
SubstringSetMatcher::SubstringSetMatcher() {
RebuildAhoCorasickTree(SubstringPatternVector());
}
SubstringSetMatcher::~SubstringSetMatcher() {}
void SubstringSetMatcher::RegisterPatterns( for (const StringPattern& pattern : patterns)
const std::vector<const StringPattern*>& patterns) { pattern_pointers.push_back(&pattern);
RegisterAndUnregisterPatterns(patterns,
std::vector<const StringPattern*>());
}
void SubstringSetMatcher::UnregisterPatterns( return pattern_pointers;
const std::vector<const StringPattern*>& patterns) {
RegisterAndUnregisterPatterns(std::vector<const StringPattern*>(),
patterns);
} }
void SubstringSetMatcher::RegisterAndUnregisterPatterns( } // namespace
const std::vector<const StringPattern*>& to_register,
const std::vector<const StringPattern*>& to_unregister) {
// Register patterns.
for (auto i = to_register.begin(); i != to_register.end(); ++i) {
DCHECK(patterns_.find((*i)->id()) == patterns_.end());
patterns_[(*i)->id()] = *i;
}
// Unregister patterns
for (auto i = to_unregister.begin(); i != to_unregister.end(); ++i) {
patterns_.erase((*i)->id());
}
// Now we compute the total number of tree nodes needed.
SubstringPatternVector sorted_patterns;
sorted_patterns.resize(patterns_.size());
size_t next = 0; SubstringSetMatcher::SubstringSetMatcher(
for (SubstringPatternMap::const_iterator i = patterns_.begin(); const std::vector<StringPattern>& patterns)
i != patterns_.end(); : SubstringSetMatcher(GetVectorOfPointers(patterns)) {}
++i, ++next) {
sorted_patterns[next] = i->second; SubstringSetMatcher::SubstringSetMatcher(
std::vector<const StringPattern*> patterns) {
// Ensure there are no duplicate IDs.
#if DCHECK_IS_ON()
{
std::set<int> ids;
for (const StringPattern* pattern : patterns) {
CHECK(!base::Contains(ids, pattern->id()));
ids.insert(pattern->id());
}
} }
#endif
std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns); // Compute the total number of tree nodes needed.
tree_.reserve(TreeSize(sorted_patterns)); std::sort(patterns.begin(), patterns.end(), ComparePatterns);
tree_.reserve(TreeSize(patterns));
RebuildAhoCorasickTree(sorted_patterns); BuildAhoCorasickTree(patterns);
DCHECK_EQ(tree_.size(), TreeSize(patterns));
is_empty_ = patterns.empty() && tree_.size() == 1u;
} }
SubstringSetMatcher::~SubstringSetMatcher() = default;
bool SubstringSetMatcher::Match(const std::string& text, bool SubstringSetMatcher::Match(const std::string& text,
std::set<StringPattern::ID>* matches) const { std::set<StringPattern::ID>* matches) const {
const size_t old_number_of_matches = matches->size(); const size_t old_number_of_matches = matches->size();
...@@ -116,12 +103,12 @@ bool SubstringSetMatcher::Match(const std::string& text, ...@@ -116,12 +103,12 @@ bool SubstringSetMatcher::Match(const std::string& text,
matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); matches->insert(tree_[0].matches().begin(), tree_[0].matches().end());
uint32_t current_node = 0; uint32_t current_node = 0;
for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) { for (const char c : text) {
uint32_t edge_from_current = tree_[current_node].GetEdge(*i); uint32_t edge_from_current = tree_[current_node].GetEdge(c);
while (edge_from_current == AhoCorasickNode::kNoSuchEdge && while (edge_from_current == AhoCorasickNode::kNoSuchEdge &&
current_node != 0) { current_node != 0) {
current_node = tree_[current_node].failure(); current_node = tree_[current_node].failure();
edge_from_current = tree_[current_node].GetEdge(*i); edge_from_current = tree_[current_node].GetEdge(c);
} }
if (edge_from_current != AhoCorasickNode::kNoSuchEdge) { if (edge_from_current != AhoCorasickNode::kNoSuchEdge) {
current_node = edge_from_current; current_node = edge_from_current;
...@@ -135,29 +122,21 @@ bool SubstringSetMatcher::Match(const std::string& text, ...@@ -135,29 +122,21 @@ bool SubstringSetMatcher::Match(const std::string& text,
return old_number_of_matches != matches->size(); return old_number_of_matches != matches->size();
} }
bool SubstringSetMatcher::IsEmpty() const {
// An empty tree consists of only the root node.
return patterns_.empty() && tree_.size() == 1u;
}
size_t SubstringSetMatcher::EstimateMemoryUsage() const { size_t SubstringSetMatcher::EstimateMemoryUsage() const {
return base::trace_event::EstimateMemoryUsage(tree_) + return base::trace_event::EstimateMemoryUsage(tree_);
base::trace_event::EstimateMemoryUsage(patterns_);
} }
void SubstringSetMatcher::RebuildAhoCorasickTree( void SubstringSetMatcher::BuildAhoCorasickTree(
const SubstringPatternVector& sorted_patterns) { const SubstringPatternVector& patterns) {
tree_.clear(); DCHECK(tree_.empty());
// Initialize root note of tree. // Initialize root node of tree.
AhoCorasickNode root; tree_.emplace_back();
root.set_failure(0); tree_.back().set_failure(0);
tree_.push_back(root);
// Insert all patterns. // Build the initial trie for all the patterns.
for (auto i = sorted_patterns.begin(); i != sorted_patterns.end(); ++i) { for (const StringPattern* pattern : patterns)
InsertPatternIntoAhoCorasickTree(*i); InsertPatternIntoAhoCorasickTree(pattern);
}
CreateFailureEdges(); CreateFailureEdges();
} }
...@@ -193,15 +172,12 @@ void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( ...@@ -193,15 +172,12 @@ void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree(
} }
void SubstringSetMatcher::CreateFailureEdges() { void SubstringSetMatcher::CreateFailureEdges() {
typedef AhoCorasickNode::Edges Edges;
base::queue<uint32_t> queue; base::queue<uint32_t> queue;
AhoCorasickNode& root = tree_[0]; AhoCorasickNode& root = tree_[0];
root.set_failure(0); root.set_failure(0);
const Edges& root_edges = root.edges(); for (const auto& edge : root.edges()) {
for (auto e = root_edges.begin(); e != root_edges.end(); ++e) { const uint32_t leads_to = edge.second;
const uint32_t& leads_to = e->second;
tree_[leads_to].set_failure(0); tree_[leads_to].set_failure(0);
queue.push(leads_to); queue.push(leads_to);
} }
...@@ -209,10 +185,9 @@ void SubstringSetMatcher::CreateFailureEdges() { ...@@ -209,10 +185,9 @@ void SubstringSetMatcher::CreateFailureEdges() {
while (!queue.empty()) { while (!queue.empty()) {
AhoCorasickNode& current_node = tree_[queue.front()]; AhoCorasickNode& current_node = tree_[queue.front()];
queue.pop(); queue.pop();
for (auto e = current_node.edges().begin(); e != current_node.edges().end(); for (const auto& edge : current_node.edges()) {
++e) { const char edge_label = edge.first;
const char& edge_label = e->first; const uint32_t leads_to = edge.second;
const uint32_t& leads_to = e->second;
queue.push(leads_to); queue.push(leads_to);
uint32_t failure = current_node.failure(); uint32_t failure = current_node.failure();
...@@ -237,22 +212,13 @@ const uint32_t SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = 0xFFFFFFFF; ...@@ -237,22 +212,13 @@ const uint32_t SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = 0xFFFFFFFF;
SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode()
: failure_(kNoSuchEdge) {} : failure_(kNoSuchEdge) {}
SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() = default;
SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(AhoCorasickNode&& other) =
const SubstringSetMatcher::AhoCorasickNode& other) default;
: edges_(other.edges_),
failure_(other.failure_), SubstringSetMatcher::AhoCorasickNode& SubstringSetMatcher::AhoCorasickNode::
matches_(other.matches_) {} operator=(AhoCorasickNode&& other) = default;
SubstringSetMatcher::AhoCorasickNode&
SubstringSetMatcher::AhoCorasickNode::operator=(
const SubstringSetMatcher::AhoCorasickNode& other) {
edges_ = other.edges_;
failure_ = other.failure_;
matches_ = other.matches_;
return *this;
}
uint32_t SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { uint32_t SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const {
auto i = edges_.find(c); auto i = edges_.find(c);
......
...@@ -23,32 +23,20 @@ namespace url_matcher { ...@@ -23,32 +23,20 @@ namespace url_matcher {
// which string patterns occur in S. // which string patterns occur in S.
class URL_MATCHER_EXPORT SubstringSetMatcher { class URL_MATCHER_EXPORT SubstringSetMatcher {
public: public:
SubstringSetMatcher(); // Registers all |patterns|. The same pattern cannot be registered twice and
// each pattern needs to have a unique ID.
SubstringSetMatcher(const std::vector<StringPattern>& patterns);
SubstringSetMatcher(std::vector<const StringPattern*> patterns);
~SubstringSetMatcher(); ~SubstringSetMatcher();
// Registers all |patterns|. The ownership remains with the caller.
// The same pattern cannot be registered twice and each pattern needs to have
// a unique ID.
// Ownership of the patterns remains with the caller.
void RegisterPatterns(const std::vector<const StringPattern*>& patterns);
// Unregisters the passed |patterns|.
void UnregisterPatterns(const std::vector<const StringPattern*>& patterns);
// Analogous to RegisterPatterns and UnregisterPatterns but executes both
// operations in one step, which is cheaper in the execution.
void RegisterAndUnregisterPatterns(
const std::vector<const StringPattern*>& to_register,
const std::vector<const StringPattern*>& to_unregister);
// Matches |text| against all registered StringPatterns. Stores the IDs // Matches |text| against all registered StringPatterns. Stores the IDs
// of matching patterns in |matches|. |matches| is not cleared before adding // of matching patterns in |matches|. |matches| is not cleared before adding
// to it. // to it.
bool Match(const std::string& text, bool Match(const std::string& text,
std::set<StringPattern::ID>* matches) const; std::set<StringPattern::ID>* matches) const;
// Returns true if this object retains no allocated data. Only for debugging. // Returns true if this object retains no allocated data.
bool IsEmpty() const; bool IsEmpty() const { return is_empty_; }
// Returns the estimated memory usage in bytes. // Returns the estimated memory usage in bytes.
size_t EstimateMemoryUsage() const; size_t EstimateMemoryUsage() const;
...@@ -94,8 +82,8 @@ class URL_MATCHER_EXPORT SubstringSetMatcher { ...@@ -94,8 +82,8 @@ class URL_MATCHER_EXPORT SubstringSetMatcher {
AhoCorasickNode(); AhoCorasickNode();
~AhoCorasickNode(); ~AhoCorasickNode();
AhoCorasickNode(const AhoCorasickNode& other); AhoCorasickNode(AhoCorasickNode&& other);
AhoCorasickNode& operator=(const AhoCorasickNode& other); AhoCorasickNode& operator=(AhoCorasickNode&& other);
uint32_t GetEdge(char c) const; uint32_t GetEdge(char c) const;
void SetEdge(char c, uint32_t node); void SetEdge(char c, uint32_t node);
...@@ -121,25 +109,20 @@ class URL_MATCHER_EXPORT SubstringSetMatcher { ...@@ -121,25 +109,20 @@ class URL_MATCHER_EXPORT SubstringSetMatcher {
Matches matches_; Matches matches_;
}; };
typedef std::map<StringPattern::ID, const StringPattern*> SubstringPatternMap;
typedef std::vector<const StringPattern*> SubstringPatternVector; typedef std::vector<const StringPattern*> SubstringPatternVector;
// |sorted_patterns| is a copy of |patterns_| sorted by the pattern string. void BuildAhoCorasickTree(const SubstringPatternVector& patterns);
void RebuildAhoCorasickTree(const SubstringPatternVector& sorted_patterns);
// Inserts a path for |pattern->pattern()| into the tree and adds // Inserts a path for |pattern->pattern()| into the tree and adds
// |pattern->id()| to the set of matches. Ownership of |pattern| remains with // |pattern->id()| to the set of matches.
// the caller.
void InsertPatternIntoAhoCorasickTree(const StringPattern* pattern); void InsertPatternIntoAhoCorasickTree(const StringPattern* pattern);
void CreateFailureEdges(); void CreateFailureEdges();
// Set of all registered StringPatterns. Used to regenerate the
// Aho-Corasick tree in case patterns are registered or unregistered.
SubstringPatternMap patterns_;
// The nodes of a Aho-Corasick tree. // The nodes of a Aho-Corasick tree.
std::vector<AhoCorasickNode> tree_; std::vector<AhoCorasickNode> tree_;
bool is_empty_ = true;
DISALLOW_COPY_AND_ASSIGN(SubstringSetMatcher); DISALLOW_COPY_AND_ASSIGN(SubstringSetMatcher);
}; };
......
...@@ -37,15 +37,6 @@ std::string GetString(size_t len) { ...@@ -37,15 +37,6 @@ std::string GetString(size_t len) {
return std::string(pattern.begin(), pattern.end()); return std::string(pattern.begin(), pattern.end());
} }
std::vector<const StringPattern*> GetVectorOfPointers(
const std::vector<StringPattern>& patterns) {
std::vector<const StringPattern*> pointers;
for (const StringPattern& pattern : patterns)
pointers.push_back(&pattern);
return pointers;
}
// Tests performance of SubstringSetMatcher for hundred thousand keys each of // Tests performance of SubstringSetMatcher for hundred thousand keys each of
// 100 characters. // 100 characters.
TEST(SubstringSetMatcherPerfTest, HundredThousandKeys) { TEST(SubstringSetMatcherPerfTest, HundredThousandKeys) {
...@@ -58,8 +49,7 @@ TEST(SubstringSetMatcherPerfTest, HundredThousandKeys) { ...@@ -58,8 +49,7 @@ TEST(SubstringSetMatcherPerfTest, HundredThousandKeys) {
patterns.emplace_back(GetString(kPatternLen), i); patterns.emplace_back(GetString(kPatternLen), i);
base::ElapsedTimer init_timer; base::ElapsedTimer init_timer;
SubstringSetMatcher matcher; SubstringSetMatcher matcher(patterns);
matcher.RegisterPatterns(GetVectorOfPointers(patterns));
base::TimeDelta init_time = init_timer.Elapsed(); base::TimeDelta init_time = init_timer.Elapsed();
// Match patterns against a string of 5000 characters. // Match patterns against a string of 5000 characters.
......
...@@ -22,11 +22,9 @@ void TestOnePattern(const std::string& test_string, ...@@ -22,11 +22,9 @@ void TestOnePattern(const std::string& test_string,
std::string test = std::string test =
"TestOnePattern(" + test_string + ", " + pattern + ", " + "TestOnePattern(" + test_string + ", " + pattern + ", " +
(is_match ? "1" : "0") + ")"; (is_match ? "1" : "0") + ")";
std::vector<const StringPattern*> patterns; std::vector<StringPattern> patterns;
StringPattern substring_pattern(pattern, 1); patterns.emplace_back(pattern, 1);
patterns.push_back(&substring_pattern); SubstringSetMatcher matcher(patterns);
SubstringSetMatcher matcher;
matcher.RegisterPatterns(patterns);
std::set<int> matches; std::set<int> matches;
matcher.Match(test_string, &matches); matcher.Match(test_string, &matches);
...@@ -56,8 +54,7 @@ void TestTwoPatterns(const std::string& test_string, ...@@ -56,8 +54,7 @@ void TestTwoPatterns(const std::string& test_string,
patterns.push_back(&substring_pattern_2); patterns.push_back(&substring_pattern_2);
patterns.push_back(&substring_pattern_1); patterns.push_back(&substring_pattern_1);
} }
SubstringSetMatcher matcher; SubstringSetMatcher matcher(patterns);
matcher.RegisterPatterns(patterns);
std::set<int> matches; std::set<int> matches;
matcher.Match(test_string, &matches); matcher.Match(test_string, &matches);
...@@ -123,50 +120,42 @@ TEST(SubstringSetMatcherTest, TestMatcher) { ...@@ -123,50 +120,42 @@ TEST(SubstringSetMatcherTest, TestMatcher) {
TestTwoPatterns("abcde", std::string(), "abcdef", true, false); TestTwoPatterns("abcde", std::string(), "abcdef", true, false);
} }
TEST(SubstringSetMatcherTest, RegisterAndRemove) { TEST(SubstringSetMatcherTest, TestMatcher2) {
SubstringSetMatcher matcher;
StringPattern pattern_1("a", 1); StringPattern pattern_1("a", 1);
StringPattern pattern_2("b", 2); StringPattern pattern_2("b", 2);
StringPattern pattern_3("c", 3); StringPattern pattern_3("c", 3);
std::vector<const StringPattern*> patterns; std::vector<const StringPattern*> patterns = {&pattern_1, &pattern_2,
patterns.push_back(&pattern_1); &pattern_3};
matcher.RegisterPatterns(patterns); auto matcher = std::make_unique<SubstringSetMatcher>(patterns);
patterns.clear();
patterns.push_back(&pattern_2);
patterns.push_back(&pattern_3);
matcher.RegisterPatterns(patterns);
std::set<int> matches; std::set<int> matches;
matcher.Match("abd", &matches); matcher->Match("abd", &matches);
EXPECT_EQ(2u, matches.size()); EXPECT_EQ(2u, matches.size());
EXPECT_TRUE(matches.end() != matches.find(1)); EXPECT_TRUE(matches.end() != matches.find(1));
EXPECT_TRUE(matches.end() != matches.find(2)); EXPECT_TRUE(matches.end() != matches.find(2));
patterns.clear(); patterns = {&pattern_1, &pattern_3};
patterns.push_back(&pattern_2); matcher = std::make_unique<SubstringSetMatcher>(patterns);
matcher.UnregisterPatterns(patterns);
matches.clear(); matches.clear();
matcher.Match("abd", &matches); matcher->Match("abd", &matches);
EXPECT_EQ(1u, matches.size()); EXPECT_EQ(1u, matches.size());
EXPECT_TRUE(matches.end() != matches.find(1)); EXPECT_TRUE(matches.end() != matches.find(1));
EXPECT_TRUE(matches.end() == matches.find(2)); EXPECT_TRUE(matches.end() == matches.find(2));
patterns.clear(); matcher = std::make_unique<SubstringSetMatcher>(
patterns.push_back(&pattern_1); std::vector<const StringPattern*>());
patterns.push_back(&pattern_3); EXPECT_TRUE(matcher->IsEmpty());
matcher.UnregisterPatterns(patterns);
EXPECT_TRUE(matcher.IsEmpty());
} }
TEST(SubstringSetMatcherTest, TestEmptyMatcher) { TEST(SubstringSetMatcherTest, TestEmptyMatcher) {
SubstringSetMatcher matcher; std::vector<StringPattern> patterns;
SubstringSetMatcher matcher(patterns);
std::set<int> matches; std::set<int> matches;
matcher.Match("abd", &matches); matcher.Match("abd", &matches);
EXPECT_TRUE(matches.empty()); EXPECT_TRUE(matches.empty());
EXPECT_TRUE(matcher.IsEmpty());
} }
} // namespace url_matcher } // namespace url_matcher
...@@ -152,6 +152,10 @@ bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) { ...@@ -152,6 +152,10 @@ bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) {
return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES; return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES;
} }
bool IsMatcherEmpty(const std::unique_ptr<SubstringSetMatcher>& matcher) {
return !matcher || matcher->IsEmpty();
}
} // namespace } // namespace
// //
...@@ -823,14 +827,14 @@ std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL( ...@@ -823,14 +827,14 @@ std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(
std::set<StringPattern::ID> matches; std::set<StringPattern::ID> matches;
std::string url_for_component_searches; std::string url_for_component_searches;
if (!full_url_matcher_.IsEmpty()) { if (!IsMatcherEmpty(full_url_matcher_)) {
full_url_matcher_.Match( full_url_matcher_->Match(
condition_factory_.CanonicalizeURLForFullSearches(url), &matches); condition_factory_.CanonicalizeURLForFullSearches(url), &matches);
} }
if (!url_component_matcher_.IsEmpty()) { if (!IsMatcherEmpty(url_component_matcher_)) {
url_for_component_searches = url_for_component_searches =
condition_factory_.CanonicalizeURLForComponentSearches(url); condition_factory_.CanonicalizeURLForComponentSearches(url);
url_component_matcher_.Match(url_for_component_searches, &matches); url_component_matcher_->Match(url_for_component_searches, &matches);
} }
if (!regex_set_matcher_.IsEmpty()) { if (!regex_set_matcher_.IsEmpty()) {
regex_set_matcher_.Match( regex_set_matcher_.Match(
...@@ -868,15 +872,12 @@ std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL( ...@@ -868,15 +872,12 @@ std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(
} }
bool URLMatcher::IsEmpty() const { bool URLMatcher::IsEmpty() const {
return condition_factory_.IsEmpty() && return condition_factory_.IsEmpty() && url_matcher_condition_sets_.empty() &&
url_matcher_condition_sets_.empty() && substring_match_triggers_.empty() &&
substring_match_triggers_.empty() && IsMatcherEmpty(full_url_matcher_) &&
full_url_matcher_.IsEmpty() && IsMatcherEmpty(url_component_matcher_) &&
url_component_matcher_.IsEmpty() && regex_set_matcher_.IsEmpty() &&
regex_set_matcher_.IsEmpty() && origin_and_path_regex_set_matcher_.IsEmpty();
origin_and_path_regex_set_matcher_.IsEmpty() &&
registered_full_url_patterns_.empty() &&
registered_url_component_patterns_.empty();
} }
void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) { void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
...@@ -915,32 +916,13 @@ void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) { ...@@ -915,32 +916,13 @@ void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
} }
} }
// This is the set of patterns that were registered before this function
// is called.
std::set<const StringPattern*>& registered_patterns =
full_url_conditions ? registered_full_url_patterns_
: registered_url_component_patterns_;
// Add all patterns that are in new_patterns but not in registered_patterns.
std::vector<const StringPattern*> patterns_to_register =
base::STLSetDifference<std::vector<const StringPattern*> >(
new_patterns, registered_patterns);
// Remove all patterns that are in registered_patterns but not in
// new_patterns.
std::vector<const StringPattern*> patterns_to_unregister =
base::STLSetDifference<std::vector<const StringPattern*> >(
registered_patterns, new_patterns);
// Update the SubstringSetMatcher. // Update the SubstringSetMatcher.
SubstringSetMatcher& url_matcher = std::unique_ptr<SubstringSetMatcher>& url_matcher =
full_url_conditions ? full_url_matcher_ : url_component_matcher_; full_url_conditions ? full_url_matcher_ : url_component_matcher_;
url_matcher.RegisterAndUnregisterPatterns(patterns_to_register,
patterns_to_unregister);
// Update the set of registered_patterns for the next time this function url_matcher =
// is being called. std::make_unique<SubstringSetMatcher>(std::vector<const StringPattern*>(
registered_patterns.swap(new_patterns); new_patterns.begin(), new_patterns.end()));
} }
void URLMatcher::UpdateRegexSetMatcher() { void URLMatcher::UpdateRegexSetMatcher() {
......
...@@ -417,12 +417,10 @@ class URL_MATCHER_EXPORT URLMatcher { ...@@ -417,12 +417,10 @@ class URL_MATCHER_EXPORT URLMatcher {
StringPatternTriggers; StringPatternTriggers;
StringPatternTriggers substring_match_triggers_; StringPatternTriggers substring_match_triggers_;
SubstringSetMatcher full_url_matcher_; std::unique_ptr<SubstringSetMatcher> full_url_matcher_;
SubstringSetMatcher url_component_matcher_; std::unique_ptr<SubstringSetMatcher> url_component_matcher_;
RegexSetMatcher regex_set_matcher_; RegexSetMatcher regex_set_matcher_;
RegexSetMatcher origin_and_path_regex_set_matcher_; RegexSetMatcher origin_and_path_regex_set_matcher_;
std::set<const StringPattern*> registered_full_url_patterns_;
std::set<const StringPattern*> registered_url_component_patterns_;
DISALLOW_COPY_AND_ASSIGN(URLMatcher); DISALLOW_COPY_AND_ASSIGN(URLMatcher);
}; };
......
...@@ -194,7 +194,7 @@ RegexRulesMatcher::GetBeforeRequestActionIgnoringAncestors( ...@@ -194,7 +194,7 @@ RegexRulesMatcher::GetBeforeRequestActionIgnoringAncestors(
} }
void RegexRulesMatcher::InitializeMatcher() { void RegexRulesMatcher::InitializeMatcher() {
if (regex_list_->Length() == 0) if (IsEmpty())
return; return;
for (const auto* regex_rule : *regex_list_) { for (const auto* regex_rule : *regex_list_) {
...@@ -240,19 +240,20 @@ void RegexRulesMatcher::InitializeMatcher() { ...@@ -240,19 +240,20 @@ void RegexRulesMatcher::InitializeMatcher() {
}); });
})); }));
// Convert |strings_to_match| to |filtered_re2_strings_to_match_| which stores // Convert |strings_to_match| to StringPatterns. This is necessary to use
// a vector of url_matcher::StringPattern(s). This is necessary to use
// url_matcher::SubstringSetMatcher. // url_matcher::SubstringSetMatcher.
for (size_t i = 0; i < strings_to_match.size(); ++i) { std::vector<url_matcher::StringPattern> patterns;
filtered_re2_strings_to_match_.emplace_back(std::move(strings_to_match[i]), patterns.reserve(strings_to_match.size());
i);
} for (size_t i = 0; i < strings_to_match.size(); ++i)
patterns.emplace_back(std::move(strings_to_match[i]), i);
std::vector<const url_matcher::StringPattern*> patterns; substring_matcher_ =
for (const auto& pattern : filtered_re2_strings_to_match_) std::make_unique<url_matcher::SubstringSetMatcher>(patterns);
patterns.push_back(&pattern); }
substring_matcher_.RegisterPatterns(patterns); bool RegexRulesMatcher::IsEmpty() const {
return regex_list_->Length() == 0;
} }
const std::vector<RegexRuleInfo>& RegexRulesMatcher::GetPotentialMatches( const std::vector<RegexRuleInfo>& RegexRulesMatcher::GetPotentialMatches(
...@@ -261,16 +262,24 @@ const std::vector<RegexRuleInfo>& RegexRulesMatcher::GetPotentialMatches( ...@@ -261,16 +262,24 @@ const std::vector<RegexRuleInfo>& RegexRulesMatcher::GetPotentialMatches(
if (iter != params.potential_regex_matches.end()) if (iter != params.potential_regex_matches.end())
return iter->second; return iter->second;
// Early out if this is an empty matcher.
if (IsEmpty()) {
auto result = params.potential_regex_matches.insert(
std::make_pair(this, std::vector<RegexRuleInfo>()));
return result.first->second;
}
// Compute the potential matches. FilteredRE2 requires the text to be lower // Compute the potential matches. FilteredRE2 requires the text to be lower
// cased first. // cased first.
if (!params.lower_cased_url_spec) if (!params.lower_cased_url_spec)
params.lower_cased_url_spec = base::ToLowerASCII(params.url->spec()); params.lower_cased_url_spec = base::ToLowerASCII(params.url->spec());
// To pre-filter the set of regexes to match against |params|, we first need // To pre-filter the set of regexes to match against |params|, we first need
// to compute the set of candidate strings in |filtered_re2_strings_to_match_| // to compute the set of candidate strings tracked by |substring_matcher_|
// within |params.lower_cased_url_spec|. // within |params.lower_cased_url_spec|.
std::set<int> candidate_ids_set; std::set<int> candidate_ids_set;
substring_matcher_.Match(*params.lower_cased_url_spec, &candidate_ids_set); DCHECK(substring_matcher_);
substring_matcher_->Match(*params.lower_cased_url_spec, &candidate_ids_set);
std::vector<int> candidate_ids_list(candidate_ids_set.begin(), std::vector<int> candidate_ids_list(candidate_ids_set.begin(),
candidate_ids_set.end()); candidate_ids_set.end());
......
...@@ -5,6 +5,8 @@ ...@@ -5,6 +5,8 @@
#ifndef EXTENSIONS_BROWSER_API_DECLARATIVE_NET_REQUEST_REGEX_RULES_MATCHER_H_ #ifndef EXTENSIONS_BROWSER_API_DECLARATIVE_NET_REQUEST_REGEX_RULES_MATCHER_H_
#define EXTENSIONS_BROWSER_API_DECLARATIVE_NET_REQUEST_REGEX_RULES_MATCHER_H_ #define EXTENSIONS_BROWSER_API_DECLARATIVE_NET_REQUEST_REGEX_RULES_MATCHER_H_
#include <memory>
#include "base/macros.h" #include "base/macros.h"
#include "components/url_matcher/substring_set_matcher.h" #include "components/url_matcher/substring_set_matcher.h"
#include "extensions/browser/api/declarative_net_request/ruleset_matcher_base.h" #include "extensions/browser/api/declarative_net_request/ruleset_matcher_base.h"
...@@ -31,8 +33,7 @@ struct RegexRuleInfo { ...@@ -31,8 +33,7 @@ struct RegexRuleInfo {
// Initialization: // Initialization:
// 1. During initialization, we add each regex to the FilteredRE2 class. // 1. During initialization, we add each regex to the FilteredRE2 class.
// 2. We compile the FilteredRE2 object which returns us a set of substrings. // 2. We compile the FilteredRE2 object which returns us a set of substrings.
// These are stored in |filtered_re2_strings_to_match_| below. These are also // These are added to |substring_matcher_| for use in #3 below.
// added to |substring_matcher_| for use in #3 below.
// //
// Matching // Matching
// 3. Given a request url, we find the set of strings from #2. that are // 3. Given a request url, we find the set of strings from #2. that are
...@@ -74,6 +75,9 @@ class RegexRulesMatcher final : public RulesetMatcherBase { ...@@ -74,6 +75,9 @@ class RegexRulesMatcher final : public RulesetMatcherBase {
// Helper to build the necessary data structures for matching. // Helper to build the necessary data structures for matching.
void InitializeMatcher(); void InitializeMatcher();
// Returns true if this matcher doesn't correspond to any rules.
bool IsEmpty() const;
// Returns the potentially matching rules for the given request. A potentially // Returns the potentially matching rules for the given request. A potentially
// matching rule is one whose metadata matches the given request |params| and // matching rule is one whose metadata matches the given request |params| and
// which is not ruled out as a potential match by the |filtered_re2_| object. // which is not ruled out as a potential match by the |filtered_re2_| object.
...@@ -104,15 +108,11 @@ class RegexRulesMatcher final : public RulesetMatcherBase { ...@@ -104,15 +108,11 @@ class RegexRulesMatcher final : public RulesetMatcherBase {
// |regex_list_|. // |regex_list_|.
std::map<int, const flat::RegexRule*> re2_id_to_rules_map_; std::map<int, const flat::RegexRule*> re2_id_to_rules_map_;
// Candidate strings to match for each request, for pre-filtering. The ID of
// each url_matcher::StringPattern is its index within the vector. All the
// strings are lower-cased.
std::vector<url_matcher::StringPattern> filtered_re2_strings_to_match_;
// Structure for fast substring matching. Given a string S and a set of // Structure for fast substring matching. Given a string S and a set of
// candidate strings, returns the sub-set of candidate strings that are a // candidate strings, returns the sub-set of candidate strings that are a
// substring of S. Uses the Aho-Corasick algorithm internally. // substring of S. Uses the Aho-Corasick algorithm internally. Will be null
url_matcher::SubstringSetMatcher substring_matcher_; // iff IsEmpty() returns false.
std::unique_ptr<url_matcher::SubstringSetMatcher> substring_matcher_;
DISALLOW_COPY_AND_ASSIGN(RegexRulesMatcher); DISALLOW_COPY_AND_ASSIGN(RegexRulesMatcher);
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment