Commit b7a7b8d9 authored by vabr@chromium.org's avatar vabr@chromium.org

Revert 198871 "Speed improvements to SubstringSetMatcher"

Win debug trybots fire assertions on SubstringSetMatcherTest.TestMatcher and WebRequestRulesRegistryTest.GetMatchesCheckFulfilled

c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\include\algorithm(3657) : Assertion failed: invalid operator<
c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\include\xstring(78) : Assertion failed: string iterator not dereferencable
c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\include\xstring(79) : Assertion failed: "Standard C++ Libraries Out of Range" && 0

> Speed improvements to SubstringSetMatcher
> 
> This is an attempt to reland https://codereview.chromium.org/14780003/ which was reverted as https://codereview.chromium.org/15045002/.
> Patch set 1 here is identical with p.s. 7 in https://codereview.chromium.org/14780003/. The failing buildbot was linux compile (dbg), so trying it here explicitly.
> 
> Original description:
> 
> The main one is coputing the Aho-Corasick tree size in advance.
> 
> Also contained are code clean-ups and minor optimisations, like removing HasEdge/GetEdge sequences, or adding const to aid compiler optimisations.
> 
> This was tested on a benchmark adding 20k+ patterns. It showed a reduction of the running time by 30%.
> 
> BUG=236368
> TBR=battre
> 
> Review URL: https://codereview.chromium.org/15059003

TBR=battre@chromium.org

Review URL: https://codereview.chromium.org/15027007

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@198874 0039d316-1c4b-4281-b951-d872f2087c98
parent 7a41154a
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
#include "extensions/common/matcher/substring_set_matcher.h" #include "extensions/common/matcher/substring_set_matcher.h"
#include <algorithm>
#include <queue> #include <queue>
#include "base/logging.h" #include "base/logging.h"
...@@ -12,51 +11,12 @@ ...@@ -12,51 +11,12 @@
namespace extensions { namespace extensions {
namespace {
// Compare StringPattern instances based on their string patterns.
bool ComparePatterns(const StringPattern* a, const StringPattern* b) {
return a->pattern() <= b->pattern();
}
// Given the set of patterns, compute how many nodes will the corresponding
// Aho-Corasick tree have. Note that |patterns| need to be sorted.
uint32 TreeSize(const std::vector<const StringPattern*>& patterns) {
uint32 result = 1u; // 1 for the root node.
if (patterns.empty())
return result;
std::vector<const StringPattern*>::const_iterator last = patterns.begin();
std::vector<const StringPattern*>::const_iterator current = last + 1;
// For the first pattern, each letter is a label of an edge to a new node.
result += (*last)->pattern().size();
// For the subsequent patterns, only count the edges which were not counted
// yet. For this it suffices to test against the previous pattern, because the
// patterns are sorted.
for (; current != patterns.end(); ++last, ++current) {
const std::string& last_pattern = (*last)->pattern();
const std::string& current_pattern = (*current)->pattern();
const uint32 prefix_bound =
std::min(last_pattern.size(), current_pattern.size());
uint32 common_prefix = 0;
while (common_prefix < prefix_bound &&
last_pattern[common_prefix] == current_pattern[common_prefix])
++common_prefix;
result += current_pattern.size() - common_prefix;
}
return result;
}
} // namespace
// //
// SubstringSetMatcher // SubstringSetMatcher
// //
SubstringSetMatcher::SubstringSetMatcher() { SubstringSetMatcher::SubstringSetMatcher() {
RebuildAhoCorasickTree(SubstringPatternVector()); RebuildAhoCorasickTree();
} }
SubstringSetMatcher::~SubstringSetMatcher() {} SubstringSetMatcher::~SubstringSetMatcher() {}
...@@ -89,44 +49,27 @@ void SubstringSetMatcher::RegisterAndUnregisterPatterns( ...@@ -89,44 +49,27 @@ void SubstringSetMatcher::RegisterAndUnregisterPatterns(
patterns_.erase((*i)->id()); patterns_.erase((*i)->id());
} }
// Now we compute the total number of tree nodes needed. RebuildAhoCorasickTree();
SubstringPatternVector sorted_patterns;
sorted_patterns.resize(patterns_.size());
size_t next = 0;
for (SubstringPatternMap::const_iterator i = patterns_.begin();
i != patterns_.end();
++i, ++next) {
sorted_patterns[next] = i->second;
}
std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns);
tree_.reserve(TreeSize(sorted_patterns));
RebuildAhoCorasickTree(sorted_patterns);
} }
bool SubstringSetMatcher::Match(const std::string& text, bool SubstringSetMatcher::Match(const std::string& text,
std::set<StringPattern::ID>* matches) const { std::set<StringPattern::ID>* matches) const {
const size_t old_number_of_matches = matches->size(); size_t old_number_of_matches = matches->size();
// Handle patterns matching the empty string. // Handle patterns matching the empty string.
matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); matches->insert(tree_[0].matches().begin(), tree_[0].matches().end());
uint32 current_node = 0; int current_node = 0;
for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) { size_t text_length = text.length();
uint32 edge_from_current = tree_[current_node].GetEdge(*i); for (size_t i = 0; i < text_length; ++i) {
while (edge_from_current == AhoCorasickNode::kNoSuchEdge && while (!tree_[current_node].HasEdge(text[i]) && current_node != 0)
current_node != 0) {
current_node = tree_[current_node].failure(); current_node = tree_[current_node].failure();
edge_from_current = tree_[current_node].GetEdge(*i); if (tree_[current_node].HasEdge(text[i])) {
} current_node = tree_[current_node].GetEdge(text[i]);
if (edge_from_current != AhoCorasickNode::kNoSuchEdge) {
current_node = edge_from_current;
matches->insert(tree_[current_node].matches().begin(), matches->insert(tree_[current_node].matches().begin(),
tree_[current_node].matches().end()); tree_[current_node].matches().end());
} else { } else {
DCHECK_EQ(0u, current_node); DCHECK_EQ(0, current_node);
} }
} }
...@@ -138,8 +81,7 @@ bool SubstringSetMatcher::IsEmpty() const { ...@@ -138,8 +81,7 @@ bool SubstringSetMatcher::IsEmpty() const {
return patterns_.empty() && tree_.size() == 1u; return patterns_.empty() && tree_.size() == 1u;
} }
void SubstringSetMatcher::RebuildAhoCorasickTree( void SubstringSetMatcher::RebuildAhoCorasickTree() {
const SubstringPatternVector& sorted_patterns) {
tree_.clear(); tree_.clear();
// Initialize root note of tree. // Initialize root note of tree.
...@@ -148,10 +90,9 @@ void SubstringSetMatcher::RebuildAhoCorasickTree( ...@@ -148,10 +90,9 @@ void SubstringSetMatcher::RebuildAhoCorasickTree(
tree_.push_back(root); tree_.push_back(root);
// Insert all patterns. // Insert all patterns.
for (SubstringPatternVector::const_iterator i = sorted_patterns.begin(); for (SubstringPatternSet::const_iterator i = patterns_.begin();
i != sorted_patterns.end(); i != patterns_.end(); ++i) {
++i) { InsertPatternIntoAhoCorasickTree(i->second);
InsertPatternIntoAhoCorasickTree(*i);
} }
CreateFailureEdges(); CreateFailureEdges();
...@@ -160,27 +101,26 @@ void SubstringSetMatcher::RebuildAhoCorasickTree( ...@@ -160,27 +101,26 @@ void SubstringSetMatcher::RebuildAhoCorasickTree(
void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree(
const StringPattern* pattern) { const StringPattern* pattern) {
const std::string& text = pattern->pattern(); const std::string& text = pattern->pattern();
const std::string::const_iterator text_end = text.end(); size_t text_length = text.length();
// Iterators on the tree and the text. // Iterators on the tree and the text.
uint32 current_node = 0; int current_node = 0;
std::string::const_iterator i = text.begin(); size_t text_pos = 0;
// Follow existing paths for as long as possible. // Follow existing paths for as long as possible.
uint32 edge_from_current = tree_[current_node].GetEdge(*i); while (text_pos < text_length &&
while (i != text_end && tree_[current_node].HasEdge(text[text_pos])) {
edge_from_current != AhoCorasickNode::kNoSuchEdge) { current_node = tree_[current_node].GetEdge(text[text_pos]);
current_node = edge_from_current; ++text_pos;
++i;
edge_from_current = tree_[current_node].GetEdge(*i);
} }
// Create new nodes if necessary. // Create new nodes if necessary.
while (i != text_end) { while (text_pos < text_length) {
tree_.push_back(AhoCorasickNode()); AhoCorasickNode new_node;
tree_[current_node].SetEdge(*i, tree_.size() - 1); tree_.push_back(new_node);
tree_[current_node].SetEdge(text[text_pos], tree_.size() - 1);
current_node = tree_.size() - 1; current_node = tree_.size() - 1;
++i; ++text_pos;
} }
// Register match. // Register match.
...@@ -190,14 +130,14 @@ void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( ...@@ -190,14 +130,14 @@ void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree(
void SubstringSetMatcher::CreateFailureEdges() { void SubstringSetMatcher::CreateFailureEdges() {
typedef AhoCorasickNode::Edges Edges; typedef AhoCorasickNode::Edges Edges;
std::queue<uint32> queue; std::queue<int> queue;
AhoCorasickNode& root = tree_[0]; AhoCorasickNode& root = tree_[0];
root.set_failure(0); root.set_failure(0);
const Edges& root_edges = root.edges(); const AhoCorasickNode::Edges& root_edges = root.edges();
for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end();
++e) { ++e) {
const uint32& leads_to = e->second; int leads_to = e->second;
tree_[leads_to].set_failure(0); tree_[leads_to].set_failure(0);
queue.push(leads_to); queue.push(leads_to);
} }
...@@ -207,32 +147,24 @@ void SubstringSetMatcher::CreateFailureEdges() { ...@@ -207,32 +147,24 @@ void SubstringSetMatcher::CreateFailureEdges() {
queue.pop(); queue.pop();
for (Edges::const_iterator e = current_node.edges().begin(); for (Edges::const_iterator e = current_node.edges().begin();
e != current_node.edges().end(); ++e) { e != current_node.edges().end(); ++e) {
const char& edge_label = e->first; char edge_label = e->first;
const uint32& leads_to = e->second; int leads_to = e->second;
queue.push(leads_to); queue.push(leads_to);
uint32 failure = current_node.failure(); int failure = current_node.failure();
uint32 edge_from_failure = tree_[failure].GetEdge(edge_label); while (!tree_[failure].HasEdge(edge_label) && failure != 0)
while (edge_from_failure == AhoCorasickNode::kNoSuchEdge &&
failure != 0) {
failure = tree_[failure].failure(); failure = tree_[failure].failure();
edge_from_failure = tree_[failure].GetEdge(edge_label);
}
const uint32 follow_in_case_of_failure = int follow_in_case_of_failure = tree_[failure].HasEdge(e->first) ?
edge_from_failure != AhoCorasickNode::kNoSuchEdge tree_[failure].GetEdge(edge_label) : 0;
? edge_from_failure
: 0;
tree_[leads_to].set_failure(follow_in_case_of_failure); tree_[leads_to].set_failure(follow_in_case_of_failure);
tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches());
} }
} }
} }
const uint32 SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = ~0;
SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode()
: failure_(kNoSuchEdge) {} : failure_(-1) {}
SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {}
...@@ -251,12 +183,16 @@ SubstringSetMatcher::AhoCorasickNode::operator=( ...@@ -251,12 +183,16 @@ SubstringSetMatcher::AhoCorasickNode::operator=(
return *this; return *this;
} }
uint32 SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { bool SubstringSetMatcher::AhoCorasickNode::HasEdge(char c) const {
Edges::const_iterator i = edges_.find(c); return edges_.find(c) != edges_.end();
return i == edges_.end() ? kNoSuchEdge : i->second; }
int SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const {
std::map<char, int>::const_iterator i = edges_.find(c);
return i == edges_.end() ? -1 : i->second;
} }
void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, uint32 node) { void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, int node) {
edges_[c] = node; edges_[c] = node;
} }
......
...@@ -5,7 +5,6 @@ ...@@ -5,7 +5,6 @@
#ifndef EXTENSIONS_COMMON_MATCHER_SUBSTRING_SET_MATCHER_H_ #ifndef EXTENSIONS_COMMON_MATCHER_SUBSTRING_SET_MATCHER_H_
#define EXTENSIONS_COMMON_MATCHER_SUBSTRING_SET_MATCHER_H_ #define EXTENSIONS_COMMON_MATCHER_SUBSTRING_SET_MATCHER_H_
#include <limits>
#include <map> #include <map>
#include <set> #include <set>
#include <string> #include <string>
...@@ -81,22 +80,21 @@ class SubstringSetMatcher { ...@@ -81,22 +80,21 @@ class SubstringSetMatcher {
class AhoCorasickNode { class AhoCorasickNode {
public: public:
// Key: label of the edge, value: node index in |tree_| of parent class. // Key: label of the edge, value: node index in |tree_| of parent class.
typedef std::map<char, uint32> Edges; typedef std::map<char, int> Edges;
typedef std::set<StringPattern::ID> Matches; typedef std::set<StringPattern::ID> Matches;
static const uint32 kNoSuchEdge; // Represents an invalid node index.
AhoCorasickNode(); AhoCorasickNode();
~AhoCorasickNode(); ~AhoCorasickNode();
AhoCorasickNode(const AhoCorasickNode& other); AhoCorasickNode(const AhoCorasickNode& other);
AhoCorasickNode& operator=(const AhoCorasickNode& other); AhoCorasickNode& operator=(const AhoCorasickNode& other);
uint32 GetEdge(char c) const; bool HasEdge(char c) const;
void SetEdge(char c, uint32 node); int GetEdge(char c) const;
void SetEdge(char c, int node);
const Edges& edges() const { return edges_; } const Edges& edges() const { return edges_; }
uint32 failure() const { return failure_; } int failure() const { return failure_; }
void set_failure(uint32 failure) { failure_ = failure; } void set_failure(int failure) { failure_ = failure; }
void AddMatch(StringPattern::ID id); void AddMatch(StringPattern::ID id);
void AddMatches(const Matches& matches); void AddMatches(const Matches& matches);
...@@ -107,17 +105,13 @@ class SubstringSetMatcher { ...@@ -107,17 +105,13 @@ class SubstringSetMatcher {
Edges edges_; Edges edges_;
// Node index that failure edge leads to. // Node index that failure edge leads to.
uint32 failure_; int failure_;
// Identifiers of matches. // Identifiers of matches.
Matches matches_; Matches matches_;
}; };
typedef std::map<StringPattern::ID, const StringPattern*> SubstringPatternMap; void RebuildAhoCorasickTree();
typedef std::vector<const StringPattern*> SubstringPatternVector;
// |sorted_patterns| is a copy of |patterns_| sorted by the pattern string.
void RebuildAhoCorasickTree(const SubstringPatternVector& sorted_patterns);
// Inserts a path for |pattern->pattern()| into the tree and adds // Inserts a path for |pattern->pattern()| into the tree and adds
// |pattern->id()| to the set of matches. Ownership of |pattern| remains with // |pattern->id()| to the set of matches. Ownership of |pattern| remains with
...@@ -127,7 +121,9 @@ class SubstringSetMatcher { ...@@ -127,7 +121,9 @@ class SubstringSetMatcher {
// Set of all registered StringPatterns. Used to regenerate the // Set of all registered StringPatterns. Used to regenerate the
// Aho-Corasick tree in case patterns are registered or unregistered. // Aho-Corasick tree in case patterns are registered or unregistered.
SubstringPatternMap patterns_; typedef std::map<StringPattern::ID, const StringPattern*>
SubstringPatternSet;
SubstringPatternSet patterns_;
// The nodes of a Aho-Corasick tree. // The nodes of a Aho-Corasick tree.
std::vector<AhoCorasickNode> tree_; std::vector<AhoCorasickNode> tree_;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment