Commit 64a22644 authored by Karan Bhatia's avatar Karan Bhatia Committed by Commit Bot

UrlMatcher: Store node pointers directly in SubstringSetMatcher::AhoCorasickNode.

This CL changes the AhoCorasickNode class to store pointers to other
nodes directly instead of their index in the trie array. This makes the
code clearer and reduces redundant indirection. Also, add some code
comments related to the algorithm.

This should have no behavior change.

BUG=974391

Change-Id: I2e1d89556fb064bdb1643ab7e10005d674bc8d6a
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2037734
Commit-Queue: Karan Bhatia <karandeepb@chromium.org>
Reviewed-by: default avatarDominic Battré <battre@chromium.org>
Cr-Commit-Position: refs/heads/master@{#739181}
parent 5cd0f54e
...@@ -89,7 +89,11 @@ SubstringSetMatcher::SubstringSetMatcher( ...@@ -89,7 +89,11 @@ SubstringSetMatcher::SubstringSetMatcher(
std::sort(patterns.begin(), patterns.end(), ComparePatterns); std::sort(patterns.begin(), patterns.end(), ComparePatterns);
tree_.reserve(TreeSize(patterns)); tree_.reserve(TreeSize(patterns));
BuildAhoCorasickTree(patterns); BuildAhoCorasickTree(patterns);
// Sanity check that no new allocations happened in the tree and our computed
// size was correct.
DCHECK_EQ(tree_.size(), TreeSize(patterns)); DCHECK_EQ(tree_.size(), TreeSize(patterns));
is_empty_ = patterns.empty() && tree_.size() == 1u; is_empty_ = patterns.empty() && tree_.size() == 1u;
} }
...@@ -100,22 +104,28 @@ bool SubstringSetMatcher::Match(const std::string& text, ...@@ -100,22 +104,28 @@ bool SubstringSetMatcher::Match(const std::string& text,
const size_t old_number_of_matches = matches->size(); const size_t old_number_of_matches = matches->size();
// Handle patterns matching the empty string. // Handle patterns matching the empty string.
matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); const AhoCorasickNode* const root = &tree_[0];
matches->insert(root->matches().begin(), root->matches().end());
uint32_t current_node = 0; const AhoCorasickNode* current_node = root;
for (const char c : text) { for (const char c : text) {
uint32_t edge_from_current = tree_[current_node].GetEdge(c); AhoCorasickNode* child = current_node->GetEdge(c);
while (edge_from_current == AhoCorasickNode::kNoSuchEdge &&
current_node != 0) { // If the child not can't be found, progressively iterate over the longest
current_node = tree_[current_node].failure(); // proper suffix of the string represented by the current node.
edge_from_current = tree_[current_node].GetEdge(c); while (!child && current_node != root) {
current_node = current_node->failure();
child = current_node->GetEdge(c);
} }
if (edge_from_current != AhoCorasickNode::kNoSuchEdge) {
current_node = edge_from_current; if (child) {
matches->insert(tree_[current_node].matches().begin(), current_node = child;
tree_[current_node].matches().end()); matches->insert(current_node->matches().begin(),
current_node->matches().end());
} else { } else {
DCHECK_EQ(0u, current_node); // The empty string is the longest possible suffix of the current position
// of text in the trie.
DCHECK_EQ(root, current_node);
} }
} }
...@@ -131,8 +141,11 @@ void SubstringSetMatcher::BuildAhoCorasickTree( ...@@ -131,8 +141,11 @@ void SubstringSetMatcher::BuildAhoCorasickTree(
DCHECK(tree_.empty()); DCHECK(tree_.empty());
// Initialize root node of tree. // Initialize root node of tree.
// Sanity check that there's no reallocation on adding a new node since we
// take pointers to nodes in the |tree_|, which can be invalidated in case of
// a reallocation.
DCHECK_LT(tree_.size(), tree_.capacity());
tree_.emplace_back(); tree_.emplace_back();
tree_.back().set_failure(0);
// Build the initial trie for all the patterns. // Build the initial trie for all the patterns.
for (const StringPattern* pattern : patterns) for (const StringPattern* pattern : patterns)
...@@ -147,71 +160,88 @@ void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( ...@@ -147,71 +160,88 @@ void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree(
const std::string::const_iterator text_end = text.end(); const std::string::const_iterator text_end = text.end();
// Iterators on the tree and the text. // Iterators on the tree and the text.
uint32_t current_node = 0; AhoCorasickNode* current_node = &tree_[0];
std::string::const_iterator i = text.begin(); std::string::const_iterator i = text.begin();
// Follow existing paths for as long as possible. // Follow existing paths for as long as possible.
while (i != text_end) { while (i != text_end) {
uint32_t edge_from_current = tree_[current_node].GetEdge(*i); AhoCorasickNode* child = current_node->GetEdge(*i);
if (edge_from_current == AhoCorasickNode::kNoSuchEdge) if (!child)
break; break;
current_node = edge_from_current; current_node = child;
++i; ++i;
} }
// Create new nodes if necessary. // Create new nodes if necessary.
while (i != text_end) { while (i != text_end) {
tree_.push_back(AhoCorasickNode()); // Sanity check that there's no reallocation on adding a new node since we
tree_[current_node].SetEdge(*i, tree_.size() - 1); // take pointers to nodes in the |tree_|, which can be invalidated in case
current_node = tree_.size() - 1; // of a reallocation.
DCHECK_LT(tree_.size(), tree_.capacity());
tree_.emplace_back();
AhoCorasickNode* child = &tree_.back();
current_node->SetEdge(*i, child);
current_node = child;
++i; ++i;
} }
// Register match. // Register match.
tree_[current_node].AddMatch(pattern->id()); current_node->AddMatch(pattern->id());
} }
void SubstringSetMatcher::CreateFailureEdges() { void SubstringSetMatcher::CreateFailureEdges() {
base::queue<uint32_t> queue; base::queue<AhoCorasickNode*> queue;
AhoCorasickNode& root = tree_[0]; // Initialize the failure edges for |root| and its children.
root.set_failure(0); AhoCorasickNode* const root = &tree_[0];
for (const auto& edge : root.edges()) { root->SetFailure(root);
const uint32_t leads_to = edge.second;
tree_[leads_to].set_failure(0); for (const auto& edge : root->edges()) {
queue.push(leads_to); AhoCorasickNode* child = edge.second;
child->SetFailure(root);
queue.push(child);
} }
// Do a breadth first search over the trie to create failure edges. We
// maintain the invariant that any node in |queue| has had its |failure_| edge
// and |matches_| initialized.
while (!queue.empty()) { while (!queue.empty()) {
AhoCorasickNode& current_node = tree_[queue.front()]; AhoCorasickNode* current_node = queue.front();
queue.pop(); queue.pop();
for (const auto& edge : current_node.edges()) {
// Compute the failure edges of children using the failure edges of the
// current node.
for (const auto& edge : current_node->edges()) {
const char edge_label = edge.first; const char edge_label = edge.first;
const uint32_t leads_to = edge.second; AhoCorasickNode* child = edge.second;
queue.push(leads_to);
const AhoCorasickNode* failure_candidate_parent = current_node->failure();
uint32_t failure = current_node.failure(); const AhoCorasickNode* failure_candidate =
uint32_t edge_from_failure = tree_[failure].GetEdge(edge_label); failure_candidate_parent->GetEdge(edge_label);
while (edge_from_failure == AhoCorasickNode::kNoSuchEdge &&
failure != 0) { while (!failure_candidate && failure_candidate_parent != root) {
failure = tree_[failure].failure(); failure_candidate_parent = failure_candidate_parent->failure();
edge_from_failure = tree_[failure].GetEdge(edge_label); failure_candidate = failure_candidate_parent->GetEdge(edge_label);
}
if (!failure_candidate) {
DCHECK_EQ(root, failure_candidate_parent);
// |failure_candidate| is null and we can't proceed further since we
// have reached the root. Hence the longest proper suffix of this string
// represented by this node is the empty string (represented by root).
failure_candidate = root;
} }
const uint32_t follow_in_case_of_failure = child->SetFailure(failure_candidate);
edge_from_failure != AhoCorasickNode::kNoSuchEdge ? edge_from_failure child->AddMatches(failure_candidate->matches());
: 0;
tree_[leads_to].set_failure(follow_in_case_of_failure); queue.push(child);
tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches());
} }
} }
} }
const uint32_t SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = 0xFFFFFFFF; SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() = default;
SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode()
: failure_(kNoSuchEdge) {}
SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() = default; SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() = default;
SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(AhoCorasickNode&& other) = SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(AhoCorasickNode&& other) =
...@@ -220,15 +250,24 @@ SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(AhoCorasickNode&& other) = ...@@ -220,15 +250,24 @@ SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(AhoCorasickNode&& other) =
SubstringSetMatcher::AhoCorasickNode& SubstringSetMatcher::AhoCorasickNode:: SubstringSetMatcher::AhoCorasickNode& SubstringSetMatcher::AhoCorasickNode::
operator=(AhoCorasickNode&& other) = default; operator=(AhoCorasickNode&& other) = default;
uint32_t SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { SubstringSetMatcher::AhoCorasickNode*
SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const {
auto i = edges_.find(c); auto i = edges_.find(c);
return i == edges_.end() ? kNoSuchEdge : i->second; return i == edges_.end() ? nullptr : i->second;
} }
void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, uint32_t node) { void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c,
AhoCorasickNode* node) {
DCHECK(node);
edges_[c] = node; edges_[c] = node;
} }
void SubstringSetMatcher::AhoCorasickNode::SetFailure(
const AhoCorasickNode* node) {
DCHECK(node);
failure_ = node;
}
void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) {
matches_.insert(id); matches_.insert(id);
} }
......
...@@ -74,23 +74,21 @@ class URL_MATCHER_EXPORT SubstringSetMatcher { ...@@ -74,23 +74,21 @@ class URL_MATCHER_EXPORT SubstringSetMatcher {
// It will make sense. Eventually. // It will make sense. Eventually.
class AhoCorasickNode { class AhoCorasickNode {
public: public:
// Key: label of the edge, value: node index in |tree_| of parent class. // Key: label of the edge, value: pointer to child node.
typedef std::map<char, uint32_t> Edges; typedef std::map<char, AhoCorasickNode*> Edges;
typedef std::set<StringPattern::ID> Matches; typedef std::set<StringPattern::ID> Matches;
static const uint32_t kNoSuchEdge; // Represents an invalid node index.
AhoCorasickNode(); AhoCorasickNode();
~AhoCorasickNode(); ~AhoCorasickNode();
AhoCorasickNode(AhoCorasickNode&& other); AhoCorasickNode(AhoCorasickNode&& other);
AhoCorasickNode& operator=(AhoCorasickNode&& other); AhoCorasickNode& operator=(AhoCorasickNode&& other);
uint32_t GetEdge(char c) const; AhoCorasickNode* GetEdge(char c) const;
void SetEdge(char c, uint32_t node); void SetEdge(char c, AhoCorasickNode* node);
const Edges& edges() const { return edges_; } const Edges& edges() const { return edges_; }
uint32_t failure() const { return failure_; } const AhoCorasickNode* failure() const { return failure_; }
void set_failure(uint32_t failure) { failure_ = failure; } void SetFailure(const AhoCorasickNode* failure);
void AddMatch(StringPattern::ID id); void AddMatch(StringPattern::ID id);
void AddMatches(const Matches& matches); void AddMatches(const Matches& matches);
...@@ -102,8 +100,8 @@ class URL_MATCHER_EXPORT SubstringSetMatcher { ...@@ -102,8 +100,8 @@ class URL_MATCHER_EXPORT SubstringSetMatcher {
// Outgoing edges of current node. // Outgoing edges of current node.
Edges edges_; Edges edges_;
// Node index that failure edge leads to. // Node that failure edge leads to. Null when uninitialized.
uint32_t failure_; const AhoCorasickNode* failure_ = nullptr;
// Identifiers of matches. // Identifiers of matches.
Matches matches_; Matches matches_;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment