Commit 64a22644 authored by Karan Bhatia's avatar Karan Bhatia Committed by Commit Bot

UrlMatcher: Store node pointers directly in SubstringSetMatcher::AhoCorasickNode.

This CL changes the AhoCorasickNode class to store pointers to other
nodes directly instead of their index in the trie array. This makes the
code clearer and reduces redundant indirection. Also, add some code
comments related to the algorithm.

This should have no behavior change.

BUG=974391

Change-Id: I2e1d89556fb064bdb1643ab7e10005d674bc8d6a
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2037734
Commit-Queue: Karan Bhatia <karandeepb@chromium.org>
Reviewed-by: default avatarDominic Battré <battre@chromium.org>
Cr-Commit-Position: refs/heads/master@{#739181}
parent 5cd0f54e
......@@ -89,7 +89,11 @@ SubstringSetMatcher::SubstringSetMatcher(
std::sort(patterns.begin(), patterns.end(), ComparePatterns);
tree_.reserve(TreeSize(patterns));
BuildAhoCorasickTree(patterns);
// Sanity check that no new allocations happened in the tree and our computed
// size was correct.
DCHECK_EQ(tree_.size(), TreeSize(patterns));
is_empty_ = patterns.empty() && tree_.size() == 1u;
}
......@@ -100,22 +104,28 @@ bool SubstringSetMatcher::Match(const std::string& text,
const size_t old_number_of_matches = matches->size();
// Handle patterns matching the empty string.
matches->insert(tree_[0].matches().begin(), tree_[0].matches().end());
const AhoCorasickNode* const root = &tree_[0];
matches->insert(root->matches().begin(), root->matches().end());
uint32_t current_node = 0;
const AhoCorasickNode* current_node = root;
for (const char c : text) {
uint32_t edge_from_current = tree_[current_node].GetEdge(c);
while (edge_from_current == AhoCorasickNode::kNoSuchEdge &&
current_node != 0) {
current_node = tree_[current_node].failure();
edge_from_current = tree_[current_node].GetEdge(c);
AhoCorasickNode* child = current_node->GetEdge(c);
// If the child not can't be found, progressively iterate over the longest
// proper suffix of the string represented by the current node.
while (!child && current_node != root) {
current_node = current_node->failure();
child = current_node->GetEdge(c);
}
if (edge_from_current != AhoCorasickNode::kNoSuchEdge) {
current_node = edge_from_current;
matches->insert(tree_[current_node].matches().begin(),
tree_[current_node].matches().end());
if (child) {
current_node = child;
matches->insert(current_node->matches().begin(),
current_node->matches().end());
} else {
DCHECK_EQ(0u, current_node);
// The empty string is the longest possible suffix of the current position
// of text in the trie.
DCHECK_EQ(root, current_node);
}
}
......@@ -131,8 +141,11 @@ void SubstringSetMatcher::BuildAhoCorasickTree(
DCHECK(tree_.empty());
// Initialize root node of tree.
// Sanity check that there's no reallocation on adding a new node since we
// take pointers to nodes in the |tree_|, which can be invalidated in case of
// a reallocation.
DCHECK_LT(tree_.size(), tree_.capacity());
tree_.emplace_back();
tree_.back().set_failure(0);
// Build the initial trie for all the patterns.
for (const StringPattern* pattern : patterns)
......@@ -147,71 +160,88 @@ void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree(
const std::string::const_iterator text_end = text.end();
// Iterators on the tree and the text.
uint32_t current_node = 0;
AhoCorasickNode* current_node = &tree_[0];
std::string::const_iterator i = text.begin();
// Follow existing paths for as long as possible.
while (i != text_end) {
uint32_t edge_from_current = tree_[current_node].GetEdge(*i);
if (edge_from_current == AhoCorasickNode::kNoSuchEdge)
AhoCorasickNode* child = current_node->GetEdge(*i);
if (!child)
break;
current_node = edge_from_current;
current_node = child;
++i;
}
// Create new nodes if necessary.
while (i != text_end) {
tree_.push_back(AhoCorasickNode());
tree_[current_node].SetEdge(*i, tree_.size() - 1);
current_node = tree_.size() - 1;
// Sanity check that there's no reallocation on adding a new node since we
// take pointers to nodes in the |tree_|, which can be invalidated in case
// of a reallocation.
DCHECK_LT(tree_.size(), tree_.capacity());
tree_.emplace_back();
AhoCorasickNode* child = &tree_.back();
current_node->SetEdge(*i, child);
current_node = child;
++i;
}
// Register match.
tree_[current_node].AddMatch(pattern->id());
current_node->AddMatch(pattern->id());
}
void SubstringSetMatcher::CreateFailureEdges() {
base::queue<uint32_t> queue;
AhoCorasickNode& root = tree_[0];
root.set_failure(0);
for (const auto& edge : root.edges()) {
const uint32_t leads_to = edge.second;
tree_[leads_to].set_failure(0);
queue.push(leads_to);
base::queue<AhoCorasickNode*> queue;
// Initialize the failure edges for |root| and its children.
AhoCorasickNode* const root = &tree_[0];
root->SetFailure(root);
for (const auto& edge : root->edges()) {
AhoCorasickNode* child = edge.second;
child->SetFailure(root);
queue.push(child);
}
// Do a breadth first search over the trie to create failure edges. We
// maintain the invariant that any node in |queue| has had its |failure_| edge
// and |matches_| initialized.
while (!queue.empty()) {
AhoCorasickNode& current_node = tree_[queue.front()];
AhoCorasickNode* current_node = queue.front();
queue.pop();
for (const auto& edge : current_node.edges()) {
// Compute the failure edges of children using the failure edges of the
// current node.
for (const auto& edge : current_node->edges()) {
const char edge_label = edge.first;
const uint32_t leads_to = edge.second;
queue.push(leads_to);
uint32_t failure = current_node.failure();
uint32_t edge_from_failure = tree_[failure].GetEdge(edge_label);
while (edge_from_failure == AhoCorasickNode::kNoSuchEdge &&
failure != 0) {
failure = tree_[failure].failure();
edge_from_failure = tree_[failure].GetEdge(edge_label);
AhoCorasickNode* child = edge.second;
const AhoCorasickNode* failure_candidate_parent = current_node->failure();
const AhoCorasickNode* failure_candidate =
failure_candidate_parent->GetEdge(edge_label);
while (!failure_candidate && failure_candidate_parent != root) {
failure_candidate_parent = failure_candidate_parent->failure();
failure_candidate = failure_candidate_parent->GetEdge(edge_label);
}
if (!failure_candidate) {
DCHECK_EQ(root, failure_candidate_parent);
// |failure_candidate| is null and we can't proceed further since we
// have reached the root. Hence the longest proper suffix of this string
// represented by this node is the empty string (represented by root).
failure_candidate = root;
}
const uint32_t follow_in_case_of_failure =
edge_from_failure != AhoCorasickNode::kNoSuchEdge ? edge_from_failure
: 0;
tree_[leads_to].set_failure(follow_in_case_of_failure);
tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches());
child->SetFailure(failure_candidate);
child->AddMatches(failure_candidate->matches());
queue.push(child);
}
}
}
const uint32_t SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = 0xFFFFFFFF;
SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode()
: failure_(kNoSuchEdge) {}
SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() = default;
SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() = default;
SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(AhoCorasickNode&& other) =
......@@ -220,15 +250,24 @@ SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(AhoCorasickNode&& other) =
SubstringSetMatcher::AhoCorasickNode& SubstringSetMatcher::AhoCorasickNode::
operator=(AhoCorasickNode&& other) = default;
uint32_t SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const {
SubstringSetMatcher::AhoCorasickNode*
SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const {
auto i = edges_.find(c);
return i == edges_.end() ? kNoSuchEdge : i->second;
return i == edges_.end() ? nullptr : i->second;
}
void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, uint32_t node) {
void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c,
AhoCorasickNode* node) {
DCHECK(node);
edges_[c] = node;
}
void SubstringSetMatcher::AhoCorasickNode::SetFailure(
const AhoCorasickNode* node) {
DCHECK(node);
failure_ = node;
}
void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) {
matches_.insert(id);
}
......
......@@ -74,23 +74,21 @@ class URL_MATCHER_EXPORT SubstringSetMatcher {
// It will make sense. Eventually.
class AhoCorasickNode {
public:
// Key: label of the edge, value: node index in |tree_| of parent class.
typedef std::map<char, uint32_t> Edges;
// Key: label of the edge, value: pointer to child node.
typedef std::map<char, AhoCorasickNode*> Edges;
typedef std::set<StringPattern::ID> Matches;
static const uint32_t kNoSuchEdge; // Represents an invalid node index.
AhoCorasickNode();
~AhoCorasickNode();
AhoCorasickNode(AhoCorasickNode&& other);
AhoCorasickNode& operator=(AhoCorasickNode&& other);
uint32_t GetEdge(char c) const;
void SetEdge(char c, uint32_t node);
AhoCorasickNode* GetEdge(char c) const;
void SetEdge(char c, AhoCorasickNode* node);
const Edges& edges() const { return edges_; }
uint32_t failure() const { return failure_; }
void set_failure(uint32_t failure) { failure_ = failure; }
const AhoCorasickNode* failure() const { return failure_; }
void SetFailure(const AhoCorasickNode* failure);
void AddMatch(StringPattern::ID id);
void AddMatches(const Matches& matches);
......@@ -102,8 +100,8 @@ class URL_MATCHER_EXPORT SubstringSetMatcher {
// Outgoing edges of current node.
Edges edges_;
// Node index that failure edge leads to.
uint32_t failure_;
// Node that failure edge leads to. Null when uninitialized.
const AhoCorasickNode* failure_ = nullptr;
// Identifiers of matches.
Matches matches_;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment