Commit a702473f authored by Karan Bhatia's avatar Karan Bhatia Committed by Commit Bot

UrlMatcher: Store array index to refer to nodes in SubstringSetMatcher.

Currently as part of AhoCorasickNode, we store pointers to other nodes
as part of the edge map and failure and output links. On a 64 bit
machine, the size of a pointer would be 8 bytes, so replacing it with a
4 byte array index yields decreased memory usage.

However doing this also has a drawback. Since we store array indices in
an uint32_t, the maximum size of the trie would be limitied by the
maximum possible value that an uint32_t can store as opposed to
vector<AhoCorasickNode>::max_size().

Also, the current implementation of GetTreeSize ignores any possible
overflow in computing the tree size, which can lead to a DCHECK failures
as well as incorrect results (It's not clear if it can lead to invalid
memory access). Change it to explicitly crash if the computed tree size
overflows and can't be stored in an uint32_t.

On my local machine, this reduces the amount of memory for the
SubstringSetMatcher perf test from ~33 Mb to ~25 Mb.

BUG=974391

Change-Id: Ib2541b514b8993e414891b1f678e590f6292dc98
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2050099
Commit-Queue: Karan Bhatia <karandeepb@chromium.org>
Reviewed-by: default avatarDominic Battré <battre@chromium.org>
Cr-Commit-Position: refs/heads/master@{#740848}
parent 0cb28cba
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include "base/containers/queue.h" #include "base/containers/queue.h"
#include "base/logging.h" #include "base/logging.h"
#include "base/numerics/checked_math.h"
#include "base/stl_util.h" #include "base/stl_util.h"
#include "base/trace_event/memory_usage_estimator.h" #include "base/trace_event/memory_usage_estimator.h"
...@@ -23,38 +24,6 @@ bool ComparePatterns(const StringPattern* a, const StringPattern* b) { ...@@ -23,38 +24,6 @@ bool ComparePatterns(const StringPattern* a, const StringPattern* b) {
return a->pattern() < b->pattern(); return a->pattern() < b->pattern();
} }
// Given the set of patterns, compute how many nodes will the corresponding
// Aho-Corasick tree have. Note that |patterns| need to be sorted.
uint32_t TreeSize(const std::vector<const StringPattern*>& patterns) {
DCHECK(std::is_sorted(patterns.begin(), patterns.end(), ComparePatterns));
uint32_t result = 1u; // 1 for the root node.
if (patterns.empty())
return result;
auto last = patterns.begin();
auto current = last + 1;
// For the first pattern, each letter is a label of an edge to a new node.
result += (*last)->pattern().size();
// For the subsequent patterns, only count the edges which were not counted
// yet. For this it suffices to test against the previous pattern, because the
// patterns are sorted.
for (; current != patterns.end(); ++last, ++current) {
const std::string& last_pattern = (*last)->pattern();
const std::string& current_pattern = (*current)->pattern();
const uint32_t prefix_bound =
std::min(last_pattern.size(), current_pattern.size());
uint32_t common_prefix = 0;
while (common_prefix < prefix_bound &&
last_pattern[common_prefix] == current_pattern[common_prefix])
++common_prefix;
result += current_pattern.size() - common_prefix;
}
return result;
}
std::vector<const StringPattern*> GetVectorOfPointers( std::vector<const StringPattern*> GetVectorOfPointers(
const std::vector<StringPattern>& patterns) { const std::vector<StringPattern>& patterns) {
std::vector<const StringPattern*> pattern_pointers; std::vector<const StringPattern*> pattern_pointers;
...@@ -90,12 +59,12 @@ SubstringSetMatcher::SubstringSetMatcher( ...@@ -90,12 +59,12 @@ SubstringSetMatcher::SubstringSetMatcher(
// Compute the total number of tree nodes needed. // Compute the total number of tree nodes needed.
std::sort(patterns.begin(), patterns.end(), ComparePatterns); std::sort(patterns.begin(), patterns.end(), ComparePatterns);
tree_.reserve(TreeSize(patterns)); tree_.reserve(GetTreeSize(patterns));
BuildAhoCorasickTree(patterns); BuildAhoCorasickTree(patterns);
// Sanity check that no new allocations happened in the tree and our computed // Sanity check that no new allocations happened in the tree and our computed
// size was correct. // size was correct.
DCHECK_EQ(tree_.size(), TreeSize(patterns)); DCHECK_EQ(tree_.size(), static_cast<size_t>(GetTreeSize(patterns)));
is_empty_ = patterns.empty() && tree_.size() == 1u; is_empty_ = patterns.empty() && tree_.size() == 1u;
} }
...@@ -107,26 +76,26 @@ bool SubstringSetMatcher::Match(const std::string& text, ...@@ -107,26 +76,26 @@ bool SubstringSetMatcher::Match(const std::string& text,
const size_t old_number_of_matches = matches->size(); const size_t old_number_of_matches = matches->size();
// Handle patterns matching the empty string. // Handle patterns matching the empty string.
const AhoCorasickNode* const root = &tree_[0]; const AhoCorasickNode* const root = &tree_[kRootID];
root->AccumulateMatches(matches); AccumulateMatchesForNode(root, matches);
const AhoCorasickNode* current_node = root; const AhoCorasickNode* current_node = root;
for (const char c : text) { for (const char c : text) {
AhoCorasickNode* child = current_node->GetEdge(c); NodeID child = current_node->GetEdge(c);
// If the child not can't be found, progressively iterate over the longest // If the child not can't be found, progressively iterate over the longest
// proper suffix of the string represented by the current node. In a sense // proper suffix of the string represented by the current node. In a sense
// we are pruning prefixes from the text. // we are pruning prefixes from the text.
while (!child && current_node != root) { while (child == kInvalidNodeID && current_node != root) {
current_node = current_node->failure(); current_node = &tree_[current_node->failure()];
child = current_node->GetEdge(c); child = current_node->GetEdge(c);
} }
if (child) { if (child != kInvalidNodeID) {
// The string represented by |child| is the longest possible suffix of the // The string represented by |child| is the longest possible suffix of the
// current position of |text| in the trie. // current position of |text| in the trie.
current_node = child; current_node = &tree_[child];
current_node->AccumulateMatches(matches); AccumulateMatchesForNode(current_node, matches);
} else { } else {
// The empty string is the longest possible suffix of the current position // The empty string is the longest possible suffix of the current position
// of |text| in the trie. // of |text| in the trie.
...@@ -141,15 +110,49 @@ size_t SubstringSetMatcher::EstimateMemoryUsage() const { ...@@ -141,15 +110,49 @@ size_t SubstringSetMatcher::EstimateMemoryUsage() const {
return base::trace_event::EstimateMemoryUsage(tree_); return base::trace_event::EstimateMemoryUsage(tree_);
} }
// static
constexpr SubstringSetMatcher::NodeID SubstringSetMatcher::kInvalidNodeID;
constexpr SubstringSetMatcher::NodeID SubstringSetMatcher::kRootID;
SubstringSetMatcher::NodeID SubstringSetMatcher::GetTreeSize(
const std::vector<const StringPattern*>& patterns) const {
DCHECK(std::is_sorted(patterns.begin(), patterns.end(), ComparePatterns));
base::CheckedNumeric<NodeID> result = 1u; // 1 for the root node.
if (patterns.empty())
return result.ValueOrDie();
auto last = patterns.begin();
auto current = last + 1;
// For the first pattern, each letter is a label of an edge to a new node.
result += (*last)->pattern().size();
// For the subsequent patterns, only count the edges which were not counted
// yet. For this it suffices to test against the previous pattern, because the
// patterns are sorted.
for (; current != patterns.end(); ++last, ++current) {
const std::string& last_pattern = (*last)->pattern();
const std::string& current_pattern = (*current)->pattern();
size_t prefix_bound = std::min(last_pattern.size(), current_pattern.size());
size_t common_prefix = 0;
while (common_prefix < prefix_bound &&
last_pattern[common_prefix] == current_pattern[common_prefix]) {
++common_prefix;
}
result -= common_prefix;
result += current_pattern.size();
}
return result.ValueOrDie();
}
void SubstringSetMatcher::BuildAhoCorasickTree( void SubstringSetMatcher::BuildAhoCorasickTree(
const SubstringPatternVector& patterns) { const SubstringPatternVector& patterns) {
DCHECK(tree_.empty()); DCHECK(tree_.empty());
// Initialize root node of tree. // Initialize root node of tree.
// Sanity check that there's no reallocation on adding a new node since we
// take pointers to nodes in the |tree_|, which can be invalidated in case of
// a reallocation.
DCHECK_LT(tree_.size(), tree_.capacity());
tree_.emplace_back(); tree_.emplace_back();
// Build the initial trie for all the patterns. // Build the initial trie for all the patterns.
...@@ -170,29 +173,23 @@ void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( ...@@ -170,29 +173,23 @@ void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree(
const std::string::const_iterator text_end = text.end(); const std::string::const_iterator text_end = text.end();
// Iterators on the tree and the text. // Iterators on the tree and the text.
AhoCorasickNode* current_node = &tree_[0]; AhoCorasickNode* current_node = &tree_[kRootID];
std::string::const_iterator i = text.begin(); std::string::const_iterator i = text.begin();
// Follow existing paths for as long as possible. // Follow existing paths for as long as possible.
while (i != text_end) { while (i != text_end) {
AhoCorasickNode* child = current_node->GetEdge(*i); NodeID child = current_node->GetEdge(*i);
if (!child) if (child == kInvalidNodeID)
break; break;
current_node = child; current_node = &tree_[child];
++i; ++i;
} }
// Create new nodes if necessary. // Create new nodes if necessary.
while (i != text_end) { while (i != text_end) {
// Sanity check that there's no reallocation on adding a new node since we
// take pointers to nodes in the |tree_|, which can be invalidated in case
// of a reallocation.
DCHECK_LT(tree_.size(), tree_.capacity());
tree_.emplace_back(); tree_.emplace_back();
current_node->SetEdge(*i, tree_.size() - 1);
AhoCorasickNode* child = &tree_.back(); current_node = &tree_.back();
current_node->SetEdge(*i, child);
current_node = child;
++i; ++i;
} }
...@@ -211,15 +208,15 @@ void SubstringSetMatcher::CreateFailureAndOutputEdges() { ...@@ -211,15 +208,15 @@ void SubstringSetMatcher::CreateFailureAndOutputEdges() {
// should probably be defined as null, but we assign it to the |root| to // should probably be defined as null, but we assign it to the |root| to
// simplify the code and have the invariant that the failure edge is always // simplify the code and have the invariant that the failure edge is always
// defined. // defined.
root->SetFailure(root); root->SetFailure(kRootID);
root->SetOutputLink(nullptr); root->SetOutputLink(kInvalidNodeID);
NodeID root_output_link = root->IsEndOfPattern() ? kRootID : kInvalidNodeID;
const AhoCorasickNode* const root_output_link =
root->IsEndOfPattern() ? root : nullptr;
for (const auto& edge : root->edges()) { for (const auto& edge : root->edges()) {
AhoCorasickNode* child = edge.second; AhoCorasickNode* child = &tree_[edge.second];
child->SetFailure(root); child->SetFailure(kRootID);
child->SetOutputLink(root_output_link); child->SetOutputLink(root_output_link);
queue.push(child); queue.push(child);
} }
...@@ -235,33 +232,35 @@ void SubstringSetMatcher::CreateFailureAndOutputEdges() { ...@@ -235,33 +232,35 @@ void SubstringSetMatcher::CreateFailureAndOutputEdges() {
// of the current node. // of the current node.
for (const auto& edge : current_node->edges()) { for (const auto& edge : current_node->edges()) {
const char edge_label = edge.first; const char edge_label = edge.first;
AhoCorasickNode* child = edge.second; AhoCorasickNode* child = &tree_[edge.second];
const AhoCorasickNode* failure_candidate_parent = current_node->failure(); const AhoCorasickNode* failure_candidate_parent =
const AhoCorasickNode* failure_candidate = &tree_[current_node->failure()];
NodeID failure_candidate_id =
failure_candidate_parent->GetEdge(edge_label); failure_candidate_parent->GetEdge(edge_label);
while (failure_candidate_id == kInvalidNodeID &&
while (!failure_candidate && failure_candidate_parent != root) { failure_candidate_parent != root) {
failure_candidate_parent = failure_candidate_parent->failure(); failure_candidate_parent = &tree_[failure_candidate_parent->failure()];
failure_candidate = failure_candidate_parent->GetEdge(edge_label); failure_candidate_id = failure_candidate_parent->GetEdge(edge_label);
} }
if (!failure_candidate) { if (failure_candidate_id == kInvalidNodeID) {
DCHECK_EQ(root, failure_candidate_parent); DCHECK_EQ(root, failure_candidate_parent);
// |failure_candidate| is null and we can't proceed further since we // |failure_candidate| is invalid and we can't proceed further since we
// have reached the root. Hence the longest proper suffix of this string // have reached the root. Hence the longest proper suffix of this string
// represented by this node is the empty string (represented by root). // represented by this node is the empty string (represented by root).
failure_candidate = root; failure_candidate_id = kRootID;
} }
child->SetFailure(failure_candidate); child->SetFailure(failure_candidate_id);
const AhoCorasickNode* failure_candidate = &tree_[failure_candidate_id];
// Now |failure_candidate| is |child|'s longest possible proper suffix in // Now |failure_candidate| is |child|'s longest possible proper suffix in
// the trie. We also know that since we are doing a breadth first search, // the trie. We also know that since we are doing a breadth first search,
// we would have established |failure_candidate|'s output link by now. // we would have established |failure_candidate|'s output link by now.
// Hence we can define |child|'s output link as follows: // Hence we can define |child|'s output link as follows:
child->SetOutputLink(failure_candidate->IsEndOfPattern() child->SetOutputLink(failure_candidate->IsEndOfPattern()
? failure_candidate ? failure_candidate_id
: failure_candidate->output_link()); : failure_candidate->output_link());
queue.push(child); queue.push(child);
...@@ -269,6 +268,22 @@ void SubstringSetMatcher::CreateFailureAndOutputEdges() { ...@@ -269,6 +268,22 @@ void SubstringSetMatcher::CreateFailureAndOutputEdges() {
} }
} }
void SubstringSetMatcher::AccumulateMatchesForNode(
const AhoCorasickNode* node,
std::set<StringPattern::ID>* matches) const {
DCHECK(matches);
if (node->IsEndOfPattern())
matches->insert(node->GetMatchID());
NodeID node_id = node->output_link();
while (node_id != kInvalidNodeID) {
node = &tree_[node_id];
matches->insert(node->GetMatchID());
node_id = node->output_link();
}
}
SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() = default; SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() = default;
SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() = default; SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() = default;
...@@ -278,43 +293,22 @@ SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(AhoCorasickNode&& other) = ...@@ -278,43 +293,22 @@ SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(AhoCorasickNode&& other) =
SubstringSetMatcher::AhoCorasickNode& SubstringSetMatcher::AhoCorasickNode:: SubstringSetMatcher::AhoCorasickNode& SubstringSetMatcher::AhoCorasickNode::
operator=(AhoCorasickNode&& other) = default; operator=(AhoCorasickNode&& other) = default;
SubstringSetMatcher::AhoCorasickNode* SubstringSetMatcher::NodeID SubstringSetMatcher::AhoCorasickNode::GetEdge(
SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { char c) const {
auto i = edges_.find(c); auto i = edges_.find(c);
return i == edges_.end() ? nullptr : i->second; return i == edges_.end() ? kInvalidNodeID : i->second;
} }
void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, NodeID node) {
AhoCorasickNode* node) { DCHECK_NE(kInvalidNodeID, node);
DCHECK(node);
edges_[c] = node; edges_[c] = node;
} }
void SubstringSetMatcher::AhoCorasickNode::SetFailure( void SubstringSetMatcher::AhoCorasickNode::SetFailure(NodeID node) {
const AhoCorasickNode* node) { DCHECK_NE(kInvalidNodeID, node);
DCHECK(node);
failure_ = node; failure_ = node;
} }
void SubstringSetMatcher::AhoCorasickNode::SetOutputLink(
const AhoCorasickNode* output_link) {
DCHECK(!output_link || output_link->IsEndOfPattern());
output_link_ = output_link;
}
void SubstringSetMatcher::AhoCorasickNode::AccumulateMatches(
std::set<StringPattern::ID>* matches) const {
DCHECK(matches);
if (IsEndOfPattern())
matches->insert(GetMatchID());
for (const AhoCorasickNode* node = this->output_link(); !!node;
node = node->output_link()) {
matches->insert(node->GetMatchID());
}
}
size_t SubstringSetMatcher::AhoCorasickNode::EstimateMemoryUsage() const { size_t SubstringSetMatcher::AhoCorasickNode::EstimateMemoryUsage() const {
return base::trace_event::EstimateMemoryUsage(edges_); return base::trace_event::EstimateMemoryUsage(edges_);
} }
......
...@@ -56,6 +56,17 @@ class URL_MATCHER_EXPORT SubstringSetMatcher { ...@@ -56,6 +56,17 @@ class URL_MATCHER_EXPORT SubstringSetMatcher {
size_t EstimateMemoryUsage() const; size_t EstimateMemoryUsage() const;
private: private:
// Represents the index of the node within |tree_|. It is specifically
// uint32_t so that we can be sure it takes up 4 bytes. If the computed size
// of |tree_| is larger than what can be stored within an uint32_t, there will
// be a CHECK failure.
using NodeID = uint32_t;
// This is the maximum possible size of |tree_| and hence can't be a valid ID.
static constexpr NodeID kInvalidNodeID = std::numeric_limits<NodeID>::max();
static constexpr NodeID kRootID = 0;
// A node of an Aho Corasick Tree. See // A node of an Aho Corasick Tree. See
// http://web.stanford.edu/class/archive/cs/cs166/cs166.1166/lectures/02/Small02.pdf // http://web.stanford.edu/class/archive/cs/cs166/cs166.1166/lectures/02/Small02.pdf
// to understand the algorithm. // to understand the algorithm.
...@@ -89,21 +100,22 @@ class URL_MATCHER_EXPORT SubstringSetMatcher { ...@@ -89,21 +100,22 @@ class URL_MATCHER_EXPORT SubstringSetMatcher {
// It will make sense. Eventually. // It will make sense. Eventually.
class AhoCorasickNode { class AhoCorasickNode {
public: public:
using Edges = base::flat_map<char, AhoCorasickNode*>; // Map from edge label to NodeID.
using Edges = base::flat_map<char, NodeID>;
AhoCorasickNode(); AhoCorasickNode();
~AhoCorasickNode(); ~AhoCorasickNode();
AhoCorasickNode(AhoCorasickNode&& other); AhoCorasickNode(AhoCorasickNode&& other);
AhoCorasickNode& operator=(AhoCorasickNode&& other); AhoCorasickNode& operator=(AhoCorasickNode&& other);
AhoCorasickNode* GetEdge(char c) const; NodeID GetEdge(char c) const;
void SetEdge(char c, AhoCorasickNode* node); void SetEdge(char c, NodeID node);
const Edges& edges() const { return edges_; } const Edges& edges() const { return edges_; }
void ShrinkEdges() { edges_.shrink_to_fit(); } void ShrinkEdges() { edges_.shrink_to_fit(); }
const AhoCorasickNode* failure() const { return failure_; } NodeID failure() const { return failure_; }
void SetFailure(const AhoCorasickNode* failure); void SetFailure(NodeID failure);
void SetMatchID(StringPattern::ID id) { void SetMatchID(StringPattern::ID id) {
DCHECK(!IsEndOfPattern()); DCHECK(!IsEndOfPattern());
...@@ -121,12 +133,8 @@ class URL_MATCHER_EXPORT SubstringSetMatcher { ...@@ -121,12 +133,8 @@ class URL_MATCHER_EXPORT SubstringSetMatcher {
return match_id_; return match_id_;
} }
void SetOutputLink(const AhoCorasickNode* node); void SetOutputLink(NodeID node) { output_link_ = node; }
const AhoCorasickNode* output_link() const { return output_link_; } NodeID output_link() const { return output_link_; }
// Adds all pattern IDs to |matches| which are a suffix of the string
// represented by this node.
void AccumulateMatches(std::set<StringPattern::ID>* matches) const;
size_t EstimateMemoryUsage() const; size_t EstimateMemoryUsage() const;
...@@ -134,23 +142,28 @@ class URL_MATCHER_EXPORT SubstringSetMatcher { ...@@ -134,23 +142,28 @@ class URL_MATCHER_EXPORT SubstringSetMatcher {
// Outgoing edges of current node. // Outgoing edges of current node.
Edges edges_; Edges edges_;
// Node that failure edge leads to. The failure node corresponds to the node // Node index that failure edge leads to. The failure node corresponds to
// which represents the longest proper suffix (include empty string) of the // the node which represents the longest proper suffix (include empty
// string represented by this node. Must be valid, null when uninitialized. // string) of the string represented by this node. Must be valid, equal to
const AhoCorasickNode* failure_ = nullptr; // kInvalidNodeID when uninitialized.
NodeID failure_ = kInvalidNodeID;
// If valid, this node represents the end of a pattern. It stores the ID of // If valid, this node represents the end of a pattern. It stores the ID of
// the corresponding pattern. // the corresponding pattern.
StringPattern::ID match_id_ = StringPattern::kInvalidId; StringPattern::ID match_id_ = StringPattern::kInvalidId;
// Node that corresponds to the longest proper suffix (including empty // Node index that corresponds to the longest proper suffix (including empty
// suffix) of this node and which also represents the end of a pattern. Can // suffix) of this node and which also represents the end of a pattern. Can
// be null. // be invalid.
const AhoCorasickNode* output_link_ = nullptr; NodeID output_link_ = kInvalidNodeID;
}; };
using SubstringPatternVector = std::vector<const StringPattern*>; using SubstringPatternVector = std::vector<const StringPattern*>;
// Given the set of patterns, compute how many nodes will the corresponding
// Aho-Corasick tree have. Note that |patterns| need to be sorted.
NodeID GetTreeSize(const std::vector<const StringPattern*>& patterns) const;
void BuildAhoCorasickTree(const SubstringPatternVector& patterns); void BuildAhoCorasickTree(const SubstringPatternVector& patterns);
// Inserts a path for |pattern->pattern()| into the tree and adds // Inserts a path for |pattern->pattern()| into the tree and adds
...@@ -159,6 +172,11 @@ class URL_MATCHER_EXPORT SubstringSetMatcher { ...@@ -159,6 +172,11 @@ class URL_MATCHER_EXPORT SubstringSetMatcher {
void CreateFailureAndOutputEdges(); void CreateFailureAndOutputEdges();
// Adds all pattern IDs to |matches| which are a suffix of the string
// represented by |node|.
void AccumulateMatchesForNode(const AhoCorasickNode* node,
std::set<StringPattern::ID>* matches) const;
// The nodes of a Aho-Corasick tree. // The nodes of a Aho-Corasick tree.
std::vector<AhoCorasickNode> tree_; std::vector<AhoCorasickNode> tree_;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment