Commit a702473f authored by Karan Bhatia's avatar Karan Bhatia Committed by Commit Bot

UrlMatcher: Store array index to refer to nodes in SubstringSetMatcher.

Currently as part of AhoCorasickNode, we store pointers to other nodes
as part of the edge map and failure and output links. On a 64 bit
machine, the size of a pointer would be 8 bytes, so replacing it with a
4 byte array index yields decreased memory usage.

However doing this also has a drawback. Since we store array indices in
an uint32_t, the maximum size of the trie would be limitied by the
maximum possible value that an uint32_t can store as opposed to
vector<AhoCorasickNode>::max_size().

Also, the current implementation of GetTreeSize ignores any possible
overflow in computing the tree size, which can lead to a DCHECK failures
as well as incorrect results (It's not clear if it can lead to invalid
memory access). Change it to explicitly crash if the computed tree size
overflows and can't be stored in an uint32_t.

On my local machine, this reduces the amount of memory for the
SubstringSetMatcher perf test from ~33 Mb to ~25 Mb.

BUG=974391

Change-Id: Ib2541b514b8993e414891b1f678e590f6292dc98
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2050099
Commit-Queue: Karan Bhatia <karandeepb@chromium.org>
Reviewed-by: default avatarDominic Battré <battre@chromium.org>
Cr-Commit-Position: refs/heads/master@{#740848}
parent 0cb28cba
...@@ -56,6 +56,17 @@ class URL_MATCHER_EXPORT SubstringSetMatcher { ...@@ -56,6 +56,17 @@ class URL_MATCHER_EXPORT SubstringSetMatcher {
size_t EstimateMemoryUsage() const; size_t EstimateMemoryUsage() const;
private: private:
// Represents the index of the node within |tree_|. It is specifically
// uint32_t so that we can be sure it takes up 4 bytes. If the computed size
// of |tree_| is larger than what can be stored within an uint32_t, there will
// be a CHECK failure.
using NodeID = uint32_t;
// This is the maximum possible size of |tree_| and hence can't be a valid ID.
static constexpr NodeID kInvalidNodeID = std::numeric_limits<NodeID>::max();
static constexpr NodeID kRootID = 0;
// A node of an Aho Corasick Tree. See // A node of an Aho Corasick Tree. See
// http://web.stanford.edu/class/archive/cs/cs166/cs166.1166/lectures/02/Small02.pdf // http://web.stanford.edu/class/archive/cs/cs166/cs166.1166/lectures/02/Small02.pdf
// to understand the algorithm. // to understand the algorithm.
...@@ -89,21 +100,22 @@ class URL_MATCHER_EXPORT SubstringSetMatcher { ...@@ -89,21 +100,22 @@ class URL_MATCHER_EXPORT SubstringSetMatcher {
// It will make sense. Eventually. // It will make sense. Eventually.
class AhoCorasickNode { class AhoCorasickNode {
public: public:
using Edges = base::flat_map<char, AhoCorasickNode*>; // Map from edge label to NodeID.
using Edges = base::flat_map<char, NodeID>;
AhoCorasickNode(); AhoCorasickNode();
~AhoCorasickNode(); ~AhoCorasickNode();
AhoCorasickNode(AhoCorasickNode&& other); AhoCorasickNode(AhoCorasickNode&& other);
AhoCorasickNode& operator=(AhoCorasickNode&& other); AhoCorasickNode& operator=(AhoCorasickNode&& other);
AhoCorasickNode* GetEdge(char c) const; NodeID GetEdge(char c) const;
void SetEdge(char c, AhoCorasickNode* node); void SetEdge(char c, NodeID node);
const Edges& edges() const { return edges_; } const Edges& edges() const { return edges_; }
void ShrinkEdges() { edges_.shrink_to_fit(); } void ShrinkEdges() { edges_.shrink_to_fit(); }
const AhoCorasickNode* failure() const { return failure_; } NodeID failure() const { return failure_; }
void SetFailure(const AhoCorasickNode* failure); void SetFailure(NodeID failure);
void SetMatchID(StringPattern::ID id) { void SetMatchID(StringPattern::ID id) {
DCHECK(!IsEndOfPattern()); DCHECK(!IsEndOfPattern());
...@@ -121,12 +133,8 @@ class URL_MATCHER_EXPORT SubstringSetMatcher { ...@@ -121,12 +133,8 @@ class URL_MATCHER_EXPORT SubstringSetMatcher {
return match_id_; return match_id_;
} }
void SetOutputLink(const AhoCorasickNode* node); void SetOutputLink(NodeID node) { output_link_ = node; }
const AhoCorasickNode* output_link() const { return output_link_; } NodeID output_link() const { return output_link_; }
// Adds all pattern IDs to |matches| which are a suffix of the string
// represented by this node.
void AccumulateMatches(std::set<StringPattern::ID>* matches) const;
size_t EstimateMemoryUsage() const; size_t EstimateMemoryUsage() const;
...@@ -134,23 +142,28 @@ class URL_MATCHER_EXPORT SubstringSetMatcher { ...@@ -134,23 +142,28 @@ class URL_MATCHER_EXPORT SubstringSetMatcher {
// Outgoing edges of current node. // Outgoing edges of current node.
Edges edges_; Edges edges_;
// Node that failure edge leads to. The failure node corresponds to the node // Node index that failure edge leads to. The failure node corresponds to
// which represents the longest proper suffix (include empty string) of the // the node which represents the longest proper suffix (include empty
// string represented by this node. Must be valid, null when uninitialized. // string) of the string represented by this node. Must be valid, equal to
const AhoCorasickNode* failure_ = nullptr; // kInvalidNodeID when uninitialized.
NodeID failure_ = kInvalidNodeID;
// If valid, this node represents the end of a pattern. It stores the ID of // If valid, this node represents the end of a pattern. It stores the ID of
// the corresponding pattern. // the corresponding pattern.
StringPattern::ID match_id_ = StringPattern::kInvalidId; StringPattern::ID match_id_ = StringPattern::kInvalidId;
// Node that corresponds to the longest proper suffix (including empty // Node index that corresponds to the longest proper suffix (including empty
// suffix) of this node and which also represents the end of a pattern. Can // suffix) of this node and which also represents the end of a pattern. Can
// be null. // be invalid.
const AhoCorasickNode* output_link_ = nullptr; NodeID output_link_ = kInvalidNodeID;
}; };
using SubstringPatternVector = std::vector<const StringPattern*>; using SubstringPatternVector = std::vector<const StringPattern*>;
// Given the set of patterns, compute how many nodes will the corresponding
// Aho-Corasick tree have. Note that |patterns| need to be sorted.
NodeID GetTreeSize(const std::vector<const StringPattern*>& patterns) const;
void BuildAhoCorasickTree(const SubstringPatternVector& patterns); void BuildAhoCorasickTree(const SubstringPatternVector& patterns);
// Inserts a path for |pattern->pattern()| into the tree and adds // Inserts a path for |pattern->pattern()| into the tree and adds
...@@ -159,6 +172,11 @@ class URL_MATCHER_EXPORT SubstringSetMatcher { ...@@ -159,6 +172,11 @@ class URL_MATCHER_EXPORT SubstringSetMatcher {
void CreateFailureAndOutputEdges(); void CreateFailureAndOutputEdges();
// Adds all pattern IDs to |matches| which are a suffix of the string
// represented by |node|.
void AccumulateMatchesForNode(const AhoCorasickNode* node,
std::set<StringPattern::ID>* matches) const;
// The nodes of a Aho-Corasick tree. // The nodes of a Aho-Corasick tree.
std::vector<AhoCorasickNode> tree_; std::vector<AhoCorasickNode> tree_;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment