Commit 32fac919 authored by Stephen Nusko's avatar Stephen Nusko Committed by Commit Bot

Improve InterningIndex performance by switching to std::array.

Instead of MRUCache which uses a sorted map under the hood. Use two
std:arrays, one to store all the keys (very cache friendly when scanning
them). And then immediately jump to the value in the other array.

This leads to a ~9% speed up (5 to 13% at the 95% confidence). See the
"two_arrays tracing/tipOfTreeTracing" sheet in
https://docs.google.com/spreadsheets/d/1HIiyUQa7d8eLfMBBBm_7V60kqNABYoFbHpKdpwnYu78/

Total tracing overhead is ~59.8% (52% to 67% at the 95% confidence). See the
"two_arrays tracing/tipOfTreeNotTracing" sheet in the doc above.

Bug: 1007611
Change-Id: Ie2584e0e3a13e34097021fd128387cfa4fabaeed
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1844822
Commit-Queue: Stephen Nusko <nuskos@chromium.org>
Reviewed-by: default avatarStephen Nusko <nuskos@chromium.org>
Reviewed-by: default avatarEric Seckler <eseckler@chromium.org>
Auto-Submit: Stephen Nusko <nuskos@chromium.org>
Cr-Commit-Position: refs/heads/master@{#703377}
parent 514fc11e
...@@ -5,11 +5,11 @@ ...@@ -5,11 +5,11 @@
#ifndef SERVICES_TRACING_PUBLIC_CPP_PERFETTO_INTERNING_INDEX_H_ #ifndef SERVICES_TRACING_PUBLIC_CPP_PERFETTO_INTERNING_INDEX_H_
#define SERVICES_TRACING_PUBLIC_CPP_PERFETTO_INTERNING_INDEX_H_ #define SERVICES_TRACING_PUBLIC_CPP_PERFETTO_INTERNING_INDEX_H_
#include <array>
#include <cstdint> #include <cstdint>
#include <tuple> #include <tuple>
#include "base/component_export.h" #include "base/component_export.h"
#include "base/containers/mru_cache.h"
namespace tracing { namespace tracing {
...@@ -17,7 +17,7 @@ namespace tracing { ...@@ -17,7 +17,7 @@ namespace tracing {
using InterningID = uint32_t; using InterningID = uint32_t;
struct COMPONENT_EXPORT(TRACING_CPP) InterningIndexEntry { struct COMPONENT_EXPORT(TRACING_CPP) InterningIndexEntry {
InterningID id; InterningID id = 0;
// Whether the entry was emitted since the last reset of emitted state. If // Whether the entry was emitted since the last reset of emitted state. If
// |false|, the sink should (re)emit the entry in the current TracePacket. // |false|, the sink should (re)emit the entry in the current TracePacket.
...@@ -25,29 +25,80 @@ struct COMPONENT_EXPORT(TRACING_CPP) InterningIndexEntry { ...@@ -25,29 +25,80 @@ struct COMPONENT_EXPORT(TRACING_CPP) InterningIndexEntry {
// We don't remove entries on reset of emitted state, so that we can continue // We don't remove entries on reset of emitted state, so that we can continue
// to use their original IDs and avoid unnecessarily incrementing the ID // to use their original IDs and avoid unnecessarily incrementing the ID
// counter. // counter.
bool was_emitted; bool was_emitted = false;
}; };
// Interning index that associates interned values with interning IDs. It can // Interning index that associates interned values with interning IDs. It can
// track entries of different types within the same ID space, e.g. so that both // track entries of different types within the same ID space, e.g. so that both
// copied strings and pointers to static strings can co-exist in the same index. // copied strings and pointers to static strings can co-exist in the same index.
// Uses base::MRUCaches to track the ID associations while enforcing an upper //
// bound on the index size. // The index will cache up to |N| values per ValueType after which it will start
template <typename... ValueTypes> // replacing them in a FIFO basis. N must be a power of 2 for performance
// reasons.
template <size_t N, typename... ValueTypes>
class COMPONENT_EXPORT(TRACING_CPP) InterningIndex { class COMPONENT_EXPORT(TRACING_CPP) InterningIndex {
public: public:
// IndexCache is just a pair of std::arrays which arg kept in sync with each
// other. This has an advantage over a std::array<pairs> since we can load a
// bunch of keys to search in one cache line without loading values.
template <typename ValueType> template <typename ValueType>
using IndexCache = base::MRUCache<ValueType, InterningIndexEntry>; class IndexCache {
public:
IndexCache() {
// Assert that N is a power of two. This allows the "% N" in Insert() to
// compile to a bunch of bit shifts which improves performance over an
// arbitrary modulo division.
static_assert(
N && ((N & (N - 1)) == 0),
"InterningIndex requires that the cache size be a power of 2.");
}
void Clear() {
for (auto& val : keys_) {
val = ValueType{};
}
}
typename std::array<InterningIndexEntry, N>::iterator Find(
const ValueType& value) {
auto it = std::find(keys_.begin(), keys_.end(), value);
// If the find above returns it == keys_.end(), then (it - keys_.begin())
// will equal keys_.size() which is equal to values_.size(). So
// values_.begin() + values_size() will be values_.end() and we've
// returned the correct value. This saves us checking a conditional in
// this function.
return values_.begin() + (it - keys_.begin());
}
typename std::array<InterningIndexEntry, N>::iterator Insert(
const ValueType& key,
InterningIndexEntry&& value) {
size_t new_position = current_index_++ % N;
keys_[new_position] = key;
values_[new_position] = std::move(value);
return values_.begin() + new_position;
}
typename std::array<InterningIndexEntry, N>::iterator begin() {
return values_.begin();
}
typename std::array<InterningIndexEntry, N>::iterator end() {
return values_.end();
}
private:
size_t current_index_ = 0;
std::array<ValueType, N> keys_{{}};
std::array<InterningIndexEntry, N> values_{{}};
};
// Construct a new index with caches for each of the ValueTypes. The cache // Construct a new index with caches for each of the ValueTypes. Every cache
// size for the n-th ValueType will be limited to max_entry_counts[n] entries. // is |N| elements.
// //
// For example, to construct an index containing at most 1000 char* pointers // For example, to construct an index containing at most 1024 char* pointers
// and 100 std::string objects: // and 1024 std::string objects:
// InterningIndex<char*, std::string> index(1000, 100); // InterningIndex<1024, char*, std::string> index;
template <typename... SizeType> InterningIndex() = default;
InterningIndex(SizeType... max_entry_counts)
: entry_caches_(max_entry_counts...) {}
// Returns the entry for the given interned |value|, adding it to the index if // Returns the entry for the given interned |value|, adding it to the index if
// it didn't exist previously or was evicted from the index. Entries may be // it didn't exist previously or was evicted from the index. Entries may be
...@@ -60,14 +111,14 @@ class COMPONENT_EXPORT(TRACING_CPP) InterningIndex { ...@@ -60,14 +111,14 @@ class COMPONENT_EXPORT(TRACING_CPP) InterningIndex {
InterningIndexEntry LookupOrAdd(const ValueType& value) { InterningIndexEntry LookupOrAdd(const ValueType& value) {
IndexCache<ValueType>& cache = IndexCache<ValueType>& cache =
std::get<IndexCache<ValueType>>(entry_caches_); std::get<IndexCache<ValueType>>(entry_caches_);
auto it = cache.Get(value); auto it = cache.Find(value);
if (it == cache.end()) { if (it == cache.end()) {
it = cache.Put(value, InterningIndexEntry{next_id_++, false}); it = cache.Insert(value, InterningIndexEntry{next_id_++, false});
} }
bool was_emitted = it->second.was_emitted; bool was_emitted = it->was_emitted;
// The caller will (re)emit the entry, so mark it as emitted. // The caller will (re)emit the entry, so mark it as emitted.
it->second.was_emitted = true; it->was_emitted = true;
return InterningIndexEntry{it->second.id, was_emitted}; return InterningIndexEntry{it->id, was_emitted};
} }
// Marks all entries as "not emitted", so that they will be reemitted when // Marks all entries as "not emitted", so that they will be reemitted when
...@@ -103,7 +154,7 @@ class COMPONENT_EXPORT(TRACING_CPP) InterningIndex { ...@@ -103,7 +154,7 @@ class COMPONENT_EXPORT(TRACING_CPP) InterningIndex {
cache.Clear(); cache.Clear();
} else { } else {
for (auto& entry : cache) { for (auto& entry : cache) {
entry.second.was_emitted = false; entry.was_emitted = false;
} }
} }
} }
......
...@@ -169,12 +169,6 @@ TrackEventThreadLocalEventSink::TrackEventThreadLocalEventSink( ...@@ -169,12 +169,6 @@ TrackEventThreadLocalEventSink::TrackEventThreadLocalEventSink(
: ThreadLocalEventSink(std::move(trace_writer), : ThreadLocalEventSink(std::move(trace_writer),
session_id, session_id,
disable_interning), disable_interning),
// TODO(eseckler): Tune these values experimentally.
interned_event_categories_(1000),
interned_event_names_(1000, 100),
interned_annotation_names_(1000, 100),
interned_source_locations_(1000),
interned_log_message_bodies_(100),
process_id_(TraceLog::GetInstance()->process_id()), process_id_(TraceLog::GetInstance()->process_id()),
thread_id_(static_cast<int>(base::PlatformThread::CurrentId())), thread_id_(static_cast<int>(base::PlatformThread::CurrentId())),
privacy_filtering_enabled_(proto_writer_filtering_enabled) { privacy_filtering_enabled_(proto_writer_filtering_enabled) {
......
...@@ -69,12 +69,13 @@ class COMPONENT_EXPORT(TRACING_CPP) TrackEventThreadLocalEventSink ...@@ -69,12 +69,13 @@ class COMPONENT_EXPORT(TRACING_CPP) TrackEventThreadLocalEventSink
// TODO(eseckler): Make it possible to register new indexes for use from // TODO(eseckler): Make it possible to register new indexes for use from
// TRACE_EVENT macros. // TRACE_EVENT macros.
InterningIndex<const char*> interned_event_categories_; // TODO(eseckler): Tune the cache sizes experimentally.
InterningIndex<const char*, std::string> interned_event_names_; InterningIndex<1024, const char*> interned_event_categories_;
InterningIndex<const char*, std::string> interned_annotation_names_; InterningIndex<1024, const char*, std::string> interned_event_names_;
InterningIndex<std::tuple<const char*, const char*, int>> InterningIndex<1024, const char*, std::string> interned_annotation_names_;
InterningIndex<1024, std::tuple<const char*, const char*, int>>
interned_source_locations_; interned_source_locations_;
InterningIndex<std::string> interned_log_message_bodies_; InterningIndex<128, std::string> interned_log_message_bodies_;
static std::atomic<uint32_t> incremental_state_reset_id_; static std::atomic<uint32_t> incremental_state_reset_id_;
......
...@@ -78,14 +78,15 @@ class COMPONENT_EXPORT(TRACING_CPP) TracingSamplerProfiler { ...@@ -78,14 +78,15 @@ class COMPONENT_EXPORT(TRACING_CPP) TracingSamplerProfiler {
const base::PlatformThreadId sampled_thread_id_; const base::PlatformThreadId sampled_thread_id_;
base::Lock trace_writer_lock_; base::Lock trace_writer_lock_;
std::unique_ptr<perfetto::TraceWriter> trace_writer_; std::unique_ptr<perfetto::TraceWriter> trace_writer_;
InterningIndex<size_t> interned_callstacks_{1000}; InterningIndex<1024, size_t> interned_callstacks_{};
InterningIndex<std::pair<std::string, std::string>, InterningIndex<1024,
std::pair<std::string, std::string>,
std::pair<uintptr_t, std::string>> std::pair<uintptr_t, std::string>>
interned_frames_{1000, 1000}; interned_frames_{};
InterningIndex<std::string> interned_frame_names_{1000}; InterningIndex<1024, std::string> interned_frame_names_{};
InterningIndex<std::string> interned_module_names_{1000}; InterningIndex<1024, std::string> interned_module_names_{};
InterningIndex<std::string> interned_module_ids_{1000}; InterningIndex<1024, std::string> interned_module_ids_{};
InterningIndex<uintptr_t> interned_modules_{1000}; InterningIndex<1024, uintptr_t> interned_modules_{};
bool reset_incremental_state_ = true; bool reset_incremental_state_ = true;
uint32_t last_incremental_state_reset_id_ = 0; uint32_t last_incremental_state_reset_id_ = 0;
int32_t last_emitted_process_priority_ = -1; int32_t last_emitted_process_priority_ = -1;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment