Commit 684f4185 authored by Alexei Filippov's avatar Alexei Filippov Committed by Commit Bot

[heap profiler] Make use of thread_local instead of base::TLS

The C++ thread_local is slightly faster while making the code clear.
Besides that calls to TlsGetValue on Windows may alter the result of
GetLastError, thus changing behavior of the underlying code.

BUG=920440

Change-Id: Ic89632f4a54f35d58b93cdecfffc68fc1a94dac1
Reviewed-on: https://chromium-review.googlesource.com/c/1461681Reviewed-by: default avatarVlad Tsyrklevich <vtsyrklevich@chromium.org>
Reviewed-by: default avatarErik Chen <erikchen@chromium.org>
Commit-Queue: Alexei Filippov <alph@chromium.org>
Cr-Commit-Position: refs/heads/master@{#632819}
parent 21c34339
......@@ -19,87 +19,49 @@
#include "base/sampling_heap_profiler/lock_free_address_hash_set.h"
#include "build/build_config.h"
#if defined(OS_POSIX)
#if defined(OS_MACOSX) || defined(OS_ANDROID)
#include <pthread.h>
#endif
#if defined(OS_WIN)
#include <windows.h>
#endif
namespace base {
using allocator::AllocatorDispatch;
namespace {
// PoissonAllocationSampler cannot use ThreadLocalStorage, as during thread
// exiting when TLS storage is already released, there might be a call to
// |free| which would trigger the profiler hook and would make it access TLS.
// It instead uses OS primitives directly. As it only stores POD types it
// does not need thread exit callbacks.
#if defined(OS_WIN)
using TLSKey = DWORD;
void TLSInit(TLSKey* key) {
*key = ::TlsAlloc();
CHECK_NE(TLS_OUT_OF_INDEXES, *key);
}
uintptr_t TLSGetValue(const TLSKey& key) {
return reinterpret_cast<uintptr_t>(::TlsGetValue(key));
}
void TLSSetValue(const TLSKey& key, uintptr_t value) {
::TlsSetValue(key, reinterpret_cast<LPVOID>(value));
}
#else // defined(OS_WIN)
using TLSKey = pthread_key_t;
void TLSInit(TLSKey* key) {
int result = pthread_key_create(key, nullptr);
CHECK_EQ(0, result);
}
uintptr_t TLSGetValue(const TLSKey& key) {
return reinterpret_cast<uintptr_t>(pthread_getspecific(key));
}
void TLSSetValue(const TLSKey& key, uintptr_t value) {
pthread_setspecific(key, reinterpret_cast<void*>(value));
}
#endif
#if defined(OS_MACOSX) || defined(OS_ANDROID)
// On MacOS the implementation of libmalloc sometimes calls malloc recursively,
// The macOS implementation of libmalloc sometimes calls malloc recursively,
// delegating allocations between zones. That causes our hooks being called
// twice. The scoped guard allows us to detect that.
#if defined(OS_MACOSX)
//
// Besides that the implementations of thread_local on macOS and Android
// seem to allocate memory lazily on the first access to thread_local variables.
// Make use of pthread TLS instead of C++ thread_local there.
class ReentryGuard {
public:
ReentryGuard() : allowed_(!TLSGetValue(entered_key_)) {
TLSSetValue(entered_key_, true);
ReentryGuard() : allowed_(!pthread_getspecific(entered_key_)) {
pthread_setspecific(entered_key_, reinterpret_cast<void*>(true));
}
~ReentryGuard() {
if (LIKELY(allowed_))
TLSSetValue(entered_key_, false);
pthread_setspecific(entered_key_, nullptr);
}
operator bool() { return allowed_; }
static void Init() { TLSInit(&entered_key_); }
static void Init() {
int error = pthread_key_create(&entered_key_, nullptr);
CHECK(!error);
}
private:
bool allowed_;
static TLSKey entered_key_;
static pthread_key_t entered_key_;
};
TLSKey ReentryGuard::entered_key_;
pthread_key_t ReentryGuard::entered_key_;
#else
......@@ -111,21 +73,44 @@ class ReentryGuard {
#endif
TLSKey g_internal_reentry_guard;
const size_t kDefaultSamplingIntervalBytes = 128 * 1024;
// Notes on TLS usage:
//
// * There's no safe way to use TLS in malloc() as both C++ thread_local and
// pthread do not pose any guarantees on whether they allocate or not.
// * We think that we can safely use thread_local w/o re-entrancy guard because
// the compiler will use "tls static access model" for static builds of
// Chrome [https://www.uclibc.org/docs/tls.pdf].
// But there's no guarantee that this will stay true, and in practice
// it seems to have problems on macOS/Android. These platforms do allocate
// on the very first access to a thread_local on each thread.
// * Directly using/warming-up platform TLS seems to work on all platforms,
// but is also not guaranteed to stay true. We make use of it for reentrancy
// guards on macOS/Android.
// * We cannot use Windows Tls[GS]etValue API as it modifies the result of
// GetLastError.
//
// Android thread_local seems to be using __emutls_get_address from libgcc:
// https://github.com/gcc-mirror/gcc/blob/master/libgcc/emutls.c
// macOS version is based on _tlv_get_addr from dyld:
// https://opensource.apple.com/source/dyld/dyld-635.2/src/threadLocalHelpers.s.auto.html
// The guard protects against reentering on platforms other the macOS and
// Android.
thread_local bool g_internal_reentry_guard;
// Accumulated bytes towards sample thread local key.
TLSKey g_accumulated_bytes_tls;
thread_local intptr_t g_accumulated_bytes_tls;
// A boolean used to distinguish first allocation on a thread.
// false - first allocation on the thread.
// true - otherwise
// A boolean used to distinguish first allocation on a thread:
// false - first allocation on the thread;
// true - otherwise.
// Since g_accumulated_bytes_tls is initialized with zero the very first
// allocation on a thread would always trigger the sample, thus skewing the
// profile towards such allocations. To mitigate that we use the flag to
// ensure the first allocation is properly accounted.
TLSKey g_sampling_interval_initialized_tls;
thread_local bool g_sampling_interval_initialized_tls;
// Controls if sample intervals should not be randomized. Used for testing.
bool g_deterministic;
......@@ -314,18 +299,18 @@ void PartitionFreeHook(void* address) {
} // namespace
PoissonAllocationSampler::ScopedMuteThreadSamples::ScopedMuteThreadSamples() {
DCHECK(!TLSGetValue(g_internal_reentry_guard));
TLSSetValue(g_internal_reentry_guard, true);
DCHECK(!g_internal_reentry_guard);
g_internal_reentry_guard = true;
}
PoissonAllocationSampler::ScopedMuteThreadSamples::~ScopedMuteThreadSamples() {
DCHECK(TLSGetValue(g_internal_reentry_guard));
TLSSetValue(g_internal_reentry_guard, false);
DCHECK(g_internal_reentry_guard);
g_internal_reentry_guard = false;
}
// static
bool PoissonAllocationSampler::ScopedMuteThreadSamples::IsMuted() {
return TLSGetValue(g_internal_reentry_guard);
return g_internal_reentry_guard;
}
PoissonAllocationSampler* PoissonAllocationSampler::instance_;
......@@ -343,9 +328,6 @@ PoissonAllocationSampler::PoissonAllocationSampler() {
void PoissonAllocationSampler::Init() {
static bool init_once = []() {
ReentryGuard::Init();
TLSInit(&g_internal_reentry_guard);
TLSInit(&g_accumulated_bytes_tls);
TLSInit(&g_sampling_interval_initialized_tls);
return true;
}();
ignore_result(init_once);
......@@ -427,11 +409,11 @@ void PoissonAllocationSampler::RecordAlloc(void* address,
const char* context) {
if (UNLIKELY(!g_running.load(std::memory_order_relaxed)))
return;
intptr_t accumulated_bytes = TLSGetValue(g_accumulated_bytes_tls) + size;
g_accumulated_bytes_tls += size;
intptr_t accumulated_bytes = g_accumulated_bytes_tls;
if (LIKELY(accumulated_bytes < 0))
TLSSetValue(g_accumulated_bytes_tls, accumulated_bytes);
else
instance_->DoRecordAlloc(accumulated_bytes, size, address, type, context);
return;
instance_->DoRecordAlloc(accumulated_bytes, size, address, type, context);
}
void PoissonAllocationSampler::DoRecordAlloc(intptr_t accumulated_bytes,
......@@ -452,10 +434,10 @@ void PoissonAllocationSampler::DoRecordAlloc(intptr_t accumulated_bytes,
++samples;
} while (accumulated_bytes >= 0);
TLSSetValue(g_accumulated_bytes_tls, accumulated_bytes);
g_accumulated_bytes_tls = accumulated_bytes;
if (UNLIKELY(!TLSGetValue(g_sampling_interval_initialized_tls))) {
TLSSetValue(g_sampling_interval_initialized_tls, true);
if (UNLIKELY(!g_sampling_interval_initialized_tls)) {
g_sampling_interval_initialized_tls = true;
// This is the very first allocation on the thread. It always produces an
// extra sample because g_accumulated_bytes_tls is initialized with zero
// due to TLS semantics.
......@@ -464,7 +446,7 @@ void PoissonAllocationSampler::DoRecordAlloc(intptr_t accumulated_bytes,
return;
}
if (UNLIKELY(TLSGetValue(g_internal_reentry_guard)))
if (UNLIKELY(ScopedMuteThreadSamples::IsMuted()))
return;
ScopedMuteThreadSamples no_reentrancy_scope;
......@@ -491,7 +473,7 @@ void PoissonAllocationSampler::RecordFree(void* address) {
}
void PoissonAllocationSampler::DoRecordFree(void* address) {
if (UNLIKELY(TLSGetValue(g_internal_reentry_guard)))
if (UNLIKELY(ScopedMuteThreadSamples::IsMuted()))
return;
ScopedMuteThreadSamples no_reentrancy_scope;
AutoLock lock(mutex_);
......
......@@ -9,6 +9,7 @@
#include "base/allocator/allocator_shim.h"
#include "base/debug/alias.h"
#include "base/rand_util.h"
#include "base/threading/simple_thread.h"
#include "build/build_config.h"
#include "testing/gtest/include/gtest/gtest.h"
......@@ -200,4 +201,40 @@ TEST_F(SamplingHeapProfilerTest, DISABLED_SequentialLargeSmallStats) {
});
}
// Platform TLS: alloc+free[ns]: 22.184 alloc[ns]: 8.910 free[ns]: 13.274
// thread_local: alloc+free[ns]: 18.353 alloc[ns]: 5.021 free[ns]: 13.331
TEST_F(SamplingHeapProfilerTest, MANUAL_SamplerMicroBenchmark) {
// With the sampling interval of 100KB it happens to record ~ every 450th
// allocation in the browser process. We model this pattern here.
constexpr size_t sampling_interval = 100000;
constexpr size_t allocation_size = sampling_interval / 450;
SamplesCollector collector(0);
auto* sampler = PoissonAllocationSampler::Get();
sampler->SetSamplingInterval(sampling_interval);
sampler->AddSamplesObserver(&collector);
int kNumAllocations = 50000000;
base::TimeTicks t0 = base::TimeTicks::Now();
for (int i = 1; i <= kNumAllocations; ++i) {
sampler->RecordAlloc(
reinterpret_cast<void*>(static_cast<intptr_t>(i)), allocation_size,
PoissonAllocationSampler::AllocatorType::kMalloc, nullptr);
}
base::TimeTicks t1 = base::TimeTicks::Now();
for (int i = 1; i <= kNumAllocations; ++i)
sampler->RecordFree(reinterpret_cast<void*>(static_cast<intptr_t>(i)));
base::TimeTicks t2 = base::TimeTicks::Now();
printf(
"alloc+free[ns]: %.3f alloc[ns]: %.3f free[ns]: %.3f "
"alloc+free[mln/s]: %.1f total[ms]: %.1f\n",
(t2 - t0).InNanoseconds() * 1. / kNumAllocations,
(t1 - t0).InNanoseconds() * 1. / kNumAllocations,
(t2 - t1).InNanoseconds() * 1. / kNumAllocations,
kNumAllocations / (t2 - t0).InMicrosecondsF(),
(t2 - t0).InMillisecondsF());
sampler->RemoveSamplesObserver(&collector);
}
} // namespace base
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment