Commit b956afc8 authored by Alexei Filippov's avatar Alexei Filippov Committed by Commit Bot

Sampling Heap Profiler: Use TLS for accumulated bytes.

The TLS version has the same performance compared to lock-free version,
and also has the following benefits:
  - simpler code
  - handles multithreaded allocations with higher accuracy

It although has a potential corner case issue when there are lots
or short living threads each allocating small amount of memory.

BUG=803276,812262

Change-Id: Ie868f07b99559d8cc95d134eed6592bffe1f63aa
Reviewed-on: https://chromium-review.googlesource.com/944052
Commit-Queue: Alexei Filippov <alph@chromium.org>
Reviewed-by: default avatarPavel Feldman <pfeldman@chromium.org>
Reviewed-by: default avatarPrimiano Tucci <primiano@chromium.org>
Reviewed-by: default avatarErik Chen <erikchen@chromium.org>
Cr-Commit-Position: refs/heads/master@{#542635}
parent e9605182
...@@ -11,11 +11,12 @@ ...@@ -11,11 +11,12 @@
#include "base/allocator/buildflags.h" #include "base/allocator/buildflags.h"
#include "base/allocator/partition_allocator/partition_alloc.h" #include "base/allocator/partition_allocator/partition_alloc.h"
#include "base/atomicops.h" #include "base/atomicops.h"
#include "base/debug/alias.h"
#include "base/debug/stack_trace.h" #include "base/debug/stack_trace.h"
#include "base/macros.h"
#include "base/no_destructor.h" #include "base/no_destructor.h"
#include "base/partition_alloc_buildflags.h" #include "base/partition_alloc_buildflags.h"
#include "base/rand_util.h" #include "base/rand_util.h"
#include "base/threading/thread_local_storage.h"
#include "build/build_config.h" #include "build/build_config.h"
namespace base { namespace base {
...@@ -46,20 +47,12 @@ Atomic32 g_operations_in_flight; ...@@ -46,20 +47,12 @@ Atomic32 g_operations_in_flight;
// When set to true, threads should not enter lock-free paths. // When set to true, threads should not enter lock-free paths.
Atomic32 g_fast_path_is_closed; Atomic32 g_fast_path_is_closed;
// Number of bytes left to form the sample being collected.
AtomicWord g_bytes_left;
// Current sample size to be accumulated. Basically:
// <bytes accumulated toward sample> == g_current_interval - g_bytes_left
AtomicWord g_current_interval;
// Sampling interval parameter, the mean value for intervals between samples. // Sampling interval parameter, the mean value for intervals between samples.
AtomicWord g_sampling_interval = kDefaultSamplingIntervalBytes; AtomicWord g_sampling_interval = kDefaultSamplingIntervalBytes;
// Last generated sample ordinal number. // Last generated sample ordinal number.
uint32_t g_last_sample_ordinal = 0; uint32_t g_last_sample_ordinal = 0;
SamplingHeapProfiler* g_sampling_heap_profiler_instance;
void (*g_hooks_install_callback)(); void (*g_hooks_install_callback)();
Atomic32 g_hooks_installed; Atomic32 g_hooks_installed;
...@@ -167,6 +160,12 @@ void PartitionFreeHook(void* address) { ...@@ -167,6 +160,12 @@ void PartitionFreeHook(void* address) {
#endif // BUILDFLAG(USE_PARTITION_ALLOC) && !defined(OS_NACL) #endif // BUILDFLAG(USE_PARTITION_ALLOC) && !defined(OS_NACL)
ThreadLocalStorage::Slot& AccumulatedBytesTLS() {
static base::NoDestructor<base::ThreadLocalStorage::Slot>
accumulated_bytes_tls;
return *accumulated_bytes_tls;
}
} // namespace } // namespace
SamplingHeapProfiler::Sample::Sample(size_t size, SamplingHeapProfiler::Sample::Sample(size_t size,
...@@ -178,14 +177,23 @@ SamplingHeapProfiler::Sample::Sample(const Sample&) = default; ...@@ -178,14 +177,23 @@ SamplingHeapProfiler::Sample::Sample(const Sample&) = default;
SamplingHeapProfiler::Sample::~Sample() = default; SamplingHeapProfiler::Sample::~Sample() = default;
SamplingHeapProfiler* SamplingHeapProfiler::instance_;
SamplingHeapProfiler::SamplingHeapProfiler() { SamplingHeapProfiler::SamplingHeapProfiler() {
g_sampling_heap_profiler_instance = this; instance_ = this;
}
// static
void SamplingHeapProfiler::InitTLSSlot() {
// Preallocate the TLS slot early, so it can't cause reentracy issues
// when sampling is started.
ignore_result(AccumulatedBytesTLS().Get());
} }
// static // static
void SamplingHeapProfiler::InstallAllocatorHooksOnce() { void SamplingHeapProfiler::InstallAllocatorHooksOnce() {
static bool hook_installed = InstallAllocatorHooks(); static bool hook_installed = InstallAllocatorHooks();
base::debug::Alias(&hook_installed); ignore_result(hook_installed);
} }
// static // static
...@@ -193,7 +201,7 @@ bool SamplingHeapProfiler::InstallAllocatorHooks() { ...@@ -193,7 +201,7 @@ bool SamplingHeapProfiler::InstallAllocatorHooks() {
#if BUILDFLAG(USE_ALLOCATOR_SHIM) #if BUILDFLAG(USE_ALLOCATOR_SHIM)
base::allocator::InsertAllocatorDispatch(&g_allocator_dispatch); base::allocator::InsertAllocatorDispatch(&g_allocator_dispatch);
#else #else
base::debug::Alias(&g_allocator_dispatch); ignore_result(g_allocator_dispatch);
DLOG(WARNING) DLOG(WARNING)
<< "base::allocator shims are not available for memory sampling."; << "base::allocator shims are not available for memory sampling.";
#endif // BUILDFLAG(USE_ALLOCATOR_SHIM) #endif // BUILDFLAG(USE_ALLOCATOR_SHIM)
...@@ -225,10 +233,6 @@ void SamplingHeapProfiler::SetHooksInstallCallback( ...@@ -225,10 +233,6 @@ void SamplingHeapProfiler::SetHooksInstallCallback(
uint32_t SamplingHeapProfiler::Start() { uint32_t SamplingHeapProfiler::Start() {
InstallAllocatorHooksOnce(); InstallAllocatorHooksOnce();
size_t next_interval =
GetNextSampleInterval(base::subtle::Acquire_Load(&g_sampling_interval));
base::subtle::Release_Store(&g_current_interval, next_interval);
base::subtle::Release_Store(&g_bytes_left, next_interval);
base::subtle::Barrier_AtomicIncrement(&g_running, 1); base::subtle::Barrier_AtomicIncrement(&g_running, 1);
return g_last_sample_ordinal; return g_last_sample_ordinal;
} }
...@@ -276,39 +280,29 @@ void SamplingHeapProfiler::RecordAlloc(void* address, ...@@ -276,39 +280,29 @@ void SamplingHeapProfiler::RecordAlloc(void* address,
if (UNLIKELY(!base::subtle::NoBarrier_Load(&g_running))) if (UNLIKELY(!base::subtle::NoBarrier_Load(&g_running)))
return; return;
// Lock-free algorithm decreases number of bytes left to form a sample. // TODO(alph): On MacOS it may call the hook several times for a single
// The thread that makes it to reach zero is responsible for recording // allocation. Handle the case.
// a sample.
AtomicWord bytes_left = base::subtle::NoBarrier_AtomicIncrement(
&g_bytes_left, -static_cast<AtomicWord>(size));
if (LIKELY(bytes_left > 0))
return;
// Return if g_bytes_left was already zero or below before we decreased it. intptr_t accumulated_bytes =
// That basically means that another thread in fact crossed the threshold. reinterpret_cast<intptr_t>(AccumulatedBytesTLS().Get());
if (LIKELY(bytes_left + static_cast<AtomicWord>(size) <= 0)) accumulated_bytes += size;
if (LIKELY(accumulated_bytes < 0)) {
AccumulatedBytesTLS().Set(reinterpret_cast<void*>(accumulated_bytes));
return; return;
}
// Only one thread that crossed the threshold is running the code below. size_t mean_interval = base::subtle::NoBarrier_Load(&g_sampling_interval);
// It is going to be recording the sample. size_t samples = accumulated_bytes / mean_interval;
accumulated_bytes %= mean_interval;
size_t accumulated = base::subtle::Acquire_Load(&g_current_interval);
size_t next_interval =
GetNextSampleInterval(base::subtle::NoBarrier_Load(&g_sampling_interval));
// Make sure g_current_interval is set before updating g_bytes_left. do {
base::subtle::Release_Store(&g_current_interval, next_interval); accumulated_bytes -= GetNextSampleInterval(mean_interval);
++samples;
} while (accumulated_bytes >= 0);
// Put the next sampling interval to g_bytes_left, thus allowing threads to AccumulatedBytesTLS().Set(reinterpret_cast<void*>(accumulated_bytes));
// start accumulating bytes towards the next sample.
// Simultaneously extract the current value (which is negative or zero)
// and take it into account when calculating the number of bytes
// accumulated for the current sample.
accumulated -=
base::subtle::NoBarrier_AtomicExchange(&g_bytes_left, next_interval);
g_sampling_heap_profiler_instance->DoRecordAlloc(accumulated, size, address, instance_->DoRecordAlloc(samples * mean_interval, size, address, skip_frames);
kSkipBaseAllocatorFrames);
} }
void SamplingHeapProfiler::RecordStackTrace(Sample* sample, void SamplingHeapProfiler::RecordStackTrace(Sample* sample,
...@@ -331,8 +325,6 @@ void SamplingHeapProfiler::DoRecordAlloc(size_t total_allocated, ...@@ -331,8 +325,6 @@ void SamplingHeapProfiler::DoRecordAlloc(size_t total_allocated,
size_t size, size_t size,
void* address, void* address,
uint32_t skip_frames) { uint32_t skip_frames) {
// TODO(alph): It's better to use a recursive mutex and move the check
// inside the critical section.
if (entered_.Get()) if (entered_.Get())
return; return;
base::AutoLock lock(mutex_); base::AutoLock lock(mutex_);
...@@ -364,10 +356,10 @@ void SamplingHeapProfiler::RecordFree(void* address) { ...@@ -364,10 +356,10 @@ void SamplingHeapProfiler::RecordFree(void* address) {
bool maybe_sampled = true; // Pessimistically assume allocation was sampled. bool maybe_sampled = true; // Pessimistically assume allocation was sampled.
base::subtle::Barrier_AtomicIncrement(&g_operations_in_flight, 1); base::subtle::Barrier_AtomicIncrement(&g_operations_in_flight, 1);
if (LIKELY(!base::subtle::NoBarrier_Load(&g_fast_path_is_closed))) if (LIKELY(!base::subtle::NoBarrier_Load(&g_fast_path_is_closed)))
maybe_sampled = g_sampling_heap_profiler_instance->samples_.count(address); maybe_sampled = instance_->samples_.count(address);
base::subtle::Barrier_AtomicIncrement(&g_operations_in_flight, -1); base::subtle::Barrier_AtomicIncrement(&g_operations_in_flight, -1);
if (maybe_sampled) if (maybe_sampled)
g_sampling_heap_profiler_instance->DoRecordFree(address); instance_->DoRecordFree(address);
} }
void SamplingHeapProfiler::DoRecordFree(void* address) { void SamplingHeapProfiler::DoRecordFree(void* address) {
...@@ -398,15 +390,19 @@ void SamplingHeapProfiler::SuppressRandomnessForTest(bool suppress) { ...@@ -398,15 +390,19 @@ void SamplingHeapProfiler::SuppressRandomnessForTest(bool suppress) {
void SamplingHeapProfiler::AddSamplesObserver(SamplesObserver* observer) { void SamplingHeapProfiler::AddSamplesObserver(SamplesObserver* observer) {
base::AutoLock lock(mutex_); base::AutoLock lock(mutex_);
CHECK(!entered_.Get()); CHECK(!entered_.Get());
entered_.Set(true);
observers_.push_back(observer); observers_.push_back(observer);
entered_.Set(false);
} }
void SamplingHeapProfiler::RemoveSamplesObserver(SamplesObserver* observer) { void SamplingHeapProfiler::RemoveSamplesObserver(SamplesObserver* observer) {
base::AutoLock lock(mutex_); base::AutoLock lock(mutex_);
CHECK(!entered_.Get()); CHECK(!entered_.Get());
entered_.Set(true);
auto it = std::find(observers_.begin(), observers_.end(), observer); auto it = std::find(observers_.begin(), observers_.end(), observer);
CHECK(it != observers_.end()); CHECK(it != observers_.end());
observers_.erase(it); observers_.erase(it);
entered_.Set(false);
} }
std::vector<SamplingHeapProfiler::Sample> SamplingHeapProfiler::GetSamples( std::vector<SamplingHeapProfiler::Sample> SamplingHeapProfiler::GetSamples(
......
...@@ -49,6 +49,10 @@ class BASE_EXPORT SamplingHeapProfiler { ...@@ -49,6 +49,10 @@ class BASE_EXPORT SamplingHeapProfiler {
virtual void SampleRemoved(uint32_t id) = 0; virtual void SampleRemoved(uint32_t id) = 0;
}; };
// Must be called early during the process initialization. It creates and
// reserves a TLS slot.
static void InitTLSSlot();
// This is an entry point for plugging in an external allocator. // This is an entry point for plugging in an external allocator.
// Profiler will invoke the provided callback upon initialization. // Profiler will invoke the provided callback upon initialization.
// The callback should install hooks onto the corresponding memory allocator // The callback should install hooks onto the corresponding memory allocator
...@@ -94,6 +98,8 @@ class BASE_EXPORT SamplingHeapProfiler { ...@@ -94,6 +98,8 @@ class BASE_EXPORT SamplingHeapProfiler {
std::unordered_map<void*, Sample> samples_; std::unordered_map<void*, Sample> samples_;
std::vector<SamplesObserver*> observers_; std::vector<SamplesObserver*> observers_;
static SamplingHeapProfiler* instance_;
friend class base::NoDestructor<SamplingHeapProfiler>; friend class base::NoDestructor<SamplingHeapProfiler>;
DISALLOW_COPY_AND_ASSIGN(SamplingHeapProfiler); DISALLOW_COPY_AND_ASSIGN(SamplingHeapProfiler);
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include "base/sampling_heap_profiler/sampling_heap_profiler.h" #include "base/sampling_heap_profiler/sampling_heap_profiler.h"
#include <stdlib.h> #include <stdlib.h>
#include <cinttypes>
#include "base/allocator/allocator_shim.h" #include "base/allocator/allocator_shim.h"
#include "base/debug/alias.h" #include "base/debug/alias.h"
...@@ -46,6 +47,7 @@ class SamplesCollector : public SamplingHeapProfiler::SamplesObserver { ...@@ -46,6 +47,7 @@ class SamplesCollector : public SamplingHeapProfiler::SamplesObserver {
}; };
TEST_F(SamplingHeapProfilerTest, CollectSamples) { TEST_F(SamplingHeapProfilerTest, CollectSamples) {
SamplingHeapProfiler::InitTLSSlot();
SamplesCollector collector(10000); SamplesCollector collector(10000);
SamplingHeapProfiler* profiler = SamplingHeapProfiler::GetInstance(); SamplingHeapProfiler* profiler = SamplingHeapProfiler::GetInstance();
profiler->SuppressRandomnessForTest(true); profiler->SuppressRandomnessForTest(true);
...@@ -96,10 +98,14 @@ class MyThread2 : public SimpleThread { ...@@ -96,10 +98,14 @@ class MyThread2 : public SimpleThread {
}; };
void CheckAllocationPattern(void (*allocate_callback)()) { void CheckAllocationPattern(void (*allocate_callback)()) {
SamplingHeapProfiler::InitTLSSlot();
SamplingHeapProfiler* profiler = SamplingHeapProfiler::GetInstance(); SamplingHeapProfiler* profiler = SamplingHeapProfiler::GetInstance();
profiler->SuppressRandomnessForTest(false); profiler->SuppressRandomnessForTest(false);
profiler->SetSamplingInterval(10240); profiler->SetSamplingInterval(10240);
for (int i = 0; i < 40; ++i) { base::TimeTicks t0 = base::TimeTicks::Now();
std::map<size_t, size_t> sums;
const int iterations = 40;
for (int i = 0; i < iterations; ++i) {
uint32_t id = profiler->Start(); uint32_t id = profiler->Start();
allocate_callback(); allocate_callback();
std::vector<SamplingHeapProfiler::Sample> samples = std::vector<SamplingHeapProfiler::Sample> samples =
...@@ -110,11 +116,23 @@ void CheckAllocationPattern(void (*allocate_callback)()) { ...@@ -110,11 +116,23 @@ void CheckAllocationPattern(void (*allocate_callback)()) {
buckets[sample.size] += sample.total; buckets[sample.size] += sample.total;
} }
for (auto& it : buckets) { for (auto& it : buckets) {
if (it.first == 400 || it.first == 700 || it.first == 20480) if (it.first != 400 && it.first != 700 && it.first != 20480)
printf("%u,", static_cast<uint32_t>(it.second)); continue;
sums[it.first] += it.second;
printf("%zu,", it.second);
} }
printf("\n"); printf("\n");
} }
printf("Time taken %" PRIu64 "ms\n",
(base::TimeTicks::Now() - t0).InMilliseconds());
for (auto sum : sums) {
intptr_t expected = sum.first * kNumberOfAllocations;
intptr_t actual = sum.second / iterations;
printf("%zu:\tmean: %zu\trelative error: %.2f%%\n", sum.first, actual,
100. * (actual - expected) / expected);
}
} }
// Manual tests to check precision of the sampling profiler. // Manual tests to check precision of the sampling profiler.
......
...@@ -74,6 +74,7 @@ class BrowserMainRunnerImpl : public BrowserMainRunner { ...@@ -74,6 +74,7 @@ class BrowserMainRunnerImpl : public BrowserMainRunner {
const base::TimeTicks start_time_step1 = base::TimeTicks::Now(); const base::TimeTicks start_time_step1 = base::TimeTicks::Now();
base::SamplingHeapProfiler::InitTLSSlot();
if (parameters.command_line.HasSwitch(switches::kSamplingHeapProfiler)) { if (parameters.command_line.HasSwitch(switches::kSamplingHeapProfiler)) {
base::SamplingHeapProfiler* profiler = base::SamplingHeapProfiler* profiler =
base::SamplingHeapProfiler::GetInstance(); base::SamplingHeapProfiler::GetInstance();
......
...@@ -94,6 +94,7 @@ int RendererMain(const MainFunctionParams& parameters) { ...@@ -94,6 +94,7 @@ int RendererMain(const MainFunctionParams& parameters) {
const base::CommandLine& command_line = parameters.command_line; const base::CommandLine& command_line = parameters.command_line;
base::SamplingHeapProfiler::InitTLSSlot();
if (command_line.HasSwitch(switches::kSamplingHeapProfiler)) { if (command_line.HasSwitch(switches::kSamplingHeapProfiler)) {
base::SamplingHeapProfiler* profiler = base::SamplingHeapProfiler* profiler =
base::SamplingHeapProfiler::GetInstance(); base::SamplingHeapProfiler::GetInstance();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment