Implement a ScopedThreadHeapUsage class to allow profiling per-thread heap usage.

This uses the generic allocator shim to hook into heap allocations. When disabled and unused, there is no runtime penalty for this. When heap tracing is enabled, there's a small accounting overhead for every allocation. Instantiating a ScopedThreadHeapUsage instance carries O(1) cost, whether or not heap tracing is enabled. BUG=644385 Review-Url: https://codereview.chromium.org/2163783003 Cr-Commit-Position: refs/heads/master@{#417601}

Implement a ScopedThreadHeapUsage class to allow profiling per-thread heap usage.
This uses the generic allocator shim to hook into heap allocations. When disabled and unused, there is no runtime penalty for this. When heap tracing is enabled, there's a small accounting overhead for every allocation. Instantiating a ScopedThreadHeapUsage instance carries O(1) cost, whether or not heap tracing is enabled. BUG=644385 Review-Url: https://codereview.chromium.org/2163783003 Cr-Commit-Position: refs/heads/master@{#417601}
46e1b077 · siggi · Commit bot · c1183b81 · 46e1b077 · 46e1b077
Commit 46e1b077 authored Sep 09, 2016 by siggi Committed by Commit bot Sep 09, 2016
12 changed files
--- a/base/BUILD.gn
+++ b/base/BUILD.gn
@@ -291,6 +291,8 @@ component("base") {
    "debug/proc_maps_linux.h",
    "debug/profiler.cc",
    "debug/profiler.h",
+    "debug/scoped_thread_heap_usage.cc",
+    "debug/scoped_thread_heap_usage.h",
    "debug/stack_trace.cc",
    "debug/stack_trace.h",
    "debug/stack_trace_android.cc",
@@ -1771,6 +1773,7 @@ test("base_unittests") {
    "debug/debugger_unittest.cc",
    "debug/leak_tracker_unittest.cc",
    "debug/proc_maps_linux_unittest.cc",
+    "debug/scoped_thread_heap_usage_unittest.cc",
    "debug/stack_trace_unittest.cc",
    "debug/task_annotator_unittest.cc",
    "deferred_sequenced_task_runner_unittest.cc",

--- a/base/allocator/allocator_shim.h
+++ b/base/allocator/allocator_shim.h
@@ -56,12 +56,19 @@ struct AllocatorDispatch {
                          void* address,
                          size_t size);
  using FreeFn = void(const AllocatorDispatch* self, void* address);
+  // Returns the best available estimate for the actual amount of memory
+  // consumed by the allocation |address|. If possible, this should include
+  // heap overhead or at least a decent estimate of the full cost of the
+  // allocation. If no good estimate is possible, returns zero.
+  using GetSizeEstimateFn = size_t(const AllocatorDispatch* self,
+                                   void* address);
  AllocFn* const alloc_function;
  AllocZeroInitializedFn* const alloc_zero_initialized_function;
  AllocAlignedFn* const alloc_aligned_function;
  ReallocFn* const realloc_function;
  FreeFn* const free_function;
+  GetSizeEstimateFn* const get_size_estimate_function;
  const AllocatorDispatch* next;

--- a/base/allocator/allocator_shim_default_dispatch_to_glibc.cc
+++ b/base/allocator/allocator_shim_default_dispatch_to_glibc.cc
@@ -4,6 +4,8 @@
 #include "base/allocator/allocator_shim.h"
+#include <malloc.h>
 // This translation unit defines a default dispatch for the allocator shim which
 // routes allocations to libc functions.
 // The code here is strongly inspired from tcmalloc's libc_override_glibc.h.
@@ -40,13 +42,20 @@ void GlibcFree(const AllocatorDispatch*, void* address) {
  __libc_free(address);
 }
+size_t GlibcGetSizeEstimate(const AllocatorDispatch*, void* address) {
+  // TODO(siggi, primiano): malloc_usable_size may need redirection in the
+  //     presence of interposing shims that divert allocations.
+  return malloc_usable_size(address);
+}
 }  // namespace
 const AllocatorDispatch AllocatorDispatch::default_dispatch = {
-    &GlibcMalloc,   /* alloc_function */
+    &GlibcMalloc,          /* alloc_function */
-    &GlibcCalloc,   /* alloc_zero_initialized_function */
+    &GlibcCalloc,          /* alloc_zero_initialized_function */
-    &GlibcMemalign, /* alloc_aligned_function */
+    &GlibcMemalign,        /* alloc_aligned_function */
-    &GlibcRealloc,  /* realloc_function */
+    &GlibcRealloc,         /* realloc_function */
-    &GlibcFree,     /* free_function */
+    &GlibcFree,            /* free_function */
-    nullptr,        /* next */
+    &GlibcGetSizeEstimate, /* get_size_estimate_function */
+    nullptr,               /* next */
 };
--- a/base/allocator/allocator_shim_default_dispatch_to_linker_wrapped_symbols.cc
+++ b/base/allocator/allocator_shim_default_dispatch_to_linker_wrapped_symbols.cc
@@ -45,13 +45,20 @@ void RealFree(const AllocatorDispatch*, void* address) {
  __real_free(address);
 }
+size_t RealSizeEstimate(const AllocatorDispatch*, void*) {
+  // TODO(primiano): This should be redirected to malloc_usable_size or
+  //     the like.
+  return 0;
+}
 }  // namespace
 const AllocatorDispatch AllocatorDispatch::default_dispatch = {
-    &RealMalloc,   /* alloc_function */
+    &RealMalloc,       /* alloc_function */
-    &RealCalloc,   /* alloc_zero_initialized_function */
+    &RealCalloc,       /* alloc_zero_initialized_function */
-    &RealMemalign, /* alloc_aligned_function */
+    &RealMemalign,     /* alloc_aligned_function */
-    &RealRealloc,  /* realloc_function */
+    &RealRealloc,      /* realloc_function */
-    &RealFree,     /* free_function */
+    &RealFree,         /* free_function */
-    nullptr,       /* next */
+    &RealSizeEstimate, /* get_size_estimate_function */
+    nullptr,           /* next */
 };
--- a/base/allocator/allocator_shim_default_dispatch_to_tcmalloc.cc
+++ b/base/allocator/allocator_shim_default_dispatch_to_tcmalloc.cc
@@ -31,15 +31,20 @@ void TCFree(const AllocatorDispatch*, void* address) {
  tc_free(address);
 }
+size_t TCGetSizeEstimate(const AllocatorDispatch*, void* address) {
+  return tc_malloc_size(address);
+}
 }  // namespace
 const AllocatorDispatch AllocatorDispatch::default_dispatch = {
-    &TCMalloc,   /* alloc_function */
+    &TCMalloc,          /* alloc_function */
-    &TCCalloc,   /* alloc_zero_initialized_function */
+    &TCCalloc,          /* alloc_zero_initialized_function */
-    &TCMemalign, /* alloc_aligned_function */
+    &TCMemalign,        /* alloc_aligned_function */
-    &TCRealloc,  /* realloc_function */
+    &TCRealloc,         /* realloc_function */
-    &TCFree,     /* free_function */
+    &TCFree,            /* free_function */
-    nullptr,     /* next */
+    &TCGetSizeEstimate, /* get_size_estimate_function */
+    nullptr,            /* next */
 };
 // In the case of tcmalloc we have also to route the diagnostic symbols,

--- a/base/allocator/allocator_shim_default_dispatch_to_winheap.cc
+++ b/base/allocator/allocator_shim_default_dispatch_to_winheap.cc
@@ -47,10 +47,19 @@ void DefaultWinHeapFreeImpl(const AllocatorDispatch*, void* address) {
  base::allocator::WinHeapFree(address);
 }
+size_t DefaultWinHeapGetSizeEstimateImpl(const AllocatorDispatch*,
+                                         void* address) {
+  return base::allocator::WinHeapGetSizeEstimate(address);
+}
 }  // namespace
 const AllocatorDispatch AllocatorDispatch::default_dispatch = {
-    &DefaultWinHeapMallocImpl,   &DefaultWinHeapCallocImpl,
+    &DefaultWinHeapMallocImpl,
-    &DefaultWinHeapMemalignImpl, &DefaultWinHeapReallocImpl,
+    &DefaultWinHeapCallocImpl,
-    &DefaultWinHeapFreeImpl,     nullptr, /* next */
+    &DefaultWinHeapMemalignImpl,
+    &DefaultWinHeapReallocImpl,
+    &DefaultWinHeapFreeImpl,
+    &DefaultWinHeapGetSizeEstimateImpl,
+    nullptr, /* next */
 };
--- a/base/allocator/winheap_stubs_win.cc
+++ b/base/allocator/winheap_stubs_win.cc
@@ -13,6 +13,8 @@
 #include <new.h>
 #include <windows.h>
+#include "base/bits.h"
 namespace base {
 namespace allocator {
@@ -51,6 +53,27 @@ void* WinHeapRealloc(void* ptr, size_t size) {
  return nullptr;
 }
+size_t WinHeapGetSizeEstimate(void* ptr) {
+  if (!ptr)
+    return 0;
+  // Get the user size of the allocation.
+  size_t size = HeapSize(get_heap_handle(), 0, ptr);
+  // Account for the 8-byte HEAP_HEADER preceding the block.
+  size += 8;
+// Round up to the nearest allocation granularity, which is 8 for
+// 32 bit machines, and 16 for 64 bit machines.
+#if defined(ARCH_CPU_64_BITS)
+  const size_t kAllocationGranularity = 16;
+#else
+  const size_t kAllocationGranularity = 8;
+#endif
+  return base::bits::Align(size, kAllocationGranularity);
+}
 // Call the new handler, if one has been set.
 // Returns true on successfully calling the handler, false otherwise.
 bool WinCallNewHandler(size_t size) {

--- a/base/allocator/winheap_stubs_win.h
+++ b/base/allocator/winheap_stubs_win.h
@@ -21,9 +21,13 @@ extern bool g_is_win_shim_layer_initialized;
 // Thin wrappers to implement the standard C allocation semantics on the
 // CRT's Windows heap.
 void* WinHeapMalloc(size_t size);
-void WinHeapFree(void* size);
+void WinHeapFree(void* ptr);
 void* WinHeapRealloc(void* ptr, size_t size);
+// Returns a lower-bound estimate for the full amount of memory consumed by the
+// the allocation |ptr|.
+size_t WinHeapGetSizeEstimate(void* ptr);
 // Call the new handler, if one has been set.
 // Returns true on successfully calling the handler, false otherwise.
 bool WinCallNewHandler(size_t size);

--- a/base/debug/scoped_thread_heap_usage.cc
+++ b/base/debug/scoped_thread_heap_usage.cc
+// Copyright 2016 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#include "base/debug/scoped_thread_heap_usage.h"
+#include <stdint.h>
+#include <algorithm>
+#include <type_traits>
+#include "base/allocator/allocator_shim.h"
+#include "base/allocator/features.h"
+#include "base/logging.h"
+#include "base/threading/thread_local_storage.h"
+#include "build/build_config.h"
+#if defined(OS_MACOSX) || defined(OS_IOS)
+#include <malloc/malloc.h>
+#else
+#include <malloc.h>
+#endif
+namespace base {
+namespace debug {
+namespace {
+using base::allocator::AllocatorDispatch;
+ThreadLocalStorage::StaticSlot g_thread_allocator_usage = TLS_INITIALIZER;
+ScopedThreadHeapUsage::ThreadAllocatorUsage* const kInitializingSentinel =
+    reinterpret_cast<ScopedThreadHeapUsage::ThreadAllocatorUsage*>(-1);
+bool g_heap_tracking_enabled = false;
+// Forward declared as it needs to delegate memory allocation to the next
+// lower shim.
+ScopedThreadHeapUsage::ThreadAllocatorUsage* GetOrCreateThreadUsage();
+size_t GetAllocSizeEstimate(const AllocatorDispatch* next, void* ptr) {
+  if (ptr == nullptr)
+    return 0U;
+  return next->get_size_estimate_function(next, ptr);
+}
+void RecordAlloc(const AllocatorDispatch* next, void* ptr, size_t size) {
+  ScopedThreadHeapUsage::ThreadAllocatorUsage* usage = GetOrCreateThreadUsage();
+  if (usage == nullptr)
+    return;
+  usage->alloc_ops++;
+  size_t estimate = GetAllocSizeEstimate(next, ptr);
+  if (size && estimate) {
+    usage->alloc_bytes += estimate;
+    usage->alloc_overhead_bytes += estimate - size;
+    // Only keep track of the net number of bytes allocated in the scope if the
+    // size estimate function returns sane values, e.g. non-zero.
+    uint64_t allocated_bytes = usage->alloc_bytes - usage->free_bytes;
+    if (allocated_bytes > usage->max_allocated_bytes)
+      usage->max_allocated_bytes = allocated_bytes;
+  } else {
+    usage->alloc_bytes += size;
+  }
+}
+void RecordFree(const AllocatorDispatch* next, void* ptr) {
+  ScopedThreadHeapUsage::ThreadAllocatorUsage* usage = GetOrCreateThreadUsage();
+  if (usage == nullptr)
+    return;
+  size_t estimate = GetAllocSizeEstimate(next, ptr);
+  usage->free_ops++;
+  usage->free_bytes += estimate;
+}
+void* AllocFn(const AllocatorDispatch* self, size_t size) {
+  void* ret = self->next->alloc_function(self->next, size);
+  if (ret != nullptr)
+    RecordAlloc(self->next, ret, size);
+  return ret;
+}
+void* AllocZeroInitializedFn(const AllocatorDispatch* self,
+                             size_t n,
+                             size_t size) {
+  void* ret = self->next->alloc_zero_initialized_function(self->next, n, size);
+  if (ret != nullptr)
+    RecordAlloc(self->next, ret, size);
+  return ret;
+}
+void* AllocAlignedFn(const AllocatorDispatch* self,
+                     size_t alignment,
+                     size_t size) {
+  void* ret = self->next->alloc_aligned_function(self->next, alignment, size);
+  if (ret != nullptr)
+    RecordAlloc(self->next, ret, size);
+  return ret;
+}
+void* ReallocFn(const AllocatorDispatch* self, void* address, size_t size) {
+  if (address != nullptr)
+    RecordFree(self->next, address);
+  void* ret = self->next->realloc_function(self->next, address, size);
+  if (ret != nullptr && size != 0)
+    RecordAlloc(self->next, ret, size);
+  return ret;
+}
+void FreeFn(const AllocatorDispatch* self, void* address) {
+  if (address != nullptr)
+    RecordFree(self->next, address);
+  self->next->free_function(self->next, address);
+}
+size_t GetSizeEstimateFn(const AllocatorDispatch* self, void* address) {
+  return self->next->get_size_estimate_function(self->next, address);
+}
+// The allocator dispatch used to intercept heap operations.
+AllocatorDispatch allocator_dispatch = {
+    &AllocFn, &AllocZeroInitializedFn, &AllocAlignedFn, &ReallocFn,
+    &FreeFn,  &GetSizeEstimateFn,      nullptr};
+ScopedThreadHeapUsage::ThreadAllocatorUsage* GetOrCreateThreadUsage() {
+  ScopedThreadHeapUsage::ThreadAllocatorUsage* allocator_usage =
+      static_cast<ScopedThreadHeapUsage::ThreadAllocatorUsage*>(
+          g_thread_allocator_usage.Get());
+  if (allocator_usage == kInitializingSentinel)
+    return nullptr;  // Re-entrancy case.
+  if (allocator_usage == nullptr) {
+    // Prevent reentrancy due to the allocation below.
+    g_thread_allocator_usage.Set(kInitializingSentinel);
+    allocator_usage = new ScopedThreadHeapUsage::ThreadAllocatorUsage;
+    memset(allocator_usage, 0, sizeof(*allocator_usage));
+    g_thread_allocator_usage.Set(allocator_usage);
+  }
+  return allocator_usage;
+}
+}  // namespace
+ScopedThreadHeapUsage::ScopedThreadHeapUsage() {
+  // Initialize must be called before creating instances of this class.
+  CHECK(g_thread_allocator_usage.initialized());
+  ThreadAllocatorUsage* usage = GetOrCreateThreadUsage();
+  usage_at_creation_ = *usage;
+  // Reset the stats for our current scope.
+  // The per-thread usage instance now tracks this scope's usage, while this
+  // instance persists the outer scope's usage stats. On destruction, this
+  // instance will restore the outer scope's usage stats with this scope's usage
+  // added.
+  memset(usage, 0, sizeof(*usage));
+  static_assert(std::is_pod<ThreadAllocatorUsage>::value, "Must be POD.");
+}
+ScopedThreadHeapUsage::~ScopedThreadHeapUsage() {
+  DCHECK(thread_checker_.CalledOnValidThread());
+  ThreadAllocatorUsage* usage = GetOrCreateThreadUsage();
+  // Update the outer max.
+  if (usage->max_allocated_bytes) {
+    uint64_t outer_net_alloc_bytes =
+        usage_at_creation_.alloc_bytes - usage_at_creation_.free_bytes;
+    usage->max_allocated_bytes =
+        std::max(usage_at_creation_.max_allocated_bytes,
+                 outer_net_alloc_bytes + usage->max_allocated_bytes);
+  }
+  usage->alloc_ops += usage_at_creation_.alloc_ops;
+  usage->alloc_bytes += usage_at_creation_.alloc_bytes;
+  usage->alloc_overhead_bytes += usage_at_creation_.alloc_overhead_bytes;
+  usage->free_ops += usage_at_creation_.free_ops;
+  usage->free_bytes += usage_at_creation_.free_bytes;
+}
+ScopedThreadHeapUsage::ThreadAllocatorUsage
+ScopedThreadHeapUsage::CurrentUsage() {
+  ThreadAllocatorUsage* usage = GetOrCreateThreadUsage();
+  return *usage;
+}
+void ScopedThreadHeapUsage::Initialize() {
+  if (!g_thread_allocator_usage.initialized()) {
+    g_thread_allocator_usage.Initialize([](void* allocator_usage) {
+      delete static_cast<ScopedThreadHeapUsage::ThreadAllocatorUsage*>(
+          allocator_usage);
+    });
+  }
+}
+void ScopedThreadHeapUsage::EnableHeapTracking() {
+  CHECK_EQ(false, g_heap_tracking_enabled) << "No double-enabling.";
+  g_heap_tracking_enabled = true;
+#if BUILDFLAG(USE_EXPERIMENTAL_ALLOCATOR_SHIM)
+  base::allocator::InsertAllocatorDispatch(&allocator_dispatch);
+#else
+  CHECK(false) << "Can't enable heap tracking without the shim.";
+#endif  // BUILDFLAG(USE_EXPERIMENTAL_ALLOCATOR_SHIM)
+}
+void ScopedThreadHeapUsage::DisableHeapTrackingForTesting() {
+#if BUILDFLAG(USE_EXPERIMENTAL_ALLOCATOR_SHIM)
+  base::allocator::RemoveAllocatorDispatchForTesting(&allocator_dispatch);
+#else
+  CHECK(false) << "Can't disable heap tracking without the shim.";
+#endif  // BUILDFLAG(USE_EXPERIMENTAL_ALLOCATOR_SHIM)
+  DCHECK_EQ(true, g_heap_tracking_enabled) << "Heap tracking not enabled.";
+  g_heap_tracking_enabled = false;
+}
+base::allocator::AllocatorDispatch*
+ScopedThreadHeapUsage::GetDispatchForTesting() {
+  return &allocator_dispatch;
+}
+}  // namespace debug
+}  // namespace base
--- a/base/debug/scoped_thread_heap_usage.h
+++ b/base/debug/scoped_thread_heap_usage.h
+// Copyright 2016 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#ifndef BASE_DEBUG_SCOPED_THREAD_HEAP_USAGE_H_
+#define BASE_DEBUG_SCOPED_THREAD_HEAP_USAGE_H_
+#include <stdint.h>
+#include "base/allocator/features.h"
+#include "base/base_export.h"
+#include "base/threading/thread_checker.h"
+namespace base {
+namespace allocator {
+struct AllocatorDispatch;
+}  // namespace allocator
+namespace debug {
+// By keeping a tally on heap operations, it's possible to track:
+// - the number of alloc/free operations, where a realloc is zero or one
+//   of each, depending on the input parameters (see man realloc).
+// - the number of bytes allocated/freed.
+// - the number of estimated bytes of heap overhead used.
+// - the high-watermark amount of bytes allocated in the scope.
+// This in turn allows measuring the memory usage and memory usage churn over
+// a scope. Scopes must be cleanly nested, and each scope must be
+// destroyed on the thread where it's created.
+//
+// Note that this depends on the capabilities of the underlying heap shim. If
+// that shim can not yield a size estimate for an allocation, it's not possible
+// to keep track of overhead, freed bytes and the allocation high water mark.
+class BASE_EXPORT ScopedThreadHeapUsage {
+ public:
+  struct ThreadAllocatorUsage {
+    // The cumulative number of allocation operations.
+    uint64_t alloc_ops;
+    // The cumulative number of allocated bytes. Where available, this is
+    // inclusive heap padding and estimated or actual heap overhead.
+    uint64_t alloc_bytes;
+    // Where available, cumulative number of heap padding heap
+    // and overhead bytes.
+    uint64_t alloc_overhead_bytes;
+    // The cumulative number of free operations.
+    uint64_t free_ops;
+    // The cumulative number of bytes freed.
+    // Only recorded if the underlying heap shim can return the size of an
+    // allocation.
+    uint64_t free_bytes;
+    // The maximal value of alloc_bytes - free_bytes seen for this thread.
+    // Only recorded if the underlying heap shim supports returning the size of
+    // an allocation.
+    uint64_t max_allocated_bytes;
+  };
+  ScopedThreadHeapUsage();
+  ~ScopedThreadHeapUsage();
+  const ThreadAllocatorUsage& usage_at_creation() const {
+    return usage_at_creation_;
+  }
+  // Returns this thread's allocator usage from the creation of the innermost
+  // enclosing ScopedThreadHeapUsage instance, if any. Note that this is
+  // inclusive allocator usage in all inner scopes.
+  static ThreadAllocatorUsage CurrentUsage();
+  // Initializes the TLS machinery this class uses. Must be called before
+  // creating instances of this class.
+  static void Initialize();
+  // Enables the heap intercept. May only be called once, and only if the heap
+  // shim is available, e.g. if BUILDFLAG(USE_EXPERIMENTAL_ALLOCATOR_SHIM) is
+  // true.
+  static void EnableHeapTracking();
+ protected:
+  // Exposed for testing only - note that it's safe to re-EnableHeapTracking()
+  // after calling this function in tests.
+  static void DisableHeapTrackingForTesting();
+  // Exposed to allow testing the shim without inserting it in the allocator
+  // shim chain.
+  static base::allocator::AllocatorDispatch* GetDispatchForTesting();
+ private:
+  static void EnsureTLSInitialized();
+  ThreadChecker thread_checker_;
+  // The allocator usage captured at creation of this instance.
+  ThreadAllocatorUsage usage_at_creation_;
+};
+}  // namespace debug
+}  // namespace base
+#endif  // BASE_DEBUG_SCOPED_THREAD_HEAP_USAGE_H_
\ No newline at end of file
--- a/base/debug/scoped_thread_heap_usage_unittest.cc
+++ b/base/debug/scoped_thread_heap_usage_unittest.cc
--- a/base/trace_event/malloc_dump_provider.cc
+++ b/base/trace_event/malloc_dump_provider.cc
@@ -77,13 +77,19 @@ void HookFree(const AllocatorDispatch* self, void* address) {
  next->free_function(next, address);
 }
+size_t HookGetSizeEstimate(const AllocatorDispatch* self, void* address) {
+  const AllocatorDispatch* const next = self->next;
+  return next->get_size_estimate_function(next, address);
+}
 AllocatorDispatch g_allocator_hooks = {
-    &HookAlloc,         /* alloc_function */
+    &HookAlloc,           /* alloc_function */
-    &HookZeroInitAlloc, /* alloc_zero_initialized_function */
+    &HookZeroInitAlloc,   /* alloc_zero_initialized_function */
-    &HookllocAligned,   /* alloc_aligned_function */
+    &HookllocAligned,     /* alloc_aligned_function */
-    &HookRealloc,       /* realloc_function */
+    &HookRealloc,         /* realloc_function */
-    &HookFree,          /* free_function */
+    &HookFree,            /* free_function */
-    nullptr,            /* next */
+    &HookGetSizeEstimate, /* get_size_estimate_function */
+    nullptr,              /* next */
 };
 #endif  // BUILDFLAG(USE_EXPERIMENTAL_ALLOCATOR_SHIM)