[PartitionAlloc] Add batched deallocation to ThreadCache::Purge().

Acquire the lock only once in ThreadCache::Purge(). Beyond being faster, this also adds the possibility of amortizing thread cache deallocation later. Bug: 998048 Change-Id: I95805fe28f8df557d5783684c172618a304985bd Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2540596 Commit-Queue: Benoit L <lizeb@chromium.org> Reviewed-by: Yuki Shiino <yukishiino@chromium.org> Cr-Commit-Position: refs/heads/master@{#828219}

[PartitionAlloc] Add batched deallocation to ThreadCache::Purge().
Acquire the lock only once in ThreadCache::Purge(). Beyond being faster, this also adds the possibility of amortizing thread cache deallocation later. Bug: 998048 Change-Id: I95805fe28f8df557d5783684c172618a304985bd Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2540596 Commit-Queue: Benoit L <lizeb@chromium.org> Reviewed-by: Yuki Shiino <yukishiino@chromium.org> Cr-Commit-Position: refs/heads/master@{#828219}
0eacd613 · Benoit Lize · Commit Bot · b8321dcd · 0eacd613 · 0eacd613
Commit 0eacd613 authored Nov 17, 2020 by Benoit Lize Committed by Commit Bot Nov 17, 2020
3 changed files
--- a/base/allocator/partition_allocator/partition_root.h
+++ b/base/allocator/partition_allocator/partition_root.h
@@ -276,8 +276,8 @@ struct BASE_EXPORT PartitionRoot {
  static uint16_t SizeToBucketIndex(size_t size);

  // Frees memory, with |ptr| as returned by |RawAlloc()|.
+  ALWAYS_INLINE void RawFree(void* ptr);
  ALWAYS_INLINE void RawFree(void* ptr, SlotSpan* slot_span);
-  static void RawFreeStatic(void* ptr);

  internal::ThreadCache* thread_cache_for_testing() const {
    return with_thread_cache ? internal::ThreadCache::Get() : nullptr;
@@ -378,6 +378,7 @@ struct BASE_EXPORT PartitionRoot {
      internal::SlotSpanMetadata<thread_safe>* slot_span,
      size_t requested_size) EXCLUSIVE_LOCKS_REQUIRED(lock_);
  void DecommitEmptySlotSpans() EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  ALWAYS_INLINE void RawFreeLocked(void* ptr) EXCLUSIVE_LOCKS_REQUIRED(lock_);

  friend class internal::ThreadCache;
 };
@@ -693,6 +694,12 @@ ALWAYS_INLINE void PartitionRoot<thread_safe>::FreeNoHooksImmediate(
  RawFree(ptr, slot_span);
 }

+template <bool thread_safe>
+ALWAYS_INLINE void PartitionRoot<thread_safe>::RawFree(void* ptr) {
+  SlotSpan* slot_span = SlotSpan::FromPointerNoAlignmentCheck(ptr);
+  RawFree(ptr, slot_span);
+}
+
 template <bool thread_safe>
 ALWAYS_INLINE void PartitionRoot<thread_safe>::RawFree(void* ptr,
                                                       SlotSpan* slot_span) {
@@ -704,12 +711,12 @@ ALWAYS_INLINE void PartitionRoot<thread_safe>::RawFree(void* ptr,
  deferred_unmap.Run();
 }

-// static
 template <bool thread_safe>
-void PartitionRoot<thread_safe>::RawFreeStatic(void* ptr) {
+ALWAYS_INLINE void PartitionRoot<thread_safe>::RawFreeLocked(void* ptr) {
  SlotSpan* slot_span = SlotSpan::FromPointerNoAlignmentCheck(ptr);
-  auto* root = FromSlotSpan(slot_span);
-  root->RawFree(ptr, slot_span);
+  auto deferred_unmap = slot_span->Free(ptr);
+  PA_DCHECK(!deferred_unmap.ptr);  // Only used with bucketed allocations.
+  deferred_unmap.Run();
 }

 // static

--- a/base/allocator/partition_allocator/thread_cache.cc
+++ b/base/allocator/partition_allocator/thread_cache.cc
@@ -23,11 +23,6 @@ ThreadCacheRegistry g_instance;
 BASE_EXPORT PartitionTlsKey g_thread_cache_key;

 namespace {
-void DeleteThreadCache(void* tcache_ptr) {
-  reinterpret_cast<ThreadCache*>(tcache_ptr)->~ThreadCache();
-  PartitionRoot<ThreadSafe>::RawFreeStatic(tcache_ptr);
-}
-
 // Since |g_thread_cache_key| is shared, make sure that no more than one
 // PartitionRoot can use it.
 static std::atomic<bool> g_has_instance;
@@ -110,7 +105,7 @@ void ThreadCacheRegistry::PurgeAll() {
 void ThreadCache::Init(PartitionRoot<ThreadSafe>* root) {
  PA_CHECK(root->buckets[kBucketCount - 1].slot_size == kSizeThreshold);

-  bool ok = PartitionTlsCreate(&g_thread_cache_key, DeleteThreadCache);
+  bool ok = PartitionTlsCreate(&g_thread_cache_key, Delete);
  PA_CHECK(ok);

  // Make sure that only one PartitionRoot wants a thread cache.
@@ -160,6 +155,14 @@ ThreadCache::~ThreadCache() {
  Purge();
 }

+// static
+void ThreadCache::Delete(void* tcache_ptr) {
+  auto* tcache = reinterpret_cast<ThreadCache*>(tcache_ptr);
+  auto* root = tcache->root_;
+  reinterpret_cast<ThreadCache*>(tcache_ptr)->~ThreadCache();
+  root->RawFree(tcache_ptr);
+}
+
 void ThreadCache::AccumulateStats(ThreadCacheStats* stats) const {
  stats->alloc_count += stats_.alloc_count;
  stats->alloc_hits += stats_.alloc_hits;
@@ -190,12 +193,18 @@ void ThreadCache::SetShouldPurge() {
 void ThreadCache::Purge() {
  for (Bucket& bucket : buckets_) {
    size_t count = bucket.count;
+    if (!count)
+      continue;

+    // Acquire the lock once per bucket. This avoids acquiring it for too long,
+    // and also allocations from the same bucket are likely to be hitting the
+    // same cache lines in the central allocator.
+    internal::ScopedGuard<internal::ThreadSafe> guard(root_->lock_);
    while (bucket.freelist_head) {
      auto* entry = bucket.freelist_head;
      bucket.freelist_head = entry->GetNext();

-      PartitionRoot<ThreadSafe>::RawFreeStatic(entry);
+      root_->RawFreeLocked(entry);
      count--;
    }
    CHECK_EQ(0u, count);

--- a/base/allocator/partition_allocator/thread_cache.h
+++ b/base/allocator/partition_allocator/thread_cache.h
@@ -162,6 +162,7 @@ class BASE_EXPORT ThreadCache {

 private:
  explicit ThreadCache(PartitionRoot<ThreadSafe>* root);
+  static void Delete(void* thread_cache_ptr);

  struct Bucket {
    size_t count;
@@ -184,7 +185,7 @@ class BASE_EXPORT ThreadCache {
  std::atomic<bool> should_purge_;
  Bucket buckets_[kBucketCount];
  ThreadCacheStats stats_;
-  PartitionRoot<ThreadSafe>* root_;
+  PartitionRoot<ThreadSafe>* const root_;

  // Intrusive list since ThreadCacheRegistry::RegisterThreadCache() cannot
  // allocate.