[PartitionAlloc] Batch thread cache deallocations.

The thread cache has a hard limit on how many objects are cached per bucket. Previously, every cache fill above the limit would go back to the central allocator, so the following scenario: 1. 1000 malloc(10) calls 2. 1000 free(ptr) calls (of the previously allocated memory) Woould acquire the lock 1000 - kMaxCountPerBucket times on the deallocation path. By batching deallocations, the lock is only acquired (1000 - kMaxCountPerBucket) / (kMaxCountPerBucket / 2) = 2000 / kMaxCountPerBucket - 2 times. It may also reduce cache footprint for these scenarios, since the bucket is rarely full. Note that the allocation patth is not affected, batching on this side may be investigated as well, but not without data showing that empty buckets are frequent. Bug: 998048 Change-Id: I626205e28d7a082a12303a493ca504e9581fc43a Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2546599 Commit-Queue: Benoit L <lizeb@chromium.org> Reviewed-by: Kentaro Hara <haraken@chromium.org> Reviewed-by: Yuki Shiino <yukishiino@chromium.org> Cr-Commit-Position: refs/heads/master@{#829208}

[PartitionAlloc] Batch thread cache deallocations.
The thread cache has a hard limit on how many objects are cached per bucket. Previously, every cache fill above the limit would go back to the central allocator, so the following scenario: 1. 1000 malloc(10) calls 2. 1000 free(ptr) calls (of the previously allocated memory) Woould acquire the lock 1000 - kMaxCountPerBucket times on the deallocation path. By batching deallocations, the lock is only acquired (1000 - kMaxCountPerBucket) / (kMaxCountPerBucket / 2) = 2000 / kMaxCountPerBucket - 2 times. It may also reduce cache footprint for these scenarios, since the bucket is rarely full. Note that the allocation patth is not affected, batching on this side may be investigated as well, but not without data showing that empty buckets are frequent. Bug: 998048 Change-Id: I626205e28d7a082a12303a493ca504e9581fc43a Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2546599 Commit-Queue: Benoit L <lizeb@chromium.org> Reviewed-by: Kentaro Hara <haraken@chromium.org> Reviewed-by: Yuki Shiino <yukishiino@chromium.org> Cr-Commit-Position: refs/heads/master@{#829208}
2a5ddc90 · Benoit Lize · Commit Bot · f8a14659 · 2a5ddc90 · 2a5ddc90
Commit 2a5ddc90 authored Nov 19, 2020 by Benoit Lize Committed by Commit Bot Nov 19, 2020
5 changed files
--- a/base/allocator/partition_allocator/partition_stats.h
+++ b/base/allocator/partition_allocator/partition_stats.h
@@ -26,9 +26,7 @@ struct ThreadCacheStats {
  // Cache fill details:
  uint64_t cache_fill_count;
  uint64_t cache_fill_hits;
-  uint64_t cache_fill_misses;
+  uint64_t cache_fill_misses;  // Object too large.
-  uint64_t cache_fill_bucket_full;
-  uint64_t cache_fill_too_large;
  // Memory cost:
  uint64_t bucket_total_memory;

--- a/base/allocator/partition_allocator/thread_cache.cc
+++ b/base/allocator/partition_allocator/thread_cache.cc
@@ -163,6 +163,26 @@ void ThreadCache::Delete(void* tcache_ptr) {
  root->RawFree(tcache_ptr);
 }
+void ThreadCache::ClearBucket(ThreadCache::Bucket& bucket, size_t limit) {
+  // Avoids acquiring the lock needlessly.
+  if (!bucket.count)
+    return;
+  // Acquire the lock once for the bucket. Allocations from the same bucket are
+  // likely to be hitting the same cache lines in the central allocator, and
+  // lock acquisitions can be expensive.
+  internal::ScopedGuard<internal::ThreadSafe> guard(root_->lock_);
+  while (bucket.count > limit) {
+    auto* entry = bucket.freelist_head;
+    PA_DCHECK(entry);
+    bucket.freelist_head = entry->GetNext();
+    root_->RawFreeLocked(entry);
+    bucket.count--;
+  }
+  PA_DCHECK(bucket.count == limit);
+}
 void ThreadCache::AccumulateStats(ThreadCacheStats* stats) const {
  stats->alloc_count += stats_.alloc_count;
  stats->alloc_hits += stats_.alloc_hits;
@@ -174,8 +194,6 @@ void ThreadCache::AccumulateStats(ThreadCacheStats* stats) const {
  stats->cache_fill_count += stats_.cache_fill_count;
  stats->cache_fill_hits += stats_.cache_fill_hits;
  stats->cache_fill_misses += stats_.cache_fill_misses;
-  stats->cache_fill_bucket_full += stats_.cache_fill_bucket_full;
-  stats->cache_fill_too_large += stats_.cache_fill_too_large;
  for (size_t i = 0; i < kBucketCount; i++) {
    stats->bucket_total_memory +=
@@ -191,25 +209,9 @@ void ThreadCache::SetShouldPurge() {
 }
 void ThreadCache::Purge() {
-  for (Bucket& bucket : buckets_) {
+  for (auto& bucket : buckets_)
-    size_t count = bucket.count;
+    ClearBucket(bucket, 0);
-    if (!count)
-      continue;
-    // Acquire the lock once per bucket. This avoids acquiring it for too long,
-    // and also allocations from the same bucket are likely to be hitting the
-    // same cache lines in the central allocator.
-    internal::ScopedGuard<internal::ThreadSafe> guard(root_->lock_);
-    while (bucket.freelist_head) {
-      auto* entry = bucket.freelist_head;
-      bucket.freelist_head = entry->GetNext();
-      root_->RawFreeLocked(entry);
-      count--;
-    }
-    CHECK_EQ(0u, count);
-    bucket.count = 0;
-  }
  should_purge_.store(false, std::memory_order_relaxed);
 }

--- a/base/allocator/partition_allocator/thread_cache.h
+++ b/base/allocator/partition_allocator/thread_cache.h
@@ -159,14 +159,16 @@ class BASE_EXPORT ThreadCache {
  }
 private:
-  explicit ThreadCache(PartitionRoot<ThreadSafe>* root);
-  static void Delete(void* thread_cache_ptr);
  struct Bucket {
    size_t count;
    PartitionFreelistEntry* freelist_head;
  };
+  explicit ThreadCache(PartitionRoot<ThreadSafe>* root);
+  static void Delete(void* thread_cache_ptr);
+  // Empties the |bucket| until there are at most |limit| objects in it.
+  void ClearBucket(Bucket& bucket, size_t limit);
  // TODO(lizeb): Optimize the threshold.
  static constexpr size_t kSizeThreshold = 512;
  static constexpr size_t kBucketCount =
@@ -206,19 +208,12 @@ ALWAYS_INLINE bool ThreadCache::MaybePutInCache(void* address,
  INCREMENT_COUNTER(stats_.cache_fill_count);
  if (bucket_index >= kBucketCount) {
-    INCREMENT_COUNTER(stats_.cache_fill_too_large);
    INCREMENT_COUNTER(stats_.cache_fill_misses);
    return false;
  }
  auto& bucket = buckets_[bucket_index];
-  if (bucket.count >= kMaxCountPerBucket) {
-    INCREMENT_COUNTER(stats_.cache_fill_bucket_full);
-    INCREMENT_COUNTER(stats_.cache_fill_misses);
-    return false;
-  }
  PA_DCHECK(bucket.count != 0 || bucket.freelist_head == nullptr);
  auto* entry = reinterpret_cast<PartitionFreelistEntry*>(address);
@@ -227,6 +222,12 @@ ALWAYS_INLINE bool ThreadCache::MaybePutInCache(void* address,
  bucket.count++;
  INCREMENT_COUNTER(stats_.cache_fill_hits);
+  // Batched deallocation, amortizing lock acquisitions.
+  if (bucket.count >= kMaxCountPerBucket) {
+    ClearBucket(bucket, kMaxCountPerBucket / 2);
+  }
  return true;
 }

--- a/base/allocator/partition_allocator/thread_cache_unittest.cc
+++ b/base/allocator/partition_allocator/thread_cache_unittest.cc
@@ -184,8 +184,6 @@ TEST_F(ThreadCacheTest, LargeAllocationsAreNotCached) {
      tcache->stats_.alloc_miss_too_large};
  DeltaCounter cache_fill_counter{tcache->stats_.cache_fill_count};
  DeltaCounter cache_fill_misses_counter{tcache->stats_.cache_fill_misses};
-  DeltaCounter cache_fill_too_large_counter{
-      tcache->stats_.cache_fill_too_large};
  FillThreadCacheAndReturnIndex(100 * 1024);
  tcache = g_root->thread_cache_for_testing();
@@ -193,7 +191,6 @@ TEST_F(ThreadCacheTest, LargeAllocationsAreNotCached) {
  EXPECT_EQ(1u, alloc_miss_too_large_counter.Delta());
  EXPECT_EQ(1u, cache_fill_counter.Delta());
  EXPECT_EQ(1u, cache_fill_misses_counter.Delta());
-  EXPECT_EQ(1u, cache_fill_too_large_counter.Delta());
 }
 #endif  // defined(PA_ENABLE_THREAD_CACHE_STATISTICS)
@@ -287,8 +284,6 @@ TEST_F(ThreadCacheTest, RecordStats) {
  DeltaCounter cache_fill_counter{tcache->stats_.cache_fill_count};
  DeltaCounter cache_fill_hits_counter{tcache->stats_.cache_fill_hits};
  DeltaCounter cache_fill_misses_counter{tcache->stats_.cache_fill_misses};
-  DeltaCounter cache_fill_bucket_full_counter{
-      tcache->stats_.cache_fill_bucket_full};
  // Cache has been purged, first allocation is a miss.
  void* data = g_root->Alloc(kTestSize, "");
@@ -304,19 +299,19 @@ TEST_F(ThreadCacheTest, RecordStats) {
  tcache->Purge();
  cache_fill_counter.Reset();
-  // Bucket full accounting.
+  // Bucket are never full, fill always succeeds.
  size_t bucket_index = FillThreadCacheAndReturnIndex(
      kTestSize, ThreadCache::kMaxCountPerBucket + 10);
  EXPECT_EQ(ThreadCache::kMaxCountPerBucket + 10, cache_fill_counter.Delta());
-  EXPECT_EQ(10u, cache_fill_bucket_full_counter.Delta());
+  EXPECT_EQ(0u, cache_fill_misses_counter.Delta());
-  EXPECT_EQ(10u, cache_fill_misses_counter.Delta());
  // Memory footprint.
  ThreadCacheStats stats;
  ThreadCacheRegistry::Instance().DumpStats(true, &stats);
-  EXPECT_EQ(
+  // Bucket was cleared (count halved, then refilled).
-      g_root->buckets[bucket_index].slot_size * ThreadCache::kMaxCountPerBucket,
+  EXPECT_EQ(g_root->buckets[bucket_index].slot_size *
-      stats.bucket_total_memory);
+                (ThreadCache::kMaxCountPerBucket / 2 + 10),
+            stats.bucket_total_memory);
  EXPECT_EQ(sizeof(ThreadCache), stats.metadata_overhead);
 }

--- a/base/trace_event/malloc_dump_provider.cc
+++ b/base/trace_event/malloc_dump_provider.cc
@@ -227,9 +227,6 @@ void ReportPartitionAllocThreadCacheStats(MemoryAllocatorDump* dump,
  dump->AddScalar("cache_fill_count", "scalar", stats.cache_fill_count);
  dump->AddScalar("cache_fill_hits", "scalar", stats.cache_fill_hits);
  dump->AddScalar("cache_fill_misses", "scalar", stats.cache_fill_misses);
-  dump->AddScalar("cache_fill_bucket_full", "scalar",
-                  stats.cache_fill_bucket_full);
-  dump->AddScalar("cache_fill_too_large", "scalar", stats.cache_fill_too_large);
  dump->AddScalar("size", "bytes", stats.bucket_total_memory);
  dump->AddScalar("metadata_overhead", "bytes", stats.metadata_overhead);