[PartitionAlloc] Batch Thread cache allocations.

Allocate multiple objects at a time for the thread at a time. This is meant to amortize allocation costs. Bug: 998048 Change-Id: I837216fcb3cb76302a6d09e7890b52313ebf8fa2 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2562302 Commit-Queue: Benoit L <lizeb@chromium.org> Reviewed-by: Bartek Nowierski <bartekn@chromium.org> Cr-Commit-Position: refs/heads/master@{#832357}

[PartitionAlloc] Batch Thread cache allocations.
Allocate multiple objects at a time for the thread at a time. This is meant to amortize allocation costs. Bug: 998048 Change-Id: I837216fcb3cb76302a6d09e7890b52313ebf8fa2 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2562302 Commit-Queue: Benoit L <lizeb@chromium.org> Reviewed-by: Bartek Nowierski <bartekn@chromium.org> Cr-Commit-Position: refs/heads/master@{#832357}
a8cac9ec · Benoit Lize · Chromium LUCI CQ · ed7c5b6a · a8cac9ec · a8cac9ec
Commit a8cac9ec authored Dec 01, 2020 by Benoit Lize Committed by Chromium LUCI CQ Dec 01, 2020
3 changed files
--- a/base/allocator/partition_allocator/thread_cache.cc
+++ b/base/allocator/partition_allocator/thread_cache.cc
@@ -181,6 +181,62 @@ void ThreadCache::Delete(void* tcache_ptr) {
  root->RawFree(tcache_ptr);
 }
+void ThreadCache::FillBucket(size_t bucket_index) {
+  // Filling multiple elements from the central allocator at a time has several
+  // advantages:
+  // - Amortize lock acquisition
+  // - Increase hit rate
+  // - Can improve locality, as consecutive allocations from the central
+  //   allocator will likely return close addresses, especially early on.
+  //
+  // However, do not take too many items, to prevent memory bloat.
+  //
+  // Cache filling / purging policy:
+  // We aim at keeping the buckets neither empty nor full, while minimizing
+  // requests to the central allocator.
+  //
+  // For each bucket, there is a |limit| of how many cached objects there are in
+  // the bucket, so |count| < |limit| at all times.
+  // - Clearing: limit -> limit / 2
+  // - Filling: 0 -> limit / 4
+  //
+  // These thresholds are somewhat arbitrary, with these considerations:
+  // (1) Batched filling should not completely fill the bucket
+  // (2) Batched clearing should not completely clear the bucket
+  // (3) Batched filling should not be too eager
+  //
+  // If (1) and (2) do not hold, we risk oscillations of bucket filling /
+  // clearing which would greatly increase calls to the central allocator. (3)
+  // tries to keep memory usage low. So clearing half of the bucket, and filling
+  // a quarter of it are sensible defaults.
+  Bucket& bucket = buckets_[bucket_index];
+  int count = bucket.limit / 4;
+  size_t utilized_slot_size;
+  bool is_already_zeroed;
+  // Same as calling RawAlloc() |count| times, but acquires the lock only once.
+  internal::ScopedGuard<internal::ThreadSafe> guard(root_->lock_);
+  for (int i = 0; i < count; i++) {
+    // We allow the allocator to return nullptr, since filling the cache may
+    // safely fail, and the proper flag will be handled by the central
+    // allocator.
+    //
+    // |raw_size| is set to the slot size, as we don't know it. However, it is
+    // only used for direct-mapped allocations and single-slot ones anyway,
+    // which are not handled here.
+    void* ptr = root_->AllocFromBucket(
+        &root_->buckets[bucket_index], PartitionAllocReturnNull,
+        root_->buckets[bucket_index].slot_size /* raw_size */,
+        &utilized_slot_size, &is_already_zeroed);
+    // Central allocator is out of memory.
+    if (!ptr)
+      break;
+    PutInBucket(bucket, ptr);
+  }
+}
 void ThreadCache::ClearBucket(ThreadCache::Bucket& bucket, size_t limit) {
  // Avoids acquiring the lock needlessly.
  if (!bucket.count)

--- a/base/allocator/partition_allocator/thread_cache.h
+++ b/base/allocator/partition_allocator/thread_cache.h
@@ -195,8 +195,11 @@ class BASE_EXPORT ThreadCache {
  explicit ThreadCache(PartitionRoot<ThreadSafe>* root);
  static void Delete(void* thread_cache_ptr);
  void PurgeInternal();
+  // Fills a bucket from the central allocator.
+  void FillBucket(size_t bucket_index);
  // Empties the |bucket| until there are at most |limit| objects in it.
  void ClearBucket(Bucket& bucket, size_t limit);
+  ALWAYS_INLINE void PutInBucket(Bucket& bucket, void* ptr);
  // TODO(lizeb): Optimize the threshold.
  static constexpr size_t kSizeThreshold = 512;
@@ -208,7 +211,7 @@ class BASE_EXPORT ThreadCache {
      kBucketCount < kNumBuckets,
      "Cannot have more cached buckets than what the allocator supports");
-  std::atomic<bool> should_purge_;
+  std::atomic<bool> should_purge_{false};
  Bucket buckets_[kBucketCount];
  ThreadCacheStats stats_;
  PartitionRoot<ThreadSafe>* const root_;
@@ -232,9 +235,6 @@ class BASE_EXPORT ThreadCache {
 ALWAYS_INLINE bool ThreadCache::MaybePutInCache(void* address,
                                                size_t bucket_index) {
  PA_REENTRANCY_GUARD(is_in_thread_cache_);
-  if (UNLIKELY(should_purge_.load(std::memory_order_relaxed)))
-    PurgeInternal();
  INCREMENT_COUNTER(stats_.cache_fill_count);
  if (UNLIKELY(bucket_index >= kBucketCount)) {
@@ -246,18 +246,17 @@ ALWAYS_INLINE bool ThreadCache::MaybePutInCache(void* address,
  PA_DCHECK(bucket.count != 0 || bucket.freelist_head == nullptr);
-  auto* entry = reinterpret_cast<PartitionFreelistEntry*>(address);
+  PutInBucket(bucket, address);
-  entry->SetNextForThreadCache(bucket.freelist_head);
-  bucket.freelist_head = entry;
-  bucket.count++;
  INCREMENT_COUNTER(stats_.cache_fill_hits);
  // Batched deallocation, amortizing lock acquisitions.
  if (UNLIKELY(bucket.count >= bucket.limit)) {
-    ClearBucket(bucket, bucket.limit >> 1);
+    ClearBucket(bucket, bucket.limit / 2);
  }
+  if (UNLIKELY(should_purge_.load(std::memory_order_relaxed)))
+    PurgeInternal();
  return true;
 }
@@ -272,25 +271,39 @@ ALWAYS_INLINE void* ThreadCache::GetFromCache(size_t bucket_index) {
  }
  auto& bucket = buckets_[bucket_index];
-  auto* result = bucket.freelist_head;
+  if (LIKELY(bucket.freelist_head)) {
-  if (UNLIKELY(!result)) {
+    INCREMENT_COUNTER(stats_.alloc_hits);
+  } else {
    PA_DCHECK(bucket.count == 0);
    INCREMENT_COUNTER(stats_.alloc_miss_empty);
    INCREMENT_COUNTER(stats_.alloc_misses);
-    return nullptr;
+    FillBucket(bucket_index);
+    // Very unlikely, means that the central allocator is out of memory. Let it
+    // deal with it (may return nullptr, may crash).
+    if (UNLIKELY(!bucket.freelist_head))
+      return nullptr;
  }
  PA_DCHECK(bucket.count != 0);
+  auto* result = bucket.freelist_head;
  auto* next = result->GetNext();
  PA_DCHECK(result != next);
  bucket.count--;
  PA_DCHECK(bucket.count != 0 || !next);
  bucket.freelist_head = next;
-  INCREMENT_COUNTER(stats_.alloc_hits);
  return result;
 }
+ALWAYS_INLINE void ThreadCache::PutInBucket(Bucket& bucket, void* ptr) {
+  auto* entry = reinterpret_cast<PartitionFreelistEntry*>(ptr);
+  entry->SetNextForThreadCache(bucket.freelist_head);
+  bucket.freelist_head = entry;
+  bucket.count++;
+}
 }  // namespace internal
 }  // namespace base

--- a/base/allocator/partition_allocator/thread_cache_unittest.cc
+++ b/base/allocator/partition_allocator/thread_cache_unittest.cc