Commit a8cac9ec authored by Benoit Lize's avatar Benoit Lize Committed by Chromium LUCI CQ

[PartitionAlloc] Batch Thread cache allocations.

Allocate multiple objects at a time for the thread at a time.
This is meant to amortize allocation costs.

Bug: 998048
Change-Id: I837216fcb3cb76302a6d09e7890b52313ebf8fa2
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2562302
Commit-Queue: Benoit L <lizeb@chromium.org>
Reviewed-by: default avatarBartek Nowierski <bartekn@chromium.org>
Cr-Commit-Position: refs/heads/master@{#832357}
parent ed7c5b6a
...@@ -181,6 +181,62 @@ void ThreadCache::Delete(void* tcache_ptr) { ...@@ -181,6 +181,62 @@ void ThreadCache::Delete(void* tcache_ptr) {
root->RawFree(tcache_ptr); root->RawFree(tcache_ptr);
} }
void ThreadCache::FillBucket(size_t bucket_index) {
// Filling multiple elements from the central allocator at a time has several
// advantages:
// - Amortize lock acquisition
// - Increase hit rate
// - Can improve locality, as consecutive allocations from the central
// allocator will likely return close addresses, especially early on.
//
// However, do not take too many items, to prevent memory bloat.
//
// Cache filling / purging policy:
// We aim at keeping the buckets neither empty nor full, while minimizing
// requests to the central allocator.
//
// For each bucket, there is a |limit| of how many cached objects there are in
// the bucket, so |count| < |limit| at all times.
// - Clearing: limit -> limit / 2
// - Filling: 0 -> limit / 4
//
// These thresholds are somewhat arbitrary, with these considerations:
// (1) Batched filling should not completely fill the bucket
// (2) Batched clearing should not completely clear the bucket
// (3) Batched filling should not be too eager
//
// If (1) and (2) do not hold, we risk oscillations of bucket filling /
// clearing which would greatly increase calls to the central allocator. (3)
// tries to keep memory usage low. So clearing half of the bucket, and filling
// a quarter of it are sensible defaults.
Bucket& bucket = buckets_[bucket_index];
int count = bucket.limit / 4;
size_t utilized_slot_size;
bool is_already_zeroed;
// Same as calling RawAlloc() |count| times, but acquires the lock only once.
internal::ScopedGuard<internal::ThreadSafe> guard(root_->lock_);
for (int i = 0; i < count; i++) {
// We allow the allocator to return nullptr, since filling the cache may
// safely fail, and the proper flag will be handled by the central
// allocator.
//
// |raw_size| is set to the slot size, as we don't know it. However, it is
// only used for direct-mapped allocations and single-slot ones anyway,
// which are not handled here.
void* ptr = root_->AllocFromBucket(
&root_->buckets[bucket_index], PartitionAllocReturnNull,
root_->buckets[bucket_index].slot_size /* raw_size */,
&utilized_slot_size, &is_already_zeroed);
// Central allocator is out of memory.
if (!ptr)
break;
PutInBucket(bucket, ptr);
}
}
void ThreadCache::ClearBucket(ThreadCache::Bucket& bucket, size_t limit) { void ThreadCache::ClearBucket(ThreadCache::Bucket& bucket, size_t limit) {
// Avoids acquiring the lock needlessly. // Avoids acquiring the lock needlessly.
if (!bucket.count) if (!bucket.count)
......
...@@ -195,8 +195,11 @@ class BASE_EXPORT ThreadCache { ...@@ -195,8 +195,11 @@ class BASE_EXPORT ThreadCache {
explicit ThreadCache(PartitionRoot<ThreadSafe>* root); explicit ThreadCache(PartitionRoot<ThreadSafe>* root);
static void Delete(void* thread_cache_ptr); static void Delete(void* thread_cache_ptr);
void PurgeInternal(); void PurgeInternal();
// Fills a bucket from the central allocator.
void FillBucket(size_t bucket_index);
// Empties the |bucket| until there are at most |limit| objects in it. // Empties the |bucket| until there are at most |limit| objects in it.
void ClearBucket(Bucket& bucket, size_t limit); void ClearBucket(Bucket& bucket, size_t limit);
ALWAYS_INLINE void PutInBucket(Bucket& bucket, void* ptr);
// TODO(lizeb): Optimize the threshold. // TODO(lizeb): Optimize the threshold.
static constexpr size_t kSizeThreshold = 512; static constexpr size_t kSizeThreshold = 512;
...@@ -208,7 +211,7 @@ class BASE_EXPORT ThreadCache { ...@@ -208,7 +211,7 @@ class BASE_EXPORT ThreadCache {
kBucketCount < kNumBuckets, kBucketCount < kNumBuckets,
"Cannot have more cached buckets than what the allocator supports"); "Cannot have more cached buckets than what the allocator supports");
std::atomic<bool> should_purge_; std::atomic<bool> should_purge_{false};
Bucket buckets_[kBucketCount]; Bucket buckets_[kBucketCount];
ThreadCacheStats stats_; ThreadCacheStats stats_;
PartitionRoot<ThreadSafe>* const root_; PartitionRoot<ThreadSafe>* const root_;
...@@ -232,9 +235,6 @@ class BASE_EXPORT ThreadCache { ...@@ -232,9 +235,6 @@ class BASE_EXPORT ThreadCache {
ALWAYS_INLINE bool ThreadCache::MaybePutInCache(void* address, ALWAYS_INLINE bool ThreadCache::MaybePutInCache(void* address,
size_t bucket_index) { size_t bucket_index) {
PA_REENTRANCY_GUARD(is_in_thread_cache_); PA_REENTRANCY_GUARD(is_in_thread_cache_);
if (UNLIKELY(should_purge_.load(std::memory_order_relaxed)))
PurgeInternal();
INCREMENT_COUNTER(stats_.cache_fill_count); INCREMENT_COUNTER(stats_.cache_fill_count);
if (UNLIKELY(bucket_index >= kBucketCount)) { if (UNLIKELY(bucket_index >= kBucketCount)) {
...@@ -246,18 +246,17 @@ ALWAYS_INLINE bool ThreadCache::MaybePutInCache(void* address, ...@@ -246,18 +246,17 @@ ALWAYS_INLINE bool ThreadCache::MaybePutInCache(void* address,
PA_DCHECK(bucket.count != 0 || bucket.freelist_head == nullptr); PA_DCHECK(bucket.count != 0 || bucket.freelist_head == nullptr);
auto* entry = reinterpret_cast<PartitionFreelistEntry*>(address); PutInBucket(bucket, address);
entry->SetNextForThreadCache(bucket.freelist_head);
bucket.freelist_head = entry;
bucket.count++;
INCREMENT_COUNTER(stats_.cache_fill_hits); INCREMENT_COUNTER(stats_.cache_fill_hits);
// Batched deallocation, amortizing lock acquisitions. // Batched deallocation, amortizing lock acquisitions.
if (UNLIKELY(bucket.count >= bucket.limit)) { if (UNLIKELY(bucket.count >= bucket.limit)) {
ClearBucket(bucket, bucket.limit >> 1); ClearBucket(bucket, bucket.limit / 2);
} }
if (UNLIKELY(should_purge_.load(std::memory_order_relaxed)))
PurgeInternal();
return true; return true;
} }
...@@ -272,25 +271,39 @@ ALWAYS_INLINE void* ThreadCache::GetFromCache(size_t bucket_index) { ...@@ -272,25 +271,39 @@ ALWAYS_INLINE void* ThreadCache::GetFromCache(size_t bucket_index) {
} }
auto& bucket = buckets_[bucket_index]; auto& bucket = buckets_[bucket_index];
auto* result = bucket.freelist_head; if (LIKELY(bucket.freelist_head)) {
if (UNLIKELY(!result)) { INCREMENT_COUNTER(stats_.alloc_hits);
} else {
PA_DCHECK(bucket.count == 0); PA_DCHECK(bucket.count == 0);
INCREMENT_COUNTER(stats_.alloc_miss_empty); INCREMENT_COUNTER(stats_.alloc_miss_empty);
INCREMENT_COUNTER(stats_.alloc_misses); INCREMENT_COUNTER(stats_.alloc_misses);
return nullptr;
FillBucket(bucket_index);
// Very unlikely, means that the central allocator is out of memory. Let it
// deal with it (may return nullptr, may crash).
if (UNLIKELY(!bucket.freelist_head))
return nullptr;
} }
PA_DCHECK(bucket.count != 0); PA_DCHECK(bucket.count != 0);
auto* result = bucket.freelist_head;
auto* next = result->GetNext(); auto* next = result->GetNext();
PA_DCHECK(result != next); PA_DCHECK(result != next);
bucket.count--; bucket.count--;
PA_DCHECK(bucket.count != 0 || !next); PA_DCHECK(bucket.count != 0 || !next);
bucket.freelist_head = next; bucket.freelist_head = next;
INCREMENT_COUNTER(stats_.alloc_hits);
return result; return result;
} }
ALWAYS_INLINE void ThreadCache::PutInBucket(Bucket& bucket, void* ptr) {
auto* entry = reinterpret_cast<PartitionFreelistEntry*>(ptr);
entry->SetNextForThreadCache(bucket.freelist_head);
bucket.freelist_head = entry;
bucket.count++;
}
} // namespace internal } // namespace internal
} // namespace base } // namespace base
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment