Commit 2a5ddc90 authored by Benoit Lize's avatar Benoit Lize Committed by Commit Bot

[PartitionAlloc] Batch thread cache deallocations.

The thread cache has a hard limit on how many objects are cached per
bucket. Previously, every cache fill above the limit would go back to
the central allocator, so the following scenario:
1. 1000 malloc(10) calls
2. 1000 free(ptr) calls (of the previously allocated memory)

Woould acquire the lock 1000 - kMaxCountPerBucket times on the
deallocation path. By batching deallocations, the lock is only acquired
(1000 - kMaxCountPerBucket) / (kMaxCountPerBucket / 2)
  = 2000 / kMaxCountPerBucket - 2 times.

It may also reduce cache footprint for these scenarios, since the bucket
is rarely full. Note that the allocation patth is not affected, batching
on this side may be investigated as well, but not without data showing
that empty buckets are frequent.

Bug: 998048
Change-Id: I626205e28d7a082a12303a493ca504e9581fc43a
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2546599
Commit-Queue: Benoit L <lizeb@chromium.org>
Reviewed-by: default avatarKentaro Hara <haraken@chromium.org>
Reviewed-by: default avatarYuki Shiino <yukishiino@chromium.org>
Cr-Commit-Position: refs/heads/master@{#829208}
parent f8a14659
...@@ -26,9 +26,7 @@ struct ThreadCacheStats { ...@@ -26,9 +26,7 @@ struct ThreadCacheStats {
// Cache fill details: // Cache fill details:
uint64_t cache_fill_count; uint64_t cache_fill_count;
uint64_t cache_fill_hits; uint64_t cache_fill_hits;
uint64_t cache_fill_misses; uint64_t cache_fill_misses; // Object too large.
uint64_t cache_fill_bucket_full;
uint64_t cache_fill_too_large;
// Memory cost: // Memory cost:
uint64_t bucket_total_memory; uint64_t bucket_total_memory;
......
...@@ -163,6 +163,26 @@ void ThreadCache::Delete(void* tcache_ptr) { ...@@ -163,6 +163,26 @@ void ThreadCache::Delete(void* tcache_ptr) {
root->RawFree(tcache_ptr); root->RawFree(tcache_ptr);
} }
void ThreadCache::ClearBucket(ThreadCache::Bucket& bucket, size_t limit) {
// Avoids acquiring the lock needlessly.
if (!bucket.count)
return;
// Acquire the lock once for the bucket. Allocations from the same bucket are
// likely to be hitting the same cache lines in the central allocator, and
// lock acquisitions can be expensive.
internal::ScopedGuard<internal::ThreadSafe> guard(root_->lock_);
while (bucket.count > limit) {
auto* entry = bucket.freelist_head;
PA_DCHECK(entry);
bucket.freelist_head = entry->GetNext();
root_->RawFreeLocked(entry);
bucket.count--;
}
PA_DCHECK(bucket.count == limit);
}
void ThreadCache::AccumulateStats(ThreadCacheStats* stats) const { void ThreadCache::AccumulateStats(ThreadCacheStats* stats) const {
stats->alloc_count += stats_.alloc_count; stats->alloc_count += stats_.alloc_count;
stats->alloc_hits += stats_.alloc_hits; stats->alloc_hits += stats_.alloc_hits;
...@@ -174,8 +194,6 @@ void ThreadCache::AccumulateStats(ThreadCacheStats* stats) const { ...@@ -174,8 +194,6 @@ void ThreadCache::AccumulateStats(ThreadCacheStats* stats) const {
stats->cache_fill_count += stats_.cache_fill_count; stats->cache_fill_count += stats_.cache_fill_count;
stats->cache_fill_hits += stats_.cache_fill_hits; stats->cache_fill_hits += stats_.cache_fill_hits;
stats->cache_fill_misses += stats_.cache_fill_misses; stats->cache_fill_misses += stats_.cache_fill_misses;
stats->cache_fill_bucket_full += stats_.cache_fill_bucket_full;
stats->cache_fill_too_large += stats_.cache_fill_too_large;
for (size_t i = 0; i < kBucketCount; i++) { for (size_t i = 0; i < kBucketCount; i++) {
stats->bucket_total_memory += stats->bucket_total_memory +=
...@@ -191,25 +209,9 @@ void ThreadCache::SetShouldPurge() { ...@@ -191,25 +209,9 @@ void ThreadCache::SetShouldPurge() {
} }
void ThreadCache::Purge() { void ThreadCache::Purge() {
for (Bucket& bucket : buckets_) { for (auto& bucket : buckets_)
size_t count = bucket.count; ClearBucket(bucket, 0);
if (!count)
continue;
// Acquire the lock once per bucket. This avoids acquiring it for too long,
// and also allocations from the same bucket are likely to be hitting the
// same cache lines in the central allocator.
internal::ScopedGuard<internal::ThreadSafe> guard(root_->lock_);
while (bucket.freelist_head) {
auto* entry = bucket.freelist_head;
bucket.freelist_head = entry->GetNext();
root_->RawFreeLocked(entry);
count--;
}
CHECK_EQ(0u, count);
bucket.count = 0;
}
should_purge_.store(false, std::memory_order_relaxed); should_purge_.store(false, std::memory_order_relaxed);
} }
......
...@@ -159,14 +159,16 @@ class BASE_EXPORT ThreadCache { ...@@ -159,14 +159,16 @@ class BASE_EXPORT ThreadCache {
} }
private: private:
explicit ThreadCache(PartitionRoot<ThreadSafe>* root);
static void Delete(void* thread_cache_ptr);
struct Bucket { struct Bucket {
size_t count; size_t count;
PartitionFreelistEntry* freelist_head; PartitionFreelistEntry* freelist_head;
}; };
explicit ThreadCache(PartitionRoot<ThreadSafe>* root);
static void Delete(void* thread_cache_ptr);
// Empties the |bucket| until there are at most |limit| objects in it.
void ClearBucket(Bucket& bucket, size_t limit);
// TODO(lizeb): Optimize the threshold. // TODO(lizeb): Optimize the threshold.
static constexpr size_t kSizeThreshold = 512; static constexpr size_t kSizeThreshold = 512;
static constexpr size_t kBucketCount = static constexpr size_t kBucketCount =
...@@ -206,19 +208,12 @@ ALWAYS_INLINE bool ThreadCache::MaybePutInCache(void* address, ...@@ -206,19 +208,12 @@ ALWAYS_INLINE bool ThreadCache::MaybePutInCache(void* address,
INCREMENT_COUNTER(stats_.cache_fill_count); INCREMENT_COUNTER(stats_.cache_fill_count);
if (bucket_index >= kBucketCount) { if (bucket_index >= kBucketCount) {
INCREMENT_COUNTER(stats_.cache_fill_too_large);
INCREMENT_COUNTER(stats_.cache_fill_misses); INCREMENT_COUNTER(stats_.cache_fill_misses);
return false; return false;
} }
auto& bucket = buckets_[bucket_index]; auto& bucket = buckets_[bucket_index];
if (bucket.count >= kMaxCountPerBucket) {
INCREMENT_COUNTER(stats_.cache_fill_bucket_full);
INCREMENT_COUNTER(stats_.cache_fill_misses);
return false;
}
PA_DCHECK(bucket.count != 0 || bucket.freelist_head == nullptr); PA_DCHECK(bucket.count != 0 || bucket.freelist_head == nullptr);
auto* entry = reinterpret_cast<PartitionFreelistEntry*>(address); auto* entry = reinterpret_cast<PartitionFreelistEntry*>(address);
...@@ -227,6 +222,12 @@ ALWAYS_INLINE bool ThreadCache::MaybePutInCache(void* address, ...@@ -227,6 +222,12 @@ ALWAYS_INLINE bool ThreadCache::MaybePutInCache(void* address,
bucket.count++; bucket.count++;
INCREMENT_COUNTER(stats_.cache_fill_hits); INCREMENT_COUNTER(stats_.cache_fill_hits);
// Batched deallocation, amortizing lock acquisitions.
if (bucket.count >= kMaxCountPerBucket) {
ClearBucket(bucket, kMaxCountPerBucket / 2);
}
return true; return true;
} }
......
...@@ -184,8 +184,6 @@ TEST_F(ThreadCacheTest, LargeAllocationsAreNotCached) { ...@@ -184,8 +184,6 @@ TEST_F(ThreadCacheTest, LargeAllocationsAreNotCached) {
tcache->stats_.alloc_miss_too_large}; tcache->stats_.alloc_miss_too_large};
DeltaCounter cache_fill_counter{tcache->stats_.cache_fill_count}; DeltaCounter cache_fill_counter{tcache->stats_.cache_fill_count};
DeltaCounter cache_fill_misses_counter{tcache->stats_.cache_fill_misses}; DeltaCounter cache_fill_misses_counter{tcache->stats_.cache_fill_misses};
DeltaCounter cache_fill_too_large_counter{
tcache->stats_.cache_fill_too_large};
FillThreadCacheAndReturnIndex(100 * 1024); FillThreadCacheAndReturnIndex(100 * 1024);
tcache = g_root->thread_cache_for_testing(); tcache = g_root->thread_cache_for_testing();
...@@ -193,7 +191,6 @@ TEST_F(ThreadCacheTest, LargeAllocationsAreNotCached) { ...@@ -193,7 +191,6 @@ TEST_F(ThreadCacheTest, LargeAllocationsAreNotCached) {
EXPECT_EQ(1u, alloc_miss_too_large_counter.Delta()); EXPECT_EQ(1u, alloc_miss_too_large_counter.Delta());
EXPECT_EQ(1u, cache_fill_counter.Delta()); EXPECT_EQ(1u, cache_fill_counter.Delta());
EXPECT_EQ(1u, cache_fill_misses_counter.Delta()); EXPECT_EQ(1u, cache_fill_misses_counter.Delta());
EXPECT_EQ(1u, cache_fill_too_large_counter.Delta());
} }
#endif // defined(PA_ENABLE_THREAD_CACHE_STATISTICS) #endif // defined(PA_ENABLE_THREAD_CACHE_STATISTICS)
...@@ -287,8 +284,6 @@ TEST_F(ThreadCacheTest, RecordStats) { ...@@ -287,8 +284,6 @@ TEST_F(ThreadCacheTest, RecordStats) {
DeltaCounter cache_fill_counter{tcache->stats_.cache_fill_count}; DeltaCounter cache_fill_counter{tcache->stats_.cache_fill_count};
DeltaCounter cache_fill_hits_counter{tcache->stats_.cache_fill_hits}; DeltaCounter cache_fill_hits_counter{tcache->stats_.cache_fill_hits};
DeltaCounter cache_fill_misses_counter{tcache->stats_.cache_fill_misses}; DeltaCounter cache_fill_misses_counter{tcache->stats_.cache_fill_misses};
DeltaCounter cache_fill_bucket_full_counter{
tcache->stats_.cache_fill_bucket_full};
// Cache has been purged, first allocation is a miss. // Cache has been purged, first allocation is a miss.
void* data = g_root->Alloc(kTestSize, ""); void* data = g_root->Alloc(kTestSize, "");
...@@ -304,19 +299,19 @@ TEST_F(ThreadCacheTest, RecordStats) { ...@@ -304,19 +299,19 @@ TEST_F(ThreadCacheTest, RecordStats) {
tcache->Purge(); tcache->Purge();
cache_fill_counter.Reset(); cache_fill_counter.Reset();
// Bucket full accounting. // Bucket are never full, fill always succeeds.
size_t bucket_index = FillThreadCacheAndReturnIndex( size_t bucket_index = FillThreadCacheAndReturnIndex(
kTestSize, ThreadCache::kMaxCountPerBucket + 10); kTestSize, ThreadCache::kMaxCountPerBucket + 10);
EXPECT_EQ(ThreadCache::kMaxCountPerBucket + 10, cache_fill_counter.Delta()); EXPECT_EQ(ThreadCache::kMaxCountPerBucket + 10, cache_fill_counter.Delta());
EXPECT_EQ(10u, cache_fill_bucket_full_counter.Delta()); EXPECT_EQ(0u, cache_fill_misses_counter.Delta());
EXPECT_EQ(10u, cache_fill_misses_counter.Delta());
// Memory footprint. // Memory footprint.
ThreadCacheStats stats; ThreadCacheStats stats;
ThreadCacheRegistry::Instance().DumpStats(true, &stats); ThreadCacheRegistry::Instance().DumpStats(true, &stats);
EXPECT_EQ( // Bucket was cleared (count halved, then refilled).
g_root->buckets[bucket_index].slot_size * ThreadCache::kMaxCountPerBucket, EXPECT_EQ(g_root->buckets[bucket_index].slot_size *
stats.bucket_total_memory); (ThreadCache::kMaxCountPerBucket / 2 + 10),
stats.bucket_total_memory);
EXPECT_EQ(sizeof(ThreadCache), stats.metadata_overhead); EXPECT_EQ(sizeof(ThreadCache), stats.metadata_overhead);
} }
......
...@@ -227,9 +227,6 @@ void ReportPartitionAllocThreadCacheStats(MemoryAllocatorDump* dump, ...@@ -227,9 +227,6 @@ void ReportPartitionAllocThreadCacheStats(MemoryAllocatorDump* dump,
dump->AddScalar("cache_fill_count", "scalar", stats.cache_fill_count); dump->AddScalar("cache_fill_count", "scalar", stats.cache_fill_count);
dump->AddScalar("cache_fill_hits", "scalar", stats.cache_fill_hits); dump->AddScalar("cache_fill_hits", "scalar", stats.cache_fill_hits);
dump->AddScalar("cache_fill_misses", "scalar", stats.cache_fill_misses); dump->AddScalar("cache_fill_misses", "scalar", stats.cache_fill_misses);
dump->AddScalar("cache_fill_bucket_full", "scalar",
stats.cache_fill_bucket_full);
dump->AddScalar("cache_fill_too_large", "scalar", stats.cache_fill_too_large);
dump->AddScalar("size", "bytes", stats.bucket_total_memory); dump->AddScalar("size", "bytes", stats.bucket_total_memory);
dump->AddScalar("metadata_overhead", "bytes", stats.metadata_overhead); dump->AddScalar("metadata_overhead", "bytes", stats.metadata_overhead);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment