[PartitionAlloc] Record and report batch fill rate.

The thread cache uses batched allocation from the central allocator. Rather than using the cache hit rate to assess contention on the main lock, the number of batch fill requests is the one to track. This CL records this metric, and reports it in memory dumps and in UMA. Bug: 998048 Change-Id: Ie6a833c7a0ece66d3c138757c8fb3f49a1690c7f Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2593630Reviewed-by: Kentaro Hara <haraken@chromium.org> Reviewed-by: ssid <ssid@chromium.org> Reviewed-by: Jesse Doherty <jwd@chromium.org> Commit-Queue: Benoit L <lizeb@chromium.org> Cr-Commit-Position: refs/heads/master@{#838015}

[PartitionAlloc] Record and report batch fill rate.
The thread cache uses batched allocation from the central allocator. Rather than using the cache hit rate to assess contention on the main lock, the number of batch fill requests is the one to track. This CL records this metric, and reports it in memory dumps and in UMA. Bug: 998048 Change-Id: Ie6a833c7a0ece66d3c138757c8fb3f49a1690c7f Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2593630Reviewed-by: Kentaro Hara <haraken@chromium.org> Reviewed-by: ssid <ssid@chromium.org> Reviewed-by: Jesse Doherty <jwd@chromium.org> Commit-Queue: Benoit L <lizeb@chromium.org> Cr-Commit-Position: refs/heads/master@{#838015}
f7757107 · Benoit Lize · Chromium LUCI CQ · 41522c8a · f7757107 · f7757107
Commit f7757107 authored Dec 17, 2020 by Benoit Lize Committed by Chromium LUCI CQ Dec 17, 2020
8 changed files
--- a/base/allocator/partition_allocator/partition_stats.h
+++ b/base/allocator/partition_allocator/partition_stats.h
@@ -28,6 +28,8 @@ struct ThreadCacheStats {
  uint64_t cache_fill_hits;
  uint64_t cache_fill_misses;  // Object too large.

+  uint64_t batch_fill_count;  // Number of central allocator requests.
+
  // Memory cost:
  uint64_t bucket_total_memory;
  uint64_t metadata_overhead;

--- a/base/allocator/partition_allocator/thread_cache.cc
+++ b/base/allocator/partition_allocator/thread_cache.cc
@@ -301,6 +301,8 @@ void ThreadCache::FillBucket(size_t bucket_index) {
  // clearing which would greatly increase calls to the central allocator. (3)
  // tries to keep memory usage low. So clearing half of the bucket, and filling
  // a quarter of it are sensible defaults.
+  INCREMENT_COUNTER(stats_.batch_fill_count);
+
  Bucket& bucket = buckets_[bucket_index];
  int count = bucket.limit / kBatchFillRatio;

@@ -388,6 +390,8 @@ void ThreadCache::ResetForTesting() {
  stats_.cache_fill_hits = 0;
  stats_.cache_fill_misses = 0;

+  stats_.batch_fill_count = 0;
+
  stats_.bucket_total_memory = 0;
  stats_.metadata_overhead = 0;

@@ -407,9 +411,11 @@ void ThreadCache::AccumulateStats(ThreadCacheStats* stats) const {
  stats->cache_fill_hits += stats_.cache_fill_hits;
  stats->cache_fill_misses += stats_.cache_fill_misses;

-  for (size_t i = 0; i < kBucketCount; i++) {
+  stats->batch_fill_count += stats_.batch_fill_count;
+
+  for (const Bucket& bucket : buckets_) {
    stats->bucket_total_memory +=
-        buckets_[i].count * static_cast<size_t>(buckets_[i].slot_size);
+        bucket.count * static_cast<size_t>(bucket.slot_size);
  }
  stats->metadata_overhead += sizeof(*this);
 }

--- a/base/allocator/partition_allocator/thread_cache.h
+++ b/base/allocator/partition_allocator/thread_cache.h
@@ -257,6 +257,8 @@ class BASE_EXPORT ThreadCache {

  friend class ThreadCacheRegistry;
  friend class ThreadCacheTest;
+  FRIEND_TEST_ALL_PREFIXES(ThreadCacheTest, Simple);
+  FRIEND_TEST_ALL_PREFIXES(ThreadCacheTest, MultipleObjectsCachedPerBucket);
  FRIEND_TEST_ALL_PREFIXES(ThreadCacheTest, LargeAllocationsAreNotCached);
  FRIEND_TEST_ALL_PREFIXES(ThreadCacheTest, MultipleThreadCaches);
  FRIEND_TEST_ALL_PREFIXES(ThreadCacheTest, RecordStats);

--- a/base/allocator/partition_allocator/thread_cache_unittest.cc
+++ b/base/allocator/partition_allocator/thread_cache_unittest.cc
@@ -121,12 +121,13 @@ class ThreadCacheTest : public ::testing::Test {
 };

 TEST_F(ThreadCacheTest, Simple) {
-  void* ptr = g_root->Alloc(kSmallSize, "");
-  ASSERT_TRUE(ptr);
-
  // There is a cache.
  auto* tcache = g_root->thread_cache_for_testing();
  EXPECT_TRUE(tcache);
+  DeltaCounter batch_fill_counter{tcache->stats_.batch_fill_count};
+
+  void* ptr = g_root->Alloc(kSmallSize, "");
+  ASSERT_TRUE(ptr);

  uint16_t index = PartitionRoot<ThreadSafe>::SizeToBucketIndex(kSmallSize);
  EXPECT_EQ(kFillCountForSmallBucket - 1,
@@ -141,6 +142,8 @@ TEST_F(ThreadCacheTest, Simple) {
  // Allocated from the thread cache.
  EXPECT_EQ(kFillCountForSmallBucket - 1,
            tcache->bucket_count_for_testing(index));
+
+  EXPECT_EQ(1u, batch_fill_counter.Delta());
 }

 TEST_F(ThreadCacheTest, InexactSizeMatch) {
@@ -167,11 +170,15 @@ TEST_F(ThreadCacheTest, InexactSizeMatch) {
 }

 TEST_F(ThreadCacheTest, MultipleObjectsCachedPerBucket) {
+  auto* tcache = g_root->thread_cache_for_testing();
+  DeltaCounter batch_fill_counter{tcache->stats_.batch_fill_count};
  size_t bucket_index =
      FillThreadCacheAndReturnIndex(kMediumSize, kFillCountForMediumBucket + 2);
-  auto* tcache = g_root->thread_cache_for_testing();
  EXPECT_EQ(2 * kFillCountForMediumBucket,
            tcache->bucket_count_for_testing(bucket_index));
+  // 2 batches, since there were more than |kFillCountForMediumBucket|
+  // allocations.
+  EXPECT_EQ(2u, batch_fill_counter.Delta());
 }

 TEST_F(ThreadCacheTest, ObjectsCachedCountIsLimited) {

--- a/base/trace_event/malloc_dump_provider.cc
+++ b/base/trace_event/malloc_dump_provider.cc
@@ -227,6 +227,8 @@ void ReportPartitionAllocThreadCacheStats(MemoryAllocatorDump* dump,
  dump->AddScalar("cache_fill_hits", "scalar", stats.cache_fill_hits);
  dump->AddScalar("cache_fill_misses", "scalar", stats.cache_fill_misses);

+  dump->AddScalar("batch_fill_count", "scalar", stats.batch_fill_count);
+
  dump->AddScalar("size", "bytes", stats.bucket_total_memory);
  dump->AddScalar("metadata_overhead", "bytes", stats.metadata_overhead);
 }

--- a/third_party/blink/renderer/platform/instrumentation/partition_alloc_memory_dump_provider.cc
+++ b/third_party/blink/renderer/platform/instrumentation/partition_alloc_memory_dump_provider.cc
@@ -92,6 +92,13 @@ void PartitionStatsDumperImpl::PartitionDumpTotals(
                           all_thread_caches_stats.alloc_count);
      base::UmaHistogramPercentage("Memory.PartitionAlloc.ThreadCache.HitRate",
                                   hit_rate_percent);
+
+      int batch_fill_rate_percent =
+          static_cast<int>((100 * all_thread_caches_stats.batch_fill_count) /
+                           all_thread_caches_stats.alloc_count);
+      base::UmaHistogramPercentage(
+          "Memory.PartitionAlloc.ThreadCache.BatchFillRate",
+          batch_fill_rate_percent);
    }

    if (thread_cache_stats.alloc_count) {
@@ -101,6 +108,13 @@ void PartitionStatsDumperImpl::PartitionDumpTotals(
      base::UmaHistogramPercentage(
          "Memory.PartitionAlloc.ThreadCache.HitRate.MainThread",
          hit_rate_percent);
+
+      int batch_fill_rate_percent =
+          static_cast<int>((100 * thread_cache_stats.batch_fill_count) /
+                           thread_cache_stats.alloc_count);
+      base::UmaHistogramPercentage(
+          "Memory.PartitionAlloc.ThreadCache.BatchFillRate.MainThread",
+          batch_fill_rate_percent);
    }
  }
 }

--- a/third_party/blink/renderer/platform/instrumentation/partition_alloc_memory_dump_provider_test.cc
+++ b/third_party/blink/renderer/platform/instrumentation/partition_alloc_memory_dump_provider_test.cc
@@ -30,11 +30,20 @@ TEST(PartitionAllocMemoryDumpProviderTest, Simple) {
                                    1);
  histogram_tester.ExpectTotalCount(
      "Memory.PartitionAlloc.ThreadCache.HitRate.MainThread", 1);
+
+  histogram_tester.ExpectTotalCount(
+      "Memory.PartitionAlloc.ThreadCache.BatchFillRate", 1);
+  histogram_tester.ExpectTotalCount(
+      "Memory.PartitionAlloc.ThreadCache.HitRate.MainThread", 1);
 #else
  histogram_tester.ExpectTotalCount("Memory.PartitionAlloc.ThreadCache.HitRate",
                                    0);
  histogram_tester.ExpectTotalCount(
      "Memory.PartitionAlloc.ThreadCache.HitRate.MainThread", 0);
+  histogram_tester.ExpectTotalCount(
+      "Memory.PartitionAlloc.ThreadCache.BatchFillRate", 0);
+  histogram_tester.ExpectTotalCount(
+      "Memory.PartitionAlloc.ThreadCache.BatchFillRate.MainThread", 0);
 #endif  // !BUILDFLAG(USE_PARTITION_ALLOC_AS_MALLOC) &&
        // defined(PA_THREAD_CACHE_SUPPORTED) &&
        // !defined(MEMORY_TOOL_REPLACES_ALLOCATOR)

--- a/tools/metrics/histograms/histograms_xml/memory/histograms.xml
+++ b/tools/metrics/histograms/histograms_xml/memory/histograms.xml
@@ -1995,6 +1995,22 @@ reviews. Googlers can read more about this at go/gwsq-gerrit.
  <summary>Throughput of a ParkableString disk write.</summary>
 </histogram>

+<histogram name="Memory.PartitionAlloc.ThreadCache.BatchFillRate{ThreadType}"
+    units="%" expires_after="M92">
+  <owner>lizeb@chromium.org</owner>
+  <owner>bartekn@chromium.org</owner>
+  <summary>
+    Fraction of PartitionAlloc's thread cache allocations requests that required
+    a batch fill, that is cache hits touching the central allocator. Recorded
+    during memory dumps, at the same time as the Memory.*.PartitionAlloc.*
+    histograms. Data is collected for {ThreadType}.
+  </summary>
+  <token key="ThreadType">
+    <variant name="" summary="all threads"/>
+    <variant name=".MainThread" summary="the main thread only"/>
+  </token>
+</histogram>
+
 <histogram name="Memory.PartitionAlloc.ThreadCache.HitRate{ThreadType}"
    units="%" expires_after="M92">
  <owner>lizeb@chromium.org</owner>