CpuTimeMetrics: Correct for idle times in approx time_in_state metric

As it turns out, the per-core time_in_state data from /sys/devices/system/cpu/cpuX/cpufreq/stats/time_in_state is wall time rather than CPU time and includes time the cores spend in idle states. Consequently, the data we currently report into Power.Approx* metrics isn't correct. We have to correct for the idle time when computing each core type + frequency state's proportional part of the execution. This patch adds a way to do so by polling per-core idle times from /sys/devices/system/cpu/cpuX/cpuidle/stateY/time (via a helper in base::CPU) and updates the CpuTimeMetrics code to account for idle time. Bug: 1081760 Change-Id: I7b8168b47b1e4d118bc7569aebb07952fa78693e Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2462088 Commit-Queue: Eric Seckler <eseckler@chromium.org> Reviewed-by: Daniel Cheng <dcheng@chromium.org> Reviewed-by: Bo <boliu@chromium.org> Reviewed-by: Sami Kyöstilä <skyostil@chromium.org> Cr-Commit-Position: refs/heads/master@{#816509}

CpuTimeMetrics: Correct for idle times in approx time_in_state metric
As it turns out, the per-core time_in_state data from /sys/devices/system/cpu/cpuX/cpufreq/stats/time_in_state is wall time rather than CPU time and includes time the cores spend in idle states. Consequently, the data we currently report into Power.Approx* metrics isn't correct. We have to correct for the idle time when computing each core type + frequency state's proportional part of the execution. This patch adds a way to do so by polling per-core idle times from /sys/devices/system/cpu/cpuX/cpuidle/stateY/time (via a helper in base::CPU) and updates the CpuTimeMetrics code to account for idle time. Bug: 1081760 Change-Id: I7b8168b47b1e4d118bc7569aebb07952fa78693e Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2462088 Commit-Queue: Eric Seckler <eseckler@chromium.org> Reviewed-by: Daniel Cheng <dcheng@chromium.org> Reviewed-by: Bo <boliu@chromium.org> Reviewed-by: Sami Kyöstilä <skyostil@chromium.org> Cr-Commit-Position: refs/heads/master@{#816509}
e8542484 · Eric Seckler · Commit Bot · f86efafa · e8542484 · e8542484
Commit e8542484 authored Oct 13, 2020 by Eric Seckler Committed by Commit Bot Oct 13, 2020
Show whitespace changes
Inline Side-by-side

Showing with 262 additions and 60 deletions

base/cpu.cc base/cpu.cc +52 -0

base/cpu.h base/cpu.h +13 -2

content/common/android/cpu_time_metrics.cc content/common/android/cpu_time_metrics.cc +197 -58

No files found.
--- a/base/cpu.cc
+++ b/base/cpu.cc
@@ -335,6 +335,8 @@ constexpr char kTimeInStatePath[] =
    "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state";
 constexpr char kPhysicalPackageIdPath[] =
    "/sys/devices/system/cpu/cpu%d/topology/physical_package_id";
+constexpr char kCoreIdleStateTimePath[] =
+    "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/time";

 bool SupportsTimeInState() {
  // Reading from time_in_state doesn't block (it amounts to reading a struct
@@ -388,6 +390,16 @@ bool ParseTimeInState(const std::string& content,
  return true;
 }

+bool SupportsCoreIdleTimes() {
+  // Reading from the cpuidle driver doesn't block.
+  ThreadRestrictions::ScopedAllowIO allow_io;
+  // Check if the path for the idle time in state 0 for core 0 is readable.
+  FilePath idle_state0_path(
+      StringPrintf(kCoreIdleStateTimePath, /*core_index=*/0, /*idle_state=*/0));
+  ScopedFILE file_stream(OpenFile(idle_state0_path, "rb"));
+  return static_cast<bool>(file_stream);
+}
+
 std::vector<CPU::CoreType> GuessCoreTypes() {
  // Try to guess the CPU architecture and cores of each cluster by comparing
  // the maximum frequencies of the available (online and offline) cores.
@@ -531,6 +543,46 @@ bool CPU::GetTimeInState(TimeInState& time_in_state) {

  return true;
 }
+
+// static
+bool CPU::GetCumulativeCoreIdleTimes(CoreIdleTimes& idle_times) {
+  idle_times.clear();
+
+  // The kernel may not support the cpufreq-stats driver.
+  static const bool kSupportsIdleTimes = SupportsCoreIdleTimes();
+  if (!kSupportsIdleTimes)
+    return false;
+
+  // Reading from the cpuidle driver doesn't block.
+  ThreadRestrictions::ScopedAllowIO allow_io;
+
+  int num_cpus = SysInfo::NumberOfProcessors();
+
+  bool success = false;
+  for (int core_index = 0; core_index < num_cpus; ++core_index) {
+    std::string content;
+    TimeDelta idle_time;
+
+    // The number of idle states is system/CPU dependent, so we increment and
+    // try to read each state until we fail.
+    for (int state_index = 0;; ++state_index) {
+      auto path = StringPrintf(kCoreIdleStateTimePath, core_index, state_index);
+      uint64_t idle_state_time = 0;
+      if (!ReadFileToString(FilePath(path), &content))
+        break;
+      StringToUint64(content, &idle_state_time);
+      idle_time += TimeDelta::FromMicroseconds(idle_state_time);
+    }
+
+    idle_times.push_back(idle_time);
+
+    // At least one of the cores should have some idle time, otherwise we report
+    // a failure.
+    success |= idle_time > base::TimeDelta();
+  }
+
+  return success;
+}
 #endif  // defined(OS_LINUX) || defined(OS_CHROMEOS) || defined(OS_ANDROID) ||
        // defined(OS_AIX)


--- a/base/cpu.h
+++ b/base/cpu.h
@@ -98,11 +98,11 @@ class BASE_EXPORT CPU final {
    CPU::CoreType core_type;      // type of the cores in this cluster.
    uint32_t cluster_core_index;  // index of the first core in the cluster.
    uint64_t core_frequency_khz;
-    TimeDelta cumulative_cpu_time;
+    TimeDelta cumulative_time;
  };
  using TimeInState = std::vector<TimeInStateEntry>;

-  // Emits the device's cumulative CPU usage split by CPU cluster frequency
+  // For each CPU core, emits the cumulative time spent in different frequency
  // states into the output parameter (replacing its current contents). One
  // entry in the output parameter is added for each cluster core index
  // + frequency state combination with a non-zero CPU time value. Returns false
@@ -113,6 +113,17 @@ class BASE_EXPORT CPU final {
  // NOTE: Currently only supported on Linux/Android, and only on kernels with
  // cpufreq-stats driver.
  static bool GetTimeInState(TimeInState&);
+
+  // For each CPU core, emits the total cumulative wall time spent in any idle
+  // state into the output parameter (replacing its current contents). Returns
+  // false on failure. We return the usage via an output parameter to allow
+  // reuse of TimeInState's std::vector by the caller, e.g. to avoid allocations
+  // between repeated calls to this method.
+  //
+  // NOTE: Currently only supported on Linux/Android, and only on kernels with
+  // cpuidle driver.
+  using CoreIdleTimes = std::vector<TimeDelta>;
+  static bool GetCumulativeCoreIdleTimes(CoreIdleTimes&);
 #endif  // defined(OS_LINUX) || defined(OS_CHROMEOS) || defined(OS_ANDROID) ||
        // defined(OS_AIX)


--- a/content/common/android/cpu_time_metrics.cc
+++ b/content/common/android/cpu_time_metrics.cc
@@ -19,6 +19,7 @@
 #include "base/metrics/histogram_macros.h"
 #include "base/no_destructor.h"
 #include "base/process/process_metrics.h"
+#include "base/run_loop.h"
 #include "base/sequence_checker.h"
 #include "base/strings/pattern.h"
 #include "base/strings/string_util.h"
@@ -310,6 +311,9 @@ class ProcessCpuTimeTaskObserver : public base::TaskObserver {
      reporting_interval_ = kReportAfterEveryNTasksOtherProcess;
    }
    DETACH_FROM_SEQUENCE(thread_pool_);
+    // Post a first collection to capture initial values for calculation of
+    // delta values in subsequent passes.
+    PostCollectionTask();
  }

  // base::TaskObserver implementation:
@@ -326,6 +330,12 @@ class ProcessCpuTimeTaskObserver : public base::TaskObserver {
      return;
    task_counter_++;
    if (task_counter_ == reporting_interval_) {
+      PostCollectionTask();
+      task_counter_ = 0;
+    }
+  }
+
+  void PostCollectionTask() {
    // PostTask() applies a barrier, so this will be applied before the thread
    // pool task executes and sets |collection_in_progress_| back to false.
    collection_in_progress_.store(true, std::memory_order_relaxed);
@@ -334,8 +344,6 @@ class ProcessCpuTimeTaskObserver : public base::TaskObserver {
        base::BindOnce(
            &ProcessCpuTimeTaskObserver::CollectAndReportCpuTimeOnThreadPool,
            base::Unretained(this)));
-      task_counter_ = 0;
-    }
  }

  void CollectAndReportCpuTimeOnThreadPool() {
@@ -362,53 +370,7 @@ class ProcessCpuTimeTaskObserver : public base::TaskObserver {
    // kernels. This breakdown approximates Chrome's total per
    // core-type/frequency usage by splitting the process's CPU time across
    // cores/frequencies according to global per-core time_in_state values.
-    if (base::CPU::GetTimeInState(time_in_state_)) {
-      // Compute the total CPU time delta since the last cycle across all
-      // clusters and frequencies, so that we can compute proportional deltas in
-      // the second loop below.
-      base::TimeDelta total_cumulative;
-      for (const base::CPU::TimeInStateEntry& entry : time_in_state_) {
-        total_cumulative += entry.cumulative_cpu_time;
-      }
-
-      base::TimeDelta total_delta =
-          total_cumulative - total_reported_time_in_state_;
-      total_reported_time_in_state_ = total_cumulative;
-
-      if (process_cpu_time_delta > base::TimeDelta() &&
-          total_delta > base::TimeDelta()) {
-        for (const base::CPU::TimeInStateEntry& entry : time_in_state_) {
-          DCHECK_GT(approximate_time_in_state_reporters_.size(),
-                    static_cast<size_t>(entry.core_type));
-          std::unique_ptr<TimeInStateReporter>& reporter =
-              approximate_time_in_state_reporters_[static_cast<size_t>(
-                  entry.core_type)];
-          if (!reporter) {
-            reporter = std::make_unique<TimeInStateReporter>(
-                process_type_, entry.core_type, /*is_approximate=*/true);
-          }
-
-          // Compute delta since last cycle per entry.
-          uint32_t frequency_mhz = entry.core_frequency_khz / 1000;
-          base::TimeDelta& reported_time =
-              reported_time_in_state_[std::make_tuple(
-                  entry.core_type, entry.cluster_core_index, frequency_mhz)];
-          base::TimeDelta time_delta =
-              entry.cumulative_cpu_time - reported_time;
-          reported_time = entry.cumulative_cpu_time;
-
-          if (time_delta > base::TimeDelta()) {
-            // Scale the process's cpu time by each cluster/frequency pair's
-            // relative proportion of execution time. We scale by a double value
-            // to avoid integer overflow in the presence of large time_delta values.
-            uint64_t delta_us = process_cpu_time_delta.InMicroseconds() *
-                                (time_delta.InMicroseconds() /
-                                 static_cast<double>(total_delta.InMicroseconds()));
-            reporter->AddMicroseconds(frequency_mhz, delta_us);
-          }
-        }
-      }
-    }
+    CollectAndReportApproxTimeInState(process_cpu_time_delta);

    // Also report a breakdown by thread type.
    base::TimeDelta unattributed_delta = process_cpu_time_delta;
@@ -506,6 +468,13 @@ class ProcessCpuTimeTaskObserver : public base::TaskObserver {
    collection_in_progress_.store(false, std::memory_order_relaxed);
  }

+  void WaitForCollectionForTesting() const {
+    base::RunLoop run_loop;
+    // Post the QuitClosure to execute after any pending collection.
+    task_runner_->PostTask(FROM_HERE, run_loop.QuitClosure());
+    run_loop.Run();
+  }
+
 private:
  using ClusterFrequency = std::tuple<base::CPU::CoreType,
                                      uint32_t /*cluster_core_index*/,
@@ -539,6 +508,170 @@ class ProcessCpuTimeTaskObserver : public base::TaskObserver {
    return GetThreadTypeFromName(name);
  }

+  void CollectAndReportApproxTimeInState(
+      base::TimeDelta process_cpu_time_delta) {
+    if (!base::CPU::GetTimeInState(time_in_state_) || time_in_state_.empty() ||
+        !base::CPU::GetCumulativeCoreIdleTimes(core_idle_times_)) {
+      return;
+    }
+
+    if (core_idle_times_.size() > reported_core_idle_times_.size())
+      reported_core_idle_times_.resize(core_idle_times_.size());
+
+    // Compute the wall time delta since the last cycle, so that we can
+    // compute active times per core type below. cpuidle and time_in_state
+    // information tick with CLOCK_MONOTONIC, so we use base::TimeTicks (also
+    // CLOCK_MONOTONIC) as a reference here.
+    base::TimeTicks now = base::TimeTicks::Now();
+    base::TimeDelta wall_time_delta = now - last_time_in_state_walltime_;
+    last_time_in_state_walltime_ = now;
+
+    // Convert core_idle_times_ to delta values.
+    for (uint32_t core_index = 0; core_index < core_idle_times_.size();
+         ++core_index) {
+      base::TimeDelta absolute_idle_time = core_idle_times_[core_index];
+      core_idle_times_[core_index] -= reported_core_idle_times_[core_index];
+      reported_core_idle_times_[core_index] = absolute_idle_time;
+    }
+
+    // Compute total active time during this cycle.
+    base::TimeDelta total_active_time;
+    for (base::TimeDelta core_idle_time : core_idle_times_) {
+      base::TimeDelta active_time = wall_time_delta - core_idle_time;
+      if (active_time > base::TimeDelta())
+        total_active_time += active_time;
+    }
+
+    // Because time_in_state_ includes idle time and reports values for a
+    // single core of each cluster, we work out how much CPU time to attribute
+    // to each cluster and frequency through the following approximation:
+    // (1) For each cluster, we compute how much of the execution happened on
+    //     cores of the cluster vs. cores on other clusters
+    //     (current_cluster_proportion).
+    // (2) We assume that the time spent in idle will be mostly in the lower
+    //     frequency band of the cluster. For that reason, we subtract the
+    //     average idle time of a cluster's core from the cluster's first
+    //     entries (lowest frequencies) in time_in_state.
+    // (3) For the remaining frequencies, we calculate the proportion of
+    //     active time the cluster spent in each frequency
+    //     (frequency_proportion).
+    // (4) Finally, we split the process's CPU time across clusters and
+    //     frequencies based on current_cluster_proportion and
+    //     frequency_proportion.
+
+    uint32_t last_core_index = -1;
+    uint32_t next_core_index = -1;
+    // Idle time of the current cluster that hasn't been attributed to a
+    // specific frequency state yet, for (2).
+    base::TimeDelta current_cluster_unattributed_idle_wall_time;
+    // Average wall time the current cluster's cores were active for, for (3).
+    base::TimeDelta current_cluster_active_wall_time;
+    // Proportion of the current cluster's core's cumulative active time of
+    // the total active time across all cores, for (1).
+    double current_cluster_proportion = 0;
+
+    for (size_t state_index = 0; state_index < time_in_state_.size();
+         ++state_index) {
+      const base::CPU::TimeInStateEntry& entry = time_in_state_[state_index];
+      DCHECK_GT(approximate_time_in_state_reporters_.size(),
+                static_cast<size_t>(entry.core_type));
+      std::unique_ptr<TimeInStateReporter>& reporter =
+          approximate_time_in_state_reporters_[static_cast<size_t>(
+              entry.core_type)];
+      if (!reporter) {
+        reporter = std::make_unique<TimeInStateReporter>(
+            process_type_, entry.core_type, /*is_approximate=*/true);
+      }
+
+      // Compute delta since last cycle per entry.
+      uint32_t frequency_mhz = entry.core_frequency_khz / 1000;
+      base::TimeDelta& reported_time = reported_time_in_state_[std::make_tuple(
+          entry.core_type, entry.cluster_core_index, frequency_mhz)];
+      base::TimeDelta time_delta = entry.cumulative_time - reported_time;
+      reported_time = entry.cumulative_time;
+
+      // In the first cycle (wall_time_delta == now), we can't really trust
+      // active_time_per_core (because we don't know the absolute time
+      // domain of cpuidle values), so skip reporting.
+      bool first_cycle = wall_time_delta == (now - base::TimeTicks());
+
+      if (first_cycle || time_delta <= base::TimeDelta() ||
+          process_cpu_time_delta <= base::TimeDelta()) {
+        continue;
+      }
+
+      if (last_core_index != entry.cluster_core_index) {
+        // This is the first entry for a new cluster. Find the next
+        // cluster's first core and compute the cluster's active/idle wall
+        // time (3) and proportion of total execution time (1).
+        next_core_index = FindNextClusterCoreIndex(state_index);
+
+        base::TimeDelta cluster_active_time;
+        base::TimeDelta cluster_idle_time;
+        for (size_t core_index = entry.cluster_core_index;
+             core_index < next_core_index; ++core_index) {
+          cluster_idle_time += core_idle_times_[core_index];
+          cluster_active_time += wall_time_delta - core_idle_times_[core_index];
+        }
+
+        size_t num_cores = next_core_index - entry.cluster_core_index;
+        current_cluster_active_wall_time = cluster_active_time / num_cores;
+        current_cluster_unattributed_idle_wall_time =
+            cluster_idle_time / num_cores;
+
+        // (1) Proportion of execution on this cluster's cores vs others.
+        current_cluster_proportion = 0;
+        if (total_active_time > base::TimeDelta())
+          current_cluster_proportion = cluster_active_time / total_active_time;
+
+        last_core_index = entry.cluster_core_index;
+      }
+
+      // (2) Assign the cluster's idle wall time to the first entries, i.e.
+      // lowest frequencies.
+      if (time_delta < current_cluster_unattributed_idle_wall_time) {
+        // Attribute this frequency state entirely to idle time, skip it.
+        current_cluster_unattributed_idle_wall_time -= time_delta;
+        continue;
+      } else if (current_cluster_unattributed_idle_wall_time >
+                 base::TimeDelta()) {
+        time_delta -= current_cluster_unattributed_idle_wall_time;
+        current_cluster_unattributed_idle_wall_time = base::TimeDelta();
+      }
+
+      // (3) Proportion of active wall time that this cluster spent in the
+      // frequency state.
+      double frequency_proportion = 0;
+      if (current_cluster_active_wall_time > base::TimeDelta())
+        frequency_proportion = time_delta / current_cluster_active_wall_time;
+
+      // (4) Scale the process's cpu time by the cluster/frequency pair's
+      // relative proportion of execution time. Note that we calculate
+      // double values for the proportions above first to avoid integer
+      // overflow in the presence of large time_delta values.
+      uint64_t delta_us = process_cpu_time_delta.InMicroseconds() *
+                          frequency_proportion * current_cluster_proportion;
+
+      reporter->AddMicroseconds(frequency_mhz, delta_us);
+    }
+  }
+
+  // Returns the core index of the first core of the next cluster after the
+  // cluster of the given entry in |time_in_state_|. Returns max core_index + 1
+  // if no further clusters exist.
+  size_t FindNextClusterCoreIndex(size_t state_index) {
+    for (size_t next_state_index = state_index;
+         next_state_index < time_in_state_.size(); ++next_state_index) {
+      const auto& next_entry = time_in_state_[next_state_index];
+      if (next_entry.cluster_core_index !=
+          time_in_state_[state_index].cluster_core_index) {
+        return next_entry.cluster_core_index;
+      }
+    }
+    // No further clusters, return max core index + 1.
+    return core_idle_times_.size();
+  }
+
  // Sample CPU time after a certain number of main-thread task to balance
  // overhead of sampling and loss at process termination.
  static constexpr int kReportAfterEveryNTasksPersistentProcess = 500;
@@ -566,11 +699,13 @@ class ProcessCpuTimeTaskObserver : public base::TaskObserver {
      approximate_time_in_state_reporters_ = {};
  base::flat_map<ClusterFrequency, base::TimeDelta /*time_in_state*/>
      reported_time_in_state_;
-  base::TimeDelta total_reported_time_in_state_;
+  base::CPU::CoreIdleTimes reported_core_idle_times_;
+  base::TimeTicks last_time_in_state_walltime_;
  // Stored as instance variables to avoid allocation churn.
  base::ProcessMetrics::CPUUsagePerThread cumulative_thread_times_;
  base::ProcessMetrics::TimeInStatePerThread time_in_state_per_thread_;
  base::CPU::TimeInState time_in_state_;
+  base::CPU::CoreIdleTimes core_idle_times_;

  // Accessed on both sequences.
  std::atomic<bool> collection_in_progress_;
@@ -589,8 +724,12 @@ void SetupCpuTimeMetrics() {
 }

 void SampleCpuTimeMetricsForTesting() {
-  ProcessCpuTimeTaskObserver::GetInstance()
-      ->CollectAndReportCpuTimeOnThreadPool();
+  auto* instance = ProcessCpuTimeTaskObserver::GetInstance();
+  // Make sure no collection is currently in progress (this may happen if
+  // GetInstance() above initializes the task observer).
+  instance->WaitForCollectionForTesting();  // IN-TEST
+  instance->PostCollectionTask();
+  instance->WaitForCollectionForTesting();  // IN-TEST
 }

 }  // namespace content