Commit 939bda0c authored by Gabriel Marin's avatar Gabriel Marin Committed by Commit Bot

Collect cycles precise=3 where available.

Precise attribution at instruction level helps to better understand hostspots
in assembly view.

Updated a few unit tests to check for the right UMA metric being incremented.

BUG=b:168312716

Change-Id: I537395d5ec6714c75b4224765ebc247a6ba00ff3
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2424517Reviewed-by: default avatarGabriel Marin <gmx@chromium.org>
Reviewed-by: default avatarGeorge Burgess <gbiv@chromium.org>
Commit-Queue: Gabriel Marin <gmx@chromium.org>
Cr-Commit-Position: refs/heads/master@{#809884}
parent f8b9de2b
...@@ -87,6 +87,12 @@ void ExtractVersionNumbers(const std::string& version, ...@@ -87,6 +87,12 @@ void ExtractVersionNumbers(const std::string& version,
bugfix_version); bugfix_version);
} }
// Returns if a micro-architecture supports the cycles:ppp event.
bool MicroarchitectureHasCyclesPPPEvent(const std::string& uarch) {
return uarch == "Goldmont" || uarch == "GoldmontPlus" ||
uarch == "Broadwell" || uarch == "Kabylake" || uarch == "Tigerlake";
}
// Returns if a micro-architecture supports LBR callgraph profiling. // Returns if a micro-architecture supports LBR callgraph profiling.
bool MicroarchitectureHasLBRCallgraph(const std::string& uarch) { bool MicroarchitectureHasLBRCallgraph(const std::string& uarch) {
return uarch == "Haswell" || uarch == "Broadwell" || uarch == "Skylake" || return uarch == "Haswell" || uarch == "Broadwell" || uarch == "Skylake" ||
...@@ -103,6 +109,16 @@ bool KernelReleaseHasLBRCallgraph(const std::string& release) { ...@@ -103,6 +109,16 @@ bool KernelReleaseHasLBRCallgraph(const std::string& release) {
// Hopefully we never need a space in a command argument. // Hopefully we never need a space in a command argument.
const char kPerfCommandDelimiter[] = " "; const char kPerfCommandDelimiter[] = " ";
// Collect precise=3 (:ppp) cycle events on microarchitectures that support it.
const char kPerfCyclesPPPCmd[] = "perf record -a -e cycles:ppp -c 1000003";
const char kPerfFPCallgraphPPPCmd[] =
"perf record -a -e cycles:ppp -g -c 4000037";
const char kPerfLBRCallgraphPPPCmd[] =
"perf record -a -e cycles:ppp -c 4000037 --call-graph lbr";
// Collect default (imprecise) cycle events everywhere else.
const char kPerfCyclesCmd[] = "perf record -a -e cycles -c 1000003"; const char kPerfCyclesCmd[] = "perf record -a -e cycles -c 1000003";
const char kPerfFPCallgraphCmd[] = "perf record -a -e cycles -g -c 4000037"; const char kPerfFPCallgraphCmd[] = "perf record -a -e cycles -g -c 4000037";
...@@ -156,6 +172,9 @@ const std::vector<RandomSelector::WeightAndValue> GetDefaultCommands_x86_64( ...@@ -156,6 +172,9 @@ const std::vector<RandomSelector::WeightAndValue> GetDefaultCommands_x86_64(
const char* itlb_miss_cycles_cmd = kPerfITLBMissCyclesCmdIvyBridge; const char* itlb_miss_cycles_cmd = kPerfITLBMissCyclesCmdIvyBridge;
const char* dtlb_miss_cycles_cmd = kPerfDTLBMissCyclesCmdIvyBridge; const char* dtlb_miss_cycles_cmd = kPerfDTLBMissCyclesCmdIvyBridge;
const char* lbr_cmd = kPerfLBRCmd; const char* lbr_cmd = kPerfLBRCmd;
const char* cycles_cmd = kPerfCyclesCmd;
const char* fp_callgraph_cmd = kPerfFPCallgraphCmd;
const char* lbr_callgraph_cmd = kPerfLBRCallgraphCmd;
if (cpu_uarch == "Skylake" || cpu_uarch == "Kabylake" || if (cpu_uarch == "Skylake" || cpu_uarch == "Kabylake" ||
cpu_uarch == "Tigerlake" || cpu_uarch == "GoldmontPlus") { cpu_uarch == "Tigerlake" || cpu_uarch == "GoldmontPlus") {
...@@ -171,6 +190,28 @@ const std::vector<RandomSelector::WeightAndValue> GetDefaultCommands_x86_64( ...@@ -171,6 +190,28 @@ const std::vector<RandomSelector::WeightAndValue> GetDefaultCommands_x86_64(
cpu_uarch == "Goldmont" || cpu_uarch == "GoldmontPlus") { cpu_uarch == "Goldmont" || cpu_uarch == "GoldmontPlus") {
lbr_cmd = kPerfLBRCmdAtom; lbr_cmd = kPerfLBRCmdAtom;
} }
if (MicroarchitectureHasCyclesPPPEvent(cpu_uarch)) {
cycles_cmd = kPerfCyclesPPPCmd;
fp_callgraph_cmd = kPerfFPCallgraphPPPCmd;
lbr_callgraph_cmd = kPerfLBRCallgraphPPPCmd;
}
cmds.emplace_back(WeightAndValue(50.0, cycles_cmd));
// Haswell and newer big Intel cores support LBR callstack profiling. This
// requires kernel support, which was added in kernel 4.4, and it was
// backported to kernel 3.18. Collect LBR callstack profiling where
// supported in addition to FP callchains. The former works with binaries
// compiled with frame pointers disabled, but it only captures callchains
// after profiling is enabled, so it's likely missing the lower frames of
// the callstack.
if (MicroarchitectureHasLBRCallgraph(cpu_uarch) &&
KernelReleaseHasLBRCallgraph(cpuid.release)) {
cmds.emplace_back(WeightAndValue(10.0, fp_callgraph_cmd));
cmds.emplace_back(WeightAndValue(10.0, lbr_callgraph_cmd));
} else {
cmds.emplace_back(WeightAndValue(20.0, fp_callgraph_cmd));
}
if (cpu_uarch == "IvyBridge" || cpu_uarch == "Haswell" || if (cpu_uarch == "IvyBridge" || cpu_uarch == "Haswell" ||
cpu_uarch == "Broadwell" || cpu_uarch == "SandyBridge" || cpu_uarch == "Broadwell" || cpu_uarch == "SandyBridge" ||
...@@ -178,30 +219,15 @@ const std::vector<RandomSelector::WeightAndValue> GetDefaultCommands_x86_64( ...@@ -178,30 +219,15 @@ const std::vector<RandomSelector::WeightAndValue> GetDefaultCommands_x86_64(
cpu_uarch == "Tigerlake" || cpu_uarch == "Silvermont" || cpu_uarch == "Tigerlake" || cpu_uarch == "Silvermont" ||
cpu_uarch == "Airmont" || cpu_uarch == "Goldmont" || cpu_uarch == "Airmont" || cpu_uarch == "Goldmont" ||
cpu_uarch == "GoldmontPlus") { cpu_uarch == "GoldmontPlus") {
cmds.push_back(WeightAndValue(50.0, kPerfCyclesCmd)); cmds.emplace_back(WeightAndValue(15.0, lbr_cmd));
// Haswell and newer big Intel cores support LBR callstack profiling. This cmds.emplace_back(WeightAndValue(5.0, itlb_miss_cycles_cmd));
// requires kernel support, which was added in kernel 4.4, and it was cmds.emplace_back(WeightAndValue(5.0, dtlb_miss_cycles_cmd));
// backported to kernel 3.18. Collect LBR callstack profiling where
// supported in addition to FP callchains. The former works with binaries
// compiled with frame pointers disabled, but it only captures callchains
// after profiling is enabled, so it's likely missing the lower frames of
// the callstack.
if (MicroarchitectureHasLBRCallgraph(cpu_uarch) &&
KernelReleaseHasLBRCallgraph(cpuid.release)) {
cmds.push_back(WeightAndValue(10.0, kPerfFPCallgraphCmd));
cmds.push_back(WeightAndValue(10.0, kPerfLBRCallgraphCmd));
} else {
cmds.push_back(WeightAndValue(20.0, kPerfFPCallgraphCmd));
}
cmds.push_back(WeightAndValue(15.0, lbr_cmd));
cmds.push_back(WeightAndValue(5.0, itlb_miss_cycles_cmd));
cmds.push_back(WeightAndValue(5.0, dtlb_miss_cycles_cmd));
// Only Goldmont and GoldmontPlus support precise events on last level cache // Only Goldmont and GoldmontPlus support precise events on last level cache
// misses. // misses.
if (cpu_uarch == "Goldmont" || cpu_uarch == "GoldmontPlus") { if (cpu_uarch == "Goldmont" || cpu_uarch == "GoldmontPlus") {
cmds.push_back(WeightAndValue(5.0, kPerfLLCMissesPreciseCmd)); cmds.emplace_back(WeightAndValue(5.0, kPerfLLCMissesPreciseCmd));
} else { } else {
cmds.push_back(WeightAndValue(5.0, kPerfLLCMissesCmd)); cmds.emplace_back(WeightAndValue(5.0, kPerfLLCMissesCmd));
} }
return cmds; return cmds;
} }
...@@ -209,12 +235,11 @@ const std::vector<RandomSelector::WeightAndValue> GetDefaultCommands_x86_64( ...@@ -209,12 +235,11 @@ const std::vector<RandomSelector::WeightAndValue> GetDefaultCommands_x86_64(
// non-Intel CPUs such as AMD, since the event code provided for LLC is // non-Intel CPUs such as AMD, since the event code provided for LLC is
// Intel specific. // Intel specific.
if (cpuid.vendor=="GenuineIntel"){ if (cpuid.vendor=="GenuineIntel"){
cmds.push_back(WeightAndValue(75.0, kPerfCyclesCmd)); cmds.emplace_back(WeightAndValue(25.0, cycles_cmd));
cmds.push_back(WeightAndValue(5.0, kPerfLLCMissesCmd)); cmds.emplace_back(WeightAndValue(5.0, kPerfLLCMissesCmd));
} else { } else {
cmds.push_back(WeightAndValue(80.0, kPerfCyclesCmd)); cmds.emplace_back(WeightAndValue(30.0, cycles_cmd));
} }
cmds.push_back(WeightAndValue(20.0, kPerfFPCallgraphCmd));
return cmds; return cmds;
} }
...@@ -246,13 +271,13 @@ std::vector<RandomSelector::WeightAndValue> GetDefaultCommandsForCpu( ...@@ -246,13 +271,13 @@ std::vector<RandomSelector::WeightAndValue> GetDefaultCommandsForCpu(
if (cpuid.arch == "x86" || // 32-bit x86, or... if (cpuid.arch == "x86" || // 32-bit x86, or...
cpuid.arch == "armv7l" || // ARM32 cpuid.arch == "armv7l" || // ARM32
cpuid.arch == "aarch64") { // ARM64 cpuid.arch == "aarch64") { // ARM64
cmds.push_back(WeightAndValue(80.0, kPerfCyclesCmd)); cmds.emplace_back(WeightAndValue(80.0, kPerfCyclesCmd));
cmds.push_back(WeightAndValue(20.0, kPerfFPCallgraphCmd)); cmds.emplace_back(WeightAndValue(20.0, kPerfFPCallgraphCmd));
return cmds; return cmds;
} }
// Unknown CPUs // Unknown CPUs
cmds.push_back(WeightAndValue(1.0, kPerfCyclesCmd)); cmds.emplace_back(WeightAndValue(1.0, kPerfCyclesCmd));
return cmds; return cmds;
} }
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "base/metrics/field_trial.h" #include "base/metrics/field_trial.h"
#include "base/task/post_task.h" #include "base/task/post_task.h"
#include "base/test/bind_test_util.h" #include "base/test/bind_test_util.h"
#include "base/test/metrics/histogram_tester.h"
#include "chrome/browser/metrics/perf/cpu_identity.h" #include "chrome/browser/metrics/perf/cpu_identity.h"
#include "chrome/browser/metrics/perf/windowed_incognito_observer.h" #include "chrome/browser/metrics/perf/windowed_incognito_observer.h"
#include "components/variations/variations_associated_data.h" #include "components/variations/variations_associated_data.h"
...@@ -32,6 +33,11 @@ const char kPerfCyclesCmd[] = "perf record -a -e cycles -c 1000003"; ...@@ -32,6 +33,11 @@ const char kPerfCyclesCmd[] = "perf record -a -e cycles -c 1000003";
const char kPerfFPCallgraphCmd[] = "perf record -a -e cycles -g -c 4000037"; const char kPerfFPCallgraphCmd[] = "perf record -a -e cycles -g -c 4000037";
const char kPerfLBRCallgraphCmd[] = const char kPerfLBRCallgraphCmd[] =
"perf record -a -e cycles -c 4000037 --call-graph lbr"; "perf record -a -e cycles -c 4000037 --call-graph lbr";
const char kPerfCyclesPPPCmd[] = "perf record -a -e cycles:ppp -c 1000003";
const char kPerfFPCallgraphPPPCmd[] =
"perf record -a -e cycles:ppp -g -c 4000037";
const char kPerfLBRCallgraphPPPCmd[] =
"perf record -a -e cycles:ppp -c 4000037 --call-graph lbr";
const char kPerfLBRCmd[] = "perf record -a -e r20c4 -b -c 200011"; const char kPerfLBRCmd[] = "perf record -a -e r20c4 -b -c 200011";
const char kPerfLBRCmdAtom[] = "perf record -a -e rc4 -b -c 300001"; const char kPerfLBRCmdAtom[] = "perf record -a -e rc4 -b -c 300001";
const char kPerfITLBMissCyclesCmdIvyBridge[] = const char kPerfITLBMissCyclesCmdIvyBridge[] =
...@@ -150,6 +156,7 @@ class TestPerfCollector : public PerfCollector { ...@@ -150,6 +156,7 @@ class TestPerfCollector : public PerfCollector {
public: public:
TestPerfCollector() = default; TestPerfCollector() = default;
using MetricCollector::CollectionAttemptStatus;
using MetricCollector::CollectPerfDataAfterSessionRestore; using MetricCollector::CollectPerfDataAfterSessionRestore;
using MetricCollector::OnJankStarted; using MetricCollector::OnJankStarted;
using MetricCollector::OnJankStopped; using MetricCollector::OnJankStopped;
...@@ -264,11 +271,15 @@ TEST_F(PerfCollectorTest, NoCollectionWhenProfileCacheFull) { ...@@ -264,11 +271,15 @@ TEST_F(PerfCollectorTest, NoCollectionWhenProfileCacheFull) {
EXPECT_TRUE(perf_collector_->IsRunning()); EXPECT_TRUE(perf_collector_->IsRunning());
// Pretend the cache is full. // Pretend the cache is full.
perf_collector_->AddCachedDataDelta(4 * 1024 * 1024); perf_collector_->AddCachedDataDelta(4 * 1024 * 1024);
base::HistogramTester histogram_tester;
// Advance the clock by a periodic collection interval. We shouldn't find a // Advance the clock by a periodic collection interval. We shouldn't find a
// profile because the cache is full. // profile because the cache is full.
task_environment_.FastForwardBy(kPeriodicCollectionInterval); task_environment_.FastForwardBy(kPeriodicCollectionInterval);
EXPECT_TRUE(cached_profile_data_.empty()); EXPECT_TRUE(cached_profile_data_.empty());
histogram_tester.ExpectUniqueSample(
"ChromeOS.CWP.CollectPerf",
TestPerfCollector::CollectionAttemptStatus::NOT_READY_TO_COLLECT, 1);
} }
// Simulate opening and closing of incognito window in between calls to // Simulate opening and closing of incognito window in between calls to
...@@ -304,6 +315,7 @@ TEST_F(PerfCollectorTest, IncognitoWindowOpened) { ...@@ -304,6 +315,7 @@ TEST_F(PerfCollectorTest, IncognitoWindowOpened) {
EXPECT_GT(profile1.cpu_max_frequency_mhz_size(), 0); EXPECT_GT(profile1.cpu_max_frequency_mhz_size(), 0);
cached_profile_data_.clear(); cached_profile_data_.clear();
base::HistogramTester histogram_tester;
sampled_profile = std::make_unique<SampledProfile>(); sampled_profile = std::make_unique<SampledProfile>();
sampled_profile->set_trigger_event(SampledProfile::RESUME_FROM_SUSPEND); sampled_profile->set_trigger_event(SampledProfile::RESUME_FROM_SUSPEND);
// An incognito window opens. // An incognito window opens.
...@@ -314,6 +326,9 @@ TEST_F(PerfCollectorTest, IncognitoWindowOpened) { ...@@ -314,6 +326,9 @@ TEST_F(PerfCollectorTest, IncognitoWindowOpened) {
task_environment_.RunUntilIdle(); task_environment_.RunUntilIdle();
EXPECT_TRUE(cached_profile_data_.empty()); EXPECT_TRUE(cached_profile_data_.empty());
histogram_tester.ExpectUniqueSample(
"ChromeOS.CWP.CollectPerf",
TestPerfCollector::CollectionAttemptStatus::INCOGNITO_LAUNCHED, 1);
sampled_profile = std::make_unique<SampledProfile>(); sampled_profile = std::make_unique<SampledProfile>();
sampled_profile->set_trigger_event(SampledProfile::RESUME_FROM_SUSPEND); sampled_profile->set_trigger_event(SampledProfile::RESUME_FROM_SUSPEND);
...@@ -483,10 +498,10 @@ TEST_F(PerfCollectorTest, DefaultCommandsBasedOnUarch_Tigerlake) { ...@@ -483,10 +498,10 @@ TEST_F(PerfCollectorTest, DefaultCommandsBasedOnUarch_Tigerlake) {
std::vector<RandomSelector::WeightAndValue> cmds = std::vector<RandomSelector::WeightAndValue> cmds =
internal::GetDefaultCommandsForCpu(cpuid); internal::GetDefaultCommandsForCpu(cpuid);
ASSERT_GE(cmds.size(), 3UL); ASSERT_GE(cmds.size(), 3UL);
EXPECT_EQ(cmds[0].value, kPerfCyclesCmd); EXPECT_EQ(cmds[0].value, kPerfCyclesPPPCmd);
// We have both FP and LBR based callstacks. // We have both FP and LBR based callstacks.
EXPECT_EQ(cmds[1].value, kPerfFPCallgraphCmd); EXPECT_EQ(cmds[1].value, kPerfFPCallgraphPPPCmd);
EXPECT_EQ(cmds[2].value, kPerfLBRCallgraphCmd); EXPECT_EQ(cmds[2].value, kPerfLBRCallgraphPPPCmd);
auto found = auto found =
std::find_if(cmds.begin(), cmds.end(), std::find_if(cmds.begin(), cmds.end(),
[](const RandomSelector::WeightAndValue& cmd) -> bool { [](const RandomSelector::WeightAndValue& cmd) -> bool {
...@@ -516,13 +531,13 @@ TEST_F(PerfCollectorTest, DefaultCommandsBasedOnUarch_Goldmont) { ...@@ -516,13 +531,13 @@ TEST_F(PerfCollectorTest, DefaultCommandsBasedOnUarch_Goldmont) {
std::vector<RandomSelector::WeightAndValue> cmds = std::vector<RandomSelector::WeightAndValue> cmds =
internal::GetDefaultCommandsForCpu(cpuid); internal::GetDefaultCommandsForCpu(cpuid);
ASSERT_GE(cmds.size(), 2UL); ASSERT_GE(cmds.size(), 2UL);
EXPECT_EQ(cmds[0].value, kPerfCyclesCmd); EXPECT_EQ(cmds[0].value, kPerfCyclesPPPCmd);
EXPECT_EQ(cmds[1].value, kPerfFPCallgraphCmd); EXPECT_EQ(cmds[1].value, kPerfFPCallgraphPPPCmd);
// No LBR callstacks because the microarchitecture doesn't support it. // No LBR callstacks because the microarchitecture doesn't support it.
auto found = auto found =
std::find_if(cmds.begin(), cmds.end(), std::find_if(cmds.begin(), cmds.end(),
[](const RandomSelector::WeightAndValue& cmd) -> bool { [](const RandomSelector::WeightAndValue& cmd) -> bool {
return cmd.value == kPerfLBRCallgraphCmd; return cmd.value == kPerfLBRCallgraphPPPCmd;
}); });
EXPECT_EQ(cmds.end(), found); EXPECT_EQ(cmds.end(), found);
found = std::find_if(cmds.begin(), cmds.end(), found = std::find_if(cmds.begin(), cmds.end(),
...@@ -553,13 +568,13 @@ TEST_F(PerfCollectorTest, DefaultCommandsBasedOnUarch_GoldmontPlus) { ...@@ -553,13 +568,13 @@ TEST_F(PerfCollectorTest, DefaultCommandsBasedOnUarch_GoldmontPlus) {
std::vector<RandomSelector::WeightAndValue> cmds = std::vector<RandomSelector::WeightAndValue> cmds =
internal::GetDefaultCommandsForCpu(cpuid); internal::GetDefaultCommandsForCpu(cpuid);
ASSERT_GE(cmds.size(), 2UL); ASSERT_GE(cmds.size(), 2UL);
EXPECT_EQ(cmds[0].value, kPerfCyclesCmd); EXPECT_EQ(cmds[0].value, kPerfCyclesPPPCmd);
EXPECT_EQ(cmds[1].value, kPerfFPCallgraphCmd); EXPECT_EQ(cmds[1].value, kPerfFPCallgraphPPPCmd);
// No LBR callstacks because the microarchitecture doesn't support it. // No LBR callstacks because the microarchitecture doesn't support it.
auto found = auto found =
std::find_if(cmds.begin(), cmds.end(), std::find_if(cmds.begin(), cmds.end(),
[](const RandomSelector::WeightAndValue& cmd) -> bool { [](const RandomSelector::WeightAndValue& cmd) -> bool {
return cmd.value == kPerfLBRCallgraphCmd; return cmd.value == kPerfLBRCallgraphPPPCmd;
}); });
EXPECT_EQ(cmds.end(), found); EXPECT_EQ(cmds.end(), found);
found = std::find_if(cmds.begin(), cmds.end(), found = std::find_if(cmds.begin(), cmds.end(),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment