Commit 939bda0c authored by Gabriel Marin's avatar Gabriel Marin Committed by Commit Bot

Collect cycles precise=3 where available.

Precise attribution at instruction level helps to better understand hostspots
in assembly view.

Updated a few unit tests to check for the right UMA metric being incremented.

BUG=b:168312716

Change-Id: I537395d5ec6714c75b4224765ebc247a6ba00ff3
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2424517Reviewed-by: default avatarGabriel Marin <gmx@chromium.org>
Reviewed-by: default avatarGeorge Burgess <gbiv@chromium.org>
Commit-Queue: Gabriel Marin <gmx@chromium.org>
Cr-Commit-Position: refs/heads/master@{#809884}
parent f8b9de2b
......@@ -87,6 +87,12 @@ void ExtractVersionNumbers(const std::string& version,
bugfix_version);
}
// Returns if a micro-architecture supports the cycles:ppp event.
bool MicroarchitectureHasCyclesPPPEvent(const std::string& uarch) {
return uarch == "Goldmont" || uarch == "GoldmontPlus" ||
uarch == "Broadwell" || uarch == "Kabylake" || uarch == "Tigerlake";
}
// Returns if a micro-architecture supports LBR callgraph profiling.
bool MicroarchitectureHasLBRCallgraph(const std::string& uarch) {
return uarch == "Haswell" || uarch == "Broadwell" || uarch == "Skylake" ||
......@@ -103,6 +109,16 @@ bool KernelReleaseHasLBRCallgraph(const std::string& release) {
// Hopefully we never need a space in a command argument.
const char kPerfCommandDelimiter[] = " ";
// Collect precise=3 (:ppp) cycle events on microarchitectures that support it.
const char kPerfCyclesPPPCmd[] = "perf record -a -e cycles:ppp -c 1000003";
const char kPerfFPCallgraphPPPCmd[] =
"perf record -a -e cycles:ppp -g -c 4000037";
const char kPerfLBRCallgraphPPPCmd[] =
"perf record -a -e cycles:ppp -c 4000037 --call-graph lbr";
// Collect default (imprecise) cycle events everywhere else.
const char kPerfCyclesCmd[] = "perf record -a -e cycles -c 1000003";
const char kPerfFPCallgraphCmd[] = "perf record -a -e cycles -g -c 4000037";
......@@ -156,6 +172,9 @@ const std::vector<RandomSelector::WeightAndValue> GetDefaultCommands_x86_64(
const char* itlb_miss_cycles_cmd = kPerfITLBMissCyclesCmdIvyBridge;
const char* dtlb_miss_cycles_cmd = kPerfDTLBMissCyclesCmdIvyBridge;
const char* lbr_cmd = kPerfLBRCmd;
const char* cycles_cmd = kPerfCyclesCmd;
const char* fp_callgraph_cmd = kPerfFPCallgraphCmd;
const char* lbr_callgraph_cmd = kPerfLBRCallgraphCmd;
if (cpu_uarch == "Skylake" || cpu_uarch == "Kabylake" ||
cpu_uarch == "Tigerlake" || cpu_uarch == "GoldmontPlus") {
......@@ -171,6 +190,28 @@ const std::vector<RandomSelector::WeightAndValue> GetDefaultCommands_x86_64(
cpu_uarch == "Goldmont" || cpu_uarch == "GoldmontPlus") {
lbr_cmd = kPerfLBRCmdAtom;
}
if (MicroarchitectureHasCyclesPPPEvent(cpu_uarch)) {
cycles_cmd = kPerfCyclesPPPCmd;
fp_callgraph_cmd = kPerfFPCallgraphPPPCmd;
lbr_callgraph_cmd = kPerfLBRCallgraphPPPCmd;
}
cmds.emplace_back(WeightAndValue(50.0, cycles_cmd));
// Haswell and newer big Intel cores support LBR callstack profiling. This
// requires kernel support, which was added in kernel 4.4, and it was
// backported to kernel 3.18. Collect LBR callstack profiling where
// supported in addition to FP callchains. The former works with binaries
// compiled with frame pointers disabled, but it only captures callchains
// after profiling is enabled, so it's likely missing the lower frames of
// the callstack.
if (MicroarchitectureHasLBRCallgraph(cpu_uarch) &&
KernelReleaseHasLBRCallgraph(cpuid.release)) {
cmds.emplace_back(WeightAndValue(10.0, fp_callgraph_cmd));
cmds.emplace_back(WeightAndValue(10.0, lbr_callgraph_cmd));
} else {
cmds.emplace_back(WeightAndValue(20.0, fp_callgraph_cmd));
}
if (cpu_uarch == "IvyBridge" || cpu_uarch == "Haswell" ||
cpu_uarch == "Broadwell" || cpu_uarch == "SandyBridge" ||
......@@ -178,30 +219,15 @@ const std::vector<RandomSelector::WeightAndValue> GetDefaultCommands_x86_64(
cpu_uarch == "Tigerlake" || cpu_uarch == "Silvermont" ||
cpu_uarch == "Airmont" || cpu_uarch == "Goldmont" ||
cpu_uarch == "GoldmontPlus") {
cmds.push_back(WeightAndValue(50.0, kPerfCyclesCmd));
// Haswell and newer big Intel cores support LBR callstack profiling. This
// requires kernel support, which was added in kernel 4.4, and it was
// backported to kernel 3.18. Collect LBR callstack profiling where
// supported in addition to FP callchains. The former works with binaries
// compiled with frame pointers disabled, but it only captures callchains
// after profiling is enabled, so it's likely missing the lower frames of
// the callstack.
if (MicroarchitectureHasLBRCallgraph(cpu_uarch) &&
KernelReleaseHasLBRCallgraph(cpuid.release)) {
cmds.push_back(WeightAndValue(10.0, kPerfFPCallgraphCmd));
cmds.push_back(WeightAndValue(10.0, kPerfLBRCallgraphCmd));
} else {
cmds.push_back(WeightAndValue(20.0, kPerfFPCallgraphCmd));
}
cmds.push_back(WeightAndValue(15.0, lbr_cmd));
cmds.push_back(WeightAndValue(5.0, itlb_miss_cycles_cmd));
cmds.push_back(WeightAndValue(5.0, dtlb_miss_cycles_cmd));
cmds.emplace_back(WeightAndValue(15.0, lbr_cmd));
cmds.emplace_back(WeightAndValue(5.0, itlb_miss_cycles_cmd));
cmds.emplace_back(WeightAndValue(5.0, dtlb_miss_cycles_cmd));
// Only Goldmont and GoldmontPlus support precise events on last level cache
// misses.
if (cpu_uarch == "Goldmont" || cpu_uarch == "GoldmontPlus") {
cmds.push_back(WeightAndValue(5.0, kPerfLLCMissesPreciseCmd));
cmds.emplace_back(WeightAndValue(5.0, kPerfLLCMissesPreciseCmd));
} else {
cmds.push_back(WeightAndValue(5.0, kPerfLLCMissesCmd));
cmds.emplace_back(WeightAndValue(5.0, kPerfLLCMissesCmd));
}
return cmds;
}
......@@ -209,12 +235,11 @@ const std::vector<RandomSelector::WeightAndValue> GetDefaultCommands_x86_64(
// non-Intel CPUs such as AMD, since the event code provided for LLC is
// Intel specific.
if (cpuid.vendor=="GenuineIntel"){
cmds.push_back(WeightAndValue(75.0, kPerfCyclesCmd));
cmds.push_back(WeightAndValue(5.0, kPerfLLCMissesCmd));
cmds.emplace_back(WeightAndValue(25.0, cycles_cmd));
cmds.emplace_back(WeightAndValue(5.0, kPerfLLCMissesCmd));
} else {
cmds.push_back(WeightAndValue(80.0, kPerfCyclesCmd));
cmds.emplace_back(WeightAndValue(30.0, cycles_cmd));
}
cmds.push_back(WeightAndValue(20.0, kPerfFPCallgraphCmd));
return cmds;
}
......@@ -246,13 +271,13 @@ std::vector<RandomSelector::WeightAndValue> GetDefaultCommandsForCpu(
if (cpuid.arch == "x86" || // 32-bit x86, or...
cpuid.arch == "armv7l" || // ARM32
cpuid.arch == "aarch64") { // ARM64
cmds.push_back(WeightAndValue(80.0, kPerfCyclesCmd));
cmds.push_back(WeightAndValue(20.0, kPerfFPCallgraphCmd));
cmds.emplace_back(WeightAndValue(80.0, kPerfCyclesCmd));
cmds.emplace_back(WeightAndValue(20.0, kPerfFPCallgraphCmd));
return cmds;
}
// Unknown CPUs
cmds.push_back(WeightAndValue(1.0, kPerfCyclesCmd));
cmds.emplace_back(WeightAndValue(1.0, kPerfCyclesCmd));
return cmds;
}
......
......@@ -16,6 +16,7 @@
#include "base/metrics/field_trial.h"
#include "base/task/post_task.h"
#include "base/test/bind_test_util.h"
#include "base/test/metrics/histogram_tester.h"
#include "chrome/browser/metrics/perf/cpu_identity.h"
#include "chrome/browser/metrics/perf/windowed_incognito_observer.h"
#include "components/variations/variations_associated_data.h"
......@@ -32,6 +33,11 @@ const char kPerfCyclesCmd[] = "perf record -a -e cycles -c 1000003";
const char kPerfFPCallgraphCmd[] = "perf record -a -e cycles -g -c 4000037";
const char kPerfLBRCallgraphCmd[] =
"perf record -a -e cycles -c 4000037 --call-graph lbr";
const char kPerfCyclesPPPCmd[] = "perf record -a -e cycles:ppp -c 1000003";
const char kPerfFPCallgraphPPPCmd[] =
"perf record -a -e cycles:ppp -g -c 4000037";
const char kPerfLBRCallgraphPPPCmd[] =
"perf record -a -e cycles:ppp -c 4000037 --call-graph lbr";
const char kPerfLBRCmd[] = "perf record -a -e r20c4 -b -c 200011";
const char kPerfLBRCmdAtom[] = "perf record -a -e rc4 -b -c 300001";
const char kPerfITLBMissCyclesCmdIvyBridge[] =
......@@ -150,6 +156,7 @@ class TestPerfCollector : public PerfCollector {
public:
TestPerfCollector() = default;
using MetricCollector::CollectionAttemptStatus;
using MetricCollector::CollectPerfDataAfterSessionRestore;
using MetricCollector::OnJankStarted;
using MetricCollector::OnJankStopped;
......@@ -264,11 +271,15 @@ TEST_F(PerfCollectorTest, NoCollectionWhenProfileCacheFull) {
EXPECT_TRUE(perf_collector_->IsRunning());
// Pretend the cache is full.
perf_collector_->AddCachedDataDelta(4 * 1024 * 1024);
base::HistogramTester histogram_tester;
// Advance the clock by a periodic collection interval. We shouldn't find a
// profile because the cache is full.
task_environment_.FastForwardBy(kPeriodicCollectionInterval);
EXPECT_TRUE(cached_profile_data_.empty());
histogram_tester.ExpectUniqueSample(
"ChromeOS.CWP.CollectPerf",
TestPerfCollector::CollectionAttemptStatus::NOT_READY_TO_COLLECT, 1);
}
// Simulate opening and closing of incognito window in between calls to
......@@ -304,6 +315,7 @@ TEST_F(PerfCollectorTest, IncognitoWindowOpened) {
EXPECT_GT(profile1.cpu_max_frequency_mhz_size(), 0);
cached_profile_data_.clear();
base::HistogramTester histogram_tester;
sampled_profile = std::make_unique<SampledProfile>();
sampled_profile->set_trigger_event(SampledProfile::RESUME_FROM_SUSPEND);
// An incognito window opens.
......@@ -314,6 +326,9 @@ TEST_F(PerfCollectorTest, IncognitoWindowOpened) {
task_environment_.RunUntilIdle();
EXPECT_TRUE(cached_profile_data_.empty());
histogram_tester.ExpectUniqueSample(
"ChromeOS.CWP.CollectPerf",
TestPerfCollector::CollectionAttemptStatus::INCOGNITO_LAUNCHED, 1);
sampled_profile = std::make_unique<SampledProfile>();
sampled_profile->set_trigger_event(SampledProfile::RESUME_FROM_SUSPEND);
......@@ -483,10 +498,10 @@ TEST_F(PerfCollectorTest, DefaultCommandsBasedOnUarch_Tigerlake) {
std::vector<RandomSelector::WeightAndValue> cmds =
internal::GetDefaultCommandsForCpu(cpuid);
ASSERT_GE(cmds.size(), 3UL);
EXPECT_EQ(cmds[0].value, kPerfCyclesCmd);
EXPECT_EQ(cmds[0].value, kPerfCyclesPPPCmd);
// We have both FP and LBR based callstacks.
EXPECT_EQ(cmds[1].value, kPerfFPCallgraphCmd);
EXPECT_EQ(cmds[2].value, kPerfLBRCallgraphCmd);
EXPECT_EQ(cmds[1].value, kPerfFPCallgraphPPPCmd);
EXPECT_EQ(cmds[2].value, kPerfLBRCallgraphPPPCmd);
auto found =
std::find_if(cmds.begin(), cmds.end(),
[](const RandomSelector::WeightAndValue& cmd) -> bool {
......@@ -516,13 +531,13 @@ TEST_F(PerfCollectorTest, DefaultCommandsBasedOnUarch_Goldmont) {
std::vector<RandomSelector::WeightAndValue> cmds =
internal::GetDefaultCommandsForCpu(cpuid);
ASSERT_GE(cmds.size(), 2UL);
EXPECT_EQ(cmds[0].value, kPerfCyclesCmd);
EXPECT_EQ(cmds[1].value, kPerfFPCallgraphCmd);
EXPECT_EQ(cmds[0].value, kPerfCyclesPPPCmd);
EXPECT_EQ(cmds[1].value, kPerfFPCallgraphPPPCmd);
// No LBR callstacks because the microarchitecture doesn't support it.
auto found =
std::find_if(cmds.begin(), cmds.end(),
[](const RandomSelector::WeightAndValue& cmd) -> bool {
return cmd.value == kPerfLBRCallgraphCmd;
return cmd.value == kPerfLBRCallgraphPPPCmd;
});
EXPECT_EQ(cmds.end(), found);
found = std::find_if(cmds.begin(), cmds.end(),
......@@ -553,13 +568,13 @@ TEST_F(PerfCollectorTest, DefaultCommandsBasedOnUarch_GoldmontPlus) {
std::vector<RandomSelector::WeightAndValue> cmds =
internal::GetDefaultCommandsForCpu(cpuid);
ASSERT_GE(cmds.size(), 2UL);
EXPECT_EQ(cmds[0].value, kPerfCyclesCmd);
EXPECT_EQ(cmds[1].value, kPerfFPCallgraphCmd);
EXPECT_EQ(cmds[0].value, kPerfCyclesPPPCmd);
EXPECT_EQ(cmds[1].value, kPerfFPCallgraphPPPCmd);
// No LBR callstacks because the microarchitecture doesn't support it.
auto found =
std::find_if(cmds.begin(), cmds.end(),
[](const RandomSelector::WeightAndValue& cmd) -> bool {
return cmd.value == kPerfLBRCallgraphCmd;
return cmd.value == kPerfLBRCallgraphPPPCmd;
});
EXPECT_EQ(cmds.end(), found);
found = std::find_if(cmds.begin(), cmds.end(),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment