Commit 21aa90b6 authored by Olivier Li's avatar Olivier Li Committed by Commit Bot

Introduce the HangWatcher.NumberOfHungThreadsDuringWatchWindow metric.

Bug: 1135528
Change-Id: I5d1a406854e5b6e7ef7a6c2421198c1358a2497c
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2499423Reviewed-by: default avatarIlya Sherman <isherman@chromium.org>
Reviewed-by: default avatarWeilun Shi <sweilun@chromium.org>
Reviewed-by: default avatarFrançois Doray <fdoray@chromium.org>
Commit-Queue: Oliver Li <olivierli@chromium.org>
Cr-Commit-Position: refs/heads/master@{#825465}
parent e292898e
......@@ -9,6 +9,7 @@
#include "base/bind.h"
#include "base/callback_helpers.h"
#include "base/containers/flat_map.h"
#include "base/debug/alias.h"
#include "base/debug/crash_logging.h"
#include "base/debug/dump_without_crashing.h"
......@@ -48,6 +49,35 @@ std::atomic<bool> g_use_hang_watcher{false};
std::atomic<bool> g_hang_watch_workers{false};
std::atomic<bool> g_hang_watch_io_thread{false};
std::atomic<bool> g_hang_watch_ui_thread{false};
// Emits the hung thread count histogram. |count| is the number of threads
// of type |thread_type| that were hung or became hung during the last
// monitoring window. This function should be invoked for each thread type
// encountered on each call to Monitor().
void LogHungThreadCountHistogram(HangWatcher::ThreadType thread_type,
int count) {
constexpr int kMaxHungThreadCount = 100;
switch (thread_type) {
case HangWatcher::ThreadType::kIOThread:
UMA_HISTOGRAM_EXACT_LINEAR(
"HangWatcher.NumberOfHungThreadsDuringWatchWindow.BrowserProcess."
"IOThread",
count, kMaxHungThreadCount);
break;
case HangWatcher::ThreadType::kUIThread:
UMA_HISTOGRAM_EXACT_LINEAR(
"HangWatcher.NumberOfHungThreadsDuringWatchWindow.BrowserProcess."
"UIThread",
count, kMaxHungThreadCount);
break;
case HangWatcher::ThreadType::kThreadPoolThread:
UMA_HISTOGRAM_EXACT_LINEAR(
"HangWatcher.NumberOfHungThreadsDuringWatchWindow.BrowserProcess."
"ThreadPool",
count, kMaxHungThreadCount);
break;
}
}
}
constexpr const char* kThreadName = "HangWatcher";
......@@ -424,6 +454,17 @@ HangWatcher::WatchStateSnapShot::WatchStateSnapShot(
bool all_threads_marked = true;
bool found_deadline_before_ignore_threshold = false;
// Use an std::array to store the hang counts to avoid allocations. The
// numerical values of the HangWatcher::ThreadType enum is used to index into
// the array. A |kInvalidHangCount| is used to signify there were no threads
// of the type found.
constexpr size_t kHangCountArraySize =
static_cast<std::size_t>(base::HangWatcher::ThreadType::kMax) + 1;
std::array<int, kHangCountArraySize> hung_counts_per_thread_type;
constexpr int kInvalidHangCount = -1;
hung_counts_per_thread_type.fill(kInvalidHangCount);
// Copy hung thread information.
for (const auto& watch_state : watch_states) {
uint64_t flags;
......@@ -441,8 +482,18 @@ HangWatcher::WatchStateSnapShot::WatchStateSnapShot(
continue;
}
// If a thread type is monitored and did not hang it still needs to be
// logged as a zero count;
const size_t hang_count_index =
static_cast<size_t>(watch_state.get()->thread_type());
if (hung_counts_per_thread_type[hang_count_index] == kInvalidHangCount) {
hung_counts_per_thread_type[hang_count_index] = 0;
}
// Only copy hung threads.
if (deadline <= now) {
++hung_counts_per_thread_type[hang_count_index];
// Attempt to mark the thread as needing to stay within its current
// HangWatchScopeEnabled until capture is complete.
bool thread_marked = watch_state->SetShouldBlockOnHang(flags, deadline);
......@@ -461,6 +512,15 @@ HangWatcher::WatchStateSnapShot::WatchStateSnapShot(
}
}
// Log the hung thread counts to histograms for each thread type if any thread
// of the type were found.
for (size_t i = 0; i < kHangCountArraySize; ++i) {
const int hang_count = hung_counts_per_thread_type[i];
if (hang_count != kInvalidHangCount)
LogHungThreadCountHistogram(static_cast<HangWatcher::ThreadType>(i),
hang_count);
}
// Two cases can invalidate this snapshot and prevent the capture of the hang.
//
// 1. Some threads could not be marked for blocking so this snapshot isn't
......
......@@ -141,7 +141,7 @@ class BASE_EXPORT HangWatcher : public DelegateSimpleThread::Delegate {
kIOThread = 0,
kUIThread = 1,
kThreadPoolThread = 2,
kThreadForTesting = 3
kMax = kThreadPoolThread
};
// The first invocation of the constructor will set the global instance
......
......@@ -16,6 +16,7 @@
#include "base/synchronization/lock.h"
#include "base/synchronization/waitable_event.h"
#include "base/test/bind.h"
#include "base/test/metrics/histogram_tester.h"
#include "base/test/simple_test_tick_clock.h"
#include "base/test/task_environment.h"
#include "base/test/test_timeouts.h"
......@@ -27,6 +28,9 @@
#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"
using testing::ElementsAre;
using testing::IsEmpty;
namespace base {
namespace {
......@@ -57,7 +61,7 @@ class BlockingThread : public DelegateSimpleThread::Delegate {
// happens on the right thread.
base::ScopedClosureRunner unregister_closure =
base::HangWatcher::GetInstance()->RegisterThread(
base::HangWatcher::ThreadType::kThreadForTesting);
base::HangWatcher::ThreadType::kThreadPoolThread);
HangWatchScopeEnabled scope(timeout_);
wait_until_entered_scope_.Signal();
......@@ -189,7 +193,7 @@ TEST_F(
ScopeDisabledCreateScopeDisabledDestroyScopeEnabledCreateScopeEnabledDestroy) {
// Register the main test thread for hang watching.
auto unregister_thread_closure = hang_watcher_.RegisterThread(
base::HangWatcher::ThreadType::kThreadForTesting);
base::HangWatcher::ThreadType::kThreadPoolThread);
{
HangWatchScopeEnabled expires_instantly(base::TimeDelta{});
......@@ -217,7 +221,7 @@ TEST_F(
ScopeEnabledCreateScopeDisabledCreateScopeEnabledDestroyScopeDisabledDestroy) {
// Register the main test thread for hang watching.
auto unregister_thread_closure = hang_watcher_.RegisterThread(
base::HangWatcher::ThreadType::kThreadForTesting);
base::HangWatcher::ThreadType::kThreadPoolThread);
base::Optional<HangWatchScopeDisabled> disabler;
......@@ -247,7 +251,7 @@ TEST_F(
ScopeDisabledCreateScopeEnabledCreateScopeDisabledDestroyScopeEnabledDestroy) {
// Register the main test thread for hang watching.
auto unregister_thread_closure = hang_watcher_.RegisterThread(
base::HangWatcher::ThreadType::kThreadForTesting);
base::HangWatcher::ThreadType::kThreadPoolThread);
base::Optional<HangWatchScopeDisabled> disabler;
......@@ -277,7 +281,7 @@ TEST_F(
ScopeDisabledCreateScopeEnabledCreateScopeEnabledDestroyScopeDisabledDestroy) {
// Register the main test thread for hang watching.
auto unregister_thread_closure = hang_watcher_.RegisterThread(
base::HangWatcher::ThreadType::kThreadForTesting);
base::HangWatcher::ThreadType::kThreadPoolThread);
// De-activate hang watching,
HangWatchScopeDisabled disabler;
......@@ -299,7 +303,7 @@ TEST_F(
TEST_F(HangWatcherTest, ScopeCreateTempCreateTempDestroyScopeDestroy) {
// Register the main test thread for hang watching.
auto unregister_thread_closure = hang_watcher_.RegisterThread(
base::HangWatcher::ThreadType::kThreadForTesting);
base::HangWatcher::ThreadType::kThreadPoolThread);
{
// Start a HangWatchScopeEnabled that expires right away. Then advance
// time to make sure a hang is detected.
......@@ -325,7 +329,7 @@ TEST_F(
ScopeEnabledCreateScopeDisabledCreateScopeDisabledDestroyScopeEnabledDestroy) {
// Register the main test thread for hang watching.
auto unregister_thread_closure = hang_watcher_.RegisterThread(
base::HangWatcher::ThreadType::kThreadForTesting);
base::HangWatcher::ThreadType::kThreadPoolThread);
{
// Start a HangWatchScopeEnabled that expires right away. Then advance
// time to make sure a hang is detected.
......@@ -349,7 +353,7 @@ TEST_F(
TEST_F(HangWatcherTest, ScopeDisabledObjectInnerScope) {
// Register the main test thread for hang watching.
auto unregister_thread_closure = hang_watcher_.RegisterThread(
base::HangWatcher::ThreadType::kThreadForTesting);
base::HangWatcher::ThreadType::kThreadPoolThread);
// Start a HangWatchScopeEnabled that expires right away. Then advance
// time to make sure a hang is detected.
......@@ -377,7 +381,7 @@ TEST_F(HangWatcherTest, ScopeDisabledObjectInnerScope) {
TEST_F(HangWatcherTest, NewScopeAfterDisabling) {
// Register the main test thread for hang watching.
auto unregister_thread_closure = hang_watcher_.RegisterThread(
base::HangWatcher::ThreadType::kThreadForTesting);
base::HangWatcher::ThreadType::kThreadPoolThread);
// Start a HangWatchScopeEnabled that expires right away. Then advance
// time to make sure a hang is detected.
......@@ -409,7 +413,7 @@ TEST_F(HangWatcherTest, NestedScopes) {
// threaded.
auto current_hang_watch_state =
base::internal::HangWatchState::CreateHangWatchStateForCurrentThread(
HangWatcher::ThreadType::kThreadForTesting);
HangWatcher::ThreadType::kThreadPoolThread);
ASSERT_FALSE(current_hang_watch_state->IsOverDeadline());
base::TimeTicks original_deadline = current_hang_watch_state->GetDeadline();
......@@ -446,6 +450,71 @@ TEST_F(HangWatcherTest, NestedScopes) {
ASSERT_EQ(current_hang_watch_state->GetDeadline(), original_deadline);
}
TEST_F(HangWatcherBlockingThreadTest, HistogramsLoggedOnHang) {
base::HistogramTester histogram_tester;
StartBlockedThread();
// Simulate hang.
task_environment_.FastForwardBy(kHangTime);
// First monitoring catches the hang and emits the histogram.
MonitorHangs();
EXPECT_THAT(histogram_tester.GetAllSamples(
"HangWatcher.NumberOfHungThreadsDuringWatchWindow."
"BrowserProcess.ThreadPool"),
ElementsAre(base::Bucket(/*min=*/1, /*count=*/1)));
// Reset to attempt capture again.
hang_event_.Reset();
monitor_event_.Reset();
// Hang is logged again even it would not trigger a crash dump.
MonitorHangs();
EXPECT_THAT(histogram_tester.GetAllSamples(
"HangWatcher.NumberOfHungThreadsDuringWatchWindow."
"BrowserProcess.ThreadPool"),
ElementsAre(base::Bucket(/*min=*/1, /*count=*/2)));
// Thread types that are not monitored should not get any samples.
EXPECT_THAT(histogram_tester.GetAllSamples(
"HangWatcher.NumberOfHungThreadsDuringWatchWindow."
"BrowserProcess.IOThread"),
IsEmpty());
EXPECT_THAT(histogram_tester.GetAllSamples(
"HangWatcher.NumberOfHungThreadsDuringWatchWindow."
"BrowserProcess.UIThread"),
IsEmpty());
JoinThread();
}
TEST_F(HangWatcherBlockingThreadTest, HistogramsLoggedWithoutHangs) {
base::HistogramTester histogram_tester;
StartBlockedThread();
// No hang to catch so nothing is recorded.
MonitorHangs();
ASSERT_FALSE(hang_event_.IsSignaled());
// A thread of type ThreadForTesting was monitored but didn't hang. This is
// logged.
EXPECT_THAT(histogram_tester.GetAllSamples(
"HangWatcher.NumberOfHungThreadsDuringWatchWindow."
"BrowserProcess.ThreadPool"),
ElementsAre(base::Bucket(/*min=*/0, /*count=*/1)));
// Thread types that are not monitored should not get any samples.
EXPECT_THAT(histogram_tester.GetAllSamples(
"HangWatcher.NumberOfHungThreadsDuringWatchWindow."
"BrowserProcess.IOThread"),
IsEmpty());
EXPECT_THAT(histogram_tester.GetAllSamples(
"HangWatcher.NumberOfHungThreadsDuringWatchWindow."
"BrowserProcess.UIThread"),
IsEmpty());
JoinThread();
}
TEST_F(HangWatcherBlockingThreadTest, Hang) {
StartBlockedThread();
......@@ -577,7 +646,7 @@ TEST_F(HangWatcherSnapshotTest, NonActionableReport) {
// Register the main test thread for hang watching.
auto unregister_thread_closure = hang_watcher_.RegisterThread(
base::HangWatcher::ThreadType::kThreadForTesting);
base::HangWatcher::ThreadType::kThreadPoolThread);
{
// Start a HangWatchScopeEnabled that expires right away. Ensures that
// the first monitor will detect a hang.
......@@ -625,7 +694,7 @@ TEST_F(HangWatcherSnapshotTest, DISABLED_HungThreadIDs) {
// Register the main test thread for hang watching.
auto unregister_thread_closure = hang_watcher_.RegisterThread(
base::HangWatcher::ThreadType::kThreadForTesting);
base::HangWatcher::ThreadType::kThreadPoolThread);
BlockingThread blocking_thread(&monitor_event_, base::TimeDelta{});
blocking_thread.StartAndWaitForScopeEntered();
......@@ -780,7 +849,7 @@ TEST_F(HangWatcherPeriodicMonitoringTest, PeriodicCallsTakePlace) {
// Register a thread,
unregister_thread_closure_ = hang_watcher_.RegisterThread(
base::HangWatcher::ThreadType::kThreadForTesting);
base::HangWatcher::ThreadType::kThreadPoolThread);
run_loop.Run();
......@@ -809,7 +878,7 @@ TEST_F(HangWatcherPeriodicMonitoringTest, NoMonitorOnOverSleep) {
// Register a thread.
unregister_thread_closure_ = hang_watcher_.RegisterThread(
base::HangWatcher::ThreadType::kThreadForTesting);
base::HangWatcher::ThreadType::kThreadPoolThread);
// Unblock the test thread. All waits were perceived as oversleeping so all
// monitoring was inhibited.
......@@ -850,7 +919,7 @@ class HangWatchScopeEnabledBlockingTest : public testing::Test {
// Register the test main thread for hang watching.
unregister_thread_closure_ = hang_watcher_.RegisterThread(
base::HangWatcher::ThreadType::kThreadForTesting);
base::HangWatcher::ThreadType::kThreadPoolThread);
}
HangWatchScopeEnabledBlockingTest(
......
......@@ -26,6 +26,34 @@ reviews. Googlers can read more about this at go/gwsq-gerrit.
<variant name="RendererProcess"/>
</variants>
<variants name="ThreadType">
<variant name="IOThread"/>
<variant name="ThreadPool"/>
<variant name="UIThread"/>
</variants>
<histogram
name="HangWatcher.NumberOfHungThreadsDuringWatchWindow.{ProcessType}.{ThreadType}"
units="threads" expires_after="2021-10-23">
<owner>olivierli@chromium.org</owner>
<owner>chrome-catan@google.com</owner>
<summary>
HangWatcher is a class that periodically watches threads to see if they are
making progress or if they are hung. This metric is recorded when the
HangWatcher checks for hangs. It counts the number of threads of type
{ThreadType} that were hung or became hung during the monitoring period in
processes of type {ProcessType}. The primary purpose of HangWatcher to
upload crash reports on hang. Hangs that last for N monitoring periods do
not produce N crash reports. Every subsequent report after the first one is
ignored if it pertains to a subset of the same hangs. In contrast this
metric is recorded after every single monitoring period. This gives more
weight to hangs that last for longer and also accounts for cascading hangs
that start as a side-effect of the first recorded one.
</summary>
<token key="ProcessType" variants="ProcessType"/>
<token key="ThreadType" variants="ThreadType"/>
</histogram>
<histogram name="HangWatcher.SleepDrift.{ProcessType}" units="microseconds"
expires_after="2021-10-23">
<owner>olivierli@chromium.org</owner>
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment