Commit 9d2ac622 authored by Olivier Li's avatar Olivier Li Committed by Commit Bot

Add monitoring loop to HangWatcher

Bug: 1034046
Change-Id: Id471b00549b34887039dcddb3d4c22a6d7f0a9fc
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2033428
Commit-Queue: Oliver Li <olivierli@chromium.org>
Reviewed-by: default avatarFrançois Doray <fdoray@chromium.org>
Cr-Commit-Position: refs/heads/master@{#738388}
parent 4b299a40
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include "base/callback_helpers.h" #include "base/callback_helpers.h"
#include "base/no_destructor.h" #include "base/no_destructor.h"
#include "base/synchronization/lock.h" #include "base/synchronization/lock.h"
#include "base/synchronization/waitable_event.h"
#include "base/threading/thread_checker.h" #include "base/threading/thread_checker.h"
#include "base/threading/thread_restrictions.h" #include "base/threading/thread_restrictions.h"
#include "base/time/time.h" #include "base/time/time.h"
...@@ -21,6 +22,16 @@ namespace { ...@@ -21,6 +22,16 @@ namespace {
HangWatcher* g_instance = nullptr; HangWatcher* g_instance = nullptr;
} }
constexpr const char* kThreadName = "HangWatcher";
// The time that the HangWatcher thread will sleep for between calls to
// Monitor(). Increasing or decreasing this does not modify the type of hangs
// that can be detected. It instead increases the probability that a call to
// Monitor() will happen at the right time to catch a hang. This has to be
// balanced with power/cpu use concerns as busy looping would catch amost all
// hangs but present unacceptable overhead.
const base::TimeDelta kMonitoringPeriod = base::TimeDelta::FromSeconds(10);
HangWatchScope::HangWatchScope(TimeDelta timeout) { HangWatchScope::HangWatchScope(TimeDelta timeout) {
internal::HangWatchState* current_hang_watch_state = internal::HangWatchState* current_hang_watch_state =
internal::HangWatchState::GetHangWatchStateForCurrentThread()->Get(); internal::HangWatchState::GetHangWatchStateForCurrentThread()->Get();
...@@ -61,15 +72,59 @@ HangWatchScope::~HangWatchScope() { ...@@ -61,15 +72,59 @@ HangWatchScope::~HangWatchScope() {
} }
HangWatcher::HangWatcher(RepeatingClosure on_hang_closure) HangWatcher::HangWatcher(RepeatingClosure on_hang_closure)
: on_hang_closure_(std::move(on_hang_closure)) { : monitor_period_(kMonitoringPeriod),
monitor_event_(WaitableEvent::ResetPolicy::AUTOMATIC,
WaitableEvent::InitialState::NOT_SIGNALED),
on_hang_closure_(std::move(on_hang_closure)),
thread_(this, kThreadName) {
// |thread_checker_| should not be bound to the constructing thread.
DETACH_FROM_THREAD(thread_checker_);
DCHECK(!g_instance); DCHECK(!g_instance);
g_instance = this; g_instance = this;
Start();
} }
HangWatcher::~HangWatcher() { HangWatcher::~HangWatcher() {
DCHECK_EQ(g_instance, this); DCHECK_EQ(g_instance, this);
DCHECK(watch_states_.empty()); DCHECK(watch_states_.empty());
g_instance = nullptr; g_instance = nullptr;
Stop();
}
void HangWatcher::Start() {
thread_.Start();
}
void HangWatcher::Stop() {
keep_monitoring_.store(false);
monitor_event_.Signal();
thread_.Join();
}
bool HangWatcher::IsWatchListEmpty() {
AutoLock auto_lock(watch_state_lock_);
return watch_states_.empty();
}
void HangWatcher::Run() {
// Monitor() should only run on |thread_|. Bind |thread_checker_| here to make
// sure of that.
DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
while (keep_monitoring_) {
// If there is nothing to watch sleep until there is.
if (IsWatchListEmpty()) {
monitor_event_.Wait();
} else {
Monitor();
}
if (keep_monitoring_) {
// Sleep until next scheduled monitoring.
monitor_event_.TimedWait(monitor_period_);
}
}
} }
// static // static
...@@ -83,11 +138,18 @@ ScopedClosureRunner HangWatcher::RegisterThread() { ...@@ -83,11 +138,18 @@ ScopedClosureRunner HangWatcher::RegisterThread() {
watch_states_.push_back( watch_states_.push_back(
internal::HangWatchState::CreateHangWatchStateForCurrentThread()); internal::HangWatchState::CreateHangWatchStateForCurrentThread());
// Now that there is a thread to monitor we wake the HangWatcher thread.
if (watch_states_.size() == 1) {
monitor_event_.Signal();
}
return ScopedClosureRunner(BindOnce(&HangWatcher::UnregisterThread, return ScopedClosureRunner(BindOnce(&HangWatcher::UnregisterThread,
Unretained(HangWatcher::GetInstance()))); Unretained(HangWatcher::GetInstance())));
} }
void HangWatcher::Monitor() { void HangWatcher::Monitor() {
DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
bool must_invoke_hang_closure = false; bool must_invoke_hang_closure = false;
{ {
AutoLock auto_lock(watch_state_lock_); AutoLock auto_lock(watch_state_lock_);
...@@ -104,6 +166,23 @@ void HangWatcher::Monitor() { ...@@ -104,6 +166,23 @@ void HangWatcher::Monitor() {
// to prevent lock reentrancy. // to prevent lock reentrancy.
on_hang_closure_.Run(); on_hang_closure_.Run();
} }
if (after_monitor_closure_for_testing_) {
after_monitor_closure_for_testing_.Run();
}
}
void HangWatcher::SetAfterMonitorClosureForTesting(
base::RepeatingClosure closure) {
after_monitor_closure_for_testing_ = std::move(closure);
}
void HangWatcher::SetMonitoringPeriodForTesting(base::TimeDelta period) {
monitor_period_ = period;
}
void HangWatcher::SignalMonitorEventForTesting() {
monitor_event_.Signal();
} }
void HangWatcher::UnregisterThread() { void HangWatcher::UnregisterThread() {
...@@ -129,11 +208,20 @@ namespace internal { ...@@ -129,11 +208,20 @@ namespace internal {
// |deadline_| starts at Max() to avoid validation problems // |deadline_| starts at Max() to avoid validation problems
// when setting the first legitimate value. // when setting the first legitimate value.
HangWatchState::HangWatchState() : deadline_(TimeTicks::Max()) {} HangWatchState::HangWatchState() : deadline_(TimeTicks::Max()) {
// There should not exist a state object for this thread already.
DCHECK(!GetHangWatchStateForCurrentThread()->Get());
// Bind the new instance to this thread.
GetHangWatchStateForCurrentThread()->Set(this);
}
HangWatchState::~HangWatchState() { HangWatchState::~HangWatchState() {
DCHECK_CALLED_ON_VALID_THREAD(thread_checker_); DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
DCHECK_EQ(GetHangWatchStateForCurrentThread()->Get(), this);
GetHangWatchStateForCurrentThread()->Set(nullptr);
#if DCHECK_IS_ON() #if DCHECK_IS_ON()
// Destroying the HangWatchState should not be done if there are live // Destroying the HangWatchState should not be done if there are live
// HangWatchScopes. // HangWatchScopes.
...@@ -144,18 +232,13 @@ HangWatchState::~HangWatchState() { ...@@ -144,18 +232,13 @@ HangWatchState::~HangWatchState() {
// static // static
std::unique_ptr<HangWatchState> std::unique_ptr<HangWatchState>
HangWatchState::CreateHangWatchStateForCurrentThread() { HangWatchState::CreateHangWatchStateForCurrentThread() {
// There should not exist a state object for this thread already.
DCHECK(!GetHangWatchStateForCurrentThread()->Get());
// Allocate a watch state object for this thread. // Allocate a watch state object for this thread.
std::unique_ptr<HangWatchState> hang_state = std::unique_ptr<HangWatchState> hang_state =
std::make_unique<HangWatchState>(); std::make_unique<HangWatchState>();
// Bind the new instance to this thread.
GetHangWatchStateForCurrentThread()->Set(hang_state.get());
// Setting the thread local worked. // Setting the thread local worked.
DCHECK(GetHangWatchStateForCurrentThread()->Get()); DCHECK_EQ(GetHangWatchStateForCurrentThread()->Get(), hang_state.get());
// Transfer ownership to caller. // Transfer ownership to caller.
return hang_state; return hang_state;
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include "base/atomicops.h" #include "base/atomicops.h"
#include "base/callback.h" #include "base/callback.h"
#include "base/callback_helpers.h" #include "base/callback_helpers.h"
#include "base/threading/simple_thread.h"
#include "base/threading/thread_checker.h" #include "base/threading/thread_checker.h"
#include "base/threading/thread_local.h" #include "base/threading/thread_local.h"
#include "base/time/time.h" #include "base/time/time.h"
...@@ -65,10 +66,10 @@ class BASE_EXPORT HangWatchScope { ...@@ -65,10 +66,10 @@ class BASE_EXPORT HangWatchScope {
}; };
// Monitors registered threads for hangs by inspecting their associated // Monitors registered threads for hangs by inspecting their associated
// HangWatchStates for deadline overruns. Only one instance of HangWatcher can // HangWatchStates for deadline overruns. This happens at a regular interval on
// exist at a time within a single process. This instance must outlive all // a separate thread. Only one instance of HangWatcher can exist at a time
// monitored threads. // within a single process. This instance must outlive all monitored threads.
class BASE_EXPORT HangWatcher { class BASE_EXPORT HangWatcher : public DelegateSimpleThread::Delegate {
public: public:
// The first invocation of the constructor will set the global instance // The first invocation of the constructor will set the global instance
// accessible through GetInstance(). This means that only one instance can // accessible through GetInstance(). This means that only one instance can
...@@ -76,7 +77,7 @@ class BASE_EXPORT HangWatcher { ...@@ -76,7 +77,7 @@ class BASE_EXPORT HangWatcher {
explicit HangWatcher(RepeatingClosure on_hang_closure); explicit HangWatcher(RepeatingClosure on_hang_closure);
// Clears the global instance for the class. // Clears the global instance for the class.
~HangWatcher(); ~HangWatcher() override;
HangWatcher(const HangWatcher&) = delete; HangWatcher(const HangWatcher&) = delete;
HangWatcher& operator=(const HangWatcher&) = delete; HangWatcher& operator=(const HangWatcher&) = delete;
...@@ -87,21 +88,65 @@ class BASE_EXPORT HangWatcher { ...@@ -87,21 +88,65 @@ class BASE_EXPORT HangWatcher {
// Sets up the calling thread to be monitored for threads. Returns a // Sets up the calling thread to be monitored for threads. Returns a
// ScopedClosureRunner that unregisters the thread. This closure has to be // ScopedClosureRunner that unregisters the thread. This closure has to be
// called from the registered thread before it's joined. // called from the registered thread before it's joined.
ScopedClosureRunner RegisterThread() WARN_UNUSED_RESULT; ScopedClosureRunner RegisterThread()
LOCKS_EXCLUDED(watch_state_lock_) WARN_UNUSED_RESULT;
// Inspects the state of all registered threads to check if they are hung. // Choose a closure to be run at the end of each call to Monitor(). Use only
void Monitor(); // for testing.
void SetAfterMonitorClosureForTesting(base::RepeatingClosure closure);
// Set a monitoring period other than the default. Use only for
// testing.
void SetMonitoringPeriodForTesting(base::TimeDelta period);
// Force the monitoring loop to resume and evaluate whether to continue.
// This can trigger a call to Monitor() or not depending on why the
// HangWatcher thread is sleeping. Use only for testing.
void SignalMonitorEventForTesting();
private: private:
THREAD_CHECKER(thread_checker_);
// Inspects the state of all registered threads to check if they are hung and
// invokes the appropriate closure if so.
void Monitor();
// Call Run() on the HangWatcher thread.
void Start();
// Stop all monitoring and join the HangWatcher thread.
void Stop();
// Run the loop that periodically monitors the registered thread at a
// set time interval.
void Run() override;
base::TimeDelta monitor_period_;
// Indicates whether Run() should return after the next monitoring.
std::atomic<bool> keep_monitoring_{true};
// Use to make the HangWatcher thread wake or sleep to schedule the
// appropriate monitoring frequency.
WaitableEvent monitor_event_;
bool IsWatchListEmpty() LOCKS_EXCLUDED(watch_state_lock_);
// Stops hang watching on the calling thread by removing the entry from the // Stops hang watching on the calling thread by removing the entry from the
// watch list. // watch list.
void UnregisterThread(); void UnregisterThread() LOCKS_EXCLUDED(watch_state_lock_);
const RepeatingClosure on_hang_closure_; const RepeatingClosure on_hang_closure_;
Lock watch_state_lock_; Lock watch_state_lock_;
std::vector<std::unique_ptr<internal::HangWatchState>> watch_states_ std::vector<std::unique_ptr<internal::HangWatchState>> watch_states_
GUARDED_BY(watch_state_lock_); GUARDED_BY(watch_state_lock_);
base::DelegateSimpleThread thread_;
base::RepeatingClosure after_monitor_closure_for_testing_;
FRIEND_TEST_ALL_PREFIXES(HangWatcherTest, NestedScopes);
}; };
// Classes here are exposed in the header only for testing. They are not // Classes here are exposed in the header only for testing. They are not
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include "base/callback_helpers.h" #include "base/callback_helpers.h"
#include "base/synchronization/lock.h" #include "base/synchronization/lock.h"
#include "base/synchronization/waitable_event.h" #include "base/synchronization/waitable_event.h"
#include "base/test/bind_test_util.h"
#include "base/test/task_environment.h" #include "base/test/task_environment.h"
#include "base/threading/platform_thread.h" #include "base/threading/platform_thread.h"
#include "base/time/time.h" #include "base/time/time.h"
...@@ -64,7 +65,16 @@ class HangWatcherTest : public testing::Test { ...@@ -64,7 +65,16 @@ class HangWatcherTest : public testing::Test {
: hang_watcher_(std::make_unique<HangWatcher>( : hang_watcher_(std::make_unique<HangWatcher>(
base::BindRepeating(&WaitableEvent::Signal, base::BindRepeating(&WaitableEvent::Signal,
base::Unretained(&hang_event_)))), base::Unretained(&hang_event_)))),
thread_(&unblock_thread_) {} thread_(&unblock_thread_) {
hang_watcher_->SetAfterMonitorClosureForTesting(base::BindRepeating(
&WaitableEvent::Signal, base::Unretained(&monitor_event_)));
}
void SetUp() override {
// We're not testing the monitoring loop behavior in this test so we want to
// trigger monitoring manually.
hang_watcher_->SetMonitoringPeriodForTesting(base::TimeDelta::Max());
}
void StartBlockedThread() { void StartBlockedThread() {
// Thread has not run yet. // Thread has not run yet.
...@@ -75,11 +85,20 @@ class HangWatcherTest : public testing::Test { ...@@ -75,11 +85,20 @@ class HangWatcherTest : public testing::Test {
ASSERT_TRUE(PlatformThread::Create(0, &thread_, &handle)); ASSERT_TRUE(PlatformThread::Create(0, &thread_, &handle));
thread_.WaitUntilScopeEntered(); thread_.WaitUntilScopeEntered();
// Thread registration triggered a call to HangWatcher::Monitor() which
// signaled |monitor_event_|. Reset it so it's ready for waiting later on.
monitor_event_.Reset();
} }
void MonitorHangsAndJoinThread() { void MonitorHangsAndJoinThread() {
// Try to detect a hang if any. // HangWatcher::Monitor() should not be set which would mean a call to
HangWatcher::GetInstance()->Monitor(); // HangWatcher::Monitor() happened and was unacounted for.
ASSERT_FALSE(monitor_event_.IsSignaled());
// Triger a monitoring on HangWatcher thread and verify results.
hang_watcher_->SignalMonitorEventForTesting();
monitor_event_.Wait();
unblock_thread_.Signal(); unblock_thread_.Signal();
...@@ -91,25 +110,39 @@ class HangWatcherTest : public testing::Test { ...@@ -91,25 +110,39 @@ class HangWatcherTest : public testing::Test {
} }
protected: protected:
// Used to wait for monitoring. Will be signaled by the HangWatcher thread and
// so needs to outlive it.
WaitableEvent monitor_event_;
// Signaled from the HangWatcher thread when a hang is detected. Needs to
// outlive the HangWatcher thread.
WaitableEvent hang_event_;
std::unique_ptr<HangWatcher> hang_watcher_; std::unique_ptr<HangWatcher> hang_watcher_;
// Used exclusively for MOCK_TIME. No tasks will be run on the environment. // Used exclusively for MOCK_TIME. No tasks will be run on the environment.
test::TaskEnvironment task_environment_{ test::TaskEnvironment task_environment_{
test::TaskEnvironment::TimeSource::MOCK_TIME}; test::TaskEnvironment::TimeSource::MOCK_TIME};
PlatformThreadHandle handle; // Used to unblock the monitored thread. Signaled from the test main thread.
WaitableEvent unblock_thread_; WaitableEvent unblock_thread_;
PlatformThreadHandle handle;
BlockingThread thread_; BlockingThread thread_;
// Signaled when a hang is detected.
WaitableEvent hang_event_;
}; };
} // namespace } // namespace
TEST_F(HangWatcherTest, NoScopes) { TEST_F(HangWatcherTest, NoRegisteredThreads) {
HangWatcher::GetInstance()->Monitor(); ASSERT_FALSE(monitor_event_.IsSignaled());
// Signal to advance the Run() loop.
base::HangWatcher::GetInstance()->SignalMonitorEventForTesting();
// Monitoring should just not happen when there are no registered threads.
// Wait a while to make sure it does not.
ASSERT_FALSE(monitor_event_.TimedWait(base::TimeDelta::FromSeconds(1)));
ASSERT_FALSE(hang_event_.IsSignaled()); ASSERT_FALSE(hang_event_.IsSignaled());
} }
...@@ -171,4 +204,64 @@ TEST_F(HangWatcherTest, NoHang) { ...@@ -171,4 +204,64 @@ TEST_F(HangWatcherTest, NoHang) {
ASSERT_FALSE(hang_event_.IsSignaled()); ASSERT_FALSE(hang_event_.IsSignaled());
} }
// |HangWatcher| relies on |WaitableEvent::TimedWait| to schedule monitoring
// which cannot be tested using MockTime. Some tests will have to actually wait
// in real time before observing results but the TimeDeltas used are chosen to
// minimize flakiness as much as possible.
class HangWatcherRealTimeTest : public testing::Test {
public:
HangWatcherRealTimeTest()
: hang_watcher_(std::make_unique<HangWatcher>(
base::BindRepeating(&WaitableEvent::Signal,
base::Unretained(&hang_event_)))) {}
protected:
std::unique_ptr<HangWatcher> hang_watcher_;
// Signaled when a hang is detected.
WaitableEvent hang_event_;
std::atomic<int> monitor_count_{0};
base::ScopedClosureRunner unregister_thread_closure_;
};
TEST_F(HangWatcherRealTimeTest, PeriodicCallsCount) {
// These values are chosen to execute fast enough while running the unit tests
// but be large enough to buffer against clock precision problems.
const base::TimeDelta kMonitoringPeriod(
base::TimeDelta::FromMilliseconds(100));
const base::TimeDelta kExecutionTime = kMonitoringPeriod * 5;
// HangWatcher::Monitor() will run once right away on thread registration.
// We want to make sure it runs at least once more from being scheduled.
constexpr int kMinimumMonitorCount = 2;
// Some amount of extra monitoring can happen but it has to be of the right
// order of magnitude. Otherwise it could indicate a problem like some code
// signaling the Thread to wake up excessivelly.
const int kMaximumMonitorCount = 2 * (kExecutionTime / kMonitoringPeriod);
auto increment_monitor_count = [this]() { ++monitor_count_; };
hang_watcher_->SetMonitoringPeriodForTesting(kMonitoringPeriod);
hang_watcher_->SetAfterMonitorClosureForTesting(
base::BindLambdaForTesting(increment_monitor_count));
hang_event_.TimedWait(kExecutionTime);
// No thread ever registered so no monitoring took place at all.
ASSERT_EQ(monitor_count_.load(), 0);
unregister_thread_closure_ = hang_watcher_->RegisterThread();
hang_event_.TimedWait(kExecutionTime);
ASSERT_GE(monitor_count_.load(), kMinimumMonitorCount);
ASSERT_LE(monitor_count_.load(), kMaximumMonitorCount);
// No monitored scope means no possible hangs.
ASSERT_FALSE(hang_event_.IsSignaled());
}
} // namespace base } // namespace base
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment