Commit 2ce332a5 authored by Olivier Li's avatar Olivier Li Committed by Commit Bot

HangWatcher: Save hung thread IDs in crash keys on hang recording.

This requires some concept of watch state "snapshot" since we don't want
to record the same hangs twice. Most of the complexity in the CL is
related to that concept since the actual creation of the crash key is
rather simple.

Bug: 1034046
Change-Id: I41f910143c9a268451d1c88ce6684f680f873aad
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2106486
Commit-Queue: Oliver Li <olivierli@google.com>
Reviewed-by: default avatarFrançois Doray <fdoray@chromium.org>
Cr-Commit-Position: refs/heads/master@{#760505}
parent 1cc58db7
...@@ -10,14 +10,18 @@ ...@@ -10,14 +10,18 @@
#include "base/bind.h" #include "base/bind.h"
#include "base/callback_helpers.h" #include "base/callback_helpers.h"
#include "base/debug/crash_logging.h"
#include "base/debug/dump_without_crashing.h" #include "base/debug/dump_without_crashing.h"
#include "base/feature_list.h" #include "base/feature_list.h"
#include "base/no_destructor.h" #include "base/no_destructor.h"
#include "base/strings/string_number_conversions.h"
#include "base/synchronization/lock.h" #include "base/synchronization/lock.h"
#include "base/synchronization/waitable_event.h" #include "base/synchronization/waitable_event.h"
#include "base/threading/platform_thread.h"
#include "base/threading/thread_checker.h" #include "base/threading/thread_checker.h"
#include "base/threading/thread_restrictions.h" #include "base/threading/thread_restrictions.h"
#include "base/time/time.h" #include "base/time/time.h"
#include "build/build_config.h"
namespace base { namespace base {
...@@ -46,6 +50,8 @@ HangWatchScope::HangWatchScope(TimeDelta timeout) { ...@@ -46,6 +50,8 @@ HangWatchScope::HangWatchScope(TimeDelta timeout) {
internal::HangWatchState* current_hang_watch_state = internal::HangWatchState* current_hang_watch_state =
internal::HangWatchState::GetHangWatchStateForCurrentThread()->Get(); internal::HangWatchState::GetHangWatchStateForCurrentThread()->Get();
DCHECK(timeout >= base::TimeDelta()) << "Negative timeouts are invalid.";
// TODO(crbug.com/1034046): Remove when all threads using HangWatchScope are // TODO(crbug.com/1034046): Remove when all threads using HangWatchScope are
// monitored. Thread is not monitored, noop. // monitored. Thread is not monitored, noop.
if (!current_hang_watch_state) { if (!current_hang_watch_state) {
...@@ -146,6 +152,10 @@ void HangWatcher::Run() { ...@@ -146,6 +152,10 @@ void HangWatcher::Run() {
should_monitor_.Wait(); should_monitor_.Wait();
} else { } else {
Monitor(); Monitor();
if (after_monitor_closure_for_testing_) {
after_monitor_closure_for_testing_.Run();
}
} }
if (keep_monitoring_.load(std::memory_order_relaxed)) { if (keep_monitoring_.load(std::memory_order_relaxed)) {
...@@ -186,34 +196,156 @@ ScopedClosureRunner HangWatcher::RegisterThread() { ...@@ -186,34 +196,156 @@ ScopedClosureRunner HangWatcher::RegisterThread() {
Unretained(HangWatcher::GetInstance()))); Unretained(HangWatcher::GetInstance())));
} }
void HangWatcher::Monitor() { base::TimeTicks HangWatcher::WatchStateSnapShot::GetHighestDeadline() const {
DCHECK_CALLED_ON_VALID_THREAD(thread_checker_); DCHECK(!hung_watch_state_copies_.empty());
// Since entries are sorted in increasing order the last entry is the largest
// one.
return hung_watch_state_copies_.back().deadline;
}
bool must_invoke_hang_closure = false; HangWatcher::WatchStateSnapShot::WatchStateSnapShot(
{ const HangWatchStates& watch_states,
AutoLock auto_lock(watch_state_lock_); base::TimeTicks snapshot_time,
for (const auto& watch_state : watch_states_) { base::TimeTicks previous_latest_expired_deadline)
if (watch_state->IsOverDeadline()) { : snapshot_time_(snapshot_time) {
must_invoke_hang_closure = true; // Initial copy of the values.
break; for (const auto& watch_state : watch_states) {
} base::TimeTicks deadline = watch_state.get()->GetDeadline();
// Some hangs that were already recorded are still live, snapshot is
// not actionable.
if (deadline <= previous_latest_expired_deadline) {
hung_watch_state_copies_.clear();
return;
}
// Only copy hung threads.
if (deadline <= snapshot_time) {
hung_watch_state_copies_.push_back(
WatchStateCopy{deadline, watch_state.get()->GetThreadID()});
} }
} }
if (must_invoke_hang_closure) { // Sort |hung_watch_state_copies_| by order of decreasing hang severity so the
capture_in_progress.store(true, std::memory_order_relaxed); // most severe hang is first in the list.
base::AutoLock scope_lock(capture_lock_); std::sort(hung_watch_state_copies_.begin(), hung_watch_state_copies_.end(),
[](const WatchStateCopy& lhs, const WatchStateCopy& rhs) {
return lhs.deadline < rhs.deadline;
});
}
// Invoke the closure outside the scope of |watch_state_lock_| HangWatcher::WatchStateSnapShot::WatchStateSnapShot(
// to prevent lock reentrancy. const WatchStateSnapShot& other) = default;
on_hang_closure_.Run();
capture_in_progress.store(false, std::memory_order_relaxed); HangWatcher::WatchStateSnapShot::~WatchStateSnapShot() = default;
}
std::string HangWatcher::WatchStateSnapShot::PrepareHungThreadListCrashKey()
const {
// Build a crash key string that contains the ids of the hung threads.
constexpr char kSeparator{'|'};
std::string list_of_hung_thread_ids;
if (after_monitor_closure_for_testing_) { // Add as many thread ids to the crash key as possible.
after_monitor_closure_for_testing_.Run(); for (const WatchStateCopy& copy : hung_watch_state_copies_) {
std::string fragment = base::NumberToString(copy.thread_id) + kSeparator;
if (list_of_hung_thread_ids.size() + fragment.size() <
static_cast<std::size_t>(debug::CrashKeySize::Size256)) {
list_of_hung_thread_ids += fragment;
} else {
// Respect the by priority ordering of thread ids in the crash key by
// stopping the construction as soon as one does not fit. This avoids
// including lesser priority ids while omitting more important ones.
break;
}
} }
return list_of_hung_thread_ids;
}
HangWatcher::WatchStateSnapShot HangWatcher::GrabWatchStateSnapshotForTesting()
const {
WatchStateSnapShot snapshot(watch_states_, base::TimeTicks::Now(),
latest_expired_deadline_);
return snapshot;
}
void HangWatcher::Monitor() {
DCHECK_CALLED_ON_VALID_THREAD(thread_checker_);
AutoLock auto_lock(watch_state_lock_);
// If all threads unregistered since this function was invoked there's
// nothing to do anymore.
if (watch_states_.empty())
return;
const base::TimeTicks now = base::TimeTicks::Now();
// See if any thread hung. We're holding |watch_state_lock_| so threads
// can't register or unregister but their deadline still can change
// atomically. This is fine. Detecting a hang is generally best effort and
// if a thread resumes from hang in the time it takes to move on to
// capturing then its ID will be absent from the crash keys.
bool any_thread_hung = std::any_of(
watch_states_.cbegin(), watch_states_.cend(),
[this, now](const std::unique_ptr<internal::HangWatchState>& state) {
base::TimeTicks deadline = state->GetDeadline();
return deadline > latest_expired_deadline_ && deadline < now;
});
// If at least a thread is hung we need to capture.
if (any_thread_hung)
CaptureHang(now);
}
void HangWatcher::CaptureHang(base::TimeTicks capture_time) {
capture_in_progress.store(true, std::memory_order_relaxed);
base::AutoLock scope_lock(capture_lock_);
WatchStateSnapShot watch_state_snapshot(watch_states_, capture_time,
latest_expired_deadline_);
// The hung thread(s) could detected at the start of Monitor() could have
// moved on from their scopes. If that happened and there are no more hung
// threads then abort capture.
std::string list_of_hung_thread_ids =
watch_state_snapshot.PrepareHungThreadListCrashKey();
if (list_of_hung_thread_ids.empty())
return;
#if not defined(OS_NACL)
static debug::CrashKeyString* crash_key = AllocateCrashKeyString(
"list-of-hung-threads", debug::CrashKeySize::Size256);
debug::ScopedCrashKeyString list_of_hung_threads_crash_key_string(
crash_key, list_of_hung_thread_ids);
#endif
// To avoid capturing more than one hang that blames a subset of the same
// threads it's necessary to keep track of what is the furthest deadline
// that contributed to declaring a hang. Only once
// all threads have deadlines past this point can we be sure that a newly
// discovered hang is not directly related.
// Example:
// **********************************************************************
// Timeline A : L------1-------2----------3-------4----------N-----------
// Timeline B : -------2----------3-------4----------L----5------N-------
// Timeline C : L----------------------------5------6----7---8------9---N
// **********************************************************************
// In the example when a Monitor() happens during timeline A
// |latest_expired_deadline_| (L) is at time zero and deadlines (1-4)
// are before Now() (N) . A hang is captured and L is updated. During
// the next Monitor() (timeline B) a new deadline is over but we can't
// capture a hang because deadlines 2-4 are still live and already counted
// toward a hang. During a third monitor (timeline C) all live deadlines
// are now after L and a second hang can be recorded.
base::TimeTicks latest_expired_deadline =
watch_state_snapshot.GetHighestDeadline();
on_hang_closure_.Run();
// Update after running the actual capture.
latest_expired_deadline_ = latest_expired_deadline;
capture_in_progress.store(false, std::memory_order_relaxed);
} }
void HangWatcher::SetAfterMonitorClosureForTesting( void HangWatcher::SetAfterMonitorClosureForTesting(
...@@ -262,7 +394,7 @@ namespace internal { ...@@ -262,7 +394,7 @@ namespace internal {
// |deadline_| starts at Max() to avoid validation problems // |deadline_| starts at Max() to avoid validation problems
// when setting the first legitimate value. // when setting the first legitimate value.
HangWatchState::HangWatchState() { HangWatchState::HangWatchState() : thread_id_(PlatformThread::CurrentId()) {
// There should not exist a state object for this thread already. // There should not exist a state object for this thread already.
DCHECK(!GetHangWatchStateForCurrentThread()->Get()); DCHECK(!GetHangWatchStateForCurrentThread()->Get());
...@@ -330,6 +462,10 @@ HangWatchState::GetHangWatchStateForCurrentThread() { ...@@ -330,6 +462,10 @@ HangWatchState::GetHangWatchStateForCurrentThread() {
return hang_watch_state.get(); return hang_watch_state.get();
} }
PlatformThreadId HangWatchState::GetThreadID() const {
return thread_id_;
}
} // namespace internal } // namespace internal
} // namespace base } // namespace base
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#include "base/callback_helpers.h" #include "base/callback_helpers.h"
#include "base/feature_list.h" #include "base/feature_list.h"
#include "base/synchronization/lock.h" #include "base/synchronization/lock.h"
#include "base/thread_annotations.h"
#include "base/threading/platform_thread.h"
#include "base/threading/simple_thread.h" #include "base/threading/simple_thread.h"
#include "base/threading/thread_checker.h" #include "base/threading/thread_checker.h"
#include "base/threading/thread_local.h" #include "base/threading/thread_local.h"
...@@ -86,7 +88,9 @@ class BASE_EXPORT HangWatcher : public DelegateSimpleThread::Delegate { ...@@ -86,7 +88,9 @@ class BASE_EXPORT HangWatcher : public DelegateSimpleThread::Delegate {
// The first invocation of the constructor will set the global instance // The first invocation of the constructor will set the global instance
// accessible through GetInstance(). This means that only one instance can // accessible through GetInstance(). This means that only one instance can
// exist at a time. // exist at a time. No locks owned by this class can be acquired in the
// closure. This indirectly means that registering a thread or creating a
// HangWatchScope from within the closure creates a deadlock.
explicit HangWatcher(RepeatingClosure on_hang_closure); explicit HangWatcher(RepeatingClosure on_hang_closure);
// Clears the global instance for the class. // Clears the global instance for the class.
...@@ -109,7 +113,9 @@ class BASE_EXPORT HangWatcher : public DelegateSimpleThread::Delegate { ...@@ -109,7 +113,9 @@ class BASE_EXPORT HangWatcher : public DelegateSimpleThread::Delegate {
LOCKS_EXCLUDED(watch_state_lock_) WARN_UNUSED_RESULT; LOCKS_EXCLUDED(watch_state_lock_) WARN_UNUSED_RESULT;
// Choose a closure to be run at the end of each call to Monitor(). Use only // Choose a closure to be run at the end of each call to Monitor(). Use only
// for testing. // for testing. No locks owned by this class can be acquired in the closure.
// This indirectly means that registering a thread or creating a
// HangWatchScope from within the closure creates a deadlock.
void SetAfterMonitorClosureForTesting(base::RepeatingClosure closure); void SetAfterMonitorClosureForTesting(base::RepeatingClosure closure);
// Set a monitoring period other than the default. Use only for // Set a monitoring period other than the default. Use only for
...@@ -129,9 +135,56 @@ class BASE_EXPORT HangWatcher : public DelegateSimpleThread::Delegate { ...@@ -129,9 +135,56 @@ class BASE_EXPORT HangWatcher : public DelegateSimpleThread::Delegate {
private: private:
THREAD_CHECKER(thread_checker_); THREAD_CHECKER(thread_checker_);
using HangWatchStates =
std::vector<std::unique_ptr<internal::HangWatchState>>;
// Used to save a snapshots of the state of hang watching during capture.
// Only the state of hung threads is retained.
class BASE_EXPORT WatchStateSnapShot {
public:
struct WatchStateCopy {
base::TimeTicks deadline;
base::PlatformThreadId thread_id;
};
// Construct the snapshot from provided data. |snapshot_time| can be
// different than now() to be coherent with other operations recently done
// on |watch_states|. |previous_latest_expired_deadline| is the highest
// deadline that contributed to the detection of hang for the associated
// HangWatcher.
WatchStateSnapShot(const HangWatchStates& watch_states,
base::TimeTicks snapshot_time,
base::TimeTicks previous_latest_expired_deadline);
WatchStateSnapShot(const WatchStateSnapShot& other);
~WatchStateSnapShot();
// Returns a string that contains the ids of the hung threads separated by a
// '|'. The size of the string is capped at debug::CrashKeySize::Size256. If
// no threads are hung returns an empty string.
std::string PrepareHungThreadListCrashKey() const;
// Return the highest deadline included in this snapshot.
base::TimeTicks GetHighestDeadline() const;
private:
base::TimeTicks snapshot_time_;
std::vector<WatchStateCopy> hung_watch_state_copies_;
};
// Return a watch state snapshot taken Now() to be inspected in tests.
// NO_THREAD_SAFETY_ANALYSIS is needed because the analyzer can't figure out
// that calls to this function done from |on_hang_closure_| are properly
// locked.
WatchStateSnapShot GrabWatchStateSnapshotForTesting() const
NO_THREAD_SAFETY_ANALYSIS;
// Inspects the state of all registered threads to check if they are hung and // Inspects the state of all registered threads to check if they are hung and
// invokes the appropriate closure if so. // invokes the appropriate closure if so.
void Monitor(); void Monitor() LOCKS_EXCLUDED(watch_state_lock_);
// Record the hang and perform the necessary housekeeping before and after.
void CaptureHang(base::TimeTicks capture_time)
EXCLUSIVE_LOCKS_REQUIRED(watch_state_lock_) LOCKS_EXCLUDED(capture_lock_);
// Call Run() on the HangWatcher thread. // Call Run() on the HangWatcher thread.
void Start(); void Start();
...@@ -168,10 +221,18 @@ class BASE_EXPORT HangWatcher : public DelegateSimpleThread::Delegate { ...@@ -168,10 +221,18 @@ class BASE_EXPORT HangWatcher : public DelegateSimpleThread::Delegate {
base::RepeatingClosure after_monitor_closure_for_testing_; base::RepeatingClosure after_monitor_closure_for_testing_;
base::Lock capture_lock_; base::Lock capture_lock_ ACQUIRED_AFTER(watch_state_lock_);
std::atomic<bool> capture_in_progress{false}; std::atomic<bool> capture_in_progress{false};
// The highest valued deadline that ever participated in the detection of a
// hang by HangWatcher. Used to make sure no two captures cover the same
// hangs. Initialized to 0 to make sure that once the thread gets going
// it's in the past since no capture took place yet. See comment in
// CaptureHang().
base::TimeTicks latest_expired_deadline_{base::TimeTicks()};
FRIEND_TEST_ALL_PREFIXES(HangWatcherTest, NestedScopes); FRIEND_TEST_ALL_PREFIXES(HangWatcherTest, NestedScopes);
FRIEND_TEST_ALL_PREFIXES(HangWatcherSnapshotTest, HungThreadIDs);
}; };
// Classes here are exposed in the header only for testing. They are not // Classes here are exposed in the header only for testing. They are not
...@@ -205,7 +266,7 @@ class BASE_EXPORT HangWatchState { ...@@ -205,7 +266,7 @@ class BASE_EXPORT HangWatchState {
// store the value. To test if the deadline has expired use IsOverDeadline(). // store the value. To test if the deadline has expired use IsOverDeadline().
TimeTicks GetDeadline() const; TimeTicks GetDeadline() const;
// Atomically sets the deadline to a new value and return the previous value. // Atomically sets the deadline to a new value.
void SetDeadline(TimeTicks deadline); void SetDeadline(TimeTicks deadline);
// Tests whether the associated thread's execution has gone over the deadline. // Tests whether the associated thread's execution has gone over the deadline.
...@@ -219,6 +280,8 @@ class BASE_EXPORT HangWatchState { ...@@ -219,6 +280,8 @@ class BASE_EXPORT HangWatchState {
HangWatchScope* GetCurrentHangWatchScope(); HangWatchScope* GetCurrentHangWatchScope();
#endif #endif
PlatformThreadId GetThreadID() const;
private: private:
// The thread that creates the instance should be the class that updates // The thread that creates the instance should be the class that updates
// the deadline. // the deadline.
...@@ -228,6 +291,8 @@ class BASE_EXPORT HangWatchState { ...@@ -228,6 +291,8 @@ class BASE_EXPORT HangWatchState {
// reaches the value contained in it this constistutes a hang. // reaches the value contained in it this constistutes a hang.
std::atomic<TimeTicks> deadline_{base::TimeTicks::Max()}; std::atomic<TimeTicks> deadline_{base::TimeTicks::Max()};
const PlatformThreadId thread_id_;
#if DCHECK_IS_ON() #if DCHECK_IS_ON()
// Used to keep track of the current HangWatchScope and detect improper usage. // Used to keep track of the current HangWatchScope and detect improper usage.
// Scopes should always be destructed in reverse order from the one they were // Scopes should always be destructed in reverse order from the one they were
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment