Commit 5b3db8d8 authored by kylechar's avatar kylechar Committed by Commit Bot

Add GPU process mojo error handler

There is a recurring issue where bugs in browser process code introduce
deserialization errors on the main viz browser-to-GPU process message
pipe. The message pipe is closed and the GPU process crashes trying to
reinitialize the viz thread. Without a way to differentiate message pipe
closure due to a deserialization error vs the other side closing it, the
crash only happens when the browser tries to reinitialize the viz thread
after what it thinks is a GPU crash. There is a very little information
available in these crash reports, the message pipe is shared by multiple
interfaces and figuring out what IPC message the error occurred in is a
guessing game.

There is a global error handler that triggers for any mojo
deserialization errors which includes a string containing the interface
name and a message identifier. This CL adds a handler which stores the
error string. If the GPU process crashes due to reinitializing the viz
thread within 5 seconds after the handler fired, it populates a crash
key with that error message. While it's not guaranteed the last
deserialization error was what caused the viz thread to get
reinitialized, deserialization errors are infrequent enough it shouldn't
be an issue.

Bug: 1075495
Change-Id: I3a762fd5e9d50c0c1c301b3179f08ce892ff4912
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2356546
Commit-Queue: kylechar <kylechar@chromium.org>
Reviewed-by: default avatarKen Rockot <rockot@google.com>
Cr-Commit-Position: refs/heads/master@{#798645}
parent f4ccdef6
...@@ -10,9 +10,13 @@ ...@@ -10,9 +10,13 @@
#include "base/bind.h" #include "base/bind.h"
#include "base/feature_list.h" #include "base/feature_list.h"
#include "base/message_loop/message_pump_type.h" #include "base/message_loop/message_pump_type.h"
#include "base/no_destructor.h"
#include "base/power_monitor/power_monitor.h" #include "base/power_monitor/power_monitor.h"
#include "base/power_monitor/power_monitor_source.h" #include "base/power_monitor/power_monitor_source.h"
#include "base/single_thread_task_runner.h" #include "base/single_thread_task_runner.h"
#include "base/synchronization/lock.h"
#include "base/task/thread_pool.h"
#include "base/time/time.h"
#include "base/trace_event/memory_dump_manager.h" #include "base/trace_event/memory_dump_manager.h"
#include "build/build_config.h" #include "build/build_config.h"
#include "components/ui_devtools/buildflags.h" #include "components/ui_devtools/buildflags.h"
...@@ -21,12 +25,81 @@ ...@@ -21,12 +25,81 @@
#include "gpu/ipc/service/gpu_init.h" #include "gpu/ipc/service/gpu_init.h"
#include "gpu/ipc/service/gpu_watchdog_thread.h" #include "gpu/ipc/service/gpu_watchdog_thread.h"
#include "media/gpu/buildflags.h" #include "media/gpu/buildflags.h"
#include "mojo/public/cpp/bindings/scoped_message_error_crash_key.h"
#include "mojo/public/cpp/system/functions.h"
#include "services/metrics/public/cpp/delegating_ukm_recorder.h" #include "services/metrics/public/cpp/delegating_ukm_recorder.h"
#include "services/metrics/public/cpp/mojo_ukm_recorder.h" #include "services/metrics/public/cpp/mojo_ukm_recorder.h"
#include "third_party/skia/include/core/SkFontLCDConfig.h" #include "third_party/skia/include/core/SkFontLCDConfig.h"
namespace { namespace {
// Singleton class to store error strings from global mojo error handler. It
// will store error strings for kTimeout seconds and can be used to set a crash
// key if the GPU process is going to crash due to a deserialization error.
// TODO(kylechar): This can be removed after tracking down all outstanding
// deserialization errors in messages sent from the browser to GPU on the viz
// message pipe.
class MojoErrorTracker {
public:
static constexpr base::TimeDelta kTimeout = base::TimeDelta::FromSeconds(5);
static MojoErrorTracker& Get() {
static base::NoDestructor<MojoErrorTracker> tracker;
return *tracker;
}
void OnError(const std::string& error) {
{
base::AutoLock locked(lock_);
error_ = error;
error_time_ = base::TimeTicks::Now();
}
// Once the error is old enough we will no longer use it in a crash key we
// can reset the string storage.
base::ThreadPool::PostDelayedTask(
FROM_HERE,
base::BindOnce(&MojoErrorTracker::Reset, base::Unretained(this)),
kTimeout);
}
// This will initialize the scoped crash key in |key| with the last mojo
// error message if the last mojo error happened within kTimeout seconds.
void MaybeSetCrashKeyWithRecentError(
base::Optional<mojo::debug::ScopedMessageErrorCrashKey>& key) {
base::AutoLock locked(lock_);
if (!HasErrorTimedOut())
key.emplace(error_);
}
private:
friend class base::NoDestructor<MojoErrorTracker>;
MojoErrorTracker() = default;
bool HasErrorTimedOut() const EXCLUSIVE_LOCKS_REQUIRED(lock_) {
return base::TimeTicks::Now() - error_time_ > kTimeout;
}
void Reset() {
base::AutoLock locked(lock_);
// If another mojo error happened since this task was scheduled we shouldn't
// reset the error string yet.
if (!HasErrorTimedOut())
return;
error_.clear();
error_.shrink_to_fit();
}
base::Lock lock_;
std::string error_ GUARDED_BY(lock_);
base::TimeTicks error_time_ GUARDED_BY(lock_);
};
// static
constexpr base::TimeDelta MojoErrorTracker::kTimeout;
std::unique_ptr<base::Thread> CreateAndStartIOThread() { std::unique_ptr<base::Thread> CreateAndStartIOThread() {
// TODO(sad): We do not need the IO thread once gpu has a separate process. // TODO(sad): We do not need the IO thread once gpu has a separate process.
// It should be possible to use |main_task_runner_| for doing IO tasks. // It should be possible to use |main_task_runner_| for doing IO tasks.
...@@ -63,6 +136,12 @@ VizMainImpl::VizMainImpl(Delegate* delegate, ...@@ -63,6 +136,12 @@ VizMainImpl::VizMainImpl(Delegate* delegate,
gpu_thread_task_runner_(base::ThreadTaskRunnerHandle::Get()) { gpu_thread_task_runner_(base::ThreadTaskRunnerHandle::Get()) {
DCHECK(gpu_init_); DCHECK(gpu_init_);
if (!gpu_init_->gpu_info().in_process_gpu) {
mojo::SetDefaultProcessErrorHandler(
base::BindRepeating(&MojoErrorTracker::OnError,
base::Unretained(&MojoErrorTracker::Get())));
}
// TODO(crbug.com/609317): Remove this when Mus Window Server and GPU are // TODO(crbug.com/609317): Remove this when Mus Window Server and GPU are
// split into separate processes. Until then this is necessary to be able to // split into separate processes. Until then this is necessary to be able to
// run Mushrome (chrome with mus) with Mus running in the browser process. // run Mushrome (chrome with mus) with Mus running in the browser process.
...@@ -238,7 +317,16 @@ void VizMainImpl::CreateFrameSinkManagerInternal( ...@@ -238,7 +317,16 @@ void VizMainImpl::CreateFrameSinkManagerInternal(
// FrameSinkManagerImpl, so just do a hard CHECK rather than crashing down the // FrameSinkManagerImpl, so just do a hard CHECK rather than crashing down the
// road so that all crash reports caused by this issue look the same and have // road so that all crash reports caused by this issue look the same and have
// the same signature. https://crbug.com/928845 // the same signature. https://crbug.com/928845
CHECK(!task_executor_); if (task_executor_) {
// If the global mojo error handler callback ran recently, we've cached the
// error string and will initialize |crash_key| to contain it before
// intentionally crashing. The deserialization error that caused the mojo
// error handler to run was probably, but not 100% guaranteed, the error
// that caused the main viz browser-to-GPU message pipe close.
base::Optional<mojo::debug::ScopedMessageErrorCrashKey> crash_key;
MojoErrorTracker::Get().MaybeSetCrashKeyWithRecentError(crash_key);
CHECK(!task_executor_);
}
task_executor_ = std::make_unique<gpu::GpuInProcessThreadService>( task_executor_ = std::make_unique<gpu::GpuInProcessThreadService>(
this, gpu_thread_task_runner_, gpu_service_->GetGpuScheduler(), this, gpu_thread_task_runner_, gpu_service_->GetGpuScheduler(),
gpu_service_->sync_point_manager(), gpu_service_->mailbox_manager(), gpu_service_->sync_point_manager(), gpu_service_->mailbox_manager(),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment