Commit 48c4d507 authored by Aaron Colwell's avatar Aaron Colwell Committed by Commit Bot

Fix NestedMessagePumpAndroid to avoid nested calls into the looper.

This fixes flakiness in a variety of browser tests on Android caused
by nested calls to Java_NestedSystemMessageHandler_dispatchOneMessage().
This method calls into looper code that is not reentrant safe which
results in crashes. This change refactors the MessagePumpUI &
NestedMessagePumpAndroid logic so that work can be dispatched without
the looper code on the call stack. The code detects whether work events
are being dispatched while inside the looper and defers that work until
the Java_NestedSystemMessageHandler_dispatchOneMessage() call returns.
This allows us to avoid making reentrant calls into the looper while
still allowing nested RunLoops to work properly.

- Splits MessagePumpUI::OnDelayedLooperCallback() and
  MessagePumpUI::OnNonDelayedLooperCallback() into 2 pieces so the
  dispatching of work can be separated from the looper callback.
- Added deferred work tracking logic to NestedMessagePumpAndroid so
  that it can keep track of what events occurred while the looper was
  running and dispatch work once the looper returns.
- Added logic to save/restore the delegate and quit status so that
  quitting one Run() level does not quit all levels. This makes the
  code more consistent with other MessagePump implementations.
- Enable tests on Android that were previously disabled because of
  flakiness caused by crashing in the looper.


Bug: 1110497, 1129592
Change-Id: I4cfb8804275045b6fb4ab2b0d89cdc20fdb8ea4c
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2511790Reviewed-by: default avatarNasko Oskov <nasko@chromium.org>
Reviewed-by: default avatardanakj <danakj@chromium.org>
Commit-Queue: Aaron Colwell <acolwell@chromium.org>
Cr-Commit-Position: refs/heads/master@{#827283}
parent 925d863b
......@@ -82,6 +82,9 @@ STACK_ALIGN int DelayedLooperCallback(int fd, int events, void* data) {
return 1; // continue listening for events
}
// A bit added to the |non_delayed_fd_| to keep it signaled when we yield to
// native tasks below.
constexpr uint64_t kTryNativeTasksBeforeIdleBit = uint64_t(1) << 32;
} // namespace
MessagePumpForUI::MessagePumpForUI()
......@@ -150,7 +153,10 @@ void MessagePumpForUI::OnDelayedLooperCallback() {
// the timerfd, and they both run on the same thread as this callback, so
// there are no obvious timing or multi-threading related issues.
DPCHECK(ret >= 0 || errno == EAGAIN);
DoDelayedLooperWork();
}
void MessagePumpForUI::DoDelayedLooperWork() {
delayed_scheduled_time_.reset();
Delegate::NextWorkInfo next_work_info = delegate_->DoWork();
......@@ -182,10 +188,6 @@ void MessagePumpForUI::OnNonDelayedLooperCallback() {
if (ShouldQuit())
return;
// A bit added to the |non_delayed_fd_| to keep it signaled when we yield to
// native tasks below.
constexpr uint64_t kTryNativeTasksBeforeIdleBit = uint64_t(1) << 32;
// We're about to process all the work requested by ScheduleWork().
// MessagePump users are expected to do their best not to invoke
// ScheduleWork() again before DoWork() returns a non-immediate
......@@ -193,15 +195,18 @@ void MessagePumpForUI::OnNonDelayedLooperCallback() {
// resetting its contents to 0 should be okay. The value currently stored
// should be greater than 0 since work having been scheduled is the reason
// we're here. See http://man7.org/linux/man-pages/man2/eventfd.2.html
uint64_t pre_work_value = 0;
int ret = read(non_delayed_fd_, &pre_work_value, sizeof(pre_work_value));
uint64_t value = 0;
int ret = read(non_delayed_fd_, &value, sizeof(value));
DPCHECK(ret >= 0);
DCHECK_GT(pre_work_value, 0U);
DCHECK_GT(value, 0U);
bool do_idle_work = value == kTryNativeTasksBeforeIdleBit;
DoNonDelayedLooperWork(do_idle_work);
}
// Note: We can't skip DoWork() even if
// |pre_work_value == kTryNativeTasksBeforeIdleBit| here (i.e. no additional
// ScheduleWork() since yielding to native) as delayed tasks might have come
// in and we need to re-sample |next_work_info|.
void MessagePumpForUI::DoNonDelayedLooperWork(bool do_idle_work) {
// Note: We can't skip DoWork() even if |do_idle_work| is true here (i.e. no
// additional ScheduleWork() since yielding to native) as delayed tasks might
// have come in and we need to re-sample |next_work_info|.
// Runs all application tasks scheduled to run.
Delegate::NextWorkInfo next_work_info;
......@@ -220,15 +225,8 @@ void MessagePumpForUI::OnNonDelayedLooperCallback() {
// Before declaring this loop idle, yield to native tasks and arrange to be
// called again (unless we're already in that second call).
if (pre_work_value != kTryNativeTasksBeforeIdleBit) {
// Note: This write() is racing with potential ScheduleWork() calls. This is
// fine as write() is adding this bit, not overwriting the existing value,
// and as such racing ScheduleWork() calls would merely add 1 to the lower
// bits and we would find |pre_work_value != kTryNativeTasksBeforeIdleBit|
// in the next cycle again, retrying this.
ret = write(non_delayed_fd_, &kTryNativeTasksBeforeIdleBit,
sizeof(kTryNativeTasksBeforeIdleBit));
DPCHECK(ret >= 0);
if (!do_idle_work) {
ScheduleWorkInternal(/*do_idle_work=*/true);
return;
}
......@@ -241,7 +239,7 @@ void MessagePumpForUI::OnNonDelayedLooperCallback() {
// SchedulerWork() but still keep the system non-idle (e.g., the Java Handler
// API). It would be better to add an API to query the presence of native
// tasks instead of relying on yielding once + kTryNativeTasksBeforeIdleBit.
DCHECK_EQ(pre_work_value, kTryNativeTasksBeforeIdleBit);
DCHECK(do_idle_work);
if (ShouldQuit())
return;
......@@ -309,16 +307,26 @@ void MessagePumpForUI::Quit() {
}
void MessagePumpForUI::ScheduleWork() {
// Write (add) 1 to the eventfd. This tells the Looper to wake up and call our
// callback, allowing us to run tasks. This also allows us to detect, when we
// clear the fd, whether additional work was scheduled after we finished
// performing work, but before we cleared the fd, as we'll read back >=2
// instead of 1 in that case.
// See the eventfd man pages
ScheduleWorkInternal(/*do_idle_work=*/false);
}
void MessagePumpForUI::ScheduleWorkInternal(bool do_idle_work) {
// Write (add) |value| to the eventfd. This tells the Looper to wake up and
// call our callback, allowing us to run tasks. This also allows us to detect,
// when we clear the fd, whether additional work was scheduled after we
// finished performing work, but before we cleared the fd, as we'll read back
// >=2 instead of 1 in that case. See the eventfd man pages
// (http://man7.org/linux/man-pages/man2/eventfd.2.html) for details on how
// the read and write APIs for this file descriptor work, specifically without
// EFD_SEMAPHORE.
uint64_t value = 1;
// Note: Calls with |do_idle_work| set to true may race with potential calls
// where the parameter is false. This is fine as write() is adding |value|,
// not overwriting the existing value, and as such racing calls would merely
// have their values added together. Since idle work is only executed when the
// value read equals kTryNativeTasksBeforeIdleBit, a race would prevent idle
// work from being run and trigger another call to this method with
// |do_idle_work| set to true.
uint64_t value = do_idle_work ? kTryNativeTasksBeforeIdleBit : 1;
int ret = write(non_delayed_fd_, &value, sizeof(value));
DPCHECK(ret >= 0);
}
......@@ -352,4 +360,12 @@ void MessagePumpForUI::QuitWhenIdle(base::OnceClosure callback) {
ScheduleWork();
}
MessagePump::Delegate* MessagePumpForUI::SetDelegate(Delegate* delegate) {
return std::exchange(delegate_, delegate);
}
bool MessagePumpForUI::SetQuit(bool quit) {
return std::exchange(quit_, quit);
}
} // namespace base
......@@ -57,10 +57,13 @@ class BASE_EXPORT MessagePumpForUI : public MessagePump {
void OnNonDelayedLooperCallback();
protected:
void SetDelegate(Delegate* delegate) { delegate_ = delegate; }
void ResetShouldQuit() { quit_ = false; }
Delegate* SetDelegate(Delegate* delegate);
bool SetQuit(bool quit);
virtual void DoDelayedLooperWork();
virtual void DoNonDelayedLooperWork(bool do_idle_work);
private:
void ScheduleWorkInternal(bool do_idle_work);
void DoIdleWork();
// Unlike other platforms, we don't control the message loop as it's
......
......@@ -96,8 +96,7 @@ class MessagePumpForUIStub : public base::MessagePumpForUI {
if (g_state->run_depth > 1) {
RunNested(delegate);
} else {
ResetShouldQuit();
SetQuit(false);
SetDelegate(delegate);
// Pump the loop once in case we're starting off idle as ALooper_pollOnce
......
......@@ -7577,17 +7577,8 @@ IN_PROC_BROWSER_TEST_P(ProactivelySwapBrowsingInstancesSameSiteTest,
EvalJs(main_frame_2, "localStorage.getItem('visibilitychange_storage')"));
}
// Calling script from RenderFrameHostChanged is flaky on Android, so disable
// the test for now.
// TODO(crbug.com/1110497): Enable the test.
#if defined(OS_ANDROID)
#define MAYBE_NavigationAfterPagehideHistogram \
DISABLED_NavigationAfterPagehideHistogram
#else
#define MAYBE_NavigationAfterPagehideHistogram NavigationAfterPagehideHistogram
#endif
IN_PROC_BROWSER_TEST_P(ProactivelySwapBrowsingInstancesSameSiteTest,
MAYBE_NavigationAfterPagehideHistogram) {
NavigationAfterPagehideHistogram) {
ASSERT_TRUE(embedded_test_server()->Start());
GURL url_a1(embedded_test_server()->GetURL("a.com", "/title1.html"));
GURL url_a2(embedded_test_server()->GetURL("a.com", "/title2.html"));
......@@ -7613,18 +7604,8 @@ IN_PROC_BROWSER_TEST_P(ProactivelySwapBrowsingInstancesSameSiteTest,
ActionAfterPagehide::kNavigation, 1);
}
// Calling script from RenderFrameHostChanged is flaky on Android, so disable
// the test for now.
// TODO(crbug.com/1110497): Enable the test.
#if defined(OS_ANDROID)
#define MAYBE_PostMessageAfterPagehideHistogram \
DISABLED_PostMessageAfterPagehideHistogram
#else
#define MAYBE_PostMessageAfterPagehideHistogram \
PostMessageAfterPagehideHistogram
#endif
IN_PROC_BROWSER_TEST_P(ProactivelySwapBrowsingInstancesSameSiteTest,
MAYBE_PostMessageAfterPagehideHistogram) {
PostMessageAfterPagehideHistogram) {
ASSERT_TRUE(embedded_test_server()->Start());
GURL url_a1(embedded_test_server()->GetURL("a.com", "/title1.html"));
GURL url_a2(embedded_test_server()->GetURL("a.com", "/title2.html"));
......@@ -7678,18 +7659,8 @@ IN_PROC_BROWSER_TEST_P(ProactivelySwapBrowsingInstancesSameSiteTest,
}
}
// Calling script from RenderFrameHostChanged is flaky on Android, so disable
// the test for now.
// TODO(crbug.com/1110497): Enable the test.
#if defined(OS_ANDROID)
#define MAYBE_PostMessageAfterPagehideHistogramSubframe \
DISABLED_PostMessageAfterPagehideHistogramSubframe
#else
#define MAYBE_PostMessageAfterPagehideHistogramSubframe \
PostMessageAfterPagehideHistogramSubframe
#endif
IN_PROC_BROWSER_TEST_P(ProactivelySwapBrowsingInstancesSameSiteTest,
MAYBE_PostMessageAfterPagehideHistogramSubframe) {
PostMessageAfterPagehideHistogramSubframe) {
ASSERT_TRUE(embedded_test_server()->Start());
GURL url_a1(embedded_test_server()->GetURL(
"a.com", "/cross_site_iframe_factory.html?a(a)"));
......@@ -7759,18 +7730,8 @@ IN_PROC_BROWSER_TEST_P(ProactivelySwapBrowsingInstancesSameSiteTest,
}
}
// Calling script from RenderFrameHostChanged is flaky on Android, so disable
// the test for now.
// TODO(crbug.com/1110497): Enable the test.
#if defined(OS_ANDROID)
#define MAYBE_StorageModificationAfterPagehideHistogram \
DISABLED_StorageModificationAfterPagehideHistogram
#else
#define MAYBE_StorageModificationAfterPagehideHistogram \
StorageModificationAfterPagehideHistogram
#endif
IN_PROC_BROWSER_TEST_P(ProactivelySwapBrowsingInstancesSameSiteTest,
MAYBE_StorageModificationAfterPagehideHistogram) {
StorageModificationAfterPagehideHistogram) {
ASSERT_TRUE(embedded_test_server()->Start());
GURL url_a1(embedded_test_server()->GetURL("a.com", "/title1.html"));
GURL url_a2(embedded_test_server()->GetURL("a.com", "/title2.html"));
......@@ -8589,22 +8550,13 @@ IN_PROC_BROWSER_TEST_P(RenderFrameHostManagerTest,
SetBrowserClientForTesting(old_client);
}
// TODO(crbug.com/1110497): flaky on Android builders since 2020-07-28.
#if defined(OS_ANDROID)
#define MAYBE_NavigationRacesWithCommitInUnassignedSiteInstance \
DISABLED_NavigationRacesWithCommitInUnassignedSiteInstance
#else
#define MAYBE_NavigationRacesWithCommitInUnassignedSiteInstance \
NavigationRacesWithCommitInUnassignedSiteInstance
#endif
// Check that when a navigation to a URL that doesn't require assigning a site
// URL is in progress, another navigation can't reuse the same process in the
// meantime. Such reuse previously led to a renderer kill when the siteless
// URL later committed; a real-world example of the siteless URL was
// chrome-native://newtab. See https://crbug.com/970046.
IN_PROC_BROWSER_TEST_P(
RenderFrameHostManagerTest,
MAYBE_NavigationRacesWithCommitInUnassignedSiteInstance) {
IN_PROC_BROWSER_TEST_P(RenderFrameHostManagerTest,
NavigationRacesWithCommitInUnassignedSiteInstance) {
ASSERT_TRUE(embedded_test_server()->Start());
// Set up a URL for which ShouldAssignSiteForURL will return false. The
......@@ -8781,14 +8733,6 @@ class RenderFrameHostManagerDefaultProcessTest
DISALLOW_COPY_AND_ASSIGN(RenderFrameHostManagerDefaultProcessTest);
};
// TODO(crbug.com/1110497): flaky on Android builders since 2020-07-28.
#if defined(OS_ANDROID)
#define MAYBE_NavigationRacesWithSitelessCommitInDefaultProcess \
DISABLED_NavigationRacesWithSitelessCommitInDefaultProcess
#else
#define MAYBE_NavigationRacesWithSitelessCommitInDefaultProcess \
NavigationRacesWithSitelessCommitInDefaultProcess
#endif
// Ensure that the default process can be used for URLs that don't assign a site
// to the SiteInstance, when Site Isolation is not enabled.
// 1. Visit foo.com.
......@@ -8798,9 +8742,8 @@ class RenderFrameHostManagerDefaultProcessTest
// https://crbug.com/838348.)
// All navigations should use the default process, and we should not crash.
// See https://crbug.com/977956.
IN_PROC_BROWSER_TEST_P(
RenderFrameHostManagerDefaultProcessTest,
MAYBE_NavigationRacesWithSitelessCommitInDefaultProcess) {
IN_PROC_BROWSER_TEST_P(RenderFrameHostManagerDefaultProcessTest,
NavigationRacesWithSitelessCommitInDefaultProcess) {
// This test is designed to run without strict site isolation.
if (AreAllSitesIsolatedForTesting())
return;
......
......@@ -5,6 +5,7 @@
#include "content/public/test/nested_message_pump_android.h"
#include "base/android/jni_android.h"
#include "base/auto_reset.h"
#include "content/public/test/android/test_support_content_jni_headers/NestedSystemMessageHandler_jni.h"
namespace content {
......@@ -19,25 +20,42 @@ NestedMessagePumpAndroid::~NestedMessagePumpAndroid() = default;
// isn't re-entrant safe. Instead, we keep the entire run loop in Java (in
// MessageQueue.next()).
void NestedMessagePumpAndroid::Run(base::MessagePump::Delegate* delegate) {
auto* env = base::android::AttachCurrentThread();
SetDelegate(delegate);
ResetShouldQuit();
ScheduleWork();
// Preserve delegate and quit state of the current run loop so it can be
// restored after the loop below has completed.
auto* old_delegate = SetDelegate(delegate);
bool old_quit = SetQuit(false);
ScheduleWork();
while (!ShouldQuit()) {
// Dispatch the first available Java message and one or more C++ messages as
// a side effect. If there are no Java messages available, this call will
// block until one becomes available (while continuing to process C++ work).
bool ret = Java_NestedSystemMessageHandler_dispatchOneMessage(env);
CHECK(ret) << "Error running java message loop, tests will likely fail.";
RunJavaSystemMessageHandler();
// Handle deferred work if necessary.
// Note: |deferred_work_type_| and |deferred_do_idle_work_| must be reset
// here because another Run() loop can be triggered when we dispatch work.
CHECK(!ShouldDeferWork());
auto work_type = std::exchange(deferred_work_type_, kNone);
auto do_idle_work = std::exchange(deferred_do_idle_work_, true);
switch (work_type) {
case kNone:
// Do nothing.
break;
case kDelayed:
base::MessagePumpForUI::DoDelayedLooperWork();
break;
case kNonDelayed:
base::MessagePumpForUI::DoNonDelayedLooperWork(do_idle_work);
break;
}
}
// Restore old run loop state.
SetDelegate(old_delegate);
SetQuit(old_quit);
}
void NestedMessagePumpAndroid::Quit() {
// Wake up the Java message dispatcher to exit the loop in Run().
auto* env = base::android::AttachCurrentThread();
Java_NestedSystemMessageHandler_postQuitMessage(env);
MessagePumpForUI::Quit();
QuitJavaSystemMessageHandler();
SetQuit(true);
}
// Since this pump allows Run() to be called on the UI thread, there's no need
......@@ -45,4 +63,68 @@ void NestedMessagePumpAndroid::Quit() {
// the top level run loop from being considered a nested one.
void NestedMessagePumpAndroid::Attach(Delegate*) {}
void NestedMessagePumpAndroid::DoDelayedLooperWork() {
if (ShouldDeferWork()) {
switch (deferred_work_type_) {
case kNone:
deferred_work_type_ = kDelayed;
break;
case kDelayed:
// Do nothing. We are already going to process the next delayed work.
break;
case kNonDelayed:
// Do nothing. The immediate work will be processed and then a
// request to process immediate and idle work will be made. This
// will unconditionally process the next work item which should be
// the delayed item.
break;
}
QuitJavaSystemMessageHandler();
return;
}
base::MessagePumpForUI::DoDelayedLooperWork();
}
void NestedMessagePumpAndroid::DoNonDelayedLooperWork(bool do_idle_work) {
if (ShouldDeferWork()) {
deferred_work_type_ = kNonDelayed;
// Only do idle work if we are consistently asked to do it. If there
// is a request to defer the idle work, then we will honor that over
// any previous request to do idle work. Once the non-delayed work is
// dispatched, another request to do the idle work will be made.
deferred_do_idle_work_ &= do_idle_work;
QuitJavaSystemMessageHandler();
return;
}
base::MessagePumpForUI::DoNonDelayedLooperWork(do_idle_work);
}
void NestedMessagePumpAndroid::RunJavaSystemMessageHandler() {
auto* env = base::android::AttachCurrentThread();
CHECK(!quit_message_handler_);
CHECK(!inside_run_message_handler_);
base::AutoReset<bool> auto_reset(&inside_run_message_handler_, true);
// Dispatch the first available Java message and one or more C++ messages as
// a side effect. If there are no Java messages available, this call will
// block until one becomes available (while continuing to process C++ work).
bool ret = Java_NestedSystemMessageHandler_dispatchOneMessage(env);
CHECK(ret) << "Error running java message loop, tests will likely fail.";
quit_message_handler_ = false;
}
void NestedMessagePumpAndroid::QuitJavaSystemMessageHandler() {
if (!inside_run_message_handler_ || quit_message_handler_)
return;
quit_message_handler_ = true;
// Wake up the Java message dispatcher to exit dispatchOneMessage().
auto* env = base::android::AttachCurrentThread();
Java_NestedSystemMessageHandler_postQuitMessage(env);
}
} // namespace content
......@@ -21,8 +21,39 @@ class NestedMessagePumpAndroid : public base::MessagePumpForUI {
void Run(Delegate*) override;
void Quit() override;
void Attach(Delegate*) override;
void DoDelayedLooperWork() override;
void DoNonDelayedLooperWork(bool do_idle_work) override;
private:
// Returns true if the work should be deferred.
bool ShouldDeferWork() const { return inside_run_message_handler_; }
// Calls Java_NestedSystemMessageHandler_dispatchOneMessage() to service the
// native looper, dispatch a pending Java message, or wait until a Java
// message or looper event arrives.
void RunJavaSystemMessageHandler();
// Sends a signal that causes the current RunMessageHandler() call to return
// as soon as possible.
void QuitJavaSystemMessageHandler();
// Tracks whether a RunMessageHandler() call is currently on the stack.
bool inside_run_message_handler_ = false;
// Tracks whether a signal has been sent to trigger the current
// RunMessageHandler() call to return.
bool quit_message_handler_ = false;
// Keeps track of the work that needs to be dispatched after
// RunJavaSystemMessageHandler() returns.
enum DeferredWorkType {
kNone,
kDelayed,
kNonDelayed,
};
DeferredWorkType deferred_work_type_ = kNone;
bool deferred_do_idle_work_ = true;
DISALLOW_COPY_AND_ASSIGN(NestedMessagePumpAndroid);
};
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment