Commit d6ce9d7c authored by Benoit Lize's avatar Benoit Lize Committed by Commit Bot

[PartitionAlloc] Replace SpinLock with a spinning lock on Linux kernels.

PartitionAlloc's main lock is the per-PartitionRoot lock. It has to be
fast on the common uncontended path, and better than a simple spinlock
when it's contended.

Past attempts to replace it with base::Lock regressed low and high-level
benchmarks. This commit introduces SpinningFutex, a futex()-based lock
which spins in userspace like SpinLock before calling into the kernel,
like base::Lock does.

It is intended to be as fast as SpinLock, but better for contended
cases. Since it relies on the linux-specific futex() system call, this
is only available on Linux-based kernels, for now.

Bug: 1125866, 1125999, 1061437
Change-Id: I1e0cf23beb48c32518d0ffc293d1a9326bc2d7bc
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2409902
Commit-Queue: Benoit L <lizeb@chromium.org>
Reviewed-by: default avatarKentaro Hara <haraken@chromium.org>
Cr-Commit-Position: refs/heads/master@{#808343}
parent d1c780ab
...@@ -1794,6 +1794,7 @@ component("base") { ...@@ -1794,6 +1794,7 @@ component("base") {
"allocator/partition_allocator/random.h", "allocator/partition_allocator/random.h",
"allocator/partition_allocator/thread_cache.cc", "allocator/partition_allocator/thread_cache.cc",
"allocator/partition_allocator/thread_cache.h", "allocator/partition_allocator/thread_cache.h",
"allocator/partition_allocator/yield_processor.h",
] ]
if (is_win) { if (is_win) {
sources += sources +=
...@@ -1806,6 +1807,13 @@ component("base") { ...@@ -1806,6 +1807,13 @@ component("base") {
"allocator/partition_allocator/page_allocator_internals_fuchsia.h", "allocator/partition_allocator/page_allocator_internals_fuchsia.h",
] ]
} }
if (is_linux || is_android) {
sources += [
"allocator/partition_allocator/spinning_futex_linux.cc",
"allocator/partition_allocator/spinning_futex_linux.h",
]
}
} }
} }
...@@ -3217,6 +3225,7 @@ test("base_unittests") { ...@@ -3217,6 +3225,7 @@ test("base_unittests") {
"allocator/partition_allocator/memory_reclaimer_unittest.cc", "allocator/partition_allocator/memory_reclaimer_unittest.cc",
"allocator/partition_allocator/page_allocator_unittest.cc", "allocator/partition_allocator/page_allocator_unittest.cc",
"allocator/partition_allocator/partition_alloc_unittest.cc", "allocator/partition_allocator/partition_alloc_unittest.cc",
"allocator/partition_allocator/partition_lock_unittest.cc",
"allocator/partition_allocator/thread_cache_unittest.cc", "allocator/partition_allocator/thread_cache_unittest.cc",
] ]
} }
......
...@@ -4,8 +4,7 @@ ...@@ -4,8 +4,7 @@
#include "base/allocator/partition_allocator/partition_lock.h" #include "base/allocator/partition_allocator/partition_lock.h"
#if !DCHECK_IS_ON() #include "base/allocator/partition_allocator/yield_processor.h"
#include "base/threading/platform_thread.h" #include "base/threading/platform_thread.h"
#if defined(OS_WIN) #if defined(OS_WIN)
...@@ -14,48 +13,13 @@ ...@@ -14,48 +13,13 @@
#include <sched.h> #include <sched.h>
#endif #endif
// The YIELD_PROCESSOR macro wraps an architecture specific-instruction that
// informs the processor we're in a busy wait, so it can handle the branch more
// intelligently and e.g. reduce power to our core or give more resources to the
// other hyper-thread on this core. See the following for context:
// https://software.intel.com/en-us/articles/benefitting-power-and-performance-sleep-loops
//
// The YIELD_THREAD macro tells the OS to relinquish our quantum. This is // The YIELD_THREAD macro tells the OS to relinquish our quantum. This is
// basically a worst-case fallback, and if you're hitting it with any frequency // basically a worst-case fallback, and if you're hitting it with any frequency
// you really should be using a proper lock (such as |base::Lock|)rather than // you really should be using a proper lock (such as |base::Lock|)rather than
// these spinlocks. // these spinlocks.
#if defined(OS_WIN) #if defined(OS_WIN)
#define YIELD_PROCESSOR YieldProcessor()
#define YIELD_THREAD SwitchToThread() #define YIELD_THREAD SwitchToThread()
#elif defined(OS_POSIX) || defined(OS_FUCHSIA) #elif defined(OS_POSIX) || defined(OS_FUCHSIA)
#if defined(ARCH_CPU_X86_64) || defined(ARCH_CPU_X86)
#define YIELD_PROCESSOR __asm__ __volatile__("pause")
#elif (defined(ARCH_CPU_ARMEL) && __ARM_ARCH >= 6) || defined(ARCH_CPU_ARM64)
#define YIELD_PROCESSOR __asm__ __volatile__("yield")
#elif defined(ARCH_CPU_MIPSEL)
// The MIPS32 docs state that the PAUSE instruction is a no-op on older
// architectures (first added in MIPS32r2). To avoid assembler errors when
// targeting pre-r2, we must encode the instruction manually.
#define YIELD_PROCESSOR __asm__ __volatile__(".word 0x00000140")
#elif defined(ARCH_CPU_MIPS64EL) && __mips_isa_rev >= 2
// Don't bother doing using .word here since r2 is the lowest supported mips64
// that Chromium supports.
#define YIELD_PROCESSOR __asm__ __volatile__("pause")
#elif defined(ARCH_CPU_PPC64_FAMILY)
#define YIELD_PROCESSOR __asm__ __volatile__("or 31,31,31")
#elif defined(ARCH_CPU_S390_FAMILY)
// just do nothing
#define YIELD_PROCESSOR ((void)0)
#endif // ARCH
#ifndef YIELD_PROCESSOR
#warning "Processor yield not supported on this architecture."
#define YIELD_PROCESSOR ((void)0)
#endif
#define YIELD_THREAD sched_yield() #define YIELD_THREAD sched_yield()
#else // Other OS #else // Other OS
...@@ -102,5 +66,3 @@ void SpinLock::AcquireSlow() { ...@@ -102,5 +66,3 @@ void SpinLock::AcquireSlow() {
} // namespace internal } // namespace internal
} // namespace base } // namespace base
#endif // !DCHECK_IS_ON()
...@@ -10,10 +10,14 @@ ...@@ -10,10 +10,14 @@
#include "base/allocator/buildflags.h" #include "base/allocator/buildflags.h"
#include "base/no_destructor.h" #include "base/no_destructor.h"
#include "base/synchronization/lock.h" #include "base/thread_annotations.h"
#include "base/threading/platform_thread.h" #include "base/threading/platform_thread.h"
#include "build/build_config.h" #include "build/build_config.h"
#if defined(OS_LINUX) || defined(OS_ANDROID)
#include "base/allocator/partition_allocator/spinning_futex_linux.h"
#endif
namespace base { namespace base {
namespace internal { namespace internal {
...@@ -53,7 +57,6 @@ class SCOPED_LOCKABLE ScopedUnlockGuard { ...@@ -53,7 +57,6 @@ class SCOPED_LOCKABLE ScopedUnlockGuard {
MaybeSpinLock<thread_safe>& lock_; MaybeSpinLock<thread_safe>& lock_;
}; };
#if !DCHECK_IS_ON()
// Spinlock. Do not use, to be removed. crbug.com/1061437. // Spinlock. Do not use, to be removed. crbug.com/1061437.
class BASE_EXPORT SpinLock { class BASE_EXPORT SpinLock {
public: public:
...@@ -80,7 +83,6 @@ class BASE_EXPORT SpinLock { ...@@ -80,7 +83,6 @@ class BASE_EXPORT SpinLock {
std::atomic_int lock_{0}; std::atomic_int lock_{0};
}; };
#endif // !DCHECK_IS_ON()
template <> template <>
class LOCKABLE MaybeSpinLock<true> { class LOCKABLE MaybeSpinLock<true> {
...@@ -134,14 +136,9 @@ class LOCKABLE MaybeSpinLock<true> { ...@@ -134,14 +136,9 @@ class LOCKABLE MaybeSpinLock<true> {
} }
private: private:
#if DCHECK_IS_ON() // DCHECK_IS_ON() for now to check stability before performance.
// NoDestructor to avoid issues with the "static destruction order fiasco". #if DCHECK_IS_ON() && (defined(OS_LINUX) || defined(OS_ANDROID))
// base::NoDestructor<SpinningFutex> lock_;
// This also means that we leak a lock when a partition is destructed. This
// will in practice only show in some tests, as partitions are not destructed
// in regular use. In addition, on most platforms, base::Lock doesn't allocate
// memory and neither does the OS library, and the destructor is a no-op.
base::NoDestructor<base::Lock> lock_;
#else #else
// base::Lock is slower on the fast path than SpinLock, hence we still use it // base::Lock is slower on the fast path than SpinLock, hence we still use it
// on non-DCHECK() builds. crbug.com/1125999 // on non-DCHECK() builds. crbug.com/1125999
......
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/allocator/partition_allocator/partition_lock.h"
#include "base/bind.h"
#include "base/callback.h"
#include "base/macros.h"
#include "base/test/bind_test_util.h"
#include "base/test/gtest_util.h"
#include "base/threading/platform_thread.h"
#include "base/time/time.h"
#include "testing/gtest/include/gtest/gtest.h"
namespace base {
namespace internal {
namespace {
class LambdaThreadDelegate : public PlatformThread::Delegate {
public:
explicit LambdaThreadDelegate(RepeatingClosure f) : f_(f) {}
void ThreadMain() override { f_.Run(); }
private:
RepeatingClosure f_;
};
TEST(SpinLockTest, Simple) {
MaybeSpinLock<true> lock;
lock.Lock();
lock.Unlock();
}
MaybeSpinLock<true> g_lock;
TEST(SpinLockTest, StaticLockStartsUnlocked) {
g_lock.Lock();
g_lock.Unlock();
}
TEST(SpinLockTest, Contended) {
int counter = 0; // *Not* atomic.
std::vector<PlatformThreadHandle> thread_handles;
constexpr int iterations_per_thread = 1000000;
constexpr int num_threads = 4;
MaybeSpinLock<true> lock;
MaybeSpinLock<true> start_lock;
LambdaThreadDelegate delegate{BindLambdaForTesting([&]() {
start_lock.Lock();
start_lock.Unlock();
for (int i = 0; i < iterations_per_thread; i++) {
lock.Lock();
counter++;
lock.Unlock();
}
})};
start_lock.Lock(); // Make sure that the threads compete, by waiting until
// all of them have at least been created.
for (int i = 0; i < num_threads; i++) {
PlatformThreadHandle handle;
PlatformThread::Create(0, &delegate, &handle);
thread_handles.push_back(handle);
}
start_lock.Unlock();
for (int i = 0; i < num_threads; i++) {
PlatformThread::Join(thread_handles[i]);
}
EXPECT_EQ(iterations_per_thread * num_threads, counter);
}
TEST(SpinLockTest, SlowThreads) {
int counter = 0; // *Not* atomic.
std::vector<PlatformThreadHandle> thread_handles;
constexpr int iterations_per_thread = 100;
constexpr int num_threads = 4;
MaybeSpinLock<true> lock;
MaybeSpinLock<true> start_lock;
LambdaThreadDelegate delegate{BindLambdaForTesting([&]() {
start_lock.Lock();
start_lock.Unlock();
for (int i = 0; i < iterations_per_thread; i++) {
lock.Lock();
counter++;
// Hold the lock for a while, to force futex()-based locks to sleep.
PlatformThread::Sleep(TimeDelta::FromMilliseconds(1));
lock.Unlock();
}
})};
start_lock.Lock(); // Make sure that the threads compete, by waiting until
// all of them have at least been created.
for (int i = 0; i < num_threads; i++) {
PlatformThreadHandle handle;
PlatformThread::Create(0, &delegate, &handle);
thread_handles.push_back(handle);
}
start_lock.Unlock();
for (int i = 0; i < num_threads; i++) {
PlatformThread::Join(thread_handles[i]);
}
EXPECT_EQ(iterations_per_thread * num_threads, counter);
}
} // namespace
} // namespace internal
} // namespace base
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/allocator/partition_allocator/spinning_futex_linux.h"
#include "base/allocator/partition_allocator/partition_alloc_check.h"
#include "build/build_config.h"
#if defined(OS_LINUX) || defined(OS_ANDROID)
#include <errno.h>
#include <linux/futex.h>
#include <sys/syscall.h>
#include <unistd.h>
namespace base {
namespace internal {
void SpinningFutex::FutexWait() {
// Save and restore errno.
int saved_errno = errno;
// Don't check the return value, as we will not be awaken by a timeout, since
// none is specified.
//
// Ignoring the return value doesn't impact correctness, as this acts as an
// immediate wakeup. For completeness, the possible errors for FUTEX_WAIT are:
// - EACCES: state_ is not readable. Should not happen.
// - EAGAIN: the value is not as expected, that is not |kLockedContended|, in
// which case retrying the loop is the right behavior.
// - EINTR: signal, looping is the right behavior.
// - EINVAL: invalid argument.
//
// Note: not checking the return value is the approach used in bionic and
// glibc as well.
//
// Will return immediately if |state_| is no longer equal to
// |kLockedContended|. Otherwise, sleeps and wakes up when |state_| may not be
// |kLockedContended| anymore. Note that even without spurious wakeups, the
// value of |state_| is not guaranteed when this returns, as another thread
// may get the lock before we get to run.
int err = syscall(SYS_futex, &state_, FUTEX_WAIT | FUTEX_PRIVATE_FLAG,
kLockedContended, nullptr, nullptr, 0);
if (err) {
// These are programming error, check them.
PA_DCHECK(errno != EACCES);
PA_DCHECK(errno != EINVAL);
}
errno = saved_errno;
}
void SpinningFutex::FutexWake() {
int saved_errno = errno;
long retval = syscall(SYS_futex, &state_, FUTEX_WAKE | FUTEX_PRIVATE_FLAG,
1 /* wake up a single waiter */, nullptr, nullptr, 0);
PA_CHECK(retval != -1);
errno = saved_errno;
}
void SpinningFutex::LockSlow() {
// If this thread gets awaken but another one got the lock first, then go back
// to sleeping. See comments in |FutexWait()| to see why a loop is required.
while (state_.exchange(kLockedContended, std::memory_order_acquire) !=
kUnlocked) {
FutexWait();
}
}
} // namespace internal
} // namespace base
#endif // defined(OS_LINUX) || defined(OS_ANDROID)
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_FUTEX_LINUX_H_
#define BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_FUTEX_LINUX_H_
#include <atomic>
#include "base/allocator/partition_allocator/yield_processor.h"
#include "base/base_export.h"
#include "base/compiler_specific.h"
#include "build/build_config.h"
#if !(defined(OS_LINUX) || defined(OS_ANDROID))
#error "Not supported"
#endif
namespace base {
namespace internal {
// Simple spinning futex lock. It will spin in user space a set number of times
// before going into the kernel to sleep.
//
// This is intended to give "the best of both worlds" between a SpinLock and
// base::Lock:
// - SpinLock: Inlined fast path, no external function calls, just
// compare-and-swap. Short waits do not go into the kernel. Good behavior in
// low contention cases.
// - base::Lock: Good behavior in case of contention.
//
// We don't rely on base::Lock which we could make spin (by calling Try() in a
// loop), as performance is below a custom spinlock as seen on high-level
// benchmarks. Instead this implements a simple non-recursive mutex on top of
// the futex() syscall. The main difference between this and a libc
// implementation is that it only supports the simplest path: private (to a
// process), non-recursive mutexes with no priority inheritance, no timed waits.
//
// As an interesting side-effect to be used in the allocator, this code does not
// make any allocations, locks are small with a constexpr constructor and no
// destructor.
class BASE_EXPORT SpinningFutex {
public:
inline constexpr SpinningFutex();
ALWAYS_INLINE void Acquire();
ALWAYS_INLINE void Release();
void AssertAcquired() const {} // Not supported.
private:
void LockSlow();
void FutexWait();
void FutexWake();
static constexpr int kUnlocked = 0;
static constexpr int kLockedUncontended = 1;
static constexpr int kLockedContended = 2;
// Same as SpinLock, not scientifically calibrated.
static constexpr int kSpinCount = 10;
std::atomic<int32_t> state_{kUnlocked};
};
ALWAYS_INLINE void SpinningFutex::Acquire() {
int tries = 0;
// Busy-waiting is inlined, which is fine as long as we have few callers. This
// is only used for the partition lock, so this is the case.
do {
int expected = kUnlocked;
if (LIKELY(state_.compare_exchange_strong(expected, kLockedUncontended,
std::memory_order_acquire,
std::memory_order_relaxed))) {
return;
}
YIELD_PROCESSOR;
tries++;
} while (tries < kSpinCount);
LockSlow();
}
inline constexpr SpinningFutex::SpinningFutex() = default;
ALWAYS_INLINE void SpinningFutex::Release() {
if (UNLIKELY(state_.exchange(kUnlocked, std::memory_order_release) ==
kLockedContended)) {
// |kLockedContended|: there is a waiter to wake up.
//
// Here there is a window where the lock is unlocked, since we just set it
// to |kUnlocked| above. Meaning that another thread can grab the lock
// in-between now and |FutexWake()| waking up a waiter. Aside from
// potentially fairness, this is not an issue, as the newly-awaken thread
// will check that the lock is still free.
//
// There is a small pessimization here though: if we have a single waiter,
// then when it wakes up, the lock will be set to |kLockedContended|, so
// when this waiter releases the lock, it will needlessly call
// |FutexWake()|, even though there are no waiters. This is supported by the
// kernel, and is what bionic (Android's libc) also does.
FutexWake();
}
}
} // namespace internal
} // namespace base
#endif // BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_FUTEX_LINUX_H_
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef BASE_ALLOCATOR_PARTITION_ALLOCATOR_YIELD_PROCESSOR_H_
#define BASE_ALLOCATOR_PARTITION_ALLOCATOR_YIELD_PROCESSOR_H_
#include "build/build_config.h"
// The YIELD_PROCESSOR macro wraps an architecture specific-instruction that
// informs the processor we're in a busy wait, so it can handle the branch more
// intelligently and e.g. reduce power to our core or give more resources to the
// other hyper-thread on this core. See the following for context:
// https://software.intel.com/en-us/articles/benefitting-power-and-performance-sleep-loops
#if defined(ARCH_CPU_X86_64) || defined(ARCH_CPU_X86)
#define YIELD_PROCESSOR __asm__ __volatile__("pause")
#elif (defined(ARCH_CPU_ARMEL) && __ARM_ARCH >= 6) || defined(ARCH_CPU_ARM64)
#define YIELD_PROCESSOR __asm__ __volatile__("yield")
#elif defined(ARCH_CPU_MIPSEL)
// The MIPS32 docs state that the PAUSE instruction is a no-op on older
// architectures (first added in MIPS32r2). To avoid assembler errors when
// targeting pre-r2, we must encode the instruction manually.
#define YIELD_PROCESSOR __asm__ __volatile__(".word 0x00000140")
#elif defined(ARCH_CPU_MIPS64EL) && __mips_isa_rev >= 2
// Don't bother doing using .word here since r2 is the lowest supported mips64
// that Chromium supports.
#define YIELD_PROCESSOR __asm__ __volatile__("pause")
#elif defined(ARCH_CPU_PPC64_FAMILY)
#define YIELD_PROCESSOR __asm__ __volatile__("or 31,31,31")
#elif defined(ARCH_CPU_S390_FAMILY)
// just do nothing
#define YIELD_PROCESSOR ((void)0)
#endif // ARCH
#ifndef YIELD_PROCESSOR
#warning "Processor yield not supported on this architecture."
#define YIELD_PROCESSOR ((void)0)
#endif
#endif // BASE_ALLOCATOR_PARTITION_ALLOCATOR_YIELD_PROCESSOR_H_
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment