[PartitionAlloc] Replace SpinLock with a spinning lock on Linux kernels.

PartitionAlloc's main lock is the per-PartitionRoot lock. It has to be fast on the common uncontended path, and better than a simple spinlock when it's contended. Past attempts to replace it with base::Lock regressed low and high-level benchmarks. This commit introduces SpinningFutex, a futex()-based lock which spins in userspace like SpinLock before calling into the kernel, like base::Lock does. It is intended to be as fast as SpinLock, but better for contended cases. Since it relies on the linux-specific futex() system call, this is only available on Linux-based kernels, for now. Bug: 1125866, 1125999, 1061437 Change-Id: I1e0cf23beb48c32518d0ffc293d1a9326bc2d7bc Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2409902 Commit-Queue: Benoit L <lizeb@chromium.org> Reviewed-by: Kentaro Hara <haraken@chromium.org> Cr-Commit-Position: refs/heads/master@{#808343}

[PartitionAlloc] Replace SpinLock with a spinning lock on Linux kernels.
PartitionAlloc's main lock is the per-PartitionRoot lock. It has to be fast on the common uncontended path, and better than a simple spinlock when it's contended. Past attempts to replace it with base::Lock regressed low and high-level benchmarks. This commit introduces SpinningFutex, a futex()-based lock which spins in userspace like SpinLock before calling into the kernel, like base::Lock does. It is intended to be as fast as SpinLock, but better for contended cases. Since it relies on the linux-specific futex() system call, this is only available on Linux-based kernels, for now. Bug: 1125866, 1125999, 1061437 Change-Id: I1e0cf23beb48c32518d0ffc293d1a9326bc2d7bc Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2409902 Commit-Queue: Benoit L <lizeb@chromium.org> Reviewed-by: Kentaro Hara <haraken@chromium.org> Cr-Commit-Position: refs/heads/master@{#808343}
d6ce9d7c · Benoit Lize · Commit Bot · d1c780ab · d6ce9d7c · d6ce9d7c
Commit d6ce9d7c authored Sep 18, 2020 by Benoit Lize Committed by Commit Bot Sep 18, 2020
7 changed files
--- a/base/BUILD.gn
+++ b/base/BUILD.gn
@@ -1794,6 +1794,7 @@ component("base") {
        "allocator/partition_allocator/random.h",
        "allocator/partition_allocator/thread_cache.cc",
        "allocator/partition_allocator/thread_cache.h",
+        "allocator/partition_allocator/yield_processor.h",
      ]
      if (is_win) {
        sources +=
@@ -1806,6 +1807,13 @@ component("base") {
          "allocator/partition_allocator/page_allocator_internals_fuchsia.h",
        ]
      }
+      if (is_linux || is_android) {
+        sources += [
+          "allocator/partition_allocator/spinning_futex_linux.cc",
+          "allocator/partition_allocator/spinning_futex_linux.h",
+        ]
+      }
    }
  }
@@ -3217,6 +3225,7 @@ test("base_unittests") {
      "allocator/partition_allocator/memory_reclaimer_unittest.cc",
      "allocator/partition_allocator/page_allocator_unittest.cc",
      "allocator/partition_allocator/partition_alloc_unittest.cc",
+      "allocator/partition_allocator/partition_lock_unittest.cc",
      "allocator/partition_allocator/thread_cache_unittest.cc",
    ]
  }

--- a/base/allocator/partition_allocator/partition_lock.cc
+++ b/base/allocator/partition_allocator/partition_lock.cc
@@ -4,8 +4,7 @@
 #include "base/allocator/partition_allocator/partition_lock.h"
-#if !DCHECK_IS_ON()
+#include "base/allocator/partition_allocator/yield_processor.h"
 #include "base/threading/platform_thread.h"
 #if defined(OS_WIN)
@@ -14,48 +13,13 @@
 #include <sched.h>
 #endif
-// The YIELD_PROCESSOR macro wraps an architecture specific-instruction that
-// informs the processor we're in a busy wait, so it can handle the branch more
-// intelligently and e.g. reduce power to our core or give more resources to the
-// other hyper-thread on this core. See the following for context:
-// https://software.intel.com/en-us/articles/benefitting-power-and-performance-sleep-loops
-//
 // The YIELD_THREAD macro tells the OS to relinquish our quantum. This is
 // basically a worst-case fallback, and if you're hitting it with any frequency
 // you really should be using a proper lock (such as |base::Lock|)rather than
 // these spinlocks.
 #if defined(OS_WIN)
-#define YIELD_PROCESSOR YieldProcessor()
 #define YIELD_THREAD SwitchToThread()
 #elif defined(OS_POSIX) || defined(OS_FUCHSIA)
-#if defined(ARCH_CPU_X86_64) || defined(ARCH_CPU_X86)
-#define YIELD_PROCESSOR __asm__ __volatile__("pause")
-#elif (defined(ARCH_CPU_ARMEL) && __ARM_ARCH >= 6) || defined(ARCH_CPU_ARM64)
-#define YIELD_PROCESSOR __asm__ __volatile__("yield")
-#elif defined(ARCH_CPU_MIPSEL)
-// The MIPS32 docs state that the PAUSE instruction is a no-op on older
-// architectures (first added in MIPS32r2). To avoid assembler errors when
-// targeting pre-r2, we must encode the instruction manually.
-#define YIELD_PROCESSOR __asm__ __volatile__(".word 0x00000140")
-#elif defined(ARCH_CPU_MIPS64EL) && __mips_isa_rev >= 2
-// Don't bother doing using .word here since r2 is the lowest supported mips64
-// that Chromium supports.
-#define YIELD_PROCESSOR __asm__ __volatile__("pause")
-#elif defined(ARCH_CPU_PPC64_FAMILY)
-#define YIELD_PROCESSOR __asm__ __volatile__("or 31,31,31")
-#elif defined(ARCH_CPU_S390_FAMILY)
-// just do nothing
-#define YIELD_PROCESSOR ((void)0)
-#endif  // ARCH
-#ifndef YIELD_PROCESSOR
-#warning "Processor yield not supported on this architecture."
-#define YIELD_PROCESSOR ((void)0)
-#endif
 #define YIELD_THREAD sched_yield()
 #else  // Other OS
@@ -102,5 +66,3 @@ void SpinLock::AcquireSlow() {
 }  // namespace internal
 }  // namespace base
-#endif  // !DCHECK_IS_ON()
--- a/base/allocator/partition_allocator/partition_lock.h
+++ b/base/allocator/partition_allocator/partition_lock.h
@@ -10,10 +10,14 @@
 #include "base/allocator/buildflags.h"
 #include "base/no_destructor.h"
-#include "base/synchronization/lock.h"
+#include "base/thread_annotations.h"
 #include "base/threading/platform_thread.h"
 #include "build/build_config.h"
+#if defined(OS_LINUX) || defined(OS_ANDROID)
+#include "base/allocator/partition_allocator/spinning_futex_linux.h"
+#endif
 namespace base {
 namespace internal {
@@ -53,7 +57,6 @@ class SCOPED_LOCKABLE ScopedUnlockGuard {
  MaybeSpinLock<thread_safe>& lock_;
 };
-#if !DCHECK_IS_ON()
 // Spinlock. Do not use, to be removed. crbug.com/1061437.
 class BASE_EXPORT SpinLock {
 public:
@@ -80,7 +83,6 @@ class BASE_EXPORT SpinLock {
  std::atomic_int lock_{0};
 };
-#endif  // !DCHECK_IS_ON()
 template <>
 class LOCKABLE MaybeSpinLock<true> {
@@ -134,14 +136,9 @@ class LOCKABLE MaybeSpinLock<true> {
  }
 private:
-#if DCHECK_IS_ON()
+  // DCHECK_IS_ON() for now to check stability before performance.
-  // NoDestructor to avoid issues with the "static destruction order fiasco".
+#if DCHECK_IS_ON() && (defined(OS_LINUX) || defined(OS_ANDROID))
-  //
+  base::NoDestructor<SpinningFutex> lock_;
-  // This also means that we leak a lock when a partition is destructed. This
-  // will in practice only show in some tests, as partitions are not destructed
-  // in regular use. In addition, on most platforms, base::Lock doesn't allocate
-  // memory and neither does the OS library, and the destructor is a no-op.
-  base::NoDestructor<base::Lock> lock_;
 #else
  // base::Lock is slower on the fast path than SpinLock, hence we still use it
  // on non-DCHECK() builds. crbug.com/1125999

--- a/base/allocator/partition_allocator/partition_lock_unittest.cc
+++ b/base/allocator/partition_allocator/partition_lock_unittest.cc
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#include "base/allocator/partition_allocator/partition_lock.h"
+#include "base/bind.h"
+#include "base/callback.h"
+#include "base/macros.h"
+#include "base/test/bind_test_util.h"
+#include "base/test/gtest_util.h"
+#include "base/threading/platform_thread.h"
+#include "base/time/time.h"
+#include "testing/gtest/include/gtest/gtest.h"
+namespace base {
+namespace internal {
+namespace {
+class LambdaThreadDelegate : public PlatformThread::Delegate {
+ public:
+  explicit LambdaThreadDelegate(RepeatingClosure f) : f_(f) {}
+  void ThreadMain() override { f_.Run(); }
+ private:
+  RepeatingClosure f_;
+};
+TEST(SpinLockTest, Simple) {
+  MaybeSpinLock<true> lock;
+  lock.Lock();
+  lock.Unlock();
+}
+MaybeSpinLock<true> g_lock;
+TEST(SpinLockTest, StaticLockStartsUnlocked) {
+  g_lock.Lock();
+  g_lock.Unlock();
+}
+TEST(SpinLockTest, Contended) {
+  int counter = 0;  // *Not* atomic.
+  std::vector<PlatformThreadHandle> thread_handles;
+  constexpr int iterations_per_thread = 1000000;
+  constexpr int num_threads = 4;
+  MaybeSpinLock<true> lock;
+  MaybeSpinLock<true> start_lock;
+  LambdaThreadDelegate delegate{BindLambdaForTesting([&]() {
+    start_lock.Lock();
+    start_lock.Unlock();
+    for (int i = 0; i < iterations_per_thread; i++) {
+      lock.Lock();
+      counter++;
+      lock.Unlock();
+    }
+  })};
+  start_lock.Lock();  // Make sure that the threads compete, by waiting until
+                      // all of them have at least been created.
+  for (int i = 0; i < num_threads; i++) {
+    PlatformThreadHandle handle;
+    PlatformThread::Create(0, &delegate, &handle);
+    thread_handles.push_back(handle);
+  }
+  start_lock.Unlock();
+  for (int i = 0; i < num_threads; i++) {
+    PlatformThread::Join(thread_handles[i]);
+  }
+  EXPECT_EQ(iterations_per_thread * num_threads, counter);
+}
+TEST(SpinLockTest, SlowThreads) {
+  int counter = 0;  // *Not* atomic.
+  std::vector<PlatformThreadHandle> thread_handles;
+  constexpr int iterations_per_thread = 100;
+  constexpr int num_threads = 4;
+  MaybeSpinLock<true> lock;
+  MaybeSpinLock<true> start_lock;
+  LambdaThreadDelegate delegate{BindLambdaForTesting([&]() {
+    start_lock.Lock();
+    start_lock.Unlock();
+    for (int i = 0; i < iterations_per_thread; i++) {
+      lock.Lock();
+      counter++;
+      // Hold the lock for a while, to force futex()-based locks to sleep.
+      PlatformThread::Sleep(TimeDelta::FromMilliseconds(1));
+      lock.Unlock();
+    }
+  })};
+  start_lock.Lock();  // Make sure that the threads compete, by waiting until
+                      // all of them have at least been created.
+  for (int i = 0; i < num_threads; i++) {
+    PlatformThreadHandle handle;
+    PlatformThread::Create(0, &delegate, &handle);
+    thread_handles.push_back(handle);
+  }
+  start_lock.Unlock();
+  for (int i = 0; i < num_threads; i++) {
+    PlatformThread::Join(thread_handles[i]);
+  }
+  EXPECT_EQ(iterations_per_thread * num_threads, counter);
+}
+}  // namespace
+}  // namespace internal
+}  // namespace base
--- a/base/allocator/partition_allocator/spinning_futex_linux.cc
+++ b/base/allocator/partition_allocator/spinning_futex_linux.cc
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#include "base/allocator/partition_allocator/spinning_futex_linux.h"
+#include "base/allocator/partition_allocator/partition_alloc_check.h"
+#include "build/build_config.h"
+#if defined(OS_LINUX) || defined(OS_ANDROID)
+#include <errno.h>
+#include <linux/futex.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+namespace base {
+namespace internal {
+void SpinningFutex::FutexWait() {
+  // Save and restore errno.
+  int saved_errno = errno;
+  // Don't check the return value, as we will not be awaken by a timeout, since
+  // none is specified.
+  //
+  // Ignoring the return value doesn't impact correctness, as this acts as an
+  // immediate wakeup. For completeness, the possible errors for FUTEX_WAIT are:
+  // - EACCES: state_ is not readable. Should not happen.
+  // - EAGAIN: the value is not as expected, that is not |kLockedContended|, in
+  //           which case retrying the loop is the right behavior.
+  // - EINTR: signal, looping is the right behavior.
+  // - EINVAL: invalid argument.
+  //
+  // Note: not checking the return value is the approach used in bionic and
+  // glibc as well.
+  //
+  // Will return immediately if |state_| is no longer equal to
+  // |kLockedContended|. Otherwise, sleeps and wakes up when |state_| may not be
+  // |kLockedContended| anymore. Note that even without spurious wakeups, the
+  // value of |state_| is not guaranteed when this returns, as another thread
+  // may get the lock before we get to run.
+  int err = syscall(SYS_futex, &state_, FUTEX_WAIT | FUTEX_PRIVATE_FLAG,
+                    kLockedContended, nullptr, nullptr, 0);
+  if (err) {
+    // These are programming error, check them.
+    PA_DCHECK(errno != EACCES);
+    PA_DCHECK(errno != EINVAL);
+  }
+  errno = saved_errno;
+}
+void SpinningFutex::FutexWake() {
+  int saved_errno = errno;
+  long retval = syscall(SYS_futex, &state_, FUTEX_WAKE | FUTEX_PRIVATE_FLAG,
+                        1 /* wake up a single waiter */, nullptr, nullptr, 0);
+  PA_CHECK(retval != -1);
+  errno = saved_errno;
+}
+void SpinningFutex::LockSlow() {
+  // If this thread gets awaken but another one got the lock first, then go back
+  // to sleeping. See comments in |FutexWait()| to see why a loop is required.
+  while (state_.exchange(kLockedContended, std::memory_order_acquire) !=
+         kUnlocked) {
+    FutexWait();
+  }
+}
+}  // namespace internal
+}  // namespace base
+#endif  // defined(OS_LINUX) || defined(OS_ANDROID)
--- a/base/allocator/partition_allocator/spinning_futex_linux.h
+++ b/base/allocator/partition_allocator/spinning_futex_linux.h
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#ifndef BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_FUTEX_LINUX_H_
+#define BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_FUTEX_LINUX_H_
+#include <atomic>
+#include "base/allocator/partition_allocator/yield_processor.h"
+#include "base/base_export.h"
+#include "base/compiler_specific.h"
+#include "build/build_config.h"
+#if !(defined(OS_LINUX) || defined(OS_ANDROID))
+#error "Not supported"
+#endif
+namespace base {
+namespace internal {
+// Simple spinning futex lock. It will spin in user space a set number of times
+// before going into the kernel to sleep.
+//
+// This is intended to give "the best of both worlds" between a SpinLock and
+// base::Lock:
+// - SpinLock: Inlined fast path, no external function calls, just
+//   compare-and-swap. Short waits do not go into the kernel. Good behavior in
+//   low contention cases.
+// - base::Lock: Good behavior in case of contention.
+//
+// We don't rely on base::Lock which we could make spin (by calling Try() in a
+// loop), as performance is below a custom spinlock as seen on high-level
+// benchmarks. Instead this implements a simple non-recursive mutex on top of
+// the futex() syscall. The main difference between this and a libc
+// implementation is that it only supports the simplest path: private (to a
+// process), non-recursive mutexes with no priority inheritance, no timed waits.
+//
+// As an interesting side-effect to be used in the allocator, this code does not
+// make any allocations, locks are small with a constexpr constructor and no
+// destructor.
+class BASE_EXPORT SpinningFutex {
+ public:
+  inline constexpr SpinningFutex();
+  ALWAYS_INLINE void Acquire();
+  ALWAYS_INLINE void Release();
+  void AssertAcquired() const {}  // Not supported.
+ private:
+  void LockSlow();
+  void FutexWait();
+  void FutexWake();
+  static constexpr int kUnlocked = 0;
+  static constexpr int kLockedUncontended = 1;
+  static constexpr int kLockedContended = 2;
+  // Same as SpinLock, not scientifically calibrated.
+  static constexpr int kSpinCount = 10;
+  std::atomic<int32_t> state_{kUnlocked};
+};
+ALWAYS_INLINE void SpinningFutex::Acquire() {
+  int tries = 0;
+  // Busy-waiting is inlined, which is fine as long as we have few callers. This
+  // is only used for the partition lock, so this is the case.
+  do {
+    int expected = kUnlocked;
+    if (LIKELY(state_.compare_exchange_strong(expected, kLockedUncontended,
+                                              std::memory_order_acquire,
+                                              std::memory_order_relaxed))) {
+      return;
+    }
+    YIELD_PROCESSOR;
+    tries++;
+  } while (tries < kSpinCount);
+  LockSlow();
+}
+inline constexpr SpinningFutex::SpinningFutex() = default;
+ALWAYS_INLINE void SpinningFutex::Release() {
+  if (UNLIKELY(state_.exchange(kUnlocked, std::memory_order_release) ==
+               kLockedContended)) {
+    // |kLockedContended|: there is a waiter to wake up.
+    //
+    // Here there is a window where the lock is unlocked, since we just set it
+    // to |kUnlocked| above. Meaning that another thread can grab the lock
+    // in-between now and |FutexWake()| waking up a waiter. Aside from
+    // potentially fairness, this is not an issue, as the newly-awaken thread
+    // will check that the lock is still free.
+    //
+    // There is a small pessimization here though: if we have a single waiter,
+    // then when it wakes up, the lock will be set to |kLockedContended|, so
+    // when this waiter releases the lock, it will needlessly call
+    // |FutexWake()|, even though there are no waiters. This is supported by the
+    // kernel, and is what bionic (Android's libc) also does.
+    FutexWake();
+  }
+}
+}  // namespace internal
+}  // namespace base
+#endif  // BASE_ALLOCATOR_PARTITION_ALLOCATOR_SPINNING_FUTEX_LINUX_H_
--- a/base/allocator/partition_allocator/yield_processor.h
+++ b/base/allocator/partition_allocator/yield_processor.h
+// Copyright 2020 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#ifndef BASE_ALLOCATOR_PARTITION_ALLOCATOR_YIELD_PROCESSOR_H_
+#define BASE_ALLOCATOR_PARTITION_ALLOCATOR_YIELD_PROCESSOR_H_
+#include "build/build_config.h"
+// The YIELD_PROCESSOR macro wraps an architecture specific-instruction that
+// informs the processor we're in a busy wait, so it can handle the branch more
+// intelligently and e.g. reduce power to our core or give more resources to the
+// other hyper-thread on this core. See the following for context:
+// https://software.intel.com/en-us/articles/benefitting-power-and-performance-sleep-loops
+#if defined(ARCH_CPU_X86_64) || defined(ARCH_CPU_X86)
+#define YIELD_PROCESSOR __asm__ __volatile__("pause")
+#elif (defined(ARCH_CPU_ARMEL) && __ARM_ARCH >= 6) || defined(ARCH_CPU_ARM64)
+#define YIELD_PROCESSOR __asm__ __volatile__("yield")
+#elif defined(ARCH_CPU_MIPSEL)
+// The MIPS32 docs state that the PAUSE instruction is a no-op on older
+// architectures (first added in MIPS32r2). To avoid assembler errors when
+// targeting pre-r2, we must encode the instruction manually.
+#define YIELD_PROCESSOR __asm__ __volatile__(".word 0x00000140")
+#elif defined(ARCH_CPU_MIPS64EL) && __mips_isa_rev >= 2
+// Don't bother doing using .word here since r2 is the lowest supported mips64
+// that Chromium supports.
+#define YIELD_PROCESSOR __asm__ __volatile__("pause")
+#elif defined(ARCH_CPU_PPC64_FAMILY)
+#define YIELD_PROCESSOR __asm__ __volatile__("or 31,31,31")
+#elif defined(ARCH_CPU_S390_FAMILY)
+// just do nothing
+#define YIELD_PROCESSOR ((void)0)
+#endif  // ARCH
+#ifndef YIELD_PROCESSOR
+#warning "Processor yield not supported on this architecture."
+#define YIELD_PROCESSOR ((void)0)
+#endif
+#endif  // BASE_ALLOCATOR_PARTITION_ALLOCATOR_YIELD_PROCESSOR_H_