Add a short random delay to scheduled policy fetches

This change adds a small (0 to 5 minutes) delay to scheduled policy fetches in order to alleviate server-side traffic spikes. The delay is randomly set at policy initialization time in the CloudPolicyRefreshScheduler constructor. To avoid causing issues with ChromeOS' synchronous policy fetches, we skip adding the delay in some cases. Bug: 963910 TEST: updated logic of component_unittest and one unit_tests Change-Id: Ic02ac2ef18e7e7b03ac95bdfab19e946c2f46324 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1998789Reviewed-by: Pavol Marko <pmarko@chromium.org> Reviewed-by: Owen Min <zmin@chromium.org> Commit-Queue: proberge <proberge@chromium.org> Cr-Commit-Position: refs/heads/master@{#732976}

Add a short random delay to scheduled policy fetches
This change adds a small (0 to 5 minutes) delay to scheduled policy fetches in order to alleviate server-side traffic spikes. The delay is randomly set at policy initialization time in the CloudPolicyRefreshScheduler constructor. To avoid causing issues with ChromeOS' synchronous policy fetches, we skip adding the delay in some cases. Bug: 963910 TEST: updated logic of component_unittest and one unit_tests Change-Id: Ic02ac2ef18e7e7b03ac95bdfab19e946c2f46324 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1998789Reviewed-by: Pavol Marko <pmarko@chromium.org> Reviewed-by: Owen Min <zmin@chromium.org> Commit-Queue: proberge <proberge@chromium.org> Cr-Commit-Position: refs/heads/master@{#732976}
665fc90a · proberge · Commit Bot · 2e2dbb66 · 665fc90a · 665fc90a
Commit 665fc90a authored Jan 17, 2020 by proberge Committed by Commit Bot Jan 17, 2020
4 changed files
--- a/chrome/browser/chromeos/policy/user_cloud_policy_manager_chromeos_unittest.cc
+++ b/chrome/browser/chromeos/policy/user_cloud_policy_manager_chromeos_unittest.cc
@@ -1248,7 +1248,8 @@ TEST_P(UserCloudPolicyManagerChromeOSChildTest, RefreshScheduler) {
  // of the test will work incorrectly and should be updated.
  const int iterations = 3;
  base::TimeDelta refresh_delay = base::TimeDelta::FromMilliseconds(
-      manager_->core()->refresh_scheduler()->GetActualRefreshDelay());
+      manager_->core()->refresh_scheduler()->GetActualRefreshDelay() +
+      manager_->core()->refresh_scheduler()->GetSaltDelayForTesting());
  ASSERT_GT(refresh_delay, iterations * token_lifetime);

  // Advancing the clock will trigger delivery of new tokens. It should not

--- a/components/policy/core/common/cloud/cloud_policy_refresh_scheduler.cc
+++ b/components/policy/core/common/cloud/cloud_policy_refresh_scheduler.cc
@@ -11,6 +11,7 @@
 #include "base/bind_helpers.h"
 #include "base/logging.h"
 #include "base/metrics/histogram_macros.h"
+#include "base/rand_util.h"
 #include "base/sequenced_task_runner.h"
 #include "build/build_config.h"
 #include "components/policy/core/common/cloud/cloud_policy_constants.h"
@@ -36,6 +37,8 @@ const int64_t CloudPolicyRefreshScheduler::kRefreshDelayMinMs =
    30 * 60 * 1000;  // 30 minutes.
 const int64_t CloudPolicyRefreshScheduler::kRefreshDelayMaxMs =
    7 * 24 * 60 * 60 * 1000;  // 1 week.
+const int64_t CloudPolicyRefreshScheduler::kRandomSaltDelayMaxValueMs =
+    5 * 60 * 1000;  // 5 minutes.

 #else

@@ -53,6 +56,8 @@ const int64_t CloudPolicyRefreshScheduler::kRefreshDelayMinMs =
    30 * 60 * 1000;  // 30 minutes.
 const int64_t CloudPolicyRefreshScheduler::kRefreshDelayMaxMs =
    24 * 60 * 60 * 1000;  // 1 day.
+const int64_t CloudPolicyRefreshScheduler::kRandomSaltDelayMaxValueMs =
+    5 * 60 * 1000;  // 5 minutes.

 #endif

@@ -69,6 +74,8 @@ CloudPolicyRefreshScheduler::CloudPolicyRefreshScheduler(
      network_connection_tracker_(network_connection_tracker_getter.Run()),
      error_retry_delay_ms_(kInitialErrorRetryDelayMs),
      refresh_delay_ms_(kDefaultRefreshDelayMs),
+      refresh_delay_salt_ms_(static_cast<int64_t>(
+          base::RandGenerator(kRandomSaltDelayMaxValueMs))),
      invalidations_available_(false),
      creation_time_(base::Time::NowFromSystemTime()) {
  client_->AddObserver(this);
@@ -364,7 +371,13 @@ void CloudPolicyRefreshScheduler::RefreshAfter(int delta_ms) {
  const base::TimeDelta time_ticks_delay =
      std::max((last_refresh_ticks_ + delta) - base::TimeTicks::Now(),
               base::TimeDelta());
-  const base::TimeDelta delay = std::min(system_delay, time_ticks_delay);
+  base::TimeDelta delay = std::min(system_delay, time_ticks_delay);
+
+  // Unless requesting an immediate refresh, add a delay to the scheduled policy
+  // refresh in order to spread out server load.
+  if (!delay.is_zero())
+    delay += base::TimeDelta::FromMilliseconds(refresh_delay_salt_ms_);
+
  refresh_callback_.Reset(base::BindOnce(
      &CloudPolicyRefreshScheduler::PerformRefresh, base::Unretained(this)));
  task_runner_->PostDelayedTask(FROM_HERE, refresh_callback_.callback(), delay);

--- a/components/policy/core/common/cloud/cloud_policy_refresh_scheduler.h
+++ b/components/policy/core/common/cloud/cloud_policy_refresh_scheduler.h
@@ -36,6 +36,7 @@ class POLICY_EXPORT CloudPolicyRefreshScheduler
  static const int64_t kUnmanagedRefreshDelayMs;
  static const int64_t kWithInvalidationsRefreshDelayMs;
  static const int64_t kInitialErrorRetryDelayMs;
+  static const int64_t kRandomSaltDelayMaxValueMs;

  // Refresh delay bounds.
  static const int64_t kRefreshDelayMinMs;
@@ -62,6 +63,9 @@ class POLICY_EXPORT CloudPolicyRefreshScheduler
  // invalidations are available or not).
  int64_t GetActualRefreshDelay() const;

+  // For testing: get the value randomly assigned to refresh_delay_salt_ms_.
+  int64_t GetSaltDelayForTesting() const { return refresh_delay_salt_ms_; }
+
  // Schedules a refresh to be performed immediately.
  void RefreshSoon();

@@ -106,8 +110,9 @@ class POLICY_EXPORT CloudPolicyRefreshScheduler
  // Triggers a policy refresh.
  void PerformRefresh();

-  // Schedules a policy refresh to happen no later than |delta_ms| msecs after
-  // |last_refresh_| or |last_refresh_ticks_| whichever is sooner.
+  // Schedules a policy refresh to happen no later than |delta_ms| +
+  // |refresh_delay_salt_ms_| msecs after |last_refresh_| or
+  // |last_refresh_ticks_| whichever is sooner.
  void RefreshAfter(int delta_ms);

  // Cancels the scheduled policy refresh.
@@ -153,6 +158,10 @@ class POLICY_EXPORT CloudPolicyRefreshScheduler
  // The refresh delay.
  int64_t refresh_delay_ms_;

+  // A randomly-generated (between 0 and |kRandomSaltDelayMaxValueMs|) delay
+  // added to all non-immediately scheduled refresh requests.
+  const int64_t refresh_delay_salt_ms_;
+
  // Whether the invalidations service is available and receiving notifications
  // of policy updates.
  bool invalidations_available_;

--- a/components/policy/core/common/cloud/cloud_policy_refresh_scheduler_unittest.cc
+++ b/components/policy/core/common/cloud/cloud_policy_refresh_scheduler_unittest.cc
@@ -93,19 +93,28 @@ class CloudPolicyRefreshSchedulerTest : public testing::Test {
    return task_runner_->FinalPendingTaskDelay();
  }

-  void CheckTiming(int64_t expected_delay_ms) const {
-    CheckTimingWithAge(base::TimeDelta::FromMilliseconds(expected_delay_ms),
+  void CheckTiming(CloudPolicyRefreshScheduler* const scheduler,
+                   int64_t expected_delay_ms) const {
+    CheckTimingWithAge(scheduler,
+                       base::TimeDelta::FromMilliseconds(expected_delay_ms),
                       base::TimeDelta());
  }

  // Checks that the latest refresh scheduled used an offset of
  // |offset_from_last_refresh| from the time of the previous refresh.
  // |cache_age| is how old the cache was when the refresh was issued.
-  void CheckTimingWithAge(const base::TimeDelta& offset_from_last_refresh,
+  void CheckTimingWithAge(CloudPolicyRefreshScheduler* const scheduler,
+                          const base::TimeDelta& offset_from_last_refresh,
                          const base::TimeDelta& cache_age) const {
    EXPECT_TRUE(task_runner_->HasPendingTask());
    base::Time now(base::Time::NowFromSystemTime());
    base::TimeTicks now_ticks(base::TimeTicks::Now());
+    base::TimeDelta offset_since_refresh_plus_salt = offset_from_last_refresh;
+    // The salt is only applied for non-immediate scheduled refreshes.
+    if (!offset_from_last_refresh.is_zero()) {
+      offset_since_refresh_plus_salt += base::TimeDelta::FromMilliseconds(
+          scheduler->GetSaltDelayForTesting());
+    }
    // |last_update_| was updated and then a refresh was scheduled at time S,
    // so |last_update_| is a bit before that.
    // Now is a bit later, N.
@@ -129,24 +138,25 @@ class CloudPolicyRefreshSchedulerTest : public testing::Test {
    //
    // |last_update_| was a bit before S, so if
    // elapsed = now - |last_update_| then the delay is more than
-    // |offset_from_last_refresh| - elapsed.
+    // |offset_since_refresh_plus_salt| - elapsed.
    //
-    // The delay is also less than offset_from_last_refresh, because some time
-    // already elapsed. Additionally, if the cache was already considered old
-    // when the schedule was performed then its age at that time has been
+    // The delay is also less than offset_since_refresh_plus_salt, because some
+    // time already elapsed. Additionally, if the cache was already considered
+    // old when the schedule was performed then its age at that time has been
    // discounted from the delay. So the delay is a bit less than
-    // |offset_from_last_refresh - cache_age|.
+    // |offset_since_refresh_plus_salt - cache_age|.
    // The logic of time based on TimeTicks is added to be on the safe side,
    // since CloudPolicyRefreshScheduler implementation is based on both, the
    // system time and the time in TimeTicks.
    base::TimeDelta system_delta = (now - last_update_);
    base::TimeDelta ticks_delta = (now_ticks - last_update_ticks_);
-    EXPECT_GE(GetLastDelay(),
-              offset_from_last_refresh - std::max(system_delta, ticks_delta));
-    EXPECT_LE(GetLastDelay(), offset_from_last_refresh - cache_age);
+    EXPECT_GE(GetLastDelay(), offset_since_refresh_plus_salt -
+                                  std::max(system_delta, ticks_delta));
+    EXPECT_LE(GetLastDelay(), offset_since_refresh_plus_salt - cache_age);
  }

-  void CheckInitialRefresh(bool with_invalidations) const {
+  void CheckInitialRefresh(CloudPolicyRefreshScheduler* const scheduler,
+                           bool with_invalidations) const {
 #if defined(OS_ANDROID) || defined(OS_IOS)
    // The mobile platforms take the cache age into account for the initial
    // fetch. Usually the cache age is ignored for the initial refresh, but on
@@ -155,7 +165,7 @@ class CloudPolicyRefreshSchedulerTest : public testing::Test {
        with_invalidations
            ? CloudPolicyRefreshScheduler::kWithInvalidationsRefreshDelayMs
            : kPolicyRefreshRate);
-    CheckTimingWithAge(rate,
+    CheckTimingWithAge(scheduler, rate,
                       base::TimeDelta::FromMinutes(kInitialCacheAgeMinutes));
 #else
    // Other platforms refresh immediately.
@@ -194,7 +204,8 @@ TEST_F(CloudPolicyRefreshSchedulerTest, InitialRefreshUnmanaged) {
  store_.policy_->set_state(em::PolicyData::UNMANAGED);
  std::unique_ptr<CloudPolicyRefreshScheduler> scheduler(
      CreateRefreshScheduler());
-  CheckTiming(CloudPolicyRefreshScheduler::kUnmanagedRefreshDelayMs);
+  CheckTiming(scheduler.get(),
+              CloudPolicyRefreshScheduler::kUnmanagedRefreshDelayMs);
  EXPECT_CALL(*service_.get(), RefreshPolicy(_)).Times(1);
  EXPECT_CALL(client_, FetchPolicy()).Times(1);
  task_runner_->RunUntilIdle();
@@ -204,7 +215,7 @@ TEST_F(CloudPolicyRefreshSchedulerTest, InitialRefreshManagedNotYetFetched) {
  std::unique_ptr<CloudPolicyRefreshScheduler> scheduler(
      CreateRefreshScheduler());
  EXPECT_TRUE(task_runner_->HasPendingTask());
-  CheckInitialRefresh(false);
+  CheckInitialRefresh(scheduler.get(), false);
  EXPECT_CALL(*service_.get(), RefreshPolicy(_)).Times(1);
  EXPECT_CALL(client_, FetchPolicy()).Times(1);
  task_runner_->RunUntilIdle();
@@ -216,7 +227,7 @@ TEST_F(CloudPolicyRefreshSchedulerTest, InitialRefreshManagedAlreadyFetched) {
                    em::PolicyFetchResponse());
  std::unique_ptr<CloudPolicyRefreshScheduler> scheduler(
      CreateRefreshScheduler());
-  CheckTiming(kPolicyRefreshRate);
+  CheckTiming(scheduler.get(), kPolicyRefreshRate);
  EXPECT_CALL(*service_.get(), RefreshPolicy(_)).Times(1);
  EXPECT_CALL(client_, FetchPolicy()).Times(1);
  task_runner_->RunUntilIdle();
@@ -251,20 +262,20 @@ TEST_F(CloudPolicyRefreshSchedulerTest, RefreshSoonOverriding) {

  // The refresh scheduled for soon overrides the previously scheduled refresh.
  scheduler->RefreshSoon();
-  CheckTiming(0);
+  CheckTiming(scheduler.get(), 0);

  // The refresh scheduled for soon is not overridden by the change of the
  // desired refresh delay.
  const int64_t kNewPolicyRefreshRate = 12 * 60 * 60 * 1000;
  scheduler->SetDesiredRefreshDelay(kNewPolicyRefreshRate);
-  CheckTiming(0);
+  CheckTiming(scheduler.get(), 0);

  // The refresh scheduled for soon is not overridden by the notification on the
  // already fetched policy.
  client_.SetPolicy(dm_protocol::kChromeUserPolicyType, std::string(),
                    em::PolicyFetchResponse());
  store_.NotifyStoreLoaded();
-  CheckTiming(0);
+  CheckTiming(scheduler.get(), 0);

  EXPECT_CALL(*service_.get(), RefreshPolicy(_)).Times(1);
  EXPECT_CALL(client_, FetchPolicy()).Times(1);
@@ -273,7 +284,7 @@ TEST_F(CloudPolicyRefreshSchedulerTest, RefreshSoonOverriding) {

  // The next refresh is scheduled according to the normal rate.
  client_.NotifyPolicyFetched();
-  CheckTiming(kNewPolicyRefreshRate);
+  CheckTiming(scheduler.get(), kNewPolicyRefreshRate);
 }

 TEST_F(CloudPolicyRefreshSchedulerTest, InvalidationsAvailable) {
@@ -292,7 +303,7 @@ TEST_F(CloudPolicyRefreshSchedulerTest, InvalidationsAvailable) {
            scheduler->GetActualRefreshDelay());
  EXPECT_EQ(3u, task_runner_->NumPendingTasks());

-  CheckInitialRefresh(true);
+  CheckInitialRefresh(scheduler.get(), true);

  EXPECT_CALL(*service_.get(), RefreshPolicy(_)).Times(1);
  EXPECT_CALL(client_, FetchPolicy()).Times(1);
@@ -305,7 +316,8 @@ TEST_F(CloudPolicyRefreshSchedulerTest, InvalidationsAvailable) {

  // The next refresh has been scheduled using a lower refresh rate.
  EXPECT_EQ(1u, task_runner_->NumPendingTasks());
-  CheckTiming(CloudPolicyRefreshScheduler::kWithInvalidationsRefreshDelayMs);
+  CheckTiming(scheduler.get(),
+              CloudPolicyRefreshScheduler::kWithInvalidationsRefreshDelayMs);
 }

 TEST_F(CloudPolicyRefreshSchedulerTest, InvalidationsNotAvailable) {
@@ -323,7 +335,7 @@ TEST_F(CloudPolicyRefreshSchedulerTest, InvalidationsNotAvailable) {
  }

  // This scheduled the initial refresh.
-  CheckInitialRefresh(false);
+  CheckInitialRefresh(scheduler.get(), false);
  EXPECT_EQ(kPolicyRefreshRate, scheduler->GetActualRefreshDelay());

  // Perform that fetch now.
@@ -338,7 +350,7 @@ TEST_F(CloudPolicyRefreshSchedulerTest, InvalidationsNotAvailable) {

  // The next refresh has been scheduled at the normal rate.
  EXPECT_EQ(1u, task_runner_->NumPendingTasks());
-  CheckTiming(kPolicyRefreshRate);
+  CheckTiming(scheduler.get(), kPolicyRefreshRate);
 }

 TEST_F(CloudPolicyRefreshSchedulerTest, InvalidationsOffAndOn) {
@@ -357,7 +369,8 @@ TEST_F(CloudPolicyRefreshSchedulerTest, InvalidationsOffAndOn) {
  client_.NotifyPolicyFetched();

  // The next refresh has been scheduled using a lower refresh rate.
-  CheckTiming(CloudPolicyRefreshScheduler::kWithInvalidationsRefreshDelayMs);
+  CheckTiming(scheduler.get(),
+              CloudPolicyRefreshScheduler::kWithInvalidationsRefreshDelayMs);

  // If the service goes down and comes back up before the timeout then a
  // refresh is rescheduled at the lower rate again; after executing all
@@ -367,7 +380,8 @@ TEST_F(CloudPolicyRefreshSchedulerTest, InvalidationsOffAndOn) {
  // The next refresh has been scheduled using a lower refresh rate.
  EXPECT_CALL(*service_.get(), RefreshPolicy(_)).Times(1);
  EXPECT_CALL(client_, FetchPolicy()).Times(1);
-  CheckTiming(CloudPolicyRefreshScheduler::kWithInvalidationsRefreshDelayMs);
+  CheckTiming(scheduler.get(),
+              CloudPolicyRefreshScheduler::kWithInvalidationsRefreshDelayMs);
  task_runner_->RunPendingTasks();
  Mock::VerifyAndClearExpectations(&client_);
 }
@@ -389,7 +403,8 @@ TEST_F(CloudPolicyRefreshSchedulerTest, InvalidationsDisconnected) {

  // The next refresh has been scheduled using a lower refresh rate.
  // Flush that task.
-  CheckTiming(CloudPolicyRefreshScheduler::kWithInvalidationsRefreshDelayMs);
+  CheckTiming(scheduler.get(),
+              CloudPolicyRefreshScheduler::kWithInvalidationsRefreshDelayMs);
  EXPECT_CALL(*service_.get(), RefreshPolicy(_)).Times(1);
  EXPECT_CALL(client_, FetchPolicy()).Times(1);
  task_runner_->RunPendingTasks();
@@ -398,7 +413,7 @@ TEST_F(CloudPolicyRefreshSchedulerTest, InvalidationsDisconnected) {
  // If the service goes down then the refresh scheduler falls back on the
  // default polling rate.
  scheduler->SetInvalidationServiceAvailability(false);
-  CheckTiming(kPolicyRefreshRate);
+  CheckTiming(scheduler.get(), kPolicyRefreshRate);
 }

 TEST_F(CloudPolicyRefreshSchedulerTest, OnConnectionChangedUnregistered) {
@@ -446,7 +461,7 @@ class CloudPolicyRefreshSchedulerSteadyStateTest
    CloudPolicyRefreshSchedulerTest::SetUp();
    SetLastUpdateToNow();
    client_.NotifyPolicyFetched();
-    CheckTiming(kPolicyRefreshRate);
+    CheckTiming(refresh_scheduler_.get(), kPolicyRefreshRate);
  }

  std::unique_ptr<CloudPolicyRefreshScheduler> refresh_scheduler_;
@@ -454,7 +469,7 @@ class CloudPolicyRefreshSchedulerSteadyStateTest

 TEST_F(CloudPolicyRefreshSchedulerSteadyStateTest, OnPolicyFetched) {
  client_.NotifyPolicyFetched();
-  CheckTiming(kPolicyRefreshRate);
+  CheckTiming(refresh_scheduler_.get(), kPolicyRefreshRate);
 }

 TEST_F(CloudPolicyRefreshSchedulerSteadyStateTest, OnRegistrationStateChanged) {
@@ -470,7 +485,7 @@ TEST_F(CloudPolicyRefreshSchedulerSteadyStateTest, OnRegistrationStateChanged) {

 TEST_F(CloudPolicyRefreshSchedulerSteadyStateTest, OnStoreLoaded) {
  store_.NotifyStoreLoaded();
-  CheckTiming(kPolicyRefreshRate);
+  CheckTiming(refresh_scheduler_.get(), kPolicyRefreshRate);
 }

 TEST_F(CloudPolicyRefreshSchedulerSteadyStateTest, OnStoreError) {
@@ -482,15 +497,17 @@ TEST_F(CloudPolicyRefreshSchedulerSteadyStateTest, OnStoreError) {
 TEST_F(CloudPolicyRefreshSchedulerSteadyStateTest, RefreshDelayChange) {
  const int delay_short_ms = 5 * 60 * 1000;
  refresh_scheduler_->SetDesiredRefreshDelay(delay_short_ms);
-  CheckTiming(CloudPolicyRefreshScheduler::kRefreshDelayMinMs);
+  CheckTiming(refresh_scheduler_.get(),
+              CloudPolicyRefreshScheduler::kRefreshDelayMinMs);

  const int delay_ms = 12 * 60 * 60 * 1000;
  refresh_scheduler_->SetDesiredRefreshDelay(delay_ms);
-  CheckTiming(delay_ms);
+  CheckTiming(refresh_scheduler_.get(), delay_ms);

  const int delay_long_ms = 20 * 24 * 60 * 60 * 1000;
  refresh_scheduler_->SetDesiredRefreshDelay(delay_long_ms);
-  CheckTiming(CloudPolicyRefreshScheduler::kRefreshDelayMaxMs);
+  CheckTiming(refresh_scheduler_.get(),
+              CloudPolicyRefreshScheduler::kRefreshDelayMaxMs);
 }

 TEST_F(CloudPolicyRefreshSchedulerSteadyStateTest, OnConnectionChanged) {
@@ -540,7 +557,7 @@ TEST_P(CloudPolicyRefreshSchedulerClientErrorTest, OnClientError) {
  int64_t expected_delay_ms = GetParam().expected_delay_ms;
  client_.NotifyClientError();
  if (expected_delay_ms >= 0) {
-    CheckTiming(expected_delay_ms);
+    CheckTiming(refresh_scheduler_.get(), expected_delay_ms);

    // Check whether exponential backoff is working as expected and capped at
    // the regular refresh rate (if applicable).
@@ -548,7 +565,8 @@ TEST_P(CloudPolicyRefreshSchedulerClientErrorTest, OnClientError) {
      expected_delay_ms *= GetParam().backoff_factor;
      SetLastUpdateToNow();
      client_.NotifyClientError();
-      CheckTiming(std::max(std::min(expected_delay_ms, kPolicyRefreshRate),
+      CheckTiming(refresh_scheduler_.get(),
+                  std::max(std::min(expected_delay_ms, kPolicyRefreshRate),
                           GetParam().expected_delay_ms));
    } while (GetParam().backoff_factor > 1 &&
             expected_delay_ms <= kPolicyRefreshRate);