Commit 4494c5d6 authored by tim@chromium.org's avatar tim@chromium.org

sync: raise initial backoff interval from 1 second to 5 minutes.

Long term, we'd like to drop this further (i.e. 1 minute) and trust THROTTLED responses in the event of widespread backend outages.
We'd like to run some stress tests of that infrastructure (at scale / under load), but in the meantime we want to address the problem
for beta channel users / M22.  We chose 5 minutes, which is a little more aggressive than the recommended value, but we believe is
safe -- happy to discuss offline.

This patch is very low risk (stability wise). although 5 minutes is a long time to wait for users who hit this due to transient failures.
Current data suggests 500s occur around 0.02% of the time, and although backoff isn't limited to 500s it's unlikely to get hit frequently,
and the user experience doesn't break down significantly.

Effectiveness wise, We can run an experiment once it hits dev channel by returning 500s and watching for expected drops in QPS.

BUG=139733,142029

Review URL: https://chromiumcodereview.appspot.com/10826194

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@151216 0039d316-1c4b-4281-b951-d872f2087c98
parent 74d42bb4
...@@ -55,6 +55,7 @@ ...@@ -55,6 +55,7 @@
#include "net/url_request/url_request_status.h" #include "net/url_request/url_request_status.h"
#include "sync/notifier/p2p_notifier.h" #include "sync/notifier/p2p_notifier.h"
#include "sync/protocol/sync.pb.h" #include "sync/protocol/sync.pb.h"
#include "sync/engine/sync_scheduler_impl.h"
using content::BrowserThread; using content::BrowserThread;
...@@ -169,6 +170,9 @@ void SyncTest::SetUp() { ...@@ -169,6 +170,9 @@ void SyncTest::SetUp() {
Encryptor::UseMockKeychain(true); Encryptor::UseMockKeychain(true);
#endif #endif
// TODO(tim): Use command line flag.
syncer::SyncSchedulerImpl::ForceShortInitialBackoffRetry();
// Yield control back to the InProcessBrowserTest framework. // Yield control back to the InProcessBrowserTest framework.
InProcessBrowserTest::SetUp(); InProcessBrowserTest::SetUp();
} }
......
...@@ -31,6 +31,12 @@ using sessions::SyncSourceInfo; ...@@ -31,6 +31,12 @@ using sessions::SyncSourceInfo;
using sync_pb::GetUpdatesCallerInfo; using sync_pb::GetUpdatesCallerInfo;
namespace { namespace {
// For integration tests only. Override initial backoff value.
// TODO(tim): Remove this egregiousness, use command line flag and plumb
// through. Done this way to reduce diffs in hotfix.
static bool g_force_short_retry = false;
bool ShouldRequestEarlyExit(const SyncProtocolError& error) { bool ShouldRequestEarlyExit(const SyncProtocolError& error) {
switch (error.error_type) { switch (error.error_type) {
case SYNC_SUCCESS: case SYNC_SUCCESS:
...@@ -897,6 +903,43 @@ void SyncSchedulerImpl::RestartWaiting() { ...@@ -897,6 +903,43 @@ void SyncSchedulerImpl::RestartWaiting() {
this, &SyncSchedulerImpl::DoCanaryJob); this, &SyncSchedulerImpl::DoCanaryJob);
} }
namespace {
// TODO(tim): Move this function to syncer_error.h.
// Return true if the command in question was attempted and did not complete
// successfully.
bool IsError(SyncerError error) {
return error != UNSET && error != SYNCER_OK;
}
} // namespace
// static
void SyncSchedulerImpl::ForceShortInitialBackoffRetry() {
g_force_short_retry = true;
}
TimeDelta SyncSchedulerImpl::GetInitialBackoffDelay(
const sessions::ModelNeutralState& state) const {
// TODO(tim): Remove this, provide integration-test-only mechanism
// for override.
if (g_force_short_retry) {
return TimeDelta::FromSeconds(kInitialBackoffShortRetrySeconds);
}
if (IsError(state.last_get_key_result))
return TimeDelta::FromSeconds(kInitialBackoffRetrySeconds);
// Note: If we received a MIGRATION_DONE on download updates, then commit
// should not have taken place. Moreover, if we receive a MIGRATION_DONE
// on commit, it means that download updates succeeded. Therefore, we only
// need to check if either code is equal to SERVER_RETURN_MIGRATION_DONE,
// and not if there were any more serious errors requiring the long retry.
if (state.last_download_updates_result == SERVER_RETURN_MIGRATION_DONE ||
state.commit_result == SERVER_RETURN_MIGRATION_DONE) {
return TimeDelta::FromSeconds(kInitialBackoffShortRetrySeconds);
}
return TimeDelta::FromSeconds(kInitialBackoffRetrySeconds);
}
void SyncSchedulerImpl::HandleContinuationError( void SyncSchedulerImpl::HandleContinuationError(
const SyncSessionJob& old_job) { const SyncSessionJob& old_job) {
DCHECK_EQ(MessageLoop::current(), sync_loop_); DCHECK_EQ(MessageLoop::current(), sync_loop_);
...@@ -907,7 +950,9 @@ void SyncSchedulerImpl::HandleContinuationError( ...@@ -907,7 +950,9 @@ void SyncSchedulerImpl::HandleContinuationError(
} }
TimeDelta length = delay_provider_->GetDelay( TimeDelta length = delay_provider_->GetDelay(
IsBackingOff() ? wait_interval_->length : TimeDelta::FromSeconds(1)); IsBackingOff() ? wait_interval_->length :
GetInitialBackoffDelay(
old_job.session->status_controller().model_neutral_state()));
SDVLOG(2) << "In handle continuation error with " SDVLOG(2) << "In handle continuation error with "
<< SyncSessionJob::GetPurposeString(old_job.purpose) << SyncSessionJob::GetPurposeString(old_job.purpose)
......
...@@ -77,6 +77,11 @@ class SyncSchedulerImpl : public SyncScheduler { ...@@ -77,6 +77,11 @@ class SyncSchedulerImpl : public SyncScheduler {
// TODO(tim): Look at URLRequestThrottlerEntryInterface. // TODO(tim): Look at URLRequestThrottlerEntryInterface.
static base::TimeDelta GetRecommendedDelay(const base::TimeDelta& base_delay); static base::TimeDelta GetRecommendedDelay(const base::TimeDelta& base_delay);
// For integration tests only. Override initial backoff value.
// TODO(tim): Remove this, use command line flag and plumb through. Done
// this way to reduce diffs in hotfix.
static void ForceShortInitialBackoffRetry();
private: private:
enum JobProcessDecision { enum JobProcessDecision {
// Indicates we should continue with the current job. // Indicates we should continue with the current job.
...@@ -143,6 +148,7 @@ class SyncSchedulerImpl : public SyncScheduler { ...@@ -143,6 +148,7 @@ class SyncSchedulerImpl : public SyncScheduler {
FRIEND_TEST_ALL_PREFIXES(SyncSchedulerWhiteboxTest, FRIEND_TEST_ALL_PREFIXES(SyncSchedulerWhiteboxTest,
ContinueNudgeWhileExponentialBackOff); ContinueNudgeWhileExponentialBackOff);
FRIEND_TEST_ALL_PREFIXES(SyncSchedulerTest, TransientPollFailure); FRIEND_TEST_ALL_PREFIXES(SyncSchedulerTest, TransientPollFailure);
FRIEND_TEST_ALL_PREFIXES(SyncSchedulerTest, GetInitialBackoffDelay);
// A component used to get time delays associated with exponential backoff. // A component used to get time delays associated with exponential backoff.
// Encapsulated into a class to facilitate testing. // Encapsulated into a class to facilitate testing.
...@@ -228,6 +234,11 @@ class SyncSchedulerImpl : public SyncScheduler { ...@@ -228,6 +234,11 @@ class SyncSchedulerImpl : public SyncScheduler {
// Helper to ScheduleNextSync in case of consecutive sync errors. // Helper to ScheduleNextSync in case of consecutive sync errors.
void HandleContinuationError(const SyncSessionJob& old_job); void HandleContinuationError(const SyncSessionJob& old_job);
// Helper to calculate the initial value for exponential backoff.
// See possible values and comments in polling_constants.h.
base::TimeDelta GetInitialBackoffDelay(
const sessions::ModelNeutralState& state) const;
// Determines if it is legal to run |job| by checking current // Determines if it is legal to run |job| by checking current
// operational mode, backoff or throttling, freshness // operational mode, backoff or throttling, freshness
// (so we don't make redundant syncs), and connection. // (so we don't make redundant syncs), and connection.
......
...@@ -928,7 +928,7 @@ TEST_F(SyncSchedulerTest, BackoffElevation) { ...@@ -928,7 +928,7 @@ TEST_F(SyncSchedulerTest, BackoffElevation) {
.WillRepeatedly(DoAll(Invoke(sessions::test_util::SimulateCommitFailed), .WillRepeatedly(DoAll(Invoke(sessions::test_util::SimulateCommitFailed),
RecordSyncShareMultiple(&r, kMinNumSamples))); RecordSyncShareMultiple(&r, kMinNumSamples)));
const TimeDelta first = TimeDelta::FromSeconds(1); const TimeDelta first = TimeDelta::FromSeconds(kInitialBackoffRetrySeconds);
const TimeDelta second = TimeDelta::FromMilliseconds(2); const TimeDelta second = TimeDelta::FromMilliseconds(2);
const TimeDelta third = TimeDelta::FromMilliseconds(3); const TimeDelta third = TimeDelta::FromMilliseconds(3);
const TimeDelta fourth = TimeDelta::FromMilliseconds(4); const TimeDelta fourth = TimeDelta::FromMilliseconds(4);
...@@ -959,6 +959,38 @@ TEST_F(SyncSchedulerTest, BackoffElevation) { ...@@ -959,6 +959,38 @@ TEST_F(SyncSchedulerTest, BackoffElevation) {
EXPECT_GE(r.times[4] - r.times[3], fifth); EXPECT_GE(r.times[4] - r.times[3], fifth);
} }
TEST_F(SyncSchedulerTest, GetInitialBackoffDelay) {
sessions::ModelNeutralState state;
state.last_get_key_result = SYNC_SERVER_ERROR;
EXPECT_EQ(kInitialBackoffRetrySeconds,
scheduler()->GetInitialBackoffDelay(state).InSeconds());
state.last_get_key_result = UNSET;
state.last_download_updates_result = SERVER_RETURN_MIGRATION_DONE;
EXPECT_EQ(kInitialBackoffShortRetrySeconds,
scheduler()->GetInitialBackoffDelay(state).InSeconds());
state.last_download_updates_result = SERVER_RETURN_TRANSIENT_ERROR;
EXPECT_EQ(kInitialBackoffRetrySeconds,
scheduler()->GetInitialBackoffDelay(state).InSeconds());
state.last_download_updates_result = SERVER_RESPONSE_VALIDATION_FAILED;
EXPECT_EQ(kInitialBackoffRetrySeconds,
scheduler()->GetInitialBackoffDelay(state).InSeconds());
state.last_download_updates_result = SYNCER_OK;
// Note that updating credentials triggers a canary job, trumping
// the initial delay, but in theory we still expect this function to treat
// it like any other error in the system (except migration).
state.commit_result = SERVER_RETURN_INVALID_CREDENTIAL;
EXPECT_EQ(kInitialBackoffRetrySeconds,
scheduler()->GetInitialBackoffDelay(state).InSeconds());
state.commit_result = SERVER_RETURN_MIGRATION_DONE;
EXPECT_EQ(kInitialBackoffShortRetrySeconds,
scheduler()->GetInitialBackoffDelay(state).InSeconds());
}
// Test that things go back to normal once a retry makes forward progress. // Test that things go back to normal once a retry makes forward progress.
TEST_F(SyncSchedulerTest, BackoffRelief) { TEST_F(SyncSchedulerTest, BackoffRelief) {
SyncShareRecords r; SyncShareRecords r;
......
...@@ -22,5 +22,13 @@ const int64 kMaxBackoffSeconds = 60 * 60 * 4; // 4 hours. ...@@ -22,5 +22,13 @@ const int64 kMaxBackoffSeconds = 60 * 60 * 4; // 4 hours.
// Backoff interval randomization factor. // Backoff interval randomization factor.
const int kBackoffRandomizationFactor = 2; const int kBackoffRandomizationFactor = 2;
} // namespace syncer // After a failure contacting sync servers, specifies how long to wait before
// reattempting and entering exponential backoff if consecutive failures
// occur.
const int kInitialBackoffRetrySeconds = 60 * 5; // 5 minutes.
// Similar to kInitialBackoffRetrySeconds above, but only to be used in
// certain exceptional error cases, such as MIGRATION_DONE.
const int kInitialBackoffShortRetrySeconds = 1;
} // namespace syncer
...@@ -15,6 +15,8 @@ extern const int64 kDefaultShortPollIntervalSeconds; ...@@ -15,6 +15,8 @@ extern const int64 kDefaultShortPollIntervalSeconds;
extern const int64 kDefaultLongPollIntervalSeconds; extern const int64 kDefaultLongPollIntervalSeconds;
SYNC_EXPORT extern const int64 kMaxBackoffSeconds; SYNC_EXPORT extern const int64 kMaxBackoffSeconds;
SYNC_EXPORT extern const int kBackoffRandomizationFactor; SYNC_EXPORT extern const int kBackoffRandomizationFactor;
SYNC_EXPORT_PRIVATE extern const int kInitialBackoffRetrySeconds;
SYNC_EXPORT_PRIVATE extern const int kInitialBackoffShortRetrySeconds;
} // namespace syncer } // namespace syncer
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment