Commit 5286517f authored by Yuwei Huang's avatar Yuwei Huang Committed by Commit Bot

[remoting][FTL] Fix heartbeat sender backoff bug

There is a bug in heartbeat sender's backoff logic such that the
backoff is not capped to a maximum retry interval. This CL:

* Fixes the bug by switching to use BackoffEntry
* Adds unittests
* Adds LOG to help future debugging

Change-Id: Idd737d2c8edfec326adf394eef5ee6857d757287
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1636829
Commit-Queue: Joe Downing <joedow@chromium.org>
Reviewed-by: default avatarJoe Downing <joedow@chromium.org>
Cr-Commit-Position: refs/heads/master@{#664637}
parent 15209a68
......@@ -39,12 +39,37 @@ constexpr base::TimeDelta kMinimumHeartbeatInterval =
base::TimeDelta::FromMinutes(3);
constexpr base::TimeDelta kHeartbeatResponseTimeout =
base::TimeDelta::FromSeconds(30);
constexpr base::TimeDelta kResendDelay = base::TimeDelta::FromSeconds(10);
constexpr base::TimeDelta kResendDelayOnHostNotFound =
base::TimeDelta::FromSeconds(10);
const int kMaxResendOnHostNotFoundCount = 12; // 2 minutes (12 x 10 seconds).
const net::BackoffEntry::Policy kBackoffPolicy = {
// Number of initial errors (in sequence) to ignore before applying
// exponential back-off rules.
0,
// Initial delay for exponential back-off in ms. (10s)
10000,
// Factor by which the waiting time will be multiplied.
2,
// Fuzzing percentage. ex: 10% will spread requests randomly
// between 90%-100% of the calculated time.
0.5,
// Maximum amount of time we are willing to delay our request in ms. (10m)
600000,
// Time to keep an entry from being discarded even when it
// has no significant state, -1 to never discard.
-1,
// Starts with initial delay.
false,
};
using HeartbeatResponseCallback =
base::OnceCallback<void(const grpc::Status&,
const apis::v1::HeartbeatResponse&)>;
......@@ -128,7 +153,8 @@ HeartbeatSender::HeartbeatSender(
signal_strategy_(signal_strategy),
host_key_pair_(host_key_pair),
client_(std::make_unique<HeartbeatClient>(oauth_token_getter)),
log_to_server_(log_to_server) {
log_to_server_(log_to_server),
backoff_(&kBackoffPolicy) {
DCHECK(host_key_pair_.get());
DCHECK(log_to_server_);
......@@ -233,7 +259,7 @@ void HeartbeatSender::OnResponse(const grpc::Status& status,
if (status.ok()) {
heartbeat_succeeded_ = true;
failed_heartbeat_count_ = 0;
backoff_.Reset();
// Notify listener of the first successful heartbeat.
if (on_heartbeat_successful_callback_) {
......@@ -247,7 +273,7 @@ void HeartbeatSender::OnResponse(const grpc::Status& status,
return;
}
} else {
failed_heartbeat_count_++;
backoff_.InformOfRequest(false);
}
if (status.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) {
......@@ -261,7 +287,7 @@ void HeartbeatSender::OnResponse(const grpc::Status& status,
// exit.
if (status.error_code() == grpc::StatusCode::NOT_FOUND &&
(heartbeat_succeeded_ ||
(failed_heartbeat_count_ > kMaxResendOnHostNotFoundCount))) {
(backoff_.failure_count() > kMaxResendOnHostNotFoundCount))) {
if (on_unknown_host_id_error_) {
std::move(on_unknown_host_id_error_).Run();
}
......@@ -287,8 +313,10 @@ void HeartbeatSender::OnResponse(const grpc::Status& status,
// TODO(yuweih): Handle sequence ID mismatch case. This is not implemented
// in the server yet.
default:
delay = pow(2.0, failed_heartbeat_count_) * (1 + base::RandDouble()) *
kResendDelay;
delay = backoff_.GetTimeUntilRelease();
LOG(ERROR) << "Heartbeat failed due to unexpected error: "
<< status.error_code() << ", " << status.error_message()
<< ". Will retry in " << delay;
break;
}
......
......@@ -14,6 +14,7 @@
#include "base/memory/ref_counted.h"
#include "base/sequence_checker.h"
#include "base/timer/timer.h"
#include "net/base/backoff_entry.h"
#include "remoting/base/grpc_support/grpc_channel.h"
#include "remoting/base/rsa_key_pair.h"
#include "remoting/proto/remoting/v1/directory_messages.pb.h"
......@@ -127,11 +128,10 @@ class HeartbeatSender final : public SignalStrategy::Listener {
base::OneShotTimer heartbeat_timer_;
int failed_heartbeat_count_ = 0;
net::BackoffEntry backoff_;
int sequence_id_ = 0;
bool heartbeat_succeeded_ = false;
int timed_out_heartbeats_count_ = 0;
// Fields to send and indicate completion of sending host-offline-reason.
std::string host_offline_reason_;
......
......@@ -147,6 +147,10 @@ class HeartbeatSenderTest : public testing::Test, public LogToServer {
protected:
HeartbeatSender* heartbeat_sender() { return heartbeat_sender_.get(); }
const net::BackoffEntry& GetBackoff() const {
return heartbeat_sender_->backoff_;
}
base::test::ScopedTaskEnvironment scoped_task_environment_{
base::test::ScopedTaskEnvironment::MainThreadType::MOCK_TIME};
test::GrpcTestServer<MockDirectoryService> test_server_;
......@@ -419,4 +423,43 @@ TEST_F(HeartbeatSenderTest, SendHeartbeatLogEntryOnHeartbeat) {
ASSERT_EQ(1u, received_log_entries_.size());
}
TEST_F(HeartbeatSenderTest, FailedToHeartbeat_Backoff) {
base::RunLoop run_loop;
EXPECT_CALL(*test_server_, Heartbeat(_, _, _))
.WillOnce([&](grpc::ServerContext*,
const apis::v1::HeartbeatRequest* request,
apis::v1::HeartbeatResponse* response) {
EXPECT_EQ(0, GetBackoff().failure_count());
ValidateHeartbeat(*request, /* ftl */ true, /* xmpp */ true, 0);
return grpc::Status(grpc::StatusCode::UNAVAILABLE, "unavailable");
})
.WillOnce([&](grpc::ServerContext*,
const apis::v1::HeartbeatRequest* request,
apis::v1::HeartbeatResponse* response) {
EXPECT_EQ(1, GetBackoff().failure_count());
ValidateHeartbeat(*request, /* ftl */ true, /* xmpp */ true, 1);
return grpc::Status(grpc::StatusCode::UNAVAILABLE, "unavailable");
})
.WillOnce([&](grpc::ServerContext*,
const apis::v1::HeartbeatRequest* request,
apis::v1::HeartbeatResponse* response) {
EXPECT_EQ(2, GetBackoff().failure_count());
ValidateHeartbeat(*request, /* ftl */ true, /* xmpp */ true, 2);
return grpc::Status::OK;
});
EXPECT_CALL(mock_heartbeat_successful_callback_, Run()).WillOnce([&]() {
run_loop.Quit();
});
base::SequencedTaskRunnerHandle::Get()->PostTask(
FROM_HERE, base::BindLambdaForTesting([&]() {
ftl_signal_strategy_->Connect();
xmpp_signal_strategy_->Connect();
}));
run_loop.Run();
ASSERT_EQ(0, GetBackoff().failure_count());
}
} // namespace remoting
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment