Commit e264f250 authored by zea@chromium.org's avatar zea@chromium.org

[GCM] Track connection failures properly

This patch consolidates the reset logic and adds a reason for the reset,
for use in histograms. Additionally, connection uptime and overall success
rate are now tracked as well.

BUG=284553

Review URL: https://codereview.chromium.org/160703002

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@251186 0039d316-1c4b-4281-b951-d872f2087c98
parent f5f3905c
......@@ -23,6 +23,18 @@ class GCM_EXPORT ConnectionFactory {
typedef base::Callback<void(mcs_proto::LoginRequest* login_request)>
BuildLoginRequestCallback;
// Reasons for triggering a connection reset. Note that these enums are
// consumed by a histogram, so ordering should not be modified.
enum ConnectionResetReason {
LOGIN_FAILURE, // Login response included an error.
CLOSE_COMMAND, // Received a close command.
HEARTBEAT_FAILURE, // Heartbeat was not acknowledged in time.
SOCKET_FAILURE, // net::Socket error.
// Count of total number of connection reset reasons. All new reset reasons
// should be added above this line.
CONNECTION_RESET_COUNT,
};
ConnectionFactory();
virtual ~ConnectionFactory();
......@@ -62,7 +74,7 @@ class GCM_EXPORT ConnectionFactory {
// event forced a reset (e.g. server sends a close connection response).
// If the last connection was made within kConnectionResetWindowSecs, the old
// backoff is restored, else a new backoff kicks off.
virtual void SignalConnectionReset() = 0;
virtual void SignalConnectionReset(ConnectionResetReason reason) = 0;
};
} // namespace gcm
......
......@@ -29,6 +29,15 @@ const int kReadTimeoutMs = 30000; // 30 seconds.
// as if it was transient).
const int kConnectionResetWindowSecs = 10; // 10 seconds.
// Decides whether the last login was within kConnectionResetWindowSecs of now
// or not.
bool ShouldRestorePreviousBackoff(const base::TimeTicks& login_time,
const base::TimeTicks& now_ticks) {
return !login_time.is_null() &&
now_ticks - login_time <=
base::TimeDelta::FromSeconds(kConnectionResetWindowSecs);
}
} // namespace
ConnectionFactoryImpl::ConnectionFactoryImpl(
......@@ -41,6 +50,7 @@ ConnectionFactoryImpl::ConnectionFactoryImpl(
network_session_(network_session),
net_log_(net_log),
connecting_(false),
logging_in_(false),
weak_ptr_factory_(this) {
}
......@@ -98,21 +108,52 @@ bool ConnectionFactoryImpl::IsEndpointReachable() const {
!connecting_;
}
void ConnectionFactoryImpl::SignalConnectionReset() {
void ConnectionFactoryImpl::SignalConnectionReset(
ConnectionResetReason reason) {
// A failure can trigger multiple resets, so no need to do anything if a
// connection is already in progress.
if (connecting_)
return; // Already attempting to reconnect.
return;
UMA_HISTOGRAM_ENUMERATION("GCM.ConnectionResetReason",
reason,
CONNECTION_RESET_COUNT);
if (!last_login_time_.is_null()) {
UMA_HISTOGRAM_CUSTOM_TIMES("GCM.ConnectionUpTime",
NowTicks() - last_login_time_,
base::TimeDelta::FromSeconds(1),
base::TimeDelta::FromHours(24),
50);
// |last_login_time_| will be reset below, before attempting the new
// connection.
}
if (connection_handler_)
connection_handler_->Reset();
if (!backoff_reset_time_.is_null() &&
NowTicks() - backoff_reset_time_ <=
base::TimeDelta::FromSeconds(kConnectionResetWindowSecs)) {
if (socket_handle_.socket() && socket_handle_.socket()->IsConnected())
socket_handle_.socket()->Disconnect();
socket_handle_.Reset();
if (logging_in_) {
// Failures prior to login completion just reuse the existing backoff entry.
logging_in_ = false;
backoff_entry_->InformOfRequest(false);
} else if (reason == LOGIN_FAILURE ||
ShouldRestorePreviousBackoff(last_login_time_, NowTicks())) {
// Failures due to login, or within the reset window of a login, restore
// the backoff entry that was saved off at login completion time.
backoff_entry_.swap(previous_backoff_);
backoff_entry_->InformOfRequest(false);
} else {
// We shouldn't be in backoff in thise case.
DCHECK(backoff_entry_->CanDiscard());
}
backoff_reset_time_ = base::TimeTicks();
previous_backoff_->Reset();
// At this point the last login time has been consumed or deemed irrelevant,
// reset it.
last_login_time_ = base::TimeTicks();
Connect();
}
......@@ -143,10 +184,8 @@ void ConnectionFactoryImpl::OnIPAddressChanged() {
}
void ConnectionFactoryImpl::ConnectImpl() {
DCHECK(!connection_handler_->CanSendMessage());
if (socket_handle_.socket() && socket_handle_.socket()->IsConnected())
socket_handle_.socket()->Disconnect();
socket_handle_.Reset();
DCHECK(connecting_);
DCHECK(!socket_handle_.socket());
// TODO(zea): resolve proxies.
net::ProxyInfo proxy_info;
......@@ -190,6 +229,8 @@ base::TimeTicks ConnectionFactoryImpl::NowTicks() {
}
void ConnectionFactoryImpl::OnConnectDone(int result) {
UMA_HISTOGRAM_BOOLEAN("GCM.ConnectionSuccessRate", (result == net::OK));
if (result != net::OK) {
LOG(ERROR) << "Failed to connect to MCS endpoint with error " << result;
backoff_entry_->InformOfRequest(false);
......@@ -198,31 +239,29 @@ void ConnectionFactoryImpl::OnConnectDone(int result) {
return;
}
DVLOG(1) << "MCS endpoint socket connection success, starting handshake.";
connecting_ = false;
logging_in_ = true;
DVLOG(1) << "MCS endpoint socket connection success, starting login.";
InitHandler();
}
void ConnectionFactoryImpl::ConnectionHandlerCallback(int result) {
if (result == net::OK) {
// Handshake succeeded, reset the backoff.
connecting_ = false;
backoff_reset_time_ = NowTicks();
previous_backoff_.swap(backoff_entry_);
backoff_entry_->Reset();
DCHECK(!connecting_);
if (result != net::OK) {
// TODO(zea): Consider how to handle errors that may require some sort of
// user intervention (login page, etc.).
UMA_HISTOGRAM_SPARSE_SLOWLY("GCM.ConnectionDisconnectErrorCode", result);
SignalConnectionReset(SOCKET_FAILURE);
return;
}
if (!connecting_)
UMA_HISTOGRAM_SPARSE_SLOWLY("GCM.ConnectionDisconnectErrorCode", result);
if (connection_handler_)
connection_handler_->Reset();
// TODO(zea): Consider how to handle errors that may require some sort of
// user intervention (login page, etc.).
LOG(ERROR) << "Connection reset with error " << result;
backoff_entry_->InformOfRequest(false);
Connect();
// Handshake complete, reset backoff. If the login failed with an error,
// the client should invoke SignalConnectionReset(LOGIN_FAILURE), which will
// restore the previous backoff.
last_login_time_ = NowTicks();
previous_backoff_.swap(backoff_entry_);
backoff_entry_->Reset();
logging_in_ = false;
}
} // namespace gcm
......@@ -8,6 +8,7 @@
#include "google_apis/gcm/engine/connection_factory.h"
#include "base/memory/weak_ptr.h"
#include "base/time/time.h"
#include "google_apis/gcm/protocol/mcs.pb.h"
#include "net/base/backoff_entry.h"
#include "net/base/network_change_notifier.h"
......@@ -44,7 +45,7 @@ class GCM_EXPORT ConnectionFactoryImpl :
virtual void Connect() OVERRIDE;
virtual bool IsEndpointReachable() const OVERRIDE;
virtual base::TimeTicks NextRetryAttempt() const OVERRIDE;
virtual void SignalConnectionReset() OVERRIDE;
virtual void SignalConnectionReset(ConnectionResetReason reason) OVERRIDE;
// NetworkChangeNotifier observer implementations.
virtual void OnConnectionTypeChanged(
......@@ -91,17 +92,26 @@ class GCM_EXPORT ConnectionFactoryImpl :
net::NetLog* const net_log_;
// The handle to the socket for the current connection, if one exists.
net::ClientSocketHandle socket_handle_;
// Connection attempt backoff policy.
// Current backoff entry.
scoped_ptr<net::BackoffEntry> backoff_entry_;
// Backoff policy from previous backoff attempt.
// Backoff entry from previous connection attempt. Updated on each login
// completion.
scoped_ptr<net::BackoffEntry> previous_backoff_;
base::TimeTicks backoff_reset_time_;
// Whether a connection attempt is currently in progress or we're in backoff
// waiting until the next connection attempt. |!connecting_| denotes
// steady state with an active connection.
bool connecting_;
// Whether login successfully completed after the connection was established.
// If a connection reset happens while attempting to log in, the current
// backoff entry is reused (after incrementing with a new failure).
bool logging_in_;
// The time of the last login completion. Used for calculating whether to
// restore a previous backoff entry and for measuring uptime.
base::TimeTicks last_login_time_;
// The current connection handler, if one exists.
scoped_ptr<ConnectionHandlerImpl> connection_handler_;
......
......@@ -351,7 +351,7 @@ TEST_F(ConnectionFactoryImplTest, FailViaSignalReset) {
factory()->Connect();
EXPECT_TRUE(factory()->NextRetryAttempt().is_null());
factory()->SignalConnectionReset();
factory()->SignalConnectionReset(ConnectionFactory::SOCKET_FAILURE);
EXPECT_FALSE(factory()->NextRetryAttempt().is_null());
EXPECT_FALSE(factory()->GetConnectionHandler()->CanSendMessage());
}
......@@ -365,13 +365,13 @@ TEST_F(ConnectionFactoryImplTest, IgnoreResetWhileConnecting) {
factory()->Connect();
EXPECT_TRUE(factory()->NextRetryAttempt().is_null());
factory()->SignalConnectionReset();
factory()->SignalConnectionReset(ConnectionFactory::SOCKET_FAILURE);
base::TimeTicks retry_time = factory()->NextRetryAttempt();
EXPECT_FALSE(retry_time.is_null());
const int kNumAttempts = 5;
for (int i = 0; i < kNumAttempts; ++i)
factory()->SignalConnectionReset();
factory()->SignalConnectionReset(ConnectionFactory::SOCKET_FAILURE);
EXPECT_EQ(retry_time, factory()->NextRetryAttempt());
}
......@@ -395,7 +395,7 @@ TEST_F(ConnectionFactoryImplTest, SignalResetRestoresBackoff) {
WaitForConnections();
EXPECT_TRUE(factory()->NextRetryAttempt().is_null());
factory()->SignalConnectionReset();
factory()->SignalConnectionReset(ConnectionFactory::SOCKET_FAILURE);
EXPECT_FALSE(factory()->GetConnectionHandler()->CanSendMessage());
EXPECT_NE(retry_time, factory()->NextRetryAttempt());
retry_time = factory()->NextRetryAttempt();
......@@ -410,7 +410,7 @@ TEST_F(ConnectionFactoryImplTest, SignalResetRestoresBackoff) {
WaitForConnections();
EXPECT_TRUE(factory()->NextRetryAttempt().is_null());
factory()->SignalConnectionReset();
factory()->SignalConnectionReset(ConnectionFactory::SOCKET_FAILURE);
EXPECT_NE(retry_time, factory()->NextRetryAttempt());
retry_time = factory()->NextRetryAttempt();
EXPECT_FALSE(retry_time.is_null());
......
......@@ -10,7 +10,9 @@
namespace gcm {
FakeConnectionFactory::FakeConnectionFactory() {
FakeConnectionFactory::FakeConnectionFactory()
: reconnect_pending_(false),
delay_reconnect_(false) {
}
FakeConnectionFactory::~FakeConnectionFactory() {
......@@ -43,8 +45,12 @@ base::TimeTicks FakeConnectionFactory::NextRetryAttempt() const {
return base::TimeTicks();
}
void FakeConnectionFactory::SignalConnectionReset() {
Connect();
void FakeConnectionFactory::SignalConnectionReset(
ConnectionResetReason reason) {
if (!delay_reconnect_)
Connect();
else
reconnect_pending_ = true;
}
} // namespace gcm
......@@ -28,13 +28,26 @@ class FakeConnectionFactory : public ConnectionFactory {
virtual void Connect() OVERRIDE;
virtual bool IsEndpointReachable() const OVERRIDE;
virtual base::TimeTicks NextRetryAttempt() const OVERRIDE;
virtual void SignalConnectionReset() OVERRIDE;
virtual void SignalConnectionReset(ConnectionResetReason reason) OVERRIDE;
// Whether a connection reset has been triggered and is yet to run.
bool reconnect_pending() const { return reconnect_pending_; }
// Whether connection resets should be handled immediately or delayed until
// release.
void set_delay_reconnect(bool should_delay) {
delay_reconnect_ = should_delay;
}
private:
scoped_ptr<FakeConnectionHandler> connection_handler_;
BuildLoginRequestCallback request_builder_;
// Logic for handling connection resets.
bool reconnect_pending_;
bool delay_reconnect_;
DISALLOW_COPY_AND_ASSIGN(FakeConnectionFactory);
};
......
......@@ -40,6 +40,7 @@ FakeConnectionHandler::~FakeConnectionHandler() {
void FakeConnectionHandler::Init(const mcs_proto::LoginRequest& login_request,
net::StreamSocket* socket) {
ASSERT_GE(expected_outgoing_messages_.size(), 1U);
EXPECT_EQ(expected_outgoing_messages_.front().SerializeAsString(),
login_request.SerializeAsString());
expected_outgoing_messages_.pop_front();
......
......@@ -606,6 +606,9 @@ void MCSClient::HandlePacketFromWire(
state_ = UNINITIALIZED;
DVLOG(1) << " Error code: " << login_response->error().code();
DVLOG(1) << " Error message: " << login_response->error().message();
LOG(ERROR) << "Failed to log in to GCM, resetting connection.";
connection_factory_->SignalConnectionReset(
ConnectionFactory::LOGIN_FAILURE);
mcs_error_callback_.Run();
return;
}
......@@ -656,7 +659,8 @@ void MCSClient::HandlePacketFromWire(
case kCloseTag:
LOG(ERROR) << "Received close command, resetting connection.";
state_ = LOADED;
connection_factory_->SignalConnectionReset();
connection_factory_->SignalConnectionReset(
ConnectionFactory::CLOSE_COMMAND);
return;
case kIqStanzaTag: {
DCHECK_GE(stream_id_in_, 1U);
......@@ -805,7 +809,8 @@ MCSClient::PersistentId MCSClient::GetNextPersistentId() {
}
void MCSClient::OnConnectionResetByHeartbeat() {
connection_factory_->SignalConnectionReset();
connection_factory_->SignalConnectionReset(
ConnectionFactory::HEARTBEAT_FAILURE);
}
void MCSClient::NotifyMessageSendStatus(
......
......@@ -274,15 +274,18 @@ TEST_F(MCSClientTest, LoginSuccess) {
EXPECT_EQ(kLoginResponseTag, received_message()->tag());
}
// Encounter a server error during the login attempt.
// Encounter a server error during the login attempt. Should trigger a
// reconnect.
TEST_F(MCSClientTest, FailLogin) {
BuildMCSClient();
InitializeClient();
GetFakeHandler()->set_fail_login(true);
connection_factory()->set_delay_reconnect(true);
LoginClient(std::vector<std::string>());
EXPECT_FALSE(connection_factory()->IsEndpointReachable());
EXPECT_FALSE(init_success());
EXPECT_FALSE(received_message());
EXPECT_TRUE(connection_factory()->reconnect_pending());
}
// Send a message without RMQ support.
......
......@@ -5505,6 +5505,24 @@ other types of suffix sets.
<summary>Net error results from GCM connection attempts.</summary>
</histogram>
<histogram name="GCM.ConnectionResetReason" enum="GCMConnectionResetReason">
<summary>Reasons for GCM connection resets.</summary>
</histogram>
<histogram name="GCM.ConnectionSuccessRate">
<summary>
GCM connection success rate. Does not take into account login success. See
GCM.ConnectionFailureErrorCode for a breakdown of connection failure
reasons.
</summary>
</histogram>
<histogram name="GCM.ConnectionUpTime" units="milliseconds">
<summary>
Time (from login until reset) that a GCM connection was active.
</summary>
</histogram>
<histogram name="GCM.NumThrottledApps">
<summary>
Number of applications hitting GCM per-app outstanding message limits at
......@@ -26468,6 +26486,13 @@ other types of suffix sets.
<int value="6" label="Zero ID or token"/>
</enum>
<enum name="GCMConnectionResetReason" type="int">
<int value="0" label="Login failure"/>
<int value="1" label="Close command"/>
<int value="2" label="Heartbeat failure"/>
<int value="3" label="Socket failure"/>
</enum>
<enum name="GCMRegistrationRequestStatus" type="int">
<int value="0" label="Success (this is not logged currently)"/>
<int value="1" label="Invalid parameters"/>
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment