Add retry logic to results_dashboard uploading

In this first attempt, we retry upload the whole perf dashboard data (JSONnizable) for any recoverable error. Subsequent CL may improve this further by splitting up the perf dashboard data to small parts and only try to reupload the parts which the upload failed. This CL also increase the expiration of the oath token generated for each perf benchmark's data to 30 minutes (used to be 1 minutes by default) Bug: 864565 Cq-Include-Trybots: master.tryserver.chromium.perf:obbs_fyi Change-Id: If4cb959cc62c5da3baa4a36a5ae91c10ca6d4d02 Reviewed-on: https://chromium-review.googlesource.com/1148228 Commit-Queue: Ned Nguyen <nednguyen@google.com> Reviewed-by: Simon Hatch <simonhatch@chromium.org> Cr-Commit-Position: refs/heads/master@{#577682}

Add retry logic to results_dashboard uploading
In this first attempt, we retry upload the whole perf dashboard data (JSONnizable) for any recoverable error. Subsequent CL may improve this further by splitting up the perf dashboard data to small parts and only try to reupload the parts which the upload failed. This CL also increase the expiration of the oath token generated for each perf benchmark's data to 30 minutes (used to be 1 minutes by default) Bug: 864565 Cq-Include-Trybots: master.tryserver.chromium.perf:obbs_fyi Change-Id: If4cb959cc62c5da3baa4a36a5ae91c10ca6d4d02 Reviewed-on: https://chromium-review.googlesource.com/1148228 Commit-Queue: Ned Nguyen <nednguyen@google.com> Reviewed-by: Simon Hatch <simonhatch@chromium.org> Cr-Commit-Position: refs/heads/master@{#577682}
881f1317 · Ned Nguyen · Commit Bot · e28b1035 · 881f1317 · 881f1317
Commit 881f1317 authored Jul 24, 2018 by Ned Nguyen Committed by Commit Bot Jul 24, 2018
4 changed files
--- a/tools/perf/core/oauth_api.py
+++ b/tools/perf/core/oauth_api.py
@@ -13,17 +13,21 @@ import tempfile


 @contextlib.contextmanager
-def with_access_token(service_account_json, append):
+def with_access_token(
+    service_account_json, prefix, token_expiration_in_minutes):
  """Yields an access token for the service account.

  Args:
    service_account_json: The path to the service account JSON file.
+    prefix: the prefix of the token file
+    token_expiration_in_minutes: the integer expiration time of the token
  """
-  fd, path = tempfile.mkstemp(suffix='.json', prefix=append)
+  fd, path = tempfile.mkstemp(suffix='.json', prefix=prefix)
  try:
    args = ['luci-auth', 'token']
    if service_account_json:
      args += ['-service-account-json', service_account_json]
+    args += ['-lifetime', '%im' % token_expiration_in_minutes]
    subprocess.check_call(args, stdout=fd)
    os.close(fd)
    fd = None

--- a/tools/perf/core/results_dashboard.py
+++ b/tools/perf/core/results_dashboard.py
@@ -52,7 +52,8 @@ class SendResultsFatalException(SendResultException):
  pass


-def SendResults(data, url, send_as_histograms=False, oauth_token=None):
+def SendResults(data, url, send_as_histograms=False, oauth_token=None,
+                num_retries=3):
  """Sends results to the Chrome Performance Dashboard.

  This function tries to send the given data to the dashboard.
@@ -61,72 +62,80 @@ def SendResults(data, url, send_as_histograms=False, oauth_token=None):
    data: The data to try to send. Must be JSON-serializable.
    url: Performance Dashboard URL (including schema).
    send_as_histograms: True if result is to be sent to /add_histograms.
-    oauth_token: string; used for flushing oauth uploads from cache.
+    oauth_token: string; used for flushing oauth uploads from cache. Note that
+      client is responsible for making sure that the oauth_token doesn't expire
+      when using this API.
+    num_retries: Number of times to retry uploading to the perf dashboard upon
+      recoverable error.
  """
  start = time.time()

  # Send all the results from this run and the previous cache to the
  # dashboard.
-  fatal_error, errors = _SendResultsToDashboard(
-      data, url, oauth_token, is_histogramset=send_as_histograms)
+  errors, all_data_uploaded = _SendResultsToDashboard(
+      data, url, oauth_token, is_histogramset=send_as_histograms,
+      num_retries=num_retries)

  print 'Time spent sending results to %s: %s' % (url, time.time() - start)

-  # Print any errors; if there was a fatal error, it should be an exception.
-  for error in errors:
-    print error
-  if fatal_error:
-    return False
-  return True
+  for err in errors:
+    print err

+  return all_data_uploaded

-def _SendResultsToDashboard(dashboard_data, url, oauth_token, is_histogramset):
+
+def _SendResultsToDashboard(
+    dashboard_data, url, oauth_token, is_histogramset, num_retries):
  """Tries to send perf dashboard data to |url|.

  Args:
    perf_results_file_path: A file name.
    url: The instance URL to which to post results.
    oauth_token: An oauth token to use for histogram uploads. Might be None.
+    num_retries: Number of time to retry uploading to the perf dashboard upon
+      recoverable error.

  Returns:
-    A pair (fatal_error, errors), where fatal_error is a boolean indicating
-    whether there there was a major error and the step should fail, and errors
-    is a list of error strings.
+    A tuple (errors, all_data_uploaded), whereas:
+      errors is a list of error strings.
+      all_data_uploaded is a boolean indicating whether all perf data was
+        succesfully uploaded.
  """

-  fatal_error = False
  errors = []
+  all_data_uploaded = False

  data_type = ('histogram' if is_histogramset else 'chartjson')
-  print 'Sending %s result to dashboard.' % data_type

-  try:
-    if is_histogramset:
-      # TODO(eakuefner): Remove this discard logic once all bots use
-      # histograms.
-      if oauth_token is None:
-        raise SendResultsFatalException(ERROR_NO_OAUTH_TOKEN)
-
-      _SendHistogramJson(url, json.dumps(dashboard_data), oauth_token)
-    else:
-      _SendResultsJson(url, json.dumps(dashboard_data))
-  except SendResultsRetryException as e:
-    # TODO(crbug.com/864565): retry sending to the perf dashboard with this
-    # error.
-    error = 'Error while uploading %s data: %s' % (data_type, str(e))
-    fatal_error = True
-    errors.append(error)
-  except SendResultsFatalException as e:
-    error = 'Error uploading %s data: %s' % (data_type, str(e))
-    errors.append(error)
-    fatal_error = True
-  except Exception:
-    error = 'Unexpected error while uploading %s data: %s' % (
-        data_type, traceback.format_exc())
-    errors.append(error)
-    fatal_error = True
-
-  return fatal_error, errors
+  for i in xrange(1, num_retries + 1):
+    try:
+      print 'Sending %s result to dashboard (attempt %i out of %i).' % (
+          data_type, i, num_retries)
+      if is_histogramset:
+        # TODO(eakuefner): Remove this discard logic once all bots use
+        # histograms.
+        if oauth_token is None:
+          raise SendResultsFatalException(ERROR_NO_OAUTH_TOKEN)
+
+        _SendHistogramJson(url, json.dumps(dashboard_data), oauth_token)
+      else:
+        _SendResultsJson(url, json.dumps(dashboard_data))
+      all_data_uploaded = True
+      break
+    except SendResultsRetryException as e:
+      error = 'Error while uploading %s data: %s' % (data_type, str(e))
+      errors.append(error)
+    except SendResultsFatalException as e:
+      error = 'Error uploading %s data: %s' % (data_type, str(e))
+      errors.append(error)
+      break
+    except Exception:
+      error = 'Unexpected error while uploading %s data: %s' % (
+          data_type, traceback.format_exc())
+      errors.append(error)
+      break
+
+  return errors, all_data_uploaded


 def MakeHistogramSetWithDiagnostics(histograms_file,

--- a/tools/perf/core/results_dashboard_unittest.py
+++ b/tools/perf/core/results_dashboard_unittest.py
+# Copyright 2018 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+import unittest
+
+import mock
+
+from core import results_dashboard
+
+
+class ResultsDashboardTest(unittest.TestCase):
+
+  def setUp(self):
+    self.fake_oath = 'fake-oath'
+    self.perf_data = {'foo': 1, 'bar': 2}
+    self.dashboard_url = 'https://chromeperf.appspot.com'
+
+  def testRetryForSendResultRetryException(self):
+    def raise_retry_exception(url, histogramset_json, oauth_token):
+      del url, histogramset_json, oauth_token  # unused
+      raise results_dashboard.SendResultsRetryException('Should retry')
+
+    with mock.patch('core.results_dashboard._SendHistogramJson',
+                    side_effect=raise_retry_exception) as m:
+      upload_result = results_dashboard.SendResults(
+          self.perf_data, self.dashboard_url, send_as_histograms=True,
+          oauth_token=self.fake_oath, num_retries=5)
+      self.assertFalse(upload_result)
+      self.assertEqual(m.call_count, 5)
+
+  def testNoRetryForSendResultFatalException(self):
+
+    def raise_retry_exception(url, histogramset_json, oauth_token):
+      del url, histogramset_json, oauth_token  # unused
+      raise results_dashboard.SendResultsFatalException('Do not retry')
+
+    with mock.patch('core.results_dashboard._SendHistogramJson',
+                    side_effect=raise_retry_exception) as m:
+      upload_result =  results_dashboard.SendResults(
+          self.perf_data, self.dashboard_url, send_as_histograms=True,
+          oauth_token=self.fake_oath, num_retries=5)
+      self.assertFalse(upload_result)
+      self.assertEqual(m.call_count, 1)
+
+  def testNoRetryForSuccessfulSendResult(self):
+    with mock.patch('core.results_dashboard._SendHistogramJson') as m:
+      upload_result = results_dashboard.SendResults(
+          self.perf_data, self.dashboard_url, send_as_histograms=True,
+          oauth_token=self.fake_oath, num_retries=5)
+      self.assertTrue(upload_result)
+      self.assertEqual(m.call_count, 1)
+
+  def testNoRetryAfterSucessfulSendResult(self):
+    counter = [0]
+    def raise_retry_exception_first_two_times(
+        url, histogramset_json, oauth_token):
+      del url, histogramset_json, oauth_token  # unused
+      counter[0] += 1
+      if counter[0] <= 2:
+        raise results_dashboard.SendResultsRetryException('Please retry')
+
+    with mock.patch('core.results_dashboard._SendHistogramJson',
+                    side_effect=raise_retry_exception_first_two_times) as m:
+      upload_result = results_dashboard.SendResults(
+          self.perf_data, self.dashboard_url, send_as_histograms=True,
+          oauth_token=self.fake_oath, num_retries=5)
+      self.assertTrue(upload_result)
+      self.assertEqual(m.call_count, 3)
--- a/tools/perf/process_perf_results.py
+++ b/tools/perf/process_perf_results.py
@@ -205,10 +205,11 @@ def _handle_benchmarks_shard_map(benchmarks_shard_map_file, extra_links):
 def _get_benchmark_name(directory):
  return basename(directory).replace(" benchmark", "")

+
 def process_perf_results(output_json, configuration_name,
-                          service_account_file,
-                          build_properties, task_output_dir,
-                          smoke_test_mode):
+                         service_account_file,
+                         build_properties, task_output_dir,
+                         smoke_test_mode):
  """Process perf results.

  Consists of merging the json-test-format output, uploading the perf test
@@ -352,7 +353,8 @@ def _upload_individual(
    # We generate an oauth token for every benchmark upload in the event
    # the token could time out, see crbug.com/854162
    with oauth_api.with_access_token(
-        service_account_file, ("%s_tok" % benchmark_name)) as oauth_file:
+        service_account_file, ('%s_tok' % benchmark_name),
+        token_expiration_in_minutes=30) as oauth_file:
      with open(output_json_file, 'w') as oj:
        upload_fail = _upload_perf_results(
          results_filename,