Obtain confidence score based off last known good and first known bad revision results.

BUG=448817 Review URL: https://codereview.chromium.org/850013004 Cr-Commit-Position: refs/heads/master@{#314257}

Obtain confidence score based off last known good and first known bad revision results.
BUG=448817 Review URL: https://codereview.chromium.org/850013004 Cr-Commit-Position: refs/heads/master@{#314257}
bdfab875 · robertocn · Commit bot · 36dcef3d · bdfab875 · bdfab875
Commit bdfab875 authored Feb 02, 2015 by robertocn Committed by Commit bot Feb 03, 2015
3 changed files
--- a/tools/auto_bisect/bisect_perf_regression.py
+++ b/tools/auto_bisect/bisect_perf_regression.py
@@ -86,6 +86,10 @@ MAX_LINUX_BUILD_TIME = 14400
 # The confidence percentage we require to consider the initial range a
 # regression based on the test results of the inital good and bad revisions.
 REGRESSION_CONFIDENCE = 80
+# How many times to repeat the test on the last known good and first known bad
+# revisions in order to assess a more accurate confidence score in the
+# regression culprit.
+BORDER_REVISIONS_EXTRA_RUNS = 2

 # Patch template to add a new file, DEPS.sha under src folder.
 # This file contains SHA1 value of the DEPS changes made while bisecting
@@ -1272,7 +1276,7 @@ class BisectPerformanceMetrics(object):

  def RunPerformanceTestAndParseResults(
      self, command_to_run, metric, reset_on_first_run=False,
-      upload_on_last_run=False, results_label=None):
+      upload_on_last_run=False, results_label=None, test_run_multiplier=1):
    """Runs a performance test on the current revision and parses the results.

    Args:
@@ -1285,6 +1289,8 @@ class BisectPerformanceMetrics(object):
      results_label: A value for the option flag --results-label.
          The arguments reset_on_first_run, upload_on_last_run and results_label
          are all ignored if the test is not a Telemetry test.
+      test_run_multiplier: Factor by which to multiply the number of test runs
+          and the timeout period specified in self.opts.

    Returns:
      (values dict, 0) if --debug_ignore_perf_test was passed.
@@ -1326,7 +1332,8 @@ class BisectPerformanceMetrics(object):

    metric_values = []
    output_of_all_runs = ''
-    for i in xrange(self.opts.repeat_test_count):
+    repeat_count = self.opts.repeat_test_count * test_run_multiplier
+    for i in xrange(repeat_count):
      # Can ignore the return code since if the tests fail, it won't return 0.
      current_args = copy.copy(args)
      if is_telemetry:
@@ -1368,7 +1375,8 @@ class BisectPerformanceMetrics(object):
        metric_values.append(return_code)

      elapsed_minutes = (time.time() - start_time) / 60.0
-      if elapsed_minutes >= self.opts.max_time_minutes:
+      time_limit = self.opts.max_time_minutes *  test_run_multiplier
+      if elapsed_minutes >= time_limit:
        break

    if metric and len(metric_values) == 0:
@@ -1473,7 +1481,8 @@ class BisectPerformanceMetrics(object):
    return False

  def RunTest(self, revision, depot, command, metric, skippable=False,
-      skip_sync=False, create_patch=False, force_build=False):
+              skip_sync=False, create_patch=False, force_build=False,
+              test_run_multiplier=1):
    """Performs a full sync/build/run of the specified revision.

    Args:
@@ -1484,6 +1493,8 @@ class BisectPerformanceMetrics(object):
      skip_sync: Skip the sync step.
      create_patch: Create a patch with any locally modified files.
      force_build: Force a local build.
+      test_run_multiplier: Factor by which to multiply the given number of runs
+          and the set timeout period.

    Returns:
      On success, a tuple containing the results of the performance test.
@@ -1525,7 +1536,8 @@ class BisectPerformanceMetrics(object):
    command = self.GetCompatibleCommand(command, revision, depot)

    # Run the command and get the results.
-    results = self.RunPerformanceTestAndParseResults(command, metric)
+    results = self.RunPerformanceTestAndParseResults(
+        command, metric, test_run_multiplier=test_run_multiplier)

    # Restore build output directory once the tests are done, to avoid
    # any discrepancies.
@@ -2439,6 +2451,9 @@ class BisectPerformanceMetrics(object):
          self.printer.PrintPartialResults(bisect_state)
          bisect_utils.OutputAnnotationStepClosed()

+
+      self._ConfidenceExtraTestRuns(min_revision_state, max_revision_state,
+                                    command_to_run, metric)
      results = BisectResults(bisect_state, self.depot_registry, self.opts,
                              self.warnings)

@@ -2452,6 +2467,21 @@ class BisectPerformanceMetrics(object):
               '[%s..%s]' % (good_revision, bad_revision))
      return BisectResults(error=error)

+  def _ConfidenceExtraTestRuns(self, good_state, bad_state, command_to_run,
+                               metric):
+    if (bool(good_state.passed) != bool(bad_state.passed)
+       and good_state.passed not in ('Skipped', 'Build Failed')
+       and bad_state.passed not in ('Skipped', 'Build Failed')):
+      for state in (good_state, bad_state):
+        run_results = self.RunTest(
+            state.revision,
+            state.depot,
+            command_to_run,
+            metric,
+            test_run_multiplier=BORDER_REVISIONS_EXTRA_RUNS)
+        # Is extend the right thing to do here?
+        state.value['values'].extend(run_results[0]['values'])
+

 def _IsPlatformSupported():
  """Checks that this platform and build system are supported.

--- a/tools/auto_bisect/bisect_results.py
+++ b/tools/auto_bisect/bisect_results.py
@@ -230,6 +230,20 @@ class BisectResults(object):

  @staticmethod
  def FindBreakingRevRange(revision_states):
+    """Finds the last known good and first known bad revisions.
+
+    Note that since revision_states is expected to be in reverse chronological
+    order, the last known good revision is the first revision in the list that
+    has the passed property set to 1, therefore the name
+    `first_working_revision`. The inverse applies to `last_broken_revision`.
+
+    Args:
+      revision_states: A list of RevisionState instances.
+
+    Returns:
+      A tuple containing the two revision states at the border. (Last
+      known good and first known bad.)
+    """
    first_working_revision = None
    last_broken_revision = None

@@ -287,10 +301,13 @@ class BisectResults(object):
        [working_mean, broken_mean]) /
        max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0

-    # Give a "confidence" in the bisect. At the moment we use how distinct the
-    # values are before and after the last broken revision, and how noisy the
-    # overall graph is.
-    confidence_params = (sum(working_means, []), sum(broken_means, []))
+    # Give a "confidence" in the bisect. Currently, we consider the values of
+    # only the revisions at the breaking range (last known good and first known
+    # bad) see the note in the docstring for FindBreakingRange.
+    confidence_params = (
+        sum([first_working_rev.value['values']], []),
+        sum([last_broken_rev.value['values']], [])
+    )
    confidence = cls.ConfidenceScore(*confidence_params)

    bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs

--- a/tools/auto_bisect/bisect_results_test.py
+++ b/tools/auto_bisect/bisect_results_test.py
@@ -158,6 +158,58 @@ class BisectResultsTest(unittest.TestCase):
    self.assertEqual(revision_states[2], results.first_working_revision)
    self.assertEqual(revision_states[1], results.last_broken_revision)

+  def testCorrectlyFindsBreakingRangeNotInOrder(self):
+    revision_states = self.mock_bisect_state.mock_revision_states
+    revision_states[0].passed = 0
+    revision_states[1].passed = 1
+    revision_states[2].passed = 0
+    revision_states[3].passed = 1
+    revision_states[4].passed = 1
+
+    results = BisectResults(self.mock_bisect_state, self.mock_depot_registry,
+                            self.mock_opts, self.mock_warnings)
+    self.assertEqual(revision_states[1], results.first_working_revision)
+    self.assertEqual(revision_states[2], results.last_broken_revision)
+
+  def testCorrectlyFindsBreakingRangeIncompleteBisect(self):
+    revision_states = self.mock_bisect_state.mock_revision_states
+    revision_states[0].passed = 0
+    revision_states[1].passed = 0
+    revision_states[2].passed = '?'
+    revision_states[3].passed = 1
+    revision_states[4].passed = 1
+
+    results = BisectResults(self.mock_bisect_state, self.mock_depot_registry,
+                            self.mock_opts, self.mock_warnings)
+    self.assertEqual(revision_states[3], results.first_working_revision)
+    self.assertEqual(revision_states[1], results.last_broken_revision)
+
+  def testFindBreakingRangeAllPassed(self):
+    revision_states = self.mock_bisect_state.mock_revision_states
+    revision_states[0].passed = 1
+    revision_states[1].passed = 1
+    revision_states[2].passed = 1
+    revision_states[3].passed = 1
+    revision_states[4].passed = 1
+
+    results = BisectResults(self.mock_bisect_state, self.mock_depot_registry,
+                            self.mock_opts, self.mock_warnings)
+    self.assertEqual(revision_states[0], results.first_working_revision)
+    self.assertIsNone(results.last_broken_revision)
+
+  def testFindBreakingRangeNonePassed(self):
+    revision_states = self.mock_bisect_state.mock_revision_states
+    revision_states[0].passed = 0
+    revision_states[1].passed = 0
+    revision_states[2].passed = 0
+    revision_states[3].passed = 0
+    revision_states[4].passed = 0
+
+    results = BisectResults(self.mock_bisect_state, self.mock_depot_registry,
+                            self.mock_opts, self.mock_warnings)
+    self.assertIsNone(results.first_working_revision)
+    self.assertEqual(revision_states[4], results.last_broken_revision)
+
  def testCorrectlyComputesRegressionStatistics(self):
    revision_states = self.mock_bisect_state.mock_revision_states
    revision_states[0].passed = 0
@@ -227,9 +279,9 @@ class BisectResultsTest(unittest.TestCase):

  def testWarningForTooLowConfidence(self):
    revision_states = self.mock_bisect_state.mock_revision_states
-    revision_states[2].value = {'values': [95, 100, 90]}
-    revision_states[3].value = {'values': [95, 100, 90]}
-    revision_states[4].value = {'values': [95, 100, 90]}
+    revision_states[2].value = {'values': [95, 90, 90]}
+    revision_states[3].value = {'values': [95, 90, 90]}
+    revision_states[4].value = {'values': [95, 90, 90]}
    results = BisectResults(self.mock_bisect_state, self.mock_depot_registry,
                            self.mock_opts, self.mock_warnings)
    self.assertGreater(results.confidence, 0)