Return 0 for confidence when there's only one good or bad rev.

BUG= Review URL: https://codereview.chromium.org/463743002 Cr-Commit-Position: refs/heads/master@{#288982} git-svn-id: svn://svn.chromium.org/chrome/trunk/src@288982 0039d316-1c4b-4281-b951-d872f2087c98

Return 0 for confidence when there's only one good or bad rev.
BUG= Review URL: https://codereview.chromium.org/463743002 Cr-Commit-Position: refs/heads/master@{#288982} git-svn-id: svn://svn.chromium.org/chrome/trunk/src@288982 0039d316-1c4b-4281-b951-d872f2087c98
7e75117e · qyearsley@chromium.org · efbb4076 · 7e75117e · 7e75117e
Commit 7e75117e authored Aug 12, 2014 by qyearsley@chromium.org
Hide whitespace changes
Inline Side-by-side

Showing with 29 additions and 23 deletions

tools/bisect-perf-regression.py tools/bisect-perf-regression.py +21 -17

tools/bisect-perf-regression_test.py tools/bisect-perf-regression_test.py +8 -6

No files found.
--- a/tools/bisect-perf-regression.py
+++ b/tools/bisect-perf-regression.py
@@ -171,6 +171,9 @@ MAX_MAC_BUILD_TIME = 14400
 MAX_WIN_BUILD_TIME = 14400
 MAX_LINUX_BUILD_TIME = 14400
+# The confidence percentage at which confidence can be consider "high".
+HIGH_CONFIDENCE = 95
 # Patch template to add a new file, DEPS.sha under src folder.
 # This file contains SHA1 value of the DEPS changes made while bisecting
 # dependency repositories. This patch send along with DEPS patch to tryserver.
@@ -191,9 +194,9 @@ BISECT_MODE_MEAN = 'mean'
 BISECT_MODE_STD_DEV = 'std_dev'
 BISECT_MODE_RETURN_CODE = 'return_code'
-# The perf dashboard specifically looks for the string
+# The perf dashboard looks for a string like "Estimated Confidence: 95%"
-# "Estimated Confidence: 95%" to decide whether or not to cc the author(s).
+# to decide whether or not to cc the author(s). If you change this, please
-# If you change this, please update the perf dashboard as well.
+# update the perf dashboard as well.
 RESULTS_BANNER = """
 ===== BISECT JOB RESULTS =====
 Status: %(status)s
@@ -280,12 +283,18 @@ def ConfidenceScore(good_results_lists, bad_results_lists):
  Returns:
    A number in the range [0, 100].
  """
-  if not good_results_lists or not bad_results_lists:
+  # If there's only one item in either list, this means only one revision was
+  # classified good or bad; this isn't good enough evidence to make a decision.
+  # If an empty list was passed, that also implies zero confidence.
+  if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1:
    return 0.0
  # Flatten the lists of results lists.
  sample1 = sum(good_results_lists, [])
  sample2 = sum(bad_results_lists, [])
+  # If there were only empty lists in either of the lists (this is unexpected
+  # and normally shouldn't happen), then we also want to return 0.
  if not sample1 or not sample2:
    return 0.0
@@ -2889,7 +2898,7 @@ class BisectPerformanceMetrics(object):
    if not results_dict['confidence']:
      return None
    confidence_status = 'Successful with %(level)s confidence%(warning)s.'
-    if results_dict['confidence'] >= 95:
+    if results_dict['confidence'] >= HIGH_CONFIDENCE:
      level = 'high'
    else:
      level = 'low'
@@ -3173,18 +3182,13 @@ class BisectPerformanceMetrics(object):
    if self.opts.repeat_test_count == 1:
      self.warnings.append('Tests were only set to run once. This may '
                           'be insufficient to get meaningful results.')
-    if results_dict['confidence'] < 100:
+    if 0 < results_dict['confidence'] < HIGH_CONFIDENCE:
-      if results_dict['confidence']:
+      self.warnings.append('Confidence is not high. Try bisecting again '
-        self.warnings.append(
+                           'with increased repeat_count, larger range, or '
-            'Confidence is less than 100%. There could be other candidates '
+                           'on another metric.')
-            'for this regression. Try bisecting again with increased '
+    if not results_dict['confidence']:
-            'repeat_count or on a sub-metric that shows the regression more '
+      self.warnings.append('Confidence score is 0%. Try bisecting again on '
-            'clearly.')
+                           'another platform or another metric.')
-      else:
-        self.warnings.append(
-          'Confidence is 0%. Try bisecting again on another platform, with '
-          'increased repeat_count or on a sub-metric that shows the '
-          'regression more clearly.')
  def FormatAndPrintResults(self, bisect_results):
    """Prints the results from a bisection run in a readable format.

--- a/tools/bisect-perf-regression_test.py
+++ b/tools/bisect-perf-regression_test.py
@@ -25,8 +25,11 @@ class BisectPerfRegressionTest(unittest.TestCase):
      bad_values: First list of numbers.
      good_values: Second list of numbers.
    """
-    # ConfidenceScore takes a list of lists but these lists are flattened.
+    # ConfidenceScore takes a list of lists but these lists are flattened
-    confidence = bisect_perf_module.ConfidenceScore([bad_values], [good_values])
+    # inside the function.
+    confidence = bisect_perf_module.ConfidenceScore(
+        [[v] for v in bad_values],
+        [[v] for v in good_values])
    self.assertEqual(score, confidence)
  def testConfidenceScore_ZeroConfidence(self):
@@ -45,10 +48,9 @@ class BisectPerfRegressionTest(unittest.TestCase):
    self._AssertConfidence(99.9, [1, 1, 1, 1], [1.2, 1.2, 1.2, 1.2])
    self._AssertConfidence(99.9, [1, 1, 1, 1], [1.01, 1.01, 1.01, 1.01])
-  def testConfidenceScore_ImbalancedSampleSize(self):
+  def testConfidenceScore_UnbalancedSampleSize(self):
-    # The second set of numbers only contains one number, so confidence is low.
+    # The second set of numbers only contains one number, so confidence is 0.
-    self._AssertConfidence(
+    self._AssertConfidence(0.0, [1.1, 1.2, 1.1, 1.2, 1.0, 1.3, 1.2], [1.4])
-        80.0, [1.1, 1.2, 1.1, 1.2, 1.0, 1.3, 1.2, 1.3],[1.4])
  def testConfidenceScore_EmptySample(self):
    # Confidence is zero if either or both samples are empty.