Return 0 for confidence when there's only one good or bad rev.

BUG=

Review URL: https://codereview.chromium.org/463743002

Cr-Commit-Position: refs/heads/master@{#288982}
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@288982 0039d316-1c4b-4281-b951-d872f2087c98
parent efbb4076
...@@ -171,6 +171,9 @@ MAX_MAC_BUILD_TIME = 14400 ...@@ -171,6 +171,9 @@ MAX_MAC_BUILD_TIME = 14400
MAX_WIN_BUILD_TIME = 14400 MAX_WIN_BUILD_TIME = 14400
MAX_LINUX_BUILD_TIME = 14400 MAX_LINUX_BUILD_TIME = 14400
# The confidence percentage at which confidence can be consider "high".
HIGH_CONFIDENCE = 95
# Patch template to add a new file, DEPS.sha under src folder. # Patch template to add a new file, DEPS.sha under src folder.
# This file contains SHA1 value of the DEPS changes made while bisecting # This file contains SHA1 value of the DEPS changes made while bisecting
# dependency repositories. This patch send along with DEPS patch to tryserver. # dependency repositories. This patch send along with DEPS patch to tryserver.
...@@ -191,9 +194,9 @@ BISECT_MODE_MEAN = 'mean' ...@@ -191,9 +194,9 @@ BISECT_MODE_MEAN = 'mean'
BISECT_MODE_STD_DEV = 'std_dev' BISECT_MODE_STD_DEV = 'std_dev'
BISECT_MODE_RETURN_CODE = 'return_code' BISECT_MODE_RETURN_CODE = 'return_code'
# The perf dashboard specifically looks for the string # The perf dashboard looks for a string like "Estimated Confidence: 95%"
# "Estimated Confidence: 95%" to decide whether or not to cc the author(s). # to decide whether or not to cc the author(s). If you change this, please
# If you change this, please update the perf dashboard as well. # update the perf dashboard as well.
RESULTS_BANNER = """ RESULTS_BANNER = """
===== BISECT JOB RESULTS ===== ===== BISECT JOB RESULTS =====
Status: %(status)s Status: %(status)s
...@@ -280,12 +283,18 @@ def ConfidenceScore(good_results_lists, bad_results_lists): ...@@ -280,12 +283,18 @@ def ConfidenceScore(good_results_lists, bad_results_lists):
Returns: Returns:
A number in the range [0, 100]. A number in the range [0, 100].
""" """
if not good_results_lists or not bad_results_lists: # If there's only one item in either list, this means only one revision was
# classified good or bad; this isn't good enough evidence to make a decision.
# If an empty list was passed, that also implies zero confidence.
if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1:
return 0.0 return 0.0
# Flatten the lists of results lists. # Flatten the lists of results lists.
sample1 = sum(good_results_lists, []) sample1 = sum(good_results_lists, [])
sample2 = sum(bad_results_lists, []) sample2 = sum(bad_results_lists, [])
# If there were only empty lists in either of the lists (this is unexpected
# and normally shouldn't happen), then we also want to return 0.
if not sample1 or not sample2: if not sample1 or not sample2:
return 0.0 return 0.0
...@@ -2889,7 +2898,7 @@ class BisectPerformanceMetrics(object): ...@@ -2889,7 +2898,7 @@ class BisectPerformanceMetrics(object):
if not results_dict['confidence']: if not results_dict['confidence']:
return None return None
confidence_status = 'Successful with %(level)s confidence%(warning)s.' confidence_status = 'Successful with %(level)s confidence%(warning)s.'
if results_dict['confidence'] >= 95: if results_dict['confidence'] >= HIGH_CONFIDENCE:
level = 'high' level = 'high'
else: else:
level = 'low' level = 'low'
...@@ -3173,18 +3182,13 @@ class BisectPerformanceMetrics(object): ...@@ -3173,18 +3182,13 @@ class BisectPerformanceMetrics(object):
if self.opts.repeat_test_count == 1: if self.opts.repeat_test_count == 1:
self.warnings.append('Tests were only set to run once. This may ' self.warnings.append('Tests were only set to run once. This may '
'be insufficient to get meaningful results.') 'be insufficient to get meaningful results.')
if results_dict['confidence'] < 100: if 0 < results_dict['confidence'] < HIGH_CONFIDENCE:
if results_dict['confidence']: self.warnings.append('Confidence is not high. Try bisecting again '
self.warnings.append( 'with increased repeat_count, larger range, or '
'Confidence is less than 100%. There could be other candidates ' 'on another metric.')
'for this regression. Try bisecting again with increased ' if not results_dict['confidence']:
'repeat_count or on a sub-metric that shows the regression more ' self.warnings.append('Confidence score is 0%. Try bisecting again on '
'clearly.') 'another platform or another metric.')
else:
self.warnings.append(
'Confidence is 0%. Try bisecting again on another platform, with '
'increased repeat_count or on a sub-metric that shows the '
'regression more clearly.')
def FormatAndPrintResults(self, bisect_results): def FormatAndPrintResults(self, bisect_results):
"""Prints the results from a bisection run in a readable format. """Prints the results from a bisection run in a readable format.
......
...@@ -25,8 +25,11 @@ class BisectPerfRegressionTest(unittest.TestCase): ...@@ -25,8 +25,11 @@ class BisectPerfRegressionTest(unittest.TestCase):
bad_values: First list of numbers. bad_values: First list of numbers.
good_values: Second list of numbers. good_values: Second list of numbers.
""" """
# ConfidenceScore takes a list of lists but these lists are flattened. # ConfidenceScore takes a list of lists but these lists are flattened
confidence = bisect_perf_module.ConfidenceScore([bad_values], [good_values]) # inside the function.
confidence = bisect_perf_module.ConfidenceScore(
[[v] for v in bad_values],
[[v] for v in good_values])
self.assertEqual(score, confidence) self.assertEqual(score, confidence)
def testConfidenceScore_ZeroConfidence(self): def testConfidenceScore_ZeroConfidence(self):
...@@ -45,10 +48,9 @@ class BisectPerfRegressionTest(unittest.TestCase): ...@@ -45,10 +48,9 @@ class BisectPerfRegressionTest(unittest.TestCase):
self._AssertConfidence(99.9, [1, 1, 1, 1], [1.2, 1.2, 1.2, 1.2]) self._AssertConfidence(99.9, [1, 1, 1, 1], [1.2, 1.2, 1.2, 1.2])
self._AssertConfidence(99.9, [1, 1, 1, 1], [1.01, 1.01, 1.01, 1.01]) self._AssertConfidence(99.9, [1, 1, 1, 1], [1.01, 1.01, 1.01, 1.01])
def testConfidenceScore_ImbalancedSampleSize(self): def testConfidenceScore_UnbalancedSampleSize(self):
# The second set of numbers only contains one number, so confidence is low. # The second set of numbers only contains one number, so confidence is 0.
self._AssertConfidence( self._AssertConfidence(0.0, [1.1, 1.2, 1.1, 1.2, 1.0, 1.3, 1.2], [1.4])
80.0, [1.1, 1.2, 1.1, 1.2, 1.0, 1.3, 1.2, 1.3],[1.4])
def testConfidenceScore_EmptySample(self): def testConfidenceScore_EmptySample(self):
# Confidence is zero if either or both samples are empty. # Confidence is zero if either or both samples are empty.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment