Commit 5068d536 authored by behdad's avatar behdad Committed by Commit Bot

Stop failures as a result of noise using control test

Main_30fps_impl_60fps story in representative perf test is marked as a
control test, so that if this test fails as a result of high noise, it
would invalidate the failure of other tests and would result in less
false positives.

Bug: chromium:1029846
Change-Id: I50c8e41108d584f2f5cbcf095be50dc09228406b
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2025510
Commit-Queue: Behdad Bakhshinategh <behdadb@chromium.org>
Reviewed-by: default avatarSadrul Chowdhury <sadrul@chromium.org>
Reviewed-by: default avatarCaleb Rouleau <crouleau@chromium.org>
Cr-Commit-Position: refs/heads/master@{#738735}
parent 2d7e2dd9
...@@ -50,7 +50,8 @@ ...@@ -50,7 +50,8 @@
}, },
"main_30fps_impl_60fps": { "main_30fps_impl_60fps": {
"ci_095": 0.121, "ci_095": 0.121,
"avg": 16.689 "avg": 16.689,
"control": true
}, },
"css_value_type_shadow": { "css_value_type_shadow": {
"ci_095": 1.306, "ci_095": 1.306,
...@@ -96,7 +97,8 @@ ...@@ -96,7 +97,8 @@
}, },
"main_30fps_impl_60fps": { "main_30fps_impl_60fps": {
"ci_095": 0.185, "ci_095": 0.185,
"avg": 16.695 "avg": 16.695,
"control": true
}, },
"ie_chalkboard": { "ie_chalkboard": {
"ci_095": 43.55, "ci_095": 43.55,
...@@ -142,7 +144,8 @@ ...@@ -142,7 +144,8 @@
}, },
"main_30fps_impl_60fps": { "main_30fps_impl_60fps": {
"ci_095": 0.603, "ci_095": 0.603,
"avg": 18.009 "avg": 18.009,
"control": true
}, },
"new_tilings": { "new_tilings": {
"ci_095": 1.008, "ci_095": 1.008,
......
...@@ -44,30 +44,39 @@ class ResultRecorder(object): ...@@ -44,30 +44,39 @@ class ResultRecorder(object):
self.output = {} self.output = {}
self.return_code = 0 self.return_code = 0
self._failed_stories = set() self._failed_stories = set()
self._noisy_control_stories = set()
# Set of _noisy_control_stories keeps track of control tests which failed
# because of high noise values.
def set_tests(self, output): def set_tests(self, output):
self.output = output self.output = output
self.fails = 0 self.fails = output['num_failures_by_type'].get('FAIL', 0)
if 'FAIL' in output['num_failures_by_type']: self.tests = self.fails + output['num_failures_by_type'].get('PASS', 0)
self.fails = output['num_failures_by_type']['FAIL']
self.tests = self.fails + output['num_failures_by_type']['PASS']
def add_failure(self, name, benchmark): def add_failure(self, name, benchmark, is_control=False):
self.output['tests'][benchmark][name]['actual'] = 'FAIL' self.output['tests'][benchmark][name]['actual'] = 'FAIL'
self.output['tests'][benchmark][name]['is_unexpected'] = True self.output['tests'][benchmark][name]['is_unexpected'] = True
self._failed_stories.add(name) self._failed_stories.add(name)
self.fails += 1 self.fails += 1
if is_control:
self._noisy_control_stories.add(name)
def remove_failure(self, name, benchmark): def remove_failure(self, name, benchmark, is_control=False):
self.output['tests'][benchmark][name]['actual'] = 'PASS' self.output['tests'][benchmark][name]['actual'] = 'PASS'
self.output['tests'][benchmark][name]['is_unexpected'] = False self.output['tests'][benchmark][name]['is_unexpected'] = False
self._failed_stories.remove(name) self._failed_stories.remove(name)
self.fails -= 1 self.fails -= 1
if is_control:
self._noisy_control_stories.remove(name)
@property @property
def failed_stories(self): def failed_stories(self):
return self._failed_stories return self._failed_stories
@property
def is_control_stories_noisy(self):
return len(self._noisy_control_stories) > 0
def get_output(self, return_code): def get_output(self, return_code):
self.output['seconds_since_epoch'] = time.time() - self.start_time self.output['seconds_since_epoch'] = time.time() - self.start_time
self.output['num_failures_by_type']['PASS'] = self.tests - self.fails self.output['num_failures_by_type']['PASS'] = self.tests - self.fails
...@@ -82,7 +91,8 @@ class ResultRecorder(object): ...@@ -82,7 +91,8 @@ class ResultRecorder(object):
print('[ PASSED ] ' + tests(self.tests - self.fails) + '.') print('[ PASSED ] ' + tests(self.tests - self.fails) + '.')
if self.fails > 0: if self.fails > 0:
print('[ FAILED ] ' + tests(self.fails) + '.') print('[ FAILED ] ' + tests(self.fails) + '.')
self.return_code = 1 if not self.is_control_stories_noisy:
self.return_code = 1
return (self.output, self.return_code) return (self.output, self.return_code)
...@@ -159,7 +169,9 @@ def compare_values(values_per_story, upper_limit_data, benchmark, ...@@ -159,7 +169,9 @@ def compare_values(values_per_story, upper_limit_data, benchmark,
print(('[ FAILED ] {}/{} frame_times has higher noise ({:.3f}) ' + print(('[ FAILED ] {}/{} frame_times has higher noise ({:.3f}) ' +
'compared to upper limit ({:.3f})').format( 'compared to upper limit ({:.3f})').format(
benchmark, story_name, measured_ci,upper_limit_ci)) benchmark, story_name, measured_ci,upper_limit_ci))
result_recorder.add_failure(story_name, benchmark) result_recorder.add_failure(story_name, benchmark,
is_control_story(upper_limit_data[story_name]))
elif (measured_avg > upper_limit_avg * AVG_ERROR_MARGIN): elif (measured_avg > upper_limit_avg * AVG_ERROR_MARGIN):
print(('[ FAILED ] {}/{} higher average frame_times({:.3f}) compared' + print(('[ FAILED ] {}/{} higher average frame_times({:.3f}) compared' +
' to upper limit ({:.3f})').format( ' to upper limit ({:.3f})').format(
...@@ -204,6 +216,9 @@ def replace_arg_values(args, key_value_pairs): ...@@ -204,6 +216,9 @@ def replace_arg_values(args, key_value_pairs):
args[index+1] = value args[index+1] = value
return args return args
def is_control_story(story_data):
return ('control' in story_data and story_data['control'] == True)
def main(): def main():
overall_return_code = 0 overall_return_code = 0
options = parse_arguments() options = parse_arguments()
...@@ -282,7 +297,8 @@ def main(): ...@@ -282,7 +297,8 @@ def main():
for story_name in result_recorder.failed_stories.copy(): for story_name in result_recorder.failed_stories.copy():
if story_name not in re_run_result_recorder.failed_stories: if story_name not in re_run_result_recorder.failed_stories:
result_recorder.remove_failure(story_name, benchmark) result_recorder.remove_failure(story_name, benchmark,
is_control_story(upper_limit_data[platform][story_name]))
( (
finalOut, finalOut,
...@@ -294,6 +310,10 @@ def main(): ...@@ -294,6 +310,10 @@ def main():
with open(options.isolated_script_test_output, 'w') as outputFile: with open(options.isolated_script_test_output, 'w') as outputFile:
json.dump(finalOut, outputFile, indent=4) json.dump(finalOut, outputFile, indent=4)
if result_recorder.is_control_stories_noisy:
assert overall_return_code == 0
print('Control story has high noise. These runs are not reliable!')
return overall_return_code return overall_return_code
def parse_arguments(): def parse_arguments():
......
...@@ -27,7 +27,8 @@ UPPER_LIMIT_DATA_SAMPLE = { ...@@ -27,7 +27,8 @@ UPPER_LIMIT_DATA_SAMPLE = {
}, },
'story_4': { 'story_4': {
'ci_095': 10, 'ci_095': 10,
'avg': 10 'avg': 10,
'control': True,
}, },
'story_5': { 'story_5': {
'ci_095': 20, 'ci_095': 20,
...@@ -132,6 +133,8 @@ class TestRepresentativePerfScript(unittest.TestCase): ...@@ -132,6 +133,8 @@ class TestRepresentativePerfScript(unittest.TestCase):
result_recorder) result_recorder)
self.assertEquals(result_recorder.tests, 2) self.assertEquals(result_recorder.tests, 2)
self.assertEquals(result_recorder.failed_stories, set(['story_2'])) self.assertEquals(result_recorder.failed_stories, set(['story_2']))
(_, overall_return_code) = result_recorder.get_output(0)
self.assertEquals(overall_return_code, 1)
def test_compare_values_2(self): def test_compare_values_2(self):
values_per_story = { values_per_story = {
...@@ -141,7 +144,7 @@ class TestRepresentativePerfScript(unittest.TestCase): ...@@ -141,7 +144,7 @@ class TestRepresentativePerfScript(unittest.TestCase):
}, },
'story_3': { # Two of the runs have acceptable CI but high averages. 'story_3': { # Two of the runs have acceptable CI but high averages.
'averages': [10, 13], 'averages': [10, 13],
'ci_095': [1.0, 1.4, 1.2], 'ci_095': [14, 16, 12]
}, },
'story_4': { # All runs have high noise. 'story_4': { # All runs have high noise.
'averages': [], 'averages': [],
...@@ -167,3 +170,6 @@ class TestRepresentativePerfScript(unittest.TestCase): ...@@ -167,3 +170,6 @@ class TestRepresentativePerfScript(unittest.TestCase):
self.assertEquals(result_recorder.tests, 5) self.assertEquals(result_recorder.tests, 5)
self.assertEquals(result_recorder.failed_stories, self.assertEquals(result_recorder.failed_stories,
set(['story_3', 'story_4', 'story_5'])) set(['story_3', 'story_4', 'story_5']))
self.assertTrue(result_recorder.is_control_stories_noisy)
(_, overall_return_code) = result_recorder.get_output(0)
self.assertEquals(overall_return_code, 0)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment