Commit 0bd20fee authored by behdad's avatar behdad Committed by Commit Bot

Low cpu_Wall_time_ratio can invalidate failures

Use cpu_wall_time_ratio in representative perf tests logic to invalidate failures which had lower than usual cpu_wall_time_ratio in order to reduce flaky failures.
As a result only stories which have higher frame_times than the limit and acceptable cpu_wall_time_ratio would be failing.

Bug: chromium:1052361
Change-Id: I294361375f91096a32e226f57dc61450926fab0b
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2292811
Commit-Queue: Behdad Bakhshinategh <behdadb@chromium.org>
Reviewed-by: default avatarJohn Chen <johnchen@chromium.org>
Cr-Commit-Position: refs/heads/master@{#789189}
parent 2a39758f
...@@ -180,8 +180,8 @@ class RenderingRepresentativePerfTest(object): ...@@ -180,8 +180,8 @@ class RenderingRepresentativePerfTest(object):
values_per_story = {} values_per_story = {}
for row in csv_obj: for row in csv_obj:
# For now only frame_times is used for testing representatives' # For now only frame_times is used for testing representatives'
# performance. # performance and cpu_wall_time_ratio is used for validation.
if row['name'] != METRIC_NAME: if row['name'] != METRIC_NAME and row['name'] != 'cpu_wall_time_ratio':
continue continue
story_name = row['stories'] story_name = row['stories']
if (story_name not in self.upper_limit_data): if (story_name not in self.upper_limit_data):
...@@ -189,13 +189,16 @@ class RenderingRepresentativePerfTest(object): ...@@ -189,13 +189,16 @@ class RenderingRepresentativePerfTest(object):
if story_name not in values_per_story: if story_name not in values_per_story:
values_per_story[story_name] = { values_per_story[story_name] = {
'averages': [], 'averages': [],
'ci_095': [] 'ci_095': [],
'cpu_wall_time_ratio': []
} }
if (row['avg'] == '' or row['count'] == 0): if row['name'] == METRIC_NAME and row['avg'] != '' and row['count'] != 0:
continue values_per_story[story_name]['ci_095'].append(float(row['ci_095']))
values_per_story[story_name]['ci_095'].append(float(row['ci_095'])) values_per_story[story_name]['averages'].append(float(row['avg']))
values_per_story[story_name]['averages'].append(float(row['avg'])) elif row['name'] == 'cpu_wall_time_ratio' and row['avg'] != '':
values_per_story[story_name]['cpu_wall_time_ratio'].append(
float(row['avg']))
return values_per_story return values_per_story
...@@ -219,10 +222,14 @@ class RenderingRepresentativePerfTest(object): ...@@ -219,10 +222,14 @@ class RenderingRepresentativePerfTest(object):
self.result_recorder[rerun].add_failure(story_name, self.benchmark) self.result_recorder[rerun].add_failure(story_name, self.benchmark)
continue continue
upper_limit_avg = self.upper_limit_data[story_name]['avg'] upper_limits = self.upper_limit_data
upper_limit_ci = self.upper_limit_data[story_name]['ci_095'] upper_limit_avg = upper_limits[story_name]['avg']
upper_limit_ci = upper_limits[story_name]['ci_095']
lower_limit_cpu_ratio = upper_limits[story_name]['cpu_wall_time_ratio']
measured_avg = np.mean(np.array(values_per_story[story_name]['averages'])) measured_avg = np.mean(np.array(values_per_story[story_name]['averages']))
measured_ci = np.mean(np.array(values_per_story[story_name]['ci_095'])) measured_ci = np.mean(np.array(values_per_story[story_name]['ci_095']))
measured_cpu_ratio = np.mean(np.array(
values_per_story[story_name]['cpu_wall_time_ratio']))
if (measured_ci > upper_limit_ci * CI_ERROR_MARGIN and if (measured_ci > upper_limit_ci * CI_ERROR_MARGIN and
self.is_control_story(story_name)): self.is_control_story(story_name)):
...@@ -233,10 +240,16 @@ class RenderingRepresentativePerfTest(object): ...@@ -233,10 +240,16 @@ class RenderingRepresentativePerfTest(object):
self.result_recorder[rerun].add_failure( self.result_recorder[rerun].add_failure(
story_name, self.benchmark, True) story_name, self.benchmark, True)
elif (measured_avg > upper_limit_avg * AVG_ERROR_MARGIN): elif (measured_avg > upper_limit_avg * AVG_ERROR_MARGIN):
print(('[ FAILED ] {}/{} higher average {}({:.3f}) compared' + if (measured_cpu_ratio >= lower_limit_cpu_ratio):
' to upper limit ({:.3f})').format(self.benchmark, story_name, print(('[ FAILED ] {}/{} higher average {}({:.3f}) compared' +
METRIC_NAME, measured_avg, upper_limit_avg)) ' to upper limit ({:.3f})').format(self.benchmark, story_name,
self.result_recorder[rerun].add_failure(story_name, self.benchmark) METRIC_NAME, measured_avg, upper_limit_avg))
self.result_recorder[rerun].add_failure(story_name, self.benchmark)
else:
print(('[ OK ] {}/{} higher average {}({:.3f}) compared ' +
'to upper limit({:.3f}). Invalidated for low cpu_wall_time_ratio'
).format(self.benchmark, story_name, METRIC_NAME, measured_avg,
upper_limit_avg))
else: else:
print(('[ OK ] {}/{} lower average {}({:.3f}) compared ' + print(('[ OK ] {}/{} lower average {}({:.3f}) compared ' +
'to upper limit({:.3f}).').format(self.benchmark, story_name, 'to upper limit({:.3f}).').format(self.benchmark, story_name,
......
...@@ -18,32 +18,39 @@ BENCHMARK = 'rendering.desktop' ...@@ -18,32 +18,39 @@ BENCHMARK = 'rendering.desktop'
UPPER_LIMIT_DATA_SAMPLE = { UPPER_LIMIT_DATA_SAMPLE = {
'story_1': { 'story_1': {
'ci_095': 10, 'ci_095': 10,
'avg': 20 'avg': 20,
'cpu_wall_time_ratio': 0.4,
}, },
'story_2': { 'story_2': {
'ci_095': 10, 'ci_095': 10,
'avg': 16 'avg': 16,
'cpu_wall_time_ratio': 0.3,
}, },
'story_3': { 'story_3': {
'ci_095': 10, 'ci_095': 10,
'avg': 10 'avg': 10,
'cpu_wall_time_ratio': 0.5,
}, },
'story_4': { 'story_4': {
'ci_095': 10, 'ci_095': 10,
'avg': 10, 'avg': 10,
'cpu_wall_time_ratio': 0.5,
'control': True, 'control': True,
}, },
'story_5': { 'story_5': {
'ci_095': 20, 'ci_095': 20,
'avg': 10 'avg': 10,
'cpu_wall_time_ratio': 0.5,
}, },
'story_6': { 'story_6': {
'ci_095': 20, 'ci_095': 20,
'avg': 10 'avg': 10,
'cpu_wall_time_ratio': 0.5,
}, },
'story_7': { 'story_7': {
'ci_095': 20, 'ci_095': 20,
'avg': 10, 'avg': 10,
'cpu_wall_time_ratio': 0.5,
'experimental': True, 'experimental': True,
}, },
} }
...@@ -96,14 +103,18 @@ class TestRepresentativePerfScript(unittest.TestCase): ...@@ -96,14 +103,18 @@ class TestRepresentativePerfScript(unittest.TestCase):
def test_parse_csv_results(self): def test_parse_csv_results(self):
csv_obj = create_sample_input([ csv_obj = create_sample_input([
['story_1', 'frame_times', 16, 10, 1.5], ['story_1', 'frame_times', 16, 10, 1.5],
['story_1', 'cpu_wall_time_ratio', 0.5, 1, 1],
['story_2', 'latency', 10, 8, 4], # Record for a different metric. ['story_2', 'latency', 10, 8, 4], # Record for a different metric.
['story_3', 'frame_times', 8, 20, 2], ['story_3', 'frame_times', 8, 20, 2],
['story_3', 'frame_times', 7, 20, 15],
['story_3', 'frame_times', 12, 20, 16],
['story_3', 'cpu_wall_time_ratio', 0.3, 1, 1],
['story_3', 'cpu_wall_time_ratio', 0.7, 1, 1],
['story_3', 'cpu_wall_time_ratio', '', 0, 1],
['story_4', 'frame_times', '', 10, 1], # Record with no avg. ['story_4', 'frame_times', '', 10, 1], # Record with no avg.
['story_5', 'frame_times', 12, 0, 3], # Record with count of 0. ['story_5', 'frame_times', 12, 0, 3], # Record with count of 0.
['story_6', 'frame_times', 12, 40, 40], # High noise record. ['story_6', 'frame_times', 12, 40, 40], # High noise record.
['story_8', 'frame_times', 12, 40, 4], ['story_8', 'frame_times', 12, 40, 4],
['story_3', 'frame_times', 7, 20, 15],
['story_3', 'frame_times', 12, 20, 16]
]) ])
perf_test = perf_test_initializer() perf_test = perf_test_initializer()
...@@ -114,10 +125,13 @@ class TestRepresentativePerfScript(unittest.TestCase): ...@@ -114,10 +125,13 @@ class TestRepresentativePerfScript(unittest.TestCase):
self.assertEquals(len(values_per_story), 5) self.assertEquals(len(values_per_story), 5)
self.assertEquals(values_per_story['story_1']['averages'], [16.0]) self.assertEquals(values_per_story['story_1']['averages'], [16.0])
self.assertEquals(values_per_story['story_1']['ci_095'], [1.5]) self.assertEquals(values_per_story['story_1']['ci_095'], [1.5])
self.assertEquals(values_per_story['story_1']['cpu_wall_time_ratio'], [0.5])
# Record with avg 12 has high noise. # Record with avg 12 has high noise.
self.assertEquals(values_per_story['story_3']['averages'], [8.0, 7.0, 12.0]) self.assertEquals(values_per_story['story_3']['averages'], [8.0, 7.0, 12.0])
self.assertEquals(values_per_story['story_3']['ci_095'], [2.0, 15.0, 16.0]) self.assertEquals(values_per_story['story_3']['ci_095'], [2.0, 15.0, 16.0])
self.assertEquals(values_per_story['story_3']['cpu_wall_time_ratio'],
[0.3, 0.7])
self.assertEquals(len(values_per_story['story_4']['averages']), 0) self.assertEquals(len(values_per_story['story_4']['averages']), 0)
self.assertEquals(len(values_per_story['story_4']['ci_095']), 0) self.assertEquals(len(values_per_story['story_4']['ci_095']), 0)
...@@ -131,46 +145,59 @@ class TestRepresentativePerfScript(unittest.TestCase): ...@@ -131,46 +145,59 @@ class TestRepresentativePerfScript(unittest.TestCase):
'story_1': { 'story_1': {
'averages': [16.0, 17.0, 21.0], 'averages': [16.0, 17.0, 21.0],
'ci_095': [2.0, 15.0, 16.0], 'ci_095': [2.0, 15.0, 16.0],
'cpu_wall_time_ratio': [0.5, 0.52, 0.57]
}, },
'story_2': { 'story_2': {
'averages': [16.0, 17.0, 22.0], 'averages': [16.0, 17.0, 22.0],
'ci_095': [1.0, 1.4, 1.2], 'ci_095': [1.0, 1.4, 1.2],
'cpu_wall_time_ratio': [0.3, 0.3, 0.3]
},
'story_3': {
'averages': [20.0, 15.0, 22.0],
'ci_095': [1.0, 0.8, 1.2],
'cpu_wall_time_ratio': [0.5, 0.5, 0.49]
} }
} }
sample_perf_results = create_sample_perf_results(['story_1', 'story_2'], [], sample_perf_results = create_sample_perf_results(
BENCHMARK) ['story_1', 'story_2', 'story_3'], [], BENCHMARK)
rerun = False rerun = False
perf_test = perf_test_initializer() perf_test = perf_test_initializer()
perf_test.result_recorder[rerun].set_tests(sample_perf_results) perf_test.result_recorder[rerun].set_tests(sample_perf_results)
perf_test.compare_values(values_per_story, rerun) perf_test.compare_values(values_per_story, rerun)
result_recorder = perf_test.result_recorder[rerun] result_recorder = perf_test.result_recorder[rerun]
self.assertEquals(result_recorder.tests, 2) self.assertEquals(result_recorder.tests, 3)
# The failure for story_3 is invalidated (low cpu_wall_time_ratio)
self.assertEquals(result_recorder.failed_stories, set(['story_2'])) self.assertEquals(result_recorder.failed_stories, set(['story_2']))
(output, overall_return_code) = result_recorder.get_output(0) (output, overall_return_code) = result_recorder.get_output(0)
self.assertEquals(overall_return_code, 1) self.assertEquals(overall_return_code, 1)
self.assertEquals(output['num_failures_by_type'].get('FAIL', 0), 1) self.assertEquals(output['num_failures_by_type'].get('FAIL', 0), 1)
self.assertEquals(output['tests'][BENCHMARK]['story_1']['actual'], 'PASS') self.assertEquals(output['tests'][BENCHMARK]['story_1']['actual'], 'PASS')
self.assertEquals(output['tests'][BENCHMARK]['story_2']['actual'], 'FAIL') self.assertEquals(output['tests'][BENCHMARK]['story_2']['actual'], 'FAIL')
self.assertEquals(output['tests'][BENCHMARK]['story_3']['actual'], 'PASS')
def test_compare_values_2(self): def test_compare_values_2(self):
values_per_story = { values_per_story = {
'story_1': { 'story_1': {
'averages': [16.0, 17.0, 21.0], 'averages': [16.0, 17.0, 21.0],
'ci_095': [2.0, 15.0, 16.0], 'ci_095': [2.0, 15.0, 16.0],
'cpu_wall_time_ratio': [0.45, 0.42],
}, },
'story_3': { # Two of the runs have acceptable CI but high averages. 'story_3': { # Two of the runs have acceptable CI but high averages.
'averages': [10, 13], 'averages': [10, 13],
'ci_095': [14, 16, 12] 'ci_095': [14, 16, 12],
'cpu_wall_time_ratio': [0.5, 0.52],
}, },
'story_4': { # All runs have high noise. 'story_4': { # All runs have high noise.
'averages': [], 'averages': [],
'ci_095': [16, 17, 18], 'ci_095': [16, 17, 18],
'cpu_wall_time_ratio': [],
}, },
'story_5': { # No recorded values. 'story_5': { # No recorded values.
'averages': [], 'averages': [],
'ci_095': [], 'ci_095': [],
'cpu_wall_time_ratio': [],
} }
} }
...@@ -204,11 +231,13 @@ class TestRepresentativePerfScript(unittest.TestCase): ...@@ -204,11 +231,13 @@ class TestRepresentativePerfScript(unittest.TestCase):
'story_1': { 'story_1': {
'averages': [16.0, 17.0, 21.0], 'averages': [16.0, 17.0, 21.0],
'ci_095': [2.0, 15.0, 16.0], 'ci_095': [2.0, 15.0, 16.0],
'cpu_wall_time_ratio': [0.45, 0.42, 0.44],
}, },
'story_7': 'story_7':
{ # Experimental story with higher value than the upper limit. { # Experimental story with higher value than the upper limit.
'averages': [20, 26], 'averages': [20, 26],
'ci_095': [14, 16] 'ci_095': [14, 16],
'cpu_wall_time_ratio': [0.45, 0.42, 0.44],
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment