Commit feffdfca authored by behdad's avatar behdad Committed by Commit Bot

Use Confidence interval only for Control test

In a previous change(crrev.com/2055047) we stop using the confidence intervals as a result
of high noise on them. But we need to use the CI for control test. This
change adds the condition to use confidence interval only for the
control test. This helps with not flagging flaky failures.

TBR=crouleau@chromium.org

Bug: chromium:1052054
Change-Id: Ibbacb64229d06fc8b896cce28eb07d41bed62885
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2065112
Commit-Queue: Behdad Bakhshinategh <behdadb@chromium.org>
Reviewed-by: default avatarCaleb Rouleau <crouleau@chromium.org>
Reviewed-by: default avatarBehdad Bakhshinategh <behdadb@chromium.org>
Reviewed-by: default avatarSadrul Chowdhury <sadrul@chromium.org>
Cr-Commit-Position: refs/heads/master@{#743238}
parent b87fe3a6
......@@ -35,7 +35,7 @@ AVG_ERROR_MARGIN = 1.1
# recorded range between upper and lower CIs. CI_ERROR_MARGIN is the maximum
# acceptable ratio of calculated ci_095 to the recorded ones.
# TODO(behdadb) crbug.com/1052054
CI_ERROR_MARGIN = 30.0
CI_ERROR_MARGIN = 1.5
class ResultRecorder(object):
def __init__(self):
......@@ -174,13 +174,12 @@ def compare_values(values_per_story, upper_limit_data, benchmark,
measured_avg = np.mean(np.array(values_per_story[story_name]['averages']))
measured_ci = np.mean(np.array(values_per_story[story_name]['ci_095']))
if (measured_ci > upper_limit_ci * CI_ERROR_MARGIN):
if (measured_ci > upper_limit_ci * CI_ERROR_MARGIN and
is_control_story(upper_limit_data[story_name])):
print(('[ FAILED ] {}/{} frame_times has higher noise ({:.3f}) ' +
'compared to upper limit ({:.3f})').format(
benchmark, story_name, measured_ci,upper_limit_ci))
result_recorder.add_failure(story_name, benchmark,
is_control_story(upper_limit_data[story_name]))
result_recorder.add_failure(story_name, benchmark, True)
elif (measured_avg > upper_limit_avg * AVG_ERROR_MARGIN):
print(('[ FAILED ] {}/{} higher average frame_times({:.3f}) compared' +
' to upper limit ({:.3f})').format(
......
......@@ -79,15 +79,15 @@ def create_sample_perf_results(passed_stories, failed_stories, benchmark):
class TestRepresentativePerfScript(unittest.TestCase):
def test_parse_csv_results(self):
csv_obj = create_sample_input([
['story_1', 'frame_times', 16, 10, 30],
['story_2', 'latency', 10, 8, 80], # Record for a different metric.
['story_3', 'frame_times', 8, 20, 40],
['story_4', 'frame_times', '', 10, 20], # Record with no avg.
['story_5', 'frame_times', 12, 0, 60], # Record with count of 0.
['story_6', 'frame_times', 12, 40, 800], # High noise record.
['story_7', 'frame_times', 12, 40, 90],
['story_3', 'frame_times', 7, 20, 300],
['story_3', 'frame_times', 12, 20, 320]
['story_1', 'frame_times', 16, 10, 1.5],
['story_2', 'latency', 10, 8, 4], # Record for a different metric.
['story_3', 'frame_times', 8, 20, 2],
['story_4', 'frame_times', '', 10, 1], # Record with no avg.
['story_5', 'frame_times', 12, 0, 3], # Record with count of 0.
['story_6', 'frame_times', 12, 40, 40], # High noise record.
['story_7', 'frame_times', 12, 40, 4],
['story_3', 'frame_times', 7, 20, 15],
['story_3', 'frame_times', 12, 20, 16]
])
values_per_story = perf_tests.parse_csv_results(csv_obj,
UPPER_LIMIT_DATA_SAMPLE)
......@@ -95,11 +95,11 @@ class TestRepresentativePerfScript(unittest.TestCase):
# All stories but story_2 & story_7.
self.assertEquals(len(values_per_story), 5)
self.assertEquals(values_per_story['story_1']['averages'], [16.0])
self.assertEquals(values_per_story['story_1']['ci_095'], [30])
self.assertEquals(values_per_story['story_1']['ci_095'], [1.5])
# Record with avg 12 has high noise.
self.assertEquals(values_per_story['story_3']['averages'], [8.0, 7.0])
self.assertEquals(values_per_story['story_3']['ci_095'], [40, 300, 320])
self.assertEquals(values_per_story['story_3']['ci_095'], [2.0, 15.0, 16.0])
self.assertEquals(len(values_per_story['story_4']['averages']), 0)
self.assertEquals(len(values_per_story['story_4']['ci_095']), 0)
......@@ -108,17 +108,17 @@ class TestRepresentativePerfScript(unittest.TestCase):
# High noise record will be filtered.
self.assertEquals(len(values_per_story['story_6']['averages']), 0)
self.assertEquals(values_per_story['story_6']['ci_095'], [800.0])
self.assertEquals(values_per_story['story_6']['ci_095'], [40.0])
def test_compare_values_1(self):
values_per_story = {
'story_1': {
'averages': [16.0, 17.0, 21.0],
'ci_095': [40, 300, 320],
'ci_095': [2.0, 15.0, 16.0],
},
'story_2': {
'averages': [16.0, 17.0, 22.0],
'ci_095': [20, 28, 24],
'ci_095': [1.0, 1.4, 1.2],
}
}
benchmark = 'rendering.desktop'
......@@ -143,15 +143,15 @@ class TestRepresentativePerfScript(unittest.TestCase):
values_per_story = {
'story_1': {
'averages': [16.0, 17.0, 21.0],
'ci_095': [40, 300, 320],
'ci_095': [2.0, 15.0, 16.0],
},
'story_3': { # Two of the runs have acceptable CI but high averages.
'averages': [10, 13],
'ci_095': [280, 320, 240]
'ci_095': [14, 16, 12]
},
'story_4': { # All runs have high noise.
'averages': [],
'ci_095': [320, 340, 360],
'ci_095': [16, 17, 18],
},
'story_5': { # No recorded values.
'averages': [],
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment