Use Confidence interval only for Control test

In a previous change(crrev.com/2055047) we stop using the confidence intervals as a result of high noise on them. But we need to use the CI for control test. This change adds the condition to use confidence interval only for the control test. This helps with not flagging flaky failures. TBR=crouleau@chromium.org Bug: chromium:1052054 Change-Id: Ibbacb64229d06fc8b896cce28eb07d41bed62885 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2065112 Commit-Queue: Behdad Bakhshinategh <behdadb@chromium.org> Reviewed-by: Caleb Rouleau <crouleau@chromium.org> Reviewed-by: Behdad Bakhshinategh <behdadb@chromium.org> Reviewed-by: Sadrul Chowdhury <sadrul@chromium.org> Cr-Commit-Position: refs/heads/master@{#743238}

Use Confidence interval only for Control test
In a previous change(crrev.com/2055047) we stop using the confidence intervals as a result of high noise on them. But we need to use the CI for control test. This change adds the condition to use confidence interval only for the control test. This helps with not flagging flaky failures. TBR=crouleau@chromium.org Bug: chromium:1052054 Change-Id: Ibbacb64229d06fc8b896cce28eb07d41bed62885 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2065112 Commit-Queue: Behdad Bakhshinategh <behdadb@chromium.org> Reviewed-by: Caleb Rouleau <crouleau@chromium.org> Reviewed-by: Behdad Bakhshinategh <behdadb@chromium.org> Reviewed-by: Sadrul Chowdhury <sadrul@chromium.org> Cr-Commit-Position: refs/heads/master@{#743238}
feffdfca · behdad · Commit Bot · b87fe3a6 · feffdfca · feffdfca
Commit feffdfca authored Feb 20, 2020 by behdad Committed by Commit Bot Feb 20, 2020
2 changed files
--- a/testing/scripts/run_rendering_benchmark_with_gated_performance.py
+++ b/testing/scripts/run_rendering_benchmark_with_gated_performance.py
@@ -35,7 +35,7 @@ AVG_ERROR_MARGIN = 1.1
 # recorded range between upper and lower CIs. CI_ERROR_MARGIN is the maximum
 # acceptable ratio of calculated ci_095 to the recorded ones.
 # TODO(behdadb) crbug.com/1052054
-CI_ERROR_MARGIN = 30.0
+CI_ERROR_MARGIN = 1.5

 class ResultRecorder(object):
  def __init__(self):
@@ -174,13 +174,12 @@ def compare_values(values_per_story, upper_limit_data, benchmark,
    measured_avg = np.mean(np.array(values_per_story[story_name]['averages']))
    measured_ci = np.mean(np.array(values_per_story[story_name]['ci_095']))

-    if (measured_ci > upper_limit_ci * CI_ERROR_MARGIN):
+    if (measured_ci > upper_limit_ci * CI_ERROR_MARGIN and
+      is_control_story(upper_limit_data[story_name])):
      print(('[  FAILED  ] {}/{} frame_times has higher noise ({:.3f}) ' +
        'compared to upper limit ({:.3f})').format(
          benchmark, story_name, measured_ci,upper_limit_ci))
-      result_recorder.add_failure(story_name, benchmark,
-        is_control_story(upper_limit_data[story_name]))
-
+      result_recorder.add_failure(story_name, benchmark, True)
    elif (measured_avg > upper_limit_avg * AVG_ERROR_MARGIN):
      print(('[  FAILED  ] {}/{} higher average frame_times({:.3f}) compared' +
        ' to upper limit ({:.3f})').format(

--- a/tools/perf/run_rendering_benchmark_with_gated_performance_unittest.py
+++ b/tools/perf/run_rendering_benchmark_with_gated_performance_unittest.py
@@ -79,15 +79,15 @@ def create_sample_perf_results(passed_stories, failed_stories, benchmark):
 class TestRepresentativePerfScript(unittest.TestCase):
  def test_parse_csv_results(self):
    csv_obj = create_sample_input([
-        ['story_1', 'frame_times', 16, 10, 30],
-        ['story_2', 'latency', 10, 8, 80],  # Record for a different metric.
-        ['story_3', 'frame_times', 8, 20, 40],
-        ['story_4', 'frame_times', '', 10, 20],  # Record with no avg.
-        ['story_5', 'frame_times', 12, 0, 60],  # Record with count of 0.
-        ['story_6', 'frame_times', 12, 40, 800],  # High noise record.
-        ['story_7', 'frame_times', 12, 40, 90],
-        ['story_3', 'frame_times', 7, 20, 300],
-        ['story_3', 'frame_times', 12, 20, 320]
+        ['story_1', 'frame_times', 16, 10, 1.5],
+        ['story_2', 'latency', 10, 8, 4],  # Record for a different metric.
+        ['story_3', 'frame_times', 8, 20, 2],
+        ['story_4', 'frame_times', '', 10, 1],  # Record with no avg.
+        ['story_5', 'frame_times', 12, 0, 3],  # Record with count of 0.
+        ['story_6', 'frame_times', 12, 40, 40],  # High noise record.
+        ['story_7', 'frame_times', 12, 40, 4],
+        ['story_3', 'frame_times', 7, 20, 15],
+        ['story_3', 'frame_times', 12, 20, 16]
    ])
    values_per_story = perf_tests.parse_csv_results(csv_obj,
                                                    UPPER_LIMIT_DATA_SAMPLE)
@@ -95,11 +95,11 @@ class TestRepresentativePerfScript(unittest.TestCase):
    # All stories but story_2 & story_7.
    self.assertEquals(len(values_per_story), 5)
    self.assertEquals(values_per_story['story_1']['averages'], [16.0])
-    self.assertEquals(values_per_story['story_1']['ci_095'], [30])
+    self.assertEquals(values_per_story['story_1']['ci_095'], [1.5])

    # Record with avg 12 has high noise.
    self.assertEquals(values_per_story['story_3']['averages'], [8.0, 7.0])
-    self.assertEquals(values_per_story['story_3']['ci_095'], [40, 300, 320])
+    self.assertEquals(values_per_story['story_3']['ci_095'], [2.0, 15.0, 16.0])

    self.assertEquals(len(values_per_story['story_4']['averages']), 0)
    self.assertEquals(len(values_per_story['story_4']['ci_095']), 0)
@@ -108,17 +108,17 @@ class TestRepresentativePerfScript(unittest.TestCase):

    # High noise record will be filtered.
    self.assertEquals(len(values_per_story['story_6']['averages']), 0)
-    self.assertEquals(values_per_story['story_6']['ci_095'], [800.0])
+    self.assertEquals(values_per_story['story_6']['ci_095'], [40.0])

  def test_compare_values_1(self):
    values_per_story = {
        'story_1': {
            'averages': [16.0, 17.0, 21.0],
-            'ci_095': [40, 300, 320],
+            'ci_095': [2.0, 15.0, 16.0],
        },
        'story_2': {
            'averages': [16.0, 17.0, 22.0],
-            'ci_095': [20, 28, 24],
+            'ci_095': [1.0, 1.4, 1.2],
        }
    }
    benchmark = 'rendering.desktop'
@@ -143,15 +143,15 @@ class TestRepresentativePerfScript(unittest.TestCase):
    values_per_story = {
      'story_1': {
        'averages': [16.0, 17.0, 21.0],
-        'ci_095': [40, 300, 320],
+        'ci_095': [2.0, 15.0, 16.0],
      },
      'story_3': { # Two of the runs have acceptable CI but high averages.
        'averages': [10, 13],
-        'ci_095': [280, 320, 240]
+        'ci_095': [14, 16, 12]
      },
      'story_4': {  # All runs have high noise.
        'averages': [],
-        'ci_095': [320, 340, 360],
+        'ci_095': [16, 17, 18],
      },
      'story_5': {  # No recorded values.
        'averages': [],