Stop failures as a result of noise using control test

Main_30fps_impl_60fps story in representative perf test is marked as a control test, so that if this test fails as a result of high noise, it would invalidate the failure of other tests and would result in less false positives. Bug: chromium:1029846 Change-Id: I50c8e41108d584f2f5cbcf095be50dc09228406b Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2025510 Commit-Queue: Behdad Bakhshinategh <behdadb@chromium.org> Reviewed-by: Sadrul Chowdhury <sadrul@chromium.org> Reviewed-by: Caleb Rouleau <crouleau@chromium.org> Cr-Commit-Position: refs/heads/master@{#738735}

Stop failures as a result of noise using control test
Main_30fps_impl_60fps story in representative perf test is marked as a control test, so that if this test fails as a result of high noise, it would invalidate the failure of other tests and would result in less false positives. Bug: chromium:1029846 Change-Id: I50c8e41108d584f2f5cbcf095be50dc09228406b Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2025510 Commit-Queue: Behdad Bakhshinategh <behdadb@chromium.org> Reviewed-by: Sadrul Chowdhury <sadrul@chromium.org> Reviewed-by: Caleb Rouleau <crouleau@chromium.org> Cr-Commit-Position: refs/heads/master@{#738735}
5068d536 · behdad · Commit Bot · 2d7e2dd9 · 5068d536 · 5068d536
Commit 5068d536 authored Feb 05, 2020 by behdad Committed by Commit Bot Feb 05, 2020
3 changed files
--- a/testing/scripts/representative_perf_test_data/representatives_frame_times_upper_limit.json
+++ b/testing/scripts/representative_perf_test_data/representatives_frame_times_upper_limit.json
@@ -50,7 +50,8 @@
    },
    "main_30fps_impl_60fps": {
      "ci_095": 0.121,
-      "avg": 16.689
+      "avg": 16.689,
+      "control": true
    },
    "css_value_type_shadow": {
      "ci_095": 1.306,
@@ -96,7 +97,8 @@
    },
    "main_30fps_impl_60fps": {
      "ci_095": 0.185,
-      "avg": 16.695
+      "avg": 16.695,
+      "control": true
    },
    "ie_chalkboard": {
      "ci_095": 43.55,
@@ -142,7 +144,8 @@
    },
    "main_30fps_impl_60fps": {
      "ci_095": 0.603,
-      "avg": 18.009
+      "avg": 18.009,
+      "control": true
    },
    "new_tilings": {
      "ci_095": 1.008,

--- a/testing/scripts/run_rendering_benchmark_with_gated_performance.py
+++ b/testing/scripts/run_rendering_benchmark_with_gated_performance.py
@@ -44,30 +44,39 @@ class ResultRecorder(object):
    self.output = {}
    self.return_code = 0
    self._failed_stories = set()
+    self._noisy_control_stories = set()
+    # Set of _noisy_control_stories keeps track of control tests which failed
+    # because of high noise values.

  def set_tests(self, output):
    self.output = output
-    self.fails = 0
-    if 'FAIL' in output['num_failures_by_type']:
-      self.fails = output['num_failures_by_type']['FAIL']
-    self.tests = self.fails + output['num_failures_by_type']['PASS']
+    self.fails = output['num_failures_by_type'].get('FAIL', 0)
+    self.tests = self.fails + output['num_failures_by_type'].get('PASS', 0)

-  def add_failure(self, name, benchmark):
+  def add_failure(self, name, benchmark, is_control=False):
    self.output['tests'][benchmark][name]['actual'] = 'FAIL'
    self.output['tests'][benchmark][name]['is_unexpected'] = True
    self._failed_stories.add(name)
    self.fails += 1
+    if is_control:
+      self._noisy_control_stories.add(name)

-  def remove_failure(self, name, benchmark):
+  def remove_failure(self, name, benchmark, is_control=False):
    self.output['tests'][benchmark][name]['actual'] = 'PASS'
    self.output['tests'][benchmark][name]['is_unexpected'] = False
    self._failed_stories.remove(name)
    self.fails -= 1
+    if is_control:
+      self._noisy_control_stories.remove(name)

  @property
  def failed_stories(self):
    return self._failed_stories

+  @property
+  def is_control_stories_noisy(self):
+    return len(self._noisy_control_stories) > 0
+
  def get_output(self, return_code):
    self.output['seconds_since_epoch'] = time.time() - self.start_time
    self.output['num_failures_by_type']['PASS'] = self.tests - self.fails
@@ -82,6 +91,7 @@ class ResultRecorder(object):
    print('[  PASSED  ] ' + tests(self.tests - self.fails) + '.')
    if self.fails > 0:
      print('[  FAILED  ] ' + tests(self.fails) + '.')
+      if not self.is_control_stories_noisy:
        self.return_code = 1

    return (self.output, self.return_code)
@@ -159,7 +169,9 @@ def compare_values(values_per_story, upper_limit_data, benchmark,
      print(('[  FAILED  ] {}/{} frame_times has higher noise ({:.3f}) ' +
        'compared to upper limit ({:.3f})').format(
          benchmark, story_name, measured_ci,upper_limit_ci))
-      result_recorder.add_failure(story_name, benchmark)
+      result_recorder.add_failure(story_name, benchmark,
+        is_control_story(upper_limit_data[story_name]))
+
    elif (measured_avg > upper_limit_avg * AVG_ERROR_MARGIN):
      print(('[  FAILED  ] {}/{} higher average frame_times({:.3f}) compared' +
        ' to upper limit ({:.3f})').format(
@@ -204,6 +216,9 @@ def replace_arg_values(args, key_value_pairs):
          args[index+1] = value
  return args

+def is_control_story(story_data):
+  return ('control' in story_data and story_data['control'] == True)
+
 def main():
  overall_return_code = 0
  options = parse_arguments()
@@ -282,7 +297,8 @@ def main():

      for story_name in result_recorder.failed_stories.copy():
        if story_name not in re_run_result_recorder.failed_stories:
-          result_recorder.remove_failure(story_name, benchmark)
+          result_recorder.remove_failure(story_name, benchmark,
+            is_control_story(upper_limit_data[platform][story_name]))

    (
      finalOut,
@@ -294,6 +310,10 @@ def main():
    with open(options.isolated_script_test_output, 'w') as outputFile:
      json.dump(finalOut, outputFile, indent=4)

+    if result_recorder.is_control_stories_noisy:
+      assert overall_return_code == 0
+      print('Control story has high noise. These runs are not reliable!')
+
  return overall_return_code

 def parse_arguments():

--- a/tools/perf/run_rendering_benchmark_with_gated_performance_unittest.py
+++ b/tools/perf/run_rendering_benchmark_with_gated_performance_unittest.py
@@ -27,7 +27,8 @@ UPPER_LIMIT_DATA_SAMPLE = {
    },
    'story_4': {
        'ci_095': 10,
-        'avg': 10
+        'avg': 10,
+        'control': True,
    },
    'story_5': {
        'ci_095': 20,
@@ -132,6 +133,8 @@ class TestRepresentativePerfScript(unittest.TestCase):
        result_recorder)
    self.assertEquals(result_recorder.tests, 2)
    self.assertEquals(result_recorder.failed_stories, set(['story_2']))
+    (_, overall_return_code) = result_recorder.get_output(0)
+    self.assertEquals(overall_return_code, 1)

  def test_compare_values_2(self):
    values_per_story = {
@@ -141,7 +144,7 @@ class TestRepresentativePerfScript(unittest.TestCase):
      },
      'story_3': { # Two of the runs have acceptable CI but high averages.
        'averages': [10, 13],
-        'ci_095': [1.0, 1.4, 1.2],
+        'ci_095': [14, 16, 12]
      },
      'story_4': {  # All runs have high noise.
        'averages': [],
@@ -167,3 +170,6 @@ class TestRepresentativePerfScript(unittest.TestCase):
    self.assertEquals(result_recorder.tests, 5)
    self.assertEquals(result_recorder.failed_stories,
                      set(['story_3', 'story_4', 'story_5']))
+    self.assertTrue(result_recorder.is_control_stories_noisy)
+    (_, overall_return_code) = result_recorder.get_output(0)
+    self.assertEquals(overall_return_code, 0)