Failure invalidation of rep_perf tests should show green status

This change reports the tests as a success in case of failure invalidation, and adds a field to the output.json for invalidation reason. The two failure reasons used are: - Noisy control test - Low cpu_wal_time_ratio Bug: chromium:1106934 Change-Id: Ib9217352fc8e779588a67093acaba2538b97a263 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2315539Reviewed-by: John Chen <johnchen@chromium.org> Commit-Queue: Behdad Bakhshinategh <behdadb@chromium.org> Cr-Commit-Position: refs/heads/master@{#791317}

Failure invalidation of rep_perf tests should show green status
This change reports the tests as a success in case of failure invalidation, and adds a field to the output.json for invalidation reason. The two failure reasons used are: - Noisy control test - Low cpu_wal_time_ratio Bug: chromium:1106934 Change-Id: Ib9217352fc8e779588a67093acaba2538b97a263 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2315539Reviewed-by: John Chen <johnchen@chromium.org> Commit-Queue: Behdad Bakhshinategh <behdadb@chromium.org> Cr-Commit-Position: refs/heads/master@{#791317}
29130140 · behdad · Commit Bot · 1ff3c501 · 29130140 · 29130140
Commit 29130140 authored Jul 25, 2020 by behdad Committed by Commit Bot Jul 25, 2020
2 changed files
--- a/testing/scripts/run_rendering_benchmark_with_gated_performance.py
+++ b/testing/scripts/run_rendering_benchmark_with_gated_performance.py
@@ -64,20 +64,26 @@ class ResultRecorder(object):
    if is_control:
      self._noisy_control_stories.add(name)
-  def remove_failure(self, name, benchmark, is_control=False):
+  def remove_failure(self, name, benchmark, is_control=False,
+                      invalidation_reason=None):
    self.output['tests'][benchmark][name]['actual'] = 'PASS'
    self.output['tests'][benchmark][name]['is_unexpected'] = False
    self._failed_stories.remove(name)
    self.fails -= 1
    if is_control:
      self._noisy_control_stories.remove(name)
+    if invalidation_reason:
+      self.add_invalidation_reason(name, benchmark, invalidation_reason)
  def invalidate_failures(self, benchmark):
    # The method is for invalidating the failures in case of noisy control test
-    for story in self._failed_stories:
+    for story in self._failed_stories.copy():
      print(story + ' [Invalidated Failure]: The story failed but was ' +
        'invalidated as a result of noisy control test.')
-      self.output['tests'][benchmark][story]['is_unexpected'] = False
+      self.remove_failure(story, benchmark, False, 'Noisy control test')
+  def add_invalidation_reason(self, name, benchmark, reason):
+    self.output['tests'][benchmark][name]['invalidation_reason'] = reason
  @property
  def failed_stories(self):
@@ -90,10 +96,7 @@ class ResultRecorder(object):
  def get_output(self, return_code):
    self.output['seconds_since_epoch'] = time.time() - self.start_time
    self.output['num_failures_by_type']['PASS'] = self.tests - self.fails
-    if self.fails > 0 and not self.is_control_stories_noisy:
+    self.output['num_failures_by_type']['FAIL'] = self.fails
-      self.output['num_failures_by_type']['FAIL'] = self.fails
-    else:
-      self.output['num_failures_by_type']['FAIL'] = 0
    if return_code == 1:
      self.output['interrupted'] = True
@@ -101,7 +104,7 @@ class ResultRecorder(object):
    tests = lambda n: plural(n, 'test', 'tests')
    print('[  PASSED  ] ' + tests(self.tests - self.fails) + '.')
-    if self.fails > 0 and not self.is_control_stories_noisy:
+    if self.fails > 0:
      print('[  FAILED  ] ' + tests(self.fails) + '.')
      self.return_code = 1
@@ -250,6 +253,8 @@ class RenderingRepresentativePerfTest(object):
            'to upper limit({:.3f}). Invalidated for low cpu_wall_time_ratio'
            ).format(self.benchmark, story_name, METRIC_NAME, measured_avg,
                      upper_limit_avg))
+          self.result_recorder[rerun].add_invalidation_reason(
+            story_name, self.benchmark, 'Low cpu_wall_time_ratio')
      else:
        print(('[       OK ] {}/{} lower average {}({:.3f}) compared ' +
          'to upper limit({:.3f}).').format(self.benchmark, story_name,
@@ -388,4 +393,4 @@ if __name__ == '__main__':
      'compile_targets': main_compile_targets,
    }
    sys.exit(common.run_script(sys.argv[1:], funcs))
  sys.exit(main())
\ No newline at end of file
--- a/tools/perf/run_rendering_benchmark_with_gated_performance_unittest.py
+++ b/tools/perf/run_rendering_benchmark_with_gated_performance_unittest.py
@@ -219,14 +219,74 @@ class TestRepresentativePerfScript(unittest.TestCase):
    result_recorder.invalidate_failures(BENCHMARK)
    (output, overall_return_code) = result_recorder.get_output(0)
+    self.assertEquals(overall_return_code, 1)
+    self.assertEquals(output['num_failures_by_type'].get('FAIL', 0), 1)
+    self.assertEquals(output['tests'][BENCHMARK]['story_1']['actual'], 'PASS')
+    self.assertEquals(output['tests'][BENCHMARK]['story_2']['actual'], 'FAIL')
+    self.assertEquals(output['tests'][BENCHMARK]['story_3']['actual'], 'PASS')
+    self.assertEquals(output['tests'][BENCHMARK]['story_4']['actual'], 'PASS')
+  # Invalidating failure as a result of noisy control test
+  def test_compare_values_3(self):
+    values_per_story = {
+      'story_1': {
+        'averages': [16.0, 17.0, 21.0],
+        'ci_095': [2.0, 15.0, 16.0],
+        'cpu_wall_time_ratio': [0.45, 0.42],
+      },
+      'story_3': { # Two of the runs have acceptable CI but high averages.
+        'averages': [10, 13],
+        'ci_095': [14, 16, 12],
+        'cpu_wall_time_ratio': [0.5, 0.52],
+      },
+      'story_4': {  # All runs have high noise.
+        'averages': [],
+        'ci_095': [16, 17, 18],
+        'cpu_wall_time_ratio': [],
+      },
+      'story_5': {  # No recorded values.
+        'averages': [],
+        'ci_095': [],
+        'cpu_wall_time_ratio': [],
+      }
+    }
+    sample_perf_results = create_sample_perf_results(
+        ['story_1', 'story_3', 'story_4', 'story_5'], [], BENCHMARK)
+    rerun = True
+    perf_test = perf_test_initializer()
+    perf_test.result_recorder[rerun].set_tests(sample_perf_results)
+    self.assertEquals(perf_test.result_recorder[rerun].fails, 0)
+    perf_test.compare_values(values_per_story, rerun)
+    result_recorder = perf_test.result_recorder[rerun]
+    self.assertEquals(result_recorder.tests, 4)
+    self.assertEquals(result_recorder.failed_stories,
+                      set(['story_3', 'story_4', 'story_5']))
+    self.assertTrue(result_recorder.is_control_stories_noisy)
+    result_recorder.invalidate_failures(BENCHMARK)
+    (output, overall_return_code) = result_recorder.get_output(0)
    self.assertEquals(overall_return_code, 0)
    self.assertEquals(output['num_failures_by_type'].get('FAIL', 0), 0)
    self.assertEquals(output['tests'][BENCHMARK]['story_1']['actual'], 'PASS')
-    self.assertEquals(output['tests'][BENCHMARK]['story_3']['actual'], 'FAIL')
+    self.assertEquals(output['tests'][BENCHMARK]['story_3']['actual'], 'PASS')
-    self.assertEquals(output['tests'][BENCHMARK]['story_4']['actual'], 'FAIL')
+    self.assertEquals(output['tests'][BENCHMARK]['story_4']['actual'], 'PASS')
+    self.assertEquals(output['tests'][BENCHMARK]['story_5']['actual'], 'PASS')
+    self.assertEquals(
+        output['tests'][BENCHMARK]['story_3']['invalidation_reason'],
+        'Noisy control test')
+    self.assertEquals(
+        output['tests'][BENCHMARK]['story_4']['invalidation_reason'],
+        'Noisy control test')
+    self.assertEquals(
+        output['tests'][BENCHMARK]['story_5']['invalidation_reason'],
+        'Noisy control test')
  # Experimental stories should not fail the test
-  def test_compare_values_3(self):
+  def test_compare_values_4(self):
    values_per_story = {
        'story_1': {
            'averages': [16.0, 17.0, 21.0],
@@ -254,10 +314,42 @@ class TestRepresentativePerfScript(unittest.TestCase):
    self.assertEquals(result_recorder.tests, 2)
    self.assertEquals(result_recorder.failed_stories, set([]))
-    result_recorder.invalidate_failures(BENCHMARK)
    (output, overall_return_code) = result_recorder.get_output(0)
    self.assertEquals(overall_return_code, 0)
    self.assertEquals(output['num_failures_by_type'].get('FAIL', 0), 0)
    self.assertEquals(output['tests'][BENCHMARK]['story_1']['actual'], 'PASS')
    self.assertEquals(output['tests'][BENCHMARK]['story_7']['actual'], 'PASS')
+  # Low cpu_wall_time_ratio invalidates the failure
+  def test_compare_values_5(self):
+    values_per_story = {
+        'story_1': {
+            'averages': [26.0, 27.0, 21.0],
+            'ci_095': [2.0, 15.0, 16.0],
+            'cpu_wall_time_ratio': [0.35, 0.42, 0.34],
+            # Higher avg than upper limit with low Cpu_wall_time_ratio
+        }
+    }
+    sample_perf_results = create_sample_perf_results(['story_1'], [], BENCHMARK)
+    rerun = False
+    perf_test = perf_test_initializer()
+    perf_test.result_recorder[rerun].set_tests(sample_perf_results)
+    self.assertEquals(perf_test.result_recorder[rerun].fails, 0)
+    perf_test.compare_values(values_per_story, rerun)
+    result_recorder = perf_test.result_recorder[rerun]
+    self.assertEquals(result_recorder.tests, 1)
+    self.assertEquals(result_recorder.failed_stories, set([]))
+    result_recorder.invalidate_failures(BENCHMARK)
+    (output, overall_return_code) = result_recorder.get_output(0)
+    self.assertEquals(overall_return_code, 0)
+    self.assertEquals(output['num_failures_by_type'].get('FAIL', 0), 0)
+    self.assertEquals(output['tests'][BENCHMARK]['story_1']['actual'], 'PASS')
+    self.assertEquals(
+        output['tests'][BENCHMARK]['story_1']['invalidation_reason'],
+        'Low cpu_wall_time_ratio')