Low cpu_Wall_time_ratio can invalidate failures

Use cpu_wall_time_ratio in representative perf tests logic to invalidate failures which had lower than usual cpu_wall_time_ratio in order to reduce flaky failures. As a result only stories which have higher frame_times than the limit and acceptable cpu_wall_time_ratio would be failing. Bug: chromium:1052361 Change-Id: I294361375f91096a32e226f57dc61450926fab0b Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2292811 Commit-Queue: Behdad Bakhshinategh <behdadb@chromium.org> Reviewed-by: John Chen <johnchen@chromium.org> Cr-Commit-Position: refs/heads/master@{#789189}

Low cpu_Wall_time_ratio can invalidate failures
Use cpu_wall_time_ratio in representative perf tests logic to invalidate failures which had lower than usual cpu_wall_time_ratio in order to reduce flaky failures. As a result only stories which have higher frame_times than the limit and acceptable cpu_wall_time_ratio would be failing. Bug: chromium:1052361 Change-Id: I294361375f91096a32e226f57dc61450926fab0b Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2292811 Commit-Queue: Behdad Bakhshinategh <behdadb@chromium.org> Reviewed-by: John Chen <johnchen@chromium.org> Cr-Commit-Position: refs/heads/master@{#789189}
0bd20fee · behdad · Commit Bot · 2a39758f · 0bd20fee · 0bd20fee
Commit 0bd20fee authored Jul 16, 2020 by behdad Committed by Commit Bot Jul 16, 2020
2 changed files
--- a/testing/scripts/run_rendering_benchmark_with_gated_performance.py
+++ b/testing/scripts/run_rendering_benchmark_with_gated_performance.py
@@ -180,8 +180,8 @@ class RenderingRepresentativePerfTest(object):
    values_per_story = {}
    for row in csv_obj:
      # For now only frame_times is used for testing representatives'
-      # performance.
-      if row['name'] != METRIC_NAME:
+      # performance and cpu_wall_time_ratio is used for validation.
+      if row['name'] != METRIC_NAME and row['name'] != 'cpu_wall_time_ratio':
        continue
      story_name = row['stories']
      if (story_name not in self.upper_limit_data):
@@ -189,13 +189,16 @@ class RenderingRepresentativePerfTest(object):
      if story_name not in values_per_story:
        values_per_story[story_name] = {
          'averages': [],
-          'ci_095': []
+          'ci_095': [],
+          'cpu_wall_time_ratio': []
        }

-      if (row['avg'] == '' or row['count'] == 0):
-        continue
-      values_per_story[story_name]['ci_095'].append(float(row['ci_095']))
-      values_per_story[story_name]['averages'].append(float(row['avg']))
+      if row['name'] == METRIC_NAME and row['avg'] != '' and row['count'] != 0:
+        values_per_story[story_name]['ci_095'].append(float(row['ci_095']))
+        values_per_story[story_name]['averages'].append(float(row['avg']))
+      elif row['name'] == 'cpu_wall_time_ratio' and row['avg'] != '':
+        values_per_story[story_name]['cpu_wall_time_ratio'].append(
+          float(row['avg']))

    return values_per_story

@@ -219,10 +222,14 @@ class RenderingRepresentativePerfTest(object):
        self.result_recorder[rerun].add_failure(story_name, self.benchmark)
        continue

-      upper_limit_avg = self.upper_limit_data[story_name]['avg']
-      upper_limit_ci = self.upper_limit_data[story_name]['ci_095']
+      upper_limits = self.upper_limit_data
+      upper_limit_avg = upper_limits[story_name]['avg']
+      upper_limit_ci = upper_limits[story_name]['ci_095']
+      lower_limit_cpu_ratio = upper_limits[story_name]['cpu_wall_time_ratio']
      measured_avg = np.mean(np.array(values_per_story[story_name]['averages']))
      measured_ci = np.mean(np.array(values_per_story[story_name]['ci_095']))
+      measured_cpu_ratio = np.mean(np.array(
+        values_per_story[story_name]['cpu_wall_time_ratio']))

      if (measured_ci > upper_limit_ci * CI_ERROR_MARGIN and
        self.is_control_story(story_name)):
@@ -233,10 +240,16 @@ class RenderingRepresentativePerfTest(object):
        self.result_recorder[rerun].add_failure(
          story_name, self.benchmark, True)
      elif (measured_avg > upper_limit_avg * AVG_ERROR_MARGIN):
-        print(('[  FAILED  ] {}/{} higher average {}({:.3f}) compared' +
-          ' to upper limit ({:.3f})').format(self.benchmark, story_name,
-          METRIC_NAME, measured_avg, upper_limit_avg))
-        self.result_recorder[rerun].add_failure(story_name, self.benchmark)
+        if (measured_cpu_ratio >= lower_limit_cpu_ratio):
+          print(('[  FAILED  ] {}/{} higher average {}({:.3f}) compared' +
+            ' to upper limit ({:.3f})').format(self.benchmark, story_name,
+            METRIC_NAME, measured_avg, upper_limit_avg))
+          self.result_recorder[rerun].add_failure(story_name, self.benchmark)
+        else:
+          print(('[       OK ] {}/{} higher average {}({:.3f}) compared ' +
+            'to upper limit({:.3f}). Invalidated for low cpu_wall_time_ratio'
+            ).format(self.benchmark, story_name, METRIC_NAME, measured_avg,
+                      upper_limit_avg))
      else:
        print(('[       OK ] {}/{} lower average {}({:.3f}) compared ' +
          'to upper limit({:.3f}).').format(self.benchmark, story_name,

--- a/tools/perf/run_rendering_benchmark_with_gated_performance_unittest.py
+++ b/tools/perf/run_rendering_benchmark_with_gated_performance_unittest.py
@@ -18,32 +18,39 @@ BENCHMARK = 'rendering.desktop'
 UPPER_LIMIT_DATA_SAMPLE = {
    'story_1': {
        'ci_095': 10,
-        'avg': 20
+        'avg': 20,
+        'cpu_wall_time_ratio': 0.4,
    },
    'story_2': {
        'ci_095': 10,
-        'avg': 16
+        'avg': 16,
+        'cpu_wall_time_ratio': 0.3,
    },
    'story_3': {
        'ci_095': 10,
-        'avg': 10
+        'avg': 10,
+        'cpu_wall_time_ratio': 0.5,
    },
    'story_4': {
        'ci_095': 10,
        'avg': 10,
+        'cpu_wall_time_ratio': 0.5,
        'control': True,
    },
    'story_5': {
        'ci_095': 20,
-        'avg': 10
+        'avg': 10,
+        'cpu_wall_time_ratio': 0.5,
    },
    'story_6': {
        'ci_095': 20,
-        'avg': 10
+        'avg': 10,
+        'cpu_wall_time_ratio': 0.5,
    },
    'story_7': {
        'ci_095': 20,
        'avg': 10,
+        'cpu_wall_time_ratio': 0.5,
        'experimental': True,
    },
 }
@@ -96,14 +103,18 @@ class TestRepresentativePerfScript(unittest.TestCase):
  def test_parse_csv_results(self):
    csv_obj = create_sample_input([
        ['story_1', 'frame_times', 16, 10, 1.5],
+        ['story_1', 'cpu_wall_time_ratio', 0.5, 1, 1],
        ['story_2', 'latency', 10, 8, 4],  # Record for a different metric.
        ['story_3', 'frame_times', 8, 20, 2],
+        ['story_3', 'frame_times', 7, 20, 15],
+        ['story_3', 'frame_times', 12, 20, 16],
+        ['story_3', 'cpu_wall_time_ratio', 0.3, 1, 1],
+        ['story_3', 'cpu_wall_time_ratio', 0.7, 1, 1],
+        ['story_3', 'cpu_wall_time_ratio', '', 0, 1],
        ['story_4', 'frame_times', '', 10, 1],  # Record with no avg.
        ['story_5', 'frame_times', 12, 0, 3],  # Record with count of 0.
        ['story_6', 'frame_times', 12, 40, 40],  # High noise record.
        ['story_8', 'frame_times', 12, 40, 4],
-        ['story_3', 'frame_times', 7, 20, 15],
-        ['story_3', 'frame_times', 12, 20, 16]
    ])

    perf_test = perf_test_initializer()
@@ -114,10 +125,13 @@ class TestRepresentativePerfScript(unittest.TestCase):
    self.assertEquals(len(values_per_story), 5)
    self.assertEquals(values_per_story['story_1']['averages'], [16.0])
    self.assertEquals(values_per_story['story_1']['ci_095'], [1.5])
+    self.assertEquals(values_per_story['story_1']['cpu_wall_time_ratio'], [0.5])

    # Record with avg 12 has high noise.
    self.assertEquals(values_per_story['story_3']['averages'], [8.0, 7.0, 12.0])
    self.assertEquals(values_per_story['story_3']['ci_095'], [2.0, 15.0, 16.0])
+    self.assertEquals(values_per_story['story_3']['cpu_wall_time_ratio'],
+                      [0.3, 0.7])

    self.assertEquals(len(values_per_story['story_4']['averages']), 0)
    self.assertEquals(len(values_per_story['story_4']['ci_095']), 0)
@@ -131,46 +145,59 @@ class TestRepresentativePerfScript(unittest.TestCase):
        'story_1': {
            'averages': [16.0, 17.0, 21.0],
            'ci_095': [2.0, 15.0, 16.0],
+            'cpu_wall_time_ratio': [0.5, 0.52, 0.57]
        },
        'story_2': {
            'averages': [16.0, 17.0, 22.0],
            'ci_095': [1.0, 1.4, 1.2],
+            'cpu_wall_time_ratio': [0.3, 0.3, 0.3]
+        },
+        'story_3': {
+            'averages': [20.0, 15.0, 22.0],
+            'ci_095': [1.0, 0.8, 1.2],
+            'cpu_wall_time_ratio': [0.5, 0.5, 0.49]
        }
    }

-    sample_perf_results = create_sample_perf_results(['story_1', 'story_2'], [],
-                                                     BENCHMARK)
+    sample_perf_results = create_sample_perf_results(
+        ['story_1', 'story_2', 'story_3'], [], BENCHMARK)
    rerun = False
    perf_test = perf_test_initializer()
    perf_test.result_recorder[rerun].set_tests(sample_perf_results)

    perf_test.compare_values(values_per_story, rerun)
    result_recorder = perf_test.result_recorder[rerun]
-    self.assertEquals(result_recorder.tests, 2)
+    self.assertEquals(result_recorder.tests, 3)
+    # The failure for story_3 is invalidated (low cpu_wall_time_ratio)
    self.assertEquals(result_recorder.failed_stories, set(['story_2']))
    (output, overall_return_code) = result_recorder.get_output(0)
    self.assertEquals(overall_return_code, 1)
    self.assertEquals(output['num_failures_by_type'].get('FAIL', 0), 1)
    self.assertEquals(output['tests'][BENCHMARK]['story_1']['actual'], 'PASS')
    self.assertEquals(output['tests'][BENCHMARK]['story_2']['actual'], 'FAIL')
+    self.assertEquals(output['tests'][BENCHMARK]['story_3']['actual'], 'PASS')

  def test_compare_values_2(self):
    values_per_story = {
      'story_1': {
        'averages': [16.0, 17.0, 21.0],
        'ci_095': [2.0, 15.0, 16.0],
+        'cpu_wall_time_ratio': [0.45, 0.42],
      },
      'story_3': { # Two of the runs have acceptable CI but high averages.
        'averages': [10, 13],
-        'ci_095': [14, 16, 12]
+        'ci_095': [14, 16, 12],
+        'cpu_wall_time_ratio': [0.5, 0.52],
      },
      'story_4': {  # All runs have high noise.
        'averages': [],
        'ci_095': [16, 17, 18],
+        'cpu_wall_time_ratio': [],
      },
      'story_5': {  # No recorded values.
        'averages': [],
        'ci_095': [],
+        'cpu_wall_time_ratio': [],
      }
    }

@@ -204,11 +231,13 @@ class TestRepresentativePerfScript(unittest.TestCase):
        'story_1': {
            'averages': [16.0, 17.0, 21.0],
            'ci_095': [2.0, 15.0, 16.0],
+            'cpu_wall_time_ratio': [0.45, 0.42, 0.44],
        },
        'story_7':
        {  # Experimental story with higher value than the upper limit.
            'averages': [20, 26],
-            'ci_095': [14, 16]
+            'ci_095': [14, 16],
+            'cpu_wall_time_ratio': [0.45, 0.42, 0.44],
        }
    }