Refactor CalculateConfidence and add unit test.

This version is a little bit different from the last; main differences: - I saw that the "CalculateBounds" was returning the minimum and maximum of the means of the input list, so I thought it would be easier to understand if this was done by mapping CalculateMean and then directly making a tuple with min/max. - Since CalculateTruncatedMean(xs, 0) is just the mean, and this is used in several place, I extracted out a convenience function CalculateMean. - CalculateConfidence doesn't refer to self, so I thought it was reasonable to move it up to be a top-level function with the other stats functions. BUG= Review URL: https://codereview.chromium.org/220113012 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@262924 0039d316-1c4b-4281-b951-d872f2087c98

Refactor CalculateConfidence and add unit test.
This version is a little bit different from the last; main differences: - I saw that the "CalculateBounds" was returning the minimum and maximum of the means of the input list, so I thought it would be easier to understand if this was done by mapping CalculateMean and then directly making a tuple with min/max. - Since CalculateTruncatedMean(xs, 0) is just the mean, and this is used in several place, I extracted out a convenience function CalculateMean. - CalculateConfidence doesn't refer to self, so I thought it was reasonable to move it up to be a top-level function with the other stats functions. BUG= Review URL: https://codereview.chromium.org/220113012 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@262924 0039d316-1c4b-4281-b951-d872f2087c98
28733cea · qyearsley@chromium.org · b13ea409 · 28733cea · 28733cea
Commit 28733cea authored Apr 10, 2014 by qyearsley@chromium.org
Hide whitespace changes
Inline Side-by-side

Showing with 153 additions and 49 deletions

tools/bisect-perf-regression.py tools/bisect-perf-regression.py +74 -49

tools/bisect-perf-regression_test.py tools/bisect-perf-regression_test.py +79 -0

No files found.
--- a/tools/bisect-perf-regression.py
+++ b/tools/bisect-perf-regression.py
@@ -189,13 +189,20 @@ def _AddAdditionalDepotInfo(depot_info):
 def CalculateTruncatedMean(data_set, truncate_percent):
  """Calculates the truncated mean of a set of values.

+  Note that this isn't just the mean of the set of values with the highest
+  and lowest values discarded; the non-discarded values are also weighted
+  differently depending how many values are discarded.
+
  Args:
-    data_set: Set of values to use in calculation.
-    truncate_percent: The % from the upper/lower portions of the data set to
-        discard, expressed as a value in [0, 1].
+    data_set: Non-empty list of values.
+    truncate_percent: The % from the upper and lower portions of the data set
+        to discard, expressed as a value in [0, 1].

  Returns:
    The truncated mean as a float.
+
+  Raises:
+    TypeError: The data set was empty after discarding values.
  """
  if len(data_set) > 2:
    data_set = sorted(data_set)
@@ -224,14 +231,61 @@ def CalculateTruncatedMean(data_set, truncate_percent):
  return truncated_mean


-def CalculateStandardDeviation(v):
-  if len(v) == 1:
+def CalculateMean(values):
+  """Calculates the arithmetic mean of a list of values."""
+  return CalculateTruncatedMean(values, 0.0)
+
+
+def CalculateConfidence(good_results_lists, bad_results_lists):
+  """Calculates a confidence percentage.
+
+  This is calculated based on how distinct the "good" and "bad" values are,
+  and how noisy the results are. More precisely, the confidence is the quotient
+  of the difference between the closest values across the good and bad groups
+  and the sum of the standard deviations of the good and bad groups.
+
+  TODO(qyearsley): Replace this confidence function with a function that
+      uses a Student's t-test. The confidence would be (1 - p-value), where
+      p-value is the probability of obtaining the given a set of good and bad
+      values just by chance.
+
+  Args:
+    good_results_lists: A list of lists of "good" result numbers.
+    bad_results_lists: A list of lists of "bad" result numbers.
+
+  Returns:
+    A number between in the range [0, 100].
+  """
+  # Get the distance between the two groups.
+  means_good = map(CalculateMean, good_results_lists)
+  means_bad = map(CalculateMean, bad_results_lists)
+  bounds_good = (min(means_good), max(means_good))
+  bounds_bad = (min(means_bad), max(means_bad))
+  dist_between_groups = min(
+      math.fabs(bounds_bad[1] - bounds_good[0]),
+      math.fabs(bounds_bad[0] - bounds_good[1]))
+
+  # Get the sum of the standard deviations of the two groups.
+  good_results_flattened = sum(good_results_lists, [])
+  bad_results_flattened = sum(bad_results_lists, [])
+  stddev_good = CalculateStandardDeviation(good_results_flattened)
+  stddev_bad = CalculateStandardDeviation(bad_results_flattened)
+  stddev_sum = stddev_good + stddev_bad
+
+  confidence = dist_between_groups / (max(0.0001, stddev_sum))
+  confidence = int(min(1.0, max(confidence, 0.0)) * 100.0)
+  return confidence
+
+
+def CalculateStandardDeviation(values):
+  """Calculates the sample standard deviation of the given list of values."""
+  if len(values) == 1:
    return 0.0

-  mean = CalculateTruncatedMean(v, 0.0)
-  variances = [float(x) - mean for x in v]
-  variances = [x * x for x in variances]
-  variance = reduce(lambda x, y: float(x) + float(y), variances) / (len(v) - 1)
+  mean = CalculateMean(values)
+  differences_from_mean = [float(x) - mean for x in values]
+  squared_differences = [float(x * x) for x in differences_from_mean]
+  variance = sum(squared_differences) / (len(values) - 1)
  std_dev = math.sqrt(variance)

  return std_dev
@@ -253,13 +307,14 @@ def CalculatePooledStandardError(work_sets):
  return 0.0


-def CalculateStandardError(v):
-  if len(v) <= 1:
+def CalculateStandardError(values):
+  """Calculates the standard error of a list of values."""
+  if len(values) <= 1:
    return 0.0

-  std_dev = CalculateStandardDeviation(v)
+  std_dev = CalculateStandardDeviation(values)

-  return std_dev / math.sqrt(len(v))
+  return std_dev / math.sqrt(len(values))


 def IsStringFloat(string_to_check):
@@ -3022,11 +3077,9 @@ class BisectPerformanceMetrics(object):
      if current_values:
        current_values = current_values['values']
        if previous_values:
-          confidence = self._CalculateConfidence(previous_values,
-              [current_values])
-          mean_of_prev_runs = CalculateTruncatedMean(
-              sum(previous_values, []), 0)
-          mean_of_current_runs = CalculateTruncatedMean(current_values, 0)
+          confidence = CalculateConfidence(previous_values, [current_values])
+          mean_of_prev_runs = CalculateMean(sum(previous_values, []))
+          mean_of_current_runs = CalculateMean(current_values)

          # Check that the potential regression is in the same direction as
          # the overall regression. If the mean of the previous runs < the
@@ -3043,34 +3096,6 @@ class BisectPerformanceMetrics(object):
        previous_id = current_id
    return other_regressions

-  def _CalculateConfidence(self, working_means, broken_means):
-    bounds_working = []
-    bounds_broken = []
-    for m in working_means:
-      current_mean = CalculateTruncatedMean(m, 0)
-      if bounds_working:
-        bounds_working[0] = min(current_mean, bounds_working[0])
-        bounds_working[1] = max(current_mean, bounds_working[0])
-      else:
-        bounds_working = [current_mean, current_mean]
-    for m in broken_means:
-      current_mean = CalculateTruncatedMean(m, 0)
-      if bounds_broken:
-        bounds_broken[0] = min(current_mean, bounds_broken[0])
-        bounds_broken[1] = max(current_mean, bounds_broken[0])
-      else:
-        bounds_broken = [current_mean, current_mean]
-    dist_between_groups = min(math.fabs(bounds_broken[1] - bounds_working[0]),
-        math.fabs(bounds_broken[0] - bounds_working[1]))
-    working_mean = sum(working_means, [])
-    broken_mean = sum(broken_means, [])
-    len_working_group = CalculateStandardDeviation(working_mean)
-    len_broken_group = CalculateStandardDeviation(broken_mean)
-
-    confidence = (dist_between_groups / (
-        max(0.0001, (len_broken_group + len_working_group ))))
-    confidence = int(min(1.0, max(confidence, 0.0)) * 100.0)
-    return confidence

  def _GetResultsDict(self, revision_data, revision_data_sorted):
    # Find range where it possibly broke.
@@ -3106,8 +3131,8 @@ class BisectPerformanceMetrics(object):
      broken_mean = sum(broken_means, [])

      # Calculate the approximate size of the regression
-      mean_of_bad_runs = CalculateTruncatedMean(broken_mean, 0.0)
-      mean_of_good_runs = CalculateTruncatedMean(working_mean, 0.0)
+      mean_of_bad_runs = CalculateMean(broken_mean)
+      mean_of_good_runs = CalculateMean(working_mean)

      regression_size = math.fabs(max(mean_of_good_runs, mean_of_bad_runs) /
          max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 - 100.0
@@ -3119,7 +3144,7 @@ class BisectPerformanceMetrics(object):
      # Give a "confidence" in the bisect. At the moment we use how distinct the
      # values are before and after the last broken revision, and how noisy the
      # overall graph is.
-      confidence = self._CalculateConfidence(working_means, broken_means)
+      confidence = CalculateConfidence(working_means, broken_means)

      culprit_revisions = []


--- a/tools/bisect-perf-regression_test.py
+++ b/tools/bisect-perf-regression_test.py
+# Copyright 2014 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import unittest
+
+# Special import necessary because filename contains dash characters.
+bisect_perf_module = __import__('bisect-perf-regression')
+
+
+class BisectPerfRegressionTest(unittest.TestCase):
+  """Test case for top-level functions in the bisect-perf-regrssion module."""
+
+  def setUp(self):
+    """Sets up the test environment before each test method."""
+    pass
+
+  def tearDown(self):
+    """Cleans up the test environment after each test method."""
+    pass
+
+  def testCalculateTruncatedMeanRaisesError(self):
+    """CalculateTrunctedMean raises an error when passed an empty list."""
+    with self.assertRaises(TypeError):
+      bisect_perf_module.CalculateTruncatedMean([], 0)
+
+  def testCalculateMeanSingleNum(self):
+    """Tests the CalculateMean function with a single number."""
+    self.assertEqual(3.0, bisect_perf_module.CalculateMean([3]))
+
+  def testCalculateMeanShortList(self):
+    """Tests the CalculateMean function with a short list."""
+    self.assertEqual(0.5, bisect_perf_module.CalculateMean([-3, 0, 1, 4]))
+
+  def testCalculateMeanCompareAlternateImplementation(self):
+    """Tests CalculateMean by comparing against an alternate implementation."""
+    def AlternateMeanFunction(values):
+      """Simple arithmetic mean function."""
+      return sum(values) / float(len(values))
+    test_values_lists = [[1], [5, 6.5, 1.2, 3], [-3, 0, 1, 4],
+                         [-3, -1, 0.12, 0.752, 3.33, 8, 16, 32, 439]]
+    for values in test_values_lists:
+      self.assertEqual(
+          AlternateMeanFunction(values),
+          bisect_perf_module.CalculateMean(values))
+
+  def testCalculateConfidence(self):
+    """Tests the confidence calculation."""
+    bad_values = [[0, 1], [1, 2]]
+    good_values = [[6, 7], [7, 8]]
+    # Closest means are mean(1, 2) and mean(6, 7).
+    distance = 6.5 - 1.5
+    # Standard deviation of [n-1, n, n, n+1] is 0.8165.
+    stddev_sum = 0.8165 + 0.8165
+    # Expected confidence is an int in the range [0, 100].
+    expected_confidence = min(100, int(100 * distance / float(stddev_sum)))
+    self.assertEqual(
+        expected_confidence,
+        bisect_perf_module.CalculateConfidence(bad_values, good_values))
+
+  def testCalculateConfidence0(self):
+    """Tests the confidence calculation when it's expected to be 0."""
+    bad_values = [[0, 1], [1, 2], [4, 5], [0, 2]]
+    good_values = [[4, 5], [6, 7], [7, 8]]
+    # Both groups have value lists with means of 4.5, which means distance
+    # between groups is zero, and thus confidence is zero.
+    self.assertEqual(
+        0, bisect_perf_module.CalculateConfidence(bad_values, good_values))
+
+  def testCalculateConfidence100(self):
+    """Tests the confidence calculation when it's expected to be 100."""
+    bad_values = [[1, 1], [1, 1]]
+    good_values = [[1.2, 1.2], [1.2, 1.2]]
+    # Standard deviation in both groups is zero, so confidence is 100.
+    self.assertEqual(
+        100, bisect_perf_module.CalculateConfidence(bad_values, good_values))
+
+if __name__ == '__main__':
+  unittest.main()