Added flag support to comparative_tester.py

Bug: 839491 Change-Id: I9682eef59b6d1ec2157108a72976dc1b868575f9 Reviewed-on: https://chromium-review.googlesource.com/1110853 Commit-Queue: Stephan Stross <stephanstross@google.com> Reviewed-by: Sergey Ulanov <sergeyu@chromium.org> Cr-Commit-Position: refs/heads/master@{#570118}

Added flag support to comparative_tester.py
Bug: 839491 Change-Id: I9682eef59b6d1ec2157108a72976dc1b868575f9 Reviewed-on: https://chromium-review.googlesource.com/1110853 Commit-Queue: Stephan Stross <stephanstross@google.com> Reviewed-by: Sergey Ulanov <sergeyu@chromium.org> Cr-Commit-Position: refs/heads/master@{#570118}
37cb4d23 · Stephan Stross · Commit Bot · 25013557 · 37cb4d23 · 37cb4d23
Commit 37cb4d23 authored Jun 25, 2018 by Stephan Stross Committed by Commit Bot Jun 25, 2018
7 changed files
--- a/build/fuchsia/common_args.py
+++ b/build/fuchsia/common_args.py
@@ -42,7 +42,9 @@ def AddCommonArgs(arg_parser):
  common_args.add_argument('--ssh-config', '-F',
                           help='The path to the SSH configuration used for '
                                'connecting to the target device.')
-  common_args.add_argument('--include-system-logs', default=True, type=bool,
+  common_args.add_argument('--exclude-system-logs',
+                           action='store_false',
+                           dest='include_system_logs',
                           help='Do not show system log data.')
  common_args.add_argument('--verbose', '-v', default=False,
                           action='store_true',

--- a/testing/buildbot/filters/fuchsia.base_perftests.filter
+++ b/testing/buildbot/filters/fuchsia.base_perftests.filter
 # Fuchsia OOM bug crbug.com/851698
 -MessageLoopPerfTest.PostTaskRate/1_Posting_Thread
 # crbug.com/848499
-WaitableEventPerfTest.Throughput
\ No newline at end of file
+-WaitableEventPerfTest.Throughput
+
+# Can cause hangs on SSH connection to test box
+# https://fuchsia.atlassian.net/browse/NET-1010
+-TaskObserverPerfTest.TaskPingPong
+-JSONPerfTest.StressTest
\ No newline at end of file
--- a/testing/buildbot/filters/fuchsia.net_perftests.filter
+++ b/testing/buildbot/filters/fuchsia.net_perftests.filter
 # These two are disabled until crbug.com/851083 is fixed
 -DiskCachePerfTest.CacheBackendPerformance
-DiskCachePerfTest.SimpleCacheBackendPerformance
\ No newline at end of file
+-DiskCachePerfTest.SimpleCacheBackendPerformance
+
+# Flaky test blocked until crbug.com/852937 is fixed
+-URLRequestQuicPerfTest.TestGetRequest
+
+# Too long-running for repeat execution
+-SimpleIndexPerfTest.EvictionPerformance
\ No newline at end of file
--- a/tools/fuchsia/comparative_tester/comparative_tester.py
+++ b/tools/fuchsia/comparative_tester/comparative_tester.py
@@ -7,6 +7,8 @@
 # Fuchsia devices and then compares their output to each other, extracting the
 # relevant performance data from the output of gtest.

+import argparse
+import logging
 import os
 import re
 import subprocess
@@ -76,10 +78,9 @@ class TestTarget(object):

  def ExecFuchsia(self, out_dir: str, run_locally: bool) -> str:
    runner_name = "{}/bin/run_{}".format(out_dir, self._name)
-    command = [runner_name, self._filter_flag, "--include-system-logs=False"]
+    command = [runner_name, self._filter_flag, "--exclude-system-logs"]
    if not run_locally:
      command.append("-d")
-    command.append(self._filter_flag)
    return RunCommand(command,
                      "Test {} failed on fuchsia!".format(self._target))

@@ -166,11 +167,14 @@ def RunTest(target: TestTarget, run_locally: bool = False) -> None:
  print("Wrote result files")


-def RunGnForDirectory(dir_name: str, target_os: str) -> None:
+def RunGnForDirectory(dir_name: str, target_os: str, is_debug: bool) -> None:
  if not os.path.exists(dir_name):
    os.makedirs(dir_name)
+
+  debug_str = str(is_debug).lower()
+
  with open("{}/{}".format(dir_name, "args.gn"), "w") as args_file:
-    args_file.write("is_debug = false\n")
+    args_file.write("is_debug = {}\n".format(debug_str))
    args_file.write("dcheck_always_on = false\n")
    args_file.write("is_component_build = false\n")
    args_file.write("use_goma = true\n")
@@ -179,7 +183,8 @@ def RunGnForDirectory(dir_name: str, target_os: str) -> None:
  subprocess.run(["gn", "gen", dir_name]).check_returncode()


-def GenerateTestData(runs: int):
+def GenerateTestData(do_config: bool, do_build: bool, num_reps: int,
+                     is_debug: bool):
  DIR_SOURCE_ROOT = os.path.abspath(
      os.path.join(os.path.dirname(__file__), *([os.pardir] * 3)))
  os.chdir(DIR_SOURCE_ROOT)
@@ -192,22 +197,27 @@ def GenerateTestData(runs: int):
  test_input = []  # type: List[TestTarget]
  for target in target_spec.test_targets:
    test_input.append(TestTarget(target))
-  print("Test targets collected:\n{}".format("\n".join(
+  print("Test targets collected:\n{}".format(",".join(
      [test._target for test in test_input])))
-
-  RunGnForDirectory(linux_dir, "linux")
-  RunGnForDirectory(fuchsia_dir, "fuchsia")
-
-  # Build test targets in both output directories.
-  for directory in [linux_dir, fuchsia_dir]:
-    build_command = ["autoninja", "-C", directory] \
-                  + [test._target for test in test_input]
-    RunCommand(build_command,
-               "autoninja failed in directory {}".format(directory))
-  print("Builds completed.")
+  if do_config:
+    RunGnForDirectory(linux_dir, "linux", is_debug)
+    RunGnForDirectory(fuchsia_dir, "fuchsia", is_debug)
+    print("Ran GN")
+  elif is_debug:
+    logging.warning("The --is_debug flag is ignored unless --do_config is also \
+                     specified")
+
+  if do_build:
+    # Build test targets in both output directories.
+    for directory in [linux_dir, fuchsia_dir]:
+      build_command = ["autoninja", "-C", directory] \
+                    + [test._target for test in test_input]
+      RunCommand(build_command,
+                 "autoninja failed in directory {}".format(directory))
+    print("Builds completed.")

  # Execute the tests, one at a time, per system, and collect their results.
-  for i in range(0, runs):
+  for i in range(0, num_reps):
    print("Running Test set {}".format(i))
    for test_target in test_input:
      print("Running Target {}".format(test_target._name))
@@ -218,7 +228,29 @@ def GenerateTestData(runs: int):


 def main() -> int:
-  GenerateTestData(1)
+  cmd_flags = argparse.ArgumentParser(
+      description="Execute tests repeatedly and collect performance data.")
+  cmd_flags.add_argument(
+      "--do-config",
+      action="store_true",
+      help="WARNING: This flag over-writes args.gn in the directories "
+           "configured. GN is executed before running the tests.")
+  cmd_flags.add_argument(
+      "--do-build",
+      action="store_true",
+      help="Build the tests before running them.")
+  cmd_flags.add_argument(
+      "--is-debug",
+      action="store_true",
+      help="This config-and-build cycle is a debug build")
+  cmd_flags.add_argument(
+      "--num-repetitions",
+      type=int,
+      default=1,
+      help="The number of times to execute each test target.")
+  cmd_flags.parse_args()
+  GenerateTestData(cmd_flags.do_config, cmd_flags.do_build,
+                   cmd_flags.num_repetitions, cmd_flags.is_debug)
  return 0



--- a/tools/fuchsia/comparative_tester/generate_perf_report.py
+++ b/tools/fuchsia/comparative_tester/generate_perf_report.py
+#!/usr/bin/env python3
+# Copyright 2018 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+"""generate_perf_report.py is to be used after comparative_tester.py has been
+executed and written some test data into the location specified by
+target_spec.py. It writes to results_dir and reads all present test info from
+raw_data_dir. Using this script should just be a matter of invoking it from
+chromium/src while raw test data exists in raw_data_dir."""
+
+import json
+import logging
+import math
+import os
+import sys
+from typing import List, Dict, Set, Tuple, Optional, Any, TypeVar, Callable
+
+import target_spec
+from test_results import (TargetResult, ReadTargetFromJson, TestResult,
+                          ResultLine)
+
+
+class LineStats(object):
+
+  def __init__(self, desc: str, unit: str, time_avg: float, time_dev: float,
+               cv: float, samples: int) -> None:
+    self.desc = desc
+    self.time_avg = time_avg
+    self.time_dev = time_dev
+    self.cv = cv
+    self.unit = unit
+    self.sample_num = samples
+
+  def ToString(self) -> str:
+    if self.sample_num > 1:
+      return "{}: {:.5f} σ={:.5f} {} with n={} cv={}".format(
+          self.desc, self.time_avg, self.time_dev, self.unit, self.sample_num,
+          self.cv)
+    else:
+      return "{}: {:.5f} with only one sample".format(self.desc, self.time_avg)
+
+
+def LineFromList(lines: List[ResultLine]) -> Optional[LineStats]:
+  desc = lines[0].desc
+  unit = lines[0].unit
+  times = [line.meas for line in lines]
+  avg, dev, cv = GenStats(times)
+  return LineStats(desc, unit, avg, dev, cv, len(lines))
+
+
+class TestStats(object):
+
+  def __init__(self, name: str, time_avg: float, time_dev: float, cv: float,
+               samples: int, lines: List[LineStats]) -> None:
+    self.name = name
+    self.time_avg = time_avg
+    self.time_dev = time_dev
+    self.cv = cv
+    self.sample_num = samples
+    self.lines = lines
+
+  def ToLines(self) -> List[str]:
+    lines = []
+    if self.sample_num > 1:
+      lines.append("{}: {:.5f} σ={:.5f}ms with n={} cv={}".format(
+          self.name, self.time_avg, self.time_dev, self.sample_num, self.cv))
+    else:
+      lines.append("{}: {:.5f} with only one sample".format(
+          self.name, self.time_avg))
+    for line in self.lines:
+      lines.append("  {}".format(line.ToString()))
+    return lines
+
+
+def TestFromList(tests: List[TestResult]) -> TestStats:
+  name = tests[0].name
+  avg, dev, cv = GenStats([test.time for test in tests])
+  lines = {}  # type: Dict[str, List[ResultLine]]
+  for test in tests:
+    assert test.name == name
+    for line in test.lines:
+      if not line.desc in lines:
+        lines[line.desc] = [line]
+      else:
+        lines[line.desc].append(line)
+  test_lines = []
+  for _, line_list in lines.items():
+    stat_line = LineFromList(line_list)
+    if stat_line:
+      test_lines.append(stat_line)
+  return TestStats(name, avg, dev, cv, len(tests), test_lines)
+
+
+class TargetStats(object):
+
+  def __init__(self, name: str, samples: int, tests: List[TestStats]) -> None:
+    self.name = name
+    self.sample_num = samples
+    self.tests = tests
+
+  def ToLines(self) -> List[str]:
+    lines = []
+    if self.sample_num > 1:
+      lines.append("{}: ".format(self.name))
+    else:
+      lines.append("{}: with only one sample".format(self.name))
+    for test in self.tests:
+      for line in test.ToLines():
+        lines.append("  {}".format(line))
+    return lines
+
+  def __format__(self, format_spec):
+    return "\n".join(self.ToLines())
+
+
+def TargetFromList(results: List[TargetResult]) -> TargetStats:
+  name = results[0].name
+  sample_num = len(results)
+  tests = {}  # type: Dict[str, List[TestResult]]
+  for result in results:
+    assert result.name == name
+    # This groups tests by name so that they can be considered independently,
+    # so that in the event tests flake out, their average times can
+    # still be accurately calculated
+    for test in result.tests:
+      if not test.name in tests.keys():
+        tests[test.name] = [test]
+      tests[test.name].append(test)
+  test_stats = [TestFromList(test_list) for _, test_list in tests.items()]
+  return TargetStats(name, sample_num, test_stats)
+
+
+def GenStats(corpus: List[float]) -> Tuple[float, float, float]:
+  avg = sum(corpus) / len(corpus)
+  adjusted_sum = 0.0
+  for item in corpus:
+    adjusted = item - avg
+    adjusted_sum += adjusted * adjusted
+
+  dev = math.sqrt(adjusted_sum / len(corpus))
+  cv = dev / avg
+  return avg, dev, cv
+
+
+def DirectoryStats(directory: str) -> List[TargetStats]:
+  resultMap = {}  # type: Dict[str, List[TargetResult]]
+  for file in os.listdir(directory):
+    results = ReadTargetFromJson("{}/{}".format(directory, file))
+    if not results.name in resultMap.keys():
+      resultMap[results.name] = [results]
+    else:
+      resultMap[results.name].append(results)
+
+  targets = []
+  for _, resultList in resultMap.items():
+    targets.append(TargetFromList(resultList))
+  return targets
+
+
+def CompareTargets(linux: TargetStats, fuchsia: TargetStats) -> Dict[str, Any]:
+  """Compare takes a corpus of statistics from both Fuchsia and Linux, and then
+  lines up the values, compares them to each other, and writes them into a
+  dictionary that can be JSONified.
+  """
+  assert linux.name == fuchsia.name
+  paired_tests = ZipListsByPredicate(linux.tests, fuchsia.tests,
+                                     lambda test: test.name)
+
+  paired_tests = MapDictValues(paired_tests, CompareTests)
+  return {"name": linux.name, "tests": paired_tests}
+
+
+def CompareTests(linux: TestStats, fuchsia: TestStats) -> Dict[str, Any]:
+  assert linux != None or fuchsia != None
+  if linux != None and fuchsia != None:
+    assert linux.name == fuchsia.name
+  paired_lines = ZipListsByPredicate(linux.lines, fuchsia.lines,
+                                     lambda line: line.desc)
+  paired_lines = MapDictValues(paired_lines, CompareLines)
+  result = {"lines": paired_lines, "unit": "ms"}  # type: Dict[str, Any]
+
+  if linux:
+    result["name"] = linux.name
+    result["linux_avg"] = linux.time_avg
+    result["linux_dev"] = linux.time_dev
+    result["linux_cv"] = linux.cv
+
+  if fuchsia == None:
+    logging.warning("Fuchsia is missing test case {}".format(linux.name))
+  else:
+    result["name"] = fuchsia.name
+    result["fuchsia_avg"] = fuchsia.time_avg
+    result["fuchsia_dev"] = fuchsia.time_dev
+    result["fuchsia_cv"] = fuchsia.cv
+
+
+def CompareLines(linux: LineStats, fuchsia: LineStats) -> Dict[str, Any]:
+  """CompareLines wraps two LineStats objects up as a JSON-dumpable dict, with
+  missing values written as -1 (which specifically doesn't make sense for time
+  elapsed measurements). It also logs a warning every time a line is given which
+  can't be matched up. If both lines passed are None, or their units or
+  descriptions are not the same(which should never happen) this function fails.
+  """
+  assert linux != None or fuchsia != None
+  result = {}  # type: Dict[str, Any]
+  if linux != None and fuchsia != None:
+    assert linux.desc == fuchsia.desc
+    assert linux.unit == fuchsia.unit
+
+  if linux:
+    result["desc"] = linux.desc
+    result["unit"] = linux.unit
+    result["linux_avg"] = linux.time_avg
+    result["linux_dev"] = linux.time_dev
+    result["linux_cv"] = linux.cv
+
+  if fuchsia == None:
+    logging.warning("Fuchsia is missing test line {}".format(linux.desc))
+  else:
+    result["desc"] = fuchsia.desc
+    result["unit"] = fuchsia.unit
+    result["fuchsia_avg"] = fuchsia.time_avg
+    result["fuchsia_dev"] = fuchsia.time_dev
+    result["fuchsia_cv"] = fuchsia.cv
+
+  return result
+
+
+T = TypeVar("T")
+R = TypeVar("R")
+
+
+def ZipListsByPredicate(left: List[T], right: List[T],
+                        pred: Callable[[T], R]) -> Dict[R, Tuple[T, T]]:
+  """This function takes two lists, and a predicate. The predicate is applied to
+  the values in both lists to obtain a keying value from them. Each item is then
+  inserted into the returned dictionary using the obtained key. Finally, after
+  all items have been added to the dict, any items that do not have a pair are
+  discarded after warning the user, and the new dictionary is returned. The
+  predicate should not map multiple values from one list to the same key.
+  """
+  paired_items = {}  # type: Dict [R, Tuple[T, T]]
+  for item in left:
+    key = pred(item)
+    # the first list shouldn't cause any key collisions
+    assert key not in paired_items.keys()
+    paired_items[key] = item, None
+
+  for item in right:
+    key = pred(item)
+    if key in paired_items.keys():
+      # elem 1 of the tuple is always None if the key exists in the map
+      prev, _ = paired_items[key]
+      paired_items[key] = prev, item
+    else:
+      paired_items[key] = None, item
+
+  return paired_items
+
+
+U = TypeVar("U")
+V = TypeVar("V")
+
+
+def MapDictValues(dct: Dict[T, Tuple[R, U]],
+                  predicate: Callable[[R, U], V]) -> Dict[T, V]:
+  """This function applies the predicate to all the values in the dictionary,
+  returning a new dictionary with the new values.
+  """
+  out_dict = {}
+  for key, val in dct.items():
+    out_dict[key] = predicate(*val)
+  return out_dict
+
+
+def main():
+  linux_avgs = DirectoryStats(target_spec.raw_linux_dir)
+  fuchsia_avgs = DirectoryStats(target_spec.raw_fuchsia_dir)
+  paired_targets = ZipListsByPredicate(linux_avgs, fuchsia_avgs,
+                                       lambda target: target.name)
+  for name, targets in paired_targets.items():
+    comparison_dict = CompareTargets(*targets)
+    with open("{}/{}.json".format(target_spec.results_dir, name),
+              "w") as outfile:
+      json.dump(comparison_dict, outfile, indent=2)
+
+
+if __name__ == "__main__":
+  sys.exit(main())
--- a/tools/fuchsia/comparative_tester/target_spec.py
+++ b/tools/fuchsia/comparative_tester/target_spec.py
@@ -2,8 +2,6 @@
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

-from typing import Dict
-
 # Fields for use when working with a physical linux device connected locally
 linux_device_ip = "192.168.42.32"
 linux_device_user = "potat"
@@ -18,8 +16,8 @@ fuchsia_out_dir = "out/fuchsia"
 results_dir = "results"
 # The location in src that stores the information from each comparative
 # invocation of a perftest
-raw_linux_dir = "results/linux_raw"
-raw_fuchsia_dir = "results/fuchsia_raw"
+raw_linux_dir = results_dir + "/linux_raw"
+raw_fuchsia_dir = results_dir + "/fuchsia_raw"

 # A list of test targets to deploy to both devices. Stick to *_perftests.
 test_targets = [

--- a/tools/fuchsia/comparative_tester/test_results.py
+++ b/tools/fuchsia/comparative_tester/test_results.py
@@ -10,8 +10,10 @@ from typing import Any, Dict, List, Tuple, Optional


 def UnitStringIsValid(unit: str) -> bool:
-  return (unit == "us/hop" or unit == "us/task" or unit == "ns/sample" or
-          unit == "ms" or unit == "s" or unit == "count")
+  accepted_units = [
+      "us/hop", "us/task", "ns/sample", "ms", "s", "count", "KB", "MB/s", "us"
+  ]
+  return unit in accepted_units


 class ResultLine(object):
@@ -32,6 +34,10 @@ class ResultLine(object):
    }


+def ReadResultLineFromJson(dct: Dict[str, Any]):
+  return ResultLine(dct["description"], float(dct["measurement"]), dct["unit"])
+
+
 def ResultLineFromStdout(line: str) -> Optional[ResultLine]:
  if "pkgsvr" in line:
    return None
@@ -40,12 +46,12 @@ def ResultLineFromStdout(line: str) -> Optional[ResultLine]:
  # for the line description, so at least 3 total
  if len(chunks) < 3:
    logging.warning("The line {} contains too few space-separated pieces to be "
-        "parsed as a ResultLine".format(line))
+                    "parsed as a ResultLine".format(line))
    return None
  unit = chunks[-1]
  if not UnitStringIsValid(unit):
    logging.warning("The unit string parsed from {} was {}, which was not "
-        "expected".format(line, unit))
+                    "expected".format(line, unit))
    return None
  try:
    measure = float(chunks[-2])
@@ -53,7 +59,7 @@ def ResultLineFromStdout(line: str) -> Optional[ResultLine]:
    return ResultLine(desc, measure, unit)
  except ValueError as e:
    logging.warning("The chunk {} could not be parsed as a valid measurement "
-        "because of {}".format(chunks[-2], str(e)))
+                    "because of {}".format(chunks[-2], str(e)))
    return None


@@ -75,27 +81,33 @@ class TestResult(object):
    }


-def ExtractCaseInfo(line: str) -> Tuple[str, float, str]:
+def ReadTestFromJson(obj_dict: Dict[str, Any]) -> TestResult:
+  name = obj_dict["name"]
+  time = obj_dict["time_in_ms"]
+  lines = [ReadResultLineFromJson(line) for line in obj_dict["lines"]]
+  return TestResult(name, time, lines)
+
+
+def ExtractTestInfo(line: str) -> Tuple[str, float]:
  # Trim off the [       OK ] part of the line
  trimmed = line.lstrip("[       OK ]").strip()
  try:
    test_name, rest = trimmed.split("(")  # Isolate the measurement
  except Exception as e:
    err_text = "Could not extract the case name from {} because of error {}"\
-               .format(
-        rest, str(e))
+               .format(trimmed, str(e))
    raise Exception(err_text)
  try:
-    measure, units = rest.split(")")[0].split()
+    measure, _ = rest.split(")", 1)[0].split()
  except Exception as e:
    err_text = "Could not extract measure and units from {}\
                because of error {}".format(rest, str(e))
    raise Exception(err_text)
-  return test_name.strip(), float(measure), units.strip()
+  return test_name.strip(), float(measure)


 def TaggedTestFromLines(lines: List[str]) -> TestResult:
-  test_name, time, units = ExtractCaseInfo(lines[-1])
+  test_name, time = ExtractTestInfo(lines[-1])
  res_lines = []
  for line in lines[:-1]:
    res_line = ResultLineFromStdout(line)
@@ -113,15 +125,13 @@ class TargetResult(object):
  run.
  """

-  def __init__(self, name: str, time: float, tests: List[TestResult]) -> None:
+  def __init__(self, name: str, tests: List[TestResult]) -> None:
    self.name = name
-    self.time = time
    self.tests = tests

  def ToJsonDict(self) -> Dict[str, Any]:
    return {
        "name": self.name,
-        "time_in_ms": self.time,
        "tests": [test.ToJsonDict() for test in self.tests]
    }

@@ -130,6 +140,13 @@ class TargetResult(object):
      json.dump(self.ToJsonDict(), outfile, indent=2)


+def ReadTargetFromJson(path: str):
+  with open(path, "r") as json_file:
+    dct = json.load(json_file)
+    return TargetResult(
+        dct["name"], [ReadTestFromJson(test_dct) for test_dct in dct["tests"]])
+
+
 def TargetResultFromStdout(lines: List[str], name: str) -> TargetResult:
  """TargetResultFromStdout attempts to associate GTest names to the lines of
  output that they produce. Example input looks something like the following:
@@ -166,7 +183,4 @@ def TargetResultFromStdout(lines: List[str], name: str) -> TargetResult:
  test_cases = [
      TaggedTestFromLines(test_lines) for test_lines in test_line_lists
  ]
-  target_time = 0  # type: float
-  for test in test_cases:
-    target_time += test.time
-  return TargetResult(name, target_time, test_cases)
+  return TargetResult(name, test_cases)