Add more debug logging to GPU trigger script.

Try to get to the bottom of situations where jobs seem to be going to an oversubscribed configuration. TBR=jmadill@chromium.org, jbudorick@chromium.org Bug: 807340 Cq-Include-Trybots: master.tryserver.chromium.android:android_optional_gpu_tests_rel;master.tryserver.chromium.linux:linux_optional_gpu_tests_rel;master.tryserver.chromium.mac:mac_optional_gpu_tests_rel;master.tryserver.chromium.win:win_optional_gpu_tests_rel Change-Id: I2161d42d980c9b5bc8a9acfbe74e11ad7fa037b0 Reviewed-on: https://chromium-review.googlesource.com/894450 Commit-Queue: Kenneth Russell <kbr@chromium.org> Reviewed-by: Kenneth Russell <kbr@chromium.org> Cr-Commit-Position: refs/heads/master@{#533230}

Add more debug logging to GPU trigger script.
Try to get to the bottom of situations where jobs seem to be going to an oversubscribed configuration. TBR=jmadill@chromium.org, jbudorick@chromium.org Bug: 807340 Cq-Include-Trybots: master.tryserver.chromium.android:android_optional_gpu_tests_rel;master.tryserver.chromium.linux:linux_optional_gpu_tests_rel;master.tryserver.chromium.mac:mac_optional_gpu_tests_rel;master.tryserver.chromium.win:win_optional_gpu_tests_rel Change-Id: I2161d42d980c9b5bc8a9acfbe74e11ad7fa037b0 Reviewed-on: https://chromium-review.googlesource.com/894450 Commit-Queue: Kenneth Russell <kbr@chromium.org> Reviewed-by: Kenneth Russell <kbr@chromium.org> Cr-Commit-Position: refs/heads/master@{#533230}
707aac9c · Kenneth Russell · Commit Bot · 65745cb4 · 707aac9c · 707aac9c
Commit 707aac9c authored Jan 31, 2018 by Kenneth Russell Committed by Commit Bot Jan 31, 2018
5 changed files
--- a/content/test/gpu/generate_buildbot_json.py
+++ b/content/test/gpu/generate_buildbot_json.py
@@ -2562,7 +2562,9 @@ def add_common_test_properties(test, tester_config):
      'args': [
        '--gpu-trigger-configs',
        json.dumps(tester_config['swarming_dimensions'] +
-                   tester_config.get('alternate_swarming_dimensions', []))
+                   tester_config.get('alternate_swarming_dimensions', [])),
+        '--gpu-trigger-script-verbose',
+        'True'
      ],
    }


--- a/content/test/gpu/trigger_gpu_test.py
+++ b/content/test/gpu/trigger_gpu_test.py
@@ -150,7 +150,7 @@ class GpuTestTriggerer(object):
    if not all(isinstance(entry, dict) for entry in self._gpu_configs):
      raise ValueError('GPU configurations must all be dictionaries')

-  def query_swarming_for_gpu_configs(self):
+  def query_swarming_for_gpu_configs(self, verbose):
    # Query Swarming to figure out which bots are available.
    for config in self._gpu_configs:
      values = []
@@ -172,7 +172,8 @@ class GpuTestTriggerer(object):
                                 '0',
                                 '--json',
                                 temp_file,
-                                 ('bots/count?%s' % query_arg)])
+                                 ('bots/count?%s' % query_arg)],
+                                verbose)
        if ret:
          raise Exception('Error running swarming.py')
        with open(temp_file) as fp:
@@ -182,10 +183,15 @@ class GpuTestTriggerer(object):
        # Be robust against errors in computation.
        available = max(0, count - int(query_result['busy']))
        self._bot_statuses.append({'total': count, 'available': available})
+        if verbose:
+          idx = len(self._bot_statuses) - 1
+          print 'GPU config %d: %s' % (idx, str(self._bot_statuses[idx]))
      finally:
        self.delete_temp_file(temp_file)
    # Sum up the total count of all bots.
    self._total_bots = sum(x['total'] for x in self._bot_statuses)
+    if verbose:
+      print 'Total bots: %d' % (self._total_bots)

  def remove_swarming_dimension(self, args, dimension):
    for i in xrange(len(args)):
@@ -196,7 +202,7 @@ class GpuTestTriggerer(object):
  def choose_random_int(self, max_num):
    return random.randint(1, max_num)

-  def pick_gpu_configuration(self):
+  def pick_gpu_configuration(self, verbose):
    # These are the rules used:
    # 1. If any configuration has bots available, pick the configuration with
    #    the most bots available.
@@ -216,6 +222,8 @@ class GpuTestTriggerer(object):
          max_val = avail
      self._bot_statuses[max_index]['available'] -= 1
      assert self._bot_statuses[max_index]['available'] >= 0
+      if verbose:
+        print 'Chose GPU config %d because bots were available' % (max_index)
      return max_index
    # Case 2.
    # We want to choose a bot uniformly at random from all of the bots specified
@@ -225,6 +233,8 @@ class GpuTestTriggerer(object):
    r = self.choose_random_int(self._total_bots)
    for i, status in enumerate(self._bot_statuses):
      if r <= status['total']:
+        if verbose:
+          print 'Chose GPU config %d stochastically' % (i)
        return i
      r -= status['total']
    raise Exception('Should not reach here')
@@ -247,7 +257,10 @@ class GpuTestTriggerer(object):
    with open(output_file, 'w') as f:
      json.dump(merged_json, f)

-  def run_swarming(self, args):
+  def run_swarming(self, args, verbose):
+    if verbose:
+      print 'Running Swarming with args:'
+      print str(args)
    return subprocess.call([sys.executable, SWARMING_PY] + args)

  def trigger_tasks(self, args, remaining):
@@ -262,8 +275,9 @@ class GpuTestTriggerer(object):
      Exit code for the script.
    """
    remaining = self.filter_swarming_py_path_arg(remaining)
+    verbose = args.gpu_trigger_script_verbose
    self.parse_gpu_configs(args)
-    self.query_swarming_for_gpu_configs()
+    self.query_swarming_for_gpu_configs(verbose)

    # In the remaining arguments, find the Swarming dimensions that are
    # specified by the GPU configs and remove them, because for each shard,
@@ -282,14 +296,14 @@ class GpuTestTriggerer(object):
      # 1. Pick which GPU configuration to use.
      # 2. Insert that GPU configuration's dimensions as command line
      #    arguments, and invoke "swarming.py trigger".
-      gpu_index = self.pick_gpu_configuration()
+      gpu_index = self.pick_gpu_configuration(verbose)
      # Holds the results of the swarming.py trigger call.
      try:
        json_temp = self.make_temp_file(prefix='trigger_gpu_test',
                                        suffix='.json')
        args_to_pass = self.modify_args(filtered_remaining_args, gpu_index, i,
                                        args.shards, json_temp)
-        ret = self.run_swarming(args_to_pass)
+        ret = self.run_swarming(args_to_pass, verbose)
        if ret:
          sys.stderr.write('Failed to trigger a task, aborting\n')
          return ret
@@ -318,6 +332,8 @@ def main():
                      help='The GPU configurations to trigger tasks on, in the'
                      ' form of a JSON array of dictionaries. At least one'
                      ' entry in this dictionary is required.')
+  parser.add_argument('--gpu-trigger-script-verbose', type=bool, default=False,
+                      help='Turn on verbose logging')
  parser.add_argument('--dump-json', required=True,
                      help='(Swarming Trigger Script API) Where to dump the'
                      ' resulting json which indicates which tasks were'

--- a/content/test/gpu/trigger_gpu_test_unittest.py
+++ b/content/test/gpu/trigger_gpu_test_unittest.py
@@ -53,11 +53,11 @@ class FakeTriggerer(trigger_gpu_test.GpuTestTriggerer):
  def parse_gpu_configs(self, args):
    pass

-  def query_swarming_for_gpu_configs(self):
+  def query_swarming_for_gpu_configs(self, verbose):
    # Sum up the total count of all bots.
    self._total_bots = sum(x['total'] for x in self._bot_statuses)

-  def run_swarming(self, args):
+  def run_swarming(self, args, verbose):
    self._swarming_runs.append(args)


@@ -119,6 +119,7 @@ class UnitTest(unittest.TestCase):
    args = Args()
    args.shards = 2
    args.dump_json = 'output.json'
+    args.gpu_trigger_script_verbose = False
    triggerer.trigger_tasks(
      args,
      [

--- a/testing/buildbot/chromium.gpu.fyi.json
+++ b/testing/buildbot/chromium.gpu.fyi.json
--- a/testing/buildbot/chromium.gpu.json
+++ b/testing/buildbot/chromium.gpu.json