Commit 707aac9c authored by Kenneth Russell's avatar Kenneth Russell Committed by Commit Bot

Add more debug logging to GPU trigger script.

Try to get to the bottom of situations where jobs seem to be going to
an oversubscribed configuration.

TBR=jmadill@chromium.org, jbudorick@chromium.org

Bug: 807340
Cq-Include-Trybots: master.tryserver.chromium.android:android_optional_gpu_tests_rel;master.tryserver.chromium.linux:linux_optional_gpu_tests_rel;master.tryserver.chromium.mac:mac_optional_gpu_tests_rel;master.tryserver.chromium.win:win_optional_gpu_tests_rel
Change-Id: I2161d42d980c9b5bc8a9acfbe74e11ad7fa037b0
Reviewed-on: https://chromium-review.googlesource.com/894450
Commit-Queue: Kenneth Russell <kbr@chromium.org>
Reviewed-by: default avatarKenneth Russell <kbr@chromium.org>
Cr-Commit-Position: refs/heads/master@{#533230}
parent 65745cb4
...@@ -2562,7 +2562,9 @@ def add_common_test_properties(test, tester_config): ...@@ -2562,7 +2562,9 @@ def add_common_test_properties(test, tester_config):
'args': [ 'args': [
'--gpu-trigger-configs', '--gpu-trigger-configs',
json.dumps(tester_config['swarming_dimensions'] + json.dumps(tester_config['swarming_dimensions'] +
tester_config.get('alternate_swarming_dimensions', [])) tester_config.get('alternate_swarming_dimensions', [])),
'--gpu-trigger-script-verbose',
'True'
], ],
} }
......
...@@ -150,7 +150,7 @@ class GpuTestTriggerer(object): ...@@ -150,7 +150,7 @@ class GpuTestTriggerer(object):
if not all(isinstance(entry, dict) for entry in self._gpu_configs): if not all(isinstance(entry, dict) for entry in self._gpu_configs):
raise ValueError('GPU configurations must all be dictionaries') raise ValueError('GPU configurations must all be dictionaries')
def query_swarming_for_gpu_configs(self): def query_swarming_for_gpu_configs(self, verbose):
# Query Swarming to figure out which bots are available. # Query Swarming to figure out which bots are available.
for config in self._gpu_configs: for config in self._gpu_configs:
values = [] values = []
...@@ -172,7 +172,8 @@ class GpuTestTriggerer(object): ...@@ -172,7 +172,8 @@ class GpuTestTriggerer(object):
'0', '0',
'--json', '--json',
temp_file, temp_file,
('bots/count?%s' % query_arg)]) ('bots/count?%s' % query_arg)],
verbose)
if ret: if ret:
raise Exception('Error running swarming.py') raise Exception('Error running swarming.py')
with open(temp_file) as fp: with open(temp_file) as fp:
...@@ -182,10 +183,15 @@ class GpuTestTriggerer(object): ...@@ -182,10 +183,15 @@ class GpuTestTriggerer(object):
# Be robust against errors in computation. # Be robust against errors in computation.
available = max(0, count - int(query_result['busy'])) available = max(0, count - int(query_result['busy']))
self._bot_statuses.append({'total': count, 'available': available}) self._bot_statuses.append({'total': count, 'available': available})
if verbose:
idx = len(self._bot_statuses) - 1
print 'GPU config %d: %s' % (idx, str(self._bot_statuses[idx]))
finally: finally:
self.delete_temp_file(temp_file) self.delete_temp_file(temp_file)
# Sum up the total count of all bots. # Sum up the total count of all bots.
self._total_bots = sum(x['total'] for x in self._bot_statuses) self._total_bots = sum(x['total'] for x in self._bot_statuses)
if verbose:
print 'Total bots: %d' % (self._total_bots)
def remove_swarming_dimension(self, args, dimension): def remove_swarming_dimension(self, args, dimension):
for i in xrange(len(args)): for i in xrange(len(args)):
...@@ -196,7 +202,7 @@ class GpuTestTriggerer(object): ...@@ -196,7 +202,7 @@ class GpuTestTriggerer(object):
def choose_random_int(self, max_num): def choose_random_int(self, max_num):
return random.randint(1, max_num) return random.randint(1, max_num)
def pick_gpu_configuration(self): def pick_gpu_configuration(self, verbose):
# These are the rules used: # These are the rules used:
# 1. If any configuration has bots available, pick the configuration with # 1. If any configuration has bots available, pick the configuration with
# the most bots available. # the most bots available.
...@@ -216,6 +222,8 @@ class GpuTestTriggerer(object): ...@@ -216,6 +222,8 @@ class GpuTestTriggerer(object):
max_val = avail max_val = avail
self._bot_statuses[max_index]['available'] -= 1 self._bot_statuses[max_index]['available'] -= 1
assert self._bot_statuses[max_index]['available'] >= 0 assert self._bot_statuses[max_index]['available'] >= 0
if verbose:
print 'Chose GPU config %d because bots were available' % (max_index)
return max_index return max_index
# Case 2. # Case 2.
# We want to choose a bot uniformly at random from all of the bots specified # We want to choose a bot uniformly at random from all of the bots specified
...@@ -225,6 +233,8 @@ class GpuTestTriggerer(object): ...@@ -225,6 +233,8 @@ class GpuTestTriggerer(object):
r = self.choose_random_int(self._total_bots) r = self.choose_random_int(self._total_bots)
for i, status in enumerate(self._bot_statuses): for i, status in enumerate(self._bot_statuses):
if r <= status['total']: if r <= status['total']:
if verbose:
print 'Chose GPU config %d stochastically' % (i)
return i return i
r -= status['total'] r -= status['total']
raise Exception('Should not reach here') raise Exception('Should not reach here')
...@@ -247,7 +257,10 @@ class GpuTestTriggerer(object): ...@@ -247,7 +257,10 @@ class GpuTestTriggerer(object):
with open(output_file, 'w') as f: with open(output_file, 'w') as f:
json.dump(merged_json, f) json.dump(merged_json, f)
def run_swarming(self, args): def run_swarming(self, args, verbose):
if verbose:
print 'Running Swarming with args:'
print str(args)
return subprocess.call([sys.executable, SWARMING_PY] + args) return subprocess.call([sys.executable, SWARMING_PY] + args)
def trigger_tasks(self, args, remaining): def trigger_tasks(self, args, remaining):
...@@ -262,8 +275,9 @@ class GpuTestTriggerer(object): ...@@ -262,8 +275,9 @@ class GpuTestTriggerer(object):
Exit code for the script. Exit code for the script.
""" """
remaining = self.filter_swarming_py_path_arg(remaining) remaining = self.filter_swarming_py_path_arg(remaining)
verbose = args.gpu_trigger_script_verbose
self.parse_gpu_configs(args) self.parse_gpu_configs(args)
self.query_swarming_for_gpu_configs() self.query_swarming_for_gpu_configs(verbose)
# In the remaining arguments, find the Swarming dimensions that are # In the remaining arguments, find the Swarming dimensions that are
# specified by the GPU configs and remove them, because for each shard, # specified by the GPU configs and remove them, because for each shard,
...@@ -282,14 +296,14 @@ class GpuTestTriggerer(object): ...@@ -282,14 +296,14 @@ class GpuTestTriggerer(object):
# 1. Pick which GPU configuration to use. # 1. Pick which GPU configuration to use.
# 2. Insert that GPU configuration's dimensions as command line # 2. Insert that GPU configuration's dimensions as command line
# arguments, and invoke "swarming.py trigger". # arguments, and invoke "swarming.py trigger".
gpu_index = self.pick_gpu_configuration() gpu_index = self.pick_gpu_configuration(verbose)
# Holds the results of the swarming.py trigger call. # Holds the results of the swarming.py trigger call.
try: try:
json_temp = self.make_temp_file(prefix='trigger_gpu_test', json_temp = self.make_temp_file(prefix='trigger_gpu_test',
suffix='.json') suffix='.json')
args_to_pass = self.modify_args(filtered_remaining_args, gpu_index, i, args_to_pass = self.modify_args(filtered_remaining_args, gpu_index, i,
args.shards, json_temp) args.shards, json_temp)
ret = self.run_swarming(args_to_pass) ret = self.run_swarming(args_to_pass, verbose)
if ret: if ret:
sys.stderr.write('Failed to trigger a task, aborting\n') sys.stderr.write('Failed to trigger a task, aborting\n')
return ret return ret
...@@ -318,6 +332,8 @@ def main(): ...@@ -318,6 +332,8 @@ def main():
help='The GPU configurations to trigger tasks on, in the' help='The GPU configurations to trigger tasks on, in the'
' form of a JSON array of dictionaries. At least one' ' form of a JSON array of dictionaries. At least one'
' entry in this dictionary is required.') ' entry in this dictionary is required.')
parser.add_argument('--gpu-trigger-script-verbose', type=bool, default=False,
help='Turn on verbose logging')
parser.add_argument('--dump-json', required=True, parser.add_argument('--dump-json', required=True,
help='(Swarming Trigger Script API) Where to dump the' help='(Swarming Trigger Script API) Where to dump the'
' resulting json which indicates which tasks were' ' resulting json which indicates which tasks were'
......
...@@ -53,11 +53,11 @@ class FakeTriggerer(trigger_gpu_test.GpuTestTriggerer): ...@@ -53,11 +53,11 @@ class FakeTriggerer(trigger_gpu_test.GpuTestTriggerer):
def parse_gpu_configs(self, args): def parse_gpu_configs(self, args):
pass pass
def query_swarming_for_gpu_configs(self): def query_swarming_for_gpu_configs(self, verbose):
# Sum up the total count of all bots. # Sum up the total count of all bots.
self._total_bots = sum(x['total'] for x in self._bot_statuses) self._total_bots = sum(x['total'] for x in self._bot_statuses)
def run_swarming(self, args): def run_swarming(self, args, verbose):
self._swarming_runs.append(args) self._swarming_runs.append(args)
...@@ -119,6 +119,7 @@ class UnitTest(unittest.TestCase): ...@@ -119,6 +119,7 @@ class UnitTest(unittest.TestCase):
args = Args() args = Args()
args.shards = 2 args.shards = 2
args.dump_json = 'output.json' args.dump_json = 'output.json'
args.gpu_trigger_script_verbose = False
triggerer.trigger_tasks( triggerer.trigger_tasks(
args, args,
[ [
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment