[pinboard] Fetch state from cloud storage if not found locally

This makes it easier to transfer the state from one workstation to another, without having to fetch again the results of all past pinpoint jobs. Bug: 1029039 Change-Id: I83eebe5d65cdda80b2f9255a584dd86ecd40f7bc Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1942047Reviewed-by: Sami Kyöstilä <skyostil@chromium.org> Reviewed-by: Ross McIlroy <rmcilroy@chromium.org> Commit-Queue: Juan Antonio Navarro Pérez <perezju@chromium.org> Cr-Commit-Position: refs/heads/master@{#719987}

[pinboard] Fetch state from cloud storage if not found locally
This makes it easier to transfer the state from one workstation to another, without having to fetch again the results of all past pinpoint jobs. Bug: 1029039 Change-Id: I83eebe5d65cdda80b2f9255a584dd86ecd40f7bc Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1942047Reviewed-by: Sami Kyöstilä <skyostil@chromium.org> Reviewed-by: Ross McIlroy <rmcilroy@chromium.org> Commit-Queue: Juan Antonio Navarro Pérez <perezju@chromium.org> Cr-Commit-Position: refs/heads/master@{#719987}
91ce1b14 · Juan Antonio Navarro Perez · Commit Bot · 9e24d318 · 91ce1b14 · 91ce1b14
Commit 91ce1b14 authored Nov 28, 2019 by Juan Antonio Navarro Perez Committed by Commit Bot Nov 28, 2019
Showing with 78 additions and 32 deletions

tools/perf/cli_tools/pinboard/pinboard.py tools/perf/cli_tools/pinboard/pinboard.py +64 -27

tools/perf/cli_tools/pinboard/pinboard_unittest.py tools/perf/cli_tools/pinboard/pinboard_unittest.py +14 -5

No files found.
--- a/tools/perf/cli_tools/pinboard/pinboard.py
+++ b/tools/perf/cli_tools/pinboard/pinboard.py
@@ -142,9 +142,10 @@ def CollectPinpointResults(state):
 def LoadJobsState():
  """Load the latest recorded state of pinpoint jobs."""
  local_path = CachedFilePath(JOBS_STATE_FILE)
-  if os.path.exists(local_path):
+  if os.path.exists(local_path) or DownloadFromCloudStorage(local_path):
    return LoadJsonFile(local_path)
  else:
+    logging.info('No jobs state found. Creating empty state.')
    return []


@@ -164,26 +165,52 @@ def UpdateJobsState(state):
      UploadToCloudStorage(local_path)


-def AggregateAndUploadResults(state):
-  """Aggregate results collected and upload them to cloud storage."""
-  cached_results = CachedFilePath(DATASET_PKL_FILE)
-  dfs = []
+def GetCachedDataset():
+  """Load the latest dataset with cached data."""
+  local_path = CachedFilePath(DATASET_PKL_FILE)
+  if os.path.exists(local_path) or DownloadFromCloudStorage(local_path):
+    return pd.read_pickle(local_path)
+  else:
+    return None
+
+
+def UpdateCachedDataset(df):
+  """Write back the dataset with cached data."""
+  local_path = CachedFilePath(DATASET_PKL_FILE)
+  df.to_pickle(local_path)
+  UploadToCloudStorage(local_path)

-  keep_revisions = set(item['revision'] for item in state)
-  if os.path.exists(cached_results):
-    # To speed things up, we take the cache computed from previous results.
-    df = pd.read_pickle(cached_results)
-    # Drop possible old data from revisions no longer in recent state.
-    df = df[df['revision'].isin(keep_revisions)]
-    dfs.append(df)
+
+def GetItemsToUpdate(state):
+  """Select jobs with new data to download and cached data for existing jobs.
+
+  This also filters out old revisions to keep only recent (6 months) data.
+
+  Returns:
+    new_items: A list of job items from which to get data.
+    cached_df: A DataFrame with existing cached data, may be None.
+  """
+  from_date = str(TimeAgo(months=6).date())
+  new_items = [item for item in state if item['timestamp'] > from_date]
+  df = GetCachedDataset()
+  if df is not None:
+    recent_revisions = set(item['revision'] for item in new_items)
+    df = df[df['revision'].isin(recent_revisions)]
    known_revisions = set(df['revision'])
-  else:
-    known_revisions = set()
+    new_items = [
+        item for item in new_items if item['revision'] not in known_revisions]
+  return new_items, df
+
+
+def AggregateAndUploadResults(new_items, cached_df=None):
+  """Aggregate results collected and upload them to cloud storage."""
+  dfs = []
+  if cached_df is not None:
+    dfs.append(cached_df)

  found_new = False
-  for item in state:
-    if item['revision'] in known_revisions or _SkipProcessing(item):
-      # Revision is already in cache, jobs are not ready, or all have failed.
+  for item in new_items:
+    if _SkipProcessing(item):  # Jobs are not ready, or all have failed.
      continue
    if not found_new:
      logging.info('Processing data from new results:')
@@ -197,7 +224,7 @@ def AggregateAndUploadResults(state):

  # Otherwise update our cache and upload.
  df = pd.concat(dfs, ignore_index=True)
-  df.to_pickle(cached_results)
+  UpdateCachedDataset(df)

  # Drop revisions with no results and mark the last result for each metric,
  # both with/without patch, as a 'reference'. This allows making score cards
@@ -327,6 +354,18 @@ def UploadToCloudStorage(filepath):
      filepath, posixpath.join(CLOUD_STORAGE_DIR, os.path.basename(filepath)))


+def DownloadFromCloudStorage(filepath):
+  """Get the given file from cloud storage."""
+  try:
+    gsutil.Copy(
+        posixpath.join(CLOUD_STORAGE_DIR, os.path.basename(filepath)), filepath)
+    logging.info('Downloaded copy of %s from cloud storage.', filepath)
+    return True
+  except subprocess.CalledProcessError:
+    logging.info('Failed to download copy of %s from cloud storage.', filepath)
+    return False
+
+
 def LoadJsonFile(filename):
  with open(filename) as f:
    return json.load(f)
@@ -353,12 +392,6 @@ def SetUpLogging(level):
  logger.addHandler(h2)


-def SelectRecentRevisions(state):
-  """Filter out old revisions from state to keep only recent (6 months) data."""
-  from_date = str(TimeAgo(months=6).date())
-  return [item for item in state if item['timestamp'] > from_date]
-
-
 def Main():
  SetUpLogging(level=logging.INFO)
  actions = ('start', 'collect', 'upload')
@@ -377,15 +410,19 @@ def Main():
    logging.info('=== auto run for %s ===', args.date)
    args.actions = actions

+  cached_results_dir = CachedFilePath('job_results')
+  if not os.path.isdir(cached_results_dir):
+    os.makedirs(cached_results_dir)
+
  state = LoadJobsState()
  try:
    if 'start' in args.actions:
      StartPinpointJobs(state, args.date)
-    recent_state = SelectRecentRevisions(state)
+    new_items, cached_df = GetItemsToUpdate(state)
    if 'collect' in args.actions:
-      CollectPinpointResults(recent_state)
+      CollectPinpointResults(new_items)
  finally:
    UpdateJobsState(state)

  if 'upload' in args.actions:
-    AggregateAndUploadResults(recent_state)
+    AggregateAndUploadResults(new_items, cached_df)
--- a/tools/perf/cli_tools/pinboard/pinboard_unittest.py
+++ b/tools/perf/cli_tools/pinboard/pinboard_unittest.py
@@ -34,6 +34,9 @@ class PinboardToolTests(unittest.TestCase):
        'cli_tools.pinboard.pinboard.subprocess').start()
    self.upload_to_cloud = mock.patch(
        'cli_tools.pinboard.pinboard.UploadToCloudStorage').start()
+    self.download_from_cloud = mock.patch(
+        'cli_tools.pinboard.pinboard.DownloadFromCloudStorage').start()
+    self.download_from_cloud.return_value = False

  def tearDown(self):
    mock.patch.stopall()
@@ -109,7 +112,8 @@ class PinboardToolTests(unittest.TestCase):
    self.assertEqual(self.upload_to_cloud.call_count, 2)

  @mock.patch('cli_tools.pinboard.pinboard.GetRevisionResults')
-  def testAggregateAndUploadResults(self, get_revision_results):
+  @mock.patch('cli_tools.pinboard.pinboard.TimeAgo')
+  def testAggregateAndUploadResults(self, time_ago, get_revision_results):
    state = [
        StateItem('a100', timestamp='2019-03-15', job1='completed'),
        StateItem('a200', timestamp='2019-03-16', job2='completed'),
@@ -129,26 +133,31 @@ class PinboardToolTests(unittest.TestCase):
      return df

    get_revision_results.side_effect = GetFakeResults
+    time_ago.return_value = pd.Timestamp('2018-10-20')

    # Only process first few revisions.
-    pinboard.AggregateAndUploadResults(state[:3])
+    new_items, cached_df = pinboard.GetItemsToUpdate(state[:3])
+    pinboard.AggregateAndUploadResults(new_items, cached_df)
    dataset_file = pinboard.CachedFilePath(pinboard.DATASET_CSV_FILE)
    df = pd.read_csv(dataset_file)
    self.assertEqual(set(df['revision']), set(['a100', 'a200']))
    self.assertTrue((df[df['reference']]['revision'] == 'a200').all())

    # Incrementally process the rest.
-    pinboard.AggregateAndUploadResults(state)
+    new_items, cached_df = pinboard.GetItemsToUpdate(state)
+    pinboard.AggregateAndUploadResults(new_items, cached_df)
    dataset_file = pinboard.CachedFilePath(pinboard.DATASET_CSV_FILE)
    df = pd.read_csv(dataset_file)
    self.assertEqual(set(df['revision']), set(['a100', 'a200', 'a500']))
    self.assertTrue((df[df['reference']]['revision'] == 'a500').all())

    # No new revisions. This should be a no-op.
-    pinboard.AggregateAndUploadResults(state)
+    new_items, cached_df = pinboard.GetItemsToUpdate(state)
+    pinboard.AggregateAndUploadResults(new_items, cached_df)

    self.assertEqual(get_revision_results.call_count, 4)
-    self.assertEqual(self.upload_to_cloud.call_count, 2)
+    # Uploads twice (the pkl and csv) on each call to aggregate results.
+    self.assertEqual(self.upload_to_cloud.call_count, 2 * 2)

  def testGetRevisionResults_simple(self):
    item = StateItem('2a66ba', timestamp='2019-03-17T23:50:16-07:00')