sandwich: Fixes two sources of KeyError task failures

BUG=623966 Review-Url: https://codereview.chromium.org/2112483002 Cr-Commit-Position: refs/heads/master@{#403916}

sandwich: Fixes two sources of KeyError task failures
BUG=623966 Review-Url: https://codereview.chromium.org/2112483002 Cr-Commit-Position: refs/heads/master@{#403916}
66cf3abb · gabadie · Commit bot · 53450e52 · 66cf3abb
Commit 66cf3abb authored Jul 06, 2016 by gabadie Committed by Commit bot Jul 06, 2016
Hide whitespace changes
Inline Side-by-side

Showing with 42 additions and 5 deletions

tools/android/loading/sandwich_prefetch.py tools/android/loading/sandwich_prefetch.py +42 -5

No files found.
--- a/tools/android/loading/sandwich_prefetch.py
+++ b/tools/android/loading/sandwich_prefetch.py
@@ -21,7 +21,7 @@ import json
 import os
 import re
 import shutil
-from urlparse import urlparse
+import urlparse

 import chrome_cache
 import common_util
@@ -30,6 +30,7 @@ from prefetch_view import PrefetchSimulationView
 from request_dependencies_lens import RequestDependencyLens
 import sandwich_metrics
 import sandwich_runner
+import sandwich_utils
 import task_manager
 import wpr_backend

@@ -69,6 +70,13 @@ SUBRESOURCE_DISCOVERERS = set([
 _UPLOAD_DATA_STREAM_REQUESTS_REGEX = re.compile(r'^\d+/(?P<url>.*)$')


+def _NormalizeUrl(url):
+  """Returns normalized URL such as removing trailing slashes."""
+  parsed_url = list(urlparse.urlparse(url))
+  parsed_url[2] = re.sub(r'/{2,}', r'/', parsed_url[2])
+  return urlparse.urlunparse(parsed_url)
+
+
 def _PatchWpr(wpr_archive):
  """Patches a WPR archive to get all resources into the HTTP cache and avoid
  invalidation and revalidations.
@@ -201,7 +209,15 @@ def _PruneOutOriginalNoStoreRequests(original_headers_path, requests):
    original_headers = json.load(file_input)
  pruned_requests = set()
  for request in requests:
-    request_original_headers = original_headers[request.url]
+    url = _NormalizeUrl(request.url)
+    if url not in original_headers:
+      # TODO(gabadie): Investigate why these requests were not in WPR.
+      assert request.failed
+      logging.warning(
+          'could not find original headers for: %s (failure: %s)',
+          url, request.error_text)
+      continue
+    request_original_headers = original_headers[url]
    if ('cache-control' in request_original_headers and
        'no-store' in request_original_headers['cache-control'].lower()):
      pruned_requests.add(request)
@@ -369,7 +385,7 @@ class _RunOutputVerifier(object):
    for request in all_wpr_requests:
      if request.is_wpr_host:
        continue
-      if urlparse(request.url).path.startswith('/web-page-replay'):
+      if urlparse.urlparse(request.url).path.startswith('/web-page-replay'):
        wpr_command_colliding_urls.add(request.url)
      elif request.is_served is False:
        unserved_wpr_urls.add(request.url)
@@ -481,6 +497,7 @@ def _ProcessRunOutputDir(
    served_from_network_bytes = 0
    served_from_cache_bytes = 0
    urls_hitting_network = set()
+    response_sizes = {}
    for request in _FilterOutDataAndIncompleteRequests(
        trace.request_track.GetEvents()):
      # Ignore requests served from the blink's cache.
@@ -488,9 +505,20 @@ def _ProcessRunOutputDir(
        continue
      urls_hitting_network.add(request.url)
      if request.from_disk_cache:
-        served_from_cache_bytes += cached_encoded_data_lengths[request.url]
+        if request.url in cached_encoded_data_lengths:
+          response_size = cached_encoded_data_lengths[request.url]
+        else:
+          # Some fat webpages may overflow the Memory cache, and so some
+          # requests might be served from disk cache couple of times per page
+          # load.
+          logging.warning('Looks like could be served from memory cache: %s',
+              request.url)
+          response_size = response_sizes[request.url]
+        served_from_cache_bytes += response_size
      else:
-        served_from_network_bytes += request.GetEncodedDataLength()
+        response_size = request.GetEncodedDataLength()
+        served_from_network_bytes += response_size
+      response_sizes[request.url] = response_size

    # Make sure the served from blink's cache requests have at least one
    # corresponding request that was not served from the blink's cache.
@@ -574,6 +602,15 @@ class PrefetchBenchmarkBuilder(task_manager.Builder):
      # Save up original response headers.
      original_response_headers = {e.url: e.GetResponseHeadersDict() \
          for e in wpr_archive.ListUrlEntries()}
+      logging.info('save up response headers for %d resources',
+                   len(original_response_headers))
+      if not original_response_headers:
+        # TODO(gabadie): How is it possible to not even have the main resource
+        # in the WPR archive? Example URL can be found in:
+        # http://crbug.com/623966#c5
+        raise Exception(
+            'Looks like no resources were recorded in WPR during: {}'.format(
+                self._common_builder.original_wpr_task.name))
      with open(self._original_headers_path, 'w') as file_output:
        json.dump(original_response_headers, file_output)