Commit 553b419e authored by erikchen's avatar erikchen Committed by Commit bot

Telemetry: Add WPR data for large profiles.

This CL adds a new page set ProfileSafeUrlsPageSet, which is used by the
CookieProfileExtender. This CL also hooks up WPR to the CookieProfileExtender.

BUG=472745

Review URL: https://codereview.chromium.org/1070053004

Cr-Commit-Position: refs/heads/master@{#325324}
parent 46237053
This diff is collapsed.
5f21e6c824007764cd75cff772be40dee5b47c4a
\ No newline at end of file
# Copyright 2015 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
from profile_creators import profile_safe_url_list
from telemetry.page import page as page_module
from telemetry.page import page_set as page_set_module
class ProfileSafeUrlPage(page_module.Page):
def __init__(self, url, page_set):
super(ProfileSafeUrlPage, self).__init__(
url=url,
page_set = page_set,
credentials_path = 'data/credentials.json')
self.credentials = 'google'
class ProfileSafeUrlsPageSet(page_set_module.PageSet):
"""Safe urls used for profile generation."""
def __init__(self):
super(ProfileSafeUrlsPageSet, self).__init__(
archive_data_file='data/profile_safe_urls.json',
user_agent_type='desktop',
bucket=page_set_module.PARTNER_BUCKET)
# Only use the first 500 urls to prevent the .wpr files from getting too
# big.
safe_urls = profile_safe_url_list.GetShuffledSafeUrls()[0:500]
for safe_url in safe_urls:
self.AddUserStory(ProfileSafeUrlPage(safe_url, self))
......@@ -5,13 +5,14 @@ import multiprocessing
import os
import sqlite3
import page_sets
from profile_creators import fast_navigation_profile_extender
from profile_creators import profile_safe_url_list
class CookieProfileExtender(
fast_navigation_profile_extender.FastNavigationProfileExtender):
"""This extender performs a large number of navigations (up to 500), with the
goal of filling out the cookie database.
"""This extender fills in the cookie database.
By default, Chrome purges the cookie DB down to 3300 cookies. However, it
won't purge cookies accessed in the last month. This means the extender needs
......@@ -28,9 +29,12 @@ class CookieProfileExtender(
# A list of urls that have not yet been navigated to. This list will shrink
# over time. Each navigation will add a diminishing number of new cookies,
# since there's a high probability that the cookie is already present. If
# the cookie DB isn't full by 500 navigations, just give up.
self._navigation_urls = profile_safe_url_list.GetShuffledSafeUrls()[0:500]
# since there's a high probability that the cookie is already present.
self._page_set = page_sets.ProfileSafeUrlsPageSet()
urls = []
for user_story in self._page_set.user_stories:
urls.append(user_story.url)
self._navigation_urls = urls
def GetUrlIterator(self):
"""Superclass override."""
......@@ -40,6 +44,14 @@ class CookieProfileExtender(
"""Superclass override."""
return self._IsCookieDBFull()
def WebPageReplayArchivePath(self):
return self._page_set.WprFilePathForUserStory(
self._page_set.user_stories[0])
def FetchWebPageReplayArchives(self):
"""Superclass override."""
self._page_set.wpr_archive_info.DownloadArchivesIfNeeded()
@staticmethod
def _CookieCountInDB(db_path):
"""The number of cookies in the db at |db_path|."""
......
......@@ -21,6 +21,12 @@ class _HRefParser(HTMLParser.HTMLParser):
self.hrefs.append(value)
def _AbsoluteUrlHasSaneScheme(absolute_url):
if len(absolute_url) < 4:
return False
return absolute_url[0:4] == 'http'
def GenerateSafeUrls():
"""Prints a list of safe urls.
......@@ -31,7 +37,7 @@ def GenerateSafeUrls():
# A list of websites whose hrefs are unlikely to link to sites that contain
# malware.
seed_urls = [
"https://www.cnn.com",
"http://www.cnn.com",
"https://www.youtube.com",
"https://www.facebook.com",
"https://www.twitter.com",
......@@ -76,6 +82,8 @@ def GenerateSafeUrls():
continue
absolute_url = urlparse.urljoin(url, relative_url)
if not _AbsoluteUrlHasSaneScheme(absolute_url):
continue
safe_urls.add(absolute_url)
# Sort the urls, to make them easier to view in bulk.
......
......@@ -7,16 +7,18 @@ import random
def GetShuffledSafeUrls():
"""Returns a deterministic shuffling of safe urls. The profile generators
access the urls in order, and the urls are grouped by domain. The shuffling
reduces the load on external servers."""
"""Returns a deterministic shuffling of safe urls.
The profile generators access the urls in order, and the urls are grouped by
domain. The shuffling reduces the load on external servers.
"""
random.seed(0)
url_list_copy = list(_GetSafeUrls())
url_list_copy = list(GetSafeUrls())
random.shuffle(url_list_copy)
return url_list_copy
def _GetSafeUrls():
def GetSafeUrls():
"""Returns a list of safe urls by loading them from a pre-generated file."""
safe_url_dir = os.path.dirname(os.path.realpath(__file__))
safe_url_path = os.path.join(safe_url_dir, "profile_safe_url_list.json")
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment