Commit 4208a052 authored by lizeb's avatar lizeb Committed by Commit bot

tools/android/loading: ContentClassificationLens, ads and tracking requests.

Adds a lens leveraging a rules file to tag whether a given request is
related to Ads and/or tracking/analytics. This CL also displays this in
the PNG output of the dependency graph.

Review URL: https://codereview.chromium.org/1626393002

Cr-Commit-Position: refs/heads/master@{#371504}
parent 23ab73d8
......@@ -24,6 +24,7 @@ sys.path.append(os.path.join(_SRC_DIR, 'build', 'android'))
import devil_chromium
from pylib import constants
import content_classification_lens
import device_setup
import loading_model
import loading_trace
......@@ -147,10 +148,14 @@ def _FullFetch(url, json_output, prefetch, local, prefetch_delay_seconds):
# TODO(mattcary): it would be nice to refactor so the --noads flag gets dealt
# with here.
def _ProcessRequests(filename):
def _ProcessRequests(filename, ad_rules_filename='',
tracking_rules_filename=''):
with open(filename) as f:
return loading_model.ResourceGraph(
loading_trace.LoadingTrace.FromJsonDict(json.load(f)))
trace = loading_trace.LoadingTrace.FromJsonDict(json.load(f))
content_lens = (
content_classification_lens.ContentClassificationLens.WithRulesFiles(
trace, ad_rules_filename, tracking_rules_filename))
return loading_model.ResourceGraph(trace, content_lens)
def InvalidCommand(cmd):
......@@ -185,8 +190,11 @@ def DoPng(arg_str):
parser.add_argument('--eog', action='store_true')
parser.add_argument('--highlight')
parser.add_argument('--noads', action='store_true')
parser.add_argument('--ad_rules', default='')
parser.add_argument('--tracking_rules', default='')
args = parser.parse_args(arg_str)
graph = _ProcessRequests(args.request_json)
graph = _ProcessRequests(
args.request_json, args.ad_rules, args.tracking_rules)
if args.noads:
graph.Set(node_filter=graph.FilterAds)
tmp = tempfile.NamedTemporaryFile()
......
# Copyright 2016 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Labels requests according to the type of content they represent."""
import adblockparser # Available on PyPI, through pip.
import collections
import os
import loading_trace
import request_track
class ContentClassificationLens(object):
"""Associates requests and frames with the type of content they represent."""
def __init__(self, trace, ad_rules, tracking_rules):
"""Initializes an instance of ContentClassificationLens.
Args:
trace: (LoadingTrace) loading trace.
ad_rules: ([str]) List of Adblock+ compatible rules used to classify ads.
tracking_rules: ([str]) List of Adblock+ compatible rules used to
classify tracking and analytics.
"""
self._trace = trace
self._requests = trace.request_track.GetEvents()
self._main_frame_id = trace.page_track.GetEvents()[0]['frame_id']
self._frame_to_requests = collections.defaultdict(list)
self._ad_requests = set()
self._tracking_requests = set()
self._ad_matcher = _RulesMatcher(ad_rules, True)
self._tracking_matcher = _RulesMatcher(tracking_rules, True)
self._GroupRequestsByFrameId()
self._LabelRequests()
def IsAdRequest(self, request):
"""Returns True iff the request matches one of the ad_rules."""
return request.request_id in self._ad_requests
def IsTrackingRequest(self, request):
"""Returns True iff the request matches one of the tracking_rules."""
return request.request_id in self._tracking_requests
def IsAdFrame(self, frame_id, ratio):
"""A Frame is an Ad frame if more than |ratio| of its requests are
ad-related, and is not the main frame."""
if frame_id == self._main_frame_id:
return False
ad_requests_count = sum(r in self._ad_requests
for r in self._frame_to_requests[frame_id])
frame_requests_count = len(self._frame_to_requests[frame_id])
return (float(ad_requests_count) / frame_requests_count) > ratio
@classmethod
def WithRulesFiles(cls, trace, ad_rules_filename, tracking_rules_filename):
"""Returns an instance of ContentClassificationLens with the rules read
from files.
"""
ad_rules = []
tracking_rules = []
if os.path.exists(ad_rules_filename):
ad_rules = open(ad_rules_filename, 'r').readlines()
if os.path.exists(tracking_rules_filename):
tracking_rules = open(tracking_rules_filename, 'r').readlines()
return ContentClassificationLens(trace, ad_rules, tracking_rules)
def _GroupRequestsByFrameId(self):
for request in self._requests:
frame_id = request.frame_id
self._frame_to_requests[frame_id].append(request.request_id)
def _LabelRequests(self):
for request in self._requests:
request_id = request.request_id
if self._ad_matcher.Matches(request):
self._ad_requests.add(request_id)
if self._tracking_matcher.Matches(request):
self._tracking_requests.add(request_id)
class _RulesMatcher(object):
"""Matches requests with rules in Adblock+ format."""
_WHITELIST_PREFIX = '@@'
_RESOURCE_TYPE_TO_OPTIONS_KEY = {
'Script': 'script', 'Stylesheet': 'stylesheet', 'Image': 'image',
'XHR': 'xmlhttprequest'}
def __init__(self, rules, no_whitelist):
"""Initializes an instance of _RulesMatcher.
Args:
rules: ([str]) list of rules.
no_whitelist: (bool) Whether the whitelisting rules should be ignored.
"""
self._rules = self._FilterRules(rules, no_whitelist)
self._matcher = adblockparser.AdblockRules(self._rules)
def Matches(self, request):
"""Returns whether a request matches one of the rules."""
url = request.url
return self._matcher.should_block(url, self._GetOptions(request))
@classmethod
def _GetOptions(cls, request):
options = {}
resource_type = request.resource_type
option = cls._RESOURCE_TYPE_TO_OPTIONS_KEY.get(resource_type)
if option:
options[option] = True
return options
@classmethod
def _FilterRules(cls, rules, no_whitelist):
if not no_whitelist:
return rules
else:
return [rule for rule in rules
if not rule.startswith(cls._WHITELIST_PREFIX)]
# Copyright 2016 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import copy
import unittest
from content_classification_lens import (ContentClassificationLens,
_RulesMatcher)
from request_track import (Request, TimingFromDict)
import test_utils
class ContentClassificationLensTestCase(unittest.TestCase):
_REQUEST = Request.FromJsonDict({'url': 'http://bla.com',
'request_id': '1234.1',
'frame_id': '123.1',
'initiator': {'type': 'other'},
'timestamp': 2,
'timing': TimingFromDict({})})
_MAIN_FRAME_ID = '123.1'
_PAGE_EVENTS = [{'method': 'Page.frameStartedLoading',
'frame_id': _MAIN_FRAME_ID},
{'method': 'Page.frameAttached',
'frame_id': '123.13', 'parent_frame_id': _MAIN_FRAME_ID}]
_RULES = ['bla.com']
def testAdRequest(self):
trace = test_utils.LoadingTraceFromEvents(
[self._REQUEST], self._PAGE_EVENTS)
lens = ContentClassificationLens(trace, self._RULES, [])
self.assertTrue(lens.IsAdRequest(self._REQUEST))
self.assertFalse(lens.IsTrackingRequest(self._REQUEST))
def testTrackingRequest(self):
trace = test_utils.LoadingTraceFromEvents(
[self._REQUEST], self._PAGE_EVENTS)
lens = ContentClassificationLens(trace, [], self._RULES)
self.assertFalse(lens.IsAdRequest(self._REQUEST))
self.assertTrue(lens.IsTrackingRequest(self._REQUEST))
def testMainFrameIsNotAdFrame(self):
trace = test_utils.LoadingTraceFromEvents(
[self._REQUEST] * 10, self._PAGE_EVENTS)
lens = ContentClassificationLens(trace, self._RULES, [])
self.assertFalse(lens.IsAdFrame(self._MAIN_FRAME_ID, .5))
def testAdFrame(self):
request = self._REQUEST
request.frame_id = '123.123'
trace = test_utils.LoadingTraceFromEvents(
[request] * 10 + [self._REQUEST] * 5, self._PAGE_EVENTS)
lens = ContentClassificationLens(trace, self._RULES, [])
self.assertTrue(lens.IsAdFrame(request.frame_id, .5))
class _MatcherTestCase(unittest.TestCase):
_RULES_WITH_WHITELIST = ['/thisisanad.', '@@myadvertisingdomain.com/*',
'@@||www.mydomain.com/ads/$elemhide']
_SCRIPT_RULE = 'domainwithscripts.com/*$script'
_SCRIPT_REQUEST = Request.FromJsonDict(
{'url': 'http://domainwithscripts.com/bla.js',
'resource_type': 'Script',
'request_id': '1234.1',
'frame_id': '123.1',
'initiator': {'type': 'other'},
'timestamp': 2,
'timing': TimingFromDict({})})
def testRemovesWhitelistRules(self):
matcher = _RulesMatcher(self._RULES_WITH_WHITELIST, False)
self.assertEquals(3, len(matcher._rules))
matcher = _RulesMatcher(self._RULES_WITH_WHITELIST, True)
self.assertEquals(1, len(matcher._rules))
def testScriptRule(self):
matcher = _RulesMatcher([self._SCRIPT_RULE], False)
request = copy.deepcopy(self._SCRIPT_REQUEST)
request.resource_type = 'Stylesheet'
self.assertFalse(matcher.Matches(request))
self.assertTrue(matcher.Matches(self._SCRIPT_REQUEST))
if __name__ == '__main__':
unittest.main()
......@@ -30,14 +30,17 @@ class ResourceGraph(object):
Set parameters:
cache_all: if true, assume zero loading time for all resources.
"""
def __init__(self, trace):
def __init__(self, trace, content_lens=None):
"""Create from a LoadingTrace (or json of a trace).
Args:
trace: (LoadingTrace/JSON) Loading trace or JSON of a trace.
content_lens: (ContentClassificationLens) Lens used to annotate the
nodes, or None.
"""
if type(trace) == dict:
trace = loading_trace.LoadingTrace.FromJsonDict(trace)
self._content_lens = content_lens
self._BuildDag(trace)
self._global_start = min([n.StartTime() for n in self._node_info])
# Sort before splitting children so that we can correctly dectect if a
......@@ -339,6 +342,8 @@ class ResourceGraph(object):
request: The request associated with this node.
"""
self._request = request
self._is_ad = False
self._is_tracking = False
self._node = node
self._edge_costs = {}
self._edge_annotations = {}
......@@ -357,6 +362,21 @@ class ResourceGraph(object):
def Index(self):
return self._node.Index()
def SetRequestContent(self, is_ad, is_tracking):
"""Sets the kind of content the request relates to.
Args:
is_ad: (bool) Whether the request is an Ad.
is_tracking: (bool) Whether the request is related to tracking.
"""
(self._is_ad, self._is_tracking) = (is_ad, is_tracking)
def IsAd(self):
return self._is_ad
def IsTracking(self):
return self._is_tracking
def Request(self):
return self._request
......@@ -481,6 +501,9 @@ class ResourceGraph(object):
index_by_request[request] = next_index
node = dag.Node(next_index)
node_info = self._NodeInfo(node, request)
if self._content_lens:
node.SetRequestContent(self._content_lens.IsAdRequest(request),
self._content_lens.IsTrackingRequest(request))
self._nodes.append(node)
self._node_info.append(node_info)
......@@ -606,6 +629,8 @@ class ResourceGraph(object):
if fragment in node_info.Url():
styles.append('dotted')
break
if node_info.IsAd() or node_info.IsTracking():
styles += ['bold', 'diagonals']
return ('%d [label = "%s\\n%.2f->%.2f (%.2f)"; style = "%s"; '
'fillcolor = %s; shape = %s];\n'
% (index, node_info.ShortName(),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment