tools/android/loading: ContentClassificationLens, ads and tracking requests.

Adds a lens leveraging a rules file to tag whether a given request is related to Ads and/or tracking/analytics. This CL also displays this in the PNG output of the dependency graph. Review URL: https://codereview.chromium.org/1626393002 Cr-Commit-Position: refs/heads/master@{#371504}

tools/android/loading: ContentClassificationLens, ads and tracking requests.
Adds a lens leveraging a rules file to tag whether a given request is related to Ads and/or tracking/analytics. This CL also displays this in the PNG output of the dependency graph. Review URL: https://codereview.chromium.org/1626393002 Cr-Commit-Position: refs/heads/master@{#371504}
4208a052 · lizeb · Commit bot · 23ab73d8 · 4208a052 · 4208a052
Commit 4208a052 authored Jan 26, 2016 by lizeb Committed by Commit bot Jan 26, 2016
4 changed files
--- a/tools/android/loading/analyze.py
+++ b/tools/android/loading/analyze.py
@@ -24,6 +24,7 @@ sys.path.append(os.path.join(_SRC_DIR, 'build', 'android'))
 import devil_chromium
 from pylib import constants

+import content_classification_lens
 import device_setup
 import loading_model
 import loading_trace
@@ -147,10 +148,14 @@ def _FullFetch(url, json_output, prefetch, local, prefetch_delay_seconds):

 # TODO(mattcary): it would be nice to refactor so the --noads flag gets dealt
 # with here.
-def _ProcessRequests(filename):
+def _ProcessRequests(filename, ad_rules_filename='',
+                     tracking_rules_filename=''):
  with open(filename) as f:
-    return loading_model.ResourceGraph(
-        loading_trace.LoadingTrace.FromJsonDict(json.load(f)))
+    trace = loading_trace.LoadingTrace.FromJsonDict(json.load(f))
+    content_lens = (
+        content_classification_lens.ContentClassificationLens.WithRulesFiles(
+            trace, ad_rules_filename, tracking_rules_filename))
+    return loading_model.ResourceGraph(trace, content_lens)


 def InvalidCommand(cmd):
@@ -185,8 +190,11 @@ def DoPng(arg_str):
  parser.add_argument('--eog', action='store_true')
  parser.add_argument('--highlight')
  parser.add_argument('--noads', action='store_true')
+  parser.add_argument('--ad_rules', default='')
+  parser.add_argument('--tracking_rules', default='')
  args = parser.parse_args(arg_str)
-  graph = _ProcessRequests(args.request_json)
+  graph = _ProcessRequests(
+      args.request_json, args.ad_rules, args.tracking_rules)
  if args.noads:
    graph.Set(node_filter=graph.FilterAds)
  tmp = tempfile.NamedTemporaryFile()

--- a/tools/android/loading/content_classification_lens.py
+++ b/tools/android/loading/content_classification_lens.py
+# Copyright 2016 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Labels requests according to the type of content they represent."""
+
+import adblockparser # Available on PyPI, through pip.
+import collections
+import os
+
+import loading_trace
+import request_track
+
+
+class ContentClassificationLens(object):
+  """Associates requests and frames with the type of content they represent."""
+  def __init__(self, trace, ad_rules, tracking_rules):
+    """Initializes an instance of ContentClassificationLens.
+
+    Args:
+      trace: (LoadingTrace) loading trace.
+      ad_rules: ([str]) List of Adblock+ compatible rules used to classify ads.
+      tracking_rules: ([str]) List of Adblock+ compatible rules used to
+                      classify tracking and analytics.
+    """
+    self._trace = trace
+    self._requests = trace.request_track.GetEvents()
+    self._main_frame_id = trace.page_track.GetEvents()[0]['frame_id']
+    self._frame_to_requests = collections.defaultdict(list)
+    self._ad_requests = set()
+    self._tracking_requests = set()
+    self._ad_matcher = _RulesMatcher(ad_rules, True)
+    self._tracking_matcher = _RulesMatcher(tracking_rules, True)
+    self._GroupRequestsByFrameId()
+    self._LabelRequests()
+
+  def IsAdRequest(self, request):
+    """Returns True iff the request matches one of the ad_rules."""
+    return request.request_id in self._ad_requests
+
+  def IsTrackingRequest(self, request):
+    """Returns True iff the request matches one of the tracking_rules."""
+    return request.request_id in self._tracking_requests
+
+  def IsAdFrame(self, frame_id, ratio):
+    """A Frame is an Ad frame if more than |ratio| of its requests are
+    ad-related, and is not the main frame."""
+    if frame_id == self._main_frame_id:
+      return False
+    ad_requests_count = sum(r in self._ad_requests
+                            for r in self._frame_to_requests[frame_id])
+    frame_requests_count = len(self._frame_to_requests[frame_id])
+    return (float(ad_requests_count) / frame_requests_count) > ratio
+
+  @classmethod
+  def WithRulesFiles(cls, trace, ad_rules_filename, tracking_rules_filename):
+    """Returns an instance of ContentClassificationLens with the rules read
+    from files.
+    """
+    ad_rules = []
+    tracking_rules = []
+    if os.path.exists(ad_rules_filename):
+      ad_rules = open(ad_rules_filename, 'r').readlines()
+    if os.path.exists(tracking_rules_filename):
+      tracking_rules = open(tracking_rules_filename, 'r').readlines()
+    return ContentClassificationLens(trace, ad_rules, tracking_rules)
+
+  def _GroupRequestsByFrameId(self):
+    for request in self._requests:
+      frame_id = request.frame_id
+      self._frame_to_requests[frame_id].append(request.request_id)
+
+  def _LabelRequests(self):
+    for request in self._requests:
+      request_id = request.request_id
+      if self._ad_matcher.Matches(request):
+        self._ad_requests.add(request_id)
+      if self._tracking_matcher.Matches(request):
+        self._tracking_requests.add(request_id)
+
+
+class _RulesMatcher(object):
+  """Matches requests with rules in Adblock+ format."""
+  _WHITELIST_PREFIX = '@@'
+  _RESOURCE_TYPE_TO_OPTIONS_KEY = {
+      'Script': 'script', 'Stylesheet': 'stylesheet', 'Image': 'image',
+      'XHR': 'xmlhttprequest'}
+  def __init__(self, rules, no_whitelist):
+    """Initializes an instance of _RulesMatcher.
+
+    Args:
+      rules: ([str]) list of rules.
+      no_whitelist: (bool) Whether the whitelisting rules should be ignored.
+    """
+    self._rules = self._FilterRules(rules, no_whitelist)
+    self._matcher = adblockparser.AdblockRules(self._rules)
+
+  def Matches(self, request):
+    """Returns whether a request matches one of the rules."""
+    url = request.url
+    return self._matcher.should_block(url, self._GetOptions(request))
+
+  @classmethod
+  def _GetOptions(cls, request):
+    options = {}
+    resource_type = request.resource_type
+    option = cls._RESOURCE_TYPE_TO_OPTIONS_KEY.get(resource_type)
+    if option:
+      options[option] = True
+    return options
+
+  @classmethod
+  def _FilterRules(cls, rules, no_whitelist):
+    if not no_whitelist:
+      return rules
+    else:
+      return [rule for rule in rules
+              if not rule.startswith(cls._WHITELIST_PREFIX)]
--- a/tools/android/loading/content_classification_lens_unittest.py
+++ b/tools/android/loading/content_classification_lens_unittest.py
+# Copyright 2016 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import copy
+import unittest
+
+from content_classification_lens import (ContentClassificationLens,
+                                         _RulesMatcher)
+from request_track import (Request, TimingFromDict)
+import test_utils
+
+
+class ContentClassificationLensTestCase(unittest.TestCase):
+  _REQUEST = Request.FromJsonDict({'url': 'http://bla.com',
+                                   'request_id': '1234.1',
+                                   'frame_id': '123.1',
+                                   'initiator': {'type': 'other'},
+                                   'timestamp': 2,
+                                   'timing': TimingFromDict({})})
+  _MAIN_FRAME_ID = '123.1'
+  _PAGE_EVENTS = [{'method': 'Page.frameStartedLoading',
+                   'frame_id': _MAIN_FRAME_ID},
+                  {'method': 'Page.frameAttached',
+                   'frame_id': '123.13', 'parent_frame_id': _MAIN_FRAME_ID}]
+  _RULES = ['bla.com']
+
+  def testAdRequest(self):
+    trace = test_utils.LoadingTraceFromEvents(
+        [self._REQUEST], self._PAGE_EVENTS)
+    lens = ContentClassificationLens(trace, self._RULES, [])
+    self.assertTrue(lens.IsAdRequest(self._REQUEST))
+    self.assertFalse(lens.IsTrackingRequest(self._REQUEST))
+
+  def testTrackingRequest(self):
+    trace = test_utils.LoadingTraceFromEvents(
+        [self._REQUEST], self._PAGE_EVENTS)
+    lens = ContentClassificationLens(trace, [], self._RULES)
+    self.assertFalse(lens.IsAdRequest(self._REQUEST))
+    self.assertTrue(lens.IsTrackingRequest(self._REQUEST))
+
+  def testMainFrameIsNotAdFrame(self):
+    trace = test_utils.LoadingTraceFromEvents(
+        [self._REQUEST] * 10, self._PAGE_EVENTS)
+    lens = ContentClassificationLens(trace, self._RULES, [])
+    self.assertFalse(lens.IsAdFrame(self._MAIN_FRAME_ID, .5))
+
+  def testAdFrame(self):
+    request = self._REQUEST
+    request.frame_id = '123.123'
+    trace = test_utils.LoadingTraceFromEvents(
+        [request] * 10 + [self._REQUEST] * 5, self._PAGE_EVENTS)
+    lens = ContentClassificationLens(trace, self._RULES, [])
+    self.assertTrue(lens.IsAdFrame(request.frame_id, .5))
+
+
+class _MatcherTestCase(unittest.TestCase):
+  _RULES_WITH_WHITELIST = ['/thisisanad.', '@@myadvertisingdomain.com/*',
+                           '@@||www.mydomain.com/ads/$elemhide']
+  _SCRIPT_RULE = 'domainwithscripts.com/*$script'
+  _SCRIPT_REQUEST = Request.FromJsonDict(
+      {'url': 'http://domainwithscripts.com/bla.js',
+       'resource_type': 'Script',
+       'request_id': '1234.1',
+       'frame_id': '123.1',
+       'initiator': {'type': 'other'},
+       'timestamp': 2,
+       'timing': TimingFromDict({})})
+
+  def testRemovesWhitelistRules(self):
+    matcher = _RulesMatcher(self._RULES_WITH_WHITELIST, False)
+    self.assertEquals(3, len(matcher._rules))
+    matcher = _RulesMatcher(self._RULES_WITH_WHITELIST, True)
+    self.assertEquals(1, len(matcher._rules))
+
+  def testScriptRule(self):
+    matcher = _RulesMatcher([self._SCRIPT_RULE], False)
+    request = copy.deepcopy(self._SCRIPT_REQUEST)
+    request.resource_type = 'Stylesheet'
+    self.assertFalse(matcher.Matches(request))
+    self.assertTrue(matcher.Matches(self._SCRIPT_REQUEST))
+
+
+if __name__ == '__main__':
+  unittest.main()
--- a/tools/android/loading/loading_model.py
+++ b/tools/android/loading/loading_model.py
@@ -30,14 +30,17 @@ class ResourceGraph(object):
  Set parameters:
    cache_all: if true, assume zero loading time for all resources.
  """
-  def __init__(self, trace):
+  def __init__(self, trace, content_lens=None):
    """Create from a LoadingTrace (or json of a trace).

    Args:
      trace: (LoadingTrace/JSON) Loading trace or JSON of a trace.
+      content_lens: (ContentClassificationLens) Lens used to annotate the
+                    nodes, or None.
    """
    if type(trace) == dict:
      trace = loading_trace.LoadingTrace.FromJsonDict(trace)
+    self._content_lens = content_lens
    self._BuildDag(trace)
    self._global_start = min([n.StartTime() for n in self._node_info])
    # Sort before splitting children so that we can correctly dectect if a
@@ -339,6 +342,8 @@ class ResourceGraph(object):
        request: The request associated with this node.
      """
      self._request = request
+      self._is_ad = False
+      self._is_tracking = False
      self._node = node
      self._edge_costs = {}
      self._edge_annotations = {}
@@ -357,6 +362,21 @@ class ResourceGraph(object):
    def Index(self):
      return self._node.Index()

+    def SetRequestContent(self, is_ad, is_tracking):
+      """Sets the kind of content the request relates to.
+
+      Args:
+        is_ad: (bool) Whether the request is an Ad.
+        is_tracking: (bool) Whether the request is related to tracking.
+      """
+      (self._is_ad, self._is_tracking) = (is_ad, is_tracking)
+
+    def IsAd(self):
+      return self._is_ad
+
+    def IsTracking(self):
+      return self._is_tracking
+
    def Request(self):
      return self._request

@@ -481,6 +501,9 @@ class ResourceGraph(object):
      index_by_request[request] = next_index
      node = dag.Node(next_index)
      node_info = self._NodeInfo(node, request)
+      if self._content_lens:
+        node.SetRequestContent(self._content_lens.IsAdRequest(request),
+                               self._content_lens.IsTrackingRequest(request))
      self._nodes.append(node)
      self._node_info.append(node_info)

@@ -606,6 +629,8 @@ class ResourceGraph(object):
        if fragment in node_info.Url():
          styles.append('dotted')
          break
+    if node_info.IsAd() or node_info.IsTracking():
+      styles += ['bold', 'diagonals']
    return ('%d [label = "%s\\n%.2f->%.2f (%.2f)"; style = "%s"; '
            'fillcolor = %s; shape = %s];\n'
            % (index, node_info.ShortName(),