Commit 8497c472 authored by mattcary's avatar mattcary Committed by Commit bot

Practical fixes for ad filtering.

Only import adblocker if we actually define any rules. If we define
rules, and fail to import adblockerparser, give detailed installation
instructions.

Fix typo that prevented the filtering from working in the model, and have the filtering actually use the new adblocker matcher rather than the old heuristic.

Review URL: https://codereview.chromium.org/1645953002

Cr-Commit-Position: refs/heads/master@{#372079}
parent 6e9d8f9c
...@@ -4,8 +4,8 @@ ...@@ -4,8 +4,8 @@
"""Labels requests according to the type of content they represent.""" """Labels requests according to the type of content they represent."""
import adblockparser # Available on PyPI, through pip.
import collections import collections
import logging
import os import os
import loading_trace import loading_trace
...@@ -93,10 +93,23 @@ class _RulesMatcher(object): ...@@ -93,10 +93,23 @@ class _RulesMatcher(object):
no_whitelist: (bool) Whether the whitelisting rules should be ignored. no_whitelist: (bool) Whether the whitelisting rules should be ignored.
""" """
self._rules = self._FilterRules(rules, no_whitelist) self._rules = self._FilterRules(rules, no_whitelist)
self._matcher = adblockparser.AdblockRules(self._rules) if self._rules:
try:
import adblockparser
self._matcher = adblockparser.AdblockRules(self._rules)
except ImportError:
logging.critical('Likely you need to install adblockparser. Try:\n'
' pip install --user adblockparser\n'
'For 10-100x better performance, also try:\n'
" pip install --user 're2 >= 0.2.21'")
raise
else:
self._matcher = None
def Matches(self, request): def Matches(self, request):
"""Returns whether a request matches one of the rules.""" """Returns whether a request matches one of the rules."""
if self._matcher is None:
return False
url = request.url url = request.url
return self._matcher.should_block(url, self._GetOptions(request)) return self._matcher.should_block(url, self._GetOptions(request))
......
...@@ -25,6 +25,13 @@ class ContentClassificationLensTestCase(unittest.TestCase): ...@@ -25,6 +25,13 @@ class ContentClassificationLensTestCase(unittest.TestCase):
'frame_id': '123.13', 'parent_frame_id': _MAIN_FRAME_ID}] 'frame_id': '123.13', 'parent_frame_id': _MAIN_FRAME_ID}]
_RULES = ['bla.com'] _RULES = ['bla.com']
def testNoRules(self):
trace = test_utils.LoadingTraceFromEvents(
[self._REQUEST], self._PAGE_EVENTS)
lens = ContentClassificationLens(trace, [], [])
self.assertFalse(lens.IsAdRequest(self._REQUEST))
self.assertFalse(lens.IsTrackingRequest(self._REQUEST))
def testAdRequest(self): def testAdRequest(self):
trace = test_utils.LoadingTraceFromEvents( trace = test_utils.LoadingTraceFromEvents(
[self._REQUEST], self._PAGE_EVENTS) [self._REQUEST], self._PAGE_EVENTS)
......
...@@ -202,7 +202,8 @@ class ResourceGraph(object): ...@@ -202,7 +202,8 @@ class ResourceGraph(object):
Returns: Returns:
True if the node is not ad-related. True if the node is not ad-related.
""" """
return not self._IsAdUrl(self._node_info[node.Index()].Url()) node_info = self._node_info[node.Index()]
return not (node_info.IsAd() or node_info.IsTracking())
def MakeGraphviz(self, output, highlight=None): def MakeGraphviz(self, output, highlight=None):
"""Output a graphviz representation of our DAG. """Output a graphviz representation of our DAG.
...@@ -504,8 +505,9 @@ class ResourceGraph(object): ...@@ -504,8 +505,9 @@ class ResourceGraph(object):
node = dag.Node(next_index) node = dag.Node(next_index)
node_info = self._NodeInfo(node, request) node_info = self._NodeInfo(node, request)
if self._content_lens: if self._content_lens:
node.SetRequestContent(self._content_lens.IsAdRequest(request), node_info.SetRequestContent(
self._content_lens.IsTrackingRequest(request)) self._content_lens.IsAdRequest(request),
self._content_lens.IsTrackingRequest(request))
self._nodes.append(node) self._nodes.append(node)
self._node_info.append(node_info) self._node_info.append(node_info)
...@@ -641,82 +643,6 @@ class ResourceGraph(object): ...@@ -641,82 +643,6 @@ class ResourceGraph(object):
node_info.EndTime() - node_info.StartTime(), node_info.EndTime() - node_info.StartTime(),
','.join(styles), color, shape)) ','.join(styles), color, shape))
@classmethod
def _IsAdUrl(cls, url):
"""Return true if the url is an ad.
We group content that doesn't seem to be specific to the website along with
ads, eg staticxx.facebook.com, as well as analytics like googletagmanager (?
is this correct?).
Args:
url: The full string url to examine.
Returns:
True iff the url appears to be an ad.
"""
# See below for how these patterns are defined.
AD_PATTERNS = ['2mdn.net',
'admarvel.com',
'adnxs.com',
'adobedtm.com',
'adsrvr.org',
'adsafeprotected.com',
'adsymptotic.com',
'adtech.de',
'adtechus.com',
'advertising.com',
'atwola.com', # brand protection from cscglobal.com?
'bounceexchange.com',
'betrad.com',
'casalemedia.com',
'cloudfront.net//test.png',
'cloudfront.net//atrk.js',
'contextweb.com',
'crwdcntrl.net',
'doubleclick.net',
'dynamicyield.com',
'krxd.net',
'facebook.com//ping',
'fastclick.net',
'google.com//-ads.js',
'cse.google.com', # Custom search engine.
'googleadservices.com',
'googlesyndication.com',
'googletagmanager.com',
'lightboxcdn.com',
'mediaplex.com',
'meltdsp.com',
'mobile.nytimes.com//ads-success',
'mookie1.com',
'newrelic.com',
'nr-data.net', # Apparently part of newrelic.
'optnmnstr.com',
'pubmatic.com',
'quantcast.com',
'quantserve.com',
'rubiconproject.com',
'scorecardresearch.com',
'sekindo.com',
'serving-sys.com',
'sharethrough.com',
'staticxx.facebook.com', # ?
'syndication.twimg.com',
'tapad.com',
'yieldmo.com',
]
parts = urlparse.urlparse(url)
for pattern in AD_PATTERNS:
if '//' in pattern:
domain, path = pattern.split('//')
else:
domain, path = (pattern, None)
if parts.netloc.endswith(domain):
if not path or path in parts.path:
return True
return False
def _ExtractImages(self): def _ExtractImages(self):
"""Return interesting image resources. """Return interesting image resources.
......
...@@ -192,18 +192,6 @@ class LoadingModelTestCase(unittest.TestCase): ...@@ -192,18 +192,6 @@ class LoadingModelTestCase(unittest.TestCase):
self.assertEqual(self.SuccessorIndicies(graph._nodes[5]), []) self.assertEqual(self.SuccessorIndicies(graph._nodes[5]), [])
self.assertEqual(self.SortedIndicies(graph), [0, 1, 2, 3, 4, 5]) self.assertEqual(self.SortedIndicies(graph), [0, 1, 2, 3, 4, 5])
def test_AdUrl(self):
self.assertTrue(loading_model.ResourceGraph._IsAdUrl(
'http://afae61024b33032ef.profile.sfo20.cloudfront.net/test.png'))
self.assertFalse(loading_model.ResourceGraph._IsAdUrl(
'http://afae61024b33032ef.profile.sfo20.cloudfront.net/tst.png'))
self.assertTrue(loading_model.ResourceGraph._IsAdUrl(
'http://ums.adtechus.com/mapuser?providerid=1003;'
'userid=RUmecco4z3o===='))
self.assertTrue(loading_model.ResourceGraph._IsAdUrl(
'http://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js'))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment