diff --git a/src/js/heuristicblocking.js b/src/js/heuristicblocking.js index 91c0cebb76..de6faeb944 100644 --- a/src/js/heuristicblocking.js +++ b/src/js/heuristicblocking.js @@ -101,9 +101,8 @@ HeuristicBlocker.prototype = { * Use updateTrackerPrevalence for non-webRequest initiated bookkeeping. * * @param {Object} details request/response details - * @param {Boolean} check_for_cookie_share whether to check for cookie sharing */ - heuristicBlockingAccounting: function (details, check_for_cookie_share) { + heuristicBlockingAccounting: function (details) { // ignore requests that are outside a tabbed window if (details.tabId < 0 || !badger.isLearningEnabled(details.tabId)) { return {}; @@ -144,119 +143,6 @@ HeuristicBlocker.prototype = { self._recordPrevalence(request_host, request_origin, tab_origin); return {}; } - - // check for cookie sharing iff this is an image in the top-level frame, and the request URL has parameters - if (check_for_cookie_share && details.type == 'image' && details.frameId === 0 && details.url.indexOf('?') > -1) { - // get all non-HttpOnly cookies for the top-level frame - // and pass those to the cookie-share accounting function - let tab_url = self.tabUrls[details.tabId]; - - let config = { - url: tab_url - }; - if (badger.firstPartyDomainPotentiallyRequired) { - config.firstPartyDomain = null; - } - - chrome.cookies.getAll(config, function (cookies) { - cookies = cookies.filter(cookie => !cookie.httpOnly); - if (cookies.length >= 1) { - self.pixelCookieShareAccounting(tab_url, tab_origin, details.url, request_host, request_origin, cookies); - } - }); - } - }, - - /** - * Checks for cookie sharing: requests to third-party domains that include - * high entropy data from first-party cookies (associated with the top-level - * frame). Only catches plain-text verbatim sharing (b64 encoding + the like - * defeat it). Assumes any long string that doesn't contain URL fragments or - * stopwords is an identifier. Doesn't catch cookie syncing (3rd party -> 3rd - * party), but most of those tracking cookies should be blocked anyway. - * - * @param details are those from onBeforeSendHeaders - * @param cookies are the result of chrome.cookies.getAll() - * @returns {*} - */ - pixelCookieShareAccounting: function (tab_url, tab_origin, request_url, request_host, request_origin, cookies) { - let params = (new URL(request_url)).searchParams, - TRACKER_ENTROPY_THRESHOLD = 33, - MIN_STR_LEN = 8; - - for (let p of params) { - let key = p[0], - value = p[1]; - - // the argument must be sufficiently long - if (!value || value.length < MIN_STR_LEN) { - continue; - } - - // check if this argument is derived from a high-entropy first-party cookie - for (let cookie of cookies) { - // the cookie value must be sufficiently long - if (!cookie.value || cookie.value.length < MIN_STR_LEN) { - continue; - } - - // find the longest common substring between this arg and the cookies - // associated with the document - let substrings = utils.findCommonSubstrings(cookie.value, value) || []; - for (let s of substrings) { - // ignore the substring if it's part of the first-party URL. sometimes - // content servers take the url of the page they're hosting content - // for as an argument. e.g. - // https://example-cdn.com/content?u=http://example.com/index.html - if (tab_url.indexOf(s) != -1) { - continue; - } - - // elements of the user agent string are also commonly included in - // both cookies and arguments; e.g. "Mozilla/5.0" might be in both. - // This is not a special tracking risk since third parties can see - // this info anyway. - if (navigator.userAgent.indexOf(s) != -1) { - continue; - } - - // Sometimes the entire url and then some is included in the - // substring -- the common string might be "https://example.com/:true" - // In that case, we only care about the information around the URL. - if (s.indexOf(tab_url) != -1) { - s = s.replace(tab_url, ""); - } - - // During testing we found lots of common values like "homepage", - // "referrer", etc. were being flagged as high entropy. This searches - // for a few of those and removes them before we go further. - let lower = s.toLowerCase(); - lowEntropyQueryValues.forEach(function (qv) { - let start = lower.indexOf(qv); - if (start != -1) { - s = s.replace(s.substring(start, start + qv.length), ""); - } - }); - - // at this point, since we might have removed things, make sure the - // string is still long enough to bother with - if (s.length < MIN_STR_LEN) { - continue; - } - - // compute the entropy of this common substring. if it's greater than - // our threshold, record the tracking action and exit the function. - let entropy = utils.estimateMaxEntropy(s); - if (entropy > TRACKER_ENTROPY_THRESHOLD) { - log("Found high-entropy cookie share from", tab_origin, "to", request_host, - ":", entropy, "bits\n cookie:", cookie.name, '=', cookie.value, - "\n arg:", key, "=", value, "\n substring:", s); - this._recordPrevalence(request_host, request_origin, tab_origin); - return; - } - } - } - } }, /** @@ -552,51 +438,6 @@ var lowEntropyCookieValues = { "zu":8 }; -const lowEntropyQueryValues = [ - "https", - "http", - "://", - "%3A%2F%2F", - "www", - "url", - "undefined", - "impression", - "session", - "homepage", - "client", - "version", - "business", - "title", - "get", - "site", - "name", - "category", - "account_id", - "smartadserver", - "front", - "page", - "view", - "first", - "visit", - "platform", - "language", - "automatic", - "disabled", - "landing", - "entertainment", - "amazon", - "official", - "webvisor", - "anonymous", - "across", - "narrative", - "\":null", - "\":false", - "\":\"", - "\",\"", - "\",\"", -]; - /** * Extract cookies from onBeforeSendHeaders * @@ -686,7 +527,7 @@ function startListeners() { extraInfoSpec.push('extraHeaders'); } chrome.webRequest.onBeforeSendHeaders.addListener(function(details) { - return badger.heuristicBlocking.heuristicBlockingAccounting(details, true); + return badger.heuristicBlocking.heuristicBlockingAccounting(details); }, {urls: [""]}, extraInfoSpec); /** @@ -705,7 +546,7 @@ function startListeners() { } } if (hasSetCookie) { - return badger.heuristicBlocking.heuristicBlockingAccounting(details, false); + return badger.heuristicBlocking.heuristicBlockingAccounting(details); } }, {urls: [""]}, extraInfoSpec); diff --git a/src/js/utils.js b/src/js/utils.js index 581cb6a8a3..1072935316 100644 --- a/src/js/utils.js +++ b/src/js/utils.js @@ -200,48 +200,6 @@ function estimateMaxEntropy(str) { return max_bits; } -// Adapted from https://gist.github.com/jaewook77/cd1e3aa9449d7ea4fb4f -// Find all common substrings more than 8 characters long, using DYNAMIC -// PROGRAMMING -function findCommonSubstrings(str1, str2) { - /* - Let D[i,j] be the length of the longest matching string suffix between - str1[1]..str1[i] and a segment of str2 between str2[1]..str2[j]. - If the ith character in str1 doesn’t match the jth character in str2, then - D[i,j] is zero to indicate that there is no matching suffix - */ - - // we only care about strings >= 8 chars - let D = [], LCS = [], LCS_MIN = 8; - - // runs in O(M x N) time! - for (let i = 0; i < str1.length; i++) { - D[i] = []; - for (let j = 0; j < str2.length; j++) { - if (str1[i] == str2[j]) { - if (i == 0 || j == 0) { - D[i][j] = 1; - } else { - D[i][j] = D[i-1][j-1] + 1; - } - - // store all common substrings longer than the minimum length - if (D[i][j] == LCS_MIN) { - LCS.push(str1.substring(i-D[i][j]+1, i+1)); - } else if (D[i][j] > LCS_MIN) { - // remove the shorter substring and add the new, longer one - LCS.pop(); - LCS.push(str1.substring(i-D[i][j]+1, i+1)); - } - } else { - D[i][j] = 0; - } - } - } - - return LCS; -} - function oneSecond() { return 1000; } @@ -468,7 +426,6 @@ let exports = { arrayBufferToBase64, estimateMaxEntropy, explodeSubdomains, - findCommonSubstrings, getHostFromDomainInput, isRestrictedUrl, isThirdPartyDomain, diff --git a/src/tests/tests/utils.js b/src/tests/tests/utils.js index 26c1882542..c884374a7e 100644 --- a/src/tests/tests/utils.js +++ b/src/tests/tests/utils.js @@ -487,30 +487,6 @@ QUnit.test("getHostFromDomainInput", assert => { ); }); -// Tests algorithm used in the pixel tracking heuristic -// It should return a common substring between two given values -QUnit.test("findCommonSubstrings", assert => { - - assert.deepEqual( - utils.findCommonSubstrings('www.foo.bar', 'www.foob.ar'), - [], - "substrings under the length threshold of 8 are ignored" - ); - - assert.equal( - utils.findCommonSubstrings('foobar.com/foo/fizz/buzz/bar', 'foobar.com/foo/bizz/fuzz/bar')[0], - 'foobar.com/foo/', - "returns longest matching value from the pair of URLs" - ); - - assert.deepEqual( - utils.findCommonSubstrings('foobar.com/fizz/buzz/bar/foo', 'foobar.com/fizzbuzz/buzz/bar/foo'), - ['foobar.com/fizz', "zz/buzz/bar/foo"], - "returns multiple substrings if multiple are present in comparison" - ); - -}); - // used in pixel tracking heuristic, given a string the estimateMaxEntropy function // will return the estimated entropy value from it, based on logic parsing the string's length, // and classes of character complication included in the string diff --git a/tests/selenium/cookie_sharing_test.py b/tests/selenium/cookie_sharing_test.py deleted file mode 100644 index 9d99acdd49..0000000000 --- a/tests/selenium/cookie_sharing_test.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- - -import unittest -import pbtest - -class PixelTrackingTest(pbtest.PBSeleniumTest): - """Tests for the pixel cookie sharing heuristic included in heuristicblocking.js - - loads HTML fixture that sets a first-party cookie on page then creates an img tag - - if `trackMe=true` is present in the query string, img tag makes a src request carrying a substring of that tracking cookie - - tracking domain is caught by pixel tracking heuristic, snitch map entry is updated - """ - - def get_snitch_map(self): - return self.js( - "return chrome.extension.getBackgroundPage()." - "badger.storage.snitch_map.getItem('cloudinary.com');" - ) - - def setUp(self): - # enable local learning - self.load_url(self.options_url) - self.wait_for_script("return window.OPTIONS_INITIALIZED") - self.find_el_by_css('#local-learning-checkbox').click() - - def test_pixel_cookie_sharing(self): - FIXTURE_URL = ( - "https://efforg.github.io/privacybadger-test-fixtures/html/" - "pixel_cookie_sharing.html" - ) - - # clear seed data to prevent any potential false positives - self.load_url(self.options_url) - self.js("chrome.extension.getBackgroundPage().badger.storage.clearTrackerData();") - - # load the test fixture without the URL parameter to to verify there is no tracking on the page by default - self.load_url(FIXTURE_URL) - # check to make sure the domain wasn't logged in snitch map - self.load_url(self.options_url) - self.assertFalse(self.get_snitch_map(), - "Tracking detected but page expected to have no tracking at this point") - - # load the same test fixture, but pass the URL parameter for it to perform pixel cookie sharing - self.load_url(FIXTURE_URL + "?trackMe=true") - # check to make sure this domain is caught and correctly recorded in snitch map - self.load_url(self.options_url) - self.assertEqual( - self.get_snitch_map(), - ["efforg.github.io"], - "Pixel cookie sharing tracking failed to be detected" - ) - - -if __name__ == "__main__": - unittest.main()