From 3287d5fd128a4f595e8ce6dad2b015e5c450d6ef Mon Sep 17 00:00:00 2001
From: Alexei <alexeiatyahoodotcom@gmail.com>
Date: Tue, 2 Feb 2021 13:12:49 -0500
Subject: [PATCH] Remove pixel cookie sharing detection for now

---
 src/js/heuristicblocking.js           | 165 +-------------------------
 src/js/utils.js                       |  43 -------
 src/tests/tests/utils.js              |  24 ----
 tests/selenium/cookie_sharing_test.py |  55 ---------
 4 files changed, 3 insertions(+), 284 deletions(-)
 delete mode 100644 tests/selenium/cookie_sharing_test.py

diff --git a/src/js/heuristicblocking.js b/src/js/heuristicblocking.js
index 91c0cebb76..de6faeb944 100644
--- a/src/js/heuristicblocking.js
+++ b/src/js/heuristicblocking.js
@@ -101,9 +101,8 @@ HeuristicBlocker.prototype = {
    * Use updateTrackerPrevalence for non-webRequest initiated bookkeeping.
    *
    * @param {Object} details request/response details
-   * @param {Boolean} check_for_cookie_share whether to check for cookie sharing
    */
-  heuristicBlockingAccounting: function (details, check_for_cookie_share) {
+  heuristicBlockingAccounting: function (details) {
     // ignore requests that are outside a tabbed window
     if (details.tabId < 0 || !badger.isLearningEnabled(details.tabId)) {
       return {};
@@ -144,119 +143,6 @@ HeuristicBlocker.prototype = {
       self._recordPrevalence(request_host, request_origin, tab_origin);
       return {};
     }
-
-    // check for cookie sharing iff this is an image in the top-level frame, and the request URL has parameters
-    if (check_for_cookie_share && details.type == 'image' && details.frameId === 0 && details.url.indexOf('?') > -1) {
-      // get all non-HttpOnly cookies for the top-level frame
-      // and pass those to the cookie-share accounting function
-      let tab_url = self.tabUrls[details.tabId];
-
-      let config = {
-        url: tab_url
-      };
-      if (badger.firstPartyDomainPotentiallyRequired) {
-        config.firstPartyDomain = null;
-      }
-
-      chrome.cookies.getAll(config, function (cookies) {
-        cookies = cookies.filter(cookie => !cookie.httpOnly);
-        if (cookies.length >= 1) {
-          self.pixelCookieShareAccounting(tab_url, tab_origin, details.url, request_host, request_origin, cookies);
-        }
-      });
-    }
-  },
-
-  /**
-   * Checks for cookie sharing: requests to third-party domains that include
-   * high entropy data from first-party cookies (associated with the top-level
-   * frame). Only catches plain-text verbatim sharing (b64 encoding + the like
-   * defeat it). Assumes any long string that doesn't contain URL fragments or
-   * stopwords is an identifier.  Doesn't catch cookie syncing (3rd party -> 3rd
-   * party), but most of those tracking cookies should be blocked anyway.
-   *
-   * @param details are those from onBeforeSendHeaders
-   * @param cookies are the result of chrome.cookies.getAll()
-   * @returns {*}
-   */
-  pixelCookieShareAccounting: function (tab_url, tab_origin, request_url, request_host, request_origin, cookies) {
-    let params = (new URL(request_url)).searchParams,
-      TRACKER_ENTROPY_THRESHOLD = 33,
-      MIN_STR_LEN = 8;
-
-    for (let p of params) {
-      let key = p[0],
-        value = p[1];
-
-      // the argument must be sufficiently long
-      if (!value || value.length < MIN_STR_LEN) {
-        continue;
-      }
-
-      // check if this argument is derived from a high-entropy first-party cookie
-      for (let cookie of cookies) {
-        // the cookie value must be sufficiently long
-        if (!cookie.value || cookie.value.length < MIN_STR_LEN) {
-          continue;
-        }
-
-        // find the longest common substring between this arg and the cookies
-        // associated with the document
-        let substrings = utils.findCommonSubstrings(cookie.value, value) || [];
-        for (let s of substrings) {
-          // ignore the substring if it's part of the first-party URL. sometimes
-          // content servers take the url of the page they're hosting content
-          // for as an argument. e.g.
-          // https://example-cdn.com/content?u=http://example.com/index.html
-          if (tab_url.indexOf(s) != -1) {
-            continue;
-          }
-
-          // elements of the user agent string are also commonly included in
-          // both cookies and arguments; e.g. "Mozilla/5.0" might be in both.
-          // This is not a special tracking risk since third parties can see
-          // this info anyway.
-          if (navigator.userAgent.indexOf(s) != -1) {
-            continue;
-          }
-
-          // Sometimes the entire url and then some is included in the
-          // substring -- the common string might be "https://example.com/:true"
-          // In that case, we only care about the information around the URL.
-          if (s.indexOf(tab_url) != -1) {
-            s = s.replace(tab_url, "");
-          }
-
-          // During testing we found lots of common values like "homepage",
-          // "referrer", etc. were being flagged as high entropy. This searches
-          // for a few of those and removes them before we go further.
-          let lower = s.toLowerCase();
-          lowEntropyQueryValues.forEach(function (qv) {
-            let start = lower.indexOf(qv);
-            if (start != -1) {
-              s = s.replace(s.substring(start, start + qv.length), "");
-            }
-          });
-
-          // at this point, since we might have removed things, make sure the
-          // string is still long enough to bother with
-          if (s.length < MIN_STR_LEN) {
-            continue;
-          }
-
-          // compute the entropy of this common substring. if it's greater than
-          // our threshold, record the tracking action and exit the function.
-          let entropy = utils.estimateMaxEntropy(s);
-          if (entropy > TRACKER_ENTROPY_THRESHOLD) {
-            log("Found high-entropy cookie share from", tab_origin, "to", request_host,
-              ":", entropy, "bits\n  cookie:", cookie.name, '=', cookie.value,
-              "\n  arg:", key, "=", value, "\n  substring:", s);
-            this._recordPrevalence(request_host, request_origin, tab_origin);
-            return;
-          }
-        }
-      }
-    }
   },
 
   /**
@@ -552,51 +438,6 @@ var lowEntropyCookieValues = {
   "zu":8
 };
 
-const lowEntropyQueryValues = [
-  "https",
-  "http",
-  "://",
-  "%3A%2F%2F",
-  "www",
-  "url",
-  "undefined",
-  "impression",
-  "session",
-  "homepage",
-  "client",
-  "version",
-  "business",
-  "title",
-  "get",
-  "site",
-  "name",
-  "category",
-  "account_id",
-  "smartadserver",
-  "front",
-  "page",
-  "view",
-  "first",
-  "visit",
-  "platform",
-  "language",
-  "automatic",
-  "disabled",
-  "landing",
-  "entertainment",
-  "amazon",
-  "official",
-  "webvisor",
-  "anonymous",
-  "across",
-  "narrative",
-  "\":null",
-  "\":false",
-  "\":\"",
-  "\",\"",
-  "\",\"",
-];
-
 /**
  * Extract cookies from onBeforeSendHeaders
  *
@@ -686,7 +527,7 @@ function startListeners() {
     extraInfoSpec.push('extraHeaders');
   }
   chrome.webRequest.onBeforeSendHeaders.addListener(function(details) {
-    return badger.heuristicBlocking.heuristicBlockingAccounting(details, true);
+    return badger.heuristicBlocking.heuristicBlockingAccounting(details);
   }, {urls: ["<all_urls>"]}, extraInfoSpec);
 
   /**
@@ -705,7 +546,7 @@ function startListeners() {
       }
     }
     if (hasSetCookie) {
-      return badger.heuristicBlocking.heuristicBlockingAccounting(details, false);
+      return badger.heuristicBlocking.heuristicBlockingAccounting(details);
     }
   },
   {urls: ["<all_urls>"]}, extraInfoSpec);
diff --git a/src/js/utils.js b/src/js/utils.js
index 581cb6a8a3..1072935316 100644
--- a/src/js/utils.js
+++ b/src/js/utils.js
@@ -200,48 +200,6 @@ function estimateMaxEntropy(str) {
   return max_bits;
 }
 
-// Adapted from https://gist.github.com/jaewook77/cd1e3aa9449d7ea4fb4f
-// Find all common substrings more than 8 characters long, using DYNAMIC
-// PROGRAMMING
-function findCommonSubstrings(str1, str2) {
-  /*
-   Let D[i,j] be the length of the longest matching string suffix between
-   str1[1]..str1[i] and a segment of str2 between str2[1]..str2[j].
-   If the ith character in str1 doesn’t match the jth character in str2, then
-   D[i,j] is zero to indicate that there is no matching suffix
-   */
-
-  // we only care about strings >= 8 chars
-  let D = [], LCS = [], LCS_MIN = 8;
-
-  // runs in O(M x N) time!
-  for (let i = 0; i < str1.length; i++) {
-    D[i] = [];
-    for (let j = 0; j < str2.length; j++) {
-      if (str1[i] == str2[j]) {
-        if (i == 0 || j == 0) {
-          D[i][j] = 1;
-        } else {
-          D[i][j] = D[i-1][j-1] + 1;
-        }
-
-        // store all common substrings longer than the minimum length
-        if (D[i][j] == LCS_MIN) {
-          LCS.push(str1.substring(i-D[i][j]+1, i+1));
-        } else if (D[i][j] > LCS_MIN) {
-          // remove the shorter substring and add the new, longer one
-          LCS.pop();
-          LCS.push(str1.substring(i-D[i][j]+1, i+1));
-        }
-      } else {
-        D[i][j] = 0;
-      }
-    }
-  }
-
-  return LCS;
-}
-
 function oneSecond() {
   return 1000;
 }
@@ -468,7 +426,6 @@ let exports = {
   arrayBufferToBase64,
   estimateMaxEntropy,
   explodeSubdomains,
-  findCommonSubstrings,
   getHostFromDomainInput,
   isRestrictedUrl,
   isThirdPartyDomain,
diff --git a/src/tests/tests/utils.js b/src/tests/tests/utils.js
index 26c1882542..c884374a7e 100644
--- a/src/tests/tests/utils.js
+++ b/src/tests/tests/utils.js
@@ -487,30 +487,6 @@ QUnit.test("getHostFromDomainInput", assert => {
   );
 });
 
-// Tests algorithm used in the pixel tracking heuristic
-// It should return a common substring between two given values
-QUnit.test("findCommonSubstrings", assert => {
-
-  assert.deepEqual(
-    utils.findCommonSubstrings('www.foo.bar', 'www.foob.ar'),
-    [],
-    "substrings under the length threshold of 8 are ignored"
-  );
-
-  assert.equal(
-    utils.findCommonSubstrings('foobar.com/foo/fizz/buzz/bar', 'foobar.com/foo/bizz/fuzz/bar')[0],
-    'foobar.com/foo/',
-    "returns longest matching value from the pair of URLs"
-  );
-
-  assert.deepEqual(
-    utils.findCommonSubstrings('foobar.com/fizz/buzz/bar/foo', 'foobar.com/fizzbuzz/buzz/bar/foo'),
-    ['foobar.com/fizz', "zz/buzz/bar/foo"],
-    "returns multiple substrings if multiple are present in comparison"
-  );
-
-});
-
 // used in pixel tracking heuristic, given a string the estimateMaxEntropy function
 // will return the estimated entropy value from it, based on logic parsing the string's length,
 // and classes of character complication included in the string
diff --git a/tests/selenium/cookie_sharing_test.py b/tests/selenium/cookie_sharing_test.py
deleted file mode 100644
index 9d99acdd49..0000000000
--- a/tests/selenium/cookie_sharing_test.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: UTF-8 -*-
-
-import unittest
-import pbtest
-
-class PixelTrackingTest(pbtest.PBSeleniumTest):
-    """Tests for the pixel cookie sharing heuristic included in heuristicblocking.js
-        - loads HTML fixture that sets a first-party cookie on page then creates an img tag
-        - if `trackMe=true` is present in the query string, img tag makes a src request carrying a substring of that tracking cookie
-        - tracking domain is caught by pixel tracking heuristic, snitch map entry is updated
-    """
-
-    def get_snitch_map(self):
-        return self.js(
-            "return chrome.extension.getBackgroundPage()."
-            "badger.storage.snitch_map.getItem('cloudinary.com');"
-        )
-
-    def setUp(self):
-        # enable local learning
-        self.load_url(self.options_url)
-        self.wait_for_script("return window.OPTIONS_INITIALIZED")
-        self.find_el_by_css('#local-learning-checkbox').click()
-
-    def test_pixel_cookie_sharing(self):
-        FIXTURE_URL = (
-            "https://efforg.github.io/privacybadger-test-fixtures/html/"
-            "pixel_cookie_sharing.html"
-        )
-
-        # clear seed data to prevent any potential false positives
-        self.load_url(self.options_url)
-        self.js("chrome.extension.getBackgroundPage().badger.storage.clearTrackerData();")
-
-        # load the test fixture without the URL parameter to to verify there is no tracking on the page by default
-        self.load_url(FIXTURE_URL)
-        # check to make sure the domain wasn't logged in snitch map
-        self.load_url(self.options_url)
-        self.assertFalse(self.get_snitch_map(),
-            "Tracking detected but page expected to have no tracking at this point")
-
-        # load the same test fixture, but pass the URL parameter for it to perform pixel cookie sharing
-        self.load_url(FIXTURE_URL + "?trackMe=true")
-        # check to make sure this domain is caught and correctly recorded in snitch map
-        self.load_url(self.options_url)
-        self.assertEqual(
-            self.get_snitch_map(),
-            ["efforg.github.io"],
-            "Pixel cookie sharing tracking failed to be detected"
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()