Skip to content

Commit

Permalink
Fixes #3656 - Fetch and save top domains from Tranco and optionally A…
Browse files Browse the repository at this point in the history
…lexa
  • Loading branch information
ksy36 committed Apr 21, 2022
1 parent c1a035e commit b570c47
Show file tree
Hide file tree
Showing 12 changed files with 702 additions and 316 deletions.
21 changes: 10 additions & 11 deletions tests/unit/test_topsites.py → tests/unit/test_topsites_alexa.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,31 +11,30 @@
from unittest.mock import patch
from xml.dom.minidom import parseString

from tools import topsites

from tools.topsites.alexa import Alexa
from tools.topsites.utils import node_text

TEST_XML = '''<?xml version="1.0"?><aws:TopSitesResponse xmlns:aws="http://alexa.amazonaws.com/doc/2005-10-05/"><aws:Response><aws:OperationRequest><aws:RequestId>9ffc5e13-175e-4c7e-b33b-0efe3501d1f3</aws:RequestId></aws:OperationRequest><aws:TopSitesResult><aws:Alexa><aws:TopSites><aws:List><aws:CountryName>China</aws:CountryName><aws:CountryCode>CN</aws:CountryCode><aws:TotalSites>671496</aws:TotalSites><aws:Sites><aws:Site><aws:DataUrl>baidu.com</aws:DataUrl><aws:Country><aws:Rank>1</aws:Rank><aws:Reach><aws:PerMillion>358000</aws:PerMillion></aws:Reach><aws:PageViews><aws:PerMillion>77410</aws:PerMillion><aws:PerUser>11.5</aws:PerUser></aws:PageViews></aws:Country><aws:Global><aws:Rank>4</aws:Rank></aws:Global></aws:Site></aws:Sites></aws:List></aws:TopSites></aws:Alexa></aws:TopSitesResult><aws:ResponseStatus><aws:StatusCode>Success</aws:StatusCode></aws:ResponseStatus></aws:Response></aws:TopSitesResponse>''' # noqa
TEST_QUERY_STRING = 'Action=TopSites&Count=100&CountryCode=CN&ResponseGroup=Country&Start=1' # noqa
TEST_QUERY_URI = 'https://ats.amazonaws.com/api?Action=TopSites&Count=100&CountryCode=CN&ResponseGroup=Country&Start=1' # noqa
TEST_QUERY_URI = 'https://ats.us-west-1.amazonaws.com/api?Action=TopSites&Count=100&CountryCode=CN&ResponseGroup=Country&Start=1' # noqa
TEST_QUERY_AUTH = 'AWS4-HMAC-SHA256 Credential=1234567890ABCDEFGHIJ/20060101/us-west-1/AlexaTopSites/aws4_request, SignedHeaders=host;x-amz-date, Signature=55b760bcae9a2ae93b0d08a85c3e613ec43c7f39f69ef2345896cf7660234f49' # noqa
TEST_QUERY_TIMESTAMP = '20060101T000000Z'


class TestTopsites(unittest.TestCase):
"""Tests for Top Sites Tools."""
class TestTopsitesAlexa(unittest.TestCase):
"""Tests for Top Sites Alexa class."""

def setUp(self):
"""Set up the tests."""
self.dom = parseString(TEST_XML)
topsites.ats_access_key = '1234567890ABCDEFGHIJ'
topsites.ats_secret_key = 'JIHGFEDCBA0987654321'
self.alexa = Alexa('1234567890ABCDEFGHIJ', 'JIHGFEDCBA0987654321')

def test_build_request(self):
"""Build a request."""
testdt = datetime.datetime(2006, 1, 1, 0, 0, 0, 0)
with patch('datetime.datetime') as dt_mock:
dt_mock.utcnow.return_value = testdt
uri, authorization, timestamp = topsites.build_request('CN', 1)
uri, authorization, timestamp = self.alexa.build_request('CN', 1)
self.assertEqual(uri, TEST_QUERY_URI)
self.assertEqual(authorization, TEST_QUERY_AUTH)
self.assertEqual(timestamp, TEST_QUERY_TIMESTAMP)
Expand All @@ -45,14 +44,14 @@ def test_build_query_string(self):
testdt = datetime.datetime(2006, 1, 1, 0, 0, 0, 0)
with patch('datetime.datetime') as dt_mock:
dt_mock.utcnow.return_value = testdt
self.assertEqual(topsites.build_query_string('CN', 1),
self.assertEqual(self.alexa.build_query_string('CN', 1),
TEST_QUERY_STRING)

def test_node_text(self):
"""Extract text from a node."""
site = self.dom.getElementsByTagName('aws:Site')[0]
self.assertEqual(topsites.node_text(site, 'aws:DataUrl'), 'baidu.com')
self.assertEqual(topsites.node_text(site, 'aws:Rank'), '1')
self.assertEqual(node_text(site, 'aws:DataUrl'), 'baidu.com')
self.assertEqual(node_text(site, 'aws:Rank'), '1')


if __name__ == '__main__':
Expand Down
101 changes: 101 additions & 0 deletions tests/unit/test_topsites_siterank.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

"""Tests for Siterank class."""

import unittest
from unittest.mock import patch

from tools.topsites.siterank import SiteRank


class TestSiteRank(unittest.TestCase):
"""Tests for Top Sites Tools."""

def setUp(self):
"""Set up the tests."""
self.siterank = SiteRank()

def test_get_priority(self):
"""Get a priority based on rank and initial priority."""
self.assertEqual(self.siterank.get_priority(100, 1), 1)
self.assertEqual(self.siterank.get_priority(1000, 1), 2)
self.assertEqual(self.siterank.get_priority(9999, 1), 3)
self.assertEqual(self.siterank.get_priority(100, 2), 2)
self.assertEqual(self.siterank.get_priority(999, 2), 3)

@patch('tools.topsites.siterank.site_global_session.add')
def test_save_global_rank(self, session_mock):
"""Save domain with a global rank."""
site_row = self.siterank.save_global_rank('example.com', 555)
self.assertEqual(self.siterank.sites_global, {'example.com': site_row})

@patch('tools.topsites.siterank.site_global_session.add')
@patch('tools.topsites.siterank.site_regional_session.add')
def test_save_regional_rank(self, r_mock, g_mock):
"""Save domain with a regional rank."""
regional_site_row = self.siterank.save_regional_rank(
'mysite.com', 49, 'DE'
)
self.assertEqual(self.siterank.sites_regional, {
'mysite.com': regional_site_row
})

@patch('tools.topsites.siterank.site_global_session.add')
@patch('tools.topsites.siterank.site_regional_session.add')
def test_save_regional_rank_same(self, r_mock, g_mock):
"""Regional rank is not saved because its priority is the same in global.""" # noqa
global_site_row = self.siterank.save_global_rank('example.com', 555)
self.siterank.save_regional_rank('example.com', 48, 'GB')

self.assertEqual(self.siterank.sites_global, {
'example.com': global_site_row
})
self.assertEqual(self.siterank.sites_regional, {})

@patch('tools.topsites.siterank.site_global_session.add')
@patch('tools.topsites.siterank.site_regional_session.add')
def test_save_regional_rank_lower(self, r_mock, g_mock):
"""Regional rank is saved because priority is lower than in global."""
global_site_row = self.siterank.save_global_rank('example.com', 1001)
regional_site_row = self.siterank.save_regional_rank(
'example.com', 48, 'GB'
)
self.assertEqual(self.siterank.sites_global, {
'example.com': global_site_row
})
self.assertEqual(self.siterank.sites_regional, {
'example.com': regional_site_row
})

@patch('tools.topsites.siterank.site_global_session.add')
@patch('tools.topsites.siterank.site_regional_session.add')
def test_save_regional_rank_multiple(self, r_mock, g_mock):
"""If a site is popular in two countries, higher priority one will be saved.""" # noqa
self.siterank.save_regional_rank('example.com', 101, 'GB')
regional_site_row_2 = self.siterank.save_regional_rank(
'example.com', 49, 'DE'
)
self.assertEqual(self.siterank.sites_regional, {
'example.com': regional_site_row_2,
})

@patch('tools.topsites.siterank.site_global_session.add')
@patch('tools.topsites.siterank.site_regional_session.add')
def test_save_global_regional_rank_multiple(self, r_mock, g_mock):
"""If a site is popular globally and in multiple countries, higher priority one will be saved.""" # noqa
global_site_row = self.siterank.save_global_rank('example.com', 12)
self.siterank.save_regional_rank('example.com', 101, 'GB')
self.siterank.save_regional_rank('example.com', 49, 'DE')

self.assertEqual(self.siterank.sites_global, {
'example.com': global_site_row
})
self.assertEqual(self.siterank.sites_regional, {})


if __name__ == '__main__':
unittest.main()
79 changes: 67 additions & 12 deletions tests/unit/test_webhook.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import webcompat

from webcompat.db import Site
from webcompat.db import SiteGlobal, SiteRegional
from webcompat.helpers import to_bytes
from webcompat.webhooks import helpers, ml

Expand Down Expand Up @@ -155,6 +155,11 @@ def setUp(self):
**Tested Another Browser**: Yes Edge
""" # noqa

self.issue_body12 = """
**URL**: https://subdomain.example.com/
<!-- @browser: Firefox Mobile (Tablet) 40.0 -->
"""

self.issue_info1 = {
'action': 'foobar',
'state': 'open',
Expand Down Expand Up @@ -350,18 +355,68 @@ def test_extract_extra_labels(self):
actual = helpers.extract_extra_labels(metadata_dict)
self.assertEqual(expected, actual)

def test_extract_priority_label(self):
"""Extract priority label."""
with patch('webcompat.db.site_db.query') as db_mock:
db_mock.return_value.filter_by.return_value = [
Site('google.com', 1, '', 1)]
priority_label = helpers.extract_priority_label(self.issue_body3)
self.assertEqual(priority_label, 'priority-critical')
priority_label_none = helpers.extract_priority_label(self.issue_body)
self.assertEqual(priority_label_none, None)

def test_get_issue_labels(self):
@patch('webcompat.db.regional_site_db.query')
@patch('webcompat.db.global_site_db.query')
def test_priority_label_regional_only(self, global_mock, regional_mock):
"""Extract priority label for a site found in regional db."""
regional_mock.return_value.filter_by.return_value = [
SiteRegional('example.com', 1, 'US', 1)]

priority_label = helpers.extract_priority_label(self.issue_body3)
global_mock.assert_not_called()
self.assertEqual(priority_label, 'priority-critical')

@patch('webcompat.db.regional_site_db.query')
@patch('webcompat.db.global_site_db.query')
def test_priority_label_global_only(self, global_mock, regional_mock):
"""Extract priority label for a site found in global db."""
regional_mock.return_value.filter_by.return_value = []
global_mock.return_value.filter_by.return_value = [
SiteGlobal('example.com', 1, 3)
]
priority_label = helpers.extract_priority_label(self.issue_body3)
global_mock.assert_called()
self.assertEqual(priority_label, 'priority-normal')

@patch('webcompat.db.regional_site_db.query')
@patch('webcompat.db.global_site_db.query')
def test_priority_label_sub(self, global_mock, regional_mock):
"""Extract priority label for subdomain."""

regional_mock.return_value.filter_by.side_effect = [
[],
[SiteRegional('example.com', 1, 'US', 1)]
]

global_mock.return_value.filter_by.return_value = []
priority_label = helpers.extract_priority_label(self.issue_body12)
self.assertEqual(priority_label, 'priority-critical')

@patch('webcompat.db.regional_site_db.query')
@patch('webcompat.db.global_site_db.query')
def test_priority_label_not_found(self, global_mock, regional_mock):
"""Extract priority label for a domain that is not in either db."""

regional_mock.return_value.filter_by.side_effect = [
[],
[]
]

global_mock.return_value.filter_by.side_effect = [
[],
[]
]

priority_label = helpers.extract_priority_label(self.issue_body)
self.assertEqual(priority_label, None)

@patch('webcompat.db.regional_site_db.query')
@patch('webcompat.db.global_site_db.query')
def test_get_issue_labels(self, global_mock, regional_mock):
"""Extract list of labels from an issue body."""
regional_mock.return_value.filter_by.return_value = []
global_mock.return_value.filter_by.return_value = []

labels_tests = [
(self.issue_body, ['browser-firefox', 'type-media',
'engine-gecko']),
Expand Down
85 changes: 85 additions & 0 deletions tools/fetch_topsites.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""
Script for assigning priority to sites.
The rule is as follows:
Critical: alexa top 100 in worldwide
Important: alexa top 101-1000 in worldwide or alexa top 100 in tier 1 countries/regions
Normal: alexa top 1001-10000 or alexa top 101-1000 in tier 1 countries/regions
"""

import argparse

from tools.topsites.tranco import Tranco
from tools.topsites.alexa import Alexa
from tools.topsites.siterank import SiteRank
from tools.topsites.siterank import DATA_PATH
from tools.topsites.utils import parse_site_dom_element

REGIONS = ['US', 'FR', 'IN', 'DE', 'TW', 'ID', 'HK', 'SG', 'PL',
'GB', 'RU']


def main() -> None:
description = "Script to fetch and update top site ranking from Tranco and optionally Alexa."
parser = argparse.ArgumentParser(description=description)

parser.add_argument(
"--retrieve-regional",
action="store_true",
help="Whether to retrieve regional ranking from Alexa (access and secret keys will be required).",
)

parser.add_argument(
"--ats_access_key",
help="AWS access key.",
type=str,
)
parser.add_argument(
"--ats_secret_key",
help="AWS secret key.",
type=str,
)

args = parser.parse_args()

if args.retrieve_regional:
print('Warning: Alexa APIs will be deprecated on December 15, 2022.')

if not (args.ats_secret_key and args.ats_access_key):
print('Please specify secret and access key for Alexa API.')
return

tranco = Tranco(DATA_PATH)
tranco_list = tranco.get_list()
siterank = SiteRank()

# Get global ranking
for (i, domain) in enumerate(tranco_list, start=1):
siterank.save_global_rank(domain, i)

# Get ranking per country
if args.retrieve_regional:
alexa = Alexa(args.ats_access_key, args.ats_secret_key)
for country_code in REGIONS:
sites = alexa.query_topsites(country_code=country_code)
for site in sites:
url, rank = parse_site_dom_element(site)
siterank.save_regional_rank(url, rank, country_code)

siterank.commit_regional()

siterank.commit_global()

siterank.cleanup_db(args.retrieve_regional)
tranco.cleanup()


if __name__ == "__main__":
main()


Loading

0 comments on commit b570c47

Please sign in to comment.