Fixes #3656 - Fetch and save Tranco top 10k domains

webcompat · Mar 29, 2022 · e014c24 · e014c24
1 parent c1a035e
commit e014c24
Showing 1 changed file with 144 additions and 0 deletions.
diff --git a/tools/topsites_tranco.py b/tools/topsites_tranco.py
@@ -0,0 +1,144 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+"""GitHub Webhook module for assigning priority to sites."""
+
+import requests
+import os
+import time
+from datetime import datetime, timedelta
+
+from sqlalchemy import Column
+from sqlalchemy import create_engine
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy import Integer
+from sqlalchemy.orm import sessionmaker
+from sqlalchemy import String
+
+cwd = os.getcwd()
+DATA_PATH = os.path.join(cwd, 'data')
+
+if not os.path.exists(DATA_PATH):
+    os.mkdir(DATA_PATH)
+
+engine = create_engine('sqlite:///' + os.path.join(DATA_PATH, 'topsites-tranco-new.db'))
+Base = declarative_base()
+Session = sessionmaker(bind=engine)
+session = Session()
+
+TRANCO_API = 'https://tranco-list.eu/api/lists/date/'
+CSV_FILE_PREFIX = 'tranco_'
+
+
+class Site(Base):
+    """SQLAchemy base object for an Alexa top site."""
+
+    __tablename__ = "topsites-tranco"
+
+    url = Column(String, primary_key=True)
+    ranking = Column(Integer)
+
+    def __init__(self, url, ranking):
+        """Initialize parameters of the Tranco top site DB."""
+        self.url = url
+        self.ranking = ranking
+
+
+Base.metadata.create_all(engine)
+
+
+class List:
+    def __init__(self, date: str, list_id: str, size: int, tranco_list: list) -> None:
+        self.date = date
+        self.list_id = list_id
+        self.size = size
+        self.list = tranco_list
+
+    def get_top(self, num=100) -> list:
+        return self.list[:num]
+
+    def get_rank(self, domain) -> int:
+        try:
+            return self.list.index(domain) + 1
+        except ValueError:
+            print('domain is not in the list')
+            return -1
+
+
+class Tranco:
+    def __init__(self) -> None:
+        self.data_path = DATA_PATH
+
+    def get_file_path(self, list_id: str) -> str:
+        return os.path.join(self.data_path, list_id + '.csv')
+
+    def get_creation_date(self) -> str:
+        day_before = (datetime.utcnow() - timedelta(days=1))
+        return day_before.strftime('%Y-%m-%d')
+
+    def get_latest_list_data(self, creation_date: str) -> dict:
+        headers = {'user-agent': 'webcompat-topsites-service'}
+        response = requests.get(TRANCO_API + creation_date, headers)
+        response.raise_for_status()
+        return response.json()
+
+    def download_csv(self, list_id: str, size: int, creation_date: str) -> str:
+        data = self.get_latest_list_data(creation_date)
+
+        if 'download' in data:
+            download_url = data['download'] + str(size)
+            dr = requests.get(download_url, stream=True)
+            dr.raise_for_status()
+
+            file_bytes = dr.content
+            with open(self.get_file_path(list_id), 'wb') as f:
+                f.write(file_bytes)
+            lst = file_bytes.decode('utf-8')
+            return lst
+
+    def get_list(self, size: int = 10000) -> List:
+        creation_date = self.get_creation_date()
+        list_id = CSV_FILE_PREFIX + creation_date
+        file_path = self.get_file_path(list_id)
+        if os.path.exists(file_path):
+            with open(file_path) as f:
+                csv = f.read()
+        else:
+            csv = self.download_csv(list_id, size, creation_date)
+
+        return List(creation_date, list_id, size, list(map(lambda x: x[x.index(',') + 1:], csv.splitlines())))
+
+
+class TopSitesGlobal:
+    def __init__(self) -> None:
+        # Cache parsed sites, change priority if raised
+        self.topsites = {}
+
+    def update_priority(self, url: str, rank: int) -> None:
+        site_row = Site(url, rank)
+        session.add(site_row)
+
+
+if __name__ == "__main__":
+    tranco = Tranco()
+    tranco_list = tranco.get_list()
+    top_global = TopSitesGlobal()
+
+    for (i, domain) in enumerate(tranco_list.list, start=1):
+        top_global.update_priority(domain, i)
+
+    session.commit()
+    session.close()
+
+    archive_date = time.strftime("%Y%m%d", time.localtime())
+
+    if os.path.isfile(os.path.join(DATA_PATH, 'topsites-tranco.db')):
+        os.rename(os.path.join(DATA_PATH, 'topsites-tranco.db'),
+                  os.path.join(DATA_PATH,
+                               'topsites-tranco-archive-{}.db'.format(archive_date)))
+    os.rename(os.path.join(DATA_PATH, 'topsites-tranco-new.db'),
+              os.path.join(DATA_PATH, 'topsites-tranco.db'))
+
+