Skip to content

Commit

Permalink
Fixes #3656 - Fetch and save Tranco top 10k domains
Browse files Browse the repository at this point in the history
  • Loading branch information
ksy36 committed Mar 29, 2022
1 parent c1a035e commit e014c24
Showing 1 changed file with 144 additions and 0 deletions.
144 changes: 144 additions & 0 deletions tools/topsites_tranco.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""GitHub Webhook module for assigning priority to sites."""

import requests
import os
import time
from datetime import datetime, timedelta

from sqlalchemy import Column
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Integer
from sqlalchemy.orm import sessionmaker
from sqlalchemy import String

cwd = os.getcwd()
DATA_PATH = os.path.join(cwd, 'data')

if not os.path.exists(DATA_PATH):
os.mkdir(DATA_PATH)

engine = create_engine('sqlite:///' + os.path.join(DATA_PATH, 'topsites-tranco-new.db'))
Base = declarative_base()
Session = sessionmaker(bind=engine)
session = Session()

TRANCO_API = 'https://tranco-list.eu/api/lists/date/'
CSV_FILE_PREFIX = 'tranco_'


class Site(Base):
"""SQLAchemy base object for an Alexa top site."""

__tablename__ = "topsites-tranco"

url = Column(String, primary_key=True)
ranking = Column(Integer)

def __init__(self, url, ranking):
"""Initialize parameters of the Tranco top site DB."""
self.url = url
self.ranking = ranking


Base.metadata.create_all(engine)


class List:
def __init__(self, date: str, list_id: str, size: int, tranco_list: list) -> None:
self.date = date
self.list_id = list_id
self.size = size
self.list = tranco_list

def get_top(self, num=100) -> list:
return self.list[:num]

def get_rank(self, domain) -> int:
try:
return self.list.index(domain) + 1
except ValueError:
print('domain is not in the list')
return -1


class Tranco:
def __init__(self) -> None:
self.data_path = DATA_PATH

def get_file_path(self, list_id: str) -> str:
return os.path.join(self.data_path, list_id + '.csv')

def get_creation_date(self) -> str:
day_before = (datetime.utcnow() - timedelta(days=1))
return day_before.strftime('%Y-%m-%d')

def get_latest_list_data(self, creation_date: str) -> dict:
headers = {'user-agent': 'webcompat-topsites-service'}
response = requests.get(TRANCO_API + creation_date, headers)
response.raise_for_status()
return response.json()

def download_csv(self, list_id: str, size: int, creation_date: str) -> str:
data = self.get_latest_list_data(creation_date)

if 'download' in data:
download_url = data['download'] + str(size)
dr = requests.get(download_url, stream=True)
dr.raise_for_status()

file_bytes = dr.content
with open(self.get_file_path(list_id), 'wb') as f:
f.write(file_bytes)
lst = file_bytes.decode('utf-8')
return lst

def get_list(self, size: int = 10000) -> List:
creation_date = self.get_creation_date()
list_id = CSV_FILE_PREFIX + creation_date
file_path = self.get_file_path(list_id)
if os.path.exists(file_path):
with open(file_path) as f:
csv = f.read()
else:
csv = self.download_csv(list_id, size, creation_date)

return List(creation_date, list_id, size, list(map(lambda x: x[x.index(',') + 1:], csv.splitlines())))


class TopSitesGlobal:
def __init__(self) -> None:
# Cache parsed sites, change priority if raised
self.topsites = {}

def update_priority(self, url: str, rank: int) -> None:
site_row = Site(url, rank)
session.add(site_row)


if __name__ == "__main__":
tranco = Tranco()
tranco_list = tranco.get_list()
top_global = TopSitesGlobal()

for (i, domain) in enumerate(tranco_list.list, start=1):
top_global.update_priority(domain, i)

session.commit()
session.close()

archive_date = time.strftime("%Y%m%d", time.localtime())

if os.path.isfile(os.path.join(DATA_PATH, 'topsites-tranco.db')):
os.rename(os.path.join(DATA_PATH, 'topsites-tranco.db'),
os.path.join(DATA_PATH,
'topsites-tranco-archive-{}.db'.format(archive_date)))
os.rename(os.path.join(DATA_PATH, 'topsites-tranco-new.db'),
os.path.join(DATA_PATH, 'topsites-tranco.db'))


0 comments on commit e014c24

Please sign in to comment.