-
Notifications
You must be signed in to change notification settings - Fork 192
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fixes #3656 - Fetch and save Tranco top 10k domains
- Loading branch information
Showing
1 changed file
with
144 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
#!/usr/bin/python | ||
# -*- coding: utf-8 -*- | ||
# This Source Code Form is subject to the terms of the Mozilla Public | ||
# License, v. 2.0. If a copy of the MPL was not distributed with this | ||
# file, You can obtain one at http://mozilla.org/MPL/2.0/. | ||
"""GitHub Webhook module for assigning priority to sites.""" | ||
|
||
import requests | ||
import os | ||
import time | ||
from datetime import datetime, timedelta | ||
|
||
from sqlalchemy import Column | ||
from sqlalchemy import create_engine | ||
from sqlalchemy.ext.declarative import declarative_base | ||
from sqlalchemy import Integer | ||
from sqlalchemy.orm import sessionmaker | ||
from sqlalchemy import String | ||
|
||
cwd = os.getcwd() | ||
DATA_PATH = os.path.join(cwd, 'data') | ||
|
||
if not os.path.exists(DATA_PATH): | ||
os.mkdir(DATA_PATH) | ||
|
||
engine = create_engine('sqlite:///' + os.path.join(DATA_PATH, 'topsites-tranco-new.db')) | ||
Base = declarative_base() | ||
Session = sessionmaker(bind=engine) | ||
session = Session() | ||
|
||
TRANCO_API = 'https://tranco-list.eu/api/lists/date/' | ||
CSV_FILE_PREFIX = 'tranco_' | ||
|
||
|
||
class Site(Base): | ||
"""SQLAchemy base object for an Alexa top site.""" | ||
|
||
__tablename__ = "topsites-tranco" | ||
|
||
url = Column(String, primary_key=True) | ||
ranking = Column(Integer) | ||
|
||
def __init__(self, url, ranking): | ||
"""Initialize parameters of the Tranco top site DB.""" | ||
self.url = url | ||
self.ranking = ranking | ||
|
||
|
||
Base.metadata.create_all(engine) | ||
|
||
|
||
class List: | ||
def __init__(self, date: str, list_id: str, size: int, tranco_list: list) -> None: | ||
self.date = date | ||
self.list_id = list_id | ||
self.size = size | ||
self.list = tranco_list | ||
|
||
def get_top(self, num=100) -> list: | ||
return self.list[:num] | ||
|
||
def get_rank(self, domain) -> int: | ||
try: | ||
return self.list.index(domain) + 1 | ||
except ValueError: | ||
print('domain is not in the list') | ||
return -1 | ||
|
||
|
||
class Tranco: | ||
def __init__(self) -> None: | ||
self.data_path = DATA_PATH | ||
|
||
def get_file_path(self, list_id: str) -> str: | ||
return os.path.join(self.data_path, list_id + '.csv') | ||
|
||
def get_creation_date(self) -> str: | ||
day_before = (datetime.utcnow() - timedelta(days=1)) | ||
return day_before.strftime('%Y-%m-%d') | ||
|
||
def get_latest_list_data(self, creation_date: str) -> dict: | ||
headers = {'user-agent': 'webcompat-topsites-service'} | ||
response = requests.get(TRANCO_API + creation_date, headers) | ||
response.raise_for_status() | ||
return response.json() | ||
|
||
def download_csv(self, list_id: str, size: int, creation_date: str) -> str: | ||
data = self.get_latest_list_data(creation_date) | ||
|
||
if 'download' in data: | ||
download_url = data['download'] + str(size) | ||
dr = requests.get(download_url, stream=True) | ||
dr.raise_for_status() | ||
|
||
file_bytes = dr.content | ||
with open(self.get_file_path(list_id), 'wb') as f: | ||
f.write(file_bytes) | ||
lst = file_bytes.decode('utf-8') | ||
return lst | ||
|
||
def get_list(self, size: int = 10000) -> List: | ||
creation_date = self.get_creation_date() | ||
list_id = CSV_FILE_PREFIX + creation_date | ||
file_path = self.get_file_path(list_id) | ||
if os.path.exists(file_path): | ||
with open(file_path) as f: | ||
csv = f.read() | ||
else: | ||
csv = self.download_csv(list_id, size, creation_date) | ||
|
||
return List(creation_date, list_id, size, list(map(lambda x: x[x.index(',') + 1:], csv.splitlines()))) | ||
|
||
|
||
class TopSitesGlobal: | ||
def __init__(self) -> None: | ||
# Cache parsed sites, change priority if raised | ||
self.topsites = {} | ||
|
||
def update_priority(self, url: str, rank: int) -> None: | ||
site_row = Site(url, rank) | ||
session.add(site_row) | ||
|
||
|
||
if __name__ == "__main__": | ||
tranco = Tranco() | ||
tranco_list = tranco.get_list() | ||
top_global = TopSitesGlobal() | ||
|
||
for (i, domain) in enumerate(tranco_list.list, start=1): | ||
top_global.update_priority(domain, i) | ||
|
||
session.commit() | ||
session.close() | ||
|
||
archive_date = time.strftime("%Y%m%d", time.localtime()) | ||
|
||
if os.path.isfile(os.path.join(DATA_PATH, 'topsites-tranco.db')): | ||
os.rename(os.path.join(DATA_PATH, 'topsites-tranco.db'), | ||
os.path.join(DATA_PATH, | ||
'topsites-tranco-archive-{}.db'.format(archive_date))) | ||
os.rename(os.path.join(DATA_PATH, 'topsites-tranco-new.db'), | ||
os.path.join(DATA_PATH, 'topsites-tranco.db')) | ||
|
||
|