-
Notifications
You must be signed in to change notification settings - Fork 0
/
archive_manager.py
73 lines (62 loc) · 2.67 KB
/
archive_manager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# the archive manager is responsible for scraping arxiv maximum once a day and storing the results as json files named by date
import scraper
import tagger
import json
import os
from datetime import datetime, timedelta
# set up constants
ARCHIVE_DIR = "data/archive"
ARCHIVE_INTERVAL = timedelta(days=1)
def manage(now = datetime.now()):
# create archive directory if it doesn't exist
if not os.path.exists(ARCHIVE_DIR):
os.makedirs(ARCHIVE_DIR)
# check if archive for today already exists
archive_filename = os.path.join(ARCHIVE_DIR, datetime.today().strftime("%Y-%m-%d") + ".json")
if os.path.exists(archive_filename):
return False
else:
# scrape arxiv
results = scraper.scrape_arxiv_until("ai", now - ARCHIVE_INTERVAL)
for result in results:
result["tags"] = tagger.tag_abstract(result["abstract"], thresh=0.23)
# save results to file
with open(archive_filename, "w") as f:
for result in results:
result["submitted"] = result["submitted"].strftime("%Y-%m-%d")
json.dump(results, f, default=str)
print("Archive saved to", archive_filename)
return True
# # clean up old archives
# for filename in os.listdir(ARCHIVE_DIR):
# filepath = os.path.join(ARCHIVE_DIR, filename)
# if os.path.isfile(filepath):
# filetime = datetime.strptime(filename[:-5], "%Y-%m-%d")
# if now - filetime > ARCHIVE_INTERVAL:
# os.remove(filepath)
# print("Removed old archive", filepath)
def fetch(start=datetime.today(), stop=datetime.today()):
start = start.replace(hour=23, minute=59, second=59)
# fetch archives from start to stop (inclusive)
# returns a list of json objects
# assumes start and stop are datetime objects
archives = []
file_date = start
while file_date >= stop:
archive_filename = os.path.join(ARCHIVE_DIR, file_date.strftime("%Y-%m-%d") + ".json")
if os.path.exists(archive_filename):
with open(archive_filename, "r") as f:
for p in json.load(f):
p_date = datetime.strptime(p["submitted"], "%Y-%m-%d")
# if publish date between start date and stop date
if p_date <= start and p_date >= stop:
archives.append(p)
file_date -= ARCHIVE_INTERVAL
return archives
if __name__ == "__main__":
import utils
days = input("How many days to look back? ")
days = int(days)
find_date = datetime.today() - timedelta(days=days)
manage(find_date)
[utils.dict_print(d, pause=True) for d in fetch(stop=find_date)]