-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch.py
36 lines (33 loc) · 1.15 KB
/
fetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from requests_html import HTMLSession
import argparse, time, os
parser = argparse.ArgumentParser(description='Fetch html.')
parser.add_argument('urls', type=str, nargs='+')
parser.add_argument('--metadata',action='store_true')
args = parser.parse_args()
print(args.urls)
if args.metadata:
for u in args.urls:
if os.path.exists(u + ".html"):
y = time.localtime(os.stat(u + ".html").st_mtime)
ftime = time.strftime("%Y-%m-%d %H:%M:%S",y)
with open(u + ".html") as f:
content = f.read()
links = content.count("<a ")
imgs = content.count("<img ")
print("site: " + u)
print("num_links: " + str(links))
print("images: " + str(imgs))
print("last_fetch: " + ftime)
else:
print("Never fetch " + u)
else:
for u in args.urls:
# try:
session = HTMLSession()
r = session.get("https://" + u)
with open(u + ".html", "w") as f:
f.write(r.html.html)
# except:
# print("Failed to fetch " + u)
# print(r.html.html)
# with open()