-
Notifications
You must be signed in to change notification settings - Fork 0
/
collect_links.py
83 lines (69 loc) · 2.79 KB
/
collect_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import requests
import pprint
pp = pprint.PrettyPrinter(indent=4)
from bs4 import BeautifulSoup
from datetime import date
from random import randint
from text_file_generator import save_to_file
# This script collects all links for a 'fandom', in this case the Star Wars Prequel Trilogy fandom. Because I love the Prequel Trilogy.
fandom = {
'name': 'Star Wars Prequel Trilogy',
'url': "https://archiveofourown.org/tags/Star%20Wars%20Prequel%20Trilogy/works?page=1&view_adult=true",
'stories': []
}
url = fandom['url']
while url:
try:
req = requests.get(url)
print(f'Starting on {url}')
time.sleep(randint(1,5))
soup = BeautifulSoup(req.text, "html.parser")
works = soup.select('div.header.module')
# Get link of each story on the page and save it with the save_to_file function.
for work in works:
try:
time.sleep(randint(1,5))
story_url = f'https://archiveofourown.org/{work.a.get("href")}?view_full_work=true'
save_to_file(story_url)
print()
except Exception as error:
print(error)
print('STORY WAS SKIPPED')
continue
# Create a dict with story information and append it to the list of stories in the fandom dict. I may refactor this to save the stories in a csv file first.
# fandom_tags = work.select('h5 a')
# try:
# author_name = work.find('a',{'rel':'author'}).text
# # If the author is anonymous or has no link this won't work
# except:
# # This code finds the author name if it's not in an <a> tag
# by_line = work.h4.text.strip()
# author_name = by_line[by_line.find('by'):].replace('by','').strip()
#
# fandom['stories'].append(
# {
# 'title': work.a.text.strip(),
# 'url': work.a.get('href'),
# 'fandoms': [tag.text.strip() for tag in fandom_tags],
# 'author': author_name
# }
# )
# Get link to next page if it exists
try:
next_page = soup.find('li',{'class': 'next'}).a.get('href')
url = f'https://archiveofourown.org/{next_page}'
except:
url = None
print(f'CURRENT PAGE: {url}\n')
# If there is an error saving the story skip to the next
except Exception as error:
print(error)
print(f'PAGE SKIPPED\n{url}')
# Save stories from the fandom.['stories']
# for story in fandom['stories']:
# try:
# save_to_file(f'https://archiveofourown.org{story["url"]}?view_full_work=true&view_adult=true')
# except Exception as e:
# print(f'ERROR! {story["url"]} was not saved.')
# print(e)
# continue