-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathbuildBackgrounds-aon.py
210 lines (168 loc) · 11.6 KB
/
buildBackgrounds-aon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
from bs4 import Tag
import requests
import json
import datetime
import codecs
import time
import re
###TODO: Some backgrounds, like "Undercover Lotus Guard", get conditional bonuses to skills without giving you training in them. Right now those skills get put in 'skillsTrained', which is incorrect. Find some way to consistently ignore those extra skills.
###TODO: Right now we don't keep italic or bold text markers. Should we? If not, we should let the "This tag wasn't captured" section know to ignore <i> tags.
backgroundHolder = {}
backgroundHolder['name'] = 'Pathfinder 2.0 background list'
backgroundHolder['date'] = datetime.date.today().strftime("%B %d, %Y")
validAbilities = ['Strength', 'Dexterity', 'Constitution', 'Intelligence', 'Wisdom', 'Charisma']#For when we look for a given background's ability boosts
backgroundLinks = {
"Backgrounds.aspx": "General",
"Backgrounds.aspx?Legacy=true": "Legacy",
"Backgrounds.aspx?Regional=true": "Regional",
"Backgrounds.aspx?Group=1": "The Fall of Plaguestone",
"Backgrounds.aspx?Group=2": "Age of Ashes",
"Backgrounds.aspx?Group=3": "Extinction Curse",
"Backgrounds.aspx?Group=4": "Pathfinder Society",
"Backgrounds.aspx?Group=5": "Agents of Edgewatch"
}
def encoder(words):
encoded = words.replace(u"\u2019", "'")
encoded1 = encoded.replace(u"\u2014", "-")
encoded2 = encoded1.replace(u"\u2011", " ")
encoded3 = encoded2.replace(u'\u201c', "'")
encoded4 = encoded3.replace(u'\u201d', "'")
encoded5 = encoded4.replace(u'\u2026', "...")
return encoded5
def get_detail_entry(backEntry):
detail = {}#Package of a given background
description = []
abilityBoosts = []
skillsTrained = []
featsGained = []
descSpecial = {}
prerequisites = []
if str(backEntry.get_text()).endswith("Background"):#Only grabs the entries with "Background" at the end, hopefully avoiding things we don't care about
entryName = (str(backEntry.get_text()))[:-10]#removes "Background" from the end of the text we found
detail['name'] = entryName
detail['source'] = backEntry.find_next_sibling("a").get_text()
#link and PFS handling
if (backEntry.find('a').get('href')) == 'PFS.aspx':#some entries don't have PFS markers at all, making us navigate links differently
detail['PFS'] = str(backEntry.find('a').span.img.get('alt'))
if (backEntry.find('a').find_next_sibling('a').get('href').startswith("Backgrounds")):#If there's a PFS link, the thing after it is the background page link
detail['link'] = str("https://2e.aonprd.com/" + backEntry.find('a').find_next_sibling('a').get('href'))
elif (isinstance(backEntry.previous_sibling, Tag)) and (backEntry.previous_sibling.find_next('a', attrs={"href": "PFS.aspx"})):#Some of the initial broken entries need special treatment to get the PFS data
detail['PFS'] = str(backEntry.previous_sibling.find_next('a', attrs={"href": "PFS.aspx"}).span.img.get('alt'))
if (backEntry.find('a').get('href').startswith("Backgrounds")):#If there's no PFS link, this link should be the background link.
detail['link'] = str("https://2e.aonprd.com/" + backEntry.find('a').get('href'))
else:
detail['PFS'] = "No PFS Data"
if (backEntry.find('a').get('href').startswith("Backgrounds")):#If there's no PFS link, this link should be the background link.
detail['link'] = str("https://2e.aonprd.com/" + backEntry.find('a').get('href'))
#Set our pointer, skipping over whitetext and sup tags
descPoint = backEntry.find_next_sibling("a").next_sibling
if (descPoint.name is None) and (descPoint.strip() == ''):#skip over whitespace
descPoint = descPoint.next_sibling
while descPoint.name == 'sup':#Some things have sup tags, some don't. We need to skip over said tags.
descPoint = descPoint.next_sibling
while descPoint != None and descPoint.name != "h2" and descPoint.name != "h3" and descPoint.next_sibling != None:
if isinstance(descPoint, Tag):#Note that our `from bs4 import Tag` is necessary to check for the Tag type
#Bold block
if (descPoint.name == 'b'):#Ability boosts, Activates, and Prerequisites (including prereq Regions) are all in bold <b> tags
if (str(descPoint.text) == "Activate"):#Activate, like in the "Cursed" background
descSpecial['specType'] = str(encoder(descPoint.text))
descSpecial['specAction'] = str(encoder(descPoint.find_next_sibling('img').get('alt')))
descSpecial['specDescription'] = []
if descPoint.find_next_sibling('a').get('href').startswith("Traits"):
descSpecial['specTrait'] = descPoint.find_next_sibling('a').text
while descPoint != None and descPoint.name != "h2" and descPoint.name != "h3":
if descPoint.string != None:
if descSpecial['specDescription']:
descSpecial['specDescription'][-1] = str(descSpecial['specDescription'][-1] + descPoint.string)
else:
descSpecial['specDescription'].append(str(descPoint.string))
descPoint = descPoint.next_sibling
if descSpecial['specDescription']:
descSpecial['specDescription'][-1] = encoder(descSpecial['specDescription'][-1].rstrip())#rstrip gets rid or right-trailing whitespace
elif (str(descPoint.text) in ("Prerequisites", "Region")):#Prerequisites, like in the "Hookclaw Digger" background, including Regional prerequisites
prerequisites.append(str(descPoint.text))
while (descPoint.next_sibling != None) and not ((descPoint.name == None) and (descPoint.next_sibling.name == None)):#We're checking if both the current point and the following point are tagless text. If they are, then we've found a linebreak, and we want to break out of the loop.
descPoint = descPoint.next_sibling
if (descPoint.name == "a"):
prerequisites.append(str(descPoint.text))
else:
prerequisites.append(str(descPoint))
descPoint = descPoint.next_sibling
elif (str(descPoint.text) in validAbilities):#Valid ability boosts
abilityBoosts.append(str(descPoint.text))
else:
print ("We didn't handle this bold text: " + str(descPoint.text))
#Underlined block
elif descPoint.name == 'u':#Most skill and feat links are <u>nderlined
if descPoint.find('a').get('href').startswith("Skills"):
skillsTrained.append(str(descPoint.text))
elif descPoint.find('a').get('href').startswith("Feats"):
featsGained.append(str(descPoint.text))
elif descPoint.name == 'a':#...Some are underlined with an <a> tag's style attribute
if descPoint.get('href').startswith("Skills"):
skillsTrained.append(str(descPoint.text))
elif descPoint.get('href').startswith("Feats"):
featsGained.append(str(descPoint.text))
elif descPoint.name == 'h1':#There are some regional categories that we can ignore, stored in h1 tags
pass
else:
print("This tag wasn't captured:" + str(descPoint))
#Description block
if description:#if description already has elements
if (isinstance(descPoint, Tag)) and (descPoint.name != "h2"):
description[-1] = str(description[-1] + encoder(descPoint.text))#We need to fuse the tag with both halves of the text block it's inside of
elif (descPoint.name != "h2"):
description[-1] = str(description[-1] + encoder(descPoint))
else:#if description is empty, there's nothing on that side to fuse with.
if (isinstance(descPoint, Tag)) and (descPoint.name != "h2"):
description.append(str(encoder(descPoint.text)))
elif (descPoint.name != "h2"):
description.append(str(encoder(descPoint)))
if not isinstance(descPoint.next_sibling, Tag):#This fuses the latter half of the text
description[-1] = str(description[-1] + encoder(descPoint.next_sibling))
descPoint = descPoint.next_sibling#Skip over the text we already handled
else:#if it's not a tag, we just append it to the description
description.append(str(encoder(descPoint)))
if descPoint.nextSibling != None and descPoint.name != "h2" and descPoint.name != "h3":
descPoint = descPoint.next_sibling
#Setting detail, so that we can return it all in one nice background package
description[-1] = description[-1].rstrip()#gets rid of all the extra whitespace at the end of the description
detail['prerequisites'] = ''.join(prerequisites)
detail['description'] = ' '.join(description)
detail['abilityBoosts'] = abilityBoosts
detail['skillsTrained'] = skillsTrained
detail['featsGained'] = featsGained
detail['descSpecial'] = descSpecial
#print (detail)
return detail
def get_detail_list(eachEntry):
detail_list = []#Package of everything from the page
res = requests.get("https://2e.aonprd.com/" + eachEntry)
res.raise_for_status()
soup = BeautifulSoup(res.text, 'lxml')
for linebreak in soup.find_all('br'):
linebreak.extract()
main = soup.find("div", {'id': 'main'})
backEntries = main.find_all("h2")#pulls out each h2 section into items in a list
###This next block is for handling the source's broken HTML tags. If the source is ever fixed, this should be commented out.
if backgroundLinks[eachEntry] in ['General', 'Legacy', 'Regional', 'Pathfinder Society', 'Agents of Edgewatch']:#These pages have their first background entries broken, so we wrap it up in an h2 tag.
wrapper = soup.new_tag('h2')
backEntries[1] = backEntries[1].parent.next_sibling.wrap(wrapper)
backEntries[1].string.replace_with(str(backEntries[1].string + "Background"))
#print ("Wrapped: " + str(backEntries[1]))
###
del backEntries[0]#We don't need the first entry, because it's not a background entry.
for backEntry in backEntries:#for each h2 section in our list
detail_list.append(get_detail_entry(backEntry))
return detail_list
for eachEntry in backgroundLinks:
backgroundHolder[backgroundLinks[eachEntry]] = get_detail_list(eachEntry)#Tags each page with its name ("General", "Regional", etc.)
json_data = json.dumps(backgroundHolder, indent=4)
# print(json_data)
filename = "backgrounds_aon-pf2.json"
f = open(filename, "w")
f.write(json_data)
f.close