-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
56 lines (40 loc) · 1.19 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from bs4 import BeautifulSoup
from csv import DictWriter
from requests import get
from time import sleep
from urllib.parse import urljoin
base_url = "https://www.aphis.usda.gov/aphis/pet-travel/take-pet-to-foreign-country/export-pets"
response = get(base_url)
soup = BeautifulSoup(response.text)
options = soup.select("#country option")
print(options)
rows = []
for opt in options:
name = opt.text
attrs = opt.attrs
url_path = opt.attrs['value']
full_url = urljoin(base_url, url_path)
if attrs['value'] == "#": continue
row = {
"name": name,
"url": full_url
}
banner_color = "unknown"
colors = ["Green", "Orange", "Purple", "Red", "Yellow"]
for color in colors:
if color.lower() in url_path.lower():
banner_color = color.lower()
if banner_color is "unknown":
page_text = get(full_url).text
for color in colors:
if f'"{color} Banner" Country' in page_text:
banner_color = color.lower()
row['color'] = banner_color
rows.append(row)
print(row)
sleep(1)
with open("./data/countries.csv", "w") as f:
writer = DictWriter(f, fieldnames=["name", "color", "url"])
writer.writeheader()
for row in rows:
writer.writerow(row)