-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape-wikipedia.py
33 lines (30 loc) · 1.1 KB
/
scrape-wikipedia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from bs4 import BeautifulSoup
import csv
from os import mkdir
from os.path import exists, join
datadir = join('..', 'data')
if not exists(datadir):
mkdir(datadir)
source_page = open('List_of_S%26P_500_companies.html').read()
soup = BeautifulSoup(source_page, 'html.parser')
table = soup.find("table", { "class" : "wikitable sortable" })
# Fail now if we haven't found the right table
header = table.findAll('th')
if header[0].string != "Ticker symbol" or header[1].string != "Security":
raise Exception("Can't parse wikipedia's table!")
# Retreive the values in the table
records = []
rows = table.findAll('tr')
for row in rows:
fields = row.findAll('td')
if fields:
symbol = fields[0].string
name = fields[1].string
sector = fields[3].string
records.append([symbol, name, sector])
header = ['Symbol', 'Name', 'Sector']
writer = csv.writer(open('../data/sp500-companies.csv', 'wb'), lineterminator='\n')
writer.writerow(header)
# Sorting ensure easy tracking of modifications
records.sort(key=lambda s: s[1].lower())
writer.writerows(records)