-
Notifications
You must be signed in to change notification settings - Fork 4
/
Scraper.py
32 lines (25 loc) · 913 Bytes
/
Scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import urllib.request as urllib2
from bs4 import BeautifulSoup
import pandas as pd
import re
from unidecode import unidecode
quote_page = 'http://metrolyrics.com/{}-lyrics-drake.html'
filename = 'drake-songs.csv'
songs = pd.read_csv(filename)
for index, row in songs.iterrows():
page = urllib2.urlopen(quote_page.format(row['song']))
soup = BeautifulSoup(page, 'html.parser')
verses = soup.find_all('p', attrs={'class': 'verse'})
lyrics = ''
for verse in verses:
text = verse.text.strip()
text = re.sub(r"\[.*\]\n", "", unidecode(text))
if lyrics == '':
lyrics = lyrics + text.replace('\n', '|-|')
else:
lyrics = lyrics + '|-|' + text.replace('\n', '|-|')
songs.at[index, 'lyrics'] = lyrics
print('saving {}'.format(row['song']))
songs.head()
print('writing to .csv')
songs.to_csv(filename, sep=',', encoding='utf-8')