-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathlyrics_scraper.py
84 lines (70 loc) · 2.55 KB
/
lyrics_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/python
# -*- coding: utf8 -*-
import argparse
import os
import logging
from genius.api import Genius
logger = logging.getLogger(__name__)
parser = argparse.ArgumentParser(
description="A Genius scraper to obtain lyric from a specified lists of lyricists"
)
parser.add_argument('-v',
'--verbose',
help='increase output verbosity',
action='store_true',
)
parser.add_argument('-l',
'--lyrics_dir',
help='lyrics save directory',
type=str,
default='lyrics',
)
parser.add_argument('-n',
'--songs_per_artists',
help='number of maximum songs to scrap per artist',
type=int,
default=200,
)
args = parser.parse_args()
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
artists = [
'NF',
'Eminem'
]
logger.info(f'Lyrics saved in directory{args.lyrics_dir}.')
logger.info(f'Number of artists to scrap {len(artists)}')
notDownloaded = []
dirname = os.path.dirname(__file__)
lyrics_folder = args.lyrics_dir
artist_processed_counter = 0
for artist in artists:
# Initialization
api = Genius()
logger.info(f'Processing artist {artist}')
try:
artistScrap = api.search_artist(artist, max_songs=args.songs_per_artists)
if artistScrap.num_songs > 0:
lyrics = ''
for song in artistScrap.songs:
new_lyrics = song._body['lyrics']
lyrics = f'{lyrics}\n{new_lyrics}'
logger.info(f'Nb characters for {artist}: {len(lyrics)}')
if not os.path.exists(lyrics_folder):
os.makedirs(lyrics_folder)
with open(os.path.join(dirname, f'{lyrics_folder}/{artist}_lyrics.txt'), 'w') as f:
f.write(lyrics)
artist_processed_counter += 1
except:
logger.error(f'Could not process artist {artist}')
notDownloaded.append(artist)
logger.info('Success. All artists have been processed')
logger.info(f'Artists for whom the scrapping failed: {notDownloaded}')
logger.info('Merge the files with')
logger.info('cat *_lyrics.txt > merged_lyrics.txt')
logger.info(f'\n Stats : sd'
f'\n - Number of artists to process: {len(artists)}'
f'\n - Success: {artist_processed_counter}'
f'\n - Failure: {len(notDownloaded)}'
f'\n - Sucess Rate: {artist_processed_counter/len(artists)}'
)