Skip to content

Commit

Permalink
Updated scraper.py to refine special characters handling.
Browse files Browse the repository at this point in the history
  • Loading branch information
jimmynotjames committed Apr 26, 2024
1 parent e8e6042 commit 1fcddd4
Showing 1 changed file with 8 additions and 3 deletions.
11 changes: 8 additions & 3 deletions scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@
LYRIC_JSON_PATH = 'lyrics.json'
SONG_LIST_PATH = 'song_titles.txt'

access_token = # put your genius.com API access token here as a string

def main():
parser = argparse.ArgumentParser()
Expand Down Expand Up @@ -239,7 +240,7 @@ def albums_to_songs_csv(songs_by_album, existing_df=None):
for song in songs_by_album[album]:
if song.title not in IGNORE_SONGS and song.title not in songs_titles:
record = {
'Title': song.title.strip('\u200b'),
'Title': song.title.replace('\u200b', ''),
'Album':
album if 'Lover (Target' not in album else 'Lover',
'Lyrics': song.lyrics,
Expand All @@ -250,7 +251,7 @@ def albums_to_songs_csv(songs_by_album, existing_df=None):
for song in songs_by_album[album]:
if song in OTHER_SONGS and song.title not in songs_titles:
record = {
'Title': song.title,
'Title': song.title.replace('\u200b', ''),
'Album': album,
'Lyrics': song.lyrics,
}
Expand Down Expand Up @@ -376,8 +377,12 @@ def clean_lyrics(lyrics: str) -> str:
lyrics = re.sub(r'\u201C|\u201D', '"', lyrics)
# Replace special unicode spaces with standard space
lyrics = re.sub(
r'[\u00A0\u1680​\u180e\u2000-\u2009\u200a​\u200b​\u202f\u205f​\u3000]',
r'[\u00A0\u1680​\u180e\u2000-\u2009\u200a​\u202f\u205f​\u3000]',
" ", lyrics)
# Replace zero-width space with empty string
lyrics = lyrics.replace('\u200b', '')
# Replace Cyrillic 'e' letters with English 'e'.
lyrics = re.sub(r'\u0435', "e", lyrics)
# Replace dashes with space and single hyphen
lyrics = re.sub(r'\u2013|\u2014', " - ", lyrics)
# Replace hyperlink text
Expand Down

0 comments on commit 1fcddd4

Please sign in to comment.