From e8e60427646cf8e88e67fe4f6af6af1d59c35488 Mon Sep 17 00:00:00 2001 From: Jimmy Ho Date: Thu, 25 Apr 2024 22:03:13 -0400 Subject: [PATCH 1/3] Updated .gitignore and requirements.txt. --- .gitignore | 3 ++- requirements.txt | 29 ++++++++++++++++++++++++----- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 7b7e21a..11910e1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ local.py -.vscode/* \ No newline at end of file +.vscode/* +venv/* diff --git a/requirements.txt b/requirements.txt index 570b153..3e1f244 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,24 @@ -certifi==2020.12.5 -chardet==4.0.0 -idna==2.10 -requests==2.25.1 -urllib3==1.26.2 +beautifulsoup4==4.12.3 +certifi==2024.2.2 +charset-normalizer==3.3.2 +idna==3.7 +ioutil==1.0.3 +local==1.0.0 +lyricsgenius==3.0.1 +numpy==1.26.4 +pandas==2.2.2 +pyarrow==16.0.0 +pyperclip==1.8.2 +python-dateutil==2.9.0.post0 +pytz==2024.1 +requests==2.31.0 +Send2Trash==1.8.3 +six==1.16.0 +soupsieve==2.5 +speedtest-cli==2.1.3 +srutil==1.0.3 +tabulate==0.9.0 +toml==0.10.2 +tzdata==2024.1 +urllib3==2.2.1 +wikipedia==1.4.0 From 1fcddd42ab388e6537bb685a7961184bb8259095 Mon Sep 17 00:00:00 2001 From: Jimmy Ho Date: Thu, 25 Apr 2024 22:44:34 -0400 Subject: [PATCH 2/3] Updated scraper.py to refine special characters handling. --- scraper.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/scraper.py b/scraper.py index 08ac1ad..3cc6f63 100644 --- a/scraper.py +++ b/scraper.py @@ -119,6 +119,7 @@ LYRIC_JSON_PATH = 'lyrics.json' SONG_LIST_PATH = 'song_titles.txt' +access_token = # put your genius.com API access token here as a string def main(): parser = argparse.ArgumentParser() @@ -239,7 +240,7 @@ def albums_to_songs_csv(songs_by_album, existing_df=None): for song in songs_by_album[album]: if song.title not in IGNORE_SONGS and song.title not in songs_titles: record = { - 'Title': song.title.strip('\u200b'), + 'Title': song.title.replace('\u200b', ''), 'Album': album if 'Lover (Target' not in album else 'Lover', 'Lyrics': song.lyrics, @@ -250,7 +251,7 @@ def albums_to_songs_csv(songs_by_album, existing_df=None): for song in songs_by_album[album]: if song in OTHER_SONGS and song.title not in songs_titles: record = { - 'Title': song.title, + 'Title': song.title.replace('\u200b', ''), 'Album': album, 'Lyrics': song.lyrics, } @@ -376,8 +377,12 @@ def clean_lyrics(lyrics: str) -> str: lyrics = re.sub(r'\u201C|\u201D', '"', lyrics) # Replace special unicode spaces with standard space lyrics = re.sub( - r'[\u00A0\u1680​\u180e\u2000-\u2009\u200a​\u200b​\u202f\u205f​\u3000]', + r'[\u00A0\u1680​\u180e\u2000-\u2009\u200a​\u202f\u205f​\u3000]', " ", lyrics) + # Replace zero-width space with empty string + lyrics = lyrics.replace('\u200b', '') + # Replace Cyrillic 'e' letters with English 'e'. + lyrics = re.sub(r'\u0435', "e", lyrics) # Replace dashes with space and single hyphen lyrics = re.sub(r'\u2013|\u2014', " - ", lyrics) # Replace hyperlink text From b9ea6a859c1d304c33d80f4230266594927849f1 Mon Sep 17 00:00:00 2001 From: Jimmy Ho Date: Fri, 26 Apr 2024 18:02:36 -0400 Subject: [PATCH 3/3] Updates to address PR comments. --- .gitignore | 3 +-- scraper.py | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 11910e1..7b7e21a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,2 @@ local.py -.vscode/* -venv/* +.vscode/* \ No newline at end of file diff --git a/scraper.py b/scraper.py index 3cc6f63..1e7f79d 100644 --- a/scraper.py +++ b/scraper.py @@ -119,7 +119,6 @@ LYRIC_JSON_PATH = 'lyrics.json' SONG_LIST_PATH = 'song_titles.txt' -access_token = # put your genius.com API access token here as a string def main(): parser = argparse.ArgumentParser()