Skip to content

Commit

Permalink
Merge pull request #3 from jimmynotjames/jimmyho_updates_for_requirem…
Browse files Browse the repository at this point in the history
…ents_and_special_chars

Updates for requirements.txt and special characters
  • Loading branch information
shaynak authored May 19, 2024
2 parents 0a3dcee + b9ea6a8 commit fdcef8e
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 8 deletions.
29 changes: 24 additions & 5 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,24 @@
certifi==2020.12.5
chardet==4.0.0
idna==2.10
requests==2.25.1
urllib3==1.26.2
beautifulsoup4==4.12.3
certifi==2024.2.2
charset-normalizer==3.3.2
idna==3.7
ioutil==1.0.3
local==1.0.0
lyricsgenius==3.0.1
numpy==1.26.4
pandas==2.2.2
pyarrow==16.0.0
pyperclip==1.8.2
python-dateutil==2.9.0.post0
pytz==2024.1
requests==2.31.0
Send2Trash==1.8.3
six==1.16.0
soupsieve==2.5
speedtest-cli==2.1.3
srutil==1.0.3
tabulate==0.9.0
toml==0.10.2
tzdata==2024.1
urllib3==2.2.1
wikipedia==1.4.0
10 changes: 7 additions & 3 deletions scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def albums_to_songs_csv(songs_by_album, existing_df=None):
for song in songs_by_album[album]:
if song.title not in IGNORE_SONGS and song.title not in songs_titles:
record = {
'Title': song.title.strip('\u200b'),
'Title': song.title.replace('\u200b', ''),
'Album':
album if 'Lover (Target' not in album else 'Lover',
'Lyrics': song.lyrics,
Expand All @@ -250,7 +250,7 @@ def albums_to_songs_csv(songs_by_album, existing_df=None):
for song in songs_by_album[album]:
if song in OTHER_SONGS and song.title not in songs_titles:
record = {
'Title': song.title,
'Title': song.title.replace('\u200b', ''),
'Album': album,
'Lyrics': song.lyrics,
}
Expand Down Expand Up @@ -376,8 +376,12 @@ def clean_lyrics(lyrics: str) -> str:
lyrics = re.sub(r'\u201C|\u201D', '"', lyrics)
# Replace special unicode spaces with standard space
lyrics = re.sub(
r'[\u00A0\u1680​\u180e\u2000-\u2009\u200a​\u200b​\u202f\u205f​\u3000]',
r'[\u00A0\u1680​\u180e\u2000-\u2009\u200a​\u202f\u205f​\u3000]',
" ", lyrics)
# Replace zero-width space with empty string
lyrics = lyrics.replace('\u200b', '')
# Replace Cyrillic 'e' letters with English 'e'.
lyrics = re.sub(r'\u0435', "e", lyrics)
# Replace dashes with space and single hyphen
lyrics = re.sub(r'\u2013|\u2014', " - ", lyrics)
# Replace hyperlink text
Expand Down

0 comments on commit fdcef8e

Please sign in to comment.