Skip to content

Commit

Permalink
Merge changes to cleaning lyrics
Browse files Browse the repository at this point in the history
  • Loading branch information
shaynak committed May 19, 2024
2 parents b640f4a + fdcef8e commit 1723c56
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 7 deletions.
29 changes: 24 additions & 5 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,24 @@
certifi==2020.12.5
chardet==4.0.0
idna==2.10
requests==2.25.1
urllib3==1.26.2
beautifulsoup4==4.12.3
certifi==2024.2.2
charset-normalizer==3.3.2
idna==3.7
ioutil==1.0.3
local==1.0.0
lyricsgenius==3.0.1
numpy==1.26.4
pandas==2.2.2
pyarrow==16.0.0
pyperclip==1.8.2
python-dateutil==2.9.0.post0
pytz==2024.1
requests==2.31.0
Send2Trash==1.8.3
six==1.16.0
soupsieve==2.5
speedtest-cli==2.1.3
srutil==1.0.3
tabulate==0.9.0
toml==0.10.2
tzdata==2024.1
urllib3==2.2.1
wikipedia==1.4.0
11 changes: 9 additions & 2 deletions scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,8 +359,11 @@ def clean_title(title: str) -> str:
title = re.sub(r'\u201C|\u201D', '"', title)
# Replace special unicode spaces with standard space
title = re.sub(
r'[\u00A0\u1680​\u180e\u2000-\u2009\u200a​\u200b​\u202f\u205f​\u3000]',
r'[\u00A0\u1680​\u180e\u2000-\u2009\u200a​\u202f\u205f​\u3000]',
" ", title)
title = title.replace('\u200b', '', title)
title = re.sub(r'\u0435', "e", title)
title = re.sub(r'\u2013|\u2014', " - ", title)
title = title.strip(' ')
return title

Expand All @@ -373,8 +376,12 @@ def clean_lyrics(lyrics: str) -> str:
lyrics = re.sub(r'\u201C|\u201D', '"', lyrics)
# Replace special unicode spaces with standard space
lyrics = re.sub(
r'[\u00A0\u1680​\u180e\u2000-\u2009\u200a​\u200b​\u202f\u205f​\u3000]',
r'[\u00A0\u1680​\u180e\u2000-\u2009\u200a​\u202f\u205f​\u3000]',
" ", lyrics)
# Replace zero-width space with empty string
lyrics = lyrics.replace('\u200b', '')
# Replace Cyrillic 'e' letters with English 'e'.
lyrics = re.sub(r'\u0435', "e", lyrics)
# Replace dashes with space and single hyphen
lyrics = re.sub(r'\u2013|\u2014', " - ", lyrics)
# Replace hyperlink text
Expand Down

0 comments on commit 1723c56

Please sign in to comment.