-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathEasonLyrics.py
254 lines (225 loc) · 9.91 KB
/
EasonLyrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#!/usr/bin/env python3
# -*- coding : utf-8 -*-
#==================================================================================================================
# Author: Eajack
# date:2017/8/23 - 2017/8/26
#==================================================================================================================
# Function:
# 1- 爬取网易云Eason所有歌曲歌词储存为txt
#==================================================================================================================
# keyPoints:
# 1- 网易云音乐的html中frame套frame,需要用selenium转换frame才能找到元素,bs不行
# driver.switch_to.frame(driver.find_element_by_xpath("//iframe"))
# 2- 经典的 utf-8(unicode) => gbk 编码问题,有两个问题
# 1)- windows下文件标题不能有 “/ ” “\”等标点符号
# 解决:fileName = fileName.replace("\xa0", " ")
# fileName = fileName.replace("/", " ")
# fileName = fileName.replace(" ", "")。记得加上 “ fileName = ”
# 2)- windows下txt文件内容编码默认为gbk
# 解决:txtFile = open(fileName,'w',encoding='utf-8')。改编码方式为utf-8
# 也可以顺便lyric = lyric.replace(u"\xa0",u" ")以备以后读取编码问题
#==================================================================================================================
# Console:
# 没有歌词的歌曲:
# ['L.I.F.E.Overture.txt', '那一夜没有雪.txt', '孤儿仔.txt', '想哭.txt', 'MusicOnly.txt', '黎喇.txt',
# 'EasonsAngel.txt', '27Seconds.txt']
# 下载失败的歌曲: []
# 总共大约用时30分钟
#==================================================================================================================
# Attention:
# Medley文件需删去
#==================================================================================================================
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import json
import os
import re
import time
# selenium库
from selenium import webdriver
# global
## 网易云请求头
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': '_ntes_nnid=e76639646686a8ddf4dbd9f865dbfe4e,1503039315399; _ntes_nuid=e76639646686a8ddf4dbd9f865dbfe4e; MUSIC_A=2ef1d1bf10e0970aa8ef2a400e11078ecfd091512723c7fb5f9d567371dac57a16f827d623128710018b9570d66b88bfb23ea72ab5261c23fb97b0129e36bdf9f546586f7bb9e6ae8328c980cd21a668; os=uwp; osver=10.0.14393.447; appver=1.3.3; deviceId=0d56743a3d5158a11f5a1fbf3a7dc746; FromPlatform=uwp; playerid=74378889; __csrf=87d059fd2c8ddc7e9cf4110cb944c364; JSESSIONID-WYYY=eXN26albKtQloWhh8EfQV77Qw519A%5Cl7vzcIjnvdMC%5CSufYdMklqQW%5C4vvSVi%5Cr0HWuK8hlb1g35D4IMcEC0h%5C6hl%5C14iF4XK5Wr%5CWflTCSH0p%2FbekQt4CuC%5CvXbN43GwErri0Brp%5CFipMllOCmfBZxiDq9qCJgkQOlsNGAkc%5CqRVGJu%3A1503483000489; _iuqxldmzr_=32',
'DNT': '1',
'Host': 'music.163.com',
'Pragma': 'no-cache',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
}
## 专辑ID统计
albumIDList = []
## key:歌名,value:ID
songName2ID_dict = {}
## 下载失败歌名
downloadFailedSong = []
## 没有歌词的歌名
noLyricsSong = []
# 获取Eason 所有专辑ID
def getAlbumIDList(singerID, limit):
'''
Function:
1- 获取Eason 所有专辑ID
:param singerID: 2116
:param limit: 12
:return: albumIDList => list
'''
global albumIDList
subAlbumLink = r'https://music.163.com/#/artist/album'
# 由于需要转移frame,需要selenium
for offset in range(0, 108, 12):
# selenium部分
# albumLink eg: https://music.163.com/#/artist/album?id=2116&limit=12&offset=0
albumLink = subAlbumLink + r"?id=" + str(singerID) + r"&limit=" + str(limit) + r"&offset=" + str(offset)
driver = webdriver.Chrome()
driver.implicitly_wait(10) # 隐式等待
driver.get(albumLink)
time.sleep(5) # 等待
driver.switch_to.frame(driver.find_element_by_xpath("//iframe")) # 关键,转换frame
html = BeautifulSoup(driver.page_source, "lxml")
# soup页面解析,获取当前页面所有歌曲ID
for albumIDtag in html.findAll("a", {"href": re.compile("^\/album\?id\=.*")}):
albumID = albumIDtag["href"].replace("/album?id=", "")
if albumID not in albumIDList:
albumIDList.append(albumID)
# 浏览器关闭
driver.quit()
# 获取Eason所有专辑内单曲歌曲ID以及所有歌名,过滤同样歌名歌曲
def getSongIDandsongNameList():
'''
Function:
1- 获取Eason所有专辑内单曲歌曲ID以及所有歌名
2- 过滤同一首歌
:return: songName2ID_dict[songName] = songID
'''
global albumIDList,songName2ID_dict
subInnerAlbumLink = r'https://music.163.com/#/album'
haveName = False
# 由于需要转移frame,需要selenium
for albumID in albumIDList: # 根据网址总结规律
# selenium部分
# InnerAlbumLink eg: https://music.163.com/#/album?id=35835294
albumLink = subInnerAlbumLink + r"?id=" + str(albumID)
driver = webdriver.Chrome()
driver.implicitly_wait(10) # 隐式等待
driver.get(albumLink)
time.sleep(5) # 等待
driver.switch_to.frame(driver.find_element_by_xpath("//iframe")) # 关键,转换frame
html = BeautifulSoup(driver.page_source, "lxml")
# soup页面解析,获取当前页面所有歌曲ID
for songIDtag in html.findAll("a", {"href": re.compile("^\/song\?id\=.*")}):
songID = songIDtag["href"].replace("/song?id=", "")
songName = str(songIDtag.b["title"])
## songName 去重处理 & 字符串处理
index1 = songName.find("-")
index2 = songName.find("(")
if (index1 != -1):
songName = songName[0:index1]
if (index2 != -1 and index2 != 0):
songName = songName[0:index2]
elif(index2 == 0):
index3 = songName.find("(",index2+1)
songName = songName[0:index3]
songName = songName.strip()
## 空列表处理,直接赋值
if(len(songName2ID_dict) == 0):
songName2ID_dict[songName] = songID
#debug use
print(songID, songName)
continue
## 判断songNameList中是否有同一首歌曲
for currentsongName in songName2ID_dict.keys():
## 存在歌曲了,包括剔除live版本等其它版本,因为歌词一样
if (currentsongName == songName):
haveName = True
break
else:
haveName = False
## 判断有无歌曲后操作
if (haveName):
continue
else:
if (songID not in songName2ID_dict.values()):
songName2ID_dict[songName] = songID
##debug use
print(songID,songName)
# 浏览器关闭
driver.quit()
# 获取歌词,储存为txt文件
def getLyric():
'''
Function:
1- 获取歌词
2- 并储存为txt文件
:param:
songName2ID_dict => dict
songName2ID_dict[songName] = songID
:return: txt歌词文档
'''
global headers,songName2ID_dict,downloadFailedSong,noLyricsSong
for songName,songID in songName2ID_dict.items():
## 0- 判断文件是否存在
## 转移目录
lyricPath = r"D:\STUDYING\MyProjects\pycharm\music163_EasonLyrics\Lyrics"
os.chdir(lyricPath)
## 文件名
fileName = songName + ".txt"
## 关键点:\xa0 编码问题,记得除了“fileName.replace("\xa0", " ")”,还有 “fileName =”
fileName = fileName.replace("\xa0", " ")
fileName = fileName.replace("/", " ")
fileName = fileName.replace(" ", "")
## 判断文件是否存在
if os.path.exists(fileName):
continue
# 1-爬取歌词
## 网页解析,关键在于songID
lyricUrl = r'http://music.163.com/api/song/lyric?' + 'id=' + str(songID) + r'&lv=1&kv=1&tv=-1'
request = requests.get(lyricUrl, headers=headers)
request.encoding = 'utf-8'
lyric_json = request.text
## 读取JSON文件
lyric_dict = json.loads(lyric_json)
try:# 歌曲没有歌词
lyric = lyric_dict["lrc"]["lyric"]
## 正则去掉 [00:00.00] 类似
extra = re.compile("\[.*\]")
lyric = re.sub(extra, "", lyric)
lyric = lyric.strip()
except KeyError as e:
noLyricsSong.append(fileName)
continue
# 2-储存为txt文件
## 储存为txt文件
try:
txtFile = open(fileName,'w',encoding='utf-8')
lyric = lyric.replace(u"\xa0",u" ")
txtFile.write(lyric)
txtFile.close()
print(songName + "\t已经下载完歌词啦")
except UnicodeEncodeError:
downloadFailedSong.append(songName)
# # debug use
# print(lyric)
if __name__ == '__main__':
start = time.time()
# 1- 先获取Eason所有专辑ID
singerID = 2116
limit = 12
getAlbumIDList(singerID, limit)
# 2- 遍历所有专辑ID获取所有歌曲ID & 所有歌名
getSongIDandsongNameList()
# 3- 遍历所有歌曲ID获取歌词
getLyric()
## 没歌词的歌以及下载失败的歌
print("没有歌词的歌曲:")
print(noLyricsSong)
print("下载失败的歌曲:")
print(downloadFailedSong)
end = time.time()
print("总共大约用时%d分钟"%((end - start) // 60) )