深圳网站备/网站流量统计系统
1.首先,了解虾米的一些反爬措施(虾米的反爬已经是很不错的了,但是嘿嘿),了解re,requests,beautifulsoup, selenium自动化脚本等等一系列该有的知识。开始干活
更新(目前虾米在歌曲的id获取上采取了加密措施,因此无法使用,但是通过抓包发现虾米只是在id上进行了加密,其他部分并未做出改变,所以,只需解密歌曲id依然可以下载歌曲。且不要下载我上传的虾米音乐爬取exe,已经无法下载音乐2019-03-22)
2.源码
Config.py
伪装成浏览器,否则的话会获得虾米音乐赠与的400 bad request
# DB
# DB_HOST = '192.168.153.131'
# DB_PORT = 3306
# DB_DBNAME = 'spider'
# DB_USER = 'root'
# DB_PASSWORD = '123123'
# DB_CHARSET = 'utf8mb4'# User-Agents
FakeUserAgents = ["Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)","Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)","Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)","Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)","Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)","Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)","Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6","Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0","Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5","Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20","Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52","Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1","Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3","Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12","Opera/9.27 (Windows NT 5.2; U; zh-cn)","Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13","Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 ","Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 ","Mozilla/5.0 (Linux; U; Android 3.2; ja-jp; F-01D Build/F0001) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13 ","Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_1 like Mac OS X; ja-jp) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8B117 Safari/6531.22.7","Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_2_1 like Mac OS X; da-dk) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5 ","Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.9 (KHTML, like Gecko) Chrome/ Safari/530.9 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)","Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Ubuntu/11.10 Chromium/27.0.1453.93 Chrome/27.0.1453.93 Safari/537.36","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36","Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36","Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36"
]
3.核心源码爬取歌手id 获取location 并解码 下载
# 1.搜索歌曲,获取歌曲id
# 'https://www.xiami.com/search?key='+ keyword+"&pos=1"
# 2.xml页面,可查看网页代码,下载xml文件,并将文件拖入到指定的窗口获得文件路径
# 'https://www.xiami.com/widget/xml-single/uid/0/sid/1804846198'+id
# 3.获取location
# 4.下载音乐
# 5. java读取文件夹# 网页播放页面
# 'https://www.xiami.com/play?ids=/song/playlist/id/'+id+'/object_name/default/object_id/0#loaded'
# 歌曲详细信息页面
# https://www.xiami.com/song/1796063337# 实现xml文件下载
import math
import os
from random import choice
import re
import sys
import urllib.parsefrom bs4 import BeautifulSoup
import chardet # 需要导入这个模块,检测编码格式
import requestsimport Configure# purpose : University competition
# author : Comiii
# date : 2018/12/28class Mp3Spider:header = {'user-agent': choice(Configure.FakeUserAgents)}localUrl = os.path.realpath(__file__)[:-7]songs_name = []location = []SongUrls = {}flag = "1"def __init__(self, flag, url):# print("获取当前文件路径——" + os.path.realpath(__file__)) # 获取当前文件路径# 美国iTunes榜# url = 'https://www.xiami.com/billboard/328'self.flag = flag# 103 # url = "https://www.xiami.com/billboard/103"r = requests.get(url, headers=self.header)html = r.contentsoup = BeautifulSoup(html, "html.parser")# <div class="song-name em"><a href="/song/1802902669">Youngblood</a></div>i = 0for div in soup.find_all('div', {'class': 'song-name em'}):for a in div.find_all('a'):# print(a.text+" "+a.get('href')[6:])if i < 12:self.SingerId = a.get('href')[6:]self.GetLocation()else:breaki += 1# print(i)def GetLocation(self):url = "https://www.xiami.com/widget/xml-single/uid/0/sid/" + self.SingerIdr = requests.get(url, headers=self.header)xml = r.contentencode_type = chardet.detect(xml)xml = xml.decode(encode_type['encoding'])# <location><![CDATA[(.*?)]]></location>url = re.compile(r'<location><!\[CDATA\[(.*?)\]\]></location>', re.S)song_name = re.compile(r'<song_name><!\[CDATA\[(.*?)\]\]></song_name>', re.S)songs_name = re.findall(song_name, xml)location = re.findall(url, xml)for i in range(len(songs_name)):if songs_name != []:if songs_name[i] != self.SongUrls.get("song_name"):self.SongUrls['song_name'] = songs_name[i]self.SongUrls['Url'] = self.Decode(location[i])if self.flag == 1:print(self.SongUrls.get("song_name"))elif self.flag == 2:self.DownloadSong()def Decode(self, location):#核心代码已被删除,需要的人可以私聊我。#或者下载下方python的虾米音乐下载器下载使用。def DownloadSong(self):if not os.path.exists("Download"):os.makedirs("Download")for songurl in self.SongUrls:r = requests.get(self.SongUrls.get("Url"))filename = "{0:s}.mp3".format(self.SongUrls.get("song_name"))with open("Download/" + filename, 'wb') as file:file.write(r.content)print("Download {0:s} Successfully.".format(self.SongUrls.get("song_name")))if __name__ == '__main__':a = []for i in range(1, len(sys.argv)):a.append(sys.argv[i])flag = int(a[0])music_url = int(a[1])if music_url == 0:# 新歌榜url = "https://www.xiami.com/billboard/102"elif music_url == 1:# 热歌榜url = "https://www.xiami.com/billboard/103"elif music_url == 2:# 电音榜url = "https://www.xiami.com/billboard/325"elif music_url == 3:# 歌单收录榜url = "https://www.xiami.com/billboard/306"elif music_url == 4:# 抖音热歌榜url = "https://www.xiami.com/billboard/332"elif music_url == 5:# 影视原声榜url = "https://www.xiami.com/billboard/324"elif music_url == 6:# 虾米分享榜url = "https://www.xiami.com/billboard/307"Mp3Spider(flag, url)
5.有什么疑问可以在下方留言!
虾米音乐单曲或多曲下载-->exe 下载:https://download.csdn.net/download/qq_25233621/10949831