ximalaya_m4a_download-thread.py.txt
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 5 11:58:25 2020
@author: 54861
"""
import requests
import math
import os, sys, threading
import re
import time
headers = {
#'Referer': start_url, # 注意加上referer
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
"""
获取总视频数
参数:albumID 专辑ID
"""
def get_album_number(albumID):
url = "https://www.ximalaya.com/revision/album?albumId="+str(albumID)
resp = requests.get(url,headers = headers)
result = resp.json()
if result['ret'] == 200:
cover = result['data']['mainInfo']['cover']
albumTitle = result['data']['mainInfo']['albumTitle']
count = result['data']['tracksInfo']['trackTotalCount']
else:
cover=''
albumTitle=''
count = 0
return cover,albumTitle,count
"""
获取一页音频信息
参数:albumID 专辑ID
参数:pageNum !!页码1开始!!
"""
def get_track_list(albumID, pageNum):
trackList = []
url = "https://www.ximalaya.com/revision/album/v1/getTracksList?albumId="+str(albumID)+"&pageNum="+str(pageNum)
resp = requests.get(url,headers = headers)
result = resp.json()
if result['ret'] == 200:
tracks = result['data']['tracks']
for track in tracks:
trackList.append({'trackId':track['trackId'],'title':track['title']})
return trackList
"""
获取音频下载地址
参数:trackID 音频ID
"""
def get_track_url(trackID):
url = "https://www.ximalaya.com/revision/play/v1/audio?id="+str(trackID)+"&ptype=1"
resp = requests.get(url,headers = headers)
result = resp.json()
if result['ret'] == 200:
src = result['data']['src']
#print(result)
if len(src) > 0:
return src
"""
下载音频
参数:url 音频下载地址
参数:file 文件本地路径
"""
def download_track(url,file):
resp = requests.get(url, headers = headers, stream = True)
length = float(resp.headers['content-length'])
count = 0
if os.path.exists(file) and os.path.getsize(file) >= length * 0.9999 :
print ("!exist ",file)
else :
with open(file, 'wb') as f:
for data in resp.iter_content(chunk_size = 1024):
if data:
f.write(data)
count += len(data)
p = count / length * 100
p = round(p,2)
#print(" ",p," ")
print(" downloaded :",url)
print(file)
thread_num = len(threading.enumerate())
print ("still ",(thread_num-1-6)," download threads is running... ",(thread_num-1))#基础6线程,主线程,守护线程,保存线程等。。。
return
"""
从一个字符串得到符合windows要求的文件名
"""
def good_win_filename(title):
rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
new_title = re.sub(rstr, "_", title) # 替换为下划线
return new_title
if __name__ == '__main__':
#albumIds = []#喜马拉雅专辑董晓漂
albumIds = [11593977,11593878,11593831,11593806,11593731,11575451,11577794,11577760,11577741,11577731,11577579,11576825,11577341,11577178,11577114,11577102,11577081,11576746,9896654,11576704,11576677,11576611,11576594,11576461,11576552,11576517,11576476,11576373,11576330,11576284,11576207,11576073,11575170,11575391,11574989,11575255,11575020,11574940,11574924,11574832,11574900,11574872,11574856,11574829,11574813,11574681,11561976,11574659,11561931,11387367,11285335,10779720,11168188,11168097,11167987,11167952,11122854,10829548,10780322,10780141,10779666]
j = 0
ii = 0
threadpool = []
title_list = []
for albumId in albumIds:
j=j+1
print ("\n start searching:"+str(j)+" of "+str(len(albumIds))+" AVBV:" + str(albumId)+" of"+str(albumIds))
coverUrl,albumTitle,total = get_album_number(albumId)
dir = 'd:/ximalaya_m4a/'
if not os.path.exists(dir):#创建专辑文件夹
os.makedirs(dir)
dir = dir+str(albumId)+"_"+good_win_filename(albumTitle)+'/'
if not os.path.exists(dir):#创建专辑文件夹
os.makedirs(dir)
#download_track("http:"+coverUrl,dir+"cover"+coverUrl[coverUrl.rindex('.'):])#下载封面
# 定义线程
th = threading.Thread(target=download_track, args=("http:"+coverUrl,dir+"cover"+coverUrl[coverUrl.rindex('.'):]))
# 将线程加入线程池
threadpool.append(th)
pageCount = math.ceil( total/30)
i = 1
for pageNum in range(0,pageCount):
time.sleep(0.3)#防止短语音文件下载过快,被服务器断开链接 推荐1秒
trackList = get_track_list(albumId,pageNum+1)
for track in trackList:
trackUrl = get_track_url(track['trackId'])
time.sleep(0.3)#防止短语音文件下载过快,被服务器断开链接 推荐1秒
if trackUrl:
ext = trackUrl[trackUrl.rindex('.'):]
file_path = dir + good_win_filename(track['title']) + ext
print ('find file:'+str(i)+" of "+str(total)+', page '+str(pageNum+1)+" of "+str(pageCount)+"\n"+ trackUrl + "\n"+track['title'])
#print ( dir )
#download_track(trackUrl, file_path)###
# 定义线程
th = threading.Thread(target=download_track, args=(trackUrl, file_path))
# 将线程加入线程池
threadpool.append(th)
ii = ii+1
i = i + 1
#print("\n\n"+str(ii)+" files found!\n\n ")
if len(threadpool)>10:
# 开始线程
for th in threadpool:
th.start()
# 等待所有线程运行完毕
for th in threadpool:
th.join()
threadpool = []
# 开始线程
for th in threadpool:
th.start()
# 等待所有线程运行完毕
for th in threadpool:
th.join()
print("\n\n"+str(ii)+" files downed in:"+dir)