喜马拉雅下载 多线程-完善版

增加了网络错误重试,同时100个线程下载完再下一个100线程。

# -*- coding: utf-8 -*-
"""
Created on Sun Apr  5 11:58:25 2020

@author: 54861
"""

import requests
import math
import os, sys, threading
import re
import time

headers = {
        #'Referer': start_url,  # 注意加上referer
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
    }

"""
获取总视频数
参数:albumID  专辑ID
"""
def get_album_number(albumID):
    i = 3
    while i>0:
        url = "https://www.ximalaya.com/revision/album?albumId="+str(albumID)
        resp = requests.get(url,headers = headers)
        if resp.status_code == requests.codes.ok:
            result = resp.json()
            if 'ret' in result:
                if result['ret'] == 200:
                    cover = result['data']['mainInfo']['cover']
                    albumTitle = result['data']['mainInfo']['albumTitle']
                    count = result['data']['tracksInfo']['trackTotalCount']
                    return cover,albumTitle,count
        i = i - 1
    # try 3 times and let codes downside ebend the program
    if result['ret'] == 200:
        cover = result['data']['mainInfo']['cover']
        albumTitle = result['data']['mainInfo']['albumTitle']
        count = result['data']['tracksInfo']['trackTotalCount']
    else:
        cover=''
        albumTitle=''
        count = 0
"""
获取一页音频信息
参数:albumID  专辑ID
参数:pageNum  !!页码1开始!!
"""
def get_track_list(albumID, pageNum):
    i = 3
    while i>0:
        trackList = []
        url = "https://www.ximalaya.com/revision/album/v1/getTracksList?albumId="+str(albumID)+"&pageNum="+str(pageNum)
        resp = requests.get(url,headers = headers)
        if resp.status_code == requests.codes.ok:
            result = resp.json()
            if 'ret' in result:
                if result['ret'] == 200:
                    tracks = result['data']['tracks']
                    for track in tracks:
                        trackList.append({'trackId':track['trackId'],'title':track['title']})
                    return trackList
        i = i - 1
    # try 3 times and let codes downside ebend the program
    trackList = []
    result = resp.json()
    if result['ret'] == 200:
        tracks = result['data']['tracks']
        for track in tracks:
            trackList.append({'trackId':track['trackId'],'title':track['title']})
    return trackList
"""
获取音频下载地址
参数:trackID  音频ID
"""
def get_track_url(trackID):
    i = 3
    while i>0:
        url = "https://www.ximalaya.com/revision/play/v1/audio?id="+str(trackID)+"&ptype=1"
        resp = requests.get(url,headers = headers)
        if resp.status_code == requests.codes.ok:
            result = resp.json()
            if 'ret' in result:
                if result['ret'] == 200:
                    src = result['data']['src']
                    #print(result)
                    if len(src) > 0:
                        return src
        i = i - 1       
    # try 3 times and let codes downside ebend the program 
    result = resp.json()
    if result['ret'] == 200:
        src = result['data']['src']
        #print(result)
        if len(src) > 0:
            return src
   
"""
下载音频
参数:url  音频下载地址
参数:file 文件本地路径
"""
def download_track(url,file):
    i = 0
    while i < 5:
        try:
            resp = requests.get(url, headers = headers, stream = True,timeout=(15.05, 47))
            length = float(resp.headers['content-length'])
            count = 0
            if os.path.exists(file) and os.path.getsize(file) >= length * 0.9999 :
                print ("!exist  ",file)
            else :
                with open(file, 'wb') as f:
                    for data in resp.iter_content(chunk_size = 1024):
                        if data: 
                            f.write(data)
                            count += len(data)
                            p = count / length * 100
                            p = round(p,2)
                            #print("    ",p,"    ")
                print(" downloaded :",url)
                print(file)
                
            thread_num = len(threading.enumerate())
            print ("still ",(thread_num-1-6)," download threads is running...        ",(thread_num-1))#基础6线程,主线程,守护线程,保存线程等。。。
            return
        except requests.exceptions.RequestException:
            print ("!!!! retry ",str(i+1)," ",file)
            i += 1
    
    
"""
从一个字符串得到符合windows要求的文件名
"""    
def good_win_filename(title):
    rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
    new_title = re.sub(rstr, "_", title) # 替换为下划线
    return new_title
    
if __name__ == '__main__':
    batchsize = 100 #!!!!!多线程同时下载数量
    #albumIds = []#喜马拉雅专辑号 
    albumIds = [24188889,24189573,24304888,24305879,24306243,24309252,24311858,24354367,24358805,24518861,24824153,24825039,24851969,24984143,29061925,29063110,29063422,29063736,29086752,29086807,29087188,29087967,29094095,29094141,29094462,29094720,29094780,29118401,29125163,29125256,29125736,29173459,29212173,29213272,29213784,29306595,29306886,29310539,29310668,29311525,29311638,29311691,29312131,29312300,29312506,29312552,29359960,29374900,29376353,29467583,29493226,29557339,29557547,29557681,29561023,29561324,29591557,29593290,29905326,29906992,29909294,30052498,30056745,30059246,30059694,30060409,30061141,30069472,30069840,30773177,31463410,31478915,31502687,31503140,31503234,31506820,31507683,31508309,31555555,31667590,31667900,31733378,31773435,31939905,31940538,32082858,32083585,32084906,32085745,32085826,32085908,32086241,32086303,32086360,32086635,32086956,32087690,32087737,32087792,32087872,32088232,32095450,32095559,32096205,32118029,32118525,32118948,32119352,32120018,32121291,32136623,32168822,32169557,32170582,32190806,32190913,32192082,32192151,32192333,32194813,32197855,32198974,32364394,32525163,33132748,33271935,34132943,34139540,34763958,34764287,34764753,34765359,34765865,34766267,34766887,34767313,34778882,34785060,34843276,34843763,34843965,34844690,34863290,34863973,34917397,34917464,34928925,34929142,34929532,34929810,34930021,34930636,34931472,34932101,34932213,34932458,34963950,35114735,35511179]
    j = 0
    ii = 0
    threadpool = []
    title_list = []
    for albumId in albumIds:
        j=j+1
        print ("\n start searching:"+str(j)+" of "+str(len(albumIds))+" AVBV:" + str(albumId)+" of"+str(albumIds))
        coverUrl,albumTitle,total = get_album_number(albumId)
        dir = 'd:/ximalaya_m4a/'
        if not os.path.exists(dir):#创建专辑文件夹
            os.makedirs(dir)
        dir = dir+str(albumId)+"_"+good_win_filename(albumTitle)+'/'
        if not os.path.exists(dir):#创建专辑文件夹
            os.makedirs(dir)
        #download_track("http:"+coverUrl,dir+"cover"+coverUrl[coverUrl.rindex('.'):])#下载封面
        # 定义线程
        th = threading.Thread(target=download_track, args=("http:"+coverUrl,dir+"cover"+coverUrl[coverUrl.rindex('.'):]))
        # 将线程加入线程池
        threadpool.append(th)
                    
        pageCount = math.ceil( total/30)
        i = 1
        for pageNum in range(0,pageCount):
            time.sleep(0.003)#防止短语音文件下载过快,被服务器断开链接 推荐1秒
            trackList = get_track_list(albumId,pageNum+1)
            for track in trackList:
                trackUrl = get_track_url(track['trackId'])
                time.sleep(0.003)#防止短语音文件下载过快,被服务器断开链接 推荐1秒
                if trackUrl:
                    ext = trackUrl[trackUrl.rindex('.'):]
                    file_path = dir + good_win_filename(track['title']) + ext
                    print ('find file:'+str(i)+" of "+str(total)+', page '+str(pageNum+1)+" of "+str(pageCount)+"\n"+ trackUrl + "\n"+track['title'])
                    #print ( dir )
                    #download_track(trackUrl, file_path)###
                    # 定义线程
                    th = threading.Thread(target=download_track, args=(trackUrl, file_path))
                    # 将线程加入线程池
                    threadpool.append(th)
                    ii = ii+1
                i = i + 1
    #print("\n\n"+str(ii)+" files found!\n\n ")
    #下载模块 ,他的位置决定何时下载,考虑到 几百集的 list获取时容易网络错误 所以放在 page里面,每个page list获取后看看是否够一批,够就下载
            if len(threadpool)>batchsize:
                ini = 0
                while len(threadpool)>batchsize:
                    # 开始线程
                    curthreadpool = []
                    for kk in range(batchsize):
                        th = threadpool.pop(0)
                        curthreadpool.append(th)             
                    # 开始线程
                    for th in curthreadpool:
                        th.start()
                    # 等待所有线程运行完毕
                    for th in curthreadpool:
                        th.join()    
                    time.sleep(1)
    
    # 开始线程  总下载数mod batchsize的余数 需要下载
    for th in threadpool:
        th.start()
    # 等待所有线程运行完毕
    for th in threadpool:
        th.join()    
    print("\n\n"+str(ii)+" files downed in:"+dir)    
        

ximalaya_m4a_download-thread.py (2).txt

发表新评论