最近需要3GPP的5G协议,手动一个一个下载比较麻烦,抽空用python3写了个下载程序,可以从3gpp的ftp服务器批量下载协议文件。

3GPP协议下载地址:https://www.3gpp.org/ftp/Specs/archive/38_series/

代码使用了线程池,可以多线程从3gpp下载协议。

代码流程:

1、首先从下载地址获取要下载的所有文件的 文件名、url、大小、日期、本地存储地址,写入ini文件

2、读取ini文件中的信息,使用线程池进行文件下载

#coding:utf-8
import urllib
import urllib.request as urllib2
from bs4 import BeautifulSoup 
import socket
import os, sys
import time
import configparser
import csv
import codecs
import threading
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor  # 线程池,进程池
import signal
from configparser import ConfigParser


stop = False


class Series38DownloadClass():
    def __init__(self, downloadMode=0):
        # 定义锁和线程池
        # 实例化线程锁
        self.threadLock = threading.Lock()
        self.ini_lock = threading.Lock()
        self.maxThreadNum = 100
        # 定义2个线程的线程池
        self.thread_pool = ThreadPoolExecutor(self.maxThreadNum)
        # 当前任务列表
        self.task_handler_list = []
        # 定义2个进程的进程池。进程池的用法和线程池基本一样
        # self.process_pool = ProcessPoolExecutor(2)
        #所有的下载文件信息存放列表 
        self.downloadFileList = []
        #下载成功数
        self.downloadSuccessNum = 0
        #下载失败数
        self.downloadFailedNum = 0
        #需要下载的总数
        self.downloadFileNum = 0 
        self.downloadCount = 0;
        #3GPP协议文件信息
        self.seriesFile_ini = "seriesFile.ini"
        

    def main_logic(self, downloadMode):
        # 从网站获取所需下载文件的url
        if downloadMode == 0:
            self.get_series38()

        # 从文件获取所需下载文件的url
        if downloadMode == 1:
            self.readSeriesFile_ini()
            while len(self.downloadFileList) > 0 and (False == stop):
                fileInfo = self.downloadFileList.pop(0)
                # 添加的中间函数调用
                self.call_downloadFile(fileInfo)

        
        

    # 添加一个线程池调用get_sub_series的中间方法
    def call_updateFileInfo(self, subPath, subSeriesUrl):
        self.thread_pool.submit(self.get_sub_series, subPath, subSeriesUrl)

    # 添加一个线程池调用call_downloadFile的中间方法
    def call_downloadFile(self, fileInfo):
        task_handler = self.thread_pool.submit(self.downloadFile, fileInfo)
        self.task_handler_list.append(task_handler)
        while True:
            if True == stop:
                break
            for task_handler_tmp in self.task_handler_list:
                if task_handler_tmp.done():
                    self.task_handler_list.remove(task_handler_tmp)
            # 如果未完成的任务已多于线程数的两倍那么先停一下,先不要再增加任务,防止内存消耗
            if len(self.task_handler_list) > self.maxThreadNum * 2:
                time.sleep(2)
            else:
                return True

    def downloadFile(self,fileInfo):
        '''
        下载文件
        '''
        self.downloadCount += 1
        
        fileName = fileInfo[0]
        fileUrl = fileInfo[1]
        fileSize = fileInfo[2]
        fileDate = fileInfo[3]
        filePath = fileInfo[4]
        tmpDir = filePath.split('/')[0]
        if not os.path.exists(tmpDir):
            os.mkdir(tmpDir)
        if os.path.exists(filePath):
            localFileSize = os.path.getsize(filePath)
            size = str(1.0*localFileSize/1024)
            if size[size.index('.')+1] == '0':
                size = size[:size.index('.')]
                localFileSize = "%s KB"%(size)
            else:
                localFileSize = "%s KB"%(size[:size.index('.')+1+1])
                
            if localFileSize != fileSize:
                os.remove(filePath)
                try:
                    urllib2.urlretrieve(fileUrl, filePath)
                    print("重新下载成功:%s [%d/%d], 原大小 %s, 协议大小 %s"%
                        (fileName, self.downloadSuccessNum, self.downloadFileNum, localFileSize, fileSize))
                    self.downloadSuccessNum += 1
                except:
                    self.downloadFaildNum += 1
                    self.downloadFileList.append(fileInfo)
                    print("重新下载失败:%s [失败次数 %s]"%(fileName,self.downloadFaildNum ))
                time.sleep(1)
            else:
                self.downloadSuccessNum += 1
        else:
            try:
                urllib2.urlretrieve(fileUrl, filePath)
                print("首次下载成功:%s [%d/%d]"%(fileName, self.downloadSuccessNum, self.downloadFileNum))
                self.downloadSuccessNum += 1
            except:
                self.downloadFaildNum += 1
                self.downloadFileList.append(fileInfo)
                print("重新下载失败:%s [失败次数 %s]"%(fileName,self.downloadFaildNum ))
            time.sleep(1)

    def get_series38(self):
        ''' 
        从https://www.3gpp.org/ftp/Specs/archive/38_series/下载协议
        '''
        series38_url = 'https://www.3gpp.org/ftp/Specs/archive/38_series/'
        request = urllib2.Request(series38_url)
        # 向指定的url地址发送请求,并返回服务器响应的类文件对象
        response = urllib2.urlopen(request)
        # 服务器返回的类文件对象支持python文件对象的操作方法
        html = response.read()
        soup = BeautifulSoup(html.decode('utf-8'), "lxml")
        subSeriesList = soup.findAll('a')
        threadId = 0
        for line in subSeriesList:
            subUrl = line.get('href')
            if subUrl.startswith(series38_url):
                subPath = line.string
                self.get_sub_series(subPath, subUrl)
                self.call_updateFileInfo(subPath, subUrl);
                #增加sleep,防止服务器主动关闭连接
                time.sleep(2)

    def get_sub_series(self, subPath='', subUrl=''):
        '''
        获取子页面的协议文件信息
        '''
        fileInfoList = []
        request = urllib2.Request(subUrl)
        response = urllib2.urlopen(request)
        html = response.read()
        soup = BeautifulSoup(html.decode('utf-8'), "lxml")
        
        tbody = soup.find('tbody')
        file_tr = tbody.tr
        while file_tr and (False == stop):
            file_td = file_tr.td
            file_td = file_td.find_next_sibling()
            fileUrl= file_td.a.get('href')
            fileName = file_td.a.string.strip()
            file_td = file_td.find_next_sibling()
            fileDate = file_td.string.strip()
            file_td = file_td.find_next_sibling()
            fileSize = file_td.string.strip().replace(',','.')
            filePath = subPath+"/"+fileName
            fileInfoList.append([fileName,fileUrl,fileSize,fileDate,filePath])
            self.downloadFileList.append([fileName,fileUrl,fileSize,fileDate,filePath])
            file_tr = file_tr.find_next_sibling()
        #文件信息从web获取完毕,将信息写入ini文件
        self.ini_lock.acquire()
        conf = ConfigParser()
        conf.read(self.seriesFile_ini, encoding='utf-8')
        for fileInfo in fileInfoList:
            fileName = fileInfo[0]
            fileUrl = fileInfo[1]
            fileSize = fileInfo[2]
            fileDate = fileInfo[3]
            filePath = fileInfo[4]
            print(fileInfo)
            if not conf.has_section(fileName):
                conf.add_section(fileName)
            conf.set(fileName, "fileName", fileName)
            conf.set(fileName, "fileUrl", fileUrl)
            conf.set(fileName, "fileSize", fileSize)
            conf.set(fileName, "fileDate", fileDate)
            conf.set(fileName, "filePath", filePath)
        conf.write(open(self.seriesFile_ini, "w"))
        self.ini_lock.release()

    def readSeriesFile_ini(self):
        '''
        读取ini文件中的信息,并根据url信息使用线程池下载文件,
        '''
        if os.path.exists(self.seriesFile_ini):
            conf = ConfigParser()
            conf.read(self.seriesFile_ini, encoding='utf-8')
            for fileName in conf.sections():
                if True == stop:
                    break
                self.downloadFileNum += 1
                fileUrl = conf.get(fileName, "fileUrl")
                fileSize = conf.get(fileName, "fileSize")
                fileDate = conf.get(fileName, "fileDate")
                filePath = conf.get(fileName, "filePath")
                self.downloadFileList.append([fileName,fileUrl,fileSize,fileDate,filePath])
            print("ini文件中的文件总数 %d:"%(self.downloadFileNum))
# 自定义信号处理函数
def quit(signum, frame):
    stop = True
    print("-----------------------------------------------------------------------------")
    print("-------------------------------进程被终止-------------------------------------")
    print("-----------------------------------------------------------------------------")
    sys.exit()


if __name__ == "__main__":
    signal.signal(signal.SIGINT, quit)
    signal.signal(signal.SIGTERM, quit)
    #设置超时时间
    socket.setdefaulttimeout(60)

    obj = Series38DownloadClass()
    #下载地址读取协议的详细信息,并将读取信息写入ini文件
    obj.main_logic(0)

    #读取ini文件,并下载协议
    obj.main_logic(1)

    while True:
        for task_handler_tmp in obj.task_handler_list:
            if task_handler_tmp.done():
                obj.task_handler_list.remove(task_handler_tmp)
        if(len(obj.task_handler_list) == 0):
            break
    print("文件下载完毕,需要下载总数 %d, 已经下载总数 %d"%(obj.downloadFileNum, obj.downloadSuccessNum))
    print("退出主线程")

 

Logo

腾讯云面向开发者汇聚海量精品云计算使用和开发经验,营造开放的云计算技术生态圈。

更多推荐