5G协议批量下载 python3
最近需要3GPP的5G协议,手动一个一个下载比较麻烦,抽空用python3写了个下载程序,可以从3gpp的ftp服务器批量下载协议文件。3GPP协议下载地址:https://www.3gpp.org/ftp/Specs/archive/38_series/代码使用了线程池,可以多线程从3gpp下载协议。代码流程:1、首先从下载地址获取要下载的所有文件的 文件名、url、大小、日期、本地存储地址,写
·
最近需要3GPP的5G协议,手动一个一个下载比较麻烦,抽空用python3写了个下载程序,可以从3gpp的ftp服务器批量下载协议文件。
3GPP协议下载地址:https://www.3gpp.org/ftp/Specs/archive/38_series/
代码使用了线程池,可以多线程从3gpp下载协议。
代码流程:
1、首先从下载地址获取要下载的所有文件的 文件名、url、大小、日期、本地存储地址,写入ini文件
2、读取ini文件中的信息,使用线程池进行文件下载
#coding:utf-8
import urllib
import urllib.request as urllib2
from bs4 import BeautifulSoup
import socket
import os, sys
import time
import configparser
import csv
import codecs
import threading
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor # 线程池,进程池
import signal
from configparser import ConfigParser
stop = False
class Series38DownloadClass():
def __init__(self, downloadMode=0):
# 定义锁和线程池
# 实例化线程锁
self.threadLock = threading.Lock()
self.ini_lock = threading.Lock()
self.maxThreadNum = 100
# 定义2个线程的线程池
self.thread_pool = ThreadPoolExecutor(self.maxThreadNum)
# 当前任务列表
self.task_handler_list = []
# 定义2个进程的进程池。进程池的用法和线程池基本一样
# self.process_pool = ProcessPoolExecutor(2)
#所有的下载文件信息存放列表
self.downloadFileList = []
#下载成功数
self.downloadSuccessNum = 0
#下载失败数
self.downloadFailedNum = 0
#需要下载的总数
self.downloadFileNum = 0
self.downloadCount = 0;
#3GPP协议文件信息
self.seriesFile_ini = "seriesFile.ini"
def main_logic(self, downloadMode):
# 从网站获取所需下载文件的url
if downloadMode == 0:
self.get_series38()
# 从文件获取所需下载文件的url
if downloadMode == 1:
self.readSeriesFile_ini()
while len(self.downloadFileList) > 0 and (False == stop):
fileInfo = self.downloadFileList.pop(0)
# 添加的中间函数调用
self.call_downloadFile(fileInfo)
# 添加一个线程池调用get_sub_series的中间方法
def call_updateFileInfo(self, subPath, subSeriesUrl):
self.thread_pool.submit(self.get_sub_series, subPath, subSeriesUrl)
# 添加一个线程池调用call_downloadFile的中间方法
def call_downloadFile(self, fileInfo):
task_handler = self.thread_pool.submit(self.downloadFile, fileInfo)
self.task_handler_list.append(task_handler)
while True:
if True == stop:
break
for task_handler_tmp in self.task_handler_list:
if task_handler_tmp.done():
self.task_handler_list.remove(task_handler_tmp)
# 如果未完成的任务已多于线程数的两倍那么先停一下,先不要再增加任务,防止内存消耗
if len(self.task_handler_list) > self.maxThreadNum * 2:
time.sleep(2)
else:
return True
def downloadFile(self,fileInfo):
'''
下载文件
'''
self.downloadCount += 1
fileName = fileInfo[0]
fileUrl = fileInfo[1]
fileSize = fileInfo[2]
fileDate = fileInfo[3]
filePath = fileInfo[4]
tmpDir = filePath.split('/')[0]
if not os.path.exists(tmpDir):
os.mkdir(tmpDir)
if os.path.exists(filePath):
localFileSize = os.path.getsize(filePath)
size = str(1.0*localFileSize/1024)
if size[size.index('.')+1] == '0':
size = size[:size.index('.')]
localFileSize = "%s KB"%(size)
else:
localFileSize = "%s KB"%(size[:size.index('.')+1+1])
if localFileSize != fileSize:
os.remove(filePath)
try:
urllib2.urlretrieve(fileUrl, filePath)
print("重新下载成功:%s [%d/%d], 原大小 %s, 协议大小 %s"%
(fileName, self.downloadSuccessNum, self.downloadFileNum, localFileSize, fileSize))
self.downloadSuccessNum += 1
except:
self.downloadFaildNum += 1
self.downloadFileList.append(fileInfo)
print("重新下载失败:%s [失败次数 %s]"%(fileName,self.downloadFaildNum ))
time.sleep(1)
else:
self.downloadSuccessNum += 1
else:
try:
urllib2.urlretrieve(fileUrl, filePath)
print("首次下载成功:%s [%d/%d]"%(fileName, self.downloadSuccessNum, self.downloadFileNum))
self.downloadSuccessNum += 1
except:
self.downloadFaildNum += 1
self.downloadFileList.append(fileInfo)
print("重新下载失败:%s [失败次数 %s]"%(fileName,self.downloadFaildNum ))
time.sleep(1)
def get_series38(self):
'''
从https://www.3gpp.org/ftp/Specs/archive/38_series/下载协议
'''
series38_url = 'https://www.3gpp.org/ftp/Specs/archive/38_series/'
request = urllib2.Request(series38_url)
# 向指定的url地址发送请求,并返回服务器响应的类文件对象
response = urllib2.urlopen(request)
# 服务器返回的类文件对象支持python文件对象的操作方法
html = response.read()
soup = BeautifulSoup(html.decode('utf-8'), "lxml")
subSeriesList = soup.findAll('a')
threadId = 0
for line in subSeriesList:
subUrl = line.get('href')
if subUrl.startswith(series38_url):
subPath = line.string
self.get_sub_series(subPath, subUrl)
self.call_updateFileInfo(subPath, subUrl);
#增加sleep,防止服务器主动关闭连接
time.sleep(2)
def get_sub_series(self, subPath='', subUrl=''):
'''
获取子页面的协议文件信息
'''
fileInfoList = []
request = urllib2.Request(subUrl)
response = urllib2.urlopen(request)
html = response.read()
soup = BeautifulSoup(html.decode('utf-8'), "lxml")
tbody = soup.find('tbody')
file_tr = tbody.tr
while file_tr and (False == stop):
file_td = file_tr.td
file_td = file_td.find_next_sibling()
fileUrl= file_td.a.get('href')
fileName = file_td.a.string.strip()
file_td = file_td.find_next_sibling()
fileDate = file_td.string.strip()
file_td = file_td.find_next_sibling()
fileSize = file_td.string.strip().replace(',','.')
filePath = subPath+"/"+fileName
fileInfoList.append([fileName,fileUrl,fileSize,fileDate,filePath])
self.downloadFileList.append([fileName,fileUrl,fileSize,fileDate,filePath])
file_tr = file_tr.find_next_sibling()
#文件信息从web获取完毕,将信息写入ini文件
self.ini_lock.acquire()
conf = ConfigParser()
conf.read(self.seriesFile_ini, encoding='utf-8')
for fileInfo in fileInfoList:
fileName = fileInfo[0]
fileUrl = fileInfo[1]
fileSize = fileInfo[2]
fileDate = fileInfo[3]
filePath = fileInfo[4]
print(fileInfo)
if not conf.has_section(fileName):
conf.add_section(fileName)
conf.set(fileName, "fileName", fileName)
conf.set(fileName, "fileUrl", fileUrl)
conf.set(fileName, "fileSize", fileSize)
conf.set(fileName, "fileDate", fileDate)
conf.set(fileName, "filePath", filePath)
conf.write(open(self.seriesFile_ini, "w"))
self.ini_lock.release()
def readSeriesFile_ini(self):
'''
读取ini文件中的信息,并根据url信息使用线程池下载文件,
'''
if os.path.exists(self.seriesFile_ini):
conf = ConfigParser()
conf.read(self.seriesFile_ini, encoding='utf-8')
for fileName in conf.sections():
if True == stop:
break
self.downloadFileNum += 1
fileUrl = conf.get(fileName, "fileUrl")
fileSize = conf.get(fileName, "fileSize")
fileDate = conf.get(fileName, "fileDate")
filePath = conf.get(fileName, "filePath")
self.downloadFileList.append([fileName,fileUrl,fileSize,fileDate,filePath])
print("ini文件中的文件总数 %d:"%(self.downloadFileNum))
# 自定义信号处理函数
def quit(signum, frame):
stop = True
print("-----------------------------------------------------------------------------")
print("-------------------------------进程被终止-------------------------------------")
print("-----------------------------------------------------------------------------")
sys.exit()
if __name__ == "__main__":
signal.signal(signal.SIGINT, quit)
signal.signal(signal.SIGTERM, quit)
#设置超时时间
socket.setdefaulttimeout(60)
obj = Series38DownloadClass()
#下载地址读取协议的详细信息,并将读取信息写入ini文件
obj.main_logic(0)
#读取ini文件,并下载协议
obj.main_logic(1)
while True:
for task_handler_tmp in obj.task_handler_list:
if task_handler_tmp.done():
obj.task_handler_list.remove(task_handler_tmp)
if(len(obj.task_handler_list) == 0):
break
print("文件下载完毕,需要下载总数 %d, 已经下载总数 %d"%(obj.downloadFileNum, obj.downloadSuccessNum))
print("退出主线程")
更多推荐
所有评论(0)