360影视大全 python_「www.dy2018.com」python爬取电影天堂(www.dy2018.com)所有视屏的所有链接 - 金橙教程网...
www.dy2018.com用到的库为Requests,bs4,re,pyMySQL目的是将链接存入数据库,数据库分为三张表:category:存储分类video:存储视屏的信息link:存储链接因为该网站网页不规范,所以用到了很多判断、正则来获取正确链接,但估计依然有没爬取到的,也就这样吧。不过这个网站页面结构相对简单,感觉很适合练手结果共爬取28000+视屏,14w+链接,爬了几十个小时(我在
www.dy2018.com
用到的库为Requests,bs4,re,pyMySQL
目的是将链接存入数据库,数据库分为三张表:
category:存储分类
video:存储视屏的信息
link:存储链接
因为该网站网页不规范,所以用到了很多判断、正则来获取正确链接,但估计依然有没爬取到的,也就这样吧。不过这个网站页面结构相对简单,感觉很适合练手
结果共爬取28000+视屏,14w+链接,爬了几十个小时(我在中间有sleep),下面是所有代码import time
import requests
fROM bs4 import BeautifulSoup
import re
import pymysql
def printTime(timeFloat,hasFinish=''):
timeInt = round(timeFloat)
timeHour = timeInt // (60 * 60)
timeMinute = (timeInt - timeHour * 60 * 60) // 60
timeSecond = timeInt - timeMinute * 60 - timeHour * 60 * 60
if timeInt < 60:
print('已用时:' + str(timeInt) + ' s',end='')
elif timeHour < 1:
print('已用时:' + str(timeMinute) + ' m ' + str(timeSecond) + ' s',end='')
else:
print('已用时:' + str(timeHour) + ' h ' + str(timeMinute) + ' m ' + str(timeSecond) + ' s',end='')
print(' 完成度'+hasFinish)
class eachVideo:
def __init__(self):
self.insertCategory = 'insert into category (name) values (%s)'
self.insertVideo = 'insert into video (name,title,cid) values (%s,%s,%s)'
self.insertLink = 'insert into link (link,vid) values (%s,%s)'
self.SELECTVideo = 'select id from video where title=%s'
self.selectCategory = 'select id from category where name=%s'
# 获取数据库连接
def getDB(self):
db = pymysql.connect(user='root', password='199508', DATabase='myresource')
return db
# 获取soup对象
def getSoup(self, URL):
headers = {'user-agent': 'Mozilla/5.0 (windows NT 10.0; WOW64) AppleWEBKit/537.36 (Khtml, like Gecko) '
'Chrome/63.0.3239.132 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept - encoding': 'gzip, deflate, br',
'cookie': '_ga=GA1.2.412668155.1542004912; gr_user_id=b54b3fbb-3005-4021-9191-49961f0925e5; '
'_gid=GA1.2.497245283.1542793697; Hm_lvt_a68dc87e09b2a989eec1a0669bfd59eb=1542437077,'
'1542793697,1542863682,1542960832; pescdfeedbackbid=2; '
'pescdcheckfeedbackkey=1543013870%2C51ac4fa341dda1cbfc464c9eb8b7270a'
'%2C7673df03288dcb33602cccfb14489466; XLA_CI=35a00c84ce21862d2edb13445a8675c8; '
'pescdlastsearchtime=1543041447; '
'gr_Session_id_bce67daadd1e4d71=212d307a-bba4-44a4-88ba-da5684fa84e5; '
'gr_session_id_bce67daadd1e4d71_212d307a-bba4-44a4-88ba-da5684fa84e5=true; '
'Hm_lpvt_a68dc87e09b2a989eec1a0669bfd59eb=1543130349'}
response = requests.get(url, headers=headers, timeout=2)
try:
html = response.content.decode('gb2312')
except UnicodeDecodeERROR:
html = response.content.decode('gbk')
soup = BeautifulSoup(html, 'lxml')
return soup
# 获取标题
def getTitle(self, soup):
title = soup.find('h1').text
return title
# 获取名字
def getName(self, title):
try:
name = re.search('《(.*?)》', title).group(1)
except AttributeError:
name = title
return name
# 获取图片
def getPic(self, soup, name, vid):
imageLink = soup.find('img').get('src')
if imageLink is not None:
try:
image = requests.get(imageLink, timeout=1)
path = 'E:\图片\dy2018\\' + str(vid) + '.jpg'
with open(path, 'wb') as pic:
pic.write(image.content)
except Exception:
print(' ' + name + '---图片下载失败')
else:
print(' ' + name + '---该视频没有海报')
# 获取链接
def getLink(self, soup, vid):
link = soup.find_all('td')
links = []
for l in link:
try:
if re.search('word-wrap.*?', l.get('style').lower()) is not None:
links.append([l.text.strip(), vid])
except AttributeError:
continue
return links
# 插入video
def execute(self, url, cid):
db = self.getDB()
cursor = db.cursor()
soup = self.getSoup(url)
title = self.getTitle(soup)
name = self.getName(title)
cursor.execute(self.selectVideo, title)
titleDB = cursor.fetchone()
if titleDB is None:
cursor.execute(self.insertVideo, (name.strip(), title.strip(), cid))
vid = cursor.lastrowid
links = self.getLink(soup, vid)
if len(links) > 0:
cursor.executemany(self.insertLink, links)
self.getPic(soup, name, vid)
else:
print(' ' + name + '---无法获取链接')
db.rollback()
print('--' + name + '--已完成')
else:
print('!!!!!!' + name + '已存在!!!!!!')
db.commit()
db.close()
# 获取分类每一页页面链接
def getEachVideoLinks(self, cateUrl):
soup = self.getSoup(cateUrl)
urls = soup.find_all(attrs={'class': 'ulink'})
trueUrls = []
for url in urls:
trueUrl = url.get('href')
if re.match('.*?\.html', trueUrl) is not None:
trueUrls.append(trueUrl)
return trueUrls
# 获取分类所有页面链接
def getEveryVideoLinks(self, cateUrl):
soup = self.getSoup(cateUrl).text
pageCount = re.search('页次.*?\d*?/(\d*)', soup).group(1)
pageNums = ['']
for i in range(2, int(pageCount) + 1):
pageNums.append('_' + str(i))
everyTrueUrls = []
for num in pageNums:
url = cateUrl + '/index' + num + '.html'
try:
everyTrueUrls += self.getEachVideoLinks(url)
print(url + '获取页面链接成功')
except Exception:
try:
everyTrueUrls += self.getEachVideoLinks(url)
print(url + '获取页面链接成功')
except Exception:
print('+++++++++++++++重要' + url + '失败+++++++++++++++')
continue
return everyTrueUrls
# 获取所有分类
def getCategory(self):
categorys = []
for i in range(8, 21):
categorys.append(str(i))
categorys.append('html/tv/hytv')
categorys.append('html/tv/hepai')
categorys.append('html/tv/gangtai')
categorys.append('html/tv/oumeitv')
categorys.append('html/tv/rihantv')
categorys.append('html/zongyi2013')
categorys.append('html/2009zongyi')
categorys.append('html/dongman')
categorys.append('html/game')
categorys.append('html/3gp')
return categorys
# 获取所有分类的所有页面的所有连接
def getAllVideoLink(self, categorys):
timeBegin = time.time()
for i in range(0, len(categorys)):
# 获取分类名称
url = 'https://www.dy2018.com/' + categorys[i]
try:
soup = self.getSoup(url)
except Exception:
try:
soup = self.getSoup(url)
except Exception:
print('+++++++++++++++++重要' + url + '失败+++++++++++++++++++++')
continue
# if i < 20:
# titleAll = soup.find('h1').text
# categoryTitle = re.search('>(.*?)>', titleAll).group(1).strip()
# elif i < 25:
# titleAll = soup.find('h1').find_all('a')
# categoryTitle = titleAll[2].text
# else:
titleAll = soup.find('h1').find_all('a')
categoryTitle = titleAll[1].text
db = self.getDB()
cursor = db.cursor()
cursor.execute(self.selectCategory, categoryTitle.strip())
ca = cursor.fetchone()
if ca is None:
cursor.execute(self.insertCategory, categoryTitle.strip())
else:
print(categoryTitle + ' 已 存 在')
cid = cursor.lastrowid
db.commit()
db.close()
try:
everyUrls = self.getEveryVideoLinks(url)
except Exception:
try:
everyUrls = self.getEveryVideoLinks(url)
except Exception:
print('++++++++++++++++重要' + url + '失败++++++++++++++++')
continue
timeGetUrls = time.time()
printTime(timeGetUrls - timeBegin)
for everyUrl in everyUrls:
videoUrl = 'Https://www.dy2018.com/' + everyUrl
try:
self.execute(videoUrl, cid)
except Exception:
try:
self.execute(videoUrl, cid)
except Exception as e:
print(e)
continue
timeFinishOne = time.time()
hasFinish=str(everyUrls.index(everyUrl)+1)+' / '+str(len(everyUrls))
printTime(timeFinishOne - timeBegin,hasFinish)
# time.sleep(0.7)
print('-------------------------' + categoryTitle + '已完成----------------------------')
if __name__ == '__main__':
video = eachVideo()
categorys = video.getCategory()
video.getAllVideoLink(categorys)
相关阅读
不少朋友觉得每次登录win10系统的时候都需要输入登录密码很麻烦,有没有win10取消登陆密码的方法呢?别着急,方法肯定是有的,下面小编就
一、变量的概念
变量名只有在第一次出现的时候,才是定义变量。当再次出现时,不是定义变量,而是直接使用之前定义的变量。
1.变量命
1、安装Pecan
更改pip镜像源mkdir ~/.pip
echo """
[global]
trusted-host=mirrors.aliyun.com
index-url=http://mirrors.aliyun
bisect是python内置模块,用于有序序列的插入和查找。
查找: bisect(array, item)
插入: insort(array,item)
查找
import bisect
DEM除了包括地面高程信息外,还可以派生地貌特性,包括坡度、坡向等,还可以计算地形特征参数,包括山峰、山脊、平原、位面、河道和沟谷
更多推荐
所有评论(0)