www.dy2018.com

用到的库为Requests,bs4,re,pyMySQL

目的是将链接存入数据库,数据库分为三张表:

category:存储分类

video:存储视屏的信息

link:存储链接

因为该网站网页不规范,所以用到了很多判断、正则来获取正确链接,但估计依然有没爬取到的,也就这样吧。不过这个网站页面结构相对简单,感觉很适合练手

结果共爬取28000+视屏,14w+链接,爬了几十个小时(我在中间有sleep),下面是所有代码import time

import requests

fROM bs4 import BeautifulSoup

import re

import pymysql

def printTime(timeFloat,hasFinish=''):

timeInt = round(timeFloat)

timeHour = timeInt // (60 * 60)

timeMinute = (timeInt - timeHour * 60 * 60) // 60

timeSecond = timeInt - timeMinute * 60 - timeHour * 60 * 60

if timeInt < 60:

print('已用时:' + str(timeInt) + ' s',end='')

elif timeHour < 1:

print('已用时:' + str(timeMinute) + ' m ' + str(timeSecond) + ' s',end='')

else:

print('已用时:' + str(timeHour) + ' h ' + str(timeMinute) + ' m ' + str(timeSecond) + ' s',end='')

print(' 完成度'+hasFinish)

class eachVideo:

def __init__(self):

self.insertCategory = 'insert into category (name) values (%s)'

self.insertVideo = 'insert into video (name,title,cid) values (%s,%s,%s)'

self.insertLink = 'insert into link (link,vid) values (%s,%s)'

self.SELECTVideo = 'select id from video where title=%s'

self.selectCategory = 'select id from category where name=%s'

# 获取数据库连接

def getDB(self):

db = pymysql.connect(user='root', password='199508', DATabase='myresource')

return db

# 获取soup对象

def getSoup(self, URL):

headers = {'user-agent': 'Mozilla/5.0 (windows NT 10.0; WOW64) AppleWEBKit/537.36 (Khtml, like Gecko) '

'Chrome/63.0.3239.132 Safari/537.36',

'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',

'accept - encoding': 'gzip, deflate, br',

'cookie': '_ga=GA1.2.412668155.1542004912; gr_user_id=b54b3fbb-3005-4021-9191-49961f0925e5; '

'_gid=GA1.2.497245283.1542793697; Hm_lvt_a68dc87e09b2a989eec1a0669bfd59eb=1542437077,'

'1542793697,1542863682,1542960832; pescdfeedbackbid=2; '

'pescdcheckfeedbackkey=1543013870%2C51ac4fa341dda1cbfc464c9eb8b7270a'

'%2C7673df03288dcb33602cccfb14489466; XLA_CI=35a00c84ce21862d2edb13445a8675c8; '

'pescdlastsearchtime=1543041447; '

'gr_Session_id_bce67daadd1e4d71=212d307a-bba4-44a4-88ba-da5684fa84e5; '

'gr_session_id_bce67daadd1e4d71_212d307a-bba4-44a4-88ba-da5684fa84e5=true; '

'Hm_lpvt_a68dc87e09b2a989eec1a0669bfd59eb=1543130349'}

response = requests.get(url, headers=headers, timeout=2)

try:

html = response.content.decode('gb2312')

except UnicodeDecodeERROR:

html = response.content.decode('gbk')

soup = BeautifulSoup(html, 'lxml')

return soup

# 获取标题

def getTitle(self, soup):

title = soup.find('h1').text

return title

# 获取名字

def getName(self, title):

try:

name = re.search('《(.*?)》', title).group(1)

except AttributeError:

name = title

return name

# 获取图片

def getPic(self, soup, name, vid):

imageLink = soup.find('img').get('src')

if imageLink is not None:

try:

image = requests.get(imageLink, timeout=1)

path = 'E:\图片\dy2018\\' + str(vid) + '.jpg'

with open(path, 'wb') as pic:

pic.write(image.content)

except Exception:

print(' ' + name + '---图片下载失败')

else:

print(' ' + name + '---该视频没有海报')

# 获取链接

def getLink(self, soup, vid):

link = soup.find_all('td')

links = []

for l in link:

try:

if re.search('word-wrap.*?', l.get('style').lower()) is not None:

links.append([l.text.strip(), vid])

except AttributeError:

continue

return links

# 插入video

def execute(self, url, cid):

db = self.getDB()

cursor = db.cursor()

soup = self.getSoup(url)

title = self.getTitle(soup)

name = self.getName(title)

cursor.execute(self.selectVideo, title)

titleDB = cursor.fetchone()

if titleDB is None:

cursor.execute(self.insertVideo, (name.strip(), title.strip(), cid))

vid = cursor.lastrowid

links = self.getLink(soup, vid)

if len(links) > 0:

cursor.executemany(self.insertLink, links)

self.getPic(soup, name, vid)

else:

print(' ' + name + '---无法获取链接')

db.rollback()

print('--' + name + '--已完成')

else:

print('!!!!!!' + name + '已存在!!!!!!')

db.commit()

db.close()

# 获取分类每一页页面链接

def getEachVideoLinks(self, cateUrl):

soup = self.getSoup(cateUrl)

urls = soup.find_all(attrs={'class': 'ulink'})

trueUrls = []

for url in urls:

trueUrl = url.get('href')

if re.match('.*?\.html', trueUrl) is not None:

trueUrls.append(trueUrl)

return trueUrls

# 获取分类所有页面链接

def getEveryVideoLinks(self, cateUrl):

soup = self.getSoup(cateUrl).text

pageCount = re.search('页次.*?\d*?/(\d*)', soup).group(1)

pageNums = ['']

for i in range(2, int(pageCount) + 1):

pageNums.append('_' + str(i))

everyTrueUrls = []

for num in pageNums:

url = cateUrl + '/index' + num + '.html'

try:

everyTrueUrls += self.getEachVideoLinks(url)

print(url + '获取页面链接成功')

except Exception:

try:

everyTrueUrls += self.getEachVideoLinks(url)

print(url + '获取页面链接成功')

except Exception:

print('+++++++++++++++重要' + url + '失败+++++++++++++++')

continue

return everyTrueUrls

# 获取所有分类

def getCategory(self):

categorys = []

for i in range(8, 21):

categorys.append(str(i))

categorys.append('html/tv/hytv')

categorys.append('html/tv/hepai')

categorys.append('html/tv/gangtai')

categorys.append('html/tv/oumeitv')

categorys.append('html/tv/rihantv')

categorys.append('html/zongyi2013')

categorys.append('html/2009zongyi')

categorys.append('html/dongman')

categorys.append('html/game')

categorys.append('html/3gp')

return categorys

# 获取所有分类的所有页面的所有连接

def getAllVideoLink(self, categorys):

timeBegin = time.time()

for i in range(0, len(categorys)):

# 获取分类名称

url = 'https://www.dy2018.com/' + categorys[i]

try:

soup = self.getSoup(url)

except Exception:

try:

soup = self.getSoup(url)

except Exception:

print('+++++++++++++++++重要' + url + '失败+++++++++++++++++++++')

continue

# if i < 20:

# titleAll = soup.find('h1').text

# categoryTitle = re.search('>(.*?)>', titleAll).group(1).strip()

# elif i < 25:

# titleAll = soup.find('h1').find_all('a')

# categoryTitle = titleAll[2].text

# else:

titleAll = soup.find('h1').find_all('a')

categoryTitle = titleAll[1].text

db = self.getDB()

cursor = db.cursor()

cursor.execute(self.selectCategory, categoryTitle.strip())

ca = cursor.fetchone()

if ca is None:

cursor.execute(self.insertCategory, categoryTitle.strip())

else:

print(categoryTitle + ' 已 存 在')

cid = cursor.lastrowid

db.commit()

db.close()

try:

everyUrls = self.getEveryVideoLinks(url)

except Exception:

try:

everyUrls = self.getEveryVideoLinks(url)

except Exception:

print('++++++++++++++++重要' + url + '失败++++++++++++++++')

continue

timeGetUrls = time.time()

printTime(timeGetUrls - timeBegin)

for everyUrl in everyUrls:

videoUrl = 'Https://www.dy2018.com/' + everyUrl

try:

self.execute(videoUrl, cid)

except Exception:

try:

self.execute(videoUrl, cid)

except Exception as e:

print(e)

continue

timeFinishOne = time.time()

hasFinish=str(everyUrls.index(everyUrl)+1)+' / '+str(len(everyUrls))

printTime(timeFinishOne - timeBegin,hasFinish)

# time.sleep(0.7)

print('-------------------------' + categoryTitle + '已完成----------------------------')

if __name__ == '__main__':

video = eachVideo()

categorys = video.getCategory()

video.getAllVideoLink(categorys)

相关阅读

不少朋友觉得每次登录win10系统的时候都需要输入登录密码很麻烦,有没有win10取消登陆密码的方法呢?别着急,方法肯定是有的,下面小编就

一、变量的概念

变量名只有在第一次出现的时候,才是定义变量。当再次出现时,不是定义变量,而是直接使用之前定义的变量。

1.变量命

1、安装Pecan

更改pip镜像源mkdir ~/.pip

echo """

[global]

trusted-host=mirrors.aliyun.com

index-url=http://mirrors.aliyun

bisect是python内置模块,用于有序序列的插入和查找。

查找: bisect(array, item)

插入: insort(array,item)

查找

import bisect

DEM除了包括地面高程信息外,还可以派生地貌特性,包括坡度、坡向等,还可以计算地形特征参数,包括山峰、山脊、平原、位面、河道和沟谷

Logo

腾讯云面向开发者汇聚海量精品云计算使用和开发经验,营造开放的云计算技术生态圈。

更多推荐