python爬取XLWB(含标题、日期、正文、点赞、评论、转发、图片等)
利用python xpath爬取XLWB内容,包括作者、标题、正文、分页抓取、图片下载等
·
1.工具
request、xpath
2.流程
在搜索栏输入关键字,抓取每页的WB内容,包含标题、发布人 、日期、正文、点赞数、评论数 、转发数、图片等,爬取多页的内容,每个关键字的爬取结果以excel形式存放
3.注意
cookie需要手动从浏览器抓包工具里获取,不然代码里的cookie值过期,导致爬取不到结果
4.代码
import os
import threading
import time
import requests,xlwt
from lxml import etree
from concurrent.futures import ThreadPoolExecutor, wait,ALL_COMPLETED
"""
WB:图片,高级搜索模式
"""
def write_excel_xls(path, sheet_name, value):
index = len(value) # 获取需要写入数据的行数
workbook = xlwt.Workbook() # 新建一个工作簿
sheet = workbook.add_sheet(sheet_name) # 在工作簿中新建一个表格
for i in range(0, index):
for j in range(0, len(value[i])):
sheet.write(i, j, value[i][j]) # 向表格中写入数据(对应的行和列)
workbook.save(path) # 保存工作簿
print("xls格式表格写入数据成功!")
# 每个关键字调用该方法,所有页面数据一次性写入excel
def download_data(key,page,startTime,endTime,head):
platform = "WB"
emotion = "-"
area = "-"
type = "图片"
duty = "-"
relevance = "-"
effect = "-"
weather = "-"
communication = "-"
# 用来装载每条WB相关的字段
data = []
# 多个WB具有相同视频,区分视频文件名
i = 0
xls_name = f'./xls/{platform}_{key}_{type}_{startTime}_{endTime}.xls'
# 全部
weibo_url="" #这里填写WB搜索url
url = f"{weibo_url}?q={key}&typeall=1&haspic=1×cope=custom:{startTime}:{endTime}&Refer=g&page={str(page)}"
resp = requests.get(url, headers=head)
resp.encoding = "utf-8"
html = etree.HTML(resp.text)
resp.close()
# 获取所有页数
pages = html.xpath('//*[@id="pl_feedlist_index"]/div[3]/div/span/ul/li/a/text()')
max_page = len(pages)
if max_page == 0:
max_page = 1
print(max_page)
#--------------------------------------------下面加入分页逻辑,每一次循环针对每一页-------------------------------------------------------
while page <= max_page:
# 原创
url_list = url.split('&')
url_list[-1] = f'page={str(page)}'
url = "&".join(url_list)
resp = requests.get(url,headers=head)
resp.encoding = "utf-8"
html = etree.HTML(resp.text)
resp.close()
# 获取所有搜索出的WB的div
div_list = html.xpath('//*[@id="pl_feedlist_index"]/div[2]/div')
# 事件标题:都是在<a></a>标签里的内容
# 遍历该页面的所有WB div
for div in div_list:
child_div = div.xpath('./div/div[1]/div[2]')[0] # div node-type="like"
# 首先提取标题 只看<a>标题</a>,一是没有<a>,二是<a>里是“展开”、的略过,因为这样的WB难以提取到标题内容
orginAddressNode = child_div.xpath('./p[2]/a[1]')
# 判断有无<a>,或者<a>里是展开的
if len(orginAddressNode) == 0 or orginAddressNode[0].xpath('./text()')[0].strip() == '展开':
continue
title = orginAddressNode[0].xpath('./text()')[0].strip('#')
if title == '网页链接':
continue
print(f"{t_id}: {t_name}: title:{title}")
# 判断是否有展开全文
# 发布者
author = child_div.xpath('./div[1]/div[2]/a/text()')[0]
# 发布时间
publishTimeList = child_div.xpath('./p[1]/a[1]/text()')[0].strip()
# 判断是否有今年年份
if "年" not in publishTimeList:
publishTimeList = publishTimeList.split(" ")
publishTimeList.insert(0, "2022年")
publishTime = "-".join(publishTimeList) \
.replace('年', '') \
.replace('月', '-') \
.replace('日', '')
else:
publishTime = publishTimeList \
.replace('年', '-') \
.replace('月', '-') \
.replace('日 ', '-')
idx = publishTime.rfind('-')
temp = list(publishTime)
temp[idx] = ' '
publishTime = "".join(temp)
# 文章原文地址 点击发布时间,获取属性值
sourceAddress = "https:" + child_div.xpath("./p[1]/a[1]/@href")[0]
node_type = div.xpath('./div/div[1]/div[2]/p/@node-type')
print(f'node_type:{node_type}')
# 检查是否有直接转载的WB
reprint_list = child_div.xpath('.//i/text()')
print(reprint_list)
if 'O' in reprint_list:
print('是转载的WB')
continue
if 'feed_list_content_full' in node_type:
# 有展开全文,从全文获取内容
content = div.xpath('./div/div[1]/div[2]/p[3]/text()')
else:
# 无展开全文
content = div.xpath('./div/div[1]/div[2]/p[2]/text()')
content = list(map(lambda s: s.strip('\n '), content))
content = list(filter(lambda s: s.strip(), content))
content = list(map(lambda s: s.strip('\u200b'), content))
content = list(map(lambda s: s.strip(), content))
print(content)
for i,s in enumerate(content):
if s.endswith(',') \
or s.endswith('。') \
or s.endswith(',') \
or s.endswith('.') \
or s.endswith('?') \
or s.endswith('!') \
or s.endswith('?') \
or s.endswith('!') \
or s.endswith('、') \
or s.endswith(';') \
or s.endswith(';') \
or s.endswith(':') \
or s.endswith(':') :
continue
elif s == '\u200b':
content.remove(s)
elif s == '':
content.remove(s)
elif s == ' ':
content.remove(s)
else:
content[i] = s + '。'
# 文章内容
content = "".join(content)
if '【' in content and '】' in content:
idx1 = content.index('【')
idx2 = content.index('】')
if idx2 == idx1+1:
content = content[idx2+1:]
print(f'{t_id}: {t_name}: content:{content}')
# 转发数量
forward = div.xpath('./div/div[2]/ul/li[1]/a/text()')[1].strip()
if forward == "转发":
forward = "0"
print(f'{t_id}: {t_name}: forward:{forward}')
# 评论数量
comment = div.xpath('./div/div[2]/ul/li[2]/a/text()')[0].strip()
if comment == "评论":
comment = "0"
print(f'{t_id}: {t_name}: comment:{comment}')
# 点赞数量
like = div.xpath('./div/div[2]/ul/li[3]/a/button/span[2]/text()')[0]
if like == "赞":
like = "0"
print(f'{t_id}: {t_name}: like:{like}')
try:
image_list = child_div.xpath('./div[2]/div/ul/li/img/@src')
# 图片链接
image_url = image_list[0]
# 修改为高清链接
temp_list = image_url.split('/')
print(temp_list)
temp_list[3] = 'mw1024'
image_url = temp_list[0] + "//" + temp_list[2] + '/' + temp_list[3] + '/' + temp_list[4]
if "sinaimg" not in image_url:
print("非WB来源图片")
continue
# 提取后缀名
suffix = image_url.split('.')[-1]
except:
# 若有不是weibo上的视频,则跳过下载与抓取
print("该图片链接有问题")
continue
print(f'{t_id}: {t_name}: {image_url}')
# 附件
file_name = f"images/{''.join(publishTime.split(' ')[0].split('-'))}_{author}_{title}_{platform}.{suffix}"
if os.path.exists(file_name):
i += 1
continue
print('图片已存在')
file_name = f"images/{''.join(publishTime.split(' ')[0].split('-'))}_{author}_{title}{i}_{platform}.{suffix}"
#file_name = f"file_name.split('.')[0]{i}.mp4"
resp = requests.get(image_url)
if resp.status_code != requests.codes.ok:
print('请求图片非200')
continue
else:
# 下载图片
with open(file_name,mode='wb') as f:
f.write(resp.content)
data.append([title,key,publishTime,author,content,platform,sourceAddress,emotion,area,type,forward,comment,like,duty,relevance,effect,weather,communication,file_name])
page += 1
time.sleep(1)
print("data:",len(data))
if len(data) > 0:
write_excel_xls(xls_name, "sheet1", data)
def main():
# 输入搜索关键字
keys = ['','','']
page = 1
startTime = "2022-01-01-0"
endTime = "2022-01-31-24"
# 自行输入cookie和user-agent
head = {
"cookie": "",
"User-Agent": ""
}
executor = ThreadPoolExecutor(max_workers=15)
all_task = [executor.submit(download_data, key,page,startTime,endTime,head) for key in keys]
wait(all_task, return_when=ALL_COMPLETED)
if __name__ == '__main__':
main()
4.效果
更多推荐
所有评论(0)