python爬取XLWB（含标题、日期、正文、点赞、评论、转发、图片等）

利用python xpath爬取XLWB内容，包括作者、标题、正文、分页抓取、图片下载等

阿雅Yage

956人浏览 · 2022-05-03 20:06:28

阿雅Yage · 2022-05-03 20:06:28 发布

1.工具

request、xpath

2.流程

在搜索栏输入关键字，抓取每页的WB内容，包含标题、发布人、日期、正文、点赞数、评论数、转发数、图片等，爬取多页的内容，每个关键字的爬取结果以excel形式存放

3.注意

cookie需要手动从浏览器抓包工具里获取，不然代码里的cookie值过期，导致爬取不到结果

4.代码

import os
import threading
import time
import requests,xlwt
from lxml import etree
from concurrent.futures import ThreadPoolExecutor, wait,ALL_COMPLETED

"""
WB：图片，高级搜索模式
"""
def write_excel_xls(path, sheet_name, value):
    index = len(value)  # 获取需要写入数据的行数
    workbook = xlwt.Workbook()  # 新建一个工作簿
    sheet = workbook.add_sheet(sheet_name)  # 在工作簿中新建一个表格
    for i in range(0, index):
        for j in range(0, len(value[i])):
            sheet.write(i, j, value[i][j])  # 向表格中写入数据（对应的行和列）
    workbook.save(path)  # 保存工作簿
    print("xls格式表格写入数据成功！")
    
# 每个关键字调用该方法，所有页面数据一次性写入excel
def download_data(key,page,startTime,endTime,head):
    platform = "WB"
    emotion = "-"
    area = "-"
    type = "图片"
    duty = "-"
    relevance = "-"
    effect = "-"
    weather = "-"
    communication = "-"


    # 用来装载每条WB相关的字段
    data = []
    # 多个WB具有相同视频，区分视频文件名
    i = 0

    xls_name = f'./xls/{platform}_{key}_{type}_{startTime}_{endTime}.xls'

    # 全部
    weibo_url="" #这里填写WB搜索url
    url = f"{weibo_url}?q={key}&typeall=1&haspic=1&timescope=custom:{startTime}:{endTime}&Refer=g&page={str(page)}"
  

    resp = requests.get(url, headers=head)
    resp.encoding = "utf-8"
    html = etree.HTML(resp.text)
    resp.close()

    # 获取所有页数
    pages = html.xpath('//*[@id="pl_feedlist_index"]/div[3]/div/span/ul/li/a/text()')
    max_page = len(pages)
    if max_page == 0:
        max_page = 1
    print(max_page)

    #--------------------------------------------下面加入分页逻辑，每一次循环针对每一页-------------------------------------------------------

    while page <= max_page:

        # 原创
        url_list = url.split('&')
        url_list[-1] = f'page={str(page)}'
        url = "&".join(url_list)

        resp = requests.get(url,headers=head)
        resp.encoding = "utf-8"
        html = etree.HTML(resp.text)
        resp.close()
        # 获取所有搜索出的WB的div
        div_list = html.xpath('//*[@id="pl_feedlist_index"]/div[2]/div')  


        # 事件标题：都是在<a></a>标签里的内容
        # 遍历该页面的所有WB  div

        for div in div_list:
                child_div = div.xpath('./div/div[1]/div[2]')[0]  # div node-type="like"
                # 首先提取标题  只看<a>标题</a>，一是没有<a>,二是<a>里是“展开”、的略过，因为这样的WB难以提取到标题内容
                orginAddressNode = child_div.xpath('./p[2]/a[1]')
                # 判断有无<a>,或者<a>里是展开的
                if len(orginAddressNode) == 0 or orginAddressNode[0].xpath('./text()')[0].strip() == '展开':
                    continue

                title = orginAddressNode[0].xpath('./text()')[0].strip('#')
                if title == '网页链接':
                    continue

                print(f"{t_id}: {t_name}: title:{title}")
                # 判断是否有展开全文         

                # 发布者
                author = child_div.xpath('./div[1]/div[2]/a/text()')[0]

                # 发布时间
                publishTimeList = child_div.xpath('./p[1]/a[1]/text()')[0].strip()

                # 判断是否有今年年份
                if "年" not in publishTimeList:
                    publishTimeList = publishTimeList.split(" ")
                    publishTimeList.insert(0, "2022年")
                    publishTime = "-".join(publishTimeList) \
                        .replace('年', '') \
                        .replace('月', '-') \
                        .replace('日', '')
                else:
                    publishTime = publishTimeList \
                        .replace('年', '-') \
                        .replace('月', '-') \
                        .replace('日 ', '-')

                idx = publishTime.rfind('-')
                temp = list(publishTime)
                temp[idx] = ' '
                publishTime = "".join(temp)
                # 文章原文地址  点击发布时间,获取属性值
                sourceAddress = "https:" + child_div.xpath("./p[1]/a[1]/@href")[0]
           
                node_type = div.xpath('./div/div[1]/div[2]/p/@node-type')
                print(f'node_type:{node_type}')

                # 检查是否有直接转载的WB
                reprint_list = child_div.xpath('.//i/text()')
                print(reprint_list)
                if 'O' in reprint_list:
                    print('是转载的WB')
                    continue


                if 'feed_list_content_full' in node_type:
                    # 有展开全文,从全文获取内容
                    content = div.xpath('./div/div[1]/div[2]/p[3]/text()')
                else:
                    # 无展开全文
                    content = div.xpath('./div/div[1]/div[2]/p[2]/text()')

                content = list(map(lambda s: s.strip('\n '), content))
                content = list(filter(lambda s: s.strip(), content))
                content = list(map(lambda s: s.strip('\u200b'), content))
                content = list(map(lambda s: s.strip(), content))
                print(content)
                for i,s in enumerate(content):
                    if s.endswith('，')   \
                        or s.endswith('。') \
                        or s.endswith(',')  \
                        or s.endswith('.') \
                        or s.endswith('？') \
                        or s.endswith('！') \
                        or s.endswith('?') \
                        or s.endswith('!') \
                        or s.endswith('、') \
                        or s.endswith('；') \
                        or s.endswith(';') \
                        or s.endswith('：') \
                        or s.endswith(':') :
                        continue
                    elif s == '\u200b':
                        content.remove(s)
                    elif s == '':
                        content.remove(s)
                    elif s == ' ':
                        content.remove(s)
                    else:
                        content[i] = s + '。'

                # 文章内容

                content = "".join(content)
                if '【' in content and '】' in content:
                    idx1 = content.index('【')
                    idx2 = content.index('】')

                    if idx2 == idx1+1:
                        content = content[idx2+1:]

                print(f'{t_id}: {t_name}: content:{content}')


                # 转发数量
                forward = div.xpath('./div/div[2]/ul/li[1]/a/text()')[1].strip()
                if forward == "转发":
                    forward = "0"
                print(f'{t_id}: {t_name}: forward:{forward}')

                # 评论数量
                comment = div.xpath('./div/div[2]/ul/li[2]/a/text()')[0].strip()
                if comment == "评论":
                    comment = "0"
                print(f'{t_id}: {t_name}: comment:{comment}')

                # 点赞数量
                like = div.xpath('./div/div[2]/ul/li[3]/a/button/span[2]/text()')[0]
                if like == "赞":
                    like = "0"

                print(f'{t_id}: {t_name}: like:{like}')
         
                
                try:
                    image_list = child_div.xpath('./div[2]/div/ul/li/img/@src')

                    # 图片链接
                    image_url = image_list[0]

                    # 修改为高清链接
                    temp_list = image_url.split('/')
                    print(temp_list)
                    temp_list[3] = 'mw1024'
                    image_url = temp_list[0] + "//" + temp_list[2] + '/' + temp_list[3] + '/' + temp_list[4]
                    if "sinaimg" not in image_url:
                        print("非WB来源图片")
                        continue
                    # 提取后缀名
                    suffix = image_url.split('.')[-1]

                except:
                    # 若有不是weibo上的视频，则跳过下载与抓取
                    print("该图片链接有问题")
                    continue

                print(f'{t_id}: {t_name}: {image_url}')

                # 附件
                file_name = f"images/{''.join(publishTime.split(' ')[0].split('-'))}_{author}_{title}_{platform}.{suffix}"


                if os.path.exists(file_name):
                    i += 1
                    continue
                    print('图片已存在')

                    file_name = f"images/{''.join(publishTime.split(' ')[0].split('-'))}_{author}_{title}{i}_{platform}.{suffix}"
                    #file_name = f"file_name.split('.')[0]{i}.mp4"

                resp = requests.get(image_url)
                if resp.status_code != requests.codes.ok:
                    print('请求图片非200')
                    continue

                else:
                    # 下载图片
                    with open(file_name,mode='wb') as f:
                        f.write(resp.content)

                data.append([title,key,publishTime,author,content,platform,sourceAddress,emotion,area,type,forward,comment,like,duty,relevance,effect,weather,communication,file_name])

              

  
        page += 1
        time.sleep(1)
    print("data:",len(data))
    if len(data) > 0:
        write_excel_xls(xls_name, "sheet1", data)

def main():


	# 输入搜索关键字 
    keys  =  ['','','']
    page = 1
    startTime = "2022-01-01-0"
    endTime = "2022-01-31-24"
    # 自行输入cookie和user-agent
    head = {
        "cookie": "",
        "User-Agent": ""

    }

    
    executor = ThreadPoolExecutor(max_workers=15)
    all_task = [executor.submit(download_data, key,page,startTime,endTime,head) for key in keys]
    wait(all_task, return_when=ALL_COMPLETED)

if __name__ == '__main__':
    main()

4.效果

在这里插入图片描述

腾讯云开发者社区

腾讯云面向开发者汇聚海量精品云计算使用和开发经验，营造开放的云计算技术生态圈。

更多推荐

自动化提示词生成工具盘点

腾讯云开发者社区

AI 浪潮下的锚与帆：工程师文化的变与不变 | 架构师夜生活

腾讯云开发者社区

腾讯云架构师技术沙龙 · 长沙站圆满落幕，共话AI驱动下的技术架构与前沿应用

人工智能已成为推动技术创新与产业变革的重要引擎，开发者正身处一场前所未有的技术变革之中。通过本次腾讯云架构师技术沙龙，各位专家深入分享前沿技术洞察，探讨 AI 落地的应用路径与实践经验，为架构师的职业发展指明方向。腾讯云架构师长沙同盟和腾讯云架构师技术同盟长沙地区理事会正式成立。未来，腾讯云架构师长沙同盟将凝心聚力，打造属于本地架构师的学习与成长的家园，助力中国架构的蓬勃发展。未来已来，让我们携手