1.工具

request、xpath

2.流程

在搜索栏输入关键字,抓取每页的WB内容,包含标题、发布人 、日期、正文、点赞数、评论数 、转发数、图片等,爬取多页的内容,每个关键字的爬取结果以excel形式存放

3.注意

cookie需要手动从浏览器抓包工具里获取,不然代码里的cookie值过期,导致爬取不到结果

4.代码

import os
import threading
import time
import requests,xlwt
from lxml import etree
from concurrent.futures import ThreadPoolExecutor, wait,ALL_COMPLETED

"""
WB:图片,高级搜索模式
"""
def write_excel_xls(path, sheet_name, value):
    index = len(value)  # 获取需要写入数据的行数
    workbook = xlwt.Workbook()  # 新建一个工作簿
    sheet = workbook.add_sheet(sheet_name)  # 在工作簿中新建一个表格
    for i in range(0, index):
        for j in range(0, len(value[i])):
            sheet.write(i, j, value[i][j])  # 向表格中写入数据(对应的行和列)
    workbook.save(path)  # 保存工作簿
    print("xls格式表格写入数据成功!")
    
# 每个关键字调用该方法,所有页面数据一次性写入excel
def download_data(key,page,startTime,endTime,head):
    platform = "WB"
    emotion = "-"
    area = "-"
    type = "图片"
    duty = "-"
    relevance = "-"
    effect = "-"
    weather = "-"
    communication = "-"


    # 用来装载每条WB相关的字段
    data = []
    # 多个WB具有相同视频,区分视频文件名
    i = 0

    xls_name = f'./xls/{platform}_{key}_{type}_{startTime}_{endTime}.xls'

    # 全部
    weibo_url="" #这里填写WB搜索url
    url = f"{weibo_url}?q={key}&typeall=1&haspic=1&timescope=custom:{startTime}:{endTime}&Refer=g&page={str(page)}"
  

    resp = requests.get(url, headers=head)
    resp.encoding = "utf-8"
    html = etree.HTML(resp.text)
    resp.close()

    # 获取所有页数
    pages = html.xpath('//*[@id="pl_feedlist_index"]/div[3]/div/span/ul/li/a/text()')
    max_page = len(pages)
    if max_page == 0:
        max_page = 1
    print(max_page)

    #--------------------------------------------下面加入分页逻辑,每一次循环针对每一页-------------------------------------------------------

    while page <= max_page:

        # 原创
        url_list = url.split('&')
        url_list[-1] = f'page={str(page)}'
        url = "&".join(url_list)

        resp = requests.get(url,headers=head)
        resp.encoding = "utf-8"
        html = etree.HTML(resp.text)
        resp.close()
        # 获取所有搜索出的WB的div
        div_list = html.xpath('//*[@id="pl_feedlist_index"]/div[2]/div')  


        # 事件标题:都是在<a></a>标签里的内容
        # 遍历该页面的所有WB  div

        for div in div_list:
                child_div = div.xpath('./div/div[1]/div[2]')[0]  # div node-type="like"
                # 首先提取标题  只看<a>标题</a>,一是没有<a>,二是<a>里是“展开”、的略过,因为这样的WB难以提取到标题内容
                orginAddressNode = child_div.xpath('./p[2]/a[1]')
                # 判断有无<a>,或者<a>里是展开的
                if len(orginAddressNode) == 0 or orginAddressNode[0].xpath('./text()')[0].strip() == '展开':
                    continue

                title = orginAddressNode[0].xpath('./text()')[0].strip('#')
                if title == '网页链接':
                    continue

                print(f"{t_id}: {t_name}: title:{title}")
                # 判断是否有展开全文         

                # 发布者
                author = child_div.xpath('./div[1]/div[2]/a/text()')[0]

                # 发布时间
                publishTimeList = child_div.xpath('./p[1]/a[1]/text()')[0].strip()

                # 判断是否有今年年份
                if "年" not in publishTimeList:
                    publishTimeList = publishTimeList.split(" ")
                    publishTimeList.insert(0, "2022年")
                    publishTime = "-".join(publishTimeList) \
                        .replace('年', '') \
                        .replace('月', '-') \
                        .replace('日', '')
                else:
                    publishTime = publishTimeList \
                        .replace('年', '-') \
                        .replace('月', '-') \
                        .replace('日 ', '-')

                idx = publishTime.rfind('-')
                temp = list(publishTime)
                temp[idx] = ' '
                publishTime = "".join(temp)
                # 文章原文地址  点击发布时间,获取属性值
                sourceAddress = "https:" + child_div.xpath("./p[1]/a[1]/@href")[0]
           
                node_type = div.xpath('./div/div[1]/div[2]/p/@node-type')
                print(f'node_type:{node_type}')

                # 检查是否有直接转载的WB
                reprint_list = child_div.xpath('.//i/text()')
                print(reprint_list)
                if 'O' in reprint_list:
                    print('是转载的WB')
                    continue


                if 'feed_list_content_full' in node_type:
                    # 有展开全文,从全文获取内容
                    content = div.xpath('./div/div[1]/div[2]/p[3]/text()')
                else:
                    # 无展开全文
                    content = div.xpath('./div/div[1]/div[2]/p[2]/text()')

                content = list(map(lambda s: s.strip('\n '), content))
                content = list(filter(lambda s: s.strip(), content))
                content = list(map(lambda s: s.strip('\u200b'), content))
                content = list(map(lambda s: s.strip(), content))
                print(content)
                for i,s in enumerate(content):
                    if s.endswith(',')   \
                        or s.endswith('。') \
                        or s.endswith(',')  \
                        or s.endswith('.') \
                        or s.endswith('?') \
                        or s.endswith('!') \
                        or s.endswith('?') \
                        or s.endswith('!') \
                        or s.endswith('、') \
                        or s.endswith(';') \
                        or s.endswith(';') \
                        or s.endswith(':') \
                        or s.endswith(':') :
                        continue
                    elif s == '\u200b':
                        content.remove(s)
                    elif s == '':
                        content.remove(s)
                    elif s == ' ':
                        content.remove(s)
                    else:
                        content[i] = s + '。'

                # 文章内容

                content = "".join(content)
                if '【' in content and '】' in content:
                    idx1 = content.index('【')
                    idx2 = content.index('】')

                    if idx2 == idx1+1:
                        content = content[idx2+1:]

                print(f'{t_id}: {t_name}: content:{content}')


                # 转发数量
                forward = div.xpath('./div/div[2]/ul/li[1]/a/text()')[1].strip()
                if forward == "转发":
                    forward = "0"
                print(f'{t_id}: {t_name}: forward:{forward}')

                # 评论数量
                comment = div.xpath('./div/div[2]/ul/li[2]/a/text()')[0].strip()
                if comment == "评论":
                    comment = "0"
                print(f'{t_id}: {t_name}: comment:{comment}')

                # 点赞数量
                like = div.xpath('./div/div[2]/ul/li[3]/a/button/span[2]/text()')[0]
                if like == "赞":
                    like = "0"

                print(f'{t_id}: {t_name}: like:{like}')
         
                
                try:
                    image_list = child_div.xpath('./div[2]/div/ul/li/img/@src')

                    # 图片链接
                    image_url = image_list[0]

                    # 修改为高清链接
                    temp_list = image_url.split('/')
                    print(temp_list)
                    temp_list[3] = 'mw1024'
                    image_url = temp_list[0] + "//" + temp_list[2] + '/' + temp_list[3] + '/' + temp_list[4]
                    if "sinaimg" not in image_url:
                        print("非WB来源图片")
                        continue
                    # 提取后缀名
                    suffix = image_url.split('.')[-1]

                except:
                    # 若有不是weibo上的视频,则跳过下载与抓取
                    print("该图片链接有问题")
                    continue

                print(f'{t_id}: {t_name}: {image_url}')

                # 附件
                file_name = f"images/{''.join(publishTime.split(' ')[0].split('-'))}_{author}_{title}_{platform}.{suffix}"


                if os.path.exists(file_name):
                    i += 1
                    continue
                    print('图片已存在')

                    file_name = f"images/{''.join(publishTime.split(' ')[0].split('-'))}_{author}_{title}{i}_{platform}.{suffix}"
                    #file_name = f"file_name.split('.')[0]{i}.mp4"

                resp = requests.get(image_url)
                if resp.status_code != requests.codes.ok:
                    print('请求图片非200')
                    continue

                else:
                    # 下载图片
                    with open(file_name,mode='wb') as f:
                        f.write(resp.content)

                data.append([title,key,publishTime,author,content,platform,sourceAddress,emotion,area,type,forward,comment,like,duty,relevance,effect,weather,communication,file_name])

              

  
        page += 1
        time.sleep(1)
    print("data:",len(data))
    if len(data) > 0:
        write_excel_xls(xls_name, "sheet1", data)

def main():


	# 输入搜索关键字 
    keys  =  ['','','']
    page = 1
    startTime = "2022-01-01-0"
    endTime = "2022-01-31-24"
    # 自行输入cookie和user-agent
    head = {
        "cookie": "",
        "User-Agent": ""

    }

    
    executor = ThreadPoolExecutor(max_workers=15)
    all_task = [executor.submit(download_data, key,page,startTime,endTime,head) for key in keys]
    wait(all_task, return_when=ALL_COMPLETED)

if __name__ == '__main__':
    main()

4.效果

在这里插入图片描述

Logo

腾讯云面向开发者汇聚海量精品云计算使用和开发经验,营造开放的云计算技术生态圈。

更多推荐