python-每日一练-抽取某本书的前50条短评内容并计算评分的平均值
python-每日一练-抽取某本书的前50条短评内容并计算评分的平均值'''抽取某本书的前50条短评内容并计算评分的平均值。提示:有的评论中并不包含评分。'''import requestsimport refrom bs4 import BeautifulSoupimport timefrom functools import reduceheaders = {'User-Agent': 'Mo
·
python-每日一练-抽取某本书的前50条短评内容并计算评分的平均值
'''
抽取某本书的前50条短评内容并计算评分的平均值。提示:有的评论中并不包含评分。
'''
import requests
import re
from bs4 import BeautifulSoup
import time
from functools import reduce
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'Cookie': 'll="118159"; bid=Tlj9LZXK6qY; __utmc=30149280; __utmc=81379588; gr_user_id=4cc23846-8110-487a-9875-d2c22a01ffc5; _vwo_uuid_v2=DDF9B30D066068B64AEED05150ED2CC21|59ca475fd37ba868d8b940b30cb8051c; __gads=ID=30286cce13d6257e-222fedbd9fd00006:T=1644908408:RT=1644908408:S=ALNI_Mbxvt-k0lb2MC6tAcI60A5qPobCCw; __yadk_uid=OU6iPv3WVtSb7pup23vqweBrH4Gj0jdG; _ga=GA1.1.1475140875.1644908471; refer_url=https://read.douban.com/category/105; viewed="35630000_35620000_35610000_35690000_10790000_10800000_10900000_11000000_11000044_11000544"; dbcl2="191997283:aBzcj8HkRFg"; ck=7X40; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=9240ed32-5eb3-42cf-941a-95cdfa3e0efa; gr_cs1_9240ed32-5eb3-42cf-941a-95cdfa3e0efa=user_id%3A1; __utma=30149280.1907998096.1644908381.1644913515.1644932425.3; __utmz=30149280.1644932425.3.2.utmcsr=read.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt_douban=1; __utmz=81379588.1644932425.3.2.utmcsr=read.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=81379588.181811108.1644908384.1644913517.1644932425.3; __utmt=1; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1644932425%2C%22https%3A%2F%2Fread.douban.com%2F%22%5D; _pk_ses.100001.3ac3=*; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_9240ed32-5eb3-42cf-941a-95cdfa3e0efa=true; push_doumail_num=0; push_noty_num=0; _ga_RXNMP372GL=GS1.1.1644931471.4.1.1644932479.60; __utmt=1; __utmv=30149280.19199; __utmb=30149280.9.10.1644932425; __utmb=81379588.5.10.1644932425; _pk_id.100001.3ac3=5c3c19525f813f77.1644908384.3.1644932627.1644914955.'
}
# 获取短评评分
def get_shorts(arr,num,start):
time.sleep(1)
patter = re.compile(pattern='"user-stars allstar(.*) rating"')
r = requests.get('https://book.douban.com/subject/26873486/comments/?start={}&limit=20&status=P&sort=new_score'.format(str(start)),headers = headers)
if r.status_code == 200 :
market = r.text
soup = BeautifulSoup(market,'lxml')
datas = soup.find_all('li','comment-item')
data_lens = len(datas)
# print(data_lens)
for i in range(0,data_lens):
data = datas[i]
data_str = str(data)
patt_rs = re.findall(patter,data_str)
if patt_rs and len(arr)<num:
# 获取短评评分
score = int(patt_rs[0])
# 获取短评内容
short = data.find('span','short')
# print(patt_rs)
# print(short)
arr.append(score)
return arr,start
else:
print('获取失败',r.status_code)
return arr,start
# 评分列表
arr_score = []
# 获取评分数量
score_num = 50
# 起始页数
start = 0
# 获取50条短评评分
while len(arr_score)<50:
arr,start = get_shorts(arr_score,score_num,start)
start+=1
# 获取短评评分平均分
avg_score = reduce(lambda x,y:x+y,arr_score)//len(arr_score)
# 打印
print('短评评分平均分为:',avg_score)
更多推荐
已为社区贡献2条内容
所有评论(0)