如何评价python语言_Python如何爬取京东的评价信息

import reimport timeimport csvimport requestsfrom bs4 import BeautifulSoupdef write_a_row_in_csv(data, csv_doc):"save good information into a row in csv document"with open(csv_doc, 'a', newline='') as

weixin_39886024

95人浏览 · 2020-11-24 09:56:43

weixin_39886024 · 2020-11-24 09:56:43 发布

import re

import time

import csv

import requests

from bs4 import BeautifulSoup

def write_a_row_in_csv(data, csv_doc):

"save good information into a row in csv document"

with open(csv_doc, 'a', newline='') as f:

writer = csv.writer(f)

writer.writerow(data)

# add headers, download page, check status code, return page

url = 'https://search.jd.com/Search?keyword=%E5%8D%8E%E4%B8%BAp20&enc=utf-8&suggest=1.def.0.V13&wq=%E5%8D%8E%E4%B8%BA&pvid=f47b5d05bba84d9dbfabf983575a6875'

headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"

}

response = requests.get(url, headers=headers)

print(response.status_code)

# save as html document

with open('html.html', 'w', encoding='utf8') as f:

f.write(response.text)

# save as csv document

with open('phone.csv', 'w', newline='') as f:

writer = csv.writer(f)

fields = ('id', '名称', '价格', '评价人数', '好评率')

writer.writerow(fields)

# find elements, such as name, item, price, comment, goodrate, comment count

soup_all = BeautifulSoup(response.content, 'lxml')

sp_all_items = soup_all.find_all('li', attrs={'class': 'gl-item'})

for soup in sp_all_items[:3]:

print('-' * 50)

name = soup.find('div', attrs={'class': 'p-name p-name-type-2'}).find('em').text

print('name: ', name)

item = soup.find('div', attrs={'class': 'p-name p-name-type-2'}).find('a')

print('item: ', item['href'], re.search(r'(\d+)', item['href']).group())

price = soup.find_all('div', attrs={'class': 'p-price'})

print('price:', price[0].i.string)

comment = soup.find_all('div', attrs={'class': 'p-commit'})

print('comment url:', comment[0].find('a').attrs['href'])

time.sleep(0.2)

# need add referer into headers

item_id = re.search(r'(\d+)', item['href']).group()

url = f'https://sclub.jd.com/comment/productPageComments.action?productId={item_id}&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1'

headers = {

"referer": f"https://item.jd.com/{item_id}.html",

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"

}

response = requests.get(url, headers=headers)

with open('html.json', 'w', encoding='utf8') as f:

f.write(response.text)

data = response.json()

comment_count = data['productCommentSummary']['commentCount']

print('评价人数：', comment_count)

good_rate = data['productCommentSummary']['goodRate']

print('好评率：', good_rate)

# record data into CSV sheet

write_a_row_in_csv(('id'+item_id, name, price[0].i.string, comment_count, good_rate), 'phone.csv')

腾讯云开发者社区

腾讯云面向开发者汇聚海量精品云计算使用和开发经验，营造开放的云计算技术生态圈。

更多推荐

计算机网络微课堂笔记

腾讯云开发者社区

Rabbitmq在java中的使用

腾讯云开发者社区

java try catch 之后定位不到具体报错行_JAVA入门（三）上

点击蓝字｜关注我们一、异常与异常处理异常简介代码中：阻止当前方法或作用域继续实现的，称之为异常java中的所有异常类都继承Throwable类，Exception 的父类是 Throwable编码环境用户操作输入出现问题由java虚拟机自动抛出和自动捕获需要手动添加抛出和捕获语句文件找不到ThrowableErrorException虚拟机错误 VirtualMachineError...