爬取豆瓣Top250电影名称(中英文)、评分、评价人数、导演和子网页各评分段人数、‘剧情简介’以及‘获奖情况’信息并存入数据库
插入到数据库中,首先要在数据库中建好表。
·
import requests
from bs4 import BeautifulSoup
import csv
import re
result=[]
director=[]
score=[]
headers = {
"User-Agent": "",#这里写成自己的
"Cookie": ''#写成自己的
}
for strart_num in range(0, 250,25):
response = requests.get(f"https://movie.douban.com/top250?start={strart_num}", headers=headers)
html = response.text
soup = BeautifulSoup(html, "html.parser")
print(soup)
# title名称 nick别名
for item in soup.find_all('div','info'):
titles=item.find_all('span','title')
title=titles[0].string.strip()
#nick=titles[1].string
other=item.find('span','other').string.strip()
# 如果有两个标题,拼接第二个标题和别名
if len(titles) > 1 and titles[1].string:
nick = titles[1].string.strip() + other
nick=nick.replace('\xa0','')
else:
nick = other # 如果没有第二个标题或别名,设置为空字符串
ick=nick.replace('\xa0','')
tag=item.find('a').get('href')
# print(tag)
html=requests.get(tag,headers=headers)
soup_add = BeautifulSoup(html.text, 'html.parser')
# 星级占比
ratings=soup_add.find('div','ratings-on-weight').get_text(strip=True)
# print(ratings)
# 获取各评分段人数
# ratings = soup_add.find('div', class_='ratings-on-weight')
# rating_per = ratings.find_all('span', class_='rating_per')# 评分百分比
# rating_per = [rp.get_text().strip('%') for rp in rating_per]
# ratings_summary = '[' + ', '.join([f'{rp}星:{rp_per}%' for rp, rp_per in zip(['5', '4', '3', '2', '1'], rating_per)]) + ']'# 合并输出各评分段人数
# print(ratings_summary)
# 简介
jian=soup_add.find('span',property="v:summary").get_text(strip=True)
# 获奖
award_item=soup_add.find_all('ul',class_='award')
awards_text = ''
for award_list in award_item:
award_items = award_list.find_all('li')
awards_text += ', '.join([award.get_text(strip=True) for award in award_items]) + '; '
awards_text = awards_text.strip('; ')
#
fenshu = item.find('span', class_='rating_num').get_text().strip()
# 导演
# 提取导演信息
info = item.find('div', 'bd').p.get_text().strip()
# 使用正则表达式提取导演信息
director_pattern = r'导演: (.*?)(?=\s+主演|\s+\n)'
director_match = re.search(director_pattern, info)
director = director_match.group(1).strip() if director_match else "未知"
oneresult=[title,nick,ratings,fenshu,director,jian,awards_text]
result.append(oneresult)
# #
# 写入CSV文件
with open('movie.csv', 'w', newline='', encoding='utf-8') as csvfile:
csv_writer = csv.writer(csvfile)
csv_writer.writerow(['Title', 'Nick', 'Ratings','Score','Director', 'Summary','Awards']) # 写入表头
csv_writer.writerows(result) # 写入数据
result
插入到数据库中,首先要在数据库中建好表
import pymysql.cursors
import pymysql
# 连接数据库
connect = pymysql.Connect(
host='localhost',
port=3306,
user='root',#自己的名字
password='***',#修改成自己的密码
db='movie',#要存的数据库的名称
# charset='utf8mb4' # 如果需要支持emoji等特殊字符,可以取消注释这行
)
# 获取游标
cursor = connect.cursor()
# 插入数据的SQL语句
insert_query = """
INSERT INTO movie_information(title, nick, ratings, score, director, summary, awards)VALUES (%s, %s, %s, %s, %s, %s, %s)
"""
# 遍历result列表,将每个电影的信息插入到数据库中
for movie in result:
cursor.execute(insert_query, movie)
# 提交事务
connect.commit()
print("Data has been inserted successfully.")
更多推荐
已为社区贡献1条内容
所有评论(0)