import requests
from bs4 import BeautifulSoup
import csv
import re
result=[]
director=[]
score=[]
headers = {
     "User-Agent": "",#这里写成自己的
    "Cookie": ''#写成自己的

}
for strart_num in range(0, 250,25):
    response = requests.get(f"https://movie.douban.com/top250?start={strart_num}", headers=headers)
    html = response.text
    soup = BeautifulSoup(html, "html.parser")
    print(soup)
# title名称 nick别名
    for item in soup.find_all('div','info'):

        titles=item.find_all('span','title')
        title=titles[0].string.strip()
        #nick=titles[1].string
        other=item.find('span','other').string.strip()
        # 如果有两个标题,拼接第二个标题和别名
        if len(titles) > 1 and titles[1].string:
            nick = titles[1].string.strip() + other
            nick=nick.replace('\xa0','')
        else:
            nick = other  # 如果没有第二个标题或别名,设置为空字符串
            ick=nick.replace('\xa0','')

        tag=item.find('a').get('href')
        # print(tag)
        html=requests.get(tag,headers=headers)
        soup_add = BeautifulSoup(html.text, 'html.parser')
        # 星级占比
        ratings=soup_add.find('div','ratings-on-weight').get_text(strip=True)
        # print(ratings)
        # 获取各评分段人数
        # ratings = soup_add.find('div', class_='ratings-on-weight')
        # rating_per = ratings.find_all('span', class_='rating_per')# 评分百分比
        # rating_per = [rp.get_text().strip('%') for rp in rating_per]
        # ratings_summary = '[' + ', '.join([f'{rp}星:{rp_per}%' for rp, rp_per in zip(['5', '4', '3', '2', '1'], rating_per)]) + ']'# 合并输出各评分段人数
        # print(ratings_summary)
        # 简介
        jian=soup_add.find('span',property="v:summary").get_text(strip=True)
        # 获奖
        award_item=soup_add.find_all('ul',class_='award')
        awards_text = ''
        for award_list in award_item:
            award_items = award_list.find_all('li')
            awards_text += ', '.join([award.get_text(strip=True) for award in award_items]) + '; '
        awards_text = awards_text.strip('; ')
        #
        fenshu = item.find('span', class_='rating_num').get_text().strip()
        # 导演
         # 提取导演信息
        info = item.find('div', 'bd').p.get_text().strip()
        # 使用正则表达式提取导演信息
        director_pattern = r'导演: (.*?)(?=\s+主演|\s+\n)'
        director_match = re.search(director_pattern, info)
        director = director_match.group(1).strip() if director_match else "未知"


        oneresult=[title,nick,ratings,fenshu,director,jian,awards_text]
        result.append(oneresult)





# #
# 写入CSV文件
with open('movie.csv', 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['Title', 'Nick', 'Ratings','Score','Director', 'Summary','Awards'])  # 写入表头
    csv_writer.writerows(result)  # 写入数据

result


插入到数据库中,首先要在数据库中建好表

import pymysql.cursors
import pymysql
# 连接数据库
connect = pymysql.Connect(
    host='localhost',
    port=3306,
    user='root',#自己的名字
    password='***',#修改成自己的密码
    db='movie',#要存的数据库的名称
    # charset='utf8mb4'  # 如果需要支持emoji等特殊字符,可以取消注释这行
)
# 获取游标
cursor = connect.cursor()



# 插入数据的SQL语句
insert_query = """
    INSERT INTO movie_information(title, nick, ratings, score, director, summary, awards)VALUES (%s, %s, %s, %s, %s, %s, %s)
"""

# 遍历result列表,将每个电影的信息插入到数据库中
for movie in result:
    cursor.execute(insert_query, movie)

# 提交事务
connect.commit()
print("Data has been inserted successfully.")



Logo

腾讯云面向开发者汇聚海量精品云计算使用和开发经验,营造开放的云计算技术生态圈。

更多推荐