
python爬虫获取豆瓣top250数据---多线程
【代码】python爬虫获取豆瓣top250数据---多线程。
·
import requests
from lxml import etree
class DBSendRequest:
def __init__(self):
self.url = ''
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'
}
@property # 该装饰器将方法变成一个可直接调用的属性
def sendRequest(self):
response = requests.get(self.url, headers=self.headers)
return response
@sendRequest.setter
def sendRequest(self, url):
self.url = url
class DealData(DBSendRequest):
def indexDealdata(self):
urls_list = []
for i in range(0, 250, 25):
self.sendRequest = f"https://movie.douban.com/top250?start={i}&filter="
response = self.sendRequest
text_data = response.content.decode()
tree = etree.HTML(text_data)
urls = tree.xpath('//*[@id="content"]/div/div[1]/ol/li[*]/div/div[1]/a/@href')
urls_list.extend(urls)
return urls_list
def pa_qu(self):
urls = self.indexDealdata()
# one_th = urls[0:len(urls)]
# two_th = urls[len(urls):]
# 构造两个线程
from concurrent.futures import ThreadPoolExecutor
thread_poll = ThreadPoolExecutor(max_workers=10)
for d in urls:
thread_poll.submit(self.get_data, d)
thread_poll.shutdown(wait=True)
def get_data(self, url):
# 爬取数据
result = {}
self.sendRequest = url
response = self.sendRequest
text_html = response.content.decode()
with open('1.html', 'w', encoding='utf-8') as f:
f.write(text_html)
tree = etree.HTML(text_html)
title = tree.xpath('//*[@id="content"]/h1/span[1]/text()')
douban_score = tree.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')
for t, d in zip(title, douban_score):
result = {'电影名': t, '豆瓣评分': d}
print(result)
if __name__ == '__main__':
dealdata = DealData()
dealdata.pa_qu()
更多推荐
所有评论(0)