python爬取qimao小说网站免费章节
爬取七猫小说网站的免费内容,爬取完,会显示爬取完成,由多线程来后台运行,不会卡。
·
import requests
import tkinter as tk
import time,random
from selenium import webdriver
from selenium.webdriver.common.by import By
from tkinter import messagebox
from bs4 import BeautifulSoup
from random import uniform
import queue
import os.path
import time
from requests.adapters import HTTPAdapter
from requests import packages
from fake_useragent import UserAgent
import threading
from threading import Thread, Event, Lock
class DownloadThread(Thread):
def __init__(self, targets, queue_data,download_chapter):
super().__init__(daemon=True)
self.targets = targets
self.download_chapter=download_chapter
# print(self.targets)
self.cancel_flag = Event()
self.length = [len(targets)]
print(self.length)
self.length_lock = Lock()
self.queue_data = queue_data
print('Download thread initialized')
def run(self):
for tar in self.targets:
result=self.download_chapter(tar, self.cancel_flag, self.length_lock, self.length,self.queue_data)
if result=='YES':
break
if self.cancel_flag.is_set():
break
time.sleep(uniform(1, 3)) # 随机延时1到3秒
self.queue_data.put(None) # 标记下载完成
def cancel(self):
self.cancel_flag.set()
class Reptile(tk.Tk):
def __init__(self):
super().__init__()
self.download_thread = None # 初始化为空
# 连接搜索网址
self.ur1='https://www.qimao.com/search/index/?keyword='
self.cancel_flag = threading.Event() # 全局取消标志
self.queue_data=queue.Queue()
# 设置关闭事件处理
self.protocol("WM_DELETE_WINDOW", self.on_closing)
self.length=-1
self.title('Reptile')
#建立界面大小
self.geometry('220x109')
#禁止再次调整界面
self.resizable(False,False)
# #初始化times
self.times=0
# 初始化数据库管理器
# 布局按钮
max_row = 2
max_column = 2
weight=0
#使对应行或列不会随窗口改变而改变,weight=1则表示可以扩展
for i in range(max_row + 1):
self.rowconfigure(i, weight=0)
for i in range(max_column + 1):
self.columnconfigure(i, weight=0)
# 创建一个标签,用于显示还未爬取的章节
label1 = tk.Label(self, text="你爬取的小说:")
self.lable1 = label1
self.lable1.grid(row=0,column=0)
# 创建一个标签,用于显示输入的url
label = tk.Label(self, text="?")
self.lable = label
self.lable.grid(row=1, column=1)
#创建一个标签,用于提示还有多少章节未爬取
label2=tk.Label(self,text="当前剩余章节数")
self.lable2=label2
self.lable2.grid(row=1,column=0)
#创建输入框
entry=tk.Entry(self,width=10,)
self.entry=entry
self.entry.bind("<KeyRelease>",self.get_name)
self.entry.grid(row=0,column=1)
self.check_queue()
print(self.get_name())
#开始爬虫按钮
begin_reptile=tk.Button(self,text='begin',command=self.begin_reptile)
begin_reptile.grid(row=3,column=1)
def on_closing(self):
if self.download_thread and self.download_thread.is_alive():
if messagebox.askokcancel("Quit", "Do you want to quit? This will cancel all ongoing downloads."):
self.download_thread.cancel()
# 等待线程结束,可以设置一个合理的超时时间
self.download_thread.join(timeout=5)
if self.download_thread.is_alive():
print("Download thread did not terminate in time.")
self.destroy()
def begin_reptile(self):
self.runs()
def check_queue(self):
if not self.queue_data.empty():
length = self.queue_data.get()
if length is None:
self.lable.config(text='爬取完成')
return
else:
self.lable.config(text=f"还有{length}章需要爬取")
self.after(1000, self.check_queue)
def get_name(self,event=None):
self.getname=self.entry.get()
return self.getname.strip()
def runs(self):
if self.get_name()=='':
time.sleep(1)
print('runs')
pp=self.get_name()
print(pp)
self.ur1=self.ur1+self.get_name()
print(self.ur1)
target = self.getur1(self.ur1)
self.download_thread = DownloadThread(target,self.queue_data,self.download_chapter)
self.download_thread.start()
self.after(1000,self.check_queue)
def getur1(self,ur1):
kuangs = []
ua = UserAgent()
header = {'User-Agent': ua.random}
re = requests.get(url=ur1, headers=header)
re.encoding = "utf-8"
html = re.text
bes = BeautifulSoup(html, "html.parser")
# 得到所搜索书籍的主网址
main_content = bes.find('div', class_="col-a")
novel_items = main_content.find('li')
s_sit = novel_items.find('span', class_='s-tit')
href1 = s_sit.a.get('href')
# 隐藏浏览器
# 浏览器启动选项
option = webdriver.ChromeOptions()
# 添加启动选项,指定为无界面模式
option.add_argument('--headless')
# 连接主网址
still = random.randint(1, 5)
driver = webdriver.Chrome(options=option)
ur1 = 'https://www.qimao.com' + href1
driver.get(ur1)
# 更新动态url
time.sleep(still)
driver.find_element(By.XPATH,
value='//div[contains(@class, "tab-inner") and .//span[text()="作品目录"]]').click()
novel_items1 = driver.page_source
bes = BeautifulSoup(novel_items1, "html.parser")
chapters = bes.find('div', class_='l-col')
novel_items2 = chapters.find_all('li')
for li in novel_items2:
a_tag = li.find('a')
name = a_tag.get_text(strip=True)
href2 = a_tag.get('href')
# print(href2)
ur2 = 'https://www.qimao.com' + href2
kuang = [ur2, name]
kuangs.append(kuang)
return kuangs
def download_chapter(self,tar, cancel_flag,length_lock,length,queue_data):
if self.get_name()== '':
time.sleep(1)
tablet_name=self.get_name()
print(tablet_name)
if cancel_flag.is_set():
return
header = {'User-Agent': UserAgent().random}
with length_lock:
length[0] = length[0] - 1
self.length = length[0]
# print(length[0])
queue_data.put(self.length)
requests.packages.urllib3.disable_warnings()
s = requests.Session()
s.mount('http://', HTTPAdapter(max_retries=19))
s.mount('https://', HTTPAdapter(max_retries=19))
try:
req = s.get(url=tar[0], headers=header, timeout=30, stream=True, verify=False)
req.encoding = "utf-8"
html = req.text
print("漂亮")
bes = BeautifulSoup(html, "html.parser")
texts = bes.find("div", class_="article")
if (texts == None):
return 'YES'
print(tar[1])
texts_list = texts.text.split('\xa0' * 4)
if not os.path.exists("D:/novel/" + tar[1] + ".txt"):
with open("D:/novel/" + tar[1] + ".txt", "w", encoding='gbk') as file:
for line in texts_list:
file.write(line + "\n")
print('1')
except Exception as e:
print(f"下载章节失败: {e}")
# if __name__=="__main__":
# run()
# end_time = time.time()
#进行时间打出
# all = end_time - start_time
# print(all)
if __name__=="__main__":
# tt=
# app = QApplication(sys.argv)
tt=Reptile()
tt.mainloop()
# sys.exit(app.exec_())
#鄙人穷,不想充钱,所以此代码虽然可以爬取任意图书,但仅限免费章节
#若是触犯法律,与我无关,学习交流使用,仅供参考
更多推荐
所有评论(0)