import requests
import tkinter as tk
import time,random
from selenium import webdriver
from selenium.webdriver.common.by import By
from tkinter import messagebox
from bs4 import BeautifulSoup
from random import uniform
import queue
import os.path
import time
from requests.adapters import HTTPAdapter
from requests import packages
from fake_useragent import UserAgent
import threading
from threading import Thread, Event, Lock

class DownloadThread(Thread):
    def __init__(self, targets, queue_data,download_chapter):
        super().__init__(daemon=True)
        self.targets = targets
        self.download_chapter=download_chapter
        # print(self.targets)
        self.cancel_flag = Event()
        self.length = [len(targets)]
        print(self.length)
        self.length_lock = Lock()
        self.queue_data = queue_data
        print('Download thread initialized')
    def run(self):
        for tar in self.targets:
         result=self.download_chapter(tar, self.cancel_flag, self.length_lock, self.length,self.queue_data)
         if result=='YES':
             break
         if self.cancel_flag.is_set():
            break
        time.sleep(uniform(1, 3))  # 随机延时1到3秒
        self.queue_data.put(None)  # 标记下载完成
    def cancel(self):
        self.cancel_flag.set()
class Reptile(tk.Tk):
    def __init__(self):
        super().__init__()
        self.download_thread = None  # 初始化为空
        # 连接搜索网址
        self.ur1='https://www.qimao.com/search/index/?keyword='
        self.cancel_flag = threading.Event()  # 全局取消标志
        self.queue_data=queue.Queue()
        # 设置关闭事件处理
        self.protocol("WM_DELETE_WINDOW", self.on_closing)
        self.length=-1
        self.title('Reptile')
        #建立界面大小
        self.geometry('220x109')
        #禁止再次调整界面
        self.resizable(False,False)
        # #初始化times
        self.times=0
        # 初始化数据库管理器
        # 布局按钮
        max_row = 2
        max_column = 2
        weight=0
        #使对应行或列不会随窗口改变而改变,weight=1则表示可以扩展
        for i in range(max_row + 1):
            self.rowconfigure(i, weight=0)
        for i in range(max_column + 1):
            self.columnconfigure(i, weight=0)
        # 创建一个标签,用于显示还未爬取的章节
        label1 = tk.Label(self, text="你爬取的小说:")
        self.lable1 = label1
        self.lable1.grid(row=0,column=0)
        # 创建一个标签,用于显示输入的url
        label = tk.Label(self, text="?")
        self.lable = label
        self.lable.grid(row=1, column=1)
        #创建一个标签,用于提示还有多少章节未爬取
        label2=tk.Label(self,text="当前剩余章节数")
        self.lable2=label2
        self.lable2.grid(row=1,column=0)
        #创建输入框
        entry=tk.Entry(self,width=10,)
        self.entry=entry
        self.entry.bind("<KeyRelease>",self.get_name)
        self.entry.grid(row=0,column=1)
        self.check_queue()
        print(self.get_name())
        #开始爬虫按钮
        begin_reptile=tk.Button(self,text='begin',command=self.begin_reptile)
        begin_reptile.grid(row=3,column=1)

    def on_closing(self):
        if self.download_thread and self.download_thread.is_alive():
            if messagebox.askokcancel("Quit", "Do you want to quit? This will cancel all ongoing downloads."):
                self.download_thread.cancel()
                # 等待线程结束,可以设置一个合理的超时时间
                self.download_thread.join(timeout=5)
                if self.download_thread.is_alive():
                    print("Download thread did not terminate in time.")
        self.destroy()
    def begin_reptile(self):
        self.runs()
    def check_queue(self):
        if not self.queue_data.empty():
            length = self.queue_data.get()
            if length is None:
                self.lable.config(text='爬取完成')
                return
            else:
                self.lable.config(text=f"还有{length}章需要爬取")
        self.after(1000, self.check_queue)
    def get_name(self,event=None):
        self.getname=self.entry.get()
        return self.getname.strip()
    def runs(self):
        if self.get_name()=='':
            time.sleep(1)
        print('runs')
        pp=self.get_name()
        print(pp)
        self.ur1=self.ur1+self.get_name()
        print(self.ur1)
        target = self.getur1(self.ur1)
        self.download_thread = DownloadThread(target,self.queue_data,self.download_chapter)
        self.download_thread.start()
        self.after(1000,self.check_queue)

    def getur1(self,ur1):
        kuangs = []
        ua = UserAgent()
        header = {'User-Agent': ua.random}
        re = requests.get(url=ur1, headers=header)
        re.encoding = "utf-8"
        html = re.text
        bes = BeautifulSoup(html, "html.parser")
        # 得到所搜索书籍的主网址
        main_content = bes.find('div', class_="col-a")
        novel_items = main_content.find('li')
        s_sit = novel_items.find('span', class_='s-tit')
        href1 = s_sit.a.get('href')
        # 隐藏浏览器
        # 浏览器启动选项
        option = webdriver.ChromeOptions()
        # 添加启动选项,指定为无界面模式
        option.add_argument('--headless')
        # 连接主网址
        still = random.randint(1, 5)
        driver = webdriver.Chrome(options=option)
        ur1 = 'https://www.qimao.com' + href1
        driver.get(ur1)
        # 更新动态url
        time.sleep(still)
        driver.find_element(By.XPATH,
                            value='//div[contains(@class, "tab-inner") and .//span[text()="作品目录"]]').click()
        novel_items1 = driver.page_source
        bes = BeautifulSoup(novel_items1, "html.parser")
        chapters = bes.find('div', class_='l-col')
        novel_items2 = chapters.find_all('li')
        for li in novel_items2:
            a_tag = li.find('a')
            name = a_tag.get_text(strip=True)
            href2 = a_tag.get('href')

            # print(href2)
            ur2 = 'https://www.qimao.com' + href2
            kuang = [ur2, name]
            kuangs.append(kuang)
        return kuangs
    def download_chapter(self,tar, cancel_flag,length_lock,length,queue_data):
        if self.get_name()== '':
            time.sleep(1)
        tablet_name=self.get_name()
        print(tablet_name)
        if cancel_flag.is_set():
            return
        header = {'User-Agent': UserAgent().random}
        with length_lock:
            length[0] = length[0] - 1
            self.length = length[0]
            # print(length[0])
            queue_data.put(self.length)
        requests.packages.urllib3.disable_warnings()
        s = requests.Session()
        s.mount('http://', HTTPAdapter(max_retries=19))
        s.mount('https://', HTTPAdapter(max_retries=19))
        try:
            req = s.get(url=tar[0], headers=header, timeout=30, stream=True, verify=False)
            req.encoding = "utf-8"
            html = req.text
            print("漂亮")
            bes = BeautifulSoup(html, "html.parser")
            texts = bes.find("div", class_="article")
            if (texts == None):
                return 'YES'
            print(tar[1])
            texts_list = texts.text.split('\xa0' * 4)
            if not os.path.exists("D:/novel/" + tar[1] + ".txt"):
                with open("D:/novel/" + tar[1] + ".txt", "w", encoding='gbk') as file:
                    for line in texts_list:
                        file.write(line + "\n")

            print('1')
        except Exception as e:
         print(f"下载章节失败: {e}")
# if __name__=="__main__":
# run()
# end_time = time.time()
#进行时间打出
# all = end_time - start_time
# print(all)
if __name__=="__main__":
    # tt=
    # app = QApplication(sys.argv)
    tt=Reptile()
    tt.mainloop()


    # sys.exit(app.exec_())

#鄙人穷,不想充钱,所以此代码虽然可以爬取任意图书,但仅限免费章节

#若是触犯法律,与我无关,学习交流使用,仅供参考

Logo

腾讯云面向开发者汇聚海量精品云计算使用和开发经验,营造开放的云计算技术生态圈。

更多推荐