import csv
import os
from telnetlib import EC
import win32com.client as win32
from bs4 import BeautifulSoup
import time
import PyPDF2
from docx import Document
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By


# 设置下载路径
download_path = r"D:\数据备份\数据爬取国家法律法规数据库\data"
options = webdriver.ChromeOptions()
prefs = {"download.default_directory": download_path}
options.add_experimental_option("prefs", prefs)

def request_html():
    num = 0
    wd = webdriver.Chrome(service=Service(r"./chromedriver-win64/chromedriver.exe"), options=options)
    while True:
        try:
            # 打开目标页面
            wd.get('https://flk.npc.gov.cn/xf.html')
            # 等待并点击指定元素
            element = WebDriverWait(wd, 10).until(
                EC.element_to_be_clickable((By.ID, 'dfxfg'))
            )
            element.click()
            break
            # 如果成功点击元素,就跳出循环
        except:
            num += 1
            print(f"第 {num}次尝试请求首页,正在重新发起请求...")
            continue
    return wd


def request_html_docx(flag_page_num):
    # 获取首页html
    wd = request_html()
    #重新爬取页数
    try:
        # 等待输入框元素可被点击,最长等待时间为10秒(可根据实际情况调整)
        input_element = WebDriverWait(wd, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//input[@class="layui-input"][1]'))
        )
        # 清空输入框原有内容
        input_element.clear()
        # 向输入框中输入指定页数
        input_element.send_keys(str(flag_page_num))
        input_element.send_keys()
        # 等待确定按钮可被点击,最长等待时间为10秒(可根据实际情况调整)
        confirm_button = WebDriverWait(wd, 10).until(
            EC.element_to_be_clickable((By.CLASS_NAME, 'layui-laypage-btn'))
        )
        # 点击确定按钮
        confirm_button.click()
        page_source = wd.page_source
        #解析页面内容并返回当前页面数
        flag_page_num, parser_content = parser_html(page_source)
        return wd
    except:
        print(f"网页不可访问:请记下现在正在爬取的页数:{flag_page_num},下次爬取需要从{flag_page_num+1}页爬取")
        request_html_docx(flag_page_num)


def parser_html(page_source):
    # 使用BeautifulSoup解析页面源代码
    soup = BeautifulSoup(page_source, 'html.parser')
    # 找到所有的行数据(每行对应一个法规的信息)
    rows = soup.find_all('tr', class_='list-b')
    # 通过class属性找到对应的元素
    span_element = soup.find('span', class_='layui-laypage-curr')
    em_elements = span_element.find_all('em')
    page_number = em_elements[1].text
    # 用于存储解析后的数据
    parsed_data = []
    for row in rows:
        # 找到每行中的各个单元格数据
        cells = row.find_all('td')
        # 提取每个单元格中的文本内容(去除HTML标签)
        number = cells[0].find('div', class_='l-xh').text
        name = cells[1].find('li', class_='l-wen').text
        organization = cells[2].find('h2', class_='l-wen1').text
        type_ = cells[3].find('h2', class_='l-wen1').text
        status = cells[4].find('h2', class_='l-wen1').text
        date = cells[5].find('h2', class_='l-wen1').text
        # 将提取的数据整理成字典形式并添加到列表中
        data_dict = {
            '序号': number,
            '法规名称': name,
            '制定机关': organization,
            '法规类型': type_,
            '状态': status,
            '日期': date
        }
        parsed_data.append(data_dict)
    return int(page_number),parsed_data


def close_page_save_init(wd):
    initial_handle = wd.window_handles[0]
    # 遍历除了初始页面句柄之外的所有句柄,并关闭对应的页面
    for handle in wd.window_handles:
        if handle != initial_handle:
            wd.switch_to.window(handle)
            wd.close()
    # 切换回最初打开的页面
    wd.switch_to.window(initial_handle)


def delete_all_docx():
    cache_file_path1 = "./data"
    for root_path,dirs, cache_files in os.walk(cache_file_path1, topdown=True):
        if len(cache_files) > 1:
            for file in cache_files:
                file_path = os.path.join(root_path, file)
                os.remove(file_path)


def spider_load_docx1(flag_page_num):
    result = []
    def load_docx1(row, wd):
        nonlocal result
        try:  # 定位到当前行中可点击的li元素
            li_element = row.find_element(By.XPATH, ".//li[@class='l-wen' and @onclick]")
            # 点击元素
            li_element.click()
            # 验证窗口是否都已经打开并且文件能够下载
            start_time = time.time()
            # 等待加载下载文件的页面不得超过100s
            while len(wd.window_handles) == 1:
                if time.time() - start_time > 50:
                    break
                continue
            wd.switch_to.window(wd.window_handles[-1])
            # 等待下载元素出现并可被点击,最长等待时间为10秒(可根据实际情况调整)
            download_element = WebDriverWait(wd, 20).until(
                EC.element_to_be_clickable((By.ID, "downLoadFile"))
            )
            # 点击下载元素
            download_element.click()
            start_time = time.time()
            # 等待页面加载出来
            while len(wd.window_handles) == 1:
                if time.time() - start_time > 50:
                    break
                continue
            # 下载成功后解析文件内容
            cache_file_path = "./data"
            #等待文件中有docx文件
            start_time = time.time()
            should_break = False
            while True:
                if len(os.listdir(cache_file_path)):
                    for root_path, dirs, cache_files in os.walk(cache_file_path):
                        for index, file in enumerate(cache_files):
                            if file.endswith('.docx') or file.endswith('.doc') or file.endswith('pdf'):
                                should_break = True
                                break
                        if should_break:
                            break
                if should_break:
                    break
                if time.time() - start_time > 50:
                    break
                continue
            if not len(os.listdir(cache_file_path)):
                close_page_save_init(wd)
                load_docx1(row, wd)
            else:
                for root_path, dirs, cache_files in os.walk(cache_file_path):
                    for index, file in enumerate(cache_files):
                        if  not file.endswith('.docx') and not file.endswith('.doc') and not file.endswith('pdf'):
                            close_page_save_init(wd)
                            load_docx1(row, wd)
            for root_path, dirs, cache_files in os.walk(cache_file_path):
                for index, file in enumerate(cache_files):
                    if file.endswith('.docx'):
                        file_path = os.path.join(root_path, file)
                        doc = Document(file_path)
                        full_text = ""
                        for para in doc.paragraphs:
                            para_text = para.text.replace('\u3000', ' ')
                            full_text += para_text + "\n"
                        for table in doc.tables:
                            for row in table.rows:
                                for cell in row.cells:
                                    cell_text = cell.text.replace('\u3000', ' ')
                                    full_text += cell_text + "\n"
                        data_content = {"政策内容": full_text}
                        print(data_content)
                        result.append(data_content)
                        os.remove(file_path)
                        break
                    if file.endswith('.doc'):
                        file_path = os.path.join(os.getcwd(), "data", file)
                        # file_path = os.path.join(root_path, file)
                        word = win32.Dispatch("Word.Application")
                        doc = word.Documents.Open(file_path)
                        full_text = doc.Content.Text
                        data_content = {"政策内容": full_text}
                        print(data_content)
                        result.append(data_content)
                        doc.Close()
                        word.Quit()
                        os.remove(file_path)
                        break
                    if file.endswith('pdf'):
                        file_path = os.path.join(os.getcwd(), "data", file)
                        text = ""
                        with open(file_path, 'rb') as file:
                            reader = PyPDF2.PdfReader(file)
                            for page_num in range(len(reader.pages)):
                                page = reader.pages[page_num]
                                text += page.extract_text()
                        data_content = {"政策内容": text}
                        result.append(data_content)
                        os.remove(file_path)
                        print(data_content )
                        break
            if len(wd.window_handles) != 1:
                wd.close()
                wd.switch_to.window(wd.window_handles[0])
            else:
                wd.switch_to.window(wd.window_handles[0])
            return
        except:
            print("文件无法下载,正在重新请求!")

            # 页面出现问题,需要重新访问页面
            if len(wd.window_handles) != 1:
                close_page_save_init(wd)
            delete_all_docx()
            load_docx1(row, wd)

    #请求页面
    wd = request_html_docx(flag_page_num)
    if not wd:
        spider_load_docx1(flag_page_num)
        return
    print(f"第{flag_page_num}页面请求成功!",wd)
    page_source = wd.page_source
    try:# 获取表格体中的所有行元素
        delete_all_docx()
        tbody = wd.find_element(By.ID, "flData")
        rows = tbody.find_elements(By.TAG_NAME, "tr")
        # 遍历每一行并点击对应的元素进行页面跳转
        for index, row in enumerate(rows):
            print(f"正在爬取{flag_page_num}页面的第{index+1}行数据的文件内容。。。。。")
            start_time = time.time()
            #下载文件
            load_docx1(row,wd)
            end_time = time.time()
            if end_time - start_time > 1000:
                #重新请求页面
                wd.quit()
                spider_load_docx1(flag_page_num)
            print(f"爬取{flag_page_num}页面的第{index+1}行数据的文件内容成功!")

        if len(result) != 10:
            wd.quit()
            spider_load_docx1(flag_page_num)
        # 写入文件中
        if len(result) == 10:
            print("正在写入文件中。。。。。。。。。。。。。。。")
            final_contents = []
            flag_page_num, parser_content = parser_html(page_source)
            for i, content in enumerate(parser_content):
                content['政策内容'] = result[i]['政策内容']
                final_contents.append(content)
            csv_file_name = 'spider_content.csv'
            with open(csv_file_name, 'a', newline='', encoding='utf-8') as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=final_contents[0].keys())
                # 逐行写入数据
                for data_content in final_contents:
                    writer.writerow(data_content)
            print(f"数据已成功写入 {csv_file_name} 文件。")
        return
    except:
        print(f"第{flag_page_num}页的某一个行不能打开页面,无法下载政策文件!现在对整个页面进行请求,重新下载此页面的所有文件,正在重新请求。。。。。。。。。")
        delete_all_docx()
        if wd:
            wd.quit()
        spider_load_docx1(flag_page_num)


flag_page_num = 247
while True:
    print(f"正在爬取第{flag_page_num}页面内容。。。。。。")
    spider_load_docx1(flag_page_num)
    print(f"爬取第{flag_page_num}页面内容成功!")
    flag_page_num += 1
    if flag_page_num > 2290:
        print(f"所有页面内容已经全部爬取完毕!")
        break

Logo

腾讯云面向开发者汇聚海量精品云计算使用和开发经验,营造开放的云计算技术生态圈。

更多推荐