python爬取国家法律法规数据库

从列表进入内容，发现每条内容文件对应一个id，这个id是由列表页传过来的，从列表获取拼出内容的url，然后进行接口爬取内容的文件路径path，再下载文件，解析文件。涉及文件下载，word解析，得到带标签和不带标签的内容，雪花算法非自增id，数据入库去重。发现导航栏有不同分类，f12查看网络接口，发现每个分类对应一个参数。爬取的文件数据下载到文件夹中，其他数据存储到数据库中。

pumpkin0_0

2492人浏览 · 2024-09-03 15:44:22

pumpkin0_0 · 2024-09-03 15:44:22 发布

1.页面

2.分析

发现导航栏有不同分类，f12查看网络接口，发现每个分类对应一个参数

从列表进入内容，发现每条内容文件对应一个id，这个id是由列表页传过来的，从列表获取拼出内容的url，然后进行接口爬取内容的文件路径path，再下载文件，解析文件

3.准备工作

爬取的文件数据下载到文件夹中，其他数据存储到数据库中

4..完整代码

import uuid
import requests
import sys
import pymysql as mysql
import datetime
from docx import Document
import time
# 爬接口
# 消除警告
requests.packages.urllib3.disable_warnings()
# 无法识别的乱码处理
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)

con = mysql.connect(host="127.0.0.1", port=3306, user="root", passwd="root", db="guojia_spider", charset="utf8")

source = "国家法律法规数据库"
cookies = {
    'wzws_sessionid': 'gWU4ZGIxYYJmYWI4NWaAMTM5LjIxNC4zMi4yMjGgZtAiVw==',
    'Hm_lvt_54434aa6770b6d9fef104d146430b53b': '1722493539,1724209491,1724916312',
    'HMACCOUNT': '4FF444F068B3087E',
    'Hm_lpvt_54434aa6770b6d9fef104d146430b53b': '1724979519',
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    'Connection': 'close'
}
# 列表请求参数
params = {
    'type': 'dfxfg',
    'searchType': 'title;vague',
    'sortTr': 'f_bbrq_s;desc',
    'gbrqStart': '',
    'gbrqEnd': '',
    'sxrqStart': '',
    'sxrqEnd': '',
    'sort': 'true',
    'page': '1',
    'size': '10',
    '_': '1724980047619',
}
# 内容请求参数
data = {
    'id': 'ZmY4MDgxODE4ZDczNmFjMTAxOGRjZmFlOTU2MTJlYWU%3D',
}

# 入库
def inputdb(id, title, source_href, ddate, date2, content_label, content_nolabel, attachment, province):
    global con, source
    cursor1 = con.cursor()
    public_time = ddate
    create_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    # 标题相同 进行去重 避免数据重复
    sql1 = "select * from 表名 where title ='%s'" % (title)
    cursor1.execute(sql1)
    results = cursor1.fetchall()
    if len(results) > 0:
        print('The data already exists---')
        cursor1.close()
        return
    cursor1.close()
    cursor2 = con.cursor()
    if public_time is None or public_time == '':
        public_time = ''
    if date2 is None or date2 == '':
        date2 = ''
    if province != '' and public_time != '' and date2 != '':
        sql2 = (
                   "insert into 表名(id,title,source,source_href,public_time,expiry_time,content,content_text,create_time,province,attachment)"
                   " values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
                   id, title, source, source_href, public_time, date2, content_label, content_nolabel, create_time, province, attachment)
    elif province != '' and public_time != '' and date2 == '':
        sql2 = (
                   "insert into 表名(id,title,source,source_href,public_time,content,content_text,create_time,province,attachment)"
                   " values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
                   id, title, source, source_href, public_time, content_label, content_nolabel, create_time, province, attachment)
    elif province != '' and public_time == '' and date2 != '':
        sql2 = (
                   "insert into 表名(id,title,source,source_href,expiry_time,content,content_text,create_time,province,attachment)"
                   " values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
                   id, title, source, source_href, date2, content_label, content_nolabel,create_time, province, attachment)
    elif province == '' and public_time != '' and date2 != '':
        sql2 = (
                   "insert into 表名(id,title,source,source_href,public_time,expiry_time,content,content_text,create_time,attachment)"
                   " values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
                   id, title, source, source_href, public_time, date2, content_label, content_nolabel,create_time, attachment)
    elif province != '' and public_time == '' and date2 == '':
        sql2 = (
                   "insert into 表名(id,title,source,source_href,content,content_text,create_time,province,attachment)"
                   " values('%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
                   id, title, source, source_href, content_label, content_nolabel,create_time, province, attachment)
    elif province == '' and public_time != '' and date2 == '':
        sql2 = (
                   "insert into 表名(id,title,source,source_href,expiry_time,content,content_text,create_time,attachment)"
                   " values('%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
                   id, title, source, source_href, public_time, content_label, content_nolabel,create_time, attachment)
    elif province == '' and public_time == '' and date2 != '':
        sql2 = (
                   "insert into 表名(id,title,source,source_href,expiry_time,content,content_text,create_time,attachment)"
                   " values('%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
                   id, title, source, source_href, date2, content_label, content_nolabel,create_time, attachment)
    else:
        sql2 = (
                   "insert into 表名(id,title,source,source_href,content,content_text,create_time,attachment)"
                   " values('%s','%s','%s','%s','%s','%s','%s','%s')") % (
                   id, title, source, source_href, content_label, content_nolabel,create_time, attachment)
    cursor2.execute(sql2)
    con.commit()
    cursor2.close()

# 雪花算法 获取id
class Snowflake:
    def __init__(self, machine_id):
        self.machine_id = machine_id
        self.sequence = 0
        self.last_timestamp = -1

    def generate_id(self):
        timestamp = int(time.time() * 1000)
        if timestamp < self.last_timestamp:
            raise Exception("Clock moved backwards")
        if timestamp == self.last_timestamp:
            self.sequence = (self.sequence + 1) & 4095
            if self.sequence == 0:
                timestamp = self.wait_next_millis(self.last_timestamp)
        else:
            self.sequence = 0
        self.last_timestamp = timestamp
        return ((timestamp - 1288834974657) << 22) | (self.machine_id << 12) | self.sequence

    def wait_next_millis(self, last_timestamp):
        timestamp = int(time.time() * 1000)
        while timestamp <= last_timestamp:
            timestamp = int(time.time() * 1000)
        return timestamp


if __name__ == '__main__':
    # 地方性法规1-2229页
    params['type'] = 'dfxfg'
    # 司法解释
    # params['type'] = 'sfjs'
    # 法律
    # params['type'] = 'fl'
    # 行政法规
    # params['type'] = 'xzfg'
    # 根据参数不同 循环不同页码
    for i in range(1, 2230):
        num = 1
        params['page'] = i
        response = requests.get(
            url='https://flk.npc.gov.cn/api/',
            headers=headers,
            params=params,
            cookies=cookies,
            verify=False
        )
        if response.status_code == 200:
            response = response.json()
            snowflake = Snowflake(1)
            for j in response['result']['data']:
                print('---start running---')
                data['id'] = j['id']
                title = j['title']
                date = j['publish'] # 发布日期
                date2 = j['expiry'] # 生效日期
                if j['type'] == '地方性法规':
                    province = str(j['office']).split('人')[0]
                else:
                    province = ''
                uurl = 'https://flk.npc.gov.cn/api/detail'
                new_data = requests.post(url=uurl, data=data, headers=headers, cookies=cookies)
                # 避免网页504
                if new_data.status_code == 504:
                    print("Error 504: Gateway Timeout")
                if new_data.status_code == 200:
                    new_data = new_data.json()
                    download_url = 'https://wb.flk.npc.gov.cn' + new_data['result']['body'][0]['path']
                    text_p, text, attachment_url = "", "", ""
                    if download_url.split('.')[-1].endswith('pdf'):
                        download_url = 'https://wb.flk.npc.gov.cn' + new_data['result']['body'][1]['path']
                    if download_url.split('.')[-1].endswith('docx'):
                        name = str(uuid.uuid4().hex)
                        attachment_url = '/spiderFiles/' + name + '.' + download_url.split('.')[-1]
                        # 下载文件
                        content = requests.get(url=download_url, headers=headers).content
                        with open('D:\\spiderFiles\\' + name + '.' + download_url.split('.')[-1],
                                  mode='wb') as f:
                            f.write(content)
                        # 解析文件
                        doc = Document('D:\\spiderFiles\\' + name + '.' + download_url.split('.')[-1])
                        for para in doc.paragraphs:
                            text += para.text
                            if para.text != None and para.text != "":
                                text_p += "<p>" + para.text + "</p>"
                        # print(text_p)
                    else:
                        print('---check the file---')
                    href = 'https://flk.npc.gov.cn/detail2.html?' + data['id']
                    # id非自增 雪花算法随机生成
                    id = snowflake.generate_id()
                    inputdb(id, title, href, date, date2, text_p, text, attachment_url, province)
                    time.sleep(1)
                    print('The', i, 'page-the', num, 'data has been downloaded!!!')
                    num += 1
    print('The data has been downloaded and is up-to-date---')
    con.close()

腾讯云开发者社区

腾讯云面向开发者汇聚海量精品云计算使用和开发经验，营造开放的云计算技术生态圈。

更多推荐

自动化提示词生成工具盘点

腾讯云开发者社区

AI 浪潮下的锚与帆：工程师文化的变与不变 | 架构师夜生活

腾讯云开发者社区

腾讯云架构师技术沙龙 · 长沙站圆满落幕，共话AI驱动下的技术架构与前沿应用

人工智能已成为推动技术创新与产业变革的重要引擎，开发者正身处一场前所未有的技术变革之中。通过本次腾讯云架构师技术沙龙，各位专家深入分享前沿技术洞察，探讨 AI 落地的应用路径与实践经验，为架构师的职业发展指明方向。腾讯云架构师长沙同盟和腾讯云架构师技术同盟长沙地区理事会正式成立。未来，腾讯云架构师长沙同盟将凝心聚力，打造属于本地架构师的学习与成长的家园，助力中国架构的蓬勃发展。未来已来，让我们携手