1.页面

2.分析

发现导航栏有不同分类,f12查看网络接口,发现每个分类对应一个参数 

从列表进入内容,发现每条内容文件对应一个id,这个id是由列表页传过来的,从列表获取拼出内容的url,然后进行接口爬取内容的文件路径path,再下载文件,解析文件

3.准备工作

爬取的文件数据下载到文件夹中,其他数据存储到数据库中

4..完整代码

import uuid
import requests
import sys
import pymysql as mysql
import datetime
from docx import Document
import time
# 爬接口
# 消除警告
requests.packages.urllib3.disable_warnings()
# 无法识别的乱码处理
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)

con = mysql.connect(host="127.0.0.1", port=3306, user="root", passwd="root", db="guojia_spider", charset="utf8")

source = "国家法律法规数据库"
cookies = {
    'wzws_sessionid': 'gWU4ZGIxYYJmYWI4NWaAMTM5LjIxNC4zMi4yMjGgZtAiVw==',
    'Hm_lvt_54434aa6770b6d9fef104d146430b53b': '1722493539,1724209491,1724916312',
    'HMACCOUNT': '4FF444F068B3087E',
    'Hm_lpvt_54434aa6770b6d9fef104d146430b53b': '1724979519',
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    'Connection': 'close'
}
# 列表请求参数
params = {
    'type': 'dfxfg',
    'searchType': 'title;vague',
    'sortTr': 'f_bbrq_s;desc',
    'gbrqStart': '',
    'gbrqEnd': '',
    'sxrqStart': '',
    'sxrqEnd': '',
    'sort': 'true',
    'page': '1',
    'size': '10',
    '_': '1724980047619',
}
# 内容请求参数
data = {
    'id': 'ZmY4MDgxODE4ZDczNmFjMTAxOGRjZmFlOTU2MTJlYWU%3D',
}

# 入库
def inputdb(id, title, source_href, ddate, date2, content_label, content_nolabel, attachment, province):
    global con, source
    cursor1 = con.cursor()
    public_time = ddate
    create_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    # 标题相同 进行去重 避免数据重复
    sql1 = "select * from 表名 where title ='%s'" % (title)
    cursor1.execute(sql1)
    results = cursor1.fetchall()
    if len(results) > 0:
        print('The data already exists---')
        cursor1.close()
        return
    cursor1.close()
    cursor2 = con.cursor()
    if public_time is None or public_time == '':
        public_time = ''
    if date2 is None or date2 == '':
        date2 = ''
    if province != '' and public_time != '' and date2 != '':
        sql2 = (
                   "insert into 表名(id,title,source,source_href,public_time,expiry_time,content,content_text,create_time,province,attachment)"
                   " values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
                   id, title, source, source_href, public_time, date2, content_label, content_nolabel, create_time, province, attachment)
    elif province != '' and public_time != '' and date2 == '':
        sql2 = (
                   "insert into 表名(id,title,source,source_href,public_time,content,content_text,create_time,province,attachment)"
                   " values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
                   id, title, source, source_href, public_time, content_label, content_nolabel, create_time, province, attachment)
    elif province != '' and public_time == '' and date2 != '':
        sql2 = (
                   "insert into 表名(id,title,source,source_href,expiry_time,content,content_text,create_time,province,attachment)"
                   " values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
                   id, title, source, source_href, date2, content_label, content_nolabel,create_time, province, attachment)
    elif province == '' and public_time != '' and date2 != '':
        sql2 = (
                   "insert into 表名(id,title,source,source_href,public_time,expiry_time,content,content_text,create_time,attachment)"
                   " values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
                   id, title, source, source_href, public_time, date2, content_label, content_nolabel,create_time, attachment)
    elif province != '' and public_time == '' and date2 == '':
        sql2 = (
                   "insert into 表名(id,title,source,source_href,content,content_text,create_time,province,attachment)"
                   " values('%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
                   id, title, source, source_href, content_label, content_nolabel,create_time, province, attachment)
    elif province == '' and public_time != '' and date2 == '':
        sql2 = (
                   "insert into 表名(id,title,source,source_href,expiry_time,content,content_text,create_time,attachment)"
                   " values('%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
                   id, title, source, source_href, public_time, content_label, content_nolabel,create_time, attachment)
    elif province == '' and public_time == '' and date2 != '':
        sql2 = (
                   "insert into 表名(id,title,source,source_href,expiry_time,content,content_text,create_time,attachment)"
                   " values('%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
                   id, title, source, source_href, date2, content_label, content_nolabel,create_time, attachment)
    else:
        sql2 = (
                   "insert into 表名(id,title,source,source_href,content,content_text,create_time,attachment)"
                   " values('%s','%s','%s','%s','%s','%s','%s','%s')") % (
                   id, title, source, source_href, content_label, content_nolabel,create_time, attachment)
    cursor2.execute(sql2)
    con.commit()
    cursor2.close()

# 雪花算法 获取id
class Snowflake:
    def __init__(self, machine_id):
        self.machine_id = machine_id
        self.sequence = 0
        self.last_timestamp = -1

    def generate_id(self):
        timestamp = int(time.time() * 1000)
        if timestamp < self.last_timestamp:
            raise Exception("Clock moved backwards")
        if timestamp == self.last_timestamp:
            self.sequence = (self.sequence + 1) & 4095
            if self.sequence == 0:
                timestamp = self.wait_next_millis(self.last_timestamp)
        else:
            self.sequence = 0
        self.last_timestamp = timestamp
        return ((timestamp - 1288834974657) << 22) | (self.machine_id << 12) | self.sequence

    def wait_next_millis(self, last_timestamp):
        timestamp = int(time.time() * 1000)
        while timestamp <= last_timestamp:
            timestamp = int(time.time() * 1000)
        return timestamp


if __name__ == '__main__':
    # 地方性法规1-2229页
    params['type'] = 'dfxfg'
    # 司法解释
    # params['type'] = 'sfjs'
    # 法律
    # params['type'] = 'fl'
    # 行政法规
    # params['type'] = 'xzfg'
    # 根据参数不同 循环不同页码
    for i in range(1, 2230):
        num = 1
        params['page'] = i
        response = requests.get(
            url='https://flk.npc.gov.cn/api/',
            headers=headers,
            params=params,
            cookies=cookies,
            verify=False
        )
        if response.status_code == 200:
            response = response.json()
            snowflake = Snowflake(1)
            for j in response['result']['data']:
                print('---start running---')
                data['id'] = j['id']
                title = j['title']
                date = j['publish'] # 发布日期
                date2 = j['expiry'] # 生效日期
                if j['type'] == '地方性法规':
                    province = str(j['office']).split('人')[0]
                else:
                    province = ''
                uurl = 'https://flk.npc.gov.cn/api/detail'
                new_data = requests.post(url=uurl, data=data, headers=headers, cookies=cookies)
                # 避免网页504
                if new_data.status_code == 504:
                    print("Error 504: Gateway Timeout")
                if new_data.status_code == 200:
                    new_data = new_data.json()
                    download_url = 'https://wb.flk.npc.gov.cn' + new_data['result']['body'][0]['path']
                    text_p, text, attachment_url = "", "", ""
                    if download_url.split('.')[-1].endswith('pdf'):
                        download_url = 'https://wb.flk.npc.gov.cn' + new_data['result']['body'][1]['path']
                    if download_url.split('.')[-1].endswith('docx'):
                        name = str(uuid.uuid4().hex)
                        attachment_url = '/spiderFiles/' + name + '.' + download_url.split('.')[-1]
                        # 下载文件
                        content = requests.get(url=download_url, headers=headers).content
                        with open('D:\\spiderFiles\\' + name + '.' + download_url.split('.')[-1],
                                  mode='wb') as f:
                            f.write(content)
                        # 解析文件
                        doc = Document('D:\\spiderFiles\\' + name + '.' + download_url.split('.')[-1])
                        for para in doc.paragraphs:
                            text += para.text
                            if para.text != None and para.text != "":
                                text_p += "<p>" + para.text + "</p>"
                        # print(text_p)
                    else:
                        print('---check the file---')
                    href = 'https://flk.npc.gov.cn/detail2.html?' + data['id']
                    # id非自增 雪花算法随机生成
                    id = snowflake.generate_id()
                    inputdb(id, title, href, date, date2, text_p, text, attachment_url, province)
                    time.sleep(1)
                    print('The', i, 'page-the', num, 'data has been downloaded!!!')
                    num += 1
    print('The data has been downloaded and is up-to-date---')
    con.close()
Logo

腾讯云面向开发者汇聚海量精品云计算使用和开发经验,营造开放的云计算技术生态圈。

更多推荐