python爬取国家法律法规数据库
从列表进入内容,发现每条内容文件对应一个id,这个id是由列表页传过来的,从列表获取拼出内容的url,然后进行接口爬取内容的文件路径path,再下载文件,解析文件。涉及文件下载,word解析,得到带标签和不带标签的内容,雪花算法非自增id,数据入库去重。发现导航栏有不同分类,f12查看网络接口,发现每个分类对应一个参数。爬取的文件数据下载到文件夹中,其他数据存储到数据库中。
·
1.页面
2.分析
发现导航栏有不同分类,f12查看网络接口,发现每个分类对应一个参数
从列表进入内容,发现每条内容文件对应一个id,这个id是由列表页传过来的,从列表获取拼出内容的url,然后进行接口爬取内容的文件路径path,再下载文件,解析文件
3.准备工作
爬取的文件数据下载到文件夹中,其他数据存储到数据库中
4..完整代码
import uuid
import requests
import sys
import pymysql as mysql
import datetime
from docx import Document
import time
# 爬接口
# 消除警告
requests.packages.urllib3.disable_warnings()
# 无法识别的乱码处理
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
con = mysql.connect(host="127.0.0.1", port=3306, user="root", passwd="root", db="guojia_spider", charset="utf8")
source = "国家法律法规数据库"
cookies = {
'wzws_sessionid': 'gWU4ZGIxYYJmYWI4NWaAMTM5LjIxNC4zMi4yMjGgZtAiVw==',
'Hm_lvt_54434aa6770b6d9fef104d146430b53b': '1722493539,1724209491,1724916312',
'HMACCOUNT': '4FF444F068B3087E',
'Hm_lpvt_54434aa6770b6d9fef104d146430b53b': '1724979519',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'Connection': 'close'
}
# 列表请求参数
params = {
'type': 'dfxfg',
'searchType': 'title;vague',
'sortTr': 'f_bbrq_s;desc',
'gbrqStart': '',
'gbrqEnd': '',
'sxrqStart': '',
'sxrqEnd': '',
'sort': 'true',
'page': '1',
'size': '10',
'_': '1724980047619',
}
# 内容请求参数
data = {
'id': 'ZmY4MDgxODE4ZDczNmFjMTAxOGRjZmFlOTU2MTJlYWU%3D',
}
# 入库
def inputdb(id, title, source_href, ddate, date2, content_label, content_nolabel, attachment, province):
global con, source
cursor1 = con.cursor()
public_time = ddate
create_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# 标题相同 进行去重 避免数据重复
sql1 = "select * from 表名 where title ='%s'" % (title)
cursor1.execute(sql1)
results = cursor1.fetchall()
if len(results) > 0:
print('The data already exists---')
cursor1.close()
return
cursor1.close()
cursor2 = con.cursor()
if public_time is None or public_time == '':
public_time = ''
if date2 is None or date2 == '':
date2 = ''
if province != '' and public_time != '' and date2 != '':
sql2 = (
"insert into 表名(id,title,source,source_href,public_time,expiry_time,content,content_text,create_time,province,attachment)"
" values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
id, title, source, source_href, public_time, date2, content_label, content_nolabel, create_time, province, attachment)
elif province != '' and public_time != '' and date2 == '':
sql2 = (
"insert into 表名(id,title,source,source_href,public_time,content,content_text,create_time,province,attachment)"
" values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
id, title, source, source_href, public_time, content_label, content_nolabel, create_time, province, attachment)
elif province != '' and public_time == '' and date2 != '':
sql2 = (
"insert into 表名(id,title,source,source_href,expiry_time,content,content_text,create_time,province,attachment)"
" values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
id, title, source, source_href, date2, content_label, content_nolabel,create_time, province, attachment)
elif province == '' and public_time != '' and date2 != '':
sql2 = (
"insert into 表名(id,title,source,source_href,public_time,expiry_time,content,content_text,create_time,attachment)"
" values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
id, title, source, source_href, public_time, date2, content_label, content_nolabel,create_time, attachment)
elif province != '' and public_time == '' and date2 == '':
sql2 = (
"insert into 表名(id,title,source,source_href,content,content_text,create_time,province,attachment)"
" values('%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
id, title, source, source_href, content_label, content_nolabel,create_time, province, attachment)
elif province == '' and public_time != '' and date2 == '':
sql2 = (
"insert into 表名(id,title,source,source_href,expiry_time,content,content_text,create_time,attachment)"
" values('%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
id, title, source, source_href, public_time, content_label, content_nolabel,create_time, attachment)
elif province == '' and public_time == '' and date2 != '':
sql2 = (
"insert into 表名(id,title,source,source_href,expiry_time,content,content_text,create_time,attachment)"
" values('%s','%s','%s','%s','%s','%s','%s','%s','%s')") % (
id, title, source, source_href, date2, content_label, content_nolabel,create_time, attachment)
else:
sql2 = (
"insert into 表名(id,title,source,source_href,content,content_text,create_time,attachment)"
" values('%s','%s','%s','%s','%s','%s','%s','%s')") % (
id, title, source, source_href, content_label, content_nolabel,create_time, attachment)
cursor2.execute(sql2)
con.commit()
cursor2.close()
# 雪花算法 获取id
class Snowflake:
def __init__(self, machine_id):
self.machine_id = machine_id
self.sequence = 0
self.last_timestamp = -1
def generate_id(self):
timestamp = int(time.time() * 1000)
if timestamp < self.last_timestamp:
raise Exception("Clock moved backwards")
if timestamp == self.last_timestamp:
self.sequence = (self.sequence + 1) & 4095
if self.sequence == 0:
timestamp = self.wait_next_millis(self.last_timestamp)
else:
self.sequence = 0
self.last_timestamp = timestamp
return ((timestamp - 1288834974657) << 22) | (self.machine_id << 12) | self.sequence
def wait_next_millis(self, last_timestamp):
timestamp = int(time.time() * 1000)
while timestamp <= last_timestamp:
timestamp = int(time.time() * 1000)
return timestamp
if __name__ == '__main__':
# 地方性法规1-2229页
params['type'] = 'dfxfg'
# 司法解释
# params['type'] = 'sfjs'
# 法律
# params['type'] = 'fl'
# 行政法规
# params['type'] = 'xzfg'
# 根据参数不同 循环不同页码
for i in range(1, 2230):
num = 1
params['page'] = i
response = requests.get(
url='https://flk.npc.gov.cn/api/',
headers=headers,
params=params,
cookies=cookies,
verify=False
)
if response.status_code == 200:
response = response.json()
snowflake = Snowflake(1)
for j in response['result']['data']:
print('---start running---')
data['id'] = j['id']
title = j['title']
date = j['publish'] # 发布日期
date2 = j['expiry'] # 生效日期
if j['type'] == '地方性法规':
province = str(j['office']).split('人')[0]
else:
province = ''
uurl = 'https://flk.npc.gov.cn/api/detail'
new_data = requests.post(url=uurl, data=data, headers=headers, cookies=cookies)
# 避免网页504
if new_data.status_code == 504:
print("Error 504: Gateway Timeout")
if new_data.status_code == 200:
new_data = new_data.json()
download_url = 'https://wb.flk.npc.gov.cn' + new_data['result']['body'][0]['path']
text_p, text, attachment_url = "", "", ""
if download_url.split('.')[-1].endswith('pdf'):
download_url = 'https://wb.flk.npc.gov.cn' + new_data['result']['body'][1]['path']
if download_url.split('.')[-1].endswith('docx'):
name = str(uuid.uuid4().hex)
attachment_url = '/spiderFiles/' + name + '.' + download_url.split('.')[-1]
# 下载文件
content = requests.get(url=download_url, headers=headers).content
with open('D:\\spiderFiles\\' + name + '.' + download_url.split('.')[-1],
mode='wb') as f:
f.write(content)
# 解析文件
doc = Document('D:\\spiderFiles\\' + name + '.' + download_url.split('.')[-1])
for para in doc.paragraphs:
text += para.text
if para.text != None and para.text != "":
text_p += "<p>" + para.text + "</p>"
# print(text_p)
else:
print('---check the file---')
href = 'https://flk.npc.gov.cn/detail2.html?' + data['id']
# id非自增 雪花算法随机生成
id = snowflake.generate_id()
inputdb(id, title, href, date, date2, text_p, text, attachment_url, province)
time.sleep(1)
print('The', i, 'page-the', num, 'data has been downloaded!!!')
num += 1
print('The data has been downloaded and is up-to-date---')
con.close()
更多推荐
已为社区贡献1条内容
所有评论(0)