python爬虫爬取拉勾网招聘信息
python爬虫爬取拉勾网招聘信息话不多说,直接上代码!import mathimport randomimport requestsfrom pyquery import PyQuery as pqfrom urllib3.exceptions import MaxRetryErrorimport pandas as pdPROXY_POOL_URL = 'http://localhost:55
·
python爬虫爬取拉勾网招聘信息
话不多说,直接上代码!
import math
import random
import requests
from pyquery import PyQuery as pq
from urllib3.exceptions import MaxRetryError
import pandas as pd
PROXY_POOL_URL = 'http://localhost:5555/random' # 使用了https://github.com/Python3WebSpider/ProxyPool代理池
PROXY_POOL_SIZE = 5
proxies = None
job_info_file = "C:/Users/86135/Desktop/job_info.csv"
company_info_file = "C:/Users/86135/Desktop/company_info.csv"
comment_info_file = "C:/Users/86135/Desktop/comment_info.csv"
'''
根据筛选条件组合并返回请求搜索页面的page_url和会返回职位信息json的ajax_url
'''
def combine_page_and_ajax_url(job_type: str, **condition):
page_url = 'https://www.lagou.com/jobs/list_' + job_type
ajax_url = r'https://www.lagou.com/jobs/positionAjax.json'
# 工作城市
city = '全国'
if 'city' in condition:
city = condition['city']
page_url = page_url + '?city=' + city
ajax_url = ajax_url + '?city=' + city
# 排序方式
px = 'default'
if 'px' in condition:
px = condition['px']
page_url = page_url + '&px=' + px
ajax_url = ajax_url + '&px=' + px
# 公司规模 1 少于15人, 2 15-50人以此类推若多选,如1,2都选,就是_1_2
if 'gm' in condition:
page_url = page_url + '&gm=' + condition['gm']
ajax_url = ajax_url + '&gm=' + condition['gm']
# 融资阶段
if 'jd' in condition:
page_url = page_url + '&jd=' + condition['jd']
ajax_url = ajax_url + '&jd=' + condition['jd']
# 工作性质:全职,兼职,实习
if 'gx' in condition:
page_url = page_url + '&gx=' + condition['gx']
ajax_url = ajax_url + '&gx=' + condition['gx']
# 工作经验 isSchoolJob 1 应届
if 'gj' in condition:
page_url = page_url + 'gj=' + condition['gj']
ajax_url = ajax_url + 'gj=' + condition['gj']
# 学历
if 'xl' in condition:
page_url = page_url + '&xl=' + condition['xl']
ajax_url = ajax_url + '&xl=' + condition['xl']
# 行业
if 'hy' in condition:
page_url = page_url + '&hy=' + condition['hy']
ajax_url = ajax_url + '&hy=' + condition['hy']
page_url = page_url + '#filterBox'
ajax_url = ajax_url + '&needAddtionalResult=false&isSchoolJob=1'
# 解决中文乱码
page_url = page_url.encode("utf-8")
ajax_url = ajax_url.encode("utf-8")
print(page_url)
print(ajax_url)
return page_url, ajax_url
'''
获取符合筛选条件的工作页面url和公司页面url
'''
def get_job_and_company_urls(page_url, ajax_url, job_type):
# 需要获取的工作和公司数据
jobs_id = []
companies_id = []
jobs_name = []
jobs_advantage = []
jobs_salary = []
jobs_publish_time = []
companies_name = []
companies_labels = []
companies_size = []
show_id = None
JOBS_COUNT_ONE_PAGE = 15
remain_page_count = -1
page_number = 1
first = 'true'
user_agent = get_user_agent()
session = requests.session()
r,session=request_page_result(page_url,session,user_agent)
while remain_page_count != 0:
# 请求的数据
data = {
'first': first,
'pn': page_number,
'kd': job_type
}
ajax_result=request_ajax_result(ajax_url,page_url,session,user_agent,data)
result_json = ajax_result.json()
position_result = result_json['content']['positionResult']
# 第一次进入循环,获取到json中的总工作数totalCount
if remain_page_count == -1:
show_id = result_json['content']['showId']
print('showId ', show_id)
print("type of result", type(position_result))
total_count = position_result['totalCount']
# 没有符合条件的工作,直接返回
if total_count == 0:
return
remain_page_count = math.ceil(total_count / JOBS_COUNT_ONE_PAGE)
result = position_result['result']
for item in result:
position_id = item['positionId']
job_name = item['positionName']
job_advantage = item['positionAdvantage']
job_salary = item['salary']
publish_time = item['createTime']
company_id = item['companyId']
company_name = item['companyFullName']
company_labels = item['companyLabelList']
company_size = item['companySize']
jobs_id.append(position_id)
jobs_name.append(job_name)
jobs_advantage.append(job_advantage)
jobs_salary.append(job_salary)
jobs_publish_time.append(publish_time)
companies_name.append(company_name)
companies_id.append(company_id)
companies_labels.append(company_labels)
companies_size.append(company_size)
remain_page_count = remain_page_count - 1
page_number = page_number + 1
first = 'false'
# 存储基本工作信息,公司信息到csv文件中
job_df = pd.DataFrame(
{'job_id': jobs_id, 'job_name': jobs_name, 'job_advantage': jobs_advantage, 'salary': jobs_salary,
'publish_time': jobs_publish_time, 'company_id': companies_id})
company_df = pd.DataFrame({'company_id': companies_id, 'company_name': companies_name, 'labels': companies_labels,
'size': companies_size})
job_df.to_csv(job_info_file, mode='w', header=True, index=False)
company_df.to_csv(company_info_file, mode='w', header=True, index=False)
return show_id
'''
根据得到的job_id访问工作页面并存储工作信息到csv文件中
'''
def get_and_store_job_info(show_id: str):
jobs_detail = []
PAGE_SIZE = 500
comments_content = []
comments_time = []
users_id = []
company_scores = []
interviewer_scores = []
describe_scores = []
comprehensive_scores = []
useful_counts = []
tags = []
# 从csv文件读取job_id并与show_id组合成工作页面url
df = pd.read_csv(job_info_file)
jobs_id = df['job_id']
for job_id in jobs_id:
user_agent = get_user_agent()
session = requests.session()
job_page_url = 'https://www.lagou.com/jobs/' + str(job_id) + '.html?show=' + show_id
# 访问工作页面获取职位描述和面试评价
r = request_page_result(url=job_page_url, session=session, user_agent=user_agent)
doc = pq(r.text)
job_detail = doc('#job_detail > dd.job_bt > div').text()
print("job_detail", job_detail)
jobs_detail.append(job_detail)
# 获取面试评价
review_ajax_url = 'https://www.lagou.com/interview/experience/byPosition.json'
data = {
'positionId': job_id,
'pageSize': PAGE_SIZE,
}
response = request_ajax_result(review_ajax_url, job_page_url, session, user_agent, data)
response_json = response.json()
print("response json", response_json)
if response_json['content']['data']['data']['totalCount'] != 0:
result = response_json['content']['data']['data']['result']
for item in result:
comment_content = item['content']
comment_time = item['createTime']
user_id = item['userId']
company_score = item['companyScore']
interviewer_score = item['interviewerScore']
describe_score = item['describeScore']
comprehensive_score = item['comprehensiveScore']
useful_count = item['usefulCount']
tag = item['tags']
print("content",comment_content)
comments_content.append(comment_content)
comments_time.append(comment_time)
users_id.append(user_id)
company_scores.append(company_score)
interviewer_scores.append(interviewer_score)
describe_scores.append(describe_score)
comprehensive_scores.append(comprehensive_score)
useful_counts.append(useful_count)
tags.append(tag)
j_ids=[]
j_ids.extend(job_id for i in range(len(comments_content)))
comment_df = pd.DataFrame({'job_id': j_ids, 'content': comments_content})
comment_df.to_csv(comment_info_file, mode='a', header=False, index=False)
# 将获取到的职位描述面试评价存储到csv文件中
df['job_detail'] = jobs_detail
df.to_csv(job_info_file)
def request_page_result(url, session, user_agent):
try:
r = session.get(url, headers={
'User-Agent': user_agent
})
except MaxRetryError as e:
print(e)
user_agent = get_user_agent()
r = session.get(url, headers={
'User-Agent': user_agent
}, proxies=get_proxy())
return r,session
def request_ajax_result(ajax_url, page_url, session, user_agent, data):
try:
result = session.post(ajax_url, headers=get_ajax_header(page_url, user_agent), data=data,
allow_redirects=False)
except MaxRetryError as e:
print(e)
user_agent = get_user_agent()
result = session.post(ajax_url, headers=get_ajax_header(page_url, user_agent),
proxies=get_proxy(),
data=data, allow_redirects=False)
return result
'''
访问拉勾页面所使用的header
'''
def get_page_header():
page_header = {
'User-Agent': get_user_agent()
}
return page_header
'''
访问拉勾ajax json所使用的header
'''
def get_ajax_header(url, user_agent):
ajax_header = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '25',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Host': 'www.lagou.com',
'Origin': 'https://www.lagou.com',
'Referer': url,
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': user_agent,
'X-Anit-Forge-Code': '0',
'X-Anit-Forge-Token': 'None',
'X-Requested-With': 'XMLHttpRequest'
}
return ajax_header
def get_user_agent():
user_agent = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 "
"Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6"
]
return random.choice(user_agent)
def get_proxy():
response = requests.get(PROXY_POOL_URL)
if response.status_code == 200:
print(response.text)
return response.text
if __name__ == '__main__':
page_url, ajax_url = combine_page_and_ajax_url(job_type='java', city='杭州', gx='全职', xl='本科', hy='移动互联网')
showId = get_job_and_company_urls(page_url, ajax_url, 'java')
get_and_store_job_info(showId)
运行结果:
更多推荐
所有评论(0)