自己捣鼓了几天写的代码,基本上把51job的岗位相关的数据都爬下来了,可以视要求自行增减,代码虽然有些简陋,不过我爬取的时候没报什么错。代码适合初学者学习使用,废话不多说,代码如下:

from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium import webdriver
from time import sleep
import pymysql
import re

class Crawler:
    def __init__(self):
        self.wd = webdriver.Chrome()
        self.wd.implicitly_wait(20)
        self.DBHOST = "localhost"
        self.DBUSER = "root"
        self.DBPASS = "123456"
        self.DBNAME = "51job"
    # 获取当前页面的数据
    def getData(self, len_Css):
        rows = []
        for i in range(1, len_Css):
            # 岗位名称
            job_name = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.jname.at'.format(i)).text
            # 公司名称
            company_name = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) a.cname.at'.format(i)).text
            # 城市 工作经验 学历 招聘人数
            al = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.d.at'.format(i)).text.split('|')
            # 分别对应不同情况,有的岗位缺少学历,有的缺少工作经验
            if len(al) == 4:
                city = al[0]
                experience = al[1]
                education = al[2]
                recruits_Number = al[3]
            elif len(al) == 3:
                city = al[0]
                experience = al[1]
                education = None
                recruits_Number = al[2]
            elif len(al) == 2:
                city = al[0]
                experience = None
                education = None
                recruits_Number = al[1]
            else:
                city = None
                experience = None
                education = None
                recruits_Number = None
            # 发布日期
            release_Date = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.time'.format(i)).text
            # 公司福利
            # 有的岗位不能定位到福利元素,通过自定义NoExists方法判断能否定位到元素
            # if self.NoExists('div.j_joblist > div:nth-child({0}) p.tags'.format(i)):
            #     welfare = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) p.tags'.format(i)).get_attribute("title")
            # else:
            #     welfare = None
            # 薪水
            # 有的岗位薪水能定位到元素,但是是空串,防止报错
            if bool(self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.sal'.format(i)).text):
                salary = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) span.sal'.format(i)).text
            else:
                salary = None
            # 公司类型
            company_type = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) p.int.at'.format(i)).text
            # 招聘详情url
            job_ex_url = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) a.el[target=_blank]'.format(i)).get_attribute("href")
            # 公司url
            company_url = self.wd.find_element(By.CSS_SELECTOR, 'div.j_joblist > div:nth-child({0}) a.cname.at'.format(i)).get_attribute("href")
            rows.append([job_name, company_name, city, experience, education, recruits_Number, release_Date, salary, company_type, job_ex_url, company_url])
        return rows
    # 将爬取的数据存进数据库
    def saveData(self, rows):
        db = pymysql.connect(host=self.DBHOST, user=self.DBUSER, password=self.DBPASS, database=self.DBNAME)
        cur = db.cursor()
        sql = "INSERT INTO ods_51job_job(job_name, company_name, job_city, job_experience, job_education, recruits_Number, release_Date, salary, company_type, job_ex_url, company_url) " \
              "VALUE (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        try:
            for row in rows:
                cur.execute(sql, row)
                db.commit()
        except pymysql.Error as e:
            print(e)
        finally:
            cur.close()
            db.close()
    # 一次爬取存储一页数据,自动递增直到爬完
    def scrapingData(self, City, keyWord, start_Page):
        wait = WebDriverWait(self.wd, 20, 0.5)
        
        # 得出总页数
        isNextpage = self.wd.find_element(By.CSS_SELECTOR,
                                     'body > div:nth-child(4) > div.j_result > div > div.leftbox > div:nth-child(4) > div.j_page > div > div > div > span:nth-child(1)').text
        result = re.findall(r'\d+', isNextpage)
        condition = int(result[0])
        
        sleep(2)
        print('城市编号:%s  关键词:%s  总页数:%d' % (City, keyWord, condition))
        
        while start_Page <= condition:
            # 当前页面总共有多少条招聘岗位(一般是50条)
            pubCss = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,
                                                                  'body > div:nth-child(4) > div.j_result > div > div.leftbox > div:nth-child(4) > div.j_joblist > div.e')))
            # 获取当前页面数据并存进数据库
            rows1 = self.getData(len(pubCss)+1)
            self.saveData(rows1)
            print('\t已爬取第%d页;' % start_Page)
            
            # 判断是否最后一页
            if start_Page < condition:
                nextpage = self.wd.find_element(By.CSS_SELECTOR, 'li.next a[style="cursor: pointer;"]')
                nextpage.click()
                self.wd.refresh()
                start_Page += 1
            else:
                print('已爬取完当前城市关键词!')
                break
            sleep(2)

    def NoExists(self, Css):
        try:
            self.wd.find_element(By.CSS_SELECTOR, Css)
            return True
        except NoSuchElementException:
            return False
    # 自动循环遍历城市和关键词
    def getUrl(self, workCity, startPage, keywords):
        # 爬取中断后需要更改i,j的下标初始位置和start_page重新继续爬取
        for i in range(0, len(workCity)):
            for j in range(0, len(keywords)):
                suffix = str(
                    startPage) + '.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
                url = 'https://search.51job.com/list/' + str(
                    workCity[i]) + ',000000,0000,00,9,99,' + keywords[j] + ',2,' + suffix
                self.wd.get(url)
                self.scrapingData(workCity[i], keywords[j], startPage)
                # 更改start_page后会从start_page页开始爬,爬到下一个关键词再把start_page重置成1从第一页开始爬
                if startPage > 1:
                    startPage = 1
# 热门城市编码
# {"北京", "010000"}, {"上海", "020000"}, {"广州", "030200"}, {"深圳", "040000"}, {"武汉", "180200"},
# {"西安", "200200"}, {"杭州", "080200"}, {"南京", "070200"}, {"成都", "090200"}, {"重庆", "060000"},
# {"东莞", "030800"}, {"大连", "230300"}, {"沈阳", "230200"}, {"苏州", "070300"}, {"昆明", "250200"},
# {"长沙", "190200"}, {"合肥", "150200"}, {"宁波", "080300"}, {"郑州", "170200"}, {"天津", "050000"},
# {"青岛", "120300"}, {"哈尔滨", "220200"}, {"长春", "240200"}, {"福州", "110200"}, {"珠三角", "01"};

if __name__ == '__main__':
    # 将需要爬取的城市编号和关键词放进数组,start_page为从第几页开始爬
    cities = ['040000', '080200', '070200', '190200', '090200', '180200']
    keyword = ['大数据', 'python', '爬虫', 'Hadoop', '数据分析师', 'Hadoop']
    start_page = 1
    
    a = Crawler()
    a.getUrl(cities, start_page, keyword)

         上面的代码里公司福利的数据我注释掉了,因为基本每页都有几条没有公司福利的岗位数据,处理错误耗时太久,爬取大量数据的时候太煎熬了,干脆不要了。还有就是css路径我都是直接复制的,好多都还可以再删减优化,不过我比较懒,也可以换成xpath路径,可以更精简。最后就是数据库需要自己建表,连接的时候注意改下代码里的参数还有sql里的字段名称就行,还是比较简单的。

        我自己运行代码的时候出错一般都是爬了很久后报timeout错误,可以把等待时间稍微加长点,不过估计爬多了也还会报错,毕竟51job虽然很随便但爬多了也会反爬,只是不像boss直聘爬了几千条数据就封ip两天那么狠(表示被封过好几次😤),最后就是出错了需要手动重新更改参数继续爬,有些麻烦,还能再改进,不过我懒得改了,反正估计也没多少人看,自己能用就行啦。

Logo

腾讯云面向开发者汇聚海量精品云计算使用和开发经验,营造开放的云计算技术生态圈。

更多推荐