python爬虫爬取ONCOKB数据库

python爬虫爬取ONCOKB数据库简介代码简介爬取这个网站比较麻烦，他网速不快、又不能直接爬，所以我们用模拟浏览器先将网页保存到本地，然后在从本地爬取信息。（没想出好的方法，大家如果有什么高明的方法，欢迎指教！）步骤：1、你要自己准备好要爬取的gene列表（也就是代码中的 oncokb_gene_list_wait 文件）。例：也就是这里有的gene名2、通过链接进去，下载...

毒鸡蛋

2267人浏览 · 2020-04-17 14:20:20

毒鸡蛋 · 2020-04-17 14:20:20 发布

python爬虫爬取ONCOKB数据库

简介
代码

简介

爬取这个网站比较麻烦，他网速不快、又不能直接爬，所以我们用模拟浏览器先将网页保存到本地，然后在从本地爬取信息。
（没想出好的方法，大家如果有什么高明的方法，欢迎指教！）

步骤：
1、你要自己准备好要爬取的gene列表（也就是代码中的 oncokb_gene_list_wait 文件）。
例：
在这里插入图片描述
也就是这里有的gene名

2、通过链接进去，下载网页，并从本地保存的网页爬取下图的文本信息。

代码

selenium、Firefox浏览器，需要自己配置

import sys
import time,random

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from queue import Queue
from threading import Thread
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary


#-----------------------------------
options = Options()
options.add_argument('--headless')

def get_html_out(in_q):
    time.sleep(random.randint(5,10))
    while in_q.empty() is not True:
        gene_name = in_q.get()           
        get_html = "1/{gene_name}.html".format(gene_name=gene_name)
        #打开文件，准备写入
        f = open(get_html,'wb')        
        try: 
            url = 'https://www.oncokb.org/gene/{gene_name}'.format(gene_name=gene_name) # 这里填你要保存的网页的网址
            browser = webdriver.Firefox(options=options)
            #browser.implicitly_wait(300)
            browser.get(url)
            #Open PhantomJS        
            
            element = WebDriverWait(browser, 100,poll_frequency=1).until(\
                EC.text_to_be_present_in_element((By.XPATH,\
                    '//*[@id="panel-1"]/div/div[2]/div[2]/div/div[1]/div[1]/div/div[1]/div[1]/span') ,u'Alteration'))

            #browser.implicitly_wait(600)
            #写入文件
            f.write(browser.page_source.encode("gbk", "ignore")) # 忽略非法字符
            print('写入成功\t',url)          
            #browser.quit()
        except :
            f.write(browser.page_source.encode("gbk", "ignore"))
            print ("Loading took too much time!\t",url)
            #将失败的页面再添加回去
            #in_q.put(gene_name)
        f.close() 
        browser.quit()
    #-----------------------------------
    #读取HTML并提取信息
        get_html = "1/{gene_name}.txt".format(gene_name=gene_name)
        f = open(get_html,'w')
        
        url = '1/{gene_name}.html'.format(gene_name=gene_name)
        html = open(url) #打开本地html
        soup = BeautifulSoup(html,'html.parser')
        a_list = []
        b_list = []
        for a in soup.find_all(name = "div",attrs = "rt-td"):
           a_list.append(a.text)
        for a in soup.find_all(name = "i",attrs = "level-icon"):
           b_list.append(a.attrs['class'][2])

        temp_list = []

        # 5列的HTML
        if len(b_list)!=0:
           for i in range(0,len(a_list),5):
              temp_list.append(a_list[i:i+5])   
           for i in range(len(temp_list)):
              temp_list[i][3] = b_list[i]
              print(gene_name, '\t', temp_list[i])
              f.write(gene_name+'\t'+str('\t'.join(temp_list[i]))+'\n')

        # 4列的HTML      
        elif len(b_list) == 0:
           for i in range(0,len(a_list),4):
              temp_list.append(a_list[i:i+4])
           for i in range(len(temp_list)):
              print(gene_name, '\t', temp_list[i])
              f.write(gene_name+'\t'+str('\t'.join(temp_list[i]))+'\n')
           #print ('error\t',url)
        print('\n')
        #time.sleep(random.randint(5,10))
        in_q.task_done()
    
#-----------------------------------
#主体运行
queue = Queue()
result_queue = Queue()

with open ('oncokb_gene_list_wait') as f_in:
    gene_list = f_in.read().split('\n')

for gene_name in gene_list:
    url = '{gene_name}'.format(gene_name = gene_name)
    queue.put(url)
print('queue 开始大小 %d' %queue.qsize())

for index in range(5):
    thread = Thread(target=get_html_out, args=(queue,  ))
    thread.start()
queue.join()  # 队列消费完 线程结束
              

print(gene_name,'\t',url)