python爬取动漫网站的动漫

最近学了一下爬虫，就写段代码来试一下成果如何。（目的是爬取某动漫网站上的一部动漫）版本是python3.7`import requestsimport refrom selenium import webdriverimport osheaders={‘user-agent’ : ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/53......

amüsiert

1658人浏览 · 2020-03-05 02:04:29

amüsiert · 2020-03-05 02:04:29 发布

最近学了一下爬虫，就写段代码来试一下成果如何.（目的是爬取某动漫网站上的一部动漫darling in the franxx）
版本是python3.7

    
    import requests  
    import re
    from selenium import webdriver
    import os
    headers={
      'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36   (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
      }
    url="http://www.imomoe.in/view/7288.html"
    response = requests.get(url,params=headers)
    html=response.text
    step1_name=re.findall("/player/7288-0-.*?.html",html)
    if not os.path.exists("DarlingInTheFranxx"):
       os.mkdir("DarlingInTheFranxx")
       for i in range(24):
       		file_name = "DITF" + str(i + 1)
       		if not os.path.exists("DarlingInTheFranxx" + '/' + file_name + '.mp4'):
       			url='http://imomoe.in'+step1_name[i]
       			response=requests.get(url,headers=headers)
        #这里实在是找不到什么好的方法来获取iframe的源代码
		       	browser=webdriver.Chrome()
		        browser.get(url)   
		        browser.switch_to.frame("play2")  
		        temp=browser.page_source
		        browser.quit()    
       
			    video_url=re.findall("https://.*?.mp4",temp)
       			with open("DarlingInTheFranxx"+'/'+file_name+'.mp4','wb') as fuck:
          			response = requests.get(video_url[0], stream=True)
          			print("正在下载第" + str(i + 1) + "集")
          			for chunk in response.iter_content(chunk_size=1024):
             			if chunk:
                 			fuck.write(chunk)
       				print("已下载第"+str(i+1)+"集")
    		else:
       			print("第"+str(i+1)+'集已经存在')