解析HTML—— BeautifulSoup和xPath的基本使用
Beautifulsoup和xPath的基本使用Beautifulsoup的基本使用导包:from bs4 import BeautifulSoup解析:soup1 = BeautifulSoup(html,'html.parser'),这里的html是一个字符串,一般都是response的text属性。find和find_allfind:例如tag = soup.fin...
·
Beautifulsoup和xPath的基本使用
Beautifulsoup的基本使用
-
导包:
from bs4 import BeautifulSoup
-
解析:
soup1 = BeautifulSoup(html,'html.parser')
,这里的html是一个字符串,一般都是response的text属性。 -
find和find_all
- find:例如
tag = soup.find(name='div', attrs={"class":"tags"})
,返回的是一个Tag类的标签,是满足class为tags的第一个div标签。 - find_all:例如
tag = soup.find_all(name='div', attrs={"class":"tags"})
,返回的是所有满足class为tags的div标签,是一个列表。 - name=可以省略不写,若是id属性可写为id=‘id名称’,class_=‘class名称’。
- find:例如
-
获得内容
- 属性内容:例如
url = a.get('href')
,这里的a是一个标签 - 文本内容:例如
content = a.get_text()
,a是一个标签
- 属性内容:例如
-
select选择器:想css选择器一样定位,比上面的find_all方便一点
- 找到所有a标签:
all_a = soup.select('a')
- 找到所有class为tags的div标签:
tag = soup.select('div.tags')
- 找到id为tag1的div标签:
tag = soup.select(div#tag1)
,返回的也是列表
- 找到所有a标签:
-
children:找到某标签的所有孩子标签,
childs = a.children
,结果是一个列表,注意列表中不一定都是Tag类,还有一些String,要去掉。一般都会判断一下:if isinstance(p,bs4.element.Tag):
p是列表中的某一个元素。
xPath和lxml的基本使用
-
导包:
from lxml import etree
,windows下要装一个东西:pip install pypiwin32
-
xPath基本语法
-
//div[@class='tag']
,这样就是找到了所有class为tag的div标签 -
//div[@class='tag']/text()
,获得class为tag的标签里的内容 -
//link[@href]
获得link标签的href属性值
-
-
python中使用xPath
- 解析:
dom = etree.HTML(text)
,text是一个字符串 - 查找:
content = dom.xpath("//span[@class='short']/text()")
,返回的是一个列表 date = dom.xpath("//span[@class='info']/span[@class='time']/text()")
这是在某父节点下找某子节点
- 解析:
例子:爬取豆瓣电影《阳光灿烂的日子》短评
使用Beautifulsoup爬取
from bs4 import BeautifulSoup
import requests
import csv
import time
'''
爬取电影《阳光灿烂的日子》在豆瓣上的所有短评
用beautifulsoup来做
'''
def getAllUrls(list):
url = 'https://movie.douban.com/subject/1291875/comments?start='
for page in range(0, 201, 20):
list.append(url + str(page))
def getComment(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.text, 'html.parser')
contentSpan = soup.select('span.short')
content = []
for c in contentSpan:
content.append(c.get_text())
userSpan = soup.select('span.comment-info')
user = []
date = []
for u in userSpan:
user.append(u.find('a').get_text())
dateSpan = soup.select('span.comment-time ')
date = []
for d in dateSpan:
date.append(d.get_text())
infos = []
for i in range(20):
info = {}
info['username'] = user[i]
info['content'] = content[i]
info['date'] = date[i]
infos.append(info)
return infos
def write2csv():
url_list = []
getAllUrls(url_list)
with open('comments_by_bs4.csv', 'w', encoding='gbk', newline='', errors='ignore') as file:
csvWriter = csv.writer(file)
csvWriter.writerow(['用户名', '评论', '评论时间'])
for url in url_list:
infos = getComment(url)
for info in infos:
csvWriter.writerow([info['username'], info['content'], info['date'].strip()])
if __name__ == '__main__':
start_time = time.time()
write2csv()
end_time = time.time()
print(round(end_time - start_time, 2), '秒')
使用xPath和lxml爬取
from lxml import etree
import requests
import csv
import time
'''
爬取电影《阳光灿烂的日子》在豆瓣上的所有短评
以lxml和xPath来做
'''
def getAllUrls(list):
url = 'https://movie.douban.com/subject/1291875/comments?start='
for page in range(0, 201, 20):
list.append(url + str(page))
def getComment(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
res = requests.get(url, headers=headers)
dom = etree.HTML(res.text)
content = dom.xpath("//span[@class='short']/text()")
user = dom.xpath("//span[@class='comment-info']/a/text()")
date = dom.xpath("//span[@class='comment-info']/span[@class='comment-time ']/text()")
infos = []
for i in range(20):
info = {}
info['username'] = user[i]
info['content'] = content[i]
info['date'] = date[i]
infos.append(info)
return infos
def write2csv():
url_list = []
getAllUrls(url_list)
with open('comments_by_xPath.csv', 'w', encoding='gbk', newline='', errors='ignore') as file:
csvWriter = csv.writer(file)
csvWriter.writerow(['用户名', '评论', '评论时间'])
for url in url_list:
infos = getComment(url)
for info in infos:
csvWriter.writerow([info['username'], info['content'], info['date'].strip()])
if __name__ == '__main__':
start_time = time.time()
write2csv()
end_time = time.time()
print(round(end_time - start_time, 2), '秒')
测试了一下时间,xPath明显快一点,速度大概是Beautifulsoup的1.5倍,而且感觉xPath用起来顺手一点。
更多推荐
所有评论(0)