逻辑思路是什么?

1. 获取页面

2. 处理页面,提取信息

3. 格式输出

先走面向过程编程:

1. 要定义3个函数,对应以上三个过程

2. 在__main__函数中传入参数,并执行以上三个过程

#!/usr/bin/python3

import bs4

import requests

from bs4 import BeautifulSoup

def getHTMLText(url):

'''获取页面'''

try:

r = requests.get(url, timeout=30)

r.raise_for_status()

r.encoding = r.apparent_encoding

return r.text

except:

return ""

def fillUnivList(ulist, html):

'''处理页面'''

soup = BeautifulSoup(html, "html.parser")

for tr in soup.find('tbody').children:

if isinstance(tr, bs4.element.Tag):

tds = tr('td')

ulist.append([tds[0].string, tds[1].string, tds[3].string])

def printUnivList(ulist, num):

'''格式输出页面'''

tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"

print(tplt.format("排名", "学校名称", "总分", chr(12288)))

for i in range(num):

u = ulist[i]

print(tplt.format(u[0], u[1], u[2], chr(12288)))

if __name__ == '__main__':

uinfo = []

url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html'

html = getHTMLText(url)

fillUnivList(uinfo, html)

printUnivList(uinfo, 20) # 输出20个大学排名

如何走向面向对象?

1. 输入: url ?+ 想要获得几条信息?

2. 输出: 格式化信息

3. 对于获取页面和处理页面为私有方法,不应该暴露

#!/usr/bin/python3

import requests

import bs4

from bs4 import BeautifulSoup

class SchoolMessage(object):

'''爬取大学排名'''

def __init__(self, url, number):

self.url = url

self.number = number

def __get_html(self):

'''获得页面'''

try:

r = requests.get(self.url,timeout=30)

r.raise_for_status()

r.encoding = r.apparent_encoding

return r.text

except:

return '1'

def __get_message(self):

'''获得信息'''

info = []

html = self.__get_html()

if html is not '1':

soup = BeautifulSoup(html, 'html.parser')

for i in soup.find('tbody').children:

if isinstance(i, bs4.element.Tag):

tds = i('td')

info.append([tds[0].string, tds[1].string, tds[2].string])

return info

else:

return '1'

def get_message(self):

'''格式化输出信息'''

info = self.__get_message()

if info is not '1':

temp = "{0:^10}\t{1:{3}^10}\t{2:^10}"

print(temp.format("排名", "学校名称", "总分", chr(12288)))

for i in range(self.number):

u = info[i]

print(temp.format(u[0], u[1], u[2], chr(12288)))

else:

print('爬取失败')

if __name__ == '__main__':

url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html'

school_1 = SchoolMessage(url, 10)

school_1.get_message()

所需要的环境:

python 3.5

requests 库

beautifulsoup 库

Logo

腾讯云面向开发者汇聚海量精品云计算使用和开发经验,营造开放的云计算技术生态圈。

更多推荐