您的位置:首页 > 其它

自己爬虫的几个案例

2019-10-15 19:26 477 查看

原创,如转载请注明来源https://www.cnblogs.com/sogeisetsu/

爬取中国大学排名

#这个只用到了requests 和 bs4
#爬取大学排名
import requests
from bs4 import BeautifulSoup as bs
def grthtml(url):
demo=requests.get(url)
demo.encoding=demo.apparent_encoding
demo=demo.text
return(demo)
def listhtml(ulist,html):
soup=bs(html,"html.parser")
soup=soup.tbody
for tr in soup("tr"):
tds=tr("td")
ulist.append([tds[0].string,tds[1].string,tds[2].string])
def pmhtml(ulist,num):
print('{0:^10}\t{1:{3}^7}\t{2:^10}'.format("排名","校名","地址",chr(12288)))
for i in ulist[0:num]:
print("{0:^10}\t{1:{3}^10}\t{2:^10}".format(i[0],i[1],i[2],chr(12288)))
if __name__=="__main__":
url="http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html"
html=grthtml(url)
uinfo=[]
listhtml(uinfo,html)
num=int(input())
pmhtml(uinfo,num)

爬取今日头条热点

# 这个爬取的是动态页面
# 爬取动态页面,今日头条
# 源文件里是没有内容的只有js
import requests
from bs4 import BeautifulSoup as bs
import json
def gethtml(url):
try:
#通过F12检查出来自下面这个链接
# url="https://www.toutiao.com/api/pc/feed/?category=news_hot&utm_source=toutiao&widen=1&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as=A1D5FDAAB5194AE&cp=5DA599948ACE7E1&_signature=TBeQ-wAAEbOkzbKGAd3hQUwXkO"
head={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36','Cookie':#cookie就用自己的吧}
r=requests.get(url,headers=head)
return r.text #返回json文件
except:
print("oneerror")
def getulist(html,list):
try:
soup=json.loads(html)
soupdata=soup['data']
for one in soupdata:
a='https://www.toutiao.com/a'+one['group_id']
list.append([one['title'],one['abstract'],a])
except:
print("twoerror")
def printulist(list):
for i in list:
print("title:\n{}".format(i[0]))
print("简介:\n{}".format(i[1]))
print("链接:{:^30}".format(i[2]))
if __name__=="__main__":
url = "https://www.toutiao.com/api/pc/feed/?category=news_hot&utm_source=toutiao&widen=1&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as=A1D5FDAAB5194AE&cp=5DA599948ACE7E1&_signature=TBeQ-wAAEbOkzbKGAd3hQUwXkO"
html=gethtml(url)
relist=[]
getulist(html,relist)
printulist(relist)

爬取知乎热点

#爬取知乎
import requests
from bs4 import BeautifulSoup as bs
def gethtml(url):
headers={'user-agent':'Mozila/5.0','Cookie':#cookie就用自己的吧}
r=requests.get(url,headers=headers)
r.raise_for_status()
r.encoding=r.apparent_encoding
r=r.text
return r
def getlist(ulist,html):
soup=bs(html,"html.parser")
soup = soup("div", "HotList-list")
c=soup[0].contents
for sect in c:
if sect.a.p:
pp=sect.a.p.string
else:
pp="该题目下没有相应简介"
ulist.append([sect.a.h2.string,pp,sect.a.attrs['href']])
def printlist(ulist,num):
top=1
for i in ulist[0:num]:
print("top{}".format(top))
top+=1
print("题目:{}".format(i[0]))
print("优秀回答简介:\n{}".format(i[1]))
print("链接\n{}".format(i[2]))
# print("\n")
if __name__ == "__main__":
url="https://www.zhihu.com/hot"
ulist=[]
print("您想看知乎热榜top几?")
num=int(input())
html=gethtml(url)
getlist(ulist,html)
printlist(ulist,num)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: