您的位置:首页 > 编程语言 > Python开发

python爬虫的一个小例子

2016-05-04 13:52 537 查看
# -*- coding: utf-8 -*-
import urllib2
import re
import BeautifulSoup
import json

def getPage(url):#获取页面信息
flag = True
while flag:  #该while循环为了防止意外断网
try:
print url
request = urllib2.Request(url)
response = urllib2.urlopen(request)
page = response.read()
flag = False
return page
except Exception, e:
if hasattr(e,"reason"):
print u"连接失败,错误原因",e.reason
flag = True

def getMaxPageNum(page):#获取 医院/科室/医生 页面数最大值
soup = BeautifulSoup.BeautifulSoup(page)
#print page
res = soup.findAll('span',attrs={'class':'contcss'})
#print res
return int(re.findall(r"\d+",res[0].text.split("/")[0])[0].encode("utf-8"))

def dumpDocInfo(url):#将医生数据写到本地
page = getPage(url)
outfile = open("C:\\Users\\Administrator\\Desktop\\doctors.txt","ab")
soup = BeautifulSoup.BeautifulSoup(page)
if len(soup.findAll(attrs={'class':'map'})) == 0:
return
xingming = soup.findAll(attrs={'class':'map'})[0].findAll('a')[2].text
zhicheng = soup.findAll(attrs={'class':'regdoc_name'})[0].findAll('span')[0].text
yiyuan = soup.findAll(attrs={'class':'regdoc_commom'})[1].text.split(u'科室:')[0][3:]
keshi = soup.findAll(attrs={'class':'regdoc_commom'})[1].text.split(u'科室:')[1]
jianjie =  soup.findAll(attrs={'class':'regdoc_msg'})[0].text[3:]
shanchang = soup.findAll(attrs={'class':'regdoc_msg'})[1].text[3:]

info = {u'姓名':xingming,u'职称':zhicheng,u'医院':yiyuan,u'科室':keshi,u'简介':jianjie,u'擅长':shanchang}
info = json.dumps(info, ensure_ascii=False)
outfile.write(info.encode('utf-8')+'\n')

def dumpHisInfo(url):#将医生数据写到本地
page = getPage(url)
outfile = open("C:\\Users\\Administrator\\Desktop\\hospital.txt","ab")
soup = BeautifulSoup.BeautifulSoup(page)
if len(soup.findAll(attrs={'class':'map'})) == 0:
return
xingming = soup.findAll(attrs={'class':'map'})[0].findAll('a')[2].text
zhicheng = soup.findAll(attrs={'class':'regdoc_name'})[0].findAll('span')[0].text
yiyuan = soup.findAll(attrs={'class':'regdoc_commom'})[1].text.split(u'科室:')[0][3:]
keshi = soup.findAll(attrs={'class':'regdoc_commom'})[1].text.split(u'科室:')[1]
jianjie =  soup.findAll(attrs={'class':'regdoc_msg'})[0].text[3:]
shanchang = soup.findAll(attrs={'class':'regdoc_msg'})[1].text[3:]

info = {u'姓名':xingming,u'职称':zhicheng,u'医院':yiyuan,u'科室':keshi,u'简介':jianjie,u'擅长':shanchang}
info = json.dumps(info, ensure_ascii=False)
outfile.write(info.encode('utf-8')+'\n')

def findDocHref(page):#获取所有医生信息的超链接
soup = BeautifulSoup.BeautifulSoup(page)
res = soup.findAll('div',attrs={'class':'yy_doctor_head'})
for v in res:
dumpDocInfo('http://www.eztcn.com'+v['onclick'].encode("utf-8")[15:-1])

def findHisHref(page):#查找所有医院的超链接
soup = BeautifulSoup.BeautifulSoup(page)
res = soup.findAll('a')
arr = []
for v in res[9::3]:
if '#' in str(v): #判断一页是否结束
break
arr.append("http://www.eztcn.com"+(v['href'].encode("utf-8")))
return arr

#主程序正式开始
hisMaxPageNum = getMaxPageNum(getPage('http://www.eztcn.com/Home/Find/findHos/p/1.html#selAnchor'))
print "hisMaxPageNum:"+str(hisMaxPageNum)
#医院页面数为15页
for i in range(1,hisMaxPageNum+1):
url = 'http://www.eztcn.com/Home/Find/findHos/p/'+str(i)+'.html#selAnchor'
page = getPage(url)
hisHref = findHisHref(page)
for hisUrl in hisHref:
page = getPage(hisUrl)
docMaxPageNum = getMaxPageNum(page)
dumpHisInfo(hisUrl)
print "docMaxPageNum:"+str(docMaxPageNum)
for i in range(1,docMaxPageNum+1):
url2 = hisUrl[:-5]+'/cd/2016-04-29/p/'+str(i)+'.html#headAnchor'
page2 = getPage(url2)
findDocHref(page2)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: