python爬虫的一个小例子
2016-05-04 13:52
537 查看
# -*- coding: utf-8 -*- import urllib2 import re import BeautifulSoup import json def getPage(url):#获取页面信息 flag = True while flag: #该while循环为了防止意外断网 try: print url request = urllib2.Request(url) response = urllib2.urlopen(request) page = response.read() flag = False return page except Exception, e: if hasattr(e,"reason"): print u"连接失败,错误原因",e.reason flag = True def getMaxPageNum(page):#获取 医院/科室/医生 页面数最大值 soup = BeautifulSoup.BeautifulSoup(page) #print page res = soup.findAll('span',attrs={'class':'contcss'}) #print res return int(re.findall(r"\d+",res[0].text.split("/")[0])[0].encode("utf-8")) def dumpDocInfo(url):#将医生数据写到本地 page = getPage(url) outfile = open("C:\\Users\\Administrator\\Desktop\\doctors.txt","ab") soup = BeautifulSoup.BeautifulSoup(page) if len(soup.findAll(attrs={'class':'map'})) == 0: return xingming = soup.findAll(attrs={'class':'map'})[0].findAll('a')[2].text zhicheng = soup.findAll(attrs={'class':'regdoc_name'})[0].findAll('span')[0].text yiyuan = soup.findAll(attrs={'class':'regdoc_commom'})[1].text.split(u'科室:')[0][3:] keshi = soup.findAll(attrs={'class':'regdoc_commom'})[1].text.split(u'科室:')[1] jianjie = soup.findAll(attrs={'class':'regdoc_msg'})[0].text[3:] shanchang = soup.findAll(attrs={'class':'regdoc_msg'})[1].text[3:] info = {u'姓名':xingming,u'职称':zhicheng,u'医院':yiyuan,u'科室':keshi,u'简介':jianjie,u'擅长':shanchang} info = json.dumps(info, ensure_ascii=False) outfile.write(info.encode('utf-8')+'\n') def dumpHisInfo(url):#将医生数据写到本地 page = getPage(url) outfile = open("C:\\Users\\Administrator\\Desktop\\hospital.txt","ab") soup = BeautifulSoup.BeautifulSoup(page) if len(soup.findAll(attrs={'class':'map'})) == 0: return xingming = soup.findAll(attrs={'class':'map'})[0].findAll('a')[2].text zhicheng = soup.findAll(attrs={'class':'regdoc_name'})[0].findAll('span')[0].text yiyuan = soup.findAll(attrs={'class':'regdoc_commom'})[1].text.split(u'科室:')[0][3:] keshi = soup.findAll(attrs={'class':'regdoc_commom'})[1].text.split(u'科室:')[1] jianjie = soup.findAll(attrs={'class':'regdoc_msg'})[0].text[3:] shanchang = soup.findAll(attrs={'class':'regdoc_msg'})[1].text[3:] info = {u'姓名':xingming,u'职称':zhicheng,u'医院':yiyuan,u'科室':keshi,u'简介':jianjie,u'擅长':shanchang} info = json.dumps(info, ensure_ascii=False) outfile.write(info.encode('utf-8')+'\n') def findDocHref(page):#获取所有医生信息的超链接 soup = BeautifulSoup.BeautifulSoup(page) res = soup.findAll('div',attrs={'class':'yy_doctor_head'}) for v in res: dumpDocInfo('http://www.eztcn.com'+v['onclick'].encode("utf-8")[15:-1]) def findHisHref(page):#查找所有医院的超链接 soup = BeautifulSoup.BeautifulSoup(page) res = soup.findAll('a') arr = [] for v in res[9::3]: if '#' in str(v): #判断一页是否结束 break arr.append("http://www.eztcn.com"+(v['href'].encode("utf-8"))) return arr #主程序正式开始 hisMaxPageNum = getMaxPageNum(getPage('http://www.eztcn.com/Home/Find/findHos/p/1.html#selAnchor')) print "hisMaxPageNum:"+str(hisMaxPageNum) #医院页面数为15页 for i in range(1,hisMaxPageNum+1): url = 'http://www.eztcn.com/Home/Find/findHos/p/'+str(i)+'.html#selAnchor' page = getPage(url) hisHref = findHisHref(page) for hisUrl in hisHref: page = getPage(hisUrl) docMaxPageNum = getMaxPageNum(page) dumpHisInfo(hisUrl) print "docMaxPageNum:"+str(docMaxPageNum) for i in range(1,docMaxPageNum+1): url2 = hisUrl[:-5]+'/cd/2016-04-29/p/'+str(i)+'.html#headAnchor' page2 = getPage(url2) findDocHref(page2)
相关文章推荐
- Python虚拟环境Virtualenv
- [python]type和isinstance的比较
- python爬虫抓取目标网页链接
- 利用python进行识别相似图片(二)
- 利用python进行识别相似图片(一)
- Python中time模块详解
- Python 使用super从父类得到帮助
- 找到 Python 的 site-packages 目录(Python site-packages directory)
- Windows 下 Python easy_install 的安装
- python生成二维码
- Python——time模块&datetime模块
- python dict 实现 switch 功能
- python 递归实现 汉诺塔
- python 迭代器(iterator)和生成器(constructor)
- Python 对 验证码的使用
- (好文转发)关于Python脚本开头两行的:#!/usr/bin/python和# -*- coding: utf-8 -*-的作用 – 指定文件编码类型
- python的IDE:PyCharm
- python中sort sorted() reverse() reversed() 的区别
- python 类修饰器
- 【Python】批量创建线程