您的位置:首页 > 理论基础 > 计算机网络

网络爬虫-python-爬取天涯求职贴

2016-04-11 15:23 369 查看
使用urllib请求页面,使用BeautifulSoup解析页面,使用xlwt3写入Excel

import urllib.request
from bs4 import BeautifulSoup
import time
import xlwt3
from xlrd import open_workbook
wExcel=xlwt3.Workbook()
sheet1=wExcel.add_sheet('my',cell_overwrite_ok=True)
num=0
fo=open(r'contents.txt','a',encoding='utf-8')
def getconten(url):
opener = urllib.request.build_opener()
try:
content = opener.open(url).read()
content2=content.decode('utf-8')
except:
try:
content = opener.open(url).read()
content2=content.decode('gbk')
except:
print('decode fail!')
return None
return None
return content2
def getdetail(url):
opener = urllib.request.build_opener()
con=getconten(url)
##    print(url)
if con:
soup=BeautifulSoup(con)
job=soup.find('div','bbs-content clearfix')
if job:
jobdetail=job.get_text()
return jobdetail
else:
return None

def getonepage(url):
global num
opener = urllib.request.build_opener()
content=getconten(url)
if content:
soup=BeautifulSoup(content)
for tr in soup.find_all('tr','bg'):
oneitem=[]
j=0
detailurl=tr.td.a['href']
detailurl='http://bbs.tianya.cn'+detailurl
##        print(detailurl)
detailcon=getdetail(detailurl)
##        print(detailcon)
for item in tr.strings:
item=item.strip()
if item:
oneitem.append(item)
sheet1.write(num,j,item)
j=j+1
##            print(item.strip())
sheet1.write(num,j,detailcon)
num=num+1
##        print('one is ok')

if __name__=='__main__':
mainpage='http://bbs.tianya.cn/list.jsp?item=763&sub=2'
getonepage(mainpage)
wExcel.save('res0.xls')
i=0
soup=BeautifulSoup(getconten(mainpage))
currentpage=soup.find('div','links').a.find_next_sibling('a')
currentpage='http://bbs.tianya.cn'+currentpage['href']
nextpage=currentpage
while i<30:
print(nextpage)
getonepage(nextpage)
print('one page finished!')
con=getconten(nextpage)
if con:
soup=BeautifulSoup(con)
currentpage=soup.find('div','links').a.find_next_sibling('a').find_next_sibling('a')
nextpage='http://bbs.tianya.cn'+currentpage['href']
i=i+1
else:
break
wExcel.save('res.xls')
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: