您的位置:首页 > 编程语言 > Python开发

糗事百科爬虫改进

2015-11-07 15:12 543 查看
无事,抓糗事!

看到一个哥们的代码,无事拿来改改,抓糗事百科文字内容

#!/usr/bin/env python
'''
for qiushibaike.com
'''

import  urllib2
# import  urllib
import  re
import  thread
import  time

class Spider_Model():
def __init__(self):
self.page = 1
self.pages = []
self.enable = False

def GetPage(self,page):
myurl = r'http://www.qiushibaike.com/textnew/page/'+page
user_agent = 'Mozilla/5.0 (X11; Linux x86_64)'
headers = {'User-Agent':user_agent}

req = urllib2.Request(myurl,headers=headers)
myres = urllib2.urlopen(req)
mypage = myres.read()

unicodepage = mypage.decode('utf-8')
myItems = re.findall('<div.*?class="content">(.*?)<!--.*?-->.*?</div>',unicodepage,re.S)
Items = []
# print myItems

for item in myItems:
# print  item
item = item.replace('\n','')
Items.append(item.replace(r'<br/>','\n'))
# Items.append(item[0])

return  Items

def LoadPage(self):
while self.enable:
if len(self.pages) < 2:
try:
mypage = self.GetPage(str(self.page))
self.page += 1
self.pages.append(mypage)
except:
print 'can not connected to the url.'
else:
time.sleep(1)

def ShowPage(self,nowPage,page):
print '\n\n############################ Page %d #################################\n\n' % page

for item in nowPage:
print item

myinput = raw_input()
if myinput == 'quit':
self.enable = False
break

def start(self):
page = self.page
self.enable = True

print(u'waiting..............')

thread.start_new_thread(self.LoadPage,())

while self.enable:
if self.pages:
nowpage = self.pages[0]
del self.pages[0]
self.ShowPage(nowpage,page)
page +=1

if __name__ == '__main__':
#---------the begin of program-----------------
print u'''
-------------------------------------------------
xxxx
x
xxx
xxx
-------------------------------------------------
'''

print 'Press any key,to continue......'
raw_input()
mymodel = Spider_Model()
mymodel.start()


一切从简。不解释不说明,随便拍!

详细内容请参考:http://blog.csdn.net/pleasecallmewhy/article/details/8932310
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python 正则表达式