您的位置:首页 > 编程语言 > Python开发

python爬虫(豆瓣新片榜)

2015-03-23 16:22 190 查看
#!/usr/bin/env python
# coding: utf-8

import re
import urllib2

class doubanTop10:
def __init__(self):
self.url = 'http://movie.douban.com/chart'
self.datas = []
self._top_num = 1
print "正在爬取豆瓣新片榜...\n"

def get_data(self, url):
url = self.url
try:
page_data = urllib2.urlopen(url).read().decode('utf-8')
except urllib2.URLError, e:
if hasattr(e, 'code'):
print "The server couldn't fulfill the request."
print "Error code: %s" % e.code
elif hasattr(e, 'reason'):
print "We failed to reach a server. Please check your url and read the Reason."
print "Reason: %s" % e.reason
return page_data

def find_title(self, page_data):
temp_data = []
# print page_data
movie_items = re.findall(r'<a.*?class="nbg".*?title="(.*?)">', page_data, re.S)
for index, item in enumerate(movie_items):
if item.find(" ") == -1:
temp_data.append("Top" + str(self._top_num) + " " + item)
self._top_num += 1
self.datas.extend(temp_data)

def start_spider(self):
my_page = self.get_data(self.url)
self.find_title(my_page)

def main():
spider = doubanTop10()
spider.start_spider()
for item in spider.datas:
print item
print "\n爬取完成!"

if __name__ == '__main__':
main()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: