您的位置:首页 > 编程语言 > Python开发

谁能有我简单之Python小爬虫

2016-08-27 18:10 337 查看
跟着教程打的一个小爬虫,作用是可以爬下来豆瓣当前热门的电影信息

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import urllib2
import os
from HTMLParser import HTMLParser

#定义HTML解析器
class MovieParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.movies = []

#
def handle_starttag(self, tag, attrs):
def _attr(attrlist,attrname):   #定义一个解析属性
for attr in attrlist:
if attr[0] == attrname:
return attr[1]
return None

#将电影信息存入movie字典中 最后输出
if tag == 'li' and _attr(attrs,'data-title'):
movie = {}
movie['title'] = _attr(attrs, 'data-title')
movie['rate'] = _attr(attrs, 'data-rate')
movie['director'] = _attr(attrs, 'data-director')
movie['actors'] = _attr(attrs, 'data-actors')
self.movies.append(movie)
print('%(title)s|%(rate)s|%(director)s|%(actors)s' % movie)

def nowplaying_movies(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}
reg = urllib2.Request(url,headers=headers)
#模仿一个浏览器进行网页浏览
s = urllib2.urlopen(reg)  #获取请求
parser = MovieParser()
parser.feed(s.read())
s.close()
return parser.movies;

#豆瓣的地址输入
if __name__ == '__main__':
url = 'https://movie.douban.com/'
movies = nowplaying_movies(url)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: