您的位置:首页 > 编程语言 > Python开发

Python 简单爬虫 豆瓣热门影评

2016-09-09 18:40 417 查看
第一次写Python,备忘用,写的不完善大家见笑了

# -*- coding:utf-8 -*-

import urllib
import urllib2
import re
import xlwt

book=xlwt.Workbook(encoding='utf-8',style_compression=0)
sheet=book.add_sheet('movie_review',cell_overwrite_ok=True)
sheet.write(0, 0, '标题')
sheet.write(0, 1, '影评人')
sheet.write(0, 2, '电影')
sheet.write(0, 3, '星级')
sheet.write(0, 4, '时间')
sheet.write(0, 5, '内容')

baseurl='https://movie.douban.com/review/best/?start='
for i in range(0,3):
url_list=baseurl+str(i*20)
request_url = urllib2.Request(url_list)
response_url = urllib2.urlopen(request_url)
html_url = response_url.read().decode('utf-8')
pattern_url = re.compile('<h3 class="title">.*?<a href="(.*?)/"', re.S)
url_thispage = re.findall(pattern_url, html_url)

for j in range(0,10):
url = url_thispage[j]
request = urllib2.Request(url)
response = urllib2.urlopen(request)
html = response.read().decode('utf-8')

pattern_title = re.compile('<span property="v:summary">(.*?)</span>',re.S)
pattern_reviewer = re.compile('<span property="v:reviewer">(.*?)</span>',re.S)
pattern_movie = re.compile('<a href="https://movie.douban.com/subject/.*?/">(.*?)</a>',re.S)
pattern_star = re.compile('<span property="v:rating" class="main-title-hide">(.*?)</span>',re.S)
pattern_time = re.compile('<p property="v:dtreviewed".*?">(.*?)</p>',re.S)
pattern_content = re.compile('<div property="v:description" class="clearfix">(.*?)</div>',re.S)

title = re.findall(pattern_title,html)
reviewer = re.findall(pattern_reviewer,html)
movie = re.findall(pattern_movie,html)
star = re.findall(pattern_star,html)
time = re.findall(pattern_time,html)
content = re.findall(pattern_content,html)

k=i*10+j+1
sheet.write(k,0,title[0])
sheet.write(k,1,reviewer[0])
sheet.write(k,2,movie[0])
sheet.write(k,3,star[0])
sheet.write(k,4,time[0])
sheet.write(k,5,content[0])
book.save('d:\ test.xls')
#print k
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python 爬虫