您的位置：首页 > 编程语言 > Python开发

Python2.7 爬虫实践：豆瓣电影影评分析

2017-08-29 23:20 691 查看

reference from ：hang

segmentfault.com/a/1190000010473819

本人先看到以上，觉得挺好玩，所以就跟着原作者的思路在撸一遍代码

后来发现了几个问题

本人的python 2.7 原作者为python3

所以其中的一些funtion/属性也不支持

所以做了一下修改

安装必要的lib 之后会有文章进行介绍

好了就先上代码吧

__author__ = 'Helen Huang'

# encoding=utf8

import sys

import warnings

warnings.filterwarnings("ignore")

reload(sys)

sys.setdefaultencoding('utf8')

from urllib2 import urlopen

import bs4

import time

import re

import jieba

import numpy

import codecs

import matplotlib.pyplot as plt

import matplotlib

from wordcloud import WordCloud

import pandas as pd

matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)

def getMovieId(title):

resp = urlopen('https://movie.douban.com/nowplaying/hangzhou/')

html_data = resp.read().decode('utf-8')

soup = bs4.BeautifulSoup(html_data,"html.parser")

content = soup.find_all('div',id='nowplaying')

movieStrList=content[0].find_all('li',class_ ='list-item')

movieList=[]

for item in movieStrList:

movie_dit={}

movie_dit['id']=item['data-subject']

movie_dit['name']=item['data-title']

movieList.append(movie_dit)

item['data-title']=item['data-title'].strip('')

if unicode(item['data-title']) == (title):

movieId =item['data-subject']

if unicode(item['data-title']) == (title):

return movieId

def getCommentsById(movieId,pageNum):

#避免uncode出现在云图，所以我直接将结果保存为str

for i in range(10):

num = i + 1

if pageNum >0 :

start =(num-1) * 20

else:

return False

reqUrl='https://movie.douban.com/subject/' + movieId + '/comments' +'?' +'start=' + str(start) + '&limit=20'

print reqUrl

resp = urlopen(reqUrl)

html = resp.read().decode('utf-8')

#print html

soup = bs4.BeautifulSoup(html,"html.parser")

comContent = soup.find_all('div',id='comments')

#print comContent

commentStr=comContent[0].find_all('div',class_ ='comment')

commentList =[]

comments=''

for comment in commentStr:

c = comment.find_all('p')[0].string

if c is not None:

commentList.append(c)

comments =comments+str(c).strip().strip('\n')

print c

return comments

if __name__ == '__main__':

print 'start ....'

title=u'杀破狼·贪狼'

movieId = getMovieId(title)

print 'movie id is:'

print movieId

comments = getCommentsById(movieId,10)

comments=comments.replace(' ','')

print comments

#使用正则表达式去除标点符号

pattern = re.compile(r'[\u4e00-\u9fa5]+')

filterdata = re.findall(pattern, comments)

cleaned_comments = ''.join(filterdata)

cleaned_comments= comments

#使用结巴分词进行中文分词

segment = jieba.lcut(cleaned_comments)

words_df=pd.DataFrame({'segment':segment})

#去掉停用词 #如果有电影领域的停用词就更好了

#stopwords=pd.read_csv('D:\python\stopwords.txt',index_col=False,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用

#python27 没有quoting
属性

stopwords=pd.read_csv('D:\python\stopwords_copy.txt',names=['stopword'], encoding='utf-8')

words_df=words_df[~words_df.segment.isin(stopwords.stopword)]

#统计词频

words_stat=words_df.groupby(by=['segment'])['segment'].agg({"计数":numpy.size})

words_stat=words_stat.reset_index().sort_index(by=["计数"],ascending=False)

print words_stat.head()

#用词云进行显示

word_frequence = {x[0]:x[1] for x in words_stat.head(100).values}

wordcloud=WordCloud(font_path="D:\python\sourcehansansi.ttf",background_color='white',max_font_size=80).generate_from_frequencies(word_frequence)

#这里没有用fit_word

plt.figure()

plt.imshow(wordcloud)

plt.axis("off")

#plt.savefig()

plt.show()

print 'end ...'

#我发现了的问题

#表情词没有过滤

#如果一句话里面，用户重复了多遍的话，词频就有影响了

#stopwords 应该更加领域化

#wordcloud还可以设置多种样式

#以下是我的结果图

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航