您的位置:首页 > 编程语言 > Python开发

Python(爬虫) — 爬取《和平饭店》的豆瓣评论

2018-02-02 17:13 471 查看

爬虫案例

效果图:



代码实现:

#! /usr/local/bin/python3
# -*- coding: utf-8 -*-

'''
Author: elson
Desc: 电视剧《和平饭店》的豆瓣评论
'''
import re

import jieba
import os

import numpy
import pandas as pd
import matplotlib.pyplot as plt
from lxml import etree

import requests
from wordcloud import WordCloud

def get_comment_detail(url):

response = requests.get(url)
resHtml = response.text

# print(resHtml)
# with open('./html/peace_hotel_detail_comment.html', 'w') as f:
#     f.write(response.text)

html = etree.HTML(resHtml)
detail = html.xpath('.//div[@class="review-content clearfix"]')[0].xpath('string(.)')
return detail

def request_page(url):
comment_list = []

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.8'
}
response = requests.get(url, headers)
resHtml = response.text
# print(resHtml)

# with open('./html/peace_hotel.html', 'w') as f:
#     f.write(response.text)

html = etree.HTML(resHtml)

result = html.xpath('//div[@class="main-bd"]/h2/a')
for site in result:
detail_url = site.attrib['href']
print(detail_url)
detail = get_comment_detail(detail_url)
comment_list.append(detail)

print('request_page....')
page = html.xpath('//span[@class="next"]')
if page and page[0].xpath('./a'):
next_start = page[0].xpath('./a')[0].attrib['href']
else:
next_start = ''
return (comment_list, next_start)

def get_comment_lists():
comment_list = []

url = "https://movie.douban.com/subject/26828285/reviews"
result = request_page(url)
comment_list.extend(result[0])
while result[1]:
result = request_page(url + result[1])
comment_list.extend(result[0])

return comment_list

def main():
#1. 数据获取
comment_list = get_comment_lists()
# print(comment_list)

#2. 数据清洗
# 将列表中的数据转换为字符串
comments = ''
for k in range(len(comment_list)):
comments = comments + (str(comment_list[k])).strip()

# 使用正则表达式去除标点符号
pattern = re.compile(r'[\u4e00-\u9fa5]+')
filterdata = re.findall(pattern, comments)
cleaned_comments = ''.join(filterdata)

#3. 数据分析
# 使用结巴分词进行中文分词
segment = jieba.lcut(cleaned_comments)
words_df = pd.DataFrame({'segment': segment})

# 将工作目录切换到指定目录
print(os.getcwd())
os.chdir('config')
print(os.getcwd())

# 去掉停用词
stopwords = pd.read_csv("stop_words.txt", index_col=False, quoting=3, sep="\t", names=['stopword'], encoding='utf-8')  # quoting=3全部引用
words_df = words_df[~words_df.segment.isin(stopwords.stopword)]

# 统计词频
words_stat = words_df.groupby(by=['segment'])['segment'].agg({"计数": numpy.size})
words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False)
print(words_stat.head())

# 用词云进行显示
wordcloud = WordCloud(font_path="simhei.ttf", background_color="white", max_font_size=80)
word_frequence = {x[0]: x[1] for x in words_stat.head(300).values}
print(word_frequence)

word_frequence_list = []
for key in word_frequence:
temp = (key, word_frequence[key])
word_frequence_list.append(temp)

wordcloud = wordcloud.fit_words(word_frequence)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

if __name__ == '__main__':
main()


segment计数
饭店306
和平270
248
一个212
207
资源下载链接:停用词
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: