您的位置：首页 > 编程语言 > Python开发

python 爬虫豆瓣评论及评分

2017-09-21 19:16 861 查看

借鉴了不少 hang 的博客：https://segmentfault.com/a/1190000010473819

评分：

# -*- coding: utf-8 -*-

"""

Created on Wed Sep 20 16:19:02 2017

@author: su

"""

from urllib import request

import re

from bs4 import BeautifulSoup as bs

"""resp = request.urlopen("https://movie.douban.com/nowplaying/hangzhou/" )

html_data = resp.read().decode("UTF-8")

soup = bs(html_data,"html.parser")

nowplaying_movie = soup.find_all("div",id="nowplaying")

nowplaying_movie_list = nowplaying_movie[0].find_all("li",class_="list-item")

nowplaying_list = []

for item in nowplaying_movie_list:

nowplaying_dict={}

nowplaying_dict["id"]= item["data-subject"]

nowplaying_dict["title"]= item["data-title"]

for tag_img_item in item.find_all("img"):

nowplaying_dict["name"]=tag_img_item["alt"]

nowplaying_list.append(nowplaying_dict)

"""

for start in range(0,60,20):

requr = "https://movie.douban.com/subject/"+nowplaying_list[0]["id"]+"/comments?"+"start="+str(start)+"&limit=20"

resp = request.urlopen(requr)

html_data = resp.read().decode("UTF-8")

soup = bs(html_data,"html.parser")

comment_list=[]

comment_div_list = soup.find_all("div",class_="comment")

for item in comment_div_list:

comment_dict={}

#if t in range(0,50,10):

item_score = item.find_all("h3")[0]

item_score = item_score.find_all("span")[4]

comment_dict["得分"]=item_score["class"]

comment_dict["评级"]=item_score["title"]

comment_list.append(comment_dict)



print(comment_list)

评论：

@author: su

"""

from urllib import request

import re

import jieba #分词包

import pandas as pd

import numpy

"""

resp = request.urlopen('https://movie.douban.com/nowplaying/hangzhou/')

html_data = resp.read().decode('utf-8')

from bs4 import BeautifulSoup as bs

soup = bs(html_data, 'html.parser')

nowplaying_movie = soup.find_all('div', id='nowplaying')

nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item')

nowplaying_list = []

for item in nowplaying_movie_list:

nowplaying_dict = {}

nowplaying_dict['id'] = item['data-subject']

for tag_img_item in item.find_all('img'):

nowplaying_dict['name'] = tag_img_item['alt']

nowplaying_list.append(nowplaying_dict)

"""

requrl = 'https://movie.douban.com/subject/' + nowplaying_list[3]['id'] + '/comments' +'?' +'start=0' + '&limit=20'

resp = request.urlopen(requrl)

html_data = resp.read().decode('utf-8')

soup = bs(html_data, 'html.parser')

comment_div_lits = soup.find_all('div', class_='comment')

eachCommentList = [];

for item in comment_div_lits:

if item.find_all('p')[0].string is not None:

eachCommentList.append(item.find_all('p')[0].string)

comments = ''

with open("s.txt","w") as f:

for k in range(len(eachCommentList)):

comments = comments + (str(eachCommentList[k])).strip()

pattern = re.compile(r'[\u4e00-\u9fa5]+')

filterdata = re.findall(pattern, comments)

cleaned_comments = ''.join(filterdata)

f.write(cleaned_comments)

f.close()

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航

python 爬虫 豆瓣 评论及评分

python 爬虫豆瓣评论及评分