您的位置:首页 > 编程语言 > Python开发

python 爬虫 豆瓣 评论及评分

2017-09-21 19:16 861 查看
借鉴了不少 hang 的博客:https://segmentfault.com/a/1190000010473819

评分:

# -*- coding: utf-8 -*-

"""

Created on Wed Sep 20 16:19:02 2017

@author: su

"""

from urllib import request

import re

from bs4 import BeautifulSoup as bs

"""resp = request.urlopen("https://movie.douban.com/nowplaying/hangzhou/" )

html_data = resp.read().decode("UTF-8")

soup = bs(html_data,"html.parser")

nowplaying_movie = soup.find_all("div",id="nowplaying")

nowplaying_movie_list = nowplaying_movie[0].find_all("li",class_="list-item")

nowplaying_list = []

for item in nowplaying_movie_list:

    nowplaying_dict={}

    nowplaying_dict["id"]= item["data-subject"]

    nowplaying_dict["title"]= item["data-title"]

    for tag_img_item in item.find_all("img"):

        nowplaying_dict["name"]=tag_img_item["alt"]

        nowplaying_list.append(nowplaying_dict)

"""

for start in range(0,60,20):

    requr = "https://movie.douban.com/subject/"+nowplaying_list[0]["id"]+"/comments?"+"start="+str(start)+"&limit=20"

    resp = request.urlopen(requr)

    html_data = resp.read().decode("UTF-8")

    soup = bs(html_data,"html.parser")

    comment_list=[]

    comment_div_list = soup.find_all("div",class_="comment")

    for item in comment_div_list:

        comment_dict={}

       #if t in range(0,50,10): 

        item_score = item.find_all("h3")[0]

        item_score = item_score.find_all("span")[4]

        comment_dict["得分"]=item_score["class"]

        comment_dict["评级"]=item_score["title"]

        comment_list.append(comment_dict)

        

    print(comment_list)

评论:

@author: su

"""

from urllib import request

import re

import jieba    #分词包

import pandas as pd

import numpy 

"""

resp = request.urlopen('https://movie.douban.com/nowplaying/hangzhou/')

html_data = resp.read().decode('utf-8')

from bs4 import BeautifulSoup as bs

soup = bs(html_data, 'html.parser')    

nowplaying_movie = soup.find_all('div', id='nowplaying')

nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item') 

nowplaying_list = [] 

for item in nowplaying_movie_list:        

        nowplaying_dict = {}        

        nowplaying_dict['id'] = item['data-subject']       

        for tag_img_item in item.find_all('img'):            

            nowplaying_dict['name'] = tag_img_item['alt']            

            nowplaying_list.append(nowplaying_dict)  

"""

requrl = 'https://movie.douban.com/subject/' + nowplaying_list[3]['id'] + '/comments' +'?' +'start=0' + '&limit=20' 

resp = request.urlopen(requrl) 

html_data = resp.read().decode('utf-8') 

soup = bs(html_data, 'html.parser') 

comment_div_lits = soup.find_all('div', class_='comment') 

eachCommentList = []; 

for item in comment_div_lits: 

        if item.find_all('p')[0].string is not None:     

            eachCommentList.append(item.find_all('p')[0].string)

comments = ''

with open("s.txt","w") as f:

  for k in range(len(eachCommentList)):

      comments = comments + (str(eachCommentList[k])).strip()

      pattern = re.compile(r'[\u4e00-\u9fa5]+')

      filterdata = re.findall(pattern, comments)

      cleaned_comments = ''.join(filterdata)

      f.write(cleaned_comments)

f.close()  

              
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: