python 爬虫 豆瓣 评论及评分
2017-09-21 19:16
861 查看
借鉴了不少 hang 的博客:https://segmentfault.com/a/1190000010473819
评分:
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 20 16:19:02 2017
@author: su
"""
from urllib import request
import re
from bs4 import BeautifulSoup as bs
"""resp = request.urlopen("https://movie.douban.com/nowplaying/hangzhou/" )
html_data = resp.read().decode("UTF-8")
soup = bs(html_data,"html.parser")
nowplaying_movie = soup.find_all("div",id="nowplaying")
nowplaying_movie_list = nowplaying_movie[0].find_all("li",class_="list-item")
nowplaying_list = []
for item in nowplaying_movie_list:
nowplaying_dict={}
nowplaying_dict["id"]= item["data-subject"]
nowplaying_dict["title"]= item["data-title"]
for tag_img_item in item.find_all("img"):
nowplaying_dict["name"]=tag_img_item["alt"]
nowplaying_list.append(nowplaying_dict)
"""
for start in range(0,60,20):
requr = "https://movie.douban.com/subject/"+nowplaying_list[0]["id"]+"/comments?"+"start="+str(start)+"&limit=20"
resp = request.urlopen(requr)
html_data = resp.read().decode("UTF-8")
soup = bs(html_data,"html.parser")
comment_list=[]
comment_div_list = soup.find_all("div",class_="comment")
for item in comment_div_list:
comment_dict={}
#if t in range(0,50,10):
item_score = item.find_all("h3")[0]
item_score = item_score.find_all("span")[4]
comment_dict["得分"]=item_score["class"]
comment_dict["评级"]=item_score["title"]
comment_list.append(comment_dict)
print(comment_list)
评论:
@author: su
"""
from urllib import request
import re
import jieba #分词包
import pandas as pd
import numpy
"""
resp = request.urlopen('https://movie.douban.com/nowplaying/hangzhou/')
html_data = resp.read().decode('utf-8')
from bs4 import BeautifulSoup as bs
soup = bs(html_data, 'html.parser')
nowplaying_movie = soup.find_all('div', id='nowplaying')
nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item')
nowplaying_list = []
for item in nowplaying_movie_list:
nowplaying_dict = {}
nowplaying_dict['id'] = item['data-subject']
for tag_img_item in item.find_all('img'):
nowplaying_dict['name'] = tag_img_item['alt']
nowplaying_list.append(nowplaying_dict)
"""
requrl = 'https://movie.douban.com/subject/' + nowplaying_list[3]['id'] + '/comments' +'?' +'start=0' + '&limit=20'
resp = request.urlopen(requrl)
html_data = resp.read().decode('utf-8')
soup = bs(html_data, 'html.parser')
comment_div_lits = soup.find_all('div', class_='comment')
eachCommentList = [];
for item in comment_div_lits:
if item.find_all('p')[0].string is not None:
eachCommentList.append(item.find_all('p')[0].string)
comments = ''
with open("s.txt","w") as f:
for k in range(len(eachCommentList)):
comments = comments + (str(eachCommentList[k])).strip()
pattern = re.compile(r'[\u4e00-\u9fa5]+')
filterdata = re.findall(pattern, comments)
cleaned_comments = ''.join(filterdata)
f.write(cleaned_comments)
f.close()
评分:
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 20 16:19:02 2017
@author: su
"""
from urllib import request
import re
from bs4 import BeautifulSoup as bs
"""resp = request.urlopen("https://movie.douban.com/nowplaying/hangzhou/" )
html_data = resp.read().decode("UTF-8")
soup = bs(html_data,"html.parser")
nowplaying_movie = soup.find_all("div",id="nowplaying")
nowplaying_movie_list = nowplaying_movie[0].find_all("li",class_="list-item")
nowplaying_list = []
for item in nowplaying_movie_list:
nowplaying_dict={}
nowplaying_dict["id"]= item["data-subject"]
nowplaying_dict["title"]= item["data-title"]
for tag_img_item in item.find_all("img"):
nowplaying_dict["name"]=tag_img_item["alt"]
nowplaying_list.append(nowplaying_dict)
"""
for start in range(0,60,20):
requr = "https://movie.douban.com/subject/"+nowplaying_list[0]["id"]+"/comments?"+"start="+str(start)+"&limit=20"
resp = request.urlopen(requr)
html_data = resp.read().decode("UTF-8")
soup = bs(html_data,"html.parser")
comment_list=[]
comment_div_list = soup.find_all("div",class_="comment")
for item in comment_div_list:
comment_dict={}
#if t in range(0,50,10):
item_score = item.find_all("h3")[0]
item_score = item_score.find_all("span")[4]
comment_dict["得分"]=item_score["class"]
comment_dict["评级"]=item_score["title"]
comment_list.append(comment_dict)
print(comment_list)
评论:
@author: su
"""
from urllib import request
import re
import jieba #分词包
import pandas as pd
import numpy
"""
resp = request.urlopen('https://movie.douban.com/nowplaying/hangzhou/')
html_data = resp.read().decode('utf-8')
from bs4 import BeautifulSoup as bs
soup = bs(html_data, 'html.parser')
nowplaying_movie = soup.find_all('div', id='nowplaying')
nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item')
nowplaying_list = []
for item in nowplaying_movie_list:
nowplaying_dict = {}
nowplaying_dict['id'] = item['data-subject']
for tag_img_item in item.find_all('img'):
nowplaying_dict['name'] = tag_img_item['alt']
nowplaying_list.append(nowplaying_dict)
"""
requrl = 'https://movie.douban.com/subject/' + nowplaying_list[3]['id'] + '/comments' +'?' +'start=0' + '&limit=20'
resp = request.urlopen(requrl)
html_data = resp.read().decode('utf-8')
soup = bs(html_data, 'html.parser')
comment_div_lits = soup.find_all('div', class_='comment')
eachCommentList = [];
for item in comment_div_lits:
if item.find_all('p')[0].string is not None:
eachCommentList.append(item.find_all('p')[0].string)
comments = ''
with open("s.txt","w") as f:
for k in range(len(eachCommentList)):
comments = comments + (str(eachCommentList[k])).strip()
pattern = re.compile(r'[\u4e00-\u9fa5]+')
filterdata = re.findall(pattern, comments)
cleaned_comments = ''.join(filterdata)
f.write(cleaned_comments)
f.close()
相关文章推荐
- Python爬虫(三)——对豆瓣图书各模块评论数与评分图形化分析
- Python(爬虫) — 爬取《和平饭店》的豆瓣评论
- Python学习笔记 第二部分 - 正则表达式 与 爬虫小实例(抓取豆瓣电影中评分大于等于8分的影片)
- python爬虫之豆瓣电影评分
- Python爬虫初学(1)豆瓣电影top250评论数
- Python爬虫初学(2)豆瓣电影top250评论数
- python爬虫——获取豆瓣评分图书
- python 爬虫-京东用户评论数据和用户评分
- Python实例:网络爬虫抓取豆瓣3万本书(5)
- Python爬虫入门 | 7 分类爬取豆瓣电影,解决动态加载问题
- Python爬虫豆瓣读书评分9分以上榜单
- Python3 爬虫(三) -- 爬取豆瓣首页图片
- Python3 爬虫(三) -- 爬取豆瓣首页图片
- Python爬虫(3)豆瓣登录
- python3--爬虫实战一:爬取豆瓣电影250
- Python3.6爬虫爬取豆瓣电影Top250信息
- Python3 + Scrapy 爬取豆瓣评分数据存入Mysql与MongoDB数据库。
- Python实例:网络爬虫抓取豆瓣3万本书(2)
- Python爬虫笔记-豆瓣模拟登陆
- Python 爬虫爬豆瓣美女图片