python爬虫 登陆豆瓣 爬豆瓣电影短评
2016-07-06 12:07
471 查看
这个爬虫的目的是爬取豆瓣电影短评和评分(从1星到5星),这些东西可以做情感分类。由于不登录的情况下只能看电影短评的前几页,所以要实现登陆豆瓣。
登陆豆瓣的部分是在网上看的别人的代码,忘了从哪看的了。
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import re
from time import sleep
import random
import urllib
import requests
url = "https://douban.com/accounts/login"
formData = {
"redir": "https://www.douban.com",
"form_email": "**************",
"form_password": "**************",
"login": u'登录',
'source': 'None',
}
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36",
"Referer": "https://douban.com/accounts/login",
"Host": "accounts.douban.com",
"Connection": "Keep-Alive",
"Content-Type": "application/x-www-form-urlencoded"
}
s = requests.session()
# r_ = s.post(url, data=formData, headers=headers)
# a = r_.text
# soup_ = BeautifulSoup(a, "html.parser")
# captchaAddr = soup_.find('img', id='captcha_image')['src']
# reCaptchaID = r'<input type="hidden" name="captcha-id" value="(.*?)"/'
# captchaID = re.findall(reCaptchaID, a)
# urllib.urlretrieve(captchaAddr, "captcha.jpg")
#
# captcha = raw_input('please input the captcha:')
# formData['captcha-solution'] = captcha
# formData['captcha-id'] = captchaID
r_ = s.post(url, data=formData, headers=headers)
page_ = r_.text
# print page_
# co = r_.cookies
"""---------------------------------------------------------------------------------"""
number = 0
def process_h3(soup, fp):
global number
h3s = soup.findAll("h3")
for i in h3s:
aa = i.span.next_siblings
bb = aa.next().next()
number += 1
if number % 100 == 0:
print number
if len(bb) == 4:
fp.write(bb[2].attrs["class"][0][-2:-1])
fp.write(" ")
cc = i.next_siblings
cc.next()
dd = cc.next().get_text().strip()
ee = dd.replace('\n', " ")
fp.write(ee.encode("utf8"))
# print ee.encode("utf8")
fp.write('\n')
def find_next(soup):
line = soup.findAll("a", {"class", "next"})
if len(line) == 0:
return None
else:
href = line[0].attrs["href"]
return target + href
"""---------------------------------------------------------------------------------"""
target = "https://movie.douban.com/subject/25944714/comments"
"""------------------------------------------------------------------------ params"""
movie = s.get(target) # , cookies=co)
page_movie = movie.text
# print page_movie
# print movie.status_code
soupMovie = BeautifulSoup(page_movie)
numb_ = soupMovie.findAll("ul", {"class": "fleft"})
print "total:", re.findall('(\d+)', numb_[0].text)[0]
movieName = soupMovie.find("title").get_text()[:-3]
print movieName
with open(movieName + ".txt", 'w') as fp:
process_h3(soupMovie, fp)
while True:
inter = random.gauss(9, 2)
time = inter if inter > 2.1 else 2.1
sleep(time)
next_ = find_next(soupMovie)
if next_ is None:
break
try:
soupMovie = BeautifulSoup(s.get(next_, timeout=10).text)
process_h3(soupMovie, fp)
except:
sleep(100)
try:
soupMovie = BeautifulSoup(s.get(next_, timeout=10).text)
process_h3(soupMovie, fp)
except:
break
登陆豆瓣的时候刚开始不用验证码,登陆次数多了再登就需要验证码了,当需要验证码时,把以下代码的注释去掉,程序会将验证码图片下载下来,再提示你输入验证码,照图片上的验证码输入就可以了。
从指定的第一页开始,程序每爬一页都会去找下一页的链接,接着去爬。其实不一定要从第一页开始,从任意页开始都行。每爬完一页,将等待一段时间,等待的时间长短服从均值为9,标准差为2的高斯分布,这是为了使爬虫行为更加没有规律。
很多电影短评到最后几页无法访问,所以程序在连续两次无法访问之后就会停止。
爬到的数据如下图所示:
转载请注明出处,如有问题,请评论。
登陆豆瓣的部分是在网上看的别人的代码,忘了从哪看的了。
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import re
from time import sleep
import random
import urllib
import requests
url = "https://douban.com/accounts/login"
formData = {
"redir": "https://www.douban.com",
"form_email": "**************",
"form_password": "**************",
"login": u'登录',
'source': 'None',
}
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36",
"Referer": "https://douban.com/accounts/login",
"Host": "accounts.douban.com",
"Connection": "Keep-Alive",
"Content-Type": "application/x-www-form-urlencoded"
}
s = requests.session()
# r_ = s.post(url, data=formData, headers=headers)
# a = r_.text
# soup_ = BeautifulSoup(a, "html.parser")
# captchaAddr = soup_.find('img', id='captcha_image')['src']
# reCaptchaID = r'<input type="hidden" name="captcha-id" value="(.*?)"/'
# captchaID = re.findall(reCaptchaID, a)
# urllib.urlretrieve(captchaAddr, "captcha.jpg")
#
# captcha = raw_input('please input the captcha:')
# formData['captcha-solution'] = captcha
# formData['captcha-id'] = captchaID
r_ = s.post(url, data=formData, headers=headers)
page_ = r_.text
# print page_
# co = r_.cookies
"""---------------------------------------------------------------------------------"""
number = 0
def process_h3(soup, fp):
global number
h3s = soup.findAll("h3")
for i in h3s:
aa = i.span.next_siblings
bb = aa.next().next()
number += 1
if number % 100 == 0:
print number
if len(bb) == 4:
fp.write(bb[2].attrs["class"][0][-2:-1])
fp.write(" ")
cc = i.next_siblings
cc.next()
dd = cc.next().get_text().strip()
ee = dd.replace('\n', " ")
fp.write(ee.encode("utf8"))
# print ee.encode("utf8")
fp.write('\n')
def find_next(soup):
line = soup.findAll("a", {"class", "next"})
if len(line) == 0:
return None
else:
href = line[0].attrs["href"]
return target + href
"""---------------------------------------------------------------------------------"""
target = "https://movie.douban.com/subject/25944714/comments"
"""------------------------------------------------------------------------ params"""
movie = s.get(target) # , cookies=co)
page_movie = movie.text
# print page_movie
# print movie.status_code
soupMovie = BeautifulSoup(page_movie)
numb_ = soupMovie.findAll("ul", {"class": "fleft"})
print "total:", re.findall('(\d+)', numb_[0].text)[0]
movieName = soupMovie.find("title").get_text()[:-3]
print movieName
with open(movieName + ".txt", 'w') as fp:
process_h3(soupMovie, fp)
while True:
inter = random.gauss(9, 2)
time = inter if inter > 2.1 else 2.1
sleep(time)
next_ = find_next(soupMovie)
if next_ is None:
break
try:
soupMovie = BeautifulSoup(s.get(next_, timeout=10).text)
process_h3(soupMovie, fp)
except:
sleep(100)
try:
soupMovie = BeautifulSoup(s.get(next_, timeout=10).text)
process_h3(soupMovie, fp)
except:
break
登陆豆瓣的时候刚开始不用验证码,登陆次数多了再登就需要验证码了,当需要验证码时,把以下代码的注释去掉,程序会将验证码图片下载下来,再提示你输入验证码,照图片上的验证码输入就可以了。
# r_ = s.post(url, data=formData, headers=headers) # a = r_.text # soup_ = BeautifulSoup(a, "html.parser") # captchaAddr = soup_.find('img', id='captcha_image')['src'] # reCaptchaID = r'<input type="hidden" name="captcha-id" value="(.*?)"/' # captchaID = re.findall(reCaptchaID, a) # urllib.urlretrieve(captchaAddr, "captcha.jpg") # captcha = raw_input('please input the captcha:') # formData['captcha-solution'] = captcha # formData['captcha-id'] = captchaID这个爬虫每次运行会爬取一部电影的所有影评,需要指定该电影短评的首页网址,如下面的代码所示。
"""---------------------------------------------------------------------------------""" target = "https://movie.douban.com/subject/25944714/comments" """------------------------------------------------------------------------ params"""爬虫开始时会输出短评总数以及电影名称,每经过100条输出一次提示。结果保存在以电影名命名的txt文件中。(注:该爬虫忽略了只有评价内容而没有评分的短评)。
从指定的第一页开始,程序每爬一页都会去找下一页的链接,接着去爬。其实不一定要从第一页开始,从任意页开始都行。每爬完一页,将等待一段时间,等待的时间长短服从均值为9,标准差为2的高斯分布,这是为了使爬虫行为更加没有规律。
很多电影短评到最后几页无法访问,所以程序在连续两次无法访问之后就会停止。
爬到的数据如下图所示:
转载请注明出处,如有问题,请评论。
相关文章推荐
- Python动态类型的学习---引用的理解
- Python3写爬虫(四)多线程实现数据爬取
- 垃圾邮件过滤器 python简单实现
- 下载并遍历 names.txt 文件,输出长度最长的回文人名。
- install and upgrade scrapy
- Scrapy的架构介绍
- Centos6 编译安装Python
- 使用Python生成Excel格式的图片
- 让Python文件也可以当bat文件运行
- [Python]推算数独
- 爬虫笔记
- Python中zip()函数用法举例
- Python中map()函数浅析
- Python将excel导入到mysql中
- Python在CAM软件Genesis2000中的应用
- 使用Shiboken为C++和Qt库创建Python绑定
- python自动化测试,生成测试报告