您的位置:首页 > 编程语言 > Python开发

python抓取百度百科点赞数等动态数据

2015-12-24 17:08 651 查看
利用selenium 模拟浏览器打开页面,加载后抓取数据

#!/usr/bin/env python
# coding=utf-8

import urllib2
import re
from bs4 import BeautifulSoup
from selenium import webdriver
import time

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

class BaikeSpider():

def __init__(self):
self.queue = ["http://baike.baidu.com/view/8095.htm",
"http://baike.baidu.com/view/2227.htm"]
self.base = "http://baike.baidu.com"
self.crawled = set()
self.crawled_word = set()

#        client = MongoClient("localhost",27017)
#        self.db = client["baike_db"]["html"]

def crawl(self):
browser = webdriver.Chrome()
cnt = 0
fw = open('./baike_keywords.txt','wb')
while self.queue:
url = self.queue.pop(0)
if url in self.crawled :
continue
self.crawled.add(url)
try:
browser.get(url)
res = {}
links = BeautifulSoup(urllib2.urlopen(url).read(),'lxml').find_all("a")
links = list(set(links))
for link in links:
if 'href' not in dict(link.attrs) or re.search(u"javascript",link['href']) or len(link['href'])<8:
continue
url = link['href']
if re.search(u"baike\.baidu\.com/view/\d+|baike\.baidu\.com/subview/\d+/\d+.htm",url) and url not in self.crawled:
self.queue.append(url)
elif re.match(u"view/\d+",url):
url = self.base+ url
if url not in self.crawled:
self.queue.append(url)

cnt += 1
print cnt
if cnt % 10 == 0:
print 'queue',len(self.queue)
fw.close()
fw = open('./baike_keywords.txt','a+')

res['url'] = url
res['title'] = browser.title.split(u"_")[0]

if res['title'] in self.crawled_word:
print 'title',res['title'],'has crawled'
continue

vote = browser.find_element_by_class_name("vote-count")
view = browser.find_element_by_id("j-lemmaStatistics-pv")

res['voted'] = vote.text
res['viewed'] = view.text

line = []
line.append(res['title'])
line.append(res['viewed'])
line.append(res['voted'])
line.append(res['url'])

line = '\t'.join(line)
fw.write(line+'\n')
self.crawled_word.add(res["title"])

except Exception,e:
print e
continue

if __name__=='__main__':
test = BaikeSpider()
test.crawl()


另外,使用chrome加载会比firefox快,且少报错,异常退出!
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: