python--爬虫微博热门数据
2018-03-07 15:41
519 查看
#coding=utf-8
'''
从weibo.com中获取前20条热门话题
'''
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait as Wait
import unittest
import time
import sys
reload(sys)
sys.setdefaultencoding('utf8')
class Weibo():
def __init__(self):
self.dr = webdriver.Firefox()
@property
def hot_topic_page_1_url(self):
return 'http://d.weibo.com/100803?cfs=&Pl_Discover_Pt6Rank__5_filter=hothtlist_type%3D1#_0'
@property
def hot_topic_page_2_url(self):
return 'http://d.weibo.com/100803?cfs=920&Pl_Discover_Pt6Rank__5_filter=hothtlist_type&Pl_Discover_Pt6Rank__5_page=2#Pl_Discover_Pt6Rank__5'
@property
def host_url(self):
return 'http://d.weibo.com/102803?from=unlogin_home&mod=pindao&type=hotweibo'
#注意所获取url的不一致
def get_top_20_hot_topics(self):
first_part = self.get_first_15_hots()
last_part = self.get_last_5_hots()
for item in last_part:
first_part.append(item)
print first_part
return first_part
def get_top_10_hots(self):
hots = []
self.dr.get(self.host_url)
time.sleep(2)
self.wait_js_complete('.WB_cardwrap')
wrap_div = self.dr.find_element_by_css_selector('.WB_feed')
cards = self.dr.find_element_by_css_selector('.WB_cards')
for card in cards[:-2]:
item = []
item['author'] = card.find_element_by_class_name('W_f14').text
item['content'] = card.find_element_by_css_selector('.WB_text').text
item['forward'] = card.find_element_by_css_selector('span[node-type="forward_btn_text"]').text
item['comment'] = card.find_element_by_css_selector('span[node-type="comment_btn_text"]').text
item['up'] = card.find_elements_by_css_selector('.WB_feed_handle li')[-1].text
return hots
def wait_js_complete(self,target_elm_css):
def element_present(dr,css_selector):
try:
elm = self.dr.find_elements_by_css_selector(css_selector)
if elm.is_displayed():
return True
else:
return False
except:
return False
Wait(self.dr,5).until(lambda dr:element_present(dr, target_elm_css))
def get_first_15_hots(self):
hots =[]
self.dr.get(self.hot_topic_page_1_url)
time.sleep(2)
self.wait_js_complete('.DS_topicon')
hot_divs =self.dr.find_elements_by_css_selector('#Pl_Discover_Pt6Rank__5 .info_box')
for div in hot_divs:
item = {}
item['order'] = div.find_element_by_css_selector('.W_autocut>:first-child').text
if 'TOP' in item['order']:
item['order'] =item['order'].rapace('TOP','')
item['title'] = div.find_element_by_css_selector('.S_txt1').text
item['tag'] = div.find_element_by_css_selector('.W_btn_tag').text
item['subtitle'] = div.find_element_by_css_selector('.subtitle').text
item['page_view'] = div.find_element_by_css_selector('number').text
hots.append(item)
return hots
def get_last_5_hots(self):
hots = []
self.dr.get(self.hot_topic_page_2_url)
time.sleep(2)
self.wait_js_complete('.DSC_topicon')
hot_divs = self.r.find_element_by_css_selector('#Pl_Discover_Pt6Rank__5 .info_box')
for div in hot_divs:
item = []
item['order'] = div.find_element_by_css_selector('.W_autocut>:first-child').text
if item['order'] == '21':
break
item['title'] = div.find_element_by_css_selector('.S_txt1').text
item['tag'] = div.find_element_by_css_selector('.W_btn_tag').text
item['subtitle'] = div.find_element_by_css_selector('.subtitle').text
item['page_view'] = div.find_element_by_css_selector('number').text
hots.append(item)
return hots
def quit(self):
self.dr.quit()
if __name__ == '__main__':
# hots = Weibo().get_top_20_hot_topics()
hots = Weibo().get_top_10_hots()
print hots
<
9b55
br />
'''
从weibo.com中获取前20条热门话题
'''
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait as Wait
import unittest
import time
import sys
reload(sys)
sys.setdefaultencoding('utf8')
class Weibo():
def __init__(self):
self.dr = webdriver.Firefox()
@property
def hot_topic_page_1_url(self):
return 'http://d.weibo.com/100803?cfs=&Pl_Discover_Pt6Rank__5_filter=hothtlist_type%3D1#_0'
@property
def hot_topic_page_2_url(self):
return 'http://d.weibo.com/100803?cfs=920&Pl_Discover_Pt6Rank__5_filter=hothtlist_type&Pl_Discover_Pt6Rank__5_page=2#Pl_Discover_Pt6Rank__5'
@property
def host_url(self):
return 'http://d.weibo.com/102803?from=unlogin_home&mod=pindao&type=hotweibo'
#注意所获取url的不一致
def get_top_20_hot_topics(self):
first_part = self.get_first_15_hots()
last_part = self.get_last_5_hots()
for item in last_part:
first_part.append(item)
print first_part
return first_part
def get_top_10_hots(self):
hots = []
self.dr.get(self.host_url)
time.sleep(2)
self.wait_js_complete('.WB_cardwrap')
wrap_div = self.dr.find_element_by_css_selector('.WB_feed')
cards = self.dr.find_element_by_css_selector('.WB_cards')
for card in cards[:-2]:
item = []
item['author'] = card.find_element_by_class_name('W_f14').text
item['content'] = card.find_element_by_css_selector('.WB_text').text
item['forward'] = card.find_element_by_css_selector('span[node-type="forward_btn_text"]').text
item['comment'] = card.find_element_by_css_selector('span[node-type="comment_btn_text"]').text
item['up'] = card.find_elements_by_css_selector('.WB_feed_handle li')[-1].text
return hots
def wait_js_complete(self,target_elm_css):
def element_present(dr,css_selector):
try:
elm = self.dr.find_elements_by_css_selector(css_selector)
if elm.is_displayed():
return True
else:
return False
except:
return False
Wait(self.dr,5).until(lambda dr:element_present(dr, target_elm_css))
def get_first_15_hots(self):
hots =[]
self.dr.get(self.hot_topic_page_1_url)
time.sleep(2)
self.wait_js_complete('.DS_topicon')
hot_divs =self.dr.find_elements_by_css_selector('#Pl_Discover_Pt6Rank__5 .info_box')
for div in hot_divs:
item = {}
item['order'] = div.find_element_by_css_selector('.W_autocut>:first-child').text
if 'TOP' in item['order']:
item['order'] =item['order'].rapace('TOP','')
item['title'] = div.find_element_by_css_selector('.S_txt1').text
item['tag'] = div.find_element_by_css_selector('.W_btn_tag').text
item['subtitle'] = div.find_element_by_css_selector('.subtitle').text
item['page_view'] = div.find_element_by_css_selector('number').text
hots.append(item)
return hots
def get_last_5_hots(self):
hots = []
self.dr.get(self.hot_topic_page_2_url)
time.sleep(2)
self.wait_js_complete('.DSC_topicon')
hot_divs = self.r.find_element_by_css_selector('#Pl_Discover_Pt6Rank__5 .info_box')
for div in hot_divs:
item = []
item['order'] = div.find_element_by_css_selector('.W_autocut>:first-child').text
if item['order'] == '21':
break
item['title'] = div.find_element_by_css_selector('.S_txt1').text
item['tag'] = div.find_element_by_css_selector('.W_btn_tag').text
item['subtitle'] = div.find_element_by_css_selector('.subtitle').text
item['page_view'] = div.find_element_by_css_selector('number').text
hots.append(item)
return hots
def quit(self):
self.dr.quit()
if __name__ == '__main__':
# hots = Weibo().get_top_20_hot_topics()
hots = Weibo().get_top_10_hots()
print hots
<
9b55
br />
相关文章推荐
- python利用新浪API实现数据的抓取\python微博数据爬虫
- python--爬虫知乎热门数据
- 如何科学地蹭热点:用python爬虫获取热门微博评论并进行情感分析
- python利用新浪API实现数据的抓取\python微博数据爬虫
- Python微博地点签到大数据实战(三)大数据利器:爬虫
- python爬虫入门 实战(四)---爬“榜姐”话题微博及热门评论
- 【python网络编程】新浪爬虫:关键词搜索爬取微博数据
- 采用python的pyquery引擎做网页爬虫,进行数据分析
- 基于微博数据用 Python 打造一颗“心”
- python爬虫案例——东方财富股票数据采集
- python爬虫之爬取CQU毕业设计网批量获取数据
- python爬虫+R数据可视化 实例
- Python爬虫:抓取手机APP的传输数据
- python --网页爬虫,文本处理,科学计算,机器学习,数据挖掘资料+附带工具包下载
- Python爬取微博数据生成词云图片
- 实践项目十:爬取百度百科Python词条相关1000个页面数据(慕课简单爬虫实战)
- Python爬虫实现数据可视化,为你做一个城市旅游数据分析
- 【干货】Python爬虫/文本处理/科学计算/机器学习/数据挖掘兵器谱
- python通过伪装头部数据抵抗反爬虫的实例
- python&php数据抓取、爬虫分析与中介,有网址案例