您的位置:首页 > 编程语言 > Python开发

12星座都是什么性格?(python爬虫+jieba分词+词云)

2017-12-25 14:36 260 查看
12星座都是什么性格,大数据告诉你!

下面是利用python爬取12星座性格相关的微博,产生的12星座性格特征词云!白羊座为例,其他的在最后。





上代码(以白羊座为例):

1.微博数据爬取(需要selenium,Firefox的驱动geckodriver)

# coding=utf-8

import time
import datetime
import re
import os
import sys
import codecs
import shutil
import urllib
import random
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.action_chains import ActionChains
import xlwt

#先调用Firefox
driver = webdriver.Firefox()

#第一步: 登陆login.sina.com
def LoginWeibo(username, password):
try:
#输入用户名/密码登录
print(u'准备登陆Weibo.cn网站...')
driver.get("https://login.sina.com.cn/signup/signin.php")
elem_user = driver.find_element_by_name("username")
elem_user.send_keys(username) #用户名
elem_pwd = driver.find_element_by_name("password")
elem_pwd.send_keys(password)  #密码
elem_sub = driver.find_element_by_xpath("/html/body/div[1]/div/div[2]/div[2]/form/div[2]/div/ul/li[7]/div[1]")
elem_sub.click()              #点击登陆 因无name属性

try:
#输入验证码
time.sleep(10)
elem_sub.click()
except:
#不用输入验证码
pass

print('Crawl in ', driver.current_url)
print(u'输出Cookie键值对信息:')
'''for cookie in driver.get_cookies():
print(cookie)
for key in cookie:
print(key, cookie[key])'''
print(u'登陆成功...')
except Exception as e:
print("Error: ",e)
finally:
print(u'End LoginWeibo!\n')

#第二步: 访问http://s.weibo.com/页面搜索结果
def GetSearchContent(key):

driver.get("http://s.weibo.com/")
print('搜索热点主题:', key)

#输入关键词并点击搜索
item_inp = driver.find_element_by_xpath("/html/body/div[1]/div[2]/div/div[2]/div[1]/div/div/div[2]/div/input")
item_inp.send_keys(key)
item_inp.send_keys(Keys.RETURN)    #采用点击回车直接搜索
time.sleep(3)
current_url = driver.current_url
current_url = current_url.split('&')[0]
global start_stamp
global page

#需要抓取的开始和结束日期
start_date = datetime.datetime(2017,12,20,0)
end_date = datetime.datetime(2017,12,24,0)
delta_date = datetime.timedelta(days=1)

#每次抓取一天的数据
start_stamp = start_date
end_stamp = start_date + delta_date

global outfile
global sheet

outfile = xlwt.Workbook(encoding = 'utf-8')

while end_stamp <= end_date:

page = 1
sheet = outfile.add_sheet(str(start_stamp.strftime("%Y-%m-%d-%H")))
url = current_url + '&typeall=1&suball=1×cope=custom:' + str(start_stamp.strftime("%Y-%m-%d-%H")) + ':' + str(end_stamp.strftime("%Y-%m-%d-%H")) + '&Refer=g'
print(url)
driver.get(url)

handlePage()

start_stamp = end_stamp
end_stamp = end_stamp + delta_date
time.sleep(1 + float(random.randint(1, 100)) / 10)

#页面加载完成后,对页面内容进行处理
def handlePage():
while True:
time.sleep(1 + float(random.randint(1, 100)) / 20)
#先行判定是否有内容
if checkContent():
print("getContent")
getContent()
#先行判定是否有下一页按钮
if checkNext():
#拿到下一页按钮
next_page_btn = driver.find_element_by_xpath("//a[@class='page next S_txt1 S_line1']")
next_page_btn.click()
else:
print("no Next")
break
else:
print("no Content")
break

#判断页面加载完成后是否有内容
def checkContent():
try:
driver.find_element_by_xpath("//div[@class='pl_noresult']")
flag = False
except:
flag = True
return flag

#判断是否有下一页按钮
def checkNext():
try:
driver.find_element_by_xpath("//a[@class='page next S_txt1 S_line1']")
flag = True
except:
flag = False
return flag

#在页面有内容的前提下,获取内容
def getContent():

nodes = driver.find_elements_by_xpath("//div[@class='WB_cardwrap S_bg2 clearfix']")
print(nodes)
#在运行过程中微博反爬机制,需要输入验证码
if len(nodes) == 0:
input("请在微博页面输入验证码!")
url = driver.current_url
driver.get(url)
getContent()
return

dic = {}

global page
print(str(start_stamp.strftime("%Y-%m-%d-%H")))
print(u'页数:', page)
page = page + 1
print(u'微博数量', len(nodes))

for i in range(len(nodes)):
dic[i] = []
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
try:
WBNR = nodes[i].find_element_by_xpath(".//div[@class='feed_content wbcon']/p[@class='comment_txt']").text.encode('utf-8')
WBNR = WBNR.decode().translate(non_bmp_map)
except:
WBNR = ''
dic[i].append(WBNR)

#保存文本
with open("BY.txt", 'a', encoding='utf-8') as f:
for k in dic:
for i in range(len(dic[k])):
f.writelines((dic[k][i],'\n'))
time.sleep(1 + float(random.randint(1, 100)) / 20)

if __name__ == '__main__':

username = ''             #输入你的用户名
password = ''               #输入你的密码

LoginWeibo(username, password)       #登陆微博

key = '白羊性格'
GetSearchContent(key)
2.jieba分词,生成词云

import pickle
from os import path
import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
def make_worldcloud(file_path):
text_from_file_with_apath = open(file_path,'r',encoding='UTF-8').read()
wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all=False)
wl_space_split = " ".join(wordlist_after_jieba)
print(wl_space_split)
backgroud_Image = plt.imread('BY.jpg')
print('加载图片成功!')
'''设置词云样式'''
stopwords = STOPWORDS.copy()
print(type(stopwords))
stopwords.add("哈哈")#可以加多个屏蔽词

stopwords.add("金牛")
stopwords.add("金牛座")
stopwords.add("双子")
stopwords.add("双子座")
stopwords.add("巨蟹")
stopwords.add("巨蟹座")
stopwords.add("狮子")
stopwords.add("狮子座")
stopwords.add("处女")
stopwords.add("处女座")
stopwords.add("天秤")
stopwords.add("天秤座")
stopwords.add("天蝎")
stopwords.add("天蝎座")
stopwords.add("射手")
stopwords.add("射手座")
stopwords.add("摩羯")
stopwords.add("摩羯座")
stopwor
4000
ds.add("魔蝎座")
stopwords.add("水瓶")
stopwords.add("水瓶座")
stopwords.add("双鱼")
stopwords.add("双鱼座")

wc = WordCloud(
width=1024,
height=768,
background_color='white',# 设置背景颜色
mask=backgroud_Image,# 设置背景图片
font_path='font/simsun.ttc',  # 设置中文字体,若是有中文的话,这句代码必须添加,不然会出现方框,不出现汉字
max_words=300, # 设置最大现实的字数
stopwords=stopwords,# 设置停用词
max_font_size=400,# 设置字体最大值
random_state=50,# 设置有多少种随机生成状态,即有多少种配色方案
)
wc.generate_from_text(wl_space_split)#开始加载文本
img_colors = ImageColorGenerator(backgroud_Image)
wc.recolor(color_func=img_colors)#字体颜色为背景图片的颜色
plt.imshow(wc)# 显示词云图
plt.axis('off')# 是否显示x轴、y轴下标
plt.show()#显示
# 获得模块所在的路径的
d = path.dirname(__file__)
# os.path.join():  将多个路径组合后返回
wc.to_file(path.join(d, "BY_cloud.jpg"))
print('生成词云成功!')

make_worldcloud('BY.txt')
突发奇想做的,数据采集处理都很粗糙,想要真正通过大数据去汇总微博上关于各星座的性格描述,还需要更进很多步,不过这个用来娱乐挺有意思的。

本来还想把各个星座出现频率最高的汇总起来形成星座共性,嗯...懒...我是射手座哈哈,如果有兴趣可以联系我,目前初步的12星座性格词云已经在朋友圈发过了哈哈,有什么问题随时沟通。

把别的也上传一下吧~






















参考资料:
http://blog.csdn.net/destinyuan/article/details/51297528 https://zhuanlan.zhihu.com/p/30107203
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息