您的位置:首页 > 其它

新浪微博数据挖掘食谱之十: 元素篇 (提取转发微博的元素)

2015-01-06 07:40 344 查看
#!/usr/bin/python
# -*- coding: utf-8 -*-

'''
Created on 2015-1-6
@author: beyondzhou
@name: extract_repost_attributions.py
'''

# Extract repost attributions
def extract_repost_attributions():

# import
from search import weibo_search
from entities import weibo_entities
from login import weibo_login
from statuses import fetch_repost_timeline, fetch_weibo_status, get_rt_attributions
import json

# Access to sina api
weibo_api = weibo_login()

# Do the search
subject = weibo_search(topic='iphone')

# Decode entities
(mids, names, texts, dates, reposts, comments, likes) = weibo_entities(subject)

# Find weibo id whose repost number is above then 1
for index in range(len(reposts)):
if reposts[index] > 0:
weibo_id_reposted = mids[index]
print 'reposts number: %s, weibo_id_reposted: %s' % (reposts[index], weibo_id_reposted)
break

# Find repost timeline
repost_timeline = fetch_repost_timeline(weibo_api, count = 200, page = 1, weibo_id = weibo_id_reposted)

# Extract repost attribution (use the first record of repost_timeline to do the example)
repost_attributions = get_rt_attributions(repost_timeline[0])

# Output repost weibo
repost_weibo = fetch_weibo_status(weibo_api, weibo_id = weibo_id_reposted)
print json.dumps(repost_weibo, indent=1)
print 'Output repost weibo done!\n'

# Output repost attribution
for repost_att in repost_attributions:
print repost_att
print 'Output repost attribution done!\n'

if __name__ == '__main__':
extract_repost_attributions()
# Get repost repost weibo timeline
def fetch_repost_timeline(weibo_api, count = 200, page = 1, weibo_id = 1):

repost_timeline = weibo_api.statuses.repost_timeline.get(count=count, page=page, id = weibo_id)
statuses = repost_timeline['reposts']
return statuses


# Get weibo status
def fetch_weibo_status(weibo_api, weibo_id = 1):

weibo_status = weibo_api.statuses.show.get(id = weibo_id)
return weibo_status


# get repost attributions
def get_rt_attributions(repost):
import re

# Regex adapted from Stack Overflow (http://bit.ly/1821y0J)
rt_patterns = re.compile(ur"(RT|via|\u8f6c\u53d1)((?:\b\W*@\w+)+)", re.IGNORECASE)
rt_attributions = []

# Inspect the tweet to see if it was produced with /statuses/retweet/:id.
# See https://dev.twitter.com/docs/api/1.1/get/statuses/retweets/%3Aid. if repost.has_key('retweeted_status'):
attribution = repost['retweeted_status']['user']['screen_name'].lower()
rt_attributions.append(attribution)

# Also, inspect the tweet for the presence of "legacy" retweet patterns
# such as "RT" and "via", which are still widely used for various reasons
# and potentially very useful. See https://dev.twitter.com/discussions/2847 # and https://dev.twitter.com/discussions/1748 for some details on how/why.
'''
subject = 'RT @SocialWebMining'
import re
rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE)
rt_patterns.findall(subject)[0][1]
Out[29]: ' @SocialWebMining'
rt_patterns.findall(subject)
Out[30]: [('RT', ' @SocialWebMining')]

rt_patterns.findall(subject)[0]
Out[31]: ('RT', ' @SocialWebMining')

rt_patterns.findall(subject)[0][1]
Out[32]: ' @SocialWebMining'

In [32]: repost_attributions = '\u798f\u5229\u6d3e\u9001\u673a'

In [33]: repost_attributions.decode("unicode_escape")
Out[33]: u'\u798f\u5229\u6d3e\u9001\u673a'

In [34]: print repost_attributions.decode("unicode_escape")
福利派送机
'''

try:
rt_attributions += [
mention.strip()
for mention in rt_patterns.findall(repost['text'])[0][1].split()
]
except IndexError, _:
pass

# Filter out any duplicates
return list(set([rta.strip("@").lower() for rta in rt_attributions]))


Result:

callback_url: https://api.weibo.com/oauth2/authorize?redirect_uri=http%3A//apps.weibo.com/guaguastd&response_type=code&client_id=2925245021 return_redirect_uri: http://weibo.com/login.php?url=http%3A%2F%2Fapps.weibo.com%2Fguaguastd%3Fcode%3D9d0a0ecb4df4db1d8d1a6ef5460c5e82 code: ['9d0a0ecb4df4db1d8d1a6ef5460c5e82']
now_handle: ce2b7c50-9531-11e4-b8c2-7bd88716b5dd http://passport.weibo.com/ all_handles: [u'ce2b7c50-9531-11e4-b8c2-7bd88716b5dd', u'd3ba1000-9531-11e4-b8c2-7bd88716b5dd']
search done!
mids entities done!
names entities done!
texts entities done!
dates entities done!
reposts entities done!
comments entities done!
likes entities done!
reposts number: 6964, weibo_id_reposted: 3795801400243898
{
"reposts_count": 6975,
"truncated": false,
"text": "1 toy 1 day\uff0c\u7b2c178\u671f\uff1a\u7f8e\u56fdBluelounge\uff0diPhone 5/5s\u6700\u4f73\u89c2\u770b\u89d2\u5ea6\u5145\u7535\u57fa\u5ea7\u3002\u624b\u673a\u653e\u5728\u684c\u4e0a\u5145\u7535\uff0c\u60f3\u770b\u4e00\u4e9b\u4e1c\u897f\uff0c\u611f\u89c9\u603b\u662f\u4e0d\u8212\u670d\u3002\u6709\u4e86\u5b83\uff0c\u4e0d\u4ec5\u5916\u89c2\u9ad8\u5927\u4e0a\uff0c\u8fd8\u8ba9\u4f60\u6709\u4e2a\u66f4\u597d\u7684\u89c2\u770b\u89d2\u5ea6\uff0c\u5145\u7535\u65f6\u7528\u8d77\u6765\u4e5f\u662f\u90a3\u4e48\u987a\u7545\u81ea\u5982\uff08\u8fd9\u662f\u6211\u9001\u51fa\u7684\u7b2c2232\u4ef6\u793c\u7269\uff0c\u5173\u8f6c\uff0c1\u67086\u65e5\u62bd\uff0c\u4e0d\u52301\u5929\uff0c\u5bf9\uff0c\u53ea\u5728\u7231\u8d34\uff0c\u56e0\u4e3a\u6709\u4f60\uff01\uff09",
"visible": {
"type": 0,
"list_id": 0
},
"in_reply_to_status_id": "",
"bmiddle_pic": "http://ww1.sinaimg.cn/bmiddle/005wRYdajw1enz2uspb4xj313y0pgmza.jpg",
"id": 3795801400243898,
"thumbnail_pic": "http://ww1.sinaimg.cn/thumbnail/005wRYdajw1enz2uspb4xj313y0pgmza.jpg",
"mid": "3795801400243898",
"source": "<a href=\"http://weibo.com/\" rel=\"nofollow\">\u5fae\u535a weibo.com</a>",
"attitudes_count": 187,
"in_reply_to_screen_name": "",
"pic_urls": [
{
"thumbnail_pic": "http://ww1.sinaimg.cn/thumbnail/005wRYdajw1enz2uspb4xj313y0pgmza.jpg"
},
{
"thumbnail_pic": "http://ww2.sinaimg.cn/thumbnail/005wRYdajw1enz2p7nulhj30go0egmxu.jpg"
},
{
"thumbnail_pic": "http://ww3.sinaimg.cn/thumbnail/005wRYdajw1enz2pa8b0fj30i20i20uj.jpg"
},
{
"thumbnail_pic": "http://ww3.sinaimg.cn/thumbnail/005wRYdajw1enz2pd1z82j312w12w40j.jpg"
},
{
"thumbnail_pic": "http://ww3.sinaimg.cn/thumbnail/005wRYdajw1enz2us8x7vj31420mytaf.jpg"
},
{
"thumbnail_pic": "http://ww3.sinaimg.cn/thumbnail/005wRYdajw1enz2utiqwqj30r00n2dgk.jpg"
},
{
"thumbnail_pic": "http://ww3.sinaimg.cn/thumbnail/005wRYdajw1enz2uu0mcuj30us0ps76b.jpg"
},
{
"thumbnail_pic": "http://ww2.sinaimg.cn/thumbnail/005wRYdajw1enz2vbtjkxj30lo0c7q53.jpg"
},
{
"thumbnail_pic": "http://ww3.sinaimg.cn/thumbnail/005wRYdajw1enz2vmki74j30sg0ilab2.jpg"
}
],
"in_reply_to_user_id": "",
"darwin_tags": [],
"favorited": false,
"original_pic": "http://ww1.sinaimg.cn/large/005wRYdajw1enz2uspb4xj313y0pgmza.jpg",
"idstr": "3795801400243898",
"source_type": 1,
"user": {
"cover_image": "http://ww4.sinaimg.cn/crop.0.0.920.300/005wRYdajw1emok192jcyj30pk08cgoi.jpg",
"bi_followers_count": 3,
"domain": "",
"avatar_large": "http://tp1.sinaimg.cn/5066369752/180/5712388302/1",
"verified_source": "",
"ptype": 0,
"cover_image_phone": "http://ww2.sinaimg.cn/crop.0.0.0.0/005wRYdajw1emovpmsh52j30hs0hrwhh.jpg",
"statuses_count": 12132,
"id": 5066369752,
"verified_reason_url": "",
"city": "1000",
"verified": true,
"friends_count": 4,
"verified_reason_modified": "",
"credit_score": 80,
"block_app": 1,
"follow_me": false,
"verified_reason": "\u5317\u4eac\u7231\u8d34\u8fbe\u4eba\u7f51\u7edc\u6280\u672f\u6709\u9650\u516c\u53f8",
"followers_count": 634775,
"location": "\u5317\u4eac",
"verified_state": 0,
"verified_trade": "",
"mbtype": 12,
"verified_source_url": "",
"profile_url": "u/5066369752",
"block_word": 0,
"avatar_hd": "http://ww1.sinaimg.cn/crop.0.0.943.943.1024/005wRYdajw1emu8osezk8j30q90q9jsp.jpg",
"star": 0,
"description": "\u6211\u9001\u7684\u4e0d\u662f\u793c\u7269\uff0c\u662f\u4efd\u5e0c\u671b\uff0c\u6bcf\u592910\u4e2a\uff0c\u5bf9\uff0c\u53ea\u5728\u7231\u8d34\uff01",
"verified_contact_email": "lucky@itie520.com",
"online_status": 0,
"mbrank": 4,
"verified_level": 3,
"profile_image_url": "http://tp1.sinaimg.cn/5066369752/50/5712388302/1",
"idstr": "5066369752",
"verified_contact_mobile": "",
"allow_all_act_msg": false,
"allow_all_comment": true,
"geo_enabled": true,
"class": 1,
"screen_name": "\u7231\u8d34",
"lang": "zh-cn",
"weihao": "",
"remark": "",
"favourites_count": 21,
"name": "\u7231\u8d34",
"url": "",
"province": "11",
"created_at": "Tue Mar 11 20:16:07 +0800 2014",
"verified_contact_name": "Jason",
"verified_type": 2,
"gender": "m",
"following": false,
"pagefriends_count": 0,
"urank": 15
},
"geo": null,
"created_at": "Mon Jan 05 23:50:39 +0800 2015",
"mlevel": 0,
"comments_count": 745
}
Output repost weibo done!

爱贴
Output repost attribution done!
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: