您的位置:首页 > 编程语言 > Python开发

分析python处理基本数据<四>

2016-07-21 21:03 746 查看
# coding=utf-8
import linecache
import time

now = time.time()
data_keys = ('bid', 'uid', 'username', 'v_class', 'content', 'img', 'created_at', 'source', 'rt_num', 'cm_num',
'rt_uid', 'rt_username', 'rt_v_class', 'rt_content', 'rt_img', 'src_rt_num', 'src_cm_num', 'gender',
'rt_bid', 'location', 'rt_mid', 'mid', 'lat', 'lon', 'lbs_type', 'lbs_title', 'poiid', 'links', 'hashtags',
'ats', 'rt_links', 'rt_hashtags', 'rt_ats', 'v_url', 'rt_v_url')
keys = {data_keys[i]: i for i in xrange(0, len(data_keys))}
F = linecache.getlines('twitter.txt')
lines = [x[1:-2].split('","') for x in F]
users = set(line[keys['username']] for line in lines)

users_total = len(users)  # 用户总数
assert type(users_total) == int

users = list(users)  # 用户名组成的list
assert type(users) == list

tweets_from_2012_11 = filter(lambda line: line[keys['created_at']].startswith('2012-11'), lines)
tweets_2012_11_count = len(tweets_from_2012_11)  # 2012-11月的tweets的总数
assert type(tweets_2012_11_count) == int

dates = set(line[keys['created_at']].split(' ')[0] for line in lines)
dates = sorted(list(dates))  # 有哪几天的数据
assert type(dates) == list

time_list = [int(line[keys['created_at']][11:13]) for line in lines]
time_count = [(h, time_list.count(h)) for h in xrange(0, 24)]
time_count.sort(key=lambda k: k[1], reverse=True)
time_most_tweet = time_count[0][0]  # 文本里面发布数据最多的小时
assert type(time_most_tweet) == int

date_user_num = {k: dict() for k in dates}
date_most_user = {k: '' for k in dates}
for line in lines:
date_line = line[keys['created_at']].split(' ')[0]
user_line = line[keys['username']]
if date_user_num[date_line]. has_key(user_line):
date_user_num[date_line][user_line] += 1
else:
date_user_num[date_line][user_line] = 1

for k, v in date_user_num.items():
us = v.items()
us.sort(key=lambda k: k[1], reverse=True)
date_user_num[k] = {us[0][0]: us[0][1]}  # 每个日期下,发Twitter最多的用户,一个条数
date_most_user[k] = us[0][0]  # 每个日期下,发Twitter最多的用户
assert type(date_user_num) == dict

tweets_from_2012_11_03 = filter(lambda line: line[keys['created_at']].startswith('2012-11-03'), lines)
date_time_list = [int(line[keys['created_at']][11:13]) for line in tweets_from_2012_11_03]
date_time_count = [(str(i), date_time_list.count(i)) for i in xrange(0, 24)]
assert type(date_time_count) == list  # 按照时间顺序输出 2012-11-03 每个小时的发布tweets的频率

tweets_source_dict = {}
for line in lines:
if tweets_source_dict. has_key(line[keys['source']]):
tweets_source_dict[line[keys['source']]] += 1
else:
tweets_source_dict[line[keys['source']]] = 1

tweets_source = tweets_source_dict.items()
tweets_source.sort(key=lambda k: k[1], reverse=True)  # 统计该文本里来源的相关信息和次数
assert type(tweets_source) == list

tweet_transmit_url = filter(lambda line: line[keys['rt_v_url']].startswith('https://twitter.com/umiushi_no_uta'), lines)
tweet_transmit_url_count = len(tweet_transmit_url)  # 计算转发URL中:以"https://twitter.com/umiushi_no_uta"开头的有几个
assert type(tweet_transmit_url_count) == int

tweet_user_count = 0
for line in lines:
if line[keys['uid']] == '573638104':
tweet_user_count += 1
assert type(tweet_user_count) == int  # UID为573638104的用户 发了多少个微博

# 定义一个函数,该函数可放入任意多的用户uid参数(如果不存在则返回null),函数返回发微薄数最多的用户uid。

def most_tweet_count(*temp):
if len(temp) == 0:
return 'null'
uid_count = {temp[i]: 0 for i in xrange(0, len(temp))}
for line in lines:
if line[keys['uid']] in temp:
uid_count[line[keys['uid']]] += 1
uid_count_list = uid_count.items()
uid_count_list.sort(key=lambda k: k[1], reverse=True)
return uid_count_list[0][0] if uid_count_list[0][1] > 0 else 'null'

assert most_tweet_count() == 'null'
assert most_tweet_count('ab', 'cds') == 'null'
assert most_tweet_count('ab', 'cds', '123b') == 'null'
assert most_tweet_count('12342', 'cd') == 'null'
assert most_tweet_count('28803555', 28803555) == '28803555'
assert most_tweet_count('28803555', 28803555, '96165754') == '28803555'

# 12. 该文本里,谁发的微博内容长度最长 (要求:输出用户的uid,字符串格式。)
max_len = 0
max_uid = ''
for line in lines:
if len(line[keys['content']]) > max_len:
max_uid = line[keys['uid']]
max_len = len(line[keys['content']])
assert type(max_uid) == str

# 13. 该文本里,谁转发的URL最多 (要求:输出用户的uid,字符串格式。)
trans_uid_count = [(line[keys['uid']], int(line[keys['rt_num']])) for line in lines if line[keys['rt_num']] != '']
trans_uid_count.sort(key=lambda k: k[1], reverse=True)
most_trans_url = trans_uid_count[0][0]
assert type(most_trans_url) == str

# 14. 该文本里,11点钟,谁发的微博次数最多。 (要求:输出用户的uid,字符串格式。)
time_eleven = {}
lines_eleven = filter(lambda line: line[keys['created_at']].startswith('11', 11, 13), lines)
for line in lines_eleven:
if time_eleven. has_key(line[keys['uid']]):
time_eleven[line[keys['uid']]] += 1
else:
time_eleven[line[keys['uid']]] = 1
time_eleven_list = time_eleven.items()
time_eleven_list.sort(key=lambda k: k[1], reverse=True)
time_eleven_most = time_eleven_list[0][0]
assert type(time_eleven_most) == str

# 15. 该文本里,哪个用户的源微博URL次数最多。 (要求:输出用户的uid,字符串格式。)

url_user = {line[keys['uid']]: 0 for line in lines}
for line in lines:
if line[keys['v_url']] != '':
url_user[line[keys['uid']]] += 1
url_user_list = url_user.items()
url_user_list.sort(key=lambda k: k[1], reverse=True)
url_trans_most = url_user_list[0][0]
assert type(url_trans_most) == str
总时间:<pre name="code" class="python">d = time.time() - now
print d
print '运算时间%s' % d
print '运算时间%d' % d
print type(d)
# 这里面是%s, 代表字符串,%d代表整型会变成0


<pre name="code" class="python"><pre name="code" class="python">0.523000001907
运算时间0.523000001907
运算时间0
<type 'float'>




                                            
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: