分析python处理基本数据<四>
2016-07-21 21:03
746 查看
# coding=utf-8 import linecache import time now = time.time() data_keys = ('bid', 'uid', 'username', 'v_class', 'content', 'img', 'created_at', 'source', 'rt_num', 'cm_num', 'rt_uid', 'rt_username', 'rt_v_class', 'rt_content', 'rt_img', 'src_rt_num', 'src_cm_num', 'gender', 'rt_bid', 'location', 'rt_mid', 'mid', 'lat', 'lon', 'lbs_type', 'lbs_title', 'poiid', 'links', 'hashtags', 'ats', 'rt_links', 'rt_hashtags', 'rt_ats', 'v_url', 'rt_v_url') keys = {data_keys[i]: i for i in xrange(0, len(data_keys))} F = linecache.getlines('twitter.txt') lines = [x[1:-2].split('","') for x in F] users = set(line[keys['username']] for line in lines) users_total = len(users) # 用户总数 assert type(users_total) == int users = list(users) # 用户名组成的list assert type(users) == list tweets_from_2012_11 = filter(lambda line: line[keys['created_at']].startswith('2012-11'), lines) tweets_2012_11_count = len(tweets_from_2012_11) # 2012-11月的tweets的总数 assert type(tweets_2012_11_count) == int dates = set(line[keys['created_at']].split(' ')[0] for line in lines) dates = sorted(list(dates)) # 有哪几天的数据 assert type(dates) == list time_list = [int(line[keys['created_at']][11:13]) for line in lines] time_count = [(h, time_list.count(h)) for h in xrange(0, 24)] time_count.sort(key=lambda k: k[1], reverse=True) time_most_tweet = time_count[0][0] # 文本里面发布数据最多的小时 assert type(time_most_tweet) == int date_user_num = {k: dict() for k in dates} date_most_user = {k: '' for k in dates} for line in lines: date_line = line[keys['created_at']].split(' ')[0] user_line = line[keys['username']] if date_user_num[date_line]. has_key(user_line): date_user_num[date_line][user_line] += 1 else: date_user_num[date_line][user_line] = 1 for k, v in date_user_num.items(): us = v.items() us.sort(key=lambda k: k[1], reverse=True) date_user_num[k] = {us[0][0]: us[0][1]} # 每个日期下,发Twitter最多的用户,一个条数 date_most_user[k] = us[0][0] # 每个日期下,发Twitter最多的用户 assert type(date_user_num) == dict tweets_from_2012_11_03 = filter(lambda line: line[keys['created_at']].startswith('2012-11-03'), lines) date_time_list = [int(line[keys['created_at']][11:13]) for line in tweets_from_2012_11_03] date_time_count = [(str(i), date_time_list.count(i)) for i in xrange(0, 24)] assert type(date_time_count) == list # 按照时间顺序输出 2012-11-03 每个小时的发布tweets的频率 tweets_source_dict = {} for line in lines: if tweets_source_dict. has_key(line[keys['source']]): tweets_source_dict[line[keys['source']]] += 1 else: tweets_source_dict[line[keys['source']]] = 1 tweets_source = tweets_source_dict.items() tweets_source.sort(key=lambda k: k[1], reverse=True) # 统计该文本里来源的相关信息和次数 assert type(tweets_source) == list tweet_transmit_url = filter(lambda line: line[keys['rt_v_url']].startswith('https://twitter.com/umiushi_no_uta'), lines) tweet_transmit_url_count = len(tweet_transmit_url) # 计算转发URL中:以"https://twitter.com/umiushi_no_uta"开头的有几个 assert type(tweet_transmit_url_count) == int tweet_user_count = 0 for line in lines: if line[keys['uid']] == '573638104': tweet_user_count += 1 assert type(tweet_user_count) == int # UID为573638104的用户 发了多少个微博 # 定义一个函数,该函数可放入任意多的用户uid参数(如果不存在则返回null),函数返回发微薄数最多的用户uid。 def most_tweet_count(*temp): if len(temp) == 0: return 'null' uid_count = {temp[i]: 0 for i in xrange(0, len(temp))} for line in lines: if line[keys['uid']] in temp: uid_count[line[keys['uid']]] += 1 uid_count_list = uid_count.items() uid_count_list.sort(key=lambda k: k[1], reverse=True) return uid_count_list[0][0] if uid_count_list[0][1] > 0 else 'null' assert most_tweet_count() == 'null' assert most_tweet_count('ab', 'cds') == 'null' assert most_tweet_count('ab', 'cds', '123b') == 'null' assert most_tweet_count('12342', 'cd') == 'null' assert most_tweet_count('28803555', 28803555) == '28803555' assert most_tweet_count('28803555', 28803555, '96165754') == '28803555' # 12. 该文本里,谁发的微博内容长度最长 (要求:输出用户的uid,字符串格式。) max_len = 0 max_uid = '' for line in lines: if len(line[keys['content']]) > max_len: max_uid = line[keys['uid']] max_len = len(line[keys['content']]) assert type(max_uid) == str # 13. 该文本里,谁转发的URL最多 (要求:输出用户的uid,字符串格式。) trans_uid_count = [(line[keys['uid']], int(line[keys['rt_num']])) for line in lines if line[keys['rt_num']] != ''] trans_uid_count.sort(key=lambda k: k[1], reverse=True) most_trans_url = trans_uid_count[0][0] assert type(most_trans_url) == str # 14. 该文本里,11点钟,谁发的微博次数最多。 (要求:输出用户的uid,字符串格式。) time_eleven = {} lines_eleven = filter(lambda line: line[keys['created_at']].startswith('11', 11, 13), lines) for line in lines_eleven: if time_eleven. has_key(line[keys['uid']]): time_eleven[line[keys['uid']]] += 1 else: time_eleven[line[keys['uid']]] = 1 time_eleven_list = time_eleven.items() time_eleven_list.sort(key=lambda k: k[1], reverse=True) time_eleven_most = time_eleven_list[0][0] assert type(time_eleven_most) == str # 15. 该文本里,哪个用户的源微博URL次数最多。 (要求:输出用户的uid,字符串格式。) url_user = {line[keys['uid']]: 0 for line in lines} for line in lines: if line[keys['v_url']] != '': url_user[line[keys['uid']]] += 1 url_user_list = url_user.items() url_user_list.sort(key=lambda k: k[1], reverse=True) url_trans_most = url_user_list[0][0] assert type(url_trans_most) == str
总时间:<pre name="code" class="python">d = time.time() - now print d print '运算时间%s' % d print '运算时间%d' % d print type(d) # 这里面是%s, 代表字符串,%d代表整型会变成0
<pre name="code" class="python"><pre name="code" class="python">0.523000001907 运算时间0.523000001907 运算时间0 <type 'float'>
相关文章推荐
- python开源IP代理池--IPProxys
- The Python Tutorial - Input and Output
- python--模块
- python-django
- python核心编程学习笔记-2016-07-21-01-decimal模块
- 将sublime打造成python的IDE开发工具
- python 数据初学者 小记
- Python学习-反射相关函数
- windows7下Python环境搭建
- 转载:深入解读Python解析XML的几种方式
- Python 练习册,每天一个小程序(0000)
- Learning Python 011 高级特性 2
- Learning Python 011 高级特性 2
- Python篇----Requests获取网页源码(爬虫基础)
- 新手疑惑:谈谈python 中__name__ = '__main__' 的作用
- 不可不知的Python模块: collections
- "Python"学习笔记----简单文件处理
- dbus-python 指南
- 运维python开发有感
- 利用Python实现选择排序