您的位置:首页 > 编程语言 > Python开发

Python脚本进行主播招募相关数据统计的案例

2017-10-20 16:52 281 查看
脚本中主要关注点:
1、从不同源中取数据(两个mysql源及Hive源)

2、两大List根据关键字进行左连接


其实也可以先将match_apply表中数据取到之后先插入到目标表中,然后根据关键字uid再将Hive计算好的指标数据更新到目标表中;在这里是左连接之后再进行一次性插入;并没有考虑mysql多条记录插入的效率,有需要的话可以参考以往脚本进行完善。
/Users/nisj/PycharmProjects/BiDataProc/Demand/hadoopStat/anchorRecruit.py
# -*- coding=utf-8 -*-
import datetime
import time
import os
import warnings
import sys
import re
reload(sys)
sys.setdefaultencoding('utf8')

warnings.filterwarnings("ignore")

def anchorRecruitProc():
matchRange = os.popen("""source /etc/profile; \
/usr/bin/mysql  -hMysqlHost -P6605 -uMysqlUser -pMysqlPass --default-character-set=utf8 -N -e " \
select a1.match_id,a1.start_time,a1.end_time,a1.id range_id \
from jellyfish_hadoop_stat.match_apply_stat_range a1 \
left join jellyfish_hadoop_stat.match_apply_stat_result a2 on a1.id=a2.range_id \
where a2.range_id is null; \
" """).readlines();
matchRange_list = []
for matchRangeList in matchRange:
matchR = re.split('\t', matchRangeList.replace('\n', '').replace('`', '').replace('\'', '').replace('"', ''))
matchRange_list.append(matchR)
for matchRangeData in matchRange_list:
match_id = matchRangeData[0]
start_time = matchRangeData[1]
end_time = matchRangeData[2]
range_id = matchRangeData[3]

gameId_Data = os.popen("""source /etc/profile; \
/usr/bin/mysql  -hMysqlHost -P50512 -uMysqlUser -pMysqlPass --default-character-set=utf8 -N -e " \
select game_id from jellyfish_event.event_apply_template where id={match_id}; \
" """.format(match_id=match_id)).readlines();
gameId = re.split('\t', gameId_Data[0].replace('\n', '').replace('`', '').replace('\'', '').replace('"', ''))[0]

matchApply_Data = os.popen("""source /etc/profile; \
/usr/bin/mysql  -hMysqlHost -P50512 -uMysqlUser -pMysqlPass --default-character-set=utf8 -N -e " \
select uid,\`name\`,phone_num,qq,area,is_leader from jellyfish_event.match_apply where match_id={match_id}; \
" """.format(match_id=match_id)).readlines();
matchApply_list = []
for matchApplyList in matchApply_Data:
matchA = re.split('\t', matchApplyList.replace('\n', '').replace('`', '').replace('\'', '').replace('"', ''))
matchApply_list.append(matchA)

indexData=os.popen("""source /etc/profile; \
/usr/lib/hive-current/bin/hive -e "with tab_live_long as( \
select pt_day,room_id,sum(live_minute) anthor_live_time \
from (select room_id,pt_day,hour(time_interval)*60+minute(time_interval)+second(time_interval)/60 live_minute from ( \
select room_id,cast(updated_time as timestamp)-cast(switch_time as timestamp) time_interval,pt_day \
from honeycomb_all_live_history_status \
where pt_day between '{start_time}' and '{end_time}' and game_id={gameId}) x) xx \
group by pt_day,room_id \
), \
tab_room_composite_popularity as( \
select roomid room_id,sum(view_time) composite_popularity,row_number()over(order by sum(view_time)) composite_popularity_rank \
from recommend_data_view \
where pt_day between '{start_time}' and '{end_time}' and gameid={gameId} \
group by roomid \
), \
tab_room_highest_popularity as( \
select room_id,max(online_count) highest_popularity,row_number()over(order by max(online_count)) highest_popularity_rank \
from oss_room_online_stat \
where pt_day between '{start_time}' and '{end_time}' \
group by room_id \
), \
tab_user_info as ( \
select a2.nickname,a1.id room_id,a2.uid \
from oss_room_v2 a1 \
left join oss_chushou_user_profile a2 on a1.creator_uid=a2.uid \
where a1.pt_day='{yesterday}' and a2.pt_day='{yesterday}') \
select a1.room_id,a4.uid,a4.nickname,a2.composite_popularity_rank composite_popularity,a3.highest_popularity_rank highest_popularity,sum(a1.anthor_live_time) live_duration,count(distinct a1.pt_day) live_days \
from tab_live_long a1 \
left join tab_room_composite_popularity a2 on a1.room_id=a2.room_id \
left join tab_room_highest_popularity a3 on a1.room_id=a3.room_id \
left join tab_user_info a4 on a1.room_id=a4.room_id
group by a1.room_id,a4.uid,a4.nickname,a2.composite_popularity_rank,a3.highest_popularity_rank \
; \
" """.format(start_time=start_time, end_time=end_time, gameId=gameId, yesterday=(datetime.date.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d'))).readlines();

index_list = []
for indexList in indexData:
indexV = re.split('\t', indexList.replace('\n', '').replace('`', '').replace('\'', '').replace('"', ''))
index_list.append(indexV)

# delete data for rollback
os.system("""source /etc/profile; \
/usr/bin/mysql  -hMysqlHost -P6605 -uMysqlUser -pMysqlPass --default-character-set=utf8 -e " \
delete from jellyfish_hadoop_stat.match_apply_stat_result where range_id='{range_id}'; \
" """.format(range_id=range_id));
for matchApplyData in matchApply_list:
uid = matchApplyData[0]
name = matchApplyData[1]
phone_num = matchApplyData[2]
qq = matchApplyData[3]
area = matchApplyData[4]
is_leader = matchApplyData[5]
indexValueDataKey = []
for indexValueData in index_list:
room_id = indexValueData[0]
# uid = indexValueData[1]
nickname = indexValueData[2]
composite_popularity = indexValueData[3]
highest_popularity = indexValueData[4]
live_duration = indexValueData[5]
live_days = indexValueData[6]
indexValueDataKey.append(indexValueData[1])
if matchApplyData[0] == indexValueData[1]:
# print uid,name,phone_num,qq,area,is_leader,room_id,nickname,composite_popularity,highest_popularity,live_duration,live_days
os.system("""source /etc/profile; \
/usr/bin/mysql  -hMysqlHost -P6605 -uMysqlUser -pMysqlPass --default-character-set=utf8 -e " \
insert into jellyfish_hadoop_stat.match_apply_stat_result(range_id,uid,room_id,\`name\`,phone_num,qq,area,is_leader,composite_popularity,highest_popularity,live_duration,live_days,created_time,updated_time) \
values ({range_id},{uid},{room_id},ifnull('{name}',' '),ifnull('{phone_num}',' '),ifnull('{qq}',' '),ifnull('{area}',' '),ifnull({is_leader},0),ifnull({composite_popularity},0),ifnull({highest_popularity},0),ifnull({live_duration},0),ifnull({live_days},0),'{created_time}','{updated_time}') \
; \
" """.format(range_id=range_id, uid=uid, room_id=room_id, name=name, phone_num=phone_num, qq=qq, area=area, is_leader=is_leader, composite_popularity=composite_popularity, highest_popularity=highest_popularity, live_duration=live_duration, live_days=live_days, created_time=time.strftime('%Y-%m-%d %X', time.localtime()), updated_time=time.strftime('%Y-%m-%d %X', time.localtime())));
if matchApplyData[0] not in indexValueDataKey:
os.system("""source /etc/profile; \
/usr/bin/mysql  -hMysqlHost -P6605 -uMysqlUser -pMysqlPass --default-character-set=utf8 -e " \
insert into jellyfish_hadoop_stat.match_apply_stat_result(range_id,uid,room_id,\`name\`,phone_num,qq,area,is_leader,composite_popularity,highest_popularity,live_duration,live_days,created_time,updated_time) \
values ({range_id},{uid},ifnull({room_id},-1),ifnull('{name}',' '),ifnull('{phone_num}',' '),ifnull('{qq}',' '),ifnull('{area}',' '),ifnull({is_leader},0),ifnull({composite_popularity},0),ifnull({highest_popularity},0),ifnull({live_duration},0),ifnull({live_days},0),'{created_time}','{updated_time}') \
; \
" """.format(range_id=range_id, uid=uid, room_id='NULL', name=name,
phone_num=phone_num, qq=qq, area=area, is_leader=is_leader,
composite_popularity='NULL',
highest_popularity='NULL',
live_duration='NULL', live_days='NULL',
created_time=time.strftime('%Y-%m-%d %X', time.localtime()),
updated_time=time.strftime('%Y-%m-%d %X', time.localtime())));

# Batch Test
anchorRecruitProc()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: