您的位置:首页 > 其它

Hive内置函数的应用案例

2015-06-11 11:44 267 查看
1:计算两个时间相差多少天:hive内置函数datediff

例子Demo:datediff('2009-07-28','2009-07-30')=2

2:也可以写自定义函数

package cdel.edu.hive.udf.demo04;

import java.text.ParseException;

import java.text.SimpleDateFormat;

import java.util.Calendar;

import java.util.Date;

import org.apache.hadoop.hive.ql.exec.UDF;

/**

* 计算一个时间到今天的相差多少天

* @author dell

*

*/

public class MyDateDiff extends UDF {

public String evaluate(String str) {

try {

String today = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date().getTime());

int days = daysBetween(str,today);

return String.valueOf(days);

} catch (Exception e) {

return null;

}

}

/**

* 计算两个日期之间相差的天数

*

* @param smdate

* 较小的时间

* @param bdate

* 较大的时间

* @return 相差天数

* @throws ParseException

*/

public static int daysBetween(Date smdate, Date bdate)

throws ParseException {

SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");

smdate = sdf.parse(sdf.format(smdate));

bdate = sdf.parse(sdf.format(bdate));

Calendar cal = Calendar.getInstance();

cal.setTime(smdate);

long time1 = cal.getTimeInMillis();

cal.setTime(bdate);

long time2 = cal.getTimeInMillis();

long between_days = (time2 - time1) / (1000 * 3600 * 24);

return Integer.parseInt(String.valueOf(between_days));

}

/**

* 字符串的日期格式的计算

*/

public static int daysBetween(String smdate, String bdate)

throws ParseException {

SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");

Calendar cal = Calendar.getInstance();

cal.setTime(sdf.parse(smdate));

long time1 = cal.getTimeInMillis();

cal.setTime(sdf.parse(bdate));

long time2 = cal.getTimeInMillis();

long between_days = (time2 - time1) / (1000 * 3600 * 24);

return Integer.parseInt(String.valueOf(between_days));

}

public static void main(String[] args) throws Exception {

String str="2015-04-27";

System.out.println(new MyDateDiff().evaluate(str));

}

}

-创建自定义函数

add jar /home/admin/xxg/MyDateDiff.jar;

create temporary function myDateDiff as 'cdel.edu.hive.udf.demo04.MyDateDiff';

--hive 统计

-- 最近30 天内学员的做题量统计

--分网统计

create table qz_count as

SELECT mps.userID,mem.memberid,COUNT(distinct qpq.questionID) as doQuesCount,mem.domain,

to_date(concat_ws(' ',substr(mps.createTime,1,10),substr(mps.createTime,11,length(mps.createTime)))) as daytime

from tmp_qz_member_paper_score mps

left join tmp_qz_member_paper_question qpq on qpq.paperScoreID = mps.paperScoreID and mps.domain = qpq.domain

left join order.tmp_member_info_new mem on mem.uid = mps.userID and mem.domain = mps.domain

where cast(myDateDiff(concat_ws(' ',substr(mps.createTime,1,10),'')) as int) <=30

group by mps.userID,mem.memberid ,mem.domain,

to_date(concat_ws(' ',substr(mps.createTime,1,10),substr(mps.createTime,11,length(mps.createTime))))

having COUNT(distinct qpq.questionID) >0

order by doQuesCount desc ;

SELECT mps.userID,mem.memberid,COUNT(distinct qpq.questionID) as doQuesCount,mem.domain,

to_date(concat_ws(' ',substr(mps.createTime,1,10),substr(mps.createTime,11,length(mps.createTime)))) as daytime

from tmp_qz_member_paper_score mps

left join tmp_qz_member_paper_question qpq on qpq.paperScoreID = mps.paperScoreID and mps.domain = qpq.domain

left join order.tmp_member_info_new mem on mem.uid = mps.userID and mem.domain = mps.domain

where mps.userID='48888922' and cast(myDateDiff(concat_ws(' ',substr(mps.createTime,1,10),'')) as int) <=31

group by mps.userID,mem.memberid ,mem.domain,

to_date(concat_ws(' ',substr(mps.createTime,1,10),substr(mps.createTime,11,length(mps.createTime))))

having COUNT(distinct qpq.questionID) >0

order by doQuesCount desc ;

SELECT qpq.*

from tmp_qz_member_paper_score mps

left join tmp_qz_member_paper_question qpq on qpq.paperScoreID = mps.paperScoreID and mps.domain = qpq.domain

where mps.userID='48888922' limit 100;

经验验证:没有医学网做题题目信息

select distinct logdata from tmp_qz_member_paper_question where domain='med' limit 10

验证数据

select * from qz_count where userid=48888922 order by doQuesCount desc limit 100;

select * from qz_count where doQuesCount>=500 order by doQuesCount desc ;

-- 不分网统计

create table qz_count as

SELECT mps.userID,mem.memberid,COUNT(distinct qpq.questionID) as doQuesCount,

to_date(concat_ws(' ',substr(mps.createTime,1,10),substr(mps.createTime,11,length(mps.createTime)))) as daytime

from tmp_qz_member_paper_score mps

left join tmp_qz_member_paper_question qpq on qpq.paperScoreID = mps.paperScoreID and mps.domain = qpq.domain

left join order.tmp_member_info_new mem on mem.uid = mps.userID and mem.domain = mps.domain

where cast(myDateDiff(concat_ws(' ',substr(mps.createTime,1,10),'')) as int) <=30

group by mps.userID,mem.memberid ,

to_date(concat_ws(' ',substr(mps.createTime,1,10),substr(mps.createTime,11,length(mps.createTime))))

having COUNT(distinct qpq.questionID) >0

order by doQuesCount desc ;

-- 统计学员每天做多少套试卷

-- 分网统计

create table paper_count as

select mps.userID,mem.Memberid,mem.domain, COUNT(distinct mps.paperViewID) AS doPaperCount ,

to_date(concat_ws(' ',substr(mps.createTime,1,10),substr(mps.createTime,11,length(mps.createTime)))) as daytime

from tmp_qz_member_paper_score mps

left join order.tmp_member_info_new mem on mem.uID = mps.userID and mem.domain = mps.domain

where cast(myDateDiff(concat_ws(' ',substr(mps.createTime,1,10),'')) as int) <=31

group by mps.userID,mem.Memberid, mem.domain,

to_date(concat_ws(' ',substr(mps.createTime,1,10),substr(mps.createTime,11,length(mps.createTime))))

having COUNT(distinct mps.paperViewID) >0

order by doPaperCount desc;

-- 不分网统计

select mps.userID,mem.Memberid,COUNT(distinct mps.paperViewID) AS doPaperCount ,

to_date(concat_ws(' ',substr(mps.createTime,1,10),substr(mps.createTime,11,length(mps.createTime)))) as daytime

from tmp_qz_member_paper_score mps

left join order.tmp_member_info_new mem on mem.uID = mps.userID and mem.domain = mps.domain

where mps.userID=48888922

group by mps.userID,mem.Memberid,

to_date(concat_ws(' ',substr(mps.createTime,1,10),substr(mps.createTime,11,length(mps.createTime))))

having COUNT(distinct mps.paperViewID) >0

order by doPaperCount desc;

测试:

select * from paper_count order by doPaperCount desc limit 100;

select mps.userID, mps.paperViewID from tmp_qz_member_paper_score mps where mps.userID=48888922;

-- 导出本地文件中

insert overwrite local directory '/home/admin/xxg/qz/med/'

row format delimited

fields terminated by '\t'

select * from qz_count;

-- 最近30天内学员的提问数量

-- 针对主题的提问统计

SELECT ft.userID,ft.userName , COUNT(ft.userID) as num FROM FAQ_TOPIC ft

group by FT.userid ,ft.userName

SELECT ft.userID,ft.userName , COUNT(af.USERID) as num

FROM FAQ_TOPIC ft

inner JOIN faq af on ft.topicID = af.topicID and af.istopic = 1

group by ft.userID,ft.userName

create table faq_count as

SELECT ft.userID, mem.memberid, COUNT(distinct ft.topicID) as num, mem.domain,

to_date(concat_ws(' ',substr(ft.createTime,1,10),substr(ft.createTime,11,length(ft.createTime)))) as daytime

FROM tmp_faq_topic ft

left join order.tmp_member_info_new mem on mem.uid = ft.userID and mem.domain = ft.domain

where cast(myDateDiff(concat_ws(' ',substr(ft.createTime,1,10),'')) as int) <=30

group by ft.userID, mem.memberid,mem.domain,

to_date(concat_ws(' ',substr(ft.createTime,1,10),substr(ft.createTime,11,length(ft.createTime))))

having COUNT(distinct ft.topicID) >0

order by num desc ;

--针对主题和继续提问的 --- 就是如果一个主题,两个继续的话 你统计的sum =3

create table faq_count2 as

SELECT ft.userID, mem.memberid, COUNT(distinct af.faqid) as num, mem.domain,

to_date(concat_ws(' ',substr(af.createTime,1,10),substr(af.createTime,11,length(af.createTime)))) as daytime

FROM tmp_faq af

left join tmp_faq_topic ft on ft.topicID = af.topicID and af.domain = ft.domain

left join order.tmp_member_info_new mem on mem.uid = ft.userID and mem.domain = af.domain

where cast(myDateDiff(concat_ws(' ',substr(af.createTime,1,10),'')) as int) <=30

group by ft.userID, mem.memberid, mem.domain,

to_date(concat_ws(' ',substr(af.createTime,1,10),substr(af.createTime,11,length(af.createTime))))

having COUNT(distinct af.faqid) >0

order by num desc limit 10;

-- 导出本地文件中

insert overwrite local directory '/home/admin/xxg/faq/'

row format delimited

fields terminated by '\t'

select * from faq_count;

-- 删除表信息

drop table qz_count;

drop table faq_count;

drop table paper_count;

-- 统计

每天做题 >=3000道以上学员的个数

每天做题 >=2500 and <3000 道以上的学员个数

每天做题 >=2000 and <2500 道以上的学员个数

每天做题 >=1500 and <2000 道以上的学员个数

每天做题 >=1000 and <1500 道以上的学员个数

每天做题 >=500 and <1000 道以上的学员个数

每天做题 >=100 and <500 道以上的学员个数

每天做题 >=50 and <100 道以上的学员个数

每天做题 >0 and <50 道以上的学员个数

-- 根据学员分组统计,学员做题数量统计

select userid,memberid,domain,

SUM(case when doquescount=0 then 1 else 0 end) as n0,

SUM(case when doquescount>0 and doquescount<50 then 1 else 0 end) as n1,

SUM(case when doquescount>=50 and doquescount<100 then 1 else 0 end) as n2,

SUM(case when doquescount>=100 and doquescount<500 then 1 else 0 end) as n3,

SUM(case when doquescount>=500 and doquescount<1000 then 1 else 0 end) as n4,

SUM(case when doquescount>=1000 and doquescount<1500 then 1 else 0 end) as n5,

SUM(case when doquescount>=1500 and doquescount<2000 then 1 else 0 end) as n6,

SUM(case when doquescount>=2000 and doquescount<2500 then 1 else 0 end) as n7,

SUM(case when doquescount>=2500 and doquescount<3000 then 1 else 0 end) as n8,

SUM(case when doquescount>=3000 then 1 else 0 end) as n9

from qz_count

group by userid,memberid, domain limit 100;

-- 根据日期分组统计,学员做题量统计

select daytime,

SUM(case when doquescount=0 then 1 else 0 end) as n0,

SUM(case when doquescount>=1 and doquescount<50 then 1 else 0 end) as n1,

SUM(case when doquescount>=50 and doquescount<100 then 1 else 0 end) as n2,

SUM(case when doquescount>=100 and doquescount<500 then 1 else 0 end) as n3,

SUM(case when doquescount>=500 and doquescount<1000 then 1 else 0 end) as n4,

SUM(case when doquescount>=1000 and doquescount<1500 then 1 else 0 end) as n5,

SUM(case when doquescount>=1500 and doquescount<2000 then 1 else 0 end) as n6,

SUM(case when doquescount>=2000 and doquescount<2500 then 1 else 0 end) as n7,

SUM(case when doquescount>=2500 and doquescount<3000 then 1 else 0 end) as n8,

SUM(case when doquescount>=3000 then 1 else 0 end) as n9

from qz_count

group by daytime;

测试

select count(distinct userid)

from qz_count

where doquescount>=2500 and doquescount<3000;

select *

from qz_count

where doquescount>=2500 and doquescount<3000;

select count(distinct userid)

from qz_count

where doquescount>=3000 ;

--

-- 根据日期统计每天学员的所做试卷量

select daytime,

SUM(case when dopapercount=0 then 1 else 0 end) as n0,

SUM(case when dopapercount>=1 and dopapercount<5 then 1 else 0 end) as n1,

SUM(case when dopapercount>=5 and dopapercount<10 then 1 else 0 end) as n2,

SUM(case when dopapercount>=10 and dopapercount<50 then 1 else 0 end) as n3,

SUM(case when dopapercount>=50 and dopapercount<100 then 1 else 0 end) as n4,

SUM(case when dopapercount>=100 and dopapercount<150 then 1 else 0 end) as n5,

SUM(case when dopapercount>=150 and dopapercount<200 then 1 else 0 end) as n6,

SUM(case when dopapercount>=200 and dopapercount<250 then 1 else 0 end) as n7,

SUM(case when dopapercount>=250 and dopapercount<300 then 1 else 0 end) as n8,

SUM(case when dopapercount>=300 then 1 else 0 end) as n9

from paper_count

group by daytime limit 100;

测试检验:

select * from paper_count order by dopapercount desc limit 100;

select count(*) from paper_count where dopapercount>=10 and dopapercount<50 and daytime='2015-04-26' limit 100;

select count(distinct userid) from paper_count where dopapercount>=10 and dopapercount<50 and daytime='2015-04-26' limit 100;

-- 根据日期统计每天学员在不同区间内答疑数量

每天答疑提问主题 >=60 道以上学员的个数

每天答疑提问主题 >=55 and < 60道以上学员的个数

每天答疑提问主题 >=50 and < 55道以上学员的个数

每天答疑提问主题 >=45 and <50 道以上的学员个数

每天答疑提问主题 >=40 and <45 道以上的学员个数

每天答疑提问主题 >=35 and <40 道以上的学员个数

每天答疑提问主题 >=30 and <35 道以上的学员个数

每天答疑提问主题 >=25 and <30 道以上的学员个数

每天答疑提问主题 >=20 and <25 道以上的学员个数

每天答疑提问主题 >=15 and <20 道以上的学员个数

每天答疑提问主题 >=10 and <15 道以上的学员个数

每天答疑提问主题 >=5 and <10 道以上的学员个数

每天答疑提问主题 >=1 and <5 道以上的学员个数

每天答疑提问主题 =0 道以上的学员个数

select daytime,

SUM(case when num=0 then 1 else 0 end) as n0,

SUM(case when num>=1 and num<5 then 1 else 0 end) as n1,

SUM(case when num>=5 and num<10 then 1 else 0 end) as n2,

SUM(case when num>=10 and num<15 then 1 else 0 end) as n3,

SUM(case when num>=15 and num<20 then 1 else 0 end) as n4,

SUM(case when num>=20 and num<25 then 1 else 0 end) as n5,

SUM(case when num>=25 and num<30 then 1 else 0 end) as n6,

SUM(case when num>=30 and num<35 then 1 else 0 end) as n7,

SUM(case when num>=35 and num<40 then 1 else 0 end) as n8,

SUM(case when num>=40 and num<45 then 1 else 0 end) as n9,

SUM(case when num>=45 and num<50 then 1 else 0 end) as n10,

SUM(case when num>=50 and num<55 then 1 else 0 end) as n11,

SUM(case when num>=55 and num<60 then 1 else 0 end) as n12,

SUM(case when num>=60 then 1 else 0 end) as n13

from faq_count

group by daytime limit 100;
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: