您的位置:首页 > 大数据

利用hive完成阿里天池大数据音乐预测比赛数据处理工作

2016-04-11 19:32 489 查看
hive shell

创建外表,指定目录

CREATE EXTERNAL TABLE IF NOT EXISTS songs2 (

sid string,

aid string,

ptime string,

sinit int,

language int,

gender int)

COMMENT 'This is the staging page view table'

ROW FORMAT DELIMITED FIELDS TERMINATED BY ','

LOCATION '/bs/music/songs/';

导入HDFS文件(原文件消失)

LOAD DATA INPATH '/bs/music/input/mars_tianchi_songs.csv' OVERWRITE INTO TABLE songs2;

查看前10条数据

select * from songs2 limit 10;

创建外表,指定目录

CREATE EXTERNAL TABLE IF NOT EXISTS useraction (

uid string,

sid string,

btime string,

atype int,

ds string)

COMMENT 'This is the staging page view table'

ROW FORMAT DELIMITED FIELDS TERMINATED BY ','

LOCATION '/bs/music/useraction/';

导入HDFS文件(原文件消失)

LOAD DATA INPATH '/bs/music/input/mars_tianchi_user_actions.csv' OVERWRITE INTO TABLE useraction;

select * from useraction limit 10;

表连接 小表在前

SELECT t1.*,t2.* FROM songs2 t1 JOIN useraction t2 on t1.sid=t2.sid;

Map join连接

SELECT /*+mapjoin(songs2)*/ t1.*,t2.* FROM songs2 t1 JOIN useraction t2 on t1.sid=t2.sid;

导出查询数据到hdfs

INSERT OVERWRITE DIRECTORY '/bs/music/data'

SELECT /*+mapjoin(songs2)*/ t1.*,t2.* FROM songs2 t1 JOIN useraction t2 on t1.sid=t2.sid;

查询结果保存到表

CREATE EXTERNAL TABLE IF NOT EXISTS usersongs (

sid string,

aid string,

ptime string,

sinit int,

language int,

gender int,

uid string,

sid2 string,

btime string,

atype int,

ds string)

ROW FORMAT DELIMITED FIELDS TERMINATED BY ','

LOCATION '/bs/music/data/';

INSERT OVERWRITE TABLE usersongs

SELECT /*+mapjoin(songs2)*/ t1.*,t2.* FROM songs2 t1 JOIN useraction t2 on t1.sid=t2.sid;

查询结果保存到本地

hive -e "select * from usersongs limit 10" >> /opt/tools/test.csv

查看HDFS上文件的前5行

hadoop fs -text /bs/music/data/000000_0 |head -n 5

去重分组查询 同一aid的uid去重总量,sid的去重总量

select count(distinct uid),count(distinct sid),aid from usersongs where atype=1 group by aid;
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: