利用hive完成阿里天池大数据音乐预测比赛数据处理工作
2016-04-11 19:32
489 查看
hive shell
创建外表,指定目录
CREATE EXTERNAL TABLE IF NOT EXISTS songs2 (
sid string,
aid string,
ptime string,
sinit int,
language int,
gender int)
COMMENT 'This is the staging page view table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/bs/music/songs/';
导入HDFS文件(原文件消失)
LOAD DATA INPATH '/bs/music/input/mars_tianchi_songs.csv' OVERWRITE INTO TABLE songs2;
查看前10条数据
select * from songs2 limit 10;
创建外表,指定目录
CREATE EXTERNAL TABLE IF NOT EXISTS useraction (
uid string,
sid string,
btime string,
atype int,
ds string)
COMMENT 'This is the staging page view table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/bs/music/useraction/';
导入HDFS文件(原文件消失)
LOAD DATA INPATH '/bs/music/input/mars_tianchi_user_actions.csv' OVERWRITE INTO TABLE useraction;
select * from useraction limit 10;
表连接 小表在前
SELECT t1.*,t2.* FROM songs2 t1 JOIN useraction t2 on t1.sid=t2.sid;
Map join连接
SELECT /*+mapjoin(songs2)*/ t1.*,t2.* FROM songs2 t1 JOIN useraction t2 on t1.sid=t2.sid;
导出查询数据到hdfs
INSERT OVERWRITE DIRECTORY '/bs/music/data'
SELECT /*+mapjoin(songs2)*/ t1.*,t2.* FROM songs2 t1 JOIN useraction t2 on t1.sid=t2.sid;
查询结果保存到表
CREATE EXTERNAL TABLE IF NOT EXISTS usersongs (
sid string,
aid string,
ptime string,
sinit int,
language int,
gender int,
uid string,
sid2 string,
btime string,
atype int,
ds string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/bs/music/data/';
INSERT OVERWRITE TABLE usersongs
SELECT /*+mapjoin(songs2)*/ t1.*,t2.* FROM songs2 t1 JOIN useraction t2 on t1.sid=t2.sid;
查询结果保存到本地
hive -e "select * from usersongs limit 10" >> /opt/tools/test.csv
查看HDFS上文件的前5行
hadoop fs -text /bs/music/data/000000_0 |head -n 5
去重分组查询 同一aid的uid去重总量,sid的去重总量
select count(distinct uid),count(distinct sid),aid from usersongs where atype=1 group by aid;
创建外表,指定目录
CREATE EXTERNAL TABLE IF NOT EXISTS songs2 (
sid string,
aid string,
ptime string,
sinit int,
language int,
gender int)
COMMENT 'This is the staging page view table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/bs/music/songs/';
导入HDFS文件(原文件消失)
LOAD DATA INPATH '/bs/music/input/mars_tianchi_songs.csv' OVERWRITE INTO TABLE songs2;
查看前10条数据
select * from songs2 limit 10;
创建外表,指定目录
CREATE EXTERNAL TABLE IF NOT EXISTS useraction (
uid string,
sid string,
btime string,
atype int,
ds string)
COMMENT 'This is the staging page view table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/bs/music/useraction/';
导入HDFS文件(原文件消失)
LOAD DATA INPATH '/bs/music/input/mars_tianchi_user_actions.csv' OVERWRITE INTO TABLE useraction;
select * from useraction limit 10;
表连接 小表在前
SELECT t1.*,t2.* FROM songs2 t1 JOIN useraction t2 on t1.sid=t2.sid;
Map join连接
SELECT /*+mapjoin(songs2)*/ t1.*,t2.* FROM songs2 t1 JOIN useraction t2 on t1.sid=t2.sid;
导出查询数据到hdfs
INSERT OVERWRITE DIRECTORY '/bs/music/data'
SELECT /*+mapjoin(songs2)*/ t1.*,t2.* FROM songs2 t1 JOIN useraction t2 on t1.sid=t2.sid;
查询结果保存到表
CREATE EXTERNAL TABLE IF NOT EXISTS usersongs (
sid string,
aid string,
ptime string,
sinit int,
language int,
gender int,
uid string,
sid2 string,
btime string,
atype int,
ds string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/bs/music/data/';
INSERT OVERWRITE TABLE usersongs
SELECT /*+mapjoin(songs2)*/ t1.*,t2.* FROM songs2 t1 JOIN useraction t2 on t1.sid=t2.sid;
查询结果保存到本地
hive -e "select * from usersongs limit 10" >> /opt/tools/test.csv
查看HDFS上文件的前5行
hadoop fs -text /bs/music/data/000000_0 |head -n 5
去重分组查询 同一aid的uid去重总量,sid的去重总量
select count(distinct uid),count(distinct sid),aid from usersongs where atype=1 group by aid;
相关文章推荐
- HDFS异构存储
- Hadoop节点上负载过高的问题分析
- VolleyAir
- lightoj 1076 - Get the Containers 二分答案
- 大数据系列文章汇总 - 更新到15篇
- nrf51822 --- 配对绑定输入密码(pair)
- [INSTALL_FAILED_OLDER_SDK]错误
- iPhone:constrainedToSize获取字符串的宽高
- iPhone:constrainedToSize获取字符串的宽高
- contrail 3.0 vcenter_compute安装出现问题
- HDoj.1789 Doing Homework again【贪心】 2016/04/11
- http://blog.csdn.net/muzizongheng/article/details/46795243
- http://wenku.baidu.com/view/1ee32e3a87c24028915fc362.html?re=view###
- ZOJ 3780 - Paint the Grid Again
- LeetCode OJ 220.Contains Duplicate 3
- 219. Contains Duplicate II
- 【poj 1691】Painting A Board
- 开发错误记录5:Failed to resolve: com
- 调用 webservice 出现:WSDLReader:Loading of the WSDL file failed HRESULT=0×80040154: 没有注册类别 解决方案
- open(\"/dev/graphics/fb0\") failed!