您的位置:首页 > 数据库

spark-sql测试总结

2015-12-17 18:01 288 查看
spark-sql测试总结

最近倒腾spark-sql,原来测试都是很小的数据,由于自己的是6个虚拟机资源有限,也不能太大,于是在找了帖子。
http://colobu.com/2014/12/11/spark-sql-quick-start/
Spark SQL 初探: 使用大数据分析2000万数据

############## 不要问我数据怎么下载的,自己搜索,我用完就删了。

1、文件检查,shell中wc和awk命令帮忙检查一下行和列。

############ head 一下文件,得知,都有列头,逗号分隔。因为涉及名字隐私信息,只打印列头,第二行开始是具体记录。

[hue@snn 2000w]$ head -1 1-200W.csv

Name,CardNo,Descriot,CtfTp,CtfId,Gender,Birthday,Address,Zip,Dirty,District1,District2,District3,District4,District5,District6,

FirstNm,LastNm,Duty,Mobile,Tel,Fax,EMail,Nation,Taste,Education,Company,CTel,CAddress,CZip,Family,Version,id

[hue@snn 2000w]$

############ wc 检查一下行数

[hadoop@snn 2000w]$ cat 1000W-1200W.csv | wc -l

2000050

[hadoop@snn 2000w]$ cat 1200W-1400W.csv | wc -l

2000205

[hadoop@snn 2000w]$ cat 1-200W.csv | wc -l

2000094

[hadoop@snn 2000w]$

############ awk 检查一下列数,33列

[hadoop@snn 2000w]$ awk 'BEGIN {FS=","}END{print "Filename:" FILENAME ",Linenumber:" NR ",Columns:" NF}' 1000W-1200W.csv

Filename:1000W-1200W.csv,Linenumber:2000050,Columns:33

####################################

2、hdfs创建文件夹,并put文件上去

[hue@snn ~]$ hadoop fs -mkdir /user/hue/external/2000w

[hue@snn ~]$ hadoop fs -put /opt/2000w/* /user/hue/external/2000w/

[hue@snn ~]$ hadoop fs -ls -R /user/hue/external/2000w/

-rw-r--r-- 3 hue hue 348173735 2015-12-17 14:36 /user/hue/external/2000w/1-200W.csv

-rw-r--r-- 3 hue hue 317365192 2015-12-17 14:36 /user/hue/external/2000w/1000W-1200W.csv

-rw-r--r-- 3 hue hue 307266272 2015-12-17 14:36 /user/hue/external/2000w/1200W-1400W.csv

-rw-r--r-- 3 hue hue 319828719 2015-12-17 14:36 /user/hue/external/2000w/1400W-1600W.csv

-rw-r--r-- 3 hue hue 310125772 2015-12-17 14:37 /user/hue/external/2000w/1600w-1800w.csv

-rw-r--r-- 3 hue hue 298454235 2015-12-17 14:37 /user/hue/external/2000w/1800w-2000w.csv

-rw-r--r-- 3 hue hue 311349431 2015-12-17 14:38 /user/hue/external/2000w/200W-400W.csv

-rw-r--r-- 3 hue hue 311013782 2015-12-17 14:38 /user/hue/external/2000w/400W-600W.csv

-rw-r--r-- 3 hue hue 308703632 2015-12-17 14:38 /user/hue/external/2000w/600W-800W.csv

-rw-r--r-- 3 hue hue 310797175 2015-12-17 14:38 /user/hue/external/2000w/800W-1000W.csv

-rw-r--r-- 3 hue hue 7487744 2015-12-17 14:38 /user/hue/external/2000w/last_5000.csv

[hue@snn ~]$

####################################

3、创建外部表,不用挪动文件,即可查询。

Create external table IF NOT EXISTS external_2000w

(

Name String,

CardNo String,

Descriot String,

CtfTp String,

CtfId String,

Gender String,

Birthday String,

Address String,

Zip String,

Dirty String,

District1 String,

District2 String,

District3 String,

District4 String,

District5 String,

District6 String,

FirstNm String,

LastNm String,

Duty String,

Mobile String,

Tel String,

Fax String,

EMail String,

Nation String,

Taste String,

Education String,

Company String,

CTel String,

CAddress String,

CZip String,

Family String,

Version String,

id int

)

ROW FORMAT DELIMITED FIELDS TERMINATED BY ','

LOCATION '/user/hue/external/2000w/';

##########################################################################################################

4、专题查询

############################ 全部记录,文件夹下11个文件,结构一样

select count(1) as cnt from external_2000w;

spark-sql> select count(1) as cnt from external_2000w;

20051440

Time taken: 27.806 seconds, Fetched 1 row(s)

spark-sql>

############################ 11 个文件,11个列头需要剔除。

select count(1) as cnt from external_2000w where name == 'Name';

spark-sql> select count(1) as cnt from external_2000w where name == 'Name';

11

Time taken: 29.432 seconds, Fetched 1 row(s)

spark-sql>

############################ 剔除列头,一共“20051429”条记录

select count(1) as cnt from external_2000w where name != 'Name';

spark-sql> select count(1) as cnt from external_2000w where name != 'Name';

20051429

Time taken: 34.129 seconds, Fetched 1 row(s)

spark-sql>

########################### 异常数据筛选,性别不是(M,F)

select count(1) as cnt from external_2000w where name != 'Name' and Gender not in('M','F');

spark-sql> select count(1) as cnt from external_2000w where name != 'Name' and Gender not in('M','F');

802043

Time taken: 34.735 seconds, Fetched 1 row(s)

spark-sql>

########################### 男女分组统计

select Gender,count(1) as cnt from external_2000w where name != 'Name' and Gender in('M','F') GROUP BY Gender;

spark-sql> select Gender,count(1) as cnt from external_2000w where name != 'Name' and Gender in('M','F') GROUP BY Gender;

F 6478121

M 12771211

Time taken: 41.875 seconds, Fetched 2 row(s)

spark-sql>

########################### 星座分组统计

select XingZuo,count(1) as cnt from (

select

CASE

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 120 and substring(Birthday,5) <= 219 THEN "水瓶座"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 220 and substring(Birthday,5) <= 320 THEN "双鱼座"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 321 and substring(Birthday,5) <= 420 THEN "白羊座"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 421 and substring(Birthday,5) <= 521 THEN "金牛座"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 522 and substring(Birthday,5) <= 621 THEN "双子座"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 622 and substring(Birthday,5) <= 722 THEN "巨蟹座"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 723 and substring(Birthday,5) <= 823 THEN "狮子座"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 824 and substring(Birthday,5) <= 923 THEN "处女座"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 924 and substring(Birthday,5) <= 1023 THEN "天秤座"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 1024 and substring(Birthday,5) <= 1122 THEN "天蝎座"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 1123 and substring(Birthday,5) <= 1222 THEN "射手座"

WHEN (length(Birthday) == 8 and substring(Birthday,5) >= 120 and substring(Birthday,5) <= 1231)

or (length(Birthday) == 8 and substring(Birthday,5) >= 101 and substring(Birthday,5) <= 119) THEN "摩蝎座"

ELSE "未知"

END AS XingZuo

from external_2000w where name != 'Name'

) as atable

group by XingZuo;

弹出很多乱七八糟的东西,难道是中文引起的,不要中文试试。

########################

select XingZuo,count(1) as cnt from (

select

CASE

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 120 and substring(Birthday,5) <= 219 THEN "A"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 220 and substring(Birthday,5) <= 320 THEN "B"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 321 and substring(Birthday,5) <= 420 THEN "C"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 421 and substring(Birthday,5) <= 521 THEN "D"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 522 and substring(Birthday,5) <= 621 THEN "E"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 622 and substring(Birthday,5) <= 722 THEN "F"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 723 and substring(Birthday,5) <= 823 THEN "G"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 824 and substring(Birthday,5) <= 923 THEN "H"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 924 and substring(Birthday,5) <= 1023 THEN "I"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 1024 and substring(Birthday,5) <= 1122 THEN "J"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 1123 and substring(Birthday,5) <= 1222 THEN "K"

WHEN (length(Birthday) == 8 and substring(Birthday,5) >= 120 and substring(Birthday,5) <= 1231)

or (length(Birthday) == 8 and substring(Birthday,5) >= 101 and substring(Birthday,5) <= 119) THEN "L"

ELSE "M"

END AS XingZuo

from external_2000w

where name != 'Name'

) as atable

group by XingZuo;

A 1636084

B 1510535

C 1410462

D 1406847

E 1406631

F 1498724

G 1614266

H 1666768

I 1897450

J 1820476

K 1615660

L 2406878

M 160648

Time taken: 91.985 seconds, Fetched 13 row(s)



跟那个帖子的结果有点差异。过滤条件不一样?





############################ 结果直接存入一个表格。

create table external_2000w_new as

select substring(Birthday,5) as born_day,

CASE

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 120 and substring(Birthday,5) <= 219 THEN "A"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 220 and substring(Birthday,5) <= 320 THEN "B"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 321 and substring(Birthday,5) <= 420 THEN "C"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 421 and substring(Birthday,5) <= 521 THEN "D"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 522 and substring(Birthday,5) <= 621 THEN "E"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 622 and substring(Birthday,5) <= 722 THEN "F"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 723 and substring(Birthday,5) <= 823 THEN "G"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 824 and substring(Birthday,5) <= 923 THEN "H"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 924 and substring(Birthday,5) <= 1023 THEN "I"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 1024 and substring(Birthday,5) <= 1122 THEN "J"

WHEN length(Birthday) == 8 and substring(Birthday,5) >= 1123 and substring(Birthday,5) <= 1222 THEN "K"

WHEN (length(Birthday) == 8 and substring(Birthday,5) >= 120 and substring(Birthday,5) <= 1231)

or (length(Birthday) == 8 and substring(Birthday,5) >= 101 and substring(Birthday,5) <= 119) THEN "L"

ELSE "M"

END AS XingZuo

from external_2000w

where name != 'Name';

############################ 结果直接存入一个表格。

31个小文件,stage里面分31tasks。



[root@snn conf]# hadoop fs -ls -R /user/hive/warehouse/external_2000w_new

drwxrwxrwt - hadoop hive 0 2015-12-17 17:18 /user/hive/warehouse/external_2000w_new/.hive-staging_hive_2015-12-17_17-18-32_719_3374007692051174329-1

drwxr-xr-x - hadoop hive 0 2015-12-17 17:20 /user/hive/warehouse/external_2000w_new/.hive-staging_hive_2015-12-17_17-18-32_719_3374007692051174329-1/-ext-10000

-rw-r--r-- 3 hadoop hive 0 2015-12-17 17:20 /user/hive/warehouse/external_2000w_new/.hive-staging_hive_2015-12-17_17-18-32_719_3374007692051174329-1/-ext-10000/_SUCCESS

-rwxrwxrwt 3 hadoop hive 6307372 2015-12-17 17:18 /user/hive/warehouse/external_2000w_new/part-00000

-rwxrwxrwt 3 hadoop hive 4747600 2015-12-17 17:18 /user/hive/warehouse/external_2000w_new/part-00001

-rwxrwxrwt 3 hadoop hive 2943508 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00002

-rwxrwxrwt 3 hadoop hive 5949216 2015-12-17 17:18 /user/hive/warehouse/external_2000w_new/part-00003

-rwxrwxrwt 3 hadoop hive 5887275 2015-12-17 17:18 /user/hive/warehouse/external_2000w_new/part-00004

-rwxrwxrwt 3 hadoop hive 2160089 2015-12-17 17:20 /user/hive/warehouse/external_2000w_new/part-00005

-rwxrwxrwt 3 hadoop hive 5950706 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00006

-rwxrwxrwt 3 hadoop hive 6322605 2015-12-17 17:18 /user/hive/warehouse/external_2000w_new/part-00007

-rwxrwxrwt 3 hadoop hive 1722862 2015-12-17 17:18 /user/hive/warehouse/external_2000w_new/part-00008

-rwxrwxrwt 3 hadoop hive 5927935 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00009

-rwxrwxrwt 3 hadoop hive 5839186 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00010

-rwxrwxrwt 3 hadoop hive 2229685 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00011

-rwxrwxrwt 3 hadoop hive 5907388 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00012

-rwxrwxrwt 3 hadoop hive 6142019 2015-12-17 17:20 /user/hive/warehouse/external_2000w_new/part-00013

-rwxrwxrwt 3 hadoop hive 1869211 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00014

-rwxrwxrwt 3 hadoop hive 6119244 2015-12-17 17:20 /user/hive/warehouse/external_2000w_new/part-00015

-rwxrwxrwt 3 hadoop hive 6200692 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00016

-rwxrwxrwt 3 hadoop hive 1399629 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00017

-rwxrwxrwt 3 hadoop hive 6045320 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00018

-rwxrwxrwt 3 hadoop hive 6044653 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00019

-rwxrwxrwt 3 hadoop hive 1906355 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00020

-rwxrwxrwt 3 hadoop hive 6024204 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00021

-rwxrwxrwt 3 hadoop hive 6035401 2015-12-17 17:20 /user/hive/warehouse/external_2000w_new/part-00022

-rwxrwxrwt 3 hadoop hive 1936859 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00023

-rwxrwxrwt 3 hadoop hive 6101666 2015-12-17 17:20 /user/hive/warehouse/external_2000w_new/part-00024

-rwxrwxrwt 3 hadoop hive 6075192 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00025

-rwxrwxrwt 3 hadoop hive 1819634 2015-12-17 17:20 /user/hive/warehouse/external_2000w_new/part-00026

-rwxrwxrwt 3 hadoop hive 6058918 2015-12-17 17:20 /user/hive/warehouse/external_2000w_new/part-00027

-rwxrwxrwt 3 hadoop hive 6032423 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00028

-rwxrwxrwt 3 hadoop hive 1905099 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00029

-rwxrwxrwt 3 hadoop hive 341632 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00030

[root@snn conf]#

################### 遗留问题:

1、中文查询那个,为何出现那么多异常抛出?后来改A/B/C之类的也有一个异常抛出;

2、根据外部表生成的结果直接生成到hive表,生成的文件数变成task的数量那么多part,小文件hdfs一个硬伤啊。

################### 遗留问题跟踪:

1、第一个问题是由于操作符引起,等于是=,不等于是<>,但是在spark里面还是可以执行,hive直接抛异常。



修改后,结果显示正常。



2、小文件这个问题,是spark不支持,hive可以支持。

参数:hive-site.xml 直接修改<hive.merge.mapredfiles>的值为true

或者CLI里面敲:set hive.merge.mapredfiles = true



内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: