您的位置:首页 > 数据库

YDB与spark SQL在百亿级数据上的性能对比测试

2017-01-15 19:39 549 查看
        按照时间逆序排序可以说是很多日志系统的硬指标。在延云YDB系统中,我们改变了传统的暴力排序方式,通过索引技术,可以超快对数据进行单列排序,不需要全表暴力扫描,这个技术我们称之为blockSort,目前支持tlong,tdouble,tint,tfloat四种数据类型。

        由于blockSort是借助搜索的索引来实现的,所以,采用blockSort的排序,不需要暴力扫描,性能有大幅度的提升。

        blockSort的排序,并非是预计算的方式,可以全表进行排序,也可以基于任意的过滤筛选条件进行过滤排序。

为此,我们针对spark sql与YDB在排序性能上做了一个比较性的测试

机器配置

一共虚拟出来,1台master,4台slave



其中slave机器的配置如下

[b]4台slave是运行在两台24core,128G的物理机上的,我们在物理机上做的虚拟机,配置如下[/b]

[b]注意,硬盘为SSD磁盘,不是普通的磁盘。
[/b]



测试数据([b]100亿条,一共有两列数据)[/b]

tradetime:tlong类型

        待测试的排序字段,高纬值(几乎没重复值),格式为为yyyyMMddHHmmss格式的随机时间,通过new Date(System.currentTimeMillis()-(long)(Math.random()*10000000000000l))来生成。

amtint:int类型

       为0~1000之间的整数,用于验证筛选条件与排序结合的性能,本身不用于排序。

测试结果(时间单位为秒)

amtint
列筛选
筛选后
条数
排序
方式
ydb
blocksort
spark
无筛选
100亿
降序
3.3
1118
升序
3.6
1085
100 TO 900
80亿
降序
1.5
1093
升序
1.3
1070
100 TO 600
50亿
降序
1.53
1104
升序
1.38
867
100 TO 200
10亿
降序
7.00
1115
升序
1.11
1131
100 TO 110
1亿
降序
2.1
1160
升序
3.44
1114
100 TO 101
0.1亿
降序
10.67
1089
升序
7.0
1110
测试过程

一、伪造数据

 hadoop fs -mkdir -p /data/example/demo/blocksort_time/

 hadoop fs -ls /data/example/demo/blocksort_time/

 hadoop fs -rm -r /data/example/demo/blocksort_time/

 hadoop fs -mkdir -p /data/example/demo/blocksort_time/

 hadoop fs -ls /data/example/demo/blocksort_time/
 hadoop jar ./lib/ydb-1.1.5-pg.jar cn.net.ycloud.ydb.server.reader.kafka.KafkaMakeBlockSortDataTime 1000000000 /data/example/demo/blocksort_time/2000_time_1.txt &

 hadoop jar ./lib/ydb-1.1.5-pg.jar cn.net.ycloud.ydb.server.reader.kafka.KafkaMakeBlockSortDataTime 1000000000 /data/example/demo/blocksort_time/2000_time_2.txt  &

 hadoop jar ./lib/ydb-1.1.5-pg.jar cn.net.ycloud.ydb.server.reader.kafka.KafkaMakeBlockSortDataTime 1000000000 /data/example/demo/blocksort_time/2000_time_3.txt  &

 hadoop jar ./lib/ydb-1.1.5-pg.jar cn.net.ycloud.ydb.server.reader.kafka.KafkaMakeBlockSortDataTime 1000000000 /data/example/demo/blocksort_time/2000_time_4.txt  &

 hadoop jar ./lib/ydb-1.1.5-pg.jar cn.net.ycloud.ydb.server.reader.kafka.KafkaMakeBlockSortDataTime 1000000000 /data/example/demo/blocksort_time/2000_time_5.txt  &

 hadoop jar ./lib/ydb-1.1.5-pg.jar cn.net.ycloud.ydb.server.reader.kafka.KafkaMakeBlockSortDataTime 1000000000 /data/example/demo/blocksort_time/2000_time_6.txt &

 hadoop jar ./lib/ydb-1.1.5-pg.jar cn.net.ycloud.ydb.server.reader.kafka.KafkaMakeBlockSortDataTime 1000000000 /data/example/demo/blocksort_time/2000_time_7.txt &

 hadoop jar ./lib/ydb-1.1.5-pg.jar cn.net.ycloud.ydb.server.reader.kafka.KafkaMakeBlockSortDataTime 1000000000 /data/example/demo/blocksort_time/2000_time_8.txt &

 hadoop jar ./lib/ydb-1.1.5-pg.jar cn.net.ycloud.ydb.server.reader.kafka.KafkaMakeBlockSortDataTime 1000000000 /data/example/demo/blocksort_time/2000_time_9.txt &

 hadoop jar ./lib/ydb-1.1.5-pg.jar cn.net.ycloud.ydb.server.reader.kafka.KafkaMakeBlockSortDataTime 1000000000 /data/example/demo/blocksort_time/2000_time_10.txt &

二、创建相关数据表

 --###创建文本表####

 drop table blocksort_time_txt;

CREATE external  table blocksort_time_txt(

     tradetime bigint,

     amtint int

)

row format delimited fields terminated by ','

stored as

    INPUTFORMAT 'cn.net.ycloud.ydb.handle.YdbCombineInputFormat'

    OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'

location '/data/example/demo/blocksort_time'

TBLPROPERTIES (

    'ydb.combine.input.format.raw.format'='org.apache.hadoop.mapred.TextInputFormat'

);

 drop table blocksort_time_commontxt;

CREATE external  table blocksort_time_commontxt(

     tradetime bigint,

     amtint int

)

row format delimited fields terminated by ','

location '/data/example/demo/blocksort_time'

;

--##创建YDB表##

/*ydb.pushdown('->')*/     

    create table blocksort_time_ydb(

     tradetime tlong,

     amtint int

    )

/*('<-')pushdown.ydb*/;

     

三、导入数据
----导入ydb数据

insert into table  ydbpartion

 select 'blocksort_time_ydb', 'ydb_default_partion', '',

    YROW(

    'tradetime',tradetime,

    'amtint',amtint

   )

from blocksort_time_txt;

----数据预览

/*ydb.pushdown('->')*/

select * from blocksort_time_ydb where ydbpartion='ydb_default_partion' limit 20

/*('<-')pushdown.ydb*/



----总数据量-一百亿



四、性能测试

(一)全表

逆序

--使用ydb的blocksort

/*ydb.pushdown('->')*/

select tradetime, amtint from blocksort_time_ydb where

    ydbkv='blocksort.field:tradetime'  and

    ydbkv='blocksort.desc:true' and

    ydbkv='blocksort.limit:30'

     order by tradetime desc limit 30

/*('<-')pushdown.ydb*/;



使用spark文本

select tradetime, amtint from blocksort_time_commontxt  
order by tradetime desc limit 30



升序

/*ydb.pushdown('->')*/

select tradetime, amtint from blocksort_time_ydb where

    ydbkv='blocksort.field:tradetime'  and

    ydbkv='blocksort.desc:false' and

    ydbkv='blocksort.limit:30'

     order by tradetime limit 30

/*('<-')pushdown.ydb*/;



select tradetime, amtint from blocksort_time_commontxt  
order by tradetime limit 30



(二)检索后匹配80%的数据 (筛选条件为 amtint like '([100 TO 900] )')

看命中条数,不排序

 /*ydb.pushdown('->')*/

select count(*) from blocksort_time_ydb where ydbpartion='ydb_default_partion' andamtint like '([100 TO 900] )'

/*('<-')pushdown.ydb*/

;



降序排序

/*ydb.pushdown('->')*/

select tradetime, amtint from blocksort_time_ydb where amtint like '([100 TO 900] )' and

    ydbkv='blocksort.field:tradetime'  and

    ydbkv='blocksort.desc:true' and

    ydbkv='blocksort.limit:30'

     order by tradetime desc limit 30

/*('<-')pushdown.ydb*/;



select tradetime, amtint from blocksort_time_commontxt 
where amtint>='100' and amtint <='900'
order by tradetime desc limit 30



升序排序

/*ydb.pushdown('->')*/

select tradetime, amtint from blocksort_time_ydb where  amtint like '([100 TO 900] )' and

    ydbkv='blocksort.field:tradetime'  and

    ydbkv='blocksort.desc:false' and

    ydbkv='blocksort.limit:30'

     order by tradetime limit 30

/*('<-')pushdown.ydb*/;



select tradetime, amtint from blocksort_time_commontxt
where amtint>='100' and amtint <='900' order by tradetime  limit 30;



(三)检索后匹配50%的数据 (筛选条件为 amtint like '([100 TO 600] )')

 /*ydb.pushdown('->')*/

select count(*) from blocksort_time_ydb where ydbpartion='ydb_default_partion' and amtint like '([100 TO 600] )'

/*('<-')pushdown.ydb*/

;



/*ydb.pushdown('->')*/

select tradetime, amtint from blocksort_time_ydb where amtint like '([100 TO 600] )' and

    ydbkv='blocksort.field:tradetime'  and

    ydbkv='blocksort.desc:true' and

    ydbkv='blocksort.limit:30'

     order by tradetime desc limit 30

/*('<-')pushdown.ydb*/;



select tradetime, amtint from blocksort_time_commontxt 
where amtint>='100' and amtint <='600' order by tradetime desc limit 30;



/*ydb.pushdown('->')*/

select tradetime, amtint from blocksort_time_ydb where  amtint like '([100 TO 600] )' and

    ydbkv='blocksort.field:tradetime'  and

    ydbkv='blocksort.desc:false' and

    ydbkv='blocksort.limit:30'

     order by tradetime limit 30

/*('<-')pushdown.ydb*/;



select tradetime, amtint from blocksort_time_commontxt 
where amtint>='100' and amtint <='600' order by tradetime  limit 30;



(三)检索后匹配10%的数据 (筛选条件为 amtint like '([100 TO 200] )')

 /*ydb.pushdown('->')*/

select count(*) from blocksort_time_ydb where ydbpartion='ydb_default_partion' and amtint like '([100 TO 200] )'

/*('<-')pushdown.ydb*/

;



/*ydb.pushdown('->')*/

select tradetime, amtint from blocksort_time_ydb where amtint like '([100 TO 200] )' and

    ydbkv='blocksort.field:tradetime'  and

    ydbkv='blocksort.desc:true' and

    ydbkv='blocksort.limit:30'

     order by tradetime desc limit 30

/*('<-')pushdown.ydb*/;



select tradetime, amtint from blocksort_time_commontxt 
where amtint>='100' and amtint <='200' order by tradetime desc limit 30;



/*ydb.pushdown('->')*/

select tradetime, amtint from blocksort_time_ydb where  amtint like '([100 TO 200] )' and

    ydbkv='blocksort.field:tradetime'  and

    ydbkv='blocksort.desc:false' and

    ydbkv='blocksort.limit:30'

     order by tradetime limit 30

/*('<-')pushdown.ydb*/;



select tradetime, amtint from blocksort_time_commontxt 
where amtint>='100' and amtint <='200' order by tradetime  limit 30;



(三)检索后匹配1%的数据 (筛选条件为 amtint like '([100 TO 110] )')

 /*ydb.pushdown('->')*/

select count(*) from blocksort_time_ydb where ydbpartion='ydb_default_partion' and amtint like '([100 TO 110] )'

/*('<-')pushdown.ydb*/

;



/*ydb.pushdown('->')*/

select tradetime, amtint from blocksort_time_ydb where amtint like '([100 TO 110] )' and

    ydbkv='blocksort.field:tradetime'  and

    ydbkv='blocksort.desc:true' and

    ydbkv='blocksort.limit:30'

     order by tradetime desc limit 30
/*('<-')pushdown.ydb*/;



select tradetime, amtint from blocksort_time_commontxt 
where amtint>='100' and amtint <='110' order by tradetime desc limit 30;



/*ydb.pushdown('->')*/

select tradetime, amtint from blocksort_time_ydb where  amtint like '([100 TO 110] )' and

    ydbkv='blocksort.field:tradetime'  and

    ydbkv='blocksort.desc:false' and

    ydbkv='blocksort.limit:30'

     order by tradetime limit 30

/*('<-')pushdown.ydb*/;



select tradetime, amtint from blocksort_time_commontxt 
where amtint>='100' and amtint <='110' order by tradetime  limit 30;



(三)检索后匹配0.1%的数据 (筛选条件为 amtint like '([100 TO 101] )')

 /*ydb.pushdown('->')*/

select count(*) from blocksort_time_ydb where ydbpartion='ydb_default_partion' and amtint like '([100 TO 101] )'

/*('<-')pushdown.ydb*/

;



/*ydb.pushdown('->')*/

select tradetime, amtint from blocksort_time_ydb where amtint like '([100 TO 101] )' and

    ydbkv='blocksort.field:tradetime'  and

    ydbkv='blocksort.desc:true' and

    ydbkv='blocksort.limit:30'

     order by tradetime desc limit 30

/*('<-')pushdown.ydb*/;



select tradetime, amtint from blocksort_time_commontxt 
where amtint>='100' and amtint <='101' order by tradetime desc limit 30;



/*ydb.pushdown('->')*/

select tradetime, amtint from blocksort_time_ydb where  amtint like '([100 TO 101] )' and

    ydbkv='blocksort.field:tradetime'  and

    ydbkv='blocksort.desc:false' and

    ydbkv='blocksort.limit:30'

     order by tradetime limit 30

/*('<-')pushdown.ydb*/;



select tradetime, amtint from blocksort_time_commontxt 
where amtint>='100' and amtint <='101' order by tradetime  limit 30;

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: