mapreduce任务中数据分布倾斜导致reduce负载不均衡的解决方案
2015-09-10 00:41
591 查看
概述:
从所周知,当需求中数据出现分布不均的情况时,按照hadoop mr任务的默认partition方法,会出现某些机子负载过重的情况,这样会拖慢整个任务进度,有时候甚至会宕机。在这里我介绍一个用随机数解决这个问题的方案,下面是通过代码来模拟map根据partition来分区的情况;原理(追加)
方案就是运用了数学中均匀分布的原理,举个例子:10个小球,有5个盒子,那么每个小球分配到每个盒子的概率都是相同的,即五分之一,那么例子中小球模拟的是mr中的key,盒子模拟的是reducer。这种方案相对于官方提供的用一致性哈希算法有什么优点呢,我们了解到要使用一致性哈希算法达到均匀分布key,哈希算法中的虚拟桶数在足够多,但是hadoop处理的数据都是十亿级数据,key种类也是千万级以上,如果为了更均匀,必然会耗费更多的内存,但是如果采用这种方案,那么几乎是不需要内存来保存key的状态,而且数据量越大,分布就更加均匀测试代码
package com.mxq.balance; import java.util.Vector; public class UNBalanceData { private Vector<keyAndNum> keyVector=new Vector<keyAndNum>(10); //这里用于生成不均匀数据样本的信息; public void intialize(){ keyAndNum kn=new keyAndNum(1,50); this.keyVector.add(kn); kn=new keyAndNum(2,100); this.keyVector.add(kn); kn=new keyAndNum(3,80); this.keyVector.add(kn); //其中keyAndNum(4,1000020)表示key 4 有1000020个记录; kn=new keyAndNum(4,1000020); this.keyVector.add(kn); kn=new keyAndNum(5,10050); this.keyVector.add(kn); kn=new keyAndNum(6,200); this.keyVector.add(kn); kn=new keyAndNum(7,100050); this.keyVector.add(kn); kn=new keyAndNum(8,250); this.keyVector.add(kn); kn=new keyAndNum(9,140); this.keyVector.add(kn); kn=new keyAndNum(10,90); this.keyVector.add(kn); } //获取总样本数是; public int getLoop(){ int loop=0; for(int i=0;i<this.keyVector.size();i++){ loop+=this.keyVector.get(i).getKeyNum(); } return loop; } //根据initialize生成的信息 生成样本时用; public boolean minus(int key){ for(int i=0;i<this.keyVector.size();i++){ if(this.keyVector.get(i).getKey()==key){ if(this.keyVector.get(i).getKeyNum()<1){ //System.out.println("key :" +key+"'s num "+this.keyVector.get(i).getKeyNum()+ " ,please check program"); return false; }else{ this.keyVector.get(i).minus(); return true; } } } return false; } }
//此类是模拟reduce package com.mxq.balance; import java.util.Vector; public class reduceContain { public Vector<keyAndNum> reduceVec=new Vector<keyAndNum>(); //根本key来收集各个key在某个reducer的数量; public void addKey(int key){ boolean find=false; int index=-1; for(int i=0;i<this.reduceVec.size();i++){ if(this.reduceVec.get(i).getKey()==key) { find=true; index=i; 4000 break; } } if(find){ this.reduceVec.get(index).add(); }else { keyAndNum kn=new keyAndNum(key,1); this.reduceVec.add(kn); } } //获取最终key在某个reducer的样本数 public int getResult(int key){ for(int i=0;i<this.reduceVec.size();i++){ if(this.reduceVec.get(i).getKey()==key){ return this.reduceVec.get(i).getKeyNum(); } } System.out.println("can not find key"); return 0; } }
//保存key的状态或者配置信息; package com.mxq.balance; public class keyAndNum { int key; int keyNum; public keyAndNum(int key,int num){ this.key=key; this.keyNum=num; } public int getKey() { return key; } public void setKey(int key) { this.key = key; } public int getKeyNum() { return keyNum; } public void setKeyNum(int keyNum) { this.keyNum = keyNum; } public void add(){ this.keyNum++; } public void minus(){ this.keyNum--; } }
package com.mxq.balance; import java.util.Random; import java.util.Vector; public class testPartition { //配置reducer数量 public int reducerNum=5; //保存生成的样本数据; public Vector<Integer> randomData=new Vector<Integer>(); //保存reducer容器的相关状态; public Vector<keyToReducer> krVec=new Vector<keyToReducer>(reducerNum); public static void main(String[] args) { // TODO Auto-generated method stub Random random=new Random(); int loop=0; UNBalanceData unbd=new UNBalanceData(); unbd.intialize(); loop=unbd.getLoop(); testPartition tp=new testPartition(); tp.initSampleVec(loop); tp.initReducer(); int i=0; while(i<loop){ //System.out.println(random.nextInt(10)); int key=random.nextInt(10)+1; if(unbd.minus(key)) { tp.addData(key); i++; } } for(int h=0;h<tp.randomData.size();h++){ int rid=random.nextInt(tp.reducerNum); int key=tp.randomData.get(h); tp.addKeyToReducer(rid, key); } tp.statistic(); } public void initSampleVec(int loop){ this.randomData=new Vector<Integer>(loop); } //将key添加样本容器当中; public void addData(int key){ this.randomData.add(new Integer(key)); } public void initReducer(){ for(int i=0;i<reducerNum;i++){ keyToReducer kr=new keyToReducer(i); this.krVec.add(kr); } } //根据partitionid 和Key添加到不同的reducer中 public void addKeyToReducer(int rid,int key){ for(int i=0;i<this.krVec.size();i++){ if(this.krVec.get(i).getRid()==rid){ this.krVec.get(i).getRc().addKey(key); break; } } } //统计最终结果; public void statistic(){ int sum=0; for(int i=0;i<10;i++){ sum=0; for(int h=0;h<this.krVec.size();h++){ sum+=this.krVec.get(h).getRc().getResult(i+1); } System.out.println("key: "+(i+1)+" has "+sum+" object"); for(int h=0;h<this.krVec.size();h++){ int num=this.krVec.get(h).getRc().getResult(i+1); System.out.println("in reducer id:"+h+" has key:"+(i+1)+" by "+(float)num/(float)sum); } System.out.println(); System.out.println(); } } //为每个reducer 配一个ID; public class keyToReducer{ private int rid; private reduceContain rc; public keyToReducer(int id){ this.rid=id; this.rc=new reduceContain(); } public int getRid() { return rid; } public void setRid(int rid) { this.rid = rid; } public reduceContain getRc() { return rc; } public void setRc(reduceContain rc) { this.rc = rc; } } }
测试结果;
其中key的种类固定只有10种;reducer的数量可配;
reducer数量:5
//key样本信息 keyAndNum kn=new keyAndNum(1,50); this.keyVector.add(kn); kn=new keyAndNum(2,100); this.keyVector.add(kn); kn=new keyAndNum(3,80); this.keyVector.add(kn); kn=new keyAndNum(4,1000020); this.keyVector.add(kn); kn=new keyAndNum(5,10050); this.keyVector.add(kn); kn=new keyAndNum(6,200); this.keyVector.add(kn); kn=new keyAndNum(7,100050); this.keyVector.add(kn); kn=new keyAndNum(8,250); this.keyVector.add(kn); kn=new keyAndNum(9,140); this.keyVector.add(kn); kn=new keyAndNum(10,90); this.keyVector.add(kn);
//测试结果 key: 1 has 50 object in reducer id:0 has key:1 by 0.18 in reducer id:1 has key:1 by 0.2 in reducer id:2 has key:1 by 0.2 in reducer id:3 has key:1 by 0.24 in reducer id:4 has key:1 by 0.18 key: 2 has 100 object in reducer id:0 has key:2 by 0.19 in reducer id:1 has key:2 by 0.18 in reducer id:2 has key:2 by 0.2 in reducer id:3 has key:2 by 0.25 in reducer id:4 has key:2 by 0.18 key: 3 has 80 object in reducer id:0 has key:3 by 0.2375 in reducer id:1 has key:3 by 0.225 in reducer id:2 has key:3 by 0.2 in reducer id:3 has key:3 by 0.2375 in reducer id:4 has key:3 by 0.1 key: 4 has 1000020 object in reducer id:0 has key:4 by 0.19913201 in reducer id:1 has key:4 by 0.200015 in reducer id:2 has key: f19c 4 by 0.199976 in reducer id:3 has key:4 by 0.19990501 in reducer id:4 has key:4 by 0.20097198 key: 5 has 10050 object in reducer id:0 has key:5 by 0.19641791 in reducer id:1 has key:5 by 0.20199005 in reducer id:2 has key:5 by 0.1999005 in reducer id:3 has key:5 by 0.20109452 in reducer id:4 has key:5 by 0.20059702 key: 6 has 200 object in reducer id:0 has key:6 by 0.17 in reducer id:1 has key:6 by 0.205 in reducer id:2 has key:6 by 0.21 in reducer id:3 has key:6 by 0.19 in reducer id:4 has key:6 by 0.225 key: 7 has 100050 object in reducer id:0 has key:7 by 0.2012194 in reducer id:1 has key:7 by 0.19902049 in reducer id:2 has key:7 by 0.20127936 in reducer id:3 has key:7 by 0.19898051 in reducer id:4 has key:7 by 0.19950025 key: 8 has 250 object in reducer id:0 has key:8 by 0.18 in reducer id:1 has key:8 by 0.208 in reducer id:2 has key:8 by 0.212 in reducer id:3 has key:8 by 0.192 in reducer id:4 has key:8 by 0.208 key: 9 has 140 object in reducer id:0 has key:9 by 0.20714286 in reducer id:1 has key:9 by 0.23571429 in reducer id:2 has key:9 by 0.19285715 in reducer id:3 has key:9 by 0.16428572 in reducer id:4 has key:9 by 0.2 key: 10 has 90 object in reducer id:0 has key:10 by 0.2 in reducer id:1 has key:10 by 0.17777778 in reducer id:2 has key:10 by 0.22222222 in reducer id:3 has key:10 by 0.21111111 in reducer id:4 has key:10 by 0.18888889
reducer数量:10
key: 1 has 50 object in reducer id:0 has key:1 by 0.1 in reducer id:1 has key:1 by 0.1 in reducer id:2 has key:1 by 0.14 in reducer id:3 has key:1 by 0.06 in reducer id:4 has key:1 by 0.16 in reducer id:5 has key:1 by 0.06 in reducer id:6 has key:1 by 0.02 in reducer id:7 has key:1 by 0.12 in reducer id:8 has key:1 by 0.12 in reducer id:9 has key:1 by 0.12 key: 2 has 100 object in reducer id:0 has key:2 by 0.1 in reducer id:1 has key:2 by 0.08 in reducer id:2 has key:2 by 0.11 in reducer id:3 has key:2 by 0.11 in reducer id:4 has key:2 by 0.1 in reducer id:5 has key:2 by 0.12 in reducer id:6 has key:2 by 0.13 in reducer id:7 has key:2 by 0.07 in reducer id:8 has key:2 by 0.1 in reducer id:9 has key:2 by 0.08 key: 3 has 80 object in reducer id:0 has key:3 by 0.0375 in reducer id:1 has key:3 by 0.0875 in reducer id:2 has key:3 by 0.075 in reducer id:3 has key:3 by 0.125 in reducer id:4 has key:3 by 0.0625 in reducer id:5 has key:3 by 0.1625 in reducer id:6 has key:3 by 0.1125 in reducer id:7 has key:3 by 0.1 in reducer id:8 has key:3 by 0.1375 in reducer id:9 has key:3 by 0.1 key: 4 has 1000020 object in reducer id:0 has key:4 by 0.099913 in reducer id:1 has key:4 by 0.100039 in reducer id:2 has key:4 by 0.09953101 in reducer id:3 has key:4 by 0.099645 in reducer id:4 has key:4 by 0.100186 in reducer id:5 has key:4 by 0.099819005 in reducer id:6 has key:4 by 0.100185 in reducer id:7 has key:4 by 0.100311995 in reducer id:8 has key:4 by 0.100187995 in reducer id:9 has key:4 by 0.100182 key: 5 has 10050 object in reducer id:0 has key:5 by 0.100497514 in reducer id:1 has key:5 by 0.10179105 in reducer id:2 has key:5 by 0.10597015 in reducer id:3 has key:5 by 0.09960199 in reducer id:4 has key:5 by 0.094228856 in reducer id:5 has key:5 by 0.10079602 in reducer id:6 has key:5 by 0.10258707 in reducer id:7 has key:5 by 0.09860697 in reducer id:8 has key:5 by 0.09542289 in reducer id:9 has key:5 by 0.100497514 key: 6 has 200 object in reducer id:0 has key:6 by 0.11 in reducer id:1 has key:6 by 0.095 in reducer id:2 has key:6 by 0.08 in reducer id:3 has key:6 by 0.105 in reducer id:4 has key:6 by 0.1 in reducer id:5 has key:6 by 0.09 in reducer id:6 has key:6 by 0.1 in reducer id:7 has key:6 by 0.105 in reducer id:8 has key:6 by 0.125 in reducer id:9 has key:6 by 0.09 key: 7 has 100050 object in reducer id:0 has key:7 by 0.10067966 in reducer id:1 has key:7 by 0.10117941 in reducer id:2 has key:7 by 0.099550225 in reducer id:3 has key:7 by 0.097891055 in reducer id:4 has key:7 by 0.10011994 in reducer id:5 has key:7 by 0.10177911 in reducer id:6 has key:7 by 0.10029985 in reducer id:7 has key:7 by 0.09991004 in reducer id:8 has key:7 by 0.09992004 in reducer id:9 has key:7 by 0.09867066 key: 8 has 250 object in reducer id:0 has key:8 by 0.108 in reducer id:1 has key:8 by 0.06 in reducer id:2 has key:8 by 0.112 in reducer id:3 has key:8 by 0.12 in reducer id:4 has key:8 by 0.108 in reducer id:5 has key:8 by 0.092 in reducer id:6 has key:8 by 0.084 in reducer id:7 has key:8 by 0.132 in reducer id:8 has key:8 by 0.112 in reducer id:9 has key:8 by 0.072 key: 9 has 140 object in reducer id:0 has key:9 by 0.07857143 in reducer id:1 has key:9 by 0.1 in reducer id:2 has key:9 by 0.10714286 in reducer id:3 has key:9 by 0.14285715 in reducer id:4 has key:9 by 0.08571429 in reducer id:5 has key:9 by 0.12142857 in reducer id:6 has key:9 by 0.092857145 in reducer id:7 has key:9 by 0.057142857 in reducer id:8 has key:9 by 0.08571429 in reducer id:9 has key:9 by 0.12857144 key: 10 has 90 object in reducer id:0 has key:10 by 0.1 in reducer id:1 has key:10 by 0.08888889 in reducer id:2 has key:10 by 0.06666667 in reducer id:3 has key:10 by 0.07777778 in reducer id:4 has key:10 by 0.1 in reducer id:5 has key:10 by 0.16666667 in reducer id:6 has key:10 by 0.11111111 in reducer id:7 has key:10 by 0.13333334 in reducer id:8 has key:10 by 0.08888889 in reducer id:9 has key:10 by 0.06666667
总结
根据配置信息,key 4有1000020 个记录,是样本量最多的key,但测试结果显示key 4在各个reducer上是分布均匀的相关文章推荐
- 详解HDFS Short Circuit Local Reads
- Hadoop_2.1.0 MapReduce序列图
- 使用Hadoop搭建现代电信企业架构
- 单机版搭建Hadoop环境图文教程详解
- 负载均衡技术沙龙1期(关于咱的图片)
- 负载均衡沙龙活动第二期现场问答汇集
- 流量引导:网络世界的负载均衡解密
- 流量引导:网络世界的负载均衡解密
- 浅谈sqlserver的负载均衡问题
- hadoop常见错误以及处理方法详解
- PHP开发负载均衡指南
- Win2003利用dfs(分布式文件系统)在负载均衡下的文件同步配置方案
- LVS+Keepalived构建高可用负载均衡(测试篇)
- 简单测试Apache是如何完成负载均衡策略配置
- Apache负载均衡设置方法 mod_proxy使用介绍
- LVS(Linux Virtual Server)Linux 虚拟服务器介绍及配置(负载均衡系统)
- hadoop 单机安装配置教程
- hadoop的hdfs文件操作实现上传文件到hdfs
- hadoop实现grep示例分享