您的位置:首页 > 运维架构

mapreduce任务中数据分布倾斜导致reduce负载不均衡的解决方案

2015-09-10 00:41 591 查看

概述:

从所周知,当需求中数据出现分布不均的情况时,按照hadoop mr任务的默认partition方法,会出现某些机子负载过重的情况,这样会拖慢整个任务进度,有时候甚至会宕机。在这里我介绍一个用随机数解决这个问题的方案,下面是通过代码来模拟map根据partition来分区的情况;

原理(追加)

方案就是运用了数学中均匀分布的原理,举个例子:10个小球,有5个盒子,那么每个小球分配到每个盒子的概率都是相同的,即五分之一,那么例子中小球模拟的是mr中的key,盒子模拟的是reducer。这种方案相对于官方提供的用一致性哈希算法有什么优点呢,我们了解到要使用一致性哈希算法达到均匀分布key,哈希算法中的虚拟桶数在足够多,但是hadoop处理的数据都是十亿级数据,key种类也是千万级以上,如果为了更均匀,必然会耗费更多的内存,但是如果采用这种方案,那么几乎是不需要内存来保存key的状态,而且数据量越大,分布就更加均匀

测试代码

package com.mxq.balance;

import java.util.Vector;

public class UNBalanceData {

private Vector<keyAndNum> keyVector=new Vector<keyAndNum>(10);

//这里用于生成不均匀数据样本的信息;
public void intialize(){

keyAndNum kn=new keyAndNum(1,50);
this.keyVector.add(kn);

kn=new keyAndNum(2,100);
this.keyVector.add(kn);

kn=new keyAndNum(3,80);
this.keyVector.add(kn);
//其中keyAndNum(4,1000020)表示key 4 有1000020个记录;
kn=new keyAndNum(4,1000020);
this.keyVector.add(kn);

kn=new keyAndNum(5,10050);
this.keyVector.add(kn);

kn=new keyAndNum(6,200);
this.keyVector.add(kn);

kn=new keyAndNum(7,100050);
this.keyVector.add(kn);

kn=new keyAndNum(8,250);
this.keyVector.add(kn);

kn=new keyAndNum(9,140);
this.keyVector.add(kn);

kn=new keyAndNum(10,90);
this.keyVector.add(kn);
}
//获取总样本数是;
public int getLoop(){
int loop=0;
for(int i=0;i<this.keyVector.size();i++){
loop+=this.keyVector.get(i).getKeyNum();
}

return loop;
}
//根据initialize生成的信息 生成样本时用;
public boolean minus(int key){

for(int i=0;i<this.keyVector.size();i++){
if(this.keyVector.get(i).getKey()==key){

if(this.keyVector.get(i).getKeyNum()<1){
//System.out.println("key :" +key+"'s num "+this.keyVector.get(i).getKeyNum()+ " ,please check program");
return false;
}else{
this.keyVector.get(i).minus();
return true;
}

}

}

return false;
}

}


//此类是模拟reduce
package com.mxq.balance;

import java.util.Vector;

public class reduceContain {

public Vector<keyAndNum> reduceVec=new Vector<keyAndNum>();

//根本key来收集各个key在某个reducer的数量;
public void addKey(int key){
boolean find=false;
int index=-1;

for(int i=0;i<this.reduceVec.size();i++){

if(this.reduceVec.get(i).getKey()==key)
{
find=true;
index=i;

4000
break;
}

}

if(find){
this.reduceVec.get(index).add();
}else
{
keyAndNum kn=new keyAndNum(key,1);
this.reduceVec.add(kn);
}

}

//获取最终key在某个reducer的样本数
public int getResult(int key){
for(int i=0;i<this.reduceVec.size();i++){
if(this.reduceVec.get(i).getKey()==key){
return this.reduceVec.get(i).getKeyNum();
}
}
System.out.println("can not find key");

return 0;
}

}


//保存key的状态或者配置信息;
package com.mxq.balance;

public class keyAndNum {

int key;
int keyNum;

public keyAndNum(int key,int num){
this.key=key;
this.keyNum=num;
}
public int getKey() {
return key;
}
public void setKey(int key) {
this.key = key;
}
public int getKeyNum() {
return keyNum;
}
public void setKeyNum(int keyNum) {
this.keyNum = keyNum;
}

public void add(){
this.keyNum++;
}

public void minus(){
this.keyNum--;
}

}


package com.mxq.balance;

import java.util.Random;
import java.util.Vector;

public class testPartition {
//配置reducer数量
public int reducerNum=5;
//保存生成的样本数据;
public Vector<Integer> randomData=new Vector<Integer>();
//保存reducer容器的相关状态;
public Vector<keyToReducer> krVec=new Vector<keyToReducer>(reducerNum);

public static void main(String[] args) {
// TODO Auto-generated method stub

Random random=new Random();

int loop=0;

UNBalanceData unbd=new UNBalanceData();
unbd.intialize();
loop=unbd.getLoop();

testPartition tp=new testPartition();
tp.initSampleVec(loop);
tp.initReducer();

int i=0;
while(i<loop){
//System.out.println(random.nextInt(10));
int key=random.nextInt(10)+1;
if(unbd.minus(key))
{
tp.addData(key);
i++;
}

}

for(int h=0;h<tp.randomData.size();h++){

int rid=random.nextInt(tp.reducerNum);
int key=tp.randomData.get(h);
tp.addKeyToReducer(rid, key);

}

tp.statistic();

}

public void initSampleVec(int loop){
this.randomData=new Vector<Integer>(loop);
}
//将key添加样本容器当中;
public void addData(int key){
this.randomData.add(new Integer(key));
}

public void initReducer(){

for(int i=0;i<reducerNum;i++){
keyToReducer kr=new keyToReducer(i);
this.krVec.add(kr);
}

}
//根据partitionid 和Key添加到不同的reducer中
public void addKeyToReducer(int rid,int key){
for(int i=0;i<this.krVec.size();i++){
if(this.krVec.get(i).getRid()==rid){
this.krVec.get(i).getRc().addKey(key);
break;
}
}
}

//统计最终结果;
public void statistic(){
int sum=0;

for(int i=0;i<10;i++){
sum=0;
for(int h=0;h<this.krVec.size();h++){
sum+=this.krVec.get(h).getRc().getResult(i+1);
}
System.out.println("key: "+(i+1)+" has "+sum+" object");
for(int h=0;h<this.krVec.size();h++){
int num=this.krVec.get(h).getRc().getResult(i+1);
System.out.println("in reducer id:"+h+" has key:"+(i+1)+" by "+(float)num/(float)sum);

}

System.out.println();
System.out.println();
}
}

//为每个reducer 配一个ID;
public class keyToReducer{
private int rid;
private reduceContain rc;

public keyToReducer(int id){
this.rid=id;
this.rc=new reduceContain();
}

public int getRid() {
return rid;
}

public void setRid(int rid) {
this.rid = rid;
}

public reduceContain getRc() {
return rc;
}

public void setRc(reduceContain rc) {
this.rc = rc;
}
}

}


测试结果;

其中key的种类固定只有10种;

reducer的数量可配;

reducer数量:5

//key样本信息
keyAndNum kn=new keyAndNum(1,50);
this.keyVector.add(kn);

kn=new keyAndNum(2,100);
this.keyVector.add(kn);

kn=new keyAndNum(3,80);
this.keyVector.add(kn);

kn=new keyAndNum(4,1000020);
this.keyVector.add(kn);

kn=new keyAndNum(5,10050);
this.keyVector.add(kn);

kn=new keyAndNum(6,200);
this.keyVector.add(kn);

kn=new keyAndNum(7,100050);
this.keyVector.add(kn);

kn=new keyAndNum(8,250);
this.keyVector.add(kn);

kn=new keyAndNum(9,140);
this.keyVector.add(kn);

kn=new keyAndNum(10,90);
this.keyVector.add(kn);


//测试结果
key: 1 has 50 object
in reducer id:0 has key:1 by 0.18
in reducer id:1 has key:1 by 0.2
in reducer id:2 has key:1 by 0.2
in reducer id:3 has key:1 by 0.24
in reducer id:4 has key:1 by 0.18

key: 2 has 100 object
in reducer id:0 has key:2 by 0.19
in reducer id:1 has key:2 by 0.18
in reducer id:2 has key:2 by 0.2
in reducer id:3 has key:2 by 0.25
in reducer id:4 has key:2 by 0.18

key: 3 has 80 object
in reducer id:0 has key:3 by 0.2375
in reducer id:1 has key:3 by 0.225
in reducer id:2 has key:3 by 0.2
in reducer id:3 has key:3 by 0.2375
in reducer id:4 has key:3 by 0.1

key: 4 has 1000020 object
in reducer id:0 has key:4 by 0.19913201
in reducer id:1 has key:4 by 0.200015
in reducer id:2 has key:
f19c
4 by 0.199976
in reducer id:3 has key:4 by 0.19990501
in reducer id:4 has key:4 by 0.20097198

key: 5 has 10050 object
in reducer id:0 has key:5 by 0.19641791
in reducer id:1 has key:5 by 0.20199005
in reducer id:2 has key:5 by 0.1999005
in reducer id:3 has key:5 by 0.20109452
in reducer id:4 has key:5 by 0.20059702

key: 6 has 200 object
in reducer id:0 has key:6 by 0.17
in reducer id:1 has key:6 by 0.205
in reducer id:2 has key:6 by 0.21
in reducer id:3 has key:6 by 0.19
in reducer id:4 has key:6 by 0.225

key: 7 has 100050 object
in reducer id:0 has key:7 by 0.2012194
in reducer id:1 has key:7 by 0.19902049
in reducer id:2 has key:7 by 0.20127936
in reducer id:3 has key:7 by 0.19898051
in reducer id:4 has key:7 by 0.19950025

key: 8 has 250 object
in reducer id:0 has key:8 by 0.18
in reducer id:1 has key:8 by 0.208
in reducer id:2 has key:8 by 0.212
in reducer id:3 has key:8 by 0.192
in reducer id:4 has key:8 by 0.208

key: 9 has 140 object
in reducer id:0 has key:9 by 0.20714286
in reducer id:1 has key:9 by 0.23571429
in reducer id:2 has key:9 by 0.19285715
in reducer id:3 has key:9 by 0.16428572
in reducer id:4 has key:9 by 0.2

key: 10 has 90 object
in reducer id:0 has key:10 by 0.2
in reducer id:1 has key:10 by 0.17777778
in reducer id:2 has key:10 by 0.22222222
in reducer id:3 has key:10 by 0.21111111
in reducer id:4 has key:10 by 0.18888889


reducer数量:10

key: 1 has 50 object
in reducer id:0 has key:1 by 0.1
in reducer id:1 has key:1 by 0.1
in reducer id:2 has key:1 by 0.14
in reducer id:3 has key:1 by 0.06
in reducer id:4 has key:1 by 0.16
in reducer id:5 has key:1 by 0.06
in reducer id:6 has key:1 by 0.02
in reducer id:7 has key:1 by 0.12
in reducer id:8 has key:1 by 0.12
in reducer id:9 has key:1 by 0.12

key: 2 has 100 object
in reducer id:0 has key:2 by 0.1
in reducer id:1 has key:2 by 0.08
in reducer id:2 has key:2 by 0.11
in reducer id:3 has key:2 by 0.11
in reducer id:4 has key:2 by 0.1
in reducer id:5 has key:2 by 0.12
in reducer id:6 has key:2 by 0.13
in reducer id:7 has key:2 by 0.07
in reducer id:8 has key:2 by 0.1
in reducer id:9 has key:2 by 0.08

key: 3 has 80 object
in reducer id:0 has key:3 by 0.0375
in reducer id:1 has key:3 by 0.0875
in reducer id:2 has key:3 by 0.075
in reducer id:3 has key:3 by 0.125
in reducer id:4 has key:3 by 0.0625
in reducer id:5 has key:3 by 0.1625
in reducer id:6 has key:3 by 0.1125
in reducer id:7 has key:3 by 0.1
in reducer id:8 has key:3 by 0.1375
in reducer id:9 has key:3 by 0.1

key: 4 has 1000020 object
in reducer id:0 has key:4 by 0.099913
in reducer id:1 has key:4 by 0.100039
in reducer id:2 has key:4 by 0.09953101
in reducer id:3 has key:4 by 0.099645
in reducer id:4 has key:4 by 0.100186
in reducer id:5 has key:4 by 0.099819005
in reducer id:6 has key:4 by 0.100185
in reducer id:7 has key:4 by 0.100311995
in reducer id:8 has key:4 by 0.100187995
in reducer id:9 has key:4 by 0.100182

key: 5 has 10050 object
in reducer id:0 has key:5 by 0.100497514
in reducer id:1 has key:5 by 0.10179105
in reducer id:2 has key:5 by 0.10597015
in reducer id:3 has key:5 by 0.09960199
in reducer id:4 has key:5 by 0.094228856
in reducer id:5 has key:5 by 0.10079602
in reducer id:6 has key:5 by 0.10258707
in reducer id:7 has key:5 by 0.09860697
in reducer id:8 has key:5 by 0.09542289
in reducer id:9 has key:5 by 0.100497514

key: 6 has 200 object
in reducer id:0 has key:6 by 0.11
in reducer id:1 has key:6 by 0.095
in reducer id:2 has key:6 by 0.08
in reducer id:3 has key:6 by 0.105
in reducer id:4 has key:6 by 0.1
in reducer id:5 has key:6 by 0.09
in reducer id:6 has key:6 by 0.1
in reducer id:7 has key:6 by 0.105
in reducer id:8 has key:6 by 0.125
in reducer id:9 has key:6 by 0.09

key: 7 has 100050 object
in reducer id:0 has key:7 by 0.10067966
in reducer id:1 has key:7 by 0.10117941
in reducer id:2 has key:7 by 0.099550225
in reducer id:3 has key:7 by 0.097891055
in reducer id:4 has key:7 by 0.10011994
in reducer id:5 has key:7 by 0.10177911
in reducer id:6 has key:7 by 0.10029985
in reducer id:7 has key:7 by 0.09991004
in reducer id:8 has key:7 by 0.09992004
in reducer id:9 has key:7 by 0.09867066

key: 8 has 250 object
in reducer id:0 has key:8 by 0.108
in reducer id:1 has key:8 by 0.06
in reducer id:2 has key:8 by 0.112
in reducer id:3 has key:8 by 0.12
in reducer id:4 has key:8 by 0.108
in reducer id:5 has key:8 by 0.092
in reducer id:6 has key:8 by 0.084
in reducer id:7 has key:8 by 0.132
in reducer id:8 has key:8 by 0.112
in reducer id:9 has key:8 by 0.072

key: 9 has 140 object
in reducer id:0 has key:9 by 0.07857143
in reducer id:1 has key:9 by 0.1
in reducer id:2 has key:9 by 0.10714286
in reducer id:3 has key:9 by 0.14285715
in reducer id:4 has key:9 by 0.08571429
in reducer id:5 has key:9 by 0.12142857
in reducer id:6 has key:9 by 0.092857145
in reducer id:7 has key:9 by 0.057142857
in reducer id:8 has key:9 by 0.08571429
in reducer id:9 has key:9 by 0.12857144

key: 10 has 90 object
in reducer id:0 has key:10 by 0.1
in reducer id:1 has key:10 by 0.08888889
in reducer id:2 has key:10 by 0.06666667
in reducer id:3 has key:10 by 0.07777778
in reducer id:4 has key:10 by 0.1
in reducer id:5 has key:10 by 0.16666667
in reducer id:6 has key:10 by 0.11111111
in reducer id:7 has key:10 by 0.13333334
in reducer id:8 has key:10 by 0.08888889
in reducer id:9 has key:10 by 0.06666667


总结

根据配置信息,key 4有1000020 个记录,是样本量最多的key,但测试结果显示key 4在各个reducer上是分布均匀的
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息