您的位置:首页 > 运维架构

Hadoop 多表 join:map side join 范例

2014-03-04 10:46 92 查看
xt value, Context context)


076
throws

IOException, InterruptedException {
077
078
kv = value.toString().split(
"\t"
);
079
// map join:在map阶段过滤掉不需要的数据
080
if

(userMap.containsKey(kv[
0
]) && sexMap.containsKey(kv[
1
])) {
081
oKey.set(userMap.get(kv[
0
]) +
"\t"
+ sexMap.get(kv[
1
]));
082
oValue.set(
"1"
);
083
context.write(oKey, oValue);
084
}
085
}
086
087
}
088
089
public

static
class
Reduce
extends
Reducer<Text, Text, Text, Text> {
090
091
private

Text oValue =
new
Text();
092
093
public

void
reduce(Text key, Iterable<Text> values, Context context)
094
throws

IOException, InterruptedException {
095
int

sumCount =
0
;
096
097
for

(Text val :values) {
098
sumCount += Integer.parseInt(val.toString());
099
}
100
oValue.set(String.valueOf(sumCount));
101
context.write(key, oValue);
102
}
103
104
}
105
106
public

int
run(String[] args)
throws

Exception {
107
Job job =
new
Job(getConf(),

"MultiTableJoin"
);
108
109
job.setJobName(
"MultiTableJoin"
);
110
job.setJarByClass(MultiTableJoin.
class
);
111
job.setMapperClass(MapClass.
class
);
112
job.setReducerClass(Reduce.
class
);
113
114
job.setInputFormatClass(TextInputFormat.
class
);
115
job.setOutputFormatClass(TextOutputFormat.
class
);
116
117
job.setOutputKeyClass(Text.
class
);
118
job.setOutputValueClass(Text.
class
);
119
120
String[] otherArgs =
new
GenericOptionsParser(job.getConfiguration(),
121
args).getRemainingArgs();
122
123
// 我们把第1、2个参数的地址作为要缓存的文件路径
124
DistributedCache.addCacheFile(
new

Path(otherArgs[
1
]).toUri(), job
125
.getConfiguration());
126
DistributedCache.addCacheFile(
new

Path(otherArgs[
2
]).toUri(), job
127
.getConfiguration());
128
129
FileInputFormat.addInputPath(job,
new
Path(otherArgs[
3
]));
130
FileOutputFormat.setOutputPath(job,
new
Path(otherArgs[
4
]));
131
132
return

job.waitForCompletion(
true
) ?
0
:
1
;
133
}
134
135
public

static
void
main(String[] args)
throws
Exception {
136
int

res = ToolRunner.run(
new
Configuration(),
new
MultiTableJoin(),
137
args);
138
System.exit(res);
139
}
140
141
}
运行命令:

1
hadoop jar MultiTableJoin.jar MultiTableJoin /test/decli/sex /test/decli/user /test/decli/login /test/decli/output
4、结果:

运行结果:
root@master 192.168.120.236 02:47:18 ~/test/table >

hadoop fs -cat /test/decli/output/*|column -t

cat:File does not exist:/test/decli/output/_logs

张三 男 4

李四 男 2

王五 女 2

赵六 女 2

root@master 192.168.120.236 02:47:26 ~/test/table >

TIPS: 转自 http://my.oschina.net/leejun2005/blog/111963
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: