您的位置：首页 > 运维架构

Hadoop学习笔记—12.MapReduce中的常见算法

2015-02-25 18:18 267 查看

一、MapReduce中有哪些常见算法

　　（1）经典之王：单词计数

　　　　这个是MapReduce的经典案例，经典的不能再经典了！

package algorithm;

import java.net.URI;

import mapreduce.MyWordCountJob.MyMapper;
import mapreduce.MyWordCountJob.MyReducer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class MyMaxNumJob extends Configured implements Tool {

/**
* @author Edison Chou
* @version 1.0
*/
public static class MyMapper extends
Mapper<LongWritable, Text, LongWritable, NullWritable> {
long max = Long.MIN_VALUE;

protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, LongWritable, NullWritable>.Context context)
throws java.io.IOException, InterruptedException {
long temp = Long.parseLong(value.toString().trim());
if (temp > max) {
max = temp;
}
};

protected void cleanup(
org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, LongWritable, NullWritable>.Context context)
throws java.io.IOException, InterruptedException {
context.write(new LongWritable(max), NullWritable.get());
};
}

/**
* @author Edison Chou
* @version 1.0
*/
public static class MyReducer extends
Reducer<LongWritable, NullWritable, LongWritable, NullWritable> {
long max = Long.MIN_VALUE;

protected void reduce(
LongWritable key,
java.lang.Iterable<NullWritable> values,
Reducer<LongWritable, NullWritable, LongWritable, NullWritable>.Context context)
throws java.io.IOException, InterruptedException {
long temp = key.get();
if (temp > max) {
max = temp;
}
};

protected void cleanup(
org.apache.hadoop.mapreduce.Reducer<LongWritable, NullWritable, LongWritable, NullWritable>.Context context)
throws java.io.IOException, InterruptedException {
context.write(new LongWritable(max), NullWritable.get());
};

}

// 输入文件路径
public static String INPUT_PATH = "hdfs://hadoop-master:9000/testdir/input/seq100w.txt";
// 输出文件路径
public static String OUTPUT_PATH = "hdfs://hadoop-master:9000/testdir/output/topkapp";

@Override
public int run(String[] args) throws Exception {
// 首先删除输出路径的已有生成文件
FileSystem fs = FileSystem.get(new URI(INPUT_PATH), getConf());
Path outPath = new Path(OUTPUT_PATH);
if (fs.exists(outPath)) {
fs.delete(outPath, true);
}

Job job = new Job(getConf(), "MaxNumberJob");
// 设置输入目录
FileInputFormat.setInputPaths(job, new Path(INPUT_PATH));
// 设置自定义Mapper
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(NullWritable.class);
// 设置自定义Reducer
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(NullWritable.class);
// 设置输出目录
FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH));

System.exit(job.waitForCompletion(true) ? 0 : 1);
return 0;
}

public static void main(String[] args) {
Configuration conf = new Configuration();
// map端输出启用压缩
conf.setBoolean("mapred.compress.map.output", true);
// reduce端输出启用压缩
conf.setBoolean("mapred.output.compress", true);
// reduce端输出压缩使用的类
conf.setClass("mapred.output.compression.codec", GzipCodec.class,
CompressionCodec.class);

try {
int res = ToolRunner.run(conf, new MyMaxNumJob(), args);
System.exit(res);
} catch (Exception e) {
e.printStackTrace();
}
}
}

View Code

3.3 查看实现效果

　　可以看出，我们的程序已经求出了最大值：32767。虽然例子很简单，业务也很简单，但是我们引入了分布式计算的思想，将MapReduce应用在了最值问题之中，就是一个进步了！

参考资料

（1）吴超，《深入浅出Hadoop》：http://www.superwu.cn/

（2）Suddenly，《Hadoop日记Day18-MapReduce排序和分组》：/content/5045604.html

（3）chenssy，《Java提高篇（27）—TreeMap》：/article/1384909.html

作者：周旭龙

出处：http://edisonchou.cnblogs.com/

本文版权归作者和博客园共有，欢迎转载，但未经作者同意必须保留此段声明，且在文章页面明显位置给出原文链接。

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航