Hadoop学习笔记—12.MapReduce中的常见算法
2015-02-25 18:18
267 查看
一、MapReduce中有哪些常见算法
(1)经典之王:单词计数
这个是MapReduce的经典案例,经典的不能再经典了!package algorithm; import java.net.URI; import mapreduce.MyWordCountJob.MyMapper; import mapreduce.MyWordCountJob.MyReducer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class MyMaxNumJob extends Configured implements Tool { /** * @author Edison Chou * @version 1.0 */ public static class MyMapper extends Mapper<LongWritable, Text, LongWritable, NullWritable> { long max = Long.MIN_VALUE; protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, LongWritable, NullWritable>.Context context) throws java.io.IOException, InterruptedException { long temp = Long.parseLong(value.toString().trim()); if (temp > max) { max = temp; } }; protected void cleanup( org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, LongWritable, NullWritable>.Context context) throws java.io.IOException, InterruptedException { context.write(new LongWritable(max), NullWritable.get()); }; } /** * @author Edison Chou * @version 1.0 */ public static class MyReducer extends Reducer<LongWritable, NullWritable, LongWritable, NullWritable> { long max = Long.MIN_VALUE; protected void reduce( LongWritable key, java.lang.Iterable<NullWritable> values, Reducer<LongWritable, NullWritable, LongWritable, NullWritable>.Context context) throws java.io.IOException, InterruptedException { long temp = key.get(); if (temp > max) { max = temp; } }; protected void cleanup( org.apache.hadoop.mapreduce.Reducer<LongWritable, NullWritable, LongWritable, NullWritable>.Context context) throws java.io.IOException, InterruptedException { context.write(new LongWritable(max), NullWritable.get()); }; } // 输入文件路径 public static String INPUT_PATH = "hdfs://hadoop-master:9000/testdir/input/seq100w.txt"; // 输出文件路径 public static String OUTPUT_PATH = "hdfs://hadoop-master:9000/testdir/output/topkapp"; @Override public int run(String[] args) throws Exception { // 首先删除输出路径的已有生成文件 FileSystem fs = FileSystem.get(new URI(INPUT_PATH), getConf()); Path outPath = new Path(OUTPUT_PATH); if (fs.exists(outPath)) { fs.delete(outPath, true); } Job job = new Job(getConf(), "MaxNumberJob"); // 设置输入目录 FileInputFormat.setInputPaths(job, new Path(INPUT_PATH)); // 设置自定义Mapper job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(NullWritable.class); // 设置自定义Reducer job.setReducerClass(MyReducer.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(NullWritable.class); // 设置输出目录 FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH)); System.exit(job.waitForCompletion(true) ? 0 : 1); return 0; } public static void main(String[] args) { Configuration conf = new Configuration(); // map端输出启用压缩 conf.setBoolean("mapred.compress.map.output", true); // reduce端输出启用压缩 conf.setBoolean("mapred.output.compress", true); // reduce端输出压缩使用的类 conf.setClass("mapred.output.compression.codec", GzipCodec.class, CompressionCodec.class); try { int res = ToolRunner.run(conf, new MyMaxNumJob(), args); System.exit(res); } catch (Exception e) { e.printStackTrace(); } } }
View Code
3.3 查看实现效果
可以看出,我们的程序已经求出了最大值:32767。虽然例子很简单,业务也很简单,但是我们引入了分布式计算的思想,将MapReduce应用在了最值问题之中,就是一个进步了!
参考资料
(1)吴超,《深入浅出Hadoop》:http://www.superwu.cn/(2)Suddenly,《Hadoop日记Day18-MapReduce排序和分组》:/content/5045604.html
(3)chenssy,《Java提高篇(27)—TreeMap》:/article/1384909.html
作者:周旭龙
出处:http://edisonchou.cnblogs.com/
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文链接。
相关文章推荐
- Hadoop学习笔记—12.MapReduce中的常见算法
- Hadoop学习笔记—12.MapReduce中的常见算法
- Hadoop学习笔记—12.MapReduce中的常见算法
- 数据结构学习笔记 --- 线性表 (一些常见的关于链表的算法和面试题)
- 个人Hadoop实验决策树算法学习笔记
- Hadoop学习笔记二 - kNN算法实现用户风险分类
- 常见算法学习笔记
- Hadoop 学习笔记十 常见问题汇总
- hadoop2.5.2学习及实践笔记(五)—— HDFS shell命令行常见操作
- 数据结构学习笔记 --- 线性表 (一些常见的关于链表的算法和面试题)
- Hadoop之MapReduce中的常见算法(笔记12)
- 链栈实现算法 - Java 学习笔记(26)
- Hadoop学习笔记二 安装部署
- 每天学习一点flash(35) 游戏设计笔记 (5) 跳跃算法笔记
- Hadoop学习笔记一 简要介绍
- [转]Hadoop学习笔记
- [转]Hadoop学习笔记
- 挖掘闭合模式的高性能算法学习笔记——CFIST
- 基础算法学习笔记(一)----回溯法
- 算法学习笔记--概况