Hadoop MapReduce 统计红楼梦出现次数最多的名字
2016-05-07 19:33
465 查看
package org.bigdata.util; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.wltea.analyzer.lucene.IKAnalyzer; /** * 统计红楼梦出现最多的名字 * * @author wwhhf * */ public class RedHouseMapReduce { public static Set<String> dic = new HashSet<>(); static { String ProjectPath = RedHouseMapReduce.class.getResource("/").getFile() .toString(); try { BufferedReader br = new BufferedReader(new FileReader(new File( ProjectPath + File.separator + "..\\ik_library\\userLibrary.dic"))); String line=null; while((line=br.readLine())!=null){ line=line.replaceAll("\\s+", ""); dic.add(line); } br.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } /** * 分词 * * @param s * @return */ private static List<String> parse(String text) { List<String> words = new ArrayList<String>(); // 创建分词对象 IKAnalyzer analyzer = new IKAnalyzer(true); analyzer.useSmart(); StringReader reader = new StringReader(text); // 分词 try { TokenStream ts = analyzer.tokenStream("", reader); CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); // 遍历分词数据 while (ts.incrementToken()) { if(dic.contains(term.toString())){ words.add(term.toString()); } } } catch (IOException e) { e.printStackTrace(); } reader.close(); return words; } /** * mapper * * @author wwhhf * */ private static class RedHouseMapper extends Mapper<LongWritable, Text, Text, LongWritable> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { List<String> names = parse(value.toString()); for (String name : names) { context.write(new Text(name), new LongWritable(1)); } } } /** * reducer * * @author wwhhf * */ private static class RedHouseReducer extends Reducer<Text, LongWritable, Text, LongWritable> { @Override protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { Long sum = 0L; for (LongWritable value : values) { sum = sum + value.get(); } context.write(key, new LongWritable(sum)); } } /** * main * * @param args */ public static void main(String[] args) { try { Configuration cfg = HadoopCfg.getConfiguration(); Job job = Job.getInstance(cfg); job.setJobName("RedHourse"); job.setJarByClass(RedHouseMapReduce.class); // mapper job.setMapperClass(RedHouseMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); // reducer job.setReducerClass(RedHouseReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); FileInputFormat.addInputPath(job, new Path("/red_in/")); FileOutputFormat.setOutputPath(job, new Path("/red_out/")); System.exit(job.waitForCompletion(true) ? 0 : 1); } catch (IllegalStateException | IllegalArgumentException | ClassNotFoundException | IOException | InterruptedException e) { e.printStackTrace(); } } }
相关文章推荐
- Centos 7.2 配置防火墙
- linux运维学习决心书
- 系统架构师成长之路(一)
- Linux进程与线程的区别
- IO复用之select poll epoll的总结
- 虚拟机+CentOS内核hack7、8、9、17失败记
- linux设备驱动编程环境的搭建小结(结合LDD3使用)
- 转载 - LINUX下查看CPU使用率的命令
- tomcat 性能优化及压力测试
- 用extundelete恢复误删除数据实战
- 【会议】QCon2016会议整理(一)——工程效率与架构
- linux守护进程
- 第2课:通过案例对SparkStreaming 透彻理解三板斧之二:解密SparkStreaming运行机制和架构
- Linux:信号(上)
- 块设备驱动程序(Linux设备驱动程序)
- Linux中执行shell脚本的4种方法总结
- shell自动监控重启Tomcat脚本
- Linux basic knowledge
- shell script 学习
- shell算术运算