MapReduce——统计单词出现次数WordCount
2018-05-06 20:19
323 查看
一、
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; /* 统计单词出现次数, 没有严格的词法分析ヾ(≧O≦)〃~ 不是很严谨 写的只是简单的用空格切割,如果是带有标点符号的结果有很大问题 */ public class ForWorldCount { public static class ForMapper extends Mapper<LongWritable,Text,Text,IntWritable>{ Text oKey=new Text(); IntWritable oValue=new IntWritable(1); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line=value.toString();//读到的文本 String []strs=line.split(" "); for(String s:strs){ oKey.set(s); context.write(oKey,oValue);//向reduce输出key-value } } } public static class ForReducer extends Reducer<Text,IntWritable,Text,IntWritable>{ IntWritable oValue=new IntWritable(); @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum=0; for(IntWritable i:values){ sum+=i.get(); } oValue.set(sum); context.write(key,oValue); } } public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException { Job job= Job.getInstance(); //设置任务的mapper类型,以及mapper的key-value类型 job.setMapperClass(ForMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); //设置任务的reducer类型,以及reducer的key-value类型 job.setReducerClass(ForReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); //配置输入路径 FileInputFormat.addInputPath(job,new Path("E://forTestData//forWordCount")); //配置输出路径,若文件已存在,先删除 FileSystem fileSystem=FileSystem.get(new URI("file://E://output"),new Configuration()); Path path=new Path("E://output"); if(fileSystem.exists(path)){ fileSystem.delete(path,true); } FileOutputFormat.setOutputPath(job,path); //提交任务 job.waitForCompletion(true); } }二、
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.util.*; /* 在统计单词出现次数的基础上, 找出出现次数最多的单词 如果次数相同,选长度更长的 */ public class ForSortWordCount { public static class ForMapper extends Mapper<LongWritable,Text,Text,IntWritable>{ Map<String,Integer> map=new HashMap<String, Integer>(); int maxTimes=0; @Override //map的输入是上一阶段输出的结果(单词+出现次数) protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line=value.toString(); String strs[]=line.split("\t"); String word=strs[0]; int times=Integer.parseInt(strs[1]); if(times>maxTimes){ map.clear(); map.put(word,times); maxTimes=times; } } /*当仅需要将map的计算结果只输出一次的时候比如topOne、topN问题 可以使用cleanup方法 cleanup方法会在map执行结束后执行一次,一般做输出操作 同理,setup方法是在map执行开始之前执行一次 */ @Override protected void cleanup(Context context) throws IOException, InterruptedException { Map.Entry<String,Integer> entry=map.entrySet().iterator().next(); context.write(new Text(entry.getKey()),new IntWritable(entry.getValue())); } } public static class ForReducer extends Reducer<Text,IntWritable,Text,IntWritable>{ Map<String,Integer> map=new HashMap<String, Integer>(); @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { map.put(key.toString(),values.iterator().next().get()); } @Override protected void cleanup(Context context) throws IOException, InterruptedException { List<Map.Entry<String,Integer>> list=new ArrayList<Map.Entry<String,Integer>>(map.entrySet()); Collections.sort(list,new Comparator<Map.Entry<String, Integer>>() { public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) { if(o1.getValue()==o2.getValue()){ return o2.getKey().length()-o1.getKey().length(); }else{ return o2.getValue()-o1.getValue(); } }}); Map.Entry<String,Integer> entry=list.get(0); context.write(new Text(entry.getKey()),new IntWritable(entry.getValue())); } } public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException { Job job= Job.getInstance(); job.setMapperClass(ForMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(ForReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileSystem fileSystem=FileSystem.get(new URI("file:E://output"),new Configuration()); Path path=new Path("E://output"); if(fileSystem.exists(path)){ fileSystem.delete(path,true); } FileInputFormat.addInputPath(job,new Path("E://forTestData//forWordCount//forSortWordCount")); FileOutputFormat.setOutputPath(job,path); job.setNumReduceTasks(1);//若要求出全局TopN,map处理完的数据只能交付给一个reduce job.waitForCompletion(true); } }
相关文章推荐
- Mapreduce实例---统计单词个数(wordcount)
- 用array_count_values统计一篇英文文档中每个单词的出现次数,结果用表格展示出来
- Hadoop编程入门,统计单词出现数目wordcount
- Scala中使用两种方式对单词进行次数统计(wordCount)
- Spark primer 之wordcount 剖析(添加 spark按照单词出现次数排名)
- 和我一起学Hadoop(五):MapReduce的单词统计,wordcount
- 编写函数void count(char a[],char w[][10],int n,int b[])。功能是:统计w指向的数组中的n个单词在a指向的字符串中各自出现的次数(将非字母字符看作单词分
- Hadoop 7days-3 MapReduce 以及 统计单词出现的次数小练习
- 编写函数 void count(char a[],char w[][10],int n,int b[]).功能是:统计w指向的数组中的n个单词在a指向的字符串中各自出现的次数(将非字符字符看作单词分割
- hadoop基础----hadoop实战(三)-----hadoop运行MapReduce---对单词进行统计--经典的自带例子wordcount
- hadoop基础----hadoop实战(三)-----hadoop运行MapReduce---对单词进行统计--经典的自带例子wordcount
- Mapreduce实例---统计单词个数(wordcount)
- Mapreduce实例---统计单词个数(wordcount)
- 编写函数void count(char a[],char w[][10],int n,int b[])。功能是:统计w指向的数组中的n个单词在a指向的字符串中各自出现的次数(将非字母字符看作单词分
- go语言之map练习(二):编写一个程序wordfreq程序,统计输入文本中每个单词出现的频率(次数)
- HADOOP(1)__Mapreduce_WordCount统计单词数
- wc:统计一个文件里出现某个单词出现的次数
- MapReduce基础开发之一词汇统计和排序(wordcount)