IF-IDF简单实现
2018-01-30 14:11
78 查看
输入:三个文件分别如下,并放在c文件夹下 xm@master:~/workspace$ hadoop fs -text /c/file1 MapReduce is simple xm@master:~/workspace$ hadoop fs -text /c/file2 MapReduce is powerful is simple xm@master:~/workspace$ hadoop fs -text /c/file3 Hello MapReduce bye MapReduce
输出: Hello | file3 | 0.11928031367991561 MapReduce | file3 | 0.0| file2 | 0.0| file1 | 0.0 bye | file3 | 0.11928031367991561 is | file2 | 0.0704365036222725| file1 | 0.058697086351893746 powerful | file2 | 0.09542425094393249 simple | file2 | 0.03521825181113625| file1 | 0.058697086351893746
实现原理请参照上一个博客
实现代码: package Inverted; import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; public class InvertedIndex { static String INPUT_PATH = "hdfs://master:9000/c"; static String OUTPUT_PATH = "hdfs://master:9000/output"; private static double file_num = 0; private static int word_sum = 0; //key=单词名:所在文件名:文件中单词总数_ value=1 实现单词计数 //求得总文件数file_num static class Map extends Mapper<Object,Object,Text,Text>{ private Text keyInfo = new Text(); private Text valueInfo = new Text(); private FileSplit split; String k = ""; //求得总文件数file_num protected void setup(Context context) throws IOException, InterruptedException{ FileSplit fs = (FileSplit) context.getInputSplit(); k = fs.getPath().getName(); file_num = file_num+1; } //key=单词名:所在文件名:文件中单词总数_ value=1 实现单词计数 protected void map(Object key, Object value, Context context) throws IOException, InterruptedException{ //求文件中单词总数 StringTokenizer itr2 = new StringTokenizer(value.toString()); word_sum = 0; while(itr2.hasMoreElements()){ itr2.nextToken(); word_sum++; } split = (FileSplit)context.getInputSplit(); StringTokenizer itr = new StringTokenizer(value.toString()); while(itr.hasMoreTokens()){ int splitIndex = split.getPath().toString().indexOf("file"); //key=单词名:所在文件名:文件中单词总数_ value=1 实现单词计数 keyInfo.set(itr.nextToken()+":"+spli 4000 t.getPath().toString().substring(splitIndex)); keyInfo.set(keyInfo.toString()+":"+word_sum); //value--->1 valueInfo.set("1"); context.write(keyInfo, valueInfo); } } } //写入key=单词名 value=文件名:词频 static class Combine extends Reducer<Text,Text,Text,Text>{ private Text info = new Text(); protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException{ //求得单词出现总数 double sum=0; for(Text value:values){ sum+=Integer.parseInt(value.toString()); } //求得文件中单词数/文件单词总数 String arr[] = key.toString().split(":"); double a = Double.parseDouble(arr[2]); double b = sum/a; // String result = String .format("%.2f",b); info.set(arr[1]+":"+b); key.set(arr[0]); //写入key=单词名 value=文件名:词频 context.write(key, info); } } //两次遍历values,第一遍取得该单词出现的文件数,第二遍求得IF-IDF static class Reduce extends Reducer<Text,Text,Text,Text>{ private Text result = new Text(); protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException{ //The number of words exist in files double file_sum = 0; String fileList = new String(); String []ar = new String[10]; double []dr = new double[10]; int i=0; for(Text value:values){ file_sum++; String[] arr2 = value.toString().split(":"); double c = Double.parseDouble(arr2[1]); ar[i] = arr2[0]; dr[i] = c; i++; } for(int y=0;y<ar.length;y++){ if(ar[y]!=null&&dr[y]!=0){ dr[i] = dr[i]*Math.log10(file_num/file_sum); fileList = fileList+"| "+ar[y]+" | "+dr[y]*Math.log10(file_num/file_sum); } } result.set(fileList); context.write(key, result); } } public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { // TODO Auto-generated method stub Path outputpath = new Path(OUTPUT_PATH); Configuration conf = new Configuration(); FileSystem fs = outputpath.getFileSystem(conf); if(fs.exists(outputpath)){ fs.delete(outputpath,true); } conf.set("fs.default.name ", "hdfs://master:9000/"); Job job = Job.getInstance(conf); job.setJarByClass(InvertedIndex.class); job.setCombinerClass(Reduce.class); FileInputFormat.setInputPaths(job, INPUT_PATH); FileOutputFormat.setOutputPath(job, outputpath); job.setMapperClass(Map.class); job.setCombinerClass(Combine.class); job.setReducerClass(Reduce.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.waitForCompletion(true); } }
相关文章推荐
- word2Vec--(1) nltk实现简单的切词,情感分析,文本相似度(TF-IDF)
- 用C++实现简单的文件I/O操作(ifstream,ofstream)
- 简单标签实现if和if-else代码
- IfFileExists(文件存在)+Goto实现简单跳转
- NSIS:IfFileExists+Goto实现简单跳转
- 简单的if else 实现权限控制
- 用C++实现简单的文件I/O操作(ifstream,ofstream)
- 简单的TFIDF算法实现Java代码
- 使用vue.js 在移动端简单实现的下拉加载更多 和一些常用的js/jq操作和vueFilter,v-if和v-show运用
- 标简单标签的实现f和 if-else
- 微信小程序 —— 瀑布流简单写法(css3属性加wx:if判断轻松实现)
- 使用if-else实现简单的登录注销(学习笔记)
- python 利用sklearn自带的模块 快速简单实现文章的 tfidf向量空间的表示
- 简单实现根据Td-idf实现语句相似度
- matlab实现简单的if else 的语句
- C语言实现简单的倒排文件索引,TF-IDF
- 最简单的Splash Screen在Android中的实现
- 简单js组件、库实现
- 自己实现简单的天气预报应用(1)
- 用python实现简单的遗传算法