您的位置:首页 > 其它

IF-IDF简单实现

2018-01-30 14:11 78 查看
输入:三个文件分别如下,并放在c文件夹下

xm@master:~/workspace$ hadoop fs -text /c/file1
MapReduce is simple
xm@master:~/workspace$ hadoop fs -text /c/file2
MapReduce is powerful is simple
xm@master:~/workspace$ hadoop fs -text /c/file3
Hello MapReduce bye MapReduce


输出:

Hello   | file3 | 0.11928031367991561
MapReduce   | file3 | 0.0| file2 | 0.0| file1 | 0.0
bye | file3 | 0.11928031367991561
is  | file2 | 0.0704365036222725| file1 | 0.058697086351893746
powerful    | file2 | 0.09542425094393249
simple  | file2 | 0.03521825181113625| file1 | 0.058697086351893746


实现原理请参照上一个博客


实现代码:

package Inverted;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class InvertedIndex {

static String INPUT_PATH = "hdfs://master:9000/c";
static String OUTPUT_PATH = "hdfs://master:9000/output";
private static double file_num = 0;
private static int word_sum = 0;

//key=单词名:所在文件名:文件中单词总数_ value=1 实现单词计数    //求得总文件数file_num
static class Map extends Mapper<Object,Object,Text,Text>{

private Text keyInfo = new Text();
private Text valueInfo = new Text();
private FileSplit split;

String k = "";

//求得总文件数file_num
protected void setup(Context context) throws IOException, InterruptedException{
FileSplit fs = (FileSplit) context.getInputSplit();
k = fs.getPath().getName();
file_num = file_num+1;
}

//key=单词名:所在文件名:文件中单词总数_ value=1 实现单词计数
protected void map(Object key, Object value, Context context) throws IOException, InterruptedException{

//求文件中单词总数
StringTokenizer itr2 = new StringTokenizer(value.toString());
word_sum = 0;
while(itr2.hasMoreElements()){
itr2.nextToken();
word_sum++;
}

split = (FileSplit)context.getInputSplit();
StringTokenizer itr = new StringTokenizer(value.toString());
while(itr.hasMoreTokens()){

int splitIndex = split.getPath().toString().indexOf("file");
//key=单词名:所在文件名:文件中单词总数_ value=1 实现单词计数
keyInfo.set(itr.nextToken()+":"+spli
4000
t.getPath().toString().substring(splitIndex));
keyInfo.set(keyInfo.toString()+":"+word_sum);
//value--->1
valueInfo.set("1");
context.write(keyInfo, valueInfo);
}
}
}

//写入key=单词名 value=文件名:词频
static class Combine  extends Reducer<Text,Text,Text,Text>{

private Text info = new Text();

protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException{

//求得单词出现总数
double sum=0;
for(Text value:values){
sum+=Integer.parseInt(value.toString());
}

//求得文件中单词数/文件单词总数
String arr[] = key.toString().split(":");
double a = Double.parseDouble(arr[2]);
double b = sum/a;
//          String result = String .format("%.2f",b);
info.set(arr[1]+":"+b);
key.set(arr[0]);

//写入key=单词名 value=文件名:词频
context.write(key, info);
}
}

//两次遍历values,第一遍取得该单词出现的文件数,第二遍求得IF-IDF
static class Reduce  extends Reducer<Text,Text,Text,Text>{

private Text result = new Text();
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException{

//The number of words exist in files
double file_sum = 0;
String fileList = new String();

String []ar = new String[10];
double []dr = new double[10];

int i=0;
for(Text value:values){
file_sum++;
String[] arr2 = value.toString().split(":");
double c = Double.parseDouble(arr2[1]);
ar[i] = arr2[0];
dr[i] = c;
i++;
}

for(int y=0;y<ar.length;y++){
if(ar[y]!=null&&dr[y]!=0){
dr[i] = dr[i]*Math.log10(file_num/file_sum);
fileList = fileList+"| "+ar[y]+" | "+dr[y]*Math.log10(file_num/file_sum);
}
}

result.set(fileList);
context.write(key, result);
}
}

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// TODO Auto-generated method stub

Path outputpath = new Path(OUTPUT_PATH);
Configuration conf = new Configuration();
FileSystem fs = outputpath.getFileSystem(conf);

if(fs.exists(outputpath)){
fs.delete(outputpath,true);
}
conf.set("fs.default.name ", "hdfs://master:9000/");

Job job = Job.getInstance(conf);

job.setJarByClass(InvertedIndex.class);

job.setCombinerClass(Reduce.class);

FileInputFormat.setInputPaths(job, INPUT_PATH);
FileOutputFormat.setOutputPath(job, outputpath);

job.setMapperClass(Map.class);
job.setCombinerClass(Combine.class);
job.setReducerClass(Reduce.class);

job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);

job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);

job.waitForCompletion(true);
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  IF-IDF简单实现