您的位置:首页 > 产品设计 > UI/UE

实现TOP K(选做):统计sogou500w中,发关键字次数最多的 *前20名用户UID和发关键字次数。

2015-09-18 19:27 423 查看
package day0917;

import java.io.IOException;

import java.util.TreeMap;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class TopK {

/*实现TOP K(选做):统计sogou500w中,发关键字次数最多的

*前20名用户UID和发关键字次数。

*

* 思路:

*(1)得到每个关键字的次数

*(2)对其进行排序

*(3)选出Top20

*关键如何选出前二十

*使用reduce中cleanup()方法能得出是较为快捷的解决方式

*/

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

Path inputPath = new Path(args[0]);

//args[1]="http:/master:9000:/out-1";

Path outputPath = new Path(args[1]);

@SuppressWarnings("deprecation")

Job job = new Job(new Configuration(),TopK.class.getName());

//FileSystem fs = new FileSystem(new Configuration(),, encoding)

job.setJarByClass(TopK.class);

job.setMapperClass(TopMapper.class);

job.setReducerClass(TopReducer.class);

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(LongWritable.class);

FileInputFormat.addInputPath(job, inputPath);

FileOutputFormat.setOutputPath(job, outputPath);

job.waitForCompletion(true);

}

public static class TopMapper extends Mapper<LongWritable, Text, Text, LongWritable>{

Text text =new Text();

@Override

protected void map(LongWritable key, Text value,Context context)

throws IOException, InterruptedException {

String[] line= value.toString().split("\t");

String keys = line[2];

text.set(keys);

context.write(text,new LongWritable(1));

}

}

public static class TopReducer extends Reducer< Text,LongWritable, Text, LongWritable>{

Text text = new Text();

TreeMap<Integer,String > map = new TreeMap<Integer,String>();

@Override

protected void reduce(Text key, Iterable<LongWritable> value, Context context)

throws IOException, InterruptedException {

int sum=0;//key出现次数

for (LongWritable ltext : value) {

sum+=ltext.get();

}

map.put(sum,key.toString());

//去前20条数据

if(map.size()>20){

map.remove(map.firstKey());

}

}

@Override

protected void cleanup(Context context)

throws IOException, InterruptedException {

// TODO Auto-generated method stub

for(Integer count:map.keySet()){

context.write(new Text(map.get(count)), new LongWritable(count));

}

}

}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: