您的位置:首页 > 运维架构

Hadoop MapReduce 统计红楼梦出现次数最多的名字

2016-05-07 19:33 465 查看
package org.bigdata.util;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;

/**
* 统计红楼梦出现最多的名字
*
* @author wwhhf
*
*/
public class RedHouseMapReduce {

public static Set<String> dic = new HashSet<>();
static {
String ProjectPath = RedHouseMapReduce.class.getResource("/").getFile()
.toString();
try {
BufferedReader br = new BufferedReader(new FileReader(new File(
ProjectPath + File.separator + "..\\ik_library\\userLibrary.dic")));
String line=null;
while((line=br.readLine())!=null){
line=line.replaceAll("\\s+", "");
dic.add(line);
}
br.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}

/**
* 分词
*
* @param s
* @return
*/
private static List<String> parse(String text) {
List<String> words = new ArrayList<String>();
// 创建分词对象
IKAnalyzer analyzer = new IKAnalyzer(true);
analyzer.useSmart();
StringReader reader = new StringReader(text);
// 分词
try {
TokenStream ts = analyzer.tokenStream("", reader);
CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
// 遍历分词数据
while (ts.incrementToken()) {
if(dic.contains(term.toString())){
words.add(term.toString());
}
}
} catch (IOException e) {
e.printStackTrace();
}
reader.close();
return words;
}

/**
* mapper
*
* @author wwhhf
*
*/
private static class RedHouseMapper extends
Mapper<LongWritable, Text, Text, LongWritable> {

@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
List<String> names = parse(value.toString());
for (String name : names) {
context.write(new Text(name), new LongWritable(1));
}
}
}

/**
* reducer
*
* @author wwhhf
*
*/
private static class RedHouseReducer extends
Reducer<Text, LongWritable, Text, LongWritable> {

@Override
protected void reduce(Text key, Iterable<LongWritable> values,
Context context) throws IOException, InterruptedException {
Long sum = 0L;
for (LongWritable value : values) {
sum = sum + value.get();
}
context.write(key, new LongWritable(sum));
}
}

/**
* main
*
* @param args
*/
public static void main(String[] args) {
try {
Configuration cfg = HadoopCfg.getConfiguration();
Job job = Job.getInstance(cfg);
job.setJobName("RedHourse");
job.setJarByClass(RedHouseMapReduce.class);

// mapper
job.setMapperClass(RedHouseMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);

// reducer
job.setReducerClass(RedHouseReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);

FileInputFormat.addInputPath(job, new Path("/red_in/"));
FileOutputFormat.setOutputPath(job, new Path("/red_out/"));

System.exit(job.waitForCompletion(true) ? 0 : 1);

} catch (IllegalStateException | IllegalArgumentException
| ClassNotFoundException | IOException | InterruptedException e) {
e.printStackTrace();
}
}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: