MapReduce—案例(四)简单数据去重
2018-03-24 13:50
483 查看
元数据
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 数据去重
* @author potter
*/
public class Practice3 {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// conf.set("fs.defaultFS", "hdfs:potter2:9000");//使用配置文件
// System.setProperty("HADOOP_USER_NAME", "potter");//使用集群
FileSystem fs = FileSystem.get(conf);//默认使用本地文件
Job job = Job.getInstance();
job.setJarByClass(Practice3.class);
job.setMapperClass(Practice3Mapper.class);
job.setReducerClass(Practice3Reducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// String inputpath = args[0];//0代表的是输入第一个参数
// String outpath = args[1]; //1代表的是输入的第二个参数
Path input = new Path("D:\\practice\\input3\\work3.txt");
Path output = new Path("D:\\practice\\input3\\output1");
FileInputFormat.setInputPaths(job, input);
FileOutputFormat.setOutputPath(job, output);
if (fs.exists(output)) {
fs.delete(output,true);
}
boolean isdone = job.waitForCompletion(true);
System.exit(isdone ? 0 : 1);
}
public static class Practice3Mapper extends Mapper<LongWritable, Text, Text, NullWritable>{
//2012-3-1 a
Text text = new Text();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context)
throws IOException, InterruptedException {
String[] split = value.toString().trim().split(" ");
String kk = split[0]+"\t"+split[1];
text.set(kk);
context.write(text, NullWritable.get());
}
}
public static class Practice3Reducer extends Reducer<Text, NullWritable, Text, NullWritable>{
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context)
throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
}
数据去重源数据: 2012-3-1 a 2012-3-2 b 2012-3-3 c 2012-3-4 d 2012-3-5 a 2012-3-6 b 2012-3-7 c 2012-3-3 c 2012-3-1 b 2012-3-2 a 2012-3-3 b 2012-3-4 d 2012-3-5 a 2012-3-6 c 2012-3-7 d 2012-3-3 c 最终结果: 2012-3-1 a 2012-3-1 b 2012-3-2 a 2012-3-2 b 2012-3-3 b 2012-3-3 c 2012-3-4 d 2012-3-5 a 2012-3-6 b 2012-3-6 c 2012-3-7 c 2012-3-7 d思路:MapReduce的key可以自动去重,所以在reduce阶段,每次输出一个,就可以达到去重的目的。package practice1;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 数据去重
* @author potter
*/
public class Practice3 {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// conf.set("fs.defaultFS", "hdfs:potter2:9000");//使用配置文件
// System.setProperty("HADOOP_USER_NAME", "potter");//使用集群
FileSystem fs = FileSystem.get(conf);//默认使用本地文件
Job job = Job.getInstance();
job.setJarByClass(Practice3.class);
job.setMapperClass(Practice3Mapper.class);
job.setReducerClass(Practice3Reducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// String inputpath = args[0];//0代表的是输入第一个参数
// String outpath = args[1]; //1代表的是输入的第二个参数
Path input = new Path("D:\\practice\\input3\\work3.txt");
Path output = new Path("D:\\practice\\input3\\output1");
FileInputFormat.setInputPaths(job, input);
FileOutputFormat.setOutputPath(job, output);
if (fs.exists(output)) {
fs.delete(output,true);
}
boolean isdone = job.waitForCompletion(true);
System.exit(isdone ? 0 : 1);
}
public static class Practice3Mapper extends Mapper<LongWritable, Text, Text, NullWritable>{
//2012-3-1 a
Text text = new Text();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context)
throws IOException, InterruptedException {
String[] split = value.toString().trim().split(" ");
String kk = split[0]+"\t"+split[1];
text.set(kk);
context.write(text, NullWritable.get());
}
}
public static class Practice3Reducer extends Reducer<Text, NullWritable, Text, NullWritable>{
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context)
throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
}
完成!!!
相关文章推荐
- MapReduce案例3——求简单数据去重
- MapReduce应用案例--简单的数据去重
- Hadoop第7周练习—MapReduce进行数据查询和实现推简单荐系统(转)
- 大数据迁移(简单案例)(41 )
- MapReduce实例-NASA博客数据频度简单分析
- mapreduce 自定义数据类型的简单的应用
- 将HDFS中的数据通过MapReduce产生HFile,然后将HFile导入到HBase具体案例分析
- MapReduce应用案例--简单排序
- MapReduce:超大机群上的简单数据处理
- python爬虫webdriver.Chrome 数据可视化简单案例matplotlib
- js数据层中间件单页运用简单构思和应用案例
- MapReduce:超大机群上的简单数据处理
- 使用Hadoop分析数据——简单案例java语言编程之MaxTemperature
- Hadoop环境搭建之二配置启动HDFS及本地模式运行MapReduce案例(使用HDFS上数据)
- Mongo入门:数据去重之MapReduce,Aggregation的简单使用(另附相关网络资源)
- MapReduce:超大机群上的简单数据处理
- MapReduce从HBase读写数据简单示例
- hadoop入门--简单的MapReduce案例
- Redis持久化RDB简介及简单数据恢复案例
- 大数据_Shuffle、MapReduce编程案例(数据去重、多表查询、倒排索引、使用单元测试)