简单实现CombineFileInputFormat
2014-09-08 12:19
302 查看
import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.Reducer.Context; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat; import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader; import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; import org.apache.hadoop.mapreduce.lib.input.SequenceFileRecordReader; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class TestCombine extends Configured implements Tool { private static class ProvinceMapper extends Mapper<Object, Text, Text, Text> { @Override protected void map(Object key, Text value, Context context) throws IOException, InterruptedException { System.out.println("value : " + value + " Context " + context); context.write(value, value); } } private static class ProvinceReducer extends Reducer<Text, Text, Text, Text> { @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { for (Text va : values) { System.out.println("reduce " + key); context.write(key, key); } } } public static class CombineSequenceFileInputFormat<K, V> extends CombineFileInputFormat<K, V> { @SuppressWarnings({ "unchecked", "rawtypes" }) @Override public RecordReader<K, V> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException { return new CombineFileRecordReader((CombineFileSplit)split, context, CombineLineRecordReader.class); } } public static class CombineLineRecordReader<K, V> extends RecordReader<K, V> { private CombineFileSplit split; private TaskAttemptContext context; private int index; private RecordReader<K, V> rr; @SuppressWarnings("unchecked") public CombineLineRecordReader(CombineFileSplit split, TaskAttemptContext context, Integer index) throws IOException, InterruptedException { this.index = index; this.split = (CombineFileSplit) split; this.context = context; this.rr = (RecordReader<K, V>) ReflectionUtils.newInstance(LineRecordReader.class, context.getConfiguration()); } @SuppressWarnings("unchecked") @Override public void initialize(InputSplit curSplit, TaskAttemptContext curContext) throws IOException, InterruptedException { this.split = (CombineFileSplit) curSplit; this.context = curContext; if (null == rr) { rr = ReflectionUtils.newInstance(SequenceFileRecordReader.class, context.getConfiguration()); } FileSplit fileSplit = new FileSplit(this.split.getPath(index), this.split.getOffset(index), this.split.getLength(index), this.split.getLocations()); this.rr.initialize(fileSplit, this.context); } @Override public float getProgress() throws IOException, InterruptedException { return rr.getProgress(); } @Override public void close() throws IOException { if (null != rr) { rr.close(); rr = null; } } @Override public K getCurrentKey() throws IOException, InterruptedException { return rr.getCurrentKey(); } @Override public V getCurrentValue() throws IOException, InterruptedException { return rr.getCurrentValue(); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { return rr.nextKeyValue(); } } public int run(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf); job.setJobName("TestCombine"); job.setJarByClass(TestCombine.class); job.setMapperClass(ProvinceMapper.class); job.setReducerClass(ProvinceReducer.class); job.setInputFormatClass(CombineSequenceFileInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); String inpath = "/home/hadoop/tmp/combine"; String outpath = "/home/hadoop/tmp/combineout"; Path p = new Path(outpath); FileSystem fs = FileSystem.get(conf); if (fs.exists(p)){ fs.delete(p); } FileInputFormat.addInputPaths(job, inpath); FileOutputFormat.setOutputPath(job, p); return job.waitForCompletion(true) ? 0 : 1; } public static void main(String[] args) throws Exception { int ret = ToolRunner.run(new TestCombine(), args); System.exit(ret); } }
相关文章推荐
- 简单 实现CombineFileInputFormat
- MapReduce小文件处理之CombineFileInputFormat实现
- streaming方式的CombineFileInputFormat实现
- MapReduce小文件处理之CombineFileInputFormat实现
- Hadoop使用CombineFileInputFormat处理大量小文件接口实现(Hadoop-1.0.4)
- hadoop CombineFileInputFormat的使用
- 利用CombineFileInputFormat把netflix data set 导入到Hbase里
- 用CombineFileInputFormat处理小文件的mapreduce程序
- 简单获取input file 选中的图片,并在一个div的img里面赋值src实现预览图片
- HFileInputFormat实现
- Hadoop中CombineFileInputFormat详解
- MapReduce应用中CombineFileInputFormat原理与用法
- Hadoop MapReduce处理小的压缩文件:基于CombineFileInputFormat
- Hadoop CombineFileInputFormat原理说明(转)
- hadoop old API CombineFileInputFormat
- Hadoop MapReduce处理海量小文件:基于CombineFileInputFormat(整个小文件读入到map中)
- Hadoop中CombineFileInputFormat详解——处理大量小文件
- Sample MultipleFileWordcount CombineFileInputFormat
- MapReduce应用中CombineFileInputFormat原理与用法
- Hadoop MapReduce处理海量小文件:基于CombineFileInputFormat(每次往map中读入1行)