hadoop mapreduce读取orcfile的java代码示例
2015-06-10 17:03
696 查看
orcfile在hive 0.11版本后提供支持,orcfile相比rcfile具有更高的数据压缩比,在不使用任何压缩算法,仅仅使用orcfile存储格式,数据量大小就能缩小一半以上。
下面以hive 0.13版本为例,列举了mapreduce读取orcfile的java示例代码:
需要引入的包:hive-common-0.13.1.jar、hive-exec-0.13.1.jar
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.OrcNewInputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcStruct;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
main函数关键代码:
public static void main(String[] args) throws IOException,
URISyntaxException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setInputFormatClass(OrcNewInputFormat.class);
FileInputFormat.addInputPath(job, new Path(inputPath));
job.setMapperClass(ExtractorMapper.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
map实现函数关键代码:
private static class ExtractorMapper extends
Mapper {
private static final String SCHEMA = "struct<column_name1:string,column_name2:string>"
protected void map(
NullWritable key,
Writable value,
Mapper.Context context)
throws IOException, InterruptedException {
OrcStruct struct = (OrcStruct)value;
TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(SCHEMA);
StructObjectInspector inspector = (StructObjectInspector)
OrcStruct.createObjectInspector(typeInfo);
StringBuffer outputKey = new StringBuffer();
outputKey.append(inspector.getStructFieldData(struct, inspector.getStructFieldRef("column_name1")).toString());
outputKey.append(TAB);
outputKey.append(inspector.getStructFieldData(struct, inspector.getStructFieldRef("column_name2")).toString());
System.out.println(outputKey.toString());
}
下面以hive 0.13版本为例,列举了mapreduce读取orcfile的java示例代码:
需要引入的包:hive-common-0.13.1.jar、hive-exec-0.13.1.jar
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.OrcNewInputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcStruct;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
main函数关键代码:
public static void main(String[] args) throws IOException,
URISyntaxException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setInputFormatClass(OrcNewInputFormat.class);
FileInputFormat.addInputPath(job, new Path(inputPath));
job.setMapperClass(ExtractorMapper.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
map实现函数关键代码:
private static class ExtractorMapper extends
Mapper {
private static final String SCHEMA = "struct<column_name1:string,column_name2:string>"
protected void map(
NullWritable key,
Writable value,
Mapper.Context context)
throws IOException, InterruptedException {
OrcStruct struct = (OrcStruct)value;
TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(SCHEMA);
StructObjectInspector inspector = (StructObjectInspector)
OrcStruct.createObjectInspector(typeInfo);
StringBuffer outputKey = new StringBuffer();
outputKey.append(inspector.getStructFieldData(struct, inspector.getStructFieldRef("column_name1")).toString());
outputKey.append(TAB);
outputKey.append(inspector.getStructFieldData(struct, inspector.getStructFieldRef("column_name2")).toString());
System.out.println(outputKey.toString());
}
相关文章推荐
- Java 命名规范
- 修改hadoop FileUtil.java,解决权限检查的问题
- Spring handler method
- java获取图片像素点的RGB值
- JAVA中获得一个月最大天数的方法(备忘)
- Java中static关键字与final关键字的用法与区别
- JAVA 环境变量
- gwt中 java与js 的相互调用
- java---当天早八点
- Struts2平凡之路(三)Struts2架构和运行流程
- java基础
- Spring事务不起作用 问题汇总
- Java多线程系列--“JUC原子类”05之 AtomicLongFieldUpdater原子类
- (转)java的几种排序方法
- Eclipse搜索快捷键
- swing组件JTabel和JTree使用总结
- struts2 页面标签或ognl表达式取值--未完待续
- Java装饰者模式
- jdk问题
- 用link方式搭建eclipse + PyDev环境