hadoop 分布式缓存
2016-04-06 17:39
155 查看
Hadoop 分布式缓存实现目的是在所有的MapReduce调用一个统一的配置文件,首先将缓存文件放置在HDFS中,然后程序在执行的过程中会可以通过设定将文件下载到本地具体设定如下:
public static void main(String[] arge) throws IOException, ClassNotFoundException, InterruptedException{
Configuration conf=new Configuration();
conf.set("fs.default.name", "hdfs://192.168.1.45:9000");
FileSystem fs=FileSystem.get(conf);
fs.delete(new Path("CASICJNJP/gongda/Test_gd20140104"));
conf.set("mapred.job.tracker", "192.168.1.45:9001");
conf.set("mapred.jar", "/home/hadoop/workspace/jar/OBDDataSelectWithImeiTxt.jar");
Job job=new Job(conf,"myTaxiAnalyze");
DistributedCache.createSymlink(job.getConfiguration());//
try {
DistributedCache.addCacheFile(new URI("/user/hadoop/CASICJNJP/DistributeFiles/imei.txt"), job.getConfiguration());
} catch (URISyntaxException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
job.setMapperClass(OBDDataSelectMaper.class);
job.setReducerClass(OBDDataSelectReducer.class);
//job.setNumReduceTasks(10);
//job.setCombinerClass(IntSumReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path("/user/hadoop/CASICJNJP/SortedData/20140104"));
FileOutputFormat.setOutputPath(job, new Path("CASICJNJP/gongda/SelectedData"));
System.exit(job.waitForCompletion(true)?0:1);
}
代码中标红的为将HDFS中的/user/hadoop/CASICJNJP/DistributeFiles/imei.txt作为分布式缓存
public class OBDDataSelectMaper extends Mapper<Object, Text, Text, Text> {
String[] strs;
String[] ImeiTimes;
String timei;
String time;
private java.util.List<Integer> ImeiList = new java.util.ArrayList<Integer>();
protected void setup(Context context) throws IOException,
InterruptedException {
try {
Path[] cacheFiles = DistributedCache.getLocalCacheFiles(context
.getConfiguration());
if (cacheFiles != null && cacheFiles.length > 0) {
String line;
BufferedReader br = new BufferedReader(new FileReader(
cacheFiles[0].toString()));
try {
line = br.readLine();
while ((line = br.readLine()) != null) {
ImeiList.add(Integer.parseInt(line));
}
} finally {
br.close();
}
}
} catch (IOException e) {
System.err.println("Exception reading DistributedCache: " + e);
}
}
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
try {
strs = value.toString().split("\t");
ImeiTimes = strs[0].split("_");
timei = ImeiTimes[0];
if (ImeiList.contains(Integer.parseInt(timei))) {
context.write(new Text(strs[0]), value);
}
} catch (Exception ex) {
}
}
}
上述标红代码中在Map的setup函数中加载分布式缓存。
public static void main(String[] arge) throws IOException, ClassNotFoundException, InterruptedException{
Configuration conf=new Configuration();
conf.set("fs.default.name", "hdfs://192.168.1.45:9000");
FileSystem fs=FileSystem.get(conf);
fs.delete(new Path("CASICJNJP/gongda/Test_gd20140104"));
conf.set("mapred.job.tracker", "192.168.1.45:9001");
conf.set("mapred.jar", "/home/hadoop/workspace/jar/OBDDataSelectWithImeiTxt.jar");
Job job=new Job(conf,"myTaxiAnalyze");
DistributedCache.createSymlink(job.getConfiguration());//
try {
DistributedCache.addCacheFile(new URI("/user/hadoop/CASICJNJP/DistributeFiles/imei.txt"), job.getConfiguration());
} catch (URISyntaxException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
job.setMapperClass(OBDDataSelectMaper.class);
job.setReducerClass(OBDDataSelectReducer.class);
//job.setNumReduceTasks(10);
//job.setCombinerClass(IntSumReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path("/user/hadoop/CASICJNJP/SortedData/20140104"));
FileOutputFormat.setOutputPath(job, new Path("CASICJNJP/gongda/SelectedData"));
System.exit(job.waitForCompletion(true)?0:1);
}
代码中标红的为将HDFS中的/user/hadoop/CASICJNJP/DistributeFiles/imei.txt作为分布式缓存
public class OBDDataSelectMaper extends Mapper<Object, Text, Text, Text> {
String[] strs;
String[] ImeiTimes;
String timei;
String time;
private java.util.List<Integer> ImeiList = new java.util.ArrayList<Integer>();
protected void setup(Context context) throws IOException,
InterruptedException {
try {
Path[] cacheFiles = DistributedCache.getLocalCacheFiles(context
.getConfiguration());
if (cacheFiles != null && cacheFiles.length > 0) {
String line;
BufferedReader br = new BufferedReader(new FileReader(
cacheFiles[0].toString()));
try {
line = br.readLine();
while ((line = br.readLine()) != null) {
ImeiList.add(Integer.parseInt(line));
}
} finally {
br.close();
}
}
} catch (IOException e) {
System.err.println("Exception reading DistributedCache: " + e);
}
}
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
try {
strs = value.toString().split("\t");
ImeiTimes = strs[0].split("_");
timei = ImeiTimes[0];
if (ImeiList.contains(Integer.parseInt(timei))) {
context.write(new Text(strs[0]), value);
}
} catch (Exception ex) {
}
}
}
上述标红代码中在Map的setup函数中加载分布式缓存。
相关文章推荐
- (总结)Nginx 502 Bad Gateway错误触发条件与解决方法
- VNCViewerWindows下远程连接Linux桌面
- 企业运维监控平台架构设计与实现(ganglia篇)
- Sqoop安装与学习
- 13.linux中断处理程序
- 《Linux内核分析》第七周学习笔记
- openfire + spark 搭建聊天系统
- Nginx/Tengine服务启动管理脚本(未使用系统funtions函数)
- centos6.7 安装 mysql 5.5.48
- linux 进程调度
- Linux VSFTP服务器详细配置
- Linux批量替换文本,文件夹内所有文本内容
- Linux的进程/线程间通信方式总结 01
- 我所理解的OOP——UML六种关系
- PUTTY 反向代理注意事项
- Linux C线程同步
- hadoop学习从0开始-mapreduce
- centos安装gitlab
- Android bat批处理自动执行adb shell命令
- openssl 安装