数据开发面试准备
2018-01-10 23:44
260 查看
1、datanode、namenode、resourceManager,NodeManger的区别和流程
http://www.aboutyun.com/thread-7778-1-1.html
2、hadoop map-reducer的流程,combine的作用
http://blog.csdn.net/lisonglisonglisong/article/details/47125319
3、基本的hadoop wordCount的编写
http://blog.jobbole.com/82607/
http://blog.csdn.net/Jerome_s/article/details/26441151
4、面试题
http://blog.csdn.net/qq_26442553/article/details/78718796
http://www.aboutyun.com/thread-7778-1-1.html
2、hadoop map-reducer的流程,combine的作用
http://blog.csdn.net/lisonglisonglisong/article/details/47125319
3、基本的hadoop wordCount的编写
http://blog.jobbole.com/82607/
http://blog.csdn.net/Jerome_s/article/details/26441151
package com.test.mapreduce.web; import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; public class WordCount { public static class WordMapper extends Mapper<Object, Text, Text, IntWritable> { private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(Object key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); StringTokenizer itr = new StringTokenizer(line); while (itr.hasMoreTokens()) { word.set(itr.nextToken().toLowerCase()); context.write(word, one); } } } public static class WordReducer extends Reducer<Text, IntWritable, Text, IntWritable> { private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, new IntWritable(sum)); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args) .getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2); } Job job = new Job(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(WordMapper.class); job.setCombinerClass(WordReducer.class); job.setReducerClass(WordReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } } --------------------------------------------------------------------------------- package cn.dataguru.hadoop; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.TreeMap; import java.util.Map.Entry; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; //位置数据 //IMSI|IMEI|UPDATETYPE|CGI|TIME //上网数据 //IMSI|IMEI|CGI|TIME|CGI|URL /** * 汇总基站数据表 * 计算每个用户在不同的时间段不同的基站停留的时长 * 输入参数 < input path > < output path > < date > < timepoint > * 参数示例: “/base /output 2012-09-12 09-17-24" * 意味着以“/base”为输入,"/output"为输出,指定计算2012年09月12日的数据,并分为00-07,07-17,17-24三个时段 * 输出格式 “IMSI|CGI|TIMFLAG|STAY_TIME” */ public class BaseStationDataPreprocess extends Configured implements Tool { /** * 计数器 * 用于计数各种异常数据 */ enum Counter { TIMESKIP, //时间格式有误 OUTOFTIMESKIP, //时间不在参数指定的时间段内 LINESKIP, //源文件行有误 USERSKIP //某个用户某个时间段被整个放弃 } /** * 读取一行数据 * 以“IMSI+时间段”作为 KEY 发射出去 */ public static class Map extends Mapper<LongWritable, Text, Text, Text> { String date; String [] timepoint; boolean dataSource; /** * 初始化 */ public void setup ( Context context ) throws IOException { this.date = context.getConfiguration().get("date"); //读取日期 this.timepoint = context.getConfiguration().get("timepoint").split("-"); //读取时间分割点 //提取文件名 FileSplit fs = (FileSplit)context.getInputSplit(); String fileName = fs.getPath().getName(); if( fileName.startsWith("POS") ) dataSource = true; else if ( fileName.startsWith("NET") ) dataSource = false; else throw new IOException("File Name should starts with POS or NET"); } /** * MAP任务 * 读取基站数据 * 找出数据所对应时间段 * 以IMSI和时间段作为 KEY * CGI和时间作为 VALUE */ public void map ( LongWritable key, Text value, Context context ) throws IOException, InterruptedException { String line = value.toString(); TableLine tableLine = new TableLine(); //读取行 try { tableLine.set(line, this.dataSource, this.date, this.timepoint ); } catch ( LineException e ) { if(e.getFlag()==-1) context.getCounter(Counter.OUTOFTIMESKIP).increment(1); else context.getCounter(Counter.TIMESKIP).increment(1); return; } catch (Exception e) { context.getCounter(Counter.LINESKIP).increment(1); return; } context.write( tableLine.outKey(), tableLine.outValue() ); } } /** * 统计同一个IMSI在同一时间段 * 在不同CGI停留的时长 */ public static class Reduce extends Reducer<Text, Text, NullWritable, Text> { private String date; private SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); /** * 初始化 */ public void setup ( Context context ) { this.date = context.getConfiguration().get("date"); //读取日期 } public void reduce ( Text key, Iterable<Text> values, Context context ) throws IOException, InterruptedException { String imsi = key.toString().split("\\|")[0]; String timeFlag = key.toString().split("\\|")[1]; //用一个TreeMap记录时间 TreeMap<Long, String> uploads = new TreeMap<Long, String>(); String valueString; for ( Text value : values ) { valueString = value.toString(); try { uploads.put( Long.valueOf( valueString.split("\\|")[1] ), valueString.split("\\|")[0] ); } catch ( NumberFormatException e ) { context.getCounter(Counter.TIMESKIP).increment(1); continue; } } try { //在最后添加“OFF”位置 Date tmp = this.formatter.parse( this.date + " " + timeFlag.split("-")[1] + ":00:00" ); uploads.put ( ( tmp.getTime() / 1000L ), "OFF"); //汇总数据 HashMap<String, Float> locs = getStayTime(uploads); //输出 for( Entry<String, Float> entry : locs.entrySet() ) { StringBuilder builder = new StringBuilder(); builder.append(imsi).append("|"); builder.append(entry.getKey()).append("|"); builder.append(timeFlag).append("|"); builder.append(entry.getValue()); context.write( NullWritable.get(), new Text(builder.toString()) ); } } catch ( Exception e ) { context.getCounter(Counter.USERSKIP).increment(1); return; } } /** * 获得位置停留信息 */ private HashMap<String, Float> getStayTime(TreeMap<Long, String> uploads) { Entry<Long, String> upload, nextUpload; HashMap<String, Float> locs = new HashMap<String, Float>(); //初始化 Iterator<Entry<Long, String>> it = uploads.entrySet().iterator(); upload = it.next(); //计算 while( it.hasNext() ) { nextUpload = it.next(); float diff = (float) (nextUpload.getKey()-upload.getKey()) / 60.0f; if( diff <= 60.0 ) //时间间隔过大则代表关机 { if( locs.containsKey( upload.getValue() ) ) locs.put( upload.getValue(), locs.get(upload.getValue())+diff ); else locs.put( upload.getValue(), diff ); } upload = nextUpload; } return locs; } } public int run(String[] args) throws Exception { Configuration conf = getConf(); conf.set("date", args[2]); conf.set("timepoint", args[3]); Job job = new Job(conf, "BaseStationDataPreprocess"); job.setJarByClass(BaseStationDataPreprocess.class); FileInputFormat.addInputPath( job, new Path(args[0]) ); //输入路径 FileOutputFormat.setOutputPath( job, new Path(args[1]) ); //输出路径 job.setMapperClass( Map.class ); //调用上面Map类作为Map任务代码 job.setReducerClass ( Reduce.class ); //调用上面Reduce类作为Reduce任务代码 job.setOutputFormatClass( TextOutputFormat.class ); job.setOutputKeyClass( Text.class ); job.setOutputValueClass( Text.class ); job.waitForCompletion(true); return job.isSuccessful() ? 0 : 1; } public static void main(String[] args) throws Exception { if ( args.length != 4 ) { System.err.println(""); System.err.println("Usage: BaseStationDataPreprocess < input path > < output path > < date > < timepoint >"); System.err.println("Example: BaseStationDataPreprocess /user/james/Base /user/james/Output 2012-09-12 07-09-17-24"); System.err.println("Warning: Timepoints should be begined with a 0+ two digit number and the last timepoint should be 24"); System.err.println("Counter:"); System.err.println("\t"+"TIMESKIP"+"\t"+"Lines which contain wrong date format"); System.err.println("\t"+"OUTOFTIMESKIP"+"\t"+"Lines which contain times that out of range"); System.err.println("\t"+"LINESKIP"+"\t"+"Lines which are invalid"); System.err.println("\t"+"USERSKIP"+"\t"+"Users in some time are invalid"); System.exit(-1); } //运行任务 int res = ToolRunner.run(new Configuration(), new BaseStationDataPreprocess(), args); System.exit(res); } }
4、面试题
http://blog.csdn.net/qq_26442553/article/details/78718796
相关文章推荐
- 移动端开发面试准备
- 数据仓库开发之路之一--准备工作
- 话说有做Android游戏开发的朋友么?准备毕业了求个面试方法
- Python开发Kettle做大数据ETL(前期准备)
- 面试准备之数据结构
- Leetcode Database using MySQL from the EASY ones 准备数据分析面试SQL试题
- java测试httprequest(为手游开发后台数据传输做准备)
- 嵌入式软件开发面试准备 —— 凡事预则立,不预则废
- osgEarth开发数据准备(一)——DEM与纹理影像(遥感)下载与处理 (转)
- 大数据工程师(开发)面试系列(7)
- 《大数据工程师(开发)面试系列(2)》
- 开发与面试涉及的基础数据结构和算法-Algorithm
- 大数据工程师(开发)面试系列(4)
- java web轻量级开发面试教程摘录,java web面试技巧汇总,如何准备Spring MVC方面的面试
- Java开发岗位面试题归类---怎么好好的准备面试,也算是发展学习方向
- java开发工程师笔试面试题目准备
- 【Qt开发】V4L2 API详解 <三> Buffer的准备和数据读取
- 准备大数据的面试
- 后台开发面试准备1:Linux命令
- 大数据开发、架构工程师面试(一)