您的位置:首页 > 职场人生

数据开发面试准备

2018-01-10 23:44 260 查看
1、datanode、namenode、resourceManager,NodeManger的区别和流程

http://www.aboutyun.com/thread-7778-1-1.html

2、hadoop map-reducer的流程,combine的作用

http://blog.csdn.net/lisonglisonglisong/article/details/47125319

3、基本的hadoop wordCount的编写

http://blog.jobbole.com/82607/

http://blog.csdn.net/Jerome_s/article/details/26441151

package com.test.mapreduce.web;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class WordCount {

public static class WordMapper extends
Mapper<Object, Text, Text, IntWritable> {

private final static IntWritable one = new IntWritable(1);
private Text word = new Text();

public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
StringTokenizer itr = new StringTokenizer(line);
while (itr.hasMoreTokens()) {
word.set(itr.nextToken().toLowerCase());
context.write(word, one);
}
}
}

public static class WordReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();

public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, new IntWritable(sum));
}
}

public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(WordMapper.class);
job.setCombinerClass(WordReducer.class);
job.setReducerClass(WordReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

---------------------------------------------------------------------------------
package cn.dataguru.hadoop;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.Map.Entry;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

//位置数据
//IMSI|IMEI|UPDATETYPE|CGI|TIME
//上网数据
//IMSI|IMEI|CGI|TIME|CGI|URL

/**
* 汇总基站数据表
* 计算每个用户在不同的时间段不同的基站停留的时长
* 输入参数 < input path > < output path > < date > < timepoint >
* 参数示例: “/base /output 2012-09-12 09-17-24"
* 意味着以“/base”为输入,"/output"为输出,指定计算2012年09月12日的数据,并分为00-07,07-17,17-24三个时段
* 输出格式 “IMSI|CGI|TIMFLAG|STAY_TIME”
*/
public class BaseStationDataPreprocess extends Configured implements Tool
{
/**
* 计数器
* 用于计数各种异常数据
*/
enum Counter
{
TIMESKIP,       //时间格式有误
OUTOFTIMESKIP,  //时间不在参数指定的时间段内
LINESKIP,       //源文件行有误
USERSKIP        //某个用户某个时间段被整个放弃
}

/**
* 读取一行数据
* 以“IMSI+时间段”作为 KEY 发射出去
*/
public static class Map extends Mapper<LongWritable, Text, Text, Text>
{
String date;
String [] timepoint;
boolean dataSource;

/**
* 初始化
*/
public void setup ( Context context ) throws IOException
{
this.date = context.getConfiguration().get("date");                         //读取日期
this.timepoint = context.getConfiguration().get("timepoint").split("-");    //读取时间分割点

//提取文件名
FileSplit fs = (FileSplit)context.getInputSplit();
String fileName = fs.getPath().getName();
if( fileName.startsWith("POS") )
dataSource = true;
else if ( fileName.startsWith("NET") )
dataSource = false;
else
throw new IOException("File Name should starts with POS or NET");
}

/**
* MAP任务
* 读取基站数据
* 找出数据所对应时间段
* 以IMSI和时间段作为 KEY
* CGI和时间作为 VALUE
*/
public void map ( LongWritable key, Text value, Context context ) throws IOException, InterruptedException
{
String line = value.toString();
TableLine tableLine = new TableLine();

//读取行
try
{
tableLine.set(line, this.dataSource, this.date, this.timepoint );
}
catch ( LineException e )
{
if(e.getFlag()==-1)
context.getCounter(Counter.OUTOFTIMESKIP).increment(1);
else
context.getCounter(Counter.TIMESKIP).increment(1);
return;
}
catch (Exception e)
{
context.getCounter(Counter.LINESKIP).increment(1);
return;
}

context.write( tableLine.outKey(), tableLine.outValue() );
}
}

/**
* 统计同一个IMSI在同一时间段
* 在不同CGI停留的时长
*/
public static class Reduce extends Reducer<Text, Text, NullWritable, Text>
{
private String date;
private SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");

/**
* 初始化
*/
public void setup ( Context context )
{
this.date = context.getConfiguration().get("date");                         //读取日期
}

public void reduce ( Text key, Iterable<Text> values, Context context ) throws IOException, InterruptedException
{
String imsi = key.toString().split("\\|")[0];
String timeFlag = key.toString().split("\\|")[1];

//用一个TreeMap记录时间
TreeMap<Long, String> uploads = new TreeMap<Long, String>();
String valueString;

for ( Text value : values )
{
valueString = value.toString();
try
{
uploads.put( Long.valueOf( valueString.split("\\|")[1] ), valueString.split("\\|")[0] );
}
catch ( NumberFormatException e )
{
context.getCounter(Counter.TIMESKIP).increment(1);
continue;
}
}

try
{
//在最后添加“OFF”位置
Date tmp = this.formatter.parse( this.date + " " + timeFlag.split("-")[1] + ":00:00" );
uploads.put ( ( tmp.getTime() / 1000L ), "OFF");

//汇总数据
HashMap<String, Float> locs = getStayTime(uploads);

//输出
for( Entry<String, Float> entry : locs.entrySet() )
{
StringBuilder builder = new StringBuilder();
builder.append(imsi).append("|");
builder.append(entry.getKey()).append("|");
builder.append(timeFlag).append("|");
builder.append(entry.getValue());

context.write( NullWritable.get(), new Text(builder.toString()) );
}
}
catch ( Exception e )
{
context.getCounter(Counter.USERSKIP).increment(1);
return;
}
}

/**
* 获得位置停留信息
*/
private HashMap<String, Float> getStayTime(TreeMap<Long, String> uploads)
{
Entry<Long, String> upload, nextUpload;
HashMap<String, Float> locs = new HashMap<String, Float>();
//初始化
Iterator<Entry<Long, String>> it = uploads.entrySet().iterator();
upload = it.next();
//计算
while( it.hasNext() )
{
nextUpload = it.next();
float diff = (float) (nextUpload.getKey()-upload.getKey()) / 60.0f;
if( diff <= 60.0 )                                  //时间间隔过大则代表关机
{
if( locs.containsKey( upload.getValue() ) )
locs.put( upload.getValue(), locs.get(upload.getValue())+diff );
else
locs.put( upload.getValue(), diff );
}
upload = nextUpload;
}
return locs;
}
}

public int run(String[] args) throws Exception {
Configuration conf = getConf();

conf.set("date", args[2]);
conf.set("timepoint", args[3]);

Job job = new Job(conf, "BaseStationDataPreprocess");
job.setJarByClass(BaseStationDataPreprocess.class);

FileInputFormat.addInputPath( job, new Path(args[0]) );         //输入路径
FileOutputFormat.setOutputPath( job, new Path(args[1]) );       //输出路径

job.setMapperClass( Map.class );                                //调用上面Map类作为Map任务代码
job.setReducerClass ( Reduce.class );                           //调用上面Reduce类作为Reduce任务代码
job.setOutputFormatClass( TextOutputFormat.class );
job.setOutputKeyClass( Text.class );
job.setOutputValueClass( Text.class );

job.waitForCompletion(true);

return job.isSuccessful() ? 0 : 1;
}

public static void main(String[] args) throws Exception
{
if ( args.length != 4 )
{
System.err.println("");
System.err.println("Usage: BaseStationDataPreprocess < input path > < output path > < date > < timepoint >");
System.err.println("Example: BaseStationDataPreprocess /user/james/Base /user/james/Output 2012-09-12 07-09-17-24");
System.err.println("Warning: Timepoints should be begined with a 0+ two digit number and the last timepoint should be 24");
System.err.println("Counter:");
System.err.println("\t"+"TIMESKIP"+"\t"+"Lines which contain wrong date format");
System.err.println("\t"+"OUTOFTIMESKIP"+"\t"+"Lines which contain times that out of range");
System.err.println("\t"+"LINESKIP"+"\t"+"Lines which are invalid");
System.err.println("\t"+"USERSKIP"+"\t"+"Users in some time are invalid");
System.exit(-1);
}

//运行任务
int res = ToolRunner.run(new Configuration(), new BaseStationDataPreprocess(), args);

System.exit(res);
}

}


4、面试题

http://blog.csdn.net/qq_26442553/article/details/78718796
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: