mapreduce实现"浏览该商品的人大多数还浏览了"经典应用
2014-06-06 17:17
183 查看
转自:/article/10741299.html
看着思路不错。是协同过滤的一种实现。
注意:\001是分隔符,16进制,ascii码里001,叫SOH(start of heading)。用vi编辑器Ctrl+v然后Ctrl+a。其他的分隔符有tab键(ascii 为9),空格(32)
输入:
日期 ...cookie id. ...商品id..
xx xx xx
输出:
商品id 商品id列表(按优先级排序,用逗号分隔)
xx xx
比如:
id1 id3,id0,id4,id2
id2 id0,id5
整个计算过程分为4步
1、提取原始日志中的(日期,cookie id,商品id)信息,按天处理,最后输出数据格式
商品id-0 商品id-1
xx x x
这一步做了次优化,商品id-0一定比商品id-1小,为了减少存储,在最后汇总数据转置下即可
reduce做局部排序及排重
2、基于上次的结果做汇总,按天计算
商品id-0 商品id-1 关联值(关联值即同时访问这两个商品的用户数)
xx x x xx
3、汇总最近三个月数据,同时考虑时间衰减,时间越久关联值的贡献越低,最后输出两两商品的关联值(包括转置后)
4、行列转换,生成最后要的推荐结果数据,按关联值排序生成
第一个MR
[java]
view plaincopyprint?
import java.io.IOException;
import java.util.ArrayList;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.log4j.Logger;
/*
* 输入:原始数据,会有重复
*日期 cookie 楼盘id
*
* 输出:
* 日期 楼盘id1 楼盘id2 //楼盘id1一定小于楼盘id2 ,按日期 cookie进行分组
*
*/
public class HouseMergeAndSplit {
public staticclass Partitioner1
extends Partitioner<TextPair, Text> {
@Override
public int getPartition(TextPair key, Text value,int numParititon) {
return Math.abs((new Text(key.getFirst().toString()+key.getSecond().toString())).hashCode() *127) % numParititon;
}
}
public staticclass Comp1
extends WritableComparator {
public Comp1() {
super(TextPair.class,true);
}
@SuppressWarnings("unchecked")
public int compare(WritableComparable a, WritableComparable b) {
TextPair t1 = (TextPair) a;
TextPair t2 = (TextPair) b;
int comp= t1.getFirst().compareTo(t2.getFirst());
if (comp!=0)
return comp;
return t1.getSecond().compareTo(t2.getSecond());
}
}
public staticclass TokenizerMapper
extends Mapper<LongWritable, Text, TextPair, Text>{
Text val=new Text("test");
public void map(LongWritable key, Text value, Context context
) throws IOException, InterruptedException {
String s[]=value.toString().split("\001");
TextPair tp=new TextPair(s[0],s[1],s[4]+s[3]);//thedate
cookie city+houseid
context.write(tp, val);
}
}
public staticclass IntSumReducer
extends Reducer<TextPair,Text,Text,Text> {
private static String comparedColumn[] =new String[3];
ArrayList<String> houselist= new ArrayList<String>();
private static Text keyv =new Text();
private static Text valuev =new Text();
static Logger logger = Logger.getLogger(HouseMergeAndSplit.class.getName());
public void reduce(TextPair key, Iterable<Text> values,
Context context
) throws IOException, InterruptedException {
houselist.clear();
String thedate=key.getFirst().toString();
String cookie=key.getSecond().toString();
for (int i=0;i<3;i++)
comparedColumn[i]="";
//first+second为分组键,每次不同重新调用reduce函数
for (Text val:values)
{
if (thedate.equals(comparedColumn[0]) && cookie.equals(comparedColumn[1])&& !key.getThree().toString().equals(comparedColumn[2]))
{
// context.write(new Text(key.getFirst()+" "+key.getSecond().toString()), new Text(key.getThree().toString()+" first"+ " "+comparedColumn[0]+" "+comparedColumn[1]+" "+comparedColumn[2]));
houselist.add(key.getThree().toString());
comparedColumn[0]=key.getFirst().toString();
comparedColumn[1]=key.getSecond().toString();
comparedColumn[2]=key.getThree().toString();
}
if (!thedate.equals(comparedColumn[0])||!cookie.equals(comparedColumn[1]))
{
// context.write(new Text(key.getFirst()+" "+key.getSecond().toString()), new Text(key.getThree().toString()+" second"+ " "+comparedColumn[0]+" "+comparedColumn[1]+" "+comparedColumn[2]));
houselist.add(key.getThree().toString());
comparedColumn[0]=key.getFirst().toString();
comparedColumn[1]=key.getSecond().toString();
comparedColumn[2]=key.getThree().toString();
}
}
keyv.set(comparedColumn[0]);
//日期
//valuev.set(houselist.toString());
//logger.info(houselist.toString());
//context.write(keyv,valuev);
for (int i=0;i<houselist.size()-1;i++)
{
for (int j=i+1;j<houselist.size();j++)
{ valuev.set(houselist.get(i)+" "+houselist.get(j));//关联的楼盘
context.write(keyv,valuev);
}
}
}
}
public staticvoid main(String[] args)
throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length !=
2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
FileSystem fstm = FileSystem.get(conf);
Path outDir = new Path(otherArgs[1]);
fstm.delete(outDir, true);
conf.set("mapred.textoutputformat.separator","\t");
//reduce输出时key value中间的分隔符
Job job = new Job(conf,
"HouseMergeAndSplit");
job.setNumReduceTasks(4);
job.setJarByClass(HouseMergeAndSplit.class);
job.setMapperClass(TokenizerMapper.class);
job.setMapOutputKeyClass(TextPair.class);
job.setMapOutputValueClass(Text.class);
// 设置partition
job.setPartitionerClass(Partitioner1.class);
// 在分区之后按照指定的条件分组
job.setGroupingComparatorClass(Comp1.class);
// 设置reduce
// 设置reduce的输出
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//job.setNumReduceTasks(18);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ?0 :
1);
}
}
TextPair
[java]
view plaincopyprint?
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
public class TextPairimplements WritableComparable<TextPair> {
private Text first;
private Text second;
private Text three;
public TextPair() {
set(new Text(),
new Text(),new Text());
}
public TextPair(String first, String second,String three) {
set(new Text(first), new Text(second),new Text(three));
}
public TextPair(Text first, Text second,Text Three) {
set(first, second,three);
}
public void set(Text first, Text second,Text three) {
this.first = first;
this.second = second;
this.three=three;
}
public Text getFirst() {
return first;
}
public Text getSecond() {
return second;
}
public Text getThree() {
return three;
}
public void write(DataOutput out)throws IOException {
first.write(out);
second.write(out);
three.write(out);
}
public void readFields(DataInput in)throws IOException {
first.readFields(in);
second.readFields(in);
three.readFields(in);
}
public int compareTo(TextPair tp) {
int cmp = first.compareTo(tp.first);
if (cmp !=
0) {
return cmp;
}
cmp= second.compareTo(tp.second);
if (cmp !=
0) {
return cmp;
}
return three.compareTo(tp.three);
}
}
TextPairSecond
[java]
view plaincopyprint?
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
public class TextPairSecondimplements WritableComparable<TextPairSecond> {
private Text first;
private FloatWritable second;
public TextPairSecond() {
set(new Text(),
new FloatWritable());
}
public TextPairSecond(String first,float second) {
set(new Text(first), new FloatWritable(second));
}
public TextPairSecond(Text first, FloatWritable second) {
set(first, second);
}
public void set(Text first, FloatWritable second) {
this.first = first;
this.second = second;
}
public Text getFirst() {
return first;
}
public FloatWritable getSecond() {
return second;
}
public void write(DataOutput out)throws IOException {
first.write(out);
second.write(out);
}
public void readFields(DataInput in)throws IOException {
first.readFields(in);
second.readFields(in);
}
public int compareTo(TextPairSecond tp) {
int cmp = first.compareTo(tp.first);
if (cmp !=
0) {
return cmp;
}
return second.compareTo(tp.second);
}
}
第二个MR
[java]
view plaincopyprint?
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.log4j.Logger;
/*
* 统计楼盘之间共同出现的次数
* 输入:
* 日期 楼盘1 楼盘2
*
* 输出:
* 日期 楼盘1 楼盘2 共同出现的次数
*
*/
public class HouseCount {
public staticclass TokenizerMapper
extends Mapper<LongWritable, Text, Text, IntWritable>{
IntWritable iw=new IntWritable(1);
public void map(LongWritable key, Text value, Context context
) throws IOException, InterruptedException {
context.write(value, iw);
}
}
public staticclass IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
IntWritable result=new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum=0;
for (IntWritable iw:values)
{
sum+=iw.get();
}
result.set(sum);
context.write(key, result) ;
}
}
public staticvoid main(String[] args)
throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length !=
2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
FileSystem fstm = FileSystem.get(conf);
Path outDir = new Path(otherArgs[1]);
fstm.delete(outDir, true);
conf.set("mapred.textoutputformat.separator","\t");
//reduce输出时key value中间的分隔符
Job job = new Job(conf,
"HouseCount");
job.setNumReduceTasks(2);
job.setJarByClass(HouseCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 设置reduce
// 设置reduce的输出
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//job.setNumReduceTasks(18);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ?0 :
1);
}
}
第三个MR
[java]
view plaincopyprint?
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.log4j.Logger;
/*
* 汇总近三个月统计楼盘之间共同出现的次数,考虑衰减系数, 并最后a b 转成 b a输出一次
* 输入:
* 日期 楼盘1 楼盘2 共同出现的次数
*
* 输出
* 楼盘1 楼盘2 共同出现的次数(考虑了衰减系数,每天的衰减系数不一样)
*
*/
public class HouseCountHz {
public staticclass HouseCountHzMapper
extends Mapper<LongWritable, Text, Text, FloatWritable>{
Text keyv=new Text();
FloatWritable valuev=new FloatWritable();
public void map(LongWritable key, Text value, Context context
) throws IOException, InterruptedException {
String[] s=value.toString().split("\t");
keyv.set(s[1]+" "+s[2]);//楼盘1,楼盘2
Calendar date1=Calendar.getInstance();
Calendar d2=Calendar.getInstance();
Date b = null;
SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd");
try {
b=sdf.parse(s[0]);
} catch (ParseException e) {
e.printStackTrace();
}
d2.setTime(b);
long n=date1.getTimeInMillis();
long birth=d2.getTimeInMillis();
long sss=n-birth;
int day=(int)((sss)/(3600*24*1000));//该条记录的日期与当前日期的日期差
float factor=1/(1+(float)(day-1)/10);//衰减系数
valuev.set(Float.parseFloat(s[3])*factor);
context.write(keyv, valuev);
}
}
public staticclass HouseCountHzReducer
extends Reducer<Text,FloatWritable,Text,FloatWritable> {
FloatWritable result=new FloatWritable();
Text keyreverse=new Text();
public void reduce(Text key, Iterable<FloatWritable> values,
Context context
) throws IOException, InterruptedException {
float sum=0;
for (FloatWritable iw:values)
{
sum+=iw.get();
}
result.set(sum);
String[] keys=key.toString().split("\t");
keyreverse.set(keys[1]+" "+keys[0]);
context.write(key, result) ;
context.write(keyreverse, result) ;
}
}
public staticvoid main(String[] args)
throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length !=2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
FileSystem fstm = FileSystem.get(conf);
Path outDir = new Path(otherArgs[1]);
fstm.delete(outDir, true);
conf.set("mapred.textoutputformat.separator","\t");
//reduce输出时key value中间的分隔符
Job job = new Job(conf,"HouseCountHz");
job.setNumReduceTasks(2);
job.setJarByClass(HouseCountHz.class);
job.setMapperClass(HouseCountHzMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FloatWritable.class);
// 设置reduce
// 设置reduce的输出
job.setReducerClass(HouseCountHzReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FloatWritable.class);
//job.setNumReduceTasks(18);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ?0 :
1);
}
}
第四个MR
[java]
view plaincopyprint?
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
/*
* 输入数据:
* 楼盘1 楼盘2 共同出现的次数
*
* 输出数据
* 楼盘1 楼盘2,楼盘3,楼盘4 (按次数排序)
*/
public class HouseRowToCol {
public staticclass Partitioner1
extends Partitioner<TextPairSecond, Text> {
@Override
//分区
public int getPartition(TextPairSecond key, Text value,int numParititon) {
return Math.abs((new Text(key.getFirst().toString()+key.getSecond().toString())).hashCode() *127) % numParititon;
}
}
//分组
public staticclass Comp1
extends WritableComparator {
public Comp1() {
super(TextPairSecond.class,true);
}
@SuppressWarnings("unchecked")
public int compare(WritableComparable a, WritableComparable b) {
TextPairSecond t1 = (TextPairSecond) a;
TextPairSecond t2 = (TextPairSecond) b;
return t1.getFirst().compareTo(t2.getFirst());
}
}
//排序
public staticclass KeyComp
extends WritableComparator {
public KeyComp() {
super(TextPairSecond.class,true);
}
@SuppressWarnings("unchecked")
public int compare(WritableComparable a, WritableComparable b) {
TextPairSecond t1 = (TextPairSecond) a;
TextPairSecond t2 = (TextPairSecond) b;
int comp= t1.getFirst().compareTo(t2.getFirst());
if (comp!=0)
return comp;
return -t1.getSecond().compareTo(t2.getSecond());
}
}
public staticclass HouseRowToColMapper
extends Mapper<LongWritable, Text, TextPairSecond, Text>{
Text houseid1=new Text();
Text houseid2=new Text();
FloatWritable weight=new FloatWritable();
public void map(LongWritable key, Text value, Context context
) throws IOException, InterruptedException {
String s[]=value.toString().split("\t");
weight.set(Float.parseFloat(s[2]));
houseid1.set(s[0]);
houseid2.set(s[1]);
TextPairSecond tp=new TextPairSecond(houseid1,weight);
context.write(tp, houseid2);
}
}
public staticclass HouseRowToColReducer
extends Reducer<TextPairSecond,Text,Text,Text> {
Text valuev=new Text();
public void reduce(TextPairSecond key, Iterable<Text> values,
Context context
) throws IOException, InterruptedException {
Text keyv=key.getFirst();
Iterator<Text> it=values.iterator();
StringBuilder sb=new StringBuilder(it.next().toString());
while(it.hasNext())
{
sb.append(","+it.next().toString());
}
valuev.set(sb.toString());
context.write(keyv, valuev);
}
}
public staticvoid main(String[] args)
throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length !=
2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
FileSystem fstm = FileSystem.get(conf);
Path outDir = new Path(otherArgs[1]);
fstm.delete(outDir, true);
conf.set("mapred.textoutputformat.separator","\t");
//reduce输出时key value中间的分隔符
Job job = new Job(conf,
"HouseRowToCol");
job.setNumReduceTasks(4);
job.setJarByClass(HouseRowToCol.class);
job.setMapperClass(HouseRowToColMapper.class);
job.setMapOutputKeyClass(TextPairSecond.class);
job.setMapOutputValueClass(Text.class);
// 设置partition
job.setPartitionerClass(Partitioner1.class);
// 在分区之后按照指定的条件分组
job.setGroupingComparatorClass(Comp1.class);
job.setSortComparatorClass(KeyComp.class);
// 设置reduce
// 设置reduce的输出
job.setReducerClass(HouseRowToColReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//job.setNumReduceTasks(18);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ?0 :
1);
}
}
看着思路不错。是协同过滤的一种实现。
注意:\001是分隔符,16进制,ascii码里001,叫SOH(start of heading)。用vi编辑器Ctrl+v然后Ctrl+a。其他的分隔符有tab键(ascii 为9),空格(32)
输入:
日期 ...cookie id. ...商品id..
xx xx xx
输出:
商品id 商品id列表(按优先级排序,用逗号分隔)
xx xx
比如:
id1 id3,id0,id4,id2
id2 id0,id5
整个计算过程分为4步
1、提取原始日志中的(日期,cookie id,商品id)信息,按天处理,最后输出数据格式
商品id-0 商品id-1
xx x x
这一步做了次优化,商品id-0一定比商品id-1小,为了减少存储,在最后汇总数据转置下即可
reduce做局部排序及排重
2、基于上次的结果做汇总,按天计算
商品id-0 商品id-1 关联值(关联值即同时访问这两个商品的用户数)
xx x x xx
3、汇总最近三个月数据,同时考虑时间衰减,时间越久关联值的贡献越低,最后输出两两商品的关联值(包括转置后)
4、行列转换,生成最后要的推荐结果数据,按关联值排序生成
第一个MR
[java]
view plaincopyprint?
import java.io.IOException;
import java.util.ArrayList;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.log4j.Logger;
/*
* 输入:原始数据,会有重复
*日期 cookie 楼盘id
*
* 输出:
* 日期 楼盘id1 楼盘id2 //楼盘id1一定小于楼盘id2 ,按日期 cookie进行分组
*
*/
public class HouseMergeAndSplit {
public staticclass Partitioner1
extends Partitioner<TextPair, Text> {
@Override
public int getPartition(TextPair key, Text value,int numParititon) {
return Math.abs((new Text(key.getFirst().toString()+key.getSecond().toString())).hashCode() *127) % numParititon;
}
}
public staticclass Comp1
extends WritableComparator {
public Comp1() {
super(TextPair.class,true);
}
@SuppressWarnings("unchecked")
public int compare(WritableComparable a, WritableComparable b) {
TextPair t1 = (TextPair) a;
TextPair t2 = (TextPair) b;
int comp= t1.getFirst().compareTo(t2.getFirst());
if (comp!=0)
return comp;
return t1.getSecond().compareTo(t2.getSecond());
}
}
public staticclass TokenizerMapper
extends Mapper<LongWritable, Text, TextPair, Text>{
Text val=new Text("test");
public void map(LongWritable key, Text value, Context context
) throws IOException, InterruptedException {
String s[]=value.toString().split("\001");
TextPair tp=new TextPair(s[0],s[1],s[4]+s[3]);//thedate
cookie city+houseid
context.write(tp, val);
}
}
public staticclass IntSumReducer
extends Reducer<TextPair,Text,Text,Text> {
private static String comparedColumn[] =new String[3];
ArrayList<String> houselist= new ArrayList<String>();
private static Text keyv =new Text();
private static Text valuev =new Text();
static Logger logger = Logger.getLogger(HouseMergeAndSplit.class.getName());
public void reduce(TextPair key, Iterable<Text> values,
Context context
) throws IOException, InterruptedException {
houselist.clear();
String thedate=key.getFirst().toString();
String cookie=key.getSecond().toString();
for (int i=0;i<3;i++)
comparedColumn[i]="";
//first+second为分组键,每次不同重新调用reduce函数
for (Text val:values)
{
if (thedate.equals(comparedColumn[0]) && cookie.equals(comparedColumn[1])&& !key.getThree().toString().equals(comparedColumn[2]))
{
// context.write(new Text(key.getFirst()+" "+key.getSecond().toString()), new Text(key.getThree().toString()+" first"+ " "+comparedColumn[0]+" "+comparedColumn[1]+" "+comparedColumn[2]));
houselist.add(key.getThree().toString());
comparedColumn[0]=key.getFirst().toString();
comparedColumn[1]=key.getSecond().toString();
comparedColumn[2]=key.getThree().toString();
}
if (!thedate.equals(comparedColumn[0])||!cookie.equals(comparedColumn[1]))
{
// context.write(new Text(key.getFirst()+" "+key.getSecond().toString()), new Text(key.getThree().toString()+" second"+ " "+comparedColumn[0]+" "+comparedColumn[1]+" "+comparedColumn[2]));
houselist.add(key.getThree().toString());
comparedColumn[0]=key.getFirst().toString();
comparedColumn[1]=key.getSecond().toString();
comparedColumn[2]=key.getThree().toString();
}
}
keyv.set(comparedColumn[0]);
//日期
//valuev.set(houselist.toString());
//logger.info(houselist.toString());
//context.write(keyv,valuev);
for (int i=0;i<houselist.size()-1;i++)
{
for (int j=i+1;j<houselist.size();j++)
{ valuev.set(houselist.get(i)+" "+houselist.get(j));//关联的楼盘
context.write(keyv,valuev);
}
}
}
}
public staticvoid main(String[] args)
throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length !=
2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
FileSystem fstm = FileSystem.get(conf);
Path outDir = new Path(otherArgs[1]);
fstm.delete(outDir, true);
conf.set("mapred.textoutputformat.separator","\t");
//reduce输出时key value中间的分隔符
Job job = new Job(conf,
"HouseMergeAndSplit");
job.setNumReduceTasks(4);
job.setJarByClass(HouseMergeAndSplit.class);
job.setMapperClass(TokenizerMapper.class);
job.setMapOutputKeyClass(TextPair.class);
job.setMapOutputValueClass(Text.class);
// 设置partition
job.setPartitionerClass(Partitioner1.class);
// 在分区之后按照指定的条件分组
job.setGroupingComparatorClass(Comp1.class);
// 设置reduce
// 设置reduce的输出
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//job.setNumReduceTasks(18);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ?0 :
1);
}
}
import java.io.IOException; import java.util.ArrayList; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparator; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Partitioner; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.log4j.Logger; /* * 输入:原始数据,会有重复 *日期 cookie 楼盘id * * 输出: * 日期 楼盘id1 楼盘id2 //楼盘id1一定小于楼盘id2 ,按日期 cookie进行分组 * */ public class HouseMergeAndSplit { public static class Partitioner1 extends Partitioner<TextPair, Text> { @Override public int getPartition(TextPair key, Text value, int numParititon) { return Math.abs((new Text(key.getFirst().toString()+key.getSecond().toString())).hashCode() * 127) % numParititon; } } public static class Comp1 extends WritableComparator { public Comp1() { super(TextPair.class, true); } @SuppressWarnings("unchecked") public int compare(WritableComparable a, WritableComparable b) { TextPair t1 = (TextPair) a; TextPair t2 = (TextPair) b; int comp= t1.getFirst().compareTo(t2.getFirst()); if (comp!=0) return comp; return t1.getSecond().compareTo(t2.getSecond()); } } public static class TokenizerMapper extends Mapper<LongWritable, Text, TextPair, Text>{ Text val=new Text("test"); public void map(LongWritable key, Text value, Context context ) throws IOException, InterruptedException { String s[]=value.toString().split("\001"); TextPair tp=new TextPair(s[0],s[1],s[4]+s[3]); //thedate cookie city+houseid context.write(tp, val); } } public static class IntSumReducer extends Reducer<TextPair,Text,Text,Text> { private static String comparedColumn[] = new String[3]; ArrayList<String> houselist= new ArrayList<String>(); private static Text keyv = new Text(); private static Text valuev = new Text(); static Logger logger = Logger.getLogger(HouseMergeAndSplit.class.getName()); public void reduce(TextPair key, Iterable<Text> values, Context context ) throws IOException, InterruptedException { houselist.clear(); String thedate=key.getFirst().toString(); String cookie=key.getSecond().toString(); for (int i=0;i<3;i++) comparedColumn[i]=""; //first+second为分组键,每次不同重新调用reduce函数 for (Text val:values) { if (thedate.equals(comparedColumn[0]) && cookie.equals(comparedColumn[1])&& !key.getThree().toString().equals(comparedColumn[2])) { // context.write(new Text(key.getFirst()+" "+key.getSecond().toString()), new Text(key.getThree().toString()+" first"+ " "+comparedColumn[0]+" "+comparedColumn[1]+" "+comparedColumn[2])); houselist.add(key.getThree().toString()); comparedColumn[0]=key.getFirst().toString(); comparedColumn[1]=key.getSecond().toString(); comparedColumn[2]=key.getThree().toString(); } if (!thedate.equals(comparedColumn[0])||!cookie.equals(comparedColumn[1])) { // context.write(new Text(key.getFirst()+" "+key.getSecond().toString()), new Text(key.getThree().toString()+" second"+ " "+comparedColumn[0]+" "+comparedColumn[1]+" "+comparedColumn[2])); houselist.add(key.getThree().toString()); comparedColumn[0]=key.getFirst().toString(); comparedColumn[1]=key.getSecond().toString(); comparedColumn[2]=key.getThree().toString(); } } keyv.set(comparedColumn[0]); //日期 //valuev.set(houselist.toString()); //logger.info(houselist.toString()); //context.write(keyv,valuev); for (int i=0;i<houselist.size()-1;i++) { for (int j=i+1;j<houselist.size();j++) { valuev.set(houselist.get(i)+" "+houselist.get(j)); //关联的楼盘 context.write(keyv,valuev); } } } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2); } FileSystem fstm = FileSystem.get(conf); Path outDir = new Path(otherArgs[1]); fstm.delete(outDir, true); conf.set("mapred.textoutputformat.separator", "\t"); //reduce输出时key value中间的分隔符 Job job = new Job(conf, "HouseMergeAndSplit"); job.setNumReduceTasks(4); job.setJarByClass(HouseMergeAndSplit.class); job.setMapperClass(TokenizerMapper.class); job.setMapOutputKeyClass(TextPair.class); job.setMapOutputValueClass(Text.class); // 设置partition job.setPartitionerClass(Partitioner1.class); // 在分区之后按照指定的条件分组 job.setGroupingComparatorClass(Comp1.class); // 设置reduce // 设置reduce的输出 job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); //job.setNumReduceTasks(18); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
TextPair
[java]
view plaincopyprint?
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
public class TextPairimplements WritableComparable<TextPair> {
private Text first;
private Text second;
private Text three;
public TextPair() {
set(new Text(),
new Text(),new Text());
}
public TextPair(String first, String second,String three) {
set(new Text(first), new Text(second),new Text(three));
}
public TextPair(Text first, Text second,Text Three) {
set(first, second,three);
}
public void set(Text first, Text second,Text three) {
this.first = first;
this.second = second;
this.three=three;
}
public Text getFirst() {
return first;
}
public Text getSecond() {
return second;
}
public Text getThree() {
return three;
}
public void write(DataOutput out)throws IOException {
first.write(out);
second.write(out);
three.write(out);
}
public void readFields(DataInput in)throws IOException {
first.readFields(in);
second.readFields(in);
three.readFields(in);
}
public int compareTo(TextPair tp) {
int cmp = first.compareTo(tp.first);
if (cmp !=
0) {
return cmp;
}
cmp= second.compareTo(tp.second);
if (cmp !=
0) {
return cmp;
}
return three.compareTo(tp.three);
}
}
import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; public class TextPair implements WritableComparable<TextPair> { private Text first; private Text second; private Text three; public TextPair() { set(new Text(), new Text(),new Text()); } public TextPair(String first, String second,String three) { set(new Text(first), new Text(second),new Text(three)); } public TextPair(Text first, Text second,Text Three) { set(first, second,three); } public void set(Text first, Text second,Text three) { this.first = first; this.second = second; this.three=three; } public Text getFirst() { return first; } public Text getSecond() { return second; } public Text getThree() { return three; } public void write(DataOutput out) throws IOException { first.write(out); second.write(out); three.write(out); } public void readFields(DataInput in) throws IOException { first.readFields(in); second.readFields(in); three.readFields(in); } public int compareTo(TextPair tp) { int cmp = first.compareTo(tp.first); if (cmp != 0) { return cmp; } cmp= second.compareTo(tp.second); if (cmp != 0) { return cmp; } return three.compareTo(tp.three); } }
TextPairSecond
[java]
view plaincopyprint?
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
public class TextPairSecondimplements WritableComparable<TextPairSecond> {
private Text first;
private FloatWritable second;
public TextPairSecond() {
set(new Text(),
new FloatWritable());
}
public TextPairSecond(String first,float second) {
set(new Text(first), new FloatWritable(second));
}
public TextPairSecond(Text first, FloatWritable second) {
set(first, second);
}
public void set(Text first, FloatWritable second) {
this.first = first;
this.second = second;
}
public Text getFirst() {
return first;
}
public FloatWritable getSecond() {
return second;
}
public void write(DataOutput out)throws IOException {
first.write(out);
second.write(out);
}
public void readFields(DataInput in)throws IOException {
first.readFields(in);
second.readFields(in);
}
public int compareTo(TextPairSecond tp) {
int cmp = first.compareTo(tp.first);
if (cmp !=
0) {
return cmp;
}
return second.compareTo(tp.second);
}
}
import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; public class TextPairSecond implements WritableComparable<TextPairSecond> { private Text first; private FloatWritable second; public TextPairSecond() { set(new Text(), new FloatWritable()); } public TextPairSecond(String first, float second) { set(new Text(first), new FloatWritable(second)); } public TextPairSecond(Text first, FloatWritable second) { set(first, second); } public void set(Text first, FloatWritable second) { this.first = first; this.second = second; } public Text getFirst() { return first; } public FloatWritable getSecond() { return second; } public void write(DataOutput out) throws IOException { first.write(out); second.write(out); } public void readFields(DataInput in) throws IOException { first.readFields(in); second.readFields(in); } public int compareTo(TextPairSecond tp) { int cmp = first.compareTo(tp.first); if (cmp != 0) { return cmp; } return second.compareTo(tp.second); } }
第二个MR
[java]
view plaincopyprint?
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.log4j.Logger;
/*
* 统计楼盘之间共同出现的次数
* 输入:
* 日期 楼盘1 楼盘2
*
* 输出:
* 日期 楼盘1 楼盘2 共同出现的次数
*
*/
public class HouseCount {
public staticclass TokenizerMapper
extends Mapper<LongWritable, Text, Text, IntWritable>{
IntWritable iw=new IntWritable(1);
public void map(LongWritable key, Text value, Context context
) throws IOException, InterruptedException {
context.write(value, iw);
}
}
public staticclass IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
IntWritable result=new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum=0;
for (IntWritable iw:values)
{
sum+=iw.get();
}
result.set(sum);
context.write(key, result) ;
}
}
public staticvoid main(String[] args)
throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length !=
2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
FileSystem fstm = FileSystem.get(conf);
Path outDir = new Path(otherArgs[1]);
fstm.delete(outDir, true);
conf.set("mapred.textoutputformat.separator","\t");
//reduce输出时key value中间的分隔符
Job job = new Job(conf,
"HouseCount");
job.setNumReduceTasks(2);
job.setJarByClass(HouseCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 设置reduce
// 设置reduce的输出
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//job.setNumReduceTasks(18);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ?0 :
1);
}
}
import java.io.IOException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparator; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Partitioner; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.Mapper.Context; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.log4j.Logger; /* * 统计楼盘之间共同出现的次数 * 输入: * 日期 楼盘1 楼盘2 * * 输出: * 日期 楼盘1 楼盘2 共同出现的次数 * */ public class HouseCount { public static class TokenizerMapper extends Mapper<LongWritable, Text, Text, IntWritable>{ IntWritable iw=new IntWritable(1); public void map(LongWritable key, Text value, Context context ) throws IOException, InterruptedException { context.write(value, iw); } } public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> { IntWritable result=new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context ) throws IOException, InterruptedException { int sum=0; for (IntWritable iw:values) { sum+=iw.get(); } result.set(sum); context.write(key, result) ; } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2); } FileSystem fstm = FileSystem.get(conf); Path outDir = new Path(otherArgs[1]); fstm.delete(outDir, true); conf.set("mapred.textoutputformat.separator", "\t"); //reduce输出时key value中间的分隔符 Job job = new Job(conf, "HouseCount"); job.setNumReduceTasks(2); job.setJarByClass(HouseCount.class); job.setMapperClass(TokenizerMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); // 设置reduce // 设置reduce的输出 job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); //job.setNumReduceTasks(18); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
第三个MR
[java]
view plaincopyprint?
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.log4j.Logger;
/*
* 汇总近三个月统计楼盘之间共同出现的次数,考虑衰减系数, 并最后a b 转成 b a输出一次
* 输入:
* 日期 楼盘1 楼盘2 共同出现的次数
*
* 输出
* 楼盘1 楼盘2 共同出现的次数(考虑了衰减系数,每天的衰减系数不一样)
*
*/
public class HouseCountHz {
public staticclass HouseCountHzMapper
extends Mapper<LongWritable, Text, Text, FloatWritable>{
Text keyv=new Text();
FloatWritable valuev=new FloatWritable();
public void map(LongWritable key, Text value, Context context
) throws IOException, InterruptedException {
String[] s=value.toString().split("\t");
keyv.set(s[1]+" "+s[2]);//楼盘1,楼盘2
Calendar date1=Calendar.getInstance();
Calendar d2=Calendar.getInstance();
Date b = null;
SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd");
try {
b=sdf.parse(s[0]);
} catch (ParseException e) {
e.printStackTrace();
}
d2.setTime(b);
long n=date1.getTimeInMillis();
long birth=d2.getTimeInMillis();
long sss=n-birth;
int day=(int)((sss)/(3600*24*1000));//该条记录的日期与当前日期的日期差
float factor=1/(1+(float)(day-1)/10);//衰减系数
valuev.set(Float.parseFloat(s[3])*factor);
context.write(keyv, valuev);
}
}
public staticclass HouseCountHzReducer
extends Reducer<Text,FloatWritable,Text,FloatWritable> {
FloatWritable result=new FloatWritable();
Text keyreverse=new Text();
public void reduce(Text key, Iterable<FloatWritable> values,
Context context
) throws IOException, InterruptedException {
float sum=0;
for (FloatWritable iw:values)
{
sum+=iw.get();
}
result.set(sum);
String[] keys=key.toString().split("\t");
keyreverse.set(keys[1]+" "+keys[0]);
context.write(key, result) ;
context.write(keyreverse, result) ;
}
}
public staticvoid main(String[] args)
throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length !=2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
FileSystem fstm = FileSystem.get(conf);
Path outDir = new Path(otherArgs[1]);
fstm.delete(outDir, true);
conf.set("mapred.textoutputformat.separator","\t");
//reduce输出时key value中间的分隔符
Job job = new Job(conf,"HouseCountHz");
job.setNumReduceTasks(2);
job.setJarByClass(HouseCountHz.class);
job.setMapperClass(HouseCountHzMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FloatWritable.class);
// 设置reduce
// 设置reduce的输出
job.setReducerClass(HouseCountHzReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FloatWritable.class);
//job.setNumReduceTasks(18);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ?0 :
1);
}
}
import java.io.IOException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Date; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparator; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Partitioner; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.Mapper.Context; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.log4j.Logger; /* * 汇总近三个月统计楼盘之间共同出现的次数,考虑衰减系数, 并最后a b 转成 b a输出一次 * 输入: * 日期 楼盘1 楼盘2 共同出现的次数 * * 输出 * 楼盘1 楼盘2 共同出现的次数(考虑了衰减系数,每天的衰减系数不一样) * */ public class HouseCountHz { public static class HouseCountHzMapper extends Mapper<LongWritable, Text, Text, FloatWritable>{ Text keyv=new Text(); FloatWritable valuev=new FloatWritable(); public void map(LongWritable key, Text value, Context context ) throws IOException, InterruptedException { String[] s=value.toString().split("\t"); keyv.set(s[1]+" "+s[2]);//楼盘1,楼盘2 Calendar date1=Calendar.getInstance(); Calendar d2=Calendar.getInstance(); Date b = null; SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd"); try { b=sdf.parse(s[0]); } catch (ParseException e) { e.printStackTrace(); } d2.setTime(b); long n=date1.getTimeInMillis(); long birth=d2.getTimeInMillis(); long sss=n-birth; int day=(int)((sss)/(3600*24*1000)); //该条记录的日期与当前日期的日期差 float factor=1/(1+(float)(day-1)/10); //衰减系数 valuev.set(Float.parseFloat(s[3])*factor); context.write(keyv, valuev); } } public static class HouseCountHzReducer extends Reducer<Text,FloatWritable,Text,FloatWritable> { FloatWritable result=new FloatWritable(); Text keyreverse=new Text(); public void reduce(Text key, Iterable<FloatWritable> values, Context context ) throws IOException, InterruptedException { float sum=0; for (FloatWritable iw:values) { sum+=iw.get(); } result.set(sum); String[] keys=key.toString().split("\t"); keyreverse.set(keys[1]+" "+keys[0]); context.write(key, result) ; context.write(keyreverse, result) ; } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2); } FileSystem fstm = FileSystem.get(conf); Path outDir = new Path(otherArgs[1]); fstm.delete(outDir, true); conf.set("mapred.textoutputformat.separator", "\t"); //reduce输出时key value中间的分隔符 Job job = new Job(conf, "HouseCountHz"); job.setNumReduceTasks(2); job.setJarByClass(HouseCountHz.class); job.setMapperClass(HouseCountHzMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(FloatWritable.class); // 设置reduce // 设置reduce的输出 job.setReducerClass(HouseCountHzReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(FloatWritable.class); //job.setNumReduceTasks(18); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
第四个MR
[java]
view plaincopyprint?
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
/*
* 输入数据:
* 楼盘1 楼盘2 共同出现的次数
*
* 输出数据
* 楼盘1 楼盘2,楼盘3,楼盘4 (按次数排序)
*/
public class HouseRowToCol {
public staticclass Partitioner1
extends Partitioner<TextPairSecond, Text> {
@Override
//分区
public int getPartition(TextPairSecond key, Text value,int numParititon) {
return Math.abs((new Text(key.getFirst().toString()+key.getSecond().toString())).hashCode() *127) % numParititon;
}
}
//分组
public staticclass Comp1
extends WritableComparator {
public Comp1() {
super(TextPairSecond.class,true);
}
@SuppressWarnings("unchecked")
public int compare(WritableComparable a, WritableComparable b) {
TextPairSecond t1 = (TextPairSecond) a;
TextPairSecond t2 = (TextPairSecond) b;
return t1.getFirst().compareTo(t2.getFirst());
}
}
//排序
public staticclass KeyComp
extends WritableComparator {
public KeyComp() {
super(TextPairSecond.class,true);
}
@SuppressWarnings("unchecked")
public int compare(WritableComparable a, WritableComparable b) {
TextPairSecond t1 = (TextPairSecond) a;
TextPairSecond t2 = (TextPairSecond) b;
int comp= t1.getFirst().compareTo(t2.getFirst());
if (comp!=0)
return comp;
return -t1.getSecond().compareTo(t2.getSecond());
}
}
public staticclass HouseRowToColMapper
extends Mapper<LongWritable, Text, TextPairSecond, Text>{
Text houseid1=new Text();
Text houseid2=new Text();
FloatWritable weight=new FloatWritable();
public void map(LongWritable key, Text value, Context context
) throws IOException, InterruptedException {
String s[]=value.toString().split("\t");
weight.set(Float.parseFloat(s[2]));
houseid1.set(s[0]);
houseid2.set(s[1]);
TextPairSecond tp=new TextPairSecond(houseid1,weight);
context.write(tp, houseid2);
}
}
public staticclass HouseRowToColReducer
extends Reducer<TextPairSecond,Text,Text,Text> {
Text valuev=new Text();
public void reduce(TextPairSecond key, Iterable<Text> values,
Context context
) throws IOException, InterruptedException {
Text keyv=key.getFirst();
Iterator<Text> it=values.iterator();
StringBuilder sb=new StringBuilder(it.next().toString());
while(it.hasNext())
{
sb.append(","+it.next().toString());
}
valuev.set(sb.toString());
context.write(keyv, valuev);
}
}
public staticvoid main(String[] args)
throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length !=
2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
FileSystem fstm = FileSystem.get(conf);
Path outDir = new Path(otherArgs[1]);
fstm.delete(outDir, true);
conf.set("mapred.textoutputformat.separator","\t");
//reduce输出时key value中间的分隔符
Job job = new Job(conf,
"HouseRowToCol");
job.setNumReduceTasks(4);
job.setJarByClass(HouseRowToCol.class);
job.setMapperClass(HouseRowToColMapper.class);
job.setMapOutputKeyClass(TextPairSecond.class);
job.setMapOutputValueClass(Text.class);
// 设置partition
job.setPartitionerClass(Partitioner1.class);
// 在分区之后按照指定的条件分组
job.setGroupingComparatorClass(Comp1.class);
job.setSortComparatorClass(KeyComp.class);
// 设置reduce
// 设置reduce的输出
job.setReducerClass(HouseRowToColReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//job.setNumReduceTasks(18);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ?0 :
1);
}
}
相关文章推荐
- mapreduce实现"浏览该商品的人大多数还浏览了"经典应用
- mapreduce实现"浏览该商品的人大多数还浏览了"经典应用
- mapreduce实现"浏览该商品的人大多数还浏览了"经典应用
- mapreduce实现"浏览该商品的人大多数还浏览了"经典应用
- cookie应用 显示上次浏览商品的实现过程
- 利用TreeView控件实现多级商品信息浏览
- 一步一步教你使用AgileEAS.NET基础类库进行应用开发-WinForm应用篇-复杂业务的实现(商品入库)-附案例操作视频
- 使用TreeView控件实现多级商品信息浏览
- 转: 利用SQL SERVER2005的XML字段类型实现类似"商品扩展属性"
- Ajax实现二级联动下拉框(经典AJAX应用)
- JQuery.cookie.js实现最近浏览过的商品
- 数据结构之应用 "栈(Stack)" 实现: 解析算术表达式及计算求值 (C#/Java)
- 使用" 参数化基类" 和" 成员函数指针" 模拟实现虚函数--在实际中的应用
- Cookies (php实现类似淘宝最近浏览商品的功能模型代码) 转
- Mapreduce中的RCFile输入RCFileInputFormat实现及其应用
- 一步一步教你使用AgileEAS.NET基础类库进行应用开发-WinForm应用篇-演示ORM对象与DataGridView的绑定技术-商品字典的另一个实现
- 用php实现类似淘宝,拍拍,易趣的最近浏览商品的功能模型代码
- 使用" 参数化基类" 和" 成员函数指针" 模拟实现虚函数--在实际中的应用
- 商城项目中cookie 实现历史浏览商品并排序