MapReduce之一——上网流量数据统计
2015-06-07 12:12
375 查看
1.原始日志数据:
日志格式:
2.DataBean
3.DataCount处理实现
4.执行结果
命令:
(记住:第三个参数只可以大于等于4)
结果:
1363157985066 13726230503 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200 1363157995052 13826544101 5C-0E-8B-C7-F1-E0:CMCC 120.197.40.4 4 0 264 0 200 1363157991076 13926435656 20-10-7A-28-CC-0A:CMCC 120.196.100.99 2 4 132 1512 200 1363154400022 13926251106 5C-0E-8B-8B-B1-50:CMCC 120.197.40.4 4 0 240 0 200 1363157993044 18211575961 94-71-AC-CD-E6-18:CMCC-EASY 120.196.100.99 iface.qiyi.com 视频网站 15 12 1527 2106 200 1363157995074 84138413 5C-0E-8B-8C-E8-20:7DaysInn 120.197.40.4 122.72.52.12 20 16 4116 1432 200 1363157993055 13560439658 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 18 15 1116 954 200 1363157995033 15920133257 5C-0E-8B-C7-BA-20:CMCC 120.197.40.4 sug.so.360.cn 信息安全 20 20 3156 2936 200 1363157983019 13719199419 68-A1-B7-03-07-B1:CMCC-EASY 120.196.100.82 4 0 240 0 200 1363157984041 13660577991 5C-0E-8B-92-5C-20:CMCC-EASY 120.197.40.4 s19.cnzz.com 站点统计 24 9 6960 690 200 1363157973098 15013685858 5C-0E-8B-C7-F7-90:CMCC 120.197.40.4 rank.ie.sogou.com 搜索引擎 28 27 3659 3538 200 1363157986029 15989002119 E8-99-C4-4E-93-E0:CMCC-EASY 120.196.100.99 www.umeng.com 站点统计 3 3 1938 180 200 1363157992093 13560439658 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 15 9 918 4938 200 1363157986041 13480253104 5C-0E-8B-C7-FC-80:CMCC-EASY 120.197.40.4 3 3 180 180 200 1363157984040 13602846565 5C-0E-8B-8B-B6-00:CMCC 120.197.40.4 2052.flash2-http.qq.com 综合门户 15 12 1938 2910 200 1363157995093 13922314466 00-FD-07-A2-EC-BA:CMCC 120.196.100.82 img.qfc.cn 12 12 3008 3720 200 1363157982040 13502468823 5C-0A-5B-6A-0B-D4:CMCC-EASY 120.196.100.99 y0.ifengimg.com 综合门户 57 102 7335 110349 200 1363157986072 18320173382 84-25-DB-4F-10-1A:CMCC-EASY 120.196.100.99 input.shouji.sogou.com 搜索引擎 21 18 9531 2412 200 1363157990043 13925057413 00-1F-64-E1-E6-9A:CMCC 120.196.100.55 t3.baidu.com 搜索引擎 69 63 11058 48243 200 1363157988072 13760778710 00-FD-07-A4-7B-08:CMCC 120.196.100.82 2 2 120 120 200 1363157985066 13726238888 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200 1363157993055 13560436666 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 18 15 1116 954 200
日志格式:
2.DataBean
public class DataBean implements Writable{ private String telNO; private long upPlayLoad; private long downPlayLoad; private long totalPayLoad; public DataBean(){ } public DataBean(String telNO, long upPlayLoad, long downPlayLoad) { this.telNO = telNO; this.upPlayLoad = upPlayLoad; this.downPlayLoad = downPlayLoad; this.totalPayLoad=this.upPlayLoad+this.downPlayLoad; } //一定要注意下面两个方法的顺序、类型匹配 //反序列化 public void readFields(DataInput arg0) throws IOException { this.telNO = arg0.readUTF(); this.upPlayLoad = arg0.readLong(); this.downPlayLoad = arg0.readLong(); this.totalPayLoad = arg0.readLong(); } // 序列化 public void write(DataOutput arg0) throws IOException { arg0.writeUTF(telNO); arg0.writeLong(upPlayLoad); arg0.writeLong(downPlayLoad); arg0.writeLong(totalPayLoad); } public String getTelNO() { return telNO; } public void setTelNO(String telNO) { this.telNO = telNO; } public long getUpPlayLoad() { return upPlayLoad; } public void setUpPlayLoad(long upPlayLoad) { this.upPlayLoad = upPlayLoad; } public long getDownPlayLoad() { return downPlayLoad; } public void setDownPlayLoad(long downPlayLoad) { this.downPlayLoad = downPlayLoad; } public long getTotalPayLoad() { return totalPayLoad; } public void setTotalPayLoad(long totalPayLoad) { this.totalPayLoad = totalPayLoad; } //mapreduce自动调用 @Override public String toString() { return this.upPlayLoad+"\t"+this.downPlayLoad+"\t"+this.totalPayLoad; }
3.DataCount处理实现
public class DataCount { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf=new Configuration(); // 对conf配置,否则会读取全局的mapred-site.xml的配置 Job job=Job.getInstance(conf); job.setJarByClass(DataCount.class); job.setMapperClass(DCMapper.class); //当k2,v2和k3,v3类型一一对应时下面可以省略 //job.setOutputKeyClass(DataBean.class); //job.setOutputValueClass(DataBean.class); //job.setMapOutputKeyClass(Text.class); //job.setMapOutputValueClass(DataBean.class); FileInputFormat.setInputPaths(job, new Path(args[0])); job.setReducerClass(DCReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DataBean.class); FileOutputFormat.setOutputPath(job, new Path(args[1])); // 设置分区实现类 job.setPartitionerClass(ProviderPartitioner.class); //设置ruducer数量,即分区数,默认为1 job.setNumReduceTasks(Integer.parseInt(args[2])); job.waitForCompletion(true); } //在map执行完成、reducer执行前执行<k2, v2> public static class ProviderPartitioner extends Partitioner<Text, DataBean>{ private static Map<String, Integer> map= new HashMap<String, Integer>(); //运行命令时输入的分区参数只可以大于等于4 static{ map.put("150", 2); map.put("159", 2); map.put("134", 1); map.put("135", 1); map.put("136", 1); map.put("137", 1); map.put("138", 1); map.put("139", 1); map.put("183", 3); map.put("182", 3); } //参数分别是k2,V2,分区数 //返回值:分区号 @Override public int getPartition(Text key, DataBean value, int numPartitions) { String account=key.toString(); String sunAcc=account.substring(0, 3); Integer p=map.get(sunAcc); if (p==null) { p=0; } return p; } } public static class DCMapper extends Mapper<LongWritable, Text, Text, DataBean> { @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, DataBean>.Context context) throws IOException, InterruptedException { //取数据、按制表副分割 String line=value.toString(); String[]fileds=line.split("\t"); String telNO=fileds[1]; Long up=Long.parseLong(fileds[8]); Long down=Long.parseLong(fileds[9]); //输出 context.write(new Text(telNO), new DataBean(telNO, up, down)); } } public static class DCReducer extends Reducer<Text, DataBean, Text, DataBean>{ @Override protected void reduce(Text key, Iterable<DataBean> v2s, Reducer<Text, DataBean, Text, DataBean>.Context context) throws IOException, InterruptedException { long upSum=0; long downSum=0; long totalSum=0; for (DataBean dataBean : v2s) { upSum+=dataBean.getUpPlayLoad(); downSum+=dataBean.getDownPlayLoad(); } context.write(key, new DataBean(key.toString(), upSum, downSum)); } } }
4.执行结果
命令:
(记住:第三个参数只可以大于等于4)
hadoop jar rsort.jar cn.zx.hadoop.mr.dc.DataCount /test/HTTP_20130313143750.dat /test/tinfo **4**
结果:
hdfs dfs -cat /test/tinfo/part-r-00002
相关文章推荐
- [最小生成树]清扫
- ubuntu 查看apt-get有哪些软件
- java大整数的乘方问题处理
- 文章之间的基本总结Activity生命周期
- 百度地图API 密钥
- 第13周项目2-形状类族的中的纯虚函数
- OJ硬币组合数量的分析
- JPA 不在 persistence.xml 文件中配置每个Entity实体类的2种解决办法
- GRMustache的使用(HTML模板渲染工具)For iOS
- 03.共享程序集和强命名程序集
- C中的继承和多态
- linux 操作系统的安装,本地登录及远程登录,vnc连接操作详细步骤
- ContentProvider ContentResolver ContentObserver 内容:提供、访问、监听
- [Network] HTML、XML和JSON学习汇总
- jms入门学习总结
- 115 tomcat架构分析 (Session管理)
- Struts2中的链接标签 <s:url>和<s:a>---在action中获取jsp表单提交的参数(转)
- erlang vm 内存分布整理
- [高精度][BZOJ1002][FJOI2007]轮状病毒
- 第13周项目1-动物这样叫(3)