大数据之Hive之扩展项目Youtube案例<二>
2017-12-27 20:42
387 查看
该项目的pom.xml文件:
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.z</groupId> <artifactId>youtube</artifactId> <version>0.0.1-SNAPSHOT</version> <packaging>jar</packaging> <name>youtube</name> <url>http://maven.apache.org</url> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> <repositories> <repository> <id>centor</id> <url>http://central.maven.org/maven2/</url> </repository> </repositories> <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>3.8.1</version> <scope>test</scope> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>2.7.2</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-yarn-server-resourcemanager</artifactId> <version>2.7.2</version> </dependency> </dependencies></project> |
3.6.1、ETL之ETLUtil
package com.z.youtube.util;public class ETLUtils { /** * 1、过滤不合法数据 * 2、去掉&符号左右两边的空格 * 3、\t换成&符号 * @param ori * @return */ public static String getETLString(String ori){ String[] splits = ori.split("\t"); //1、过滤不合法数据 if(splits.length < 9) return null; //2、去掉&符号左右两边的空格 splits[3] = splits[3].replaceAll(" ", ""); StringBuilder sb = new StringBuilder(); //3、\t换成&符号 for(int i = 0; i < splits.length; i++){ sb.append(splits[i]); if(i < 9){ if(i != splits.length - 1){ sb.append("\t"); } }else{ if(i != splits.length - 1){ sb.append("&"); } } } return sb.toString(); }} |
3.6.2、ETL之Mapper
package com.z.youtube.mr.etl;import java.io.IOException;import org.apache.commons.lang.StringUtils;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import com.z.youtube.util.ETLUtil;public class VideoETLMapper extends Mapper<Object, Text, NullWritable, Text>{ Text text = new Text(); @Override protected void map(Object key, Text value, Context context) throws IOException, InterruptedException { String etlString = ETLUtil.oriString2ETLString(value.toString()); if(StringUtils.isBlank(etlString)) return; text.set(etlString); context.write(NullWritable.get(), text); }} |
3.6.3、ETL之Runner
package com.z.youtube.mr.etl;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;public class VideoETLRunner implements Tool { private Configuration conf = null; @Override public void setConf(Configuration conf) { this.conf = conf; } @Override public Configuration getConf() { return this.conf; } @Override public int run(String[] args) throws Exception { conf = this.getConf(); conf.set("inpath", args[0]); conf.set("outpath", args[1]); Job job = Job.getInstance(conf, "youtube-video-etl"); job.setJarByClass(VideoETLRunner.class); job.setMapperClass(VideoETLMapper.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Text.class); job.setNumReduceTasks(0); this.initJobInputPath(job); this.initJobOutputPath(job); return job.waitForCompletion(true) ? 0 : 1; } private void initJobOutputPath(Job job) throws IOException { Configuration conf = job.getConfiguration(); String outPathString = conf.get("outpath"); FileSystem fs = FileSystem.get(conf); Path outPath = new Path(outPathString); if(fs.exists(outPath)){ fs.delete(outPath, true); } FileOutputFormat.setOutputPath(job, outPath); } private void initJobInputPath(Job job) throws IOException { Configuration conf = job.getConfiguration(); String inPathString = conf.get("inpath"); FileSystem fs = FileSystem.get(conf); Path inPath = new Path(inPathString); if(fs.exists(inPath)){ FileInputFormat.addInputPath(job, inPath); }else{ throw new RuntimeException("HDFS中该文件目录不存在:" + inPathString); } } public static void main(String[] args) { try { int resultCode = ToolRunner.run(new VideoETLRunner(), args); if(resultCode == 0){ System.out.println("Success!"); }else{ System.out.println("Fail!"); } System.exit(resultCode); } catch (Exception e) { e.printStackTrace(); System.exit(1); } }} |
3.6.4、执行ETL
赠送编译打包命令提示:-P local clean package$ bin/yarn jar ~/softwares/jars/youtube-0.0.1-SNAPSHOT.jar \com.z.youtube.etl.ETLYoutubeVideosRunner \/youtube/video/2008/0222 \/youtube/output/video/2008/0222 |
相关文章推荐
- 大数据之Hive之扩展项目Youtube案例<一>
- 大数据之Hive之扩展项目Youtube案例<三>
- 第四周(项目四扩展三)——数组做数据成员<数据存入文件>
- 第四周(项目四扩展2)——数组做数据成员<读取文件>
- 大数据之Hive<二>
- 一起学Maven(eclipse项目构建)<二>
- NDK<二> 基本数据类型调用
- <Netty>(二十三)(项目篇)Netty准备之数据分库分表的策略
- <java EE 项目:Musicstore>项目结构分析: 项目的3层结构之间的关系 :(表示层,业务层,数据层)
- 大数据之Sqoop <二>
- SharePoint 2013 列表关于大数据的测试<二>
- 使用SimpleXml解析xml数据<二>
- c#List泛型数据扩展,把List<>型数据格式化成List<SelectListItem>,用来作dropdownlist的数据
- 大数据学习笔记<二>
- 通过项目逐步深入了解Mybatis<二>
- solr5.5.0连接oracle数据库导入数据<二>
- Java后台开发<二>:Spirng+SpringMVC+Maven+Mybatis+MySQL项目搭建
- WCF扩展:行为扩展Behavior Extension<二>
- YII做项目总结<二>(后续不定时更新)
- 什么是人工智能、机器学习、深度学习、数据挖掘以及数据分析?<二>