您的位置:首页 > 大数据

大数据之Hive之扩展项目Youtube案例<二>

2017-12-27 20:42 387 查看
该项目的pom.xml文件:
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"       xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">       <modelVersion>4.0.0</modelVersion>       <groupId>com.z</groupId>       <artifactId>youtube</artifactId>       <version>0.0.1-SNAPSHOT</version>       <packaging>jar</packaging>       <name>youtube</name>       <url>http://maven.apache.org</url>       <properties>              <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>       </properties>        <repositories>              <repository>                     <id>centor</id>                     <url>http://central.maven.org/maven2/</url>              </repository>       </repositories>       <dependencies>              <dependency>                     <groupId>junit</groupId>                     <artifactId>junit</artifactId>                     <version>3.8.1</version>                     <scope>test</scope>              </dependency>              <dependency>                     <groupId>org.apache.hadoop</groupId>                     <artifactId>hadoop-client</artifactId>                     <version>2.7.2</version>              </dependency>              <dependency>                     <groupId>org.apache.hadoop</groupId>                     <artifactId>hadoop-yarn-server-resourcemanager</artifactId>                     <version>2.7.2</version>              </dependency>       </dependencies></project>

3.6.1、ETL之ETLUtil

package com.z.youtube.util;public class ETLUtils {       /**        * 1、过滤不合法数据        * 2、去掉&符号左右两边的空格        * 3、\t换成&符号        * @param ori        * @return        */       public static String getETLString(String ori){              String[] splits = ori.split("\t");              //1、过滤不合法数据              if(splits.length < 9) return null;              //2、去掉&符号左右两边的空格              splits[3] = splits[3].replaceAll(" ", "");              StringBuilder sb = new StringBuilder();              //3、\t换成&符号              for(int i = 0; i < splits.length; i++){                     sb.append(splits[i]);                     if(i < 9){                            if(i != splits.length - 1){                                   sb.append("\t");                            }                     }else{                            if(i != splits.length - 1){                                   sb.append("&");                            }                     }              }              return sb.toString();       }}

3.6.2、ETL之Mapper

package com.z.youtube.mr.etl;import java.io.IOException;import org.apache.commons.lang.StringUtils;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import com.z.youtube.util.ETLUtil;public class VideoETLMapper extends Mapper<Object, Text, NullWritable, Text>{       Text text = new Text();       @Override       protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {              String etlString = ETLUtil.oriString2ETLString(value.toString());              if(StringUtils.isBlank(etlString)) return;                           text.set(etlString);              context.write(NullWritable.get(), text);       }}

3.6.3、ETL之Runner

package com.z.youtube.mr.etl;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;public class VideoETLRunner implements Tool {       private Configuration conf = null;       @Override       public void setConf(Configuration conf) {              this.conf = conf;       }       @Override       public Configuration getConf() {              return this.conf;       }       @Override       public int run(String[] args) throws Exception {              conf = this.getConf();              conf.set("inpath", args[0]);              conf.set("outpath", args[1]);              Job job = Job.getInstance(conf, "youtube-video-etl");                       job.setJarByClass(VideoETLRunner.class);                           job.setMapperClass(VideoETLMapper.class);              job.setMapOutputKeyClass(NullWritable.class);              job.setMapOutputValueClass(Text.class);              job.setNumReduceTasks(0);                           this.initJobInputPath(job);              this.initJobOutputPath(job);                         return job.waitForCompletion(true) ? 0 : 1;       }        private void initJobOutputPath(Job job) throws IOException {              Configuration conf = job.getConfiguration();              String outPathString = conf.get("outpath");                         FileSystem fs = FileSystem.get(conf);                           Path outPath = new Path(outPathString);              if(fs.exists(outPath)){                     fs.delete(outPath, true);              }                          FileOutputFormat.setOutputPath(job, outPath);                 }       private void initJobInputPath(Job job) throws IOException {              Configuration conf = job.getConfiguration();              String inPathString = conf.get("inpath");                       FileSystem fs = FileSystem.get(conf);                          Path inPath = new Path(inPathString);              if(fs.exists(inPath)){                     FileInputFormat.addInputPath(job, inPath);              }else{                     throw new RuntimeException("HDFS中该文件目录不存在:" + inPathString);              }       }       public static void main(String[] args) {              try {                     int resultCode = ToolRunner.run(new VideoETLRunner(), args);                     if(resultCode == 0){                            System.out.println("Success!");                     }else{                            System.out.println("Fail!");                     }                     System.exit(resultCode);              } catch (Exception e) {                     e.printStackTrace();                     System.exit(1);              }       }}

3.6.4、执行ETL

赠送编译打包命令提示:-P local clean package
$ bin/yarn jar ~/softwares/jars/youtube-0.0.1-SNAPSHOT.jar \com.z.youtube.etl.ETLYoutubeVideosRunner \/youtube/video/2008/0222 \/youtube/output/video/2008/0222
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: