您的位置:首页 > 大数据 > Hadoop

Spark读取HDFS文件,文件格式为GB2312,实现WordCount示例

2015-02-09 11:52 423 查看
作者博客迁移至博客园:http://www.cnblogs.com/xiaodf/

import scala.Tuple2;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;

import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.regex.Pattern;

/**
* 利用Spark框架读取HDFS文件,文件格式为GB2312,实现WordCount示例
* <p/>
* 执行命令:spark-submit --class iie.hadoop.hcatalog.TextFileSparkTest --master
* yarn-cluster /tmp/sparkTest.jar hdfs://192.168.8.101/test/words
* hdfs://192.168.8.101/test/spark/out
*
* @author xdf
*/
public class SparkJavaWordCount {
private static final Pattern SPACE = Pattern.compile(",");

@SuppressWarnings("serial")
public static void main(String[] args) throws Exception {

if (args.length < 2) {
System.err.println("Usage: SparkJavaWordCount <input hdfs file> <output hdfs file>");
System.exit(1);
}
String inputSparkFile = args[0];
String outputSparkFile = args[1];

SparkConf sparkConf = new SparkConf().setAppName("SparkWordCount");
JavaSparkContext ctx = new JavaSparkContext(sparkConf);
Configuration conf = new Configuration();

//spark读取输入文件获取文件内容
JavaPairRDD<LongWritable, Text> contents = ctx.newAPIHadoopFile(
inputSparkFile, TextInputFormat.class, LongWritable.class,
Text.class, conf);

//指定文件格式,获取字符串类型RDD
JavaRDD<String> lines = contents
.map(new Function<Tuple2<LongWritable, Text>, String>() {
public String call(Tuple2<LongWritable, Text> x) {
String lines = null;
try {
lines = new String(x._2().getBytes(), 0, x._2()
.getLength(), "GB2312");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return lines;
}
});

//对字符串按指定分隔符分割,获得单个单词word
JavaRDD<String> words = lines
.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterable<String> call(String s) {
return Arrays.asList(SPACE.split(s));
}
});

//每个word转换成(word,1)键值对
JavaPairRDD<String, Integer> ones = words
.mapToPair(new PairFunction<String, String, Integer>() {

@Override
public Tuple2<String, Integer> call(String s) {
return new Tuple2<String, Integer>(s, 1);
}
});

//按相同key值对键值对累加计数
JavaPairRDD<String, Integer> counts = ones
.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer i1, Integer i2) {
return i1 + i2;
}
});

//将统计结果输出到文件
counts.map(new Function<Tuple2<String, Integer>, String>() {
@Override
public String call(Tuple2<String, Integer> arg0) throws Exception {
return arg0._1().toUpperCase() + ": " + arg0._2();
}
}).saveAsTextFile(outputSparkFile);

ctx.stop();
}
}



                                            
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  Spark hdfs 编码转换