Spark UDAF
2016-02-11 20:11
246 查看
package cn.spark.study.udf;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.expressions.MutableAggregationBuffer;
import org.apache.spark.sql.expressions.UserDefinedAggregateFunction;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
public class StringCount extends UserDefinedAggregateFunction {
/**
* dataType ,指的是,函数返回值的类型
*/
@Override
public DataType dataType() {
return DataTypes.IntegerType;
}
}
使用:
package cn.spark.study.udf;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
@SuppressWarnings(value={“unused”})
public class UdafSql {
sqlContext.udf.register(“myUDF”, (arg1: Int, arg2: String) => arg2 + arg1)
The following example registers a UDF in Java:
}, DataTypes.StringType);
Or, to use Java 8 lambda syntax:
sqlContext.udf().register(“myUDF”,
(Integer arg1, String arg2) -> arg2 + arg1,
DataTypes.StringType);
* @param args
*/
public static void main(String[] args) {
firstUdf();
}
}
import org.apache.spark.sql.Row;
import org.apache.spark.sql.expressions.MutableAggregationBuffer;
import org.apache.spark.sql.expressions.UserDefinedAggregateFunction;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
public class StringCount extends UserDefinedAggregateFunction {
/** * */ private static final long serialVersionUID = 1L; /** * inputSchema 指的是,输入数据的类型 */ @Override public StructType inputSchema() { StructField[] fields= {DataTypes.createStructField("str", DataTypes.StringType, true)}; StructType schema = DataTypes.createStructType(fields); return schema; } /** * bufferSchema 指的是,中间聚合时,所处理的数据的类型 */ @Override public StructType bufferSchema() { StructField[] fields= {DataTypes.createStructField("count", DataTypes.IntegerType, true)}; StructType schema = DataTypes.createStructType(fields); return schema; }
/**
* dataType ,指的是,函数返回值的类型
*/
@Override
public DataType dataType() {
return DataTypes.IntegerType;
}
@Override public boolean deterministic() { return true; } /** * 为每个分组的数据执行初始化操作 */ @Override public void initialize(MutableAggregationBuffer buffer) { buffer.update(0, 0); } /** * 指的是,每个分组,有新的值进来的时候,如何进行分组对应的聚合值的计算 */ @Override public void update(MutableAggregationBuffer buffer, Row row) { Integer bf = buffer.<Integer>getAs(0); buffer.update(0, bf+1); } @Override public void merge(MutableAggregationBuffer buffer1, Row buffer2) { Integer bf1 = buffer1.<Integer>getAs(0); Integer bf2 = buffer2.<Integer>getAs(0); buffer1.update(0, bf1+bf2); } /** * 最后,指的是,一个分组的聚合值,,如何通过中间的缓存聚合值,最后返回一个最终的聚合值 */ @Override public Object evaluate(Row buffer) { return buffer.<Integer>getAs(0); }
}
使用:
package cn.spark.study.udf;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
@SuppressWarnings(value={“unused”})
public class UdafSql {
/** * he following example registers a Scala closure as UDF:
sqlContext.udf.register(“myUDF”, (arg1: Int, arg2: String) => arg2 + arg1)
The following example registers a UDF in Java:
sqlContext.udf().register("myUDF", new UDF2<Integer, String, String>() { @Override public String call(Integer arg1, String arg2) { return arg2 + arg1; }
}, DataTypes.StringType);
Or, to use Java 8 lambda syntax:
sqlContext.udf().register(“myUDF”,
(Integer arg1, String arg2) -> arg2 + arg1,
DataTypes.StringType);
* @param args
*/
public static void main(String[] args) {
firstUdf();
}
private static void firstUdf(){ SparkConf conf = new SparkConf().setAppName("UdfSql").setMaster("local"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext sqlct = new SQLContext(jsc); String[] str= {"Hpf99","Leo","Marray","Jack","Tom","Tom","Tom","Leo","Leo","Marray","Marray","Jack"}; List<String> lis = Arrays.asList(str); JavaRDD<String> strRdd = jsc.parallelize(lis); JavaRDD<Row> rowRdd = strRdd.mapPartitions(new FlatMapFunction<Iterator<String>, Row>() { /** * */ private static final long serialVersionUID = 1L; @Override public Iterable<Row> call(Iterator<String> t) throws Exception { 7d4c List<Row> lis = new ArrayList<Row>(); while(t.hasNext()){ String next = t.next(); Row create = RowFactory.create(next); lis.add(create); } return lis; } }); StructField[] fields= {DataTypes.createStructField("name", DataTypes.StringType, true)}; StructType schema = DataTypes.createStructType(fields); DataFrame rowDF = sqlct.createDataFrame(rowRdd, schema); rowDF.registerTempTable("names"); sqlct.udf().register("strCount", new StringCount()); DataFrame sql = sqlct.sql("SELECT name, strCount(name) as mycount,count(name) FROM names group by name"); sql.show(); jsc.close(); }
}
相关文章推荐
- 多线程的应用场景
- 主DNS配置
- android通讯录实例(一)
- Linux使用命令查找定位文件
- HDOJ 2044 一只小蜜蜂...
- 20个非常有用的Java程序片段
- 图的深度优先遍历
- iOS 获取当前时间以及计算年龄(时间差)
- 【bzoj1663】[Usaco2006 Open]赶集 dp
- UVA 12504 Updating a Dictionary
- macbook cents virtual box 虚拟机
- remote_recv.c
- Educational Codeforces Round 7 E. Ants in Leaves(贪心)
- 蓝桥杯 - 新建Microsoft Word文档
- Educational Codeforces Round 7 C. Not Equal on a Segment
- 蓝桥杯 算法训练 图形显示
- swift算法手记-10
- [Spring实战系列](16)面向切面编程(AOP)概述
- 详说块级格式化上下文
- python select、poll