数据处理中Java与scala实现二次排序
2016-08-17 18:01
288 查看
JAVA 实现:
自定的Sort.java类:
package cn.hzjs.spark.secondsort;
import java.io.Serializable;
import scala.math.Ordered;
/**
* spark 二次排序
* @author 睡着的水
*
*/
public class Sort implements Ordered<Sort>,Serializable{//实现scala中Ordered方法,并且实现序列化
//需要二次排序的key
private int first;
private int second;
//构造方法,方便我们直接new出对象,构造实例
public Sort(int first,int second){
this.first=first;
this.second=second;
}
@Override
public boolean $greater(Sort other) {
//
if(this.first > other.getFirst()){
return true;
}else if(this.first == other.getFirst() && this.second > other.getSecond()){
return true;
}
return false;
}
@Override
public boolean $greater$eq(Sort other) {
// TODO Auto-generated method stub
if(this.$greater(other)){
return true;
}else if(this.first == other.getFirst() && this.second == other.getSecond()){
return true;
}
return false;
}
@Override
public boolean $less(Sort other) {
// TODO Auto-generated method stub
if(this.first < other.getFirst()){
return true;
}else if(this.first == other.getFirst() && this.second < other.getSecond()){
return true;
}
return false;
}
@Override
public boolean $less$eq(Sort other) {
// TODO Auto-generated method stub
if(this.$less(other)){
return true;
}else if(this.first == other.getFirst() && this.second == other.getSecond()){
return true;
}
return false;
}
@Override
public int compare(Sort other) {
// TODO Auto-generated method stub
if(this.first - other.getFirst() !=0){
return this.first - other.getFirst();
}else {
return this.second - other.getSecond();
}
}
@Override
public int compareTo(Sort other) {
// TODO Auto-generated method stub
if(this.first - other.getFirst() !=0){
return this.first - other.getFirst();
}else {
return this.second - other.getSecond();
}
}
public int getFirst() {
return first;
}
public void setFirst(int first) {
this.first = first;
}
public int getSecond() {
return second;
}
public void setSecond(int second) {
this.second = second;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + first;
result = prime * result + second;
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
Sort other = (Sort) obj;
if (first != other.first)
return false;
if (second != other.second)
return false;
return true;
}
}
排序功能类:
package cn.hzjs.spark.secondsort;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
/**
* 二次排序,具体的实现步骤:
* 1, 按照Ordered和Serrialiable借口实现自定义排序的Key
* 2, 将要进行二次排序的文件加载进来生成<key,value>类型的RDD
* 3, 使用sortByKey 基于自定义的key 进行二次排序
* 4, 去掉排序的key,只保留排序的结果
* @author 睡着的水 2016-8-17
*
*/
public clas
af8c
s SecondSortApp {
public static void main(String[] args) {
//初始化配置文件
SparkConf conf = new SparkConf().setAppName("SecondSortApp").setMaster("local");
//实例化SparkContext,作为程序执行的入口
JavaSparkContext sc = new JavaSparkContext(conf);
//读入源数据文件,生成第一个RDD
JavaRDD<String> lines = sc.textFile("F://sort.txt");
//以sort类作为key,数据作为value么?
JavaPairRDD<Sort, String> pairs =lines.mapToPair(new PairFunction<String, Sort, String>() {
private static final long serialVersionID = 1L;
@Override
public Tuple2<Sort, String> call(String line) throws Exception {
//基于每一行进行操作
String[] splited = line.split(" ");
//生成Sort类型的key
Sort key = new Sort(Integer.valueOf(splited[0]),Integer.valueOf(splited[1]));
//返回Tuple2类型,<> 为泛型 ,value是一行内容
return new Tuple2<Sort, String>(key,line);
}
});
JavaPairRDD<Sort, String> sorted = pairs.sortByKey();//完成二次排序
//执行第四步
JavaRDD<String> sortedLines = sorted.map(new Function<Tuple2<Sort,String>, String>() {
private static final long serialVersionID = 1L;
@Override
public String call(Tuple2<Sort, String> t) throws Exception {
return t._2;
}
});
sortedLines.foreach(new VoidFunction<String>() {
private static final long serialVersionID = 1L;
@Override
public void call(String line) throws Exception {
// TODO Auto-generated method stub
System.out.println(line);
}
});
}
}
用Scala实现二次排序:
自定义的SortKey.scala类:
package cn.hzjs.spark.secondsort
/**
* 自定义的二次排序的key
* 睡着的水-hzjs-2016.08.17
* 会比java的自定义,简单无数,,,
*/
class SortKey(val first: Int , val second: Int) extends Ordered[SortKey] with Serializable {
def compare(other: SortKey): Int = {
if(this.first - other.first != 0){
this.first - other.first
}else{
this.second - other.second
}
}
}
排序功能类:
package cn.hzjs.spark.secondsort
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
/**
* 二次排序,倒排序
* 1, 按照Ordered和Serrialiable借口实现自定义排序的Key(SortKey.scala)
* 2, 将要进行二次排序的文件加载进来生成<key,value>类型的RDD
* 3, 使用sortByKey 基于自定义的key 进行二次排序
* 4, 去掉排序的key,只保留排序的结果
*
* 睡着的水-hzjs-2016.08.17
*/
object SecondSortApp {
def main(args: Array[String]){
val conf = new SparkConf().setAppName("SecondSortApp").setMaster("local")
val sc = new SparkContext(conf)
val lines = sc.textFile("F://sort.txt")
val pairWithSortKey = lines.map(line => (
//生成格式为 (自定义的key,String) 格式的RDD
new SortKey(line.split(" ")(0).toInt,line.split(" ")(0).toInt),line
))
//倒排序
val sorted = pairWithSortKey.sortByKey(false)
//去掉无用的key
val sortedResult = sorted.map(sortedLine => sortedLine._2)
//打印
sortedResult.collect().foreach { x => println(x) }
}
}
自定的Sort.java类:
package cn.hzjs.spark.secondsort;
import java.io.Serializable;
import scala.math.Ordered;
/**
* spark 二次排序
* @author 睡着的水
*
*/
public class Sort implements Ordered<Sort>,Serializable{//实现scala中Ordered方法,并且实现序列化
//需要二次排序的key
private int first;
private int second;
//构造方法,方便我们直接new出对象,构造实例
public Sort(int first,int second){
this.first=first;
this.second=second;
}
@Override
public boolean $greater(Sort other) {
//
if(this.first > other.getFirst()){
return true;
}else if(this.first == other.getFirst() && this.second > other.getSecond()){
return true;
}
return false;
}
@Override
public boolean $greater$eq(Sort other) {
// TODO Auto-generated method stub
if(this.$greater(other)){
return true;
}else if(this.first == other.getFirst() && this.second == other.getSecond()){
return true;
}
return false;
}
@Override
public boolean $less(Sort other) {
// TODO Auto-generated method stub
if(this.first < other.getFirst()){
return true;
}else if(this.first == other.getFirst() && this.second < other.getSecond()){
return true;
}
return false;
}
@Override
public boolean $less$eq(Sort other) {
// TODO Auto-generated method stub
if(this.$less(other)){
return true;
}else if(this.first == other.getFirst() && this.second == other.getSecond()){
return true;
}
return false;
}
@Override
public int compare(Sort other) {
// TODO Auto-generated method stub
if(this.first - other.getFirst() !=0){
return this.first - other.getFirst();
}else {
return this.second - other.getSecond();
}
}
@Override
public int compareTo(Sort other) {
// TODO Auto-generated method stub
if(this.first - other.getFirst() !=0){
return this.first - other.getFirst();
}else {
return this.second - other.getSecond();
}
}
public int getFirst() {
return first;
}
public void setFirst(int first) {
this.first = first;
}
public int getSecond() {
return second;
}
public void setSecond(int second) {
this.second = second;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + first;
result = prime * result + second;
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
Sort other = (Sort) obj;
if (first != other.first)
return false;
if (second != other.second)
return false;
return true;
}
}
排序功能类:
package cn.hzjs.spark.secondsort;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
/**
* 二次排序,具体的实现步骤:
* 1, 按照Ordered和Serrialiable借口实现自定义排序的Key
* 2, 将要进行二次排序的文件加载进来生成<key,value>类型的RDD
* 3, 使用sortByKey 基于自定义的key 进行二次排序
* 4, 去掉排序的key,只保留排序的结果
* @author 睡着的水 2016-8-17
*
*/
public clas
af8c
s SecondSortApp {
public static void main(String[] args) {
//初始化配置文件
SparkConf conf = new SparkConf().setAppName("SecondSortApp").setMaster("local");
//实例化SparkContext,作为程序执行的入口
JavaSparkContext sc = new JavaSparkContext(conf);
//读入源数据文件,生成第一个RDD
JavaRDD<String> lines = sc.textFile("F://sort.txt");
//以sort类作为key,数据作为value么?
JavaPairRDD<Sort, String> pairs =lines.mapToPair(new PairFunction<String, Sort, String>() {
private static final long serialVersionID = 1L;
@Override
public Tuple2<Sort, String> call(String line) throws Exception {
//基于每一行进行操作
String[] splited = line.split(" ");
//生成Sort类型的key
Sort key = new Sort(Integer.valueOf(splited[0]),Integer.valueOf(splited[1]));
//返回Tuple2类型,<> 为泛型 ,value是一行内容
return new Tuple2<Sort, String>(key,line);
}
});
JavaPairRDD<Sort, String> sorted = pairs.sortByKey();//完成二次排序
//执行第四步
JavaRDD<String> sortedLines = sorted.map(new Function<Tuple2<Sort,String>, String>() {
private static final long serialVersionID = 1L;
@Override
public String call(Tuple2<Sort, String> t) throws Exception {
return t._2;
}
});
sortedLines.foreach(new VoidFunction<String>() {
private static final long serialVersionID = 1L;
@Override
public void call(String line) throws Exception {
// TODO Auto-generated method stub
System.out.println(line);
}
});
}
}
用Scala实现二次排序:
自定义的SortKey.scala类:
package cn.hzjs.spark.secondsort
/**
* 自定义的二次排序的key
* 睡着的水-hzjs-2016.08.17
* 会比java的自定义,简单无数,,,
*/
class SortKey(val first: Int , val second: Int) extends Ordered[SortKey] with Serializable {
def compare(other: SortKey): Int = {
if(this.first - other.first != 0){
this.first - other.first
}else{
this.second - other.second
}
}
}
排序功能类:
package cn.hzjs.spark.secondsort
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
/**
* 二次排序,倒排序
* 1, 按照Ordered和Serrialiable借口实现自定义排序的Key(SortKey.scala)
* 2, 将要进行二次排序的文件加载进来生成<key,value>类型的RDD
* 3, 使用sortByKey 基于自定义的key 进行二次排序
* 4, 去掉排序的key,只保留排序的结果
*
* 睡着的水-hzjs-2016.08.17
*/
object SecondSortApp {
def main(args: Array[String]){
val conf = new SparkConf().setAppName("SecondSortApp").setMaster("local")
val sc = new SparkContext(conf)
val lines = sc.textFile("F://sort.txt")
val pairWithSortKey = lines.map(line => (
//生成格式为 (自定义的key,String) 格式的RDD
new SortKey(line.split(" ")(0).toInt,line.split(" ")(0).toInt),line
))
//倒排序
val sorted = pairWithSortKey.sortByKey(false)
//去掉无用的key
val sortedResult = sorted.map(sortedLine => sortedLine._2)
//打印
sortedResult.collect().foreach { x => println(x) }
}
}
相关文章推荐
- Spark基础排序+二次排序(java+scala)
- 快速排序的Java和Scala实现
- scala语言二次排序实现
- Spark用Java实现二次排序的自定义key
- Spark:Java实现 二次排序
- Java学习-046-日志抓取合并后排序问题解决方案之 --- log4j 二次定制,实现日志输出添加延时10ms
- Spark二次排序(Java+Scala)
- Spark入门-scala实现二次或多次排序问题
- 43.top10热门品类之使用Scala实现二次排序
- SPARK排序算法,使用Scala开发 二次排序 自定义KEY值,相比JAVA的罗嗦,Scala优雅简洁!!!
- Spark基础排序+二次排序(java+scala)
- Spark Scala 实现二次排序和相加
- Java各种排序实现(转)
- 排序算法复习(Java实现)(二): 归并排序,堆排序,桶式排序,基数排序
- java Lucene 中自定义排序的实现
- java实现 冒泡排序 插入排序 选择排序
- 直接插入排序java实现
- 冒泡排序、选择排序、插入排序、快速排序算法的时间性能分析(java实现)
- 内排序算法的java实现---直接选择排序
- [Java]实现冒泡算法,对数组元素进行排序