您的位置:首页 > 其它

自定义分区partitioner实现数据分区存储

2017-09-09 22:04 417 查看
Spark中分区器直接决定了RDD中分区的个数、RDD中每条数据经过Shuffle过程属于哪个分区和Reduce的个数

注意:

(1)只有Key-Value类型的RDD才有分区的,非Key-Value类型的RDD分区的值是None

(2)每个RDD的分区ID范围:0~numPartitions-1,决定这个值是属于那个分区的。

参考:http://blog.csdn.net/high2011/article/details/68491115

package com.ljt.spark01.weblog

import java.net.URL

import org.apache.spark.HashPartitioner

import org.apache.spark.SparkConf

import org.apache.spark.SparkContext

/**

* 自定义分区partitioner实现数据分区存储

*/

object UrlCountPartition {

def main(args: Array[String]): Unit = {

val arr_course = Array(“java.itcast.cn”, “php.itcast.cn”, “net.itcast.cn”)

val conf = new SparkConf().setAppName(“AdvUrlCount”)

.setMaster(“local[2]”)

val sc = new SparkContext(conf)

//将数据切分为元组(URL,1)存放在RDDl
val RDD1 = sc.textFile("data/usercount/IT_education.log").map { x =>
val f = x.split("\t")
//去掉时间,每出现一次URL,记为一个元组(url,1)
(f(1), 1)
}
//对相同的key的每个元组的值进行自加
//(http://php.itcast.cn/php/course.shtml,459)
val rdd_urlCount = RDD1.reduceByKey(_ + _)

//获取url的前缀Host做为课程标识
//(php.itcast.cn,http://php.itcast.cn/php/course.shtml,459)
val rdd_urlHost = rdd_urlCount.map(f => {
val url = f._1
val countUrl = f._2
val host = new URL(url).getHost
//为了方便按照分区内部排序需要使用K-V,元组
(host, (url, countUrl))
}).cache() //cache会将数据缓存到内存当中,cache是一个Transformation,lazy
//url去重,得到所有host课程种类
val ints = rdd_urlHost.map(_._1).distinct().collect()
//实例化分区
val hostPartitioner = new HostPartition(ints)
//每个分区内部排序,取出前3名
val rdd_Partitioners = rdd_urlHost.partitionBy(hostPartitioner)
.mapPartitions(it => {
it.toList.sortBy(_._2._2).reverse.take(3).iterator
})

rdd_Partitioners.saveAsTextFile("data/out/out_partitioner")
/**
* ArrayBuffer((net.itcast.cn,(http://net.itcast.cn/net/course.shtml,521)), (net.itcast.cn,(http://net.itcast.cn/net/video.shtml,521)), (net.itcast.cn,(http://net.itcast.cn/net/teacher.shtml,512)), (java.itcast.cn,(http://java.itcast.cn/java/course/cloud.shtml,1028)), (java.itcast.cn,(http://java.itcast.cn/java/course/javaee.shtml,1000)), (java.itcast.cn,(http://java.itcast.cn/java/course/base.shtml,543)), (php.itcast.cn,(http://php.itcast.cn/php/video.shtml,490)), (php.itcast.cn,(http://php.itcast.cn/php/teacher.shtml,464)), (php.itcast.cn,(http://php.itcast.cn/php/course.shtml,459)))
*/
println(rdd_Partitioners.collect().toBuffer)
sc.stop()


}

}

package com.ljt.spark01.weblog

import org.apache.spark.Partitioner
import scala.collection.mutable.HashMap

/**
* 重写partition分区,按规则存储分区数据
*/
class HostPartition(ins: Array[String]) extends Partitioner {

val parMap = new HashMap[String, Int]()
var count = 0
for (i <- ins) {
parMap += (i -> count)
count += 1
}

override def numPartitions: Int = {
ins.length
}

def getPartition(key: Any): Int = {
parMap.getOrElse(key.toString(), 0)
}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  partition 自定义