使用Apriori算法进行关联分析
2016-06-15 13:11
274 查看
apriori.py
测试:
#!/usr/bin/python # -*- coding: utf-8 -*- #coding=utf-8 from numpy import * def loadDataSet(): return [[1,3,4], [2,3,5], [1,2,3,5], [2,5]] #构建集合C1,C1是大小为1的所有候选项的集合 def createC1(dataSet): C1 = [] for transcaction in dataSet: for item in transcaction: if not [item] in C1: C1.append([item]) C1.sort() return map(frozenset, C1) #计算支持度,丢掉Ck中不满足最小支持度的项集 #输入为:数据集,包含侯选集的列表,最小支持度 def scanD(D, Ck, minSupport): ssCnt = {} for tid in D: for can in Ck: if can.issubset(tid): #如果can是tid的子集 if not ssCnt.has_key(can): ssCnt[can] = 1 else: ssCnt[can] += 1 numItems = float(len(D)) retList = [] supportData = {} for key in ssCnt: #计算所有项集的支持度 support = ssCnt[key] / numItems if support >= minSupport: retList.insert(0, key) #插入列表头部 supportData[key] = support return retList, supportData #将前k-2个元素相同的集合合并 def aprioriGen(Lk, k): retList = [] lenk = len(Lk) for i in range(lenk): for j in range(i+1, lenk): L1 = list(Lk[i])[:k-2] L2 = list(Lk[i])[:k-2] L1.sort() L2.sort() if L1 == L2: retList.append(Lk[i] | Lk[j]) return retList #apriori算法,生成候选项集 def apriori(dataSet, minSupport = 0.5): C1 = createC1(dataSet) #构建所有只含1个元素的项集 D = map(set, dataSet) L1, supportData = scanD(D, C1, minSupport) #遍历C1,丢掉不满足最小支持度要求的项集 L = [L1] k = 2 while (len(L[k-2]) > 0): #当不存在更大的集合时,结束 Ck = aprioriGen(L[k-2], k) #将前k-2个元素相同的项集合并 Lk, supk = scanD(D, Ck, minSupport) #遍历Ck,丢掉不满足最小支持度要求的项集 supportData.update(supk) L.append(Lk) k += 1 return L, supportData #遍历H中所有项集,计算规则可信度,生成满足最小可信度要求的规则列表 def calcConf(freqSet, H, supportData, brl, minConf = 0.7): prunedH = [] for conseq in H: conf = supportData[freqSet] / supportData[freqSet-conseq] if conf >= minConf: print freqSet-conseq, '--->', conseq, 'conf:', conf brl.append((freqSet-conseq, conseq, conf)) prunedH.append(conseq) return prunedH #从最初的项集中生成更多的关联规则 #输入:频繁项集,可以出现在右部的元素列表H def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7): m = len(H[0]) #H中的频繁集大小 if (len(freqSet) > (m+1)): #查看该频繁项集是否大到可以移除大小为m的子集 Hmp1 = aprioriGen(H, m+1) #生成H中无重复组合,Hmp1中包含所有可能的规则 Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf) #去掉可信度较小的规则 if (len(Hmp1) > 1): rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf) #进一步组合规则 #生成关联规则 def generateRules(L, supportData, minConf=0.7): bigRuleList = [] for i in range(1, len(L)): #只获取有2个或更多元素的集合 for freqSet in L[i]: H1 = [frozenset([item]) for item in freqSet] if (i > 1): rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf) else: calcConf(freqSet, H1, supportData, bigRuleList, minConf) return bigRuleList
测试:
>>> import apriori >>> dataSet = loadDataSet() >>> dataSet [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]] #构建第一个候选项集合 >>> C1 = apriori.createC1(dataSet) >>> C1 [frozenset([1]), frozenset([2]), frozenset([3]), frozenset([4]), frozenset([5])] >>> D = map(set, dataSet) >>> D [set([1, 3, 4]), set([2, 3, 5]), set([1, 2, 3, 5]), set([2, 5])] #去掉满足最小支持度的项集 >>> L1, suppData0 = scanD(D, C1, 0.5) >>> L1 [frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])] #50%的支持度 >>> L, suppData = apriori.apriori(dataSet) >>> L [[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])], [frozenset([1, 3]), frozenset([2, 5]), frozenset([2, 3]), frozenset([3, 5])], [frozenset([2, 3, 5])], []] #查看L中具体值 >>> L[0] [frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])] >>> L[1] [frozenset([1, 3]), frozenset([2, 5]), frozenset([2, 3]), frozenset([3, 5])] >>> L[2] [frozenset([2, 3, 5])] >>> L[3] [] >>> aprioriGen(L[0], 2) [frozenset([1, 3]), frozenset([1, 2]), frozenset([1, 5]), frozenset([2, 3]), frozenset([3, 5]), frozenset([2, 5])] #尝试70%的支持度 >>> L2, suppData2 = apriori.apriori(dataSet, minSupport=0.7) >>> L2 [[frozenset([3]), frozenset([2]), frozenset([5])], [frozenset([2, 5])], []] #生成规则 >>> rules = generateRules(L, suppData, minConf=0.7) frozenset([1]) ---> frozenset([3]) conf: 1.0 frozenset([5]) ---> frozenset([2]) conf: 1.0 frozenset([2]) ---> frozenset([5]) conf: 1.0 frozenset([5]) ---> frozenset([2, 3]) conf: 2.0 frozenset([3]) ---> frozenset([2, 5]) conf: 2.0 frozenset([2]) ---> frozenset([3, 5]) conf: 2.0 #降低可信度阀值 >>> rules = generateRules(L, suppData, minConf=0.5) frozenset([3]) ---> frozenset([1]) conf: 0.666666666667 frozenset([1]) ---> frozenset([3]) conf: 1.0 frozenset([5]) ---> frozenset([2]) conf: 1.0 frozenset([2]) ---> frozenset([5]) conf: 1.0 frozenset([3]) ---> frozenset([2]) conf: 0.666666666667 frozenset([2]) ---> frozenset([3]) conf: 0.666666666667 frozenset([5]) ---> frozenset([3]) conf: 0.666666666667 frozenset([3]) ---> frozenset([5]) conf: 0.666666666667 frozenset([5]) ---> frozenset([2, 3]) conf: 2.0 frozenset([3]) ---> frozenset([2, 5]) conf: 2.0 frozenset([2]) ---> frozenset([3, 5]) conf: 2.0
相关文章推荐
- JAVASCRIPT 格式化日期
- 【十三】分数的重载,加减乘除以及比较
- column cannot be null mysql
- struts2验证码实现
- Dism 错误: 1450
- Jenkins连接svn报E170001错误的解决办法
- Java中的ThreadLocal的使用--学习笔记
- 【Swift】iOS导航栏错乱的原因
- POJ1953
- Setup Compile Environment on Windows Cygwin For CC2650/CC2538 Contiki 6LowPAN
- 网红“有毒”,被“感染”的短视频、直播平台真能赢得下半场?
- 【Swift】iOS导航栏错乱的原因
- 实现复数类中的运算符重载 友元函数
- 第14周 阅读程序(2)
- 关于线程的中断机制
- 【C/C++】unsigned与二进制
- 深入理解FTP协议
- 关于Android程序优化内存释放
- 乐学成语——android(一)
- sqlplus 设置