您的位置:首页 > 其它

使用Apriori算法进行关联分析

2016-06-15 13:11 274 查看
apriori.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
#coding=utf-8

from numpy import *

def loadDataSet():
return [[1,3,4], [2,3,5], [1,2,3,5], [2,5]]

#构建集合C1,C1是大小为1的所有候选项的集合
def createC1(dataSet):
C1 = []
for transcaction in dataSet:
for item in transcaction:
if not [item] in C1:
C1.append([item])
C1.sort()
return map(frozenset, C1)

#计算支持度,丢掉Ck中不满足最小支持度的项集
#输入为:数据集,包含侯选集的列表,最小支持度
def scanD(D, Ck, minSupport):
ssCnt = {}
for tid in D:
for can in Ck:
if can.issubset(tid):  #如果can是tid的子集
if not ssCnt.has_key(can):
ssCnt[can] = 1
else:
ssCnt[can] += 1
numItems = float(len(D))
retList = []
supportData = {}
for key in ssCnt:  #计算所有项集的支持度
support = ssCnt[key] / numItems
if support >= minSupport:
retList.insert(0, key)  #插入列表头部
supportData[key] = support
return retList, supportData

#将前k-2个元素相同的集合合并
def aprioriGen(Lk, k):
retList = []
lenk = len(Lk)
for i in range(lenk):
for j in range(i+1, lenk):
L1 = list(Lk[i])[:k-2]
L2 = list(Lk[i])[:k-2]
L1.sort()
L2.sort()
if L1 == L2:
retList.append(Lk[i] | Lk[j])
return retList

#apriori算法,生成候选项集
def apriori(dataSet, minSupport = 0.5):
C1 = createC1(dataSet) #构建所有只含1个元素的项集
D = map(set, dataSet)
L1, supportData = scanD(D, C1, minSupport) #遍历C1,丢掉不满足最小支持度要求的项集
L = [L1]
k = 2
while (len(L[k-2]) > 0):  #当不存在更大的集合时,结束
Ck = aprioriGen(L[k-2], k)  #将前k-2个元素相同的项集合并
Lk, supk = scanD(D, Ck, minSupport)  #遍历Ck,丢掉不满足最小支持度要求的项集
supportData.update(supk)
L.append(Lk)
k += 1
return L, supportData

#遍历H中所有项集,计算规则可信度,生成满足最小可信度要求的规则列表
def calcConf(freqSet, H, supportData, brl, minConf = 0.7):
prunedH = []
for conseq in H:
conf = supportData[freqSet] / supportData[freqSet-conseq]
if conf >= minConf:
print freqSet-conseq, '--->', conseq, 'conf:', conf
brl.append((freqSet-conseq, conseq, conf))
prunedH.append(conseq)
return prunedH

#从最初的项集中生成更多的关联规则
#输入:频繁项集,可以出现在右部的元素列表H
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
m = len(H[0])  #H中的频繁集大小
if (len(freqSet) > (m+1)):  #查看该频繁项集是否大到可以移除大小为m的子集
Hmp1 = aprioriGen(H, m+1)  #生成H中无重复组合,Hmp1中包含所有可能的规则
Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf) #去掉可信度较小的规则
if (len(Hmp1) > 1):
rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf) #进一步组合规则

#生成关联规则
def generateRules(L, supportData, minConf=0.7):
bigRuleList = []
for i in range(1, len(L)): #只获取有2个或更多元素的集合
for freqSet in L[i]:
H1 = [frozenset([item]) for item in freqSet]
if (i > 1):
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
else:
calcConf(freqSet, H1, supportData, bigRuleList, minConf)
return bigRuleList


测试:

>>> import apriori
>>> dataSet = loadDataSet()
>>> dataSet
[[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
#构建第一个候选项集合
>>> C1 = apriori.createC1(dataSet)
>>> C1
[frozenset([1]), frozenset([2]), frozenset([3]), frozenset([4]), frozenset([5])]
>>> D = map(set, dataSet)
>>> D
[set([1, 3, 4]), set([2, 3, 5]), set([1, 2, 3, 5]), set([2, 5])]
#去掉满足最小支持度的项集
>>> L1, suppData0 = scanD(D, C1, 0.5)
>>> L1
[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])]
#50%的支持度
>>> L, suppData = apriori.apriori(dataSet)
>>> L
[[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])], [frozenset([1, 3]), frozenset([2, 5]), frozenset([2, 3]), frozenset([3, 5])], [frozenset([2, 3, 5])], []]
#查看L中具体值
>>> L[0]
[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])]
>>> L[1]
[frozenset([1, 3]), frozenset([2, 5]), frozenset([2, 3]), frozenset([3, 5])]
>>> L[2]
[frozenset([2, 3, 5])]
>>> L[3]
[]
>>> aprioriGen(L[0], 2)
[frozenset([1, 3]), frozenset([1, 2]), frozenset([1, 5]), frozenset([2, 3]), frozenset([3, 5]), frozenset([2, 5])]
#尝试70%的支持度
>>> L2, suppData2 = apriori.apriori(dataSet, minSupport=0.7)
>>> L2
[[frozenset([3]), frozenset([2]), frozenset([5])], [frozenset([2, 5])], []]
#生成规则
>>> rules = generateRules(L, suppData, minConf=0.7)
frozenset([1]) ---> frozenset([3]) conf: 1.0
frozenset([5]) ---> frozenset([2]) conf: 1.0
frozenset([2]) ---> frozenset([5]) conf: 1.0
frozenset([5]) ---> frozenset([2, 3]) conf: 2.0
frozenset([3]) ---> frozenset([2, 5]) conf: 2.0
frozenset([2]) ---> frozenset([3, 5]) conf: 2.0
#降低可信度阀值
>>> rules = generateRules(L, suppData, minConf=0.5)
frozenset([3]) ---> frozenset([1]) conf: 0.666666666667
frozenset([1]) ---> frozenset([3]) conf: 1.0
frozenset([5]) ---> frozenset([2]) conf: 1.0
frozenset([2]) ---> frozenset([5]) conf: 1.0
frozenset([3]) ---> frozenset([2]) conf: 0.666666666667
frozenset([2]) ---> frozenset([3]) conf: 0.666666666667
frozenset([5]) ---> frozenset([3]) conf: 0.666666666667
frozenset([3]) ---> frozenset([5]) conf: 0.666666666667
frozenset([5]) ---> frozenset([2, 3]) conf: 2.0
frozenset([3]) ---> frozenset([2, 5]) conf: 2.0
frozenset([2]) ---> frozenset([3, 5]) conf: 2.0
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: