机器学习python决策树源码
2018-01-31 20:11
441 查看
from math import log import operator def createDataSet(): dataSet = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']] labels = ['no surfacing','flippers'] return dataSet, labels #计算信息熵 def calcShannonEnt(dataSet): numEntries = len(dataSet) labelcount = {} for i in dataSet: currentLabel = i[-1] if currentLabel not in labelcount.keys(): labelcount[currentLabel] = 0 labelcount[currentLabel] += 1 shannonEnt = 0.0 for keys in labelcount: prob = float(labelcount[keys]/numEntries) shannonEnt -= prob * log(prob,2) return shannonEnt #划分数据集 def splitDataSet(dataSet,axis,value): retDataSet = [] for featVec in dataSet: if featVec[axis] == value: reducedFeatureVec = featVec[:axis] reducedFeatureVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatureVec) return retDataSet #选择最好的数据划分方式 def chooseBestFeatureToSplit(dataSet): numFeature = len(dataSet[0]) - 1 baseEntropy = calcShannonEnt(dataSet) bestInfoGain = 0.0 bestFeature = -1 for i in range(numFeature): featList = [index_value[i] for index_value in dataSet] uniqueVals = set(featList) newEntropy = 0.0 for value in uniqueVals: subDataSet = splitDataSet(dataSet,i,value) prob = len(subDataSet)/float(len(dataSet)) newEntropy += prob * calcShannonEnt(subDataSet) infoGain = baseEntropy - newEntropy if (infoGain > bestInfoGain): bestInfoGain = infoGain bestFeature = i return bestFeature def majorityCnt(classList): classcount = {} for vote in classList: if vote not in classList.keys(): classcount[vote] = 0 classcount[vote] += 1 sortedClassCount = sorted(classcount.items,key = operator.itemgetter(1),reverse=True) return sortedClassCount[0][0] #创建树 def createTree(dataSet,labels): classList = [example[-1] for example in dataSet] if classList.count(classList[0] == len(classList)): return classList[0] if len(dataSet[0]) == 1: return majorityCnt(classList) bestFeat = chooseBestFeatureToSplit(dataSet) bestFeatLabel = labels[bestFeat] myTree = {bestFeatLabel:{}} del(labels[bestFeat]) featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels) return myTree def storeTree(inputTree, filename): import pickle fw = open(filename, 'w') pickle.dump(inputTree, fw) fw.close() def grabTree(filename): import pickle fr = open(filename) return pickle.load(fr)
相关文章推荐
- [Python源码解析]机器学习-决策树
- 机器学习之决策树(ID3)算法与Python实现
- Python机器学习实战之决策树分类
- 机器学习经典算法详解及Python实现--决策树(Decision Tree)
- 【机器学习实战-python3】决策树ID3
- python机器学习实战2:实现决策树
- 2018python数据分析与机器学习实战(视频+源码+课件)
- 机器学习经典算法详解及Python实现--CART分类决策树、回归树和模型树
- 3.2决策树理论2--python深度机器学习
- 用Python开始机器学习(2:决策树分类算法)
- 【机器学习笔记之二】决策树的python实现
- 决策树与随机森立案(python code)---------------------------机器学习系列(二)
- python机器学习案例系列教程——决策树(ID3、C4.5、CART)
- 机器学习:决策树python实现
- Python与机器学习2——决策树只有一个名字!
- 决策树ID3和C4.5算法Python实现源码
- [机器学习][源码]机器学习实战ch3 决策树
- 机器学习 python实例完成—决策树
- 机器学习实战第三章——决策树(源码解析)
- 决策树ID3和C4.5算法Python实现源码