决策树
2016-07-19 09:08
120 查看
决策树就是构造一棵树,每个叶子节点代表一个类别,走到叶子节点就可以确定当前数据是属于哪一类。其中一种构造决策树的方法是使用ID3算法,采用了信息增益和香农熵的概念,在机器学习实战一书中,有一句话解释了信息增益,信息增益石熵的减少或者是数据无序度的减少,对于每一个类别,选取去掉后的信息增益最大的那个类别,然后根据这个类别分类,核心就是信息增益和香农熵的概念吧。
from math import log import operator def createDataSet(): dataSet = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']] labels = ['no surfacing', 'flippers'] return dataSet, labels def calcShannonEnt(dataSet): numEntries = len(dataSet) labelCounts = {} for featVec in dataSet: currentLabel = featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1 shannonEnt = 0.0 for key in labelCounts: prob = float(labelCounts[key])/numEntries shannonEnt -= prob * log(prob, 2) return shannonEnt def splitDataSet(dataSet, axis, value): retDataSet = [] for featVec in dataSet: if featVec[axis] == value: reducedFeatVec = featVec[:axis] reducedFeatVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatVec) return retDataSet def chooseBestFeatureToSpilt(dataSet): numFeatures = len(dataSet[0]) - 1 baseEntropy = calcShannonEnt(dataSet) bestInfoGain = 0.0 bestFeature = -1 for i in range(numFeatures): featList = [example[i] for example in dataSet] uniqueVals = set(featList) newEntropy = 0.0 for value in uniqueVals: subDataSet = splitDataSet(dataSet, i, value) prob = len(subDataSet) / float(len(dataSet)) newEntropy += prob * calcShannonEnt(subDataSet) infoGain = baseEntropy - newEntropy if (infoGain > bestInfoGain): bestInfoGain = infoGain bestFeature = i return bestFeature def majorityCnt(classList): classCount={} for vote in classList: if vote not in classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] def createTree(dataSet, labels): classList = [example[-1] for example in dataSet] if classList.count(classList[0]) == len(classList): return classList[0] if len(dataSet[0]) == 1: return majorityCnt(classList) bestFeat = chooseBestFeatureToSpilt(dataSet) bestFeatLabel = labels[bestFeat] myTree = {bestFeatLabel: {}} del(labels[bestFeat]) featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels) return myTree def classify(inputTree,featLabels,testVec): firstStr = inputTree.keys()[0] secondDict = inputTree[firstStr] featIndex = featLabels.index(firstStr) key = testVec[featIndex] valueOfFeat = secondDict[key] if isinstance(valueOfFeat, dict): classLabel = classify(valueOfFeat, featLabels, testVec) else: classLabel = valueOfFeat return classLabel def storeTree(inputTree,filename): import pickle fw = open(filename,'w') pickle.dump(inputTree,fw) fw.close() def grabTree(filename): import pickle fr = open(filename) return pickle.load(fr)
相关文章推荐
- K近邻算法
- Spring MVC @ResponseBody返回Json
- maven-compiler-plugin插件
- spring redis session共享
- Spring Security学习
- Akka入门
- mongodb mysql数据互相导入
- 项目风险管理课程学习
- js中的defer关键字
- Maven本地导入jar包
- javascript弹出带文字信息的提示框效果
- ionic登录
- .Net WebApi接口调试之Swagger集成详解
- ubuntu tar 命令
- [李景山php]thinkphp核心源码注释|functions.php
- tomcat 与 weblogic 区别
- git笔记
- BZOJ 2525: [Poi2011]Dynamite
- mysql备份之mysqldump
- 使用sqluldr2实现将Oracle数据无落地快速导入Greenplum数据库