python多线程并行实现随机森林
2015-06-24 16:32
766 查看
程序针对特征值取值范围为[-1, 1]来实现,数据预处理后即可使用。
import math
import operator
import threading
import random
def loadDataSet(filename, boundry):
num = len(open(filename).readline().split(','))-1
dataMat = []; testMat = []
fr = open(filename)
ii = 0
for line in fr.readlines():
lineArr = []
curLine = line.strip().split(',')
for i in range(num):
lineArr.append(float(curLine[i+1]))
if ii >= boundry[0] and ii <= boundry[1]:
testMat.append(lineArr)
else:
dataMat.append(lineArr)
ii += 1
return dataMat, testMat
def calcShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt -= prob*math.log(prob, 2)
return shannonEnt
def binSplitDataSet(dataSet, feature, value):
mat0 = [ele for ele in dataSet if ele[feature] > value]
mat1 = [ele for ele in dataSet if ele[feature] <= value]
return mat0, mat1
def chooseBestFeatureToSplit(dataSet):
Features = [i for i in range(len(dataSet[0]))]
selectedFeatures = random.sample(Features, int(math.sqrt(len(Features))))
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0; bestFeature = None; threshold = None
Vals = [-0.9, -0.5, 0, 0.5, 0.9]
for i in selectedFeatures:
for value in Vals:
m0, m1 = binSplitDataSet(dataSet, i, value)
prob1 = len(m0)/float(len(dataSet))
prob2 = len(m1)/float(len(dataSet))
newEntropy = prob1*calcShannonEnt(m0)+prob2*calcShannonEnt(m1)
infoGain = baseEntropy-newEntropy
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
threshold = value
return bestFeature, threshold
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(), key = operator.itemgetter(1), reverse = True)
return sortedClassCount[0][0]
def createTree(dataSet):
classList = [example[-1] for example in dataSet]
if classList.count(classList[0]) == len(classList):
return classList[0]
bestFeat, threshold = chooseBestFeatureToSplit(dataSet)
#if bestFeat == None: return majorityCnt(classList)
rSet, lSet = binSplitDataSet(dataSet, bestFeat, threshold)
myTree = {(bestFeat, threshold):{}}
myTree[(bestFeat, threshold)][0] = createTree(lSet)
myTree[(bestFeat, threshold)][1] = createTree(rSet)
return myTree
def classify(tree, testVec):
classLabel = None
firstStr = tree.keys()[0]
secondDict = tree[firstStr]
key = testVec[firstStr[0]]
nextL = None
if key > firstStr[1]:
nextL = secondDict[1]
else:
nextL = secondDict[0]
if isinstance(nextL, dict):
classLabel = classify(nextL, testVec)
else:
classLabel = nextL
return classLabel
def storeTree(inputTree, filename):
import pickle
fw = open(filename, 'w')
pickle.dump(inputTree, fw)
fw.close()
def grabTree(filename):
import pickle
fr = open(filename)
return pickle.load(fr)
def bootstrap(dataSet):
dataR = []
for i in range(len(dataSet)):
rindex = random.randint(0, len(dataSet)-1)
dataR.append(dataSet[rindex])
return dataR
def learning(dataSet, times, trees):
for i in range(times):
dataTemp = bootstrap(dataSet)
trees.append(createTree(dataTemp))
def classify_RF(trees, testVecs):
count = 0
for testVec in testVecs:
classes = []
for tree in trees:
classes.append(classify(tree, testVec))
if majorityCnt(classes) == testVec[-1]:
count += 1
print count
if __name__ == '__main__':
trainData, testData = loadDataSet('train_temp.csv', (0, 1000))
trees, trees1, trees2, trees3 = [], [], [], []
learning(trainData, 10, trees)
t1 = threading.Thread(target = learning, args = (trainData, 10, trees1))
t2 = threading.Thread(target = learning, args = (trainData, 10, trees2))
t3 = threading.Thread(target = learning, args = (trainData, 10, trees3))
t1.setDaemon(True); t1.start()
t2.setDaemon(True); t2.start()
t3.setDaemon(True); t3.start()
t1.join()
t2.join()
t3.join()
trees.extend(trees1)
trees.extend(trees2)
trees.extend(trees3)
offset = len(testData)/4
classify_RF(trees, testData[:offset])
t1 = threading.Thread(target = classify_RF, args = (trees, testData[offset:2*offset]))
t2 = threading.Thread(target = classify_RF, args = (trees, testData[2*offset:3*offset]))
t3 = threading.Thread(target = classify_RF, args = (trees, testData[3*offset:]))
t1.setDaemon(True); t1.start()
t2.setDaemon(True); t2.start()
t3.setDaemon(True); t3.start()
t1.join()
t2.join()
t3.join()
import math
import operator
import threading
import random
def loadDataSet(filename, boundry):
num = len(open(filename).readline().split(','))-1
dataMat = []; testMat = []
fr = open(filename)
ii = 0
for line in fr.readlines():
lineArr = []
curLine = line.strip().split(',')
for i in range(num):
lineArr.append(float(curLine[i+1]))
if ii >= boundry[0] and ii <= boundry[1]:
testMat.append(lineArr)
else:
dataMat.append(lineArr)
ii += 1
return dataMat, testMat
def calcShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt -= prob*math.log(prob, 2)
return shannonEnt
def binSplitDataSet(dataSet, feature, value):
mat0 = [ele for ele in dataSet if ele[feature] > value]
mat1 = [ele for ele in dataSet if ele[feature] <= value]
return mat0, mat1
def chooseBestFeatureToSplit(dataSet):
Features = [i for i in range(len(dataSet[0]))]
selectedFeatures = random.sample(Features, int(math.sqrt(len(Features))))
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0; bestFeature = None; threshold = None
Vals = [-0.9, -0.5, 0, 0.5, 0.9]
for i in selectedFeatures:
for value in Vals:
m0, m1 = binSplitDataSet(dataSet, i, value)
prob1 = len(m0)/float(len(dataSet))
prob2 = len(m1)/float(len(dataSet))
newEntropy = prob1*calcShannonEnt(m0)+prob2*calcShannonEnt(m1)
infoGain = baseEntropy-newEntropy
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
threshold = value
return bestFeature, threshold
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(), key = operator.itemgetter(1), reverse = True)
return sortedClassCount[0][0]
def createTree(dataSet):
classList = [example[-1] for example in dataSet]
if classList.count(classList[0]) == len(classList):
return classList[0]
bestFeat, threshold = chooseBestFeatureToSplit(dataSet)
#if bestFeat == None: return majorityCnt(classList)
rSet, lSet = binSplitDataSet(dataSet, bestFeat, threshold)
myTree = {(bestFeat, threshold):{}}
myTree[(bestFeat, threshold)][0] = createTree(lSet)
myTree[(bestFeat, threshold)][1] = createTree(rSet)
return myTree
def classify(tree, testVec):
classLabel = None
firstStr = tree.keys()[0]
secondDict = tree[firstStr]
key = testVec[firstStr[0]]
nextL = None
if key > firstStr[1]:
nextL = secondDict[1]
else:
nextL = secondDict[0]
if isinstance(nextL, dict):
classLabel = classify(nextL, testVec)
else:
classLabel = nextL
return classLabel
def storeTree(inputTree, filename):
import pickle
fw = open(filename, 'w')
pickle.dump(inputTree, fw)
fw.close()
def grabTree(filename):
import pickle
fr = open(filename)
return pickle.load(fr)
def bootstrap(dataSet):
dataR = []
for i in range(len(dataSet)):
rindex = random.randint(0, len(dataSet)-1)
dataR.append(dataSet[rindex])
return dataR
def learning(dataSet, times, trees):
for i in range(times):
dataTemp = bootstrap(dataSet)
trees.append(createTree(dataTemp))
def classify_RF(trees, testVecs):
count = 0
for testVec in testVecs:
classes = []
for tree in trees:
classes.append(classify(tree, testVec))
if majorityCnt(classes) == testVec[-1]:
count += 1
print count
if __name__ == '__main__':
trainData, testData = loadDataSet('train_temp.csv', (0, 1000))
trees, trees1, trees2, trees3 = [], [], [], []
learning(trainData, 10, trees)
t1 = threading.Thread(target = learning, args = (trainData, 10, trees1))
t2 = threading.Thread(target = learning, args = (trainData, 10, trees2))
t3 = threading.Thread(target = learning, args = (trainData, 10, trees3))
t1.setDaemon(True); t1.start()
t2.setDaemon(True); t2.start()
t3.setDaemon(True); t3.start()
t1.join()
t2.join()
t3.join()
trees.extend(trees1)
trees.extend(trees2)
trees.extend(trees3)
offset = len(testData)/4
classify_RF(trees, testData[:offset])
t1 = threading.Thread(target = classify_RF, args = (trees, testData[offset:2*offset]))
t2 = threading.Thread(target = classify_RF, args = (trees, testData[2*offset:3*offset]))
t3 = threading.Thread(target = classify_RF, args = (trees, testData[3*offset:]))
t1.setDaemon(True); t1.start()
t2.setDaemon(True); t2.start()
t3.setDaemon(True); t3.start()
t1.join()
t2.join()
t3.join()
相关文章推荐
- Python动态类型的学习---引用的理解
- Python3写爬虫(四)多线程实现数据爬取
- 垃圾邮件过滤器 python简单实现
- 下载并遍历 names.txt 文件,输出长度最长的回文人名。
- install and upgrade scrapy
- Scrapy的架构介绍
- Centos6 编译安装Python
- 使用Python生成Excel格式的图片
- 让Python文件也可以当bat文件运行
- [Python]推算数独
- Python中zip()函数用法举例
- Python中map()函数浅析
- Python在CAM软件Genesis2000中的应用
- 使用Shiboken为C++和Qt库创建Python绑定
- FREEBASIC 编译可被python调用的dll函数示例
- Python 七步捉虫法