您的位置:首页 > 编程语言 > Python开发

基于python的scikit-learn库实现决策树、贝叶斯、K近邻

2018-01-21 22:47 639 查看
1.决策树

  1.1.对于有连续数据的决策树实现(调用包中自带的iris数据)

from sklearn.datasets import load_iris
from sklearn import tree
iris = load_iris()
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)
# export the tree in Graphviz format using the export_graphviz exporter
with open("iris.dot", 'w') as f:
f = tree.export_graphviz(clf, out_file=f)
# predict the class of samples
res=clf.predict(iris.data[:1, :])


  1.2 对于数据中的特征都是离散值的决策树

  以下是数据的结构:



  要理解变量dummy化的概念:比如一个属性有5种取值,那么dummy化后这个属性就会产生5列,每列代表这个属性的一种取值,该列为1表示取到这个取值

from sklearn.feature_extraction import DictVectorizer
import csv
from sklearn import preprocessing
from sklearn import tree
from sklearn.externals.six import StringIO
from _csv import reader
from docutils.nodes import header
#open the csv file
allElectronicsData=open('D:\\winter_python\\0119.csv')
reader=csv.reader(allElectronicsData)
headers=next(reader)
# print(headers)
featureList=[]
labelList=[]

for row in reader:
labelList.append(row[len(row)-1])
rowDict={}
for i in range(1,len(row)-1):
rowDict[headers[i]]=row[i]
featureList.append(rowDict)
#print(featureList)

vec=DictVectorizer()
dummyX=vec.fit_transform(featureList).toarray()
#print("dummyX:"+str(dummyX))
#print(vec.get_feature_names())
#print("labelList:"+str(labelList))

lb=preprocessing.LabelBinarizer()
dummyY=lb.fit_transform(labelList)
#print("dummyY:"+str(dummyY))

clf=tree.DecisionTreeClassifier(criterion='entropy')
clf=clf.fit(dummyX, dummyY)
#print("clf:"+str(clf))

with open("AllElectronics.dot",'w')as f:
f=tree.export_graphviz(clf, feature_names=vec.get_feature_names(), out_file=f)
print("featureNames:",vec.get_feature_names())
#predict
oneRowX=dummyX[0,:]
#print("oneRowX:"+str(oneRowX))

newRowX=oneRowX
#print("newRowX:"+str(newRowX))
predictedY=clf.predict([newRowX])
print("predictedY:"+str(predictedY))


2.朴素贝叶斯

#测试数据
import numpy as np
features_train = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
labels_train = np.array([1, 1, 1, 2, 2, 2])
#引入高斯朴素贝叶斯
from sklearn.naive_bayes import GaussianNB
#实例化
clf = GaussianNB()
#训练数据 fit相当于train
clf.fit(features_train, labels_train)
#输出单个预测结果
features_test = np.array([[-0.8,-1]])
labels_test = np.array([[1]])
pred = clf.predict(features_test)
print(pred)
#准确度评估 评估正确/总数
#方法1
accuracy = clf.score(features_test, labels_test)
#方法2
from sklearn.metrics import accuracy_score
accuracy2 = accuracy_score(pred,labels_test)


3.KNN

from sklearn.datasets import load_iris
from sklearn import neighbors
import sklearn

#查看iris数据集
iris = load_iris()
knn = neighbors.KNeighborsClassifier()
#训练数据集
knn.fit(iris.data, iris.target)
#预测
predict = knn.predict([[0.1,0.2,0.3,0.4]])
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: