您的位置:首页 > 其它

支持向量机(SVM)算法应用

2017-10-05 22:12 1186 查看
第一个简单的小例子:

# -*-coding:utf-8 -*-

from sklearn import svm

x = [[2, 0], [1, 1], [2, 3]]
y = [0, 0, 1]
clf = svm.SVC(kernel = 'linear')
# kernel :核函数,默认是rbf,可以是‘linear’, ‘poly’, ‘rbf’, #‘sigmoid’, ‘precomputed’

clf.fit(x, y)

print clf

# get support vectors
print clf.support_vectors_# 得到支持向量
# get indices of support vectors
print clf.support_ #得到支持向量的索引,[1 2]这里的1指的是[1, 1]的索引,2指的是[2,3]的索引
# get number of support vectors for each class
print clf.n_support_  #得到每一类的支持向量的个数


第二个例子:

# -*- coding:utf-8 -*-
import numpy as np
import pylab as pl #pylab主要提供一些画图功能
from sklearn import svm
# we create 40 separable points
X = np.r_[np.random.randn(20, 2) - [2, 2], np.random.randn(20, 2) + [2, 2]]
#np.r_按row来组合array,np.c_按colunm来组合array
#产生20行2列的矩阵,数据成正态分布,均值是2,标准差是2
# >>> x = np.random.randn(20,2)
# >>> print x                 |        print x-[2,2]
# [[-0.63145976  0.33214385]  |       [[-2.63145976 -1.66785615]
#  [-0.73478573 -0.20835977]  |        [-2.73478573 -2.20835977]
#  [-1.90850336 -0.3006437 ]  |        [-3.90850336 -2.3006437 ]
#  [-0.06334778  0.33648272]  |        [-2.06334778 -1.66351728]
#  [-1.47163982  0.46032467]  |        [-3.47163982 -1.53967533]
#  [ 0.62312495 -1.1757985 ]  |        [-1.37687505 -3.1757985 ]
#  [ 1.48181577 -0.59935329]  |        [-0.51818423 -2.59935329]
#  [ 0.85446026  0.66278357]  |        [-1.14553974 -1.33721643]
#  [ 0.64822314  0.37816488]  |        [-1.35177686 -1.62183512]
#  [-0.28784258  0.706399  ]  |        [-2.28784258 -1.293601  ]
#  [ 0.7139413  -0.25064504]  |        [-1.2860587  -2.25064504]
#  [ 1.16811313  1.16335445]  |        [-0.83188687 -0.83664555]
#  [-0.01533964 -0.10252879]  |        [-2.01533964 -2.10252879]
#  [-0.71322496 -1.76602087]  |        [-2.71322496 -3.76602087]
#  [-0.26507379  1.26459475]  |        [-2.26507379 -0.73540525]
#  [ 0.52864625 -0.62888543]  |        [-1.47135375 -2.62888543]
#  [ 0.6315711   1.74487499]  |        [-1.3684289  -0.25512501]
#  [-0.62708034 -1.66506671]  |        [-2.62708034 -3.66506671]
#  [ 1.10983563 -1.05212385]  |        [-0.89016437 -3.05212385]
#  [-0.80558621  0.1177437 ]] |        [-2.80558621 -1.8822563 ]]
Y = [0]*20 +[1]*20
# >>> print Y
# [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

#fit the model
clf = svm.SVC(kernel='linear')
# sklearn中调用机器学习的方法都是一个道理,算法就是一个类,其中包含fit(), predict()等等许多方法,
# 我们只要输入训练样本和标记,以及模型的一些可能的参数,自然就直接出分类的结果。
# SVM既可以用来分类,就是SVC;又可以用来预测,或者成为回归,就是SVR。sklearn中的svm模块中也集成了SVR类。
clf.fit(X, Y)

# get the separating hyperplane 得到分离超平面
w = clf.coef_[0] #coef_存放回归系数 print clf.coef_ 得到:[[ 0.58170366  0.54913603]]这种形式
a = -w[0]/w[1] #w*x +b = 0 w[0]x[0]+w[1]x[1]+b = 0,此处的a就是斜率
xx = np.linspace(-5, 5)
#在默认情况下,linspace函数可以生成元素为50的等间隔数列。而前两个参数分别是数列的开头与结尾。
# 如果写入第三个参数,可以制定数列的元素个数。
yy = a*xx - (clf.intercept_[0])/w[1]  #intercept_则存放截距
# print yy ,yy就是下面这个样子的
# [ 4.83387789  4.63169875  4.42951962  4.22734049  4.02516136  3.82298223
#   3.6208031   3.41862397  3.21644483  3.0142657   2.81208657  2.60990744
#   2.40772831  2.20554918  2.00337005  1.80119091  1.59901178  1.39683265
#   1.19465352  0.99247439  0.79029526  0.58811613  0.385937    0.18375786
#  -0.01842127 -0.2206004  -0.42277953 -0.62495866 -0.82713779 -1.02931692
#  -1.23149606 -1.43367519 -1.63585432 -1.83803345 -2.04021258 -2.24239171
#  -2.44457084 -2.64674997 -2.84892911 -3.05110824 -3.25328737 -3.4554665
#  -3.65764563 -3.85982476 -4.06200389 -4.26418303 -4.46636216 -4.66854129
#  -4.87072042 -5.07289955]

# print clf.coef_
# print clf.intercept_
#
# plot the parallels to the separating hyperplane that pass through the support vectors
# 通过支持向量绘制
b = clf.support_vectors_[0]
#support_vectors_得到的是支持向量
#print support_vectors_
# [[-1.37971997 -1.46699614]
#  [-0.14116337 -2.42460898]
#  [ 0.88169105  0.34857216]]
yy_down = a*xx + (b[1] - a*b[0])
b = clf.support_vectors_[-1]
yy_up = a*xx + (b[1] - a*b[0])

print "w: ", w
print "a: ", a
# w:  [ 0.67022618  0.51039141]
# a:  -1.31316116667

# print "xx: ", xx
# print "yy: ", yy
print "support_vectors_: ", clf.support_vectors_
print "clf.coef_: ", clf.coef_
# support_vectors_:  [[-1.13165211 -0.61397661]
#  [ 2.20222797 -1.34013617]
#  [-0.99613858  0.9161059 ]]
# clf.coef_:  [[ 0.76574617  1.08549389]]

# # switching to the generic n-dimensional parameterization of the hyperplan to the 2D-specific equation
# # of a line y=a.x +b: the generic w_0x + w_1y +w_3=0 can be rewritten y = -(w_0/w_1) x + (w_3/w_1)
#

# plot the line, the points, and the nearest vectors to the plane
pl.plot(xx, yy, 'k-') # use pylab to plot xx and yy 并且绘制的是实线
pl.plot(xx, yy_down, 'k--')
pl.plot(xx, yy_up, 'k--')

pl.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],#scatter绘制散点图
s=80, facecolors='none')
pl.scatter(X[:, 0], X[:, 1], c=Y, cmap=pl.cm.Paired)

pl.axis('tight')#
pl.show()#show the plot on the screen


第三个例子:

# -*- coding:utf-8 -*-
from __future__ import print_function

from time import time
import logging #打印程序进展的信息
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
# 由于sklearn更新所以换成了上面的
# from sklearn.cross_validation import train_test_split
#:数据集按比例切分为训练集和测试集
from sklearn.datasets import fetch_lfw_people
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
# 由于sklearn更新所以换成了上面的
# from sklearn.grid_search import GridSearchCV
# GridSearchCV,它存在的意义就是自动调参,只要把参数输进去,
# 就能给出最优化的结果和参数。但是这个方法适合于小数据集,一旦数据的量级上去了,很难得出结果。
from sklearn.metrics import classification_report
#生成显示主要分类指标的文本报告
#http://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html
from sklearn.metrics import confusion_matrix  #计算混淆矩阵
# from sklearn.metrics import matthews_corrcoef #计算MCC
# from sklearn.metrics import  roc_auc_score  #计算MCC 只对二分类可以计算
# from sklearn.metrics import  accuracy_score  #计算ACC
# from sklearn.decomposition import RandomizedPCA
# 这种写法在sklearn的新版本已经失效了,被from sklearn.decomposition import PCA代替
from sklearn.svm import SVC

print(__doc__)

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')#在标准输出上显示进度日志

###############################################################################
# Download the data, if not already on disk and load it as numpy arrays

lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
#获取人脸数据集的文件路径,其中check_fetch_lfw函数完成此功能。check_fetch_lfw函数会根据给定的路径判断路径下是否有人脸数据集
# 若有,返回数据集路径,若没有,那么他会从网上下载,然后自动解压,将解压后的路径返回,并且将压缩包删除。
# fetch_lfw_people这个函数是用来加载lfw人脸识别数据集的函数,返回data,images,target,target_names.
# 分别是向量化的人脸数据,人脸,人脸对应的人名编号,人名

# introspect the images arrays to find the shapes (for plotting)
n_samples, h, w = lfw_people.images.shape ##返回一共多少了实例,多少个图
#print(n_samples,h,w)
#1288 50 37

# for machine learning we use the 2 data directly (as relative pixel
# positions info is ignored by this model)
X = lfw_people.data #特征向量的矩阵
n_features = X.shape[1]#一共提取多少个特征值 1 对应着列数
# print(X.shape)
# (1288, 1850)

# the label to predict is the id of the person
y = lfw_people.target # y = 每个实例对应着哪个人脸 人脸对应的人脸编号
target_names = lfw_people.target_names # target_names 返回名字
n_classes = target_names.shape[0] #t raget_name.shape[0]一共多少个
# print(target_names)
# print(target_names.shape)
# ['Ariel Sharon' 'Colin Powell' 'Donald Rumsfeld' 'George W Bush'
#  'Gerhard Schroeder' 'Hugo Chavez' 'Tony Blair']
# (7,)

print("Total dataset size:")
print("n_samples: %d" % n_samples) #一共多少个实例
print("n_features: %d" % n_features) #一共提取了多少特征值
print("n_classes: %d" % n_classes) #一共分成了多少类
# Total dataset size:
# n_samples: 1288
# n_features: 1850
# n_classes: 7
#
###############################################################################
# Split into a training set and a test set using a stratified k fold

# split into a training and testing set
X_train, X_test, y_train, y_test = train_test_split( #分配训练集测试集 分别对应两个矩阵两个向量
X, y, test_size=0.25)
# print(X_train.shape)
# print(X_test.shape)
# (966, 1850)
# (322, 1850)

###############################################################################
# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
# dataset): unsupervised feature extraction / dimensionality reduction
#PCA降维技术,将特征值减少
n_components = 150
#从966张面孔中抽取前150脸

print("Extracting the top %d eigenfaces from %d faces"
% (n_components, X_train.shape[0]))
t0 = time()
# pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
pca = PCA(n_components=n_components, svd_solver='randomized', #选择一种svd方式
whiten=True).fit(X_train)
# whiten是一种数据预处理方式,会损失一些数据信息,但可获得更好的预测结果
# #随机降维方法 建立PCA模型
print("done in %0.3fs" % (time() - t0))

eigenfaces = pca.components_.reshape((n_components, h, w))#返回具有最大方差的成分。

print("Projecting the input data on the eigenfaces orthonormal basis")
t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
#将数据X转换成降维后的数据。当模型训练好后,对于新输入的数据,都可以用transform方法来降维
print("done in %0.3fs" % (time() - t0))

###############################################################################
# Train a SVM classification model

print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)#权重均衡,核函数kenel= rbf
# param_grid:值为字典或者列表,即需要最优化的参数的取值
clf = clf.fit(X_train_pca, y_train) # fit()建模,找到最优超平面
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)

###############################################################################
# Quantitative evaluation of the model quality on the test set

print("Predicting people's names on the test set")
t0 = time()
y_pred = clf.predict(X_test_pca)
print("done in %0.3fs" % (time() - t0))
b53d

print(classification_report(y_test, y_pred, target_names=target_names))
#生成显示主要分类指标的文本报告
#                     precision    recall   f1-score   support
#
#      Ariel Sharon       1.00      0.89      0.94        18
#      Colin Powell       0.82      0.89      0.85        56
#   Donald Rumsfeld       0.96      0.77      0.85        30
#     George W Bush       0.85      0.96      0.90       151
# Gerhard Schroeder       1.00      0.64      0.78        22
#       Hugo Chavez       1.00      0.91      0.95        11
#        Tony Blair       0.96      0.74      0.83        34
#
#       avg / total       0.89      0.88      0.88       322
print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))
# [[ 16   1   0   1   0   0   0]
#  [  0  50   0   6   0   0   0]
#  [  0   0  23   7   0   0   0]
#  [  0   6   0 145   0   0   0]
#  [  0   0   1   6  14   0   1]
#  [  0   0   0   1   0  10   0]
#  [  0   4   0   5   0   0  25]]
#
#
###############################################################################
# Qualitative evaluation of the predictions using matplotlib

def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
"""Helper function to plot a gallery of portraits"""
plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
for i in range(n_row * n_col):
plt.subplot(n_row, n_col, i + 1)
plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
plt.title(titles[i], size=12)
plt.xticks(())
plt.yticks(())

# plot the result of the prediction on a portion of the test set

def title(y_pred, y_test, target_names, i):
pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
return 'predicted: %s\ntrue:      %s' % (pred_name, true_name)

prediction_titles = [title(y_pred, y_test, target_names, i)
for i in range(y_pred.shape[0])]

plot_gallery(X_test, prediction_titles, h, w)

# plot the gallery of the most significative eigenfaces

eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
plot_gallery(eigenfaces, eigenface_titles, h, w)

plt.show()
#
#


内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息