支持向量机(SVM)算法应用
2017-10-05 22:12
1186 查看
第一个简单的小例子:
第二个例子:
第三个例子:
# -*-coding:utf-8 -*- from sklearn import svm x = [[2, 0], [1, 1], [2, 3]] y = [0, 0, 1] clf = svm.SVC(kernel = 'linear') # kernel :核函数,默认是rbf,可以是‘linear’, ‘poly’, ‘rbf’, #‘sigmoid’, ‘precomputed’ clf.fit(x, y) print clf # get support vectors print clf.support_vectors_# 得到支持向量 # get indices of support vectors print clf.support_ #得到支持向量的索引,[1 2]这里的1指的是[1, 1]的索引,2指的是[2,3]的索引 # get number of support vectors for each class print clf.n_support_ #得到每一类的支持向量的个数
第二个例子:
# -*- coding:utf-8 -*- import numpy as np import pylab as pl #pylab主要提供一些画图功能 from sklearn import svm # we create 40 separable points X = np.r_[np.random.randn(20, 2) - [2, 2], np.random.randn(20, 2) + [2, 2]] #np.r_按row来组合array,np.c_按colunm来组合array #产生20行2列的矩阵,数据成正态分布,均值是2,标准差是2 # >>> x = np.random.randn(20,2) # >>> print x | print x-[2,2] # [[-0.63145976 0.33214385] | [[-2.63145976 -1.66785615] # [-0.73478573 -0.20835977] | [-2.73478573 -2.20835977] # [-1.90850336 -0.3006437 ] | [-3.90850336 -2.3006437 ] # [-0.06334778 0.33648272] | [-2.06334778 -1.66351728] # [-1.47163982 0.46032467] | [-3.47163982 -1.53967533] # [ 0.62312495 -1.1757985 ] | [-1.37687505 -3.1757985 ] # [ 1.48181577 -0.59935329] | [-0.51818423 -2.59935329] # [ 0.85446026 0.66278357] | [-1.14553974 -1.33721643] # [ 0.64822314 0.37816488] | [-1.35177686 -1.62183512] # [-0.28784258 0.706399 ] | [-2.28784258 -1.293601 ] # [ 0.7139413 -0.25064504] | [-1.2860587 -2.25064504] # [ 1.16811313 1.16335445] | [-0.83188687 -0.83664555] # [-0.01533964 -0.10252879] | [-2.01533964 -2.10252879] # [-0.71322496 -1.76602087] | [-2.71322496 -3.76602087] # [-0.26507379 1.26459475] | [-2.26507379 -0.73540525] # [ 0.52864625 -0.62888543] | [-1.47135375 -2.62888543] # [ 0.6315711 1.74487499] | [-1.3684289 -0.25512501] # [-0.62708034 -1.66506671] | [-2.62708034 -3.66506671] # [ 1.10983563 -1.05212385] | [-0.89016437 -3.05212385] # [-0.80558621 0.1177437 ]] | [-2.80558621 -1.8822563 ]] Y = [0]*20 +[1]*20 # >>> print Y # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] #fit the model clf = svm.SVC(kernel='linear') # sklearn中调用机器学习的方法都是一个道理,算法就是一个类,其中包含fit(), predict()等等许多方法, # 我们只要输入训练样本和标记,以及模型的一些可能的参数,自然就直接出分类的结果。 # SVM既可以用来分类,就是SVC;又可以用来预测,或者成为回归,就是SVR。sklearn中的svm模块中也集成了SVR类。 clf.fit(X, Y) # get the separating hyperplane 得到分离超平面 w = clf.coef_[0] #coef_存放回归系数 print clf.coef_ 得到:[[ 0.58170366 0.54913603]]这种形式 a = -w[0]/w[1] #w*x +b = 0 w[0]x[0]+w[1]x[1]+b = 0,此处的a就是斜率 xx = np.linspace(-5, 5) #在默认情况下,linspace函数可以生成元素为50的等间隔数列。而前两个参数分别是数列的开头与结尾。 # 如果写入第三个参数,可以制定数列的元素个数。 yy = a*xx - (clf.intercept_[0])/w[1] #intercept_则存放截距 # print yy ,yy就是下面这个样子的 # [ 4.83387789 4.63169875 4.42951962 4.22734049 4.02516136 3.82298223 # 3.6208031 3.41862397 3.21644483 3.0142657 2.81208657 2.60990744 # 2.40772831 2.20554918 2.00337005 1.80119091 1.59901178 1.39683265 # 1.19465352 0.99247439 0.79029526 0.58811613 0.385937 0.18375786 # -0.01842127 -0.2206004 -0.42277953 -0.62495866 -0.82713779 -1.02931692 # -1.23149606 -1.43367519 -1.63585432 -1.83803345 -2.04021258 -2.24239171 # -2.44457084 -2.64674997 -2.84892911 -3.05110824 -3.25328737 -3.4554665 # -3.65764563 -3.85982476 -4.06200389 -4.26418303 -4.46636216 -4.66854129 # -4.87072042 -5.07289955] # print clf.coef_ # print clf.intercept_ # # plot the parallels to the separating hyperplane that pass through the support vectors # 通过支持向量绘制 b = clf.support_vectors_[0] #support_vectors_得到的是支持向量 #print support_vectors_ # [[-1.37971997 -1.46699614] # [-0.14116337 -2.42460898] # [ 0.88169105 0.34857216]] yy_down = a*xx + (b[1] - a*b[0]) b = clf.support_vectors_[-1] yy_up = a*xx + (b[1] - a*b[0]) print "w: ", w print "a: ", a # w: [ 0.67022618 0.51039141] # a: -1.31316116667 # print "xx: ", xx # print "yy: ", yy print "support_vectors_: ", clf.support_vectors_ print "clf.coef_: ", clf.coef_ # support_vectors_: [[-1.13165211 -0.61397661] # [ 2.20222797 -1.34013617] # [-0.99613858 0.9161059 ]] # clf.coef_: [[ 0.76574617 1.08549389]] # # switching to the generic n-dimensional parameterization of the hyperplan to the 2D-specific equation # # of a line y=a.x +b: the generic w_0x + w_1y +w_3=0 can be rewritten y = -(w_0/w_1) x + (w_3/w_1) # # plot the line, the points, and the nearest vectors to the plane pl.plot(xx, yy, 'k-') # use pylab to plot xx and yy 并且绘制的是实线 pl.plot(xx, yy_down, 'k--') pl.plot(xx, yy_up, 'k--') pl.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],#scatter绘制散点图 s=80, facecolors='none') pl.scatter(X[:, 0], X[:, 1], c=Y, cmap=pl.cm.Paired) pl.axis('tight')# pl.show()#show the plot on the screen
第三个例子:
# -*- coding:utf-8 -*- from __future__ import print_function from time import time import logging #打印程序进展的信息 import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split # 由于sklearn更新所以换成了上面的 # from sklearn.cross_validation import train_test_split #:数据集按比例切分为训练集和测试集 from sklearn.datasets import fetch_lfw_people from sklearn.decomposition import PCA from sklearn.model_selection import GridSearchCV # 由于sklearn更新所以换成了上面的 # from sklearn.grid_search import GridSearchCV # GridSearchCV,它存在的意义就是自动调参,只要把参数输进去, # 就能给出最优化的结果和参数。但是这个方法适合于小数据集,一旦数据的量级上去了,很难得出结果。 from sklearn.metrics import classification_report #生成显示主要分类指标的文本报告 #http://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html from sklearn.metrics import confusion_matrix #计算混淆矩阵 # from sklearn.metrics import matthews_corrcoef #计算MCC # from sklearn.metrics import roc_auc_score #计算MCC 只对二分类可以计算 # from sklearn.metrics import accuracy_score #计算ACC # from sklearn.decomposition import RandomizedPCA # 这种写法在sklearn的新版本已经失效了,被from sklearn.decomposition import PCA代替 from sklearn.svm import SVC print(__doc__) # Display progress logs on stdout logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')#在标准输出上显示进度日志 ############################################################################### # Download the data, if not already on disk and load it as numpy arrays lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4) #获取人脸数据集的文件路径,其中check_fetch_lfw函数完成此功能。check_fetch_lfw函数会根据给定的路径判断路径下是否有人脸数据集 # 若有,返回数据集路径,若没有,那么他会从网上下载,然后自动解压,将解压后的路径返回,并且将压缩包删除。 # fetch_lfw_people这个函数是用来加载lfw人脸识别数据集的函数,返回data,images,target,target_names. # 分别是向量化的人脸数据,人脸,人脸对应的人名编号,人名 # introspect the images arrays to find the shapes (for plotting) n_samples, h, w = lfw_people.images.shape ##返回一共多少了实例,多少个图 #print(n_samples,h,w) #1288 50 37 # for machine learning we use the 2 data directly (as relative pixel # positions info is ignored by this model) X = lfw_people.data #特征向量的矩阵 n_features = X.shape[1]#一共提取多少个特征值 1 对应着列数 # print(X.shape) # (1288, 1850) # the label to predict is the id of the person y = lfw_people.target # y = 每个实例对应着哪个人脸 人脸对应的人脸编号 target_names = lfw_people.target_names # target_names 返回名字 n_classes = target_names.shape[0] #t raget_name.shape[0]一共多少个 # print(target_names) # print(target_names.shape) # ['Ariel Sharon' 'Colin Powell' 'Donald Rumsfeld' 'George W Bush' # 'Gerhard Schroeder' 'Hugo Chavez' 'Tony Blair'] # (7,) print("Total dataset size:") print("n_samples: %d" % n_samples) #一共多少个实例 print("n_features: %d" % n_features) #一共提取了多少特征值 print("n_classes: %d" % n_classes) #一共分成了多少类 # Total dataset size: # n_samples: 1288 # n_features: 1850 # n_classes: 7 # ############################################################################### # Split into a training set and a test set using a stratified k fold # split into a training and testing set X_train, X_test, y_train, y_test = train_test_split( #分配训练集测试集 分别对应两个矩阵两个向量 X, y, test_size=0.25) # print(X_train.shape) # print(X_test.shape) # (966, 1850) # (322, 1850) ############################################################################### # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled # dataset): unsupervised feature extraction / dimensionality reduction #PCA降维技术,将特征值减少 n_components = 150 #从966张面孔中抽取前150脸 print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) t0 = time() # pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) pca = PCA(n_components=n_components, svd_solver='randomized', #选择一种svd方式 whiten=True).fit(X_train) # whiten是一种数据预处理方式,会损失一些数据信息,但可获得更好的预测结果 # #随机降维方法 建立PCA模型 print("done in %0.3fs" % (time() - t0)) eigenfaces = pca.components_.reshape((n_components, h, w))#返回具有最大方差的成分。 print("Projecting the input data on the eigenfaces orthonormal basis") t0 = time() X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) #将数据X转换成降维后的数据。当模型训练好后,对于新输入的数据,都可以用transform方法来降维 print("done in %0.3fs" % (time() - t0)) ############################################################################### # Train a SVM classification model print("Fitting the classifier to the training set") t0 = time() param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)#权重均衡,核函数kenel= rbf # param_grid:值为字典或者列表,即需要最优化的参数的取值 clf = clf.fit(X_train_pca, y_train) # fit()建模,找到最优超平面 print("done in %0.3fs" % (time() - t0)) print("Best estimator found by grid search:") print(clf.best_estimator_) ############################################################################### # Quantitative evaluation of the model quality on the test set print("Predicting people's names on the test set") t0 = time() y_pred = clf.predict(X_test_pca) print("done in %0.3fs" % (time() - t0)) b53d print(classification_report(y_test, y_pred, target_names=target_names)) #生成显示主要分类指标的文本报告 # precision recall f1-score support # # Ariel Sharon 1.00 0.89 0.94 18 # Colin Powell 0.82 0.89 0.85 56 # Donald Rumsfeld 0.96 0.77 0.85 30 # George W Bush 0.85 0.96 0.90 151 # Gerhard Schroeder 1.00 0.64 0.78 22 # Hugo Chavez 1.00 0.91 0.95 11 # Tony Blair 0.96 0.74 0.83 34 # # avg / total 0.89 0.88 0.88 322 print(confusion_matrix(y_test, y_pred, labels=range(n_classes))) # [[ 16 1 0 1 0 0 0] # [ 0 50 0 6 0 0 0] # [ 0 0 23 7 0 0 0] # [ 0 6 0 145 0 0 0] # [ 0 0 1 6 14 0 1] # [ 0 0 0 1 0 10 0] # [ 0 4 0 5 0 0 25]] # # ############################################################################### # Qualitative evaluation of the predictions using matplotlib def plot_gallery(images, titles, h, w, n_row=3, n_col=4): """Helper function to plot a gallery of portraits""" plt.figure(figsize=(1.8 * n_col, 2.4 * n_row)) plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35) for i in range(n_row * n_col): plt.subplot(n_row, n_col, i + 1) plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray) plt.title(titles[i], size=12) plt.xticks(()) plt.yticks(()) # plot the result of the prediction on a portion of the test set def title(y_pred, y_test, target_names, i): pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1] true_name = target_names[y_test[i]].rsplit(' ', 1)[-1] return 'predicted: %s\ntrue: %s' % (pred_name, true_name) prediction_titles = [title(y_pred, y_test, target_names, i) for i in range(y_pred.shape[0])] plot_gallery(X_test, prediction_titles, h, w) # plot the gallery of the most significative eigenfaces eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])] plot_gallery(eigenfaces, eigenface_titles, h, w) plt.show() # #
相关文章推荐
- 支持向量机SVM算法原理及应用(R)
- 支持向量机(SVM)算法在人脸识别上的应用
- 第12节--支持向量机(SVM)算法在人脸识别上的应用
- 机器学习---支持向量机(SVM)算法应用(下)
- SVM(支持向量机)算法原理和实际应用
- 机器学习---支持向量机(SVM)算法应用(上)
- 机器学习教程 五.SVM(支持向量机)算法理解和应用
- 支持向量机SVM算法应用【Python实现】
- 机器学习笔记八 - SVM(Support Vector Machine,支持向量机)的剩余部分。即核技法、软间隔分类器、对SVM求解的序列最小化算法以及SVM的一些应用
- 支持向量机(SVM)算法的Python实现
- 第9节--支持向量机(SVM)算法原理
- 第10节--支持向量机(SVM)算法代码
- 支持向量机(SVM)的SMO算法详解
- 分类算法SVM(支持向量机)
- 机器学习中的算法(2)-支持向量机(SVM)基础
- (十一)机器学习中的一个常用算法SVM算法,即支持向量机Support Vector Machine(SVM)
- 数据挖掘---分类算法之支持向量机SVM
- 支持向量机SVM(Support Vector Machine)算法初解
- 机器学习中的算法(2)-支持向量机(SVM)基础-----【推荐】
- 机器学习-python通过序列最小优化算法(SMO)方法编写支持向量机(SVM)