SVM
2016-02-19 18:09
585 查看
1、svm的python实现
2、推导请参考ng讲义,SMO具体实现参考platt原文,以及《统计学习方法》
3、代码部分,以及实验数据参考《machine learning in action》
实验结果:
2、推导请参考ng讲义,SMO具体实现参考platt原文,以及《统计学习方法》
3、代码部分,以及实验数据参考《machine learning in action》
# -*- coding=utf-8 -*- import numpy as np import random class SVM(object): def __init__(self,train_data,test_data=None,C=200,toler=0.0001,kernel="rbf",sigma=1,eta=1000): """ :param train_data: :param test_data: :param C: :param toler: :param kernel: kernel function ,valid data :"rbf","liner" :return: """ self.x=train_data[:,:-1] self.y=train_data[:,-1] if test_data is not None: self.test_x=test_data[:,:-1] self.test_y=test_data[:,-1] else: self.test_x=None self.test_y=None self.C=C self.toler=toler self.kernel=kernel self.sigma=sigma self.m,self.n=self.x.shape self.K=self.cal_kernel(self.x,self.sigma) self.alpha=np.zeros((self.m,1)) self.b=0 self.w=np.zeros((self.m,1)) self.eta=eta self.errorCache=np.zeros((self.m,2))#first column is valid flag self.support_vectors=None self.support_vectors_lables=None self.support_vectors_num=0 #calculate the kernel matrix def Kernel(self,x,xi,sigma): m,n=x.shape K=np.zeros((m,1)) if self.kernel=="linear": K=x.dot(xi.transpose()) elif self.kernel=="rbf": for j in range(m): temp=x[j]-xi K[j]=temp.dot(temp.transpose()) K=np.exp(K/(-2*sigma**2)) else : print "wrong kernel ,please input linear or rnf" return K def cal_kernel(self,x,sigma): "claculate the kernel matrix" m,n=x.shape K=np.zeros((m,m)) for i in range(m): K[:,i]=self.Kernel(x,x[i],sigma).transpose() return K def innerLoop(self, alpha_i): "the inner loop for optimizing alpha i and alpha j" error_i = self.cal_error(alpha_i) """ check and pick up the alpha who violates the KKT condition satisfy KKT condition 1) yi*f(i) >= 1 and alpha == 0 (outside the boundary) 2) yi*f(i) == 1 and 0<alpha< C (on the boundary) 3) yi*f(i) <= 1 and alpha == C (between the boundary) violate KKT condition because y[i]*E_i = y[i]*f(i) - y[i]^2 = y[i]*f(i) - 1, so 1) if y[i]*E_i < 0, so yi*f(i) < 1, if alpha < C, violate!(alpha = C will be correct) 2) if y[i]*E_i > 0, so yi*f(i) > 1, if alpha > 0, violate!(alpha = 0 will be correct) 3) if y[i]*E_i = 0, so yi*f(i) = 1, it is on the boundary, needless optimized """ if (self.y[alpha_i] * error_i < -self.toler and self.alpha[alpha_i,0] < self.C) or \ (self.y[alpha_i] * error_i > self.toler and self.alpha[alpha_i,0] > 0): # step 1: select alpha j alpha_j, error_j = self.select_alpha_j(alpha_i, error_i) alpha_i_old = self.alpha[alpha_i,0] alpha_j_old = self.alpha[alpha_j,0] # step 2: calculate the boundary L and H for alpha j if self.y[alpha_i] != self.y[alpha_j]: L = max(0, self.alpha[alpha_j,0] - self.alpha[alpha_i,0]) H = min(self.C, self.C + self.alpha[alpha_j,0] - self.alpha[alpha_i,0]) else: L = max(0, self.alpha[alpha_j,0] + self.alpha[alpha_i,0] - self.C) H = min(self.C, self.alpha[alpha_j,0] + self.alpha[alpha_i,0]) if L == H: return 0 # step 3: calculate eta (the similarity of sample i and j) eta = 2.0 * self.K[alpha_i, alpha_j] - self.K[alpha_i, alpha_i] \ - self.K[alpha_j, alpha_j] if eta >= 0: return 0 # step 4: update alpha j self.alpha[alpha_j,0] -= self.y[alpha_j] * (error_i - error_j) / eta # step 5: clip alpha j if self.alpha[alpha_j,0] > H: self.alpha[alpha_j,0] = H if self.alpha[alpha_j,0] < L: self.alpha[alpha_j,0] = L # step 6: if alpha j not moving enough, just return if abs(alpha_j_old - self.alpha[alpha_j,0]) < 0.00001: #self.update_error( alpha_j) self.alpha[alpha_j,0]=alpha_j_old return 0 # step 7: update alpha i after optimizing aipha j self.alpha[alpha_i,0] += self.y[alpha_i] * self.y[alpha_j] \ * (alpha_j_old - self.alpha[alpha_j,0]) # step 8: update threshold b b1 = self.b - error_i - self.y[alpha_i] * (self.alpha[alpha_i,0] - alpha_i_old)* self.K[alpha_i,alpha_i] \ - self.y[alpha_j] * (self.alpha[alpha_j,0] - alpha_j_old)* self.K[alpha_i, alpha_j] b2 = self.b - error_j - self.y[alpha_i] * (self.alpha[alpha_i,0] - alpha_i_old) * self.K[alpha_i, alpha_j] \ - self.y[alpha_j] * (self.alpha[alpha_j,0] - alpha_j_old) * self.K[alpha_j, alpha_j] if (0 < self.alpha[alpha_i,0]) and (self.alpha[alpha_i,0] < self.C): self.b = b1 elif (0 < self.alpha[alpha_j,0]) and (self.alpha[alpha_j,0] < self.C): self.b = b2 else: self.b = (b1 + b2) / 2.0 # step 9: update error cache for alpha i, j after optimize alpha i, j and b self.update_error( alpha_j) self.update_error(alpha_i) return 1 else: return 0 def cal_error(self,i): "calcuate the output error when we input x_i to svm" g_xi=(self.alpha[:,0]*self.y).dot(self.K[:,i])+self.b return g_xi-self.y[i] def update_error(self, alpha_k): "update the error cache for alpha k after optimize alpha k" error = self.cal_error( alpha_k) self.errorCache[alpha_k] = [1, error] def select_alpha_j(self, alpha_i, error_i): "select alpha j which has the biggest step" self.errorCache[alpha_i] = [1, error_i] # mark as valid(has been optimized) candidateAlphaList = np.nonzero(self.errorCache[:, 0])[0] maxStep = 0; alpha_j = 0; error_j = 0 # find the alpha with max iterative step if len(candidateAlphaList) > 1: for alpha_k in candidateAlphaList: if alpha_k == alpha_i: continue error_k = self.cal_error(alpha_k) if abs(error_k - error_i) > maxStep: maxStep = abs(error_k - error_i) alpha_j = alpha_k error_j = error_k # if came in this loop first time, we select alpha j randomly else: alpha_j = alpha_i while alpha_j == alpha_i: alpha_j = int(random.uniform(0, self.m)) error_j = self.cal_error(alpha_j) return alpha_j, error_j def SMO(self): "platt's Sequential Minimal Optimization algorithm" iter=0;alphaPairsChanged=0;entireSet=True while (iter < self.eta) and ((alphaPairsChanged > 0) or (entireSet)): alphaPairsChanged = 0 if entireSet: #go over all for i in range(self.m): alphaPairsChanged += self.innerLoop(i) print "fullSet, iter: %d i:%d, pairs changed %d" % (iter,i,alphaPairsChanged) iter += 1 else:#go over non-bound (railed) alphas nonBoundIs = np.nonzero((self.alpha > 0) * (self.alpha< self.C))[0] for i in nonBoundIs: alphaPairsChanged += self.innerLoop(i) print "non-bound, iter: %d i:%d, pairs changed %d" % (iter,i,alphaPairsChanged) iter += 1 if entireSet: entireSet = False #toggle entire set loop elif (alphaPairsChanged == 0): entireSet = True print "iteration number: %d" % iter def cal_w(self): self.w=(self.alpha[:,0]*self.y).dot(self.x) print self.w.shape print self.w def fit(self): self.SMO() self.cal_w() "find the support vectors" index=np.nonzero(self.alpha>0)[0] self.support_vectors=self.x[index , :] self.support_vectors_lables=self.y[index] self.support_vectors_alpha=self.alpha[index,0] print "there are %d Support Vectors" % len(self.support_vectors) def predict(self,xi): "predict the input x_i .positive class is 1 and negtive class is -1" kernel=self.Kernel(self.support_vectors,xi,self.sigma) return np.sign((self.support_vectors_alpha*self.support_vectors_lables).transpose().dot(kernel)+self.b) def accuracy(self): num=0.0 for i in range(self.m): if self.predict(self.x[i])==np.sign(self.y[i]): num+=1 print "the training data accuracy is %f"%(num/self.m) if self.test_x is not None: num=0.0 for i in range(len(self.test_x)): if self.predict(self.test_x[i])==np.sign(self.test_y[i]): num+=1 print "the test data accuracy is %f"%(num/len(self.test_x)) def loadData(path): data=np.loadtxt(path) return data training_data=loadData("D:\\SelfLearning\\Machine Learning\\MachineLearningInAction\\machinelearninginaction\\Ch06\\testSetRBF.txt") test_data=loadData("D:\\SelfLearning\\Machine Learning\\MachineLearningInAction\\machinelearninginaction\\Ch06\\testSetRBF2.txt") svm=SVM(training_data,test_data=test_data,sigma=1.3,C=200,kernel="rbf") svm.fit() svm.accuracy()
实验结果:
the training data accuracy is 1.000000 the test data accuracy is 0.990000
相关文章推荐
- Python动态类型的学习---引用的理解
- Python3写爬虫(四)多线程实现数据爬取
- 垃圾邮件过滤器 python简单实现
- 下载并遍历 names.txt 文件,输出长度最长的回文人名。
- install and upgrade scrapy
- Scrapy的架构介绍
- Centos6 编译安装Python
- 使用Python生成Excel格式的图片
- 让Python文件也可以当bat文件运行
- [Python]推算数独
- Python中zip()函数用法举例
- Python中map()函数浅析
- Python将excel导入到mysql中
- Python在CAM软件Genesis2000中的应用
- 使用Shiboken为C++和Qt库创建Python绑定
- FREEBASIC 编译可被python调用的dll函数示例
- Python 七步捉虫法