您的位置:首页 > 其它

机器学习之利用K b24b -均值聚类算法对未标注数据分组

2017-07-06 10:30 453 查看
    本文主要记录本人在学习机器学习过程中的相关代码实现,参考《机器学习实战》
from numpy import *

def loadDataSet(fileName):
dataMat=[]
with open(fileName) as fr:
for line in fr.readlines():
curLine=line.strip().split('\t')
fltLine=list(map(float,curLine))
dataMat.append(fltLine)
return dataMat

def distEclud(vecA,vecB):
return sqrt(sum(power(vecA-vecB,2)))

def randCent(dataSet,k):
n=shape(dataSet)[1]
centroids=mat(zeros((k,n)))
for j in range(n):
minJ=min(dataSet[:,j])
rangeJ=float(max(dataSet[:,j])-minJ)
centroids[:,j]=minJ+rangeJ*random.rand(k,1)#numpy.rand(k,1)表示生成k*1的随机矩阵,随机数为0-1.0
return centroids

#~ datMat=mat(loadDataSet('testSet.txt'))
#~ print(randCent(datMat,2))
#~ print(distEclud(datMat[0],datMat[1]))

def kMeans(dataSet,k,distMeas=distEclud,createCent=randCent):
m=shape(dataSet)[0]
clusterAssment=mat(zeros((m,2)))
centroids=createCent(dataSet,k)
clusterChanged=True
while clusterChanged:
clusterChanged=False
for i in range(m):
minDist=inf;minIndex=-1
for j in range(k):
distJI=distMeas(centroids[j,:],dataSet[i,:])
if distJI<minDist:
minDist=distJI;minIndex=j
if clusterAssment[i,0]!=minIndex:clusterChanged=True
clusterAssment[i,:]=minIndex,minDist**2
#~ print(centroids)
for cent in range(k):
ptsInClust=dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]
centroids[cent,:]=mean(ptsInClust,axis=0)
return centroids,clusterAssment

#~ datMat=mat(loadDataSet('testSet.txt'))
#~ myCentroids,clustAssing=kMeans(datMat,4)
#~ print(myCentroids)
def biKmeans(dataSet, k, distMeas=distEclud):
m = shape(dataSet)[0]
clusterAssment = mat(zeros((m,2)))#存储数据集中每个点的簇分配结果及平方误差
centroid0 = mean(dataSet, axis=0).tolist()[0]
centList =[centroid0] #create a list with one centroid
for j in range(m):#calc initial Error
clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:])**2
while (len(centList) < k):
lowestSSE = inf
for i in range(len(centList)):
ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:]#get the data points currently in cluster i
centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas)
#K -均值算法会生成两个质心(簇),同时给出每个簇的误差值
sseSplit = sum(splitClustAss[:,1])#compare the SSE to the currrent minimum
sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1])
print("sseSplit, and notSplit: ",sseSplit,sseNotSplit)
if (sseSplit + sseNotSplit) < lowestSSE:
bestCentToSplit = i#需要被划分的鏃
bestNewCents = centroidMat#生成的两个新鏃
bestClustAss = splitClustAss.copy()#生成的两个新鏃的数据所在的鏃与其与各自质心的距离
lowestSSE = sseSplit + sseNotSplit#最小sse
bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList) #change 1 to 3,4, or whatever
bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit
print('the bestCentToSplit is: ',bestCentToSplit)
print('the len of bestClustAss is: ', len(bestClustAss))
centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]#replace a centroid with two best centroids
centList.append(bestNewCents[1,:].tolist()[0])
clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss#reassign new clusters, and SSE
return mat(centList), clusterAssment

datMat3=mat(loadDataSet('testSet2.txt'))
cenList,myNewAssments=biKmeans(datMat3,3)
print(cenList)
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐