kmeans 算法,python
2015-11-16 17:38
603 查看
重点内容#################################################
from numpy import *
import time
import matplotlib.pyplot as plt
return sqrt(sum(power(vector2 - vector1, 2)))
numSamples, dim = dataSet.shape
centroids = zeros((k, dim))
for i in range(k):
index = int(random.uniform(0, numSamples))
centroids[i, :] = dataSet[index, :]
return centroids
numSamples = dataSet.shape[0]
# first column stores which cluster this sample belongs to,
# second column stores the error between this sample and its centroid
clusterAssment = mat(zeros((numSamples, 2)))
clusterChanged = True
numSamples, dim = dataSet.shape
if dim != 2:
print “Sorry! I can not draw because the dimension of your data is not 2!”
return 1
dataSet = []
fileIn = open(‘E:/Dataset/testSet.txt’)
for line in fileIn.readlines():
lineArr = line.strip().split(‘\t’)
dataSet.append([float(lineArr[0]), float(lineArr[1])])
dataSet = mat(dataSet)
k = 4
centroids, clusterAssment = kmeans(dataSet, k)
showCluster(dataSet, k, centroids, clusterAssment)
kmeans: k-means cluster
Author : zouxy
Date : 2013-12-25
HomePage : http://blog.csdn.net/zouxy09
Email : zouxy09@qq.com
#from numpy import *
import time
import matplotlib.pyplot as plt
calculate Euclidean distance
def euclDistance(vector1, vector2):return sqrt(sum(power(vector2 - vector1, 2)))
init centroids with random samples
def initCentroids(dataSet, k):numSamples, dim = dataSet.shape
centroids = zeros((k, dim))
for i in range(k):
index = int(random.uniform(0, numSamples))
centroids[i, :] = dataSet[index, :]
return centroids
k-means cluster
def kmeans(dataSet, k):numSamples = dataSet.shape[0]
# first column stores which cluster this sample belongs to,
# second column stores the error between this sample and its centroid
clusterAssment = mat(zeros((numSamples, 2)))
clusterChanged = True
## step 1: init centroids centroids = initCentroids(dataSet, k) while clusterChanged: clusterChanged = False ## for each sample for i in xrange(numSamples): minDist = 100000.0 minIndex = 0 ## for each centroid ## step 2: find the centroid who is closest for j in range(k): distance = euclDistance(centroids[j, :], dataSet[i, :]) if distance < minDist: minDist = distance minIndex = j ## step 3: update its cluster if clusterAssment[i, 0] != minIndex: clusterChanged = True clusterAssment[i, :] = minIndex, minDist**2 ## step 4: update centroids for j in range(k): pointsInCluster = dataSet[nonzero(clusterAssment[:, 0].A == j)[0]] centroids[j, :] = mean(pointsInCluster, axis = 0) print 'Congratulations, cluster complete!' return centroids, clusterAssment
show your cluster only available with 2-D data
def showCluster(dataSet, k, centroids, clusterAssment):numSamples, dim = dataSet.shape
if dim != 2:
print “Sorry! I can not draw because the dimension of your data is not 2!”
return 1
mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr'] if k > len(mark): print "Sorry! Your k is too large! please contact Zouxy" return 1 # draw all samples for i in xrange(numSamples): markIndex = int(clusterAssment[i, 0]) plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex]) mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb'] # draw the centroids for i in range(k): plt.plot(centroids[i, 0], centroids[i, 1], mark[i], markersize = 12) plt.show()
step 1: load data
print “step 1: load data…”dataSet = []
fileIn = open(‘E:/Dataset/testSet.txt’)
for line in fileIn.readlines():
lineArr = line.strip().split(‘\t’)
dataSet.append([float(lineArr[0]), float(lineArr[1])])
step 2: clustering…
print “step 2: clustering…”dataSet = mat(dataSet)
k = 4
centroids, clusterAssment = kmeans(dataSet, k)
step 3: show the result
print “step 3: show the result…”showCluster(dataSet, k, centroids, clusterAssment)
相关文章推荐
- 聚类算法之kmeans算法java版本
- python中kmeans聚类实现代码
- kmeans python版
- 数据挖掘-聚类-K-means算法Java实现
- Mahout-kmeans命令行文本文件聚类
- Kmeans
- 基于 Sift + Kmeans + 倒排索引 的图像匹配
- mahout-kmeans笔记
- 数据挖掘之KMEANS算法
- Mahout学习——Canopy Clustering
- Kmeans算法详解及实现
- Clustering of residential areas based on residential conditions
- Kmeans 聚类算法
- C++ 实现k-means machine learning 算法 Computer Vision
- Python 实现K-means算法
- 多目标进化问题,根据PCI进行小区的聚类,Kmeans算法的改进:二次聚类
- 颜色迁移之四——模糊聚类(FCM)算法
- Mahout学习之聚类算法Kmeans
- 强算KMeans聚类算法演示器
- WPF实现KMEANS算法