吴恩达机器学习 EX7 作业 第一部分 K均值聚类

2019-04-23 17:35 495 查看



1.1、 作业介绍


1.2 导入模块和数据


import matplotlib.pyplot as plt
import numpy as np
import scipy.io as scio
from skimage import io
from skimage import img_as_float

import runkMeans as km
import kMeansInitCentroids as kmic

import imp
imp.reload(kmic) # 重新加载导入模块,模块有修改时用到
# np.set_printoptions(formatter={'float': '{: 0.6f}'.format})


# ===================== Part 1: Find Closest Centroids =====================
data = scio.loadmat('ex7data2.mat')
X = data['X']


(300, 2)
# Select an initial set of centroidsk = 3  # Three centroidsinitial_centroids = np.array([[3, 3], [6, 2], [8, 5]]) # 手动初始化三个聚类中心点

1.2 findClosestCentroids.py函数


def find_closest_centroids(X, centroids):
# Set K
K = centroids.shape[0]
m = X.shape[0]

# You need to return the following variables correctly.
idx = np.zeros(m) # 每个训练样本距离最短聚类中心最短的索引
idx = idx.astype(int)

# ===================== Your Code Here =====================
for i in range(m):
# 计算每个训练样本到哪个聚类中心距离最小
idx[i] = np.argmin(np.sqrt(np.sum(np.power((centroids - X[i]), 2), 1)))

# ==========================================================

return idx


# Find the closest centroids for the examples using the
# initial_centroidsidx = find_closest_centroids(X, initial_centroids)

print('Closest centroids for the first 3 examples: ')
print('(the closest centroids should be 0, 2, 1 respectively)')
Closest centroids for the first 3 examples:
[0 2 1]
(the closest centroids should be 0, 2, 1 respectively)

1.3 computeCentroids.py函数


def compute_centroids(X, idx, K):
# Useful values
(m, n) = X.shape
# You need to return the following variable correctly.
centroids = np.zeros((K, n))

# ===================== Your Code Here =====================

for i in range(K):
# 用每个聚类中心所有点的位置平均值作为新的聚类中心位置
centroids[i] = np.mean(X[np.where(idx==i)], axis=0)

# ==========================================================
return centroids


# ===================== Part 2: Compute Means =====================
centroids = compute_centroids(X, idx, k)
print('Centroids computed after initial finding of closest centroids: \n{}'.format(centroids))
print('the centroids should be')
print('[[ 2.428301 3.157924 ]')
print(' [ 5.813503 2.633656 ]')
print(' [ 7.119387 3.616684 ]]')
Centroids computed after initial finding of closest centroids:
[[2.42830111 3.15792418]
[5.81350331 2.63365645]
[7.11938687 3.6166844 ]]
the centroids should be
[[ 2.428301 3.157924 ]
[ 5.813503 2.633656 ]
[ 7.119387 3.616684 ]]

1.4 runkMeans.py函数

import numpy as np
import matplotlib.pyplot as plt
#import matplotlib.colors as colors
#import matplotlib.cm as cmx
import findClosestCentroids as fc
import computeCentroids as cc

def run_kmeans(X, initial_centroids, max_iters, plot):
if plot:

# Initialize values
(m, n) = X.shapeK = initial_centroids.shape[0]#聚类中心个数
centroids = initial_centroidsprevious_centroids = centroidsidx = np.zeros(m)

# Run K-Means
for i in range(max_iters):
# Output progress
print('K-Means iteration {}/{}'.format((i + 1), max_iters))

# For each example in X, assign it to the closest centroid
idx = fc.find_closest_centroids(X, centroids) # 每个训练样本找距离最短聚类中心

# Optionally plot progress
if plot:
# 调用plot_progress函数绘制训练样本散点图及聚类中心移动过程
plot_progress(X, centroids, previous_centroids, idx, K, i)
previous_centroids = centroids # 保留前一个聚类中心点位置,以便绘制聚类中心移动线段
input('Press ENTER to continue')

# Given the memberships, compute new centroidscentroids = cc.compute_centroids(X, idx, K) # 重新计算新的聚类中心

return centroids, idx
def plot_progress(X, centroids, previous, idx, K, i):
plt.scatter(X[:, 0], X[:, 1], c=idx, s=15)# 绘制训练样本散点图,每个样本颜色通过聚类中心索引号区别
plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', c='black', s=25) # 绘制聚类中心点

for j in range(centroids.shape[0]):
# 调用draw_line绘制聚类中心移动过程
draw_line(centroids[j], previous[j])

plt.title('Iteration number {}'.format(i + 1))

def draw_line(p1, p2):
plt.plot(np.array([p1[0], p2[0]]), np.array([p1[1], p2[1]]), c='black', linewidth=1)

1.5 调用k均值聚类算法绘制聚类过程


# ===================== Part 3: K-Means Clustering =====================
# Settings for running K-Means
K = 3 # 三个聚类中心
max_iters = 10 # 迭代10次


initial_centroids = np.array([[3, 3], [6, 2], [8, 5]])


centroids, idx = km.run_kmeans(X, initial_centroids, max_iters, True)
print('K-Means Done.')
print('K-Means Done.')
K-Means iteration 1/10
Press ENTER to continue
K-Means iteration 2/10
Press ENTER to continue
K-Means iteration 3/10
Press ENTER to continue
K-Means iteration 4/10
Press ENTER to continue
K-Means iteration 5/10
Press ENTER to continue
K-Means iteration 6/10
Press ENTER to continue
K-Means iteration 7/10
Press ENTER to continue
K-Means iteration 8/10
Press ENTER to continue
K-Means iteration 9/10
Press ENTER to continue
K-Means iteration 10/10
Press ENTER to continue
K-Means Done.


1.6 kMeansInitCentroids.py随机初始化聚类中心函数

在运行 K-均值算法的之前,我们首先要随机初始化所有的聚类中心点,下面介绍怎样

  1. 我们应该选择K < m,即聚类中心点的个数要小于所有训练集实例的数量
  2. 随机选择K个训练实例,然后令K个聚类中心分别与这K个训练实例相等
    为了解决这个问题,我们通常需要多次运行 K-均值算法,每一次都重新进行随机初始化,最后再比较多次运行 K-均值的结果,选择代价函数最小的结果。但是如果K较大,这么做也可能不会有明显地改善。
import numpy as np

def kmeans_init_centroids(X, K):
# You should return this value correctly
centroids = np.zeros((K, X.shape[1]))

# ===================== Your Code Here =====================
# 随机选择K个训练样本作为初始化聚类中心
idx = np.random.choice(X.shape[0], K)

centroids = X[idx, :]
# ==========================================================

return centroids


initial_centroids = kmic.kmeans_init_centroids(X, k)
array([[6.20295231, 2.67856179],
[6.11768055, 2.85475655],
[2.1270185 , 0.95672042]])


# Run K-Means algorithm. The 'true' at the end tells our function to plot
# the progress of K-Means
centroids, idx = km.run_kmeans(X, initial_centroids, max_iters, True)
print('K-Means Done.')
print('K-Means Done.')
K-Means iteration 1/10
Press ENTER to continue
K-Means iteration 2/10
Press ENTER to continue
K-Means iteration 3/10
Press ENTER to continue
K-Means iteration 4/10
Press ENTER to continue
K-Means iteration 5/10
Press ENTER to continue
K-Means iteration 6/10
Press ENTER to continue
K-Means iteration 7/10
Press ENTER to continue
K-Means iteration 8/10
Press ENTER to continue
K-Means iteration 9/10
Press ENTER to continue
K-Means iteration 10/10
Press ENTER to continue
K-Means Done.

1.7 K均值聚类算法进行图像压缩


# ===================== Part 4: K-Means Clustering on Pixels =====================

# Load an image of a bird
image = io.imread('bird_small.png')
image = img_as_float(image)

# Size of the image
img_shape = image.shape

# Reshape the image into an Nx3 matrix where N = number of pixels.
# Each row will contain the Red, Green and Blue pixel values
# This gives us our dataset matrix X that we will use K-Means on.

X = image.reshape(img_shape[0] * img_shape[1], 3)

图片的维度:128 *128 * 3

print('image.shape: ', image.shape, '\nX.shape: ', X.shape)
image.shape:  (128, 128, 3)X.shape:  (16384, 3)

将128 * 128图片数据压缩到 16个聚类中心,迭代10次

K = 16
max_iters = 10


initial_centroids = kmic.kmeans_init_centroids(X, K)


# Run K-Means
centroids, idx = km.run_kmeans(X, initial_centroids, max_iters, False)
print('K-Means Done.')
print('K-Means Done.')
K-Means iteration 1/10
K-Means iteration 2/10
K-Means iteration 3/10
K-Means iteration 4/10
K-Means iteration 5/10
K-Means iteration 6/10
K-Means iteration 7/10
K-Means iteration 8/10
K-Means iteration 9/10
K-Means iteration 10/10
K-Means Done.


array([[0.6542609 , 0.47321336, 0.21364849],
[0.65230797, 0.57831624, 0.51264976],
[0.09712915, 0.10455034, 0.09407214],
[0.48894175, 0.4333874 , 0.42881704],
[0.06023276, 0.06591191, 0.0560684 ],
[0.26616294, 0.24824112, 0.25030182],
[0.17853   , 0.17647848, 0.16819347],
[0.43423982, 0.32894873, 0.23466724],
[0.12429066, 0.13343329, 0.12440215],
[0.52275767, 0.65194078, 0.82456983],
[0.96515704, 0.86181965, 0.63655503],
[0.85939342, 0.71613389, 0.4662387 ],
[0.80782638, 0.73124406, 0.72083682],
[0.82261625, 0.60442577, 0.31533894],
[0.07805512, 0.08465313, 0.07399391],
[0.97377854, 0.94477901, 0.82111697]])


array([11, 11, 11, ...,  6,  6,  6])


# ===================== Part 5: Image Compression =====================
print('Applying K-Means to compress an image.')

# Find closest cluster members
idx2 = find_closest_centroids(X, centroids)
Applying K-Means to compress an image.
array([11, 11, 11, ...,  6,  6,  6])

取压缩后个像素对应的聚类中心元素,压缩后只需要存储 centroids和idx即可

X_recovered = centroids[idx]

恢复压缩后的数据到原始图片数据维度128 * 128 * 3,并打印压缩前和压缩后的图片

# Reshape the recovered image into proper dimensions
X_recovered = np.reshape(X_recovered, (img_shape[0], img_shape[1], 3))

plt.subplot(1, 2, 1)

plt.subplot(1, 2, 2)
plt.title('Compressed, with {} colors'.format(K))
Text(0.5,1,'Compressed, with 16 colors')

(128, 128, 3)

1.8 K均值聚类算法对自己图像进行压缩


# Load an image of a bird
image_lm = io.imread('100.jpg')
image_lm = img_as_float(image_lm)

# Size of the image
img_lm_shape = image_lm.shape

# Reshape the image into an Nx3 matrix where N = number of pixels.
# Each row will contain the Red, Green and Blue pixel values
# This gives us our dataset matrix X that we will use K-Means on.

X_lm = image_lm.reshape(img_lm_shape[0] * img_lm_shape[1], 3)
(2448, 3264, 3)
(7990272, 3)


K = 16
max_iters = 10


initial_centroids_lm = kmic.kmeans_init_centroids(X_lm, K)


# Run K-Means
centroids_lm, idx_lm = km.run_kmeans(X_lm, initial_centroids_lm, max_iters, False)
print('K-Means Done.')
print('K-Means Done.')
K-Means iteration 1/10
K-Means iteration 2/10
K-Means iteration 3/10
K-Means iteration 4/10
K-Means iteration 5/10
K-Means iteration 6/10
K-Means iteration 7/10
K-Means iteration 8/10
K-Means iteration 9/10
K-Means iteration 10/10
K-Means Done.


(16, 3)


X_recovered = centroids_lm[idx_lm]


# Reshape the recovered image into proper dimensions
X_recovered = np.reshape(X_recovered, (img_lm_shape[0], img_lm_shape[1], 3))
plt.figure(figsize=(30, 30))
plt.subplot(1, 2, 1)

plt.subplot(1, 2, 2)
plt.title('Compressed, with {} colors'.format(K))
Text(0.5,1,'Compressed, with 16 colors')

