"""
.. module:: kmeans
:synopsis: Provides routines to apply K-means clustering.
.. moduleauthor:: Benardi Nunes <benardinunes@gmail.com>
"""
from math import inf, sqrt
from numpy import zeros, int64, power, sum, mean, full, nan
from numpy.random import permutation
[docs]def euclidean_dist(p, q):
"""Calculates Euclidean distance between 2 n-dimensional points.
Args:
p (numpy.array): First n-dimensional point.
q (numpy.array): Second n-dimensional point.
Returns:
float: Distance between 2 points.
"""
dist = p - q
dist = power(dist, 2)
dist = sum(dist)
dist = sqrt(dist)
return dist
[docs]def find_closest_centroids(X, initial_centroids):
"""Assigns to each example the indice of the closest centroid.
Args:
X (numpy.array): Features' dataset
initial_centroids (numpy.array): List of initialized centroids.
Returns:
numpy.array: Column vector of assigned centroids' indices.
"""
m = len(X)
K = len(initial_centroids)
idx = zeros((m, 1), dtype=int64)
for i in range(m):
best_c = -1
min_dist = inf
for k in range(K):
dist = euclidean_dist(X[i, :], initial_centroids[k])
if dist < min_dist:
best_c = k
min_dist = dist
idx[i, :] = best_c
return idx
[docs]def compute_centroids(X, idx, K):
"""Computes centroids from the mean of its cluster's members.
Computes centroids from the mean of its cluster's members if there are
any members for the centroid, else it returns an array of nan.
Args:
X (numpy.array): Features' dataset
idx (numpy.array): Column vector of assigned centroids' indices.
K (int): Number of centroids.
Returns:
numpy.array: Column vector of newly computed centroids
"""
m, n = X.shape
elements = None
centroids = zeros((K, n))
for k in range(K):
elements = X[(idx == k).flatten()]
if elements.size != 0:
centroids[k] = mean(elements, axis=0)
else:
centroids[k] = full((1, n), nan)
return centroids
[docs]def init_centroids(X, K):
"""Computes centroids from the mean of its cluster's members.
Args:
X (numpy.array): Features' dataset
idx (numpy.array): Column vector of assigned centroids' indices.
K (int): Number of centroids.
Returns:
numpy.array: Column vector of centroids randomly picked from dataset
"""
centroids = permutation(X)
centroids = centroids[0:K, :]
return centroids
[docs]def run_kmeans(X, K, max_iters):
"""Applies kmeans using a single random initialization.
Args:
X (numpy.array): Features' dataset
K (int): Number of centroids.
max_iters (int): Number of times the algorithm will be fitted.
Returns:
(numpy.array, numpy.array): A 2-tuple of centroids, a column vector of
centroids, and idx, a column vector of assigned centroids' indices.
"""
centroids = init_centroids(X, K)
for _ in range(max_iters):
idx = find_closest_centroids(X, centroids)
centroids = compute_centroids(X, idx, K)
return centroids, idx
[docs]def cost_function(X, idx, centroids):
"""Calculates the cost function for K means.
Args:
X (numpy.array): Features' dataset
idx (numpy.array): Column vector of assigned centroids' indices.
Returns:
float: Computed cost
"""
cost = 0
m = len(X)
for i in range(m):
centroid = centroids[idx[i][0]]
cost += power(euclidean_dist(X[i], centroid), 2)
cost = cost / m
return cost
[docs]def run_intensive_kmeans(X, K, max_iters, n_inits):
"""Applies kmeans using multiple random initializations.
Args:
X (numpy.array): Features' dataset
K (int): Number of centroids.
max_iters (int): Number of times the algorithm will be fitted.
n_inits (int): Number of random initialization.
Returns:
(numpy.array, numpy.array): A 2-tuple of centroids, a column vector of
centroids, and idx, a column vector of assigned centroids' indices.
"""
min_cost = inf
best_idx = None
best_centroids = None
for _ in range(n_inits):
centroids, idx = run_kmeans(X, K, max_iters)
cost = cost_function(X, idx, centroids)
if cost < min_cost:
best_idx = idx
best_centroids = centroids
return best_centroids, best_idx
[docs]def elbow_method(X, K_values, max_iters, n_inits):
"""Calculates the cost for each given K.
Args:
X (numpy.array): Features' dataset
K_values (list(int)): List of possible number of centroids.
max_iters (int): Number of times the algorithm will be fitted.
n_inits (int): Number of random initialization.
Returns:
(list(float)): a list of cost values for each K.
"""
cost_values = []
for K in K_values:
centroids, idx = run_intensive_kmeans(X, K, max_iters, n_inits)
cost = cost_function(X, idx, centroids)
cost_values.append(cost)
return cost_values