Source code for touvlo.unsupv.kmeans

"""
.. module:: kmeans
    :synopsis: Provides routines to apply K-means clustering.

.. moduleauthor:: Benardi Nunes <benardinunes@gmail.com>
"""

from math import inf, sqrt

from numpy import zeros, int64, power, sum, mean, full, nan
from numpy.random import permutation


[docs]def euclidean_dist(p, q): """Calculates Euclidean distance between 2 n-dimensional points. Args: p (numpy.array): First n-dimensional point. q (numpy.array): Second n-dimensional point. Returns: float: Distance between 2 points. """ dist = p - q dist = power(dist, 2) dist = sum(dist) dist = sqrt(dist) return dist
[docs]def find_closest_centroids(X, initial_centroids): """Assigns to each example the indice of the closest centroid. Args: X (numpy.array): Features' dataset initial_centroids (numpy.array): List of initialized centroids. Returns: numpy.array: Column vector of assigned centroids' indices. """ m = len(X) K = len(initial_centroids) idx = zeros((m, 1), dtype=int64) for i in range(m): best_c = -1 min_dist = inf for k in range(K): dist = euclidean_dist(X[i, :], initial_centroids[k]) if dist < min_dist: best_c = k min_dist = dist idx[i, :] = best_c return idx
[docs]def compute_centroids(X, idx, K): """Computes centroids from the mean of its cluster's members. Computes centroids from the mean of its cluster's members if there are any members for the centroid, else it returns an array of nan. Args: X (numpy.array): Features' dataset idx (numpy.array): Column vector of assigned centroids' indices. K (int): Number of centroids. Returns: numpy.array: Column vector of newly computed centroids """ m, n = X.shape elements = None centroids = zeros((K, n)) for k in range(K): elements = X[(idx == k).flatten()] if elements.size != 0: centroids[k] = mean(elements, axis=0) else: centroids[k] = full((1, n), nan) return centroids
[docs]def init_centroids(X, K): """Computes centroids from the mean of its cluster's members. Args: X (numpy.array): Features' dataset idx (numpy.array): Column vector of assigned centroids' indices. K (int): Number of centroids. Returns: numpy.array: Column vector of centroids randomly picked from dataset """ centroids = permutation(X) centroids = centroids[0:K, :] return centroids
[docs]def run_kmeans(X, K, max_iters): """Applies kmeans using a single random initialization. Args: X (numpy.array): Features' dataset K (int): Number of centroids. max_iters (int): Number of times the algorithm will be fitted. Returns: (numpy.array, numpy.array): A 2-tuple of centroids, a column vector of centroids, and idx, a column vector of assigned centroids' indices. """ centroids = init_centroids(X, K) for _ in range(max_iters): idx = find_closest_centroids(X, centroids) centroids = compute_centroids(X, idx, K) return centroids, idx
[docs]def cost_function(X, idx, centroids): """Calculates the cost function for K means. Args: X (numpy.array): Features' dataset idx (numpy.array): Column vector of assigned centroids' indices. Returns: float: Computed cost """ cost = 0 m = len(X) for i in range(m): centroid = centroids[idx[i][0]] cost += power(euclidean_dist(X[i], centroid), 2) cost = cost / m return cost
[docs]def run_intensive_kmeans(X, K, max_iters, n_inits): """Applies kmeans using multiple random initializations. Args: X (numpy.array): Features' dataset K (int): Number of centroids. max_iters (int): Number of times the algorithm will be fitted. n_inits (int): Number of random initialization. Returns: (numpy.array, numpy.array): A 2-tuple of centroids, a column vector of centroids, and idx, a column vector of assigned centroids' indices. """ min_cost = inf best_idx = None best_centroids = None for _ in range(n_inits): centroids, idx = run_kmeans(X, K, max_iters) cost = cost_function(X, idx, centroids) if cost < min_cost: best_idx = idx best_centroids = centroids return best_centroids, best_idx
[docs]def elbow_method(X, K_values, max_iters, n_inits): """Calculates the cost for each given K. Args: X (numpy.array): Features' dataset K_values (list(int)): List of possible number of centroids. max_iters (int): Number of times the algorithm will be fitted. n_inits (int): Number of random initialization. Returns: (list(float)): a list of cost values for each K. """ cost_values = [] for K in K_values: centroids, idx = run_intensive_kmeans(X, K, max_iters, n_inits) cost = cost_function(X, idx, centroids) cost_values.append(cost) return cost_values