### DO THIS BEFORE IMPORTING NUMPY OR SKLEARN TO AVOID KNOWN MEMORY LEAK IN WINDOWS
import os
os.environ["OMP_NUM_THREADS"] = "1"   # or "1" for complete safety
os.environ["MKL_NUM_THREADS"] = "2"   # optional, often helps too

import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 1.4.2.

from IPython.display import display, HTML
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import numpy as np
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.datasets import load_digits, load_iris, load_wine
from sklearn.decomposition import PCA
from sklearn.metrics import adjusted_rand_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from BootstrapKMeans import KMeansBoot

random_state = 217

## ==========================
## functions to plot kmeans uncertainty
    
def get_color_code(cmaplist, labels_prob):
    n_samples, n_clusters = labels_prob.shape    
    cols = np.zeros((n_samples,3)) # array for labels uncertainty colors
    for ii in range(n_samples):
        # average color
        for jj in range(n_clusters):
            cols[ii,:] += np.array(cmaplist[jj][0:3])*labels_prob[ii,jj]

    # clip all values between 0 and 1 to avoid floting point rounding problems
    return np.clip(cols, 0.0, 1.0, out=cols)
    
def plot_pred(X_tr, X_ts, y_tr, kmeans_mod):
    ''' Function to represent the Kmeans results (labels plus centroinds)
    together with the bootstrap uncertainty of labels and centroids. 
    Only the first two dimensions are ploted.
    
    To visualize the level of membership/label uncertainty the color of each instance ('col')
    is given by the weighted average of all available colors (cmaplist) weighted by the label
    probabilities given by labels_prob.
    
    In addition, we will store the indices ('id_unc') of the instances which has some level of
    uncertainty. If none of the values in labels_prob is 1, it means that
    there are some uncertainty.
    
    Input:
    -------
    X_tr: {array-like, sparse matrix} of shape (n_samples_tr, n_features)
        Training instances to cluster.
    X_ts: {array-like, sparse matrix} of shape (n_samples_ts, n_features)
        Test instances to assign label.
    kmeans_mod: KMeansBoot model fitted with X
    
    Returns:
    labels_prob_tr: ndarray of shape (n_samples-tr, n_clusters)
        Probabilities of each point in the train set to belong to the different clusters.
    id_unc_tr: ndarray of variable shape
        Indices of the observations with label uncertainty in train set
    labels_prob_ts: ndarray of shape (n_samples_ts, n_clusters)
        Probabilities of each point in the train set to belong to the different clusters.
    id_unc_ts: ndarray of variable shape
        Indices of the observations with label uncertainty in test set
    
    '''
    # convert datasets to numpy arrays
    if type(X_tr) != np.ndarray:
        X_tr = X_tr.to_numpy()
    if type(X_ts) != np.ndarray:
        X_ts = X_ts.to_numpy()
        
    # get params from KMeansBoot model
    centroids = kmeans_mod.cluster_centers_
    labels_tr = kmeans_mod.labels_
    labels_prob_tr = kmeans_mod.labels_boot_prob_
    centroids_boot = kmeans_mod.cluster_centers_boot_
    id_unc_tr = kmeans_mod.uncertain_ins_
    
    # predict labels probabilities for test set
    labels_ts, labels_prob_ts, id_unc_ts = kmeans_mod.predict(X_ts)
    
    ## Find uncertain labels and assign colors to probabilities
    # set number of colors in colormap to number of clusters
    cmap = plt.cm.gist_rainbow
    cmaplist = cmap(np.arange(0,256,np.round(256/n_clusters)).astype(int))

    # for the train set
    cols_tr = get_color_code(cmaplist, labels_prob_tr)
    
    # for the test set
    cols_ts = get_color_code(cmaplist, labels_prob_ts)      
       
    ### plot the results
    fig, ax = plt.subplots(1, 3, figsize = (15,4.5))
    
    ## left plot (conventional KMeans results)
    # plot all observations; color acording to their label/membership
    ax[0].scatter(X_tr[:,0],X_tr[:,1], c = cmaplist[labels_tr], s = 15)
    # Add centroids
    ax[0].scatter(centroids[:,0], centroids[:,1], c = 'white', edgecolor = 'black', marker = 'p', s = 80)
    ax[0].set_title('KMeans centroids and labels \n(Train set)')

    # create custom legends
    leg_handels = []
    for jj in range(n_clusters):
        idx = np.where(labels_tr == jj)[0][0]
        legend = mlines.Line2D([], [], color = cmaplist[labels_tr[idx]], marker = 'o', ls = '', label = str(labels_tr[idx]))
        leg_handels.append(legend)
    legend1 = ax[0].legend(handles = leg_handels, loc = 'best', fontsize = 10, handletextpad = 0.4, handlelength = 0.75,
                        labelspacing = 0.2, columnspacing = 1, frameon=True, ncols = 3, labelcolor = 'linecolor')
    ax[0].add_artist(legend1)
    
    ## center plot (bootstrapped KMeans results for train set)
    ax[1].scatter(X_tr[id_unc_tr,0], X_tr[id_unc_tr,1], c = cols_tr[id_unc_tr], s = 15)
    # Add bootstrapped centroids
    [ax[1].scatter(centroids_boot[:,0,bb], centroids_boot[:,1,bb], c = 'black', marker = '.', alpha = 0.1) for bb in range(centroids_boot.shape[2])]
    ax[1].set_title('KMeansBoot bootstrapped centroids \nand uncertain labels (Train set)')
    
    ## right plot (label uncertainties for test set)
    # plot only instances with uncertainty
    ax[2].scatter(X_ts[id_unc_ts,0], X_ts[id_unc_ts,1], c = cols_ts[id_unc_ts], s = 15)
    ax[2].set_title('KMeansBoot uncertain labels \n(Test set)')
    
    # match scales
    ax[1].set_xlim(ax[0].get_xlim())
    ax[1].set_ylim(ax[0].get_ylim())
    ax[2].set_xlim(ax[0].get_xlim())
    ax[2].set_ylim(ax[0].get_ylim())
    
    return labels_prob_tr, id_unc_tr, labels_prob_ts, id_unc_ts

n_samples = 1000
# different variances 
X_blobs, y = make_blobs(n_samples = n_samples, cluster_std = [1.0, 2.5, 1.5], random_state = random_state)
# split in train and test sets
X_blobs_tr, X_blobs_ts, y_blobs_tr, _ = train_test_split(pd.DataFrame(X_blobs), y, test_size=0.5, random_state = random_state)

# set number of bootstrap replicas and number of clusters.
n_boot = 299
n_clusters = 3
# instantiate and fit clustering
kmeans_boot_blobs = KMeansBoot(n_clusters = n_clusters, 
                               n_boot = n_boot,
                               random_state = random_state, 
                               verbose = 0).fit(X_blobs_tr)

#Coordinates of cluster centers (for full dataset)
centroids = kmeans_boot_blobs.cluster_centers_ 
#Labels of each point (for full dataset)
labels_tr = kmeans_boot_blobs.labels_
#Probabilities of each point to belong to the different clusters
labels_prob_tr = kmeans_boot_blobs.labels_boot_prob_
#Indices of the instances with label uncertainty
id_unc_tr = kmeans_boot_blobs.uncertain_ins_

# show membership probabilities for a few uncertain observations
df_prob = pd.DataFrame(labels_prob_tr[id_unc_tr,:], index = X_blobs_tr.index[id_unc_tr])
display(HTML("<h5>Cluster mambership probilities:</h5>"))
df_prob.head(10)

# predict labels, its probabilities and uncertain assignments for test set
labels_ts, labels_prob_ts, id_unc_ts = kmeans_boot_blobs.predict(X_blobs_ts)

# show membership probabilities for a few uncertain observations
df_prob = pd.DataFrame(labels_prob_ts[id_unc_ts,:], index = X_blobs_ts.index[id_unc_ts])
display(HTML("<h5>Cluster mambership probilities:</h5>"))
df_prob.head(10)

print('The percentage of uncertain train instances is {:.1f}.'.format(len(id_unc_tr)/X_blobs_tr.shape[0]*100))
print('The percentage of uncertain test instances is {:.1f}.'.format(len(id_unc_ts)/X_blobs_ts.shape[0]*100))

The percentage of uncertain train instances is 9.2.
The percentage of uncertain test instances is 8.6.

labels_prob_tr, id_unc_tr, labels_prob_ts, id_unc_ts = plot_pred(X_blobs_tr, X_blobs_ts, y_blobs_tr, kmeans_boot_blobs)

# load data
X_iris, y = load_iris(return_X_y = True)
# split in train and test sets
X_iris_tr, X_iris_ts, y_iris_tr, _ = train_test_split(pd.DataFrame(X_iris), y, test_size=0.3, random_state = random_state)
# select number of clusters
n_clusters = 3
# instantiate and fit clustering
kmeans_boot_iris = KMeansBoot(n_clusters = n_clusters,  
                              n_boot = n_boot,
                              random_state = random_state, 
                              verbose = 0).fit(X_iris_tr)

labels_prob_tr, id_unc_tr, labels_prob_ts, id_unc_ts = plot_pred(X_iris_tr, X_iris_ts, y_iris_tr, kmeans_boot_iris)

df_prob = pd.DataFrame(labels_prob_ts[id_unc_ts,:], index = X_iris_ts.index[id_unc_ts])
display(HTML("<h5>Cluster mambership probilities:</h5>"))
df_prob.head(10)

print('The percentage of uncertain train instances is {:.1f}.'.format(len(id_unc_tr)/X_iris_tr.shape[0]*100))
print('The percentage of uncertain test instances is {:.1f}.'.format(len(id_unc_ts)/X_iris_ts.shape[0]*100))

The percentage of uncertain train instances is 18.1.
The percentage of uncertain test instances is 17.8.

# load data
X_wine, y = load_wine(return_X_y = True)
# split in train and test sets
X_wine_tr, X_wine_ts, y_wine_tr, _ = train_test_split(pd.DataFrame(X_wine), y, test_size=0.2, random_state = random_state)
# select number of PCA components and clusters
n_components = 3 
n_clusters = 3
# instantiate and fit pipeline
pipe = Pipeline([('scaler', StandardScaler()),
                 ('PCA', PCA(n_components = n_components, whiten = True, random_state=random_state)), 
                 ('clustering', KMeansBoot(n_clusters = n_clusters,  
                                           n_boot = n_boot,
                                           random_state = random_state, 
                                           verbose = 0))]).fit(X_wine_tr)

# transform data to represent first two PCs
pipe_pca = Pipeline([('scaler', StandardScaler()), 
                  ('PCA', PCA(n_components = n_components, whiten = True))])

X_pca_tr = pipe_pca.fit_transform(X_wine_tr)

X_pca_ts = pipe_pca.transform(X_wine_ts)

# plot results
labels_prob_tr, id_unc_tr, labels_prob_ts, id_unc_ts = plot_pred(X_pca_tr, X_pca_ts, y_wine_tr, pipe['clustering'])

print('The percentage of uncertain train instances is {:.1f}.'.format(len(id_unc_tr)/X_wine_tr.shape[0]*100))
print('The percentage of uncertain test instances is {:.1f}.'.format(len(id_unc_ts)/X_wine_ts.shape[0]*100))

The percentage of uncertain train instances is 21.1.
The percentage of uncertain test instances is 27.8.

# change threshold value
threshold = 0.9 # WE CHANGE THIS
# instantiate and fit pipeline
pipe = Pipeline([('scaler', StandardScaler()),
                 ('PCA', PCA(n_components = n_components, whiten = True, random_state = random_state)), 
                 ('clustering', KMeansBoot(n_clusters = n_clusters,  
                                           n_boot = n_boot,
                                           threshold = threshold, #WE CHANGE THIS
                                           random_state = random_state, 
                                           verbose = 0))]).fit(X_wine_tr)

# plot results
labels_prob_tr, id_unc_tr, labels_prob_ts, id_unc_ts = plot_pred(X_pca_tr, X_pca_ts, y_wine_tr, pipe['clustering'])

print('The percentage of uncertain train instances is {:.1f}.'.format(len(id_unc_tr)/X_wine_tr.shape[0]*100))
print('The percentage of uncertain test instances is {:.1f}.'.format(len(id_unc_ts)/X_wine_ts.shape[0]*100))

The percentage of uncertain train instances is 7.0.
The percentage of uncertain test instances is 16.7.

# load data
X_digits, y = load_digits(return_X_y = True)
# split in train and test sets
X_digits_tr, X_digits_ts, y_digits_tr, _ = train_test_split(pd.DataFrame(X_digits), y, test_size=0.2, random_state = random_state)
# select number of PCA components and clusters
n_clusters = 10
n_components = 2
# instantiate and fit pipeline
pipe = Pipeline([('scaler', StandardScaler()),
                 ('PCA', PCA(n_components = n_components, whiten = True, random_state = random_state)), 
                 ('clustering', KMeansBoot(n_clusters = n_clusters,  
                                           n_boot = n_boot,
                                           random_state = random_state, 
                                           verbose = 0))]).fit(X_digits_tr)

# transform data to represent first two PCs
pipe_pca = Pipeline([('scaler', StandardScaler()), 
                  ('PCA', PCA(n_components = n_components, whiten = True, random_state = random_state))])

X_pca_tr = pipe_pca.fit_transform(X_digits_tr)

X_pca_ts = pipe_pca.transform(X_digits_ts)

# plot results
labels_prob_tr, id_unc_tr, labels_prob_ts, id_unc_ts = plot_pred(X_pca_tr, X_pca_ts, y_digits_tr, pipe['clustering'])

print('The percentage of uncertain train instances is {:.1f}.'.format(len(id_unc_tr)/X_digits_tr.shape[0]*100))
print('The percentage of uncertain test instances is {:.1f}.'.format(len(id_unc_ts)/X_digits_ts.shape[0]*100))

The percentage of uncertain train instances is 44.9.
The percentage of uncertain test instances is 43.3.

# change number of PCA componets
n_components = 10
# instantiate and fit pipeline
pipe = Pipeline([('scaler', StandardScaler()),
                 ('PCA', PCA(n_components = n_components, whiten = True, random_state = random_state)), 
                 ('clustering', KMeansBoot(n_clusters = n_clusters,  
                                           n_boot = n_boot,
                                           random_state = random_state, 
                                           verbose = 0))]).fit(X_digits_tr)

# transform data to represent first two PCs
pipe_pca = Pipeline([('scaler', StandardScaler()), 
                  ('PCA', PCA(n_components = n_components, whiten = True, random_state = random_state))])

X_pca_tr = pipe_pca.fit_transform(X_digits_tr)

X_pca_ts = pipe_pca.transform(X_digits_ts)

# plot results
labels_prob_tr, id_unc_tr, labels_prob_ts, id_unc_ts = plot_pred(X_pca_tr, X_pca_ts, y_digits_tr, pipe['clustering'])

print('The percentage of uncertain train instances is {:.1f}.'.format(len(id_unc_tr)/X_digits_tr.shape[0]*100))
print('The percentage of uncertain test instances is {:.1f}.'.format(len(id_unc_ts)/X_digits_ts.shape[0]*100))

The percentage of uncertain train instances is 21.9.
The percentage of uncertain test instances is 23.9.

	0	1	2
746	0.591973	0.408027	0.000000
488	0.872910	0.127090	0.000000
672	0.073579	0.926421	0.000000
260	0.575251	0.000000	0.424749
510	0.856187	0.123746	0.020067
592	0.066890	0.933110	0.000000
32	0.090301	0.000000	0.909699
453	0.030100	0.969900	0.000000
547	0.030100	0.969900	0.000000
975	0.528428	0.000000	0.471572

	0	1	2
38	0.026756	0.973244	0.000000
969	0.026756	0.973244	0.000000
617	0.936455	0.000000	0.063545
399	0.013378	0.986622	0.000000
552	0.545151	0.454849	0.000000
286	0.872910	0.000000	0.127090
586	0.986622	0.010033	0.003344
644	0.501672	0.498328	0.000000
397	0.311037	0.000000	0.688963
352	0.852843	0.147157	0.000000

	0	2
142	0.939799	0.060201
83	0.953177	0.046823
121	0.979933	0.020067
50	0.658863	0.341137
76	0.913043	0.086957
58	0.989967	0.010033
101	0.939799	0.060201
147	0.026756	0.973244

Bootstrapping K-Means: Getting a sense of centroids, boundary, and labels uncertainties¶

Intro¶

Example of Usage¶

Example with artificial dataset (Blobs)¶

Cluster mambership probilities:

Cluster mambership probilities:

Example with Toy Dataset (Iris):¶

Cluster mambership probilities:

Example with wine dataset (Pipeline: Standardization+PCA+K-means)¶

Example with digits dataset (Pipeline: Standardization+PCA+K-means)¶