[1]:
from sklearn.cluster import KMeans
from mr_toolkit.clustering import StratifiedClusters
import numpy as np
import matplotlib.pyplot as plt

Stratified Clustering Demo

Stratified clustering is like a hierarchical k-means clustering. A set of bins are defined, and k-means is independently performed in each bin.

We’re going demonstrate this by taking a dataset and splitting it up into 6 clusters.

First, we’ll cluster it with standard k-means.

Then, we’ll stratify in both x and y. We’ll define 2 stratum boundaries, which makes 3 strata.

Create some sample data

[2]:
colors = np.array(['tab:red', 'tab:orange', 'tab:cyan', 'tab:blue', 'tab:pink', 'tab:purple'])

# Number of clusters to place in each stratum
n_clusters = 2
[3]:
sample_data = np.array([
    [3.0, 23],
    [3.5, 27],
    [4.5, 87],
    [6.0, 14],
    [6.2, 8],
    [5.3, 91],
    [8.4, 33],
    [8.7, 32],
    [8.9, 80],
])
[4]:
plt.scatter(*sample_data.T)
[4]:
<matplotlib.collections.PathCollection at 0x7fe2b5bf0790>
../_images/_examples_stratified_clustering_6_1.png

Standard K-means clustering

[5]:
kmeans = KMeans(n_clusters=n_clusters*3, n_init='auto')
cluster_assignments = kmeans.fit_predict(sample_data)
[6]:
plt.scatter(*sample_data.T, color=colors[cluster_assignments])
[6]:
<matplotlib.collections.PathCollection at 0x7fe2b5ae6b90>
../_images/_examples_stratified_clustering_9_1.png

Stratify on the first dimension

[7]:
vertical_bounds = np.array([5, 7])

clusterer = StratifiedClusters(n_clusters, bin_bounds=vertical_bounds)
[8]:
clusterer.fit(sample_data, coord_to_stratify=0)
[9]:
vertical_assignments = clusterer.predict(sample_data)
vertical_assignments
[9]:
array([0, 0, 1, 2, 2, 3, 5, 5, 4])
[10]:
plt.scatter(*sample_data.T, color=colors[vertical_assignments])

for bound in vertical_bounds:
    plt.axvline(bound, color='gray')
../_images/_examples_stratified_clustering_14_0.png

Stratify on the second dimension

[11]:
horizontal_bounds = np.array([25, 50])

clusterer = StratifiedClusters(n_clusters, bin_bounds=horizontal_bounds)
[12]:
clusterer.fit(sample_data, coord_to_stratify=1)
[13]:
horizontal_assignments = clusterer.predict(sample_data)
horizontal_assignments
[13]:
array([0, 3, 4, 1, 1, 4, 2, 2, 5])
[14]:
plt.scatter(*sample_data.T, color=colors[horizontal_assignments])

for bound in horizontal_bounds:
    plt.axhline(bound, color='gray')
../_images/_examples_stratified_clustering_19_0.png

Compare all

[15]:
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(18,4))


axs[0].set_title("Standard K-means")
axs[0].scatter(*sample_data.T, color=colors[cluster_assignments])


axs[1].set_title("Stratifying in $x$")
axs[1].scatter(*sample_data.T, color=colors[vertical_assignments])
for bound in vertical_bounds:
    axs[1].axvline(bound, color='gray')


axs[2].set_title("Stratifying in $y$")
axs[2].scatter(*sample_data.T, color=colors[horizontal_assignments])
for bound in horizontal_bounds:
    axs[2].axhline(bound, color='gray')
../_images/_examples_stratified_clustering_21_0.png
[ ]: