Download No description has been provided for this image

הרצאה 12 - PCA and K-Means

Setup

In [ ]:
## Importing packages
import os # A build in package for interacting with the OS. For example to create a folder.
import numpy as np  # Numerical package (mainly multi-dimensional arrays and linear algebra)
import pandas as pd  # A package for working with data frames
import matplotlib.pyplot as plt  # A plotting package
import imageio  # A package to read and write image (is used here to save gif images)
import tabulate  # A package from pretty printing tables
from graphviz import Digraph  # A package for plothing graphs (of nodes and edges)

## Setup matplotlib to output figures into the notebook
## - To make the figures interactive (zoomable, tooltip, etc.) use ""%matplotlib notebook" instead
%matplotlib inline

## Setting some nice matplotlib defaults
plt.rcParams['figure.figsize'] = (4.5, 4.5)  # Set default plot's sizes
plt.rcParams['figure.dpi'] = 120  # Set default plot's dpi (increase fonts' size)
plt.rcParams['axes.grid'] = True  # Show grid by default in figures

## Auxiliary function for prining equations, pandas tables and images in cells output
from IPython.core.display import display, HTML, Latex, Markdown

## Create output folder
if not os.path.isdir('./output'):
    os.mkdir('./output')

4 Gaussians

In [ ]:
centers = np.array([[3, 3], [3, -3], [-3, 3], [-3, -3]])
std = 1

n_points = 100

rand_gen = np.random.RandomState(0)

x = (rand_gen.randn(centers.shape[0], n_points, 2) * std + centers[:, None, :]).reshape(-1, 2)
In [ ]:
## Prepare figure and plotting counters
fig, ax = plt.subplots(figsize=(5, 5))
raw_points = ax.plot(x[:, 0], x[:, 1], 'o', fillstyle='none')[0]
ax.set_xlim(-6, 6)
ax.set_ylim(-6, 6)
ax.axis('equal')
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
ax.set_title('Data Points')

fig.savefig('./output/gaussians_data.png', dpi=240)
No description has been provided for this image
In [ ]:
from scipy.spatial import distance  # A function for efficiently calculating all the distances between points in two lists of points.
from scipy.spatial import Voronoi, voronoi_plot_2d  # Functions for plotting the Voronoi cells

## Set K
k = 4

n_samples = len(x)

## Create a random generator using a fixed seed (we fix the seed for reproducible results)
rand_gen = np.random.RandomState(9)

## Initialize the means using k random points from the dataset
means = x[rand_gen.randint(low=0, high=n_samples, size=k)]
assignment = np.zeros(n_samples, dtype=int)

## Prepare figure
raw_points.remove()
colors = plt.rcParams['axes.prop_cycle'].by_key()['color'][:4]
clusters_points = [ax.plot([], [], 
                           'o',
                           fillstyle='none',
                           color=colors[i_cluster],
                           zorder=1,
                           )[0] for i_cluster in range(k)] 
centers_points = [ax.plot(means[i_cluster, 0], means[i_cluster, 1],
                          'o',
                          markersize=10,
                          color=colors[i_cluster],
                          mec='black',
                          zorder=2,
                          )[0] for i_cluster in range(k)]
arrows = [None] * 4

## Plot initial Voronoi cells
vor = Voronoi(np.concatenate([means, [[1e3, 1e3], [1e3, -1e3], [-1e3, 1e3], [-1e3, -1e3]]], axis=0))
voronoi_plot_2d(ax=ax, vor=vor, show_points=False, show_vertices=False, line_width=1, line_alpha=0.3)
ax.set_xlim(-6, 6)
ax.set_ylim(-6, 6)

i_step = 0
while True:
    i_step += 1
    assignment_old = assignment

    ## Step 1: Assign points to means
    distances = distance.cdist(x, means, 'euclidean')
    assignment = np.argmin(distances, axis=1)

    ## Stop criteria
    if (assignment == assignment_old).all():
        break

    ## Plot clusters
    ax.set_title('Step {} - Updating clusters'.format(i_step))
    for i_cluster in range(k):
        cluster_indices = assignment == i_cluster
        clusters_points[i_cluster].set_data(x[cluster_indices, 0], x[cluster_indices, 1])
        if arrows[i_cluster] is not None:
            arrows[i_cluster].remove()
            arrows[i_cluster] = None
    fig.canvas.draw()
    fig.savefig(f'./output/gaussians_step{i_step}a.png', dpi=240)
#     time.sleep(1)
    
    ## Step 2: Update means
    old_means = means.copy()  ## needed just for plotting
    for i_cluster in range(k):
        cluster_indices = assignment == i_cluster
        means[i_cluster] = x[cluster_indices].mean(axis=0)

    ## Plot means
    ax.set_title('Step {} - Updating centers'.format(i_step))
    for i_cluster in range(k):
        cluster_indices = assignment == i_cluster
        centers_points[i_cluster].set_data(means[i_cluster, 0], means[i_cluster, 1])
        if (old_means[i_cluster]  != means[i_cluster]).any():
            arrows[i_cluster] = ax.arrow(old_means[i_cluster, 0], old_means[i_cluster, 1],
                                         means[i_cluster, 0] - old_means[i_cluster, 0],
                                         means[i_cluster, 1] - old_means[i_cluster, 1],
                                         head_width=0.2,
                                         head_length=0.2,
                                         color='black',
                                         length_includes_head=True,
                                         zorder=3,
                                         )

    ## Update Voronoi cells on plot
    while(len(ax.collections)):
        ax.collections[-1].remove()
    vor = Voronoi(np.concatenate([means, [[1e3, 1e3], [1e3, -1e3], [-1e3, 1e3], [-1e3, -1e3]]], axis=0))
    voronoi_plot_2d(ax=ax, vor=vor, show_points=False, show_vertices=False, line_width=1, line_alpha=0.3)
    ax.set_xlim(-6, 6)
    ax.set_ylim(-6, 6)
    fig.canvas.draw()
#     time.sleep(1)
    fig.savefig(f'./output/gaussians_step{i_step}b.png', dpi=240)
In [ ]:
## Save plot of clusters only
ax.set_title('Clustered data Points')
for i_cluster in range(k):
    if arrows[i_cluster] is not None:
        arrows[i_cluster].remove()
        arrows[i_cluster] = None
for point in centers_points:
    point.remove()
while(len(ax.collections)):
    ax.collections[-1].remove()
fig.canvas.draw()
fig.savefig('./output/gaussians_clusters.png', dpi=240)

Results for different K's

In [ ]:
rand_gen = np.random.RandomState(0)

for k in [2, 3, 4, 10]:
    ## Initialize the means using k random points from the dataset
    means = x[rand_gen.randint(low=0, high=n_samples, size=k)]
    assignment = np.zeros(n_samples, dtype=int)

    i_step = 0
    while True:
        i_step += 1
        assignment_old = assignment

        ## Step 1: Assign points to means
        distances = distance.cdist(x, means, 'euclidean')
        assignment = np.argmin(distances, axis=1)

        ## Stop criteria
        if (assignment == assignment_old).all():
            break

        ## Step 2: Update means
        old_means = means.copy()  ## needed just for plotting
        for i_cluster in range(k):
            cluster_indices = assignment == i_cluster
            means[i_cluster] = x[cluster_indices].mean(axis=0)

    ## Plot results
    fig, ax = plt.subplots(figsize=(5, 5))
    ax.axis('equal')
    ax.set_xlabel('$x_1$')
    ax.set_ylabel('$x_2$')
    ax.set_title('K={} clusters'.format(k))

    colors = plt.rcParams['axes.prop_cycle'].by_key()['color'][:k]
    for i_cluster in range(k):
        cluster_indices = assignment == i_cluster
        ax.plot(x[cluster_indices, 0], x[cluster_indices, 1],
            'o',
            fillstyle='none',
            color=colors[i_cluster],
            zorder=1,
            )
        ax.plot(means[i_cluster, 0], means[i_cluster, 1],
            'o',
            markersize=10,
            color=colors[i_cluster],
            mec='black',
            zorder=2,
            )

    vor = Voronoi(np.concatenate([means, [[1e3, 1e3], [1e3, -1e3], [-1e3, 1e3], [-1e3, -1e3]]], axis=0))
    voronoi_plot_2d(ax=ax, vor=vor, show_points=False, show_vertices=False, line_width=1, line_alpha=0.3)
    ax.set_xlim(-6, 6)
    ax.set_ylim(-6, 6)
    fig.savefig(f'./output/gaussians_{k}_clusters.png', dpi=240)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
# rand_gen = np.random.RandomState(1)

# k_vec = np.arange(1, 400)
# err_vec = np.zeros(k_vec.shape)

# for i_k, k in enumerate(k_vec):
#     ## Initialize the means using k random points from the dataset
#     means = x[rand_gen.randint(low=0, high=n_samples, size=k)]
#     assignment = np.zeros(n_samples, dtype=int)

#     i_step = 0
#     while True:
#         i_step += 1
#         assignment_old = assignment

#         ## Step 1: Assign points to means
#         distances = distance.cdist(x, means, 'euclidean')
#         assignment = np.argmin(distances, axis=1)

#         ## Stop criteria
#         if (assignment == assignment_old).all():
#             break

#         ## Step 2: Update means
#         old_means = means.copy()  ## needed just for plotting
#         for i_cluster in range(k):
#             cluster_indices = assignment == i_cluster
#             if np.any(cluster_indices):
#                 means[i_cluster] = x[cluster_indices].mean(axis=0)
    
#     err_vec[i_k] = np.mean(((x - means[assignment]) ** 2).sum(axis=1)) ** 0.5
In [ ]:
# ## Plot
# fig, ax = plt.subplots(figsize=(5, 5))
# ax.set_xlabel('$K$')
# ax.set_ylabel('$E\\left(K\\right)$')
# ax.set_title('$E\\left(K\\right)$ vs. $K$')

# ax.plot(k_vec, err_vec)

# fig.savefig('../media/ek_vs_k.png'.format(k))

# ax.set_xlim(1, 9)
# ax.set_ylim(0, 7)

# fig.savefig('./output/ek_vs_k_zoom.png', dpi=240)
In [ ]:
# err_vec_rel = (err_vec[:-1] - err_vec[1:]) / err_vec[:-1]

# ## Plot
# fig, ax = plt.subplots(figsize=(5, 5))
# ax.set_title('$\\frac{-\\Delta E\\left(K\\right)}{E\\left(K\\right)}$ vs. $K$')
# ax.set_xlabel('$K$')
# ax.set_ylabel('$\\frac{-\\Delta E\\left(K\\right)}{E\\left(K\\right)}$')
# plt.tight_layout()

# ax.plot(k_vec[:-1], err_vec_rel)

# # fig.savefig('../media/ek_rel_vs_k.png'.format(k))

# ax.set_xlim(1, 9)
# ax.set_ylim(0, 0.7)

# fig.savefig('./output/ek_rel_vs_k_zoom.png', dpi=240)