תרגול 12 - דוגמא מעשית
Setup¶
In [ ]:
## Importing packages
import os # A build in package for interacting with the OS. For example to create a folder.
import numpy as np # Numerical package (mainly multi-dimensional arrays and linear algebra)
import pandas as pd # A package for working with data frames
import matplotlib.pyplot as plt # A plotting package
import imageio # A package to read and write image (is used here to save gif images)
import tabulate # A package from pretty printing tables
from graphviz import Digraph # A package for plothing graphs (of nodes and edges)
import tqdm.notebook as tqdm
## Setup matplotlib to output figures into the notebook
## - To make the figures interactive (zoomable, tooltip, etc.) use ""%matplotlib notebook" instead
%matplotlib inline
## Setting some nice matplotlib defaults
plt.rcParams['figure.figsize'] = (4.5, 4.5) # Set default plot's sizes
plt.rcParams['figure.dpi'] = 120 # Set default plot's dpi (increase fonts' size)
plt.rcParams['axes.grid'] = True # Show grid by default in figures
## Auxiliary function for prining equations, pandas tables and images in cells output
from IPython.core.display import display, HTML, Latex, Markdown
## Create output folder
if not os.path.isdir('./output'):
os.mkdir('./output')
Data Inspection¶
In [ ]:
data_file = 'https://technion046195.netlify.app/datasets/nyc_taxi_rides.csv'
## Loading the data
dataset = pd.read_csv(data_file)
## Show the first 10 rows
dataset
Out[ ]:
Plotting Drop Off Points¶
In [ ]:
import skimage.io # Another package to heandle images
## Load and image of the streets of NY
ny_map_image = skimage.io.imread('https://technion046195.netlify.app/assets/nyc_map.png')
## The geografic bounding box of the map data as [West-longtitude East-longtitude South-latitude North-latitude]:
bbox = [582.5, 592.5, 4505.5, 4515.5]
In [ ]:
## Create the figure and axis
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(1, 1, 1)
ax.grid(True)
## Plot the map
ax.imshow(ny_map_image, extent=bbox, cmap='gray', alpha=0.7)
ax.plot(dataset['dropoff_easting'], dataset['dropoff_northing'], '.', markersize=1)
fig.savefig('./output/pickups.png')
Train-Validation-Test split¶
In [ ]:
n_samples = len(dataset)
## Generate a random generator with a fixed seed
rand_gen = np.random.RandomState(1)
## Generating a vector of indices
indices = np.arange(n_samples)
## Shuffle the indices
rand_gen.shuffle(indices)
## Split the indices into 60% train / 20% validation / 20% test
n_samples_train = int(n_samples * 0.6)
n_samples_val = int(n_samples * 0.2)
train_indices = indices[:n_samples_train]
val_indices = indices[n_samples_train:(n_samples_train + n_samples_val)]
test_indices = indices[(n_samples_train + n_samples_val):]
train_set = dataset.iloc[train_indices]
val_set = dataset.iloc[val_indices]
test_set = dataset.iloc[test_indices]
K-Means¶
In [ ]:
from scipy.spatial import distance # A function for efficiently calculating all the distances between points in two lists of points.
## Set K
k = 10
## Define x (the matrix of the drop off locations)
x = train_set[['dropoff_easting', 'dropoff_northing']].values
n_samples = len(x)
## Create a random generator using a fixed seed (we fix the seed for reproducible results)
rand_gen = np.random.RandomState(0)
## Initialize the means using k random points from the dataset
means = x[rand_gen.randint(low=0, high=n_samples, size=k)]
assignment = np.zeros(n_samples, dtype=int)
## Prepare figure and plotting counters
next_axis = 0
fig, axis_list = plt.subplots(3, 3, figsize=(12, 12))
i_step = 0
next_plot_step = 1
stop_iterations = False
while not stop_iterations:
i_step += 1
assignment_old = assignment
## Step 1: Assign points to means
distances = distance.cdist(x, means, 'euclidean')
assignment = np.argmin(distances, axis=1)
## Stop criteria
if (assignment == assignment_old).all():
stop_iterations = True
## Step 2: Update means
for i_cluster in range(k):
cluster_indices = assignment == i_cluster
means[i_cluster] = x[cluster_indices].mean(axis=0)
## Plot clusters and means
if (i_step == next_plot_step) or (stop_iterations):
ax = axis_list.flat[next_axis]
ax.grid(True)
ax.set_title('Step: {}'.format(i_step))
ax.imshow(ny_map_image, extent=bbox, cmap='gray', alpha=0.7)
for i_cluster in range(k):
cluster_indices = assignment == i_cluster
ax.plot(x[cluster_indices, 0], x[cluster_indices, 1], '.', markersize=1)
ax.plot(means[:, 0], means[:, 1], 'xk', markersize=20)[0]
next_plot_step *= 2
next_axis += 1
for i in range(next_axis, len(axis_list.flat)):
axis_list.flat[i].set_visible(False)
print('Number of steps: {}'.format(i_step))
parking_lots_locations = means
fig.savefig('./output/taxis_k_means.png')
Evaluation on train set¶
$$ \hat{R}\left(\left\{\boldsymbol{c}_i\right\}\right)=\frac{1}{N}\sum_{\boldsymbol{x}_j}\min_{\boldsymbol{c}\in\left\{\boldsymbol{c}\right\}}\left\lVert\boldsymbol{x}_j-\boldsymbol{c}\right\rVert $$
In [ ]:
## Calculate distances to all parking lots
distances = distance.cdist(x, parking_lots_locations, 'euclidean')
## Calculate the average of the distances to the colsest parking lot to each point
train_risk = distances.min(axis=1).mean()
print('The average ride distance to the closest parking lots is approximately {:.02f} km'.format(train_risk))
Evaluation on test set¶
In [ ]:
## Define x for the test set
x_val = val_set[['dropoff_easting', 'dropoff_northing']].values
## Calculate distances to all parking lots
distances = distance.cdist(x_val, parking_lots_locations, 'euclidean')
## Calculate the average of the distances to the colsest parking lot to each point
val_risk = distances.min(axis=1).mean()
print('The average ride distance to the closest parking lots is approximatley {:.02f} km'.format(val_risk))
Finding The Optimal Number of Parking Lots¶
The empirical risk function:
$$ \hat{R}\left(\left\{\boldsymbol{c}_i\right\}, K\right)= 10\cdot K+300\cdot\frac{1}{N}\sum_{i=1}^K\sum_{\boldsymbol{x}_j\in S_i}\left\lVert\boldsymbol{x}_j-\boldsymbol{c}_i\right\rVert $$
In [ ]:
## import KMeans from scikit-learn
from sklearn.cluster import KMeans
cost_per_parking = 10
cost_per_avarage_distance = 300
## Define the grid of K's over we will search for our solution
k_grid = np.arange(1, 26, 1)
## Updates the datasets
x = train_set[['dropoff_easting', 'dropoff_northing']].values
x_val = val_set[['dropoff_easting', 'dropoff_northing']].values
x_test = test_set[['dropoff_easting', 'dropoff_northing']].values
## Initialize the list of the average ride distance
average_distance_array = np.zeros(len(k_grid), dtype=float)
## Create a random generator using a fixed seed
rand_gen = np.random.RandomState(0)
best_risk_so_far = np.inf
optimal_k = None
optimal_average_distance = None
optimal_centers = None
for i_k, k in enumerate(tqdm.tqdm(k_grid)):
## Calculate ceneters and clusters
kmeans = KMeans(n_clusters=k, random_state=rand_gen)
kmeans.fit(x)
centers = kmeans.cluster_centers_
## Evaluate the empirical risk
assignments = kmeans.predict(x_val)
distances = np.linalg.norm(x_val - centers[assignments], axis=1)
average_distance = distances.mean()
risk = cost_per_parking * k + cost_per_avarage_distance * average_distance
## If this is the best risk so far save the optimal results
if risk < best_risk_so_far:
best_risk_so_far = risk
optimal_k = k
optimal_average_distance = average_distance
optimal_centers = centers
## Save average distance for the later plot
average_distance_array[i_k] = average_distance
## Plot
fig, ax = plt.subplots()
ax.plot(k_grid, average_distance_array * cost_per_avarage_distance, label='Rides')
ax.plot(k_grid, k_grid * cost_per_parking, label='Parking lots')
ax.plot(k_grid, k_grid * cost_per_parking + average_distance_array * cost_per_avarage_distance, label='Total')
ax.set_title('Cost vs. K')
ax.set_ylabel('Cost [k$]')
ax.set_xlabel('K')
ax.legend()
fig.savefig('./output/taxis_scan_for_k.png')
## Calculate the test results
distances = distance.cdist(x_test, optimal_centers, 'euclidean')
test_average_distance = distances.min(axis=1).mean()
test_risk = cost_per_parking * optimal_k + cost_per_avarage_distance * test_average_distance
print('The optimal number of parking lots is: {}'.format(optimal_k))
print('The optimal average ride distance is: {:.2f} Km'.format(test_average_distance))
print('The train risk is {:.2f} k$'.format(test_risk))
Evaluation on test set¶
In [ ]:
## Calculate distances to all parking lots
distances = distance.cdist(x_test, optimal_centers, 'euclidean')
## Calculate the average distance and risk
test_average_distance = distances.min(axis=1).mean()
test_risk = cost_per_parking * optimal_k + cost_per_avarage_distance * test_average_distance
print('The optimal average ride distance is: {:.2f} Km'.format(test_average_distance))
print('The train risk is {:.2f} k$'.format(test_risk))