תרגול 5 - דוגמא מעשית
Setup¶
In [ ]:
## Importing packages
import os # A build in package for interacting with the OS. For example to create a folder.
import numpy as np # Numerical package (mainly multi-dimensional arrays and linear algebra)
import pandas as pd # A package for working with data frames
import matplotlib.pyplot as plt # A plotting package
import imageio # A package to read and write image (is used here to save gif images)
import tabulate # A package from pretty printing tables
from graphviz import Digraph # A package for plothing graphs (of nodes and edges)
## Setup matplotlib to output figures into the notebook
## - To make the figures interactive (zoomable, tooltip, etc.) use ""%matplotlib notebook" instead
%matplotlib inline
## Setting some nice matplotlib defaults
plt.rcParams['figure.figsize'] = (4.5, 4.5) # Set default plot's sizes
plt.rcParams['figure.dpi'] = 120 # Set default plot's dpi (increase fonts' size)
plt.rcParams['axes.grid'] = True # Show grid by default in figures
## Auxiliary function for prining equations, pandas tables and images in cells output
from IPython.core.display import display, HTML, Latex, Markdown
## Create output folder
if not os.path.isdir('./output'):
os.mkdir('./output')
Data Inspection¶
In [ ]:
data_file = 'https://technion046195.netlify.app/datasets/nyc_taxi_rides.csv'
## Loading the data
dataset = pd.read_csv(data_file)
## Print the number of rows in the data set
number_of_rows = len(dataset)
display(Markdown(f'Number of rows in the dataset: $N={number_of_rows}$'))
## Show the first 10 rows
dataset.head(10)
Out[ ]:
In [ ]:
x = dataset['duration'].values
Train-test split¶
In [ ]:
n_samples = len(x)
## Generate a random generator with a fixed seed
rand_gen = np.random.RandomState(0)
## Generating a vector of indices
indices = np.arange(n_samples)
## Shuffle the indices
rand_gen.shuffle(indices)
## Split the indices into 80% train / 20% test
n_samples_train = int(n_samples * 0.8)
train_indices = indices[:n_samples_train]
test_indices = indices[n_samples_train:]
x_train = x[train_indices]
x_test = x[test_indices]
In [ ]:
x_grid = np.arange(0, x.max() + 0.001, 0.001)
ECDF¶
In [ ]:
def calculate_ecdf(x, x_grid):
## Initialize a an array for the calculated ECDF
ecdf = np.zeros(x_grid.shape)
## Calculate the ECDF by definition
for i, duration in enumerate(x_grid): # enumerate adds an index to the list's values
ecdf[i] = (x <= duration).mean()
return ecdf
## Create the ECDF
ecdf = calculate_ecdf(x_train, x_grid)
# plot the ECDF
fig, ax = plt.subplots(figsize=(5, 4))
ax.plot(x_grid, ecdf)
ax.set_title('ECDF')
ax.set_ylabel('CDF')
ax.set_xlabel('Duration [min]')
fig.savefig('./output/nyc_duration_ecdf.png')
## Plot a zoomed in version
fig, ax = plt.subplots(figsize=(5, 4))
ax.plot(x_grid, ecdf)
ax.set_title('ECDF - Zoom in')
ax.set_ylabel('CDF')
ax.set_xlabel('Duration [min]')
ax.set_xlim(19.5, 20.5)
ax.set_ylim(0.88, 0.9);
fig.savefig('./output/nyc_duration_ecdf_zoom.png')
Smaller datasets¶
In [ ]:
## Create the figure and axis
fig, ax = plt.subplots(figsize=(5, 4))
ax.set_title('ECDF - Smaller datasets')
ax.set_ylabel('CDF')
ax.set_xlabel('Duration [min]')
## Generate a random generator for the sampling
rand_gen = np.random.RandomState(0)
for n_samples in [10, 100, 1000, len(x_train)]:
x_smaller = rand_gen.choice(x_train, n_samples, replace=False)
## Create the ECDF
ecdf = calculate_ecdf(x_smaller, x_grid)
## Plot the ECDF
ax.plot(x_grid, ecdf, label=str(n_samples))
## Add a legend
ax.legend();
fig.savefig('./output/nyc_duration_ecdf_smaller.png')
Histogram¶
In [ ]:
def calculate_hist(x, n_bins):
## Define bins
bins_edges = np.linspace(0, x.max() + 1e-9, n_bins + 1)
bins_width = np.diff(bins_edges) ## Equal to running: bins[1:] - bins[:-1]
## Calculate the histogram
hist_values = np.zeros(n_bins)
for k in range(n_bins):
hist_values[k] = \
np.sum((x >= bins_edges[k]) &
(x < bins_edges[k + 1])
) / n_samples / bins_width[k]
return hist_values, bins_edges, bins_width
## Prepare the figure
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
## Loop over the number of bins
for i, n_bins in enumerate((30, 300, 3000)):
## Calculate the histogram
hist_values, bins_edges, bins_width = calculate_hist(x_train, n_bins)
## Plot the histogram
ax = axes[i]
ax.bar(bins_edges[:-1], hist_values, width=bins_width, align='edge');
ax.set_title('# Bins = {}'.format(n_bins))
ax.set_ylabel('PDF')
ax.set_xlabel('Duration [min]')
ax.set_ylim(0, 0.1)
fig.savefig('./output/nyc_duration_histogram.png')
In [ ]:
## Generate a random generator for the shuffling
rand_gen = np.random.RandomState(0)
## Shuffle the array (the array is copied in order to preserve the order of the original array)
x_shuffled = x_train.copy()
rand_gen.shuffle(x_shuffled)
## Split the shuffled array
x_splited = np.split(x_shuffled, 8)
## Prepare the figure
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
## Loop over the number of bins
for i, n_bins in enumerate([30, 300, 3000]):
## Define bins
bins_edges = np.linspace(0, x.max() + 1e-9, n_bins + 1)
## Loop over the subsets
ax = axes[i]
for x_subset in x_splited:
## Plot histogram
ax.hist(x_subset, bins=bins_edges, density=True, histtype='step', fill=False);
ax.set_title('# Bins = {}'.format(n_bins))
ax.set_ylabel('PDF')
ax.set_xlabel('Duration [min]')
ax.set_ylim(0, 0.1)
fig.suptitle('Histograms Variability');
plt.tight_layout(rect=(0, 0, 1, 0.95))
fig.savefig('./output/nyc_duration_histogram_variance.png')
In [ ]:
## Prepare the figure
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
## Loop over the number of bins
for i, n_bins in enumerate([30, 300, 3000]):
## Define bins
bins_edges = np.linspace(0, x.max() + 1e-9, n_bins + 1)
## Initialize an array to store the histograms values
histograms = np.zeros((len(x_splited), len(bins_edges) - 1))
## Loop over the subsets
for i_subset, x_subset in enumerate(x_splited):
## Calculate the histograms
histograms[i_subset] = np.histogram(x_subset, bins=bins_edges, density=True)[0]
## Remove the mean value
histograms_error = histograms - histograms.mean(axis=0)
## Loop over the subsets
ax = axes[i]
ax.plot(bins_edges[:-1], histograms_error.T, '.')
ax.set_title('# Bins = {}'.format(n_bins))
ax.set_ylabel('PDF')
ax.set_xlabel('Duration [min]')
ax.set_ylim(-0.06, 0.06)
fig.suptitle('Histograms Variability');
plt.tight_layout(rect=(0, 0, 1, 0.95))
fig.savefig('./output/nyc_duration_histogram_variance_no_mean.png')
KDE¶
In [ ]:
## Calculate the suggested bandwidth
suggested_bandwidth = 1.06 * np.std(x_train) / n_samples ** (1 / 5.)
display(Markdown(r'The rule of thumb suggests a bandwidth of $1.06\ \text{std}(x) N^{-\frac{1}{5}}\approx$'
+ f'{suggested_bandwidth:.3f}'))
def calculate_gaussian_kde(x, bandwidth, x_grid):
## Calculate a matrix of (x_grid_j - x_i)^2
delta_squared = (x_grid[:, None] - x[None, :]) ** 2
## Calculate the KDE
kde = (1 / np.sqrt(2 * np.pi * bandwidth ** 2) * np.exp(- delta_squared / 2 / bandwidth ** 2)).mean(axis=1)
return kde
## Define the grid
x_grid = np.arange(0, x.max() + 0.01, 0.01)
## Prepare the figure
fig, ax = plt.subplots(figsize=(5, 4))
ax.hist(x_train, bins=300 ,density=True, label='Histogram')
## Loop over the bandwidth
for i_bandwidth, bandwidth in enumerate([0.08, 0.8, 8]):
## Calculate KDE
kde = calculate_gaussian_kde(x_train, bandwidth, x_grid)
## Plot histogram
ax.plot(x_grid, kde, linewidth=2, label=str(bandwidth))
ax.set_title('KDE')
ax.set_ylabel('PDF')
ax.set_xlabel('Duration [min]')
ax.legend()
ax.set_ylim(0, 0.1)
fig.savefig('./output/nyc_duration_kde.png')
Work hour prediction¶
In [ ]:
y = (dataset['time_of_day'].values > 7) & (dataset['time_of_day'].values < 18)
y_train = y[train_indices]
y_test = y[test_indices]
Estimating $\hat{p}_{\text{y},\mathcal{D}}(y)$¶
In [ ]:
## Calculate the probability p_Y(1)
prob_y1 = y_train.mean()
display(Markdown(r'$\hat{p}_{\text{y},\mathcal{D}}(1)=$' + f'{prob_y1}'))
## Calculate the probability p_Y(0)
prob_y0 = (~y_train).mean()
display(Markdown(r'$\hat{p}_{\text{y},\mathcal{D}}(0)=$' + f'{prob_y0}'))
Blind prediction¶
In [ ]:
predictions = 1
test_score = (y_test != predictions).mean()
display(Markdown(f'The test score is: {test_score}'))
Estimating $\hat{p}_{\text{x}|\text{y},\mathcal{D}}(x|y)$¶
In [ ]:
## Import the KDE class from scikit-learn
from sklearn.neighbors import KernelDensity
## Define the grid
x_grid = np.arange(0, x.max() + 0.01, 0.01)
## Initialize a KDE estimation object
kde_func = KernelDensity(kernel='gaussian', bandwidth=0.8)
## Y=1
## Update the KDE object with the data for Y=1
kde_func.fit(x_train[y_train, None])
## Calculate the desity over the grid
kde_y1 = np.exp(kde_func.score_samples(x_grid[:, None])) ## score_samples retuns log(p(x))
## Y=0
## Update the KDE object with the data for Y=1
kde_func.fit(x_train[~y_train, None])
## Calculate the desity over the grid
kde_y0 = np.exp(kde_func.score_samples(x_grid[:, None])) ## score_samples retuns log(p(x))
fig, ax = plt.subplots(figsize=(5, 4))
ax.plot(x_grid, kde_y1, label=r'$\hat{p}_{x|y,\mathcal{D}}(x|1)$')
ax.plot(x_grid, kde_y0, label=r'$\hat{p}_{x|y,\mathcal{D}}(x|0)$')
ax.set_title('Conditional KDE')
ax.set_ylabel('PDF')
ax.set_xlabel('Duration [min]')
ax.legend()
ax.set_ylim(0, 0.1)
fig.savefig('./output/nyc_work_hour_x_given_y.png')
Predictor¶
In [ ]:
## Calculate the prediction over the grid
prediction = kde_y1 * prob_y1 >= kde_y0 * prob_y0
## Plot the prediction
fig, ax = plt.subplots(figsize=(5, 3))
ax.plot(x_grid, prediction)
ax.set_title('Prediction')
ax.set_ylabel('$\hat{y}$')
ax.set_xlabel('Duration [min]')
fig.savefig('./output/nyc_work_hour_predict.png')
In [ ]:
predictions = x_test > 11.4
test_score = (y_test != predictions).mean()
display(Markdown(f'The test score is: {test_score}'))
In [ ]: