תרגול 6 - דוגמא מעשית
Setup¶
In [ ]:
## Importing packages
import os # A build in package for interacting with the OS. For example to create a folder.
import numpy as np # Numerical package (mainly multi-dimensional arrays and linear algebra)
import pandas as pd # A package for working with data frames
import matplotlib.pyplot as plt # A plotting package
import imageio # A package to read and write image (is used here to save gif images)
import tabulate # A package from pretty printing tables
from graphviz import Digraph # A package for plothing graphs (of nodes and edges)
## Setup matplotlib to output figures into the notebook
## - To make the figures interactive (zoomable, tooltip, etc.) use ""%matplotlib notebook" instead
%matplotlib inline
## Setting some nice matplotlib defaults
plt.rcParams['figure.figsize'] = (4.5, 4.5) # Set default plot's sizes
plt.rcParams['figure.dpi'] = 120 # Set default plot's dpi (increase fonts' size)
plt.rcParams['axes.grid'] = True # Show grid by default in figures
## Auxiliary function for prining equations, pandas tables and images in cells output
from IPython.core.display import display, HTML, Latex, Markdown
## Create output folder
if not os.path.isdir('./output'):
os.mkdir('./output')
Data Inspection¶
In [ ]:
data_file = 'https://technion046195.netlify.app/datasets/nyc_taxi_rides.csv'
## Loading the data
dataset = pd.read_csv(data_file)
## Print the number of rows in the data set
number_of_rows = len(dataset)
display(Markdown(f'Number of rows in the dataset: $N={number_of_rows}$'))
## Show the first 10 rows
dataset.head(10)
Out[ ]:
In [ ]:
x = dataset['duration'].values
Train-test split¶
In [ ]:
n_samples = len(x)
## Generate a random generator with a fixed seed
rand_gen = np.random.RandomState(0)
## Generating a vector of indices
indices = np.arange(n_samples)
## Shuffle the indices
rand_gen.shuffle(indices)
## Split the indices into 80% train / 20% test
n_samples_train = int(n_samples * 0.8)
train_indices = indices[:n_samples_train]
test_indices = indices[n_samples_train:]
x_train = x[train_indices]
x_test = x[test_indices]
In [ ]:
x_grid = np.arange(-10, 60 + 0.1, 0.1)
Attempt 1 : Normal Distribution¶
Calculating models parameters:
$$ \mu=\displaystyle{\frac{1}{N}\sum_i x_i} \\ \sigma=\sqrt{\displaystyle{\frac{1}{N}\sum_i\left(x_i-\mu\right)^2}} \\ $$
In [ ]:
# Normal distribution parameters
mu = np.sum(x) / len(x)
sigma = np.sqrt(np.sum((x - mu) ** 2) / len(x))
display(Latex('$\\mu = {:.01f}\\ \\text{{min}}$'.format(mu)))
display(Latex('$\\sigma = {:.01f}\\ \\text{{min}}$'.format(sigma)))
From here on we will use np.mean and np.std functions to calculate the mean and standard deviation.
In addition scipy.stats has a wide range of distribution models. Each model comes with a set of methods for calculating the CDF, PDF, performing MLE fit, generate samples and more.
In [ ]:
## Import the normal distribution model from SciPy
from scipy.stats import norm
## Define the normal distribution object
norm_dist = norm(mu, sigma)
## Calculate the normal distribution PDF over the grid
norm_pdf = norm_dist.pdf(x_grid)
## Prepare the figure
fig, ax = plt.subplots()
ax.hist(dataset['duration'].values, bins=300 ,density=True, label='Histogram')
ax.plot(x_grid, norm_pdf, label='Normal')
ax.set_title('Distribution of Durations')
ax.set_ylabel('PDF')
ax.set_xlabel('Duration [min]')
ax.legend();
fig.savefig('./output/nyc_duration_normal.png')
Attempt 2 : Rayleigh Distribution¶
Calculating models parameters:
$$ \Leftrightarrow \sigma = \sqrt{\frac{1}{2N}\sum_i x^2} $$
In [ ]:
## Import the normal distribution model from SciPy
from scipy.stats import rayleigh
## Find the model's parameters using SciPy
_, sigma = rayleigh.fit(x, floc=0) ## equivalent to running: sigma = np.sqrt(np.sum(x ** 2) / len(x) / 2)
display(Latex('$\\sigma = {:.01f}$'.format(sigma)))
## Define the Rayleigh distribution object
rayleigh_dist = rayleigh(0, sigma)
## Calculate the Rayleigh distribution PDF over the grid
rayleigh_pdf = rayleigh_dist.pdf(x_grid)
## Prepare the figure
fig, ax = plt.subplots()
ax.hist(dataset['duration'].values, bins=300 ,density=True, label='Histogram')
ax.plot(x_grid, norm_pdf, label='Normal')
ax.plot(x_grid, rayleigh_pdf, label='Rayleigh')
ax.set_title('Distribution of Durations')
ax.set_ylabel('PDF')
ax.set_xlabel('Duration [min]')
ax.legend();
fig.savefig('./output/nyc_duration_rayleigh.png')
Attempt 3 : Generalized Gamma Distribution¶
Numerical solution
In [ ]:
## Import the normal distribution model from SciPy
from scipy.stats import gengamma
## Find the model's parameters using SciPy
a, c, _, sigma = gengamma.fit(x, floc=0)
display(Latex('$a = {:.01f}$'.format(a)))
display(Latex('$c = {:.01f}$'.format(c)))
display(Latex('$\\sigma = {:.01f}$'.format(sigma)))
## Define the generalized gamma distribution object
gengamma_dist = gengamma(a, c, 0, sigma)
## Calculate the generalized gamma distribution PDF over the grid
gengamma_pdf = gengamma_dist.pdf(x_grid)
## Prepare the figure
fig, ax = plt.subplots()
ax.hist(dataset['duration'].values, bins=300 ,density=True, label='Histogram')
ax.plot(x_grid, norm_pdf, label='Normal')
ax.plot(x_grid, rayleigh_pdf, label='Rayleigh')
ax.plot(x_grid, gengamma_pdf, label='Generalized Gamma')
ax.set_title('Distribution of Durations')
ax.set_ylabel('PDF')
ax.set_xlabel('Duration [min]')
ax.legend();
fig.savefig('./output/nyc_duration_generalized_gamma.png')
In [ ]: