Download No description has been provided for this image

תרגול 6 - דוגמא מעשית

Setup

In [ ]:
## Importing packages
import os # A build in package for interacting with the OS. For example to create a folder.
import numpy as np  # Numerical package (mainly multi-dimensional arrays and linear algebra)
import pandas as pd  # A package for working with data frames
import matplotlib.pyplot as plt  # A plotting package
import imageio  # A package to read and write image (is used here to save gif images)
import tabulate  # A package from pretty printing tables
from graphviz import Digraph  # A package for plothing graphs (of nodes and edges)

## Setup matplotlib to output figures into the notebook
## - To make the figures interactive (zoomable, tooltip, etc.) use ""%matplotlib notebook" instead
%matplotlib inline

## Setting some nice matplotlib defaults
plt.rcParams['figure.figsize'] = (4.5, 4.5)  # Set default plot's sizes
plt.rcParams['figure.dpi'] = 120  # Set default plot's dpi (increase fonts' size)
plt.rcParams['axes.grid'] = True  # Show grid by default in figures

## Auxiliary function for prining equations, pandas tables and images in cells output
from IPython.core.display import display, HTML, Latex, Markdown

## Create output folder
if not os.path.isdir('./output'):
    os.mkdir('./output')

Data Inspection

In [ ]:
data_file = 'https://technion046195.netlify.app/datasets/nyc_taxi_rides.csv'

## Loading the data
dataset = pd.read_csv(data_file)

## Print the number of rows in the data set
number_of_rows = len(dataset)
display(Markdown(f'Number of rows in the dataset: $N={number_of_rows}$'))

## Show the first 10 rows
dataset.head(10)

Number of rows in the dataset: $N=100000$

Out[ ]:
passenger_count trip_distance payment_type fare_amount tip_amount pickup_easting pickup_northing dropoff_easting dropoff_northing duration day_of_week day_of_month time_of_day
0 2 2.768065 2 9.5 0.00 586.996941 4512.979705 588.155118 4515.180889 11.516667 3 13 12.801944
1 1 3.218680 2 10.0 0.00 587.151523 4512.923924 584.850489 4512.632082 12.666667 6 16 20.961389
2 1 2.574944 1 7.0 2.49 587.005357 4513.359700 585.434188 4513.174964 5.516667 0 31 20.412778
3 1 0.965604 1 7.5 1.65 586.648975 4511.729212 586.671530 4512.554065 9.883333 1 25 13.031389
4 1 2.462290 1 7.5 1.66 586.967178 4511.894301 585.262474 4511.755477 8.683333 2 5 7.703333
5 5 1.561060 1 7.5 2.20 585.926415 4512.880385 585.168973 4511.540103 9.433333 3 20 20.667222
6 1 2.574944 1 8.0 1.00 586.731409 4515.084445 588.710175 4514.209184 7.950000 5 8 23.841944
7 1 0.804670 2 5.0 0.00 585.344614 4509.712541 585.843967 4509.545089 4.950000 5 29 15.831389
8 1 3.653202 1 10.0 1.10 585.422062 4509.477536 583.671081 4507.735573 11.066667 5 8 2.098333
9 6 1.625433 1 5.5 1.36 587.875433 4514.931073 587.701248 4513.709691 4.216667 3 13 21.783056
In [ ]:
x = dataset['duration'].values

Train-test split

In [ ]:
n_samples = len(x)

## Generate a random generator with a fixed seed
rand_gen = np.random.RandomState(0)

## Generating a vector of indices
indices = np.arange(n_samples)

## Shuffle the indices
rand_gen.shuffle(indices)

## Split the indices into 80% train / 20% test
n_samples_train = int(n_samples * 0.8)
train_indices = indices[:n_samples_train]
test_indices = indices[n_samples_train:]

x_train = x[train_indices]
x_test = x[test_indices]
In [ ]:
x_grid = np.arange(-10, 60 + 0.1, 0.1)

Attempt 1 : Normal Distribution

Calculating models parameters:

$$ \mu=\displaystyle{\frac{1}{N}\sum_i x_i} \\ \sigma=\sqrt{\displaystyle{\frac{1}{N}\sum_i\left(x_i-\mu\right)^2}} \\ $$

In [ ]:
# Normal distribution parameters
mu = np.sum(x) / len(x)
sigma = np.sqrt(np.sum((x - mu) ** 2) / len(x))

display(Latex('$\\mu = {:.01f}\\ \\text{{min}}$'.format(mu)))
display(Latex('$\\sigma = {:.01f}\\ \\text{{min}}$'.format(sigma)))
$\mu = 11.4\ \text{min}$
$\sigma = 7.0\ \text{min}$

From here on we will use np.mean and np.std functions to calculate the mean and standard deviation.

In addition scipy.stats has a wide range of distribution models. Each model comes with a set of methods for calculating the CDF, PDF, performing MLE fit, generate samples and more.

In [ ]:
## Import the normal distribution model from SciPy
from scipy.stats import norm

## Define the normal distribution object
norm_dist = norm(mu, sigma)

## Calculate the normal distribution PDF over the grid
norm_pdf = norm_dist.pdf(x_grid)

## Prepare the figure
fig, ax = plt.subplots()
ax.hist(dataset['duration'].values, bins=300 ,density=True, label='Histogram')
ax.plot(x_grid, norm_pdf, label='Normal')
ax.set_title('Distribution of Durations')
ax.set_ylabel('PDF')
ax.set_xlabel('Duration [min]')
ax.legend();
fig.savefig('./output/nyc_duration_normal.png')
No description has been provided for this image

Attempt 2 : Rayleigh Distribution

Calculating models parameters:

$$ \Leftrightarrow \sigma = \sqrt{\frac{1}{2N}\sum_i x^2} $$

In [ ]:
## Import the normal distribution model from SciPy
from scipy.stats import rayleigh

## Find the model's parameters using SciPy
_, sigma = rayleigh.fit(x, floc=0)  ## equivalent to running: sigma = np.sqrt(np.sum(x ** 2) / len(x) / 2)

display(Latex('$\\sigma = {:.01f}$'.format(sigma)))

## Define the Rayleigh distribution object
rayleigh_dist = rayleigh(0, sigma)

## Calculate the Rayleigh distribution PDF over the grid
rayleigh_pdf = rayleigh_dist.pdf(x_grid)

## Prepare the figure
fig, ax = plt.subplots()
ax.hist(dataset['duration'].values, bins=300 ,density=True, label='Histogram')
ax.plot(x_grid, norm_pdf, label='Normal')
ax.plot(x_grid, rayleigh_pdf, label='Rayleigh')
ax.set_title('Distribution of Durations')
ax.set_ylabel('PDF')
ax.set_xlabel('Duration [min]')
ax.legend();
fig.savefig('./output/nyc_duration_rayleigh.png')
$\sigma = 9.5$
No description has been provided for this image

Attempt 3 : Generalized Gamma Distribution

Numerical solution

In [ ]:
## Import the normal distribution model from SciPy
from scipy.stats import gengamma

## Find the model's parameters using SciPy
a, c, _, sigma  = gengamma.fit(x, floc=0)

display(Latex('$a = {:.01f}$'.format(a)))
display(Latex('$c = {:.01f}$'.format(c)))
display(Latex('$\\sigma = {:.01f}$'.format(sigma)))

## Define the generalized gamma distribution object
gengamma_dist = gengamma(a, c, 0, sigma)

## Calculate the generalized gamma distribution PDF over the grid
gengamma_pdf = gengamma_dist.pdf(x_grid)

## Prepare the figure
fig, ax = plt.subplots()
ax.hist(dataset['duration'].values, bins=300 ,density=True, label='Histogram')
ax.plot(x_grid, norm_pdf, label='Normal')
ax.plot(x_grid, rayleigh_pdf, label='Rayleigh')
ax.plot(x_grid, gengamma_pdf, label='Generalized Gamma')
ax.set_title('Distribution of Durations')
ax.set_ylabel('PDF')
ax.set_xlabel('Duration [min]')
ax.legend();
fig.savefig('./output/nyc_duration_generalized_gamma.png')
$a = 4.4$
$c = 0.8$
$\sigma = 1.6$
No description has been provided for this image
In [ ]: