תרגול 2 - דוגמא מעשית
Setup¶
In [ ]:
## Importing packages
import os # A build in package for interacting with the OS. For example to create a folder.
import numpy as np # Numerical package (mainly multi-dimensional arrays and linear algebra)
import pandas as pd # A package for working with data frames
import matplotlib.pyplot as plt # A plotting package
import imageio # A package to read and write image (is used here to save gif images)
## Setup matplotlib to output figures into the notebook
## - To make the figures interactive (zoomable, tooltip, etc.) use ""%matplotlib notebook" instead
%matplotlib inline
## Setting some nice matplotlib defaults
plt.rcParams['figure.figsize'] = (4.5, 4.5) # Set default plot's sizes
plt.rcParams['figure.dpi'] = 120 # Set default plot's dpi (increase fonts' size)
plt.rcParams['axes.grid'] = True # Show grid by default in figures
## Auxiliary function for prining equations, pandas tables and images in cells output
from IPython.core.display import display, HTML, Latex, Markdown
## Create output folder
if not os.path.isdir('./output'):
os.mkdir('./output')
Data Inspection¶
In [ ]:
data_file = 'https://technion046195.netlify.app/datasets/nyc_taxi_rides.csv'
## Loading the data
dataset = pd.read_csv(data_file)
## Print the number of rows in the data set
number_of_rows = len(dataset)
display(Markdown(f'Number of rows in the dataset: $N={number_of_rows}$'))
## Show the first 10 rows
dataset.head(10)
Out[ ]:
In [ ]:
import skimage.io # Another package to heandle images
## Load and image of the streets of NY
ny_map_image = skimage.io.imread('https://technion046195.netlify.app/assets/nyc_map.png')
## The geografic bounding box of the map data as [West-longtitude East-longtitude South-latitude North-latitude]:
bbox = [582.5, 592.5, 4505.5, 4515.5]
In [ ]:
## Create the figure and axis
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.grid(True)
## Plot the map
ax.imshow(ny_map_image, extent=bbox, cmap='gray', alpha=0.7)
ax.plot(dataset['pickup_easting'], dataset['pickup_northing'], '.', markersize=1);
fig.savefig('./output/pickups.png')
In [ ]:
print(dataset['duration'].describe())
fig, ax = plt.subplots(figsize=(4.5, 3))
dataset['duration'].hist(ax=ax, bins=int(np.sqrt(len(dataset))), density=True)
ax.set_title("Histogram of Durations")
ax.set_xlabel('Durations')
ax.set_ylabel('PDF');
fig.savefig('./output/duration_hist.png')
In [ ]:
## Extract y and x
y = dataset['duration'].values
x = dataset[['pickup_easting', 'pickup_northing', 'dropoff_easting', 'dropoff_northing']].values
In [ ]:
## Defining augmentation
aug_func = lambda x: (((x[:, 0] - x[:, 2]) ** 2 + (x[:, 1] - x[:, 3]) ** 2) ** 0.5)[:, None]
## Augment the dataset
x_aug = aug_func(x)
## Calcualting theta
theta = np.linalg.inv(x_aug.T @ x_aug) @ (x_aug.T @ y)
## Printing derivation
display(Markdown(r'$\boldsymbol{\theta}^*_{\mathcal{D}}=$' + f'{theta}'))
## Defineing the predictor
h = lambda x: aug_func(x) @ theta
## Calculate the empirical risk
risk = ((h(x) - y) ** 2).mean()
display(Markdown(f'The empirical risk is: {risk}'))
## Ploting
fig, ax = plt.subplots(figsize=(5, 5))
ax.scatter(x_aug[:, 0], y, marker='.', s=1, alpha=0.3, label='data')
ax.set_xlabel('Arial distance [Km]')
ax.set_ylabel('Duration [Min]');
ax.set_xlim(0, 12)
ax.set_ylim(0, 60)
plt.tight_layout()
fig.savefig('./output/duration_vs_dist.png')
dist_grid = np.arange(0, 12, 0.1)
ax.plot(dist_grid, dist_grid * theta[0], 'k', label='Prediciton')
ax.legend(loc='upper left')
fig.savefig('./output/duration_vs_dist_pred.png')
In [ ]:
## Defining augmentation
aug_func = lambda x: np.stack((
((x[:, 0] - x[:, 2]) ** 2 + (x[:, 1] - x[:, 3]) ** 2) ** 0.5,
np.ones(x.shape[0]),
x[:, 0],
x[:, 1],
x[:, 0] * x[:, 1],
x[:, 0] ** 2,
x[:, 1] ** 2,
), axis=1)
## Augment the dataset
x_aug = aug_func(x)
## Calcualting theta
theta = np.linalg.inv(x_aug.T @ x_aug) @ (x_aug.T @ y)
## Printing derivation
display(Markdown(r'$\boldsymbol{\theta}^*_{\mathcal{D}}=$' + f'{theta}'))
## Defineing the predictor
h = lambda x: aug_func(x) @ theta
## Calculate the empirical risk
risk = ((h(x) - y) ** 2).mean()
display(Markdown(f'The empirical risk is: {risk}'))
In [ ]: