תרגול 10 - דוגמא מעשית
Setup¶
In [ ]:
## Importing packages
import os # A build in package for interacting with the OS. For example to create a folder.
import numpy as np # Numerical package (mainly multi-dimensional arrays and linear algebra)
import pandas as pd # A package for working with data frames
import matplotlib.pyplot as plt # A plotting package
import imageio # A package to read and write image (is used here to save gif images)
import tabulate # A package from pretty printing tables
from graphviz import Digraph # A package for plothing graphs (of nodes and edges)
import torch ## importing PyTorch
import torch.optim as optim ## Import the optimization module of the package
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import tqdm.notebook as tqdm
## Setup matplotlib to output figures into the notebook
## - To make the figures interactive (zoomable, tooltip, etc.) use ""%matplotlib notebook" instead
%matplotlib inline
## Setting some nice matplotlib defaults
plt.rcParams['figure.figsize'] = (4.5, 4.5) # Set default plot's sizes
plt.rcParams['figure.dpi'] = 120 # Set default plot's dpi (increase fonts' size)
plt.rcParams['axes.grid'] = True # Show grid by default in figures
## Auxiliary function for prining equations, pandas tables and images in cells output
from IPython.core.display import display, HTML, Latex, Markdown
## Create output folder
if not os.path.isdir('./output'):
os.mkdir('./output')
Data Inspection¶
In [ ]:
data_file = 'https://technion046195.netlify.app/datasets/voice.csv'
## Loading the data
dataset = pd.read_csv(data_file)
dataset
Out[ ]:
The Dataset Properties¶
Let us print the properties of the dataset along with the first few images
In [ ]:
fig, ax = plt.subplots()
dataset.groupby('label').size().plot.bar(ax=ax)
ax.set_title('Label')
ax.set_xlabel('Label')
ax.set_ylabel('Number of samples');
plt.tight_layout()
fig.savefig('./output/voices_labels_dist.png', dpi=240)
In [ ]:
measuements_columns = ['meanfreq', 'sd', 'median', 'Q25', 'Q75', 'IQR', 'skew', 'kurt', 'sp.ent', 'sfm', 'mode', 'centroid', 'meanfun', 'minfun', 'maxfun', 'meandom', 'mindom', 'maxdom', 'dfrange', 'modindx']
## Plotting the histograms
fig, ax_list = plt.subplots(4, 5, figsize=(10, 8))
for i, feature in enumerate(measuements_columns):
ax = ax_list.flat[i]
ax.hist(dataset.query('label == "male"')[feature].values, bins=20, alpha=0.5, label='Male')
ax.hist(dataset.query('label == "female"')[feature].values, bins=20, alpha=0.5, label='Female')
ax.set_title(feature)
for ax_list2 in ax_list:
ax_list2[0].set_ylabel('Number of samples')
ax_list.flat[-1].legend()
plt.tight_layout()
fig.savefig('./output/voices_distributions.png', dpi=240)
Train-Validation-Test split¶
In [ ]:
n_samples = len(dataset)
## Generate a random generator with a fixed seed
rand_gen = np.random.RandomState(1)
## Generating a vector of indices
indices = np.arange(n_samples)
## Shuffle the indices
rand_gen.shuffle(indices)
## Split the indices into 60% train / 20% validation / 20% test
n_samples_train = int(n_samples * 0.6)
n_samples_val = int(n_samples * 0.2)
train_indices = indices[:n_samples_train]
val_indices = indices[n_samples_train:(n_samples_train + n_samples_val)]
test_indices = indices[(n_samples_train + n_samples_val):]
train_full_indices = np.concatenate((train_indices, val_indices))
train_full_set = dataset.iloc[train_full_indices]
train_set = dataset.iloc[train_indices]
val_set = dataset.iloc[val_indices]
test_set = dataset.iloc[test_indices]
In [ ]:
def extract_x_y(dataset):
x = dataset[measuements_columns].values
y = (dataset['label'].values == 'female') * 2 - 1
return x, y
x_train_full, y_train_full = extract_x_y(train_full_set)
x_train, y_train = extract_x_y(train_set)
x_val, y_val = extract_x_y(val_set)
x_test, y_test = extract_x_y(test_set)
In [ ]:
from sklearn.svm import SVC
mean = x_train.mean(axis=0, keepdims=True)
std = x_train.std(axis=0, keepdims=True)
## Create the SVC object
svc = SVC(C = 1.0, kernel='linear')
## Run the learning algorithm
svc.fit((x_train - mean) / std, y_train)
## Evaluate in the test set
predictions = svc.predict((x_test - mean) / std)
test_risk = (y_test != predictions).mean()
print(f'The test risk is: {test_risk:.3}')
In [ ]:
dist = ((x_train - mean) / std) @ svc.coef_.T + svc.intercept_
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
ax.set_ylabel('Female')
ax.hist(dist[(y_train==1)], np.arange(-10, 10, 0.1), alpha=0.5, label='Male')
ax.hist(dist[(y_train==-1)], np.arange(-10, 10, 0.1), alpha=0.5, label='Female')
ax.plot([-1, -1], [0, 50], '--k')
ax.plot([0, 0], [0, 50], 'k')
ax.plot([1, 1], [0, 50], '--k')
ax.set_xlim(-10, 10)
ax.set_title('$w^Tx+b$')
ax.legend()
plt.tight_layout(rect=[0, 0, 1, 0.9])
fig.savefig('./output/voices_signed_dist.png', dpi=240)
In [ ]:
## Define the list of C values to test
c_list = np.logspace(-3, 3, 13)
risk_array = np.zeros((len(c_list), ))
## Train and evaluate the algorithm for each C
for i_c, c in enumerate(c_list):
svc = SVC(C=c, kernel='linear')
svc.fit((x_train - mean) / std, y_train)
predictions = svc.predict((x_val - mean) / std)
risk_array[i_c] = (y_val != predictions).mean()
## Extract the optimal C value
optimal_index = np.argmin(risk_array)
optimal_c = c_list[optimal_index]
print(f'The optimal C is {optimal_c}')
## Re-learn and evalute the model with the optimal C
svc = SVC(C = optimal_c, kernel='linear')
svc.fit((x_train_full - mean) / std, y_train_full)
predictions = svc.predict((x_test - mean) / std)
test_risk = (y_test != predictions).mean()
print(f'The test risk is: {test_risk:.2}')
## PLot risk vs. C
fig, ax = plt.subplots()
ax.set_xscale('log')
ax.plot(c_list, risk_array)
ax.plot(optimal_c, risk_array[optimal_index], '.r')
ax.set_xlabel('$K$')
ax.set_ylabel('Risk')
ax.set_title('Risk vs. $C$');
fig.savefig('./output/voices_selecting_c.png', dpi=240)
In [ ]: