הרצאה 5 - Bagging & boosting

Setup¶

## Importing packages
import os # A build in package for interacting with the OS. For example to create a folder.
import numpy as np  # Numerical package (mainly multi-dimensional arrays and linear algebra)
import pandas as pd  # A package for working with data frames
import matplotlib.pyplot as plt  # A plotting package
import imageio  # A package to read and write image (is used here to save gif images)
import tabulate  # A package from pretty printing tables
from graphviz import Digraph  # A package for plothing graphs (of nodes and edges)

## Setup matplotlib to output figures into the notebook
## - To make the figures interactive (zoomable, tooltip, etc.) use ""%matplotlib notebook" instead
%matplotlib inline

## Setting some nice matplotlib defaults
plt.rcParams['figure.figsize'] = (4.5, 4.5)  # Set default plot's sizes
plt.rcParams['figure.dpi'] = 120  # Set default plot's dpi (increase fonts' size)
plt.rcParams['axes.grid'] = True  # Show grid by default in figures

## Auxiliary function for prining equations, pandas tables and images in cells output
from IPython.core.display import display, HTML, Latex, Markdown

## Create output folder
if not os.path.isdir('./output'):
    os.mkdir('./output')

Credit Card Fraud Detection¶

mean_legit = np.array([54, 54])
std_legit = 18
mean_fraud1 = np.array([27, 27])
std_fraud1 = 7.2
# mean_fraud2 = np.array([81, 81])
mean_fraud2 = np.array([27, 27])
std_fraud2 = 7.2

n_legit = 200
n_fraud1 = 25
n_fraud2 = 25

rand_gen = np.random.RandomState(1)

x = np.concatenate((
        rand_gen.randn(n_legit, 2) * std_legit + mean_legit,
        rand_gen.randn(n_fraud1, 2) * std_fraud1 + mean_fraud1,
        rand_gen.randn(n_fraud2, 2) * std_fraud2 + mean_fraud2,
    ), axis=0)

y = np.concatenate((np.ones(n_legit, dtype=int), -np.ones(n_fraud1 + n_fraud2, dtype=int)))

x_grid = np.stack(np.meshgrid(np.linspace(0, 100, 300), np.linspace(0, 100, 300)), axis=2)

from matplotlib.colors import ListedColormap
def plot_grid_predict(ax, h, x_grid):
    cmap = ListedColormap(plt.cm.tab10([0, 1]))
    grid_predict = h(x_grid.reshape(-1, 2)).reshape(x_grid.shape[:2])
    img_obj = ax.imshow(-grid_predict, extent=[0, 100, 0, 100],
              origin='lower',
              cmap=cmap,
              alpha=0.2,
              interpolation='nearest',
              zorder=-1,
              )
    return img_obj

Train-Test Split¶

n_samples = x.shape[0]

## Generate a random generator with a fixed seed
rand_gen = np.random.RandomState(1)

## Generating a vector of indices
indices = np.arange(n_samples)

## Shuffle the indices
rand_gen.shuffle(indices)

## Split the indices into 80% train / 20% test
n_samples_train = int(n_samples * 0.8)
train_indices = indices[:n_samples_train]
test_indices = indices[n_samples_train:]

x_train = x[train_indices]
y_train = y[train_indices]
x_test = x[test_indices]
y_test = y[test_indices]

The dataset¶

fig, ax = plt.subplots(figsize=(5, 5))
ax.plot(x_train[y_train==1, 0], x_train[y_train==1, 1], 'x', label='Legit', ms=7, mew=2)
ax.plot(x_train[y_train==-1, 0], x_train[y_train==-1, 1], 'x', label='Fraud', ms=7, mew=2)
ax.set_xlabel('Distance from home [Km]')
ax.set_ylabel('Price [$]')
ax.set_xlim(0, 100)
ax.set_ylim(0, 100)
plt.tight_layout()
ax.legend(loc='upper left')
# fig.savefig('./output/transactions_dataset.png', dpi=240)

<matplotlib.legend.Legend at 0x7f75cfe659e8>

Boosting¶

class stump(object):
    def __init__(self, dim, treshold, flip=False):
        self.dim = dim
        self.treshold = treshold
        self.flip = flip
    
    def __call__(self, x):
        y_hat = (x[:, self.dim] >= self.treshold) * 2 - 1
        if self.flip:
            y_hat = -y_hat
        return y_hat

x = x_train
y = y_train

w = np.ones(x.shape[0])
w /= w.sum()

h_list = []
alpha_list = []

total_score_list = []
bound_list = []

for _ in range(100):
    best_h = None
    best_score = np.inf
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            h = stump(j, x[i, j])
            score = (h(x) != y) @ w
            if score < best_score:
                best_score = score
                best_h = h

            if (1 - score) < best_score:
                best_score = 1 - score
                best_h = stump(j, x[i, j], flip=True)
    
    h = best_h
    score = best_score
    alpha = 0.5 * np.log((1 - score) / score)
    w = w * np.exp(-alpha * y * h(x))
    w /= w.sum()
    
    h_list.append(h)
    alpha_list.append(alpha)
    
    y_hat = np.zeros(x.shape[0])
    bound_term = np.zeros(x.shape[0])
    for h, alpha in zip(h_list, alpha_list):
        y_hat += alpha * h(x)
        bound_term += alpha * y * h(x)
    
    y_hat = np.sign(y_hat)
    total_score = (y_hat != y).mean()
    total_score_list.append(total_score)
    
    bound = np.exp(-bound_term).mean()
    bound_list.append(bound)
    


def h_total(x):
    y_hat = np.zeros(x.shape[0])
    for h, alpha in zip(h_list, alpha_list):
        y_hat += alpha * h(x)
    y_hat = np.sign(y_hat)
    return y_hat

fig, ax = plt.subplots(figsize=(5, 5))
ax.plot(x_train[y_train==1, 0], x_train[y_train==1, 1], 'x', label='Legit', ms=7, mew=2)
ax.plot(x_train[y_train==-1, 0], x_train[y_train==-1, 1], 'x', label='Fraud', ms=7, mew=2)
plot_grid_predict(ax, h_total, x_grid)
ax.set_xlabel('Distance from home [Km]')
ax.set_ylabel('Price [$]')
ax.set_xlim(0, 100)
ax.set_ylim(0, 100)
plt.tight_layout()
ax.legend(loc='upper left')


for h in h_list[:5]:
    fig, ax = plt.subplots(figsize=(3, 3))
    ax.plot(x_train[y_train==1, 0], x_train[y_train==1, 1], 'x', label='Legit', ms=7, mew=2)
    ax.plot(x_train[y_train==-1, 0], x_train[y_train==-1, 1], 'x', label='Fraud', ms=7, mew=2)
    plot_grid_predict(ax, h, x_grid)
    ax.set_xlabel('Distance from home [Km]')
    ax.set_ylabel('Price [$]')
    ax.set_xlim(0, 100)
    ax.set_ylim(0, 100)
    plt.tight_layout()
    ax.legend(loc='upper left')

fig, ax = plt.subplots(figsize=(6, 4))
ax.plot(total_score_list, label='Score')
ax.plot(bound_list, label='Bound')
ax.set_xlabel('Number of predictors')
# ax.set_ylim(5, 6)
plt.tight_layout()
ax.legend();

fig, ax = plt.subplots(figsize=(5, 5))
ax.plot(x_train[y_train==1, 0], x_train[y_train==1, 1], 'x', label='Legit', ms=7, mew=2)
ax.plot(x_train[y_train==-1, 0], x_train[y_train==-1, 1], 'x', label='Fraud', ms=7, mew=2)
plot_grid_predict(ax, h_total, x_grid)
ax.set_xlabel('Distance from home [Km]')
ax.set_ylabel('Price [$]')
ax.set_xlim(0, 100)
ax.set_ylim(0, 100)
plt.tight_layout()
ax.legend(loc='upper left')

<matplotlib.legend.Legend at 0x7f75cfd69668>