Download No description has been provided for this image

הרצאה 7 - שיערוך פילוג בשיטות לא פרמטריות

Setup

In [ ]:
## Importing packages
import os # A build in package for interacting with the OS. For example to create a folder.
import numpy as np  # Numerical package (mainly multi-dimensional arrays and linear algebra)
import pandas as pd  # A package for working with data frames
import matplotlib.pyplot as plt  # A plotting package
import imageio  # A package to read and write image (is used here to save gif images)
import tabulate  # A package from pretty printing tables
from graphviz import Digraph  # A package for plothing graphs (of nodes and edges)

## Setup matplotlib to output figures into the notebook
## - To make the figures interactive (zoomable, tooltip, etc.) use ""%matplotlib notebook" instead
%matplotlib inline

## Setting some nice matplotlib defaults
plt.rcParams['figure.figsize'] = (4.5, 4.5)  # Set default plot's sizes
plt.rcParams['figure.dpi'] = 120  # Set default plot's dpi (increase fonts' size)
plt.rcParams['axes.grid'] = True  # Show grid by default in figures

## Auxiliary function for prining equations, pandas tables and images in cells output
from IPython.core.display import display, HTML, Latex, Markdown

## Create output folder
if not os.path.isdir('./output'):
    os.mkdir('./output')

Drive time distribution

In [ ]:
x = np.array([55,68,75,50,72,84,65,58,74,66])
x_grid = np.arange(45, 90, 0.01)

ECDF

In [ ]:
ecdf = (x_grid[:, None] >= x[None, :]).mean(axis=1)

fig, ax = plt.subplots(figsize=(5, 3))
ax.plot(x_grid, ecdf, linewidth=4)
ax.set_title('ECDF')
ax.set_xlabel('x')
ax.set_ylabel('Estimated CDF')
ax.set_xlim(45, 90)
ax.set_ylim(0, 1.1)
plt.tight_layout()
fig.savefig('./output/drive_time_ecdf.png', dpi=240)
No description has been provided for this image
In [ ]:
fig, ax = plt.subplots(figsize=(5, 3))
ax.stem(x, np.ones_like(x) / x.shape[0])
ax.set_title(r'$\frac{d}{dx}$  ECDF')
ax.set_xlabel('x')
ax.set_ylabel('Estimated PDF')
ax.set_xlim(45, 90)
ax.set_ylim(0, 0.2)
plt.tight_layout()
fig.savefig('./output/drive_time_diff_ecdf.png', dpi=240)
No description has been provided for this image

Histogram

In [ ]:
fig, ax = plt.subplots(figsize=(5, 3))
ax.hist(x, np.linspace(45, 90, 6), density=True, histtype='step', linewidth=4)
ax.set_title('5 bins histogram')
ax.set_xlabel('x')
ax.set_ylabel('Estimated PDF')
ax.set_xlim(40, 95)
ax.set_ylim(0, 0.05)
plt.tight_layout()
fig.savefig('./output/drive_time_hist_5.png', dpi=240)
No description has been provided for this image
In [ ]:
fig, ax = plt.subplots(figsize=(5, 3))
ax.hist(x, np.linspace(45, 90, 2), density=True, histtype='step', linewidth=4)
ax.set_title('1 bin histogram')
ax.set_xlabel('x')
ax.set_ylabel('Estimated PDF')
ax.set_xlim(40, 95)
ax.set_ylim(0, 0.05)
plt.tight_layout()
fig.savefig('./output/drive_time_hist_1.png', dpi=240)
No description has been provided for this image
In [ ]:
fig, ax = plt.subplots(figsize=(5, 3))
ax.hist(x, np.linspace(45, 90, 101), density=True, histtype='step', linewidth=2)
ax.set_title('100 bins histogram')
ax.set_xlabel('x')
ax.set_ylabel('Estimated PDF')
ax.set_xlim(40, 95)
ax.set_ylim(0, 0.3)
plt.tight_layout()
fig.savefig('./output/drive_time_hist_100.png', dpi=240)
No description has been provided for this image

KDE

In [ ]:
fig, ax = plt.subplots(figsize=(5, 3))
ax.stem(x, np.ones_like(x) / x.shape[0])
ax.set_title('Delta  PDF')
ax.set_xlabel('x')
ax.set_ylabel('Estimated PDF')
ax.set_xlim(45, 90)
ax.set_ylim(0, 0.2)
plt.tight_layout()
fig.savefig('./output/drive_time_delta_pdf.png', dpi=240)
No description has been provided for this image
In [ ]:
kernel = lambda x: 1 / (2 * np.pi) ** 0.5 * np.exp(-x ** 2 / 2)

h = 2

fig, ax = plt.subplots(figsize=(5, 3))
for x0 in x:
    sub_pdf = 1 / h * kernel((x_grid - x0) / h) / x.shape[0]
    ax.plot(x_grid, sub_pdf)
ax.set_title('Deltas -> Gaussians')
ax.set_xlabel('x')
ax.set_xlim(45, 90)
ax.set_ylim(0, 0.08)
plt.tight_layout()
fig.savefig('./output/drive_time_kernels.png', dpi=240)
No description has been provided for this image
In [ ]:
kernel = lambda x: 1 / (2 * np.pi) ** 0.5 * np.exp(-x ** 2 / 2)

h = 2

pdf = np.zeros_like(x_grid)
fig, ax = plt.subplots(figsize=(5, 3))
for x0 in x:
    sub_pdf = 1 / h * kernel((x_grid - x0) / h) / x.shape[0]
    ax.plot(x_grid, sub_pdf, color='gray', alpha=0.5)
    pdf += sub_pdf
ax.plot(x_grid, pdf)
ax.set_title('KDE')
ax.set_xlabel('x')
ax.set_ylabel('Estimated PDF')
ax.set_xlim(45, 90)
ax.set_ylim(0, 0.08)
plt.tight_layout()
fig.savefig('./output/drive_time_kde.png', dpi=240)
No description has been provided for this image
In [ ]:
kernel = lambda x: 1 / (2 * np.pi) ** 0.5 * np.exp(-x ** 2 / 2)

h = 1

pdf = np.zeros_like(x_grid)
fig, ax = plt.subplots(figsize=(5, 3))
for x0 in x:
    sub_pdf = 1 / h * kernel((x_grid - x0) / h) / x.shape[0]
    ax.plot(x_grid, sub_pdf, color='gray', alpha=0.5)
    pdf += sub_pdf
ax.plot(x_grid, pdf)
ax.set_title('KDE - h=1')
ax.set_xlabel('x')
ax.set_ylabel('Estimated PDF')
ax.set_xlim(45, 90)
ax.set_ylim(0, 0.08)
plt.tight_layout()
fig.savefig('./output/drive_time_kde_h_1.png', dpi=240)
No description has been provided for this image
In [ ]:
kernel = lambda x: 1 / (2 * np.pi) ** 0.5 * np.exp(-x ** 2 / 2)

h = 4

pdf = np.zeros_like(x_grid)
fig, ax = plt.subplots(figsize=(5, 3))
for x0 in x:
    sub_pdf = 1 / h * kernel((x_grid - x0) / h) / x.shape[0]
    ax.plot(x_grid, sub_pdf, color='gray', alpha=0.5)
    pdf += sub_pdf
ax.plot(x_grid, pdf)
ax.set_title('KDE - h=4')
ax.set_xlabel('x')
ax.set_ylabel('Estimated PDF')
ax.set_xlim(45, 90)
ax.set_ylim(0, 0.08)
plt.tight_layout()
fig.savefig('./output/drive_time_kde_h_4.png', dpi=240)
No description has been provided for this image

Gaussian distribution

In [ ]:
rand_gen = np.random.RandomState(1)

x_grid = np.arange(-3, 3.01, 0.01)

fig, axes = plt.subplots(1, 3, figsize=(9, 3))
for i, k in enumerate([3, 7, 21]):
    x = rand_gen.randn(100000)
    ax = axes[i]
    ax.hist(x, np.linspace(-3, 3, k + 1), density=True, histtype='step', linewidth=3, label='Mean histogram')
    ax.plot(x_grid, 1 / ((2 * np.pi) ** 0.5) * np.exp(-x_grid ** 2 / 2), linewidth=3, label='True PDF')
    ax.set_title(f'Bias of k={k}')
    ax.set_xlabel('x')
    ax.set_xlim(-3, 3)
    ax.set_ylim(0, 0.5)
axes[0].set_ylabel('Estimated PDF')
axes[-1].legend(loc='lower right')
plt.tight_layout()
fig.savefig('./output/gaussian_hist_bias.png', dpi=240)
No description has been provided for this image
In [ ]:
fig, axes = plt.subplots(3, 3, figsize=(9, 9))
for i, k in enumerate([3, 7, 21]):
    for j in range(3):
        x = rand_gen.randn(50)
        ax = axes[i][j]
        ax.hist(x, np.linspace(-3, 3, k + 1), density=True, histtype='step', linewidth=3, label='Histogram')
        ax.plot(x_grid, 1 / ((2 * np.pi) ** 0.5) * np.exp(-x_grid ** 2 / 2), linewidth=3, label='True PDF')
        ax.plot(x, np.zeros_like(x), 'dr', ms=5, label='dataset')
        ax.set_xlabel('x')
        ax.set_xlim(-3, 3)
        ax.set_ylim(0, 0.5)
    axes[i][1].set_title(f'Variance of k={k}')
    axes[i][0].set_ylabel('PDF')
axes[-1][-1].legend(loc='center right')
plt.tight_layout()
fig.savefig('./output/gaussian_hist_variance.png', dpi=240)
No description has been provided for this image

Credit Card Fraud Detection

In [ ]:
mean_legit = np.array([54, 54])
std_legit = 18
mean_fraud1 = np.array([27, 27])
std_fraud1 = 7.2
mean_fraud2 = np.array([81, 81])
std_fraud2 = 7.2

n_legit = 200
n_fraud1 = 25
n_fraud2 = 25

rand_gen = np.random.RandomState(1)

x = np.concatenate((
        rand_gen.randn(n_legit, 2) * std_legit + mean_legit,
        rand_gen.randn(n_fraud1, 2) * std_fraud1 + mean_fraud1,
        rand_gen.randn(n_fraud2, 2) * std_fraud2 + mean_fraud2,
    ), axis=0)

y = np.concatenate((np.zeros(n_legit, dtype=bool), np.ones(n_fraud1 + n_fraud2, dtype=bool)))

x_grid = np.stack(np.meshgrid(np.linspace(0, 100, 300), np.linspace(0, 100, 300)), axis=2)
In [ ]:
from matplotlib.colors import ListedColormap
def plot_grid_predict(ax, h, x_grid):
    cmap = ListedColormap(plt.cm.tab10([0, 1]))
    grid_predict = h(x_grid.reshape(-1, 2)).reshape(x_grid.shape[:2])
    img_obj = ax.imshow(grid_predict, extent=[0, 100, 0, 100],
              origin='lower',
              cmap=cmap,
              alpha=0.2,
              interpolation='nearest',
              zorder=-1,
              )
    return img_obj

Train-Test Split

In [ ]:
n_samples = x.shape[0]

## Generate a random generator with a fixed seed
rand_gen = np.random.RandomState(1)

## Generating a vector of indices
indices = np.arange(n_samples)

## Shuffle the indices
rand_gen.shuffle(indices)

## Split the indices into 80% train / 20% test
n_samples_train = int(n_samples * 0.8)
train_indices = indices[:n_samples_train]
test_indices = indices[n_samples_train:]

x_train = x[train_indices]
y_train = y[train_indices]
x_test = x[test_indices]
y_test = y[test_indices]

The dataset

In [ ]:
fig, ax = plt.subplots(figsize=(5, 5))
ax.plot(x_train[~y_train, 0], x_train[~y_train, 1], 'x', label='Legit', ms=7, mew=2)
ax.plot(x_train[y_train, 0], x_train[y_train, 1], 'x', label='Fraud', ms=7, mew=2)
ax.set_xlabel('Distance from home [Km]')
ax.set_ylabel('Price [$]')
ax.set_xlim(0, 100)
ax.set_ylim(0, 100)
plt.tight_layout()
ax.legend(loc='upper left')
fig.savefig('./output/transactions_dataset.png', dpi=240)
No description has been provided for this image

Illustration

In [ ]:
def gt_pfd_legit(x):
    pdf = np.exp(-((x - mean_legit[None, :]) ** 2).sum(axis=1) / 2 / std_legit ** 2) / (2 * np. pi) ** 0.5 / std_legit ** 2
    return pdf

def gt_pfd_fraud(x):
    pdf = np.exp(-((x - mean_fraud1[None, :]) ** 2).sum(axis=1) / 2 / std_fraud1 ** 2) / (2 * np. pi) ** 0.5 / std_fraud1 ** 2 \
          + np.exp(-((x - mean_fraud2[None, :]) ** 2).sum(axis=1) / 2 / std_fraud2 ** 2) / (2 * np. pi) ** 0.5 / std_fraud2 ** 2
    return pdf



fig = plt.figure(figsize=(10, 5))

pdf = gt_pfd_legit(x_grid.reshape(-1, 2)).reshape(x_grid.shape[0], x_grid.shape[1])
ax = fig.add_subplot(1, 2, 1, projection='3d')
ax.plot_surface(x_grid[:, :, 0], x_grid[:, :, 1], pdf, cmap=plt.cm.coolwarm, linewidth=0, antialiased=False)
ax.set_xlim(0, 100)
ax.set_ylim(0, 100)
ax.set_title('Legit PDF - $p_{x|y}(x|0)$')
ax.set_xlabel('Distance from home [Km]')
ax.set_ylabel('Price [$]')

pdf = gt_pfd_fraud(x_grid.reshape(-1, 2)).reshape(x_grid.shape[0], x_grid.shape[1])
ax = fig.add_subplot(1, 2, 2, projection='3d')
ax.plot_surface(x_grid[:, :, 0], x_grid[:, :, 1], pdf, cmap=plt.cm.coolwarm, linewidth=0, antialiased=False)
ax.set_xlim(0, 100)
ax.set_ylim(0, 100)
ax.set_title('Fraud PDF - $p_{x|y}(x|1)$')
ax.set_xlabel('Distance from home [Km]')
ax.set_ylabel('Price [$]')

fig.savefig('./output/transactions_gt_pdf.png', dpi=240)
No description has been provided for this image

KDE

In [ ]:
fig, ax = plt.subplots(figsize=(5, 5))
ax.plot(x_train[~y_train, 0], x_train[~y_train, 1], 'x', label='Legit', ms=7, mew=2)
ax.set_xlabel('Distance from home [Km]')
ax.set_ylabel('Price [$]')
ax.set_title('y = 0')
ax.set_xlim(0, 100)
ax.set_ylim(0, 100)
plt.tight_layout()
fig.savefig('./output/transactions_dataset_legit.png', dpi=240)
No description has been provided for this image
In [ ]:
kernel = lambda x: 1 / (2 / np.pi) ** 0.5 * np.exp(-(x ** 2).sum(axis=1) / 2)
def kde(x_grid, x, kernel, h):
    pdf = np.zeros(x_grid.shape[0])
    for x0 in x:
        pdf += 1 / h ** x.shape[1] * kernel((x_grid - x0) / h) / x.shape[0]
    return pdf

h = 4

pdf_legit = kde(x_grid.reshape(-1, 2), x_train[~y_train], kernel, h).reshape(x_grid.shape[0], x_grid.shape[1])
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(1, 1, 1, projection='3d')
ax.plot_surface(x_grid[:, :, 0], x_grid[:, :, 1], pdf_legit, cmap=plt.cm.coolwarm, linewidth=0, antialiased=False)
ax.set_xlim(0, 100)
ax.set_ylim(0, 100)
ax.set_title('Legit KDE - $\hat{p}_{x|y,\mathcal{D}}(x|0)$')
ax.set_xlabel('Distance from home [Km]')
ax.set_ylabel('Price [$]')
fig.savefig('./output/transactions_kde_legit.png', dpi=240)

pdf_fraud = kde(x_grid.reshape(-1, 2), x_train[y_train], kernel, h).reshape(x_grid.shape[0], x_grid.shape[1])
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(1, 1, 1, projection='3d')
ax.plot_surface(x_grid[:, :, 0], x_grid[:, :, 1], pdf_fraud, cmap=plt.cm.coolwarm, linewidth=0, antialiased=False)
ax.set_xlim(0, 100)
ax.set_ylim(0, 100)
ax.set_title('Fraud KDE - $\hat{p}_{x|y,\mathcal{D}}(x|1)$')
ax.set_xlabel('Distance from home [Km]')
ax.set_ylabel('Price [$]')
fig.savefig('./output/transactions_kde_fraud.png', dpi=240)
No description has been provided for this image
No description has been provided for this image

Predictions

In [ ]:
h_func = lambda x_in: kde(x_in, x_train[y_train], kernel, h) * 0.2 > kde(x_in, x_train[~y_train], kernel, h) * 0.8

fig, ax = plt.subplots(figsize=(5, 5))
ax.plot(x[~y, 0], x[~y, 1], 'x', label='Legit', ms=7, mew=2)
ax.plot(x[y, 0], x[y, 1], 'x', label='Fraud', ms=7, mew=2)
plot_grid_predict(ax, h_func, x_grid)
ax.set_xlabel('Distance from home [Km]')
ax.set_ylabel('Price [$]')
ax.set_xlim(0, 100)
ax.set_ylim(0, 100)
plt.tight_layout()
fig.savefig('./output/transactions_kde_predictions.png', dpi=240)
No description has been provided for this image
In [ ]:
y_hat = h_func(x_test)
test_score = (y_hat != y_test).mean()

display(Markdown(f'The test score is {test_score:.3f}'))

The test score is 0.120

In [ ]: