|
import os |
|
|
|
import pandas as pd |
|
from sklearn.ensemble import IsolationForest |
|
|
|
import numpy as np |
|
from sklearn.model_selection import train_test_split |
|
import gradio as gr |
|
import matplotlib.pyplot as plt |
|
from skops import hub_utils |
|
import pickle |
|
import time |
|
|
|
|
|
|
|
|
|
n_samples, n_outliers = 120, 40 |
|
rng = np.random.RandomState(0) |
|
covariance = np.array([[0.5, -0.1], [0.7, 0.4]]) |
|
cluster_1 = 0.4 * rng.randn(n_samples, 2) @ covariance + np.array([2, 2]) |
|
cluster_2 = 0.3 * rng.randn(n_samples, 2) + np.array([-2, -2]) |
|
outliers = rng.uniform(low=-4, high=4, size=(n_outliers, 2)) |
|
|
|
X = np.concatenate([cluster_1, cluster_2, outliers]) |
|
y = np.concatenate( |
|
[np.ones((2 * n_samples), dtype=int), -np.ones((n_outliers), dtype=int)] |
|
) |
|
|
|
def load_hf_model_hub(): |
|
''' |
|
Load the directory containing pretrained model |
|
and files from the model repository |
|
''' |
|
repo_id="sklearn-docs/anomaly-detection" |
|
download_repo = "downloaded-model" |
|
hub_utils.download(repo_id=repo_id, dst=download_repo) |
|
time.sleep(2) |
|
loaded_model = pickle.load(open('./downloaded-model/isolation_forest.pkl', 'rb')) |
|
return loaded_model |
|
|
|
|
|
|
|
def visualize_input_data(): |
|
fig = plt.figure(1, facecolor="w", figsize=(5, 5)) |
|
scatter = plt.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor="k") |
|
handles, labels = scatter.legend_elements() |
|
plt.axis("square") |
|
plt.legend(handles=handles, labels=["outliers", "inliers"], title="true class") |
|
plt.title("Gaussian inliers with \nuniformly distributed outliers") |
|
return fig |
|
|
|
|
|
|
|
|
|
|
|
title = " An example using IsolationForest for anomaly detection." |
|
description1 = "The isolation forest is an Ensemble of Isolation trees and it isolates the data points using recursive random partitioning." |
|
description2 = "In case of outliers the number of splits required is greater than those required for inliers." |
|
description3 = "We will use the toy dataset as given in the scikit-learn page for Isolation Forest." |
|
|
|
with gr.Blocks(title=title) as demo: |
|
|
|
gr.Markdown(f" # {title}") |
|
gr.Markdown( |
|
""" |
|
The isolation forest is an ensemble of isolation trees and it isolates the data points using recursive random partitioning. |
|
In case of outliers, the number of splits required is greater than those required for inliers. |
|
We will use the toy dataset for our educational demo as given in the scikit-learn page for Isolation Forest. |
|
|
|
""") |
|
|
|
gr.Markdown("You can see the associated scikit-learn example [here](https://scikit-learn.org/stable/auto_examples/ensemble/plot_isolation_forest.html#sphx-glr-auto-examples-ensemble-plot-isolation-forest-py).") |
|
|
|
loaded_model = load_hf_model_hub() |
|
|
|
with gr.Tab("Visualize Input dataset"): |
|
btn = gr.Button(value="Visualize input dataset") |
|
with gr.Row(): |
|
|
|
btn.click(visualize_input_data, outputs= gr.Plot(label='Visualizing input dataset') ) |
|
|
|
gr.Markdown( |
|
""" |
|
## Data Generation |
|
We generate 2 clusters one spherical and the other slightly deformed, from Standard Normal distribution |
|
For the sake of consistency inliers are assigned a ground label of 1 and outliers are assigned a label -1. |
|
The plot is a visualization of the clusters of the input dataset. |
|
|
|
""") |
|
|
|
with gr.Tab("Plot Decision Boundary"): |
|
|
|
|
|
with gr.Row(): |
|
image_decision = gr.Image('./downloaded-model/decision_boundary.png') |
|
gr.Markdown( |
|
""" |
|
## Plot the Discrete Decision Boundary |
|
We plot the discrete decision boundary. |
|
The background colour represents whether a sample in that given area is predicted to be an outlier or not. |
|
The scatter plot displays the true labels |
|
|
|
""") |
|
|
|
with gr.Tab("Plot Path"): |
|
with gr.Row(): |
|
image_path = gr.Image('./downloaded-model/plot_path.png') |
|
gr.Markdown( |
|
""" |
|
## Plot the path length of the decision boundary |
|
By setting the `response_method="decision_function"`, the background of the `DecisionBoundaryDisplay` represents |
|
the measure of the normality of an observation. |
|
|
|
Normality of Observation = Path Length/Number of Forests of Random Trees |
|
|
|
|
|
The RHS of the above equation is given by the number of splits required to isolate a given sample |
|
Such score is given by the path length averaged over a forest of random trees, which itself is given by the depth of |
|
the leaf (or equivalently the number of splits) required to isolate a given sample. |
|
|
|
When a forest of random trees collectively produces short path lengths for isolating some particular samples, |
|
they are more likely to have anomalies, and the measure of normality is close to 0. |
|
Similarly, large paths correspond to values close to 1 and are more likely to be inliers. |
|
|
|
""") |
|
|
|
demo.launch() |