merve's picture
merve HF staff
Update app.py
9e3ec65
import os
import pandas as pd
from sklearn.ensemble import IsolationForest
import numpy as np
from sklearn.model_selection import train_test_split
import gradio as gr
import matplotlib.pyplot as plt
from skops import hub_utils
import pickle
import time
#Data preparation
n_samples, n_outliers = 120, 40
rng = np.random.RandomState(0)
covariance = np.array([[0.5, -0.1], [0.7, 0.4]])
cluster_1 = 0.4 * rng.randn(n_samples, 2) @ covariance + np.array([2, 2]) # general deformed cluster
cluster_2 = 0.3 * rng.randn(n_samples, 2) + np.array([-2, -2]) # spherical cluster
outliers = rng.uniform(low=-4, high=4, size=(n_outliers, 2))
X = np.concatenate([cluster_1, cluster_2, outliers]) #120+120+40 = 280 with 2D
y = np.concatenate(
[np.ones((2 * n_samples), dtype=int), -np.ones((n_outliers), dtype=int)]
)
def load_hf_model_hub():
'''
Load the directory containing pretrained model
and files from the model repository
'''
repo_id="sklearn-docs/anomaly-detection"
download_repo = "downloaded-model"
hub_utils.download(repo_id=repo_id, dst=download_repo)
time.sleep(2)
loaded_model = pickle.load(open('./downloaded-model/isolation_forest.pkl', 'rb'))
return loaded_model
#Visualize the data as a scatter plot
def visualize_input_data():
fig = plt.figure(1, facecolor="w", figsize=(5, 5))
scatter = plt.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor="k")
handles, labels = scatter.legend_elements()
plt.axis("square")
plt.legend(handles=handles, labels=["outliers", "inliers"], title="true class")
plt.title("Gaussian inliers with \nuniformly distributed outliers")
return fig
title = " An example using IsolationForest for anomaly detection."
description1 = "The isolation forest is an Ensemble of Isolation trees and it isolates the data points using recursive random partitioning."
description2 = "In case of outliers the number of splits required is greater than those required for inliers."
description3 = "We will use the toy dataset as given in the scikit-learn page for Isolation Forest."
with gr.Blocks(title=title) as demo:
gr.Markdown(f" # {title}")
gr.Markdown(
"""
The isolation forest is an ensemble of isolation trees and it isolates the data points using recursive random partitioning.
In case of outliers, the number of splits required is greater than those required for inliers.
We will use the toy dataset for our educational demo as given in the scikit-learn page for Isolation Forest.
""")
gr.Markdown("You can see the associated scikit-learn example [here](https://scikit-learn.org/stable/auto_examples/ensemble/plot_isolation_forest.html#sphx-glr-auto-examples-ensemble-plot-isolation-forest-py).")
loaded_model = load_hf_model_hub()
with gr.Tab("Visualize Input dataset"):
btn = gr.Button(value="Visualize input dataset")
with gr.Row():
btn.click(visualize_input_data, outputs= gr.Plot(label='Visualizing input dataset') )
# out = gr.Textbox(label="explaination of the loss function")
gr.Markdown(
"""
## Data Generation
We generate 2 clusters one spherical and the other slightly deformed, from Standard Normal distribution
For the sake of consistency inliers are assigned a ground label of 1 and outliers are assigned a label -1.
The plot is a visualization of the clusters of the input dataset.
""")
with gr.Tab("Plot Decision Boundary"):
# btn_decision = gr.Button(value="Plot decision boundary")
# btn_decision.click(plot_decision_boundary, outputs= gr.Plot(label='Plot decision boundary') )
with gr.Row():
image_decision = gr.Image('./downloaded-model/decision_boundary.png')
gr.Markdown(
"""
## Plot the Discrete Decision Boundary
We plot the discrete decision boundary.
The background colour represents whether a sample in that given area is predicted to be an outlier or not.
The scatter plot displays the true labels
""")
with gr.Tab("Plot Path"):
with gr.Row():
image_path = gr.Image('./downloaded-model/plot_path.png')
gr.Markdown(
"""
## Plot the path length of the decision boundary
By setting the `response_method="decision_function"`, the background of the `DecisionBoundaryDisplay` represents
the measure of the normality of an observation.
Normality of Observation = Path Length/Number of Forests of Random Trees
The RHS of the above equation is given by the number of splits required to isolate a given sample
Such score is given by the path length averaged over a forest of random trees, which itself is given by the depth of
the leaf (or equivalently the number of splits) required to isolate a given sample.
When a forest of random trees collectively produces short path lengths for isolating some particular samples,
they are more likely to have anomalies, and the measure of normality is close to 0.
Similarly, large paths correspond to values close to 1 and are more likely to be inliers.
""")
demo.launch()