Spaces:

sklearn-docs
/

IsolationForest-Model-for-Anomaly-Detection

Runtime error

App Files Files Community

Jayabalambika commited on Apr 25, 2023

Commit

f1346cf

1 Parent(s): 776c009

Create app.py

Browse files

incorporated review comments

Files changed (1) hide show

app.py +130 -0

app.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import os
+import pandas as pd
+from sklearn.ensemble import IsolationForest
+import numpy as np
+from sklearn.model_selection import train_test_split
+import gradio as gr
+import matplotlib.pyplot as plt
+from skops import hub_utils
+import pickle
+import time
+#Data preparation
+n_samples, n_outliers = 120, 40
+rng = np.random.RandomState(0)
+covariance = np.array([[0.5, -0.1], [0.7, 0.4]])
+cluster_1 = 0.4 * rng.randn(n_samples, 2) @ covariance + np.array([2, 2])  # general deformed cluster
+cluster_2 = 0.3 * rng.randn(n_samples, 2) + np.array([-2, -2])  # spherical cluster
+outliers = rng.uniform(low=-4, high=4, size=(n_outliers, 2))
+X = np.concatenate([cluster_1, cluster_2, outliers]) #120+120+40 = 280 with 2D
+y = np.concatenate(
+    [np.ones((2 * n_samples), dtype=int), -np.ones((n_outliers), dtype=int)]
+)
+def load_hf_model_hub():
+    '''
+    Load the directory containing pretrained model
+    and files from the model repository
+    '''
+    repo_id="sklearn-docs/anomaly-detection"
+    download_repo = "downloaded-model"
+    hub_utils.download(repo_id=repo_id, dst=download_repo)
+    time.sleep(2)
+    loaded_model = pickle.load(open('./downloaded-model/isolation_forest.pkl', 'rb'))
+    return loaded_model
+#Visualize the data as a scatter plot
+def visualize_input_data():
+    fig = plt.figure(1, facecolor="w", figsize=(5, 5))
+    scatter = plt.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor="k")
+    handles, labels = scatter.legend_elements()
+    plt.axis("square")
+    plt.legend(handles=handles, labels=["outliers", "inliers"], title="true class")
+    plt.title("Gaussian inliers with \nuniformly distributed outliers")
+    return fig
+title = " An example using IsolationForest for anomaly detection."
+description1 = "The isolation forest is an Ensemble of Isolation trees and it isolates the data points using recursive random partitioning."
+description2 = "In case of outliers the number of splits required is greater than those required for inliers."
+description3 = "We will use the toy dataset as given in the scikit-learn page for Isolation Forest."
+with gr.Blocks(title=title) as demo:
+    gr.Markdown(f" # {title}")
+    gr.Markdown(
+    """
+    The isolation forest is an ensemble of isolation trees and it isolates the data points using recursive random partitioning.
+    In case of outliers, the number of splits required is greater than those required for inliers.
+    We will use the toy dataset for our educational demo as given in the scikit-learn page for Isolation Forest.
+    """)
+    gr.Markdown("You can see the associated scikit-learn example [here](https://scikit-learn.org/stable/auto_examples/ensemble/plot_isolation_forest.html#sphx-glr-auto-examples-ensemble-plot-isolation-forest-py).")
+    loaded_model = load_hf_model_hub()
+    with gr.Tab("Visualize Input dataset"):
+        btn = gr.Button(value="Visualize input dataset")
+        with gr.Row():
+            btn.click(visualize_input_data, outputs= gr.Plot(label='Visualizing input dataset') )
+            # out = gr.Textbox(label="explaination of the loss function")
+            gr.Markdown(
+            """
+            # Data Generation
+            We generate 2 clusters one spherical and the other slightly deformed, from Standard Normal distribution
+            For the sake of consistency inliers are assigned a ground label of 1 and outliers are assigned a label -1.
+            The plot is a visualization of the clusters of the input dataset.
+            """)
+    with gr.Tab("**Plot Decision Boundary**"):
+      # btn_decision = gr.Button(value="Plot decision boundary")
+      # btn_decision.click(plot_decision_boundary, outputs= gr.Plot(label='Plot decision boundary') )
+      with gr.Row():
+        image_decision = gr.Image('./downloaded-model/decision_boundary.png')
+        gr.Markdown(
+        """
+        # Plot the Discrete Decision Boundary
+        We plot the discrete decision boundary.
+        The background colour represents whether a sample in that given area is predicted to be an outlier or not.
+        The scatter plot displays the true labels
+        """)
+    with gr.Tab("Plot Path"):
+      with gr.Row():
+        image_path = gr.Image('./downloaded-model/plot_path.png')
+        gr.Markdown(
+        """
+        # Plot the path length of the decision boundary
+        By setting the response_method="decision_function", the background of the DecisionBoundaryDisplay represents
+        the measure of the normality of an observation.
+        Normality of Observation = path length/(Number_of_forests_of_random trees) - Eqn.1
+        The RHS of the above equation Eqn.1 is given by the number of splits required to isolate a given sample
+        Such score is given by the path length averaged over a forest of random trees, which itself is given by the depth of
+        the leaf (or equivalently the number of splits)
+        required to isolate a given sample.
+        When a forest of random trees collectively produces short path lengths for isolating some particular samples,
+        they are highly likely to be anomalies and the measure of normality is close to 0.
+        Similarly, large paths correspond to values close to 1 and are more likely to be inliers.
+        """)
+    gr.Markdown( f"## Success")
+demo.launch()