Jayabalambika commited on
Commit
f1346cf
Β·
1 Parent(s): 776c009

Create app.py

Browse files

incorporated review comments

Files changed (1) hide show
  1. app.py +130 -0
app.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import pandas as pd
4
+ from sklearn.ensemble import IsolationForest
5
+
6
+ import numpy as np
7
+ from sklearn.model_selection import train_test_split
8
+ import gradio as gr
9
+ import matplotlib.pyplot as plt
10
+ from skops import hub_utils
11
+ import pickle
12
+ import time
13
+
14
+
15
+
16
+ #Data preparation
17
+ n_samples, n_outliers = 120, 40
18
+ rng = np.random.RandomState(0)
19
+ covariance = np.array([[0.5, -0.1], [0.7, 0.4]])
20
+ cluster_1 = 0.4 * rng.randn(n_samples, 2) @ covariance + np.array([2, 2]) # general deformed cluster
21
+ cluster_2 = 0.3 * rng.randn(n_samples, 2) + np.array([-2, -2]) # spherical cluster
22
+ outliers = rng.uniform(low=-4, high=4, size=(n_outliers, 2))
23
+
24
+ X = np.concatenate([cluster_1, cluster_2, outliers]) #120+120+40 = 280 with 2D
25
+ y = np.concatenate(
26
+ [np.ones((2 * n_samples), dtype=int), -np.ones((n_outliers), dtype=int)]
27
+ )
28
+
29
+ def load_hf_model_hub():
30
+ '''
31
+ Load the directory containing pretrained model
32
+ and files from the model repository
33
+ '''
34
+ repo_id="sklearn-docs/anomaly-detection"
35
+ download_repo = "downloaded-model"
36
+ hub_utils.download(repo_id=repo_id, dst=download_repo)
37
+ time.sleep(2)
38
+ loaded_model = pickle.load(open('./downloaded-model/isolation_forest.pkl', 'rb'))
39
+ return loaded_model
40
+
41
+ #Visualize the data as a scatter plot
42
+
43
+ def visualize_input_data():
44
+ fig = plt.figure(1, facecolor="w", figsize=(5, 5))
45
+ scatter = plt.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor="k")
46
+ handles, labels = scatter.legend_elements()
47
+ plt.axis("square")
48
+ plt.legend(handles=handles, labels=["outliers", "inliers"], title="true class")
49
+ plt.title("Gaussian inliers with \nuniformly distributed outliers")
50
+ return fig
51
+
52
+
53
+
54
+
55
+
56
+ title = " An example using IsolationForest for anomaly detection."
57
+ description1 = "The isolation forest is an Ensemble of Isolation trees and it isolates the data points using recursive random partitioning."
58
+ description2 = "In case of outliers the number of splits required is greater than those required for inliers."
59
+ description3 = "We will use the toy dataset as given in the scikit-learn page for Isolation Forest."
60
+
61
+ with gr.Blocks(title=title) as demo:
62
+
63
+ gr.Markdown(f" # {title}")
64
+ gr.Markdown(
65
+ """
66
+ The isolation forest is an ensemble of isolation trees and it isolates the data points using recursive random partitioning.
67
+ In case of outliers, the number of splits required is greater than those required for inliers.
68
+ We will use the toy dataset for our educational demo as given in the scikit-learn page for Isolation Forest.
69
+
70
+ """)
71
+
72
+ gr.Markdown("You can see the associated scikit-learn example [here](https://scikit-learn.org/stable/auto_examples/ensemble/plot_isolation_forest.html#sphx-glr-auto-examples-ensemble-plot-isolation-forest-py).")
73
+
74
+ loaded_model = load_hf_model_hub()
75
+
76
+ with gr.Tab("Visualize Input dataset"):
77
+ btn = gr.Button(value="Visualize input dataset")
78
+ with gr.Row():
79
+
80
+ btn.click(visualize_input_data, outputs= gr.Plot(label='Visualizing input dataset') )
81
+ # out = gr.Textbox(label="explaination of the loss function")
82
+ gr.Markdown(
83
+ """
84
+ # Data Generation
85
+ We generate 2 clusters one spherical and the other slightly deformed, from Standard Normal distribution
86
+ For the sake of consistency inliers are assigned a ground label of 1 and outliers are assigned a label -1.
87
+ The plot is a visualization of the clusters of the input dataset.
88
+
89
+ """)
90
+
91
+ with gr.Tab("**Plot Decision Boundary**"):
92
+ # btn_decision = gr.Button(value="Plot decision boundary")
93
+ # btn_decision.click(plot_decision_boundary, outputs= gr.Plot(label='Plot decision boundary') )
94
+ with gr.Row():
95
+ image_decision = gr.Image('./downloaded-model/decision_boundary.png')
96
+ gr.Markdown(
97
+ """
98
+ # Plot the Discrete Decision Boundary
99
+ We plot the discrete decision boundary.
100
+ The background colour represents whether a sample in that given area is predicted to be an outlier or not.
101
+ The scatter plot displays the true labels
102
+
103
+ """)
104
+
105
+ with gr.Tab("Plot Path"):
106
+ with gr.Row():
107
+ image_path = gr.Image('./downloaded-model/plot_path.png')
108
+ gr.Markdown(
109
+ """
110
+ # Plot the path length of the decision boundary
111
+ By setting the response_method="decision_function", the background of the DecisionBoundaryDisplay represents
112
+ the measure of the normality of an observation.
113
+
114
+ Normality of Observation = path length/(Number_of_forests_of_random trees) - Eqn.1
115
+
116
+
117
+ The RHS of the above equation Eqn.1 is given by the number of splits required to isolate a given sample
118
+ Such score is given by the path length averaged over a forest of random trees, which itself is given by the depth of
119
+ the leaf (or equivalently the number of splits)
120
+ required to isolate a given sample.
121
+
122
+ When a forest of random trees collectively produces short path lengths for isolating some particular samples,
123
+ they are highly likely to be anomalies and the measure of normality is close to 0.
124
+ Similarly, large paths correspond to values close to 1 and are more likely to be inliers.
125
+
126
+ """)
127
+
128
+
129
+ gr.Markdown( f"## Success")
130
+ demo.launch()