Spaces:

sklearn-docs
/

IsolationForest-Model-for-Anomaly-Detection

Runtime error

App Files Files Community

IsolationForest-Model-for-Anomaly-Detection / app.py

merve HF staff

Update app.py

9e3ec65 almost 2 years ago

raw

history blame contribute delete

5.27 kB

	import os

	import pandas as pd
	from sklearn.ensemble import IsolationForest

	import numpy as np
	from sklearn.model_selection import train_test_split
	import gradio as gr
	import matplotlib.pyplot as plt
	from skops import hub_utils
	import pickle
	import time



	#Data preparation
	n_samples, n_outliers = 120, 40
	rng = np.random.RandomState(0)
	covariance = np.array([[0.5, -0.1], [0.7, 0.4]])
	cluster_1 = 0.4 * rng.randn(n_samples, 2) @ covariance + np.array([2, 2]) # general deformed cluster
	cluster_2 = 0.3 * rng.randn(n_samples, 2) + np.array([-2, -2]) # spherical cluster
	outliers = rng.uniform(low=-4, high=4, size=(n_outliers, 2))

	X = np.concatenate([cluster_1, cluster_2, outliers]) #120+120+40 = 280 with 2D
	y = np.concatenate(
	[np.ones((2 * n_samples), dtype=int), -np.ones((n_outliers), dtype=int)]
	)

	def load_hf_model_hub():
	'''
	Load the directory containing pretrained model
	and files from the model repository
	'''
	repo_id="sklearn-docs/anomaly-detection"
	download_repo = "downloaded-model"
	hub_utils.download(repo_id=repo_id, dst=download_repo)
	time.sleep(2)
	loaded_model = pickle.load(open('./downloaded-model/isolation_forest.pkl', 'rb'))
	return loaded_model

	#Visualize the data as a scatter plot

	def visualize_input_data():
	fig = plt.figure(1, facecolor="w", figsize=(5, 5))
	scatter = plt.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor="k")
	handles, labels = scatter.legend_elements()
	plt.axis("square")
	plt.legend(handles=handles, labels=["outliers", "inliers"], title="true class")
	plt.title("Gaussian inliers with \nuniformly distributed outliers")
	return fig





	title = " An example using IsolationForest for anomaly detection."
	description1 = "The isolation forest is an Ensemble of Isolation trees and it isolates the data points using recursive random partitioning."
	description2 = "In case of outliers the number of splits required is greater than those required for inliers."
	description3 = "We will use the toy dataset as given in the scikit-learn page for Isolation Forest."

	with gr.Blocks(title=title) as demo:

	gr.Markdown(f" # {title}")
	gr.Markdown(
	"""
	The isolation forest is an ensemble of isolation trees and it isolates the data points using recursive random partitioning.
	In case of outliers, the number of splits required is greater than those required for inliers.
	We will use the toy dataset for our educational demo as given in the scikit-learn page for Isolation Forest.

	""")

	gr.Markdown("You can see the associated scikit-learn example [here](https://scikit-learn.org/stable/auto_examples/ensemble/plot_isolation_forest.html#sphx-glr-auto-examples-ensemble-plot-isolation-forest-py).")

	loaded_model = load_hf_model_hub()

	with gr.Tab("Visualize Input dataset"):
	btn = gr.Button(value="Visualize input dataset")
	with gr.Row():

	btn.click(visualize_input_data, outputs= gr.Plot(label='Visualizing input dataset') )
	# out = gr.Textbox(label="explaination of the loss function")
	gr.Markdown(
	"""
	## Data Generation
	We generate 2 clusters one spherical and the other slightly deformed, from Standard Normal distribution
	For the sake of consistency inliers are assigned a ground label of 1 and outliers are assigned a label -1.
	The plot is a visualization of the clusters of the input dataset.

	""")

	with gr.Tab("Plot Decision Boundary"):
	# btn_decision = gr.Button(value="Plot decision boundary")
	# btn_decision.click(plot_decision_boundary, outputs= gr.Plot(label='Plot decision boundary') )
	with gr.Row():
	image_decision = gr.Image('./downloaded-model/decision_boundary.png')
	gr.Markdown(
	"""
	## Plot the Discrete Decision Boundary
	We plot the discrete decision boundary.
	The background colour represents whether a sample in that given area is predicted to be an outlier or not.
	The scatter plot displays the true labels

	""")

	with gr.Tab("Plot Path"):
	with gr.Row():
	image_path = gr.Image('./downloaded-model/plot_path.png')
	gr.Markdown(
	"""
	## Plot the path length of the decision boundary
	By setting the `response_method="decision_function"`, the background of the `DecisionBoundaryDisplay` represents
	the measure of the normality of an observation.

	Normality of Observation = Path Length/Number of Forests of Random Trees


	The RHS of the above equation is given by the number of splits required to isolate a given sample
	Such score is given by the path length averaged over a forest of random trees, which itself is given by the depth of
	the leaf (or equivalently the number of splits) required to isolate a given sample.

	When a forest of random trees collectively produces short path lengths for isolating some particular samples,
	they are more likely to have anomalies, and the measure of normality is close to 0.
	Similarly, large paths correspond to values close to 1 and are more likely to be inliers.

	""")

	demo.launch()