|
import gradio as gr |
|
import numpy as np |
|
from sklearn.datasets import load_diabetes |
|
from sklearn.linear_model import RidgeCV |
|
from sklearn.feature_selection import SelectFromModel |
|
from time import time |
|
from sklearn.feature_selection import SequentialFeatureSelector |
|
import matplotlib |
|
matplotlib.use("Agg") |
|
import matplotlib.pyplot as plt |
|
|
|
def select_features(method,num_features): |
|
diabetes = load_diabetes() |
|
X, y = diabetes.data, diabetes.target |
|
ridge = RidgeCV(alphas=np.logspace(-6, 6, num=5)).fit(X, y) |
|
feature_names = np.array(diabetes.feature_names) |
|
if method == 'model': |
|
importance = np.abs(ridge.coef_) |
|
tic = time() |
|
sfm = SelectFromModel(ridge, threshold=-np.inf,max_features=num_features).fit(X, y) |
|
toc = time() |
|
selected_features = feature_names[sfm.get_support()] |
|
if int(num_features) < len(selected_features): |
|
selected_features = selected_features[:int(num_features)] |
|
execution_time = toc - tic |
|
fig, ax = plt.subplots() |
|
ax.bar(height=importance, x=feature_names) |
|
ax.set_title("Feature importances via coefficients") |
|
ax.set_ylabel("Importance coefficient") |
|
ax.set_xlabel("Features") |
|
elif method == 'sfs-forward': |
|
tic_fwd = time() |
|
sfs_forward = SequentialFeatureSelector( |
|
ridge, n_features_to_select=int(num_features), direction="forward" |
|
).fit(X, y) |
|
toc_fwd = time() |
|
selected_features = feature_names[sfs_forward.get_support()] |
|
execution_time = toc_fwd - tic_fwd |
|
importance = np.abs(sfs_forward.get_params()['estimator'].coef_) |
|
fig = None |
|
elif method == 'sfs-backward': |
|
tic_bwd = time() |
|
sfs_backward = SequentialFeatureSelector( |
|
ridge, n_features_to_select=int(num_features), direction="backward" |
|
).fit(X, y) |
|
toc_bwd = time() |
|
selected_features = feature_names[sfs_backward.get_support()] |
|
execution_time = toc_bwd - tic_bwd |
|
importance = np.abs(sfs_backward.get_params()['estimator'].coef_) |
|
fig = None |
|
|
|
return f"Selected the following features: {', '.join(selected_features)} in {execution_time:.3f} seconds", fig |
|
|
|
title = "Selecting features with Sequential Feature Selection" |
|
with gr.Blocks(title=title) as demo: |
|
gr.Markdown(f"## {title}") |
|
gr.Markdown(""" |
|
This app demonstrates feature selection techniques using model based selection and sequential feature selection.\n\n |
|
Model based selection is based on feature importance. Each feature is assigned a score on how much influence they have on the model output. |
|
The feature with highest score is considered the most important feature.\n\n |
|
Sequential feature selection is based on greedy approach. In greedy approach, the feature is added or removed to the selected features at each iteration |
|
based on the model performance score.\n\n |
|
This app uses Ridge estimator and the diabetes dataset from sklearn. Diabetes dataset consist of quantitative measure of diabetes progression and |
|
10 following variables obtained from 442 diabetes patients: |
|
1. Age (age) |
|
2. Sex (sex) |
|
3. Body mass index (bmi) |
|
4. Average blood pressure (bp) |
|
5. Total serum cholesterol (s1) |
|
6. Low-density lipoproteins (s2) |
|
7. High-density lipoproteins (s3) |
|
8. Total cholesterol / HDL (s4) |
|
9. Possibly log of serum triglycerides level (s5) |
|
10. Blood sugar level (s6)\n\n |
|
This app is developed based on [scikit-learn example](https://scikit-learn.org/stable/auto_examples/feature_selection/plot_select_from_model_diabetes.html#sphx-glr-auto-examples-feature-selection-plot-select-from-model-diabetes-py) |
|
""") |
|
|
|
method = gr.Radio(["model", "sfs-forward", "sfs-backward"], label="Method") |
|
num_features = gr.Slider(minimum=2, maximum=10, step=1, label = "Number of features") |
|
output = gr.Textbox(label="Selected features") |
|
plot = gr.Plot(label="Feature importance plot") |
|
num_features.change(fn=select_features, inputs=[method,num_features], outputs=[output,plot]) |
|
|
|
demo.launch() |
|
|
|
|