File size: 6,521 Bytes
3a81273 669d333 3a81273 d747737 3a81273 ea34489 3a81273 34728e8 3a81273 aecb8c3 3a81273 721e7c1 aecb8c3 3a81273 f663429 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import os
import joblib
import pandas as pd
import streamlit as st
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from typing import List, Dict, Any
# Constants for directories and file names
MODEL_DIR = 'models'
DATA_DIR = 'datasets'
DATA_FILE = 'cleaned_survey_results_public.csv'
MODEL_NAMES = [
'CatBoost Regressor',
'LGBM Regressor',
]
def load_models(model_names: List[str]) -> Dict[str, Any]:
"""Load machine learning models from disk."""
models = {}
for name in model_names:
path = os.path.join(MODEL_DIR, f"{name.replace(' ', '')}.joblib")
try:
models[name] = joblib.load(path)
except Exception as e:
st.error(f"Error loading model {name}: {str(e)}")
return models
# Load models
models = load_models(MODEL_NAMES)
# Load dataset
data_path = os.path.join(DATA_DIR, DATA_FILE)
df = pd.read_csv(data_path)
# Prepare features and target
X = df.drop(columns=['Salary'])
y = df['Salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123)
# Pre-defined input choices
input_choices = {
'MainBranch': df.MainBranch.unique().tolist(),
'Country': X.Country.unique().tolist(),
'EducationLevel': X.EducationLevel.unique().tolist(),
'RemoteWork': df.RemoteWork.unique().tolist(),
}
# Pre-computed statistics for default values
default_comp = float(df.CompTotal.mean()) # Default CompTotal
max_comp = float(df.CompTotal.max() * 1.5)
default_years = 3.0 # Default years of experience
max_years = float(df.YearsOfExperience.max() * 1.5)
# Include more metrics for model performance
def load_and_predict(sample: pd.DataFrame) -> pd.DataFrame:
"""Predict salary using loaded models and evaluate statistics."""
results = []
for name, model in models.items():
try:
salary_pred = model.predict(sample)[0]
y_train_pred = model.predict(X_train)
results.append({
'Model': name,
'Predicted Salary': salary_pred,
'R2 Score (%)': r2_score(y_train, y_train_pred) * 100,
'Mean Absolute Error': mean_absolute_error(y_train, y_train_pred),
'Mean Squared Error': mean_squared_error(y_train, y_train_pred),
})
except Exception as e:
st.error(f"Error during prediction with model {name}: {str(e)}")
return pd.DataFrame(results).sort_values(by='R2 Score (%)', ascending=False).reset_index(drop=True)
# Streamlit UI setup
st.set_page_config(page_title="Developer Salary Prediction App", page_icon="π€", layout="wide")
st.title("π€ **Developer Salary Prediction**")
# Sidebar inputs
st.sidebar.header("Input Information")
mainbranch = st.sidebar.selectbox("**MainBranch**", options=input_choices['MainBranch'])
country = st.sidebar.selectbox("**Country**", options=input_choices['Country'])
educationlevel = st.sidebar.selectbox("**Education Level**", options=input_choices['EducationLevel'])
remotework = st.sidebar.selectbox("**Remote Work**", options=input_choices['RemoteWork'])
comptotal = st.sidebar.number_input("**CompTotal**", min_value=0.0, max_value=max_comp, value=default_comp)
yearsofexperience = st.sidebar.number_input("**Years of Experience**", min_value=0.0, max_value=max_years, value=default_years)
# Handling predictions
if st.sidebar.button(label=':rainbow[Predict Salary]'):
input_data = pd.DataFrame(
[[mainbranch, country, educationlevel, remotework, comptotal, yearsofexperience]],
columns=['MainBranch', 'Country', 'EducationLevel', 'RemoteWork', 'CompTotal', 'YearsOfExperience'])
results_df = load_and_predict(input_data)
if not results_df.empty:
st.write("### Prediction Results:")
st.dataframe(results_df)
# Disclaimer Section
st.markdown("---")
st.text('''
>> Developer Salary Prediction App <<
This Streamlit application predicts developer salary using multiple machine learning models including LGBM, XGBoost, and Random Forest regressors.
Users can input developer information through a user-friendly interface, which includes fields such as country, education level, and years of experience.
> Features:
**Input Components**:
- **MainBranch**: Select your main area of expertise in development, such as software engineering, data science, or web development. This selection may influence salary expectations based on the branch's demand and trends.
- **Country**: Choose your country from the dropdown list. Regions often exhibit varying salary scales due to economic factors, the cost of living, and market demand for tech workers.
- **Education Level**: Indicate the highest level of education you have completed. Higher educational qualifications often correlate with higher earning potential in the tech industry.
- **Remote Work**: Specify whether you primarily work remotely, on-site, or in a hybrid setup. Remote work setups can affect salary offers, especially if hiring companies are based in different geographic areas.
- **CompTotal**: Enter your expected total compensation, which includes salary, bonuses, and other benefits. This field is crucial for setting a base for salary predictions and facilitates comparisons.
- **Years of Experience**: Provide the number of years you've been in a coding-related job. Generally, more years of experience are associated with higher salaries due to skill accumulation and professional development.
**Data Processing**:
- The app employs a pre-processed dataset, cleaned and prepared for model training.
- It utilizes features including country, education level, and years of experience for predictions.
- Models are loaded from disk, obtaining predictions based on user-provided input.
**Prediction**: The app performs predictions with loaded models and calculates performance metrics like R2 score.
**Results Display**: The predicted salary and model performance metrics are presented in a user-friendly format.
> Usage:
Fill out the developer information and click "Predict Salary" to derive insights on anticipated salary and model performance.
> Disclaimer:
This application serves educational purposes. Predictions are not guaranteed to be accurate.
''') |