File size: 6,521 Bytes
3a81273
 
 
 
 
669d333
3a81273
 
 
 
 
 
 
 
 
 
d747737
 
 
 
 
 
 
 
 
 
 
 
 
 
3a81273
 
 
ea34489
3a81273
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34728e8
3a81273
 
 
 
 
 
 
aecb8c3
 
3a81273
 
 
721e7c1
aecb8c3
 
3a81273
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f663429
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import os
import joblib
import pandas as pd
import streamlit as st
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from typing import List, Dict, Any

# Constants for directories and file names
MODEL_DIR = 'models'
DATA_DIR = 'datasets'
DATA_FILE = 'cleaned_survey_results_public.csv'
MODEL_NAMES = [
    'CatBoost Regressor',
    'LGBM Regressor',
]

def load_models(model_names: List[str]) -> Dict[str, Any]:
    """Load machine learning models from disk."""
    models = {}
    for name in model_names:
        path = os.path.join(MODEL_DIR, f"{name.replace(' ', '')}.joblib")
        try:
            models[name] = joblib.load(path)
        except Exception as e:
            st.error(f"Error loading model {name}: {str(e)}")
    return models

# Load models
models = load_models(MODEL_NAMES)

# Load dataset
data_path = os.path.join(DATA_DIR, DATA_FILE)
df = pd.read_csv(data_path)

# Prepare features and target
X = df.drop(columns=['Salary'])
y = df['Salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123)

# Pre-defined input choices
input_choices = {
    'MainBranch': df.MainBranch.unique().tolist(),
    'Country': X.Country.unique().tolist(),
    'EducationLevel': X.EducationLevel.unique().tolist(),
    'RemoteWork': df.RemoteWork.unique().tolist(),
}

# Pre-computed statistics for default values
default_comp = float(df.CompTotal.mean())  # Default CompTotal
max_comp = float(df.CompTotal.max() * 1.5)
default_years = 3.0  # Default years of experience
max_years = float(df.YearsOfExperience.max() * 1.5)

# Include more metrics for model performance
def load_and_predict(sample: pd.DataFrame) -> pd.DataFrame:
    """Predict salary using loaded models and evaluate statistics."""
    results = []

    for name, model in models.items():
        try:
            salary_pred = model.predict(sample)[0]
            y_train_pred = model.predict(X_train)
            
            results.append({
                'Model': name,
                'Predicted Salary': salary_pred,
                'R2 Score (%)': r2_score(y_train, y_train_pred) * 100,
                'Mean Absolute Error': mean_absolute_error(y_train, y_train_pred),
                'Mean Squared Error': mean_squared_error(y_train, y_train_pred),
            })
        except Exception as e:
            st.error(f"Error during prediction with model {name}: {str(e)}")

    return pd.DataFrame(results).sort_values(by='R2 Score (%)', ascending=False).reset_index(drop=True)

# Streamlit UI setup
st.set_page_config(page_title="Developer Salary Prediction App", page_icon="πŸ€‘", layout="wide")
st.title("πŸ€‘ **Developer Salary Prediction**")

# Sidebar inputs
st.sidebar.header("Input Information")
mainbranch = st.sidebar.selectbox("**MainBranch**", options=input_choices['MainBranch'])
country = st.sidebar.selectbox("**Country**", options=input_choices['Country'])
educationlevel = st.sidebar.selectbox("**Education Level**", options=input_choices['EducationLevel'])
remotework = st.sidebar.selectbox("**Remote Work**", options=input_choices['RemoteWork'])
comptotal = st.sidebar.number_input("**CompTotal**", min_value=0.0, max_value=max_comp, value=default_comp)
yearsofexperience = st.sidebar.number_input("**Years of Experience**", min_value=0.0, max_value=max_years, value=default_years)

# Handling predictions
if st.sidebar.button(label=':rainbow[Predict Salary]'):
    input_data = pd.DataFrame(
        [[mainbranch, country, educationlevel, remotework, comptotal, yearsofexperience]], 
        columns=['MainBranch', 'Country', 'EducationLevel', 'RemoteWork', 'CompTotal', 'YearsOfExperience'])
    
    results_df = load_and_predict(input_data)
    
    if not results_df.empty:
        st.write("### Prediction Results:")
        st.dataframe(results_df)

# Disclaimer Section
st.markdown("---")
st.text('''
    >> Developer Salary Prediction App <<
    This Streamlit application predicts developer salary using multiple machine learning models including LGBM, XGBoost, and Random Forest regressors. 
    Users can input developer information through a user-friendly interface, which includes fields such as country, education level, and years of experience.
        
    > Features:
        **Input Components**: 
        - **MainBranch**: Select your main area of expertise in development, such as software engineering, data science, or web development. This selection may influence salary expectations based on the branch's demand and trends.
        
        - **Country**: Choose your country from the dropdown list. Regions often exhibit varying salary scales due to economic factors, the cost of living, and market demand for tech workers.
        
        - **Education Level**: Indicate the highest level of education you have completed. Higher educational qualifications often correlate with higher earning potential in the tech industry.
        
        - **Remote Work**: Specify whether you primarily work remotely, on-site, or in a hybrid setup. Remote work setups can affect salary offers, especially if hiring companies are based in different geographic areas.
        
        - **CompTotal**: Enter your expected total compensation, which includes salary, bonuses, and other benefits. This field is crucial for setting a base for salary predictions and facilitates comparisons.
        
        - **Years of Experience**: Provide the number of years you've been in a coding-related job. Generally, more years of experience are associated with higher salaries due to skill accumulation and professional development.
        
        **Data Processing**: 
        - The app employs a pre-processed dataset, cleaned and prepared for model training. 
        - It utilizes features including country, education level, and years of experience for predictions.
        - Models are loaded from disk, obtaining predictions based on user-provided input.

        **Prediction**: The app performs predictions with loaded models and calculates performance metrics like R2 score.
        **Results Display**: The predicted salary and model performance metrics are presented in a user-friendly format.
        
    > Usage: 
       Fill out the developer information and click "Predict Salary" to derive insights on anticipated salary and model performance.
       
    > Disclaimer: 
       This application serves educational purposes. Predictions are not guaranteed to be accurate.
''')