Update app.py
Browse files
app.py
CHANGED
@@ -1,137 +1,135 @@
|
|
1 |
-
import os
|
2 |
-
import joblib
|
3 |
-
import pandas as pd
|
4 |
-
import streamlit as st
|
5 |
-
from sklearn.model_selection import train_test_split
|
6 |
-
from sklearn.metrics import r2_score
|
7 |
-
from typing import List, Dict, Any
|
8 |
-
|
9 |
-
# Constants for directories and file names
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
]
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
'
|
47 |
-
'
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
'
|
70 |
-
|
71 |
-
|
72 |
-
})
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
st.sidebar.
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
- **
|
113 |
-
|
114 |
-
- **
|
115 |
-
|
116 |
-
- **
|
117 |
-
|
118 |
-
- **
|
119 |
-
|
120 |
-
- **
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
> Disclaimer:
|
136 |
-
This application serves educational purposes. Predictions are not guaranteed to be accurate.
|
137 |
''')
|
|
|
1 |
+
import os
|
2 |
+
import joblib
|
3 |
+
import pandas as pd
|
4 |
+
import streamlit as st
|
5 |
+
from sklearn.model_selection import train_test_split
|
6 |
+
from sklearn.metrics import r2_score
|
7 |
+
from typing import List, Dict, Any
|
8 |
+
|
9 |
+
# Constants for directories and file names
|
10 |
+
MODEL_DIR = 'models'
|
11 |
+
DATA_DIR = 'datasets'
|
12 |
+
DATA_FILE = 'cleaned_survey_results_public.csv'
|
13 |
+
MODEL_NAMES = [
|
14 |
+
'CatBoost Regressor',
|
15 |
+
'XGBoost Regressor',
|
16 |
+
'LGBM Regressor',
|
17 |
+
]
|
18 |
+
|
19 |
+
def load_models(model_names: List[str]) -> Dict[str, Any]:
|
20 |
+
"""Load machine learning models from disk."""
|
21 |
+
models = {}
|
22 |
+
for name in model_names:
|
23 |
+
path = os.path.join(MODEL_DIR, f"{name.replace(' ', '')}.joblib")
|
24 |
+
try:
|
25 |
+
models[name] = joblib.load(path)
|
26 |
+
except Exception as e:
|
27 |
+
st.error(f"Error loading model {name}: {str(e)}")
|
28 |
+
return models
|
29 |
+
|
30 |
+
# Load models
|
31 |
+
models = load_models(MODEL_NAMES)
|
32 |
+
|
33 |
+
# Load dataset
|
34 |
+
data_path = os.path.join(DATA_DIR, DATA_FILE)
|
35 |
+
df = pd.read_csv(data_path)
|
36 |
+
|
37 |
+
# Prepare features and target
|
38 |
+
X = df.drop(columns=['Salary'])
|
39 |
+
y = df['Salary']
|
40 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123)
|
41 |
+
|
42 |
+
# Pre-defined input choices
|
43 |
+
input_choices = {
|
44 |
+
'MainBranch': df.MainBranch.unique().tolist(),
|
45 |
+
'Country': X.Country.unique().tolist(),
|
46 |
+
'EducationLevel': X.EducationLevel.unique().tolist(),
|
47 |
+
'RemoteWork': df.RemoteWork.unique().tolist(),
|
48 |
+
}
|
49 |
+
|
50 |
+
# Pre-computed statistics for default values
|
51 |
+
default_comp = float(df.CompTotal.mean()) # Default CompTotal
|
52 |
+
max_comp = float(df.CompTotal.max() * 1.5)
|
53 |
+
default_years = 3.0 # Default years of experience
|
54 |
+
max_years = float(df.YearsOfExperience.max() * 1.5)
|
55 |
+
|
56 |
+
# Precompute predictions for training set
|
57 |
+
y_train_predictions = {name: model.predict(X_train) for name, model in models.items()}
|
58 |
+
|
59 |
+
def load_and_predict(sample: pd.DataFrame) -> pd.DataFrame:
|
60 |
+
"""Predict salary using loaded models and evaluate statistics."""
|
61 |
+
results = []
|
62 |
+
|
63 |
+
for name, model in models.items():
|
64 |
+
try:
|
65 |
+
salary_pred = model.predict(sample)[0]
|
66 |
+
results.append({
|
67 |
+
'Model': name,
|
68 |
+
'Predicted Salary': salary_pred,
|
69 |
+
'R2 Score (%)': r2_score(y_train, y_train_predictions[name]) * 100,
|
70 |
+
})
|
71 |
+
except Exception as e:
|
72 |
+
st.error(f"Error during prediction with model {name}: {str(e)}")
|
73 |
+
|
74 |
+
return pd.DataFrame(results).sort_values(by='R2 Score (%)', ascending=False).reset_index(drop=True)
|
75 |
+
|
76 |
+
# Streamlit UI setup
|
77 |
+
st.set_page_config(page_title="Developer Salary Prediction App", page_icon="🤑", layout="wide")
|
78 |
+
st.title("🤑 **Developer Salary Prediction**")
|
79 |
+
|
80 |
+
# Sidebar inputs
|
81 |
+
st.sidebar.header("Input Information")
|
82 |
+
mainbranch = st.sidebar.selectbox("**MainBranch**", options=input_choices['MainBranch'])
|
83 |
+
country = st.sidebar.selectbox("**Country**", options=input_choices['Country'])
|
84 |
+
educationlevel = st.sidebar.selectbox("**Education Level**", options=input_choices['EducationLevel'])
|
85 |
+
remotework = st.sidebar.selectbox("**Remote Work**", options=input_choices['RemoteWork'])
|
86 |
+
comptotal = st.sidebar.number_input("**CompTotal**", min_value=0.0, max_value=max_comp, value=default_comp)
|
87 |
+
yearsofexperience = st.sidebar.number_input("**Years of Experience**", min_value=0.0, max_value=max_years, value=default_years)
|
88 |
+
|
89 |
+
# Handling predictions
|
90 |
+
if st.sidebar.button(label=':rainbow[Predict Salary]'):
|
91 |
+
input_data = pd.DataFrame(
|
92 |
+
[[mainbranch, country, educationlevel, remotework, comptotal, yearsofexperience]],
|
93 |
+
columns=['MainBranch', 'Country', 'EducationLevel', 'RemoteWork', 'CompTotal', 'YearsOfExperience'])
|
94 |
+
|
95 |
+
results_df = load_and_predict(input_data)
|
96 |
+
|
97 |
+
if not results_df.empty:
|
98 |
+
st.write("### Prediction Results:")
|
99 |
+
st.dataframe(results_df)
|
100 |
+
|
101 |
+
# Disclaimer Section
|
102 |
+
st.markdown("---")
|
103 |
+
st.text('''
|
104 |
+
>> Developer Salary Prediction App <<
|
105 |
+
This Streamlit application predicts developer salary using multiple machine learning models including LGBM, XGBoost, and Random Forest regressors.
|
106 |
+
Users can input developer information through a user-friendly interface, which includes fields such as country, education level, and years of experience.
|
107 |
+
|
108 |
+
> Features:
|
109 |
+
**Input Components**:
|
110 |
+
- **MainBranch**: Select your main area of expertise in development, such as software engineering, data science, or web development. This selection may influence salary expectations based on the branch's demand and trends.
|
111 |
+
|
112 |
+
- **Country**: Choose your country from the dropdown list. Regions often exhibit varying salary scales due to economic factors, the cost of living, and market demand for tech workers.
|
113 |
+
|
114 |
+
- **Education Level**: Indicate the highest level of education you have completed. Higher educational qualifications often correlate with higher earning potential in the tech industry.
|
115 |
+
|
116 |
+
- **Remote Work**: Specify whether you primarily work remotely, on-site, or in a hybrid setup. Remote work setups can affect salary offers, especially if hiring companies are based in different geographic areas.
|
117 |
+
|
118 |
+
- **CompTotal**: Enter your expected total compensation, which includes salary, bonuses, and other benefits. This field is crucial for setting a base for salary predictions and facilitates comparisons.
|
119 |
+
|
120 |
+
- **Years of Experience**: Provide the number of years you've been in a coding-related job. Generally, more years of experience are associated with higher salaries due to skill accumulation and professional development.
|
121 |
+
|
122 |
+
**Data Processing**:
|
123 |
+
- The app employs a pre-processed dataset, cleaned and prepared for model training.
|
124 |
+
- It utilizes features including country, education level, and years of experience for predictions.
|
125 |
+
- Models are loaded from disk, obtaining predictions based on user-provided input.
|
126 |
+
|
127 |
+
**Prediction**: The app performs predictions with loaded models and calculates performance metrics like R2 score.
|
128 |
+
**Results Display**: The predicted salary and model performance metrics are presented in a user-friendly format.
|
129 |
+
|
130 |
+
> Usage:
|
131 |
+
Fill out the developer information and click "Predict Salary" to derive insights on anticipated salary and model performance.
|
132 |
+
|
133 |
+
> Disclaimer:
|
134 |
+
This application serves educational purposes. Predictions are not guaranteed to be accurate.
|
|
|
|
|
135 |
''')
|