AfshinMA commited on
Commit
3a81273
·
verified ·
1 Parent(s): f663429

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +134 -136
app.py CHANGED
@@ -1,137 +1,135 @@
1
- import os
2
- import joblib
3
- import pandas as pd
4
- import streamlit as st
5
- from sklearn.model_selection import train_test_split
6
- from sklearn.metrics import r2_score
7
- from typing import List, Dict, Any
8
-
9
- # Constants for directories and file names
10
- DIR = "C:\\Users\\Afshin\\Desktop\\10_Projects\\Project_3_Developer_Salary_Prediction\\"
11
- # Constants for directories and file names
12
- MODEL_DIR = DIR + 'models'
13
- DATA_DIR = DIR + 'datasets'
14
- DATA_FILE = 'cleaned_survey_results_public_v2.csv'
15
- MODEL_NAMES = [
16
- #'CatBoost Regressor',
17
- 'XGBoost Regressor',
18
- 'LGBM Regressor',
19
- ]
20
-
21
- def load_models(model_names: List[str]) -> Dict[str, Any]:
22
- """Load machine learning models from disk."""
23
- models = {}
24
- for name in model_names:
25
- path = os.path.join(MODEL_DIR, f"{name.replace(' ', '')}.joblib")
26
- try:
27
- models[name] = joblib.load(path)
28
- except Exception as e:
29
- st.error(f"Error loading model {name}: {str(e)}")
30
- return models
31
-
32
- # Load models
33
- models = load_models(MODEL_NAMES)
34
-
35
- # Load dataset
36
- data_path = os.path.join(DATA_DIR, DATA_FILE)
37
- df = pd.read_csv(data_path)
38
-
39
- # Prepare features and target
40
- X = df.drop(columns=['Salary'])
41
- y = df['Salary']
42
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123)
43
-
44
- # Pre-defined input choices
45
- input_choices = {
46
- 'MainBranch': df.MainBranch.unique().tolist(),
47
- 'Country': X.Country.unique().tolist(),
48
- 'EducationLevel': X.EducationLevel.unique().tolist(),
49
- 'RemoteWork': df.RemoteWork.unique().tolist(),
50
- }
51
-
52
- # Pre-computed statistics for default values
53
- default_comp = float(df.CompTotal.mean()) # Default CompTotal
54
- max_comp = float(df.CompTotal.max() * 1.5)
55
- default_years = 3.0 # Default years of experience
56
- max_years = float(df.YearsOfExperience.max() * 1.5)
57
-
58
- # Precompute predictions for training set
59
- y_train_predictions = {name: model.predict(X_train) for name, model in models.items()}
60
-
61
- def load_and_predict(sample: pd.DataFrame) -> pd.DataFrame:
62
- """Predict salary using loaded models and evaluate statistics."""
63
- results = []
64
-
65
- for name, model in models.items():
66
- try:
67
- salary_pred = model.predict(sample)[0]
68
- results.append({
69
- 'Model': name,
70
- 'Predicted Salary': salary_pred,
71
- 'R2 Score (%)': r2_score(y_train, y_train_predictions[name]) * 100,
72
- })
73
- except Exception as e:
74
- st.error(f"Error during prediction with model {name}: {str(e)}")
75
-
76
- return pd.DataFrame(results).sort_values(by='R2 Score (%)', ascending=False).reset_index(drop=True)
77
-
78
- # Streamlit UI setup
79
- st.set_page_config(page_title="Developer Salary Prediction App", page_icon="🤑", layout="wide")
80
- st.title("🤑 **Developer Salary Prediction**")
81
-
82
- # Sidebar inputs
83
- st.sidebar.header("Input Information")
84
- mainbranch = st.sidebar.selectbox("**MainBranch**", options=input_choices['MainBranch'])
85
- country = st.sidebar.selectbox("**Country**", options=input_choices['Country'])
86
- educationlevel = st.sidebar.selectbox("**Education Level**", options=input_choices['EducationLevel'])
87
- remotework = st.sidebar.selectbox("**Remote Work**", options=input_choices['RemoteWork'])
88
- comptotal = st.sidebar.number_input("**CompTotal**", min_value=0.0, max_value=max_comp, value=default_comp)
89
- yearsofexperience = st.sidebar.number_input("**Years of Experience**", min_value=0.0, max_value=max_years, value=default_years)
90
-
91
- # Handling predictions
92
- if st.sidebar.button(label=':rainbow[Predict Salary]'):
93
- input_data = pd.DataFrame(
94
- [[mainbranch, country, educationlevel, remotework, comptotal, yearsofexperience]],
95
- columns=['MainBranch', 'Country', 'EducationLevel', 'RemoteWork', 'CompTotal', 'YearsOfExperience'])
96
-
97
- results_df = load_and_predict(input_data)
98
-
99
- if not results_df.empty:
100
- st.write("### Prediction Results:")
101
- st.dataframe(results_df)
102
-
103
- # Disclaimer Section
104
- st.markdown("---")
105
- st.text('''
106
- >> Developer Salary Prediction App <<
107
- This Streamlit application predicts developer salary using multiple machine learning models including LGBM, XGBoost, and Random Forest regressors.
108
- Users can input developer information through a user-friendly interface, which includes fields such as country, education level, and years of experience.
109
-
110
- > Features:
111
- **Input Components**:
112
- - **MainBranch**: Select your main area of expertise in development, such as software engineering, data science, or web development. This selection may influence salary expectations based on the branch's demand and trends.
113
-
114
- - **Country**: Choose your country from the dropdown list. Regions often exhibit varying salary scales due to economic factors, the cost of living, and market demand for tech workers.
115
-
116
- - **Education Level**: Indicate the highest level of education you have completed. Higher educational qualifications often correlate with higher earning potential in the tech industry.
117
-
118
- - **Remote Work**: Specify whether you primarily work remotely, on-site, or in a hybrid setup. Remote work setups can affect salary offers, especially if hiring companies are based in different geographic areas.
119
-
120
- - **CompTotal**: Enter your expected total compensation, which includes salary, bonuses, and other benefits. This field is crucial for setting a base for salary predictions and facilitates comparisons.
121
-
122
- - **Years of Experience**: Provide the number of years you've been in a coding-related job. Generally, more years of experience are associated with higher salaries due to skill accumulation and professional development.
123
-
124
- **Data Processing**:
125
- - The app employs a pre-processed dataset, cleaned and prepared for model training.
126
- - It utilizes features including country, education level, and years of experience for predictions.
127
- - Models are loaded from disk, obtaining predictions based on user-provided input.
128
-
129
- **Prediction**: The app performs predictions with loaded models and calculates performance metrics like R2 score.
130
- **Results Display**: The predicted salary and model performance metrics are presented in a user-friendly format.
131
-
132
- > Usage:
133
- Fill out the developer information and click "Predict Salary" to derive insights on anticipated salary and model performance.
134
-
135
- > Disclaimer:
136
- This application serves educational purposes. Predictions are not guaranteed to be accurate.
137
  ''')
 
1
+ import os
2
+ import joblib
3
+ import pandas as pd
4
+ import streamlit as st
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.metrics import r2_score
7
+ from typing import List, Dict, Any
8
+
9
+ # Constants for directories and file names
10
+ MODEL_DIR = 'models'
11
+ DATA_DIR = 'datasets'
12
+ DATA_FILE = 'cleaned_survey_results_public.csv'
13
+ MODEL_NAMES = [
14
+ 'CatBoost Regressor',
15
+ 'XGBoost Regressor',
16
+ 'LGBM Regressor',
17
+ ]
18
+
19
+ def load_models(model_names: List[str]) -> Dict[str, Any]:
20
+ """Load machine learning models from disk."""
21
+ models = {}
22
+ for name in model_names:
23
+ path = os.path.join(MODEL_DIR, f"{name.replace(' ', '')}.joblib")
24
+ try:
25
+ models[name] = joblib.load(path)
26
+ except Exception as e:
27
+ st.error(f"Error loading model {name}: {str(e)}")
28
+ return models
29
+
30
+ # Load models
31
+ models = load_models(MODEL_NAMES)
32
+
33
+ # Load dataset
34
+ data_path = os.path.join(DATA_DIR, DATA_FILE)
35
+ df = pd.read_csv(data_path)
36
+
37
+ # Prepare features and target
38
+ X = df.drop(columns=['Salary'])
39
+ y = df['Salary']
40
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123)
41
+
42
+ # Pre-defined input choices
43
+ input_choices = {
44
+ 'MainBranch': df.MainBranch.unique().tolist(),
45
+ 'Country': X.Country.unique().tolist(),
46
+ 'EducationLevel': X.EducationLevel.unique().tolist(),
47
+ 'RemoteWork': df.RemoteWork.unique().tolist(),
48
+ }
49
+
50
+ # Pre-computed statistics for default values
51
+ default_comp = float(df.CompTotal.mean()) # Default CompTotal
52
+ max_comp = float(df.CompTotal.max() * 1.5)
53
+ default_years = 3.0 # Default years of experience
54
+ max_years = float(df.YearsOfExperience.max() * 1.5)
55
+
56
+ # Precompute predictions for training set
57
+ y_train_predictions = {name: model.predict(X_train) for name, model in models.items()}
58
+
59
+ def load_and_predict(sample: pd.DataFrame) -> pd.DataFrame:
60
+ """Predict salary using loaded models and evaluate statistics."""
61
+ results = []
62
+
63
+ for name, model in models.items():
64
+ try:
65
+ salary_pred = model.predict(sample)[0]
66
+ results.append({
67
+ 'Model': name,
68
+ 'Predicted Salary': salary_pred,
69
+ 'R2 Score (%)': r2_score(y_train, y_train_predictions[name]) * 100,
70
+ })
71
+ except Exception as e:
72
+ st.error(f"Error during prediction with model {name}: {str(e)}")
73
+
74
+ return pd.DataFrame(results).sort_values(by='R2 Score (%)', ascending=False).reset_index(drop=True)
75
+
76
+ # Streamlit UI setup
77
+ st.set_page_config(page_title="Developer Salary Prediction App", page_icon="🤑", layout="wide")
78
+ st.title("🤑 **Developer Salary Prediction**")
79
+
80
+ # Sidebar inputs
81
+ st.sidebar.header("Input Information")
82
+ mainbranch = st.sidebar.selectbox("**MainBranch**", options=input_choices['MainBranch'])
83
+ country = st.sidebar.selectbox("**Country**", options=input_choices['Country'])
84
+ educationlevel = st.sidebar.selectbox("**Education Level**", options=input_choices['EducationLevel'])
85
+ remotework = st.sidebar.selectbox("**Remote Work**", options=input_choices['RemoteWork'])
86
+ comptotal = st.sidebar.number_input("**CompTotal**", min_value=0.0, max_value=max_comp, value=default_comp)
87
+ yearsofexperience = st.sidebar.number_input("**Years of Experience**", min_value=0.0, max_value=max_years, value=default_years)
88
+
89
+ # Handling predictions
90
+ if st.sidebar.button(label=':rainbow[Predict Salary]'):
91
+ input_data = pd.DataFrame(
92
+ [[mainbranch, country, educationlevel, remotework, comptotal, yearsofexperience]],
93
+ columns=['MainBranch', 'Country', 'EducationLevel', 'RemoteWork', 'CompTotal', 'YearsOfExperience'])
94
+
95
+ results_df = load_and_predict(input_data)
96
+
97
+ if not results_df.empty:
98
+ st.write("### Prediction Results:")
99
+ st.dataframe(results_df)
100
+
101
+ # Disclaimer Section
102
+ st.markdown("---")
103
+ st.text('''
104
+ >> Developer Salary Prediction App <<
105
+ This Streamlit application predicts developer salary using multiple machine learning models including LGBM, XGBoost, and Random Forest regressors.
106
+ Users can input developer information through a user-friendly interface, which includes fields such as country, education level, and years of experience.
107
+
108
+ > Features:
109
+ **Input Components**:
110
+ - **MainBranch**: Select your main area of expertise in development, such as software engineering, data science, or web development. This selection may influence salary expectations based on the branch's demand and trends.
111
+
112
+ - **Country**: Choose your country from the dropdown list. Regions often exhibit varying salary scales due to economic factors, the cost of living, and market demand for tech workers.
113
+
114
+ - **Education Level**: Indicate the highest level of education you have completed. Higher educational qualifications often correlate with higher earning potential in the tech industry.
115
+
116
+ - **Remote Work**: Specify whether you primarily work remotely, on-site, or in a hybrid setup. Remote work setups can affect salary offers, especially if hiring companies are based in different geographic areas.
117
+
118
+ - **CompTotal**: Enter your expected total compensation, which includes salary, bonuses, and other benefits. This field is crucial for setting a base for salary predictions and facilitates comparisons.
119
+
120
+ - **Years of Experience**: Provide the number of years you've been in a coding-related job. Generally, more years of experience are associated with higher salaries due to skill accumulation and professional development.
121
+
122
+ **Data Processing**:
123
+ - The app employs a pre-processed dataset, cleaned and prepared for model training.
124
+ - It utilizes features including country, education level, and years of experience for predictions.
125
+ - Models are loaded from disk, obtaining predictions based on user-provided input.
126
+
127
+ **Prediction**: The app performs predictions with loaded models and calculates performance metrics like R2 score.
128
+ **Results Display**: The predicted salary and model performance metrics are presented in a user-friendly format.
129
+
130
+ > Usage:
131
+ Fill out the developer information and click "Predict Salary" to derive insights on anticipated salary and model performance.
132
+
133
+ > Disclaimer:
134
+ This application serves educational purposes. Predictions are not guaranteed to be accurate.
 
 
135
  ''')