mijgis commited on
Commit
b997311
·
1 Parent(s): 6c50d77

Add train file

Browse files
Files changed (1) hide show
  1. train.py +51 -0
train.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sklearn
2
+ import joblib
3
+ import pandas as pd
4
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
5
+ from sklearn.compose import make_column_transformer
6
+ from sklearn.pipeline import make_pipeline
7
+ from sklearn.model_selection import train_test_split
8
+ from sklearn.linear_model import LinearRegression
9
+ from sklearn.metrics import mean_squared_error, r2_score
10
+
11
+ df = pd.read_csv("insurance.csv")
12
+
13
+ target = 'charges'
14
+ numeric_features = ['age', 'bmi', 'children']
15
+ categorical_features = ['sex', 'smoker', 'region']
16
+
17
+ print("Creating data subsets")
18
+
19
+ X = df[numeric_features + categorical_features]
20
+ y = df[target]
21
+
22
+ Xtrain, Xtest, ytrain, ytest = train_test_split(
23
+ X, y,
24
+ test_size=0.2,
25
+ random_state=128
26
+ )
27
+
28
+ preprocessor = make_column_transformer(
29
+ (StandardScaler(), numeric_features),
30
+ (OneHotEncoder(handle_unknown='ignore'), categorical_features)
31
+ )
32
+
33
+ model_linear_regression = LinearRegression(n_jobs=-1)
34
+
35
+ print("Estimating Model Pipeline")
36
+
37
+ model_pipeline = make_pipeline(
38
+ preprocessor,
39
+ model_linear_regression
40
+ )
41
+
42
+ model_pipeline.fit(Xtrain, ytrain)
43
+
44
+ print("Logging Metrics")
45
+ print(f"R-squared: {r2_score(ytest, model_pipeline.predict(Xtest))}")
46
+
47
+ print("Serializing Model")
48
+
49
+ saved_model_path = "model.joblib"
50
+
51
+ joblib.dump(model_pipeline, saved_model_path)