# Predict flight cancellation probabilities and potential delay time given flight details

## Data Preprocessing

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import pickle

In [2]:
df = pd.read_csv('data-coordinates.csv', dtype={'CANCELLATION_CODE': 'string'})

In [3]:
df = df[['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'OP_CARRIER_FL_NUM', 
        'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'CRS_ARR_TIME', 'ARR_DELAY', 'CANCELLED']]

In [4]:
df.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ARR_TIME,ARR_DELAY,CANCELLED
0,1,17,7,DL,1114,BOS,TPA,1547,1907,20.0,0.0
1,1,17,7,DL,1126,SDF,ATL,1541,1705,267.0,0.0
2,1,17,7,DL,1173,SRQ,MSP,1410,1649,46.0,0.0
3,1,17,7,DL,1205,MSP,BWI,1840,2153,19.0,0.0
4,1,17,7,DL,1216,ORD,ATL,805,1104,32.0,0.0


In [5]:
df.dtypes

MONTH                  int64
DAY_OF_MONTH           int64
DAY_OF_WEEK            int64
OP_UNIQUE_CARRIER     object
OP_CARRIER_FL_NUM      int64
ORIGIN                object
DEST                  object
CRS_DEP_TIME           int64
CRS_ARR_TIME           int64
ARR_DELAY            float64
CANCELLED            float64
dtype: object

In [6]:
df.columns.values

array(['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER',
       'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST', 'CRS_DEP_TIME',
       'CRS_ARR_TIME', 'ARR_DELAY', 'CANCELLED'], dtype=object)

In [7]:
df.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ARR_TIME,ARR_DELAY,CANCELLED
0,1,17,7,DL,1114,BOS,TPA,1547,1907,20.0,0.0
1,1,17,7,DL,1126,SDF,ATL,1541,1705,267.0,0.0
2,1,17,7,DL,1173,SRQ,MSP,1410,1649,46.0,0.0
3,1,17,7,DL,1205,MSP,BWI,1840,2153,19.0,0.0
4,1,17,7,DL,1216,ORD,ATL,805,1104,32.0,0.0


In [8]:
df['ARR_DELAY'] = df['ARR_DELAY'].fillna(0)

In [9]:
delay_X = df.drop(["ARR_DELAY", "CANCELLED"], axis=1)
delay_Y = df["ARR_DELAY"].copy()

cancel_X = delay_X.copy()
cancel_Y = df["CANCELLED"].copy()

In [10]:
num_features = list(delay_X.drop(["OP_UNIQUE_CARRIER", "ORIGIN", "DEST"], axis=1))
cat_features = ["OP_UNIQUE_CARRIER", "ORIGIN", "DEST"]

full_pipeline = ColumnTransformer([
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(), cat_features)
])

In [11]:
print(delay_X.shape)
delay_X = full_pipeline.fit_transform(delay_X)
cancel_X = full_pipeline.fit_transform(cancel_X)

(1141693, 9)


In [12]:
delay_train_X, delay_test_X, delay_train_Y, delay_test_Y = train_test_split(delay_X, delay_Y, test_size=0.1)
cancel_train_X, cancel_test_X, cancel_train_Y, cancel_test_Y = train_test_split(cancel_X, cancel_Y, test_size=0.1)

### Save Transformer

In [13]:
delay_X.shape

(1141693, 655)

In [14]:
filename = 'saved_models/data_transformer.sav'
pickle.dump(full_pipeline, open(filename, 'wb'))

In [15]:
loaded_model = pickle.load(open(filename, 'rb'))

## Train Model: Delay

### Linear Regression

In [16]:
from sklearn.linear_model import LinearRegression

In [17]:
lin_reg = LinearRegression()
lin_reg.fit(delay_train_X, delay_train_Y)

In [18]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

delay_test_predictions = lin_reg.predict(delay_test_X)
lin_mse = mean_squared_error(delay_test_Y, delay_test_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_mae = mean_absolute_error(delay_test_Y, delay_test_predictions)
print(f'Root mean squared error: {lin_rmse}, Mean absolute error: {lin_mae}')

Root mean squared error: 91.86769748120687, Mean absolute error: 47.6789239375552


### Retrain on all data

In [19]:
lin_reg = LinearRegression()
lin_reg.fit(delay_X, delay_Y)

filename = 'saved_models/delay_lin_reg.sav'
pickle.dump(lin_reg, open(filename, 'wb'))

In [20]:
loaded_model = pickle.load(open(filename, 'rb'))

delay_test_predictions = loaded_model.predict(delay_test_X)
lin_mse = mean_squared_error(delay_test_Y, delay_test_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_mae = mean_absolute_error(delay_test_Y, delay_test_predictions)
print(f'Root mean squared error: {lin_rmse}, Mean absolute error: {lin_mae}')

Root mean squared error: 91.78534982074099, Mean absolute error: 47.64491297947435


### Neural Network

In [21]:
import math
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.losses import MeanSquaredLogarithmicError

In [22]:
neural = Sequential()

neural.add(Dense(128, kernel_initializer='normal',input_dim = delay_train_X.shape[1], activation='relu'))
neural.add(Dense(256, kernel_initializer='normal',activation='relu'))
neural.add(Dropout(0.2))
neural.add(Dense(64, kernel_initializer='normal',activation='relu'))
neural.add(Dense(1, kernel_initializer='normal',activation='linear'))

neural.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])
neural.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [23]:
neural.fit(delay_train_X.toarray() , np.array(delay_train_Y) , epochs=5, batch_size=32, validation_split = 0.2)

Epoch 1/5
[1m25689/25689[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m349s[0m 10ms/step - loss: 8421.3994 - mean_squared_error: 8421.3994 - val_loss: 8140.2700 - val_mean_squared_error: 8140.2700
Epoch 2/5
[1m25689/25689[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 6ms/step - loss: 8218.5732 - mean_squared_error: 8218.5732 - val_loss: 8155.1226 - val_mean_squared_error: 8155.1226
Epoch 3/5
[1m25689/25689[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 6ms/step - loss: 8051.7744 - mean_squared_error: 8051.7744 - val_loss: 8119.2329 - val_mean_squared_error: 8119.2329
Epoch 4/5
[1m25689/25689[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m220s[0m 7ms/step - loss: 8149.7393 - mean_squared_error: 8149.7393 - val_loss: 8103.2837 - val_mean_squared_error: 8103.2837
Epoch 5/5
[1m25689/25689[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 7ms/step - loss: 8080.3198 - mean_squared_error: 8080.3198 - val_loss: 8113.9531 - val_mean_squared_error: 8113.9531


<keras.src.callbacks.history.History at 0x1a128e5f4d0>

In [24]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

delay_test_predictions = neural.predict(delay_test_X)
neural_mse = mean_squared_error(delay_test_Y, delay_test_predictions)
neural_rmse = np.sqrt(neural_mse)
neural_mae = mean_absolute_error(delay_test_Y, delay_test_predictions)
print(f'Root mean squared error: {neural_rmse}, Mean absolute error: {neural_mae}')

[1m3568/3568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 5ms/step
Root mean squared error: 91.42243092164492, Mean absolute error: 45.856728016541844


## Train Model: Cancel

### Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression

In [26]:
clf = LogisticRegression(max_iter=1000)
clf.fit(cancel_train_X, cancel_train_Y)

In [27]:
import numpy as np
from sklearn.metrics import accuracy_score

cancel_test_predictions = clf.predict(cancel_test_X)
acc = accuracy_score(cancel_test_Y, cancel_test_predictions)

print(f'Accuracy: {acc}')

Accuracy: 0.905833406323903


In [28]:
clf.predict_proba(cancel_test_X)

array([[0.95112009, 0.04887991],
       [0.91754329, 0.08245671],
       [0.94939641, 0.05060359],
       ...,
       [0.91064547, 0.08935453],
       [0.7672516 , 0.2327484 ],
       [0.80113653, 0.19886347]])

### Retrain on all data

In [29]:
clf = LogisticRegression(max_iter=1000)
clf.fit(cancel_X, cancel_Y)

filename = 'saved_models/cancel_log_reg.sav'
pickle.dump(clf, open(filename, 'wb'))

In [30]:
loaded_model = pickle.load(open(filename, 'rb'))

cancel_test_predictions = loaded_model.predict(cancel_test_X)
acc = accuracy_score(cancel_test_Y, cancel_test_predictions)

print(f'Accuracy: {acc}')

Accuracy: 0.905833406323903


In [31]:
loaded_model.predict_proba(cancel_test_X)[0][1]

0.04856708387620738

# Predict Delay Reason

In [32]:
df = pd.read_csv('data-coordinates.csv', dtype={'CANCELLATION_CODE': 'string'})

In [33]:
df = df[['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'OP_CARRIER_FL_NUM', 
        'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'CRS_ARR_TIME', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'ARR_DELAY']]


In [34]:
df.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ARR_TIME,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,ARR_DELAY
0,1,17,7,DL,1114,BOS,TPA,1547,1907,0.0,0.0,20.0,0.0,0.0,20.0
1,1,17,7,DL,1126,SDF,ATL,1541,1705,267.0,0.0,0.0,0.0,0.0,267.0
2,1,17,7,DL,1173,SRQ,MSP,1410,1649,46.0,0.0,0.0,0.0,0.0,46.0
3,1,17,7,DL,1205,MSP,BWI,1840,2153,0.0,0.0,19.0,0.0,0.0,19.0
4,1,17,7,DL,1216,ORD,ATL,805,1104,0.0,0.0,32.0,0.0,0.0,32.0


In [35]:
df.dtypes

MONTH                    int64
DAY_OF_MONTH             int64
DAY_OF_WEEK              int64
OP_UNIQUE_CARRIER       object
OP_CARRIER_FL_NUM        int64
ORIGIN                  object
DEST                    object
CRS_DEP_TIME             int64
CRS_ARR_TIME             int64
CARRIER_DELAY          float64
WEATHER_DELAY          float64
NAS_DELAY              float64
SECURITY_DELAY         float64
LATE_AIRCRAFT_DELAY    float64
ARR_DELAY              float64
dtype: object

In [36]:
df.columns.values

array(['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER',
       'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST', 'CRS_DEP_TIME',
       'CRS_ARR_TIME', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY',
       'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'ARR_DELAY'], dtype=object)

In [37]:
df.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ARR_TIME,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,ARR_DELAY
0,1,17,7,DL,1114,BOS,TPA,1547,1907,0.0,0.0,20.0,0.0,0.0,20.0
1,1,17,7,DL,1126,SDF,ATL,1541,1705,267.0,0.0,0.0,0.0,0.0,267.0
2,1,17,7,DL,1173,SRQ,MSP,1410,1649,46.0,0.0,0.0,0.0,0.0,46.0
3,1,17,7,DL,1205,MSP,BWI,1840,2153,0.0,0.0,19.0,0.0,0.0,19.0
4,1,17,7,DL,1216,ORD,ATL,805,1104,0.0,0.0,32.0,0.0,0.0,32.0


In [38]:
df = df.dropna(subset=['ARR_DELAY'])
delay_Y = df['ARR_DELAY'].copy()
df = df.drop(['ARR_DELAY'], axis=1)

In [39]:
delay_X = df.drop(['CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY'], axis=1)

print(delay_X.shape)
delay_carrier_Y = df["CARRIER_DELAY"].copy()
delay_weather_Y = df["WEATHER_DELAY"].copy()
delay_nas_Y = df["NAS_DELAY"].copy()
delay_security_Y = df["SECURITY_DELAY"].copy()
delay_late_aircraft_Y = df["LATE_AIRCRAFT_DELAY"].copy()

(1034569, 9)


In [40]:
num_features = list(delay_X.drop(["OP_UNIQUE_CARRIER", "ORIGIN", "DEST"], axis=1))
cat_features = ["OP_UNIQUE_CARRIER", "ORIGIN", "DEST"]

full_pipeline = ColumnTransformer([
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(), cat_features)
])

In [41]:
delay_X = full_pipeline.fit_transform(delay_X)

In [42]:
delay_X.shape

(1034569, 655)

# Weather delay

### Linear Regression

In [43]:
delay_train_X, delay_test_X, delay_train_Y, delay_test_Y \
        = train_test_split(delay_X, delay_weather_Y, test_size=0.1)

In [44]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

# Option 1: Remove rows with NaN values in `delay_train_Y`
valid_indices = ~np.isnan(delay_train_Y)
delay_train_X = delay_train_X[valid_indices]
delay_train_Y = delay_train_Y[valid_indices]

In [45]:
lin_reg = LinearRegression()
lin_reg.fit(delay_train_X, delay_train_Y)

In [46]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Predict the test set
delay_test_predictions = lin_reg.predict(delay_test_X)

# Check for NaN in delay_test_Y
print("NaN values in delay_test_Y:", np.isnan(delay_test_Y).sum())

# Remove rows with NaN in delay_test_Y
valid_indices_Y = ~np.isnan(delay_test_Y)
delay_test_Y = delay_test_Y[valid_indices_Y]
delay_test_X = delay_test_X[valid_indices_Y]  # Update X to match cleaned Y

# Predict again with cleaned data
delay_test_predictions = lin_reg.predict(delay_test_X)

# Check for NaN in delay_test_predictions
print("NaN values in delay_test_predictions:", np.isnan(delay_test_predictions).sum())

# Remove rows with NaN in predictions
valid_indices_pred = ~np.isnan(delay_test_predictions)
delay_test_Y = delay_test_Y[valid_indices_pred]
delay_test_predictions = delay_test_predictions[valid_indices_pred]

# Now calculate the metrics
lin_mse = mean_squared_error(delay_test_Y, delay_test_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_mae = mean_absolute_error(delay_test_Y, delay_test_predictions)
print(f'Root mean squared error: {lin_rmse}, Mean absolute error: {lin_mae}')


NaN values in delay_test_Y: 0
NaN values in delay_test_predictions: 0
Root mean squared error: 35.957841679118246, Mean absolute error: 8.61170248980199


### Retrain on all data

In [47]:
# Impute missing values in delay_weather_Y with the mean
imputer = SimpleImputer(strategy='mean')
delay_weather_Y = imputer.fit_transform(delay_weather_Y.values.reshape(-1, 1)).ravel()

lin_reg = LinearRegression()
lin_reg.fit(delay_X, delay_weather_Y)

filename = 'saved_models/delay_weather_lin_reg.sav'
pickle.dump(lin_reg, open(filename, 'wb'))

In [48]:
loaded_model = pickle.load(open(filename, 'rb'))

delay_test_predictions = loaded_model.predict(delay_test_X)
lin_mse = mean_squared_error(delay_test_Y, delay_test_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_mae = mean_absolute_error(delay_test_Y, delay_test_predictions)
print(f'Root mean squared error: {lin_rmse}, Mean absolute error: {lin_mae}')

Root mean squared error: 35.88746420327346, Mean absolute error: 8.608681302973599


# Carrier delay

### Linear Regression

In [49]:
delay_train_X, delay_test_X, delay_train_Y, delay_test_Y \
        = train_test_split(delay_X, delay_weather_Y, test_size=0.1)

In [50]:
imputer = SimpleImputer(strategy='mean')
delay_carrier_Y = imputer.fit_transform(delay_carrier_Y.values.reshape(-1, 1)).ravel()

lin_reg = LinearRegression()
lin_reg.fit(delay_X, delay_carrier_Y)

filename = 'saved_models/delay_carrier_lin_reg.sav'
pickle.dump(lin_reg, open(filename, 'wb'))

In [51]:
loaded_model = pickle.load(open(filename, 'rb'))

delay_test_predictions = loaded_model.predict(delay_test_X)
lin_mse = mean_squared_error(delay_test_Y, delay_test_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_mae = mean_absolute_error(delay_test_Y, delay_test_predictions)
print(f'Root mean squared error: {lin_rmse}, Mean absolute error: {lin_mae}')

Root mean squared error: 43.11811662110843, Mean absolute error: 28.557974007733517


# NAS delay

### Linear Regression

In [52]:
delay_train_X, delay_test_X, delay_train_Y, delay_test_Y \
        = train_test_split(delay_X, delay_weather_Y, test_size=0.1)

In [53]:
imputer = SimpleImputer(strategy='mean')
delay_nas_Y = imputer.fit_transform(delay_nas_Y.values.reshape(-1, 1)).ravel()

lin_reg = LinearRegression()
lin_reg.fit(delay_X, delay_nas_Y)

filename = 'saved_models/delay_nas_lin_reg.sav'
pickle.dump(lin_reg, open(filename, 'wb'))

In [54]:
loaded_model = pickle.load(open(filename, 'rb'))

delay_test_predictions = loaded_model.predict(delay_test_X)
lin_mse = mean_squared_error(delay_test_Y, delay_test_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_mae = mean_absolute_error(delay_test_Y, delay_test_predictions)
print(f'Root mean squared error: {lin_rmse}, Mean absolute error: {lin_mae}')

Root mean squared error: 35.57074424286895, Mean absolute error: 14.59878143835819


# Security delay

### Linear Regression

In [55]:
delay_train_X, delay_test_X, delay_train_Y, delay_test_Y \
        = train_test_split(delay_X, delay_security_Y, test_size=0.1)

In [56]:
imputer = SimpleImputer(strategy='mean')
delay_security_Y = imputer.fit_transform(delay_security_Y.values.reshape(-1, 1)).ravel()

lin_reg = LinearRegression()
lin_reg.fit(delay_X, delay_security_Y)

filename = 'saved_models/delay_security_lin_reg.sav'
pickle.dump(lin_reg, open(filename, 'wb'))

In [57]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Load your model (assuming `filename` is defined)
loaded_model = pickle.load(open(filename, 'rb'))

# Make predictions
delay_test_predictions = loaded_model.predict(delay_test_X)

# Check for NaNs in the actual and predicted values
print("NaNs in delay_test_Y:", np.isnan(delay_test_Y).sum())
print("NaNs in delay_test_predictions:", np.isnan(delay_test_predictions).sum())

# Remove NaNs from the data
mask = ~np.isnan(delay_test_Y) & ~np.isnan(delay_test_predictions)
delay_test_Y_clean = delay_test_Y[mask]
delay_test_predictions_clean = delay_test_predictions[mask]

# Calculate metrics
lin_mse = mean_squared_error(delay_test_Y_clean, delay_test_predictions_clean)
lin_rmse = np.sqrt(lin_mse)
lin_mae = mean_absolute_error(delay_test_Y_clean, delay_test_predictions_clean)

# Print metrics
print(f'MSE: {lin_mse}')
print(f'RMSE: {lin_rmse}')
print(f'MAE: {lin_mae}')


NaNs in delay_test_Y: 0
NaNs in delay_test_predictions: 0
MSE: 11.504999499784063
RMSE: 3.3919020474925365
MAE: 0.4101716247563961


# Late aircraft delay

### Linear Regression

In [58]:
delay_train_X, delay_test_X, delay_train_Y, delay_test_Y \
        = train_test_split(delay_X, delay_security_Y, test_size=0.1)

In [59]:
import numpy as np
from sklearn.linear_model import LinearRegression
import pickle

# Check for NaNs in the target variable
print("NaNs in delay_late_aircraft_Y:", np.isnan(delay_late_aircraft_Y).sum())

# Remove rows with NaNs in the target variable
mask = ~np.isnan(delay_late_aircraft_Y)
delay_X_clean = delay_X[mask]
delay_late_aircraft_Y_clean = delay_late_aircraft_Y[mask]

# Fit the model
lin_reg = LinearRegression()
lin_reg.fit(delay_X_clean, delay_late_aircraft_Y_clean)

# Save the model
filename = 'saved_models/delay_late_aircraft_lin_reg.sav'
pickle.dump(lin_reg, open(filename, 'wb'))

print("Model saved successfully.")


NaNs in delay_late_aircraft_Y: 0
Model saved successfully.


In [60]:
loaded_model = pickle.load(open(filename, 'rb'))

delay_test_predictions = loaded_model.predict(delay_test_X)
lin_mse = mean_squared_error(delay_test_Y, delay_test_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_mae = mean_absolute_error(delay_test_Y, delay_test_predictions)
print(f'Root mean squared error: {lin_rmse}, Mean absolute error: {lin_mae}')

Root mean squared error: 26.30732868456468, Mean absolute error: 23.99399248512569


# Predict

In [61]:
weather_filename = 'saved_models/delay_weather_lin_reg.sav'
carrier_filename = 'saved_models/delay_carrier_lin_reg.sav'
nas_filename = 'saved_models/delay_nas_lin_reg.sav'
security_filename = 'saved_models/delay_security_lin_reg.sav'
late_aircraft_filename = 'saved_models/delay_late_aircraft_lin_reg.sav'

weather_loaded_model = pickle.load(open(weather_filename, 'rb'))
carrier_loaded_model = pickle.load(open(carrier_filename, 'rb'))
nas_loaded_model = pickle.load(open(nas_filename, 'rb'))
security_loaded_model = pickle.load(open(security_filename, 'rb'))
late_aircraft_loaded_model = pickle.load(open(late_aircraft_filename, 'rb'))

In [62]:
arr_delay_prediction = weather_loaded_model.predict(delay_X) + \
                        carrier_loaded_model.predict(delay_X) + \
                        nas_loaded_model.predict(delay_X) + \
                        security_loaded_model.predict(delay_X) + \
                        late_aircraft_loaded_model.predict(delay_X)

In [63]:
lin_mse = mean_squared_error(delay_Y, arr_delay_prediction)
lin_rmse = np.sqrt(lin_mse)
lin_mae = mean_absolute_error(delay_Y, arr_delay_prediction)
print(f'Root mean squared error: {lin_rmse}, Mean absolute error: {lin_mae}')

Root mean squared error: 93.2178331053021, Mean absolute error: 48.62179687850236
