{ "cells": [ { "cell_type": "markdown", "id": "d6f86767", "metadata": {}, "source": [ "# Predict flight cancellation probabilities and potential delay time given flight details" ] }, { "cell_type": "markdown", "id": "1b22f7ca", "metadata": {}, "source": [ "## Data Preprocessing" ] }, { "cell_type": "code", "execution_count": 1, "id": "11248582", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.pipeline import Pipeline\n", "import numpy as np\n", "import pickle" ] }, { "cell_type": "code", "execution_count": 2, "id": "1d3035c7", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('data-coordinates.csv', dtype={'CANCELLATION_CODE': 'string'})" ] }, { "cell_type": "code", "execution_count": 3, "id": "78c9ce8a", "metadata": {}, "outputs": [], "source": [ "df = df[['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'OP_CARRIER_FL_NUM', \n", " 'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'CRS_ARR_TIME', 'ARR_DELAY', 'CANCELLED']]" ] }, { "cell_type": "code", "execution_count": 4, "id": "b17c0cce", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MONTHDAY_OF_MONTHDAY_OF_WEEKOP_UNIQUE_CARRIEROP_CARRIER_FL_NUMORIGINDESTCRS_DEP_TIMECRS_ARR_TIMEARR_DELAYCANCELLED
01177DL1114BOSTPA1547190720.00.0
11177DL1126SDFATL15411705267.00.0
21177DL1173SRQMSP1410164946.00.0
31177DL1205MSPBWI1840215319.00.0
41177DL1216ORDATL805110432.00.0
\n", "
" ], "text/plain": [ " MONTH DAY_OF_MONTH DAY_OF_WEEK OP_UNIQUE_CARRIER OP_CARRIER_FL_NUM \\\n", "0 1 17 7 DL 1114 \n", "1 1 17 7 DL 1126 \n", "2 1 17 7 DL 1173 \n", "3 1 17 7 DL 1205 \n", "4 1 17 7 DL 1216 \n", "\n", " ORIGIN DEST CRS_DEP_TIME CRS_ARR_TIME ARR_DELAY CANCELLED \n", "0 BOS TPA 1547 1907 20.0 0.0 \n", "1 SDF ATL 1541 1705 267.0 0.0 \n", "2 SRQ MSP 1410 1649 46.0 0.0 \n", "3 MSP BWI 1840 2153 19.0 0.0 \n", "4 ORD ATL 805 1104 32.0 0.0 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 5, "id": "25b8e814", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MONTH int64\n", "DAY_OF_MONTH int64\n", "DAY_OF_WEEK int64\n", "OP_UNIQUE_CARRIER object\n", "OP_CARRIER_FL_NUM int64\n", "ORIGIN object\n", "DEST object\n", "CRS_DEP_TIME int64\n", "CRS_ARR_TIME int64\n", "ARR_DELAY float64\n", "CANCELLED float64\n", "dtype: object" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dtypes" ] }, { "cell_type": "code", "execution_count": 6, "id": "a08d243d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER',\n", " 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST', 'CRS_DEP_TIME',\n", " 'CRS_ARR_TIME', 'ARR_DELAY', 'CANCELLED'], dtype=object)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns.values" ] }, { "cell_type": "code", "execution_count": 7, "id": "77d9a008", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MONTHDAY_OF_MONTHDAY_OF_WEEKOP_UNIQUE_CARRIEROP_CARRIER_FL_NUMORIGINDESTCRS_DEP_TIMECRS_ARR_TIMEARR_DELAYCANCELLED
01177DL1114BOSTPA1547190720.00.0
11177DL1126SDFATL15411705267.00.0
21177DL1173SRQMSP1410164946.00.0
31177DL1205MSPBWI1840215319.00.0
41177DL1216ORDATL805110432.00.0
\n", "
" ], "text/plain": [ " MONTH DAY_OF_MONTH DAY_OF_WEEK OP_UNIQUE_CARRIER OP_CARRIER_FL_NUM \\\n", "0 1 17 7 DL 1114 \n", "1 1 17 7 DL 1126 \n", "2 1 17 7 DL 1173 \n", "3 1 17 7 DL 1205 \n", "4 1 17 7 DL 1216 \n", "\n", " ORIGIN DEST CRS_DEP_TIME CRS_ARR_TIME ARR_DELAY CANCELLED \n", "0 BOS TPA 1547 1907 20.0 0.0 \n", "1 SDF ATL 1541 1705 267.0 0.0 \n", "2 SRQ MSP 1410 1649 46.0 0.0 \n", "3 MSP BWI 1840 2153 19.0 0.0 \n", "4 ORD ATL 805 1104 32.0 0.0 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 8, "id": "7c8e1109", "metadata": {}, "outputs": [], "source": [ "df['ARR_DELAY'] = df['ARR_DELAY'].fillna(0)" ] }, { "cell_type": "code", "execution_count": 9, "id": "0abd26cb", "metadata": {}, "outputs": [], "source": [ "delay_X = df.drop([\"ARR_DELAY\", \"CANCELLED\"], axis=1)\n", "delay_Y = df[\"ARR_DELAY\"].copy()\n", "\n", "cancel_X = delay_X.copy()\n", "cancel_Y = df[\"CANCELLED\"].copy()" ] }, { "cell_type": "code", "execution_count": 10, "id": "466d2ee8", "metadata": {}, "outputs": [], "source": [ "num_features = list(delay_X.drop([\"OP_UNIQUE_CARRIER\", \"ORIGIN\", \"DEST\"], axis=1))\n", "cat_features = [\"OP_UNIQUE_CARRIER\", \"ORIGIN\", \"DEST\"]\n", "\n", "full_pipeline = ColumnTransformer([\n", " (\"num\", StandardScaler(), num_features),\n", " (\"cat\", OneHotEncoder(), cat_features)\n", "])" ] }, { "cell_type": "code", "execution_count": 11, "id": "566c37f9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1141693, 9)\n" ] } ], "source": [ "print(delay_X.shape)\n", "delay_X = full_pipeline.fit_transform(delay_X)\n", "cancel_X = full_pipeline.fit_transform(cancel_X)" ] }, { "cell_type": "code", "execution_count": 12, "id": "8b0db49e", "metadata": {}, "outputs": [], "source": [ "delay_train_X, delay_test_X, delay_train_Y, delay_test_Y = train_test_split(delay_X, delay_Y, test_size=0.1)\n", "cancel_train_X, cancel_test_X, cancel_train_Y, cancel_test_Y = train_test_split(cancel_X, cancel_Y, test_size=0.1)" ] }, { "cell_type": "markdown", "id": "a9f5e2ba", "metadata": {}, "source": [ "### Save Transformer" ] }, { "cell_type": "code", "execution_count": 13, "id": "11ed00ce", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1141693, 655)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "delay_X.shape" ] }, { "cell_type": "code", "execution_count": 14, "id": "d52bb750", "metadata": {}, "outputs": [], "source": [ "filename = 'saved_models/data_transformer.sav'\n", "pickle.dump(full_pipeline, open(filename, 'wb'))" ] }, { "cell_type": "code", "execution_count": 15, "id": "31af79c4", "metadata": {}, "outputs": [], "source": [ "loaded_model = pickle.load(open(filename, 'rb'))" ] }, { "cell_type": "markdown", "id": "1c269456", "metadata": {}, "source": [ "## Train Model: Delay" ] }, { "cell_type": "markdown", "id": "24f6fc7b", "metadata": {}, "source": [ "### Linear Regression" ] }, { "cell_type": "code", "execution_count": 16, "id": "75d1a7f7", "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LinearRegression" ] }, { "cell_type": "code", "execution_count": 17, "id": "30960676", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "LinearRegression()" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lin_reg = LinearRegression()\n", "lin_reg.fit(delay_train_X, delay_train_Y)" ] }, { "cell_type": "code", "execution_count": 18, "id": "edc4afaf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Root mean squared error: 91.86769748120687, Mean absolute error: 47.6789239375552\n" ] } ], "source": [ "import numpy as np\n", "from sklearn.metrics import mean_squared_error\n", "from sklearn.metrics import mean_absolute_error\n", "\n", "delay_test_predictions = lin_reg.predict(delay_test_X)\n", "lin_mse = mean_squared_error(delay_test_Y, delay_test_predictions)\n", "lin_rmse = np.sqrt(lin_mse)\n", "lin_mae = mean_absolute_error(delay_test_Y, delay_test_predictions)\n", "print(f'Root mean squared error: {lin_rmse}, Mean absolute error: {lin_mae}')" ] }, { "cell_type": "markdown", "id": "b406b4d3", "metadata": {}, "source": [ "### Retrain on all data" ] }, { "cell_type": "code", "execution_count": 19, "id": "0feec2e9", "metadata": {}, "outputs": [], "source": [ "lin_reg = LinearRegression()\n", "lin_reg.fit(delay_X, delay_Y)\n", "\n", "filename = 'saved_models/delay_lin_reg.sav'\n", "pickle.dump(lin_reg, open(filename, 'wb'))" ] }, { "cell_type": "code", "execution_count": 20, "id": "dbfca124", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Root mean squared error: 91.78534982074099, Mean absolute error: 47.64491297947435\n" ] } ], "source": [ "loaded_model = pickle.load(open(filename, 'rb'))\n", "\n", "delay_test_predictions = loaded_model.predict(delay_test_X)\n", "lin_mse = mean_squared_error(delay_test_Y, delay_test_predictions)\n", "lin_rmse = np.sqrt(lin_mse)\n", "lin_mae = mean_absolute_error(delay_test_Y, delay_test_predictions)\n", "print(f'Root mean squared error: {lin_rmse}, Mean absolute error: {lin_mae}')" ] }, { "cell_type": "markdown", "id": "1f8b7a72", "metadata": {}, "source": [ "### Neural Network" ] }, { "cell_type": "code", "execution_count": 21, "id": "d0a91f04", "metadata": {}, "outputs": [], "source": [ "import math\n", "import pandas as pd\n", "import tensorflow as tf\n", "from tensorflow.keras import Model\n", "from tensorflow.keras import Sequential\n", "from tensorflow.keras.optimizers import Adam\n", "from sklearn.preprocessing import StandardScaler\n", "from tensorflow.keras.layers import Dense, Dropout\n", "from sklearn.model_selection import train_test_split\n", "from tensorflow.keras.losses import MeanSquaredLogarithmicError" ] }, { "cell_type": "code", "execution_count": 22, "id": "b7237fa1", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\Admin\\Documents\\final-project-flight-never-delay\\env\\Lib\\site-packages\\keras\\src\\layers\\core\\dense.py:87: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.\n", " super().__init__(activity_regularizer=activity_regularizer, **kwargs)\n" ] }, { "data": { "text/html": [ "
Model: \"sequential\"\n",
       "
\n" ], "text/plain": [ "\u001b[1mModel: \"sequential\"\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
       "┃ Layer (type)                     Output Shape                  Param # ┃\n",
       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
       "│ dense (Dense)                   │ (None, 128)            │        83,968 │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ dense_1 (Dense)                 │ (None, 256)            │        33,024 │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ dropout (Dropout)               │ (None, 256)            │             0 │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ dense_2 (Dense)                 │ (None, 64)             │        16,448 │\n",
       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
       "│ dense_3 (Dense)                 │ (None, 1)              │            65 │\n",
       "└─────────────────────────────────┴────────────────────────┴───────────────┘\n",
       "
\n" ], "text/plain": [ "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n", "┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\n", "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n", "│ dense (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m83,968\u001b[0m │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ dense_1 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m33,024\u001b[0m │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ dropout (\u001b[38;5;33mDropout\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ dense_2 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m16,448\u001b[0m │\n", "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", "│ dense_3 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m1\u001b[0m) │ \u001b[38;5;34m65\u001b[0m │\n", "└─────────────────────────────────┴────────────────────────┴───────────────┘\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
 Total params: 133,505 (521.50 KB)\n",
       "
\n" ], "text/plain": [ "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m133,505\u001b[0m (521.50 KB)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
 Trainable params: 133,505 (521.50 KB)\n",
       "
\n" ], "text/plain": [ "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m133,505\u001b[0m (521.50 KB)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
 Non-trainable params: 0 (0.00 B)\n",
       "
\n" ], "text/plain": [ "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "neural = Sequential()\n", "\n", "neural.add(Dense(128, kernel_initializer='normal',input_dim = delay_train_X.shape[1], activation='relu'))\n", "neural.add(Dense(256, kernel_initializer='normal',activation='relu'))\n", "neural.add(Dropout(0.2))\n", "neural.add(Dense(64, kernel_initializer='normal',activation='relu'))\n", "neural.add(Dense(1, kernel_initializer='normal',activation='linear'))\n", "\n", "neural.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])\n", "neural.summary()" ] }, { "cell_type": "code", "execution_count": 23, "id": "14d410ea", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/5\n", "\u001b[1m25689/25689\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m349s\u001b[0m 10ms/step - loss: 8421.3994 - mean_squared_error: 8421.3994 - val_loss: 8140.2700 - val_mean_squared_error: 8140.2700\n", "Epoch 2/5\n", "\u001b[1m25689/25689\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m154s\u001b[0m 6ms/step - loss: 8218.5732 - mean_squared_error: 8218.5732 - val_loss: 8155.1226 - val_mean_squared_error: 8155.1226\n", "Epoch 3/5\n", "\u001b[1m25689/25689\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m157s\u001b[0m 6ms/step - loss: 8051.7744 - mean_squared_error: 8051.7744 - val_loss: 8119.2329 - val_mean_squared_error: 8119.2329\n", "Epoch 4/5\n", "\u001b[1m25689/25689\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m220s\u001b[0m 7ms/step - loss: 8149.7393 - mean_squared_error: 8149.7393 - val_loss: 8103.2837 - val_mean_squared_error: 8103.2837\n", "Epoch 5/5\n", "\u001b[1m25689/25689\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m169s\u001b[0m 7ms/step - loss: 8080.3198 - mean_squared_error: 8080.3198 - val_loss: 8113.9531 - val_mean_squared_error: 8113.9531\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "neural.fit(delay_train_X.toarray() , np.array(delay_train_Y) , epochs=5, batch_size=32, validation_split = 0.2)" ] }, { "cell_type": "code", "execution_count": 24, "id": "7c2cdc90", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m3568/3568\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m18s\u001b[0m 5ms/step\n", "Root mean squared error: 91.42243092164492, Mean absolute error: 45.856728016541844\n" ] } ], "source": [ "import numpy as np\n", "from sklearn.metrics import mean_squared_error\n", "from sklearn.metrics import mean_absolute_error\n", "\n", "delay_test_predictions = neural.predict(delay_test_X)\n", "neural_mse = mean_squared_error(delay_test_Y, delay_test_predictions)\n", "neural_rmse = np.sqrt(neural_mse)\n", "neural_mae = mean_absolute_error(delay_test_Y, delay_test_predictions)\n", "print(f'Root mean squared error: {neural_rmse}, Mean absolute error: {neural_mae}')" ] }, { "cell_type": "markdown", "id": "f8454baa", "metadata": {}, "source": [ "## Train Model: Cancel" ] }, { "cell_type": "markdown", "id": "ffc936ab", "metadata": {}, "source": [ "### Logistic Regression" ] }, { "cell_type": "code", "execution_count": 25, "id": "e6dfa3e0", "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression" ] }, { "cell_type": "code", "execution_count": 26, "id": "d51db2cc", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
LogisticRegression(max_iter=1000)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "LogisticRegression(max_iter=1000)" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf = LogisticRegression(max_iter=1000)\n", "clf.fit(cancel_train_X, cancel_train_Y)" ] }, { "cell_type": "code", "execution_count": 27, "id": "48bcecb1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.905833406323903\n" ] } ], "source": [ "import numpy as np\n", "from sklearn.metrics import accuracy_score\n", "\n", "cancel_test_predictions = clf.predict(cancel_test_X)\n", "acc = accuracy_score(cancel_test_Y, cancel_test_predictions)\n", "\n", "print(f'Accuracy: {acc}')" ] }, { "cell_type": "code", "execution_count": 28, "id": "67fcaded", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0.95112009, 0.04887991],\n", " [0.91754329, 0.08245671],\n", " [0.94939641, 0.05060359],\n", " ...,\n", " [0.91064547, 0.08935453],\n", " [0.7672516 , 0.2327484 ],\n", " [0.80113653, 0.19886347]])" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf.predict_proba(cancel_test_X)" ] }, { "cell_type": "markdown", "id": "47f9fa97", "metadata": {}, "source": [ "### Retrain on all data" ] }, { "cell_type": "code", "execution_count": 29, "id": "14f504ac", "metadata": {}, "outputs": [], "source": [ "clf = LogisticRegression(max_iter=1000)\n", "clf.fit(cancel_X, cancel_Y)\n", "\n", "filename = 'saved_models/cancel_log_reg.sav'\n", "pickle.dump(clf, open(filename, 'wb'))" ] }, { "cell_type": "code", "execution_count": 30, "id": "750e40a3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.905833406323903\n" ] } ], "source": [ "loaded_model = pickle.load(open(filename, 'rb'))\n", "\n", "cancel_test_predictions = loaded_model.predict(cancel_test_X)\n", "acc = accuracy_score(cancel_test_Y, cancel_test_predictions)\n", "\n", "print(f'Accuracy: {acc}')" ] }, { "cell_type": "code", "execution_count": 31, "id": "bb7020cb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.04856708387620738" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "loaded_model.predict_proba(cancel_test_X)[0][1]" ] }, { "cell_type": "markdown", "id": "95f1b200", "metadata": {}, "source": [ "# Predict Delay Reason" ] }, { "cell_type": "code", "execution_count": 32, "id": "1f1ec790", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('data-coordinates.csv', dtype={'CANCELLATION_CODE': 'string'})" ] }, { "cell_type": "code", "execution_count": 33, "id": "5e49873c", "metadata": {}, "outputs": [], "source": [ "df = df[['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'OP_CARRIER_FL_NUM', \n", " 'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'CRS_ARR_TIME', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'ARR_DELAY']]\n" ] }, { "cell_type": "code", "execution_count": 34, "id": "b8f5fa24", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MONTHDAY_OF_MONTHDAY_OF_WEEKOP_UNIQUE_CARRIEROP_CARRIER_FL_NUMORIGINDESTCRS_DEP_TIMECRS_ARR_TIMECARRIER_DELAYWEATHER_DELAYNAS_DELAYSECURITY_DELAYLATE_AIRCRAFT_DELAYARR_DELAY
01177DL1114BOSTPA154719070.00.020.00.00.020.0
11177DL1126SDFATL15411705267.00.00.00.00.0267.0
21177DL1173SRQMSP1410164946.00.00.00.00.046.0
31177DL1205MSPBWI184021530.00.019.00.00.019.0
41177DL1216ORDATL80511040.00.032.00.00.032.0
\n", "
" ], "text/plain": [ " MONTH DAY_OF_MONTH DAY_OF_WEEK OP_UNIQUE_CARRIER OP_CARRIER_FL_NUM \\\n", "0 1 17 7 DL 1114 \n", "1 1 17 7 DL 1126 \n", "2 1 17 7 DL 1173 \n", "3 1 17 7 DL 1205 \n", "4 1 17 7 DL 1216 \n", "\n", " ORIGIN DEST CRS_DEP_TIME CRS_ARR_TIME CARRIER_DELAY WEATHER_DELAY \\\n", "0 BOS TPA 1547 1907 0.0 0.0 \n", "1 SDF ATL 1541 1705 267.0 0.0 \n", "2 SRQ MSP 1410 1649 46.0 0.0 \n", "3 MSP BWI 1840 2153 0.0 0.0 \n", "4 ORD ATL 805 1104 0.0 0.0 \n", "\n", " NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY ARR_DELAY \n", "0 20.0 0.0 0.0 20.0 \n", "1 0.0 0.0 0.0 267.0 \n", "2 0.0 0.0 0.0 46.0 \n", "3 19.0 0.0 0.0 19.0 \n", "4 32.0 0.0 0.0 32.0 " ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 35, "id": "10b500d2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MONTH int64\n", "DAY_OF_MONTH int64\n", "DAY_OF_WEEK int64\n", "OP_UNIQUE_CARRIER object\n", "OP_CARRIER_FL_NUM int64\n", "ORIGIN object\n", "DEST object\n", "CRS_DEP_TIME int64\n", "CRS_ARR_TIME int64\n", "CARRIER_DELAY float64\n", "WEATHER_DELAY float64\n", "NAS_DELAY float64\n", "SECURITY_DELAY float64\n", "LATE_AIRCRAFT_DELAY float64\n", "ARR_DELAY float64\n", "dtype: object" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dtypes" ] }, { "cell_type": "code", "execution_count": 36, "id": "4b6b5151", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER',\n", " 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST', 'CRS_DEP_TIME',\n", " 'CRS_ARR_TIME', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY',\n", " 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'ARR_DELAY'], dtype=object)" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns.values" ] }, { "cell_type": "code", "execution_count": 37, "id": "b10d9cab", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MONTHDAY_OF_MONTHDAY_OF_WEEKOP_UNIQUE_CARRIEROP_CARRIER_FL_NUMORIGINDESTCRS_DEP_TIMECRS_ARR_TIMECARRIER_DELAYWEATHER_DELAYNAS_DELAYSECURITY_DELAYLATE_AIRCRAFT_DELAYARR_DELAY
01177DL1114BOSTPA154719070.00.020.00.00.020.0
11177DL1126SDFATL15411705267.00.00.00.00.0267.0
21177DL1173SRQMSP1410164946.00.00.00.00.046.0
31177DL1205MSPBWI184021530.00.019.00.00.019.0
41177DL1216ORDATL80511040.00.032.00.00.032.0
\n", "
" ], "text/plain": [ " MONTH DAY_OF_MONTH DAY_OF_WEEK OP_UNIQUE_CARRIER OP_CARRIER_FL_NUM \\\n", "0 1 17 7 DL 1114 \n", "1 1 17 7 DL 1126 \n", "2 1 17 7 DL 1173 \n", "3 1 17 7 DL 1205 \n", "4 1 17 7 DL 1216 \n", "\n", " ORIGIN DEST CRS_DEP_TIME CRS_ARR_TIME CARRIER_DELAY WEATHER_DELAY \\\n", "0 BOS TPA 1547 1907 0.0 0.0 \n", "1 SDF ATL 1541 1705 267.0 0.0 \n", "2 SRQ MSP 1410 1649 46.0 0.0 \n", "3 MSP BWI 1840 2153 0.0 0.0 \n", "4 ORD ATL 805 1104 0.0 0.0 \n", "\n", " NAS_DELAY SECURITY_DELAY LATE_AIRCRAFT_DELAY ARR_DELAY \n", "0 20.0 0.0 0.0 20.0 \n", "1 0.0 0.0 0.0 267.0 \n", "2 0.0 0.0 0.0 46.0 \n", "3 19.0 0.0 0.0 19.0 \n", "4 32.0 0.0 0.0 32.0 " ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 38, "id": "5482e792", "metadata": {}, "outputs": [], "source": [ "df = df.dropna(subset=['ARR_DELAY'])\n", "delay_Y = df['ARR_DELAY'].copy()\n", "df = df.drop(['ARR_DELAY'], axis=1)" ] }, { "cell_type": "code", "execution_count": 39, "id": "89ee251f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1034569, 9)\n" ] } ], "source": [ "delay_X = df.drop(['CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY'], axis=1)\n", "\n", "print(delay_X.shape)\n", "delay_carrier_Y = df[\"CARRIER_DELAY\"].copy()\n", "delay_weather_Y = df[\"WEATHER_DELAY\"].copy()\n", "delay_nas_Y = df[\"NAS_DELAY\"].copy()\n", "delay_security_Y = df[\"SECURITY_DELAY\"].copy()\n", "delay_late_aircraft_Y = df[\"LATE_AIRCRAFT_DELAY\"].copy()" ] }, { "cell_type": "code", "execution_count": 40, "id": "9cf7780b", "metadata": {}, "outputs": [], "source": [ "num_features = list(delay_X.drop([\"OP_UNIQUE_CARRIER\", \"ORIGIN\", \"DEST\"], axis=1))\n", "cat_features = [\"OP_UNIQUE_CARRIER\", \"ORIGIN\", \"DEST\"]\n", "\n", "full_pipeline = ColumnTransformer([\n", " (\"num\", StandardScaler(), num_features),\n", " (\"cat\", OneHotEncoder(), cat_features)\n", "])" ] }, { "cell_type": "code", "execution_count": 41, "id": "94094eb9", "metadata": {}, "outputs": [], "source": [ "delay_X = full_pipeline.fit_transform(delay_X)" ] }, { "cell_type": "code", "execution_count": 42, "id": "1011cecc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1034569, 655)" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "delay_X.shape" ] }, { "cell_type": "markdown", "id": "b25cac0b", "metadata": {}, "source": [ "# Weather delay" ] }, { "cell_type": "markdown", "id": "e4e3ab1d", "metadata": {}, "source": [ "### Linear Regression" ] }, { "cell_type": "code", "execution_count": 43, "id": "dc8cbc21", "metadata": {}, "outputs": [], "source": [ "delay_train_X, delay_test_X, delay_train_Y, delay_test_Y \\\n", " = train_test_split(delay_X, delay_weather_Y, test_size=0.1)" ] }, { "cell_type": "code", "execution_count": 44, "id": "af48c608", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.linear_model import LinearRegression\n", "\n", "# Option 1: Remove rows with NaN values in `delay_train_Y`\n", "valid_indices = ~np.isnan(delay_train_Y)\n", "delay_train_X = delay_train_X[valid_indices]\n", "delay_train_Y = delay_train_Y[valid_indices]" ] }, { "cell_type": "code", "execution_count": 45, "id": "b951a132", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "LinearRegression()" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lin_reg = LinearRegression()\n", "lin_reg.fit(delay_train_X, delay_train_Y)" ] }, { "cell_type": "code", "execution_count": 46, "id": "a081986a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NaN values in delay_test_Y: 0\n", "NaN values in delay_test_predictions: 0\n", "Root mean squared error: 35.957841679118246, Mean absolute error: 8.61170248980199\n" ] } ], "source": [ "import numpy as np\n", "from sklearn.metrics import mean_squared_error, mean_absolute_error\n", "\n", "# Predict the test set\n", "delay_test_predictions = lin_reg.predict(delay_test_X)\n", "\n", "# Check for NaN in delay_test_Y\n", "print(\"NaN values in delay_test_Y:\", np.isnan(delay_test_Y).sum())\n", "\n", "# Remove rows with NaN in delay_test_Y\n", "valid_indices_Y = ~np.isnan(delay_test_Y)\n", "delay_test_Y = delay_test_Y[valid_indices_Y]\n", "delay_test_X = delay_test_X[valid_indices_Y] # Update X to match cleaned Y\n", "\n", "# Predict again with cleaned data\n", "delay_test_predictions = lin_reg.predict(delay_test_X)\n", "\n", "# Check for NaN in delay_test_predictions\n", "print(\"NaN values in delay_test_predictions:\", np.isnan(delay_test_predictions).sum())\n", "\n", "# Remove rows with NaN in predictions\n", "valid_indices_pred = ~np.isnan(delay_test_predictions)\n", "delay_test_Y = delay_test_Y[valid_indices_pred]\n", "delay_test_predictions = delay_test_predictions[valid_indices_pred]\n", "\n", "# Now calculate the metrics\n", "lin_mse = mean_squared_error(delay_test_Y, delay_test_predictions)\n", "lin_rmse = np.sqrt(lin_mse)\n", "lin_mae = mean_absolute_error(delay_test_Y, delay_test_predictions)\n", "print(f'Root mean squared error: {lin_rmse}, Mean absolute error: {lin_mae}')\n" ] }, { "cell_type": "markdown", "id": "0864b592", "metadata": {}, "source": [ "### Retrain on all data" ] }, { "cell_type": "code", "execution_count": 47, "id": "f6b1a71d", "metadata": {}, "outputs": [], "source": [ "# Impute missing values in delay_weather_Y with the mean\n", "imputer = SimpleImputer(strategy='mean')\n", "delay_weather_Y = imputer.fit_transform(delay_weather_Y.values.reshape(-1, 1)).ravel()\n", "\n", "lin_reg = LinearRegression()\n", "lin_reg.fit(delay_X, delay_weather_Y)\n", "\n", "filename = 'saved_models/delay_weather_lin_reg.sav'\n", "pickle.dump(lin_reg, open(filename, 'wb'))" ] }, { "cell_type": "code", "execution_count": 48, "id": "2d802ec4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Root mean squared error: 35.88746420327346, Mean absolute error: 8.608681302973599\n" ] } ], "source": [ "loaded_model = pickle.load(open(filename, 'rb'))\n", "\n", "delay_test_predictions = loaded_model.predict(delay_test_X)\n", "lin_mse = mean_squared_error(delay_test_Y, delay_test_predictions)\n", "lin_rmse = np.sqrt(lin_mse)\n", "lin_mae = mean_absolute_error(delay_test_Y, delay_test_predictions)\n", "print(f'Root mean squared error: {lin_rmse}, Mean absolute error: {lin_mae}')" ] }, { "cell_type": "markdown", "id": "f55b409f", "metadata": {}, "source": [ "# Carrier delay" ] }, { "cell_type": "markdown", "id": "06130c89", "metadata": {}, "source": [ "### Linear Regression" ] }, { "cell_type": "code", "execution_count": 49, "id": "1bfc8368", "metadata": {}, "outputs": [], "source": [ "delay_train_X, delay_test_X, delay_train_Y, delay_test_Y \\\n", " = train_test_split(delay_X, delay_weather_Y, test_size=0.1)" ] }, { "cell_type": "code", "execution_count": 50, "id": "4025a6bb", "metadata": {}, "outputs": [], "source": [ "imputer = SimpleImputer(strategy='mean')\n", "delay_carrier_Y = imputer.fit_transform(delay_carrier_Y.values.reshape(-1, 1)).ravel()\n", "\n", "lin_reg = LinearRegression()\n", "lin_reg.fit(delay_X, delay_carrier_Y)\n", "\n", "filename = 'saved_models/delay_carrier_lin_reg.sav'\n", "pickle.dump(lin_reg, open(filename, 'wb'))" ] }, { "cell_type": "code", "execution_count": 51, "id": "939d79a1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Root mean squared error: 43.11811662110843, Mean absolute error: 28.557974007733517\n" ] } ], "source": [ "loaded_model = pickle.load(open(filename, 'rb'))\n", "\n", "delay_test_predictions = loaded_model.predict(delay_test_X)\n", "lin_mse = mean_squared_error(delay_test_Y, delay_test_predictions)\n", "lin_rmse = np.sqrt(lin_mse)\n", "lin_mae = mean_absolute_error(delay_test_Y, delay_test_predictions)\n", "print(f'Root mean squared error: {lin_rmse}, Mean absolute error: {lin_mae}')" ] }, { "cell_type": "markdown", "id": "63cf8e1c", "metadata": {}, "source": [ "# NAS delay" ] }, { "cell_type": "markdown", "id": "de91752d", "metadata": {}, "source": [ "### Linear Regression" ] }, { "cell_type": "code", "execution_count": 52, "id": "5521dfd4", "metadata": {}, "outputs": [], "source": [ "delay_train_X, delay_test_X, delay_train_Y, delay_test_Y \\\n", " = train_test_split(delay_X, delay_weather_Y, test_size=0.1)" ] }, { "cell_type": "code", "execution_count": 53, "id": "91c8a921", "metadata": {}, "outputs": [], "source": [ "imputer = SimpleImputer(strategy='mean')\n", "delay_nas_Y = imputer.fit_transform(delay_nas_Y.values.reshape(-1, 1)).ravel()\n", "\n", "lin_reg = LinearRegression()\n", "lin_reg.fit(delay_X, delay_nas_Y)\n", "\n", "filename = 'saved_models/delay_nas_lin_reg.sav'\n", "pickle.dump(lin_reg, open(filename, 'wb'))" ] }, { "cell_type": "code", "execution_count": 54, "id": "541c97bc", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Root mean squared error: 35.57074424286895, Mean absolute error: 14.59878143835819\n" ] } ], "source": [ "loaded_model = pickle.load(open(filename, 'rb'))\n", "\n", "delay_test_predictions = loaded_model.predict(delay_test_X)\n", "lin_mse = mean_squared_error(delay_test_Y, delay_test_predictions)\n", "lin_rmse = np.sqrt(lin_mse)\n", "lin_mae = mean_absolute_error(delay_test_Y, delay_test_predictions)\n", "print(f'Root mean squared error: {lin_rmse}, Mean absolute error: {lin_mae}')" ] }, { "cell_type": "markdown", "id": "b8cfb1ac", "metadata": {}, "source": [ "# Security delay" ] }, { "cell_type": "markdown", "id": "dee91d12", "metadata": {}, "source": [ "### Linear Regression" ] }, { "cell_type": "code", "execution_count": 55, "id": "9190679b", "metadata": {}, "outputs": [], "source": [ "delay_train_X, delay_test_X, delay_train_Y, delay_test_Y \\\n", " = train_test_split(delay_X, delay_security_Y, test_size=0.1)" ] }, { "cell_type": "code", "execution_count": 56, "id": "04d30dda", "metadata": {}, "outputs": [], "source": [ "imputer = SimpleImputer(strategy='mean')\n", "delay_security_Y = imputer.fit_transform(delay_security_Y.values.reshape(-1, 1)).ravel()\n", "\n", "lin_reg = LinearRegression()\n", "lin_reg.fit(delay_X, delay_security_Y)\n", "\n", "filename = 'saved_models/delay_security_lin_reg.sav'\n", "pickle.dump(lin_reg, open(filename, 'wb'))" ] }, { "cell_type": "code", "execution_count": 57, "id": "39464110", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NaNs in delay_test_Y: 0\n", "NaNs in delay_test_predictions: 0\n", "MSE: 11.504999499784063\n", "RMSE: 3.3919020474925365\n", "MAE: 0.4101716247563961\n" ] } ], "source": [ "import numpy as np\n", "from sklearn.metrics import mean_squared_error, mean_absolute_error\n", "\n", "# Load your model (assuming `filename` is defined)\n", "loaded_model = pickle.load(open(filename, 'rb'))\n", "\n", "# Make predictions\n", "delay_test_predictions = loaded_model.predict(delay_test_X)\n", "\n", "# Check for NaNs in the actual and predicted values\n", "print(\"NaNs in delay_test_Y:\", np.isnan(delay_test_Y).sum())\n", "print(\"NaNs in delay_test_predictions:\", np.isnan(delay_test_predictions).sum())\n", "\n", "# Remove NaNs from the data\n", "mask = ~np.isnan(delay_test_Y) & ~np.isnan(delay_test_predictions)\n", "delay_test_Y_clean = delay_test_Y[mask]\n", "delay_test_predictions_clean = delay_test_predictions[mask]\n", "\n", "# Calculate metrics\n", "lin_mse = mean_squared_error(delay_test_Y_clean, delay_test_predictions_clean)\n", "lin_rmse = np.sqrt(lin_mse)\n", "lin_mae = mean_absolute_error(delay_test_Y_clean, delay_test_predictions_clean)\n", "\n", "# Print metrics\n", "print(f'MSE: {lin_mse}')\n", "print(f'RMSE: {lin_rmse}')\n", "print(f'MAE: {lin_mae}')\n" ] }, { "cell_type": "markdown", "id": "9ba985d5", "metadata": {}, "source": [ "# Late aircraft delay" ] }, { "cell_type": "markdown", "id": "7db45f5a", "metadata": {}, "source": [ "### Linear Regression" ] }, { "cell_type": "code", "execution_count": 58, "id": "15fd1416", "metadata": {}, "outputs": [], "source": [ "delay_train_X, delay_test_X, delay_train_Y, delay_test_Y \\\n", " = train_test_split(delay_X, delay_security_Y, test_size=0.1)" ] }, { "cell_type": "code", "execution_count": 59, "id": "6d129bb4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NaNs in delay_late_aircraft_Y: 0\n", "Model saved successfully.\n" ] } ], "source": [ "import numpy as np\n", "from sklearn.linear_model import LinearRegression\n", "import pickle\n", "\n", "# Check for NaNs in the target variable\n", "print(\"NaNs in delay_late_aircraft_Y:\", np.isnan(delay_late_aircraft_Y).sum())\n", "\n", "# Remove rows with NaNs in the target variable\n", "mask = ~np.isnan(delay_late_aircraft_Y)\n", "delay_X_clean = delay_X[mask]\n", "delay_late_aircraft_Y_clean = delay_late_aircraft_Y[mask]\n", "\n", "# Fit the model\n", "lin_reg = LinearRegression()\n", "lin_reg.fit(delay_X_clean, delay_late_aircraft_Y_clean)\n", "\n", "# Save the model\n", "filename = 'saved_models/delay_late_aircraft_lin_reg.sav'\n", "pickle.dump(lin_reg, open(filename, 'wb'))\n", "\n", "print(\"Model saved successfully.\")\n" ] }, { "cell_type": "code", "execution_count": 60, "id": "1082c822", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Root mean squared error: 26.30732868456468, Mean absolute error: 23.99399248512569\n" ] } ], "source": [ "loaded_model = pickle.load(open(filename, 'rb'))\n", "\n", "delay_test_predictions = loaded_model.predict(delay_test_X)\n", "lin_mse = mean_squared_error(delay_test_Y, delay_test_predictions)\n", "lin_rmse = np.sqrt(lin_mse)\n", "lin_mae = mean_absolute_error(delay_test_Y, delay_test_predictions)\n", "print(f'Root mean squared error: {lin_rmse}, Mean absolute error: {lin_mae}')" ] }, { "cell_type": "markdown", "id": "607e4aea", "metadata": {}, "source": [ "# Predict" ] }, { "cell_type": "code", "execution_count": 61, "id": "cdbe5d2c", "metadata": {}, "outputs": [], "source": [ "weather_filename = 'saved_models/delay_weather_lin_reg.sav'\n", "carrier_filename = 'saved_models/delay_carrier_lin_reg.sav'\n", "nas_filename = 'saved_models/delay_nas_lin_reg.sav'\n", "security_filename = 'saved_models/delay_security_lin_reg.sav'\n", "late_aircraft_filename = 'saved_models/delay_late_aircraft_lin_reg.sav'\n", "\n", "weather_loaded_model = pickle.load(open(weather_filename, 'rb'))\n", "carrier_loaded_model = pickle.load(open(carrier_filename, 'rb'))\n", "nas_loaded_model = pickle.load(open(nas_filename, 'rb'))\n", "security_loaded_model = pickle.load(open(security_filename, 'rb'))\n", "late_aircraft_loaded_model = pickle.load(open(late_aircraft_filename, 'rb'))" ] }, { "cell_type": "code", "execution_count": 62, "id": "950ef1d5", "metadata": {}, "outputs": [], "source": [ "arr_delay_prediction = weather_loaded_model.predict(delay_X) + \\\n", " carrier_loaded_model.predict(delay_X) + \\\n", " nas_loaded_model.predict(delay_X) + \\\n", " security_loaded_model.predict(delay_X) + \\\n", " late_aircraft_loaded_model.predict(delay_X)" ] }, { "cell_type": "code", "execution_count": 63, "id": "26c130a2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Root mean squared error: 93.2178331053021, Mean absolute error: 48.62179687850236\n" ] } ], "source": [ "lin_mse = mean_squared_error(delay_Y, arr_delay_prediction)\n", "lin_rmse = np.sqrt(lin_mse)\n", "lin_mae = mean_absolute_error(delay_Y, arr_delay_prediction)\n", "print(f'Root mean squared error: {lin_rmse}, Mean absolute error: {lin_mae}')" ] }, { "cell_type": "code", "execution_count": null, "id": "58f184d2", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "env", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 5 }