diff --git "a/training/src/.ipynb_checkpoints/MSML-602-Final-Project-checkpoint.ipynb" "b/training/src/.ipynb_checkpoints/MSML-602-Final-Project-checkpoint.ipynb"
new file mode 100644--- /dev/null
+++ "b/training/src/.ipynb_checkpoints/MSML-602-Final-Project-checkpoint.ipynb"
@@ -0,0 +1,1085 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "fb7daf67-de7e-4626-a194-417aa210c959",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%capture\n",
+ "!pip install keras tensorflow seaborn requests-cache"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "c005096b-eaca-4244-998d-a92338d22902",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import sqlite3\n",
+ "import IPython\n",
+ "import IPython.display\n",
+ "import matplotlib as mpl\n",
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import seaborn as sns\n",
+ "import tensorflow as tf\n",
+ "import requests\n",
+ "import requests_cache\n",
+ "from requests_cache.backends.sqlite import SQLiteCache\n",
+ "import sqlite3\n",
+ "import datetime\n",
+ "from datetime import date, timedelta, timezone\n",
+ "import time\n",
+ "import pytz\n",
+ "\n",
+ "local_tz = pytz.timezone('America/New_York')\n",
+ "\n",
+ "\n",
+ "mpl.rcParams['figure.figsize'] = (8, 6)\n",
+ "mpl.rcParams['axes.grid'] = False\n",
+ "\n",
+ "# initialize cache\n",
+ "requests_cache.install_cache('./data/weather_api_cache')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "2e054aef-ce49-47a8-ba20-dc2e6928600e",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Starting Date: 2022-12-07\n",
+ "--------------------------------\n",
+ "Already up to date!\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " temp | \n",
+ " obs_timestamp | \n",
+ " pressure | \n",
+ " wspd | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 50.0 | \n",
+ " 2000-01-02 00:51:00 | \n",
+ " 30.22 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 45.0 | \n",
+ " 2000-01-02 01:51:00 | \n",
+ " 30.21 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 47.0 | \n",
+ " 2000-01-02 02:51:00 | \n",
+ " 30.21 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 46.0 | \n",
+ " 2000-01-02 03:51:00 | \n",
+ " 30.21 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 47.0 | \n",
+ " 2000-01-02 04:51:00 | \n",
+ " 30.20 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " temp obs_timestamp pressure wspd\n",
+ "0 50.0 2000-01-02 00:51:00 30.22 NaN\n",
+ "1 45.0 2000-01-02 01:51:00 30.21 5.0\n",
+ "2 47.0 2000-01-02 02:51:00 30.21 5.0\n",
+ "3 46.0 2000-01-02 03:51:00 30.21 NaN\n",
+ "4 47.0 2000-01-02 04:51:00 30.20 5.0"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\n",
+ "\n",
+ "# API_KEY = os.env.get(\"API_KEY\")\n",
+ "DATE_FORMAT = \"%Y-%m-%d %H:%M:%S\"\n",
+ "API_KEY = \"e1f10a1e78da46f5b10a1e78da96f525\"\n",
+ "BASE_URL = \"https://api.weather.com/v1/location/KDCA:9:US/observations/historical.json?apiKey={api_key}&units=e&startDate={start_date}&endDate={end_date}\"\n",
+ "\n",
+ "# Field descriptions here\n",
+ "# https://www.research-collection.ethz.ch/bitstream/handle/20.500.11850/511371/1/LapamonpinyoEtAl_engrXiv_2021.pdf \n",
+ "# Sample\n",
+ "# {\n",
+ "# 'key': 'KDCA', 'class': 'observation', 'expire_time_gmt': 946709460, 'obs_id': 'KDCA', 'obs_name': 'Washington/Natl', \n",
+ "# 'valid_time_gmt': 946702260, 'day_ind': 'N', 'temp': 41, 'wx_icon': 33, 'icon_extd': 3300, 'wx_phrase': 'Fair', 'pressure_tend': None,\n",
+ "# 'pressure_desc': None, 'dewPt': 34, 'heat_index': 41, 'rh': 76, 'pressure': 30.19, 'vis': 5, 'wc': 41, 'wdir': None, \n",
+ "# 'wdir_cardinal': 'CALM', 'gust': None, 'wspd': None, 'max_temp': 57, 'min_temp': 41, 'precip_total': None, \n",
+ "# 'precip_hrly': None, 'snow_hrly': None, 'uv_desc': 'Low', 'feels_like': 41, 'uv_index': 0, 'qualifier': None, 'qualifier_svrty': None,\n",
+ "# 'blunt_phrase': None, 'terse_phrase': None, 'clds': 'CLR', 'water_temp': None, 'primary_wave_period': None, 'primary_wave_height': None, \n",
+ "# 'primary_swell_period': None, 'primary_swell_height': None, 'primary_swell_direction': None, 'secondary_swell_period': None, \n",
+ "# 'secondary_swell_height': None, 'secondary_swell_direction': None\n",
+ "# }\n",
+ "\n",
+ "conn = sqlite3.connect(\"./data/weather-raw.db\")\n",
+ "cur = conn.cursor()\n",
+ "\n",
+ "\n",
+ "def create_weather_table(cur, table_name):\n",
+ " cur.execute(\"\"\"\n",
+ " CREATE TABLE IF NOT EXISTS {table_name}(\n",
+ " key, class, expire_time_gmt, obs_id, obs_name, valid_time_gmt INTEGER NOT NULL PRIMARY KEY, day_ind, temp, wx_icon, icon_extd, \n",
+ " wx_phrase, pressure_tend, pressure_desc, dewPt, heat_index, rh, pressure, vis, wc, wdir, wdir_cardinal, \n",
+ " gust, wspd, max_temp, min_temp, precip_total, precip_hourly, snow_hrly, uv_desc, feels_like, uv_index,\n",
+ " qualifier, qualifier_svrty, blunt_phrase, terse_phrase, clds, water_temp, primary_wave_period, \n",
+ " primary_wave_height, primary_swell_period, primary_swell_height, primary_swell_direction, \n",
+ " secondary_swell_period, secondary_swell_height, secondary_swell_direction, obs_timestamp)\n",
+ " \"\"\".format(table_name=table_name))\n",
+ " cur.execute(\"\"\"\n",
+ " CREATE INDEX idx_obs_timestamp ON weather_raw(obs_timestamp);\n",
+ " \"\"\")\n",
+ " cur.execute(\"\"\"\n",
+ " CREATE INDEX idx_obs_timestamp_month ON weather_raw(STRFTIME('%M', obs_timestamp));\n",
+ " \"\"\")\n",
+ " cur.execute(\"\"\"\n",
+ " CREATE INDEX idx_obs_timestamp_date ON weather_raw(STRFTIME('%Y-%m-%d', wr.obs_timestamp));\n",
+ " \"\"\")\n",
+ " \n",
+ "# Create tables for raw & cleaned data respectively (if they don't exist already)\n",
+ "create_weather_table(cur, \"weather_raw\")\n",
+ "\n",
+ "\n",
+ "# Get the latest date that data has been downloaded for and start downloading new data from that timestamp\n",
+ "cur.execute(\"SELECT MAX(obs_timestamp) FROM weather_raw\")\n",
+ "max_date = cur.fetchone()[0]\n",
+ "target_date = datetime.datetime.strptime(max_date, DATE_FORMAT).date() if max_date else date(2000, 1, 1)\n",
+ "\n",
+ "print(f\"Starting Date: {target_date}\") \n",
+ "print(\"--------------------------------\")\n",
+ "defaults = {\n",
+ " 'key': None,'class': None,'expire_time_gmt': None,'obs_id': None,'obs_name': None,'valid_time_gmt': None,\n",
+ " 'day_ind': None,'temp': None,'wx_icon': None,'icon_extd': None,'wx_phrase': None,'pressure_tend': None,\n",
+ " 'pressure_desc': None,'dewPt': None,'heat_index': None,'rh': None,'pressure': None,'vis': None,'wc': None,\n",
+ " 'wdir': None,'wdir_cardinal': None,'gust': None,'wspd': None,'max_temp': None,'min_temp': None,'precip_total': None,\n",
+ " 'precip_hrly': None,'snow_hrly': None,'uv_desc': None,'feels_like': None,'uv_index': None,'qualifier': None,\n",
+ " 'qualifier_svrty': None,'blunt_phrase': None,'terse_phrase': None,'clds': None,'water_temp': None,\n",
+ " 'primary_wave_period': None,'primary_wave_height': None,'primary_swell_period': None,'primary_swell_height': None,\n",
+ " 'primary_swell_direction': None,'secondary_swell_period': None,'secondary_swell_height': None,'secondary_swell_direction': None\n",
+ "}\n",
+ "\n",
+ "\n",
+ "INSERT_SQL = \"\"\"\n",
+ "INSERT OR IGNORE INTO weather_raw VALUES (\n",
+ " :key, :class, :expire_time_gmt, :obs_id, :obs_name, :valid_time_gmt, :day_ind, :temp, :wx_icon, :icon_extd, :wx_phrase,\n",
+ " :pressure_tend, :pressure_desc, :dewPt, :heat_index, :rh, :pressure, :vis, :wc, :wdir, :wdir_cardinal,\n",
+ " :gust, :wspd, :max_temp, :min_temp, :precip_total, :precip_hrly, :snow_hrly, :uv_desc, :feels_like, :uv_index,\n",
+ " :qualifier, :qualifier_svrty, :blunt_phrase, :terse_phrase, :clds, :water_temp, :primary_wave_period,\n",
+ " :primary_wave_height, :primary_swell_period, :primary_swell_height, :primary_swell_direction,\n",
+ " :secondary_swell_period, :secondary_swell_height, :secondary_swell_direction, :obs_timestamp\n",
+ ")\n",
+ "\"\"\"\n",
+ "\n",
+ "def augment_data(rec):\n",
+ " rec[\"obs_timestamp\"] = datetime.datetime.fromtimestamp(rec[\"valid_time_gmt\"]).strftime(DATE_FORMAT)\n",
+ " return rec\n",
+ "\n",
+ "today = datetime.date.today()\n",
+ "if target_date == today:\n",
+ " print(\"Already up to date!\")\n",
+ " \n",
+ "while target_date != today:\n",
+ " end_date = target_date + timedelta(days=1) \n",
+ " start_date_str = target_date.strftime(\"%Y%m%d\")\n",
+ " end_date_str = end_date.strftime(\"%Y%m%d\")\n",
+ " target_url = BASE_URL.format(api_key=API_KEY, start_date=start_date_str, end_date=start_date_str)\n",
+ " res = requests.get(target_url)\n",
+ " target_date = end_date\n",
+ "\n",
+ " data = res.json()\n",
+ " if not \"observations\" in data:\n",
+ " print(f\"Date: {target_date} NF\", end=\"\\r\")\n",
+ " continue\n",
+ " params = ({k: d.get(k, defaults[k]) for k in defaults} for d in data[\"observations\"])\n",
+ " params = list(map(augment_data, params))\n",
+ "\n",
+ " cur.executemany(INSERT_SQL, params)\n",
+ " conn.commit()\n",
+ " # time.sleep(0.05)\n",
+ " # was_cached = \"Cache HIT\" if res.from_cache else \"Cache MISS\"\n",
+ " print(f\"Date: {target_date} OK\", end=\"\\r\")\n",
+ " target_date = end_date\n",
+ "\n",
+ "SQL_CLEANED_DATA = \"\"\"\n",
+ " WITH RECURSIVE generate_series(x) AS (\n",
+ " SELECT 0\n",
+ " UNION ALL\n",
+ " SELECT x+1 FROM generate_series LIMIT 24\n",
+ " ), distinct_dates AS (\n",
+ " SELECT DISTINCT DATE(obs_timestamp) AS obs_date\n",
+ " FROM weather_raw wr \n",
+ " WHERE obs_timestamp >= '2000-01-01'\n",
+ " ), hours AS (\n",
+ " SELECT x AS hrs FROM generate_series\n",
+ " )\n",
+ " SELECT w.temp, w.obs_timestamp, w.pressure, w.wspd\n",
+ " FROM weather_raw w \n",
+ " JOIN (\n",
+ " SELECT d.obs_date, COUNT(*)\n",
+ " FROM distinct_dates d \n",
+ " CROSS JOIN hours h \n",
+ " INNER JOIN (SELECT * FROM weather_raw WHERE STRFTIME('%M', obs_timestamp) in ('51', '52')) wr \n",
+ " ON DATE(STRFTIME('%Y-%m-%d', wr.obs_timestamp)) = d.obs_date \n",
+ " AND CAST(STRFTIME('%H', obs_timestamp) AS INTEGER) = h.hrs\n",
+ " GROUP BY d.obs_date\n",
+ " HAVING COUNT(*) = 24\n",
+ " ) d ON d.obs_date = STRFTIME('%Y-%m-%d', w.obs_timestamp) \n",
+ " WHERE STRFTIME('%M', w.obs_timestamp) in ('51', '52');\n",
+ "\"\"\"\n",
+ "df = pd.read_sql(SQL_CLEANED_DATA, conn, parse_dates=[\"obs_timestamp\"]) \n",
+ "df.head() "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "7233dd6b-241b-46c6-a206-184b33d43792",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " temp | \n",
+ " pressure | \n",
+ " wspd | \n",
+ " day_sin | \n",
+ " day_cos | \n",
+ " year_sin | \n",
+ " year_cos | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 45.0 | \n",
+ " 30.21 | \n",
+ " 5.0 | \n",
+ " 0.465615 | \n",
+ " 0.884988 | \n",
+ " 0.013798 | \n",
+ " 0.999905 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 47.0 | \n",
+ " 30.21 | \n",
+ " 5.0 | \n",
+ " 0.678801 | \n",
+ " 0.734323 | \n",
+ " 0.014514 | \n",
+ " 0.999895 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 47.0 | \n",
+ " 30.20 | \n",
+ " 5.0 | \n",
+ " 0.955020 | \n",
+ " 0.296542 | \n",
+ " 0.015948 | \n",
+ " 0.999873 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 44.0 | \n",
+ " 30.19 | \n",
+ " 3.0 | \n",
+ " 0.999229 | \n",
+ " 0.039260 | \n",
+ " 0.016664 | \n",
+ " 0.999861 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 48.0 | \n",
+ " 30.18 | \n",
+ " 12.0 | \n",
+ " 0.975342 | \n",
+ " -0.220697 | \n",
+ " 0.017381 | \n",
+ " 0.999849 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " temp pressure wspd day_sin day_cos year_sin year_cos\n",
+ "1 45.0 30.21 5.0 0.465615 0.884988 0.013798 0.999905\n",
+ "2 47.0 30.21 5.0 0.678801 0.734323 0.014514 0.999895\n",
+ "4 47.0 30.20 5.0 0.955020 0.296542 0.015948 0.999873\n",
+ "5 44.0 30.19 3.0 0.999229 0.039260 0.016664 0.999861\n",
+ "6 48.0 30.18 12.0 0.975342 -0.220697 0.017381 0.999849"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "def prepare_dataframe(_df):\n",
+ " _df = _df.astype(\n",
+ " {\n",
+ " 'temp': 'float',\n",
+ " 'pressure': 'float',\n",
+ " 'wspd': 'float'\n",
+ " },\n",
+ " )\n",
+ " _df = _df.dropna()\n",
+ " _df = _df.sort_values(by=['obs_timestamp'])\n",
+ " date_time = _df.pop('obs_timestamp')\n",
+ " timestamp_s = date_time.map(pd.Timestamp.timestamp)\n",
+ " day = 24*60*60\n",
+ " year = (365.2425)*day\n",
+ " \n",
+ " _df['day_sin'] = np.sin(timestamp_s * (2 * np.pi / day))\n",
+ " _df['day_cos'] = np.cos(timestamp_s * (2 * np.pi / day))\n",
+ " _df['year_sin'] = np.sin(timestamp_s * (2 * np.pi / year))\n",
+ " _df['year_cos'] = np.cos(timestamp_s * (2 * np.pi / year))\n",
+ "\n",
+ " \n",
+ " return _df, date_time\n",
+ "\n",
+ "# Field descriptions here\n",
+ "# https://www.research-collection.ethz.ch/bitstream/handle/20.500.11850/511371/1/LapamonpinyoEtAl_engrXiv_2021.pdf \n",
+ "df, date_time = prepare_dataframe(df)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "049cf91d-30f9-4c42-b705-fdfb48f0d1e2",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_64/4039997715.py:7: FutureWarning: The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`.\n",
+ " plot_features.index = date_time[-480:]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " count | \n",
+ " mean | \n",
+ " std | \n",
+ " min | \n",
+ " 25% | \n",
+ " 50% | \n",
+ " 75% | \n",
+ " max | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " temp | \n",
+ " 179231.0 | \n",
+ " 59.138720 | \n",
+ " 17.768787 | \n",
+ " 6.000000 | \n",
+ " 45.000000 | \n",
+ " 60.000000 | \n",
+ " 74.000000 | \n",
+ " 118.000000 | \n",
+ "
\n",
+ " \n",
+ " pressure | \n",
+ " 179231.0 | \n",
+ " 30.034228 | \n",
+ " 0.216623 | \n",
+ " 28.610000 | \n",
+ " 29.900000 | \n",
+ " 30.030000 | \n",
+ " 30.170000 | \n",
+ " 30.860000 | \n",
+ "
\n",
+ " \n",
+ " wspd | \n",
+ " 179231.0 | \n",
+ " 8.984768 | \n",
+ " 4.745124 | \n",
+ " 0.000000 | \n",
+ " 6.000000 | \n",
+ " 8.000000 | \n",
+ " 12.000000 | \n",
+ " 45.000000 | \n",
+ "
\n",
+ " \n",
+ " day_sin | \n",
+ " 179231.0 | \n",
+ " -0.015076 | \n",
+ " 0.706559 | \n",
+ " -0.999391 | \n",
+ " -0.731354 | \n",
+ " -0.034899 | \n",
+ " 0.681998 | \n",
+ " 0.999391 | \n",
+ "
\n",
+ " \n",
+ " day_cos | \n",
+ " 179231.0 | \n",
+ " 0.007189 | \n",
+ " 0.707461 | \n",
+ " -0.999391 | \n",
+ " -0.681998 | \n",
+ " 0.034899 | \n",
+ " 0.731354 | \n",
+ " 0.999391 | \n",
+ "
\n",
+ " \n",
+ " year_sin | \n",
+ " 179231.0 | \n",
+ " 0.004124 | \n",
+ " 0.709370 | \n",
+ " -1.000000 | \n",
+ " -0.705328 | \n",
+ " 0.007787 | \n",
+ " 0.717489 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " year_cos | \n",
+ " 179231.0 | \n",
+ " -0.003987 | \n",
+ " 0.704817 | \n",
+ " -1.000000 | \n",
+ " -0.708857 | \n",
+ " -0.004902 | \n",
+ " 0.696913 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " count mean std min 25% 50% \\\n",
+ "temp 179231.0 59.138720 17.768787 6.000000 45.000000 60.000000 \n",
+ "pressure 179231.0 30.034228 0.216623 28.610000 29.900000 30.030000 \n",
+ "wspd 179231.0 8.984768 4.745124 0.000000 6.000000 8.000000 \n",
+ "day_sin 179231.0 -0.015076 0.706559 -0.999391 -0.731354 -0.034899 \n",
+ "day_cos 179231.0 0.007189 0.707461 -0.999391 -0.681998 0.034899 \n",
+ "year_sin 179231.0 0.004124 0.709370 -1.000000 -0.705328 0.007787 \n",
+ "year_cos 179231.0 -0.003987 0.704817 -1.000000 -0.708857 -0.004902 \n",
+ "\n",
+ " 75% max \n",
+ "temp 74.000000 118.000000 \n",
+ "pressure 30.170000 30.860000 \n",
+ "wspd 12.000000 45.000000 \n",
+ "day_sin 0.681998 0.999391 \n",
+ "day_cos 0.731354 0.999391 \n",
+ "year_sin 0.717489 1.000000 \n",
+ "year_cos 0.696913 1.000000 "
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "