|
import pandas as pd
|
|
import sys
|
|
import os
|
|
|
|
|
|
|
|
def load_datasets():
|
|
"""Load all datasets and return them as dataframes."""
|
|
|
|
base_dir = os.getcwd()
|
|
|
|
|
|
train_path = os.path.join(base_dir, 'process_files', 'generation.pkl')
|
|
client_path = os.path.join(base_dir, 'process_files', 'client.pkl')
|
|
historical_weather_path = os.path.join(base_dir, 'process_files', 'historical_weather.pkl')
|
|
electricity_prices_path = os.path.join(base_dir, 'process_files', 'electricity_prices.pkl')
|
|
gas_prices_path = os.path.join(base_dir, 'process_files', 'gas_prices.pkl')
|
|
|
|
|
|
for path in [train_path, client_path, historical_weather_path, electricity_prices_path, gas_prices_path]:
|
|
if not os.path.exists(path):
|
|
raise FileNotFoundError(f"Archivo no encontrado: {path}")
|
|
|
|
|
|
train = pd.read_pickle(train_path)
|
|
client = pd.read_pickle(client_path)
|
|
historical_weather = pd.read_pickle(historical_weather_path)
|
|
electricity_prices = pd.read_pickle(electricity_prices_path)
|
|
gas_prices = pd.read_pickle(gas_prices_path)
|
|
|
|
return train, client, historical_weather, electricity_prices, gas_prices
|
|
|
|
|
|
def add_time_series_col(client, historical_weather, electricity_prices, gas_prices):
|
|
"""Add column with date where data is available."""
|
|
|
|
client['datetime'] = pd.to_datetime(client['date']) + pd.Timedelta(days=3)
|
|
historical_weather['datetime'] += pd.Timedelta(days=2)
|
|
electricity_prices['datetime'] = pd.to_datetime(electricity_prices['forecast_date']) + pd.Timedelta(days=1)
|
|
gas_prices['datetime'] = pd.to_datetime(gas_prices['forecast_date']) + pd.Timedelta(days=1)
|
|
|
|
|
|
client = client.drop(['date'], axis=1)
|
|
electricity_prices = electricity_prices.drop(['forecast_date'], axis=1)
|
|
gas_prices = gas_prices.drop(['forecast_date'], axis=1)
|
|
|
|
return client, historical_weather, electricity_prices, gas_prices
|
|
|
|
|
|
def merge_datasets(train, client, historical_weather, electricity_prices, gas_prices):
|
|
"""Merge DataFrames train, client, historical weather, gas prices and electricity prices based on the datetime column."""
|
|
merged = train.merge(historical_weather, on='datetime', how='left') \
|
|
.merge(electricity_prices, on='datetime', how='left')
|
|
|
|
|
|
merged['date'] = merged['datetime'].dt.floor('D')
|
|
client['date'] = client['datetime'].dt.floor('D')
|
|
client = client.drop('datetime', axis=1)
|
|
gas_prices['date'] = gas_prices['datetime'].dt.floor('D')
|
|
gas_prices = gas_prices.drop('datetime', axis=1)
|
|
|
|
merged = merged.merge(client, on='date', how='outer') \
|
|
.merge(gas_prices, on='date', how='outer')
|
|
|
|
|
|
merged = merged.drop(['date'], axis=1)
|
|
|
|
return merged
|
|
|
|
|
|
def reorder_columns(df, column_order=None):
|
|
"""Reorder columns of the DataFrame."""
|
|
if column_order == None:
|
|
column_order = [
|
|
'datetime', 'target', 'temperature', 'dewpoint', 'rain', 'snowfall',
|
|
'surface_pressure', 'cloudcover_total', 'cloudcover_low', 'cloudcover_mid',
|
|
'cloudcover_high', 'windspeed_10m', 'winddirection_10m',
|
|
'shortwave_radiation', 'direct_solar_radiation', 'diffuse_radiation',
|
|
'lowest_price_per_mwh', 'highest_price_per_mwh', 'euros_per_mwh','eic_count', 'installed_capacity'
|
|
]
|
|
return df[column_order]
|
|
|
|
|
|
def save_datasets_to_pickle(datasets, paths=None):
|
|
"""Save each dataset in datasets list to the corresponding path in paths list as a pickle file."""
|
|
if paths == None:
|
|
import root
|
|
paths = [
|
|
root.DIR_DATA_STAGE + 'merged_df.pkl',
|
|
]
|
|
|
|
|
|
for path in paths:
|
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
|
|
|
|
for dataset, path in zip(datasets, paths):
|
|
dataset.to_pickle(path)
|
|
|
|
|
|
def drop_first_3_days(df, column, threshold_column, threshold_nans=70):
|
|
"""Drop first 3 days of the dataset if the threshold is exceeded."""
|
|
|
|
nulos = df[threshold_column].isna().sum()
|
|
|
|
|
|
if nulos > threshold_nans:
|
|
|
|
fecha_minima = df[column].min()
|
|
|
|
limite = fecha_minima + pd.Timedelta(days=3)
|
|
|
|
df = df[df[column] >= limite]
|
|
|
|
return df
|
|
|
|
|
|
def feature_selection(df):
|
|
cols_2_drop = [ 'dewpoint','cloudcover_low','cloudcover_mid',
|
|
'cloudcover_high','direct_solar_radiation',
|
|
'diffuse_radiation', 'lowest_price_per_mwh',
|
|
'highest_price_per_mwh','eic_count']
|
|
df.drop(columns = cols_2_drop, axis = 1, inplace = True)
|
|
return df
|
|
|
|
|
|
def set_datetime_index(df):
|
|
df = df.set_index('datetime')
|
|
df = df.asfreq('h')
|
|
return df
|
|
|
|
|
|
def merging_datasets():
|
|
|
|
train, client, historical_weather, electricity_prices, gas_prices = load_datasets()
|
|
|
|
|
|
client, historical_weather, electricity_prices, gas_prices = add_time_series_col(client, historical_weather, electricity_prices, gas_prices)
|
|
|
|
|
|
merged = merge_datasets(train, client, historical_weather, electricity_prices, gas_prices)
|
|
|
|
|
|
merged = reorder_columns(merged)
|
|
|
|
|
|
merged = feature_selection(merged)
|
|
|
|
|
|
merged = set_datetime_index(merged)
|
|
|
|
return merged |