File size: 5,982 Bytes
115745a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import pandas as pd
import sys
import os
def load_datasets():
"""Load all datasets and return them as dataframes."""
# Obtén la ruta base del directorio actual
base_dir = os.getcwd() # Directorio actual
# Construye las rutas absolutas de cada archivo
train_path = os.path.join(base_dir, 'process_files', 'generation.pkl')
client_path = os.path.join(base_dir, 'process_files', 'client.pkl')
historical_weather_path = os.path.join(base_dir, 'process_files', 'historical_weather.pkl')
electricity_prices_path = os.path.join(base_dir, 'process_files', 'electricity_prices.pkl')
gas_prices_path = os.path.join(base_dir, 'process_files', 'gas_prices.pkl')
# Verifica que los archivos existan antes de intentar cargarlos
for path in [train_path, client_path, historical_weather_path, electricity_prices_path, gas_prices_path]:
if not os.path.exists(path):
raise FileNotFoundError(f"Archivo no encontrado: {path}")
# Carga los archivos
train = pd.read_pickle(train_path)
client = pd.read_pickle(client_path)
historical_weather = pd.read_pickle(historical_weather_path)
electricity_prices = pd.read_pickle(electricity_prices_path)
gas_prices = pd.read_pickle(gas_prices_path)
return train, client, historical_weather, electricity_prices, gas_prices
def add_time_series_col(client, historical_weather, electricity_prices, gas_prices):
"""Add column with date where data is available."""
client['datetime'] = pd.to_datetime(client['date']) + pd.Timedelta(days=3)
historical_weather['datetime'] += pd.Timedelta(days=2)
electricity_prices['datetime'] = pd.to_datetime(electricity_prices['forecast_date']) + pd.Timedelta(days=1)
gas_prices['datetime'] = pd.to_datetime(gas_prices['forecast_date']) + pd.Timedelta(days=1)
# Drop unnecessary columns after date adjustments
client = client.drop(['date'], axis=1)
electricity_prices = electricity_prices.drop(['forecast_date'], axis=1)
gas_prices = gas_prices.drop(['forecast_date'], axis=1)
return client, historical_weather, electricity_prices, gas_prices
def merge_datasets(train, client, historical_weather, electricity_prices, gas_prices):
"""Merge DataFrames train, client, historical weather, gas prices and electricity prices based on the datetime column."""
merged = train.merge(historical_weather, on='datetime', how='left') \
.merge(electricity_prices, on='datetime', how='left')
# Add dt.floor('D')
merged['date'] = merged['datetime'].dt.floor('D')
client['date'] = client['datetime'].dt.floor('D')
client = client.drop('datetime', axis=1)
gas_prices['date'] = gas_prices['datetime'].dt.floor('D')
gas_prices = gas_prices.drop('datetime', axis=1)
merged = merged.merge(client, on='date', how='outer') \
.merge(gas_prices, on='date', how='outer')
#dreop unnecessary columns
merged = merged.drop(['date'], axis=1)
return merged
def reorder_columns(df, column_order=None):
"""Reorder columns of the DataFrame."""
if column_order == None:
column_order = [
'datetime', 'target', 'temperature', 'dewpoint', 'rain', 'snowfall',
'surface_pressure', 'cloudcover_total', 'cloudcover_low', 'cloudcover_mid',
'cloudcover_high', 'windspeed_10m', 'winddirection_10m',
'shortwave_radiation', 'direct_solar_radiation', 'diffuse_radiation',
'lowest_price_per_mwh', 'highest_price_per_mwh', 'euros_per_mwh','eic_count', 'installed_capacity'
]
return df[column_order]
def save_datasets_to_pickle(datasets, paths=None):
"""Save each dataset in datasets list to the corresponding path in paths list as a pickle file."""
if paths == None:
import root
paths = [
root.DIR_DATA_STAGE + 'merged_df.pkl',
]
# Create folders if not exists
for path in paths:
os.makedirs(os.path.dirname(path), exist_ok=True)
# Save each dataset to its respective path
for dataset, path in zip(datasets, paths):
dataset.to_pickle(path)
def drop_first_3_days(df, column, threshold_column, threshold_nans=70):
"""Drop first 3 days of the dataset if the threshold is exceeded."""
# Count null values in the threshold column
nulos = df[threshold_column].isna().sum()
# If the threshold is exceeded drop the first 3 days
if nulos > threshold_nans:
# Initial date
fecha_minima = df[column].min()
# Limit day
limite = fecha_minima + pd.Timedelta(days=3)
# Filter df
df = df[df[column] >= limite]
return df
def feature_selection(df):
cols_2_drop = [ 'dewpoint','cloudcover_low','cloudcover_mid',
'cloudcover_high','direct_solar_radiation',
'diffuse_radiation', 'lowest_price_per_mwh',
'highest_price_per_mwh','eic_count']
df.drop(columns = cols_2_drop, axis = 1, inplace = True)
return df
def set_datetime_index(df):
df = df.set_index('datetime')
df = df.asfreq('h')
return df
def merging_datasets():
# Read datasets
train, client, historical_weather, electricity_prices, gas_prices = load_datasets()
# Prepare date columns for merging
client, historical_weather, electricity_prices, gas_prices = add_time_series_col(client, historical_weather, electricity_prices, gas_prices)
# Merge datasets
merged = merge_datasets(train, client, historical_weather, electricity_prices, gas_prices)
# Reorder dataset columns
merged = reorder_columns(merged)
# Feature selection
merged = feature_selection(merged)
# Set datetime index
merged = set_datetime_index(merged)
return merged |