Multivariate long time-series forecasting with XGBoost for CityLearn2023
# import libraries
"""
Author: Chia E. Tungom
Date: 2023-10-10 10:31
email: chemago99@yahoo.com
"""
import numpy as np
import time
import os
from tqdm.auto import tqdm
import json
import pandas as pd
from citylearn.citylearn import CityLearnEnv
"""
This is only a reference script provided to allow you
to do local evaluation. The evaluator **DOES NOT**
use this script for orchestrating the evaluations.
"""
Configure CityLearn Environment¶
This configuration gives us the data and environment variables that make up the simulation.
The configuration uses a json
configuration file that can be modified as needed
data_dir = './data/'
SCHEMA = os.path.join(data_dir, 'schemas/warm_up/schema.json')
env = CityLearnEnv(SCHEMA)
env.observation_names[0][:]
Getting Prediction Variables from environmet¶
The environment is composed of several buildings and each building has it's own load, dhw, and cooling demand
To get the variables from the enviroment, we need to speicy the builign and the variables as follows
env.building["building_name"].energy_simulation."variable_name"
- building_name and variable_name are the names of the buildings and variable to predict respectively
The building varables include
- non_shiftable_load, dhw_demand, cooling_demand
env.reset()
env.buildings[0].energy_simulation.non_shiftable_load[:4]
print(env.buildings[0].energy_simulation.solar_generation[env.time_step+10])
print(env.buildings[1].energy_simulation.solar_generation[env.time_step+10])
# print(env.buildings[2].energy_simulation.carbon_intnsity.carbon[env.time_step+10])
env.buildings[0].solar_generation # solar_generation#[env.time_step+10]
Getting the Environment Data¶
To get the observations of a given building at a given step, use
- `env.buildings["building_name"].observations()`
To get observations for the entire environment, use
- `env.step(action)`
Note that for the entire environment observations, the observations of individual buildings are not exclusively distinguishable but we can see the observations for
'indoor_dry_bulb_temperature',
'non_shiftable_load',
'solar_generation',
'dhw_storage_soc',
'electrical_storage_soc',
'net_electricity_consumption',
'cooling_demand',
'dhw_demand',
'occupant_count',
'indoor_dry_bulb_temperature_set_point',
'power_outage'
to be unique to each building and the others being general. These are well alligned from the second building being the last 11*(number of buildings - 1) observations in the array. for the first building, there is a mixup with electricity price.
observations = env.reset()
for i in range(1):
actions = np.zeros( (1, len(env.buildings) * 3) )
print("\n ====== BUILDING OBSERVATION ========= \n ", env.buildings[0].observations())
print("\n ========= ENVIRONMENT OBSERVATION ========= \n ", observations)
observations, _, done, _ = env.step(actions)
Convert individual building data to dataframe¶
we use pandas to convert the data into a pandas dataframe. we will add extra features so that we can predict future time steps
observations = env.reset()
# create an empty dataframe with column names
print(env.time_step)
# check the decision variable by indexing the building variable at the time step
print(env.buildings[0].energy_simulation.dhw_demand[env.time_step])
print(env.buildings[0].energy_simulation.dhw_demand[env.time_step])
print(env.buildings[0].heating_device)
print(env.buildings[0].observations().keys())
cols = ['day_type', 'hour', 'outdoor_dry_bulb_temperature', 'outdoor_dry_bulb_temperature_predicted_6h',
'outdoor_dry_bulb_temperature_predicted_12h', 'outdoor_dry_bulb_temperature_predicted_24h',
'diffuse_solar_irradiance', 'diffuse_solar_irradiance_predicted_6h', 'diffuse_solar_irradiance_predicted_12h',
'diffuse_solar_irradiance_predicted_24h', 'direct_solar_irradiance', 'direct_solar_irradiance_predicted_6h',
'direct_solar_irradiance_predicted_12h', 'direct_solar_irradiance_predicted_24h', 'carbon_intensity',
'indoor_dry_bulb_temperature', 'non_shiftable_load', 'solar_generation', 'dhw_storage_soc',
'electrical_storage_soc', 'net_electricity_consumption', 'electricity_pricing', 'electricity_pricing_predicted_6h',
'electricity_pricing_predicted_12h', 'electricity_pricing_predicted_24h', 'cooling_demand', 'dhw_demand',
'occupant_count', 'indoor_dry_bulb_temperature_set_point', 'power_outage']
drop = ['day_type', 'hour', 'outdoor_dry_bulb_temperature',
'diffuse_solar_irradiance', 'direct_solar_irradiance', 'non_shiftable_load', 'solar_generation', 'dhw_storage_soc',
'electrical_storage_soc', 'net_electricity_consumption', 'electricity_pricing', 'cooling_demand', 'dhw_demand',
'occupant_count', 'indoor_dry_bulb_temperature_set_point']
def building_observation_dataframe(environment, building: int , forward_steps: int = 48, drop_cols: list = [], keep_cols = []):
""" takes a CityLearn Environment and a given building and builds a dataframe for future 48hrs
======================
environement: defined CityLearn environment
building: building number to build dataframe for
forward step: number of steps to predict for target variable
drop_cols: columns to drop
returns a dataframe for a given time instance with future variables """
generated_df = pd.DataFrame(columns=env.buildings[building].observations().keys())
generated_df = generated_df.append(pd.Series(environment.buildings[building].observations().values(), index=generated_df.columns), ignore_index=True)
# generate same repeated rows
# matrix = [environment.buildings[building].observations().values()]
# for i in range(1,forward_steps):
# matrix.append(environment.buildings[building].observations().values())
# # generated_df = generated_df.append(pd.Series(environment.buildings[building].observations().values(), index=generated_df.columns), ignore_index=True)
# generated_df = pd.DataFrame(matrix, columns=environment.buildings[building].observations().keys())
generated_df = generated_df.loc[generated_df.index.repeat(forward_steps)].reset_index(drop=True)
# print(matrix_df)
# === add deterministic future features ======
generated_df["day_type"] = [ int(environment.buildings[building].energy_simulation.day_type[env.time_step+i]) for i in range(forward_steps)]
generated_df["hour"] = [ int(environment.buildings[building].energy_simulation.hour[env.time_step+i]) for i in range(forward_steps)]
generated_df["step_count"] = [ int(i+1) for i in range(forward_steps)]
# === add target future features one step ahead=============
generated_df["future_electric_load"] = [ environment.buildings[building].energy_simulation.non_shiftable_load[env.time_step+i] for i in range(1, forward_steps+1)]
generated_df["future_cooling_demand"] = [ environment.buildings[building].energy_simulation.cooling_demand[env.time_step+i] for i in range(1, forward_steps+1)]
generated_df["future_dhw_demand"] = [ environment.buildings[building].energy_simulation.dhw_demand[env.time_step+i] for i in range(1, forward_steps+1)]
return generated_df
building_observation_dataframe(env, 0)
Build a Training and Testing Dataset¶
For every building and time step, we are going to build the dataset for every point and use it for training
env.buildings
observations = env.reset()
forward_time = 48
# Compose Dataset
def compose_dataset(env, forward_time):
env.reset()
total_time_steps = env.time_steps
generated_df = pd.DataFrame(columns=env.buildings[0].observations().keys())
for i in range(total_time_steps - forward_time):
for building in range(len(env.buildings)):
df = building_observation_dataframe(env, building, forward_steps = forward_time)
generated_df = pd.concat([generated_df, df], axis=0, ignore_index=True)
actions = np.zeros( (1, len(env.buildings) * 3) )
observations, _, done, _ = env.step(actions)
return generated_df
compose_dataset(env, forward_time)
class generate_building_data:
def __init__(self, environment, forward_steps):
self.environment = environment
self.forward_steps = forward_steps
self.total_time_steps = env.time_steps
self.actions = np.zeros( (1, len(env.buildings) * 3) )
def building_observation_dataframe(self, building):
""" takes a given building and builds a dataframe for future 48hrs
======================
building: building number to build dataframe for
returns a dataframe for a given time instance with future variables """
building_df = pd.DataFrame(columns=env.buildings[building].observations().keys())
building_df = building_df.append(pd.Series(self.environment.buildings[building].observations().values(), index=building_df.columns), ignore_index=True)
# === generate repeated rows =======================
building_df = building_df.loc[building_df.index.repeat(self.forward_steps)].reset_index(drop=True)
# === add deterministic future features ============
building_df["day_type"] = [ int(self.environment.buildings[building].energy_simulation.day_type[env.time_step+i]) for i in range(self.forward_steps)]
building_df["hour"] = [ int(self.environment.buildings[building].energy_simulation.hour[env.time_step+i]) for i in range(self.forward_steps)]
building_df["step_count"] = [ int(i+1) for i in range(self.forward_steps)]
# === add target future features one step ahead======
building_df["future_electric_load"] = [ self.environment.buildings[building].energy_simulation.non_shiftable_load[env.time_step+i] for i in range(1, self.forward_steps+1)]
building_df["future_cooling_demand"] = [ self.environment.buildings[building].energy_simulation.cooling_demand[env.time_step+i] for i in range(1, self.forward_steps+1)]
building_df["future_dhw_demand"] = [ self.environment.buildings[building].energy_simulation.dhw_demand[env.time_step+i] for i in range(1, self.forward_steps+1)]
building_df["future_solar_generation"] = [ int(self.environment.buildings[0].energy_simulation.solar_generation[env.time_step+i]) for i in range(self.forward_steps)]
building_df["building"] = building # can be use to get solar generation data
return building_df
def building_data(self, buildings: list, shuffle = True):
""" takes given buildings and builds a dataset entire simulation period
======================
buildings: buildings numbers to build dataframe for
returns a dataframe for a simulation period """
self.environment.reset()
buildings_dataset = pd.DataFrame(columns=self.environment.buildings[buildings[0]].observations().keys())
for i in range(self.total_time_steps - forward_time): # cannot sample till the end
for building in buildings:
df = self.building_observation_dataframe(building)
buildings_dataset = pd.concat([buildings_dataset, df], axis=0, ignore_index=True)
if shuffle:
buildings_dataset = buildings_dataset.sample(frac=1, random_state=42).reset_index(drop=True)
return buildings_dataset
def environment_data(self, shuffle = True):
""" takes given buildings and builds a dataset entire simulation period
======================
returns a dataframe for a simulation period """
observations = self.environment.reset()
cols = self.environment.observation_names[0]
initial_data = pd.DataFrame(columns=cols)
for i in range(self.total_time_steps - self.forward_steps): # cannot sample till the end
step_df = pd.DataFrame(observations, columns=cols)
initial_data = pd.concat([initial_data, step_df], axis=0, ignore_index=True)
observations, _, done, _ = env.step(self.actions)
# === get entire column data =======================
# env_data = pd.DataFrame(columns=cols)
solars = list(initial_data.solar_generation.values)
carbons = list(initial_data.carbon_intensity.values)
days = list(initial_data.day_type.values)
hours = list(initial_data.hour.values)
for i in range(0, len(initial_data) - self.forward_steps):
step_df = pd.DataFrame( [ initial_data.iloc[i].values ], columns=cols)
step_df = step_df.loc[step_df.index.repeat(self.forward_steps)].reset_index(drop=True)
# === add deterministic future features ============
step_df["day_type"] = days[i+1: i + self.forward_steps + 1]
step_df["hour"] = hours[i+1: i + self.forward_steps + 1]
step_df["step_count"] = [ int(i+1) for i in range(self.forward_steps)]
# === add target future features one step ahead======
step_df["future_solar_generation"] = solars[i+1: i + self.forward_steps + 1]
step_df["future_carbon_intensity"] = carbons[i+1: i + self.forward_steps + 1]
if i == 0:
env_data = step_df.copy(deep=True)
else:
env_data = pd.concat([env_data, step_df], axis=0, ignore_index=True)
if shuffle:
env_data = env_data.sample(frac=1, random_state=42).reset_index(drop=True)
return env_data
env.reset()
data = generate_building_data(env, 1)
neighbourhood_data = data.environment_data( shuffle = False)
building_data = data.building_data(buildings = [0,1,2], shuffle = False)
building_data["day_type"] = building_data["day_type"].astype(int)
building_data["hour"] = building_data["hour"].astype(int)
building_data["step_count"] = building_data["step_count"].astype(int)
neighbourhood_data[['solar_generation', "future_solar_generation"]]
keep = ['day_type', 'hour', 'step_count',
'outdoor_dry_bulb_temperature', 'solar_generation', 'dhw_storage_soc',
'cooling_demand', 'dhw_demand', 'occupant_count',
'electrical_storage_soc', 'net_electricity_consumption',
'non_shiftable_load', 'future_electric_load', "future_dhw_demand", "future_cooling_demand"]
# keep = ['day_type', 'hour', 'step_count', 'non_shiftable_load', 'future_electric_load']
# df['day_type'] = df['day_type'].astype(int)
def getTrainSplitData(df, train_size = 0.7):
last_index = int(len(df) * train_size) # Calculate the index corresponding to the last 70% of the data
train = df.loc[df.index < last_index]
test = df.loc[df.index >= last_index]
return train, test
# generated_df['future_electric_load'] = generated_df['future_electric_load'].astype(float).round(5)
train, test = getTrainSplitData(building_data, train_size = 0.6)
train.dtypes
class generate_model:
def __init__(self, dataset, ):
self.dataset = dataset
def train_model(self):
pass
Predict Electric Load¶
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import warnings
# Ignore all warnings
warnings.filterwarnings("ignore")
X_train, y_train = train.drop(columns=["future_cooling_demand"]), train["future_cooling_demand"]
X_test, y_test = test.drop(columns=["future_cooling_demand"]), test["future_cooling_demand"]
reg = XGBRegressor(base_score=0.5, booster='dart',
n_estimators=100,
early_stopping_rounds=50,
enable_categorical = True,
objective='reg:squarederror',
max_depth=100,
learning_rate=0.1)
reg.fit(X_train,
y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
eval_metric='mae',
verbose=20)
feature_important = reg.get_booster().get_score(importance_type='weight')
keys = list(feature_important.keys())
values = list(feature_important.values())
data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)
data#.nlargest(40, columns="score").plot(kind='barh', figsize = (20,10)) ## plot top 40 features
sorted_idx = np.argsort(reg.feature_importances_)[::-1]
for index in sorted_idx:
print([train.columns[index], reg.feature_importances_[index]])
Content
Comments
You must login before you can post a comment.