Integrate multivariate time series forecasting (#254)

* Integrate multivariate time series forecasting, now supports
continuous and categorical variables

- update data.py to transform time series data
- update search space
- update documentations to reflect changes
- update test_forecast.py
- rename 'forecast' task to 'ts_forecast' task

* update automl.py and test_forecast.py

* update forecast notebook

* update README.md and setup.py

* update ml.py and test_forecast.py

- make "ds" and "y" constant variables

* replace constants with constant variables

* bump version to 0.7.0

* update setup.py
- support 'forecast' and 'ts_forecast'

* update automl.py and data.py
- support 'forecast' and 'ts_forecast' tasks
This commit is contained in:
Kevin Chen 2021-10-30 12:48:57 -04:00 committed by GitHub
parent e0155c2339
commit 519bfc2a18
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 482 additions and 201 deletions

View File

@ -127,7 +127,7 @@ print(automl.model)
* Time series forecasting.
```python
# pip install flaml[forecast]
# pip install flaml[ts_forecast]
import numpy as np
from flaml import AutoML
X_train = np.arange('2014-01', '2021-01', dtype='datetime64[M]')
@ -136,8 +136,8 @@ automl = AutoML()
automl.fit(X_train=X_train[:72], # a single column of timestamp
y_train=y_train, # value for each timestamp
period=12, # time horizon to forecast, e.g., 12 months
task='forecast', time_budget=15, # time budget in seconds
log_file_name="test/forecast.log",
task='ts_forecast', time_budget=15, # time budget in seconds
log_file_name="test/ts_forecast.log",
)
print(automl.predict(X_train[72:]))
```

View File

@ -36,7 +36,7 @@ from .config import (
N_SPLITS,
SAMPLE_MULTIPLY_FACTOR,
)
from .data import concat, CLASSIFICATION
from .data import concat, CLASSIFICATION, TS_FORECAST, FORECAST
from . import tune
from .training_log import training_log_reader, training_log_writer
@ -428,10 +428,22 @@ class AutoML:
Args:
X_test: A numpy array of featurized instances, shape n * m,
or for 'forecasting' task:
a pandas dataframe with one column of timestamp values
or an integer n for the predict steps (only valid when
the estimator is arima or sarimax).
or for 'ts_forecast' task:
a pandas dataframe with the first column containing
timestamp values (datetime type) or an integer n for
the predict steps (only valid when the estimator is
arima or sarimax). Other columns in the dataframe
are assumed to be exogenous variables (categorical
or numeric).
.. code-block:: python
multivariate_X_test = pd.DataFrame({
'timeStamp': pd.date_range(start='1/1/2022', end='1/07/2022'),
'categorical_col': ['yes', 'yes', 'no', 'no', 'yes', 'no', 'yes'],
'continuous_col': [105, 107, 120, 118, 110, 112, 115]
})
model.predict(multivariate_X_test)
Returns:
A array-like of shape n * 1 - - each element is a predicted
@ -472,14 +484,12 @@ class AutoML:
def _preprocess(self, X):
if isinstance(X, int):
return X
if self._state.task == "forecast":
if self._state.task == TS_FORECAST:
X = pd.DataFrame(X)
X = X.rename(columns={X.columns[0]: "ds"})
else:
if issparse(X):
X = X.tocsr()
if self._transformer:
X = self._transformer.transform(X)
if issparse(X):
X = X.tocsr()
if self._transformer:
X = self._transformer.transform(X, self._state.task)
return X
def _validate_data(
@ -493,23 +503,6 @@ class AutoML:
groups_val=None,
groups=None,
):
if self._state.task == "forecast":
if dataframe is not None and label is not None:
dataframe = dataframe.copy()
dataframe = dataframe.rename(columns={label[0]: "ds", label[1]: "y"})
elif dataframe is not None:
assert "ds" in dataframe and "y" in dataframe, (
"For forecasting task, dataframe must have columns "
'"ds" and "y" with the dates and values respectively.'
)
elif (X_train_all is not None) and (y_train_all is not None):
dataframe = pd.DataFrame(X_train_all)
dataframe = dataframe.rename(columns={dataframe.columns[0]: "ds"})
dataframe["y"] = pd.Series(y_train_all)
X_train_all = None
y_train_all = None
label = "y"
if X_train_all is not None and y_train_all is not None:
assert (
isinstance(X_train_all, np.ndarray)
@ -525,6 +518,8 @@ class AutoML:
assert (
X_train_all.size != 0 and y_train_all.size != 0
), "Input data must not be empty."
if isinstance(X_train_all, np.ndarray) and len(X_train_all.shape) == 1:
X_train_all = np.reshape(X_train_all, (X_train_all.size, 1))
if isinstance(y_train_all, np.ndarray):
y_train_all = y_train_all.flatten()
assert (
@ -532,6 +527,10 @@ class AutoML:
), "# rows in X_train must match length of y_train."
self._df = isinstance(X_train_all, pd.DataFrame)
self._nrow, self._ndim = X_train_all.shape
if self._state.task == TS_FORECAST:
X_train_all = pd.DataFrame(X_train_all)
assert X_train_all[X_train_all.columns[0]].dtype.name == 'datetime64[ns]', (
f"For '{TS_FORECAST}' task, the first column must contain timestamp values.")
X, y = X_train_all, y_train_all
elif dataframe is not None and label is not None:
assert isinstance(
@ -539,12 +538,15 @@ class AutoML:
), "dataframe must be a pandas DataFrame"
assert label in dataframe.columns, "label must a column name in dataframe"
self._df = True
if self._state.task == TS_FORECAST:
assert dataframe[dataframe.columns[0]].dtype.name == 'datetime64[ns]', (
f"For '{TS_FORECAST}' task, the first column must contain timestamp values.")
X = dataframe.drop(columns=label)
self._nrow, self._ndim = X.shape
y = dataframe[label]
else:
raise ValueError("either X_train+y_train or dataframe+label are required")
if issparse(X_train_all) or self._state.task == "forecast":
if issparse(X_train_all):
self._transformer = self._label_transformer = False
self._X_train_all, self._y_train_all = X, y
else:
@ -578,11 +580,11 @@ class AutoML:
X_val.shape[0] == y_val.shape[0]
), "# rows in X_val must match length of y_val."
if self._transformer:
self._state.X_val = self._transformer.transform(X_val)
self._state.X_val = self._transformer.transform(X_val, self._state.task)
else:
self._state.X_val = X_val
if self._label_transformer:
self._state.y_val = self._label_transformer.transform(y_val)
self._state.y_val = self._label_transformer.transform(y_val, self._state.task)
else:
self._state.y_val = y_val
else:
@ -668,7 +670,7 @@ class AutoML:
if X_val is None and eval_method == "holdout":
# if eval_method = holdout, make holdout data
if self._split_type == "time":
if self._state.task == "forecast":
if self._state.task == TS_FORECAST:
num_samples = X_train_all.shape[0]
period = self._state.fit_kwargs["period"]
assert (
@ -826,7 +828,7 @@ class AutoML:
)
elif self._split_type == "time":
# logger.info("Using TimeSeriesSplit")
if self._state.task == "forecast":
if self._state.task == TS_FORECAST:
period = self._state.fit_kwargs["period"]
if period * (n_splits + 1) > y_train_all.size:
n_splits = int(y_train_all.size / period - 1)
@ -861,7 +863,7 @@ class AutoML:
record_id: An integer of the record ID in the file,
0 corresponds to the first trial
task: A string of the task type,
'binary', 'multi', 'regression', 'forecast', 'rank'
'binary', 'multi', 'regression', 'ts_forecast', 'rank'
Returns:
An estimator object for the given configuration
@ -908,20 +910,24 @@ class AutoML:
Args:
log_file_name: A string of the log file name
X_train: A numpy array of training data in shape n*m
For 'ts_forecast' task, the first column of X_train
must be the timestamp column (datetime type). Other
columns in the dataframe are assumed to be exogenous
variables (categorical or numeric).
y_train: A numpy array of labels in shape n*1
dataframe: A dataframe of training data including label column.
For 'forecast' task, dataframe must be specified and should
have two columns: timestamp and value.
label: A str of the label column name for 'classification' or
'regression' task, e.g., 'label';
or a tuple of strings for timestamp and value columns for
'forecasting' task, e.g., ('timestamp', 'value').
For 'ts_forecast' task, dataframe must be specified and should
have at least two columns: timestamp and label, where the first
column is the timestamp column (datetime type). Other columns
in the dataframe are assumed to be exogenous variables
(categorical or numeric).
label: A str of the label column name, e.g., 'label';
Note: If X_train and y_train are provided,
dataframe and label are ignored;
If not, dataframe and label must be provided.
time_budget: A float number of the time budget in seconds.
task: A string of the task type, e.g.,
'classification', 'regression', 'forecast', 'rank'.
'classification', 'regression', 'ts_forecast', 'rank'.
eval_method: A string of resampling strategy, one of
['auto', 'cv', 'holdout'].
split_ratio: A float of the validation data percentage for holdout.
@ -931,7 +937,7 @@ class AutoML:
None, 'stratified', 'uniform', 'time', 'group']. None -> stratified.
For regression tasks, valid choices are [None, 'uniform', 'time'].
None -> uniform.
For time series forecasting, must be None or 'time'.
For ts_forecast tasks, must be None or 'time'.
For ranking task, must be None or 'group'.
groups: None or array-like | Group labels (with matching length to
y_train) or groups counts (with sum equal to length of y_train)
@ -951,7 +957,10 @@ class AutoML:
**fit_kwargs: Other key word arguments to pass to fit() function of
the searched learners, such as sample_weight.
"""
self._state.task = task
if task == FORECAST:
self._state.task = TS_FORECAST
else:
self._state.task = task
self._state.fit_kwargs = fit_kwargs
self._validate_data(X_train, y_train, dataframe, label, groups=groups)
@ -1037,12 +1046,12 @@ class AutoML:
elif self._state.task == "regression":
assert split_type in [None, "uniform", "time", "group"]
self._split_type = split_type or "uniform"
elif self._state.task == "forecast":
elif self._state.task == TS_FORECAST:
assert split_type in [None, "time"]
self._split_type = "time"
assert isinstance(
self._state.fit_kwargs.get("period"), int
), "missing a required integer 'period' for forecast."
), f"missing a required integer 'period' for '{TS_FORECAST}' task."
elif self._state.task == "rank":
assert (
self._state.groups is not None
@ -1298,16 +1307,16 @@ class AutoML:
Args:
X_train: A numpy array or a pandas dataframe of training data in
shape (n, m). For 'forecast' task, X_train should contain a
single column of timestamps.
shape (n, m). For 'ts_forecast' task, the first column of X_train
must be the timestamp column (datetime type). Other columns in
the dataframe are assumed to be exogenous variables (categorical or numeric).
y_train: A numpy array or a pandas series of labels in shape (n, ).
dataframe: A dataframe of training data including label column.
For 'forecast' task, dataframe must be specified and should
have two columns: timestamp and value.
label: A str of the label column name for 'classification' or
'regression' task, e.g., 'label';
or a tuple of strings for timestamp and value columns for
'forecasting' task, e.g., ('timestamp', 'value').
For 'ts_forecast' task, dataframe must be specified and must have
at least two columns, timestamp and label, where the first
column is the timestamp column (datetime type). Other columns in
the dataframe are assumed to be exogenous variables (categorical or numeric).
label: A str of the label column name for, e.g., 'label';
Note: If X_train and y_train are provided,
dataframe and label are ignored;
If not, dataframe and label must be provided.
@ -1330,7 +1339,7 @@ class AutoML:
which returns a float number as the minimization objective,
and a dictionary as the metrics to log.
task: A string of the task type, e.g.,
'classification', 'regression', 'forecast', 'rank'.
'classification', 'regression', 'ts_forecast', 'rank'.
n_jobs: An integer of the number of threads for training.
log_file_name: A string of the log file name.
estimator_list: A list of strings for estimator names, or 'auto'
@ -1386,7 +1395,7 @@ class AutoML:
None, 'stratified', 'uniform', 'time']. None -> stratified.
For regression tasks, valid choices are [None, 'uniform', 'time'].
None -> uniform.
For time series forecasting, must be None or 'time'.
For ts_forecast tasks, must be None or 'time'.
For ranking task, must be None or 'group'.
hpo_method: str or None, default=None | The hyperparameter
optimization method. By default, CFO is used for sequential
@ -1433,10 +1442,13 @@ class AutoML:
size when sample=True.
**fit_kwargs: Other key word arguments to pass to fit() function of
the searched learners, such as sample_weight. Include period as
a key word argument for 'forecast' task.
a key word argument for 'ts_forecast' task.
"""
self._state._start_time_flag = self._start_time_flag = time.time()
self._state.task = task
if task == FORECAST:
self._state.task = TS_FORECAST
else:
self._state.task = task
self._state.log_training_metric = log_training_metric
self._state.fit_kwargs = fit_kwargs
self._state.weight_val = sample_weight_val
@ -1488,7 +1500,7 @@ class AutoML:
metric = "roc_auc"
elif "multi" in self._state.task:
metric = "log_loss"
elif self._state.task == "forecast":
elif self._state.task == TS_FORECAST:
metric = "mape"
elif self._state.task == "rank":
metric = "ndcg"
@ -1515,7 +1527,7 @@ class AutoML:
logger.info(f"Minimizing error metric: {error_metric}")
if "auto" == estimator_list:
if self._state.task == "forecast":
if self._state.task == TS_FORECAST:
try:
import prophet
@ -2132,7 +2144,7 @@ class AutoML:
elif self._retrain_final:
# reset time budget for retraining
self._state.time_from_start -= self._state.time_budget
if self._state.task == "forecast" or (
if self._state.task == TS_FORECAST or (
self._state.time_budget - self._state.time_from_start
> self._selected.est_retrain_time(self.data_size_full)
and self._selected.best_config_sample_size == self._state.data_size

View File

@ -12,6 +12,10 @@ from .training_log import training_log_reader
from datetime import datetime
CLASSIFICATION = ("binary", "multi", "classification")
TS_FORECAST = "ts_forecast"
TS_TIMESTAMP_COL = "ds"
TS_VALUE_COL = "y"
FORECAST = "forecast"
def load_openml_dataset(
@ -212,6 +216,11 @@ class DataTransformer:
n = X.shape[0]
cat_columns, num_columns, datetime_columns = [], [], []
drop = False
if task == TS_FORECAST:
X = X.rename(columns={X.columns[0]: TS_TIMESTAMP_COL})
ds_col = X.pop(TS_TIMESTAMP_COL)
if isinstance(y, pd.Series):
y = y.rename(TS_VALUE_COL)
for column in X.columns:
# sklearn\utils\validation.py needs int/float values
if X[column].dtype.name in ("object", "category"):
@ -270,6 +279,8 @@ class DataTransformer:
X[column] = X[column].fillna(np.nan)
num_columns.append(column)
X = X[cat_columns + num_columns]
if task == TS_FORECAST:
X.insert(0, TS_TIMESTAMP_COL, ds_col)
if cat_columns:
X[cat_columns] = X[cat_columns].astype("category")
if num_columns:
@ -312,7 +323,7 @@ class DataTransformer:
self.label_transformer = None
return X, y
def transform(self, X):
def transform(self, X, task):
X = X.copy()
if isinstance(X, pd.DataFrame):
cat_columns, num_columns, datetime_columns = (
@ -320,6 +331,9 @@ class DataTransformer:
self._num_columns,
self._datetime_columns,
)
if task == TS_FORECAST:
X = X.rename(columns={X.columns[0]: TS_TIMESTAMP_COL})
ds_col = X.pop(TS_TIMESTAMP_COL)
if datetime_columns:
for column in datetime_columns:
tmp_dt = X[column].dt
@ -344,6 +358,8 @@ class DataTransformer:
X[column] = X[column].map(datetime.toordinal)
del tmp_dt
X = X[cat_columns + num_columns].copy()
if task == TS_FORECAST:
X.insert(0, TS_TIMESTAMP_COL, ds_col)
for column in cat_columns:
if X[column].dtype.name == "object":
X[column] = X[column].fillna("__NAN__")

View File

@ -33,7 +33,7 @@ from .model import (
ARIMA,
SARIMAX,
)
from .data import CLASSIFICATION, group_counts
from .data import CLASSIFICATION, group_counts, TS_FORECAST, TS_VALUE_COL
import logging
@ -313,8 +313,8 @@ def evaluate_model_CV(
groups = kf.groups
kf = kf.split(X_train_split, y_train_split, groups)
shuffle = False
elif isinstance(kf, TimeSeriesSplit) and task == "forecast":
y_train_all = pd.DataFrame(y_train_all, columns=["y"])
elif isinstance(kf, TimeSeriesSplit) and task == TS_FORECAST:
y_train_all = pd.DataFrame(y_train_all, columns=[TS_VALUE_COL])
train = X_train_all.join(y_train_all)
kf = kf.split(train)
shuffle = False

View File

@ -11,7 +11,7 @@ from sklearn.linear_model import LogisticRegression
from scipy.sparse import issparse
import pandas as pd
from . import tune
from .data import group_counts, CLASSIFICATION
from .data import group_counts, CLASSIFICATION, TS_FORECAST, TS_TIMESTAMP_COL, TS_VALUE_COL
import logging
@ -871,22 +871,22 @@ class KNeighborsEstimator(BaseEstimator):
return X
class Prophet(BaseEstimator):
class Prophet(SKLearnEstimator):
@classmethod
def search_space(cls, **params):
space = {
"changepoint_prior_scale": {
"domain": tune.loguniform(lower=0.001, upper=1000),
"init_value": 0.01,
"domain": tune.loguniform(lower=0.001, upper=0.05),
"init_value": 0.05,
"low_cost_init_value": 0.001,
},
"seasonality_prior_scale": {
"domain": tune.loguniform(lower=0.01, upper=100),
"init_value": 1,
"domain": tune.loguniform(lower=0.01, upper=10),
"init_value": 10,
},
"holidays_prior_scale": {
"domain": tune.loguniform(lower=0.01, upper=100),
"init_value": 1,
"domain": tune.loguniform(lower=0.01, upper=10),
"init_value": 10,
},
"seasonality_mode": {
"domain": tune.choice(["additive", "multiplicative"]),
@ -895,15 +895,15 @@ class Prophet(BaseEstimator):
}
return space
def __init__(self, task="forecast", n_jobs=1, **params):
def __init__(self, task=TS_FORECAST, n_jobs=1, **params):
super().__init__(task, **params)
def _join(self, X_train, y_train):
assert "ds" in X_train, (
"Dataframe for training forecast model must have column"
' "ds" with the dates in X_train.'
assert TS_TIMESTAMP_COL in X_train, (
"Dataframe for training ts_forecast model must have column"
f' "{TS_TIMESTAMP_COL}" with the dates in X_train.'
)
y_train = pd.DataFrame(y_train, columns=["y"])
y_train = pd.DataFrame(y_train, columns=[TS_VALUE_COL])
train_df = X_train.join(y_train)
return train_df
@ -912,7 +912,14 @@ class Prophet(BaseEstimator):
current_time = time.time()
train_df = self._join(X_train, y_train)
model = Prophet(**self.params).fit(train_df)
train_df = self._preprocess(train_df)
cols = list(train_df)
cols.remove(TS_TIMESTAMP_COL)
cols.remove(TS_VALUE_COL)
model = Prophet(**self.params)
for regressor in cols:
model.add_regressor(regressor)
model.fit(train_df)
train_time = time.time() - current_time
self._model = model
return train_time
@ -921,9 +928,11 @@ class Prophet(BaseEstimator):
if isinstance(X_test, int):
raise ValueError(
"predict() with steps is only supported for arima/sarimax."
" For Prophet, pass a dataframe with a date colum named ds."
" For Prophet, pass a dataframe with the first column containing"
" the timestamp values."
)
if self._model is not None:
X_test = self._preprocess(X_test)
forecast = self._model.predict(X_test)
return forecast["yhat"]
else:
@ -949,7 +958,7 @@ class ARIMA(Prophet):
},
"q": {
"domain": tune.quniform(lower=0, upper=10, q=1),
"init_value": 2,
"init_value": 1,
"low_cost_init_value": 0,
},
}
@ -957,8 +966,8 @@ class ARIMA(Prophet):
def _join(self, X_train, y_train):
train_df = super()._join(X_train, y_train)
train_df.index = pd.to_datetime(train_df["ds"])
train_df = train_df.drop("ds", axis=1)
train_df.index = pd.to_datetime(train_df[TS_TIMESTAMP_COL])
train_df = train_df.drop(TS_TIMESTAMP_COL, axis=1)
return train_df
def fit(self, X_train, y_train, budget=None, **kwargs):
@ -969,12 +978,20 @@ class ARIMA(Prophet):
current_time = time.time()
train_df = self._join(X_train, y_train)
model = ARIMA_estimator(
train_df,
order=(self.params["p"], self.params["d"], self.params["q"]),
enforce_stationarity=False,
enforce_invertibility=False,
)
train_df = self._preprocess(train_df)
cols = list(train_df)
cols.remove(TS_VALUE_COL)
regressors = cols
if regressors:
model = ARIMA_estimator(
train_df[[TS_VALUE_COL]], exog=train_df[regressors], order=(
self.params["p"], self.params["d"], self.params["q"]),
enforce_stationarity=False, enforce_invertibility=False)
else:
model = ARIMA_estimator(
train_df, order=(
self.params["p"], self.params["d"], self.params["q"]),
enforce_stationarity=False, enforce_invertibility=False)
model = model.fit()
train_time = time.time() - current_time
self._model = model
@ -985,12 +1002,20 @@ class ARIMA(Prophet):
if isinstance(X_test, int):
forecast = self._model.forecast(steps=X_test)
elif isinstance(X_test, pd.DataFrame):
first_col = X_test.pop(TS_TIMESTAMP_COL)
X_test.insert(0, TS_TIMESTAMP_COL, first_col)
start = X_test.iloc[0, 0]
end = X_test.iloc[-1, 0]
forecast = self._model.predict(start=start, end=end)
if len(X_test.columns) > 1:
regressors = list(X_test)
regressors.remove(TS_TIMESTAMP_COL)
X_test = self._preprocess(X_test)
forecast = self._model.predict(start=start, end=end, exog=X_test[regressors])
else:
forecast = self._model.predict(start=start, end=end)
else:
raise ValueError(
"X_test needs to be either a pd.Dataframe with dates as column ds)"
"X_test needs to be either a pd.Dataframe with dates as the first column"
" or an int number of periods for predict()."
)
return forecast
@ -1014,7 +1039,7 @@ class SARIMAX(ARIMA):
},
"q": {
"domain": tune.quniform(lower=0, upper=10, q=1),
"init_value": 2,
"init_value": 1,
"low_cost_init_value": 0,
},
"P": {
@ -1040,22 +1065,36 @@ class SARIMAX(ARIMA):
return space
def fit(self, X_train, y_train, budget=None, **kwargs):
import warnings
warnings.filterwarnings("ignore")
from statsmodels.tsa.statespace.sarimax import SARIMAX as SARIMAX_estimator
current_time = time.time()
train_df = self._join(X_train, y_train)
model = SARIMAX_estimator(
train_df,
order=(self.params["p"], self.params["d"], self.params["q"]),
seasonality_order=(
self.params["P"],
self.params["D"],
self.params["Q"],
self.params["s"],
),
enforce_stationarity=False,
enforce_invertibility=False,
)
train_df = self._preprocess(train_df)
regressors = list(train_df)
regressors.remove(TS_VALUE_COL)
if regressors:
model = SARIMAX_estimator(
train_df[[TS_VALUE_COL]], exog=train_df[regressors], order=(
self.params["p"], self.params["d"], self.params["q"]),
seasonality_order=(
self.params["P"],
self.params["D"],
self.params["Q"],
self.params["s"]),
enforce_stationarity=False, enforce_invertibility=False)
else:
model = SARIMAX_estimator(
train_df, order=(
self.params["p"], self.params["d"], self.params["q"]),
seasonality_order=(
self.params["P"],
self.params["D"],
self.params["Q"],
self.params["s"]),
enforce_stationarity=False, enforce_invertibility=False)
model = model.fit()
train_time = time.time() - current_time
self._model = model

View File

@ -1 +1 @@
__version__ = "0.6.9"
__version__ = "0.7.0"

View File

@ -2,13 +2,14 @@
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Time Series Forecasting with FLAML Library"
],
"metadata": {}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Introduction\n",
"\n",
@ -20,32 +21,33 @@
" - In this notebook, we demonstrate how to use FLAML library to tune hyperparameters of XGBoost with a regression example.\n",
"\n",
"FLAML requires Python>=3.6. To run this notebook example, please install flaml with the notebook and forecast option:\n"
],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"!pip install flaml[notebook,forecast]"
],
"metadata": {},
"outputs": [],
"metadata": {}
"source": [
"!pip install flaml[notebook,ts_forecast]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Forecast Problem\r\n",
"\r\n",
"### Load data and preprocess\r\n",
"\r\n",
"## 2. Forecast Problem\n",
"\n",
"### Load data and preprocess\n",
"\n",
"Import co2 data from statsmodel. The dataset is from “Atmospheric CO2 from Continuous Air Samples at Mauna Loa Observatory, Hawaii, U.S.A.,” which collected CO2 samples from March 1958 to December 2001. The task is to predict monthly CO2 samples."
],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import statsmodels.api as sm\n",
"data = sm.datasets.co2.load_pandas()\n",
@ -55,149 +57,149 @@
"data = data.fillna(data.bfill()) # makes sure there are no missing values\n",
"data = data.to_frame().reset_index()\n",
"# data = data.rename(columns={'index': 'ds', 'co2': 'y'})"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# split the data into a train dataframe and X_test and y_test dataframes, where the number of samples for test is equal to\n",
"# the number of periods the user wants to predict\n",
"num_samples = data.shape[0]\n",
"time_horizon = 12\n",
"split_idx = num_samples - time_horizon\n",
"X_train = data[:split_idx] # X_train is a dataframe with two columns: time and value\n",
"X_test = data[split_idx:]['index'].to_frame('ds') # X_test is a dataframe with dates for prediction\n",
"train_df = data[:split_idx] # train_df is a dataframe with two columns: timestamp and label\n",
"X_test = data[split_idx:]['index'].to_frame() # X_test is a dataframe with dates for prediction\n",
"y_test = data[split_idx:]['co2'] # y_test is a series of the values corresponding to the dates for prediction"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Run FLAML\r\n",
"\r\n",
"### Run FLAML\n",
"\n",
"In the FLAML automl run configuration, users can specify the task type, time budget, error metric, learner list, whether to subsample, resampling strategy type, and so on. All these arguments have default values which will be used if users do not provide them."
],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"''' import AutoML class from flaml package '''\n",
"from flaml import AutoML\n",
"automl = AutoML()"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"settings = {\n",
" \"time_budget\": 180, # total running time in seconds\n",
" \"metric\": 'mape', # primary metric for validation: 'mape' is generally used for forecast tasks\n",
" \"task\": 'forecast', # task type\n",
" \"task\": 'ts_forecast', # task type\n",
" \"log_file_name\": 'CO2_forecast.log', # flaml log file\n",
" \"eval_method\": \"holdout\", # validation method can be chosen from ['auto', 'holdout', 'cv']\n",
" \"seed\": 7654321, # random seed\n",
"}"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"'''The main flaml automl API'''\n",
"automl.fit(dataframe=X_train, # training data\n",
" label=('index', 'co2'), # For 'forecast' task, label should be a tuple of strings for timestamp and value columns\n",
"automl.fit(dataframe=train_df, # training data\n",
" label='co2', # For 'forecast' task, label should be a tuple of strings for timestamp and value columns\n",
" **settings, \n",
" period=time_horizon) # key word argument 'period' must be included for forecast task)"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Best model and metric"
],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"''' retrieve best config and best learner'''\n",
"print('Best ML leaner:', automl.best_estimator)\n",
"print('Best hyperparmeter config:', automl.best_config)\n",
"print(f'Best mape on validation data: {automl.best_loss}')\n",
"print(f'Training duration of best run: {automl.best_config_train_time}s')"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(automl.model.estimator)"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"''' pickle and save the automl object '''\n",
"import pickle\n",
"with open('automl.pkl', 'wb') as f:\n",
" pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"''' compute predictions of testing dataset '''\n",
"flaml_y_pred = automl.predict(X_test)\n",
"print('Predicted labels', flaml_y_pred)\n",
"print('True labels', y_test)"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"''' compute different metric values on testing dataset'''\n",
"from flaml.ml import sklearn_metric_loss_score\n",
"print('mape', '=', sklearn_metric_loss_score('mape', flaml_y_pred, y_test))"
],
"outputs": [],
"metadata": {}
"print('mape', '=', sklearn_metric_loss_score('mape', y_predict=flaml_y_pred, y_true=y_test))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Log history"
],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from flaml.data import get_output_from_log\n",
"time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \\\n",
@ -205,13 +207,13 @@
"\n",
"for config in config_history:\n",
" print(config)"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
@ -222,13 +224,13 @@
"plt.scatter(time_history, 1 - np.array(valid_loss_history))\n",
"plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')\n",
"plt.show()"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"plt.plot(X_test, y_test, label='Actual level')\n",
@ -236,32 +238,30 @@
"plt.xlabel('Date')\n",
"plt.ylabel('CO2 Levels')\n",
"plt.legend()"
],
"outputs": [],
"metadata": {}
]
}
],
"metadata": {
"interpreter": {
"hash": "8b6c8c3ba4bafbc4530f534c605c8412f25bf61ef13254e4f377ccd42b838aa4"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3.8.0 64-bit ('blend': conda)"
"display_name": "Python 3.8.10 64-bit ('python38': conda)",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.8.0",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"interpreter": {
"hash": "0cfea3304185a9579d09e0953576b57c8581e46e6ebc6dfeb681bc5a511f7544"
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}

View File

@ -81,6 +81,7 @@ setuptools.setup(
"tensorboardX<=2.2",
"torch",
],
"ts_forecast": ["prophet>=1.0.1", "statsmodels>=0.12.2"],
"forecast": ["prophet>=1.0.1", "statsmodels>=0.12.2"],
},
classifiers=[

View File

@ -23,10 +23,10 @@ def test_forecast_automl(budget=5):
settings = {
"time_budget": budget, # total running time in seconds
"metric": "mape", # primary metric
"task": "forecast", # task type
"task": "ts_forecast", # task type
"log_file_name": "test/CO2_forecast.log", # flaml log file
"eval_method": "holdout",
"label": ("ds", "y"),
"label": "y",
}
"""The main flaml automl API"""
try:
@ -75,7 +75,7 @@ def test_forecast_automl(budget=5):
print(automl.max_resource)
print(automl.min_resource)
X_train = df["ds"]
X_train = df[["ds"]]
y_train = df["y"]
automl = AutoML()
try:
@ -93,39 +93,252 @@ def test_forecast_automl(budget=5):
def test_numpy():
X_train = np.arange("2014-01", "2021-01", dtype="datetime64[M]")
y_train = np.random.random(size=72)
y_train = np.random.random(size=len(X_train))
automl = AutoML()
try:
import prophet
automl.fit(
X_train=X_train[:60], # a single column of timestamp
y_train=y_train, # value for each timestamp
X_train=X_train[:72], # a single column of timestamp
y_train=y_train[:72], # value for each timestamp
period=12, # time horizon to forecast, e.g., 12 months
task="forecast",
task="ts_forecast",
time_budget=3, # time budget in seconds
log_file_name="test/forecast.log",
log_file_name="test/ts_forecast.log",
)
print(automl.predict(X_train[60:]))
print(automl.predict(12))
except ValueError:
print("ValueError for prophet is raised as expected.")
print(automl.predict(X_train[72:]))
except ImportError:
print("not using prophet due to ImportError")
automl = AutoML()
automl.fit(
X_train=X_train[:72], # a single column of timestamp
y_train=y_train, # value for each timestamp
y_train=y_train[:72], # value for each timestamp
period=12, # time horizon to forecast, e.g., 12 months
task="forecast",
task="ts_forecast",
time_budget=1, # time budget in seconds
estimator_list=["arima", "sarimax"],
log_file_name="test/forecast.log",
log_file_name="test/ts_forecast.log",
)
print(automl.predict(X_train[72:]))
# an alternative way to specify predict steps for arima/sarimax
print(automl.predict(12))
def load_multi_dataset():
"""multivariate time series forecasting dataset"""
import pandas as pd
# pd.set_option("display.max_rows", None, "display.max_columns", None)
df = pd.read_csv("https://raw.githubusercontent.com/srivatsan88/YouTubeLI/master/dataset/nyc_energy_consumption.csv")
# preprocessing data
df["timeStamp"] = pd.to_datetime(df["timeStamp"])
df = df.set_index("timeStamp")
df = df.resample("D").mean()
df["temp"] = df["temp"].fillna(method="ffill")
df["precip"] = df["precip"].fillna(method="ffill")
df = df[:-2] # last two rows are NaN for 'demand' column so remove them
df = df.reset_index()
return df
def test_multivariate_forecast_num(budget=5):
df = load_multi_dataset()
# split data into train and test
time_horizon = 180
num_samples = df.shape[0]
split_idx = num_samples - time_horizon
train_df = df[:split_idx]
test_df = df[split_idx:]
X_test = test_df[["timeStamp", "temp", "precip"]] # test dataframe must contain values for the regressors / multivariate variables
y_test = test_df["demand"]
# return
automl = AutoML()
settings = {
"time_budget": budget, # total running time in seconds
"metric": "mape", # primary metric
"task": "ts_forecast", # task type
"log_file_name": "test/energy_forecast_numerical.log", # flaml log file
"eval_method": "holdout",
"log_type": "all",
"label": "demand"
}
'''The main flaml automl API'''
try:
import prophet
automl.fit(dataframe=train_df, **settings, period=time_horizon)
except ImportError:
print("not using prophet due to ImportError")
automl.fit(
dataframe=train_df,
**settings,
estimator_list=["arima", "sarimax"],
period=time_horizon,
)
""" retrieve best config and best learner"""
print("Best ML leaner:", automl.best_estimator)
print("Best hyperparmeter config:", automl.best_config)
print(f"Best mape on validation data: {automl.best_loss}")
print(f"Training duration of best run: {automl.best_config_train_time}s")
print(automl.model.estimator)
""" pickle and save the automl object """
import pickle
with open("automl.pkl", "wb") as f:
pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
""" compute predictions of testing dataset """
y_pred = automl.predict(X_test)
print("Predicted labels", y_pred)
print("True labels", y_test)
""" compute different metric values on testing dataset"""
from flaml.ml import sklearn_metric_loss_score
print("mape", "=", sklearn_metric_loss_score("mape", y_pred, y_test))
from flaml.data import get_output_from_log
time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
get_output_from_log(filename=settings["log_file_name"], time_budget=budget)
for config in config_history:
print(config)
print(automl.prune_attr)
print(automl.max_resource)
print(automl.min_resource)
# import matplotlib.pyplot as plt
#
# plt.figure()
# plt.plot(X_test["timeStamp"], y_test, label="Actual Demand")
# plt.plot(X_test["timeStamp"], y_pred, label="FLAML Forecast")
# plt.xlabel("Date")
# plt.ylabel("Energy Demand")
# plt.legend()
# plt.show()
def load_multi_dataset_cat(time_horizon):
df = load_multi_dataset()
df = df[["timeStamp", "demand", "temp"]]
# feature engineering - use discrete values to denote different categories
def season(date):
date = (date.month, date.day)
spring = (3, 20)
summer = (6, 21)
fall = (9, 22)
winter = (12, 21)
if date < spring or date >= winter:
return "winter" # winter 0
elif spring <= date < summer:
return "spring" # spring 1
elif summer <= date < fall:
return "summer" # summer 2
elif fall <= date < winter:
return "fall" # fall 3
def get_monthly_avg(data):
data["month"] = data["timeStamp"].dt.month
data = data[["month", "temp"]].groupby("month")
data = data.agg({"temp": "mean"})
return data
monthly_avg = get_monthly_avg(df).to_dict().get("temp")
def above_monthly_avg(date, temp):
month = date.month
if temp > monthly_avg.get(month):
return 1
else:
return 0
df["season"] = df["timeStamp"].apply(season)
df["above_monthly_avg"] = df.apply(lambda x: above_monthly_avg(x["timeStamp"], x["temp"]), axis=1)
# split data into train and test
num_samples = df.shape[0]
split_idx = num_samples - time_horizon
train_df = df[:split_idx]
test_df = df[split_idx:]
del train_df["temp"], train_df["month"]
return train_df, test_df
def test_multivariate_forecast_cat(budget=5):
time_horizon = 180
train_df, test_df = load_multi_dataset_cat(time_horizon)
print(train_df)
X_test = test_df[["timeStamp", "season", "above_monthly_avg"]] # test dataframe must contain values for the regressors / multivariate variables
y_test = test_df["demand"]
automl = AutoML()
settings = {
"time_budget": budget, # total running time in seconds
"metric": "mape", # primary metric
"task": "ts_forecast", # task type
"log_file_name": "test/energy_forecast_numerical.log", # flaml log file
"eval_method": "holdout",
"log_type": "all",
"label": "demand"
}
'''The main flaml automl API'''
try:
import prophet
automl.fit(dataframe=train_df, **settings, period=time_horizon)
except ImportError:
print("not using prophet due to ImportError")
automl.fit(
dataframe=train_df,
**settings,
estimator_list=["arima", "sarimax"],
period=time_horizon,
)
""" retrieve best config and best learner"""
print("Best ML leaner:", automl.best_estimator)
print("Best hyperparmeter config:", automl.best_config)
print(f"Best mape on validation data: {automl.best_loss}")
print(f"Training duration of best run: {automl.best_config_train_time}s")
print(automl.model.estimator)
""" pickle and save the automl object """
import pickle
with open("automl.pkl", "wb") as f:
pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
""" compute predictions of testing dataset """
y_pred = automl.predict(X_test)
print("Predicted labels", y_pred)
print("True labels", y_test)
""" compute different metric values on testing dataset"""
from flaml.ml import sklearn_metric_loss_score
print("mape", "=", sklearn_metric_loss_score("mape", y_pred, y_test))
print("rmse", "=", sklearn_metric_loss_score("rmse", y_pred, y_test))
print("mse", "=", sklearn_metric_loss_score("mse", y_pred, y_test))
print("mae", "=", sklearn_metric_loss_score("mae", y_pred, y_test))
from flaml.data import get_output_from_log
time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
get_output_from_log(filename=settings["log_file_name"], time_budget=budget)
for config in config_history:
print(config)
print(automl.prune_attr)
print(automl.max_resource)
print(automl.min_resource)
# import matplotlib.pyplot as plt
#
# plt.figure()
# plt.plot(X_test["timeStamp"], y_test, label="Actual Demand")
# plt.plot(X_test["timeStamp"], y_pred, label="FLAML Forecast")
# plt.xlabel("Date")
# plt.ylabel("Energy Demand")
# plt.legend()
# plt.show()
if __name__ == "__main__":
test_forecast_automl(60)
test_multivariate_forecast_num(60)
test_multivariate_forecast_cat(60)