* added 'forecast' task with estimators ['fbprophet', 'arima', 'sarimax']

* update setup.py

* add TimeSeriesSplit to 'regression' and 'classification' task

* add 'time' split_type for 'classification' and 'regression' task

Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com>

* feature importance

* variable name

* Update test/test_split.py

Co-authored-by: Chi Wang <wang.chi@microsoft.com>

* Update test/test_forecast.py

Co-authored-by: Chi Wang <wang.chi@microsoft.com>

* prophet installation fail in windows

* upload flaml_forecast.ipynb

Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com>
This commit is contained in:
Kevin Chen 2021-08-23 16:26:46 -04:00 committed by GitHub
parent 6270353458
commit 3d0a3d26a2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 2613 additions and 1021 deletions

View File

@ -41,7 +41,7 @@ jobs:
- name: If linux or mac, install ray
if: (matrix.os == 'macOS-latest' || matrix.os == 'ubuntu-latest') && matrix.python-version != '3.9'
run: |
pip install -e .[ray]
pip install -e .[ray,forecast]
pip install 'tensorboardX<=2.2'
- name: Lint with flake8
run: |

View File

@ -10,7 +10,7 @@ from functools import partial
import numpy as np
from scipy.sparse import issparse
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, \
RepeatedKFold, GroupKFold
RepeatedKFold, GroupKFold, TimeSeriesSplit
from sklearn.utils import shuffle
import pandas as pd
@ -25,6 +25,7 @@ from . import tune
from .training_log import training_log_reader, training_log_writer
import logging
logger = logging.getLogger(__name__)
logger_formatter = logging.Formatter(
'[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s',
@ -360,11 +361,15 @@ class AutoML:
return self._trained_estimator.classes_.tolist()
return None
def predict(self, X_test):
def predict(self, X_test, freq=None):
'''Predict label from features.
Args:
X_test: A numpy array of featurized instances, shape n * m.
X_test: A numpy array of featurized instances, shape n * m,
or a pandas dataframe with one column with timestamp values
for 'forecasting' task.
freq: str or pandas offset, default=None | The frequency of the
time-series.
Returns:
A numpy array of shape n * 1 - - each element is a predicted class
@ -375,8 +380,14 @@ class AutoML:
"No estimator is trained. Please run fit with enough budget.")
return None
X_test = self._preprocess(X_test)
y_pred = self._trained_estimator.predict(X_test)
if y_pred.ndim > 1:
if self._state.task == 'forecast':
X_test_df = pd.DataFrame(X_test)
X_test_col = list(X_test.columns)[0]
X_test_df = X_test_df.rename(columns={X_test_col: 'ds'})
y_pred = self._trained_estimator.predict(X_test_df, freq=freq)
else:
y_pred = self._trained_estimator.predict(X_test)
if y_pred.ndim > 1 and isinstance(y_pred, np.ndarray):
y_pred = y_pred.flatten()
if self._label_transformer:
return self._label_transformer.inverse_transform(pd.Series(
@ -408,6 +419,25 @@ class AutoML:
def _validate_data(self, X_train_all, y_train_all, dataframe, label,
X_val=None, y_val=None):
if self._state.task == 'forecast':
if dataframe is not None and label is not None:
dataframe = dataframe.copy()
dataframe = dataframe.rename(columns={label[0]: 'ds', label[1]: 'y'})
elif dataframe is not None:
if ('ds' not in dataframe) or ('y' not in dataframe):
raise ValueError(
'For forecasting task, Dataframe must have columns "ds" and "y" '
'with the dates and values respectively.'
)
elif (X_train_all is not None) and (y_train_all is not None):
dataframe = pd.DataFrame(X_train_all)
time_col = list(dataframe.columns)[0]
dataframe = dataframe.rename(columns={time_col: 'ds'})
dataframe['y'] = pd.Series(y_train_all)
X_train_all = None
y_train_all = None
label = 'y'
if X_train_all is not None and y_train_all is not None:
if not (isinstance(X_train_all, np.ndarray) or issparse(X_train_all)
or isinstance(X_train_all, pd.DataFrame)):
@ -440,7 +470,7 @@ class AutoML:
else:
raise ValueError(
"either X_train+y_train or dataframe+label are required")
if issparse(X_train_all):
if issparse(X_train_all) or self._state.task == 'forecast':
self._transformer = self._label_transformer = False
self._X_train_all, self._y_train_all = X, y
else:
@ -482,7 +512,8 @@ class AutoML:
def _prepare_data(self,
eval_method,
split_ratio,
n_splits):
n_splits,
period=None):
X_val, y_val = self._state.X_val, self._state.y_val
if issparse(X_val):
X_val = X_val.tocsr()
@ -490,8 +521,9 @@ class AutoML:
self._X_train_all, self._y_train_all
if issparse(X_train_all):
X_train_all = X_train_all.tocsr()
if self._state.task != 'regression' and self._state.fit_kwargs.get(
'sample_weight') is None:
if (self._state.task == 'binary:logistic' or self._state.task == 'multi:softmax') \
and self._state.fit_kwargs.get('sample_weight') is None \
and self._split_type != 'time':
# logger.info(f"label {pd.unique(y_train_all)}")
label_set, counts = np.unique(y_train_all, return_counts=True)
# augment rare classes
@ -518,19 +550,21 @@ class AutoML:
count += rare_count
logger.info(
f"class {label} augmented from {rare_count} to {count}")
if 'sample_weight' in self._state.fit_kwargs:
X_train_all, y_train_all, self._state.fit_kwargs[
'sample_weight'] = shuffle(
SHUFFLE_SPLIT_TYPES = ['uniform', 'stratified']
if self._split_type in SHUFFLE_SPLIT_TYPES:
if 'sample_weight' in self._state.fit_kwargs:
X_train_all, y_train_all, self._state.fit_kwargs[
'sample_weight'] = shuffle(
X_train_all, y_train_all,
self._state.fit_kwargs['sample_weight'],
random_state=RANDOM_SEED)
elif hasattr(self._state, 'groups') and self._state.groups is not None:
X_train_all, y_train_all, self._state.groups = shuffle(
X_train_all, y_train_all, self._state.groups,
random_state=RANDOM_SEED)
else:
X_train_all, y_train_all = shuffle(
X_train_all, y_train_all, random_state=RANDOM_SEED)
elif hasattr(self._state, 'groups') and self._state.groups is not None:
X_train_all, y_train_all, self._state.groups = shuffle(
X_train_all, y_train_all, self._state.groups,
random_state=RANDOM_SEED)
else:
X_train_all, y_train_all = shuffle(
X_train_all, y_train_all, random_state=RANDOM_SEED)
if self._df:
X_train_all.reset_index(drop=True, inplace=True)
if isinstance(y_train_all, pd.Series):
@ -539,7 +573,31 @@ class AutoML:
X_train, y_train = X_train_all, y_train_all
if X_val is None:
# if eval_method = holdout, make holdout data
if self._state.task != 'regression' and eval_method == 'holdout':
if eval_method == 'holdout' and self._split_type == 'time':
if 'period' in self._state.fit_kwargs:
num_samples = X_train_all.shape[0]
split_idx = num_samples - self._state.fit_kwargs.get('period')
X_train = X_train_all[:split_idx]
y_train = y_train_all[:split_idx]
X_val = X_train_all[split_idx:]
y_val = y_train_all[split_idx:]
else:
if 'sample_weight' in self._state.fit_kwargs:
X_train, X_val, y_train, y_val, self._state.fit_kwargs[
'sample_weight'], self._state.weight_val = \
train_test_split(
X_train_all,
y_train_all,
self._state.fit_kwargs['sample_weight'],
test_size=split_ratio,
shuffle=False)
else:
X_train, X_val, y_train, y_val = train_test_split(
X_train_all,
y_train_all,
test_size=split_ratio,
shuffle=False)
elif self._state.task != 'regression' and eval_method == 'holdout':
# for classification, make sure the labels are complete in both
# training and validation data
label_set, first = np.unique(y_train_all, return_index=True)
@ -624,6 +682,13 @@ class AutoML:
f"requires input data with at least {n_splits*2} examples.")
self._state.kf = RepeatedStratifiedKFold(
n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED)
elif self._split_type == "time":
logger.info("Using TimeSeriesSplit")
if self._state.task == 'forecast':
self._state.kf = TimeSeriesSplit(
n_splits=n_splits, test_size=self._state.fit_kwargs.get('period'))
else:
self._state.kf = TimeSeriesSplit(n_splits=n_splits)
else:
logger.info("Using RepeatedKFold")
self._state.kf = RepeatedKFold(
@ -762,10 +827,15 @@ class AutoML:
if self._state.task == 'classification':
self._state.task = get_classification_objective(
len(np.unique(self._y_train_all)))
assert split_type in ["stratified", "uniform"]
assert split_type in ["stratified", "uniform", "time"]
self._split_type = split_type
else:
self._split_type = "uniform"
elif self._state.task == 'regression':
if split_type in ["uniform", "time"]:
self._split_type = split_type
else:
self._split_type = "uniform"
elif self._state.task == 'forecast':
self._split_type = "time"
if record_id >= 0:
eval_method = 'cv'
elif eval_method == 'auto':
@ -1011,15 +1081,22 @@ class AutoML:
Args:
X_train: A numpy array or a pandas dataframe of training data in
shape (n, m)
For 'forecast' task, X_train should be timestamp
y_train: A numpy array or a pandas series of labels in shape (n,)
For 'forecast' task, y_train should be value
dataframe: A dataframe of training data including label column
label: A str of the label column name
For 'forecast' task, dataframe must be specified and should
have two columns: timestamp and value
label: A str of the label column name for 'classification' or
'regression' task or a tuple of strings for timestamp and
value columns for 'forecasting' task
Note: If X_train and y_train are provided,
dataframe and label are ignored;
If not, dataframe and label must be provided.
metric: A string of the metric name or a function,
e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo',
'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2'
'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mape', 'mae', 'mse', 'r2'
for 'forecast' task, use 'mape'
if passing a customized metric function, the function needs to
have the follwing signature:
@ -1034,7 +1111,7 @@ class AutoML:
which returns a float number as the minimization objective,
and a tuple of floats or a dictionary as the metrics to log
task: A string of the task type, e.g.,
'classification', 'regression'
'classification', 'regression', 'forecast'
n_jobs: An integer of the number of threads for training
log_file_name: A string of the log file name
estimator_list: A list of strings for estimator names, or 'auto'
@ -1085,7 +1162,8 @@ class AutoML:
hyperparamter configurations for the corresponding estimators.
seed: int or None, default=None | The random seed for np.random.
**fit_kwargs: Other key word arguments to pass to fit() function of
the searched learners, such as sample_weight.
the searched learners, such as sample_weight. Include period as
a key word argument for 'forecast' task.
'''
self._start_time_flag = time.time()
self._state.task = task
@ -1093,6 +1171,7 @@ class AutoML:
self._state.fit_kwargs = fit_kwargs
self._state.weight_val = sample_weight_val
self._state.groups = groups
self._validate_data(X_train, y_train, dataframe, label, X_val, y_val)
self._search_states = {} # key: estimator name; value: SearchState
self._random = np.random.RandomState(RANDOM_SEED)
@ -1106,10 +1185,19 @@ class AutoML:
if self._state.task == 'classification':
self._state.task = get_classification_objective(
len(np.unique(self._y_train_all)))
assert split_type in ["stratified", "uniform"]
assert split_type in ["stratified", "uniform", "time"]
self._split_type = split_type
else:
self._split_type = "uniform"
elif self._state.task == 'regression':
if split_type in ["uniform", "time"]:
self._split_type = split_type
else:
self._split_type = "uniform"
elif self._state.task == 'forecast':
if split_type is not None and split_type != 'time':
raise ValueError("split_type must be 'time' when task is 'forecast'. ")
self._split_type = "time"
if self._state.task == 'forecast' and self._state.fit_kwargs.get('period') is None:
raise TypeError("missing 1 required argument for 'forecast' task: 'period'. ")
if eval_method == 'auto' or self._state.X_val is not None:
eval_method = self._decide_eval_method(time_budget)
self._state.eval_method = eval_method
@ -1122,7 +1210,11 @@ class AutoML:
self._retrain_full = retrain_full and (
eval_method == 'holdout' and self._state.X_val is None)
self._prepare_data(eval_method, split_ratio, n_splits)
if self._state.task != 'forecast':
self._prepare_data(eval_method, split_ratio, n_splits)
else:
self._prepare_data(eval_method, split_ratio, n_splits,
period=self._state.fit_kwargs.get('period'))
self._sample = sample and eval_method != 'cv' and (
MIN_SAMPLE_TRAIN * SAMPLE_MULTIPLY_FACTOR < self._state.data_size)
if 'auto' == metric:
@ -1130,6 +1222,8 @@ class AutoML:
metric = 'roc_auc'
elif 'multi' in self._state.task:
metric = 'log_loss'
elif self._state.task == 'forecast':
metric = 'mape'
else:
metric = 'r2'
self._state.metric = metric
@ -1146,6 +1240,8 @@ class AutoML:
estimator_list = ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree']
if 'regression' != self._state.task:
estimator_list += ['lrl1']
if self._state.task == 'forecast':
estimator_list = ['fbprophet', 'arima', 'sarimax']
for estimator_name in estimator_list:
if estimator_name not in self._state.learner_classes:
self.add_learner(
@ -1237,7 +1333,7 @@ class AutoML:
elif 'bs' == self._hpo_method:
from flaml import BlendSearch as SearchAlgo
elif 'cfocat' == self._hpo_method:
from flaml import CFOCat as SearchAlgo
from flaml.searcher.cfo_cat import CFOCat as SearchAlgo
else:
raise NotImplementedError(
f"hpo_method={self._hpo_method} is not recognized. "

View File

@ -120,7 +120,7 @@ def get_output_from_log(filename, time_budget):
time_budget: A float of the time budget in seconds
Returns:
training_time_list: A list of the finished time of each logged iter
search_time_list: A list of the finished time of each logged iter
best_error_list:
A list of the best validation error after each logged iter
error_list: A list of the validation error of each logged iter
@ -132,9 +132,8 @@ def get_output_from_log(filename, time_budget):
best_config = None
best_learner = None
best_val_loss = float('+inf')
training_duration = 0.0
training_time_list = []
search_time_list = []
config_list = []
best_error_list = []
error_list = []
@ -143,7 +142,6 @@ def get_output_from_log(filename, time_budget):
with training_log_reader(filename) as reader:
for record in reader.records():
time_used = record.total_search_time
training_duration = time_used
val_loss = record.validation_loss
config = record.config
learner = record.learner.split('_')[0]
@ -156,7 +154,7 @@ def get_output_from_log(filename, time_budget):
best_config = config
best_learner = learner
best_config_list.append(best_config)
training_time_list.append(training_duration)
search_time_list.append(time_used)
best_error_list.append(best_val_loss)
logged_metric_list.append(train_loss)
error_list.append(val_loss)
@ -166,7 +164,7 @@ def get_output_from_log(filename, time_budget):
"Best Learner": best_learner,
"Best Hyper-parameters": best_config})
return (training_time_list, best_error_list, error_list, config_list,
return (search_time_list, best_error_list, error_list, config_list,
logged_metric_list)

View File

@ -9,12 +9,12 @@ import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, \
accuracy_score, mean_absolute_error, log_loss, average_precision_score, \
f1_score
from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold
f1_score, mean_absolute_percentage_error
from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold, TimeSeriesSplit
from .model import (
XGBoostEstimator, XGBoostSklearnEstimator, RandomForestEstimator,
LGBMEstimator, LRL1Classifier, LRL2Classifier, CatBoostEstimator,
ExtraTreeEstimator, KNeighborsEstimator)
ExtraTreeEstimator, KNeighborsEstimator, FBProphet, ARIMA, SARIMAX)
import logging
logger = logging.getLogger(__name__)
@ -42,6 +42,12 @@ def get_estimator_class(task, estimator_name):
estimator_class = ExtraTreeEstimator
elif 'kneighbor' == estimator_name:
estimator_class = KNeighborsEstimator
elif 'prophet' in estimator_name:
estimator_class = FBProphet
elif estimator_name == 'arima':
estimator_class = ARIMA
elif estimator_name == 'sarimax':
estimator_class = SARIMAX
else:
raise ValueError(
estimator_name + ' is not a built-in learner. '
@ -57,7 +63,7 @@ def sklearn_metric_loss_score(
Args:
metric_name: A string of the metric name, one of
'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',
'roc_auc_ovo', 'log_loss', 'f1', 'ap', 'micro_f1', 'macro_f1'
'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'micro_f1', 'macro_f1'
y_predict: A 1d or 2d numpy array of the predictions which can be
used to calculate the metric. E.g., 2d for log_loss and 1d
for others.
@ -95,6 +101,9 @@ def sklearn_metric_loss_score(
elif 'log_loss' in metric_name:
score = log_loss(
y_true, y_predict, labels=labels, sample_weight=sample_weight)
elif 'mape' in metric_name:
score = mean_absolute_percentage_error(
y_true, y_predict)
elif 'micro_f1' in metric_name:
score = 1 - f1_score(
y_true, y_predict, sample_weight=sample_weight, average='micro')
@ -111,18 +120,20 @@ def sklearn_metric_loss_score(
metric_name + ' is not a built-in metric, '
'currently built-in metrics are: '
'r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,'
'log_loss, f1, micro_f1, macro_f1, ap. '
'log_loss, mape, f1, micro_f1, macro_f1, ap. '
'please pass a customized metric function to AutoML.fit(metric=func)')
return score
def get_y_pred(estimator, X, eval_metric, obj):
def get_y_pred(estimator, X, eval_metric, obj, freq=None):
if eval_metric in ['roc_auc', 'ap'] and 'binary' in obj:
y_pred_classes = estimator.predict_proba(X)
y_pred = y_pred_classes[
:, 1] if y_pred_classes.ndim > 1 else y_pred_classes
elif eval_metric in ['log_loss', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']:
y_pred = estimator.predict_proba(X)
elif eval_metric == 'mape':
y_pred = estimator.predict(X, freq=freq)
else:
y_pred = estimator.predict(X)
return y_pred
@ -201,15 +212,21 @@ def evaluate_model_CV(
valid_fold_num = total_fold_num = 0
n = kf.get_n_splits()
X_train_split, y_train_split = X_train_all, y_train_all
if task == 'regression':
labels = None
else:
if task == 'binary:logistics' or task == 'multi:softmax':
labels = np.unique(y_train_all)
else:
labels = None
if isinstance(kf, RepeatedStratifiedKFold):
kf = kf.split(X_train_split, y_train_split)
elif isinstance(kf, GroupKFold):
kf = kf.split(X_train_split, y_train_split, kf.groups)
elif isinstance(kf, TimeSeriesSplit) and task == 'forecast':
y_train_all = pd.DataFrame(y_train_all, columns=['y'])
train = X_train_all.join(y_train_all)
kf = kf.split(train)
elif isinstance(kf, TimeSeriesSplit):
kf = kf.split(X_train_split, y_train_split)
else:
kf = kf.split(X_train_split)
rng = np.random.RandomState(2020)
@ -221,7 +238,8 @@ def evaluate_model_CV(
else:
weight = weight_val = None
for train_index, val_index in kf:
train_index = rng.permutation(train_index)
if not isinstance(kf, TimeSeriesSplit):
train_index = rng.permutation(train_index)
if isinstance(X_train_all, pd.DataFrame):
X_train, X_val = X_train_split.iloc[
train_index], X_train_split.iloc[val_index]

View File

@ -15,6 +15,7 @@ import pandas as pd
from . import tune
import logging
logger = logging.getLogger(__name__)
@ -635,7 +636,6 @@ class LRL2Classifier(SKLearnEstimator):
class CatBoostEstimator(BaseEstimator):
_time_per_iter = None
_train_size = 0
@ -834,3 +834,222 @@ class KNeighborsEstimator(BaseEstimator):
X = X.drop(cat_columns, axis=1)
X = X.to_numpy()
return X
class FBProphet(BaseEstimator):
@classmethod
def search_space(cls, **params):
space = {
'changepoint_prior_scale': {
'domain': tune.loguniform(lower=0.001, upper=1000),
'init_value': 0.01,
'low_cost_init_value': 0.001,
},
'seasonality_prior_scale': {
'domain': tune.loguniform(lower=0.01, upper=100),
'init_value': 1,
},
'holidays_prior_scale': {
'domain': tune.loguniform(lower=0.01, upper=100),
'init_value': 1,
},
'seasonality_mode': {
'domain': tune.choice(['additive', 'multiplicative']),
'init_value': 'multiplicative',
}
}
return space
def fit(self, X_train, y_train, budget=None, **kwargs):
y_train = pd.DataFrame(y_train, columns=['y'])
train_df = X_train.join(y_train)
if ('ds' not in train_df) or ('y' not in train_df):
raise ValueError(
'Dataframe for training forecast model must have columns "ds" and "y" with the dates and '
'values respectively.'
)
if 'n_jobs' in self.params:
self.params.pop('n_jobs')
from prophet import Prophet
current_time = time.time()
model = Prophet(**self.params).fit(train_df)
train_time = time.time() - current_time
self._model = model
return train_time
def predict(self, X_test, freq=None):
if self._model is not None:
if isinstance(X_test, int) and freq is not None:
future = self._model.make_future_dataframe(periods=X_test, freq=freq)
forecast = self._model.predict(future)
elif isinstance(X_test, pd.DataFrame):
forecast = self._model.predict(X_test)
else:
raise ValueError(
"either X_test(pd.Dataframe with dates for predictions, column ds) or"
"X_test(int number of periods)+freq are required.")
return forecast['yhat']
else:
return np.ones(X_test.shape[0])
class ARIMA(BaseEstimator):
@classmethod
def search_space(cls, **params):
space = {
'p': {
'domain': tune.quniform(lower=0, upper=10, q=1),
'init_value': 2,
'low_cost_init_value': 0,
},
'd': {
'domain': tune.quniform(lower=0, upper=10, q=1),
'init_value': 2,
'low_cost_init_value': 0,
},
'q': {
'domain': tune.quniform(lower=0, upper=10, q=1),
'init_value': 2,
'low_cost_init_value': 0,
}
}
return space
def fit(self, X_train, y_train, budget=None, **kwargs):
y_train = pd.DataFrame(y_train, columns=['y'])
train_df = X_train.join(y_train)
if ('ds' not in train_df) or ('y' not in train_df):
raise ValueError(
'Dataframe for training forecast model must have columns "ds" and "y" with the dates and '
'values respectively.'
)
train_df.index = pd.to_datetime(train_df['ds'])
train_df = train_df.drop('ds', axis=1)
if 'n_jobs' in self.params:
self.params.pop('n_jobs')
from statsmodels.tsa.arima.model import ARIMA as ARIMA_estimator
import warnings
warnings.filterwarnings("ignore")
current_time = time.time()
model = ARIMA_estimator(train_df,
order=(self.params['p'], self.params['d'], self.params['q']),
enforce_stationarity=False,
enforce_invertibility=False)
model = model.fit()
train_time = time.time() - current_time
self._model = model
return train_time
def predict(self, X_test, freq=None):
if self._model is not None:
if isinstance(X_test, int) and freq is not None:
forecast = self._model.forecast(steps=X_test).to_frame().reset_index()
elif isinstance(X_test, pd.DataFrame):
start_date = X_test.iloc[0, 0]
end_date = X_test.iloc[-1, 0]
forecast = self._model.predict(start=start_date, end=end_date)
else:
raise ValueError(
"either X_test(pd.Dataframe with dates for predictions, column ds) or"
"X_test(int number of periods)+freq are required.")
return forecast
else:
return np.ones(X_test.shape[0])
class SARIMAX(BaseEstimator):
@classmethod
def search_space(cls, **params):
space = {
'p': {
'domain': tune.quniform(lower=0, upper=10, q=1),
'init_value': 2,
'low_cost_init_value': 0,
},
'd': {
'domain': tune.quniform(lower=0, upper=10, q=1),
'init_value': 2,
'low_cost_init_value': 0,
},
'q': {
'domain': tune.quniform(lower=0, upper=10, q=1),
'init_value': 2,
'low_cost_init_value': 0,
},
'P': {
'domain': tune.quniform(lower=0, upper=10, q=1),
'init_value': 1,
'low_cost_init_value': 0,
},
'D': {
'domain': tune.quniform(lower=0, upper=10, q=1),
'init_value': 1,
'low_cost_init_value': 0,
},
'Q': {
'domain': tune.quniform(lower=0, upper=10, q=1),
'init_value': 1,
'low_cost_init_value': 0,
},
's': {
'domain': tune.choice([1, 4, 6, 12]),
'init_value': 12,
}
}
return space
def fit(self, X_train, y_train, budget=None, **kwargs):
y_train = pd.DataFrame(y_train, columns=['y'])
train_df = X_train.join(y_train)
if ('ds' not in train_df) or ('y' not in train_df):
raise ValueError(
'Dataframe for training forecast model must have columns "ds" and "y" with the dates and '
'values respectively.'
)
train_df.index = pd.to_datetime(train_df['ds'])
train_df = train_df.drop('ds', axis=1)
if 'n_jobs' in self.params:
self.params.pop('n_jobs')
from statsmodels.tsa.statespace.sarimax import SARIMAX as SARIMAX_estimator
current_time = time.time()
model = SARIMAX_estimator(train_df,
order=(self.params['p'], self.params['d'], self.params['q']),
seasonality_order=(self.params['P'], self.params['D'], self.params['Q'], self.params['s']),
enforce_stationarity=False,
enforce_invertibility=False)
model = model.fit()
train_time = time.time() - current_time
self._model = model
return train_time
def predict(self, X_test, freq=None):
if self._model is not None:
if isinstance(X_test, int) and freq is not None:
forecast = self._model.forecast(steps=X_test).to_frame().reset_index()
elif isinstance(X_test, pd.DataFrame):
start_date = X_test.iloc[0, 0]
end_date = X_test.iloc[-1, 0]
forecast = self._model.predict(start=start_date, end=end_date)
else:
raise ValueError(
"either X_test(pd.Dataframe with dates for predictions, column ds)"
"or X_test(int number of periods)+freq are required.")
return forecast
else:
return np.ones(X_test.shape[0])

View File

@ -165,7 +165,8 @@ class BlendSearch(Searcher):
min_resource, max_resource, reduction_factor, self.cost_attr, seed)
self._is_ls_ever_converged = False
self._subspace = {} # the subspace for each trial id
self._init_search()
if space:
self._init_search()
def set_search_properties(self,
metric: Optional[str] = None,

View File

@ -1 +1 @@
__version__ = "0.5.12"
__version__ = "0.5.13"

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -56,6 +56,7 @@ setuptools.setup(
"torch==1.8.1",
"datasets==1.4.1",
"azure-storage-blob",
"statsmodels>=0.12.2"
],
"blendsearch": [
"optuna==2.8.0"
@ -79,6 +80,10 @@ setuptools.setup(
"datasets==1.4.1",
"tensorboardX<=2.2",
"torch"
],
"forecast": [
"prophet>=1.0.1",
"statsmodels>=0.12.2"
]
},
classifiers=[

119
test/test_forecast.py Normal file
View File

@ -0,0 +1,119 @@
def test_forecast_automl_df(budget=5):
# using dataframe
import statsmodels.api as sm
data = sm.datasets.co2.load_pandas()
data = data.data
data = data['co2'].resample('MS').mean()
data = data.fillna(data.bfill())
data = data.to_frame().reset_index()
data = data.rename(columns={'index': 'ds', 'co2': 'y'})
num_samples = data.shape[0]
time_horizon = 12
split_idx = num_samples - time_horizon
X_train = data[:split_idx]
X_test = data[split_idx:]['ds'].to_frame()
y_test = data[split_idx:]['y'].to_frame()
''' import AutoML class from flaml package '''
from flaml import AutoML
automl = AutoML()
settings = {
"time_budget": budget, # total running time in seconds
"metric": 'mape', # primary metric
"task": 'forecast', # task type
"log_file_name": 'CO2_forecast.log', # flaml log file
"eval_method": "holdout",
"split_type": 'time'
}
'''The main flaml automl API'''
try:
automl.fit(dataframe=X_train, **settings, period=time_horizon, freq='M')
except ImportError:
automl.fit(dataframe=X_train, **settings, estimator_list=['arima', 'sarimax'], period=time_horizon, freq='M')
''' retrieve best config and best learner'''
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print(f'Best mape on validation data: {automl.best_loss}')
print(f'Training duration of best run: {automl.best_config_train_time}s')
print(automl.model.estimator)
''' pickle and save the automl object '''
import pickle
with open('automl.pkl', 'wb') as f:
pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
''' compute predictions of testing dataset '''
y_pred = automl.predict(X_test)
print('Predicted labels', y_pred)
print('True labels', y_test)
''' compute different metric values on testing dataset'''
from flaml.ml import sklearn_metric_loss_score
print('mape', '=', sklearn_metric_loss_score('mape', y_pred, y_test))
from flaml.data import get_output_from_log
time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \
get_output_from_log(filename=settings['log_file_name'], time_budget=budget)
for config in config_history:
print(config)
print(automl.prune_attr)
print(automl.max_resource)
print(automl.min_resource)
def test_forecast_automl_Xy(budget=5):
# using X_train and y_train
import statsmodels.api as sm
data = sm.datasets.co2.load_pandas()
data = data.data
data = data['co2'].resample('MS').mean()
data = data.fillna(data.bfill())
data = data.to_frame().reset_index()
num_samples = data.shape[0]
time_horizon = 12
split_idx = num_samples - time_horizon
X_train = data[:split_idx]['index'].to_frame()
y_train = data[:split_idx]['co2']
X_test = data[split_idx:]['index'].to_frame()
y_test = data[split_idx:]['co2'].to_frame()
''' import AutoML class from flaml package '''
from flaml import AutoML
automl = AutoML()
settings = {
"time_budget": budget, # total running time in seconds
"metric": 'mape', # primary metric
"task": 'forecast', # task type
"log_file_name": 'CO2_forecast.log', # flaml log file
"eval_method": "holdout",
"split_type": 'time'
}
'''The main flaml automl API'''
try:
automl.fit(X_train=X_train, y_train=y_train, **settings, period=time_horizon, freq='M')
except ImportError:
automl.fit(X_train=X_train, y_train=y_train, **settings, estimator_list=['arima', 'sarimax'], period=time_horizon, freq='M')
''' retrieve best config and best learner'''
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print(f'Best mape on validation data: {automl.best_loss}')
print(f'Training duration of best run: {automl.best_config_train_time}s')
print(automl.model.estimator)
''' pickle and save the automl object '''
import pickle
with open('automl.pkl', 'wb') as f:
pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
''' compute predictions of testing dataset '''
y_pred = automl.predict(X_test)
print('Predicted labels', y_pred)
print('True labels', y_test)
''' compute different metric values on testing dataset'''
from flaml.ml import sklearn_metric_loss_score
print('mape', '=', sklearn_metric_loss_score('mape', y_pred, y_test))
from flaml.data import get_output_from_log
time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \
get_output_from_log(filename=settings['log_file_name'], time_budget=budget)
for config in config_history:
print(config)
print(automl.prune_attr)
print(automl.max_resource)
print(automl.min_resource)
if __name__ == "__main__":
test_forecast_automl_df(60)
test_forecast_automl_Xy(60)

View File

@ -6,10 +6,12 @@ from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
dataset = "credit"
dataset = "credit-g"
def _test(split_type):
from sklearn.externals._arff import ArffException
automl = AutoML()
automl_settings = {
@ -22,9 +24,17 @@ def _test(split_type):
"split_type": split_type,
}
X, y = fetch_openml(name=dataset, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
random_state=42)
try:
X, y = fetch_openml(name=dataset, return_X_y=True)
except (ArffException, ValueError):
from sklearn.datasets import load_wine
X, y = load_wine(return_X_y=True)
if split_type != 'time':
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
random_state=42)
else:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
shuffle=False)
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
pred = automl.predict(X_test)
@ -37,6 +47,10 @@ def _test_uniform():
_test(split_type="uniform")
def test_time():
_test(split_type="time")
def test_groups():
from sklearn.externals._arff import ArffException
try:

52
test/tune/example.py Normal file
View File

@ -0,0 +1,52 @@
import time
def evaluation_fn(step, width, height):
return (0.1 + width * step / 100)**(-1) + height * 0.1
def easy_objective(config):
from ray import tune
# Hyperparameters
width, height = config["width"], config["height"]
for step in range(config["steps"]):
# Iterative training function - can be any arbitrary training procedure
intermediate_score = evaluation_fn(step, width, height)
# Feed the score back back to Tune.
tune.report(iterations=step, mean_loss=intermediate_score)
time.sleep(0.1)
def test_blendsearch_tune(smoke_test=True):
try:
from ray import tune
from ray.tune.suggest import ConcurrencyLimiter
from ray.tune.schedulers import AsyncHyperBandScheduler
from ray.tune.suggest.flaml import BlendSearch
except ImportError:
print('ray[tune] is not installed, skipping test')
return
algo = BlendSearch()
algo = ConcurrencyLimiter(algo, max_concurrent=4)
scheduler = AsyncHyperBandScheduler()
analysis = tune.run(
easy_objective,
metric="mean_loss",
mode="min",
search_alg=algo,
scheduler=scheduler,
num_samples=10 if smoke_test else 100,
config={
"steps": 100,
"width": tune.uniform(0, 20),
"height": tune.uniform(-100, 100),
# This is an ignored parameter.
"activation": tune.choice(["relu", "tanh"])
})
print("Best hyperparameters found were: ", analysis.best_config)
if __name__ == "__main__":
test_blendsearch_tune(False)