mirror of https://github.com/microsoft/autogen.git
Forecast (#162)
* added 'forecast' task with estimators ['fbprophet', 'arima', 'sarimax'] * update setup.py * add TimeSeriesSplit to 'regression' and 'classification' task * add 'time' split_type for 'classification' and 'regression' task Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com> * feature importance * variable name * Update test/test_split.py Co-authored-by: Chi Wang <wang.chi@microsoft.com> * Update test/test_forecast.py Co-authored-by: Chi Wang <wang.chi@microsoft.com> * prophet installation fail in windows * upload flaml_forecast.ipynb Signed-off-by: Kevin Chen <chenkevin.8787@gmail.com>
This commit is contained in:
parent
6270353458
commit
3d0a3d26a2
|
@ -41,7 +41,7 @@ jobs:
|
|||
- name: If linux or mac, install ray
|
||||
if: (matrix.os == 'macOS-latest' || matrix.os == 'ubuntu-latest') && matrix.python-version != '3.9'
|
||||
run: |
|
||||
pip install -e .[ray]
|
||||
pip install -e .[ray,forecast]
|
||||
pip install 'tensorboardX<=2.2'
|
||||
- name: Lint with flake8
|
||||
run: |
|
||||
|
|
160
flaml/automl.py
160
flaml/automl.py
|
@ -10,7 +10,7 @@ from functools import partial
|
|||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, \
|
||||
RepeatedKFold, GroupKFold
|
||||
RepeatedKFold, GroupKFold, TimeSeriesSplit
|
||||
from sklearn.utils import shuffle
|
||||
import pandas as pd
|
||||
|
||||
|
@ -25,6 +25,7 @@ from . import tune
|
|||
from .training_log import training_log_reader, training_log_writer
|
||||
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger_formatter = logging.Formatter(
|
||||
'[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s',
|
||||
|
@ -360,11 +361,15 @@ class AutoML:
|
|||
return self._trained_estimator.classes_.tolist()
|
||||
return None
|
||||
|
||||
def predict(self, X_test):
|
||||
def predict(self, X_test, freq=None):
|
||||
'''Predict label from features.
|
||||
|
||||
Args:
|
||||
X_test: A numpy array of featurized instances, shape n * m.
|
||||
X_test: A numpy array of featurized instances, shape n * m,
|
||||
or a pandas dataframe with one column with timestamp values
|
||||
for 'forecasting' task.
|
||||
freq: str or pandas offset, default=None | The frequency of the
|
||||
time-series.
|
||||
|
||||
Returns:
|
||||
A numpy array of shape n * 1 - - each element is a predicted class
|
||||
|
@ -375,8 +380,14 @@ class AutoML:
|
|||
"No estimator is trained. Please run fit with enough budget.")
|
||||
return None
|
||||
X_test = self._preprocess(X_test)
|
||||
y_pred = self._trained_estimator.predict(X_test)
|
||||
if y_pred.ndim > 1:
|
||||
if self._state.task == 'forecast':
|
||||
X_test_df = pd.DataFrame(X_test)
|
||||
X_test_col = list(X_test.columns)[0]
|
||||
X_test_df = X_test_df.rename(columns={X_test_col: 'ds'})
|
||||
y_pred = self._trained_estimator.predict(X_test_df, freq=freq)
|
||||
else:
|
||||
y_pred = self._trained_estimator.predict(X_test)
|
||||
if y_pred.ndim > 1 and isinstance(y_pred, np.ndarray):
|
||||
y_pred = y_pred.flatten()
|
||||
if self._label_transformer:
|
||||
return self._label_transformer.inverse_transform(pd.Series(
|
||||
|
@ -408,6 +419,25 @@ class AutoML:
|
|||
|
||||
def _validate_data(self, X_train_all, y_train_all, dataframe, label,
|
||||
X_val=None, y_val=None):
|
||||
if self._state.task == 'forecast':
|
||||
if dataframe is not None and label is not None:
|
||||
dataframe = dataframe.copy()
|
||||
dataframe = dataframe.rename(columns={label[0]: 'ds', label[1]: 'y'})
|
||||
elif dataframe is not None:
|
||||
if ('ds' not in dataframe) or ('y' not in dataframe):
|
||||
raise ValueError(
|
||||
'For forecasting task, Dataframe must have columns "ds" and "y" '
|
||||
'with the dates and values respectively.'
|
||||
)
|
||||
elif (X_train_all is not None) and (y_train_all is not None):
|
||||
dataframe = pd.DataFrame(X_train_all)
|
||||
time_col = list(dataframe.columns)[0]
|
||||
dataframe = dataframe.rename(columns={time_col: 'ds'})
|
||||
dataframe['y'] = pd.Series(y_train_all)
|
||||
X_train_all = None
|
||||
y_train_all = None
|
||||
label = 'y'
|
||||
|
||||
if X_train_all is not None and y_train_all is not None:
|
||||
if not (isinstance(X_train_all, np.ndarray) or issparse(X_train_all)
|
||||
or isinstance(X_train_all, pd.DataFrame)):
|
||||
|
@ -440,7 +470,7 @@ class AutoML:
|
|||
else:
|
||||
raise ValueError(
|
||||
"either X_train+y_train or dataframe+label are required")
|
||||
if issparse(X_train_all):
|
||||
if issparse(X_train_all) or self._state.task == 'forecast':
|
||||
self._transformer = self._label_transformer = False
|
||||
self._X_train_all, self._y_train_all = X, y
|
||||
else:
|
||||
|
@ -482,7 +512,8 @@ class AutoML:
|
|||
def _prepare_data(self,
|
||||
eval_method,
|
||||
split_ratio,
|
||||
n_splits):
|
||||
n_splits,
|
||||
period=None):
|
||||
X_val, y_val = self._state.X_val, self._state.y_val
|
||||
if issparse(X_val):
|
||||
X_val = X_val.tocsr()
|
||||
|
@ -490,8 +521,9 @@ class AutoML:
|
|||
self._X_train_all, self._y_train_all
|
||||
if issparse(X_train_all):
|
||||
X_train_all = X_train_all.tocsr()
|
||||
if self._state.task != 'regression' and self._state.fit_kwargs.get(
|
||||
'sample_weight') is None:
|
||||
if (self._state.task == 'binary:logistic' or self._state.task == 'multi:softmax') \
|
||||
and self._state.fit_kwargs.get('sample_weight') is None \
|
||||
and self._split_type != 'time':
|
||||
# logger.info(f"label {pd.unique(y_train_all)}")
|
||||
label_set, counts = np.unique(y_train_all, return_counts=True)
|
||||
# augment rare classes
|
||||
|
@ -518,19 +550,21 @@ class AutoML:
|
|||
count += rare_count
|
||||
logger.info(
|
||||
f"class {label} augmented from {rare_count} to {count}")
|
||||
if 'sample_weight' in self._state.fit_kwargs:
|
||||
X_train_all, y_train_all, self._state.fit_kwargs[
|
||||
'sample_weight'] = shuffle(
|
||||
SHUFFLE_SPLIT_TYPES = ['uniform', 'stratified']
|
||||
if self._split_type in SHUFFLE_SPLIT_TYPES:
|
||||
if 'sample_weight' in self._state.fit_kwargs:
|
||||
X_train_all, y_train_all, self._state.fit_kwargs[
|
||||
'sample_weight'] = shuffle(
|
||||
X_train_all, y_train_all,
|
||||
self._state.fit_kwargs['sample_weight'],
|
||||
random_state=RANDOM_SEED)
|
||||
elif hasattr(self._state, 'groups') and self._state.groups is not None:
|
||||
X_train_all, y_train_all, self._state.groups = shuffle(
|
||||
X_train_all, y_train_all, self._state.groups,
|
||||
random_state=RANDOM_SEED)
|
||||
else:
|
||||
X_train_all, y_train_all = shuffle(
|
||||
X_train_all, y_train_all, random_state=RANDOM_SEED)
|
||||
elif hasattr(self._state, 'groups') and self._state.groups is not None:
|
||||
X_train_all, y_train_all, self._state.groups = shuffle(
|
||||
X_train_all, y_train_all, self._state.groups,
|
||||
random_state=RANDOM_SEED)
|
||||
else:
|
||||
X_train_all, y_train_all = shuffle(
|
||||
X_train_all, y_train_all, random_state=RANDOM_SEED)
|
||||
if self._df:
|
||||
X_train_all.reset_index(drop=True, inplace=True)
|
||||
if isinstance(y_train_all, pd.Series):
|
||||
|
@ -539,7 +573,31 @@ class AutoML:
|
|||
X_train, y_train = X_train_all, y_train_all
|
||||
if X_val is None:
|
||||
# if eval_method = holdout, make holdout data
|
||||
if self._state.task != 'regression' and eval_method == 'holdout':
|
||||
if eval_method == 'holdout' and self._split_type == 'time':
|
||||
if 'period' in self._state.fit_kwargs:
|
||||
num_samples = X_train_all.shape[0]
|
||||
split_idx = num_samples - self._state.fit_kwargs.get('period')
|
||||
X_train = X_train_all[:split_idx]
|
||||
y_train = y_train_all[:split_idx]
|
||||
X_val = X_train_all[split_idx:]
|
||||
y_val = y_train_all[split_idx:]
|
||||
else:
|
||||
if 'sample_weight' in self._state.fit_kwargs:
|
||||
X_train, X_val, y_train, y_val, self._state.fit_kwargs[
|
||||
'sample_weight'], self._state.weight_val = \
|
||||
train_test_split(
|
||||
X_train_all,
|
||||
y_train_all,
|
||||
self._state.fit_kwargs['sample_weight'],
|
||||
test_size=split_ratio,
|
||||
shuffle=False)
|
||||
else:
|
||||
X_train, X_val, y_train, y_val = train_test_split(
|
||||
X_train_all,
|
||||
y_train_all,
|
||||
test_size=split_ratio,
|
||||
shuffle=False)
|
||||
elif self._state.task != 'regression' and eval_method == 'holdout':
|
||||
# for classification, make sure the labels are complete in both
|
||||
# training and validation data
|
||||
label_set, first = np.unique(y_train_all, return_index=True)
|
||||
|
@ -624,6 +682,13 @@ class AutoML:
|
|||
f"requires input data with at least {n_splits*2} examples.")
|
||||
self._state.kf = RepeatedStratifiedKFold(
|
||||
n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED)
|
||||
elif self._split_type == "time":
|
||||
logger.info("Using TimeSeriesSplit")
|
||||
if self._state.task == 'forecast':
|
||||
self._state.kf = TimeSeriesSplit(
|
||||
n_splits=n_splits, test_size=self._state.fit_kwargs.get('period'))
|
||||
else:
|
||||
self._state.kf = TimeSeriesSplit(n_splits=n_splits)
|
||||
else:
|
||||
logger.info("Using RepeatedKFold")
|
||||
self._state.kf = RepeatedKFold(
|
||||
|
@ -762,10 +827,15 @@ class AutoML:
|
|||
if self._state.task == 'classification':
|
||||
self._state.task = get_classification_objective(
|
||||
len(np.unique(self._y_train_all)))
|
||||
assert split_type in ["stratified", "uniform"]
|
||||
assert split_type in ["stratified", "uniform", "time"]
|
||||
self._split_type = split_type
|
||||
else:
|
||||
self._split_type = "uniform"
|
||||
elif self._state.task == 'regression':
|
||||
if split_type in ["uniform", "time"]:
|
||||
self._split_type = split_type
|
||||
else:
|
||||
self._split_type = "uniform"
|
||||
elif self._state.task == 'forecast':
|
||||
self._split_type = "time"
|
||||
if record_id >= 0:
|
||||
eval_method = 'cv'
|
||||
elif eval_method == 'auto':
|
||||
|
@ -1011,15 +1081,22 @@ class AutoML:
|
|||
Args:
|
||||
X_train: A numpy array or a pandas dataframe of training data in
|
||||
shape (n, m)
|
||||
For 'forecast' task, X_train should be timestamp
|
||||
y_train: A numpy array or a pandas series of labels in shape (n,)
|
||||
For 'forecast' task, y_train should be value
|
||||
dataframe: A dataframe of training data including label column
|
||||
label: A str of the label column name
|
||||
For 'forecast' task, dataframe must be specified and should
|
||||
have two columns: timestamp and value
|
||||
label: A str of the label column name for 'classification' or
|
||||
'regression' task or a tuple of strings for timestamp and
|
||||
value columns for 'forecasting' task
|
||||
Note: If X_train and y_train are provided,
|
||||
dataframe and label are ignored;
|
||||
If not, dataframe and label must be provided.
|
||||
metric: A string of the metric name or a function,
|
||||
e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo',
|
||||
'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2'
|
||||
'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mape', 'mae', 'mse', 'r2'
|
||||
for 'forecast' task, use 'mape'
|
||||
if passing a customized metric function, the function needs to
|
||||
have the follwing signature:
|
||||
|
||||
|
@ -1034,7 +1111,7 @@ class AutoML:
|
|||
which returns a float number as the minimization objective,
|
||||
and a tuple of floats or a dictionary as the metrics to log
|
||||
task: A string of the task type, e.g.,
|
||||
'classification', 'regression'
|
||||
'classification', 'regression', 'forecast'
|
||||
n_jobs: An integer of the number of threads for training
|
||||
log_file_name: A string of the log file name
|
||||
estimator_list: A list of strings for estimator names, or 'auto'
|
||||
|
@ -1085,7 +1162,8 @@ class AutoML:
|
|||
hyperparamter configurations for the corresponding estimators.
|
||||
seed: int or None, default=None | The random seed for np.random.
|
||||
**fit_kwargs: Other key word arguments to pass to fit() function of
|
||||
the searched learners, such as sample_weight.
|
||||
the searched learners, such as sample_weight. Include period as
|
||||
a key word argument for 'forecast' task.
|
||||
'''
|
||||
self._start_time_flag = time.time()
|
||||
self._state.task = task
|
||||
|
@ -1093,6 +1171,7 @@ class AutoML:
|
|||
self._state.fit_kwargs = fit_kwargs
|
||||
self._state.weight_val = sample_weight_val
|
||||
self._state.groups = groups
|
||||
|
||||
self._validate_data(X_train, y_train, dataframe, label, X_val, y_val)
|
||||
self._search_states = {} # key: estimator name; value: SearchState
|
||||
self._random = np.random.RandomState(RANDOM_SEED)
|
||||
|
@ -1106,10 +1185,19 @@ class AutoML:
|
|||
if self._state.task == 'classification':
|
||||
self._state.task = get_classification_objective(
|
||||
len(np.unique(self._y_train_all)))
|
||||
assert split_type in ["stratified", "uniform"]
|
||||
assert split_type in ["stratified", "uniform", "time"]
|
||||
self._split_type = split_type
|
||||
else:
|
||||
self._split_type = "uniform"
|
||||
elif self._state.task == 'regression':
|
||||
if split_type in ["uniform", "time"]:
|
||||
self._split_type = split_type
|
||||
else:
|
||||
self._split_type = "uniform"
|
||||
elif self._state.task == 'forecast':
|
||||
if split_type is not None and split_type != 'time':
|
||||
raise ValueError("split_type must be 'time' when task is 'forecast'. ")
|
||||
self._split_type = "time"
|
||||
if self._state.task == 'forecast' and self._state.fit_kwargs.get('period') is None:
|
||||
raise TypeError("missing 1 required argument for 'forecast' task: 'period'. ")
|
||||
if eval_method == 'auto' or self._state.X_val is not None:
|
||||
eval_method = self._decide_eval_method(time_budget)
|
||||
self._state.eval_method = eval_method
|
||||
|
@ -1122,7 +1210,11 @@ class AutoML:
|
|||
|
||||
self._retrain_full = retrain_full and (
|
||||
eval_method == 'holdout' and self._state.X_val is None)
|
||||
self._prepare_data(eval_method, split_ratio, n_splits)
|
||||
if self._state.task != 'forecast':
|
||||
self._prepare_data(eval_method, split_ratio, n_splits)
|
||||
else:
|
||||
self._prepare_data(eval_method, split_ratio, n_splits,
|
||||
period=self._state.fit_kwargs.get('period'))
|
||||
self._sample = sample and eval_method != 'cv' and (
|
||||
MIN_SAMPLE_TRAIN * SAMPLE_MULTIPLY_FACTOR < self._state.data_size)
|
||||
if 'auto' == metric:
|
||||
|
@ -1130,6 +1222,8 @@ class AutoML:
|
|||
metric = 'roc_auc'
|
||||
elif 'multi' in self._state.task:
|
||||
metric = 'log_loss'
|
||||
elif self._state.task == 'forecast':
|
||||
metric = 'mape'
|
||||
else:
|
||||
metric = 'r2'
|
||||
self._state.metric = metric
|
||||
|
@ -1146,6 +1240,8 @@ class AutoML:
|
|||
estimator_list = ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree']
|
||||
if 'regression' != self._state.task:
|
||||
estimator_list += ['lrl1']
|
||||
if self._state.task == 'forecast':
|
||||
estimator_list = ['fbprophet', 'arima', 'sarimax']
|
||||
for estimator_name in estimator_list:
|
||||
if estimator_name not in self._state.learner_classes:
|
||||
self.add_learner(
|
||||
|
@ -1237,7 +1333,7 @@ class AutoML:
|
|||
elif 'bs' == self._hpo_method:
|
||||
from flaml import BlendSearch as SearchAlgo
|
||||
elif 'cfocat' == self._hpo_method:
|
||||
from flaml import CFOCat as SearchAlgo
|
||||
from flaml.searcher.cfo_cat import CFOCat as SearchAlgo
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"hpo_method={self._hpo_method} is not recognized. "
|
||||
|
|
|
@ -120,7 +120,7 @@ def get_output_from_log(filename, time_budget):
|
|||
time_budget: A float of the time budget in seconds
|
||||
|
||||
Returns:
|
||||
training_time_list: A list of the finished time of each logged iter
|
||||
search_time_list: A list of the finished time of each logged iter
|
||||
best_error_list:
|
||||
A list of the best validation error after each logged iter
|
||||
error_list: A list of the validation error of each logged iter
|
||||
|
@ -132,9 +132,8 @@ def get_output_from_log(filename, time_budget):
|
|||
best_config = None
|
||||
best_learner = None
|
||||
best_val_loss = float('+inf')
|
||||
training_duration = 0.0
|
||||
|
||||
training_time_list = []
|
||||
search_time_list = []
|
||||
config_list = []
|
||||
best_error_list = []
|
||||
error_list = []
|
||||
|
@ -143,7 +142,6 @@ def get_output_from_log(filename, time_budget):
|
|||
with training_log_reader(filename) as reader:
|
||||
for record in reader.records():
|
||||
time_used = record.total_search_time
|
||||
training_duration = time_used
|
||||
val_loss = record.validation_loss
|
||||
config = record.config
|
||||
learner = record.learner.split('_')[0]
|
||||
|
@ -156,7 +154,7 @@ def get_output_from_log(filename, time_budget):
|
|||
best_config = config
|
||||
best_learner = learner
|
||||
best_config_list.append(best_config)
|
||||
training_time_list.append(training_duration)
|
||||
search_time_list.append(time_used)
|
||||
best_error_list.append(best_val_loss)
|
||||
logged_metric_list.append(train_loss)
|
||||
error_list.append(val_loss)
|
||||
|
@ -166,7 +164,7 @@ def get_output_from_log(filename, time_budget):
|
|||
"Best Learner": best_learner,
|
||||
"Best Hyper-parameters": best_config})
|
||||
|
||||
return (training_time_list, best_error_list, error_list, config_list,
|
||||
return (search_time_list, best_error_list, error_list, config_list,
|
||||
logged_metric_list)
|
||||
|
||||
|
||||
|
|
38
flaml/ml.py
38
flaml/ml.py
|
@ -9,12 +9,12 @@ import numpy as np
|
|||
import pandas as pd
|
||||
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, \
|
||||
accuracy_score, mean_absolute_error, log_loss, average_precision_score, \
|
||||
f1_score
|
||||
from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold
|
||||
f1_score, mean_absolute_percentage_error
|
||||
from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold, TimeSeriesSplit
|
||||
from .model import (
|
||||
XGBoostEstimator, XGBoostSklearnEstimator, RandomForestEstimator,
|
||||
LGBMEstimator, LRL1Classifier, LRL2Classifier, CatBoostEstimator,
|
||||
ExtraTreeEstimator, KNeighborsEstimator)
|
||||
ExtraTreeEstimator, KNeighborsEstimator, FBProphet, ARIMA, SARIMAX)
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -42,6 +42,12 @@ def get_estimator_class(task, estimator_name):
|
|||
estimator_class = ExtraTreeEstimator
|
||||
elif 'kneighbor' == estimator_name:
|
||||
estimator_class = KNeighborsEstimator
|
||||
elif 'prophet' in estimator_name:
|
||||
estimator_class = FBProphet
|
||||
elif estimator_name == 'arima':
|
||||
estimator_class = ARIMA
|
||||
elif estimator_name == 'sarimax':
|
||||
estimator_class = SARIMAX
|
||||
else:
|
||||
raise ValueError(
|
||||
estimator_name + ' is not a built-in learner. '
|
||||
|
@ -57,7 +63,7 @@ def sklearn_metric_loss_score(
|
|||
Args:
|
||||
metric_name: A string of the metric name, one of
|
||||
'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',
|
||||
'roc_auc_ovo', 'log_loss', 'f1', 'ap', 'micro_f1', 'macro_f1'
|
||||
'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'micro_f1', 'macro_f1'
|
||||
y_predict: A 1d or 2d numpy array of the predictions which can be
|
||||
used to calculate the metric. E.g., 2d for log_loss and 1d
|
||||
for others.
|
||||
|
@ -95,6 +101,9 @@ def sklearn_metric_loss_score(
|
|||
elif 'log_loss' in metric_name:
|
||||
score = log_loss(
|
||||
y_true, y_predict, labels=labels, sample_weight=sample_weight)
|
||||
elif 'mape' in metric_name:
|
||||
score = mean_absolute_percentage_error(
|
||||
y_true, y_predict)
|
||||
elif 'micro_f1' in metric_name:
|
||||
score = 1 - f1_score(
|
||||
y_true, y_predict, sample_weight=sample_weight, average='micro')
|
||||
|
@ -111,18 +120,20 @@ def sklearn_metric_loss_score(
|
|||
metric_name + ' is not a built-in metric, '
|
||||
'currently built-in metrics are: '
|
||||
'r2, rmse, mae, mse, accuracy, roc_auc, roc_auc_ovr, roc_auc_ovo,'
|
||||
'log_loss, f1, micro_f1, macro_f1, ap. '
|
||||
'log_loss, mape, f1, micro_f1, macro_f1, ap. '
|
||||
'please pass a customized metric function to AutoML.fit(metric=func)')
|
||||
return score
|
||||
|
||||
|
||||
def get_y_pred(estimator, X, eval_metric, obj):
|
||||
def get_y_pred(estimator, X, eval_metric, obj, freq=None):
|
||||
if eval_metric in ['roc_auc', 'ap'] and 'binary' in obj:
|
||||
y_pred_classes = estimator.predict_proba(X)
|
||||
y_pred = y_pred_classes[
|
||||
:, 1] if y_pred_classes.ndim > 1 else y_pred_classes
|
||||
elif eval_metric in ['log_loss', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']:
|
||||
y_pred = estimator.predict_proba(X)
|
||||
elif eval_metric == 'mape':
|
||||
y_pred = estimator.predict(X, freq=freq)
|
||||
else:
|
||||
y_pred = estimator.predict(X)
|
||||
return y_pred
|
||||
|
@ -201,15 +212,21 @@ def evaluate_model_CV(
|
|||
valid_fold_num = total_fold_num = 0
|
||||
n = kf.get_n_splits()
|
||||
X_train_split, y_train_split = X_train_all, y_train_all
|
||||
if task == 'regression':
|
||||
labels = None
|
||||
else:
|
||||
if task == 'binary:logistics' or task == 'multi:softmax':
|
||||
labels = np.unique(y_train_all)
|
||||
else:
|
||||
labels = None
|
||||
|
||||
if isinstance(kf, RepeatedStratifiedKFold):
|
||||
kf = kf.split(X_train_split, y_train_split)
|
||||
elif isinstance(kf, GroupKFold):
|
||||
kf = kf.split(X_train_split, y_train_split, kf.groups)
|
||||
elif isinstance(kf, TimeSeriesSplit) and task == 'forecast':
|
||||
y_train_all = pd.DataFrame(y_train_all, columns=['y'])
|
||||
train = X_train_all.join(y_train_all)
|
||||
kf = kf.split(train)
|
||||
elif isinstance(kf, TimeSeriesSplit):
|
||||
kf = kf.split(X_train_split, y_train_split)
|
||||
else:
|
||||
kf = kf.split(X_train_split)
|
||||
rng = np.random.RandomState(2020)
|
||||
|
@ -221,7 +238,8 @@ def evaluate_model_CV(
|
|||
else:
|
||||
weight = weight_val = None
|
||||
for train_index, val_index in kf:
|
||||
train_index = rng.permutation(train_index)
|
||||
if not isinstance(kf, TimeSeriesSplit):
|
||||
train_index = rng.permutation(train_index)
|
||||
if isinstance(X_train_all, pd.DataFrame):
|
||||
X_train, X_val = X_train_split.iloc[
|
||||
train_index], X_train_split.iloc[val_index]
|
||||
|
|
221
flaml/model.py
221
flaml/model.py
|
@ -15,6 +15,7 @@ import pandas as pd
|
|||
from . import tune
|
||||
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
@ -635,7 +636,6 @@ class LRL2Classifier(SKLearnEstimator):
|
|||
|
||||
|
||||
class CatBoostEstimator(BaseEstimator):
|
||||
|
||||
_time_per_iter = None
|
||||
_train_size = 0
|
||||
|
||||
|
@ -834,3 +834,222 @@ class KNeighborsEstimator(BaseEstimator):
|
|||
X = X.drop(cat_columns, axis=1)
|
||||
X = X.to_numpy()
|
||||
return X
|
||||
|
||||
|
||||
class FBProphet(BaseEstimator):
|
||||
@classmethod
|
||||
def search_space(cls, **params):
|
||||
space = {
|
||||
'changepoint_prior_scale': {
|
||||
'domain': tune.loguniform(lower=0.001, upper=1000),
|
||||
'init_value': 0.01,
|
||||
'low_cost_init_value': 0.001,
|
||||
},
|
||||
'seasonality_prior_scale': {
|
||||
'domain': tune.loguniform(lower=0.01, upper=100),
|
||||
'init_value': 1,
|
||||
},
|
||||
'holidays_prior_scale': {
|
||||
'domain': tune.loguniform(lower=0.01, upper=100),
|
||||
'init_value': 1,
|
||||
},
|
||||
'seasonality_mode': {
|
||||
'domain': tune.choice(['additive', 'multiplicative']),
|
||||
'init_value': 'multiplicative',
|
||||
}
|
||||
}
|
||||
return space
|
||||
|
||||
def fit(self, X_train, y_train, budget=None, **kwargs):
|
||||
y_train = pd.DataFrame(y_train, columns=['y'])
|
||||
train_df = X_train.join(y_train)
|
||||
|
||||
if ('ds' not in train_df) or ('y' not in train_df):
|
||||
raise ValueError(
|
||||
'Dataframe for training forecast model must have columns "ds" and "y" with the dates and '
|
||||
'values respectively.'
|
||||
)
|
||||
|
||||
if 'n_jobs' in self.params:
|
||||
self.params.pop('n_jobs')
|
||||
|
||||
from prophet import Prophet
|
||||
|
||||
current_time = time.time()
|
||||
model = Prophet(**self.params).fit(train_df)
|
||||
train_time = time.time() - current_time
|
||||
self._model = model
|
||||
return train_time
|
||||
|
||||
def predict(self, X_test, freq=None):
|
||||
if self._model is not None:
|
||||
if isinstance(X_test, int) and freq is not None:
|
||||
future = self._model.make_future_dataframe(periods=X_test, freq=freq)
|
||||
forecast = self._model.predict(future)
|
||||
elif isinstance(X_test, pd.DataFrame):
|
||||
forecast = self._model.predict(X_test)
|
||||
else:
|
||||
raise ValueError(
|
||||
"either X_test(pd.Dataframe with dates for predictions, column ds) or"
|
||||
"X_test(int number of periods)+freq are required.")
|
||||
return forecast['yhat']
|
||||
else:
|
||||
return np.ones(X_test.shape[0])
|
||||
|
||||
|
||||
class ARIMA(BaseEstimator):
|
||||
@classmethod
|
||||
def search_space(cls, **params):
|
||||
space = {
|
||||
'p': {
|
||||
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||
'init_value': 2,
|
||||
'low_cost_init_value': 0,
|
||||
},
|
||||
'd': {
|
||||
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||
'init_value': 2,
|
||||
'low_cost_init_value': 0,
|
||||
},
|
||||
'q': {
|
||||
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||
'init_value': 2,
|
||||
'low_cost_init_value': 0,
|
||||
}
|
||||
}
|
||||
return space
|
||||
|
||||
def fit(self, X_train, y_train, budget=None, **kwargs):
|
||||
y_train = pd.DataFrame(y_train, columns=['y'])
|
||||
train_df = X_train.join(y_train)
|
||||
|
||||
if ('ds' not in train_df) or ('y' not in train_df):
|
||||
raise ValueError(
|
||||
'Dataframe for training forecast model must have columns "ds" and "y" with the dates and '
|
||||
'values respectively.'
|
||||
)
|
||||
|
||||
train_df.index = pd.to_datetime(train_df['ds'])
|
||||
train_df = train_df.drop('ds', axis=1)
|
||||
|
||||
if 'n_jobs' in self.params:
|
||||
self.params.pop('n_jobs')
|
||||
|
||||
from statsmodels.tsa.arima.model import ARIMA as ARIMA_estimator
|
||||
import warnings
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
current_time = time.time()
|
||||
model = ARIMA_estimator(train_df,
|
||||
order=(self.params['p'], self.params['d'], self.params['q']),
|
||||
enforce_stationarity=False,
|
||||
enforce_invertibility=False)
|
||||
|
||||
model = model.fit()
|
||||
train_time = time.time() - current_time
|
||||
self._model = model
|
||||
return train_time
|
||||
|
||||
def predict(self, X_test, freq=None):
|
||||
if self._model is not None:
|
||||
if isinstance(X_test, int) and freq is not None:
|
||||
forecast = self._model.forecast(steps=X_test).to_frame().reset_index()
|
||||
elif isinstance(X_test, pd.DataFrame):
|
||||
start_date = X_test.iloc[0, 0]
|
||||
end_date = X_test.iloc[-1, 0]
|
||||
forecast = self._model.predict(start=start_date, end=end_date)
|
||||
else:
|
||||
raise ValueError(
|
||||
"either X_test(pd.Dataframe with dates for predictions, column ds) or"
|
||||
"X_test(int number of periods)+freq are required.")
|
||||
return forecast
|
||||
else:
|
||||
return np.ones(X_test.shape[0])
|
||||
|
||||
|
||||
class SARIMAX(BaseEstimator):
|
||||
@classmethod
|
||||
def search_space(cls, **params):
|
||||
space = {
|
||||
'p': {
|
||||
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||
'init_value': 2,
|
||||
'low_cost_init_value': 0,
|
||||
},
|
||||
'd': {
|
||||
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||
'init_value': 2,
|
||||
'low_cost_init_value': 0,
|
||||
},
|
||||
'q': {
|
||||
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||
'init_value': 2,
|
||||
'low_cost_init_value': 0,
|
||||
},
|
||||
'P': {
|
||||
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||
'init_value': 1,
|
||||
'low_cost_init_value': 0,
|
||||
},
|
||||
'D': {
|
||||
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||
'init_value': 1,
|
||||
'low_cost_init_value': 0,
|
||||
},
|
||||
'Q': {
|
||||
'domain': tune.quniform(lower=0, upper=10, q=1),
|
||||
'init_value': 1,
|
||||
'low_cost_init_value': 0,
|
||||
},
|
||||
's': {
|
||||
'domain': tune.choice([1, 4, 6, 12]),
|
||||
'init_value': 12,
|
||||
}
|
||||
}
|
||||
return space
|
||||
|
||||
def fit(self, X_train, y_train, budget=None, **kwargs):
|
||||
y_train = pd.DataFrame(y_train, columns=['y'])
|
||||
train_df = X_train.join(y_train)
|
||||
|
||||
if ('ds' not in train_df) or ('y' not in train_df):
|
||||
raise ValueError(
|
||||
'Dataframe for training forecast model must have columns "ds" and "y" with the dates and '
|
||||
'values respectively.'
|
||||
)
|
||||
|
||||
train_df.index = pd.to_datetime(train_df['ds'])
|
||||
train_df = train_df.drop('ds', axis=1)
|
||||
|
||||
if 'n_jobs' in self.params:
|
||||
self.params.pop('n_jobs')
|
||||
|
||||
from statsmodels.tsa.statespace.sarimax import SARIMAX as SARIMAX_estimator
|
||||
|
||||
current_time = time.time()
|
||||
model = SARIMAX_estimator(train_df,
|
||||
order=(self.params['p'], self.params['d'], self.params['q']),
|
||||
seasonality_order=(self.params['P'], self.params['D'], self.params['Q'], self.params['s']),
|
||||
enforce_stationarity=False,
|
||||
enforce_invertibility=False)
|
||||
|
||||
model = model.fit()
|
||||
train_time = time.time() - current_time
|
||||
self._model = model
|
||||
return train_time
|
||||
|
||||
def predict(self, X_test, freq=None):
|
||||
if self._model is not None:
|
||||
if isinstance(X_test, int) and freq is not None:
|
||||
forecast = self._model.forecast(steps=X_test).to_frame().reset_index()
|
||||
elif isinstance(X_test, pd.DataFrame):
|
||||
start_date = X_test.iloc[0, 0]
|
||||
end_date = X_test.iloc[-1, 0]
|
||||
forecast = self._model.predict(start=start_date, end=end_date)
|
||||
else:
|
||||
raise ValueError(
|
||||
"either X_test(pd.Dataframe with dates for predictions, column ds)"
|
||||
"or X_test(int number of periods)+freq are required.")
|
||||
return forecast
|
||||
else:
|
||||
return np.ones(X_test.shape[0])
|
||||
|
|
|
@ -165,7 +165,8 @@ class BlendSearch(Searcher):
|
|||
min_resource, max_resource, reduction_factor, self.cost_attr, seed)
|
||||
self._is_ls_ever_converged = False
|
||||
self._subspace = {} # the subspace for each trial id
|
||||
self._init_search()
|
||||
if space:
|
||||
self._init_search()
|
||||
|
||||
def set_search_properties(self,
|
||||
metric: Optional[str] = None,
|
||||
|
|
|
@ -1 +1 @@
|
|||
__version__ = "0.5.12"
|
||||
__version__ = "0.5.13"
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
5
setup.py
5
setup.py
|
@ -56,6 +56,7 @@ setuptools.setup(
|
|||
"torch==1.8.1",
|
||||
"datasets==1.4.1",
|
||||
"azure-storage-blob",
|
||||
"statsmodels>=0.12.2"
|
||||
],
|
||||
"blendsearch": [
|
||||
"optuna==2.8.0"
|
||||
|
@ -79,6 +80,10 @@ setuptools.setup(
|
|||
"datasets==1.4.1",
|
||||
"tensorboardX<=2.2",
|
||||
"torch"
|
||||
],
|
||||
"forecast": [
|
||||
"prophet>=1.0.1",
|
||||
"statsmodels>=0.12.2"
|
||||
]
|
||||
},
|
||||
classifiers=[
|
||||
|
|
|
@ -0,0 +1,119 @@
|
|||
def test_forecast_automl_df(budget=5):
|
||||
# using dataframe
|
||||
import statsmodels.api as sm
|
||||
data = sm.datasets.co2.load_pandas()
|
||||
data = data.data
|
||||
data = data['co2'].resample('MS').mean()
|
||||
data = data.fillna(data.bfill())
|
||||
data = data.to_frame().reset_index()
|
||||
data = data.rename(columns={'index': 'ds', 'co2': 'y'})
|
||||
num_samples = data.shape[0]
|
||||
time_horizon = 12
|
||||
split_idx = num_samples - time_horizon
|
||||
X_train = data[:split_idx]
|
||||
X_test = data[split_idx:]['ds'].to_frame()
|
||||
y_test = data[split_idx:]['y'].to_frame()
|
||||
''' import AutoML class from flaml package '''
|
||||
from flaml import AutoML
|
||||
automl = AutoML()
|
||||
settings = {
|
||||
"time_budget": budget, # total running time in seconds
|
||||
"metric": 'mape', # primary metric
|
||||
"task": 'forecast', # task type
|
||||
"log_file_name": 'CO2_forecast.log', # flaml log file
|
||||
"eval_method": "holdout",
|
||||
"split_type": 'time'
|
||||
}
|
||||
'''The main flaml automl API'''
|
||||
try:
|
||||
automl.fit(dataframe=X_train, **settings, period=time_horizon, freq='M')
|
||||
except ImportError:
|
||||
automl.fit(dataframe=X_train, **settings, estimator_list=['arima', 'sarimax'], period=time_horizon, freq='M')
|
||||
''' retrieve best config and best learner'''
|
||||
print('Best ML leaner:', automl.best_estimator)
|
||||
print('Best hyperparmeter config:', automl.best_config)
|
||||
print(f'Best mape on validation data: {automl.best_loss}')
|
||||
print(f'Training duration of best run: {automl.best_config_train_time}s')
|
||||
print(automl.model.estimator)
|
||||
''' pickle and save the automl object '''
|
||||
import pickle
|
||||
with open('automl.pkl', 'wb') as f:
|
||||
pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
|
||||
''' compute predictions of testing dataset '''
|
||||
y_pred = automl.predict(X_test)
|
||||
print('Predicted labels', y_pred)
|
||||
print('True labels', y_test)
|
||||
''' compute different metric values on testing dataset'''
|
||||
from flaml.ml import sklearn_metric_loss_score
|
||||
print('mape', '=', sklearn_metric_loss_score('mape', y_pred, y_test))
|
||||
from flaml.data import get_output_from_log
|
||||
time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \
|
||||
get_output_from_log(filename=settings['log_file_name'], time_budget=budget)
|
||||
for config in config_history:
|
||||
print(config)
|
||||
print(automl.prune_attr)
|
||||
print(automl.max_resource)
|
||||
print(automl.min_resource)
|
||||
|
||||
|
||||
def test_forecast_automl_Xy(budget=5):
|
||||
# using X_train and y_train
|
||||
import statsmodels.api as sm
|
||||
data = sm.datasets.co2.load_pandas()
|
||||
data = data.data
|
||||
data = data['co2'].resample('MS').mean()
|
||||
data = data.fillna(data.bfill())
|
||||
data = data.to_frame().reset_index()
|
||||
num_samples = data.shape[0]
|
||||
time_horizon = 12
|
||||
split_idx = num_samples - time_horizon
|
||||
X_train = data[:split_idx]['index'].to_frame()
|
||||
y_train = data[:split_idx]['co2']
|
||||
X_test = data[split_idx:]['index'].to_frame()
|
||||
y_test = data[split_idx:]['co2'].to_frame()
|
||||
''' import AutoML class from flaml package '''
|
||||
from flaml import AutoML
|
||||
automl = AutoML()
|
||||
settings = {
|
||||
"time_budget": budget, # total running time in seconds
|
||||
"metric": 'mape', # primary metric
|
||||
"task": 'forecast', # task type
|
||||
"log_file_name": 'CO2_forecast.log', # flaml log file
|
||||
"eval_method": "holdout",
|
||||
"split_type": 'time'
|
||||
}
|
||||
'''The main flaml automl API'''
|
||||
try:
|
||||
automl.fit(X_train=X_train, y_train=y_train, **settings, period=time_horizon, freq='M')
|
||||
except ImportError:
|
||||
automl.fit(X_train=X_train, y_train=y_train, **settings, estimator_list=['arima', 'sarimax'], period=time_horizon, freq='M')
|
||||
''' retrieve best config and best learner'''
|
||||
print('Best ML leaner:', automl.best_estimator)
|
||||
print('Best hyperparmeter config:', automl.best_config)
|
||||
print(f'Best mape on validation data: {automl.best_loss}')
|
||||
print(f'Training duration of best run: {automl.best_config_train_time}s')
|
||||
print(automl.model.estimator)
|
||||
''' pickle and save the automl object '''
|
||||
import pickle
|
||||
with open('automl.pkl', 'wb') as f:
|
||||
pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
|
||||
''' compute predictions of testing dataset '''
|
||||
y_pred = automl.predict(X_test)
|
||||
print('Predicted labels', y_pred)
|
||||
print('True labels', y_test)
|
||||
''' compute different metric values on testing dataset'''
|
||||
from flaml.ml import sklearn_metric_loss_score
|
||||
print('mape', '=', sklearn_metric_loss_score('mape', y_pred, y_test))
|
||||
from flaml.data import get_output_from_log
|
||||
time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \
|
||||
get_output_from_log(filename=settings['log_file_name'], time_budget=budget)
|
||||
for config in config_history:
|
||||
print(config)
|
||||
print(automl.prune_attr)
|
||||
print(automl.max_resource)
|
||||
print(automl.min_resource)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_forecast_automl_df(60)
|
||||
test_forecast_automl_Xy(60)
|
|
@ -6,10 +6,12 @@ from sklearn.model_selection import train_test_split
|
|||
from sklearn.metrics import accuracy_score
|
||||
|
||||
|
||||
dataset = "credit"
|
||||
dataset = "credit-g"
|
||||
|
||||
|
||||
def _test(split_type):
|
||||
from sklearn.externals._arff import ArffException
|
||||
|
||||
automl = AutoML()
|
||||
|
||||
automl_settings = {
|
||||
|
@ -22,9 +24,17 @@ def _test(split_type):
|
|||
"split_type": split_type,
|
||||
}
|
||||
|
||||
X, y = fetch_openml(name=dataset, return_X_y=True)
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
|
||||
random_state=42)
|
||||
try:
|
||||
X, y = fetch_openml(name=dataset, return_X_y=True)
|
||||
except (ArffException, ValueError):
|
||||
from sklearn.datasets import load_wine
|
||||
X, y = load_wine(return_X_y=True)
|
||||
if split_type != 'time':
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
|
||||
random_state=42)
|
||||
else:
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
|
||||
shuffle=False)
|
||||
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
|
||||
pred = automl.predict(X_test)
|
||||
|
@ -37,6 +47,10 @@ def _test_uniform():
|
|||
_test(split_type="uniform")
|
||||
|
||||
|
||||
def test_time():
|
||||
_test(split_type="time")
|
||||
|
||||
|
||||
def test_groups():
|
||||
from sklearn.externals._arff import ArffException
|
||||
try:
|
||||
|
|
|
@ -0,0 +1,52 @@
|
|||
import time
|
||||
|
||||
|
||||
def evaluation_fn(step, width, height):
|
||||
return (0.1 + width * step / 100)**(-1) + height * 0.1
|
||||
|
||||
|
||||
def easy_objective(config):
|
||||
from ray import tune
|
||||
# Hyperparameters
|
||||
width, height = config["width"], config["height"]
|
||||
|
||||
for step in range(config["steps"]):
|
||||
# Iterative training function - can be any arbitrary training procedure
|
||||
intermediate_score = evaluation_fn(step, width, height)
|
||||
# Feed the score back back to Tune.
|
||||
tune.report(iterations=step, mean_loss=intermediate_score)
|
||||
time.sleep(0.1)
|
||||
|
||||
|
||||
def test_blendsearch_tune(smoke_test=True):
|
||||
try:
|
||||
from ray import tune
|
||||
from ray.tune.suggest import ConcurrencyLimiter
|
||||
from ray.tune.schedulers import AsyncHyperBandScheduler
|
||||
from ray.tune.suggest.flaml import BlendSearch
|
||||
except ImportError:
|
||||
print('ray[tune] is not installed, skipping test')
|
||||
return
|
||||
algo = BlendSearch()
|
||||
algo = ConcurrencyLimiter(algo, max_concurrent=4)
|
||||
scheduler = AsyncHyperBandScheduler()
|
||||
analysis = tune.run(
|
||||
easy_objective,
|
||||
metric="mean_loss",
|
||||
mode="min",
|
||||
search_alg=algo,
|
||||
scheduler=scheduler,
|
||||
num_samples=10 if smoke_test else 100,
|
||||
config={
|
||||
"steps": 100,
|
||||
"width": tune.uniform(0, 20),
|
||||
"height": tune.uniform(-100, 100),
|
||||
# This is an ignored parameter.
|
||||
"activation": tune.choice(["relu", "tanh"])
|
||||
})
|
||||
|
||||
print("Best hyperparameters found were: ", analysis.best_config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_blendsearch_tune(False)
|
Loading…
Reference in New Issue