remove catboost training dir; ensemble api; blendsearch for hierarchical space; ranking task; forecast improvement (#178)

* remove catboost training dir

* close #48

* bs for hierarchical space. close #85

* retrain for hierarchical space

* clean ml (#180)

Co-authored-by: Qingyun Wu <qxw5138@psu.edu>

* support ranking task

* examples

* cv shuffle

* forecast api and implementation cleaner

* period constraints

* delete groups after fit
This commit is contained in:
Chi Wang 2021-09-01 16:25:04 -07:00 committed by GitHub
parent 1bc8786dcb
commit 6ab0730793
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 1399 additions and 1126 deletions

View File

@ -65,7 +65,7 @@ tune.run(train_with_config, config={…}, low_cost_partial_config={…}, time_bu
## Advantages
* For classification and regression tasks, find quality models with lower computational resources.
* For common machine learning tasks like classification and regression, find quality models with small computational resources.
* Users can choose their desired customizability: minimal customization (computational resource budget), medium customization (e.g., scikit-style learner, search space and metric), full customization (arbitrary training and evaluation code).
* Allow human guidance in hyperparameter tuning to respect prior on certain subspaces but also able to explore other subspaces. Read more about the
hyperparameter optimization methods
@ -75,7 +75,7 @@ And they can be used in distributed HPO frameworks such as ray tune or nni.
## Examples
A basic classification example.
- A basic classification example.
```python
from flaml import AutoML
@ -99,7 +99,7 @@ print(automl.predict_proba(X_train))
print(automl.model)
```
A basic regression example.
- A basic regression example.
```python
from flaml import AutoML
@ -123,6 +123,39 @@ print(automl.predict(X_train))
print(automl.model)
```
- Time series forecasting.
```python
# pip install flaml[forecast]
import numpy as np
from flaml import AutoML
X_train = np.arange('2014-01', '2021-01', dtype='datetime64[M]')
y_train = np.random.random(size=72)
automl = AutoML()
automl.fit(X_train=X_train[:72], # a single column of timestamp
y_train=y_train, # value for each timestamp
period=12, # time horizon to forecast, e.g., 12 months
task='forecast', time_budget=15, # time budget in seconds
log_file_name="test/forecast.log",
)
print(automl.predict(X_train[72:]))
```
- Learning to rank.
```python
from sklearn.datasets import fetch_openml
from flaml import AutoML
X, y = fetch_openml(name="credit-g", return_X_y=True)
# not a real learning to rank dataaset
groups = [200] * 4 + [100] * 2, # group counts
automl = AutoML()
automl.fit(
X_train, y_train, groups=groups,
task='rank', time_budget=10, # in seconds
)
```
More examples can be found in [notebooks](https://github.com/microsoft/FLAML/tree/main/notebook/).
## Documentation

View File

@ -10,7 +10,7 @@ from functools import partial
import numpy as np
from scipy.sparse import issparse
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, \
RepeatedKFold, GroupKFold, TimeSeriesSplit
RepeatedKFold, GroupKFold, TimeSeriesSplit, GroupShuffleSplit
from sklearn.utils import shuffle
import pandas as pd
import logging
@ -94,13 +94,13 @@ class SearchState:
else:
self.sample_size = self.data_size
obj = result['val_loss']
train_loss = result['train_loss']
metric_for_logging = result['metric_for_logging']
time2eval = result['time_total_s']
trained_estimator = result['trained_estimator']
del result['trained_estimator'] # free up RAM
else:
obj, time2eval, trained_estimator = np.inf, 0.0, None
train_loss = config = None
metric_for_logging = config = None
self.trial_time = time2eval
self.total_time_used += time_used
self.total_iter += 1
@ -126,7 +126,8 @@ class SearchState:
self.trained_estimator.cleanup()
if trained_estimator:
self.trained_estimator = trained_estimator
self.train_loss, self.val_loss, self.config = train_loss, obj, config
self.metric_for_logging, self.val_loss, self.config = \
metric_for_logging, obj, config
def get_hist_config_sig(self, sample_size, config):
config_values = tuple([config[k] for k in self._hp_names])
@ -144,7 +145,7 @@ class AutoMLState:
def _prepare_sample_train_data(self, sample_size):
full_size = len(self.y_train)
sampled_weight = None
sampled_weight = groups = None
if sample_size <= full_size:
if isinstance(self.X_train, pd.DataFrame):
sampled_X_train = self.X_train.iloc[:sample_size]
@ -154,12 +155,16 @@ class AutoMLState:
weight = self.fit_kwargs.get('sample_weight')
if weight is not None:
sampled_weight = weight[:sample_size]
if self.groups is not None:
groups = self.groups[:sample_size]
else:
sampled_X_train = self.X_train_all
sampled_y_train = self.y_train_all
if 'sample_weight' in self.fit_kwargs:
sampled_weight = self.sample_weight_all
return sampled_X_train, sampled_y_train, sampled_weight
if self.groups is not None:
groups = self.groups_all
return sampled_X_train, sampled_y_train, sampled_weight, groups
def _compute_with_config_base(self,
estimator,
@ -168,13 +173,15 @@ class AutoMLState:
sample_size = int(config_w_resource['FLAML_sample_size'])
else:
sample_size = self.data_size
sampled_X_train, sampled_y_train, sampled_weight = \
sampled_X_train, sampled_y_train, sampled_weight, groups = \
self._prepare_sample_train_data(sample_size)
if sampled_weight is not None:
weight = self.fit_kwargs['sample_weight']
self.fit_kwargs['sample_weight'] = sampled_weight
else:
weight = None
if groups is not None:
self.fit_kwargs['groups'] = groups
config = config_w_resource.copy()
if 'FLAML_sample_size' in config:
del config['FLAML_sample_size']
@ -182,13 +189,14 @@ class AutoMLState:
budget = time_left if sample_size == self.data_size else \
time_left / 2 * sample_size / self.data_size
trained_estimator, val_loss, train_loss, _, pred_time = \
trained_estimator, val_loss, metric_for_logging, _, pred_time = \
compute_estimator(
sampled_X_train,
sampled_y_train,
self.X_val,
self.y_val,
self.weight_val,
self.groups_val,
min(budget, self.train_time_limit),
self.kf,
config,
@ -204,7 +212,7 @@ class AutoMLState:
result = {
'pred_time': pred_time,
'wall_clock_time': time.time() - self._start_time_flag,
'train_loss': train_loss,
'metric_for_logging': metric_for_logging,
'val_loss': val_loss,
'trained_estimator': trained_estimator
}
@ -216,19 +224,23 @@ class AutoMLState:
def _train_with_config(
self, estimator, config_w_resource, sample_size=None
):
config = config_w_resource.copy()
if not sample_size:
sample_size = config_w_resource['FLAML_sample_size']
config = config_w_resource.get('ml', config_w_resource).copy()
if 'FLAML_sample_size' in config:
if not sample_size:
sample_size = config['FLAML_sample_size']
del config['FLAML_sample_size']
if "learner" in config:
del config['learner']
assert sample_size is not None
sampled_X_train, sampled_y_train, sampled_weight = \
sampled_X_train, sampled_y_train, sampled_weight, groups = \
self._prepare_sample_train_data(sample_size)
if sampled_weight is not None:
weight = self.fit_kwargs['sample_weight']
self.fit_kwargs['sample_weight'] = sampled_weight
else:
weight = None
if groups is not None:
self.fit_kwargs['groups'] = groups
budget = None if self.time_budget is None else (
self.time_budget - self.time_from_start)
estimator, train_time = train_estimator(
@ -368,18 +380,18 @@ class AutoML:
return self._trained_estimator.classes_.tolist()
return None
def predict(self, X_test, freq=None):
def predict(self, X_test):
'''Predict label from features.
Args:
X_test: A numpy array of featurized instances, shape n * m,
or a pandas dataframe with one column with timestamp values
for 'forecasting' task.
freq: str or pandas offset, default=None | The frequency of the
time-series.
or for 'forecasting' task:
a pandas dataframe with one column of timestamp values
or an integer n for the predict steps (only valid when
the estimator is arima or sarimax).
Returns:
A numpy array of shape n * 1 - - each element is a predicted class
A array-like of shape n * 1 - - each element is a predicted
label for an instance.
'''
if self._trained_estimator is None:
@ -387,13 +399,7 @@ class AutoML:
"No estimator is trained. Please run fit with enough budget.")
return None
X_test = self._preprocess(X_test)
if self._state.task == 'forecast':
X_test_df = pd.DataFrame(X_test)
X_test_col = list(X_test.columns)[0]
X_test_df = X_test_df.rename(columns={X_test_col: 'ds'})
y_pred = self._trained_estimator.predict(X_test_df, freq=freq)
else:
y_pred = self._trained_estimator.predict(X_test)
y_pred = self._trained_estimator.predict(X_test)
if y_pred.ndim > 1 and isinstance(y_pred, np.ndarray):
y_pred = y_pred.flatten()
if self._label_transformer:
@ -418,14 +424,20 @@ class AutoML:
return proba
def _preprocess(self, X):
if issparse(X):
X = X.tocsr()
if self._transformer:
X = self._transformer.transform(X)
if isinstance(X, int):
return X
if self._state.task == 'forecast':
X = pd.DataFrame(X)
X = X.rename(columns={X.columns[0]: 'ds'})
else:
if issparse(X):
X = X.tocsr()
if self._transformer:
X = self._transformer.transform(X)
return X
def _validate_data(self, X_train_all, y_train_all, dataframe, label,
X_val=None, y_val=None):
X_val=None, y_val=None, groups_val=None, groups=None):
if self._state.task == 'forecast':
if dataframe is not None and label is not None:
dataframe = dataframe.copy()
@ -433,13 +445,11 @@ class AutoML:
elif dataframe is not None:
if ('ds' not in dataframe) or ('y' not in dataframe):
raise ValueError(
'For forecasting task, Dataframe must have columns "ds" and "y" '
'with the dates and values respectively.'
)
'For forecasting task, dataframe must have columns "ds" and "y" '
'with the dates and values respectively.')
elif (X_train_all is not None) and (y_train_all is not None):
dataframe = pd.DataFrame(X_train_all)
time_col = list(dataframe.columns)[0]
dataframe = dataframe.rename(columns={time_col: 'ds'})
dataframe = dataframe.rename(columns={dataframe.columns[0]: 'ds'})
dataframe['y'] = pd.Series(y_train_all)
X_train_all = None
y_train_all = None
@ -515,12 +525,23 @@ class AutoML:
self._state.y_val = y_val
else:
self._state.X_val = self._state.y_val = None
if groups is not None and len(groups) != self._nrow:
# groups is given as group counts
self._state.groups = np.concatenate(
[[i] * c for i, c in enumerate(groups)])
assert len(self._state.groups) == self._nrow, \
"the sum of group counts must match the number of examples"
self._state.groups_val = np.concatenate(
[[i] * c for i, c in enumerate(groups_val)]
) if groups_val is not None else None
else:
self._state.groups_val = groups_val
self._state.groups = groups
def _prepare_data(self,
eval_method,
split_ratio,
n_splits,
period=None):
n_splits):
X_val, y_val = self._state.X_val, self._state.y_val
if issparse(X_val):
X_val = X_val.tocsr()
@ -564,25 +585,25 @@ class AutoML:
random_state=RANDOM_SEED)
self._state.fit_kwargs[
'sample_weight'] = self._state.sample_weight_all
elif hasattr(self._state, 'groups') and self._state.groups is not None:
X_train_all, y_train_all, self._state.groups = shuffle(
X_train_all, y_train_all, self._state.groups,
random_state=RANDOM_SEED)
else:
X_train_all, y_train_all = shuffle(
X_train_all, y_train_all, random_state=RANDOM_SEED)
if self._df:
X_train_all.reset_index(drop=True, inplace=True)
if isinstance(y_train_all, pd.Series):
y_train_all.reset_index(drop=True, inplace=True)
if self._df:
X_train_all.reset_index(drop=True, inplace=True)
if isinstance(y_train_all, pd.Series):
y_train_all.reset_index(drop=True, inplace=True)
X_train, y_train = X_train_all, y_train_all
if X_val is None:
self._state.groups_all = self._state.groups
if X_val is None and eval_method == 'holdout':
# if eval_method = holdout, make holdout data
if eval_method == 'holdout' and self._split_type == 'time':
if 'period' in self._state.fit_kwargs:
if self._split_type == 'time':
if self._state.task == 'forecast':
num_samples = X_train_all.shape[0]
split_idx = num_samples - self._state.fit_kwargs.get('period')
period = self._state.fit_kwargs['period']
assert period < num_samples, (
f"period={period}>#examples={num_samples}")
split_idx = num_samples - period
X_train = X_train_all[:split_idx]
y_train = y_train_all[:split_idx]
X_val = X_train_all[split_idx:]
@ -603,7 +624,21 @@ class AutoML:
y_train_all,
test_size=split_ratio,
shuffle=False)
elif self._state.task != 'regression' and eval_method == 'holdout':
elif self._state.task == 'rank':
gss = GroupShuffleSplit(n_splits=1, test_size=split_ratio,
random_state=RANDOM_SEED)
for train_idx, val_idx in gss.split(X_train_all, y_train_all,
self._state.groups):
if self._df:
X_train, X_val = X_train_all.iloc[
train_idx], X_train_all.iloc[val_idx]
else:
X_train, X_val = X_train_all[
train_idx], X_train_all[val_idx]
y_train, y_val = y_train_all[train_idx], y_train_all[val_idx]
self._state.groups, self._state.groups_val = self._state.groups[
train_idx], self._state.groups[val_idx]
elif self._state.task != 'regression':
# for classification, make sure the labels are complete in both
# training and validation data
label_set, first = np.unique(y_train_all, return_index=True)
@ -617,8 +652,7 @@ class AutoML:
X_first = X_train_all.iloc[first] if self._df else X_train_all[
first]
X_rest = X_train_all.iloc[rest] if self._df else X_train_all[rest]
y_rest = y_train_all.iloc[rest] if isinstance(
y_train_all, pd.Series) else y_train_all[rest]
y_rest = y_train_all[rest]
stratify = y_rest if self._split_type == 'stratified' else \
None
if 'sample_weight' in self._state.fit_kwargs:
@ -647,7 +681,7 @@ class AutoML:
X_val = concat(X_first, X_val)
y_val = concat(label_set, y_val) if self._df else \
np.concatenate([label_set, y_val])
elif eval_method == 'holdout' and self._state.task == 'regression':
elif self._state.task == 'regression':
if 'sample_weight' in self._state.fit_kwargs:
X_train, X_val, y_train, y_val, self._state.fit_kwargs[
'sample_weight'], self._state.weight_val = \
@ -669,16 +703,16 @@ class AutoML:
self._state.y_val = (X_train, y_train, X_val, y_val)
self._state.X_train_all = X_train_all
self._state.y_train_all = y_train_all
if hasattr(self._state, 'groups') and self._state.groups is not None:
logger.info("Using GroupKFold")
assert len(self._state.groups) == y_train_all.size, \
if self._split_type == 'group':
# logger.info("Using GroupKFold")
assert len(self._state.groups_all) == y_train_all.size, \
"the length of groups must match the number of examples"
assert len(np.unique(self._state.groups)) >= n_splits, \
assert len(np.unique(self._state.groups_all)) >= n_splits, \
"the number of groups must be equal or larger than n_splits"
self._state.kf = GroupKFold(n_splits)
self._state.kf.groups = self._state.groups
self._state.kf.groups = self._state.groups_all
elif self._split_type == "stratified":
logger.info("Using StratifiedKFold")
# logger.info("Using StratifiedKFold")
assert y_train_all.size >= n_splits, (
f"{n_splits}-fold cross validation"
f" requires input data with at least {n_splits} examples.")
@ -688,14 +722,22 @@ class AutoML:
self._state.kf = RepeatedStratifiedKFold(
n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED)
elif self._split_type == "time":
logger.info("Using TimeSeriesSplit")
# logger.info("Using TimeSeriesSplit")
if self._state.task == 'forecast':
period = self._state.fit_kwargs['period']
if period * (n_splits + 1) > y_train_all.size:
n_splits = int(y_train_all.size / period - 1)
assert n_splits >= 2, (
f"cross validation for forecasting period={period}"
f" requires input data with at least {3 * period} examples.")
logger.info(
f"Using nsplits={n_splits} due to data size limit.")
self._state.kf = TimeSeriesSplit(
n_splits=n_splits, test_size=self._state.fit_kwargs.get('period'))
n_splits=n_splits, test_size=period)
else:
self._state.kf = TimeSeriesSplit(n_splits=n_splits)
else:
logger.info("Using RepeatedKFold")
# logger.info("Using RepeatedKFold")
self._state.kf = RepeatedKFold(
n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED)
@ -745,7 +787,8 @@ class AutoML:
eval_method='auto',
split_ratio=SPLIT_RATIO,
n_splits=N_SPLITS,
split_type="stratified",
split_type=None,
groups=None,
n_jobs=1,
train_best=True,
train_full=False,
@ -754,31 +797,51 @@ class AutoML:
'''Retrain from log file
Args:
time_budget: A float number of the time budget in seconds
log_file_name: A string of the log file name
X_train: A numpy array of training data in shape n*m
y_train: A numpy array of labels in shape n*1
dataframe: A dataframe of training data including label column.
For 'forecast' task, dataframe must be specified and should
have two columns: timestamp and value.
label: A str of the label column name for 'classification' or
'regression' task, e.g., 'label';
or a tuple of strings for timestamp and value columns for
'forecasting' task, e.g., ('timestamp', 'value').
Note: If X_train and y_train are provided,
dataframe and label are ignored;
If not, dataframe and label must be provided.
time_budget: A float number of the time budget in seconds.
task: A string of the task type, e.g.,
'classification', 'regression'
'classification', 'regression', 'forecast', 'rank'.
eval_method: A string of resampling strategy, one of
['auto', 'cv', 'holdout']
split_ratio: A float of the validation data percentage for holdout
n_splits: An integer of the number of folds for cross-validation
n_jobs: An integer of the number of threads for training
['auto', 'cv', 'holdout'].
split_ratio: A float of the validation data percentage for holdout.
n_splits: An integer of the number of folds for cross-validation.
split_type: str or None, default=None | the data split type.
For classification tasks, valid choices are [
None, 'stratified', 'uniform', 'time']. None -> stratified.
For regression tasks, valid choices are [None, 'uniform', 'time'].
None -> uniform.
For time series forecasting, must be None or 'time'.
For ranking task, must be None or 'group'.
groups: None or array-like | Group labels (with matching length to
y_train) or groups counts (with sum equal to length of y_train)
for training data.
n_jobs: An integer of the number of threads for training.
train_best: A boolean of whether to train the best config in the
time budget; if false, train the last config in the budget
time budget; if false, train the last config in the budget.
train_full: A boolean of whether to train on the full data. If true,
eval_method and sample_size in the log file will be ignored
eval_method and sample_size in the log file will be ignored.
record_id: the ID of the training log record from which the model will
be retrained. By default `record_id = -1` which means this will be
ignored. `record_id = 0` corresponds to the first trial, and
when `record_id >= 0`, `time_budget` will be ignored.
**fit_kwargs: Other key word arguments to pass to fit() function of
the searched learners, such as sample_weight
the searched learners, such as sample_weight.
'''
self._state.task = task
self._state.fit_kwargs = fit_kwargs
self._validate_data(X_train, y_train, dataframe, label)
self._validate_data(X_train, y_train, dataframe, label, groups=groups)
logger.info('log file name {}'.format(log_file_name))
@ -829,24 +892,17 @@ class AutoML:
# Partially copied from fit() function
# Initilize some attributes required for retrain_from_log
self._state.task = task
if self._state.task == 'classification':
self._state.task = get_classification_objective(
len(np.unique(self._y_train_all)))
assert split_type in ["stratified", "uniform", "time"]
self._split_type = split_type
elif self._state.task == 'regression':
if split_type in ["uniform", "time"]:
self._split_type = split_type
else:
self._split_type = "uniform"
elif self._state.task == 'forecast':
self._split_type = "time"
self._decide_split_type(split_type)
if record_id >= 0:
eval_method = 'cv'
elif eval_method == 'auto':
eval_method = self._decide_eval_method(time_budget)
self.modelcount = 0
self._prepare_data(eval_method, split_ratio, n_splits)
if self._state.task != 'forecast':
self._prepare_data(eval_method, split_ratio, n_splits)
else:
self._prepare_data(eval_method, split_ratio, n_splits,
period=self._state.fit_kwargs['period'])
self._state.time_budget = None
self._state.n_jobs = n_jobs
self._trained_estimator = self._state._train_with_config(
@ -854,6 +910,26 @@ class AutoML:
logger.info('retrain from log succeeded')
return training_duration
def _decide_split_type(self, split_type):
if self._state.task == 'classification':
self._state.task = get_classification_objective(
len(np.unique(self._y_train_all)))
assert split_type in [None, "stratified", "uniform", "time"]
self._split_type = split_type or "stratified"
elif self._state.task == 'regression':
assert split_type in [None, "uniform", "time"]
self._split_type = split_type or "uniform"
elif self._state.task == 'forecast':
assert split_type in [None, "time"]
self._split_type = "time"
assert isinstance(self._state.fit_kwargs.get('period'), int), (
"missing a required integer 'period' for forecast.")
elif self._state.task == 'rank':
assert self._state.groups is not None, \
'groups must be specified for ranking task.'
assert split_type in [None, "group"]
self._split_type = 'group'
def _decide_eval_method(self, time_budget):
if self._state.X_val is not None:
return 'holdout'
@ -1020,7 +1096,7 @@ class AutoML:
else:
return {'pred_time': 0,
'wall_clock_time': None,
'train_loss': np.inf,
'metric_for_logging': np.inf,
'val_loss': np.inf,
'trained_estimator': None
}
@ -1065,10 +1141,11 @@ class AutoML:
X_val=None,
y_val=None,
sample_weight_val=None,
groups_val=None,
groups=None,
verbose=1,
retrain_full=True,
split_type="stratified",
split_type=None,
learner_selector='sample',
hpo_method=None,
starting_points={},
@ -1104,14 +1181,15 @@ class AutoML:
def custom_metric(
X_test, y_test, estimator, labels,
X_train, y_train, weight_test=None, weight_train=None
X_train, y_train, weight_test=None, weight_train=None,
config=None, groups_test=None, groups_train=None,
):
return metric_to_minimize, metrics_to_log
which returns a float number as the minimization objective,
and a tuple of floats or a dictionary as the metrics to log.
task: A string of the task type, e.g.,
'classification', 'regression', 'forecast'.
'classification', 'regression', 'forecast', 'rank'.
n_jobs: An integer of the number of threads for training.
log_file_name: A string of the log file name.
estimator_list: A list of strings for estimator names, or 'auto'
@ -1125,6 +1203,10 @@ class AutoML:
max_iter: An integer of the maximal number of iterations.
sample: A boolean of whether to sample the training data during
search.
ensemble: boolean or dict | default=False. Whether to perform
ensemble after search. Can be a dict with keys 'passthrough'
and 'final_estimator' to specify the passthrough and
final_estimator in the stacker.
eval_method: A string of resampling strategy, one of
['auto', 'cv', 'holdout'].
split_ratio: A float of the valiation data percentage for holdout.
@ -1144,9 +1226,13 @@ class AutoML:
X_val: None or a numpy array or a pandas dataframe of validation data.
y_val: None or a numpy array or a pandas series of validation labels.
sample_weight_val: None or a numpy array of the sample weight of
validation data.
groups: None or an array-like of shape (n,) | Group labels for the
samples used while splitting the dataset into train/valid set.
validation data of the same shape as y_val.
groups_val: None or array-like | group labels (with matching length
to y_val) or group counts (with sum equal to length of y_val)
for validation data. Need to be consistent with groups.
groups: None or array-like | Group labels (with matching length to
y_train) or groups counts (with sum equal to length of y_train)
for training data.
verbose: int, default=1 | Controls the verbosity, higher means more
messages.
retrain_full: bool or str, default=True | whether to retrain the
@ -1154,6 +1240,13 @@ class AutoML:
True - retrain only after search finishes; False - no retraining;
'budget' - do best effort to retrain without violating the time
budget.
split_type: str or None, default=None | the data split type.
For classification tasks, valid choices are [
None, 'stratified', 'uniform', 'time']. None -> stratified.
For regression tasks, valid choices are [None, 'uniform', 'time'].
None -> uniform.
For time series forecasting, must be None or 'time'.
For ranking task, must be None or 'group'.
hpo_method: str or None, default=None | The hyperparameter
optimization method. When it is None, CFO is used.
No need to set when using flaml's default search space or using
@ -1182,9 +1275,9 @@ class AutoML:
self._state.log_training_metric = log_training_metric
self._state.fit_kwargs = fit_kwargs
self._state.weight_val = sample_weight_val
self._state.groups = groups
self._validate_data(X_train, y_train, dataframe, label, X_val, y_val)
self._validate_data(X_train, y_train, dataframe, label, X_val, y_val,
groups_val, groups)
self._search_states = {} # key: estimator name; value: SearchState
self._random = np.random.RandomState(RANDOM_SEED)
if seed is not None:
@ -1194,24 +1287,7 @@ class AutoML:
self.verbose = verbose
if verbose == 0:
logger.setLevel(logging.WARNING)
if self._state.task == 'classification':
self._state.task = get_classification_objective(
len(np.unique(self._y_train_all)))
assert split_type in ["stratified", "uniform", "time"]
self._split_type = split_type
elif self._state.task == 'regression':
if split_type in ["uniform", "time"]:
self._split_type = split_type
else:
self._split_type = "uniform"
elif self._state.task == 'forecast':
if split_type is not None and split_type != 'time':
raise ValueError(
"split_type must be 'time' when task is 'forecast'.")
self._split_type = "time"
if self._state.fit_kwargs.get('period') is None:
raise TypeError(
"missing 1 required argument for 'forecast' task: 'period'.")
self._decide_split_type(split_type)
if eval_method == 'auto' or self._state.X_val is not None:
eval_method = self._decide_eval_method(time_budget)
self._state.eval_method = eval_method
@ -1227,12 +1303,8 @@ class AutoML:
self._retrain_final = retrain_full is True and (
eval_method == 'holdout' and self._state.X_val is None) or (
eval_method == 'cv')
if self._state.task != 'forecast':
self._prepare_data(eval_method, split_ratio, n_splits)
else:
self._prepare_data(eval_method, split_ratio, n_splits,
period=self._state.fit_kwargs['period'])
self._sample = sample and eval_method != 'cv' and (
self._prepare_data(eval_method, split_ratio, n_splits)
self._sample = sample and task != 'rank' and eval_method != 'cv' and (
MIN_SAMPLE_TRAIN * SAMPLE_MULTIPLY_FACTOR < self._state.data_size)
if 'auto' == metric:
if 'binary' in self._state.task:
@ -1241,11 +1313,13 @@ class AutoML:
metric = 'log_loss'
elif self._state.task == 'forecast':
metric = 'mape'
elif self._state.task == 'rank':
metric = 'ndcg'
else:
metric = 'r2'
self._state.metric = metric
if metric in ['r2', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo',
'f1', 'ap', 'micro_f1', 'macro_f1']:
'f1', 'ap', 'micro_f1', 'macro_f1', 'ndcg']:
error_metric = f"1-{metric}"
elif isinstance(metric, str):
error_metric = metric
@ -1256,6 +1330,8 @@ class AutoML:
if 'auto' == estimator_list:
if self._state.task == 'forecast':
estimator_list = ['fbprophet', 'arima', 'sarimax']
elif self._state.task == 'rank':
estimator_list = ['lgbm', 'xgboost']
else:
estimator_list = [
'lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree']
@ -1278,7 +1354,9 @@ class AutoML:
logger.info("List of ML learners in AutoML Run: {}".format(
estimator_list))
self.estimator_list = estimator_list
self._hpo_method = hpo_method or 'cfo'
self._hpo_method = hpo_method or (
'cfo' if n_concurrent_trials == 1 or len(estimator_list) == 1
else 'bs')
self._state.time_budget = time_budget
self._active_estimators = estimator_list.copy()
self._ensemble = ensemble
@ -1315,7 +1393,8 @@ class AutoML:
del self._X_train_all, self._y_train_all, self._state.kf
del self._state.X_train, self._state.X_train_all, self._state.X_val
del self._state.y_train, self._state.y_train_all, self._state.y_val
del self._sample_weight_full, self._state.fit_kwargs, self._state.groups
del self._sample_weight_full, self._state.fit_kwargs
del self._state.groups, self._state.groups_all, self._state.groups_val
for state in self._search_states.values():
if state.trained_estimator:
del state.trained_estimator
@ -1363,8 +1442,7 @@ class AutoML:
del p[k]
search_alg = SearchAlgo(max_concurrent=self._n_concurrent_trials,
points_to_evaluate=points_to_evaluate
)
points_to_evaluate=points_to_evaluate)
else:
search_alg = SearchAlgo(
metric='val_loss',
@ -1387,7 +1465,8 @@ class AutoML:
analysis = ray.tune.run(
self.trainable, search_alg=search_alg, config=self.search_space,
metric='val_loss', mode='min', resources_per_trial=resources_per_trial,
time_budget_s=self._state.time_budget, num_samples=self._max_iter)
time_budget_s=self._state.time_budget, num_samples=self._max_iter,
verbose=self.verbose)
# logger.info([trial.last_result for trial in analysis.trials])
trials = sorted((trial for trial in analysis.trials if trial.last_result
and trial.last_result['wall_clock_time'] is not None),
@ -1421,7 +1500,7 @@ class AutoML:
if (better or self._log_type == 'all') and self._training_log:
self._training_log.append(
self._iter_per_learner[estimator],
search_state.train_loss,
search_state.metric_for_logging,
search_state.trial_time,
self._state.time_from_start,
search_state.val_loss,
@ -1591,7 +1670,7 @@ class AutoML:
if self._training_log:
self._training_log.append(
self._iter_per_learner[estimator],
search_state.train_loss,
search_state.metric_for_logging,
search_state.trial_time,
self._state.time_from_start,
search_state.val_loss,
@ -1604,8 +1683,8 @@ class AutoML:
with mlflow.start_run(nested=True):
mlflow.log_metric('iter_counter',
self._iter_per_learner[estimator])
mlflow.log_param('train_loss',
search_state.train_loss)
mlflow.log_param('metric_for_logging',
search_state.metric_for_logging)
mlflow.log_metric('trial_time',
search_state.trial_time)
mlflow.log_metric('wall_clock_time',
@ -1702,7 +1781,9 @@ class AutoML:
for search_state in self._search_states.values())
if self._trained_estimator:
logger.info(f'selected model: {self._trained_estimator.model}')
if self._ensemble:
if self._ensemble and self._state.task in (
'binary:logistic', 'multi:softmax', 'regression',
):
search_states = list(x for x in self._search_states.items()
if x[1].trained_estimator)
search_states.sort(key=lambda x: x[1].best_loss)
@ -1714,15 +1795,20 @@ class AutoML:
logger.info(estimators)
if len(estimators) <= 1:
return
if self._state.task != "regression":
if self._state.task in ('binary:logistic', 'multi:softmax'):
from sklearn.ensemble import StackingClassifier as Stacker
for e in estimators:
e[1]._estimator_type = 'classifier'
else:
from sklearn.ensemble import StackingRegressor as Stacker
best_m = self._trained_estimator
stacker = Stacker(estimators, best_m, n_jobs=self._state.n_jobs,
passthrough=True)
if isinstance(self._ensemble, dict):
final_estimator = self._ensemble.get(
'final_estimator', self._trained_estimator)
passthrough = self._ensemble.get('passthrough', True)
else:
final_estimator = self._trained_estimator
passthrough = True
stacker = Stacker(
estimators, final_estimator, n_jobs=self._state.n_jobs,
passthrough=passthrough)
if self._sample_weight_full is not None:
self._state.fit_kwargs[
'sample_weight'] = self._sample_weight_full
@ -1734,9 +1820,11 @@ class AutoML:
elif self._retrain_final:
# reset time budget for retraining
self._state.time_from_start -= self._state.time_budget
if (self._state.time_budget - self._state.time_from_start
> self._selected.est_retrain_time(self.data_size_full)) \
and self._selected.best_config_sample_size == self._state.data_size:
if self._state.task == 'forecast' or (
self._state.time_budget - self._state.time_from_start
> self._selected.est_retrain_time(self.data_size_full)
and self._selected.best_config_sample_size == self._state.data_size
):
self._trained_estimator, \
retrain_time = self._state._train_with_config(
self._best_estimator,

View File

@ -146,7 +146,7 @@ def get_output_from_log(filename, time_budget):
config = record.config
learner = record.learner.split('_')[0]
sample_size = record.sample_size
train_loss = record.logged_metric
metric = record.logged_metric
if time_used < time_budget and np.isfinite(val_loss):
if val_loss < best_val_loss:
@ -156,7 +156,7 @@ def get_output_from_log(filename, time_budget):
best_config_list.append(best_config)
search_time_list.append(time_used)
best_error_list.append(best_val_loss)
logged_metric_list.append(train_loss)
logged_metric_list.append(metric)
error_list.append(val_loss)
config_list.append({"Current Learner": learner,
"Current Sample": sample_size,
@ -242,8 +242,12 @@ class DataTransformer:
X[cat_columns] = X[cat_columns].astype('category')
if num_columns:
X_num = X[num_columns]
if drop and np.issubdtype(X_num.columns.dtype, np.integer):
if np.issubdtype(X_num.columns.dtype, np.integer) and (
drop or min(X_num.columns) != 0
or max(X_num.columns) != X_num.shape[1] - 1
):
X_num.columns = range(X_num.shape[1])
drop = True
else:
drop = False
from sklearn.impute import SimpleImputer
@ -257,12 +261,12 @@ class DataTransformer:
cat_columns, num_columns, datetime_columns
self._drop = drop
if task == 'regression':
self.label_transformer = None
else:
if task in ('binary:logistic', 'multi:softmax'):
from sklearn.preprocessing import LabelEncoder
self.label_transformer = LabelEncoder()
y = self.label_transformer.fit_transform(y)
else:
self.label_transformer = None
return X, y
def transform(self, X):
@ -302,3 +306,8 @@ class DataTransformer:
X_num.columns = range(X_num.shape[1])
X[num_columns] = self.transformer.transform(X_num)
return X
def group_counts(groups):
_, i, c = np.unique(groups, return_counts=True, return_index=True)
return c[np.argsort(i)]

View File

@ -4,17 +4,17 @@
'''
import time
from joblib.externals.cloudpickle.cloudpickle import instance
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, \
accuracy_score, mean_absolute_error, log_loss, average_precision_score, \
f1_score, mean_absolute_percentage_error
f1_score, mean_absolute_percentage_error, ndcg_score
from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold, TimeSeriesSplit
from .model import (
XGBoostEstimator, XGBoostSklearnEstimator, RandomForestEstimator,
LGBMEstimator, LRL1Classifier, LRL2Classifier, CatBoostEstimator,
ExtraTreeEstimator, KNeighborsEstimator, FBProphet, ARIMA, SARIMAX)
from .data import group_counts
import logging
logger = logging.getLogger(__name__)
@ -56,26 +56,29 @@ def get_estimator_class(task, estimator_name):
def sklearn_metric_loss_score(
metric_name, y_predict, y_true, labels=None, sample_weight=None
metric_name, y_predict, y_true, labels=None, sample_weight=None,
groups=None,
):
'''Loss using the specified metric
Args:
metric_name: A string of the metric name, one of
'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',
'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'micro_f1', 'macro_f1'
'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg',
'micro_f1', 'macro_f1'.
y_predict: A 1d or 2d numpy array of the predictions which can be
used to calculate the metric. E.g., 2d for log_loss and 1d
for others.
y_true: A 1d numpy array of the true labels
labels: A 1d numpy array of the unique labels
sample_weight: A 1d numpy array of the sample weight
y_true: A 1d numpy array of the true labels.
labels: A 1d numpy array of the unique labels.
sample_weight: A 1d numpy array of the sample weight.
groups: A 1d numpy array of the group labels.
Returns:
score: A float number of the loss, the lower the better
score: A float number of the loss, the lower the better.
'''
metric_name = metric_name.lower()
if 'r2' in metric_name:
if 'r2' == metric_name:
score = 1.0 - r2_score(y_true, y_predict, sample_weight=sample_weight)
elif metric_name == 'rmse':
score = np.sqrt(mean_squared_error(
@ -98,26 +101,40 @@ def sklearn_metric_loss_score(
elif metric_name == 'roc_auc_ovo':
score = 1.0 - roc_auc_score(
y_true, y_predict, sample_weight=sample_weight, multi_class='ovo')
elif 'log_loss' in metric_name:
elif 'log_loss' == metric_name:
score = log_loss(
y_true, y_predict, labels=labels, sample_weight=sample_weight)
elif 'mape' in metric_name:
elif 'mape' == metric_name:
try:
score = mean_absolute_percentage_error(
y_true, y_predict)
except ValueError:
return np.inf
elif 'micro_f1' in metric_name:
elif 'micro_f1' == metric_name:
score = 1 - f1_score(
y_true, y_predict, sample_weight=sample_weight, average='micro')
elif 'macro_f1' in metric_name:
elif 'macro_f1' == metric_name:
score = 1 - f1_score(
y_true, y_predict, sample_weight=sample_weight, average='macro')
elif 'f1' in metric_name:
elif 'f1' == metric_name:
score = 1 - f1_score(y_true, y_predict, sample_weight=sample_weight)
elif 'ap' in metric_name:
elif 'ap' == metric_name:
score = 1 - average_precision_score(
y_true, y_predict, sample_weight=sample_weight)
elif 'ndcg' in metric_name:
if '@' in metric_name:
k = int(metric_name.split('@', 1)[-1])
counts = group_counts(groups)
score = 0
psum = 0
for c in counts:
score -= ndcg_score(np.asarray([y_true[psum:psum + c]]),
np.asarray([y_predict[psum:psum + c]]), k=k)
psum += c
score /= len(counts)
score += 1
else:
score = 1 - ndcg_score([y_true], [y_predict])
else:
raise ValueError(
metric_name + ' is not a built-in metric, '
@ -128,92 +145,60 @@ def sklearn_metric_loss_score(
return score
def get_y_pred(estimator, X, eval_metric, obj, freq=None):
def get_y_pred(estimator, X, eval_metric, obj):
if eval_metric in ['roc_auc', 'ap'] and 'binary' in obj:
y_pred_classes = estimator.predict_proba(X)
y_pred = y_pred_classes[
:, 1] if y_pred_classes.ndim > 1 else y_pred_classes
elif eval_metric in ['log_loss', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']:
y_pred = estimator.predict_proba(X)
elif eval_metric == 'mape':
y_pred = estimator.predict(X, freq=freq)
else:
y_pred = estimator.predict(X)
return y_pred
def get_test_loss(
estimator, X_train, y_train, X_test, y_test, weight_test,
eval_metric, obj, labels=None, budget=None, log_training_metric=False, fit_kwargs={}
):
def get_test_loss(config, estimator, X_train, y_train, X_test, y_test, weight_test,
groups_test, eval_metric, obj, labels=None, budget=None,
log_training_metric=False, fit_kwargs={}):
start = time.time()
# if groups_test is not None:
# fit_kwargs['groups_val'] = groups_test
# fit_kwargs['X_val'] = X_test
# fit_kwargs['y_val'] = y_test
estimator.fit(X_train, y_train, budget, **fit_kwargs)
if isinstance(eval_metric, str):
pred_start = time.time()
test_pred_y = get_y_pred(estimator, X_test, eval_metric, obj)
pred_time = (time.time() - pred_start) / X_test.shape[0]
test_loss = sklearn_metric_loss_score(eval_metric, test_pred_y, y_test,
labels, weight_test)
labels, weight_test, groups_test)
if log_training_metric:
test_pred_y = get_y_pred(estimator, X_train, eval_metric, obj)
train_loss = sklearn_metric_loss_score(
eval_metric, test_pred_y,
y_train, labels, fit_kwargs.get('sample_weight'))
metric_for_logging = sklearn_metric_loss_score(
eval_metric, test_pred_y, y_train, labels,
fit_kwargs.get('sample_weight'), fit_kwargs.get('groups'))
else:
train_loss = None
metric_for_logging = None
else: # customized metric function
test_loss, metrics = eval_metric(
X_test, y_test, estimator, labels, X_train, y_train,
weight_test, fit_kwargs.get('sample_weight'))
X_test, y_test, estimator, labels, X_train, y_train, weight_test,
fit_kwargs.get('sample_weight'), config, groups_test,
fit_kwargs.get('groups'))
if isinstance(metrics, dict):
pred_time = metrics.get('pred_time', 0)
train_loss = metrics
metric_for_logging = metrics
train_time = time.time() - start
return test_loss, train_time, train_loss, pred_time
return test_loss, metric_for_logging, train_time, pred_time
def train_model(estimator, X_train, y_train, budget, fit_kwargs={}):
train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs)
return train_time
def evaluate_model(
estimator, X_train, y_train, X_val, y_val, weight_val,
budget, kf, task, eval_method, eval_metric, best_val_loss, log_training_metric=False,
fit_kwargs={}
):
if 'holdout' in eval_method:
val_loss, train_loss, train_time, pred_time = evaluate_model_holdout(
estimator, X_train, y_train, X_val, y_val, weight_val, budget,
task, eval_metric, log_training_metric=log_training_metric,
fit_kwargs=fit_kwargs)
else:
val_loss, train_loss, train_time, pred_time = evaluate_model_CV(
estimator, X_train, y_train, budget, kf, task,
eval_metric, best_val_loss, log_training_metric=log_training_metric,
fit_kwargs=fit_kwargs)
return val_loss, train_loss, train_time, pred_time
def evaluate_model_holdout(
estimator, X_train, y_train, X_val, y_val,
weight_val, budget, task, eval_metric, log_training_metric=False,
fit_kwargs={}
):
val_loss, train_time, train_loss, pred_time = get_test_loss(
estimator, X_train, y_train, X_val, y_val, weight_val, eval_metric,
task, budget=budget, log_training_metric=log_training_metric, fit_kwargs=fit_kwargs)
return val_loss, train_loss, train_time, pred_time
def evaluate_model_CV(
estimator, X_train_all, y_train_all, budget, kf,
task, eval_metric, best_val_loss, log_training_metric=False, fit_kwargs={}
):
def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
task, eval_metric, best_val_loss,
log_training_metric=False, fit_kwargs={}):
start_time = time.time()
total_val_loss = 0
total_train_loss = None
train_loss = None
total_metric = None
metric = None
train_time = pred_time = 0
valid_fold_num = total_fold_num = 0
n = kf.get_n_splits()
@ -222,15 +207,19 @@ def evaluate_model_CV(
labels = np.unique(y_train_all)
else:
labels = None
groups = None
shuffle = True
if isinstance(kf, RepeatedStratifiedKFold):
kf = kf.split(X_train_split, y_train_split)
elif isinstance(kf, GroupKFold):
kf = kf.split(X_train_split, y_train_split, kf.groups)
groups = kf.groups
kf = kf.split(X_train_split, y_train_split, groups)
shuffle = False
elif isinstance(kf, TimeSeriesSplit) and task == 'forecast':
y_train_all = pd.DataFrame(y_train_all, columns=['y'])
train = X_train_all.join(y_train_all)
kf = kf.split(train)
shuffle = False
elif isinstance(kf, TimeSeriesSplit):
kf = kf.split(X_train_split, y_train_split)
else:
@ -244,7 +233,7 @@ def evaluate_model_CV(
else:
weight = weight_val = None
for train_index, val_index in kf:
if not isinstance(kf, TimeSeriesSplit):
if shuffle:
train_index = rng.permutation(train_index)
if isinstance(X_train_all, pd.DataFrame):
X_train, X_val = X_train_split.iloc[
@ -252,19 +241,19 @@ def evaluate_model_CV(
else:
X_train, X_val = X_train_split[
train_index], X_train_split[val_index]
if isinstance(y_train_all, pd.Series):
y_train, y_val = y_train_split.iloc[
train_index], y_train_split.iloc[val_index]
else:
y_train, y_val = y_train_split[
train_index], y_train_split[val_index]
y_train, y_val = y_train_split[train_index], y_train_split[val_index]
estimator.cleanup()
if weight is not None:
fit_kwargs['sample_weight'], weight_val = weight[
train_index], weight[val_index]
val_loss_i, train_time_i, train_loss_i, pred_time_i = get_test_loss(
estimator, X_train, y_train, X_val, y_val, weight_val,
eval_metric, task, labels, budget_per_train,
if groups is not None:
fit_kwargs['groups'] = groups[train_index]
groups_val = groups[val_index]
else:
groups_val = None
val_loss_i, metric_i, train_time_i, pred_time_i = get_test_loss(
config, estimator, X_train, y_train, X_val, y_val, weight_val,
groups_val, eval_metric, task, labels, budget_per_train,
log_training_metric=log_training_metric, fit_kwargs=fit_kwargs)
if weight is not None:
fit_kwargs['sample_weight'] = weight
@ -272,16 +261,16 @@ def evaluate_model_CV(
total_fold_num += 1
total_val_loss += val_loss_i
if log_training_metric or not isinstance(eval_metric, str):
if isinstance(total_train_loss, list):
total_train_loss = [
total_train_loss[i] + v for i, v in enumerate(train_loss_i)]
elif isinstance(total_train_loss, dict):
total_train_loss = {
k: total_train_loss[k] + v for k, v in train_loss_i.items()}
elif total_train_loss is not None:
total_train_loss += train_loss_i
if isinstance(total_metric, list):
total_metric = [
total_metric[i] + v for i, v in enumerate(metric_i)]
elif isinstance(total_metric, dict):
total_metric = {
k: total_metric[k] + v for k, v in metric_i.items()}
elif total_metric is not None:
total_metric += metric_i
else:
total_train_loss = train_loss_i
total_metric = metric_i
train_time += train_time_i
pred_time += pred_time_i
if valid_fold_num == n:
@ -293,22 +282,22 @@ def evaluate_model_CV(
val_loss = np.max(val_loss_list)
n = total_fold_num
if log_training_metric or not isinstance(eval_metric, str):
if isinstance(total_train_loss, list):
train_loss = [v / n for v in total_train_loss]
elif isinstance(total_train_loss, dict):
train_loss = {k: v / n for k, v in total_train_loss.items()}
if isinstance(total_metric, list):
metric = [v / n for v in total_metric]
elif isinstance(total_metric, dict):
metric = {k: v / n for k, v in total_metric.items()}
else:
train_loss = total_train_loss / n
metric = total_metric / n
pred_time /= n
# budget -= time.time() - start_time
# if val_loss < best_val_loss and budget > budget_per_train:
# estimator.cleanup()
# estimator.fit(X_train_all, y_train_all, budget, **fit_kwargs)
return val_loss, train_loss, train_time, pred_time
return val_loss, metric, train_time, pred_time
def compute_estimator(
X_train, y_train, X_val, y_val, weight_val, budget, kf,
X_train, y_train, X_val, y_val, weight_val, groups_val, budget, kf,
config_dic, task, estimator_name, eval_method, eval_metric,
best_val_loss=np.Inf, n_jobs=1, estimator_class=None, log_training_metric=False,
fit_kwargs={}
@ -317,11 +306,17 @@ def compute_estimator(
task, estimator_name)
estimator = estimator_class(
**config_dic, task=task, n_jobs=n_jobs)
val_loss, train_loss, train_time, pred_time = evaluate_model(
estimator, X_train, y_train, X_val, y_val, weight_val, budget, kf, task,
eval_method, eval_metric, best_val_loss, log_training_metric=log_training_metric,
fit_kwargs=fit_kwargs)
return estimator, val_loss, train_loss, train_time, pred_time
if 'holdout' in eval_method:
val_loss, metric_for_logging, train_time, pred_time = get_test_loss(
config_dic, estimator, X_train, y_train, X_val, y_val, weight_val,
groups_val, eval_metric, task, budget=budget,
log_training_metric=log_training_metric, fit_kwargs=fit_kwargs)
else:
val_loss, metric_for_logging, train_time, pred_time = evaluate_model_CV(
config_dic, estimator, X_train, y_train, budget, kf, task,
eval_metric, best_val_loss, log_training_metric=log_training_metric,
fit_kwargs=fit_kwargs)
return estimator, val_loss, metric_for_logging, train_time, pred_time
def train_estimator(
@ -333,8 +328,7 @@ def train_estimator(
task, estimator_name)
estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs)
if X_train is not None:
train_time = train_model(
estimator, X_train, y_train, budget, fit_kwargs)
train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs)
else:
estimator = estimator.estimator_class(**estimator.params)
train_time = time.time() - start_time

View File

@ -3,16 +3,18 @@
* Licensed under the MIT License.
'''
import warnings
import numpy as np
import xgboost as xgb
import time
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier, LGBMRegressor
from lightgbm import LGBMClassifier, LGBMRegressor, LGBMRanker
from scipy.sparse import issparse
import pandas as pd
from . import tune
from .data import group_counts
import logging
@ -45,8 +47,8 @@ class BaseEstimator:
self._estimator_type = params['_estimator_type']
del self.params['_estimator_type']
else:
self._estimator_type = "regressor" if task == 'regression' \
else "classifier"
self._estimator_type = "classifier" if task in (
'binary:logistic', 'multi:softmax') else "regressor"
def get_params(self, deep=False):
params = self.params.copy()
@ -81,6 +83,18 @@ class BaseEstimator:
def _fit(self, X_train, y_train, **kwargs):
current_time = time.time()
if 'groups' in kwargs:
kwargs = kwargs.copy()
if self._task == 'rank':
kwargs['group'] = group_counts(kwargs['groups'])
# groups_val = kwargs.get('groups_val')
# if groups_val is not None:
# kwargs['eval_group'] = [group_counts(groups_val)]
# kwargs['eval_set'] = [
# (kwargs['X_val'], kwargs['y_val'])]
# kwargs['verbose'] = False
# del kwargs['groups_val'], kwargs['X_val'], kwargs['y_val']
del kwargs['groups']
X_train = self._preprocess(X_train)
model = self.estimator_class(**self.params)
model.fit(X_train, y_train, **kwargs)
@ -255,12 +269,14 @@ class LGBMEstimator(BaseEstimator):
if "objective" not in self.params:
# Default: regression for LGBMRegressor,
# binary or multiclass for LGBMClassifier
if 'regression' in task:
if 'regression' == task:
objective = 'regression'
elif 'binary' in task:
objective = 'binary'
elif 'multi' in task:
objective = 'multiclass'
elif 'rank' == task:
objective = 'lambdarank'
else:
objective = 'regression'
self.params["objective"] = objective
@ -276,8 +292,10 @@ class LGBMEstimator(BaseEstimator):
self.params['verbose'] = -1
# if "subsample_freq" not in self.params:
# self.params['subsample_freq'] = 1
if 'regression' in task:
if 'regression' == task:
self.estimator_class = LGBMRegressor
elif 'rank' == task:
self.estimator_class = LGBMRanker
else:
self.estimator_class = LGBMClassifier
self._time_per_iter = None
@ -488,8 +506,10 @@ class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
'use_label_encoder': params.get('use_label_encoder', False),
})
if 'regression' in task:
if 'regression' == task:
self.estimator_class = xgb.XGBRegressor
elif 'rank' == task:
self.estimator_class = xgb.XGBRanker
else:
self.estimator_class = xgb.XGBClassifier
self._time_per_iter = None
@ -716,7 +736,9 @@ class CatBoostEstimator(BaseEstimator):
return params
def fit(self, X_train, y_train, budget=None, **kwargs):
import shutil
start_time = time.time()
train_dir = f'catboost_{str(start_time)}'
n_iter = self.params["n_estimators"]
X_train = self._preprocess(X_train)
if isinstance(X_train, pd.DataFrame):
@ -730,16 +752,19 @@ class CatBoostEstimator(BaseEstimator):
CatBoostEstimator._train_size - len(y_train)) > 4) and budget:
# measure the time per iteration
self.params["n_estimators"] = 1
CatBoostEstimator._smallmodel = self.estimator_class(**self.params)
CatBoostEstimator._smallmodel = self.estimator_class(
train_dir=train_dir, **self.params)
CatBoostEstimator._smallmodel.fit(
X_train, y_train, cat_features=cat_features, **kwargs)
CatBoostEstimator._t1 = time.time() - start_time
if CatBoostEstimator._t1 >= budget:
self.params["n_estimators"] = n_iter
self._model = CatBoostEstimator._smallmodel
shutil.rmtree(train_dir, ignore_errors=True)
return CatBoostEstimator._t1
self.params["n_estimators"] = 4
CatBoostEstimator._smallmodel = self.estimator_class(**self.params)
CatBoostEstimator._smallmodel = self.estimator_class(
train_dir=train_dir, **self.params)
CatBoostEstimator._smallmodel.fit(
X_train, y_train, cat_features=cat_features, **kwargs)
CatBoostEstimator._time_per_iter = (
@ -752,6 +777,7 @@ class CatBoostEstimator(BaseEstimator):
"n_estimators"]:
self.params["n_estimators"] = n_iter
self._model = CatBoostEstimator._smallmodel
shutil.rmtree(train_dir, ignore_errors=True)
return time.time() - start_time
if budget:
train_times = 1
@ -769,13 +795,14 @@ class CatBoostEstimator(BaseEstimator):
else:
weight = None
from catboost import Pool
model = self.estimator_class(**self.params)
model = self.estimator_class(train_dir=train_dir, **self.params)
model.fit(
X_tr, y_tr, cat_features=cat_features,
eval_set=Pool(
data=X_train[n:], label=y_train[n:],
cat_features=cat_features),
**kwargs) # model.get_best_iteration()
shutil.rmtree(train_dir, ignore_errors=True)
if weight is not None:
kwargs['sample_weight'] = weight
self._model = model
@ -862,44 +889,43 @@ class FBProphet(BaseEstimator):
}
return space
def fit(self, X_train, y_train, budget=None, **kwargs):
def __init__(self, task='forecast', **params):
if 'n_jobs' in params:
params.pop('n_jobs')
super().__init__(task, **params)
def _join(self, X_train, y_train):
assert 'ds' in X_train, (
'Dataframe for training forecast model must have column'
' "ds" with the dates in X_train.')
y_train = pd.DataFrame(y_train, columns=['y'])
train_df = X_train.join(y_train)
return train_df
if ('ds' not in train_df) or ('y' not in train_df):
raise ValueError(
'Dataframe for training forecast model must have columns "ds" and "y" with the dates and '
'values respectively.'
)
if 'n_jobs' in self.params:
self.params.pop('n_jobs')
def fit(self, X_train, y_train, budget=None, **kwargs):
from prophet import Prophet
current_time = time.time()
train_df = self._join(X_train, y_train)
model = Prophet(**self.params).fit(train_df)
train_time = time.time() - current_time
self._model = model
return train_time
def predict(self, X_test, freq=None):
def predict(self, X_test):
if isinstance(X_test, int):
raise ValueError(
"predict() with steps is only supported for arima/sarimax."
" For FBProphet, pass a dataframe with a date colum named ds.")
if self._model is not None:
if isinstance(X_test, int) and freq is not None:
future = self._model.make_future_dataframe(periods=X_test, freq=freq)
forecast = self._model.predict(future)
elif isinstance(X_test, pd.DataFrame):
forecast = self._model.predict(X_test)
else:
raise ValueError(
"either X_test(pd.Dataframe with dates for predictions, column ds) or"
"X_test(int number of periods)+freq are required.")
forecast = self._model.predict(X_test)
return forecast['yhat']
else:
warnings.warn(
"Estimator is not fit yet. Please run fit() before predict().")
return np.ones(X_test.shape[0])
class ARIMA(BaseEstimator):
class ARIMA(FBProphet):
@classmethod
def search_space(cls, **params):
space = {
@ -921,55 +947,45 @@ class ARIMA(BaseEstimator):
}
return space
def fit(self, X_train, y_train, budget=None, **kwargs):
y_train = pd.DataFrame(y_train, columns=['y'])
train_df = X_train.join(y_train)
if ('ds' not in train_df) or ('y' not in train_df):
raise ValueError(
'Dataframe for training forecast model must have columns "ds" and "y" with the dates and '
'values respectively.'
)
def _join(self, X_train, y_train):
train_df = super()._join(X_train, y_train)
train_df.index = pd.to_datetime(train_df['ds'])
train_df = train_df.drop('ds', axis=1)
return train_df
if 'n_jobs' in self.params:
self.params.pop('n_jobs')
def fit(self, X_train, y_train, budget=None, **kwargs):
from statsmodels.tsa.arima.model import ARIMA as ARIMA_estimator
import warnings
warnings.filterwarnings("ignore")
current_time = time.time()
model = ARIMA_estimator(train_df,
order=(self.params['p'], self.params['d'], self.params['q']),
enforce_stationarity=False,
enforce_invertibility=False)
train_df = self._join(X_train, y_train)
model = ARIMA_estimator(
train_df, order=(
self.params['p'], self.params['d'], self.params['q']),
enforce_stationarity=False, enforce_invertibility=False)
model = model.fit()
train_time = time.time() - current_time
self._model = model
return train_time
def predict(self, X_test, freq=None):
def predict(self, X_test):
if self._model is not None:
if isinstance(X_test, int) and freq is not None:
forecast = self._model.forecast(steps=X_test).to_frame().reset_index()
if isinstance(X_test, int):
forecast = self._model.forecast(steps=X_test)
elif isinstance(X_test, pd.DataFrame):
start_date = X_test.iloc[0, 0]
end_date = X_test.iloc[-1, 0]
forecast = self._model.predict(start=start_date, end=end_date)
start = X_test.iloc[0, 0]
end = X_test.iloc[-1, 0]
forecast = self._model.predict(start=start, end=end)
else:
raise ValueError(
"either X_test(pd.Dataframe with dates for predictions, column ds) or"
"X_test(int number of periods)+freq are required.")
"X_test needs to be either a pd.Dataframe with dates as column ds)"
" or an int number of periods for predict().")
return forecast
else:
return np.ones(X_test.shape[0])
return np.ones(X_test if isinstance(X_test, int)
else X_test.shape[0])
class SARIMAX(BaseEstimator):
class SARIMAX(ARIMA):
@classmethod
def search_space(cls, **params):
space = {
@ -1011,47 +1027,17 @@ class SARIMAX(BaseEstimator):
return space
def fit(self, X_train, y_train, budget=None, **kwargs):
y_train = pd.DataFrame(y_train, columns=['y'])
train_df = X_train.join(y_train)
if ('ds' not in train_df) or ('y' not in train_df):
raise ValueError(
'Dataframe for training forecast model must have columns "ds" and "y" with the dates and '
'values respectively.'
)
train_df.index = pd.to_datetime(train_df['ds'])
train_df = train_df.drop('ds', axis=1)
if 'n_jobs' in self.params:
self.params.pop('n_jobs')
from statsmodels.tsa.statespace.sarimax import SARIMAX as SARIMAX_estimator
current_time = time.time()
model = SARIMAX_estimator(train_df,
order=(self.params['p'], self.params['d'], self.params['q']),
seasonality_order=(self.params['P'], self.params['D'], self.params['Q'], self.params['s']),
enforce_stationarity=False,
enforce_invertibility=False)
train_df = self._join(X_train, y_train)
model = SARIMAX_estimator(
train_df, order=(
self.params['p'], self.params['d'], self.params['q']),
seasonality_order=(
self.params['P'], self.params['D'], self.params['Q'],
self.params['s']),
enforce_stationarity=False, enforce_invertibility=False)
model = model.fit()
train_time = time.time() - current_time
self._model = model
return train_time
def predict(self, X_test, freq=None):
if self._model is not None:
if isinstance(X_test, int) and freq is not None:
forecast = self._model.forecast(steps=X_test).to_frame().reset_index()
elif isinstance(X_test, pd.DataFrame):
start_date = X_test.iloc[0, 0]
end_date = X_test.iloc[-1, 0]
forecast = self._model.predict(start=start_date, end=end_date)
else:
raise ValueError(
"either X_test(pd.Dataframe with dates for predictions, column ds)"
"or X_test(int number of periods)+freq are required.")
return forecast
else:
return np.ones(X_test.shape[0])

View File

@ -8,19 +8,20 @@ import numpy as np
import time
import pickle
try:
from ray import __version__ as ray_version
assert ray_version >= '1.0.0'
from ray.tune.suggest import Searcher
from ray.tune.suggest.optuna import OptunaSearch as GlobalSearch
from ray.tune.utils.util import flatten_dict
from ray.tune.utils.util import unflatten_dict
except (ImportError, AssertionError):
from .suggestion import Searcher
from .suggestion import OptunaSearch as GlobalSearch
from .variant_generator import flatten_dict
from ..tune.trial import unflatten_dict
from .search_thread import SearchThread
from .flow2 import FLOW2
from ..tune.space import add_cost_to_space, normalize # TODO: , define_by_run_func
from ..tune.space import add_cost_to_space, indexof, normalize, define_by_run_func
import logging
logger = logging.getLogger(__name__)
@ -133,9 +134,8 @@ class BlendSearch(Searcher):
if global_search_alg is not None:
self._gs = global_search_alg
elif getattr(self, '__name__', None) != 'CFO':
gs_space = space
# TODO: when define_by_run is supported
# gs_space = define_by_run_func(space)
from functools import partial
gs_space = partial(define_by_run_func, space=space)
try:
gs_seed = seed - 10 if (seed - 10) >= 0 else seed - 11 + (1 << 32)
if experimental:
@ -198,7 +198,10 @@ class BlendSearch(Searcher):
# reset search when metric or mode changed
self._ls.set_search_properties(metric, mode)
if self._gs is not None:
self._gs.set_search_properties(metric, mode)
self._gs = GlobalSearch(
space=self._gs._space, metric=metric, mode=mode,
sampler=self._gs._sampler)
self._gs.space = self._ls.space
self._init_search()
if config:
if 'time_budget_s' in config:
@ -312,9 +315,11 @@ class BlendSearch(Searcher):
self._expand_admissible_region(
self._ls_bound_min, self._ls_bound_max,
self._subspace.get(trial_id, self._ls.space))
if self._gs is not None and self._experimental:
# TODO: key match for hierarchical space
self._gs.add_evaluated_point(flatten_dict(config), objective)
# if self._gs is not None and self._experimental:
# # TODO: recover when supported
# converted = convert_key(config, self._gs.space)
# logger.info(converted)
# self._gs.add_evaluated_point(converted, objective)
elif metric_constraint_satisfied and self._create_condition(
result):
# thread creator
@ -339,7 +344,6 @@ class BlendSearch(Searcher):
del self._subspace[trial_id]
def _create_thread(self, config, result, space):
# logger.info(f"create local search thread from {config}")
self._search_thread_pool[self._thread_count] = SearchThread(
self._ls.mode,
self._ls.create(
@ -349,26 +353,29 @@ class BlendSearch(Searcher):
)
self._thread_count += 1
self._update_admissible_region(
config, self._ls_bound_min, self._ls_bound_max, space)
unflatten_dict(config), self._ls_bound_min, self._ls_bound_max, space,
self._ls.space)
def _update_admissible_region(
self, config, admissible_min, admissible_max, space: Dict = {}
self, config, admissible_min, admissible_max, subspace: Dict = {},
space: Dict = {}
):
# update admissible region
normalized_config = normalize(config, space, config, {})
normalized_config = normalize(config, subspace, config, {})
for key in admissible_min:
value = normalized_config[key]
if isinstance(admissible_max[key], list):
choice = space[key]['_choice_']
domain = space[key]
choice = indexof(domain, value)
self._update_admissible_region(
value,
admissible_min[key][choice], admissible_max[key][choice],
space[key]
subspace[key], domain[choice]
)
elif isinstance(value, dict):
self._update_admissible_region(
value,
admissible_min[key], admissible_max[key], space[key])
value, admissible_min[key], admissible_max[key],
subspace[key], space[key])
else:
if value > admissible_max[key]:
admissible_max[key] = value
@ -514,7 +521,8 @@ class BlendSearch(Searcher):
return None
use_rs = 1
if choice or self._valid(
config, space, self._gs_admissible_min, self._gs_admissible_max):
config, self._ls.space, space, self._gs_admissible_min,
self._gs_admissible_max):
# LS or valid or no backup choice
self._trial_proposed_by[trial_id] = choice
self._search_thread_pool[choice].running += use_rs
@ -542,10 +550,11 @@ class BlendSearch(Searcher):
# temporarily relax admissible region for parallel proposals
self._update_admissible_region(
config, self._gs_admissible_min, self._gs_admissible_max,
space)
space, self._ls.space)
else:
self._update_admissible_region(
config, self._ls_bound_min, self._ls_bound_max, space)
config, self._ls_bound_min, self._ls_bound_max, space,
self._ls.space)
self._gs_admissible_min.update(self._ls_bound_min)
self._gs_admissible_max.update(self._ls_bound_max)
signature = self._ls.config_signature(config, space)
@ -632,11 +641,6 @@ class BlendSearch(Searcher):
top_thread_id = backup_thread_id = 0
priority1 = priority2 = self._search_thread_pool[0].priority
for thread_id, thread in self._search_thread_pool.items():
# if thread_id:
# print(
# f"priority of thread {thread_id}={thread.priority}")
# logger.debug(
# f"thread {thread_id}.can_suggest={thread.can_suggest}")
if thread_id and thread.can_suggest:
priority = thread.priority
if priority > priority1:
@ -647,21 +651,29 @@ class BlendSearch(Searcher):
backup_thread_id = thread_id
return top_thread_id, backup_thread_id
def _valid(self, config: Dict, space: Dict, lower: Dict, upper: Dict) -> bool:
def _valid(self, config: Dict, space: Dict, subspace: Dict,
lower: Dict, upper: Dict) -> bool:
''' config validator
'''
normalized_config = normalize(config, space, config, {})
normalized_config = normalize(config, subspace, config, {})
for key, lb in lower.items():
if key in config:
value = normalized_config[key]
if isinstance(lb, list):
subspace = space[key]['_choice_']
domain = space[key]
index = indexof(domain, value)
nestedspace = subspace[key]
lb = lb[index]
ub = upper[key][index]
elif isinstance(lb, dict):
subspace = space[key]
nestedspace = subspace[key]
domain = space[key]
ub = upper[key]
else:
subspace = None
if subspace:
valid = self._valid(value, subspace, lb, upper[key])
nestedspace = None
if nestedspace:
valid = self._valid(
value, domain, nestedspace, lb, ub)
if not valid:
return False
elif (value + self._ls.STEPSIZE < lower[key]

View File

@ -543,8 +543,9 @@ class FLOW2(Searcher):
return False
for key in self._unordered_cat_hp:
# unordered cat choice is hard to reach by chance
if config1[key] != config2[key]:
if config1[key] != config2.get(key):
return False
delta = np.array(
[incumbent1[key] - incumbent2[key] for key in self._tunable_keys])
[incumbent1[key] - incumbent2.get(key, np.inf)
for key in self._tunable_keys])
return np.linalg.norm(delta) <= self.step

View File

@ -12,7 +12,7 @@ try:
except (ImportError, AssertionError):
from .suggestion import Searcher
from .flow2 import FLOW2
from ..tune.space import (add_cost_to_space, unflatten_hierarchical)
from ..tune.space import unflatten_hierarchical
import logging
logger = logging.getLogger(__name__)
@ -46,10 +46,6 @@ class SearchThread:
self.cost_attr = cost_attr
if search_alg:
self.space = self._space = search_alg.space # unflattened space
# TODO: remove when define_by_run is supported
if not isinstance(self._search_alg, FLOW2):
# remember const config
self._const = add_cost_to_space(self.space, {}, {})
@classmethod
def set_eps(cls, time_budget_s):
@ -63,8 +59,6 @@ class SearchThread:
else:
try:
config = self._search_alg.suggest(trial_id)
# TODO: remove when define_by_run is supported
config.update(self._const)
config, self.space = unflatten_hierarchical(config, self._space)
except FloatingPointError:
logger.warning(

View File

@ -17,9 +17,12 @@ This source file is adapted here because ray does not fully support Windows.
Copyright (c) Microsoft Corporation.
'''
import time
import functools
import warnings
import copy
import logging
from typing import Any, Dict, Optional, Union, List, Tuple
from typing import Any, Dict, Optional, Union, List, Tuple, Callable
import pickle
from .variant_generator import parse_spec_vars
from ..tune.sample import Categorical, Domain, Float, Integer, LogUniform, \
@ -332,13 +335,16 @@ class ConcurrencyLimiter(Searcher):
try:
import optuna as ot
from optuna.trial import TrialState as OptunaTrialState
from optuna.distributions import BaseDistribution as OptunaDistribution
from optuna.samplers import BaseSampler
from optuna.trial import TrialState as OptunaTrialState
from optuna.trial import Trial as OptunaTrial
except ImportError:
ot = None
OptunaTrialState = None
OptunaDistribution = None
BaseSampler = None
OptunaTrialState = None
OptunaTrial = None
# (Optional) Default (anonymous) metric when using tune.report(x)
DEFAULT_METRIC = "_metric"
@ -346,6 +352,78 @@ DEFAULT_METRIC = "_metric"
# (Auto-filled) The index of this training iteration.
TRAINING_ITERATION = "training_iteration"
# print a warning if define by run function takes longer than this to execute
DEFINE_BY_RUN_WARN_THRESHOLD_S = 1 # 1 is arbitrary
def validate_warmstart(parameter_names: List[str],
points_to_evaluate: List[Union[List, Dict]],
evaluated_rewards: List,
validate_point_name_lengths: bool = True):
"""Generic validation of a Searcher's warm start functionality.
Raises exceptions in case of type and length mismatches between
parameters.
If ``validate_point_name_lengths`` is False, the equality of lengths
between ``points_to_evaluate`` and ``parameter_names`` will not be
validated.
"""
if points_to_evaluate:
if not isinstance(points_to_evaluate, list):
raise TypeError(
"points_to_evaluate expected to be a list, got {}.".format(
type(points_to_evaluate)))
for point in points_to_evaluate:
if not isinstance(point, (dict, list)):
raise TypeError(
f"points_to_evaluate expected to include list or dict, "
f"got {point}.")
if validate_point_name_lengths and (
not len(point) == len(parameter_names)):
raise ValueError("Dim of point {}".format(point)
+ " and parameter_names {}".format(
parameter_names) + " do not match.")
if points_to_evaluate and evaluated_rewards:
if not isinstance(evaluated_rewards, list):
raise TypeError(
"evaluated_rewards expected to be a list, got {}.".format(
type(evaluated_rewards)))
if not len(evaluated_rewards) == len(points_to_evaluate):
raise ValueError(
"Dim of evaluated_rewards {}".format(evaluated_rewards)
+ " and points_to_evaluate {}".format(points_to_evaluate)
+ " do not match.")
class _OptunaTrialSuggestCaptor:
"""Utility to capture returned values from Optuna's suggest_ methods.
This will wrap around the ``optuna.Trial` object and decorate all
`suggest_` callables with a function capturing the returned value,
which will be saved in the ``captured_values`` dict.
"""
def __init__(self, ot_trial: OptunaTrial) -> None:
self.ot_trial = ot_trial
self.captured_values: Dict[str, Any] = {}
def _get_wrapper(self, func: Callable) -> Callable:
@functools.wraps(func)
def wrapper(*args, **kwargs):
# name is always the first arg for suggest_ methods
name = kwargs.get("name", args[0])
ret = func(*args, **kwargs)
self.captured_values[name] = ret
return ret
return wrapper
def __getattr__(self, item_name: str) -> Any:
item = getattr(self.ot_trial, item_name)
if item_name.startswith("suggest_") and callable(item):
return self._get_wrapper(item)
return item
class OptunaSearch(Searcher):
"""A wrapper around Optuna to provide trial suggestions.
@ -355,16 +433,20 @@ class OptunaSearch(Searcher):
This Searcher is a thin wrapper around Optuna's search algorithms.
You can pass any Optuna sampler, which will be used to generate
hyperparameter suggestions.
Please note that this wrapper does not support define-by-run, so the
search space will be configured before running the optimization. You will
also need to use a Tune trainable (e.g. using the function API) with
this wrapper.
For defining the search space, use ``ray.tune.suggest.optuna.param``
(see example).
Args:
space (list): Hyperparameter search space definition for Optuna's
sampler. This is a list, and samples for the parameters will
be obtained in order.
space (dict|Callable): Hyperparameter search space definition for
Optuna's sampler. This can be either a :class:`dict` with
parameter names as keys and ``optuna.distributions`` as values,
or a Callable - in which case, it should be a define-by-run
function using ``optuna.trial`` to obtain the hyperparameter
values. The function should return either a :class:`dict` of
constant values with names as keys, or None.
For more information, see https://optuna.readthedocs.io\
/en/stable/tutorial/10_key_features/002_configurations.html.
.. warning::
No actual computation should take place in the define-by-run
function. Instead, put the training logic inside the function
or class trainable passed to ``tune.run``.
metric (str): The training result objective value attribute. If None
but a mode was passed, the anonymous metric `_metric` will be used
per default.
@ -411,15 +493,28 @@ class OptunaSearch(Searcher):
metric="loss",
mode="min")
tune.run(trainable, search_alg=optuna_search)
# Equivalent Optuna define-by-run function approach:
def define_search_space(trial: optuna.Trial):
trial.suggest_float("a", 6, 8)
trial.suggest_float("b", 1e-4, 1e-2, log=True)
# training logic goes into trainable, this is just
# for search space definition
optuna_search = OptunaSearch(
define_search_space,
metric="loss",
mode="min")
tune.run(trainable, search_alg=optuna_search)
.. versionadded:: 0.8.8
"""
def __init__(self,
space: Optional[Union[Dict, List[Tuple]]] = None,
space: Optional[Union[Dict[str, "OptunaDistribution"], List[
Tuple], Callable[["OptunaTrial"], Optional[Dict[
str, Any]]]]] = None,
metric: Optional[str] = None,
mode: Optional[str] = None,
points_to_evaluate: Optional[List[Dict]] = None,
sampler: Optional[BaseSampler] = None,
sampler: Optional["BaseSampler"] = None,
seed: Optional[int] = None,
evaluated_rewards: Optional[List] = None):
assert ot is not None, (
@ -490,6 +585,11 @@ class OptunaSearch(Searcher):
load_if_exists=True)
if self._points_to_evaluate:
validate_warmstart(
self._space,
self._points_to_evaluate,
self._evaluated_rewards,
validate_point_name_lengths=not callable(self._space))
if self._evaluated_rewards:
for point, reward in zip(self._points_to_evaluate,
self._evaluated_rewards):
@ -512,6 +612,37 @@ class OptunaSearch(Searcher):
self._setup_study(mode)
return True
def _suggest_from_define_by_run_func(
self, func: Callable[["OptunaTrial"], Optional[Dict[str, Any]]],
ot_trial: "OptunaTrial") -> Dict:
captor = _OptunaTrialSuggestCaptor(ot_trial)
time_start = time.time()
ret = func(captor)
time_taken = time.time() - time_start
if time_taken > DEFINE_BY_RUN_WARN_THRESHOLD_S:
warnings.warn(
"Define-by-run function passed in the `space` argument "
f"took {time_taken} seconds to "
"run. Ensure that actual computation, training takes "
"place inside Tune's train functions or Trainables "
"passed to `tune.run`.")
if ret is not None:
if not isinstance(ret, dict):
raise TypeError(
"The return value of the define-by-run function "
"passed in the `space` argument should be "
"either None or a `dict` with `str` keys. "
f"Got {type(ret)}.")
if not all(isinstance(k, str) for k in ret.keys()):
raise TypeError(
"At least one of the keys in the dict returned by the "
"define-by-run function passed in the `space` argument "
"was not a `str`.")
return {
**captor.captured_values,
**ret
} if ret else captor.captured_values
def suggest(self, trial_id: str) -> Optional[Dict]:
if not self._space:
raise RuntimeError(
@ -538,6 +669,14 @@ class OptunaSearch(Searcher):
ot_trial, fn)(*args, **kwargs)
for (fn, args, kwargs) in self._space
}
elif callable(self._space):
if trial_id not in self._ot_trials:
self._ot_trials[trial_id] = self._ot_study.ask()
ot_trial = self._ot_trials[trial_id]
params = self._suggest_from_define_by_run_func(
self._space, ot_trial)
else:
# Use Optuna ask interface (since version 2.6.0)
if trial_id not in self._ot_trials:

View File

@ -26,6 +26,9 @@ def define_by_run_func(
for key, domain in space.items():
if path:
key = path + '/' + key
if isinstance(domain, dict):
config.update(define_by_run_func(trial, domain, key))
continue
if not isinstance(domain, sample.Domain):
config[key] = domain
continue
@ -57,7 +60,7 @@ def define_by_run_func(
trial.suggest_int(
key, domain.lower,
domain.upper - int(bool(not quantize)),
step=quantize or 1, log=True)
log=True)
elif isinstance(sampler, sample.Uniform):
# Upper bound should be inclusive for quantization and
# exclusive otherwise
@ -76,7 +79,7 @@ def define_by_run_func(
if isinstance(choice, dict):
key += f":{index}"
# the suffix needs to be removed from the final config
config[key] = define_by_run_func(trial, choice, key)
config.update(define_by_run_func(trial, choice, key))
else:
raise ValueError(
"Optuna search does not support parameters of type "
@ -87,6 +90,32 @@ def define_by_run_func(
return config
def convert_key(
conf: Dict, space: Dict, path: str = ""
) -> Optional[Dict[str, Any]]:
"""Convert config keys to define-by-run keys.
Returns:
A dict with converted keys.
"""
config = {}
for key, domain in space.items():
value = conf[key]
if path:
key = path + '/' + key
if isinstance(domain, dict):
config.update(convert_key(conf[key], domain, key))
elif isinstance(domain, sample.Categorical):
index = indexof(domain, value)
config[key + '_choice_'] = index
if isinstance(value, dict):
key += f":{index}"
config.update(convert_key(value, domain.categories[index], key))
else:
config[key] = value
return config
def unflatten_hierarchical(config: Dict, space: Dict) -> Tuple[Dict, Dict]:
'''unflatten hierarchical config'''
hier = {}
@ -101,12 +130,18 @@ def unflatten_hierarchical(config: Dict, space: Dict) -> Tuple[Dict, Dict]:
hier[true_key], subspace[true_key] = unflatten_hierarchical(
value, space[true_key][choice])
else:
if key.endswith("_choice_"):
key = key[:-8]
domain = space.get(key)
if domain is not None:
subspace[key] = domain
if isinstance(domain, sample.Domain):
sampler = domain.sampler
if isinstance(sampler, sample.Quantized):
if isinstance(domain, sample.Categorical):
value = domain.categories[value]
if isinstance(value, dict):
continue
elif isinstance(sampler, sample.Quantized):
q = sampler.q
sampler = sampler.sampler
if isinstance(sampler, sample.LogUniform):

File diff suppressed because one or more lines are too long

View File

@ -124,7 +124,8 @@
"source": [
"settings = {\n",
" \"time_budget\": 60, # total running time in seconds\n",
" \"metric\": 'accuracy', # primary metrics can be chosen from: ['accuracy','roc_auc','roc_auc_ovr','roc_auc_ovo','f1','log_loss','mae','mse','r2']\n",
" \"metric\": 'accuracy', # can be: 'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',\n",
" # 'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg', 'micro_f1', 'macro_f1'\n",
" \"estimator_list\": ['lgbm', 'rf', 'xgboost'], # list of ML learners\n",
" \"task\": 'classification', # task type \n",
" \"sample\": False, # whether to subsample training data\n",
@ -265,7 +266,7 @@
"execution_count": null,
"source": [
"from flaml.data import get_output_from_log\n",
"time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \\\n",
"time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n",
" get_output_from_log(filename = settings['log_file_name'], time_budget = 60)\n",
"\n",
"for config in config_history:\n",

View File

@ -104,10 +104,7 @@
" \"metric\": 'mape', # primary metric for validation: 'mape' is generally used for forecast tasks\n",
" \"task\": 'forecast', # task type\n",
" \"log_file_name\": 'CO2_forecast.log', # flaml log file\n",
" \"eval_method\": \"holdout\", # validation method can be chosen from ['auto', 'holdout', 'cv']\n",
" # \"estimator_list\": [\"sarimax\"],\n",
" # \"verbose\": 3,\n",
" \"split_type\": 'time' # for foretask task, 'split_type' has to be 'time'\n",
" \"eval_method\": \"holdout\", # validation method can be chosen from ['auto', 'holdout', 'cv']\n",
"}"
],
"outputs": [],
@ -1355,7 +1352,7 @@
"execution_count": 11,
"source": [
"from flaml.data import get_output_from_log\n",
"time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \\\n",
"time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n",
" get_output_from_log(filename=settings['log_file_name'], time_budget=300)\n",
"\n",
"for config in config_history:\n",

View File

@ -445,7 +445,7 @@
"execution_count": 11,
"source": [
"from flaml.data import get_output_from_log\n",
"time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \\\n",
"time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n",
" get_output_from_log(filename=settings['log_file_name'], time_budget=60)\n",
"\n",
"for config in config_history:\n",

View File

@ -362,7 +362,7 @@
"execution_count": 10,
"source": [
"from flaml.data import get_output_from_log\n",
"time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \\\n",
"time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n",
" get_output_from_log(filename=settings['log_file_name'], time_budget=60)\n",
"\n",
"for config in config_history:\n",

View File

@ -62,7 +62,7 @@ setuptools.setup(
"optuna==2.8.0"
],
"ray": [
"ray[tune]==1.5.1",
"ray[tune]==1.6.0",
"pyyaml<5.3.1",
],
"azureml": [
@ -75,7 +75,7 @@ setuptools.setup(
"vowpalwabbit",
],
"nlp": [
"ray[tune]>=1.5.1",
"ray[tune]>=1.6.0",
"transformers",
"datasets==1.4.1",
"tensorboardX<=2.2",

View File

@ -111,7 +111,8 @@ class MyLargeLGBM(LGBMEstimator):
def custom_metric(X_test, y_test, estimator, labels, X_train, y_train,
weight_test=None, weight_train=None):
weight_test=None, weight_train=None, config=None,
groups_test=None, groups_train=None):
from sklearn.metrics import log_loss
import time
start = time.time()
@ -162,7 +163,10 @@ class TestAutoML(unittest.TestCase):
"sample": True, # whether to subsample training data
"log_file_name": "test/wine.log",
"log_training_metric": True, # whether to log training metric
"ensemble": True,
"ensemble": {
"final_estimator": MyRegularizedGreedyForest(),
"passthrough": False,
},
"n_jobs": 1,
}
@ -274,9 +278,9 @@ class TestAutoML(unittest.TestCase):
task='multi')
print(estimator)
time_history, best_valid_loss_history, valid_loss_history, \
config_history, train_loss_history = get_output_from_log(
config_history, metric_history = get_output_from_log(
filename=automl_settings['log_file_name'], time_budget=6)
print(train_loss_history)
print(metric_history)
def test_classification(self, as_frame=False):
automl_experiment = AutoML()
@ -496,6 +500,30 @@ class TestAutoML(unittest.TestCase):
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
def test_parallel(self, hpo_method=None):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 10,
"task": 'regression',
"log_file_name": "test/boston.log",
"log_type": "all",
"n_jobs": 1,
"n_concurrent_trials": 2,
"hpo_method": hpo_method,
}
X_train, y_train = load_boston(return_X_y=True)
try:
automl_experiment.fit(X_train=X_train, y_train=y_train,
**automl_settings)
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
except ImportError:
return
def test_parallel_xgboost(self, hpo_method=None):
automl_experiment = AutoML()
automl_settings = {

View File

@ -1,20 +1,19 @@
def test_forecast_automl_df(budget=5):
import numpy as np
from flaml import AutoML
def test_forecast_automl(budget=5):
# using dataframe
import statsmodels.api as sm
data = sm.datasets.co2.load_pandas()
data = data.data
data = data['co2'].resample('MS').mean()
data = data.fillna(data.bfill())
data = data.to_frame().reset_index()
data = data.rename(columns={'index': 'ds', 'co2': 'y'})
data = sm.datasets.co2.load_pandas().data['co2'].resample('MS').mean()
data = data.fillna(data.bfill()).to_frame().reset_index().rename(
columns={'index': 'ds', 'co2': 'y'})
num_samples = data.shape[0]
time_horizon = 12
split_idx = num_samples - time_horizon
X_train = data[:split_idx]
X_test = data[split_idx:]['ds'].to_frame()
y_test = data[split_idx:]['y'].to_frame()
''' import AutoML class from flaml package '''
from flaml import AutoML
df = data[:split_idx]
X_test = data[split_idx:]['ds']
y_test = data[split_idx:]['y']
automl = AutoML()
settings = {
"time_budget": budget, # total running time in seconds
@ -22,13 +21,14 @@ def test_forecast_automl_df(budget=5):
"task": 'forecast', # task type
"log_file_name": 'CO2_forecast.log', # flaml log file
"eval_method": "holdout",
"split_type": 'time'
}
'''The main flaml automl API'''
try:
automl.fit(dataframe=X_train, **settings, period=time_horizon, freq='M')
automl.fit(dataframe=df, **settings, period=time_horizon)
except ImportError:
automl.fit(dataframe=X_train, **settings, estimator_list=['arima', 'sarimax'], period=time_horizon, freq='M')
print("not using FBProphet due to ImportError")
automl.fit(dataframe=df, **settings, estimator_list=[
'arima', 'sarimax'], period=time_horizon)
''' retrieve best config and best learner'''
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
@ -47,7 +47,7 @@ def test_forecast_automl_df(budget=5):
from flaml.ml import sklearn_metric_loss_score
print('mape', '=', sklearn_metric_loss_score('mape', y_pred, y_test))
from flaml.data import get_output_from_log
time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \
time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
get_output_from_log(filename=settings['log_file_name'], time_budget=budget)
for config in config_history:
print(config)
@ -55,65 +55,46 @@ def test_forecast_automl_df(budget=5):
print(automl.max_resource)
print(automl.min_resource)
def test_forecast_automl_Xy(budget=5):
# using X_train and y_train
import statsmodels.api as sm
data = sm.datasets.co2.load_pandas()
data = data.data
data = data['co2'].resample('MS').mean()
data = data.fillna(data.bfill())
data = data.to_frame().reset_index()
num_samples = data.shape[0]
time_horizon = 12
split_idx = num_samples - time_horizon
X_train = data[:split_idx]['index'].to_frame()
y_train = data[:split_idx]['co2']
X_test = data[split_idx:]['index'].to_frame()
y_test = data[split_idx:]['co2'].to_frame()
''' import AutoML class from flaml package '''
from flaml import AutoML
X_train = df['ds']
y_train = df['y']
automl = AutoML()
settings = {
"time_budget": budget, # total running time in seconds
"metric": 'mape', # primary metric
"task": 'forecast', # task type
"log_file_name": 'CO2_forecast.log', # flaml log file
"eval_method": "holdout",
"split_type": 'time'
}
'''The main flaml automl API'''
try:
automl.fit(X_train=X_train, y_train=y_train, **settings, period=time_horizon, freq='M')
automl.fit(X_train=X_train, y_train=y_train, **settings, period=time_horizon)
except ImportError:
automl.fit(X_train=X_train, y_train=y_train, **settings, estimator_list=['arima', 'sarimax'], period=time_horizon, freq='M')
''' retrieve best config and best learner'''
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print(f'Best mape on validation data: {automl.best_loss}')
print(f'Training duration of best run: {automl.best_config_train_time}s')
print(automl.model.estimator)
''' pickle and save the automl object '''
import pickle
with open('automl.pkl', 'wb') as f:
pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
''' compute predictions of testing dataset '''
y_pred = automl.predict(X_test)
print('Predicted labels', y_pred)
print('True labels', y_test)
''' compute different metric values on testing dataset'''
from flaml.ml import sklearn_metric_loss_score
print('mape', '=', sklearn_metric_loss_score('mape', y_pred, y_test))
from flaml.data import get_output_from_log
time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \
get_output_from_log(filename=settings['log_file_name'], time_budget=budget)
for config in config_history:
print(config)
print(automl.prune_attr)
print(automl.max_resource)
print(automl.min_resource)
print("not using FBProphet due to ImportError")
automl.fit(X_train=X_train, y_train=y_train, **settings, estimator_list=[
'arima', 'sarimax'], period=time_horizon)
def test_numpy():
X_train = np.arange('2014-01', '2021-01', dtype='datetime64[M]')
y_train = np.random.random(size=72)
automl = AutoML()
try:
automl.fit(
X_train=X_train[:60], # a single column of timestamp
y_train=y_train, # value for each timestamp
period=12, # time horizon to forecast, e.g., 12 months
task='forecast', time_budget=3, # time budget in seconds
log_file_name="test/forecast.log")
print(automl.predict(X_train[60:]))
print(automl.predict(12))
except ValueError:
print("ValueError for FBProphet is raised as expected.")
except ImportError:
print("not using FBProphet due to ImportError")
automl = AutoML()
automl.fit(
X_train=X_train[:72], # a single column of timestamp
y_train=y_train, # value for each timestamp
period=12, # time horizon to forecast, e.g., 12 months
task='forecast', time_budget=1, # time budget in seconds
estimator_list=['arima', 'sarimax'],
log_file_name="test/forecast.log")
print(automl.predict(X_train[72:]))
# an alternative way to specify predict steps for arima/sarimax
print(automl.predict(12))
if __name__ == "__main__":
test_forecast_automl_df(60)
test_forecast_automl_Xy(60)
test_forecast_automl(60)

View File

@ -42,7 +42,7 @@ def test_automl(budget=5, dataset_format='dataframe'):
print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test))
print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test))
from flaml.data import get_output_from_log
time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \
time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
get_output_from_log(filename=settings['log_file_name'], time_budget=60)
for config in config_history:
print(config)

View File

@ -62,11 +62,11 @@ class TestLogging(unittest.TestCase):
config = automl.best_config.copy()
config['learner'] = automl.best_estimator
automl.trainable({"ml": config})
from flaml import tune, CFO
from flaml import tune, BlendSearch
from flaml.automl import size
from functools import partial
search_alg = CFO(
metric='val_loss',
search_alg = BlendSearch(
metric='val_loss', mode='min',
space=automl.search_space,
low_cost_partial_config=automl.low_cost_partial_config,
points_to_evaluate=automl.points_to_evaluate,

View File

@ -74,5 +74,41 @@ def test_groups():
automl.fit(X, y, **automl_settings)
def test_rank():
from sklearn.externals._arff import ArffException
try:
X, y = fetch_openml(name=dataset, return_X_y=True)
except (ArffException, ValueError):
from sklearn.datasets import load_wine
X, y = load_wine(return_X_y=True)
y = y.cat.codes
import numpy as np
automl = AutoML()
automl_settings = {
"time_budget": 2,
"task": "rank",
"log_file_name": "test/{}.log".format(dataset),
"model_history": True,
"eval_method": "cv",
"groups": np.array( # group labels
[0] * 200 + [1] * 200 + [2] * 200 + [3] * 200 + [4] * 100 + [5] * 100),
"learner_selector": "roundrobin",
}
automl.fit(X, y, **automl_settings)
automl = AutoML()
automl_settings = {
"time_budget": 2,
"task": "rank",
"metric": "ndcg@5", # 5 can be replaced by any number
"log_file_name": "test/{}.log".format(dataset),
"model_history": True,
"groups": [200] * 4 + [100] * 2, # alternative way: group counts
# "estimator_list": ['lgbm', 'xgboost'], # list of ML learners
"learner_selector": "roundrobin",
}
automl.fit(X, y, **automl_settings)
if __name__ == "__main__":
unittest.main()