mirror of https://github.com/microsoft/autogen.git
remove catboost training dir; ensemble api; blendsearch for hierarchical space; ranking task; forecast improvement (#178)
* remove catboost training dir * close #48 * bs for hierarchical space. close #85 * retrain for hierarchical space * clean ml (#180) Co-authored-by: Qingyun Wu <qxw5138@psu.edu> * support ranking task * examples * cv shuffle * forecast api and implementation cleaner * period constraints * delete groups after fit
This commit is contained in:
parent
1bc8786dcb
commit
6ab0730793
39
README.md
39
README.md
|
@ -65,7 +65,7 @@ tune.run(train_with_config, config={…}, low_cost_partial_config={…}, time_bu
|
|||
|
||||
## Advantages
|
||||
|
||||
* For classification and regression tasks, find quality models with lower computational resources.
|
||||
* For common machine learning tasks like classification and regression, find quality models with small computational resources.
|
||||
* Users can choose their desired customizability: minimal customization (computational resource budget), medium customization (e.g., scikit-style learner, search space and metric), full customization (arbitrary training and evaluation code).
|
||||
* Allow human guidance in hyperparameter tuning to respect prior on certain subspaces but also able to explore other subspaces. Read more about the
|
||||
hyperparameter optimization methods
|
||||
|
@ -75,7 +75,7 @@ And they can be used in distributed HPO frameworks such as ray tune or nni.
|
|||
|
||||
## Examples
|
||||
|
||||
A basic classification example.
|
||||
- A basic classification example.
|
||||
|
||||
```python
|
||||
from flaml import AutoML
|
||||
|
@ -99,7 +99,7 @@ print(automl.predict_proba(X_train))
|
|||
print(automl.model)
|
||||
```
|
||||
|
||||
A basic regression example.
|
||||
- A basic regression example.
|
||||
|
||||
```python
|
||||
from flaml import AutoML
|
||||
|
@ -123,6 +123,39 @@ print(automl.predict(X_train))
|
|||
print(automl.model)
|
||||
```
|
||||
|
||||
- Time series forecasting.
|
||||
|
||||
```python
|
||||
# pip install flaml[forecast]
|
||||
import numpy as np
|
||||
from flaml import AutoML
|
||||
X_train = np.arange('2014-01', '2021-01', dtype='datetime64[M]')
|
||||
y_train = np.random.random(size=72)
|
||||
automl = AutoML()
|
||||
automl.fit(X_train=X_train[:72], # a single column of timestamp
|
||||
y_train=y_train, # value for each timestamp
|
||||
period=12, # time horizon to forecast, e.g., 12 months
|
||||
task='forecast', time_budget=15, # time budget in seconds
|
||||
log_file_name="test/forecast.log",
|
||||
)
|
||||
print(automl.predict(X_train[72:]))
|
||||
```
|
||||
|
||||
- Learning to rank.
|
||||
|
||||
```python
|
||||
from sklearn.datasets import fetch_openml
|
||||
from flaml import AutoML
|
||||
X, y = fetch_openml(name="credit-g", return_X_y=True)
|
||||
# not a real learning to rank dataaset
|
||||
groups = [200] * 4 + [100] * 2, # group counts
|
||||
automl = AutoML()
|
||||
automl.fit(
|
||||
X_train, y_train, groups=groups,
|
||||
task='rank', time_budget=10, # in seconds
|
||||
)
|
||||
```
|
||||
|
||||
More examples can be found in [notebooks](https://github.com/microsoft/FLAML/tree/main/notebook/).
|
||||
|
||||
## Documentation
|
||||
|
|
368
flaml/automl.py
368
flaml/automl.py
|
@ -10,7 +10,7 @@ from functools import partial
|
|||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, \
|
||||
RepeatedKFold, GroupKFold, TimeSeriesSplit
|
||||
RepeatedKFold, GroupKFold, TimeSeriesSplit, GroupShuffleSplit
|
||||
from sklearn.utils import shuffle
|
||||
import pandas as pd
|
||||
import logging
|
||||
|
@ -94,13 +94,13 @@ class SearchState:
|
|||
else:
|
||||
self.sample_size = self.data_size
|
||||
obj = result['val_loss']
|
||||
train_loss = result['train_loss']
|
||||
metric_for_logging = result['metric_for_logging']
|
||||
time2eval = result['time_total_s']
|
||||
trained_estimator = result['trained_estimator']
|
||||
del result['trained_estimator'] # free up RAM
|
||||
else:
|
||||
obj, time2eval, trained_estimator = np.inf, 0.0, None
|
||||
train_loss = config = None
|
||||
metric_for_logging = config = None
|
||||
self.trial_time = time2eval
|
||||
self.total_time_used += time_used
|
||||
self.total_iter += 1
|
||||
|
@ -126,7 +126,8 @@ class SearchState:
|
|||
self.trained_estimator.cleanup()
|
||||
if trained_estimator:
|
||||
self.trained_estimator = trained_estimator
|
||||
self.train_loss, self.val_loss, self.config = train_loss, obj, config
|
||||
self.metric_for_logging, self.val_loss, self.config = \
|
||||
metric_for_logging, obj, config
|
||||
|
||||
def get_hist_config_sig(self, sample_size, config):
|
||||
config_values = tuple([config[k] for k in self._hp_names])
|
||||
|
@ -144,7 +145,7 @@ class AutoMLState:
|
|||
|
||||
def _prepare_sample_train_data(self, sample_size):
|
||||
full_size = len(self.y_train)
|
||||
sampled_weight = None
|
||||
sampled_weight = groups = None
|
||||
if sample_size <= full_size:
|
||||
if isinstance(self.X_train, pd.DataFrame):
|
||||
sampled_X_train = self.X_train.iloc[:sample_size]
|
||||
|
@ -154,12 +155,16 @@ class AutoMLState:
|
|||
weight = self.fit_kwargs.get('sample_weight')
|
||||
if weight is not None:
|
||||
sampled_weight = weight[:sample_size]
|
||||
if self.groups is not None:
|
||||
groups = self.groups[:sample_size]
|
||||
else:
|
||||
sampled_X_train = self.X_train_all
|
||||
sampled_y_train = self.y_train_all
|
||||
if 'sample_weight' in self.fit_kwargs:
|
||||
sampled_weight = self.sample_weight_all
|
||||
return sampled_X_train, sampled_y_train, sampled_weight
|
||||
if self.groups is not None:
|
||||
groups = self.groups_all
|
||||
return sampled_X_train, sampled_y_train, sampled_weight, groups
|
||||
|
||||
def _compute_with_config_base(self,
|
||||
estimator,
|
||||
|
@ -168,13 +173,15 @@ class AutoMLState:
|
|||
sample_size = int(config_w_resource['FLAML_sample_size'])
|
||||
else:
|
||||
sample_size = self.data_size
|
||||
sampled_X_train, sampled_y_train, sampled_weight = \
|
||||
sampled_X_train, sampled_y_train, sampled_weight, groups = \
|
||||
self._prepare_sample_train_data(sample_size)
|
||||
if sampled_weight is not None:
|
||||
weight = self.fit_kwargs['sample_weight']
|
||||
self.fit_kwargs['sample_weight'] = sampled_weight
|
||||
else:
|
||||
weight = None
|
||||
if groups is not None:
|
||||
self.fit_kwargs['groups'] = groups
|
||||
config = config_w_resource.copy()
|
||||
if 'FLAML_sample_size' in config:
|
||||
del config['FLAML_sample_size']
|
||||
|
@ -182,13 +189,14 @@ class AutoMLState:
|
|||
budget = time_left if sample_size == self.data_size else \
|
||||
time_left / 2 * sample_size / self.data_size
|
||||
|
||||
trained_estimator, val_loss, train_loss, _, pred_time = \
|
||||
trained_estimator, val_loss, metric_for_logging, _, pred_time = \
|
||||
compute_estimator(
|
||||
sampled_X_train,
|
||||
sampled_y_train,
|
||||
self.X_val,
|
||||
self.y_val,
|
||||
self.weight_val,
|
||||
self.groups_val,
|
||||
min(budget, self.train_time_limit),
|
||||
self.kf,
|
||||
config,
|
||||
|
@ -204,7 +212,7 @@ class AutoMLState:
|
|||
result = {
|
||||
'pred_time': pred_time,
|
||||
'wall_clock_time': time.time() - self._start_time_flag,
|
||||
'train_loss': train_loss,
|
||||
'metric_for_logging': metric_for_logging,
|
||||
'val_loss': val_loss,
|
||||
'trained_estimator': trained_estimator
|
||||
}
|
||||
|
@ -216,19 +224,23 @@ class AutoMLState:
|
|||
def _train_with_config(
|
||||
self, estimator, config_w_resource, sample_size=None
|
||||
):
|
||||
config = config_w_resource.copy()
|
||||
if not sample_size:
|
||||
sample_size = config_w_resource['FLAML_sample_size']
|
||||
config = config_w_resource.get('ml', config_w_resource).copy()
|
||||
if 'FLAML_sample_size' in config:
|
||||
if not sample_size:
|
||||
sample_size = config['FLAML_sample_size']
|
||||
del config['FLAML_sample_size']
|
||||
if "learner" in config:
|
||||
del config['learner']
|
||||
assert sample_size is not None
|
||||
sampled_X_train, sampled_y_train, sampled_weight = \
|
||||
sampled_X_train, sampled_y_train, sampled_weight, groups = \
|
||||
self._prepare_sample_train_data(sample_size)
|
||||
if sampled_weight is not None:
|
||||
weight = self.fit_kwargs['sample_weight']
|
||||
self.fit_kwargs['sample_weight'] = sampled_weight
|
||||
else:
|
||||
weight = None
|
||||
if groups is not None:
|
||||
self.fit_kwargs['groups'] = groups
|
||||
budget = None if self.time_budget is None else (
|
||||
self.time_budget - self.time_from_start)
|
||||
estimator, train_time = train_estimator(
|
||||
|
@ -368,18 +380,18 @@ class AutoML:
|
|||
return self._trained_estimator.classes_.tolist()
|
||||
return None
|
||||
|
||||
def predict(self, X_test, freq=None):
|
||||
def predict(self, X_test):
|
||||
'''Predict label from features.
|
||||
|
||||
Args:
|
||||
X_test: A numpy array of featurized instances, shape n * m,
|
||||
or a pandas dataframe with one column with timestamp values
|
||||
for 'forecasting' task.
|
||||
freq: str or pandas offset, default=None | The frequency of the
|
||||
time-series.
|
||||
or for 'forecasting' task:
|
||||
a pandas dataframe with one column of timestamp values
|
||||
or an integer n for the predict steps (only valid when
|
||||
the estimator is arima or sarimax).
|
||||
|
||||
Returns:
|
||||
A numpy array of shape n * 1 - - each element is a predicted class
|
||||
A array-like of shape n * 1 - - each element is a predicted
|
||||
label for an instance.
|
||||
'''
|
||||
if self._trained_estimator is None:
|
||||
|
@ -387,13 +399,7 @@ class AutoML:
|
|||
"No estimator is trained. Please run fit with enough budget.")
|
||||
return None
|
||||
X_test = self._preprocess(X_test)
|
||||
if self._state.task == 'forecast':
|
||||
X_test_df = pd.DataFrame(X_test)
|
||||
X_test_col = list(X_test.columns)[0]
|
||||
X_test_df = X_test_df.rename(columns={X_test_col: 'ds'})
|
||||
y_pred = self._trained_estimator.predict(X_test_df, freq=freq)
|
||||
else:
|
||||
y_pred = self._trained_estimator.predict(X_test)
|
||||
y_pred = self._trained_estimator.predict(X_test)
|
||||
if y_pred.ndim > 1 and isinstance(y_pred, np.ndarray):
|
||||
y_pred = y_pred.flatten()
|
||||
if self._label_transformer:
|
||||
|
@ -418,14 +424,20 @@ class AutoML:
|
|||
return proba
|
||||
|
||||
def _preprocess(self, X):
|
||||
if issparse(X):
|
||||
X = X.tocsr()
|
||||
if self._transformer:
|
||||
X = self._transformer.transform(X)
|
||||
if isinstance(X, int):
|
||||
return X
|
||||
if self._state.task == 'forecast':
|
||||
X = pd.DataFrame(X)
|
||||
X = X.rename(columns={X.columns[0]: 'ds'})
|
||||
else:
|
||||
if issparse(X):
|
||||
X = X.tocsr()
|
||||
if self._transformer:
|
||||
X = self._transformer.transform(X)
|
||||
return X
|
||||
|
||||
def _validate_data(self, X_train_all, y_train_all, dataframe, label,
|
||||
X_val=None, y_val=None):
|
||||
X_val=None, y_val=None, groups_val=None, groups=None):
|
||||
if self._state.task == 'forecast':
|
||||
if dataframe is not None and label is not None:
|
||||
dataframe = dataframe.copy()
|
||||
|
@ -433,13 +445,11 @@ class AutoML:
|
|||
elif dataframe is not None:
|
||||
if ('ds' not in dataframe) or ('y' not in dataframe):
|
||||
raise ValueError(
|
||||
'For forecasting task, Dataframe must have columns "ds" and "y" '
|
||||
'with the dates and values respectively.'
|
||||
)
|
||||
'For forecasting task, dataframe must have columns "ds" and "y" '
|
||||
'with the dates and values respectively.')
|
||||
elif (X_train_all is not None) and (y_train_all is not None):
|
||||
dataframe = pd.DataFrame(X_train_all)
|
||||
time_col = list(dataframe.columns)[0]
|
||||
dataframe = dataframe.rename(columns={time_col: 'ds'})
|
||||
dataframe = dataframe.rename(columns={dataframe.columns[0]: 'ds'})
|
||||
dataframe['y'] = pd.Series(y_train_all)
|
||||
X_train_all = None
|
||||
y_train_all = None
|
||||
|
@ -515,12 +525,23 @@ class AutoML:
|
|||
self._state.y_val = y_val
|
||||
else:
|
||||
self._state.X_val = self._state.y_val = None
|
||||
if groups is not None and len(groups) != self._nrow:
|
||||
# groups is given as group counts
|
||||
self._state.groups = np.concatenate(
|
||||
[[i] * c for i, c in enumerate(groups)])
|
||||
assert len(self._state.groups) == self._nrow, \
|
||||
"the sum of group counts must match the number of examples"
|
||||
self._state.groups_val = np.concatenate(
|
||||
[[i] * c for i, c in enumerate(groups_val)]
|
||||
) if groups_val is not None else None
|
||||
else:
|
||||
self._state.groups_val = groups_val
|
||||
self._state.groups = groups
|
||||
|
||||
def _prepare_data(self,
|
||||
eval_method,
|
||||
split_ratio,
|
||||
n_splits,
|
||||
period=None):
|
||||
n_splits):
|
||||
X_val, y_val = self._state.X_val, self._state.y_val
|
||||
if issparse(X_val):
|
||||
X_val = X_val.tocsr()
|
||||
|
@ -564,25 +585,25 @@ class AutoML:
|
|||
random_state=RANDOM_SEED)
|
||||
self._state.fit_kwargs[
|
||||
'sample_weight'] = self._state.sample_weight_all
|
||||
elif hasattr(self._state, 'groups') and self._state.groups is not None:
|
||||
X_train_all, y_train_all, self._state.groups = shuffle(
|
||||
X_train_all, y_train_all, self._state.groups,
|
||||
random_state=RANDOM_SEED)
|
||||
else:
|
||||
X_train_all, y_train_all = shuffle(
|
||||
X_train_all, y_train_all, random_state=RANDOM_SEED)
|
||||
if self._df:
|
||||
X_train_all.reset_index(drop=True, inplace=True)
|
||||
if isinstance(y_train_all, pd.Series):
|
||||
y_train_all.reset_index(drop=True, inplace=True)
|
||||
if self._df:
|
||||
X_train_all.reset_index(drop=True, inplace=True)
|
||||
if isinstance(y_train_all, pd.Series):
|
||||
y_train_all.reset_index(drop=True, inplace=True)
|
||||
|
||||
X_train, y_train = X_train_all, y_train_all
|
||||
if X_val is None:
|
||||
self._state.groups_all = self._state.groups
|
||||
if X_val is None and eval_method == 'holdout':
|
||||
# if eval_method = holdout, make holdout data
|
||||
if eval_method == 'holdout' and self._split_type == 'time':
|
||||
if 'period' in self._state.fit_kwargs:
|
||||
if self._split_type == 'time':
|
||||
if self._state.task == 'forecast':
|
||||
num_samples = X_train_all.shape[0]
|
||||
split_idx = num_samples - self._state.fit_kwargs.get('period')
|
||||
period = self._state.fit_kwargs['period']
|
||||
assert period < num_samples, (
|
||||
f"period={period}>#examples={num_samples}")
|
||||
split_idx = num_samples - period
|
||||
X_train = X_train_all[:split_idx]
|
||||
y_train = y_train_all[:split_idx]
|
||||
X_val = X_train_all[split_idx:]
|
||||
|
@ -603,7 +624,21 @@ class AutoML:
|
|||
y_train_all,
|
||||
test_size=split_ratio,
|
||||
shuffle=False)
|
||||
elif self._state.task != 'regression' and eval_method == 'holdout':
|
||||
elif self._state.task == 'rank':
|
||||
gss = GroupShuffleSplit(n_splits=1, test_size=split_ratio,
|
||||
random_state=RANDOM_SEED)
|
||||
for train_idx, val_idx in gss.split(X_train_all, y_train_all,
|
||||
self._state.groups):
|
||||
if self._df:
|
||||
X_train, X_val = X_train_all.iloc[
|
||||
train_idx], X_train_all.iloc[val_idx]
|
||||
else:
|
||||
X_train, X_val = X_train_all[
|
||||
train_idx], X_train_all[val_idx]
|
||||
y_train, y_val = y_train_all[train_idx], y_train_all[val_idx]
|
||||
self._state.groups, self._state.groups_val = self._state.groups[
|
||||
train_idx], self._state.groups[val_idx]
|
||||
elif self._state.task != 'regression':
|
||||
# for classification, make sure the labels are complete in both
|
||||
# training and validation data
|
||||
label_set, first = np.unique(y_train_all, return_index=True)
|
||||
|
@ -617,8 +652,7 @@ class AutoML:
|
|||
X_first = X_train_all.iloc[first] if self._df else X_train_all[
|
||||
first]
|
||||
X_rest = X_train_all.iloc[rest] if self._df else X_train_all[rest]
|
||||
y_rest = y_train_all.iloc[rest] if isinstance(
|
||||
y_train_all, pd.Series) else y_train_all[rest]
|
||||
y_rest = y_train_all[rest]
|
||||
stratify = y_rest if self._split_type == 'stratified' else \
|
||||
None
|
||||
if 'sample_weight' in self._state.fit_kwargs:
|
||||
|
@ -647,7 +681,7 @@ class AutoML:
|
|||
X_val = concat(X_first, X_val)
|
||||
y_val = concat(label_set, y_val) if self._df else \
|
||||
np.concatenate([label_set, y_val])
|
||||
elif eval_method == 'holdout' and self._state.task == 'regression':
|
||||
elif self._state.task == 'regression':
|
||||
if 'sample_weight' in self._state.fit_kwargs:
|
||||
X_train, X_val, y_train, y_val, self._state.fit_kwargs[
|
||||
'sample_weight'], self._state.weight_val = \
|
||||
|
@ -669,16 +703,16 @@ class AutoML:
|
|||
self._state.y_val = (X_train, y_train, X_val, y_val)
|
||||
self._state.X_train_all = X_train_all
|
||||
self._state.y_train_all = y_train_all
|
||||
if hasattr(self._state, 'groups') and self._state.groups is not None:
|
||||
logger.info("Using GroupKFold")
|
||||
assert len(self._state.groups) == y_train_all.size, \
|
||||
if self._split_type == 'group':
|
||||
# logger.info("Using GroupKFold")
|
||||
assert len(self._state.groups_all) == y_train_all.size, \
|
||||
"the length of groups must match the number of examples"
|
||||
assert len(np.unique(self._state.groups)) >= n_splits, \
|
||||
assert len(np.unique(self._state.groups_all)) >= n_splits, \
|
||||
"the number of groups must be equal or larger than n_splits"
|
||||
self._state.kf = GroupKFold(n_splits)
|
||||
self._state.kf.groups = self._state.groups
|
||||
self._state.kf.groups = self._state.groups_all
|
||||
elif self._split_type == "stratified":
|
||||
logger.info("Using StratifiedKFold")
|
||||
# logger.info("Using StratifiedKFold")
|
||||
assert y_train_all.size >= n_splits, (
|
||||
f"{n_splits}-fold cross validation"
|
||||
f" requires input data with at least {n_splits} examples.")
|
||||
|
@ -688,14 +722,22 @@ class AutoML:
|
|||
self._state.kf = RepeatedStratifiedKFold(
|
||||
n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED)
|
||||
elif self._split_type == "time":
|
||||
logger.info("Using TimeSeriesSplit")
|
||||
# logger.info("Using TimeSeriesSplit")
|
||||
if self._state.task == 'forecast':
|
||||
period = self._state.fit_kwargs['period']
|
||||
if period * (n_splits + 1) > y_train_all.size:
|
||||
n_splits = int(y_train_all.size / period - 1)
|
||||
assert n_splits >= 2, (
|
||||
f"cross validation for forecasting period={period}"
|
||||
f" requires input data with at least {3 * period} examples.")
|
||||
logger.info(
|
||||
f"Using nsplits={n_splits} due to data size limit.")
|
||||
self._state.kf = TimeSeriesSplit(
|
||||
n_splits=n_splits, test_size=self._state.fit_kwargs.get('period'))
|
||||
n_splits=n_splits, test_size=period)
|
||||
else:
|
||||
self._state.kf = TimeSeriesSplit(n_splits=n_splits)
|
||||
else:
|
||||
logger.info("Using RepeatedKFold")
|
||||
# logger.info("Using RepeatedKFold")
|
||||
self._state.kf = RepeatedKFold(
|
||||
n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED)
|
||||
|
||||
|
@ -745,7 +787,8 @@ class AutoML:
|
|||
eval_method='auto',
|
||||
split_ratio=SPLIT_RATIO,
|
||||
n_splits=N_SPLITS,
|
||||
split_type="stratified",
|
||||
split_type=None,
|
||||
groups=None,
|
||||
n_jobs=1,
|
||||
train_best=True,
|
||||
train_full=False,
|
||||
|
@ -754,31 +797,51 @@ class AutoML:
|
|||
'''Retrain from log file
|
||||
|
||||
Args:
|
||||
time_budget: A float number of the time budget in seconds
|
||||
log_file_name: A string of the log file name
|
||||
X_train: A numpy array of training data in shape n*m
|
||||
y_train: A numpy array of labels in shape n*1
|
||||
dataframe: A dataframe of training data including label column.
|
||||
For 'forecast' task, dataframe must be specified and should
|
||||
have two columns: timestamp and value.
|
||||
label: A str of the label column name for 'classification' or
|
||||
'regression' task, e.g., 'label';
|
||||
or a tuple of strings for timestamp and value columns for
|
||||
'forecasting' task, e.g., ('timestamp', 'value').
|
||||
Note: If X_train and y_train are provided,
|
||||
dataframe and label are ignored;
|
||||
If not, dataframe and label must be provided.
|
||||
time_budget: A float number of the time budget in seconds.
|
||||
task: A string of the task type, e.g.,
|
||||
'classification', 'regression'
|
||||
'classification', 'regression', 'forecast', 'rank'.
|
||||
eval_method: A string of resampling strategy, one of
|
||||
['auto', 'cv', 'holdout']
|
||||
split_ratio: A float of the validation data percentage for holdout
|
||||
n_splits: An integer of the number of folds for cross-validation
|
||||
n_jobs: An integer of the number of threads for training
|
||||
['auto', 'cv', 'holdout'].
|
||||
split_ratio: A float of the validation data percentage for holdout.
|
||||
n_splits: An integer of the number of folds for cross-validation.
|
||||
split_type: str or None, default=None | the data split type.
|
||||
For classification tasks, valid choices are [
|
||||
None, 'stratified', 'uniform', 'time']. None -> stratified.
|
||||
For regression tasks, valid choices are [None, 'uniform', 'time'].
|
||||
None -> uniform.
|
||||
For time series forecasting, must be None or 'time'.
|
||||
For ranking task, must be None or 'group'.
|
||||
groups: None or array-like | Group labels (with matching length to
|
||||
y_train) or groups counts (with sum equal to length of y_train)
|
||||
for training data.
|
||||
n_jobs: An integer of the number of threads for training.
|
||||
train_best: A boolean of whether to train the best config in the
|
||||
time budget; if false, train the last config in the budget
|
||||
time budget; if false, train the last config in the budget.
|
||||
train_full: A boolean of whether to train on the full data. If true,
|
||||
eval_method and sample_size in the log file will be ignored
|
||||
eval_method and sample_size in the log file will be ignored.
|
||||
record_id: the ID of the training log record from which the model will
|
||||
be retrained. By default `record_id = -1` which means this will be
|
||||
ignored. `record_id = 0` corresponds to the first trial, and
|
||||
when `record_id >= 0`, `time_budget` will be ignored.
|
||||
**fit_kwargs: Other key word arguments to pass to fit() function of
|
||||
the searched learners, such as sample_weight
|
||||
the searched learners, such as sample_weight.
|
||||
'''
|
||||
self._state.task = task
|
||||
self._state.fit_kwargs = fit_kwargs
|
||||
self._validate_data(X_train, y_train, dataframe, label)
|
||||
self._validate_data(X_train, y_train, dataframe, label, groups=groups)
|
||||
|
||||
logger.info('log file name {}'.format(log_file_name))
|
||||
|
||||
|
@ -829,24 +892,17 @@ class AutoML:
|
|||
# Partially copied from fit() function
|
||||
# Initilize some attributes required for retrain_from_log
|
||||
self._state.task = task
|
||||
if self._state.task == 'classification':
|
||||
self._state.task = get_classification_objective(
|
||||
len(np.unique(self._y_train_all)))
|
||||
assert split_type in ["stratified", "uniform", "time"]
|
||||
self._split_type = split_type
|
||||
elif self._state.task == 'regression':
|
||||
if split_type in ["uniform", "time"]:
|
||||
self._split_type = split_type
|
||||
else:
|
||||
self._split_type = "uniform"
|
||||
elif self._state.task == 'forecast':
|
||||
self._split_type = "time"
|
||||
self._decide_split_type(split_type)
|
||||
if record_id >= 0:
|
||||
eval_method = 'cv'
|
||||
elif eval_method == 'auto':
|
||||
eval_method = self._decide_eval_method(time_budget)
|
||||
self.modelcount = 0
|
||||
self._prepare_data(eval_method, split_ratio, n_splits)
|
||||
if self._state.task != 'forecast':
|
||||
self._prepare_data(eval_method, split_ratio, n_splits)
|
||||
else:
|
||||
self._prepare_data(eval_method, split_ratio, n_splits,
|
||||
period=self._state.fit_kwargs['period'])
|
||||
self._state.time_budget = None
|
||||
self._state.n_jobs = n_jobs
|
||||
self._trained_estimator = self._state._train_with_config(
|
||||
|
@ -854,6 +910,26 @@ class AutoML:
|
|||
logger.info('retrain from log succeeded')
|
||||
return training_duration
|
||||
|
||||
def _decide_split_type(self, split_type):
|
||||
if self._state.task == 'classification':
|
||||
self._state.task = get_classification_objective(
|
||||
len(np.unique(self._y_train_all)))
|
||||
assert split_type in [None, "stratified", "uniform", "time"]
|
||||
self._split_type = split_type or "stratified"
|
||||
elif self._state.task == 'regression':
|
||||
assert split_type in [None, "uniform", "time"]
|
||||
self._split_type = split_type or "uniform"
|
||||
elif self._state.task == 'forecast':
|
||||
assert split_type in [None, "time"]
|
||||
self._split_type = "time"
|
||||
assert isinstance(self._state.fit_kwargs.get('period'), int), (
|
||||
"missing a required integer 'period' for forecast.")
|
||||
elif self._state.task == 'rank':
|
||||
assert self._state.groups is not None, \
|
||||
'groups must be specified for ranking task.'
|
||||
assert split_type in [None, "group"]
|
||||
self._split_type = 'group'
|
||||
|
||||
def _decide_eval_method(self, time_budget):
|
||||
if self._state.X_val is not None:
|
||||
return 'holdout'
|
||||
|
@ -1020,7 +1096,7 @@ class AutoML:
|
|||
else:
|
||||
return {'pred_time': 0,
|
||||
'wall_clock_time': None,
|
||||
'train_loss': np.inf,
|
||||
'metric_for_logging': np.inf,
|
||||
'val_loss': np.inf,
|
||||
'trained_estimator': None
|
||||
}
|
||||
|
@ -1065,10 +1141,11 @@ class AutoML:
|
|||
X_val=None,
|
||||
y_val=None,
|
||||
sample_weight_val=None,
|
||||
groups_val=None,
|
||||
groups=None,
|
||||
verbose=1,
|
||||
retrain_full=True,
|
||||
split_type="stratified",
|
||||
split_type=None,
|
||||
learner_selector='sample',
|
||||
hpo_method=None,
|
||||
starting_points={},
|
||||
|
@ -1104,14 +1181,15 @@ class AutoML:
|
|||
|
||||
def custom_metric(
|
||||
X_test, y_test, estimator, labels,
|
||||
X_train, y_train, weight_test=None, weight_train=None
|
||||
X_train, y_train, weight_test=None, weight_train=None,
|
||||
config=None, groups_test=None, groups_train=None,
|
||||
):
|
||||
return metric_to_minimize, metrics_to_log
|
||||
|
||||
which returns a float number as the minimization objective,
|
||||
and a tuple of floats or a dictionary as the metrics to log.
|
||||
task: A string of the task type, e.g.,
|
||||
'classification', 'regression', 'forecast'.
|
||||
'classification', 'regression', 'forecast', 'rank'.
|
||||
n_jobs: An integer of the number of threads for training.
|
||||
log_file_name: A string of the log file name.
|
||||
estimator_list: A list of strings for estimator names, or 'auto'
|
||||
|
@ -1125,6 +1203,10 @@ class AutoML:
|
|||
max_iter: An integer of the maximal number of iterations.
|
||||
sample: A boolean of whether to sample the training data during
|
||||
search.
|
||||
ensemble: boolean or dict | default=False. Whether to perform
|
||||
ensemble after search. Can be a dict with keys 'passthrough'
|
||||
and 'final_estimator' to specify the passthrough and
|
||||
final_estimator in the stacker.
|
||||
eval_method: A string of resampling strategy, one of
|
||||
['auto', 'cv', 'holdout'].
|
||||
split_ratio: A float of the valiation data percentage for holdout.
|
||||
|
@ -1144,9 +1226,13 @@ class AutoML:
|
|||
X_val: None or a numpy array or a pandas dataframe of validation data.
|
||||
y_val: None or a numpy array or a pandas series of validation labels.
|
||||
sample_weight_val: None or a numpy array of the sample weight of
|
||||
validation data.
|
||||
groups: None or an array-like of shape (n,) | Group labels for the
|
||||
samples used while splitting the dataset into train/valid set.
|
||||
validation data of the same shape as y_val.
|
||||
groups_val: None or array-like | group labels (with matching length
|
||||
to y_val) or group counts (with sum equal to length of y_val)
|
||||
for validation data. Need to be consistent with groups.
|
||||
groups: None or array-like | Group labels (with matching length to
|
||||
y_train) or groups counts (with sum equal to length of y_train)
|
||||
for training data.
|
||||
verbose: int, default=1 | Controls the verbosity, higher means more
|
||||
messages.
|
||||
retrain_full: bool or str, default=True | whether to retrain the
|
||||
|
@ -1154,6 +1240,13 @@ class AutoML:
|
|||
True - retrain only after search finishes; False - no retraining;
|
||||
'budget' - do best effort to retrain without violating the time
|
||||
budget.
|
||||
split_type: str or None, default=None | the data split type.
|
||||
For classification tasks, valid choices are [
|
||||
None, 'stratified', 'uniform', 'time']. None -> stratified.
|
||||
For regression tasks, valid choices are [None, 'uniform', 'time'].
|
||||
None -> uniform.
|
||||
For time series forecasting, must be None or 'time'.
|
||||
For ranking task, must be None or 'group'.
|
||||
hpo_method: str or None, default=None | The hyperparameter
|
||||
optimization method. When it is None, CFO is used.
|
||||
No need to set when using flaml's default search space or using
|
||||
|
@ -1182,9 +1275,9 @@ class AutoML:
|
|||
self._state.log_training_metric = log_training_metric
|
||||
self._state.fit_kwargs = fit_kwargs
|
||||
self._state.weight_val = sample_weight_val
|
||||
self._state.groups = groups
|
||||
|
||||
self._validate_data(X_train, y_train, dataframe, label, X_val, y_val)
|
||||
self._validate_data(X_train, y_train, dataframe, label, X_val, y_val,
|
||||
groups_val, groups)
|
||||
self._search_states = {} # key: estimator name; value: SearchState
|
||||
self._random = np.random.RandomState(RANDOM_SEED)
|
||||
if seed is not None:
|
||||
|
@ -1194,24 +1287,7 @@ class AutoML:
|
|||
self.verbose = verbose
|
||||
if verbose == 0:
|
||||
logger.setLevel(logging.WARNING)
|
||||
if self._state.task == 'classification':
|
||||
self._state.task = get_classification_objective(
|
||||
len(np.unique(self._y_train_all)))
|
||||
assert split_type in ["stratified", "uniform", "time"]
|
||||
self._split_type = split_type
|
||||
elif self._state.task == 'regression':
|
||||
if split_type in ["uniform", "time"]:
|
||||
self._split_type = split_type
|
||||
else:
|
||||
self._split_type = "uniform"
|
||||
elif self._state.task == 'forecast':
|
||||
if split_type is not None and split_type != 'time':
|
||||
raise ValueError(
|
||||
"split_type must be 'time' when task is 'forecast'.")
|
||||
self._split_type = "time"
|
||||
if self._state.fit_kwargs.get('period') is None:
|
||||
raise TypeError(
|
||||
"missing 1 required argument for 'forecast' task: 'period'.")
|
||||
self._decide_split_type(split_type)
|
||||
if eval_method == 'auto' or self._state.X_val is not None:
|
||||
eval_method = self._decide_eval_method(time_budget)
|
||||
self._state.eval_method = eval_method
|
||||
|
@ -1227,12 +1303,8 @@ class AutoML:
|
|||
self._retrain_final = retrain_full is True and (
|
||||
eval_method == 'holdout' and self._state.X_val is None) or (
|
||||
eval_method == 'cv')
|
||||
if self._state.task != 'forecast':
|
||||
self._prepare_data(eval_method, split_ratio, n_splits)
|
||||
else:
|
||||
self._prepare_data(eval_method, split_ratio, n_splits,
|
||||
period=self._state.fit_kwargs['period'])
|
||||
self._sample = sample and eval_method != 'cv' and (
|
||||
self._prepare_data(eval_method, split_ratio, n_splits)
|
||||
self._sample = sample and task != 'rank' and eval_method != 'cv' and (
|
||||
MIN_SAMPLE_TRAIN * SAMPLE_MULTIPLY_FACTOR < self._state.data_size)
|
||||
if 'auto' == metric:
|
||||
if 'binary' in self._state.task:
|
||||
|
@ -1241,11 +1313,13 @@ class AutoML:
|
|||
metric = 'log_loss'
|
||||
elif self._state.task == 'forecast':
|
||||
metric = 'mape'
|
||||
elif self._state.task == 'rank':
|
||||
metric = 'ndcg'
|
||||
else:
|
||||
metric = 'r2'
|
||||
self._state.metric = metric
|
||||
if metric in ['r2', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo',
|
||||
'f1', 'ap', 'micro_f1', 'macro_f1']:
|
||||
'f1', 'ap', 'micro_f1', 'macro_f1', 'ndcg']:
|
||||
error_metric = f"1-{metric}"
|
||||
elif isinstance(metric, str):
|
||||
error_metric = metric
|
||||
|
@ -1256,6 +1330,8 @@ class AutoML:
|
|||
if 'auto' == estimator_list:
|
||||
if self._state.task == 'forecast':
|
||||
estimator_list = ['fbprophet', 'arima', 'sarimax']
|
||||
elif self._state.task == 'rank':
|
||||
estimator_list = ['lgbm', 'xgboost']
|
||||
else:
|
||||
estimator_list = [
|
||||
'lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree']
|
||||
|
@ -1278,7 +1354,9 @@ class AutoML:
|
|||
logger.info("List of ML learners in AutoML Run: {}".format(
|
||||
estimator_list))
|
||||
self.estimator_list = estimator_list
|
||||
self._hpo_method = hpo_method or 'cfo'
|
||||
self._hpo_method = hpo_method or (
|
||||
'cfo' if n_concurrent_trials == 1 or len(estimator_list) == 1
|
||||
else 'bs')
|
||||
self._state.time_budget = time_budget
|
||||
self._active_estimators = estimator_list.copy()
|
||||
self._ensemble = ensemble
|
||||
|
@ -1315,7 +1393,8 @@ class AutoML:
|
|||
del self._X_train_all, self._y_train_all, self._state.kf
|
||||
del self._state.X_train, self._state.X_train_all, self._state.X_val
|
||||
del self._state.y_train, self._state.y_train_all, self._state.y_val
|
||||
del self._sample_weight_full, self._state.fit_kwargs, self._state.groups
|
||||
del self._sample_weight_full, self._state.fit_kwargs
|
||||
del self._state.groups, self._state.groups_all, self._state.groups_val
|
||||
for state in self._search_states.values():
|
||||
if state.trained_estimator:
|
||||
del state.trained_estimator
|
||||
|
@ -1363,8 +1442,7 @@ class AutoML:
|
|||
del p[k]
|
||||
|
||||
search_alg = SearchAlgo(max_concurrent=self._n_concurrent_trials,
|
||||
points_to_evaluate=points_to_evaluate
|
||||
)
|
||||
points_to_evaluate=points_to_evaluate)
|
||||
else:
|
||||
search_alg = SearchAlgo(
|
||||
metric='val_loss',
|
||||
|
@ -1387,7 +1465,8 @@ class AutoML:
|
|||
analysis = ray.tune.run(
|
||||
self.trainable, search_alg=search_alg, config=self.search_space,
|
||||
metric='val_loss', mode='min', resources_per_trial=resources_per_trial,
|
||||
time_budget_s=self._state.time_budget, num_samples=self._max_iter)
|
||||
time_budget_s=self._state.time_budget, num_samples=self._max_iter,
|
||||
verbose=self.verbose)
|
||||
# logger.info([trial.last_result for trial in analysis.trials])
|
||||
trials = sorted((trial for trial in analysis.trials if trial.last_result
|
||||
and trial.last_result['wall_clock_time'] is not None),
|
||||
|
@ -1421,7 +1500,7 @@ class AutoML:
|
|||
if (better or self._log_type == 'all') and self._training_log:
|
||||
self._training_log.append(
|
||||
self._iter_per_learner[estimator],
|
||||
search_state.train_loss,
|
||||
search_state.metric_for_logging,
|
||||
search_state.trial_time,
|
||||
self._state.time_from_start,
|
||||
search_state.val_loss,
|
||||
|
@ -1591,7 +1670,7 @@ class AutoML:
|
|||
if self._training_log:
|
||||
self._training_log.append(
|
||||
self._iter_per_learner[estimator],
|
||||
search_state.train_loss,
|
||||
search_state.metric_for_logging,
|
||||
search_state.trial_time,
|
||||
self._state.time_from_start,
|
||||
search_state.val_loss,
|
||||
|
@ -1604,8 +1683,8 @@ class AutoML:
|
|||
with mlflow.start_run(nested=True):
|
||||
mlflow.log_metric('iter_counter',
|
||||
self._iter_per_learner[estimator])
|
||||
mlflow.log_param('train_loss',
|
||||
search_state.train_loss)
|
||||
mlflow.log_param('metric_for_logging',
|
||||
search_state.metric_for_logging)
|
||||
mlflow.log_metric('trial_time',
|
||||
search_state.trial_time)
|
||||
mlflow.log_metric('wall_clock_time',
|
||||
|
@ -1702,7 +1781,9 @@ class AutoML:
|
|||
for search_state in self._search_states.values())
|
||||
if self._trained_estimator:
|
||||
logger.info(f'selected model: {self._trained_estimator.model}')
|
||||
if self._ensemble:
|
||||
if self._ensemble and self._state.task in (
|
||||
'binary:logistic', 'multi:softmax', 'regression',
|
||||
):
|
||||
search_states = list(x for x in self._search_states.items()
|
||||
if x[1].trained_estimator)
|
||||
search_states.sort(key=lambda x: x[1].best_loss)
|
||||
|
@ -1714,15 +1795,20 @@ class AutoML:
|
|||
logger.info(estimators)
|
||||
if len(estimators) <= 1:
|
||||
return
|
||||
if self._state.task != "regression":
|
||||
if self._state.task in ('binary:logistic', 'multi:softmax'):
|
||||
from sklearn.ensemble import StackingClassifier as Stacker
|
||||
for e in estimators:
|
||||
e[1]._estimator_type = 'classifier'
|
||||
else:
|
||||
from sklearn.ensemble import StackingRegressor as Stacker
|
||||
best_m = self._trained_estimator
|
||||
stacker = Stacker(estimators, best_m, n_jobs=self._state.n_jobs,
|
||||
passthrough=True)
|
||||
if isinstance(self._ensemble, dict):
|
||||
final_estimator = self._ensemble.get(
|
||||
'final_estimator', self._trained_estimator)
|
||||
passthrough = self._ensemble.get('passthrough', True)
|
||||
else:
|
||||
final_estimator = self._trained_estimator
|
||||
passthrough = True
|
||||
stacker = Stacker(
|
||||
estimators, final_estimator, n_jobs=self._state.n_jobs,
|
||||
passthrough=passthrough)
|
||||
if self._sample_weight_full is not None:
|
||||
self._state.fit_kwargs[
|
||||
'sample_weight'] = self._sample_weight_full
|
||||
|
@ -1734,9 +1820,11 @@ class AutoML:
|
|||
elif self._retrain_final:
|
||||
# reset time budget for retraining
|
||||
self._state.time_from_start -= self._state.time_budget
|
||||
if (self._state.time_budget - self._state.time_from_start
|
||||
> self._selected.est_retrain_time(self.data_size_full)) \
|
||||
and self._selected.best_config_sample_size == self._state.data_size:
|
||||
if self._state.task == 'forecast' or (
|
||||
self._state.time_budget - self._state.time_from_start
|
||||
> self._selected.est_retrain_time(self.data_size_full)
|
||||
and self._selected.best_config_sample_size == self._state.data_size
|
||||
):
|
||||
self._trained_estimator, \
|
||||
retrain_time = self._state._train_with_config(
|
||||
self._best_estimator,
|
||||
|
|
|
@ -146,7 +146,7 @@ def get_output_from_log(filename, time_budget):
|
|||
config = record.config
|
||||
learner = record.learner.split('_')[0]
|
||||
sample_size = record.sample_size
|
||||
train_loss = record.logged_metric
|
||||
metric = record.logged_metric
|
||||
|
||||
if time_used < time_budget and np.isfinite(val_loss):
|
||||
if val_loss < best_val_loss:
|
||||
|
@ -156,7 +156,7 @@ def get_output_from_log(filename, time_budget):
|
|||
best_config_list.append(best_config)
|
||||
search_time_list.append(time_used)
|
||||
best_error_list.append(best_val_loss)
|
||||
logged_metric_list.append(train_loss)
|
||||
logged_metric_list.append(metric)
|
||||
error_list.append(val_loss)
|
||||
config_list.append({"Current Learner": learner,
|
||||
"Current Sample": sample_size,
|
||||
|
@ -242,8 +242,12 @@ class DataTransformer:
|
|||
X[cat_columns] = X[cat_columns].astype('category')
|
||||
if num_columns:
|
||||
X_num = X[num_columns]
|
||||
if drop and np.issubdtype(X_num.columns.dtype, np.integer):
|
||||
if np.issubdtype(X_num.columns.dtype, np.integer) and (
|
||||
drop or min(X_num.columns) != 0
|
||||
or max(X_num.columns) != X_num.shape[1] - 1
|
||||
):
|
||||
X_num.columns = range(X_num.shape[1])
|
||||
drop = True
|
||||
else:
|
||||
drop = False
|
||||
from sklearn.impute import SimpleImputer
|
||||
|
@ -257,12 +261,12 @@ class DataTransformer:
|
|||
cat_columns, num_columns, datetime_columns
|
||||
self._drop = drop
|
||||
|
||||
if task == 'regression':
|
||||
self.label_transformer = None
|
||||
else:
|
||||
if task in ('binary:logistic', 'multi:softmax'):
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
self.label_transformer = LabelEncoder()
|
||||
y = self.label_transformer.fit_transform(y)
|
||||
else:
|
||||
self.label_transformer = None
|
||||
return X, y
|
||||
|
||||
def transform(self, X):
|
||||
|
@ -302,3 +306,8 @@ class DataTransformer:
|
|||
X_num.columns = range(X_num.shape[1])
|
||||
X[num_columns] = self.transformer.transform(X_num)
|
||||
return X
|
||||
|
||||
|
||||
def group_counts(groups):
|
||||
_, i, c = np.unique(groups, return_counts=True, return_index=True)
|
||||
return c[np.argsort(i)]
|
||||
|
|
206
flaml/ml.py
206
flaml/ml.py
|
@ -4,17 +4,17 @@
|
|||
'''
|
||||
|
||||
import time
|
||||
from joblib.externals.cloudpickle.cloudpickle import instance
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, \
|
||||
accuracy_score, mean_absolute_error, log_loss, average_precision_score, \
|
||||
f1_score, mean_absolute_percentage_error
|
||||
f1_score, mean_absolute_percentage_error, ndcg_score
|
||||
from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold, TimeSeriesSplit
|
||||
from .model import (
|
||||
XGBoostEstimator, XGBoostSklearnEstimator, RandomForestEstimator,
|
||||
LGBMEstimator, LRL1Classifier, LRL2Classifier, CatBoostEstimator,
|
||||
ExtraTreeEstimator, KNeighborsEstimator, FBProphet, ARIMA, SARIMAX)
|
||||
from .data import group_counts
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -56,26 +56,29 @@ def get_estimator_class(task, estimator_name):
|
|||
|
||||
|
||||
def sklearn_metric_loss_score(
|
||||
metric_name, y_predict, y_true, labels=None, sample_weight=None
|
||||
metric_name, y_predict, y_true, labels=None, sample_weight=None,
|
||||
groups=None,
|
||||
):
|
||||
'''Loss using the specified metric
|
||||
|
||||
Args:
|
||||
metric_name: A string of the metric name, one of
|
||||
'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',
|
||||
'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'micro_f1', 'macro_f1'
|
||||
'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg',
|
||||
'micro_f1', 'macro_f1'.
|
||||
y_predict: A 1d or 2d numpy array of the predictions which can be
|
||||
used to calculate the metric. E.g., 2d for log_loss and 1d
|
||||
for others.
|
||||
y_true: A 1d numpy array of the true labels
|
||||
labels: A 1d numpy array of the unique labels
|
||||
sample_weight: A 1d numpy array of the sample weight
|
||||
y_true: A 1d numpy array of the true labels.
|
||||
labels: A 1d numpy array of the unique labels.
|
||||
sample_weight: A 1d numpy array of the sample weight.
|
||||
groups: A 1d numpy array of the group labels.
|
||||
|
||||
Returns:
|
||||
score: A float number of the loss, the lower the better
|
||||
score: A float number of the loss, the lower the better.
|
||||
'''
|
||||
metric_name = metric_name.lower()
|
||||
if 'r2' in metric_name:
|
||||
if 'r2' == metric_name:
|
||||
score = 1.0 - r2_score(y_true, y_predict, sample_weight=sample_weight)
|
||||
elif metric_name == 'rmse':
|
||||
score = np.sqrt(mean_squared_error(
|
||||
|
@ -98,26 +101,40 @@ def sklearn_metric_loss_score(
|
|||
elif metric_name == 'roc_auc_ovo':
|
||||
score = 1.0 - roc_auc_score(
|
||||
y_true, y_predict, sample_weight=sample_weight, multi_class='ovo')
|
||||
elif 'log_loss' in metric_name:
|
||||
elif 'log_loss' == metric_name:
|
||||
score = log_loss(
|
||||
y_true, y_predict, labels=labels, sample_weight=sample_weight)
|
||||
elif 'mape' in metric_name:
|
||||
elif 'mape' == metric_name:
|
||||
try:
|
||||
score = mean_absolute_percentage_error(
|
||||
y_true, y_predict)
|
||||
except ValueError:
|
||||
return np.inf
|
||||
elif 'micro_f1' in metric_name:
|
||||
elif 'micro_f1' == metric_name:
|
||||
score = 1 - f1_score(
|
||||
y_true, y_predict, sample_weight=sample_weight, average='micro')
|
||||
elif 'macro_f1' in metric_name:
|
||||
elif 'macro_f1' == metric_name:
|
||||
score = 1 - f1_score(
|
||||
y_true, y_predict, sample_weight=sample_weight, average='macro')
|
||||
elif 'f1' in metric_name:
|
||||
elif 'f1' == metric_name:
|
||||
score = 1 - f1_score(y_true, y_predict, sample_weight=sample_weight)
|
||||
elif 'ap' in metric_name:
|
||||
elif 'ap' == metric_name:
|
||||
score = 1 - average_precision_score(
|
||||
y_true, y_predict, sample_weight=sample_weight)
|
||||
elif 'ndcg' in metric_name:
|
||||
if '@' in metric_name:
|
||||
k = int(metric_name.split('@', 1)[-1])
|
||||
counts = group_counts(groups)
|
||||
score = 0
|
||||
psum = 0
|
||||
for c in counts:
|
||||
score -= ndcg_score(np.asarray([y_true[psum:psum + c]]),
|
||||
np.asarray([y_predict[psum:psum + c]]), k=k)
|
||||
psum += c
|
||||
score /= len(counts)
|
||||
score += 1
|
||||
else:
|
||||
score = 1 - ndcg_score([y_true], [y_predict])
|
||||
else:
|
||||
raise ValueError(
|
||||
metric_name + ' is not a built-in metric, '
|
||||
|
@ -128,92 +145,60 @@ def sklearn_metric_loss_score(
|
|||
return score
|
||||
|
||||
|
||||
def get_y_pred(estimator, X, eval_metric, obj, freq=None):
|
||||
def get_y_pred(estimator, X, eval_metric, obj):
|
||||
if eval_metric in ['roc_auc', 'ap'] and 'binary' in obj:
|
||||
y_pred_classes = estimator.predict_proba(X)
|
||||
y_pred = y_pred_classes[
|
||||
:, 1] if y_pred_classes.ndim > 1 else y_pred_classes
|
||||
elif eval_metric in ['log_loss', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']:
|
||||
y_pred = estimator.predict_proba(X)
|
||||
elif eval_metric == 'mape':
|
||||
y_pred = estimator.predict(X, freq=freq)
|
||||
else:
|
||||
y_pred = estimator.predict(X)
|
||||
return y_pred
|
||||
|
||||
|
||||
def get_test_loss(
|
||||
estimator, X_train, y_train, X_test, y_test, weight_test,
|
||||
eval_metric, obj, labels=None, budget=None, log_training_metric=False, fit_kwargs={}
|
||||
):
|
||||
def get_test_loss(config, estimator, X_train, y_train, X_test, y_test, weight_test,
|
||||
groups_test, eval_metric, obj, labels=None, budget=None,
|
||||
log_training_metric=False, fit_kwargs={}):
|
||||
|
||||
start = time.time()
|
||||
# if groups_test is not None:
|
||||
# fit_kwargs['groups_val'] = groups_test
|
||||
# fit_kwargs['X_val'] = X_test
|
||||
# fit_kwargs['y_val'] = y_test
|
||||
estimator.fit(X_train, y_train, budget, **fit_kwargs)
|
||||
if isinstance(eval_metric, str):
|
||||
pred_start = time.time()
|
||||
test_pred_y = get_y_pred(estimator, X_test, eval_metric, obj)
|
||||
pred_time = (time.time() - pred_start) / X_test.shape[0]
|
||||
test_loss = sklearn_metric_loss_score(eval_metric, test_pred_y, y_test,
|
||||
labels, weight_test)
|
||||
labels, weight_test, groups_test)
|
||||
if log_training_metric:
|
||||
test_pred_y = get_y_pred(estimator, X_train, eval_metric, obj)
|
||||
train_loss = sklearn_metric_loss_score(
|
||||
eval_metric, test_pred_y,
|
||||
y_train, labels, fit_kwargs.get('sample_weight'))
|
||||
metric_for_logging = sklearn_metric_loss_score(
|
||||
eval_metric, test_pred_y, y_train, labels,
|
||||
fit_kwargs.get('sample_weight'), fit_kwargs.get('groups'))
|
||||
else:
|
||||
train_loss = None
|
||||
metric_for_logging = None
|
||||
else: # customized metric function
|
||||
test_loss, metrics = eval_metric(
|
||||
X_test, y_test, estimator, labels, X_train, y_train,
|
||||
weight_test, fit_kwargs.get('sample_weight'))
|
||||
X_test, y_test, estimator, labels, X_train, y_train, weight_test,
|
||||
fit_kwargs.get('sample_weight'), config, groups_test,
|
||||
fit_kwargs.get('groups'))
|
||||
if isinstance(metrics, dict):
|
||||
pred_time = metrics.get('pred_time', 0)
|
||||
train_loss = metrics
|
||||
metric_for_logging = metrics
|
||||
train_time = time.time() - start
|
||||
return test_loss, train_time, train_loss, pred_time
|
||||
return test_loss, metric_for_logging, train_time, pred_time
|
||||
|
||||
|
||||
def train_model(estimator, X_train, y_train, budget, fit_kwargs={}):
|
||||
train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs)
|
||||
return train_time
|
||||
|
||||
|
||||
def evaluate_model(
|
||||
estimator, X_train, y_train, X_val, y_val, weight_val,
|
||||
budget, kf, task, eval_method, eval_metric, best_val_loss, log_training_metric=False,
|
||||
fit_kwargs={}
|
||||
):
|
||||
if 'holdout' in eval_method:
|
||||
val_loss, train_loss, train_time, pred_time = evaluate_model_holdout(
|
||||
estimator, X_train, y_train, X_val, y_val, weight_val, budget,
|
||||
task, eval_metric, log_training_metric=log_training_metric,
|
||||
fit_kwargs=fit_kwargs)
|
||||
else:
|
||||
val_loss, train_loss, train_time, pred_time = evaluate_model_CV(
|
||||
estimator, X_train, y_train, budget, kf, task,
|
||||
eval_metric, best_val_loss, log_training_metric=log_training_metric,
|
||||
fit_kwargs=fit_kwargs)
|
||||
return val_loss, train_loss, train_time, pred_time
|
||||
|
||||
|
||||
def evaluate_model_holdout(
|
||||
estimator, X_train, y_train, X_val, y_val,
|
||||
weight_val, budget, task, eval_metric, log_training_metric=False,
|
||||
fit_kwargs={}
|
||||
):
|
||||
val_loss, train_time, train_loss, pred_time = get_test_loss(
|
||||
estimator, X_train, y_train, X_val, y_val, weight_val, eval_metric,
|
||||
task, budget=budget, log_training_metric=log_training_metric, fit_kwargs=fit_kwargs)
|
||||
return val_loss, train_loss, train_time, pred_time
|
||||
|
||||
|
||||
def evaluate_model_CV(
|
||||
estimator, X_train_all, y_train_all, budget, kf,
|
||||
task, eval_metric, best_val_loss, log_training_metric=False, fit_kwargs={}
|
||||
):
|
||||
def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
|
||||
task, eval_metric, best_val_loss,
|
||||
log_training_metric=False, fit_kwargs={}):
|
||||
start_time = time.time()
|
||||
total_val_loss = 0
|
||||
total_train_loss = None
|
||||
train_loss = None
|
||||
total_metric = None
|
||||
metric = None
|
||||
train_time = pred_time = 0
|
||||
valid_fold_num = total_fold_num = 0
|
||||
n = kf.get_n_splits()
|
||||
|
@ -222,15 +207,19 @@ def evaluate_model_CV(
|
|||
labels = np.unique(y_train_all)
|
||||
else:
|
||||
labels = None
|
||||
|
||||
groups = None
|
||||
shuffle = True
|
||||
if isinstance(kf, RepeatedStratifiedKFold):
|
||||
kf = kf.split(X_train_split, y_train_split)
|
||||
elif isinstance(kf, GroupKFold):
|
||||
kf = kf.split(X_train_split, y_train_split, kf.groups)
|
||||
groups = kf.groups
|
||||
kf = kf.split(X_train_split, y_train_split, groups)
|
||||
shuffle = False
|
||||
elif isinstance(kf, TimeSeriesSplit) and task == 'forecast':
|
||||
y_train_all = pd.DataFrame(y_train_all, columns=['y'])
|
||||
train = X_train_all.join(y_train_all)
|
||||
kf = kf.split(train)
|
||||
shuffle = False
|
||||
elif isinstance(kf, TimeSeriesSplit):
|
||||
kf = kf.split(X_train_split, y_train_split)
|
||||
else:
|
||||
|
@ -244,7 +233,7 @@ def evaluate_model_CV(
|
|||
else:
|
||||
weight = weight_val = None
|
||||
for train_index, val_index in kf:
|
||||
if not isinstance(kf, TimeSeriesSplit):
|
||||
if shuffle:
|
||||
train_index = rng.permutation(train_index)
|
||||
if isinstance(X_train_all, pd.DataFrame):
|
||||
X_train, X_val = X_train_split.iloc[
|
||||
|
@ -252,19 +241,19 @@ def evaluate_model_CV(
|
|||
else:
|
||||
X_train, X_val = X_train_split[
|
||||
train_index], X_train_split[val_index]
|
||||
if isinstance(y_train_all, pd.Series):
|
||||
y_train, y_val = y_train_split.iloc[
|
||||
train_index], y_train_split.iloc[val_index]
|
||||
else:
|
||||
y_train, y_val = y_train_split[
|
||||
train_index], y_train_split[val_index]
|
||||
y_train, y_val = y_train_split[train_index], y_train_split[val_index]
|
||||
estimator.cleanup()
|
||||
if weight is not None:
|
||||
fit_kwargs['sample_weight'], weight_val = weight[
|
||||
train_index], weight[val_index]
|
||||
val_loss_i, train_time_i, train_loss_i, pred_time_i = get_test_loss(
|
||||
estimator, X_train, y_train, X_val, y_val, weight_val,
|
||||
eval_metric, task, labels, budget_per_train,
|
||||
if groups is not None:
|
||||
fit_kwargs['groups'] = groups[train_index]
|
||||
groups_val = groups[val_index]
|
||||
else:
|
||||
groups_val = None
|
||||
val_loss_i, metric_i, train_time_i, pred_time_i = get_test_loss(
|
||||
config, estimator, X_train, y_train, X_val, y_val, weight_val,
|
||||
groups_val, eval_metric, task, labels, budget_per_train,
|
||||
log_training_metric=log_training_metric, fit_kwargs=fit_kwargs)
|
||||
if weight is not None:
|
||||
fit_kwargs['sample_weight'] = weight
|
||||
|
@ -272,16 +261,16 @@ def evaluate_model_CV(
|
|||
total_fold_num += 1
|
||||
total_val_loss += val_loss_i
|
||||
if log_training_metric or not isinstance(eval_metric, str):
|
||||
if isinstance(total_train_loss, list):
|
||||
total_train_loss = [
|
||||
total_train_loss[i] + v for i, v in enumerate(train_loss_i)]
|
||||
elif isinstance(total_train_loss, dict):
|
||||
total_train_loss = {
|
||||
k: total_train_loss[k] + v for k, v in train_loss_i.items()}
|
||||
elif total_train_loss is not None:
|
||||
total_train_loss += train_loss_i
|
||||
if isinstance(total_metric, list):
|
||||
total_metric = [
|
||||
total_metric[i] + v for i, v in enumerate(metric_i)]
|
||||
elif isinstance(total_metric, dict):
|
||||
total_metric = {
|
||||
k: total_metric[k] + v for k, v in metric_i.items()}
|
||||
elif total_metric is not None:
|
||||
total_metric += metric_i
|
||||
else:
|
||||
total_train_loss = train_loss_i
|
||||
total_metric = metric_i
|
||||
train_time += train_time_i
|
||||
pred_time += pred_time_i
|
||||
if valid_fold_num == n:
|
||||
|
@ -293,22 +282,22 @@ def evaluate_model_CV(
|
|||
val_loss = np.max(val_loss_list)
|
||||
n = total_fold_num
|
||||
if log_training_metric or not isinstance(eval_metric, str):
|
||||
if isinstance(total_train_loss, list):
|
||||
train_loss = [v / n for v in total_train_loss]
|
||||
elif isinstance(total_train_loss, dict):
|
||||
train_loss = {k: v / n for k, v in total_train_loss.items()}
|
||||
if isinstance(total_metric, list):
|
||||
metric = [v / n for v in total_metric]
|
||||
elif isinstance(total_metric, dict):
|
||||
metric = {k: v / n for k, v in total_metric.items()}
|
||||
else:
|
||||
train_loss = total_train_loss / n
|
||||
metric = total_metric / n
|
||||
pred_time /= n
|
||||
# budget -= time.time() - start_time
|
||||
# if val_loss < best_val_loss and budget > budget_per_train:
|
||||
# estimator.cleanup()
|
||||
# estimator.fit(X_train_all, y_train_all, budget, **fit_kwargs)
|
||||
return val_loss, train_loss, train_time, pred_time
|
||||
return val_loss, metric, train_time, pred_time
|
||||
|
||||
|
||||
def compute_estimator(
|
||||
X_train, y_train, X_val, y_val, weight_val, budget, kf,
|
||||
X_train, y_train, X_val, y_val, weight_val, groups_val, budget, kf,
|
||||
config_dic, task, estimator_name, eval_method, eval_metric,
|
||||
best_val_loss=np.Inf, n_jobs=1, estimator_class=None, log_training_metric=False,
|
||||
fit_kwargs={}
|
||||
|
@ -317,11 +306,17 @@ def compute_estimator(
|
|||
task, estimator_name)
|
||||
estimator = estimator_class(
|
||||
**config_dic, task=task, n_jobs=n_jobs)
|
||||
val_loss, train_loss, train_time, pred_time = evaluate_model(
|
||||
estimator, X_train, y_train, X_val, y_val, weight_val, budget, kf, task,
|
||||
eval_method, eval_metric, best_val_loss, log_training_metric=log_training_metric,
|
||||
fit_kwargs=fit_kwargs)
|
||||
return estimator, val_loss, train_loss, train_time, pred_time
|
||||
if 'holdout' in eval_method:
|
||||
val_loss, metric_for_logging, train_time, pred_time = get_test_loss(
|
||||
config_dic, estimator, X_train, y_train, X_val, y_val, weight_val,
|
||||
groups_val, eval_metric, task, budget=budget,
|
||||
log_training_metric=log_training_metric, fit_kwargs=fit_kwargs)
|
||||
else:
|
||||
val_loss, metric_for_logging, train_time, pred_time = evaluate_model_CV(
|
||||
config_dic, estimator, X_train, y_train, budget, kf, task,
|
||||
eval_metric, best_val_loss, log_training_metric=log_training_metric,
|
||||
fit_kwargs=fit_kwargs)
|
||||
return estimator, val_loss, metric_for_logging, train_time, pred_time
|
||||
|
||||
|
||||
def train_estimator(
|
||||
|
@ -333,8 +328,7 @@ def train_estimator(
|
|||
task, estimator_name)
|
||||
estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs)
|
||||
if X_train is not None:
|
||||
train_time = train_model(
|
||||
estimator, X_train, y_train, budget, fit_kwargs)
|
||||
train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs)
|
||||
else:
|
||||
estimator = estimator.estimator_class(**estimator.params)
|
||||
train_time = time.time() - start_time
|
||||
|
|
184
flaml/model.py
184
flaml/model.py
|
@ -3,16 +3,18 @@
|
|||
* Licensed under the MIT License.
|
||||
'''
|
||||
|
||||
import warnings
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
import time
|
||||
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
|
||||
from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from lightgbm import LGBMClassifier, LGBMRegressor
|
||||
from lightgbm import LGBMClassifier, LGBMRegressor, LGBMRanker
|
||||
from scipy.sparse import issparse
|
||||
import pandas as pd
|
||||
from . import tune
|
||||
from .data import group_counts
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -45,8 +47,8 @@ class BaseEstimator:
|
|||
self._estimator_type = params['_estimator_type']
|
||||
del self.params['_estimator_type']
|
||||
else:
|
||||
self._estimator_type = "regressor" if task == 'regression' \
|
||||
else "classifier"
|
||||
self._estimator_type = "classifier" if task in (
|
||||
'binary:logistic', 'multi:softmax') else "regressor"
|
||||
|
||||
def get_params(self, deep=False):
|
||||
params = self.params.copy()
|
||||
|
@ -81,6 +83,18 @@ class BaseEstimator:
|
|||
def _fit(self, X_train, y_train, **kwargs):
|
||||
|
||||
current_time = time.time()
|
||||
if 'groups' in kwargs:
|
||||
kwargs = kwargs.copy()
|
||||
if self._task == 'rank':
|
||||
kwargs['group'] = group_counts(kwargs['groups'])
|
||||
# groups_val = kwargs.get('groups_val')
|
||||
# if groups_val is not None:
|
||||
# kwargs['eval_group'] = [group_counts(groups_val)]
|
||||
# kwargs['eval_set'] = [
|
||||
# (kwargs['X_val'], kwargs['y_val'])]
|
||||
# kwargs['verbose'] = False
|
||||
# del kwargs['groups_val'], kwargs['X_val'], kwargs['y_val']
|
||||
del kwargs['groups']
|
||||
X_train = self._preprocess(X_train)
|
||||
model = self.estimator_class(**self.params)
|
||||
model.fit(X_train, y_train, **kwargs)
|
||||
|
@ -255,12 +269,14 @@ class LGBMEstimator(BaseEstimator):
|
|||
if "objective" not in self.params:
|
||||
# Default: ‘regression’ for LGBMRegressor,
|
||||
# ‘binary’ or ‘multiclass’ for LGBMClassifier
|
||||
if 'regression' in task:
|
||||
if 'regression' == task:
|
||||
objective = 'regression'
|
||||
elif 'binary' in task:
|
||||
objective = 'binary'
|
||||
elif 'multi' in task:
|
||||
objective = 'multiclass'
|
||||
elif 'rank' == task:
|
||||
objective = 'lambdarank'
|
||||
else:
|
||||
objective = 'regression'
|
||||
self.params["objective"] = objective
|
||||
|
@ -276,8 +292,10 @@ class LGBMEstimator(BaseEstimator):
|
|||
self.params['verbose'] = -1
|
||||
# if "subsample_freq" not in self.params:
|
||||
# self.params['subsample_freq'] = 1
|
||||
if 'regression' in task:
|
||||
if 'regression' == task:
|
||||
self.estimator_class = LGBMRegressor
|
||||
elif 'rank' == task:
|
||||
self.estimator_class = LGBMRanker
|
||||
else:
|
||||
self.estimator_class = LGBMClassifier
|
||||
self._time_per_iter = None
|
||||
|
@ -488,8 +506,10 @@ class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
|
|||
'use_label_encoder': params.get('use_label_encoder', False),
|
||||
})
|
||||
|
||||
if 'regression' in task:
|
||||
if 'regression' == task:
|
||||
self.estimator_class = xgb.XGBRegressor
|
||||
elif 'rank' == task:
|
||||
self.estimator_class = xgb.XGBRanker
|
||||
else:
|
||||
self.estimator_class = xgb.XGBClassifier
|
||||
self._time_per_iter = None
|
||||
|
@ -716,7 +736,9 @@ class CatBoostEstimator(BaseEstimator):
|
|||
return params
|
||||
|
||||
def fit(self, X_train, y_train, budget=None, **kwargs):
|
||||
import shutil
|
||||
start_time = time.time()
|
||||
train_dir = f'catboost_{str(start_time)}'
|
||||
n_iter = self.params["n_estimators"]
|
||||
X_train = self._preprocess(X_train)
|
||||
if isinstance(X_train, pd.DataFrame):
|
||||
|
@ -730,16 +752,19 @@ class CatBoostEstimator(BaseEstimator):
|
|||
CatBoostEstimator._train_size - len(y_train)) > 4) and budget:
|
||||
# measure the time per iteration
|
||||
self.params["n_estimators"] = 1
|
||||
CatBoostEstimator._smallmodel = self.estimator_class(**self.params)
|
||||
CatBoostEstimator._smallmodel = self.estimator_class(
|
||||
train_dir=train_dir, **self.params)
|
||||
CatBoostEstimator._smallmodel.fit(
|
||||
X_train, y_train, cat_features=cat_features, **kwargs)
|
||||
CatBoostEstimator._t1 = time.time() - start_time
|
||||
if CatBoostEstimator._t1 >= budget:
|
||||
self.params["n_estimators"] = n_iter
|
||||
self._model = CatBoostEstimator._smallmodel
|
||||
shutil.rmtree(train_dir, ignore_errors=True)
|
||||
return CatBoostEstimator._t1
|
||||
self.params["n_estimators"] = 4
|
||||
CatBoostEstimator._smallmodel = self.estimator_class(**self.params)
|
||||
CatBoostEstimator._smallmodel = self.estimator_class(
|
||||
train_dir=train_dir, **self.params)
|
||||
CatBoostEstimator._smallmodel.fit(
|
||||
X_train, y_train, cat_features=cat_features, **kwargs)
|
||||
CatBoostEstimator._time_per_iter = (
|
||||
|
@ -752,6 +777,7 @@ class CatBoostEstimator(BaseEstimator):
|
|||
"n_estimators"]:
|
||||
self.params["n_estimators"] = n_iter
|
||||
self._model = CatBoostEstimator._smallmodel
|
||||
shutil.rmtree(train_dir, ignore_errors=True)
|
||||
return time.time() - start_time
|
||||
if budget:
|
||||
train_times = 1
|
||||
|
@ -769,13 +795,14 @@ class CatBoostEstimator(BaseEstimator):
|
|||
else:
|
||||
weight = None
|
||||
from catboost import Pool
|
||||
model = self.estimator_class(**self.params)
|
||||
model = self.estimator_class(train_dir=train_dir, **self.params)
|
||||
model.fit(
|
||||
X_tr, y_tr, cat_features=cat_features,
|
||||
eval_set=Pool(
|
||||
data=X_train[n:], label=y_train[n:],
|
||||
cat_features=cat_features),
|
||||
**kwargs) # model.get_best_iteration()
|
||||
shutil.rmtree(train_dir, ignore_errors=True)
|
||||
if weight is not None:
|
||||
kwargs['sample_weight'] = weight
|
||||
self._model = model
|
||||
|
@ -862,44 +889,43 @@ class FBProphet(BaseEstimator):
|
|||
}
|
||||
return space
|
||||
|
||||
def fit(self, X_train, y_train, budget=None, **kwargs):
|
||||
def __init__(self, task='forecast', **params):
|
||||
if 'n_jobs' in params:
|
||||
params.pop('n_jobs')
|
||||
super().__init__(task, **params)
|
||||
|
||||
def _join(self, X_train, y_train):
|
||||
assert 'ds' in X_train, (
|
||||
'Dataframe for training forecast model must have column'
|
||||
' "ds" with the dates in X_train.')
|
||||
y_train = pd.DataFrame(y_train, columns=['y'])
|
||||
train_df = X_train.join(y_train)
|
||||
return train_df
|
||||
|
||||
if ('ds' not in train_df) or ('y' not in train_df):
|
||||
raise ValueError(
|
||||
'Dataframe for training forecast model must have columns "ds" and "y" with the dates and '
|
||||
'values respectively.'
|
||||
)
|
||||
|
||||
if 'n_jobs' in self.params:
|
||||
self.params.pop('n_jobs')
|
||||
|
||||
def fit(self, X_train, y_train, budget=None, **kwargs):
|
||||
from prophet import Prophet
|
||||
|
||||
current_time = time.time()
|
||||
train_df = self._join(X_train, y_train)
|
||||
model = Prophet(**self.params).fit(train_df)
|
||||
train_time = time.time() - current_time
|
||||
self._model = model
|
||||
return train_time
|
||||
|
||||
def predict(self, X_test, freq=None):
|
||||
def predict(self, X_test):
|
||||
if isinstance(X_test, int):
|
||||
raise ValueError(
|
||||
"predict() with steps is only supported for arima/sarimax."
|
||||
" For FBProphet, pass a dataframe with a date colum named ds.")
|
||||
if self._model is not None:
|
||||
if isinstance(X_test, int) and freq is not None:
|
||||
future = self._model.make_future_dataframe(periods=X_test, freq=freq)
|
||||
forecast = self._model.predict(future)
|
||||
elif isinstance(X_test, pd.DataFrame):
|
||||
forecast = self._model.predict(X_test)
|
||||
else:
|
||||
raise ValueError(
|
||||
"either X_test(pd.Dataframe with dates for predictions, column ds) or"
|
||||
"X_test(int number of periods)+freq are required.")
|
||||
forecast = self._model.predict(X_test)
|
||||
return forecast['yhat']
|
||||
else:
|
||||
warnings.warn(
|
||||
"Estimator is not fit yet. Please run fit() before predict().")
|
||||
return np.ones(X_test.shape[0])
|
||||
|
||||
|
||||
class ARIMA(BaseEstimator):
|
||||
class ARIMA(FBProphet):
|
||||
@classmethod
|
||||
def search_space(cls, **params):
|
||||
space = {
|
||||
|
@ -921,55 +947,45 @@ class ARIMA(BaseEstimator):
|
|||
}
|
||||
return space
|
||||
|
||||
def fit(self, X_train, y_train, budget=None, **kwargs):
|
||||
y_train = pd.DataFrame(y_train, columns=['y'])
|
||||
train_df = X_train.join(y_train)
|
||||
|
||||
if ('ds' not in train_df) or ('y' not in train_df):
|
||||
raise ValueError(
|
||||
'Dataframe for training forecast model must have columns "ds" and "y" with the dates and '
|
||||
'values respectively.'
|
||||
)
|
||||
|
||||
def _join(self, X_train, y_train):
|
||||
train_df = super()._join(X_train, y_train)
|
||||
train_df.index = pd.to_datetime(train_df['ds'])
|
||||
train_df = train_df.drop('ds', axis=1)
|
||||
return train_df
|
||||
|
||||
if 'n_jobs' in self.params:
|
||||
self.params.pop('n_jobs')
|
||||
|
||||
def fit(self, X_train, y_train, budget=None, **kwargs):
|
||||
from statsmodels.tsa.arima.model import ARIMA as ARIMA_estimator
|
||||
import warnings
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
current_time = time.time()
|
||||
model = ARIMA_estimator(train_df,
|
||||
order=(self.params['p'], self.params['d'], self.params['q']),
|
||||
enforce_stationarity=False,
|
||||
enforce_invertibility=False)
|
||||
|
||||
train_df = self._join(X_train, y_train)
|
||||
model = ARIMA_estimator(
|
||||
train_df, order=(
|
||||
self.params['p'], self.params['d'], self.params['q']),
|
||||
enforce_stationarity=False, enforce_invertibility=False)
|
||||
model = model.fit()
|
||||
train_time = time.time() - current_time
|
||||
self._model = model
|
||||
return train_time
|
||||
|
||||
def predict(self, X_test, freq=None):
|
||||
def predict(self, X_test):
|
||||
if self._model is not None:
|
||||
if isinstance(X_test, int) and freq is not None:
|
||||
forecast = self._model.forecast(steps=X_test).to_frame().reset_index()
|
||||
if isinstance(X_test, int):
|
||||
forecast = self._model.forecast(steps=X_test)
|
||||
elif isinstance(X_test, pd.DataFrame):
|
||||
start_date = X_test.iloc[0, 0]
|
||||
end_date = X_test.iloc[-1, 0]
|
||||
forecast = self._model.predict(start=start_date, end=end_date)
|
||||
start = X_test.iloc[0, 0]
|
||||
end = X_test.iloc[-1, 0]
|
||||
forecast = self._model.predict(start=start, end=end)
|
||||
else:
|
||||
raise ValueError(
|
||||
"either X_test(pd.Dataframe with dates for predictions, column ds) or"
|
||||
"X_test(int number of periods)+freq are required.")
|
||||
"X_test needs to be either a pd.Dataframe with dates as column ds)"
|
||||
" or an int number of periods for predict().")
|
||||
return forecast
|
||||
else:
|
||||
return np.ones(X_test.shape[0])
|
||||
return np.ones(X_test if isinstance(X_test, int)
|
||||
else X_test.shape[0])
|
||||
|
||||
|
||||
class SARIMAX(BaseEstimator):
|
||||
class SARIMAX(ARIMA):
|
||||
@classmethod
|
||||
def search_space(cls, **params):
|
||||
space = {
|
||||
|
@ -1011,47 +1027,17 @@ class SARIMAX(BaseEstimator):
|
|||
return space
|
||||
|
||||
def fit(self, X_train, y_train, budget=None, **kwargs):
|
||||
y_train = pd.DataFrame(y_train, columns=['y'])
|
||||
train_df = X_train.join(y_train)
|
||||
|
||||
if ('ds' not in train_df) or ('y' not in train_df):
|
||||
raise ValueError(
|
||||
'Dataframe for training forecast model must have columns "ds" and "y" with the dates and '
|
||||
'values respectively.'
|
||||
)
|
||||
|
||||
train_df.index = pd.to_datetime(train_df['ds'])
|
||||
train_df = train_df.drop('ds', axis=1)
|
||||
|
||||
if 'n_jobs' in self.params:
|
||||
self.params.pop('n_jobs')
|
||||
|
||||
from statsmodels.tsa.statespace.sarimax import SARIMAX as SARIMAX_estimator
|
||||
|
||||
current_time = time.time()
|
||||
model = SARIMAX_estimator(train_df,
|
||||
order=(self.params['p'], self.params['d'], self.params['q']),
|
||||
seasonality_order=(self.params['P'], self.params['D'], self.params['Q'], self.params['s']),
|
||||
enforce_stationarity=False,
|
||||
enforce_invertibility=False)
|
||||
|
||||
train_df = self._join(X_train, y_train)
|
||||
model = SARIMAX_estimator(
|
||||
train_df, order=(
|
||||
self.params['p'], self.params['d'], self.params['q']),
|
||||
seasonality_order=(
|
||||
self.params['P'], self.params['D'], self.params['Q'],
|
||||
self.params['s']),
|
||||
enforce_stationarity=False, enforce_invertibility=False)
|
||||
model = model.fit()
|
||||
train_time = time.time() - current_time
|
||||
self._model = model
|
||||
return train_time
|
||||
|
||||
def predict(self, X_test, freq=None):
|
||||
if self._model is not None:
|
||||
if isinstance(X_test, int) and freq is not None:
|
||||
forecast = self._model.forecast(steps=X_test).to_frame().reset_index()
|
||||
elif isinstance(X_test, pd.DataFrame):
|
||||
start_date = X_test.iloc[0, 0]
|
||||
end_date = X_test.iloc[-1, 0]
|
||||
forecast = self._model.predict(start=start_date, end=end_date)
|
||||
else:
|
||||
raise ValueError(
|
||||
"either X_test(pd.Dataframe with dates for predictions, column ds)"
|
||||
"or X_test(int number of periods)+freq are required.")
|
||||
return forecast
|
||||
else:
|
||||
return np.ones(X_test.shape[0])
|
||||
|
|
|
@ -8,19 +8,20 @@ import numpy as np
|
|||
import time
|
||||
import pickle
|
||||
|
||||
|
||||
try:
|
||||
from ray import __version__ as ray_version
|
||||
assert ray_version >= '1.0.0'
|
||||
from ray.tune.suggest import Searcher
|
||||
from ray.tune.suggest.optuna import OptunaSearch as GlobalSearch
|
||||
from ray.tune.utils.util import flatten_dict
|
||||
from ray.tune.utils.util import unflatten_dict
|
||||
except (ImportError, AssertionError):
|
||||
from .suggestion import Searcher
|
||||
from .suggestion import OptunaSearch as GlobalSearch
|
||||
from .variant_generator import flatten_dict
|
||||
from ..tune.trial import unflatten_dict
|
||||
from .search_thread import SearchThread
|
||||
from .flow2 import FLOW2
|
||||
from ..tune.space import add_cost_to_space, normalize # TODO: , define_by_run_func
|
||||
from ..tune.space import add_cost_to_space, indexof, normalize, define_by_run_func
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -133,9 +134,8 @@ class BlendSearch(Searcher):
|
|||
if global_search_alg is not None:
|
||||
self._gs = global_search_alg
|
||||
elif getattr(self, '__name__', None) != 'CFO':
|
||||
gs_space = space
|
||||
# TODO: when define_by_run is supported
|
||||
# gs_space = define_by_run_func(space)
|
||||
from functools import partial
|
||||
gs_space = partial(define_by_run_func, space=space)
|
||||
try:
|
||||
gs_seed = seed - 10 if (seed - 10) >= 0 else seed - 11 + (1 << 32)
|
||||
if experimental:
|
||||
|
@ -198,7 +198,10 @@ class BlendSearch(Searcher):
|
|||
# reset search when metric or mode changed
|
||||
self._ls.set_search_properties(metric, mode)
|
||||
if self._gs is not None:
|
||||
self._gs.set_search_properties(metric, mode)
|
||||
self._gs = GlobalSearch(
|
||||
space=self._gs._space, metric=metric, mode=mode,
|
||||
sampler=self._gs._sampler)
|
||||
self._gs.space = self._ls.space
|
||||
self._init_search()
|
||||
if config:
|
||||
if 'time_budget_s' in config:
|
||||
|
@ -312,9 +315,11 @@ class BlendSearch(Searcher):
|
|||
self._expand_admissible_region(
|
||||
self._ls_bound_min, self._ls_bound_max,
|
||||
self._subspace.get(trial_id, self._ls.space))
|
||||
if self._gs is not None and self._experimental:
|
||||
# TODO: key match for hierarchical space
|
||||
self._gs.add_evaluated_point(flatten_dict(config), objective)
|
||||
# if self._gs is not None and self._experimental:
|
||||
# # TODO: recover when supported
|
||||
# converted = convert_key(config, self._gs.space)
|
||||
# logger.info(converted)
|
||||
# self._gs.add_evaluated_point(converted, objective)
|
||||
elif metric_constraint_satisfied and self._create_condition(
|
||||
result):
|
||||
# thread creator
|
||||
|
@ -339,7 +344,6 @@ class BlendSearch(Searcher):
|
|||
del self._subspace[trial_id]
|
||||
|
||||
def _create_thread(self, config, result, space):
|
||||
# logger.info(f"create local search thread from {config}")
|
||||
self._search_thread_pool[self._thread_count] = SearchThread(
|
||||
self._ls.mode,
|
||||
self._ls.create(
|
||||
|
@ -349,26 +353,29 @@ class BlendSearch(Searcher):
|
|||
)
|
||||
self._thread_count += 1
|
||||
self._update_admissible_region(
|
||||
config, self._ls_bound_min, self._ls_bound_max, space)
|
||||
unflatten_dict(config), self._ls_bound_min, self._ls_bound_max, space,
|
||||
self._ls.space)
|
||||
|
||||
def _update_admissible_region(
|
||||
self, config, admissible_min, admissible_max, space: Dict = {}
|
||||
self, config, admissible_min, admissible_max, subspace: Dict = {},
|
||||
space: Dict = {}
|
||||
):
|
||||
# update admissible region
|
||||
normalized_config = normalize(config, space, config, {})
|
||||
normalized_config = normalize(config, subspace, config, {})
|
||||
for key in admissible_min:
|
||||
value = normalized_config[key]
|
||||
if isinstance(admissible_max[key], list):
|
||||
choice = space[key]['_choice_']
|
||||
domain = space[key]
|
||||
choice = indexof(domain, value)
|
||||
self._update_admissible_region(
|
||||
value,
|
||||
admissible_min[key][choice], admissible_max[key][choice],
|
||||
space[key]
|
||||
subspace[key], domain[choice]
|
||||
)
|
||||
elif isinstance(value, dict):
|
||||
self._update_admissible_region(
|
||||
value,
|
||||
admissible_min[key], admissible_max[key], space[key])
|
||||
value, admissible_min[key], admissible_max[key],
|
||||
subspace[key], space[key])
|
||||
else:
|
||||
if value > admissible_max[key]:
|
||||
admissible_max[key] = value
|
||||
|
@ -514,7 +521,8 @@ class BlendSearch(Searcher):
|
|||
return None
|
||||
use_rs = 1
|
||||
if choice or self._valid(
|
||||
config, space, self._gs_admissible_min, self._gs_admissible_max):
|
||||
config, self._ls.space, space, self._gs_admissible_min,
|
||||
self._gs_admissible_max):
|
||||
# LS or valid or no backup choice
|
||||
self._trial_proposed_by[trial_id] = choice
|
||||
self._search_thread_pool[choice].running += use_rs
|
||||
|
@ -542,10 +550,11 @@ class BlendSearch(Searcher):
|
|||
# temporarily relax admissible region for parallel proposals
|
||||
self._update_admissible_region(
|
||||
config, self._gs_admissible_min, self._gs_admissible_max,
|
||||
space)
|
||||
space, self._ls.space)
|
||||
else:
|
||||
self._update_admissible_region(
|
||||
config, self._ls_bound_min, self._ls_bound_max, space)
|
||||
config, self._ls_bound_min, self._ls_bound_max, space,
|
||||
self._ls.space)
|
||||
self._gs_admissible_min.update(self._ls_bound_min)
|
||||
self._gs_admissible_max.update(self._ls_bound_max)
|
||||
signature = self._ls.config_signature(config, space)
|
||||
|
@ -632,11 +641,6 @@ class BlendSearch(Searcher):
|
|||
top_thread_id = backup_thread_id = 0
|
||||
priority1 = priority2 = self._search_thread_pool[0].priority
|
||||
for thread_id, thread in self._search_thread_pool.items():
|
||||
# if thread_id:
|
||||
# print(
|
||||
# f"priority of thread {thread_id}={thread.priority}")
|
||||
# logger.debug(
|
||||
# f"thread {thread_id}.can_suggest={thread.can_suggest}")
|
||||
if thread_id and thread.can_suggest:
|
||||
priority = thread.priority
|
||||
if priority > priority1:
|
||||
|
@ -647,21 +651,29 @@ class BlendSearch(Searcher):
|
|||
backup_thread_id = thread_id
|
||||
return top_thread_id, backup_thread_id
|
||||
|
||||
def _valid(self, config: Dict, space: Dict, lower: Dict, upper: Dict) -> bool:
|
||||
def _valid(self, config: Dict, space: Dict, subspace: Dict,
|
||||
lower: Dict, upper: Dict) -> bool:
|
||||
''' config validator
|
||||
'''
|
||||
normalized_config = normalize(config, space, config, {})
|
||||
normalized_config = normalize(config, subspace, config, {})
|
||||
for key, lb in lower.items():
|
||||
if key in config:
|
||||
value = normalized_config[key]
|
||||
if isinstance(lb, list):
|
||||
subspace = space[key]['_choice_']
|
||||
domain = space[key]
|
||||
index = indexof(domain, value)
|
||||
nestedspace = subspace[key]
|
||||
lb = lb[index]
|
||||
ub = upper[key][index]
|
||||
elif isinstance(lb, dict):
|
||||
subspace = space[key]
|
||||
nestedspace = subspace[key]
|
||||
domain = space[key]
|
||||
ub = upper[key]
|
||||
else:
|
||||
subspace = None
|
||||
if subspace:
|
||||
valid = self._valid(value, subspace, lb, upper[key])
|
||||
nestedspace = None
|
||||
if nestedspace:
|
||||
valid = self._valid(
|
||||
value, domain, nestedspace, lb, ub)
|
||||
if not valid:
|
||||
return False
|
||||
elif (value + self._ls.STEPSIZE < lower[key]
|
||||
|
|
|
@ -543,8 +543,9 @@ class FLOW2(Searcher):
|
|||
return False
|
||||
for key in self._unordered_cat_hp:
|
||||
# unordered cat choice is hard to reach by chance
|
||||
if config1[key] != config2[key]:
|
||||
if config1[key] != config2.get(key):
|
||||
return False
|
||||
delta = np.array(
|
||||
[incumbent1[key] - incumbent2[key] for key in self._tunable_keys])
|
||||
[incumbent1[key] - incumbent2.get(key, np.inf)
|
||||
for key in self._tunable_keys])
|
||||
return np.linalg.norm(delta) <= self.step
|
||||
|
|
|
@ -12,7 +12,7 @@ try:
|
|||
except (ImportError, AssertionError):
|
||||
from .suggestion import Searcher
|
||||
from .flow2 import FLOW2
|
||||
from ..tune.space import (add_cost_to_space, unflatten_hierarchical)
|
||||
from ..tune.space import unflatten_hierarchical
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -46,10 +46,6 @@ class SearchThread:
|
|||
self.cost_attr = cost_attr
|
||||
if search_alg:
|
||||
self.space = self._space = search_alg.space # unflattened space
|
||||
# TODO: remove when define_by_run is supported
|
||||
if not isinstance(self._search_alg, FLOW2):
|
||||
# remember const config
|
||||
self._const = add_cost_to_space(self.space, {}, {})
|
||||
|
||||
@classmethod
|
||||
def set_eps(cls, time_budget_s):
|
||||
|
@ -63,8 +59,6 @@ class SearchThread:
|
|||
else:
|
||||
try:
|
||||
config = self._search_alg.suggest(trial_id)
|
||||
# TODO: remove when define_by_run is supported
|
||||
config.update(self._const)
|
||||
config, self.space = unflatten_hierarchical(config, self._space)
|
||||
except FloatingPointError:
|
||||
logger.warning(
|
||||
|
|
|
@ -17,9 +17,12 @@ This source file is adapted here because ray does not fully support Windows.
|
|||
|
||||
Copyright (c) Microsoft Corporation.
|
||||
'''
|
||||
import time
|
||||
import functools
|
||||
import warnings
|
||||
import copy
|
||||
import logging
|
||||
from typing import Any, Dict, Optional, Union, List, Tuple
|
||||
from typing import Any, Dict, Optional, Union, List, Tuple, Callable
|
||||
import pickle
|
||||
from .variant_generator import parse_spec_vars
|
||||
from ..tune.sample import Categorical, Domain, Float, Integer, LogUniform, \
|
||||
|
@ -332,13 +335,16 @@ class ConcurrencyLimiter(Searcher):
|
|||
|
||||
try:
|
||||
import optuna as ot
|
||||
from optuna.trial import TrialState as OptunaTrialState
|
||||
from optuna.distributions import BaseDistribution as OptunaDistribution
|
||||
from optuna.samplers import BaseSampler
|
||||
from optuna.trial import TrialState as OptunaTrialState
|
||||
from optuna.trial import Trial as OptunaTrial
|
||||
except ImportError:
|
||||
ot = None
|
||||
OptunaTrialState = None
|
||||
OptunaDistribution = None
|
||||
BaseSampler = None
|
||||
|
||||
OptunaTrialState = None
|
||||
OptunaTrial = None
|
||||
|
||||
# (Optional) Default (anonymous) metric when using tune.report(x)
|
||||
DEFAULT_METRIC = "_metric"
|
||||
|
@ -346,6 +352,78 @@ DEFAULT_METRIC = "_metric"
|
|||
# (Auto-filled) The index of this training iteration.
|
||||
TRAINING_ITERATION = "training_iteration"
|
||||
|
||||
# print a warning if define by run function takes longer than this to execute
|
||||
DEFINE_BY_RUN_WARN_THRESHOLD_S = 1 # 1 is arbitrary
|
||||
|
||||
|
||||
def validate_warmstart(parameter_names: List[str],
|
||||
points_to_evaluate: List[Union[List, Dict]],
|
||||
evaluated_rewards: List,
|
||||
validate_point_name_lengths: bool = True):
|
||||
"""Generic validation of a Searcher's warm start functionality.
|
||||
Raises exceptions in case of type and length mismatches between
|
||||
parameters.
|
||||
If ``validate_point_name_lengths`` is False, the equality of lengths
|
||||
between ``points_to_evaluate`` and ``parameter_names`` will not be
|
||||
validated.
|
||||
"""
|
||||
if points_to_evaluate:
|
||||
if not isinstance(points_to_evaluate, list):
|
||||
raise TypeError(
|
||||
"points_to_evaluate expected to be a list, got {}.".format(
|
||||
type(points_to_evaluate)))
|
||||
for point in points_to_evaluate:
|
||||
if not isinstance(point, (dict, list)):
|
||||
raise TypeError(
|
||||
f"points_to_evaluate expected to include list or dict, "
|
||||
f"got {point}.")
|
||||
|
||||
if validate_point_name_lengths and (
|
||||
not len(point) == len(parameter_names)):
|
||||
raise ValueError("Dim of point {}".format(point)
|
||||
+ " and parameter_names {}".format(
|
||||
parameter_names) + " do not match.")
|
||||
|
||||
if points_to_evaluate and evaluated_rewards:
|
||||
if not isinstance(evaluated_rewards, list):
|
||||
raise TypeError(
|
||||
"evaluated_rewards expected to be a list, got {}.".format(
|
||||
type(evaluated_rewards)))
|
||||
if not len(evaluated_rewards) == len(points_to_evaluate):
|
||||
raise ValueError(
|
||||
"Dim of evaluated_rewards {}".format(evaluated_rewards)
|
||||
+ " and points_to_evaluate {}".format(points_to_evaluate)
|
||||
+ " do not match.")
|
||||
|
||||
|
||||
class _OptunaTrialSuggestCaptor:
|
||||
"""Utility to capture returned values from Optuna's suggest_ methods.
|
||||
This will wrap around the ``optuna.Trial` object and decorate all
|
||||
`suggest_` callables with a function capturing the returned value,
|
||||
which will be saved in the ``captured_values`` dict.
|
||||
"""
|
||||
|
||||
def __init__(self, ot_trial: OptunaTrial) -> None:
|
||||
self.ot_trial = ot_trial
|
||||
self.captured_values: Dict[str, Any] = {}
|
||||
|
||||
def _get_wrapper(self, func: Callable) -> Callable:
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
# name is always the first arg for suggest_ methods
|
||||
name = kwargs.get("name", args[0])
|
||||
ret = func(*args, **kwargs)
|
||||
self.captured_values[name] = ret
|
||||
return ret
|
||||
|
||||
return wrapper
|
||||
|
||||
def __getattr__(self, item_name: str) -> Any:
|
||||
item = getattr(self.ot_trial, item_name)
|
||||
if item_name.startswith("suggest_") and callable(item):
|
||||
return self._get_wrapper(item)
|
||||
return item
|
||||
|
||||
|
||||
class OptunaSearch(Searcher):
|
||||
"""A wrapper around Optuna to provide trial suggestions.
|
||||
|
@ -355,16 +433,20 @@ class OptunaSearch(Searcher):
|
|||
This Searcher is a thin wrapper around Optuna's search algorithms.
|
||||
You can pass any Optuna sampler, which will be used to generate
|
||||
hyperparameter suggestions.
|
||||
Please note that this wrapper does not support define-by-run, so the
|
||||
search space will be configured before running the optimization. You will
|
||||
also need to use a Tune trainable (e.g. using the function API) with
|
||||
this wrapper.
|
||||
For defining the search space, use ``ray.tune.suggest.optuna.param``
|
||||
(see example).
|
||||
Args:
|
||||
space (list): Hyperparameter search space definition for Optuna's
|
||||
sampler. This is a list, and samples for the parameters will
|
||||
be obtained in order.
|
||||
space (dict|Callable): Hyperparameter search space definition for
|
||||
Optuna's sampler. This can be either a :class:`dict` with
|
||||
parameter names as keys and ``optuna.distributions`` as values,
|
||||
or a Callable - in which case, it should be a define-by-run
|
||||
function using ``optuna.trial`` to obtain the hyperparameter
|
||||
values. The function should return either a :class:`dict` of
|
||||
constant values with names as keys, or None.
|
||||
For more information, see https://optuna.readthedocs.io\
|
||||
/en/stable/tutorial/10_key_features/002_configurations.html.
|
||||
.. warning::
|
||||
No actual computation should take place in the define-by-run
|
||||
function. Instead, put the training logic inside the function
|
||||
or class trainable passed to ``tune.run``.
|
||||
metric (str): The training result objective value attribute. If None
|
||||
but a mode was passed, the anonymous metric `_metric` will be used
|
||||
per default.
|
||||
|
@ -411,15 +493,28 @@ class OptunaSearch(Searcher):
|
|||
metric="loss",
|
||||
mode="min")
|
||||
tune.run(trainable, search_alg=optuna_search)
|
||||
# Equivalent Optuna define-by-run function approach:
|
||||
def define_search_space(trial: optuna.Trial):
|
||||
trial.suggest_float("a", 6, 8)
|
||||
trial.suggest_float("b", 1e-4, 1e-2, log=True)
|
||||
# training logic goes into trainable, this is just
|
||||
# for search space definition
|
||||
optuna_search = OptunaSearch(
|
||||
define_search_space,
|
||||
metric="loss",
|
||||
mode="min")
|
||||
tune.run(trainable, search_alg=optuna_search)
|
||||
.. versionadded:: 0.8.8
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
space: Optional[Union[Dict, List[Tuple]]] = None,
|
||||
space: Optional[Union[Dict[str, "OptunaDistribution"], List[
|
||||
Tuple], Callable[["OptunaTrial"], Optional[Dict[
|
||||
str, Any]]]]] = None,
|
||||
metric: Optional[str] = None,
|
||||
mode: Optional[str] = None,
|
||||
points_to_evaluate: Optional[List[Dict]] = None,
|
||||
sampler: Optional[BaseSampler] = None,
|
||||
sampler: Optional["BaseSampler"] = None,
|
||||
seed: Optional[int] = None,
|
||||
evaluated_rewards: Optional[List] = None):
|
||||
assert ot is not None, (
|
||||
|
@ -490,6 +585,11 @@ class OptunaSearch(Searcher):
|
|||
load_if_exists=True)
|
||||
|
||||
if self._points_to_evaluate:
|
||||
validate_warmstart(
|
||||
self._space,
|
||||
self._points_to_evaluate,
|
||||
self._evaluated_rewards,
|
||||
validate_point_name_lengths=not callable(self._space))
|
||||
if self._evaluated_rewards:
|
||||
for point, reward in zip(self._points_to_evaluate,
|
||||
self._evaluated_rewards):
|
||||
|
@ -512,6 +612,37 @@ class OptunaSearch(Searcher):
|
|||
self._setup_study(mode)
|
||||
return True
|
||||
|
||||
def _suggest_from_define_by_run_func(
|
||||
self, func: Callable[["OptunaTrial"], Optional[Dict[str, Any]]],
|
||||
ot_trial: "OptunaTrial") -> Dict:
|
||||
captor = _OptunaTrialSuggestCaptor(ot_trial)
|
||||
time_start = time.time()
|
||||
ret = func(captor)
|
||||
time_taken = time.time() - time_start
|
||||
if time_taken > DEFINE_BY_RUN_WARN_THRESHOLD_S:
|
||||
warnings.warn(
|
||||
"Define-by-run function passed in the `space` argument "
|
||||
f"took {time_taken} seconds to "
|
||||
"run. Ensure that actual computation, training takes "
|
||||
"place inside Tune's train functions or Trainables "
|
||||
"passed to `tune.run`.")
|
||||
if ret is not None:
|
||||
if not isinstance(ret, dict):
|
||||
raise TypeError(
|
||||
"The return value of the define-by-run function "
|
||||
"passed in the `space` argument should be "
|
||||
"either None or a `dict` with `str` keys. "
|
||||
f"Got {type(ret)}.")
|
||||
if not all(isinstance(k, str) for k in ret.keys()):
|
||||
raise TypeError(
|
||||
"At least one of the keys in the dict returned by the "
|
||||
"define-by-run function passed in the `space` argument "
|
||||
"was not a `str`.")
|
||||
return {
|
||||
**captor.captured_values,
|
||||
**ret
|
||||
} if ret else captor.captured_values
|
||||
|
||||
def suggest(self, trial_id: str) -> Optional[Dict]:
|
||||
if not self._space:
|
||||
raise RuntimeError(
|
||||
|
@ -538,6 +669,14 @@ class OptunaSearch(Searcher):
|
|||
ot_trial, fn)(*args, **kwargs)
|
||||
for (fn, args, kwargs) in self._space
|
||||
}
|
||||
elif callable(self._space):
|
||||
if trial_id not in self._ot_trials:
|
||||
self._ot_trials[trial_id] = self._ot_study.ask()
|
||||
|
||||
ot_trial = self._ot_trials[trial_id]
|
||||
|
||||
params = self._suggest_from_define_by_run_func(
|
||||
self._space, ot_trial)
|
||||
else:
|
||||
# Use Optuna ask interface (since version 2.6.0)
|
||||
if trial_id not in self._ot_trials:
|
||||
|
|
|
@ -26,6 +26,9 @@ def define_by_run_func(
|
|||
for key, domain in space.items():
|
||||
if path:
|
||||
key = path + '/' + key
|
||||
if isinstance(domain, dict):
|
||||
config.update(define_by_run_func(trial, domain, key))
|
||||
continue
|
||||
if not isinstance(domain, sample.Domain):
|
||||
config[key] = domain
|
||||
continue
|
||||
|
@ -57,7 +60,7 @@ def define_by_run_func(
|
|||
trial.suggest_int(
|
||||
key, domain.lower,
|
||||
domain.upper - int(bool(not quantize)),
|
||||
step=quantize or 1, log=True)
|
||||
log=True)
|
||||
elif isinstance(sampler, sample.Uniform):
|
||||
# Upper bound should be inclusive for quantization and
|
||||
# exclusive otherwise
|
||||
|
@ -76,7 +79,7 @@ def define_by_run_func(
|
|||
if isinstance(choice, dict):
|
||||
key += f":{index}"
|
||||
# the suffix needs to be removed from the final config
|
||||
config[key] = define_by_run_func(trial, choice, key)
|
||||
config.update(define_by_run_func(trial, choice, key))
|
||||
else:
|
||||
raise ValueError(
|
||||
"Optuna search does not support parameters of type "
|
||||
|
@ -87,6 +90,32 @@ def define_by_run_func(
|
|||
return config
|
||||
|
||||
|
||||
def convert_key(
|
||||
conf: Dict, space: Dict, path: str = ""
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Convert config keys to define-by-run keys.
|
||||
|
||||
Returns:
|
||||
A dict with converted keys.
|
||||
"""
|
||||
config = {}
|
||||
for key, domain in space.items():
|
||||
value = conf[key]
|
||||
if path:
|
||||
key = path + '/' + key
|
||||
if isinstance(domain, dict):
|
||||
config.update(convert_key(conf[key], domain, key))
|
||||
elif isinstance(domain, sample.Categorical):
|
||||
index = indexof(domain, value)
|
||||
config[key + '_choice_'] = index
|
||||
if isinstance(value, dict):
|
||||
key += f":{index}"
|
||||
config.update(convert_key(value, domain.categories[index], key))
|
||||
else:
|
||||
config[key] = value
|
||||
return config
|
||||
|
||||
|
||||
def unflatten_hierarchical(config: Dict, space: Dict) -> Tuple[Dict, Dict]:
|
||||
'''unflatten hierarchical config'''
|
||||
hier = {}
|
||||
|
@ -101,12 +130,18 @@ def unflatten_hierarchical(config: Dict, space: Dict) -> Tuple[Dict, Dict]:
|
|||
hier[true_key], subspace[true_key] = unflatten_hierarchical(
|
||||
value, space[true_key][choice])
|
||||
else:
|
||||
if key.endswith("_choice_"):
|
||||
key = key[:-8]
|
||||
domain = space.get(key)
|
||||
if domain is not None:
|
||||
subspace[key] = domain
|
||||
if isinstance(domain, sample.Domain):
|
||||
sampler = domain.sampler
|
||||
if isinstance(sampler, sample.Quantized):
|
||||
if isinstance(domain, sample.Categorical):
|
||||
value = domain.categories[value]
|
||||
if isinstance(value, dict):
|
||||
continue
|
||||
elif isinstance(sampler, sample.Quantized):
|
||||
q = sampler.q
|
||||
sampler = sampler.sampler
|
||||
if isinstance(sampler, sample.LogUniform):
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -124,7 +124,8 @@
|
|||
"source": [
|
||||
"settings = {\n",
|
||||
" \"time_budget\": 60, # total running time in seconds\n",
|
||||
" \"metric\": 'accuracy', # primary metrics can be chosen from: ['accuracy','roc_auc','roc_auc_ovr','roc_auc_ovo','f1','log_loss','mae','mse','r2']\n",
|
||||
" \"metric\": 'accuracy', # can be: 'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',\n",
|
||||
" # 'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg', 'micro_f1', 'macro_f1'\n",
|
||||
" \"estimator_list\": ['lgbm', 'rf', 'xgboost'], # list of ML learners\n",
|
||||
" \"task\": 'classification', # task type \n",
|
||||
" \"sample\": False, # whether to subsample training data\n",
|
||||
|
@ -265,7 +266,7 @@
|
|||
"execution_count": null,
|
||||
"source": [
|
||||
"from flaml.data import get_output_from_log\n",
|
||||
"time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \\\n",
|
||||
"time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n",
|
||||
" get_output_from_log(filename = settings['log_file_name'], time_budget = 60)\n",
|
||||
"\n",
|
||||
"for config in config_history:\n",
|
||||
|
|
|
@ -104,10 +104,7 @@
|
|||
" \"metric\": 'mape', # primary metric for validation: 'mape' is generally used for forecast tasks\n",
|
||||
" \"task\": 'forecast', # task type\n",
|
||||
" \"log_file_name\": 'CO2_forecast.log', # flaml log file\n",
|
||||
" \"eval_method\": \"holdout\", # validation method can be chosen from ['auto', 'holdout', 'cv']\n",
|
||||
" # \"estimator_list\": [\"sarimax\"],\n",
|
||||
" # \"verbose\": 3,\n",
|
||||
" \"split_type\": 'time' # for foretask task, 'split_type' has to be 'time'\n",
|
||||
" \"eval_method\": \"holdout\", # validation method can be chosen from ['auto', 'holdout', 'cv']\n",
|
||||
"}"
|
||||
],
|
||||
"outputs": [],
|
||||
|
@ -1355,7 +1352,7 @@
|
|||
"execution_count": 11,
|
||||
"source": [
|
||||
"from flaml.data import get_output_from_log\n",
|
||||
"time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \\\n",
|
||||
"time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n",
|
||||
" get_output_from_log(filename=settings['log_file_name'], time_budget=300)\n",
|
||||
"\n",
|
||||
"for config in config_history:\n",
|
||||
|
|
|
@ -445,7 +445,7 @@
|
|||
"execution_count": 11,
|
||||
"source": [
|
||||
"from flaml.data import get_output_from_log\n",
|
||||
"time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \\\n",
|
||||
"time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n",
|
||||
" get_output_from_log(filename=settings['log_file_name'], time_budget=60)\n",
|
||||
"\n",
|
||||
"for config in config_history:\n",
|
||||
|
|
|
@ -362,7 +362,7 @@
|
|||
"execution_count": 10,
|
||||
"source": [
|
||||
"from flaml.data import get_output_from_log\n",
|
||||
"time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \\\n",
|
||||
"time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n",
|
||||
" get_output_from_log(filename=settings['log_file_name'], time_budget=60)\n",
|
||||
"\n",
|
||||
"for config in config_history:\n",
|
||||
|
|
4
setup.py
4
setup.py
|
@ -62,7 +62,7 @@ setuptools.setup(
|
|||
"optuna==2.8.0"
|
||||
],
|
||||
"ray": [
|
||||
"ray[tune]==1.5.1",
|
||||
"ray[tune]==1.6.0",
|
||||
"pyyaml<5.3.1",
|
||||
],
|
||||
"azureml": [
|
||||
|
@ -75,7 +75,7 @@ setuptools.setup(
|
|||
"vowpalwabbit",
|
||||
],
|
||||
"nlp": [
|
||||
"ray[tune]>=1.5.1",
|
||||
"ray[tune]>=1.6.0",
|
||||
"transformers",
|
||||
"datasets==1.4.1",
|
||||
"tensorboardX<=2.2",
|
||||
|
|
|
@ -111,7 +111,8 @@ class MyLargeLGBM(LGBMEstimator):
|
|||
|
||||
|
||||
def custom_metric(X_test, y_test, estimator, labels, X_train, y_train,
|
||||
weight_test=None, weight_train=None):
|
||||
weight_test=None, weight_train=None, config=None,
|
||||
groups_test=None, groups_train=None):
|
||||
from sklearn.metrics import log_loss
|
||||
import time
|
||||
start = time.time()
|
||||
|
@ -162,7 +163,10 @@ class TestAutoML(unittest.TestCase):
|
|||
"sample": True, # whether to subsample training data
|
||||
"log_file_name": "test/wine.log",
|
||||
"log_training_metric": True, # whether to log training metric
|
||||
"ensemble": True,
|
||||
"ensemble": {
|
||||
"final_estimator": MyRegularizedGreedyForest(),
|
||||
"passthrough": False,
|
||||
},
|
||||
"n_jobs": 1,
|
||||
}
|
||||
|
||||
|
@ -274,9 +278,9 @@ class TestAutoML(unittest.TestCase):
|
|||
task='multi')
|
||||
print(estimator)
|
||||
time_history, best_valid_loss_history, valid_loss_history, \
|
||||
config_history, train_loss_history = get_output_from_log(
|
||||
config_history, metric_history = get_output_from_log(
|
||||
filename=automl_settings['log_file_name'], time_budget=6)
|
||||
print(train_loss_history)
|
||||
print(metric_history)
|
||||
|
||||
def test_classification(self, as_frame=False):
|
||||
automl_experiment = AutoML()
|
||||
|
@ -496,6 +500,30 @@ class TestAutoML(unittest.TestCase):
|
|||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
|
||||
def test_parallel(self, hpo_method=None):
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 10,
|
||||
"task": 'regression',
|
||||
"log_file_name": "test/boston.log",
|
||||
"log_type": "all",
|
||||
"n_jobs": 1,
|
||||
"n_concurrent_trials": 2,
|
||||
"hpo_method": hpo_method,
|
||||
}
|
||||
X_train, y_train = load_boston(return_X_y=True)
|
||||
try:
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train,
|
||||
**automl_settings)
|
||||
print(automl_experiment.predict(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
except ImportError:
|
||||
return
|
||||
|
||||
def test_parallel_xgboost(self, hpo_method=None):
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
|
|
|
@ -1,20 +1,19 @@
|
|||
def test_forecast_automl_df(budget=5):
|
||||
import numpy as np
|
||||
from flaml import AutoML
|
||||
|
||||
|
||||
def test_forecast_automl(budget=5):
|
||||
# using dataframe
|
||||
import statsmodels.api as sm
|
||||
data = sm.datasets.co2.load_pandas()
|
||||
data = data.data
|
||||
data = data['co2'].resample('MS').mean()
|
||||
data = data.fillna(data.bfill())
|
||||
data = data.to_frame().reset_index()
|
||||
data = data.rename(columns={'index': 'ds', 'co2': 'y'})
|
||||
data = sm.datasets.co2.load_pandas().data['co2'].resample('MS').mean()
|
||||
data = data.fillna(data.bfill()).to_frame().reset_index().rename(
|
||||
columns={'index': 'ds', 'co2': 'y'})
|
||||
num_samples = data.shape[0]
|
||||
time_horizon = 12
|
||||
split_idx = num_samples - time_horizon
|
||||
X_train = data[:split_idx]
|
||||
X_test = data[split_idx:]['ds'].to_frame()
|
||||
y_test = data[split_idx:]['y'].to_frame()
|
||||
''' import AutoML class from flaml package '''
|
||||
from flaml import AutoML
|
||||
df = data[:split_idx]
|
||||
X_test = data[split_idx:]['ds']
|
||||
y_test = data[split_idx:]['y']
|
||||
automl = AutoML()
|
||||
settings = {
|
||||
"time_budget": budget, # total running time in seconds
|
||||
|
@ -22,13 +21,14 @@ def test_forecast_automl_df(budget=5):
|
|||
"task": 'forecast', # task type
|
||||
"log_file_name": 'CO2_forecast.log', # flaml log file
|
||||
"eval_method": "holdout",
|
||||
"split_type": 'time'
|
||||
}
|
||||
'''The main flaml automl API'''
|
||||
try:
|
||||
automl.fit(dataframe=X_train, **settings, period=time_horizon, freq='M')
|
||||
automl.fit(dataframe=df, **settings, period=time_horizon)
|
||||
except ImportError:
|
||||
automl.fit(dataframe=X_train, **settings, estimator_list=['arima', 'sarimax'], period=time_horizon, freq='M')
|
||||
print("not using FBProphet due to ImportError")
|
||||
automl.fit(dataframe=df, **settings, estimator_list=[
|
||||
'arima', 'sarimax'], period=time_horizon)
|
||||
''' retrieve best config and best learner'''
|
||||
print('Best ML leaner:', automl.best_estimator)
|
||||
print('Best hyperparmeter config:', automl.best_config)
|
||||
|
@ -47,7 +47,7 @@ def test_forecast_automl_df(budget=5):
|
|||
from flaml.ml import sklearn_metric_loss_score
|
||||
print('mape', '=', sklearn_metric_loss_score('mape', y_pred, y_test))
|
||||
from flaml.data import get_output_from_log
|
||||
time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \
|
||||
time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
|
||||
get_output_from_log(filename=settings['log_file_name'], time_budget=budget)
|
||||
for config in config_history:
|
||||
print(config)
|
||||
|
@ -55,65 +55,46 @@ def test_forecast_automl_df(budget=5):
|
|||
print(automl.max_resource)
|
||||
print(automl.min_resource)
|
||||
|
||||
|
||||
def test_forecast_automl_Xy(budget=5):
|
||||
# using X_train and y_train
|
||||
import statsmodels.api as sm
|
||||
data = sm.datasets.co2.load_pandas()
|
||||
data = data.data
|
||||
data = data['co2'].resample('MS').mean()
|
||||
data = data.fillna(data.bfill())
|
||||
data = data.to_frame().reset_index()
|
||||
num_samples = data.shape[0]
|
||||
time_horizon = 12
|
||||
split_idx = num_samples - time_horizon
|
||||
X_train = data[:split_idx]['index'].to_frame()
|
||||
y_train = data[:split_idx]['co2']
|
||||
X_test = data[split_idx:]['index'].to_frame()
|
||||
y_test = data[split_idx:]['co2'].to_frame()
|
||||
''' import AutoML class from flaml package '''
|
||||
from flaml import AutoML
|
||||
X_train = df['ds']
|
||||
y_train = df['y']
|
||||
automl = AutoML()
|
||||
settings = {
|
||||
"time_budget": budget, # total running time in seconds
|
||||
"metric": 'mape', # primary metric
|
||||
"task": 'forecast', # task type
|
||||
"log_file_name": 'CO2_forecast.log', # flaml log file
|
||||
"eval_method": "holdout",
|
||||
"split_type": 'time'
|
||||
}
|
||||
'''The main flaml automl API'''
|
||||
try:
|
||||
automl.fit(X_train=X_train, y_train=y_train, **settings, period=time_horizon, freq='M')
|
||||
automl.fit(X_train=X_train, y_train=y_train, **settings, period=time_horizon)
|
||||
except ImportError:
|
||||
automl.fit(X_train=X_train, y_train=y_train, **settings, estimator_list=['arima', 'sarimax'], period=time_horizon, freq='M')
|
||||
''' retrieve best config and best learner'''
|
||||
print('Best ML leaner:', automl.best_estimator)
|
||||
print('Best hyperparmeter config:', automl.best_config)
|
||||
print(f'Best mape on validation data: {automl.best_loss}')
|
||||
print(f'Training duration of best run: {automl.best_config_train_time}s')
|
||||
print(automl.model.estimator)
|
||||
''' pickle and save the automl object '''
|
||||
import pickle
|
||||
with open('automl.pkl', 'wb') as f:
|
||||
pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
|
||||
''' compute predictions of testing dataset '''
|
||||
y_pred = automl.predict(X_test)
|
||||
print('Predicted labels', y_pred)
|
||||
print('True labels', y_test)
|
||||
''' compute different metric values on testing dataset'''
|
||||
from flaml.ml import sklearn_metric_loss_score
|
||||
print('mape', '=', sklearn_metric_loss_score('mape', y_pred, y_test))
|
||||
from flaml.data import get_output_from_log
|
||||
time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \
|
||||
get_output_from_log(filename=settings['log_file_name'], time_budget=budget)
|
||||
for config in config_history:
|
||||
print(config)
|
||||
print(automl.prune_attr)
|
||||
print(automl.max_resource)
|
||||
print(automl.min_resource)
|
||||
print("not using FBProphet due to ImportError")
|
||||
automl.fit(X_train=X_train, y_train=y_train, **settings, estimator_list=[
|
||||
'arima', 'sarimax'], period=time_horizon)
|
||||
|
||||
|
||||
def test_numpy():
|
||||
X_train = np.arange('2014-01', '2021-01', dtype='datetime64[M]')
|
||||
y_train = np.random.random(size=72)
|
||||
automl = AutoML()
|
||||
try:
|
||||
automl.fit(
|
||||
X_train=X_train[:60], # a single column of timestamp
|
||||
y_train=y_train, # value for each timestamp
|
||||
period=12, # time horizon to forecast, e.g., 12 months
|
||||
task='forecast', time_budget=3, # time budget in seconds
|
||||
log_file_name="test/forecast.log")
|
||||
print(automl.predict(X_train[60:]))
|
||||
print(automl.predict(12))
|
||||
except ValueError:
|
||||
print("ValueError for FBProphet is raised as expected.")
|
||||
except ImportError:
|
||||
print("not using FBProphet due to ImportError")
|
||||
automl = AutoML()
|
||||
automl.fit(
|
||||
X_train=X_train[:72], # a single column of timestamp
|
||||
y_train=y_train, # value for each timestamp
|
||||
period=12, # time horizon to forecast, e.g., 12 months
|
||||
task='forecast', time_budget=1, # time budget in seconds
|
||||
estimator_list=['arima', 'sarimax'],
|
||||
log_file_name="test/forecast.log")
|
||||
print(automl.predict(X_train[72:]))
|
||||
# an alternative way to specify predict steps for arima/sarimax
|
||||
print(automl.predict(12))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_forecast_automl_df(60)
|
||||
test_forecast_automl_Xy(60)
|
||||
test_forecast_automl(60)
|
||||
|
|
|
@ -42,7 +42,7 @@ def test_automl(budget=5, dataset_format='dataframe'):
|
|||
print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test))
|
||||
print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test))
|
||||
from flaml.data import get_output_from_log
|
||||
time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \
|
||||
time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
|
||||
get_output_from_log(filename=settings['log_file_name'], time_budget=60)
|
||||
for config in config_history:
|
||||
print(config)
|
||||
|
|
|
@ -62,11 +62,11 @@ class TestLogging(unittest.TestCase):
|
|||
config = automl.best_config.copy()
|
||||
config['learner'] = automl.best_estimator
|
||||
automl.trainable({"ml": config})
|
||||
from flaml import tune, CFO
|
||||
from flaml import tune, BlendSearch
|
||||
from flaml.automl import size
|
||||
from functools import partial
|
||||
search_alg = CFO(
|
||||
metric='val_loss',
|
||||
search_alg = BlendSearch(
|
||||
metric='val_loss', mode='min',
|
||||
space=automl.search_space,
|
||||
low_cost_partial_config=automl.low_cost_partial_config,
|
||||
points_to_evaluate=automl.points_to_evaluate,
|
||||
|
|
|
@ -74,5 +74,41 @@ def test_groups():
|
|||
automl.fit(X, y, **automl_settings)
|
||||
|
||||
|
||||
def test_rank():
|
||||
from sklearn.externals._arff import ArffException
|
||||
try:
|
||||
X, y = fetch_openml(name=dataset, return_X_y=True)
|
||||
except (ArffException, ValueError):
|
||||
from sklearn.datasets import load_wine
|
||||
X, y = load_wine(return_X_y=True)
|
||||
y = y.cat.codes
|
||||
import numpy as np
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
"task": "rank",
|
||||
"log_file_name": "test/{}.log".format(dataset),
|
||||
"model_history": True,
|
||||
"eval_method": "cv",
|
||||
"groups": np.array( # group labels
|
||||
[0] * 200 + [1] * 200 + [2] * 200 + [3] * 200 + [4] * 100 + [5] * 100),
|
||||
"learner_selector": "roundrobin",
|
||||
}
|
||||
automl.fit(X, y, **automl_settings)
|
||||
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
"task": "rank",
|
||||
"metric": "ndcg@5", # 5 can be replaced by any number
|
||||
"log_file_name": "test/{}.log".format(dataset),
|
||||
"model_history": True,
|
||||
"groups": [200] * 4 + [100] * 2, # alternative way: group counts
|
||||
# "estimator_list": ['lgbm', 'xgboost'], # list of ML learners
|
||||
"learner_selector": "roundrobin",
|
||||
}
|
||||
automl.fit(X, y, **automl_settings)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
Loading…
Reference in New Issue