warmstart blendsearch (#186)

* increase test coverage

* use define by run only when needed

* warmstart bs

* classification -> binary, multi

* warm start with evaluated rewards

* data transformer; resource attr for gs

* BlendSearchTuner bug fix and unittest

* bug fix

* docstr and import

* task type
This commit is contained in:
Chi Wang 2021-09-04 01:42:21 -07:00 committed by GitHub
parent 5fdfa2559b
commit e46573a01d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
26 changed files with 599 additions and 707 deletions

View File

@ -5,7 +5,6 @@
'''
import time
from typing import Callable, Optional
import warnings
from functools import partial
import numpy as np
from scipy.sparse import issparse
@ -144,9 +143,8 @@ class SearchState:
class AutoMLState:
def _prepare_sample_train_data(self, sample_size):
full_size = len(self.y_train)
sampled_weight = groups = None
if sample_size <= full_size:
if sample_size <= self.data_size:
if isinstance(self.X_train, pd.DataFrame):
sampled_X_train = self.X_train.iloc[:sample_size]
else:
@ -225,13 +223,13 @@ class AutoMLState:
self, estimator, config_w_resource, sample_size=None
):
if not sample_size:
sample_size = config_w_resource['FLAML_sample_size']
sample_size = config_w_resource.get(
'FLAML_sample_size', len(self.y_train_all))
config = config_w_resource.get('ml', config_w_resource).copy()
if 'FLAML_sample_size' in config:
del config['FLAML_sample_size']
if "learner" in config:
del config['learner']
assert sample_size is not None
del config["learner"]
sampled_X_train, sampled_y_train, sampled_weight, groups = \
self._prepare_sample_train_data(sample_size)
if sampled_weight is not None:
@ -316,10 +314,7 @@ class AutoML:
'''An object with `predict()` and `predict_proba()` method (for
classification), storing the best trained model.
'''
if self._trained_estimator:
return self._trained_estimator
else:
return None
return self.__dict__.get('_trained_estimator')
def best_model_for_estimator(self, estimator_name):
'''Return the best model found for a particular estimator
@ -331,11 +326,8 @@ class AutoML:
An object with `predict()` and `predict_proba()` method (for
classification), storing the best trained model for estimator_name.
'''
if estimator_name in self._search_states:
state = self._search_states[estimator_name]
if hasattr(state, 'trained_estimator'):
return state.trained_estimator
return None
state = self._search_states.get(estimator_name)
return state and getattr(state, 'trained_estimator', None)
@property
def best_estimator(self):
@ -374,10 +366,12 @@ class AutoML:
@property
def classes_(self):
'''A list of n_classes elements for class labels.'''
if self._label_transformer:
return self._label_transformer.classes_.tolist()
if self._trained_estimator:
return self._trained_estimator.classes_.tolist()
attr = getattr(self, "label_transformer", None)
if attr:
return attr.classes_.tolist()
attr = getattr(self, "_trained_estimator", None)
if attr:
return attr.classes_.tolist()
return None
def predict(self, X_test):
@ -394,12 +388,13 @@ class AutoML:
A array-like of shape n * 1 - - each element is a predicted
label for an instance.
'''
if self._trained_estimator is None:
warnings.warn(
estimator = getattr(self, "_trained_estimator", None)
if estimator is None:
logger.warning(
"No estimator is trained. Please run fit with enough budget.")
return None
X_test = self._preprocess(X_test)
y_pred = self._trained_estimator.predict(X_test)
y_pred = estimator.predict(X_test)
if y_pred.ndim > 1 and isinstance(y_pred, np.ndarray):
y_pred = y_pred.flatten()
if self._label_transformer:
@ -443,10 +438,9 @@ class AutoML:
dataframe = dataframe.copy()
dataframe = dataframe.rename(columns={label[0]: 'ds', label[1]: 'y'})
elif dataframe is not None:
if ('ds' not in dataframe) or ('y' not in dataframe):
raise ValueError(
'For forecasting task, dataframe must have columns "ds" and "y" '
'with the dates and values respectively.')
assert 'ds' in dataframe and 'y' in dataframe, (
'For forecasting task, dataframe must have columns '
'"ds" and "y" with the dates and values respectively.')
elif (X_train_all is not None) and (y_train_all is not None):
dataframe = pd.DataFrame(X_train_all)
dataframe = dataframe.rename(columns={dataframe.columns[0]: 'ds'})
@ -456,30 +450,29 @@ class AutoML:
label = 'y'
if X_train_all is not None and y_train_all is not None:
if not (isinstance(X_train_all, np.ndarray) or issparse(X_train_all)
or isinstance(X_train_all, pd.DataFrame)):
raise ValueError(
"X_train_all must be a numpy array, a pandas dataframe, "
"or Scipy sparse matrix.")
if not (isinstance(y_train_all, np.ndarray)
or isinstance(y_train_all, pd.Series)):
raise ValueError(
"y_train_all must be a numpy array or a pandas series.")
if X_train_all.size == 0 or y_train_all.size == 0:
raise ValueError("Input data must not be empty.")
assert (
isinstance(X_train_all, np.ndarray) or issparse(X_train_all)
or isinstance(X_train_all, pd.DataFrame)), (
"X_train_all must be a numpy array, a pandas dataframe, "
"or Scipy sparse matrix.")
assert (
isinstance(y_train_all, np.ndarray)
or isinstance(y_train_all, pd.Series)), (
"y_train_all must be a numpy array or a pandas series.")
assert X_train_all.size != 0 and y_train_all.size != 0, (
"Input data must not be empty.")
if isinstance(y_train_all, np.ndarray):
y_train_all = y_train_all.flatten()
if X_train_all.shape[0] != y_train_all.shape[0]:
raise ValueError(
"# rows in X_train must match length of y_train.")
assert X_train_all.shape[0] == y_train_all.shape[0], (
"# rows in X_train must match length of y_train.")
self._df = isinstance(X_train_all, pd.DataFrame)
self._nrow, self._ndim = X_train_all.shape
X, y = X_train_all, y_train_all
elif dataframe is not None and label is not None:
if not isinstance(dataframe, pd.DataFrame):
raise ValueError("dataframe must be a pandas DataFrame")
if label not in dataframe.columns:
raise ValueError("label must a column name in dataframe")
assert isinstance(dataframe, pd.DataFrame), (
"dataframe must be a pandas DataFrame")
assert label in dataframe.columns, (
"label must a column name in dataframe")
self._df = True
X = dataframe.drop(columns=label)
self._nrow, self._ndim = X.shape
@ -498,23 +491,21 @@ class AutoML:
self._label_transformer = self._transformer.label_transformer
self._sample_weight_full = self._state.fit_kwargs.get('sample_weight')
if X_val is not None and y_val is not None:
if not (isinstance(X_val, np.ndarray) or issparse(X_val)
or isinstance(X_val, pd.DataFrame)):
raise ValueError(
"X_val must be None, a numpy array, a pandas dataframe, "
"or Scipy sparse matrix.")
if not (isinstance(y_val, np.ndarray)
or isinstance(y_val, pd.Series)):
raise ValueError(
"y_val must be None, a numpy array or a pandas series.")
if X_val.size == 0 or y_val.size == 0:
raise ValueError(
"Validation data are expected to be nonempty. "
"Use None for X_val and y_val if no validation data.")
assert (
isinstance(X_val, np.ndarray) or issparse(X_val)
or isinstance(X_val, pd.DataFrame)), (
"X_val must be None, a numpy array, a pandas dataframe, "
"or Scipy sparse matrix.")
assert (
isinstance(y_val, np.ndarray) or isinstance(y_val, pd.Series)
), "y_val must be None, a numpy array or a pandas series."
assert X_val.size != 0 and y_val.size != 0, (
"Validation data are expected to be nonempty. "
"Use None for X_val and y_val if no validation data.")
if isinstance(y_val, np.ndarray):
y_val = y_val.flatten()
if X_val.shape[0] != y_val.shape[0]:
raise ValueError("# rows in X_val must match length of y_val.")
assert X_val.shape[0] == y_val.shape[0], (
"# rows in X_val must match length of y_val.")
if self._transformer:
self._state.X_val = self._transformer.transform(X_val)
else:
@ -548,7 +539,7 @@ class AutoML:
X_train_all, y_train_all = self._X_train_all, self._y_train_all
if issparse(X_train_all):
X_train_all = X_train_all.tocsr()
if self._state.task in ('binary:logistic', 'multi:softmax') \
if self._state.task in ('binary', 'multi') \
and self._state.fit_kwargs.get('sample_weight') is None \
and self._split_type != 'time':
# logger.info(f"label {pd.unique(y_train_all)}")
@ -638,7 +629,7 @@ class AutoML:
y_train, y_val = y_train_all[train_idx], y_train_all[val_idx]
self._state.groups, self._state.groups_val = self._state.groups[
train_idx], self._state.groups[val_idx]
elif self._state.task != 'regression':
elif self._state.task in ('binary', 'multi'):
# for classification, make sure the labels are complete in both
# training and validation data
label_set, first = np.unique(y_train_all, return_index=True)
@ -760,7 +751,7 @@ class AutoML:
record_id: An integer of the record ID in the file,
0 corresponds to the first trial
task: A string of the task type,
'binary', 'multi', or 'regression'
'binary', 'multi', 'regression', 'forecast', 'rank'
Returns:
An estimator object for the given configuration
@ -875,9 +866,10 @@ class AutoML:
best_val_loss = val_loss
sample_size = size
if not training_duration:
logger.warning(
f"No estimator found within time_budget={time_budget}")
from .model import BaseEstimator as Estimator
self._trained_estimator = Estimator()
self._trained_estimator.model = None
return training_duration
if not best:
return
@ -898,11 +890,7 @@ class AutoML:
elif eval_method == 'auto':
eval_method = self._decide_eval_method(time_budget)
self.modelcount = 0
if self._state.task != 'forecast':
self._prepare_data(eval_method, split_ratio, n_splits)
else:
self._prepare_data(eval_method, split_ratio, n_splits,
period=self._state.fit_kwargs['period'])
self._prepare_data(eval_method, split_ratio, n_splits)
self._state.time_budget = None
self._state.n_jobs = n_jobs
self._trained_estimator = self._state._train_with_config(
@ -911,9 +899,10 @@ class AutoML:
return training_duration
def _decide_split_type(self, split_type):
if self._state.task in ('classification', 'binary', 'multi'):
if self._state.task == 'classification':
self._state.task = get_classification_objective(
len(np.unique(self._y_train_all)))
if self._state.task in ('binary', 'multi'):
assert split_type in [None, "stratified", "uniform", "time"]
self._split_type = split_type or "stratified"
elif self._state.task == 'regression':
@ -1248,13 +1237,14 @@ class AutoML:
For time series forecasting, must be None or 'time'.
For ranking task, must be None or 'group'.
hpo_method: str or None, default=None | The hyperparameter
optimization method. When it is None, CFO is used.
optimization method. By default, CFO is used for sequential
search and BlendSearch is used for parallel search.
No need to set when using flaml's default search space or using
a simple customized search space. When set to 'bs', BlendSearch
is used. BlendSearch can be tried when the search space is
complex, for example, containing multiple disjoint, discontinuous
subspaces. When set to 'random' and the argument 'n_concurrent_trials'
is larger than 1, RandomSearch is used.
subspaces. When set to 'random' and the argument
`n_concurrent_trials` is larger than 1, random search is used.
starting_points: A dictionary to specify the starting hyperparameter
config for the estimators.
Keys are the name of the estimators, and values are the starting
@ -1355,8 +1345,7 @@ class AutoML:
estimator_list))
self.estimator_list = estimator_list
self._hpo_method = hpo_method or (
'cfo' if n_concurrent_trials == 1 or len(estimator_list) == 1
else 'bs')
'cfo' if n_concurrent_trials == 1 else 'bs')
self._state.time_budget = time_budget
self._active_estimators = estimator_list.copy()
self._ensemble = ensemble
@ -1379,14 +1368,16 @@ class AutoML:
if self._best_estimator:
logger.info("fit succeeded")
logger.info(f"Time taken to find the best model: {self._time_taken_best_iter}")
if self._time_taken_best_iter >= time_budget * 0.7 and not all(
if self._hpo_method in ('cfo', 'bs') and (
self._time_taken_best_iter >= time_budget * 0.7) and not all(
state.search_alg and state.search_alg.searcher.is_ls_ever_converged
for state in self._search_states.values()
):
logger.warn("Time taken to find the best model is {0:.0f}% of the "
"provided time budget and not all estimators' hyperparameter "
"search converged. Consider increasing the time budget.".format(
self._time_taken_best_iter / time_budget * 100))
logger.warning(
"Time taken to find the best model is {0:.0f}% of the "
"provided time budget and not all estimators' hyperparameter "
"search converged. Consider increasing the time budget.".format(
self._time_taken_best_iter / time_budget * 100))
if not keep_search_state:
# release space
@ -1413,20 +1404,16 @@ class AutoML:
"Please run pip install flaml[ray]")
if self._hpo_method in ('cfo', 'grid'):
from flaml import CFO as SearchAlgo
elif 'optuna' == self._hpo_method:
from ray.tune.suggest.optuna import OptunaSearch as SearchAlgo
elif 'bs' == self._hpo_method:
from flaml import BlendSearch as SearchAlgo
elif 'cfocat' == self._hpo_method:
from flaml.searcher.cfo_cat import CFOCat as SearchAlgo
elif 'random' == self._hpo_method:
from ray.tune.suggest import BasicVariantGenerator as SearchAlgo
from ray.tune.sample import Domain as RayDomain
from .tune.sample import Domain
from ray.tune.sample import Domain
else:
raise NotImplementedError(
f"hpo_method={self._hpo_method} is not recognized. "
"'cfo' and 'bs' are supported.")
space = self.search_space
if self._hpo_method == 'random':
# Any point in points_to_evaluate must consist of hyperparamters
# that are tunable, which can be identified by checking whether
@ -1434,19 +1421,19 @@ class AutoML:
# the 'Domain' class from flaml or ray.tune
points_to_evaluate = self.points_to_evaluate.copy()
to_del = []
for k, v in self.search_space.items():
if not (isinstance(v, Domain) or isinstance(v, RayDomain)):
for k, v in space.items():
if not isinstance(v, Domain):
to_del.append(k)
for k in to_del:
for p in points_to_evaluate:
del p[k]
search_alg = SearchAlgo(max_concurrent=self._n_concurrent_trials,
points_to_evaluate=points_to_evaluate)
if k in p:
del p[k]
search_alg = SearchAlgo(
max_concurrent=self._n_concurrent_trials,
points_to_evaluate=points_to_evaluate)
else:
search_alg = SearchAlgo(
metric='val_loss',
space=self.search_space,
metric='val_loss', space=space,
low_cost_partial_config=self.low_cost_partial_config,
points_to_evaluate=self.points_to_evaluate,
cat_hp_cost=self.cat_hp_cost,
@ -1463,7 +1450,7 @@ class AutoML:
resources_per_trial = {
"cpu": self._state.n_jobs} if self._state.n_jobs > 1 else None
analysis = ray.tune.run(
self.trainable, search_alg=search_alg, config=self.search_space,
self.trainable, search_alg=search_alg, config=space,
metric='val_loss', mode='min', resources_per_trial=resources_per_trial,
time_budget_s=self._state.time_budget, num_samples=self._max_iter,
verbose=self.verbose)
@ -1521,6 +1508,7 @@ class AutoML:
from flaml import CFO as SearchAlgo
elif 'optuna' == self._hpo_method:
try:
from ray import __version__ as ray_version
assert ray_version >= '1.0.0'
from ray.tune.suggest.optuna import OptunaSearch as SearchAlgo
except (ImportError, AssertionError):
@ -1600,7 +1588,9 @@ class AutoML:
else:
algo = SearchAlgo(
metric='val_loss', mode='min', space=search_space,
points_to_evaluate=points_to_evaluate,
points_to_evaluate=points_to_evaluate
if len(search_state.init_config) == len(
search_space) else None,
)
search_state.search_alg = ConcurrencyLimiter(algo,
max_concurrent=1)
@ -1710,13 +1700,16 @@ class AutoML:
search_state.best_loss,
self._best_estimator,
self._state.best_loss))
if all(state.search_alg and state.search_alg.searcher.is_ls_ever_converged
for state in self._search_states.values()) and (
self._state.time_from_start
> self._warn_threshold * self._time_taken_best_iter):
logger.warn("All estimator hyperparameters local search has converged at least once, "
f"and the total search time exceeds {self._warn_threshold} times the time taken "
"to find the best model.")
if self._hpo_method in ('cfo', 'bs') and all(
state.search_alg and state.search_alg.searcher.is_ls_ever_converged
for state in self._search_states.values()) and (
self._state.time_from_start
> self._warn_threshold * self._time_taken_best_iter):
logger.warning(
"All estimator hyperparameters local search has "
"converged at least once, and the total search time "
f"exceeds {self._warn_threshold} times the time taken "
"to find the best model.")
self._warn_threshold *= 10
else:
logger.info(f"no enough budget for learner {estimator}")
@ -1766,6 +1759,8 @@ class AutoML:
self._best_estimator = None
self._retrained_config = {}
self._warn_threshold = 10
self._selected = None
self.modelcount = 0
if self._n_concurrent_trials == 1:
self._search_sequential()
@ -1782,7 +1777,7 @@ class AutoML:
if self._trained_estimator:
logger.info(f'selected model: {self._trained_estimator.model}')
if self._ensemble and self._state.task in (
'binary:logistic', 'multi:softmax', 'regression',
'binary', 'multi', 'regression',
):
search_states = list(x for x in self._search_states.items()
if x[1].trained_estimator)
@ -1795,7 +1790,7 @@ class AutoML:
logger.info(estimators)
if len(estimators) <= 1:
return
if self._state.task in ('binary:logistic', 'multi:softmax'):
if self._state.task in ('binary', 'multi'):
from sklearn.ensemble import StackingClassifier as Stacker
else:
from sklearn.ensemble import StackingRegressor as Stacker
@ -1838,9 +1833,6 @@ class AutoML:
else:
logger.info(
"not retraining because the time budget is too small.")
else:
self._selected = self._trained_estimator = None
self.modelcount = 0
if self.model and mlflow is not None and mlflow.active_run():
mlflow.sklearn.log_model(self.model, 'best_model')
@ -1886,8 +1878,7 @@ class AutoML:
speed = delta_loss / delta_time
if speed:
estimated_cost = max(2 * gap / speed, estimated_cost)
if estimated_cost == 0:
estimated_cost = 1e-10
estimated_cost == estimated_cost or 1e-10
inv.append(1 / estimated_cost)
else:
estimated_cost = self._eci[i]

View File

@ -261,7 +261,7 @@ class DataTransformer:
cat_columns, num_columns, datetime_columns
self._drop = drop
if task in ('binary:logistic', 'multi:softmax'):
if task in ('binary', 'multi', 'classification'):
from sklearn.preprocessing import LabelEncoder
self.label_transformer = LabelEncoder()
y = self.label_transformer.fit_transform(y)

View File

@ -24,7 +24,7 @@ def get_estimator_class(task, estimator_name):
''' when adding a new learner, need to add an elif branch '''
if 'xgboost' == estimator_name:
if 'regression' in task:
if 'regression' == task:
estimator_class = XGBoostEstimator
else:
estimator_class = XGBoostSklearnEstimator
@ -179,7 +179,8 @@ def _eval_estimator(config, estimator, X_train, y_train, X_test, y_test, weight_
fit_kwargs.get('groups'))
if isinstance(metric_for_logging, dict):
pred_time = metric_for_logging.get('pred_time', 0)
test_pred_y = None # eval_metric may return test_pred_y but not necessarily. Setting None for now.
test_pred_y = None
# eval_metric may return test_pred_y but not necessarily. Setting None for now.
return test_loss, metric_for_logging, pred_time, test_pred_y
@ -193,10 +194,10 @@ def get_test_loss(config, estimator, X_train, y_train, X_test, y_test, weight_te
# fit_kwargs['X_val'] = X_test
# fit_kwargs['y_val'] = y_test
estimator.fit(X_train, y_train, budget, **fit_kwargs)
test_loss, metric_for_logging, pred_time, _ = _eval_estimator(config, estimator,
X_train, y_train, X_test, y_test,
weight_test, groups_test, eval_metric, obj,
labels, log_training_metric, fit_kwargs)
test_loss, metric_for_logging, pred_time, _ = _eval_estimator(
config, estimator, X_train, y_train, X_test, y_test,
weight_test, groups_test, eval_metric, obj,
labels, log_training_metric, fit_kwargs)
train_time = time.time() - start
return test_loss, metric_for_logging, train_time, pred_time
@ -212,7 +213,7 @@ def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
valid_fold_num = total_fold_num = 0
n = kf.get_n_splits()
X_train_split, y_train_split = X_train_all, y_train_all
if task == 'binary:logistics' or task == 'multi:softmax':
if task in ('binary', 'multi'):
labels = np.unique(y_train_all)
else:
labels = None
@ -346,9 +347,9 @@ def train_estimator(
def get_classification_objective(num_labels: int) -> str:
if num_labels == 2:
objective_name = 'binary:logistic'
objective_name = 'binary'
else:
objective_name = 'multi:softmax'
objective_name = 'multi'
return objective_name

View File

@ -3,7 +3,6 @@
* Licensed under the MIT License.
'''
import warnings
import numpy as np
import xgboost as xgb
import time
@ -31,12 +30,12 @@ class BaseEstimator:
for both regression and classification
'''
def __init__(self, task='binary:logistic', **params):
def __init__(self, task='binary', **params):
'''Constructor
Args:
task: A string of the task type, one of
'binary:logistic', 'multi:softmax', 'regression'
'binary', 'multi', 'regression', 'rank', 'forecast'
n_jobs: An integer of the number of parallel threads
params: A dictionary of the hyperparameter names and values
'''
@ -48,7 +47,7 @@ class BaseEstimator:
del self.params['_estimator_type']
else:
self._estimator_type = "classifier" if task in (
'binary:logistic', 'multi:softmax') else "regressor"
'binary', 'multi') else "regressor"
def get_params(self, deep=False):
params = self.params.copy()
@ -145,11 +144,10 @@ class BaseEstimator:
Each element at (i,j) is the probability for instance i to be in
class j
'''
if 'regression' in self._task:
raise ValueError('Regression tasks do not support predict_prob')
else:
X_test = self._preprocess(X_test)
return self._model.predict_proba(X_test)
assert self._task in ('binary', 'multi'), (
'predict_prob() only for classification task.')
X_test = self._preprocess(X_test)
return self._model.predict_proba(X_test)
def cleanup(self):
pass
@ -193,7 +191,7 @@ class BaseEstimator:
class SKLearnEstimator(BaseEstimator):
def __init__(self, task='binary:logistic', **params):
def __init__(self, task='binary', **params):
super().__init__(task, **params)
def _preprocess(self, X):
@ -264,21 +262,18 @@ class LGBMEstimator(BaseEstimator):
n_estimators = int(round(config['n_estimators']))
return (num_leaves * 3 + (num_leaves - 1) * 4 + 1.0) * n_estimators * 8
def __init__(self, task='binary:logistic', log_max_bin=8, **params):
def __init__(self, task='binary', log_max_bin=8, **params):
super().__init__(task, **params)
if "objective" not in self.params:
# Default: regression for LGBMRegressor,
# binary or multiclass for LGBMClassifier
if 'regression' == task:
objective = 'regression'
elif 'binary' in task:
objective = 'regression'
if 'binary' in task:
objective = 'binary'
elif 'multi' in task:
objective = 'multiclass'
elif 'rank' == task:
objective = 'lambdarank'
else:
objective = 'regression'
self.params["objective"] = objective
if "n_estimators" in self.params:
self.params["n_estimators"] = int(round(self.params["n_estimators"]))
@ -477,7 +472,7 @@ class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
return XGBoostEstimator.cost_relative2lgbm()
def __init__(
self, task='binary:logistic', n_jobs=1,
self, task='binary', n_jobs=1,
n_estimators=4, max_leaves=4, subsample=1.0,
min_child_weight=1, learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0,
colsample_bylevel=1.0, colsample_bytree=1.0, tree_method='hist',
@ -506,11 +501,10 @@ class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
'use_label_encoder': params.get('use_label_encoder', False),
})
if 'regression' == task:
self.estimator_class = xgb.XGBRegressor
elif 'rank' == task:
self.estimator_class = xgb.XGBRegressor
if 'rank' == task:
self.estimator_class = xgb.XGBRanker
else:
elif task in ('binary', 'multi'):
self.estimator_class = xgb.XGBClassifier
self._time_per_iter = None
self._train_size = 0
@ -543,7 +537,7 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
'low_cost_init_value': 4,
},
}
if task != 'regression':
if task in ('binary', 'multi'):
space['criterion'] = {
'domain': tune.choice(['gini', 'entropy']),
# 'init_value': 'gini',
@ -555,7 +549,7 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
return 2.0
def __init__(
self, task='binary:logistic', n_jobs=1,
self, task='binary', n_jobs=1,
n_estimators=4, max_features=1.0, criterion='gini', max_leaves=4,
**params
):
@ -569,9 +563,8 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
'max_features': float(max_features),
"max_leaf_nodes": params.get('max_leaf_nodes', int(round(max_leaves))),
})
if 'regression' in task:
self.estimator_class = RandomForestRegressor
else:
self.estimator_class = RandomForestRegressor
if task in ('binary', 'multi'):
self.estimator_class = RandomForestClassifier
self.params['criterion'] = criterion
@ -586,7 +579,7 @@ class ExtraTreeEstimator(RandomForestEstimator):
def cost_relative2lgbm(cls):
return 1.9
def __init__(self, task='binary:logistic', **params):
def __init__(self, task='binary', **params):
super().__init__(task, **params)
if 'regression' in task:
self.estimator_class = ExtraTreesRegressor
@ -610,7 +603,7 @@ class LRL1Classifier(SKLearnEstimator):
return 160
def __init__(
self, task='binary:logistic', n_jobs=1, tol=0.0001, C=1.0,
self, task='binary', n_jobs=1, tol=0.0001, C=1.0,
**params
):
super().__init__(task, **params)
@ -621,11 +614,9 @@ class LRL1Classifier(SKLearnEstimator):
'solver': params.get("solver", 'saga'),
'n_jobs': n_jobs,
})
if 'regression' in task:
self.estimator_class = None
raise NotImplementedError('LR does not support regression task')
else:
self.estimator_class = LogisticRegression
assert task in ('binary', 'multi'), (
'LogisticRegression for classification task only')
self.estimator_class = LogisticRegression
class LRL2Classifier(SKLearnEstimator):
@ -639,7 +630,7 @@ class LRL2Classifier(SKLearnEstimator):
return 25
def __init__(
self, task='binary:logistic', n_jobs=1, tol=0.0001, C=1.0,
self, task='binary', n_jobs=1, tol=0.0001, C=1.0,
**params
):
super().__init__(task, **params)
@ -650,11 +641,9 @@ class LRL2Classifier(SKLearnEstimator):
'solver': params.get("solver", 'lbfgs'),
'n_jobs': n_jobs,
})
if 'regression' in task:
self.estimator_class = None
raise NotImplementedError('LR does not support regression task')
else:
self.estimator_class = LogisticRegression
assert task in ('binary', 'multi'), (
'LogisticRegression for classification task only')
self.estimator_class = LogisticRegression
class CatBoostEstimator(BaseEstimator):
@ -711,7 +700,7 @@ class CatBoostEstimator(BaseEstimator):
return X
def __init__(
self, task='binary:logistic', n_jobs=1,
self, task='binary', n_jobs=1,
n_estimators=8192, learning_rate=0.1, early_stopping_rounds=4, **params
):
super().__init__(task, **params)
@ -723,10 +712,9 @@ class CatBoostEstimator(BaseEstimator):
'verbose': params.get('verbose', False),
'random_seed': params.get("random_seed", 10242048),
})
if 'regression' in task:
from catboost import CatBoostRegressor
self.estimator_class = CatBoostRegressor
else:
from catboost import CatBoostRegressor
self.estimator_class = CatBoostRegressor
if task in ('binary', 'multi'):
from catboost import CatBoostClassifier
self.estimator_class = CatBoostClassifier
@ -831,7 +819,7 @@ class KNeighborsEstimator(BaseEstimator):
return 30
def __init__(
self, task='binary:logistic', n_jobs=1, n_neighbors=5, **params
self, task='binary', n_jobs=1, n_neighbors=5, **params
):
super().__init__(task, **params)
self.params.update({
@ -839,10 +827,9 @@ class KNeighborsEstimator(BaseEstimator):
'weights': params.get('weights', 'distance'),
'n_jobs': n_jobs,
})
if 'regression' in task:
from sklearn.neighbors import KNeighborsRegressor
self.estimator_class = KNeighborsRegressor
else:
from sklearn.neighbors import KNeighborsRegressor
self.estimator_class = KNeighborsRegressor
if task in ('binary', 'multi'):
from sklearn.neighbors import KNeighborsClassifier
self.estimator_class = KNeighborsClassifier
@ -920,7 +907,7 @@ class FBProphet(BaseEstimator):
forecast = self._model.predict(X_test)
return forecast['yhat']
else:
warnings.warn(
logger.warning(
"Estimator is not fit yet. Please run fit() before predict().")
return np.ones(X_test.shape[0])
@ -954,8 +941,9 @@ class ARIMA(FBProphet):
return train_df
def fit(self, X_train, y_train, budget=None, **kwargs):
from statsmodels.tsa.arima.model import ARIMA as ARIMA_estimator
import warnings
warnings.filterwarnings("ignore")
from statsmodels.tsa.arima.model import ARIMA as ARIMA_estimator
current_time = time.time()
train_df = self._join(X_train, y_train)
model = ARIMA_estimator(

View File

@ -29,12 +29,11 @@ class AutoTransformers:
.. code-block:: python
autohf = AutoTransformers()
autohf_settings = {"resources_per_trial": {"cpu": 1},
"num_samples": -1,
"time_budget": 100000,
"ckpt_per_epoch": 1,
"fp16": False,
}
autohf_settings = {
"resources_per_trial": {"cpu": 1, "gpu": 1},
"num_samples": -1,
"time_budget": 60,
}
validation_metric, analysis = autohf.fit(**autohf_settings)
@ -45,10 +44,11 @@ class AutoTransformers:
search_space = {}
if mode == "grid":
# TODO add test
for each_hp in config_json.keys():
this_config = config_json[each_hp]
assert isinstance(this_config, dict) or isinstance(this_config, list), \
"config of " + each_hp + " must be dict or list"
"config of " + each_hp + " must be dict or list for grid search"
search_space[each_hp] = ray.tune.grid_search(this_config)
else:
for each_hp in config_json.keys():
@ -85,10 +85,6 @@ class AutoTransformers:
search_space_hpo_json,
mode=self.jobid_config.mod)
@staticmethod
def _wrapper(func, *args): # with star
return func(*args)
@staticmethod
def _get_split_name(data_raw, fold_name=None):
if fold_name:
@ -179,7 +175,7 @@ class AutoTransformers:
data_raw = load_dataset(JobID.dataset_list_to_str(self.jobid_config.dat),
self.jobid_config.subdat)
else:
data_raw = AutoTransformers._wrapper(load_dataset, *self.jobid_config.dat)
data_raw = load_dataset(*self.jobid_config.dat)
self._train_name, self._dev_name, self._test_name = AutoTransformers._get_split_name(
data_raw,
@ -349,6 +345,7 @@ class AutoTransformers:
return training_args_config, per_model_config
def _objective(self, config, reporter, checkpoint_dir=None):
# TODO add test
from transformers.trainer_utils import set_seed
self._set_transformers_verbosity(self._transformers_verbose)
@ -827,6 +824,7 @@ class AutoTransformers:
test_trainer = TrainerForAutoTransformers(best_model, training_args)
if self.jobid_config.spt == "ori":
# TODO add test
if "label" in self.test_dataset.features.keys():
self.test_dataset.remove_columns_("label")
print("Cleaning the existing label column from test data")

View File

@ -1,2 +1,2 @@
from .trial_scheduler import TrialScheduler, FIFOScheduler
from .trial_scheduler import TrialScheduler
from .online_scheduler import OnlineScheduler, OnlineSuccessiveDoublingScheduler, ChaChaScheduler

View File

@ -1,12 +1,12 @@
import numpy as np
import logging
from typing import Optional, Dict
from flaml.scheduler import FIFOScheduler, TrialScheduler
from typing import Dict
from flaml.scheduler import TrialScheduler
from flaml.tune import Trial
logger = logging.getLogger(__name__)
class OnlineScheduler(FIFOScheduler):
class OnlineScheduler(TrialScheduler):
"""Implementation of the OnlineFIFOSchedulers.
Methods:

View File

@ -17,10 +17,8 @@ This source file is adapted here because ray does not fully support Windows.
Copyright (c) Microsoft Corporation.
'''
from typing import Dict, Optional
from flaml.tune import trial_runner
from flaml.tune.result import DEFAULT_METRIC
from flaml.tune.trial import Trial
@ -31,127 +29,10 @@ class TrialScheduler:
PAUSE = "PAUSE" #: Status for pausing trial execution
STOP = "STOP" #: Status for stopping trial execution
_metric = None
@property
def metric(self):
return self._metric
def set_search_properties(self, metric: Optional[str],
mode: Optional[str]) -> bool:
"""Pass search properties to scheduler.
This method acts as an alternative to instantiating schedulers
that react to metrics with their own `metric` and `mode` parameters.
Args:
metric (str): Metric to optimize
mode (str): One of ["min", "max"]. Direction to optimize.
"""
if self._metric and metric:
return False
if metric:
self._metric = metric
if self._metric is None:
# Per default, use anonymous metric
self._metric = DEFAULT_METRIC
return True
def on_trial_add(self, trial_runner: "trial_runner.TrialRunner",
trial: Trial):
"""Called when a new trial is added to the trial runner."""
raise NotImplementedError
def on_trial_error(self, trial_runner: "trial_runner.TrialRunner",
trial: Trial):
"""Notification for the error of trial.
This will only be called when the trial is in the RUNNING state."""
raise NotImplementedError
def on_trial_result(self, trial_runner: "trial_runner.TrialRunner",
trial: Trial, result: Dict) -> str:
"""Called on each intermediate result returned by a trial.
At this point, the trial scheduler can make a decision by returning
one of CONTINUE, PAUSE, and STOP. This will only be called when the
trial is in the RUNNING state."""
raise NotImplementedError
def on_trial_complete(self, trial_runner: "trial_runner.TrialRunner",
trial: Trial, result: Dict):
"""Notification for the completion of trial.
This will only be called when the trial is in the RUNNING state and
either completes naturally or by manual termination."""
raise NotImplementedError
def on_trial_remove(self, trial_runner: "trial_runner.TrialRunner",
trial: Trial):
"""Called to remove trial.
This is called when the trial is in PAUSED or PENDING state. Otherwise,
call `on_trial_complete`."""
raise NotImplementedError
def choose_trial_to_run(
self, trial_runner: "trial_runner.TrialRunner") -> Optional[Trial]:
"""Called to choose a new trial to run.
This should return one of the trials in trial_runner that is in
the PENDING or PAUSED state. This function must be idempotent.
If no trial is ready, return None."""
raise NotImplementedError
def debug_string(self) -> str:
"""Returns a human readable message for printing to the console."""
raise NotImplementedError
def save(self, checkpoint_path: str):
"""Save trial scheduler to a checkpoint"""
raise NotImplementedError
def restore(self, checkpoint_path: str):
"""Restore trial scheduler from checkpoint."""
raise NotImplementedError
class FIFOScheduler(TrialScheduler):
"""Simple scheduler that just runs trials in submission order."""
def on_trial_add(self, trial_runner: "trial_runner.TrialRunner",
trial: Trial):
pass
def on_trial_error(self, trial_runner: "trial_runner.TrialRunner",
trial: Trial):
pass
def on_trial_result(self, trial_runner: "trial_runner.TrialRunner",
trial: Trial, result: Dict) -> str:
return TrialScheduler.CONTINUE
def on_trial_complete(self, trial_runner: "trial_runner.TrialRunner",
trial: Trial, result: Dict):
pass
def on_trial_remove(self, trial_runner: "trial_runner.TrialRunner",
trial: Trial):
pass
def choose_trial_to_run(
self, trial_runner: "trial_runner.TrialRunner") -> Optional[Trial]:
for trial in trial_runner.get_trials():
if (trial.status == Trial.PENDING
and trial_runner.has_resources_for_trial(trial)):
return trial
for trial in trial_runner.get_trials():
if (trial.status == Trial.PAUSED
and trial_runner.has_resources_for_trial(trial)):
return trial
return None
def debug_string(self) -> str:
return "Using FIFO scheduling algorithm."

View File

@ -14,14 +14,14 @@ try:
assert ray_version >= '1.0.0'
from ray.tune.suggest import Searcher
from ray.tune.suggest.optuna import OptunaSearch as GlobalSearch
from ray.tune.utils.util import unflatten_dict
except (ImportError, AssertionError):
from .suggestion import Searcher
from .suggestion import OptunaSearch as GlobalSearch
from ..tune.trial import unflatten_dict
from ..tune.trial import unflatten_dict, flatten_dict
from .search_thread import SearchThread
from .flow2 import FLOW2
from ..tune.space import add_cost_to_space, indexof, normalize, define_by_run_func
from ..tune.space import (
add_cost_to_space, indexof, normalize, define_by_run_func)
import logging
logger = logging.getLogger(__name__)
@ -40,9 +40,10 @@ class BlendSearch(Searcher):
metric: Optional[str] = None,
mode: Optional[str] = None,
space: Optional[dict] = None,
points_to_evaluate: Optional[List[dict]] = None,
low_cost_partial_config: Optional[dict] = None,
cat_hp_cost: Optional[dict] = None,
points_to_evaluate: Optional[List[dict]] = None,
evaluated_rewards: Optional[List] = None,
prune_attr: Optional[str] = None,
min_resource: Optional[float] = None,
max_resource: Optional[float] = None,
@ -61,7 +62,6 @@ class BlendSearch(Searcher):
mode: A string in ['min', 'max'] to specify the objective as
minimization or maximization.
space: A dictionary to specify the search space.
points_to_evaluate: Initial parameter suggestions to be run first.
low_cost_partial_config: A dictionary from a subset of
controlled dimensions to the initial low-cost values.
e.g.,
@ -80,6 +80,13 @@ class BlendSearch(Searcher):
i.e., the relative cost of the
three choices of 'tree_method' is 1, 1 and 2 respectively.
points_to_evaluate: Initial parameter suggestions to be run first.
evaluated_rewards (list): If you have previously evaluated the
parameters passed in as points_to_evaluate you can avoid
re-running those trials by passing in the reward attributes
as a list so the optimiser can be told the results without
needing to re-compute the trial. Must be the same length as
points_to_evaluate.
prune_attr: A string of the attribute used for pruning.
Not necessarily in space.
When prune_attr is in space, it is a hyperparameter, e.g.,
@ -122,7 +129,20 @@ class BlendSearch(Searcher):
"consider providing low-cost values for cost-related hps via "
"'low_cost_partial_config'."
)
self._points_to_evaluate = points_to_evaluate or []
if evaluated_rewards and mode:
self._points_to_evaluate = []
self._evaluated_rewards = []
best = max(evaluated_rewards) if mode == 'max' else min(
evaluated_rewards)
# only keep the best points as start points
for i, r in enumerate(evaluated_rewards):
if r == best:
p = points_to_evaluate[i]
self._points_to_evaluate.append(p)
self._evaluated_rewards.append(r)
else:
self._points_to_evaluate = points_to_evaluate or []
self._evaluated_rewards = evaluated_rewards or []
self._config_constraints = config_constraints
self._metric_constraints = metric_constraints
if self._metric_constraints:
@ -131,40 +151,45 @@ class BlendSearch(Searcher):
self._cat_hp_cost = cat_hp_cost or {}
if space:
add_cost_to_space(space, init_config, self._cat_hp_cost)
self._ls = self.LocalSearch(
init_config, metric, mode, space, prune_attr,
min_resource, max_resource, reduction_factor, self.cost_attr, seed)
if global_search_alg is not None:
self._gs = global_search_alg
elif getattr(self, '__name__', None) != 'CFO':
from functools import partial
gs_space = partial(define_by_run_func, space=space)
if space and self._ls.hierarchical:
from functools import partial
gs_space = partial(define_by_run_func, space=space)
evaluated_rewards = None # not supproted by define-by-run
else:
gs_space = space
gs_seed = seed - 10 if (seed - 10) >= 0 else seed - 11 + (1 << 32)
if experimental:
import optuna as ot
sampler = ot.samplers.TPESampler(
seed=seed, multivariate=True, group=True)
else:
sampler = None
try:
gs_seed = seed - 10 if (seed - 10) >= 0 else seed - 11 + (1 << 32)
if experimental:
import optuna as ot
sampler = ot.samplers.TPESampler(
seed=seed, multivariate=True, group=True)
else:
sampler = None
self._gs = GlobalSearch(
space=gs_space, metric=metric, mode=mode, seed=gs_seed,
sampler=sampler, points_to_evaluate=points_to_evaluate,
evaluated_rewards=evaluated_rewards)
except ValueError:
self._gs = GlobalSearch(
space=gs_space, metric=metric, mode=mode, seed=gs_seed,
sampler=sampler)
except TypeError:
self._gs = GlobalSearch(space=gs_space, metric=metric, mode=mode)
self._gs.space = space
else:
self._gs = None
self._experimental = experimental
if getattr(self, '__name__', None) == 'CFO' and points_to_evaluate and len(
points_to_evaluate) > 1:
self._points_to_evaluate) > 1:
# use the best config in points_to_evaluate as the start point
self._candidate_start_points = {}
self._started_from_low_cost = not low_cost_partial_config
else:
self._candidate_start_points = None
self._ls = self.LocalSearch(
init_config, metric, mode, space, prune_attr,
min_resource, max_resource, reduction_factor, self.cost_attr, seed)
self._is_ls_ever_converged = False
self._subspace = {} # the subspace for each trial id
if space:
self._init_search()
@ -187,6 +212,7 @@ class BlendSearch(Searcher):
if not self._ls.space:
# the search space can be set only once
if self._gs is not None:
# define-by-run is not supported via set_search_properties
self._gs.set_search_properties(metric, mode, config)
self._gs.space = config
if config:
@ -216,6 +242,8 @@ class BlendSearch(Searcher):
def _init_search(self):
'''initialize the search
'''
self._is_ls_ever_converged = False
self._subspace = {} # the subspace for each trial id
self._metric_target = np.inf * self._ls.metric_op
self._search_thread_pool = {
# id: int -> thread: SearchThread
@ -239,6 +267,7 @@ class BlendSearch(Searcher):
else:
self._metric_constraint_satisfied = True
self._metric_constraint_penalty = None
self.best_resource = self._ls.min_resource
def save(self, checkpoint_path: str):
''' save states to a checkpoint path
@ -295,10 +324,11 @@ class BlendSearch(Searcher):
trial_id, result, error)
del self._trial_proposed_by[trial_id]
if result:
config = {}
for key, value in result.items():
if key.startswith('config/'):
config[key[7:]] = value
config = result.get('config', {})
if not config:
for key, value in result.items():
if key.startswith('config/'):
config[key[7:]] = value
signature = self._ls.config_signature(
config, self._subspace.get(trial_id, {}))
if error: # remove from result cache
@ -309,17 +339,22 @@ class BlendSearch(Searcher):
objective = result[self._ls.metric]
if (objective - self._metric_target) * self._ls.metric_op < 0:
self._metric_target = objective
if self._ls.resource:
self._best_resource = config[self._ls.prune_attr]
if thread_id:
if not self._metric_constraint_satisfied:
# no point has been found to satisfy metric constraint
self._expand_admissible_region(
self._ls_bound_min, self._ls_bound_max,
self._subspace.get(trial_id, self._ls.space))
# if self._gs is not None and self._experimental:
# # TODO: recover when supported
# converted = convert_key(config, self._gs.space)
# logger.info(converted)
# self._gs.add_evaluated_point(converted, objective)
if self._gs is not None and self._experimental and (
not self._ls.hierarchical):
self._gs.add_evaluated_point(
flatten_dict(config), objective)
# TODO: recover when supported
# converted = convert_key(config, self._gs.space)
# logger.info(converted)
# self._gs.add_evaluated_point(converted, objective)
elif metric_constraint_satisfied and self._create_condition(
result):
# thread creator
@ -496,10 +531,12 @@ class BlendSearch(Searcher):
'''
if self._init_used and not self._points_to_evaluate:
choice, backup = self._select_thread()
if choice < 0: # timeout
return None
# if choice < 0: # timeout
# return None
config = self._search_thread_pool[choice].suggest(trial_id)
if choice and config is None:
if not choice and config is not None and self._ls.resource:
config[self._ls.prune_attr] = self.best_resource
elif choice and config is None:
# local search thread finishes
if self._search_thread_pool[choice].converged:
self._expand_admissible_region(
@ -544,9 +581,6 @@ class BlendSearch(Searcher):
self._trial_proposed_by[trial_id] = backup
choice = backup
if not choice: # global search
if self._ls._resource:
# TODO: min or median?
config[self._ls.prune_attr] = self._ls.min_resource
# temporarily relax admissible region for parallel proposals
self._update_admissible_region(
config, self._gs_admissible_min, self._gs_admissible_max,
@ -563,22 +597,35 @@ class BlendSearch(Searcher):
else: # use init config
if self._candidate_start_points is not None and self._points_to_evaluate:
self._candidate_start_points[trial_id] = None
init_config = self._points_to_evaluate.pop(
0) if self._points_to_evaluate else self._ls.init_config
reward = None
if self._points_to_evaluate:
init_config = self._points_to_evaluate.pop(0)
if self._evaluated_rewards:
reward = self._evaluated_rewards.pop(0)
else:
init_config = self._ls.init_config
config, space = self._ls.complete_config(
init_config, self._ls_bound_min, self._ls_bound_max)
config_signature = self._ls.config_signature(config, space)
result = self._result.get(config_signature)
if result: # tried before
return None
elif result is None: # not tried before
self._result[config_signature] = {}
else: # running but no result yet
return None
if reward is None:
config_signature = self._ls.config_signature(config, space)
result = self._result.get(config_signature)
if result: # tried before
return None
elif result is None: # not tried before
self._result[config_signature] = {}
else: # running but no result yet
return None
self._init_used = True
self._trial_proposed_by[trial_id] = 0
self._search_thread_pool[0].running += 1
self._subspace[trial_id] = space
if reward is not None:
result = {
self._metric: reward, self.cost_attr: 1,
'config': config
}
self.on_trial_complete(trial_id, result)
return None
return config
def _should_skip(self, choice, trial_id, config, space) -> bool:
@ -694,79 +741,88 @@ except (ImportError, AssertionError):
try:
from nni.tuner import Tuner as NNITuner
from nni.utils import extract_scalar_reward
class BlendSearchTuner(BlendSearch, NNITuner):
'''Tuner class for NNI
'''
def receive_trial_result(self, parameter_id, parameters, value,
**kwargs):
'''
Receive trial's final result.
parameter_id: int
parameters: object created by 'generate_parameters()'
value: final metrics of the trial, including default metric
'''
result = {}
for key, value in parameters.items():
result['config/' + key] = value
reward = extract_scalar_reward(value)
result[self._metric] = reward
# if nni does not report training cost,
# using sequence as an approximation.
# if no sequence, using a constant 1
result[self.cost_attr] = value.get(self.cost_attr, value.get(
'sequence', 1))
self.on_trial_complete(str(parameter_id), result)
...
def generate_parameters(self, parameter_id, **kwargs) -> Dict:
'''
Returns a set of trial (hyper-)parameters, as a serializable object
parameter_id: int
'''
return self.suggest(str(parameter_id))
...
def update_search_space(self, search_space):
'''
Tuners are advised to support updating search space at run-time.
If a tuner can only set search space once before generating first hyper-parameters,
it should explicitly document this behaviour.
search_space: JSON object created by experiment owner
'''
config = {}
for key, value in search_space.items():
v = value.get("_value")
_type = value['_type']
if _type == 'choice':
config[key] = choice(v)
elif _type == 'randint':
config[key] = randint(v[0], v[1] - 1)
elif _type == 'uniform':
config[key] = uniform(v[0], v[1])
elif _type == 'quniform':
config[key] = quniform(v[0], v[1], v[2])
elif _type == 'loguniform':
config[key] = loguniform(v[0], v[1])
elif _type == 'qloguniform':
config[key] = qloguniform(v[0], v[1], v[2])
elif _type == 'normal':
config[key] = randn(v[1], v[2])
elif _type == 'qnormal':
config[key] = qrandn(v[1], v[2], v[3])
else:
raise ValueError(
f'unsupported type in search_space {_type}')
self._ls.set_search_properties(None, None, config)
if self._gs is not None:
self._gs.set_search_properties(None, None, config)
self._init_search()
except ImportError:
class BlendSearchTuner(BlendSearch):
class NNITuner:
pass
def extract_scalar_reward(x: Dict):
return x.get('reward')
class BlendSearchTuner(BlendSearch, NNITuner):
'''Tuner class for NNI
'''
def receive_trial_result(self, parameter_id, parameters, value,
**kwargs):
'''
Receive trial's final result.
parameter_id: int
parameters: object created by 'generate_parameters()'
value: final metrics of the trial, including default metric
'''
result = {}
for k, v in parameters.items():
result['config/' + k] = v
reward = extract_scalar_reward(value)
result[self._metric] = reward
# if nni does not report training cost,
# using sequence as an approximation.
# if no sequence, using a constant 1
result[self.cost_attr] = value.get(self.cost_attr, value.get(
'sequence', 1))
self.on_trial_complete(str(parameter_id), result)
...
def generate_parameters(self, parameter_id, **kwargs) -> Dict:
'''
Returns a set of trial (hyper-)parameters, as a serializable object
parameter_id: int
'''
return self.suggest(str(parameter_id))
...
def update_search_space(self, search_space):
'''
Tuners are advised to support updating search space at run-time.
If a tuner can only set search space once before generating first hyper-parameters,
it should explicitly document this behaviour.
search_space: JSON object created by experiment owner
'''
config = {}
for key, value in search_space.items():
v = value.get("_value")
_type = value['_type']
if _type == 'choice':
config[key] = choice(v)
elif _type == 'randint':
config[key] = randint(*v)
elif _type == 'uniform':
config[key] = uniform(*v)
elif _type == 'quniform':
config[key] = quniform(*v)
elif _type == 'loguniform':
config[key] = loguniform(*v)
elif _type == 'qloguniform':
config[key] = qloguniform(*v)
elif _type == 'normal':
config[key] = randn(*v)
elif _type == 'qnormal':
config[key] = qrandn(*v)
else:
raise ValueError(
f'unsupported type in search_space {_type}')
add_cost_to_space(config, {}, {})
self._ls = self.LocalSearch(
{}, self._ls.metric, self._mode, config, cost_attr=self.cost_attr,
seed=self._ls.seed)
if self._gs is not None:
self._gs = GlobalSearch(
space=config, metric=self._metric, mode=self._mode,
sampler=self._gs._sampler)
self._gs.space = config
self._init_search()
class CFO(BlendSearchTuner):
''' class for CFO algorithm

View File

@ -15,8 +15,9 @@ try:
from ray.tune.utils.util import flatten_dict, unflatten_dict
except (ImportError, AssertionError):
from .suggestion import Searcher
from .variant_generator import generate_variants, flatten_dict, unflatten_dict
from .variant_generator import generate_variants
from ..tune import sample
from ..tune.trial import flatten_dict, unflatten_dict
from ..tune.space import complete_config, denormalize, normalize
@ -95,7 +96,7 @@ class FLOW2(Searcher):
self.space = space or {}
self._space = flatten_dict(self.space, prevent_delimiter=True)
self._random = np.random.RandomState(seed)
self._seed = seed
self.seed = seed
self.init_config = init_config
self.best_config = flatten_dict(init_config)
self.prune_attr = prune_attr
@ -142,7 +143,7 @@ class FLOW2(Searcher):
self._bounded_keys.append(key)
if not hier:
self._space_keys = sorted(self._tunable_keys)
self._hierarchical = hier
self.hierarchical = hier
if (self.prune_attr and self.prune_attr not in self._space
and self.max_resource):
self.min_resource = self.min_resource or self._min_resource()
@ -253,10 +254,10 @@ class FLOW2(Searcher):
init_config, self.metric, self.mode,
space, self.prune_attr,
self.min_resource, self.max_resource,
self.resource_multiple_factor, self.cost_attr, self._seed + 1)
self.resource_multiple_factor, self.cost_attr, self.seed + 1)
flow2.best_obj = obj * self.metric_op # minimize internally
flow2.cost_incumbent = cost
self._seed += 1
self.seed += 1
return flow2
def normalize(self, config, recursive=False) -> Dict:
@ -502,7 +503,7 @@ class FLOW2(Searcher):
value_list = []
# self._space_keys doesn't contain keys with const values,
# e.g., "eval_metric": ["logloss", "error"].
keys = sorted(config.keys()) if self._hierarchical else self._space_keys
keys = sorted(config.keys()) if self.hierarchical else self._space_keys
for key in keys:
value = config[key]
if key == self.prune_attr:
@ -510,7 +511,7 @@ class FLOW2(Searcher):
else:
# key must be in space
domain = space[key]
if self._hierarchical:
if self.hierarchical:
# can't remove constant for hierarchical search space,
# e.g., learner
if not (domain is None or type(domain) in (str, int, float)

View File

@ -12,7 +12,7 @@ try:
except (ImportError, AssertionError):
from .suggestion import Searcher
from .flow2 import FLOW2
from ..tune.space import unflatten_hierarchical
from ..tune.space import add_cost_to_space, unflatten_hierarchical
import logging
logger = logging.getLogger(__name__)
@ -46,6 +46,11 @@ class SearchThread:
self.cost_attr = cost_attr
if search_alg:
self.space = self._space = search_alg.space # unflattened space
if self.space and not isinstance(search_alg, FLOW2) and isinstance(
search_alg._space, dict
):
# remember const config
self._const = add_cost_to_space(self.space, {}, {})
@classmethod
def set_eps(cls, time_budget_s):
@ -59,7 +64,12 @@ class SearchThread:
else:
try:
config = self._search_alg.suggest(trial_id)
config, self.space = unflatten_hierarchical(config, self._space)
if isinstance(self._search_alg._space, dict):
config.update(self._const)
else:
# define by run
config, self.space = unflatten_hierarchical(
config, self._space)
except FloatingPointError:
logger.warning(
'The global search method raises FloatingPointError. '

View File

@ -91,15 +91,6 @@ class Searcher:
mode: Optional[str] = None,
max_concurrent: Optional[int] = None,
use_early_stopped_trials: Optional[bool] = None):
if use_early_stopped_trials is False:
raise DeprecationWarning(
"Early stopped trials are now always used. If this is a "
"problem, file an issue: https://github.com/ray-project/ray.")
if max_concurrent is not None:
logger.warning(
"DeprecationWarning: `max_concurrent` is deprecated for this "
"search algorithm. Use tune.suggest.ConcurrencyLimiter() "
"instead. This will raise an error in future versions of Ray.")
self._metric = metric
self._mode = mode
@ -152,83 +143,6 @@ class Searcher:
"""
pass
def on_trial_complete(self,
trial_id: str,
result: Optional[Dict] = None,
error: bool = False):
"""Notification for the completion of trial.
Typically, this method is used for notifying the underlying
optimizer of the result.
Args:
trial_id (str): A unique string ID for the trial.
result (dict): Dictionary of metrics for current training progress.
Note that the result dict may include NaNs or
may not include the optimization metric. It is up to the
subclass implementation to preprocess the result to
avoid breaking the optimization process. Upon errors, this
may also be None.
error (bool): True if the training process raised an error.
"""
raise NotImplementedError
def suggest(self, trial_id: str) -> Optional[Dict]:
"""Queries the algorithm to retrieve the next set of parameters.
Arguments:
trial_id (str): Trial ID used for subsequent notifications.
Returns:
dict | FINISHED | None: Configuration for a trial, if possible.
If FINISHED is returned, Tune will be notified that
no more suggestions/configurations will be provided.
If None is returned, Tune will skip the querying of the
searcher for this step.
"""
raise NotImplementedError
def save(self, checkpoint_path: str):
"""Save state to path for this search algorithm.
Args:
checkpoint_path (str): File where the search algorithm
state is saved. This path should be used later when
restoring from file.
Example:
.. code-block:: python
search_alg = Searcher(...)
analysis = tune.run(
cost,
num_samples=5,
search_alg=search_alg,
name=self.experiment_name,
local_dir=self.tmpdir)
search_alg.save("./my_favorite_path.pkl")
.. versionchanged:: 0.8.7
Save is automatically called by `tune.run`. You can use
`restore_from_dir` to restore from an experiment directory
such as `~/ray_results/trainable`.
"""
raise NotImplementedError
def restore(self, checkpoint_path: str):
"""Restore state for this search algorithm
Args:
checkpoint_path (str): File where the search algorithm
state is saved. This path should be the same
as the one provided to "save".
Example:
.. code-block:: python
search_alg.save("./my_favorite_path.pkl")
search_alg2 = Searcher(...)
search_alg2 = ConcurrencyLimiter(search_alg2, 1)
search_alg2.restore(checkpoint_path)
tune.run(cost, num_samples=5, search_alg=search_alg2)
"""
raise NotImplementedError
def get_state(self) -> Dict:
raise NotImplementedError
def set_state(self, state: Dict):
raise NotImplementedError
@property
def metric(self) -> str:
"""The training result objective value attribute."""
@ -536,14 +450,6 @@ class OptunaSearch(Searcher):
# Flatten to support nested dicts
space = flatten_dict(space, "/")
# Deprecate: 1.5
if isinstance(space, list):
logger.warning(
"Passing lists of `param.suggest_*()` calls to OptunaSearch "
"as a search space is deprecated and will be removed in "
"a future release of Ray. Please pass a dict mapping "
"to `optuna.distributions` objects instead.")
self._space = space
self._points_to_evaluate = points_to_evaluate or []

View File

@ -19,57 +19,16 @@ Copyright (c) Microsoft Corporation.
'''
import copy
import logging
from collections.abc import Mapping
from typing import Any, Dict, Generator, List, Optional, Tuple
from typing import Any, Dict, Generator, List, Tuple
import numpy
import random
from ..tune.sample import Categorical, Domain, Function
from ..tune.sample import Categorical, Domain
logger = logging.getLogger(__name__)
def flatten_dict(dt, delimiter="/", prevent_delimiter=False):
dt = copy.deepcopy(dt)
if prevent_delimiter and any(delimiter in key for key in dt):
# Raise if delimiter is any of the keys
raise ValueError(
"Found delimiter `{}` in key when trying to flatten array."
"Please avoid using the delimiter in your specification.")
while any(isinstance(v, dict) for v in dt.values()):
remove = []
add = {}
for key, value in dt.items():
if isinstance(value, dict):
for subkey, v in value.items():
if prevent_delimiter and delimiter in subkey:
# Raise if delimiter is in any of the subkeys
raise ValueError(
"Found delimiter `{}` in key when trying to "
"flatten array. Please avoid using the delimiter "
"in your specification.")
add[delimiter.join([key, str(subkey)])] = v
remove.append(key)
dt.update(add)
for k in remove:
del dt[k]
return dt
def unflatten_dict(dt, delimiter="/"):
"""Unflatten dict. Does not support unflattening lists."""
dict_type = type(dt)
out = dict_type()
for key, val in dt.items():
path = key.split(delimiter)
item = out
for k in path[:-1]:
item = item.setdefault(k, dict_type())
item[path[-1]] = val
return out
class TuneError(Exception):
"""General error class raised by ray.tune."""
pass
@ -84,16 +43,9 @@ def generate_variants(
variants in combination:
"activation": grid_search(["relu", "tanh"])
"learning_rate": grid_search([1e-3, 1e-4, 1e-5])
Lambda functions: These are evaluated to produce a concrete value, and
can express dependencies or conditional distributions between values.
They can also be used to express random search (e.g., by calling
into the `random` or `np` module).
"cpu": lambda spec: spec.config.num_workers
"batch_size": lambda spec: random.uniform(1, 1000)
Finally, to support defining specs in plain JSON / YAML, grid search
and lambda functions can also be defined alternatively as follows:
can also be defined alternatively as follows:
"activation": {"grid_search": ["relu", "tanh"]}
"cpu": {"eval": "spec.config.num_workers"}
Use `format_vars` to format the returned dict of hyperparameters.
Yields:
(Dict of resolved variables, Spec object)
@ -242,10 +194,6 @@ def _try_resolve(v) -> Tuple[bool, Any]:
if isinstance(v, Domain):
# Domain to sample from
return False, v
elif isinstance(v, dict) and len(v) == 1 and "eval" in v:
# Lambda function in eval syntax
return False, Function(
lambda spec: eval(v["eval"], _STANDARD_IMPORTS, {"spec": spec}))
elif isinstance(v, dict) and len(v) == 1 and "grid_search" in v:
# Grid search values
grid_values = v["grid_search"]

View File

@ -325,11 +325,6 @@ class Categorical(Domain):
new.set_sampler(self._Uniform())
return new
def grid(self):
new = copy(self)
new.set_sampler(Grid())
return new
def __len__(self):
return len(self.categories)
@ -344,55 +339,6 @@ class Categorical(Domain):
return f"{self.categories}"
class Function(Domain):
class _CallSampler(BaseSampler):
def sample(self,
domain: "Function",
spec: Optional[Union[List[Dict], Dict]] = None,
size: int = 1):
if domain.pass_spec:
items = [
domain.func(spec[i] if isinstance(spec, list) else spec)
for i in range(size)
]
else:
items = [domain.func() for i in range(size)]
return items if len(items) > 1 else domain.cast(items[0])
default_sampler_cls = _CallSampler
def __init__(self, func: Callable):
sig = signature(func)
pass_spec = True # whether we should pass `spec` when calling `func`
try:
sig.bind({})
except TypeError:
pass_spec = False
if not pass_spec:
try:
sig.bind()
except TypeError as exc:
raise ValueError(
"The function passed to a `Function` parameter must be "
"callable with either 0 or 1 parameters.") from exc
self.pass_spec = pass_spec
self.func = func
def is_function(self):
return True
def is_valid(self, value: Any):
return True # This is user-defined, so lets not assume anything
@property
def domain_str(self):
return f"{self.func}()"
class Quantized(Sampler):
def __init__(self, sampler: Sampler, q: Union[float, int]):
self.sampler = sampler
@ -439,22 +385,6 @@ class PolynomialExpansionSet:
return "PolynomialExpansionSet"
# TODO (krfricke): Remove tune.function
def function(func):
logger.warning(
"DeprecationWarning: wrapping {} with tune.function() is no "
"longer needed".format(func))
return func
def sample_from(func: Callable[[Dict], Any]):
"""Specify that tune should sample configuration values from this function.
Arguments:
func: An callable function to draw a sample from.
"""
return Function(func)
def uniform(lower: float, upper: float):
"""Sample a float value uniformly between ``lower`` and ``upper``.
Sampling from ``tune.uniform(1, 10)`` is equivalent to sampling from

View File

@ -90,30 +90,30 @@ def define_by_run_func(
return config
def convert_key(
conf: Dict, space: Dict, path: str = ""
) -> Optional[Dict[str, Any]]:
"""Convert config keys to define-by-run keys.
# def convert_key(
# conf: Dict, space: Dict, path: str = ""
# ) -> Optional[Dict[str, Any]]:
# """Convert config keys to define-by-run keys.
Returns:
A dict with converted keys.
"""
config = {}
for key, domain in space.items():
value = conf[key]
if path:
key = path + '/' + key
if isinstance(domain, dict):
config.update(convert_key(conf[key], domain, key))
elif isinstance(domain, sample.Categorical):
index = indexof(domain, value)
config[key + '_choice_'] = index
if isinstance(value, dict):
key += f":{index}"
config.update(convert_key(value, domain.categories[index], key))
else:
config[key] = value
return config
# Returns:
# A dict with converted keys.
# """
# config = {}
# for key, domain in space.items():
# value = conf[key]
# if path:
# key = path + '/' + key
# if isinstance(domain, dict):
# config.update(convert_key(conf[key], domain, key))
# elif isinstance(domain, sample.Categorical):
# index = indexof(domain, value)
# config[key + '_choice_'] = index
# if isinstance(value, dict):
# key += f":{index}"
# config.update(convert_key(value, domain.categories[index], key))
# else:
# config[key] = value
# return config
def unflatten_hierarchical(config: Dict, space: Dict) -> Tuple[Dict, Dict]:
@ -306,10 +306,8 @@ def normalize(
elif str(sampler) == 'Normal':
# N(mean, sd) -> N(0,1)
config_norm[key] = (value - sampler.mean) / sampler.sd
else:
# TODO? elif str(sampler) == 'Base': # sample.Function._CallSampler
# e.g., {test: sample_from(lambda spec: randn(10, 2).sample() * 0.01)}
config_norm[key] = value
# else:
# config_norm[key] = value
return config_norm

View File

@ -13,6 +13,7 @@ try:
from ray.tune.analysis import ExperimentAnalysis as EA
except (ImportError, AssertionError):
from .analysis import ExperimentAnalysis as EA
from .result import DEFAULT_METRIC
import logging
logger = logging.getLogger(__name__)
@ -33,7 +34,7 @@ class ExperimentAnalysis(EA):
super().__init__(self, None, trials, metric, mode)
except (TypeError, ValueError):
self.trials = trials
self.default_metric = metric or '_default_anonymous_metric'
self.default_metric = metric or DEFAULT_METRIC
self.default_mode = mode
@ -82,7 +83,7 @@ def report(_metric=None, **kwargs):
if _verbose == 2:
logger.info(f"result: {kwargs}")
if _metric:
result['_default_anonymous_metric'] = _metric
result[DEFAULT_METRIC] = _metric
trial = _runner.running_trial
if _running_trial == trial:
_training_iteration += 1
@ -105,12 +106,13 @@ def report(_metric=None, **kwargs):
def run(training_function,
config: Optional[dict] = None,
points_to_evaluate: Optional[List[dict]] = None,
low_cost_partial_config: Optional[dict] = None,
cat_hp_cost: Optional[dict] = None,
metric: Optional[str] = None,
mode: Optional[str] = None,
time_budget_s: Union[int, float, datetime.timedelta] = None,
points_to_evaluate: Optional[List[dict]] = None,
evaluated_rewards: Optional[List] = None,
prune_attr: Optional[str] = None,
min_resource: Optional[float] = None,
max_resource: Optional[float] = None,
@ -155,8 +157,6 @@ def run(training_function,
Args:
training_function: A user-defined training function.
config: A dictionary to specify the search space.
points_to_evaluate: A list of initial hyperparameter
configurations to run first.
low_cost_partial_config: A dictionary from a subset of
controlled dimensions to the initial low-cost values.
e.g.,
@ -179,6 +179,14 @@ def run(training_function,
mode: A string in ['min', 'max'] to specify the objective as
minimization or maximization.
time_budget_s: A float of the time budget in seconds.
points_to_evaluate: A list of initial hyperparameter
configurations to run first.
evaluated_rewards (list): If you have previously evaluated the
parameters passed in as points_to_evaluate you can avoid
re-running those trials by passing in the reward attributes
as a list so the optimiser can be told the results without
needing to re-compute the trial. Must be the same length as
points_to_evaluate.
prune_attr: A string of the attribute used for pruning.
Not necessarily in space.
When prune_attr is in space, it is a hyperparameter, e.g.,
@ -259,9 +267,10 @@ def run(training_function,
if search_alg is None:
from ..searcher.blendsearch import BlendSearch
search_alg = BlendSearch(
metric=metric or '_default_anonymous_metric', mode=mode,
metric=metric or DEFAULT_METRIC, mode=mode,
space=config,
points_to_evaluate=points_to_evaluate,
evaluated_rewards=evaluated_rewards,
low_cost_partial_config=low_cost_partial_config,
cat_hp_cost=cat_hp_cost,
prune_attr=prune_attr,

View File

@ -842,12 +842,12 @@
"class MyRegularizedGreedyForest(SKLearnEstimator):\n",
"\n",
"\n",
" def __init__(self, task='binary:logistic', n_jobs=1, **params):\n",
" def __init__(self, task='binary', n_jobs=1, **params):\n",
" '''Constructor\n",
" \n",
" Args:\n",
" task: A string of the task type, one of\n",
" 'binary:logistic', 'multi:softmax', 'regression'\n",
" 'binary', 'multi', 'regression'\n",
" n_jobs: An integer of the number of parallel threads\n",
" params: A dictionary of the hyperparameter names and values\n",
" '''\n",
@ -855,7 +855,7 @@
" super().__init__(task, **params)\n",
"\n",
" '''task=regression for RGFRegressor; \n",
" binary:logistic and multiclass:softmax for RGFClassifier'''\n",
" binary or multiclass for RGFClassifier'''\n",
" if 'regression' in task:\n",
" self.estimator_class = RGFRegressor\n",
" else:\n",

View File

@ -17,7 +17,7 @@ from flaml import tune
class MyRegularizedGreedyForest(SKLearnEstimator):
def __init__(self, task='binary:logistic', n_jobs=1, max_leaf=4,
def __init__(self, task='binary', n_jobs=1, max_leaf=4,
n_iter=1, n_tree_search=1, opt_interval=1, learning_rate=1.0,
min_samples_leaf=1, **params):
@ -264,6 +264,7 @@ class TestAutoML(unittest.TestCase):
"model_history": True,
"sample_weight": np.ones(len(y)),
"pred_time_limit": 1e-5,
"ensemble": True,
}
automl_experiment.fit(**automl_settings)
print(automl_experiment.classes_)
@ -382,23 +383,25 @@ class TestAutoML(unittest.TestCase):
def test_roc_auc_ovr(self):
automl_experiment = AutoML()
X_train, y_train = load_iris(return_X_y=True)
automl_settings = {
"time_budget": 2,
"time_budget": 1,
"metric": "roc_auc_ovr",
"task": "classification",
"log_file_name": "test/roc_auc_ovr.log",
"log_training_metric": True,
"n_jobs": 1,
"sample_weight": np.ones(len(y_train)),
"eval_method": "holdout",
"model_history": True
}
X_train, y_train = load_iris(return_X_y=True)
automl_experiment.fit(
X_train=X_train, y_train=y_train, **automl_settings)
def test_roc_auc_ovo(self):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 2,
"time_budget": 1,
"metric": "roc_auc_ovo",
"task": "classification",
"log_file_name": "test/roc_auc_ovo.log",
@ -438,6 +441,11 @@ class TestAutoML(unittest.TestCase):
log_file_name=automl_settings["log_file_name"],
X_train=X_train, y_train=y_train,
train_full=True, time_budget=1)
automl_experiment.retrain_from_log(
task="regression",
log_file_name=automl_settings["log_file_name"],
X_train=X_train, y_train=y_train,
train_full=True, time_budget=0)
def test_sparse_matrix_classification(self):
automl_experiment = AutoML()
@ -565,13 +573,14 @@ class TestAutoML(unittest.TestCase):
except ImportError:
return
def test_parallel_xgboost_random(self):
def test_parallel_xgboost_others(self):
# use random search as the hpo_method
self.test_parallel_xgboost(hpo_method='random')
def test_random_out_of_memory(self):
automl_experiment = AutoML()
automl_experiment.add_learner(learner_name='large_lgbm', learner_class=MyLargeLGBM)
automl_experiment.add_learner(
learner_name='large_lgbm', learner_class=MyLargeLGBM)
automl_settings = {
"time_budget": 2,
"metric": 'ap',
@ -620,13 +629,13 @@ class TestAutoML(unittest.TestCase):
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
def test_sparse_matrix_regression_cv(self):
def test_sparse_matrix_regression_holdout(self):
X_train = scipy.sparse.random(8, 100)
y_train = np.random.uniform(size=8)
automl_experiment = AutoML()
automl_settings = {
"time_budget": 2,
'eval_method': 'cv',
"time_budget": 1,
'eval_method': 'holdout',
"task": 'regression',
"log_file_name": "test/sparse_regression.log",
"n_jobs": 1,

View File

@ -21,6 +21,7 @@ def test_forecast_automl(budget=5):
"task": 'forecast', # task type
"log_file_name": 'CO2_forecast.log', # flaml log file
"eval_method": "holdout",
"label": ('ds', 'y'),
}
'''The main flaml automl API'''
try:

View File

@ -1,7 +1,7 @@
from openml.exceptions import OpenMLServerException
def test_automl(budget=5, dataset_format='dataframe'):
def test_automl(budget=5, dataset_format='dataframe', hpo_method=None):
from flaml.data import load_openml_dataset
try:
X_train, X_test, y_train, y_test = load_openml_dataset(
@ -18,6 +18,7 @@ def test_automl(budget=5, dataset_format='dataframe'):
"task": 'classification', # task type
"log_file_name": 'airlines_experiment.log', # flaml log file
"seed": 7654321, # random seed
'hpo_method': hpo_method
}
'''The main flaml automl API'''
automl.fit(X_train=X_train, y_train=y_train, **settings)
@ -52,7 +53,7 @@ def test_automl(budget=5, dataset_format='dataframe'):
def test_automl_array():
test_automl(5, 'array')
test_automl(5, 'array', 'bs')
def test_mlflow():
@ -81,8 +82,11 @@ def test_mlflow():
mlflow.set_experiment("flaml")
with mlflow.start_run():
'''The main flaml automl API'''
automl.fit(X_train=X_train, y_train=y_train, **settings)
automl.fit(
X_train=X_train, y_train=y_train, **settings)
# subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "mlflow"])
automl._mem_thres = 0
print(automl.trainable(automl.points_to_evaluate[0]))
if __name__ == "__main__":

View File

@ -41,6 +41,7 @@ class TestLogging(unittest.TestCase):
}
X_train, y_train = load_boston(return_X_y=True)
n = len(y_train) >> 1
print(automl.model, automl.classes_, automl.predict(X_train))
automl.fit(X_train=X_train[:n], y_train=y_train[:n],
X_val=X_train[n:], y_val=y_train[n:],
**automl_settings)
@ -81,6 +82,8 @@ class TestLogging(unittest.TestCase):
time_budget_s=1, num_samples=-1)
print(min(trial.last_result["val_loss"]
for trial in analysis.trials))
config = analysis.trials[-1].last_result['config']['ml']
automl._state._train_with_config(config['learner'], config)
# Check if the log buffer is populated.
self.assertTrue(len(buf.getvalue()) > 0)

View File

@ -16,9 +16,9 @@ class TestTrainingLog(unittest.TestCase):
filename = os.path.join(d, path)
# Run a simple job.
automl_experiment = AutoML()
automl = AutoML()
automl_settings = {
"time_budget": 2,
"time_budget": 1,
"metric": 'mse',
"task": 'regression',
"log_file_name": filename,
@ -29,10 +29,12 @@ class TestTrainingLog(unittest.TestCase):
"train_time_limit": 0.01,
"verbose": 3,
"ensemble": True,
"keep_search_state": True,
}
X_train, y_train = load_boston(return_X_y=True)
automl_experiment.fit(X_train=X_train, y_train=y_train,
**automl_settings)
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
automl._state._train_with_config(
automl.best_estimator, automl.best_config)
# Check if the training log file is populated.
self.assertTrue(os.path.exists(filename))
@ -44,8 +46,10 @@ class TestTrainingLog(unittest.TestCase):
self.assertGreater(count, 0)
automl_settings["log_file_name"] = None
automl_experiment.fit(X_train=X_train, y_train=y_train,
**automl_settings)
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
automl._selected.update(None, 0)
automl = AutoML()
automl.fit(X_train=X_train, y_train=y_train, max_iter=0)
def test_illfilename(self):
try:

View File

@ -76,7 +76,7 @@ def test_simple(method=None):
print(analysis.trials[-1])
def _test_optuna():
def test_optuna():
test_simple(method="optuna")

18
test/tune/test_sample.py Normal file
View File

@ -0,0 +1,18 @@
from flaml.tune.sample import (
BaseSampler, PolynomialExpansionSet, Domain,
uniform, quniform, choice, randint, qrandint, randn,
qrandn, loguniform, qloguniform, lograndint, qlograndint)
def test_sampler():
print(randn().sample(size=2))
print(PolynomialExpansionSet(), BaseSampler())
print(qrandn(2, 10, 2).sample(size=2))
c = choice([1, 2])
print(c.domain_str, len(c), c.is_valid(3))
i = randint(1, 10)
print(i.domain_str, i.is_valid(10))
d = Domain()
print(d.domain_str, d.is_function())
d.default_sampler_cls = BaseSampler
print(d.get_sampler())

126
test/tune/test_searcher.py Normal file
View File

@ -0,0 +1,126 @@
from flaml.searcher.blendsearch import CFO
import numpy as np
try:
from ray import __version__ as ray_version
assert ray_version >= '1.0.0'
from ray.tune import sample
except (ImportError, AssertionError):
from flaml.tune import sample
from flaml.searcher.suggestion import OptunaSearch, Searcher, ConcurrencyLimiter
from flaml.searcher.blendsearch import BlendSearch
def define_search_space(trial):
trial.suggest_float("a", 6, 8)
trial.suggest_float("b", 1e-4, 1e-2, log=True)
def test_searcher():
searcher = Searcher()
searcher = Searcher(metric=['m1', 'm2'], mode=['max', 'min'])
searcher.set_search_properties(None, None, None)
searcher.suggest = searcher.on_pause = searcher.on_unpause = lambda _: {}
searcher.on_trial_complete = lambda trial_id, result, error: None
searcher = ConcurrencyLimiter(searcher, max_concurrent=2, batch=True)
searcher.suggest("t1")
searcher.suggest("t2")
searcher.on_pause("t1")
searcher.on_unpause("t1")
searcher.suggest("t3")
searcher.on_trial_complete("t1", {})
searcher.on_trial_complete("t2", {})
searcher.set_state({})
print(searcher.get_state())
import optuna
config = {
"a": optuna.distributions.UniformDistribution(6, 8),
"b": optuna.distributions.LogUniformDistribution(1e-4, 1e-2),
}
searcher = OptunaSearch(
config, points_to_evaluate=[{"a": 6, "b": 1e-3}],
evaluated_rewards=[{'m': 2}], metric='m', mode='max'
)
config = {
"a": sample.uniform(6, 8),
"b": sample.loguniform(1e-4, 1e-2)
}
searcher = OptunaSearch(
config, points_to_evaluate=[{"a": 6, "b": 1e-3}],
evaluated_rewards=[{'m': 2}], metric='m', mode='max'
)
searcher = OptunaSearch(
define_search_space, points_to_evaluate=[{"a": 6, "b": 1e-3}],
# evaluated_rewards=[{'m': 2}], metric='m', mode='max'
mode='max'
)
searcher = OptunaSearch()
# searcher.set_search_properties('m', 'min', define_search_space)
searcher.set_search_properties('m', 'min', config)
searcher.suggest('t1')
searcher.on_trial_complete('t1', None, False)
searcher.suggest('t2')
searcher.on_trial_complete('t2', None, True)
searcher.suggest('t3')
searcher.on_trial_complete('t3', {'m': np.nan})
searcher.save('test/tune/optuna.pickle')
searcher.restore('test/tune/optuna.pickle')
searcher = BlendSearch(
metric="m",
global_search_alg=searcher, metric_constraints=[("c", "<", 1)])
searcher.set_search_properties(metric="m2", config=config)
searcher.set_search_properties(config={"time_budget_s": 0})
c = searcher.suggest('t1')
searcher.on_trial_complete("t1", {"config": c}, True)
c = searcher.suggest('t2')
searcher.on_trial_complete(
"t2", {"config": c, "m2": 1, "c": 2, "time_total_s": 1})
config1 = config.copy()
config1['_choice_'] = 0
searcher._expand_admissible_region(
lower={'root': [{'a': 0.5}, {'a': 0.4}]},
upper={'root': [{'a': 0.9}, {'a': 0.8}]},
space={'root': config1},
)
searcher = CFO(
metric='m', mode='min', space=config,
points_to_evaluate=[{'a': 7, 'b': 1e-3}, {'a': 6, 'b': 3e-4}],
evaluated_rewards=[1, 1])
searcher.suggest("t1")
searcher.suggest("t2")
searcher.on_trial_result('t3', {})
c = searcher.generate_parameters(1)
searcher.receive_trial_result(1, c, {'reward': 0})
searcher.update_search_space(
{
"a": {
"_value": [1, 2],
"_type": "choice",
},
"b": {
"_value": [1, 3],
"_type": "randint",
},
"c": {
"_value": [.1, 3],
"_type": "uniform",
},
"d": {
"_value": [2, 8, 2],
"_type": "quniform",
},
"e": {
"_value": [2, 8],
"_type": "loguniform",
},
"f": {
"_value": [2, 8, 2],
"_type": "qloguniform",
},
"g": {
"_value": [0, 2],
"_type": "normal",
},
"h": {
"_value": [0, 2, 2],
"_type": "qnormal",
},
}
)

View File

@ -15,7 +15,7 @@ import xgboost as xgb
import logging
logger = logging.getLogger(__name__)
os.makedirs('logs', exist_ok=True)
logger.addHandler(logging.FileHandler('logs/tune_xgboost.log'))
logger.addHandler(logging.FileHandler('logs/tune.log'))
logger.setLevel(logging.INFO)
@ -223,12 +223,22 @@ def test_nested():
logger.info(f"BlendSearch exp best config: {best_trial.config}")
logger.info(f"BlendSearch exp best result: {best_trial.last_result}")
points_to_evaluate = [
{"b": .99, "cost_related": {"a": 3}},
{"b": .99, "cost_related": {"a": 2}},
]
analysis = tune.run(
simple_func,
config=search_space,
low_cost_partial_config={
"cost_related": {"a": 1}
},
points_to_evaluate=points_to_evaluate,
evaluated_rewards=[
(config["cost_related"]["a"] - 4)**2
+ (config["b"] - config["cost_related"]["a"])**2
for config in points_to_evaluate
],
metric="obj",
mode="min",
metric_constraints=[("ab", "<=", 4)],