* iter per learner

* code cleanup
This commit is contained in:
Chi Wang 2021-04-08 09:29:55 -07:00 committed by GitHub
parent b7a91e0385
commit 97a7c114ee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
40 changed files with 1829 additions and 2122 deletions

View File

@ -6,4 +6,3 @@ import logging
# Set the root logger.
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

View File

@ -12,11 +12,13 @@ from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, \
RepeatedKFold
from sklearn.utils import shuffle
import pandas as pd
import os, contextlib
import os
import contextlib
from .ml import compute_estimator, train_estimator, get_estimator_class, \
get_classification_objective
from .config import (MIN_SAMPLE_TRAIN, MEM_THRES, RANDOM_SEED,
from .config import (
MIN_SAMPLE_TRAIN, MEM_THRES, RANDOM_SEED,
SMALL_LARGE_THRES, CV_HOLDOUT_THRESHOLD, SPLIT_RATIO, N_SPLITS,
SAMPLE_MULTIPLY_FACTOR)
from .data import concat
@ -31,21 +33,20 @@ logger_formatter = logging.Formatter(
try:
import mlflow
except:
except ImportError:
mlflow = None
class SearchState:
@property
def search_space(self):
return self._search_space_domain
@property
def estimated_cost4improvement(self):
return max(self.time_best_found-self.time_best_found_old,
self.total_time_used-self.time_best_found)
return max(self.time_best_found - self.time_best_found_old,
self.total_time_used - self.time_best_found)
def __init__(self, learner_class, data_size, task):
self.init_eci = learner_class.cost_relative2lgbm()
@ -59,7 +60,7 @@ class SearchState:
for name, space in search_space.items():
assert 'domain' in space
self._search_space_domain[name] = space['domain']
if 'init_value' in space:
if 'init_value' in space:
self.init_config[name] = space['init_value']
if 'low_cost_init_value' in space:
self.low_cost_partial_config[name] = space[
@ -67,7 +68,7 @@ class SearchState:
if 'cat_hp_cost' in space:
self.cat_hp_cost[name] = space['cat_hp_cost']
self._hp_names = list(self._search_space_domain.keys())
self.search_alg = None
self.search_alg = None
self.best_loss = self.best_loss_old = np.inf
self.total_time_used = 0
self.total_iter = 0
@ -75,21 +76,20 @@ class SearchState:
self.time_best_found = 0
self.time2eval_best = 0
self.time2eval_best_old = 0
self.trained_estimator = None
self.update_count = 0
self.trained_estimator = None
self.sample_size = None
self.trial_time = 0
def update(self, analysis, time_used, save_model_history=False):
if not analysis.trials: return
self.update_count += 1
if not analysis.trials:
return
result = analysis.trials[-1].last_result
if result:
config = result['config']
# logger.info(config)
if config and 'FLAML_sample_size' in config:
if config and 'FLAML_sample_size' in config:
self.sample_size = config['FLAML_sample_size']
else: self.sample_size = self.data_size
else:
self.sample_size = self.data_size
obj = result['val_loss']
train_loss = result['train_loss']
time2eval = result['time2eval']
@ -101,27 +101,28 @@ class SearchState:
self.trial_time = time2eval
self.total_time_used += time_used
self.total_iter += 1
if self.base_eci is None:
self.base_eci = time_used
if (obj is not None) and (self.best_loss is None or obj<self.best_loss):
self.best_loss_old = self.best_loss if self.best_loss < float(
'inf') else 2*obj
if (obj is not None) and (self.best_loss is None or obj < self.best_loss):
self.best_loss_old = self.best_loss if self.best_loss < np.inf \
else 2 * obj
self.best_loss = obj
self.time_best_found_old = self.time_best_found
self.time_best_found = self.total_time_used
self.time_best_found_old = self.time_best_found
self.time_best_found = self.total_time_used
self.iter_best_found = self.total_iter
self.best_config = config
self.best_config_sample_size = self.sample_size
self.best_config_train_time = time_used
if time2eval:
if time2eval:
self.time2eval_best_old = self.time2eval_best
self.time2eval_best = time2eval
if self.trained_estimator and trained_estimator and \
self.trained_estimator!= trained_estimator and \
self.trained_estimator != trained_estimator and \
not save_model_history:
self.trained_estimator.cleanup()
if trained_estimator: self.trained_estimator = trained_estimator
if trained_estimator:
self.trained_estimator = trained_estimator
self.train_loss, self.val_loss, self.config = train_loss, obj, config
def get_hist_config_sig(self, sample_size, config):
@ -132,13 +133,12 @@ class SearchState:
def est_retrain_time(self, retrain_sample_size):
assert self.best_config_sample_size is not None, \
'need to first get best_config_sample_size'
return (self.time2eval_best*
retrain_sample_size/self.best_config_sample_size)
return (self.time2eval_best * retrain_sample_size
/ self.best_config_sample_size)
class AutoMLState:
def _prepare_sample_train_data(self, sample_size):
full_size = len(self.y_train)
sampled_weight = None
@ -152,8 +152,8 @@ class AutoMLState:
if weight is not None:
sampled_weight = weight[:sample_size]
else:
sampled_X_train, sampled_y_train = concat(self.X_train,
self.X_val), np.concatenate([self.y_train, self.y_val])
sampled_X_train = concat(self.X_train, self.X_val)
sampled_y_train = np.concatenate([self.y_train, self.y_val])
weight = self.fit_kwargs.get('sample_weight')
if weight is not None:
sampled_weight = np.concatenate([weight, self.weight_val])
@ -165,54 +165,60 @@ class AutoMLState:
compute_start_time = time.time()
if 'FLAML_sample_size' in config_w_resource:
sample_size = int(config_w_resource['FLAML_sample_size'])
else: sample_size = self.data_size
else:
sample_size = self.data_size
sampled_X_train, sampled_y_train, sampled_weight = \
self._prepare_sample_train_data(sample_size)
if sampled_weight is not None:
weight = self.fit_kwargs['sample_weight']
self.fit_kwargs['sample_weight'] = sampled_weight
else: weight = None
else:
weight = None
config = config_w_resource.copy()
if 'FLAML_sample_size' in config: del config['FLAML_sample_size']
if 'FLAML_sample_size' in config:
del config['FLAML_sample_size']
time_left = self.time_budget - self.time_from_start
budget = time_left if sample_size == self.data_size else \
time_left/2*sample_size/self.data_size
time_left / 2 * sample_size / self.data_size
trained_estimator, val_loss, train_loss, time2eval, _ = \
compute_estimator(sampled_X_train,
sampled_y_train,
self.X_val,
self.y_val,
self.weight_val,
budget,
self.kf,
config,
self.task,
estimator,
self.eval_method,
self.metric,
self.best_loss,
self.n_jobs,
self.learner_classes.get(estimator),
self.log_training_metric,
self.fit_kwargs)
compute_estimator(
sampled_X_train,
sampled_y_train,
self.X_val,
self.y_val,
self.weight_val,
budget,
self.kf,
config,
self.task,
estimator,
self.eval_method,
self.metric,
self.best_loss,
self.n_jobs,
self.learner_classes.get(estimator),
self.log_training_metric,
self.fit_kwargs)
result = {
'total_time': time.time()-compute_start_time,
'time2eval': time2eval,
'train_loss': train_loss,
'val_loss': val_loss,
'trained_estimator': trained_estimator,}
'total_time': time.time() - compute_start_time,
'time2eval': time2eval,
'train_loss': train_loss,
'val_loss': val_loss,
'trained_estimator': trained_estimator
}
with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
tune.report(**result)
if sampled_weight is not None:
self.fit_kwargs['sample_weight'] = weight
def _train_with_config(self, estimator, config_w_resource,
sample_size=None):
def _train_with_config(
self, estimator, config_w_resource, sample_size=None
):
config = config_w_resource.copy()
if 'FLAML_sample_size' in config:
if not sample_size: sample_size = config['FLAML_sample_size']
if 'FLAML_sample_size' in config:
if not sample_size:
sample_size = config['FLAML_sample_size']
del config['FLAML_sample_size']
assert sample_size is not None
sampled_X_train, sampled_y_train, sampled_weight = \
@ -220,9 +226,10 @@ class AutoMLState:
if sampled_weight is not None:
weight = self.fit_kwargs['sample_weight']
self.fit_kwargs['sample_weight'] = sampled_weight
else: weight = None
budget = None if self.time_budget is None else (self.time_budget -
self.time_from_start)
else:
weight = None
budget = None if self.time_budget is None else (
self.time_budget - self.time_from_start)
estimator, train_time = train_estimator(
sampled_X_train,
sampled_y_train,
@ -276,7 +283,7 @@ class AutoML:
'''A dictionary of iter->(estimator, config, time),
storing the best estimator, config, and the time when the best
model is updated each time.
'''
'''
return self._config_history
@property
@ -294,7 +301,7 @@ class AutoML:
Args:
estimator_name: a str of the estimator's name
Returns:
An object with `predict()` and `predict_proba()` method (for
classification), storing the best trained model for estimator_name.
@ -307,18 +314,18 @@ class AutoML:
@property
def best_estimator(self):
'''A string indicating the best estimator found.'''
'''A string indicating the best estimator found.'''
return self._best_estimator
@property
def best_iteration(self):
'''An integer of the iteration number where the best
config is found.'''
config is found.'''
return self._best_iteration
@property
def best_config(self):
'''A dictionary of the best configuration.'''
'''A dictionary of the best configuration.'''
return self._search_states[self._best_estimator].best_config
@property
@ -335,8 +342,8 @@ class AutoML:
@property
def classes_(self):
'''A list of n_classes elements for class labels.'''
if self._label_transformer:
'''A list of n_classes elements for class labels.'''
if self._label_transformer:
return self._label_transformer.classes_.tolist()
if self._trained_estimator:
return self._trained_estimator.model.classes_.tolist()
@ -358,7 +365,8 @@ class AutoML:
return None
X_test = self._preprocess(X_test)
y_pred = self._trained_estimator.predict(X_test)
if y_pred.ndim > 1: y_pred = y_pred.flatten()
if y_pred.ndim > 1:
y_pred = y_pred.flatten()
if self._label_transformer:
return self._label_transformer.inverse_transform(pd.Series(
y_pred))
@ -381,7 +389,7 @@ class AutoML:
return proba
def _preprocess(self, X):
if issparse(X):
if issparse(X):
X = X.tocsr()
if self._transformer:
X = self._transformer.transform(X)
@ -390,10 +398,8 @@ class AutoML:
def _validate_data(self, X_train_all, y_train_all, dataframe, label,
X_val=None, y_val=None):
if X_train_all is not None and y_train_all is not None:
if not (isinstance(X_train_all, np.ndarray) or
issparse(X_train_all) or
isinstance(X_train_all, pd.DataFrame)
):
if not (isinstance(X_train_all, np.ndarray) or issparse(X_train_all)
or isinstance(X_train_all, pd.DataFrame)):
raise ValueError(
"X_train_all must be a numpy array, a pandas dataframe, "
"or Scipy sparse matrix.")
@ -407,14 +413,14 @@ class AutoML:
y_train_all = y_train_all.flatten()
if X_train_all.shape[0] != y_train_all.shape[0]:
raise ValueError(
"# rows in X_train must match length of y_train.")
"# rows in X_train must match length of y_train.")
self._df = isinstance(X_train_all, pd.DataFrame)
self._nrow, self._ndim = X_train_all.shape
X, y = X_train_all, y_train_all
elif dataframe is not None and label is not None:
if not isinstance(dataframe, pd.DataFrame):
raise ValueError("dataframe must be a pandas DataFrame")
if not label in dataframe.columns:
if label not in dataframe.columns:
raise ValueError("label must a column name in dataframe")
self._df = True
X = dataframe.drop(columns=label)
@ -422,8 +428,8 @@ class AutoML:
y = dataframe[label]
else:
raise ValueError(
"either X_train_all+y_train_all or dataframe+label need to be provided")
if issparse(X_train_all):
"either X_train+y_train or dataframe+label are required")
if issparse(X_train_all):
self._transformer = self._label_transformer = False
self._X_train_all, self._y_train_all = X, y
else:
@ -432,12 +438,10 @@ class AutoML:
self._X_train_all, self._y_train_all = \
self._transformer.fit_transform(X, y, self._state.task)
self._label_transformer = self._transformer.label_transformer
self._sample_weight_full = self._state.fit_kwargs.get('sample_weight')
self._sample_weight_full = self._state.fit_kwargs.get('sample_weight')
if X_val is not None and y_val is not None:
if not (isinstance(X_val, np.ndarray) or
issparse(X_val) or
isinstance(X_val, pd.DataFrame)
):
if not (isinstance(X_val, np.ndarray) or issparse(X_val)
or isinstance(X_val, pd.DataFrame)):
raise ValueError(
"X_val must be None, a numpy array, a pandas dataframe, "
"or Scipy sparse matrix.")
@ -452,8 +456,7 @@ class AutoML:
if isinstance(y_val, np.ndarray):
y_val = y_val.flatten()
if X_val.shape[0] != y_val.shape[0]:
raise ValueError(
"# rows in X_val must match length of y_val.")
raise ValueError("# rows in X_val must match length of y_val.")
if self._transformer:
self._state.X_val = self._transformer.transform(X_val)
else:
@ -470,14 +473,14 @@ class AutoML:
split_ratio,
n_splits):
X_val, y_val = self._state.X_val, self._state.y_val
if issparse(X_val):
if issparse(X_val):
X_val = X_val.tocsr()
X_train_all, y_train_all = \
self._X_train_all, self._y_train_all
if issparse(X_train_all):
if issparse(X_train_all):
X_train_all = X_train_all.tocsr()
if self._state.task != 'regression' and self._state.fit_kwargs.get(
'sample_weight') is None:
'sample_weight') is None:
# logger.info(f"label {pd.unique(y_train_all)}")
label_set, counts = np.unique(y_train_all, return_counts=True)
# augment rare classes
@ -534,7 +537,7 @@ class AutoML:
X_rest = X_train_all.iloc[rest] if self._df else X_train_all[rest]
y_rest = y_train_all.iloc[rest] if isinstance(
y_train_all, pd.Series) else y_train_all[rest]
stratify = y_rest if self._split_type=='stratified' else \
stratify = y_rest if self._split_type == 'stratified' else \
None
if 'sample_weight' in self._state.fit_kwargs:
X_train, X_val, y_train, y_val, weight_train, weight_val = \
@ -556,21 +559,17 @@ class AutoML:
stratify=stratify,
random_state=RANDOM_SEED)
X_train = concat(X_first, X_train)
y_train = concat(label_set,
y_train) if self._df else np.concatenate(
y_train = concat(
label_set, y_train) if self._df else np.concatenate(
[label_set, y_train])
X_val = concat(X_first, X_val)
y_val = concat(label_set,
y_val) if self._df else np.concatenate([label_set, y_val])
y_val = concat(label_set, y_val) if self._df else \
np.concatenate([label_set, y_val])
_, y_train_counts_elements = np.unique(y_train,
return_counts=True)
return_counts=True)
_, y_val_counts_elements = np.unique(y_val,
return_counts=True)
logger.debug(
f"""{self._split_type} split for y_train \
{y_train_counts_elements}, \
y_val {y_val_counts_elements}""")
elif eval_method == 'holdout' and self._state.task == 'regression':
return_counts=True)
elif eval_method == 'holdout' and self._state.task == 'regression':
if 'sample_weight' in self._state.fit_kwargs:
X_train, X_val, y_train, y_val, self._state.fit_kwargs[
'sample_weight'], self._state.weight_val = \
@ -587,8 +586,10 @@ class AutoML:
test_size=split_ratio,
random_state=RANDOM_SEED)
self._state.data_size = X_train.shape[0]
if X_val is None: self.data_size_full = self._state.data_size
else: self.data_size_full = self._state.data_size + X_val.shape[0]
if X_val is None:
self.data_size_full = self._state.data_size
else:
self.data_size_full = self._state.data_size + X_val.shape[0]
self._state.X_train, self._state.y_train, self._state.X_val, \
self._state.y_val = (X_train, y_train, X_val, y_val)
if self._split_type == "stratified":
@ -596,15 +597,15 @@ class AutoML:
assert y_train_all.size >= n_splits, (
f"{n_splits}-fold cross validation"
f" requires input data with at least {n_splits} examples.")
assert y_train_all.size >= 2*n_splits, (
assert y_train_all.size >= 2 * n_splits, (
f"{n_splits}-fold cross validation with metric=r2 "
f"requires input data with at least {n_splits*2} examples.")
self._state.kf = RepeatedStratifiedKFold(n_splits=n_splits,
n_repeats=1, random_state=RANDOM_SEED)
self._state.kf = RepeatedStratifiedKFold(
n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED)
else:
logger.info("Using RepeatedKFold")
self._state.kf = RepeatedKFold(n_splits=n_splits, n_repeats=1,
random_state=RANDOM_SEED)
self._state.kf = RepeatedKFold(
n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED)
def add_learner(self,
learner_name,
@ -624,7 +625,7 @@ class AutoML:
log_file_name: A string of the log file name
record_id: An integer of the record ID in the file,
0 corresponds to the first trial
task: A string of the task type,
task: A string of the task type,
'binary', 'multi', or 'regression'
Returns:
@ -638,8 +639,7 @@ class AutoML:
estimator, _ = train_estimator(
None, None, config, task, estimator,
estimator_class=self._state.learner_classes.get(estimator)
)
estimator_class=self._state.learner_classes.get(estimator))
return estimator
def retrain_from_log(self,
@ -724,7 +724,8 @@ class AutoML:
self._trained_estimator = Estimator()
self._trained_estimator.model = None
return training_duration
if not best: return
if not best:
return
best_estimator = best.learner
best_config = best.config
sample_size = len(self._y_train_all) if train_full \
@ -756,7 +757,8 @@ class AutoML:
return training_duration
def _decide_eval_method(self, time_budget):
if self._state.X_val is not None: return 'holdout'
if self._state.X_val is not None:
return 'holdout'
nrow, dim = self._nrow, self._ndim
if nrow * dim / 0.9 < SMALL_LARGE_THRES * (
time_budget / 3600) and nrow < CV_HOLDOUT_THRESHOLD:
@ -824,7 +826,7 @@ class AutoML:
n_jobs: An integer of the number of threads for training
log_file_name: A string of the log file name
estimator_list: A list of strings for estimator names, or 'auto'
e.g.,
e.g.,
.. code-block:: python
@ -863,12 +865,12 @@ class AutoML:
self._state.fit_kwargs = fit_kwargs
self._state.weight_val = sample_weight_val
self._validate_data(X_train, y_train, dataframe, label, X_val, y_val)
self._search_states = {} #key: estimator name; value: SearchState
self._search_states = {} # key: estimator name; value: SearchState
self._random = np.random.RandomState(RANDOM_SEED)
self._learner_selector = learner_selector
old_level = logger.getEffectiveLevel()
self.verbose = verbose
if verbose==0:
if verbose == 0:
logger.setLevel(logging.WARNING)
if self._state.task == 'classification':
self._state.task = get_classification_objective(
@ -884,11 +886,11 @@ class AutoML:
# Add the console handler.
_ch = logging.StreamHandler()
_ch.setFormatter(logger_formatter)
logger.addHandler(_ch)
logger.addHandler(_ch)
logger.info("Evaluation method: {}".format(eval_method))
self._retrain_full = retrain_full and (eval_method == 'holdout' and
self._state.X_val is None)
self._retrain_full = retrain_full and (
eval_method == 'holdout' and self._state.X_val is None)
self._prepare_data(eval_method, split_ratio, n_splits)
self._sample = sample and eval_method != 'cv' and (
MIN_SAMPLE_TRAIN * SAMPLE_MULTIPLY_FACTOR < self._state.data_size)
@ -911,18 +913,17 @@ class AutoML:
if 'auto' == estimator_list:
estimator_list = ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree']
if 'regression' != self._state.task:
estimator_list += ['lrl1',]
# add learner using add_learner() api
estimator_list += ['lrl1']
for estimator_name in estimator_list:
if estimator_name not in self._state.learner_classes:
self.add_learner(estimator_name,
get_estimator_class(self._state.task, estimator_name))
self.add_learner(
estimator_name,
get_estimator_class(self._state.task, estimator_name))
# set up learner search space
for estimator_name in estimator_list:
estimator_class = self._state.learner_classes[estimator_name]
self._search_states[estimator_name] = SearchState(
learner_class=estimator_class,
self._search_states[estimator_name] = SearchState(
learner_class=estimator_class,
data_size=self._state.data_size, task=self._state.task,
)
logger.info("List of ML learners in AutoML Run: {}".format(
@ -941,7 +942,7 @@ class AutoML:
self._state.n_jobs = n_jobs
self._search()
logger.info("fit succeeded")
if verbose==0:
if verbose == 0:
logger.setLevel(old_level)
def _search(self):
@ -953,18 +954,19 @@ class AutoML:
self._best_iteration = 0
self._model_history = {}
self._config_history = {}
self._max_iter_per_learner = 1000000 # TODO
self._iter_per_learner = dict([(e,0) for e in self.estimator_list])
self._max_iter_per_learner = 1000000 # TODO
self._iter_per_learner = dict([(e, 0) for e in self.estimator_list])
self._fullsize_reached = False
self._trained_estimator = None
self._best_estimator = None
self._retrained_config = {}
est_retrain_time = next_trial_time = 0
best_config_sig = None
# use ConcurrencyLimiter to limit the amount of concurrency when
# use ConcurrencyLimiter to limit the amount of concurrency when
# using a search algorithm
better = True # whether we find a better model in one trial
if self._ensemble: self.best_model = {}
better = True # whether we find a better model in one trial
if self._ensemble:
self.best_model = {}
try:
from ray.tune.suggest import ConcurrencyLimiter
except ImportError:
@ -978,7 +980,8 @@ class AutoML:
from .searcher.suggestion import OptunaSearch as SearchAlgo
elif 'bs' == self._hpo_method:
from flaml import BlendSearch as SearchAlgo
else: raise NotImplementedError
else:
raise NotImplementedError
for self._track_iter in range(self._max_iter):
if self._estimator_index is None:
@ -987,18 +990,18 @@ class AutoML:
estimator = self._select_estimator(self.estimator_list)
if not estimator:
break
logger.info(f"iteration {self._track_iter}"
f" current learner {estimator}")
logger.info(
f"iteration {self._track_iter}, current learner {estimator}")
search_state = self._search_states[estimator]
self._state.time_from_start = time.time()-self._start_time_flag
time_left = self._state.time_budget-self._state.time_from_start
self._state.time_from_start = time.time() - self._start_time_flag
time_left = self._state.time_budget - self._state.time_from_start
budget_left = time_left if not self._retrain_full or better or (
not self.best_estimator) or self._search_states[
self.best_estimator].sample_size<self._state.data_size \
else time_left - est_retrain_time
self.best_estimator].sample_size < self._state.data_size \
else time_left - est_retrain_time
if not search_state.search_alg:
search_state.training_function = partial(
AutoMLState._compute_with_config_base,
AutoMLState._compute_with_config_base,
self._state, estimator)
search_space = search_state.search_space
if self._sample:
@ -1008,14 +1011,13 @@ class AutoML:
else:
prune_attr = min_resource = max_resource = None
learner_class = self._state.learner_classes.get(estimator)
if 'grid' == self._hpo_method: # for synthetic exp only
if 'grid' == self._hpo_method: # for synthetic exp only
points_to_evaluate = []
space = search_space
keys = list(space.keys())
domain0 = space[keys[0]]
domain1 = space[keys[1]]
for x1 in range(domain0.lower, domain0.upper+1):
for x2 in range(domain1.lower, domain1.upper+1):
domain0, domain1 = space[keys[0]], space[keys[1]]
for x1 in range(domain0.lower, domain0.upper + 1):
for x2 in range(domain1.lower, domain1.upper + 1):
points_to_evaluate.append({
keys[0]: x1,
keys[1]: x2,
@ -1023,27 +1025,27 @@ class AutoML:
self._max_iter_per_learner = len(points_to_evaluate)
low_cost_partial_config = None
else:
points_to_evaluate=[search_state.init_config]
points_to_evaluate = [search_state.init_config]
low_cost_partial_config = search_state.low_cost_partial_config
if self._hpo_method in ('bs', 'cfo', 'grid'):
algo = SearchAlgo(metric='val_loss', mode='min',
space=search_space,
points_to_evaluate=points_to_evaluate,
algo = SearchAlgo(
metric='val_loss', mode='min', space=search_space,
points_to_evaluate=points_to_evaluate,
low_cost_partial_config=low_cost_partial_config,
cat_hp_cost=search_state.cat_hp_cost,
prune_attr=prune_attr,
min_resource=min_resource,
max_resource=max_resource,
resources_per_trial={"cpu": self._state.n_jobs,
"mem": self._mem_thres},
"mem": self._mem_thres},
mem_size=learner_class.size)
else:
algo = SearchAlgo(metric='val_loss', mode='min',
space=search_space,
points_to_evaluate=points_to_evaluate,
else:
algo = SearchAlgo(
metric='val_loss', mode='min', space=search_space,
points_to_evaluate=points_to_evaluate,
)
search_state.search_alg = ConcurrencyLimiter(algo,
max_concurrent=1)
max_concurrent=1)
else:
search_space = None
if self._hpo_method in ('bs', 'cfo'):
@ -1053,25 +1055,23 @@ class AutoML:
},
)
start_run_time = time.time()
# warnings.filterwarnings("ignore")
analysis = tune.run(search_state.training_function,
analysis = tune.run(
search_state.training_function,
search_alg=search_state.search_alg,
time_budget_s=budget_left,
verbose=max(self.verbose-1,0), #local_dir='logs/tune_results',
use_ray=False,
)
# warnings.resetwarnings()
time_used = time.time()-start_run_time
verbose=max(self.verbose - 1, 0),
use_ray=False)
time_used = time.time() - start_run_time
better = False
if analysis.trials:
search_state.update(analysis, time_used = time_used,
save_model_history = self._save_model_history)
if analysis.trials:
search_state.update(analysis, time_used=time_used,
save_model_history=self._save_model_history)
if self._estimator_index is None:
eci_base = search_state.init_eci
self._eci.append(search_state.estimated_cost4improvement)
for e in self.estimator_list[1:]:
self._eci.append(
self._search_states[e].init_eci/eci_base*self._eci[0])
self._eci.append(self._search_states[e].init_eci
/ eci_base * self._eci[0])
self._estimator_index = 0
self._state.time_from_start = time.time() - self._start_time_flag
# logger.info(f"{self._search_states[estimator].sample_size}, {data_size}")
@ -1081,7 +1081,7 @@ class AutoML:
self._fullsize_reached = True
if search_state.best_loss < self._state.best_loss:
best_config_sig = estimator + search_state.get_hist_config_sig(
self.data_size_full,
self.data_size_full,
search_state.best_config)
self._state.best_loss = search_state.best_loss
self._best_estimator = estimator
@ -1101,100 +1101,102 @@ class AutoML:
self._trained_estimator = search_state.trained_estimator
self._best_iteration = self._track_iter
better = True
next_trial_time = search_state.time2eval_best
next_trial_time = search_state.time2eval_best
if better or self._log_type == 'all':
self._training_log.append(self._iter_per_learner[estimator],
search_state.train_loss,
search_state.trial_time,
self._state.time_from_start,
search_state.val_loss,
search_state.config,
search_state.best_loss,
search_state.best_config,
estimator,
search_state.sample_size)
self._training_log.append(
self._iter_per_learner[estimator],
search_state.train_loss,
search_state.trial_time,
self._state.time_from_start,
search_state.val_loss,
search_state.config,
search_state.best_loss,
search_state.best_config,
estimator,
search_state.sample_size)
if mlflow is not None and mlflow.active_run():
with mlflow.start_run(nested=True) as run:
with mlflow.start_run(nested=True):
mlflow.log_metric('iter_counter',
self._iter_per_learner[estimator])
self._iter_per_learner[estimator])
mlflow.log_param('train_loss',
search_state.train_loss)
search_state.train_loss)
mlflow.log_metric('trial_time',
search_state.trial_time)
search_state.trial_time)
mlflow.log_metric('total_search_time',
self._state.time_from_start)
self._state.time_from_start)
mlflow.log_metric('validation_loss',
search_state.val_loss)
search_state.val_loss)
mlflow.log_param('config',
search_state.config)
search_state.config)
mlflow.log_param('learner',
estimator)
estimator)
mlflow.log_param('sample_size',
search_state.sample_size)
search_state.sample_size)
mlflow.log_metric('best_validation_loss',
search_state.best_loss)
search_state.best_loss)
mlflow.log_param('best_config',
search_state.best_config)
search_state.best_config)
mlflow.log_param('best_learner',
self._best_estimator)
self._best_estimator)
logger.info(
" at {:.1f}s,\tbest {}'s error={:.4f},\tbest {}'s error={:.4f}".format(
self._state.time_from_start,
estimator,
search_state.best_loss,
self._best_estimator,
self._state.best_loss))
" at {:.1f}s,\tbest {}'s error={:.4f},\tbest {}'s error={:.4f}".format(
self._state.time_from_start,
estimator,
search_state.best_loss,
self._best_estimator,
self._state.best_loss))
else:
logger.info(f"no enough budget for learner {estimator}")
if self._estimator_index is not None:
self.estimator_list.remove(estimator)
self._estimator_index -= 1
if self._retrain_full and best_config_sig and not better and (
self._search_states[self._best_estimator].sample_size ==
self._state.data_size) and (est_retrain_time <=
self._state.time_budget - self._state.time_from_start <=
est_retrain_time + next_trial_time):
self._trained_estimator, retrain_time = \
self._state._train_with_config(
self._best_estimator,
self._search_states[self._best_estimator].best_config,
self.data_size_full)
self._search_states[
self._best_estimator].sample_size == self._state.data_size
) and (est_retrain_time
<= self._state.time_budget - self._state.time_from_start
<= est_retrain_time + next_trial_time):
self._trained_estimator, \
retrain_time = self._state._train_with_config(
self._best_estimator,
self._search_states[self._best_estimator].best_config,
self.data_size_full)
logger.info("retrain {} for {:.1f}s".format(
estimator, retrain_time,))
estimator, retrain_time))
self._retrained_config[best_config_sig] = retrain_time
est_retrain_time = 0
self._state.time_from_start = time.time() - self._start_time_flag
if (self._state.time_from_start >= self._state.time_budget or
not self.estimator_list):
if (self._state.time_from_start >= self._state.time_budget
or not self.estimator_list):
break
if self._ensemble and self._best_estimator:
time_left = self._state.time_budget -self._state.time_from_start
time_left = self._state.time_budget - self._state.time_from_start
time_ensemble = self._search_states[
self._best_estimator].time2eval_best
if time_left < time_ensemble < 2*time_left:
if time_left < time_ensemble < 2 * time_left:
break
if self._search_states[estimator].time2eval_best > \
self._state.time_budget-self._state.time_from_start:
self._iter_per_learner[estimator] = self._max_iter_per_learner
# Add a checkpoint for the current best config to the log.
self._training_log.checkpoint()
if self._best_estimator:
self._selected = self._search_states[self._best_estimator]
self._trained_estimator = self._selected.trained_estimator
self.modelcount = sum(search_state.total_iter
for search_state in self._search_states.values())
if self._trained_estimator:
self.modelcount = sum(
search_state.total_iter
for search_state in self._search_states.values())
if self._trained_estimator:
logger.info(f'selected model: {self._trained_estimator.model}')
if self._ensemble:
search_states = list(x for x in self._search_states.items()
if x[1].trained_estimator)
search_states.sort(key=lambda x:x[1].best_loss)
estimators = [(x[0],x[1].trained_estimator) for x in search_states[
:2]]
estimators += [(x[0],x[1].trained_estimator) for x in search_states[
2:] if x[1].best_loss<4*self._selected.best_loss]
if x[1].trained_estimator)
search_states.sort(key=lambda x: x[1].best_loss)
estimators = [(x[0], x[1].trained_estimator)
for x in search_states[:2]]
estimators += [
(x[0], x[1].trained_estimator) for x in search_states[2:]
if x[1].best_loss < 4 * self._selected.best_loss]
logger.info(estimators)
if len(estimators)<=1: return
if len(estimators) <= 1:
return
if self._state.task != "regression":
from sklearn.ensemble import StackingClassifier as Stacker
for e in estimators:
@ -1202,14 +1204,13 @@ class AutoML:
else:
from sklearn.ensemble import StackingRegressor as Stacker
best_m = self._trained_estimator
stacker = Stacker(estimators, best_m,
n_jobs=self._state.n_jobs,
passthrough=True)
stacker = Stacker(estimators, best_m, n_jobs=self._state.n_jobs,
passthrough=True)
if self._sample_weight_full is not None:
self._state.fit_kwargs[
'sample_weight'] = self._sample_weight_full
stacker.fit(self._X_train_all, self._y_train_all,
**self._state.fit_kwargs)
**self._state.fit_kwargs)
logger.info(f'ensemble: {stacker}')
self._trained_estimator = stacker
self._trained_estimator.model = stacker
@ -1233,42 +1234,35 @@ class AutoML:
inv = []
untried_exists = False
for i, estimator in enumerate(estimator_list):
if estimator in self._search_states and self._search_states[
estimator].sample_size: # sample_size=none meaning no result
if estimator in self._search_states and (
self._search_states[estimator].sample_size
): # sample_size=none meaning no result
search_state = self._search_states[estimator]
if self._iter_per_learner[estimator]>=self._max_iter_per_learner:
if (self._search_states[estimator].time2eval_best
> self._state.time_budget - self._state.time_from_start
or self._iter_per_learner[estimator]
>= self._max_iter_per_learner):
inv.append(0)
continue
eci_search_state = search_state.estimated_cost4improvement
estimated_cost = search_state.estimated_cost4improvement
if search_state.sample_size < self._state.data_size:
eci_search_state = min(eci_search_state,
search_state.time2eval_best * min(SAMPLE_MULTIPLY_FACTOR,
self._state.data_size/search_state.sample_size))
estimated_cost = min(
estimated_cost,
search_state.time2eval_best * min(
SAMPLE_MULTIPLY_FACTOR,
self._state.data_size / search_state.sample_size))
gap = search_state.best_loss - self._state.best_loss
if gap > 0 and not self._ensemble:
delta_loss = (search_state.best_loss_old -
search_state.best_loss) or \
search_state.best_loss
delta_time = (search_state.total_time_used -
search_state.time_best_found_old) or 1e-10
delta_loss = (search_state.best_loss_old
- search_state.best_loss) or search_state.best_loss
delta_time = (search_state.total_time_used
- search_state.time_best_found_old) or 1e-10
speed = delta_loss / delta_time
try:
estimated_cost = 2*gap/speed
except ZeroDivisionError:
warnings.warn("ZeroDivisionError "
"speed: {0}, "
"old_best_loss: {1}, "
"new_best_loss: {2}"
.format(speed,
search_state.best_loss_old,
search_state.best_loss))
estimated_cost = 0.0
estimated_cost = max(estimated_cost, eci_search_state)
else:
estimated_cost = eci_search_state
if speed:
estimated_cost = max(2 * gap / speed, estimated_cost)
if estimated_cost == 0:
estimated_cost = 1e-10
inv.append(1/estimated_cost)
inv.append(1 / estimated_cost)
else:
estimated_cost = self._eci[i]
inv.append(0)

View File

@ -1,12 +1,12 @@
'''!
* Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License.
* Licensed under the MIT License.
'''
N_SPLITS = 5
RANDOM_SEED = 1
SPLIT_RATIO = 0.1
MEM_THRES = 4*(1024**3)
MEM_THRES = 4 * (1024 ** 3)
SMALL_LARGE_THRES = 10000000
MIN_SAMPLE_TRAIN = 10000
CV_HOLDOUT_THRESHOLD = 100000

View File

@ -1,6 +1,6 @@
'''!
* Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License.
* Licensed under the MIT License.
'''
import numpy as np
@ -10,7 +10,7 @@ from .training_log import training_log_reader
def load_openml_dataset(dataset_id, data_dir=None, random_state=0):
'''Load dataset from open ML.
'''Load dataset from open ML.
If the file is not cached locally, download it from open ML.
@ -23,7 +23,7 @@ def load_openml_dataset(dataset_id, data_dir=None, random_state=0):
X_train: A 2d numpy array of training data
X_test: A 2d numpy array of test data
y_train: A 1d numpy arrya of labels for training data
y_test: A 1d numpy arrya of labels for test data
y_test: A 1d numpy arrya of labels for test data
'''
import os
import openml
@ -58,9 +58,9 @@ def load_openml_dataset(dataset_id, data_dir=None, random_state=0):
def load_openml_task(task_id, data_dir):
'''Load task from open ML.
'''Load task from open ML.
Use the first fold of the task.
Use the first fold of the task.
If the file is not cached locally, download it from open ML.
Args:
@ -71,7 +71,7 @@ def load_openml_task(task_id, data_dir):
X_train: A 2d numpy array of training data
X_test: A 2d numpy array of test data
y_train: A 1d numpy arrya of labels for training data
y_test: A 1d numpy arrya of labels for test data
y_test: A 1d numpy arrya of labels for test data
'''
import os
import openml
@ -115,12 +115,12 @@ def get_output_from_log(filename, time_budget):
Returns:
training_time_list: A list of the finished time of each logged iter
best_error_list:
best_error_list:
A list of the best validation error after each logged iter
error_list: A list of the validation error of each logged iter
config_list:
config_list:
A list of the estimator, sample size and config of each logged iter
logged_metric_list: A list of the logged metric of each logged iter
logged_metric_list: A list of the logged metric of each logged iter
'''
best_config = None
@ -186,7 +186,6 @@ class DataTransformer:
'''transform X, y
'''
def fit_transform(self, X, y, task):
if isinstance(X, pd.DataFrame):
X = X.copy()
@ -223,17 +222,18 @@ class DataTransformer:
X_num = X[num_columns]
if drop and np.issubdtype(X_num.columns.dtype, np.integer):
X_num.columns = range(X_num.shape[1])
else: drop = False
else:
drop = False
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
self.transformer = ColumnTransformer([(
'continuous',
SimpleImputer(missing_values=np.nan, strategy='median'),
X_num.columns)])
X_num.columns)])
X[num_columns] = self.transformer.fit_transform(X_num)
self._cat_columns, self._num_columns = cat_columns, num_columns
self._drop = drop
if task == 'regression':
self.label_transformer = None
else:

View File

@ -1,15 +1,19 @@
'''!
* Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License.
* Licensed under the MIT License.
'''
from .model import *
import time
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, \
accuracy_score, mean_absolute_error, log_loss, average_precision_score, \
f1_score
import numpy as np
f1_score
from sklearn.model_selection import RepeatedStratifiedKFold
from .model import (
XGBoostEstimator, XGBoostSklearnEstimator, RandomForestEstimator,
LGBMEstimator, LRL1Classifier, LRL2Classifier, CatBoostEstimator,
ExtraTreeEstimator, KNeighborsEstimator)
import logging
logger = logging.getLogger(__name__)
@ -18,7 +22,6 @@ logger = logging.getLogger(__name__)
def get_estimator_class(task, estimator_name):
''' when adding a new learner, need to add an elif branch '''
if 'xgboost' in estimator_name:
if 'regression' in task:
estimator_class = XGBoostEstimator
@ -31,7 +34,7 @@ def get_estimator_class(task, estimator_name):
elif 'lrl1' in estimator_name:
estimator_class = LRL1Classifier
elif 'lrl2' in estimator_name:
estimator_class = LRL2Classifier
estimator_class = LRL2Classifier
elif 'catboost' in estimator_name:
estimator_class = CatBoostEstimator
elif 'extra_tree' in estimator_name:
@ -39,22 +42,24 @@ def get_estimator_class(task, estimator_name):
elif 'kneighbor' in estimator_name:
estimator_class = KNeighborsEstimator
else:
raise ValueError(estimator_name + ' is not a built-in learner. '
raise ValueError(
estimator_name + ' is not a built-in learner. '
'Please use AutoML.add_learner() to add a customized learner.')
return estimator_class
def sklearn_metric_loss_score(metric_name, y_predict, y_true, labels=None,
sample_weight=None):
def sklearn_metric_loss_score(
metric_name, y_predict, y_true, labels=None, sample_weight=None
):
'''Loss using the specified metric
Args:
metric_name: A string of the mtric name, one of
'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'log_loss',
metric_name: A string of the mtric name, one of
'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'log_loss',
'f1', 'ap'
y_predict: A 1d or 2d numpy array of the predictions which can be
used to calculate the metric. E.g., 2d for log_loss and 1d
for others.
for others.
y_true: A 1d numpy array of the true labels
labels: A 1d numpy array of the unique labels
sample_weight: A 1d numpy array of the sample weight
@ -66,69 +71,69 @@ def sklearn_metric_loss_score(metric_name, y_predict, y_true, labels=None,
if 'r2' in metric_name:
score = 1.0 - r2_score(y_true, y_predict, sample_weight=sample_weight)
elif metric_name == 'rmse':
score = np.sqrt(mean_squared_error(y_true, y_predict,
sample_weight=sample_weight))
score = np.sqrt(mean_squared_error(
y_true, y_predict, sample_weight=sample_weight))
elif metric_name == 'mae':
score = mean_absolute_error(y_true, y_predict,
sample_weight=sample_weight)
score = mean_absolute_error(
y_true, y_predict, sample_weight=sample_weight)
elif metric_name == 'mse':
score = mean_squared_error(y_true, y_predict,
sample_weight=sample_weight)
score = mean_squared_error(
y_true, y_predict, sample_weight=sample_weight)
elif metric_name == 'accuracy':
score = 1.0 - accuracy_score(y_true, y_predict,
sample_weight=sample_weight)
score = 1.0 - accuracy_score(
y_true, y_predict, sample_weight=sample_weight)
elif 'roc_auc' in metric_name:
score = 1.0 - roc_auc_score(y_true, y_predict,
sample_weight=sample_weight)
score = 1.0 - roc_auc_score(
y_true, y_predict, sample_weight=sample_weight)
elif 'log_loss' in metric_name:
score = log_loss(y_true, y_predict, labels=labels,
sample_weight=sample_weight)
score = log_loss(
y_true, y_predict, labels=labels, sample_weight=sample_weight)
elif 'f1' in metric_name:
score = 1 - f1_score(y_true, y_predict, sample_weight=sample_weight)
elif 'ap' in metric_name:
score = 1 - average_precision_score(y_true, y_predict,
sample_weight=sample_weight)
score = 1 - average_precision_score(
y_true, y_predict, sample_weight=sample_weight)
else:
raise ValueError(metric_name+' is not a built-in metric, '
'currently built-in metrics are: '
'r2, rmse, mae, mse, accuracy, roc_auc, log_loss, f1, ap. '
'please pass a customized metric function to AutoML.fit(metric=func)')
raise ValueError(
metric_name + ' is not a built-in metric, '
'currently built-in metrics are: '
'r2, rmse, mae, mse, accuracy, roc_auc, log_loss, f1, ap. '
'please pass a customized metric function to AutoML.fit(metric=func)')
return score
def get_y_pred(estimator, X, eval_metric, obj):
if eval_metric in ['roc_auc', 'ap'] and 'binary' in obj:
y_pred_classes = estimator.predict_proba(X)
y_pred = y_pred_classes[:,
1] if y_pred_classes.ndim>1 else y_pred_classes
y_pred_classes = estimator.predict_proba(X)
y_pred = y_pred_classes[
:, 1] if y_pred_classes.ndim > 1 else y_pred_classes
elif eval_metric in ['log_loss', 'roc_auc']:
y_pred = estimator.predict_proba(X)
else:
try:
y_pred = estimator.predict(X)
except:
logger.debug("prediction failed. Using a constant predictor.")
y_pred = np.ones(X.shape[0])
y_pred = estimator.predict(X)
return y_pred
def get_test_loss(estimator, X_train, y_train, X_test, y_test, weight_test,
eval_metric, obj, labels=None, budget=None, train_loss=False, fit_kwargs={}):
def get_test_loss(
estimator, X_train, y_train, X_test, y_test, weight_test,
eval_metric, obj, labels=None, budget=None, train_loss=False, fit_kwargs={}
):
start = time.time()
train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs)
if isinstance(eval_metric, str):
test_pred_y = get_y_pred(estimator, X_test, eval_metric, obj)
test_loss = sklearn_metric_loss_score(eval_metric, test_pred_y, y_test,
labels, weight_test)
if train_loss != False:
labels, weight_test)
if train_loss is not False:
test_pred_y = get_y_pred(estimator, X_train, eval_metric, obj)
train_loss = sklearn_metric_loss_score(eval_metric, test_pred_y,
y_train, labels, fit_kwargs.get('sample_weight'))
else: # customized metric function
train_loss = sklearn_metric_loss_score(
eval_metric, test_pred_y,
y_train, labels, fit_kwargs.get('sample_weight'))
else: # customized metric function
test_loss, train_loss = eval_metric(
X_test, y_test, estimator, labels, X_train, y_train,
weight_test, fit_kwargs.get('sample_weight'))
train_time = time.time()-start
train_time = time.time() - start
return test_loss, train_time, train_loss
@ -137,9 +142,11 @@ def train_model(estimator, X_train, y_train, budget, fit_kwargs={}):
return train_time
def evaluate_model(estimator, X_train, y_train, X_val, y_val, weight_val,
budget, kf, task, eval_method, eval_metric, best_val_loss, train_loss=False,
fit_kwargs={}):
def evaluate_model(
estimator, X_train, y_train, X_val, y_val, weight_val,
budget, kf, task, eval_method, eval_metric, best_val_loss, train_loss=False,
fit_kwargs={}
):
if 'holdout' in eval_method:
val_loss, train_loss, train_time = evaluate_model_holdout(
estimator, X_train, y_train, X_val, y_val, weight_val, budget,
@ -147,33 +154,37 @@ def evaluate_model(estimator, X_train, y_train, X_val, y_val, weight_val,
fit_kwargs=fit_kwargs)
else:
val_loss, train_loss, train_time = evaluate_model_CV(
estimator, X_train, y_train, budget, kf, task,
estimator, X_train, y_train, budget, kf, task,
eval_metric, best_val_loss, train_loss=train_loss,
fit_kwargs=fit_kwargs)
return val_loss, train_loss, train_time
def evaluate_model_holdout(estimator, X_train, y_train, X_val, y_val,
weight_val, budget, task, eval_metric, best_val_loss, train_loss=False,
fit_kwargs={}):
def evaluate_model_holdout(
estimator, X_train, y_train, X_val, y_val,
weight_val, budget, task, eval_metric, best_val_loss, train_loss=False,
fit_kwargs={}
):
val_loss, train_time, train_loss = get_test_loss(
estimator, X_train, y_train, X_val, y_val, weight_val, eval_metric,
task, budget = budget, train_loss=train_loss, fit_kwargs=fit_kwargs)
return val_loss, train_loss, train_time
task, budget=budget, train_loss=train_loss, fit_kwargs=fit_kwargs)
return val_loss, train_loss, train_time
def evaluate_model_CV(estimator, X_train_all, y_train_all, budget, kf,
task, eval_metric, best_val_loss, train_loss=False, fit_kwargs={}):
def evaluate_model_CV(
estimator, X_train_all, y_train_all, budget, kf,
task, eval_metric, best_val_loss, train_loss=False, fit_kwargs={}
):
start_time = time.time()
total_val_loss = total_train_loss = 0
train_time = 0
valid_fold_num = 0
n = kf.get_n_splits()
X_train_split, y_train_split = X_train_all, y_train_all
if task=='regression':
if task == 'regression':
labels = None
else:
labels = np.unique(y_train_all)
labels = np.unique(y_train_all)
if isinstance(kf, RepeatedStratifiedKFold):
kf = kf.split(X_train_split, y_train_split)
@ -181,7 +192,7 @@ def evaluate_model_CV(estimator, X_train_all, y_train_all, budget, kf,
kf = kf.split(X_train_split)
rng = np.random.RandomState(2020)
val_loss_list = []
budget_per_train = budget / (n+1)
budget_per_train = budget / (n + 1)
if 'sample_weight' in fit_kwargs:
weight = fit_kwargs['sample_weight']
weight_val = None
@ -207,24 +218,27 @@ def evaluate_model_CV(estimator, X_train_all, y_train_all, budget, kf,
train_index], weight[val_index]
val_loss_i, train_time_i, train_loss_i = get_test_loss(
estimator, X_train, y_train, X_val, y_val, weight_val,
eval_metric, task, labels, budget_per_train,
eval_metric, task, labels, budget_per_train,
train_loss=train_loss, fit_kwargs=fit_kwargs)
if weight is not None:
fit_kwargs['sample_weight'] = weight
valid_fold_num += 1
total_val_loss += val_loss_i
if train_loss != False:
if total_train_loss != 0: total_train_loss += train_loss_i
else: total_train_loss = train_loss_i
if train_loss is not False:
if total_train_loss != 0:
total_train_loss += train_loss_i
else:
total_train_loss = train_loss_i
train_time += train_time_i
if valid_fold_num == n:
val_loss_list.append(total_val_loss/valid_fold_num)
val_loss_list.append(total_val_loss / valid_fold_num)
total_val_loss = valid_fold_num = 0
elif time.time() - start_time >= budget:
val_loss_list.append(total_val_loss/valid_fold_num)
val_loss_list.append(total_val_loss / valid_fold_num)
break
val_loss = np.max(val_loss_list)
if train_loss != False: train_loss = total_train_loss/n
if train_loss is not False:
train_loss = total_train_loss / n
budget -= time.time() - start_time
if val_loss < best_val_loss and budget > budget_per_train:
estimator.cleanup()
@ -232,15 +246,17 @@ def evaluate_model_CV(estimator, X_train_all, y_train_all, budget, kf,
return val_loss, train_loss, train_time
def compute_estimator(X_train, y_train, X_val, y_val, weight_val, budget, kf,
config_dic, task, estimator_name, eval_method, eval_metric,
best_val_loss = np.Inf, n_jobs=1, estimator_class=None, train_loss=False,
fit_kwargs = {}):
def compute_estimator(
X_train, y_train, X_val, y_val, weight_val, budget, kf,
config_dic, task, estimator_name, eval_method, eval_metric,
best_val_loss=np.Inf, n_jobs=1, estimator_class=None, train_loss=False,
fit_kwargs={}
):
start_time = time.time()
estimator_class = estimator_class or get_estimator_class(
task, estimator_name)
estimator = estimator_class(
**config_dic, task = task, n_jobs=n_jobs)
**config_dic, task=task, n_jobs=n_jobs)
val_loss, train_loss, train_time = evaluate_model(
estimator, X_train, y_train, X_val, y_val, weight_val, budget, kf, task,
eval_method, eval_metric, best_val_loss, train_loss=train_loss,
@ -249,16 +265,17 @@ def compute_estimator(X_train, y_train, X_val, y_val, weight_val, budget, kf,
return estimator, val_loss, train_loss, train_time, all_time
def train_estimator(X_train, y_train, config_dic, task,
estimator_name, n_jobs=1, estimator_class=None, budget=None, fit_kwargs={}):
def train_estimator(
X_train, y_train, config_dic, task,
estimator_name, n_jobs=1, estimator_class=None, budget=None, fit_kwargs={}
):
start_time = time.time()
estimator_class = estimator_class or get_estimator_class(task,
estimator_name)
estimator = estimator_class(**config_dic, task = task,
n_jobs=n_jobs)
estimator_class = estimator_class or get_estimator_class(
task, estimator_name)
estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs)
if X_train is not None:
train_time = train_model(estimator, X_train, y_train, budget,
fit_kwargs)
train_time = train_model(
estimator, X_train, y_train, budget, fit_kwargs)
else:
estimator = estimator.estimator_class(**estimator.params)
train_time = time.time() - start_time

View File

@ -1,6 +1,6 @@
'''!
* Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License.
* Licensed under the MIT License.
'''
import numpy as np
@ -24,13 +24,13 @@ class BaseEstimator:
Typical example:
XGBoostEstimator: for regression
XGBoostSklearnEstimator: for classification
LGBMEstimator, RandomForestEstimator, LRL1Classifier, LRL2Classifier:
for both regression and classification
LGBMEstimator, RandomForestEstimator, LRL1Classifier, LRL2Classifier:
for both regression and classification
'''
def __init__(self, task = 'binary:logistic', **params):
def __init__(self, task='binary:logistic', **params):
'''Constructor
Args:
task: A string of the task type, one of
'binary:logistic', 'multi:softmax', 'regression'
@ -43,8 +43,8 @@ class BaseEstimator:
if '_estimator_type' in params:
self._estimator_type = params['_estimator_type']
else:
self._estimator_type = "regressor" if task=='regression' \
else "classifier"
self._estimator_type = "regressor" if task == 'regression' \
else "classifier"
def get_params(self, deep=False):
params = self.params.copy()
@ -58,7 +58,7 @@ class BaseEstimator:
return self._model.classes_
@property
def n_features_in_(self):
def n_features_in_(self):
return self.model.n_features_in_
@property
@ -70,7 +70,7 @@ class BaseEstimator:
def _preprocess(self, X):
return X
def _fit(self, X_train, y_train, **kwargs):
def _fit(self, X_train, y_train, **kwargs):
current_time = time.time()
X_train = self._preprocess(X_train)
@ -82,7 +82,7 @@ class BaseEstimator:
def fit(self, X_train, y_train, budget=None, **kwargs):
'''Train the model from given training data
Args:
X_train: A numpy array of training data in shape n*m
y_train: A numpy array of labels in shape n*1
@ -95,14 +95,14 @@ class BaseEstimator:
def predict(self, X_test):
'''Predict label from features
Args:
X_test: A numpy array of featurized instances, shape n*m
Returns:
A numpy array of shape n*1.
A numpy array of shape n*1.
Each element is the label for a instance
'''
'''
X_test = self._preprocess(X_test)
return self._model.predict(X_test)
@ -127,25 +127,26 @@ class BaseEstimator:
X_test = self._preprocess(X_test)
return self._model.predict_proba(X_test)
def cleanup(self): pass
def cleanup(self):
pass
@classmethod
def search_space(cls, **params):
def search_space(cls, **params):
'''[required method] search space
Returns:
A dictionary of the search space.
A dictionary of the search space.
Each key is the name of a hyperparameter, and value is a dict with
its domain and init_value (optional), cat_hp_cost (optional)
e.g.,
its domain and init_value (optional), cat_hp_cost (optional)
e.g.,
{'domain': tune.randint(lower=1, upper=10), 'init_value': 1}
'''
return {}
@classmethod
def size(cls, config):
def size(cls, config):
'''[optional method] memory size of the estimator in bytes
Args:
config - the dict of the hyperparameter config
@ -163,7 +164,6 @@ class BaseEstimator:
class SKLearnEstimator(BaseEstimator):
def _preprocess(self, X):
if isinstance(X, pd.DataFrame):
X = X.copy()
@ -174,63 +174,64 @@ class SKLearnEstimator(BaseEstimator):
class LGBMEstimator(BaseEstimator):
@classmethod
def search_space(cls, data_size, **params):
upper = min(32768,int(data_size))
def search_space(cls, data_size, **params):
upper = min(32768, int(data_size))
return {
'n_estimators': {
'domain': tune.qloguniform(lower=4, upper=upper, q=1),
'init_value': 4,
'low_cost_init_value': 4,
},
'max_leaves': {
'num_leaves': {
'domain': tune.qloguniform(lower=4, upper=upper, q=1),
'init_value': 4,
'low_cost_init_value': 4,
},
'min_data_in_leaf': {
'min_child_samples': {
'domain': tune.qloguniform(lower=2, upper=2**7, q=1),
'init_value': 20,
},
'learning_rate': {
'domain': tune.loguniform(lower=1/1024, upper=1.0),
'domain': tune.loguniform(lower=1 / 1024, upper=1.0),
'init_value': 0.1,
},
'subsample': {
'domain': tune.uniform(lower=0.1, upper=1.0),
'init_value': 1.0,
},
},
'log_max_bin': {
'domain': tune.qloguniform(lower=3, upper=10, q=1),
'init_value': 8,
},
},
'colsample_bytree': {
'domain': tune.uniform(lower=0.01, upper=1.0),
'init_value': 1.0,
},
},
'reg_alpha': {
'domain': tune.loguniform(lower=1/1024, upper=1024),
'init_value': 1/1024,
},
'domain': tune.loguniform(lower=1 / 1024, upper=1024),
'init_value': 1 / 1024,
},
'reg_lambda': {
'domain': tune.loguniform(lower=1/1024, upper=1024),
'domain': tune.loguniform(lower=1 / 1024, upper=1024),
'init_value': 1.0,
},
},
}
@classmethod
def size(cls, config):
max_leaves = int(round(config['max_leaves']))
num_leaves = int(round(config.get('num_leaves') or config['max_leaves']))
n_estimators = int(round(config['n_estimators']))
return (max_leaves*3 + (max_leaves-1)*4 + 1.0)*n_estimators*8
return (num_leaves * 3 + (num_leaves - 1) * 4 + 1.0) * n_estimators * 8
def __init__(self, task='binary:logistic', n_jobs=1,
n_estimators=2, max_leaves=2, min_data_in_leaf=20, learning_rate=0.1,
subsample=1.0, reg_lambda=1.0, reg_alpha=0.0,
colsample_bytree=1.0, log_max_bin=8, **params):
def __init__(
self, task='binary:logistic', n_jobs=1,
n_estimators=2, num_leaves=2, min_child_samples=20, learning_rate=0.1,
subsample=1.0, reg_lambda=1.0, reg_alpha=0.0,
colsample_bytree=1.0, log_max_bin=8, **params
):
super().__init__(task, **params)
# Default: regression for LGBMRegressor,
# Default: regression for LGBMRegressor,
# binary or multiclass for LGBMClassifier
if 'regression' in task:
objective = 'regression'
@ -238,21 +239,22 @@ class LGBMEstimator(BaseEstimator):
objective = 'binary'
elif 'multi' in task:
objective = 'multiclass'
else: objective = 'regression'
else:
objective = 'regression'
self.params = {
"n_estimators": int(round(n_estimators)),
"max_leaves": int(round(max_leaves)),
"num_leaves": int(round(num_leaves)),
'objective': params.get("objective", objective),
'n_jobs': n_jobs,
'learning_rate': float(learning_rate),
'reg_alpha': float(reg_alpha),
'reg_lambda': float(reg_lambda),
'min_data_in_leaf': int(round(min_data_in_leaf)),
'colsample_bytree':float(colsample_bytree),
'min_child_samples': int(round(min_child_samples)),
'colsample_bytree': float(colsample_bytree),
'subsample': float(subsample),
}
self.params['max_bin'] = params['max_bin'] if 'max_bin' in params else (
1<<int(round(log_max_bin)))-1
1 << int(round(log_max_bin))) - 1
if 'regression' in task:
self.estimator_class = LGBMRegressor
else:
@ -261,33 +263,35 @@ class LGBMEstimator(BaseEstimator):
self._train_size = 0
def _preprocess(self, X):
if not isinstance(X, pd.DataFrame) and issparse(
X) and np.issubdtype(X.dtype, np.integer):
if not isinstance(X, pd.DataFrame) and issparse(X) and np.issubdtype(
X.dtype, np.integer):
X = X.astype(float)
return X
def fit(self, X_train, y_train, budget=None, **kwargs):
start_time = time.time()
n_iter = self.params["n_estimators"]
if (not self._time_per_iter or
abs(self._train_size-X_train.shape[0])>4) and budget is not None:
if (not self._time_per_iter or abs(
self._train_size - X_train.shape[0]) > 4) and budget is not None:
self.params["n_estimators"] = 1
self._t1 = self._fit(X_train, y_train, **kwargs)
if self._t1 >= budget:
if self._t1 >= budget:
self.params["n_estimators"] = n_iter
return self._t1
self.params["n_estimators"] = 4
self._t2 = self._fit(X_train, y_train, **kwargs)
self._time_per_iter = (self._t2 - self._t1)/(
self.params["n_estimators"]-1) if self._t2 > self._t1 \
self._time_per_iter = (self._t2 - self._t1) / (
self.params["n_estimators"] - 1) if self._t2 > self._t1 \
else self._t1 if self._t1 else 0.001
self._train_size = X_train.shape[0]
if self._t1+self._t2>=budget or n_iter==self.params["n_estimators"]:
if self._t1 + self._t2 >= budget or n_iter == self.params[
"n_estimators"]:
self.params["n_estimators"] = n_iter
return time.time() - start_time
if budget is not None:
self.params["n_estimators"] = min(n_iter, int((budget-time.time()+
start_time-self._t1)/self._time_per_iter+1))
self.params["n_estimators"] = min(n_iter, int(
(budget - time.time() + start_time - self._t1)
/ self._time_per_iter + 1))
if self.params["n_estimators"] > 0:
self._fit(X_train, y_train, **kwargs)
self.params["n_estimators"] = n_iter
@ -298,10 +302,9 @@ class LGBMEstimator(BaseEstimator):
class XGBoostEstimator(SKLearnEstimator):
''' not using sklearn API, used for regression '''
@classmethod
def search_space(cls, data_size, **params):
upper = min(32768,int(data_size))
def search_space(cls, data_size, **params):
upper = min(32768, int(data_size))
return {
'n_estimators': {
'domain': tune.qloguniform(lower=4, upper=upper, q=1),
@ -318,31 +321,31 @@ class XGBoostEstimator(SKLearnEstimator):
'init_value': 1,
},
'learning_rate': {
'domain': tune.loguniform(lower=1/1024, upper=1.0),
'domain': tune.loguniform(lower=1 / 1024, upper=1.0),
'init_value': 0.1,
},
'subsample': {
'domain': tune.uniform(lower=0.1, upper=1.0),
'init_value': 1.0,
},
},
'colsample_bylevel': {
'domain': tune.uniform(lower=0.01, upper=1.0),
'init_value': 1.0,
},
},
'colsample_bytree': {
'domain': tune.uniform(lower=0.01, upper=1.0),
'init_value': 1.0,
},
},
'reg_alpha': {
'domain': tune.loguniform(lower=1/1024, upper=1024),
'init_value': 1/1024,
},
'domain': tune.loguniform(lower=1 / 1024, upper=1024),
'init_value': 1 / 1024,
},
'reg_lambda': {
'domain': tune.loguniform(lower=1/1024, upper=1024),
'domain': tune.loguniform(lower=1 / 1024, upper=1024),
'init_value': 1.0,
},
},
}
@classmethod
def size(cls, config):
return LGBMEstimator.size(config)
@ -351,10 +354,12 @@ class XGBoostEstimator(SKLearnEstimator):
def cost_relative2lgbm(cls):
return 1.6
def __init__(self, task='regression', all_thread=False, n_jobs=1,
n_estimators=4, max_leaves=4, subsample=1.0, min_child_weight=1,
def __init__(
self, task='regression', all_thread=False, n_jobs=1,
n_estimators=4, max_leaves=4, subsample=1.0, min_child_weight=1,
learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0, colsample_bylevel=1.0,
colsample_bytree=1.0, tree_method='auto', **params):
colsample_bytree=1.0, tree_method='auto', **params
):
super().__init__(task, **params)
self._n_estimators = int(round(n_estimators))
self._max_leaves = int(round(max_leaves))
@ -372,8 +377,8 @@ class XGBoostEstimator(SKLearnEstimator):
'min_child_weight': float(min_child_weight),
'booster': params.get('booster', 'gbtree'),
'colsample_bylevel': float(colsample_bylevel),
'colsample_bytree':float(colsample_bytree),
}
'colsample_bytree': float(colsample_bytree),
}
if all_thread:
del self.params['nthread']
@ -383,7 +388,7 @@ class XGBoostEstimator(SKLearnEstimator):
return params
def fit(self, X_train, y_train, budget=None, **kwargs):
start_time = time.time()
start_time = time.time()
if not issparse(X_train):
self.params['tree_method'] = 'hist'
X_train = self._preprocess(X_train)
@ -392,8 +397,8 @@ class XGBoostEstimator(SKLearnEstimator):
'sample_weight'])
else:
dtrain = xgb.DMatrix(X_train, label=y_train)
if self._max_leaves>0:
if self._max_leaves > 0:
self._model = xgb.train(self.params, dtrain, self._n_estimators)
del dtrain
train_time = time.time() - start_time
@ -411,37 +416,38 @@ class XGBoostEstimator(SKLearnEstimator):
class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
''' using sklearn API, used for classification '''
@classmethod
def search_space(cls, data_size, **params):
def search_space(cls, data_size, **params):
return XGBoostEstimator.search_space(data_size)
@classmethod
def cost_relative2lgbm(cls):
return XGBoostEstimator.cost_relative2lgbm()
def __init__(self, task='binary:logistic', n_jobs=1,
n_estimators=4, max_leaves=4, subsample=1.0,
def __init__(
self, task='binary:logistic', n_jobs=1,
n_estimators=4, max_leaves=4, subsample=1.0,
min_child_weight=1, learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0,
colsample_bylevel=1.0, colsample_bytree=1.0, tree_method='hist',
**params):
colsample_bylevel=1.0, colsample_bytree=1.0, tree_method='hist',
**params
):
super().__init__(task, **params)
self.params = {
"n_estimators": int(round(n_estimators)),
'max_leaves': int(round(max_leaves)),
'max_depth': 0,
'grow_policy': params.get("grow_policy", 'lossguide'),
'tree_method': tree_method,
'verbosity': 0,
'n_jobs': n_jobs,
'learning_rate': float(learning_rate),
'subsample': float(subsample),
'reg_alpha': float(reg_alpha),
'reg_lambda': float(reg_lambda),
'min_child_weight': float(min_child_weight),
'booster': params.get('booster', 'gbtree'),
'colsample_bylevel': float(colsample_bylevel),
'colsample_bytree': float(colsample_bytree),
"n_estimators": int(round(n_estimators)),
'max_leaves': int(round(max_leaves)),
'max_depth': 0,
'grow_policy': params.get("grow_policy", 'lossguide'),
'tree_method': tree_method,
'verbosity': 0,
'n_jobs': n_jobs,
'learning_rate': float(learning_rate),
'subsample': float(subsample),
'reg_alpha': float(reg_alpha),
'reg_lambda': float(reg_lambda),
'min_child_weight': float(min_child_weight),
'booster': params.get('booster', 'gbtree'),
'colsample_bylevel': float(colsample_bylevel),
'colsample_bytree': float(colsample_bytree),
}
if 'regression' in task:
@ -455,13 +461,12 @@ class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
if issparse(X_train):
self.params['tree_method'] = 'auto'
return super().fit(X_train, y_train, budget, **kwargs)
class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
@classmethod
def search_space(cls, data_size, task, **params):
def search_space(cls, data_size, task, **params):
upper = min(2048, int(data_size))
space = {
'n_estimators': {
@ -489,13 +494,15 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
def cost_relative2lgbm(cls):
return 2.0
def __init__(self, task = 'binary:logistic', n_jobs = 1,
n_estimators = 4, max_features = 1.0, criterion = 'gini', **params):
def __init__(
self, task='binary:logistic', n_jobs=1,
n_estimators=4, max_features=1.0, criterion='gini', **params
):
super().__init__(task, **params)
self.params = {
"n_estimators": int(round(n_estimators)),
"n_jobs": n_jobs,
'max_features': float(max_features),
"n_estimators": int(round(n_estimators)),
"n_jobs": n_jobs,
'max_features': float(max_features),
}
if 'regression' in task:
self.estimator_class = RandomForestRegressor
@ -512,12 +519,11 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
class ExtraTreeEstimator(RandomForestEstimator):
@classmethod
def cost_relative2lgbm(cls):
return 1.9
def __init__(self, task = 'binary:logistic', **params):
def __init__(self, task='binary:logistic', **params):
super().__init__(task, **params)
if 'regression' in task:
self.estimator_class = ExtraTreesRegressor
@ -527,9 +533,8 @@ class ExtraTreeEstimator(RandomForestEstimator):
class LRL1Classifier(SKLearnEstimator):
@classmethod
def search_space(cls, **params):
def search_space(cls, **params):
return {
'C': {
'domain': tune.loguniform(lower=0.03125, upper=32768.0),
@ -541,8 +546,10 @@ class LRL1Classifier(SKLearnEstimator):
def cost_relative2lgbm(cls):
return 160
def __init__(self, task='binary:logistic', n_jobs=1, tol=0.0001, C=1.0,
**params):
def __init__(
self, task='binary:logistic', n_jobs=1, tol=0.0001, C=1.0,
**params
):
super().__init__(task, **params)
self.params = {
'penalty': params.get("penalty", 'l1'),
@ -553,25 +560,25 @@ class LRL1Classifier(SKLearnEstimator):
}
if 'regression' in task:
self.estimator_class = None
print('LR does not support regression task')
raise NotImplementedError
raise NotImplementedError('LR does not support regression task')
else:
self.estimator_class = LogisticRegression
class LRL2Classifier(SKLearnEstimator):
@classmethod
def search_space(cls, **params):
def search_space(cls, **params):
return LRL1Classifier.search_space(**params)
@classmethod
def cost_relative2lgbm(cls):
return 25
def __init__(self, task='binary:logistic', n_jobs=1, tol=0.0001, C=1.0,
**params):
def __init__(
self, task='binary:logistic', n_jobs=1, tol=0.0001, C=1.0,
**params
):
super().__init__(task, **params)
self.params = {
'penalty': params.get("penalty", 'l2'),
@ -582,21 +589,19 @@ class LRL2Classifier(SKLearnEstimator):
}
if 'regression' in task:
self.estimator_class = None
print('LR does not support regression task')
raise NotImplementedError
raise NotImplementedError('LR does not support regression task')
else:
self.estimator_class = LogisticRegression
class CatBoostEstimator(BaseEstimator):
_time_per_iter = None
_train_size = 0
@classmethod
def search_space(cls, data_size, **params):
upper = max(min(round(1500000/data_size),150), 11)
def search_space(cls, data_size, **params):
upper = max(min(round(1500000 / data_size), 150), 11)
return {
'early_stopping_rounds': {
'domain': tune.qloguniform(lower=10, upper=upper, q=1),
@ -613,18 +618,20 @@ class CatBoostEstimator(BaseEstimator):
def size(cls, config):
n_estimators = 8192
max_leaves = 64
return (max_leaves*3 + (max_leaves-1)*4 + 1.0)*n_estimators*8
return (max_leaves * 3 + (max_leaves - 1) * 4 + 1.0) * n_estimators * 8
@classmethod
def cost_relative2lgbm(cls):
return 15
def __init__(self, task = 'binary:logistic', n_jobs=1,
n_estimators=8192, learning_rate=0.1, early_stopping_rounds=4, **params):
def __init__(
self, task='binary:logistic', n_jobs=1,
n_estimators=8192, learning_rate=0.1, early_stopping_rounds=4, **params
):
super().__init__(task, **params)
self.params = {
"early_stopping_rounds": int(round(early_stopping_rounds)),
"n_estimators": n_estimators,
"n_estimators": n_estimators,
'learning_rate': learning_rate,
'thread_count': n_jobs,
'verbose': params.get('verbose', False),
@ -650,65 +657,69 @@ class CatBoostEstimator(BaseEstimator):
include='category').columns)
else:
cat_features = []
if (not CatBoostEstimator._time_per_iter or
abs(CatBoostEstimator._train_size-len(y_train))>4) and budget:
if (not CatBoostEstimator._time_per_iter or abs(
CatBoostEstimator._train_size - len(y_train)) > 4) and budget:
# measure the time per iteration
self.params["n_estimators"] = 1
CatBoostEstimator._smallmodel = self.estimator_class(**self.params)
CatBoostEstimator._smallmodel.fit(X_train, y_train,
cat_features=cat_features, **kwargs)
CatBoostEstimator._smallmodel.fit(
X_train, y_train, cat_features=cat_features, **kwargs)
CatBoostEstimator._t1 = time.time() - start_time
if CatBoostEstimator._t1 >= budget:
if CatBoostEstimator._t1 >= budget:
self.params["n_estimators"] = n_iter
self._model = CatBoostEstimator._smallmodel
return CatBoostEstimator._t1
self.params["n_estimators"] = 4
CatBoostEstimator._smallmodel = self.estimator_class(**self.params)
CatBoostEstimator._smallmodel.fit(X_train, y_train,
cat_features=cat_features, **kwargs)
CatBoostEstimator._time_per_iter = (time.time() - start_time -
CatBoostEstimator._t1)/(self.params["n_estimators"]-1)
if CatBoostEstimator._time_per_iter <= 0:
CatBoostEstimator._smallmodel.fit(
X_train, y_train, cat_features=cat_features, **kwargs)
CatBoostEstimator._time_per_iter = (
time.time() - start_time - CatBoostEstimator._t1) / (
self.params["n_estimators"] - 1)
if CatBoostEstimator._time_per_iter <= 0:
CatBoostEstimator._time_per_iter = CatBoostEstimator._t1
CatBoostEstimator._train_size = len(y_train)
if time.time()-start_time>=budget or n_iter==self.params[
"n_estimators"]:
if time.time() - start_time >= budget or n_iter == self.params[
"n_estimators"]:
self.params["n_estimators"] = n_iter
self._model = CatBoostEstimator._smallmodel
return time.time()-start_time
return time.time() - start_time
if budget:
train_times = 1
self.params["n_estimators"] = min(n_iter, int((budget-time.time()+
start_time-CatBoostEstimator._t1)/train_times/
CatBoostEstimator._time_per_iter+1))
train_times = 1
self.params["n_estimators"] = min(n_iter, int(
(budget - time.time() + start_time - CatBoostEstimator._t1)
/ train_times / CatBoostEstimator._time_per_iter + 1))
self._model = CatBoostEstimator._smallmodel
if self.params["n_estimators"] > 0:
l = max(int(len(y_train)*0.9), len(y_train)-1000)
X_tr, y_tr = X_train[:l], y_train[:l]
n = max(int(len(y_train) * 0.9), len(y_train) - 1000)
X_tr, y_tr = X_train[:n], y_train[:n]
if 'sample_weight' in kwargs:
weight = kwargs['sample_weight']
if weight is not None: kwargs['sample_weight'] = weight[:l]
else: weight = None
if weight is not None:
kwargs['sample_weight'] = weight[:n]
else:
weight = None
from catboost import Pool
model = self.estimator_class(**self.params)
model.fit(X_tr, y_tr, cat_features=cat_features, eval_set=Pool(
data=X_train[l:], label=y_train[l:], cat_features=cat_features),
**kwargs)
if weight is not None: kwargs['sample_weight'] = weight
# print(self.params["n_estimators"], model.get_best_iteration())
model.fit(
X_tr, y_tr, cat_features=cat_features,
eval_set=Pool(
data=X_train[n:], label=y_train[n:],
cat_features=cat_features),
**kwargs) # model.get_best_iteration()
if weight is not None:
kwargs['sample_weight'] = weight
self._model = model
self.params["n_estimators"] = n_iter
train_time = time.time() - start_time
# print(budget, train_time)
return train_time
class KNeighborsEstimator(BaseEstimator):
@classmethod
def search_space(cls, data_size, **params):
upper = min(512, int(data_size/2))
def search_space(cls, data_size, **params):
upper = min(512, int(data_size / 2))
return {
'n_neighbors': {
'domain': tune.qloguniform(lower=1, upper=upper, q=1),
@ -721,10 +732,11 @@ class KNeighborsEstimator(BaseEstimator):
def cost_relative2lgbm(cls):
return 30
def __init__(self, task='binary:logistic', n_jobs=1,
n_neighbors=5, **params):
def __init__(
self, task='binary:logistic', n_jobs=1, n_neighbors=5, **params
):
super().__init__(task, **params)
self.params= {
self.params = {
'n_neighbors': int(round(n_neighbors)),
'weights': params.get('weights', 'distance'),
'n_jobs': n_jobs,
@ -739,10 +751,8 @@ class KNeighborsEstimator(BaseEstimator):
def _preprocess(self, X):
if isinstance(X, pd.DataFrame):
cat_columns = X.select_dtypes(['category']).columns
# print(X.dtypes)
# print(cat_columns)
if X.shape[1] == len(cat_columns):
raise ValueError(
"kneighbor requires at least one numeric feature")
X = X.drop(cat_columns, axis=1)
"kneighbor requires at least one numeric feature")
X = X.drop(cat_columns, axis=1)
return X

View File

@ -1,2 +1,2 @@
from .blendsearch import CFO, BlendSearch, BlendSearchTuner
from .flow2 import FLOW2
from .flow2 import FLOW2

View File

@ -3,7 +3,7 @@
* Licensed under the MIT License. See LICENSE file in the
* project root for license information.
'''
from typing import Dict, Optional, List, Tuple
from typing import Dict, Optional, List, Tuple, Callable
import numpy as np
import time
import pickle
@ -26,7 +26,7 @@ class BlendSearch(Searcher):
'''class for BlendSearch algorithm
'''
cost_attr = "time_total_s" # cost attribute in result
cost_attr = "time_total_s" # cost attribute in result
def __init__(self,
metric: Optional[str] = None,
@ -41,7 +41,7 @@ class BlendSearch(Searcher):
reduction_factor: Optional[float] = None,
resources_per_trial: Optional[dict] = None,
global_search_alg: Optional[Searcher] = None,
mem_size = None,
mem_size: Callable[[dict], float] = None,
seed: Optional[int] = 20):
'''Constructor
@ -50,35 +50,35 @@ class BlendSearch(Searcher):
minimization or maximization.
mode: A string in ['min', 'max'] to specify the objective as
space: A dictionary to specify the search space.
points_to_evaluate: Initial parameter suggestions to be run first.
low_cost_partial_config: A dictionary from a subset of
points_to_evaluate: Initial parameter suggestions to be run first.
low_cost_partial_config: A dictionary from a subset of
controlled dimensions to the initial low-cost values.
e.g.,
e.g.,
.. code-block:: python
{'n_estimators': 4, 'max_leaves': 4}
cat_hp_cost: A dictionary from a subset of categorical dimensions
to the relative cost of each choice.
to the relative cost of each choice.
e.g.,
.. code-block:: python
{'tree_method': [1, 1, 2]}
i.e., the relative cost of the
i.e., the relative cost of the
three choices of 'tree_method' is 1, 1 and 2 respectively.
prune_attr: A string of the attribute used for pruning.
prune_attr: A string of the attribute used for pruning.
Not necessarily in space.
When prune_attr is in space, it is a hyperparameter, e.g.,
When prune_attr is in space, it is a hyperparameter, e.g.,
'n_iters', and the best value is unknown.
When prune_attr is not in space, it is a resource dimension,
When prune_attr is not in space, it is a resource dimension,
e.g., 'sample_size', and the peak performance is assumed
to be at the max_resource.
min_resource: A float of the minimal resource to use for the
min_resource: A float of the minimal resource to use for the
prune_attr; only valid if prune_attr is not in space.
max_resource: A float of the maximal resource to use for the
max_resource: A float of the maximal resource to use for the
prune_attr; only valid if prune_attr is not in space.
reduction_factor: A float of the reduction factor used for
incremental pruning.
@ -101,14 +101,15 @@ class BlendSearch(Searcher):
self._gs = GlobalSearch(space=space, metric=metric, mode=mode)
else:
self._gs = None
self._ls = LocalSearch(init_config, metric, mode, cat_hp_cost, space,
prune_attr, min_resource, max_resource, reduction_factor, seed)
self._ls = LocalSearch(
init_config, metric, mode, cat_hp_cost, space,
prune_attr, min_resource, max_resource, reduction_factor, seed)
self._resources_per_trial = resources_per_trial
self._mem_size = mem_size
self._mem_threshold = resources_per_trial.get(
'mem') if resources_per_trial else None
self._init_search()
def set_search_properties(self,
metric: Optional[str] = None,
mode: Optional[str] = None,
@ -119,8 +120,10 @@ class BlendSearch(Searcher):
if 'metric_target' in config:
self._metric_target = config.get('metric_target')
else:
if metric: self._metric = metric
if mode: self._mode = mode
if metric:
self._metric = metric
if mode:
self._mode = mode
self._ls.set_search_properties(metric, mode, config)
if self._gs is not None:
self._gs.set_search_properties(metric, mode, config)
@ -134,22 +137,22 @@ class BlendSearch(Searcher):
self._search_thread_pool = {
# id: int -> thread: SearchThread
0: SearchThread(self._ls.mode, self._gs)
}
self._thread_count = 1 # total # threads created
}
self._thread_count = 1 # total # threads created
self._init_used = self._ls.init_config is None
self._trial_proposed_by = {} # trial_id: str -> thread_id: int
self._trial_proposed_by = {} # trial_id: str -> thread_id: int
self._ls_bound_min = self._ls.normalize(self._ls.init_config)
self._ls_bound_max = self._ls_bound_min.copy()
self._gs_admissible_min = self._ls_bound_min.copy()
self._gs_admissible_max = self._ls_bound_max.copy()
self._result = {} # config_signature: tuple -> result: Dict
self._result = {} # config_signature: tuple -> result: Dict
self._deadline = np.inf
def save(self, checkpoint_path: str):
save_object = self
with open(checkpoint_path, "wb") as outputFile:
pickle.dump(save_object, outputFile)
def restore(self, checkpoint_path: str):
with open(checkpoint_path, "rb") as inputFile:
state = pickle.load(inputFile)
@ -180,24 +183,23 @@ class BlendSearch(Searcher):
''' search thread updater and cleaner
'''
thread_id = self._trial_proposed_by.get(trial_id)
if thread_id in self._search_thread_pool:
if thread_id in self._search_thread_pool:
self._search_thread_pool[thread_id].on_trial_complete(
trial_id, result, error)
trial_id, result, error)
del self._trial_proposed_by[trial_id]
# if not thread_id: logger.info(f"result {result}")
if result:
config = {}
for key, value in result.items():
if key.startswith('config/'):
config[key[7:]] = value
if error: # remove from result cache
if error: # remove from result cache
del self._result[self._ls.config_signature(config)]
else: # add to result cache
else: # add to result cache
self._result[self._ls.config_signature(config)] = result
# update target metric if improved
if (result[self._metric]-self._metric_target)*self._ls.metric_op<0:
if (result[self._metric] - self._metric_target) * self._ls.metric_op < 0:
self._metric_target = result[self._metric]
if not thread_id and self._create_condition(result):
if not thread_id and self._create_condition(result):
# thread creator
self._search_thread_pool[self._thread_count] = SearchThread(
self._ls.mode,
@ -206,14 +208,12 @@ class BlendSearch(Searcher):
)
thread_id = self._thread_count
self._thread_count += 1
self._update_admissible_region(config, self._ls_bound_min,
self._ls_bound_max)
self._update_admissible_region(
config, self._ls_bound_min, self._ls_bound_max)
# reset admissible region to ls bounding box
self._gs_admissible_min.update(self._ls_bound_min)
self._gs_admissible_max.update(self._ls_bound_max)
# cleaner
# logger.info(f"thread {thread_id} in search thread pool="
# f"{thread_id in self._search_thread_pool}")
if thread_id and thread_id in self._search_thread_pool:
# local search thread
self._clean(thread_id)
@ -231,9 +231,11 @@ class BlendSearch(Searcher):
def _create_condition(self, result: Dict) -> bool:
''' create thread condition
'''
if len(self._search_thread_pool) < 2: return True
obj_median = np.median([thread.obj_best1 for id, thread in
self._search_thread_pool.items() if id])
if len(self._search_thread_pool) < 2:
return True
obj_median = np.median(
[thread.obj_best1 for id, thread in self._search_thread_pool.items()
if id])
return result[self._metric] * self._ls.metric_op < obj_median
def _clean(self, thread_id: int):
@ -243,21 +245,19 @@ class BlendSearch(Searcher):
assert thread_id
todelete = set()
for id in self._search_thread_pool:
if id and id!=thread_id:
if id and id != thread_id:
if self._inferior(id, thread_id):
todelete.add(id)
for id in self._search_thread_pool:
if id and id!=thread_id:
if id and id != thread_id:
if self._inferior(thread_id, id):
todelete.add(thread_id)
break
# logger.info(f"thead {thread_id}.converged="
# f"{self._search_thread_pool[thread_id].converged}")
break
if self._search_thread_pool[thread_id].converged:
todelete.add(thread_id)
for key in self._ls_bound_max:
self._ls_bound_max[key] += self._ls.STEPSIZE
self._ls_bound_min[key] -= self._ls.STEPSIZE
self._ls_bound_min[key] -= self._ls.STEPSIZE
for id in todelete:
del self._search_thread_pool[id]
@ -266,15 +266,20 @@ class BlendSearch(Searcher):
'''
t1 = self._search_thread_pool[id1]
t2 = self._search_thread_pool[id2]
if t1.obj_best1 < t2.obj_best2: return False
elif t1.resource and t1.resource < t2.resource: return False
elif t2.reach(t1): return True
else: return False
if t1.obj_best1 < t2.obj_best2:
return False
elif t1.resource and t1.resource < t2.resource:
return False
elif t2.reach(t1):
return True
return False
def on_trial_result(self, trial_id: str, result: Dict):
if trial_id not in self._trial_proposed_by: return
if trial_id not in self._trial_proposed_by:
return
thread_id = self._trial_proposed_by[trial_id]
if not thread_id in self._search_thread_pool: return
if thread_id not in self._search_thread_pool:
return
self._search_thread_pool[thread_id].on_trial_result(trial_id, result)
def suggest(self, trial_id: str) -> Optional[Dict]:
@ -282,92 +287,85 @@ class BlendSearch(Searcher):
'''
if self._init_used and not self._points_to_evaluate:
choice, backup = self._select_thread()
# print(f"choice={choice}, backup={backup}")
if choice < 0: return None # timeout
if choice < 0: # timeout
return None
self._use_rs = False
config = self._search_thread_pool[choice].suggest(trial_id)
# preliminary check; not checking config validation
skip = self._should_skip(choice, trial_id, config)
if skip:
if choice:
# print(f"skipping choice={choice}, config={config}")
if choice:
return None
# use rs when BO fails to suggest a config
self._use_rs = True
for _, generated in generate_variants(
{'config': self._ls.space}):
for _, generated in generate_variants({'config': self._ls.space}):
config = generated['config']
break # get one random config
# logger.debug(f"random config {config}")
break # get one random config
skip = self._should_skip(choice, trial_id, config)
if skip: return None
if choice or self._valid(config):
if skip:
return None
if choice or self._valid(config):
# LS or valid or no backup choice
self._trial_proposed_by[trial_id] = choice
if not choice: print(config)
else: # invalid config proposed by GS
# if not self._use_rs:
# self._search_thread_pool[choice].on_trial_complete(
# trial_id, {}, error=True) # tell GS there is an error
else: # invalid config proposed by GS
self._use_rs = False
if choice == backup:
# use CFO's init point
init_config = self._ls.init_config
config = self._ls.complete_config(init_config,
self._ls_bound_min, self._ls_bound_max)
config = self._ls.complete_config(
init_config, self._ls_bound_min, self._ls_bound_max)
self._trial_proposed_by[trial_id] = choice
else:
config = self._search_thread_pool[backup].suggest(trial_id)
skip = self._should_skip(backup, trial_id, config)
if skip:
if skip:
return None
self._trial_proposed_by[trial_id] = backup
choice = backup
if not choice: # global search
if self._ls._resource:
# TODO: add resource to config proposed by GS, min or median?
if not choice: # global search
if self._ls._resource:
# TODO: min or median?
config[self._ls.prune_attr] = self._ls.min_resource
# temporarily relax admissible region for parallel proposals
self._update_admissible_region(config, self._gs_admissible_min,
self._gs_admissible_max)
self._update_admissible_region(
config, self._gs_admissible_min, self._gs_admissible_max)
else:
self._update_admissible_region(config, self._ls_bound_min,
self._ls_bound_max)
self._update_admissible_region(
config, self._ls_bound_min, self._ls_bound_max)
self._gs_admissible_min.update(self._ls_bound_min)
self._gs_admissible_max.update(self._ls_bound_max)
self._result[self._ls.config_signature(config)] = {}
else: # use init config
# print("use init config")
else: # use init config
init_config = self._points_to_evaluate.pop(
0) if self._points_to_evaluate else self._ls.init_config
config = self._ls.complete_config(init_config,
self._ls_bound_min, self._ls_bound_max)
# logger.info(f"reset config to {config}")
config = self._ls.complete_config(
init_config, self._ls_bound_min, self._ls_bound_max)
config_signature = self._ls.config_signature(config)
result = self._result.get(config_signature)
if result: # tried before
# self.on_trial_complete(trial_id, result)
if result: # tried before
return None
elif result is None: # not tried before
elif result is None: # not tried before
self._result[config_signature] = {}
else: return None # running but no result yet
else: # running but no result yet
return None
self._init_used = True
self._trial_proposed_by[trial_id] = 0
# logger.info(f"config={config}")
return config
def _should_skip(self, choice, trial_id, config) -> bool:
''' if config is None or config's result is known or above mem threshold
return True; o.w. return False
'''
if config is None: return True
if config is None:
return True
config_signature = self._ls.config_signature(config)
exists = config_signature in self._result
# check mem constraint
if not exists and self._mem_threshold and self._mem_size(
config)>self._mem_threshold:
config) > self._mem_threshold:
self._result[config_signature] = {
self._metric:np.inf*self._ls.metric_op, 'time_total_s':1}
self._metric: np.inf * self._ls.metric_op, 'time_total_s': 1
}
exists = True
if exists:
if not self._use_rs:
@ -381,7 +379,7 @@ class BlendSearch(Searcher):
# else:
# # tell the thread there is an error
# self._search_thread_pool[choice].on_trial_complete(
# trial_id, {}, error=True)
# trial_id, {}, error=True)
return True
return False
@ -390,19 +388,21 @@ class BlendSearch(Searcher):
'''
# update priority
min_eci = self._deadline - time.time()
if min_eci <= 0: return -1, -1
if min_eci <= 0:
return -1, -1
max_speed = 0
for thread in self._search_thread_pool.values():
if thread.speed > max_speed: max_speed = thread.speed
for thread in self._search_thread_pool.values():
for thread in self._search_thread_pool.values():
if thread.speed > max_speed:
max_speed = thread.speed
for thread in self._search_thread_pool.values():
thread.update_eci(self._metric_target, max_speed)
if thread.eci < min_eci: min_eci = thread.eci
if thread.eci < min_eci:
min_eci = thread.eci
for thread in self._search_thread_pool.values():
thread.update_priority(min_eci)
top_thread_id = backup_thread_id = 0
priority1 = priority2 = self._search_thread_pool[0].priority
# print(f"priority of thread 0={priority1}, obj_best1={self._search_thread_pool[0].obj_best1}")
for thread_id, thread in self._search_thread_pool.items():
# if thread_id:
# print(
@ -411,7 +411,7 @@ class BlendSearch(Searcher):
# f"thread {thread_id}.can_suggest={thread.can_suggest}")
if thread_id and thread.can_suggest:
priority = thread.priority
if priority > priority1:
if priority > priority1:
priority1 = priority
top_thread_id = thread_id
if priority > priority2 or backup_thread_id == 0:
@ -426,30 +426,29 @@ class BlendSearch(Searcher):
for key in self._gs_admissible_min:
if key in config:
value = normalized_config[key]
# logger.info(
# f"{key},{value},{self._admissible_min[key]},{self._admissible_max[key]}")
if value+self._ls.STEPSIZE<self._gs_admissible_min[
key] or value>self._gs_admissible_max[key]+self._ls.STEPSIZE:
if value + self._ls.STEPSIZE < self._gs_admissible_min[key] \
or value > self._gs_admissible_max[key] + self._ls.STEPSIZE:
return False
return True
try:
from ray.tune import (uniform, quniform, choice, randint, qrandint, randn,
qrandn, loguniform, qloguniform)
except ImportError:
from ..tune.sample import (uniform, quniform, choice, randint, qrandint, randn,
qrandn, loguniform, qloguniform)
try:
from nni.tuner import Tuner as NNITuner
from nni.utils import extract_scalar_reward
try:
from ray.tune import (uniform, quniform, choice, randint, qrandint, randn,
qrandn, loguniform, qloguniform)
except:
from ..tune.sample import (uniform, quniform, choice, randint, qrandint, randn,
qrandn, loguniform, qloguniform)
class BlendSearchTuner(BlendSearch, NNITuner):
'''Tuner class for NNI
'''
def receive_trial_result(self, parameter_id, parameters, value,
**kwargs):
**kwargs):
'''
Receive trial's final result.
parameter_id: int
@ -458,10 +457,10 @@ try:
'''
result = {}
for key, value in parameters.items():
result['config/'+key] = value
result['config/' + key] = value
reward = extract_scalar_reward(value)
result[self._metric] = reward
# if nni does not report training cost,
# if nni does not report training cost,
# using sequence as an approximation.
# if no sequence, using a constant 1
result[self.cost_attr] = value.get(self.cost_attr, value.get(
@ -473,7 +472,7 @@ try:
'''
Returns a set of trial (hyper-)parameters, as a serializable object
parameter_id: int
'''
'''
return self.suggest(str(parameter_id))
...
@ -491,7 +490,7 @@ try:
if _type == 'choice':
config[key] = choice(v)
elif _type == 'randint':
config[key] = randint(v[0], v[1]-1)
config[key] = randint(v[0], v[1] - 1)
elif _type == 'uniform':
config[key] = uniform(v[0], v[1])
elif _type == 'quniform':
@ -506,14 +505,15 @@ try:
config[key] = qrandn(v[1], v[2], v[3])
else:
raise ValueError(
f'unsupported type in search_space {_type}')
f'unsupported type in search_space {_type}')
self._ls.set_search_properties(None, None, config)
if self._gs is not None:
self._gs.set_search_properties(None, None, config)
self._init_search()
except:
class BlendSearchTuner(BlendSearch): pass
except ImportError:
class BlendSearchTuner(BlendSearch):
pass
class CFO(BlendSearchTuner):
@ -524,7 +524,7 @@ class CFO(BlendSearchTuner):
def suggest(self, trial_id: str) -> Optional[Dict]:
# Number of threads is 1 or 2. Thread 0 is a vacuous thread
assert len(self._search_thread_pool)<3, len(self._search_thread_pool)
assert len(self._search_thread_pool) < 3, len(self._search_thread_pool)
if len(self._search_thread_pool) < 2:
# When a local converges, the number of threads is 1
# Need to restart
@ -533,7 +533,8 @@ class CFO(BlendSearchTuner):
def _select_thread(self) -> Tuple:
for key in self._search_thread_pool:
if key: return key, key
if key:
return key, key
def _create_condition(self, result: Dict) -> bool:
''' create thread condition
@ -542,19 +543,15 @@ class CFO(BlendSearchTuner):
def create_next(client):
'''A stateless API for HPO
''' functional API for HPO
'''
state = client.get_state()
setting = client.get_settings_dict()
if state is None:
# first time call
try:
from ray.tune import (uniform, quniform, choice, randint, qrandint, randn,
qrandn, loguniform, qloguniform)
from ray.tune.trial import Trial
except:
from ..tune.sample import (uniform, quniform, choice, randint, qrandint, randn,
qrandn, loguniform, qloguniform)
except ImportError:
from ..tune.trial import Trial
method = setting.get('method', 'BlendSearch')
mode = client.get_optimization_mode()
@ -575,7 +572,7 @@ def create_next(client):
space[key] = randint(value["min_val"], value["max_val"])
elif t == 'quantized_continuous':
space[key] = quniform(value["min_val"], value["max_val"],
value["step"])
value["step"])
init_config = setting.get('init_config', None)
if init_config:
points_to_evaluate = [init_config]
@ -588,12 +585,12 @@ def create_next(client):
elif method == 'CFO':
Algo = CFO
algo = Algo(
mode=mode,
metric=metric,
mode=mode,
metric=metric,
space=space,
points_to_evaluate=points_to_evaluate,
cat_hp_cost=cat_hp_cost,
)
)
time_budget_s = setting.get('time_budget_s', None)
if time_budget_s:
algo._deadline = time_budget_s + time.time()
@ -616,7 +613,7 @@ def create_next(client):
result[algo.metric] = trial.metrics[algo.metric].values[-1]
result[algo.cost_attr] = (end_time - trial.start_time).total_seconds()
for key, value in trial.hp_sample.items():
result['config/'+key] = value
result['config/' + key] = value
algo.on_trial_complete(trial_id, result=result)
# propose new trial
trial_id = Trial.generate_id()

View File

@ -44,32 +44,32 @@ class FLOW2(Searcher):
Args:
init_config: a dictionary of a partial or full initial config,
e.g. from a subset of controlled dimensions
to the initial low-cost values.
e.g. {'epochs':1}
to the initial low-cost values.
e.g. {'epochs': 1}
metric: A string of the metric name to optimize for.
minimization or maximization.
mode: A string in ['min', 'max'] to specify the objective as
cat_hp_cost: A dictionary from a subset of categorical dimensions
to the relative cost of each choice.
to the relative cost of each choice.
e.g.,
.. code-block:: python
{'tree_method': [1, 1, 2]}
i.e., the relative cost of the
i.e., the relative cost of the
three choices of 'tree_method' is 1, 1 and 2 respectively.
space: A dictionary to specify the search space.
prune_attr: A string of the attribute used for pruning.
prune_attr: A string of the attribute used for pruning.
Not necessarily in space.
When prune_attr is in space, it is a hyperparameter, e.g.,
When prune_attr is in space, it is a hyperparameter, e.g.,
'n_iters', and the best value is unknown.
When prune_attr is not in space, it is a resource dimension,
When prune_attr is not in space, it is a resource dimension,
e.g., 'sample_size', and the peak performance is assumed
to be at the max_resource.
min_resource: A float of the minimal resource to use for the
min_resource: A float of the minimal resource to use for the
prune_attr; only valid if prune_attr is not in space.
max_resource: A float of the maximal resource to use for the
max_resource: A float of the maximal resource to use for the
prune_attr; only valid if prune_attr is not in space.
resource_multiple_factor: A float of the multiplicative factor
used for increasing resource.
@ -98,7 +98,7 @@ class FLOW2(Searcher):
"For cost-frugal search, "
"consider providing init values for cost-related hps via "
"'init_config'."
)
)
self.init_config = init_config
self.best_config = flatten_dict(init_config)
self.cat_hp_cost = cat_hp_cost
@ -114,11 +114,11 @@ class FLOW2(Searcher):
def _init_search(self):
self._tunable_keys = []
self._bounded_keys = []
# choices of numeric values. integer encoding.
# choices of numeric values. integer encoding.
# value: (ordered list of choices,
# dict from choice to index in the ordered list)
self._ordered_choice_hp = {}
# choices with given cost. integer encoding.
self._ordered_choice_hp = {}
# choices with given cost. integer encoding.
# value: (array of choices ordered by cost,
# dict from choice to index in the ordered array)
self._ordered_cat_hp = {}
@ -126,8 +126,8 @@ class FLOW2(Searcher):
self._unordered_cat_hp = {}
self._cat_hp_cost = {}
for key, domain in self.space.items():
assert not (isinstance(domain, dict) and 'grid_search' in domain
), key+"'s domain is grid search which is not supported in FLOW2."
assert not (isinstance(domain, dict) and 'grid_search' in domain), \
f"{key}'s domain is grid search, not supported in FLOW^2."
if callable(getattr(domain, 'get_sampler', None)):
self._tunable_keys.append(key)
sampler = domain.get_sampler()
@ -145,53 +145,50 @@ class FLOW2(Searcher):
if cat_hp_cost and key in cat_hp_cost:
cost = np.array(cat_hp_cost[key])
ind = np.argsort(cost)
l = np.array(domain.categories)[ind]
ordered = np.array(domain.categories)[ind]
cost = self._cat_hp_cost[key] = cost[ind]
d = {}
for i, choice in enumerate(l):
for i, choice in enumerate(ordered):
d[choice] = i
self._ordered_cat_hp[key] = (l, d)
# self._step_lb = min(self._step_lb, 1.0/len(l))
self._ordered_cat_hp[key] = (ordered, d)
elif all(isinstance(x, int) or isinstance(x, float)
for x in domain.categories):
l = sorted(domain.categories)
for x in domain.categories):
ordered = sorted(domain.categories)
d = {}
for i, choice in enumerate(l):
for i, choice in enumerate(ordered):
d[choice] = i
self._ordered_choice_hp[key] = (l, d)
# self._step_lb = min(self._step_lb, 1.0/len(l))
self._ordered_choice_hp[key] = (ordered, d)
else:
self._unordered_cat_hp[key] = l = len(domain.categories)
# self._step_lb = min(self._step_lb, 1.0/l)
self._unordered_cat_hp[key] = len(domain.categories)
if str(sampler) != 'Normal':
self._bounded_keys.append(key)
self._space_keys = list(self.space.keys())
if (self.prune_attr and self.prune_attr not in self.space and
self.max_resource):
if (self.prune_attr and self.prune_attr not in self.space
and self.max_resource):
self._space_keys.append(self.prune_attr)
self.min_resource = self.min_resource or self._min_resource()
self._resource = self._round(self.min_resource)
# logger.info(min_resource)
# logger.info(max_resource)
# logger.info(self._resource)
else: self._resource = None
else:
self._resource = None
self.incumbent = {}
self.incumbent = self.normalize(self.best_config) # flattened
self.incumbent = self.normalize(self.best_config) # flattened
self.best_obj = self.cost_incumbent = None
self.dim = len(self._tunable_keys) # total # tunable dimensions
self._direction_tried = None
self._direction_tried = None
self._num_complete4incumbent = self._cost_complete4incumbent = 0
self._num_allowed4incumbent = 2 * self.dim
self._proposed_by = {} # trial_id: int -> incumbent: Dict
self.step = self.STEPSIZE * np.sqrt(self.dim)
lb = self.step_lower_bound
if lb > self.step: self.step = lb * 2
if lb > self.step:
self.step = lb * 2
# upper bound
self.step_ub = np.sqrt(self.dim)
if self.step > self.step_ub: self.step = self.step_ub
if self.step > self.step_ub:
self.step = self.step_ub
# maximal # consecutive no improvements
self.dir = 2**(self.dim)
self._configs = {} # dict from trial_id to config
self._configs = {} # dict from trial_id to config
self._K = 0
self._iter_best_config = self.trial_count = 1
self._reset_times = 0
@ -202,24 +199,26 @@ class FLOW2(Searcher):
def step_lower_bound(self) -> float:
step_lb = self._step_lb
for key in self._tunable_keys:
if key not in self.best_config: continue
if key not in self.best_config:
continue
domain = self.space[key]
sampler = domain.get_sampler()
if isinstance(sampler, sample.Quantized):
sampler_inner = sampler.get_sampler()
if str(sampler_inner) == 'LogUniform':
step_lb = min(step_lb,
np.log(1.0+sampler.q/self.best_config[key])/
np.log(domain.upper/domain.lower))
elif isinstance(domain, sample.Integer) and str(
sampler) == 'LogUniform':
step_lb = min(step_lb,
np.log(1.0+1.0/self.best_config[key])/
np.log(domain.upper/domain.lower))
if np.isinf(step_lb): step_lb = self.STEP_LOWER_BOUND
else: step_lb *= np.sqrt(self.dim)
step_lb = min(
step_lb, np.log(1.0 + sampler.q / self.best_config[key])
/ np.log(domain.upper / domain.lower))
elif isinstance(domain, sample.Integer) and str(sampler) == 'LogUniform':
step_lb = min(
step_lb, np.log(1.0 + 1.0 / self.best_config[key])
/ np.log(domain.upper / domain.lower))
if np.isinf(step_lb):
step_lb = self.STEP_LOWER_BOUND
else:
step_lb *= np.sqrt(self.dim)
return step_lb
@property
def resource(self) -> float:
return self._resource
@ -236,60 +235,61 @@ class FLOW2(Searcher):
return self.max_resource
return resource
def rand_vector_gaussian(self, dim, std = 1.0):
def rand_vector_gaussian(self, dim, std=1.0):
vec = self._random.normal(0, std, dim)
return vec
def complete_config(self, partial_config: Dict,
lower: Optional[Dict] = None, upper: Optional[Dict] = None) -> Dict:
def complete_config(
self, partial_config: Dict,
lower: Optional[Dict] = None, upper: Optional[Dict] = None
) -> Dict:
''' generate a complete config from the partial config input
add minimal resource to config if available
'''
if self._reset_times and partial_config==self.init_config:
if self._reset_times and partial_config == self.init_config:
# not the first time to complete init_config, use random gaussian
normalized = self.normalize(partial_config)
for key in normalized:
# don't change unordered cat choice
# don't change unordered cat choice
if key not in self._unordered_cat_hp:
if upper and lower:
u, l = upper[key], lower[key]
gauss_std = u-l or self.STEPSIZE
up, low = upper[key], lower[key]
gauss_std = up - low or self.STEPSIZE
# allowed bound
u += self.STEPSIZE
l -= self.STEPSIZE
up += self.STEPSIZE
low -= self.STEPSIZE
elif key in self._bounded_keys:
u, l, gauss_std = 1, 0, 1.0
else: u, l, gauss_std = np.Inf, -np.Inf, 1.0
up, low, gauss_std = 1, 0, 1.0
else:
up, low, gauss_std = np.Inf, -np.Inf, 1.0
if key in self._bounded_keys:
u = min(u, 1)
l = max(l, 0)
up = min(up, 1)
low = max(low, 0)
delta = self.rand_vector_gaussian(1, gauss_std)[0]
normalized[key] = max(l, min(u, normalized[key] + delta))
normalized[key] = max(low, min(up, normalized[key] + delta))
# use best config for unordered cat choice
config = self.denormalize(normalized)
else:
# first time init_config, or other configs, take as is
config = partial_config.copy()
if partial_config == self.init_config: self._reset_times += 1
if partial_config == self.init_config:
self._reset_times += 1
config = flatten_dict(config)
for key, value in self.space.items():
if key not in config:
config[key] = value
# logger.debug(f'before random {config}')
for _, generated in generate_variants({'config': config}):
config = generated['config']
break
# logger.debug(f'after random {config}')
if self._resource:
config[self.prune_attr] = self.min_resource
return unflatten_dict(config)
def create(self, init_config: Dict, obj: float, cost: float) -> Searcher:
flow2 = FLOW2(init_config, self.metric, self.mode, self._cat_hp_cost,
unflatten_dict(self.space), self.prune_attr,
self.min_resource, self.max_resource,
self.resource_multiple_factor, self._seed+1)
unflatten_dict(self.space), self.prune_attr,
self.min_resource, self.max_resource,
self.resource_multiple_factor, self._seed + 1)
flow2.best_obj = obj * self.metric_op # minimize internally
flow2.cost_incumbent = cost
return flow2
@ -309,16 +309,17 @@ class FLOW2(Searcher):
# normalize categorical
if key in self._ordered_cat_hp:
l, d = self._ordered_cat_hp[key]
config_norm[key] = (d[value]+0.5)/len(l) # center
config_norm[key] = (d[value] + 0.5) / len(l)
elif key in self._ordered_choice_hp:
l, d = self._ordered_choice_hp[key]
config_norm[key] = (d[value]+0.5)/len(l) # center
config_norm[key] = (d[value] + 0.5) / len(l)
elif key in self.incumbent:
config_norm[key] = self.incumbent[
key] if value == self.best_config[
key] else (self.incumbent[
key]+1)%self._unordered_cat_hp[key]
else: config_norm[key] = 0
key] + 1) % self._unordered_cat_hp[key]
else:
config_norm[key] = 0
continue
# Uniform/LogUniform/Normal/Base
sampler = domain.get_sampler()
@ -326,11 +327,11 @@ class FLOW2(Searcher):
# sampler is sample.Quantized
sampler = sampler.get_sampler()
if str(sampler) == 'LogUniform':
config_norm[key] = np.log(
value/domain.lower)/np.log(domain.upper/domain.lower)
config_norm[key] = np.log(value / domain.lower) / np.log(
domain.upper / domain.lower)
elif str(sampler) == 'Uniform':
config_norm[key] = (
value-domain.lower)/(domain.upper-domain.lower)
value - domain.lower) / (domain.upper - domain.lower)
elif str(sampler) == 'Normal':
# N(mean, sd) -> N(0,1)
config_norm[key] = (value - sampler.mean) / sampler.sd
@ -338,7 +339,6 @@ class FLOW2(Searcher):
# TODO? elif str(sampler) == 'Base': # sample.Function._CallSampler
# e.g., {test: sample_from(lambda spec: randn(10, 2).sample() * 0.01)}
config_norm[key] = value
# print(key+"'s value is not normalized")
else: # prune_attr
config_norm[key] = value
return config_norm
@ -359,19 +359,19 @@ class FLOW2(Searcher):
if key in self._ordered_cat_hp:
l, _ = self._ordered_cat_hp[key]
n = len(l)
config_denorm[key] = l[min(n-1,int(np.floor(value*n)))]
config_denorm[key] = l[min(n - 1, int(np.floor(value * n)))]
elif key in self._ordered_choice_hp:
l, _ = self._ordered_choice_hp[key]
n = len(l)
config_denorm[key] = l[min(n-1,int(np.floor(value*n)))]
config_denorm[key] = l[min(n - 1, int(np.floor(value * n)))]
else:
assert key in self.incumbent
if round(value) == self.incumbent[key]:
config_denorm[key] = self.best_config[key]
else: # ****random value each time!****
config_denorm[key] = self._random.choice([x
for x in domain.categories
if x!=self.best_config[key]])
else: # ****random value each time!****
config_denorm[key] = self._random.choice(
[x for x in domain.categories
if x != self.best_config[key]])
continue
# Uniform/LogUniform/Normal/Base
sampler = domain.get_sampler()
@ -381,10 +381,10 @@ class FLOW2(Searcher):
# Handle Log/Uniform
if str(sampler) == 'LogUniform':
config_denorm[key] = (
domain.upper/domain.lower)**value*domain.lower
domain.upper / domain.lower) ** value * domain.lower
elif str(sampler) == 'Uniform':
config_denorm[key] = value * (
domain.upper-domain.lower) + domain.lower
domain.upper - domain.lower) + domain.lower
elif str(sampler) == 'Normal':
# denormalization for 'Normal'
config_denorm[key] = value * sampler.sd + sampler.mean
@ -398,8 +398,6 @@ class FLOW2(Searcher):
# Handle int (4.6 -> 5)
if isinstance(domain, sample.Integer):
config_denorm[key] = int(round(config_denorm[key]))
# Handle int (4.6 -> 4)
# config_denorm[key] = domain.cast(config_denorm[key])
else: # prune_attr
config_denorm[key] = value
return config_denorm
@ -431,7 +429,7 @@ class FLOW2(Searcher):
self.trial_count += 1
if not error and result:
obj = result.get(self._metric)
if obj:
if obj:
obj *= self.metric_op
if self.best_obj is None or obj < self.best_obj:
self.best_obj, self.best_config = obj, self._configs[
@ -444,10 +442,11 @@ class FLOW2(Searcher):
self._cost_complete4incumbent = 0
self._num_allowed4incumbent = 2 * self.dim
self._proposed_by.clear()
if self._K > 0:
if self._K > 0:
# self._oldK must have been set when self._K>0
self.step *= np.sqrt(self._K/self._oldK)
if self.step > self.step_ub: self.step = self.step_ub
self.step *= np.sqrt(self._K / self._oldK)
if self.step > self.step_ub:
self.step = self.step_ub
self._iter_best_config = self.trial_count
return
proposed_by = self._proposed_by.get(trial_id)
@ -456,31 +455,30 @@ class FLOW2(Searcher):
self._num_complete4incumbent += 1
cost = result.get(
self.cost_attr) if result else self._trial_cost.get(trial_id)
if cost: self._cost_complete4incumbent += cost
if self._num_complete4incumbent >= 2*self.dim and \
self._num_allowed4incumbent == 0:
if cost:
self._cost_complete4incumbent += cost
if self._num_complete4incumbent >= 2 * self.dim and \
self._num_allowed4incumbent == 0:
self._num_allowed4incumbent = 2
if self._num_complete4incumbent == self.dir and (not self._resource
or self._resource == self.max_resource):
# check stuck condition if using max resource
if self._num_complete4incumbent == self.dir and (
not self._resource or self._resource == self.max_resource):
# check stuck condition if using max resource
if self.step >= self.step_lower_bound:
# decrease step size
self._oldK = self._K if self._K else self._iter_best_config
self._K = self.trial_count+1
self.step *= np.sqrt(self._oldK/self._K)
# logger.info(f"step={self.step}, lb={self.step_lower_bound}")
self._K = self.trial_count + 1
self.step *= np.sqrt(self._oldK / self._K)
self._num_complete4incumbent -= 2
if self._num_allowed4incumbent < 2:
self._num_allowed4incumbent = 2
# elif proposed_by: # proposed by older incumbent
# del self._proposed_by[trial_id]
# elif proposed_by: del self._proposed_by[trial_id]
def on_trial_result(self, trial_id: str, result: Dict):
''' early update of incumbent
'''
if result:
obj = result.get(self._metric)
if obj:
if obj:
obj *= self.metric_op
if self.best_obj is None or obj < self.best_obj:
self.best_obj = obj
@ -503,7 +501,7 @@ class FLOW2(Searcher):
def rand_vector_unit_sphere(self, dim) -> np.ndarray:
vec = self._random.normal(0, 1, dim)
mag = np.linalg.norm(vec)
return vec/mag
return vec / mag
def suggest(self, trial_id: str) -> Optional[Dict]:
''' suggest a new config, one of the following cases:
@ -513,8 +511,8 @@ class FLOW2(Searcher):
'''
if self._num_complete4incumbent > 0 and self.cost_incumbent and \
self._resource and self._resource < self.max_resource and (
self._cost_complete4incumbent >=
self.cost_incumbent * self.resource_multiple_factor):
self._cost_complete4incumbent
>= self.cost_incumbent * self.resource_multiple_factor):
# consider increasing resource using sum eval cost of complete
# configs
self._resource = self._round(
@ -529,7 +527,7 @@ class FLOW2(Searcher):
if self._direction_tried is not None:
# return negative direction
for i, key in enumerate(self._tunable_keys):
move[key] -= self._direction_tried[i]
move[key] -= self._direction_tried[i]
self._direction_tried = None
# propose a new direction
self._direction_tried = self.rand_vector_unit_sphere(
@ -548,7 +546,8 @@ class FLOW2(Searcher):
for key in self._bounded_keys:
value = config[key]
config[key] = max(0, min(1, value))
if self._resource: config[self.prune_attr] = self._resource
if self._resource:
config[self.prune_attr] = self._resource
@property
def can_suggest(self) -> bool:
@ -583,22 +582,23 @@ class FLOW2(Searcher):
def converged(self) -> bool:
''' return whether the local search has converged
'''
if self._num_complete4incumbent < self.dir-2: return False
if self._num_complete4incumbent < self.dir - 2:
return False
# check stepsize after enough configs are completed
return self.step < self.step_lower_bound
def reach(self, other: Searcher) -> bool:
''' whether the incumbent can reach the incumbent of other
'''
config1, config2 = self.best_config, other.best_config
config1, config2 = self.best_config, other.best_config
incumbent1, incumbent2 = self.incumbent, other.incumbent
if self._resource and config1[self.prune_attr]>config2[self.prune_attr]:
if self._resource and config1[self.prune_attr] > config2[self.prune_attr]:
# resource will not decrease
return False
for key in self._unordered_cat_hp:
# unordered cat choice is hard to reach by chance
if config1[key] != config2[key]: return False
delta = np.array([incumbent1[key]-incumbent2[key]
for key in self._tunable_keys])
if config1[key] != config2[key]:
return False
delta = np.array(
[incumbent1[key] - incumbent2[key] for key in self._tunable_keys])
return np.linalg.norm(delta) <= self.step

View File

@ -22,24 +22,24 @@ class SearchThread:
cost_attr = 'time_total_s'
eps = 1e-10
def __init__(self, mode: str = "min",
def __init__(self, mode: str = "min",
search_alg: Optional[Searcher] = None):
''' When search_alg is omitted, use local search FLOW2
'''
self._search_alg = search_alg
self._is_ls = isinstance(search_alg, FLOW2)
self._mode = mode
self._metric_op = 1 if mode=='min' else -1
self._metric_op = 1 if mode == 'min' else -1
self.cost_best = self.cost_last = self.cost_total = self.cost_best1 = \
getattr(search_alg, 'cost_incumbent', 0)
self.cost_best2 = 0
self.obj_best1 = self.obj_best2 = getattr(
search_alg, 'best_obj', np.inf) # inherently minimize
# eci: expected cost for improvement
search_alg, 'best_obj', np.inf) # inherently minimize
# eci: estimated cost for improvement
self.eci = self.cost_best
self.priority = self.speed = 0
self._init_config = True
self._init_config = True
def suggest(self, trial_id: str) -> Optional[Dict]:
''' use the suggest() of the underlying search algorithm
'''
@ -48,9 +48,9 @@ class SearchThread:
else:
try:
config = self._search_alg.suggest(trial_id)
except:
except FloatingPointError:
logger.warning(
f'The global search method raises error. '
'The global search method raises FloatingPointError. '
'Ignoring for this iteration.')
config = None
return config
@ -60,33 +60,36 @@ class SearchThread:
self.priority = eci * self.speed - self.obj_best1
def update_eci(self, metric_target: float,
max_speed: Optional[float] = np.inf):
# calculate eci: expected cost for improvement over metric_target;
max_speed: Optional[float] = np.inf):
# calculate eci: estimated cost for improvement over metric_target
best_obj = metric_target * self._metric_op
if not self.speed: self.speed = max_speed
if not self.speed:
self.speed = max_speed
self.eci = max(self.cost_total - self.cost_best1,
self.cost_best1 - self.cost_best2)
self.cost_best1 - self.cost_best2)
if self.obj_best1 > best_obj and self.speed > 0:
self.eci = max(self.eci, 2*(self.obj_best1-best_obj)/self.speed)
self.eci = max(self.eci, 2 * (self.obj_best1 - best_obj) / self.speed)
def _update_speed(self):
# calculate speed; use 0 for invalid speed temporarily
if self.obj_best2 > self.obj_best1:
if self.obj_best2 > self.obj_best1:
self.speed = (self.obj_best2 - self.obj_best1) / (
self.cost_total - self.cost_best2 + self.eps)
else: self.speed = 0
else:
self.speed = 0
def on_trial_complete(self, trial_id: str, result: Optional[Dict] = None,
error: bool = False):
''' update the statistics of the thread
'''
if not self._search_alg: return
if not hasattr(self._search_alg, '_ot_trials') or (not error and
trial_id in self._search_alg._ot_trials):
if not self._search_alg:
return
if not hasattr(self._search_alg, '_ot_trials') or (
not error and trial_id in self._search_alg._ot_trials):
# optuna doesn't handle error
if self._is_ls or not self._init_config:
self._search_alg.on_trial_complete(trial_id, result, error)
else:
else:
# init config is not proposed by self._search_alg
# under this thread
self._init_config = False
@ -94,8 +97,6 @@ class SearchThread:
if self.cost_attr in result:
self.cost_last = result[self.cost_attr]
self.cost_total += self.cost_last
# if not isinstance(self._search_alg, FLOW2):
# logger.info(f"result.metric{result[self._search_alg.metric]}")
if self._search_alg.metric in result:
obj = result[self._search_alg.metric] * self._metric_op
if obj < self.obj_best1:
@ -106,14 +107,14 @@ class SearchThread:
self.obj_best1 = obj
self.cost_best = self.cost_last
self._update_speed()
def on_trial_result(self, trial_id: str, result: Dict):
''' TODO update the statistics of the thread with partial result?
'''
# print('[SearchThread] on trial result')
if not self._search_alg: return
if not self._search_alg:
return
if not hasattr(self._search_alg, '_ot_trials') or (
trial_id in self._search_alg._ot_trials):
trial_id in self._search_alg._ot_trials):
self._search_alg.on_trial_result(trial_id, result)
if self.cost_attr in result and self.cost_last < result[self.cost_attr]:
self.cost_last = result[self.cost_attr]
@ -137,4 +138,3 @@ class SearchThread:
''' whether the thread can suggest new configs
'''
return self._search_alg.can_suggest

View File

@ -21,6 +21,11 @@ import logging
import os
import time
from typing import Dict, Optional, Union, List, Tuple
import pickle
from .variant_generator import parse_spec_vars
from ..tune.sample import Categorical, Domain, Float, Integer, LogUniform, \
Quantized, Uniform
from ..tune.trial import flatten_dict, unflatten_dict
logger = logging.getLogger(__name__)
@ -72,7 +77,7 @@ def log_once(key):
return False
else:
return False
class Searcher:
"""Abstract class for wrapping suggesting algorithms.
@ -407,12 +412,6 @@ class ConcurrencyLimiter(Searcher):
return self.searcher.set_search_properties(metric, mode, config)
import pickle
from .variant_generator import parse_spec_vars
from ..tune.sample import Categorical, Domain, Float, Integer, LogUniform, \
Quantized, Uniform
from ..tune.trial import flatten_dict, unflatten_dict
try:
import optuna as ot
from optuna.samplers import BaseSampler
@ -689,4 +688,4 @@ class OptunaSearch(Searcher):
for path, domain in domain_vars
]
return values
return values

View File

@ -66,8 +66,8 @@ def unflatten_dict(dt, delimiter="/"):
item = item.setdefault(k, dict_type())
item[path[-1]] = val
return out
class TuneError(Exception):
"""General error class raised by ray.tune."""
pass
@ -433,4 +433,4 @@ class _UnresolvedAccessGuard(dict):
class RecursiveDependencyError(Exception):
def __init__(self, msg: str):
Exception.__init__(self, msg)
Exception.__init__(self, msg)

View File

@ -1,6 +1,6 @@
'''!
* Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License.
* Licensed under the MIT License.
'''
import json
@ -118,7 +118,7 @@ class TrainingLogWriter(object):
def close(self):
self.file.close()
self.file = None # for pickle
self.file = None # for pickle
class TrainingLogReader(object):
@ -142,7 +142,7 @@ class TrainingLogReader(object):
def close(self):
self.file.close()
self.file = None # for pickle
self.file = None # for pickle
def get_record(self, record_id) -> TrainingLogRecord:
if self.file is None:

View File

@ -1,7 +1,7 @@
try:
from ray.tune import (uniform, quniform, choice, randint, qrandint, randn,
qrandn, loguniform, qloguniform)
except:
qrandn, loguniform, qloguniform)
except ImportError:
from .sample import (uniform, quniform, choice, randint, qrandint, randn,
qrandn, loguniform, qloguniform)
from .tune import run, report
qrandn, loguniform, qloguniform)
from .tune import run, report

View File

@ -532,4 +532,4 @@ def qrandn(mean: float, sd: float, q: float):
q (float): Quantization number. The result will be rounded to an
integer increment of this value.
"""
return Float(None, None).normal(mean, sd).quantized(q)
return Float(None, None).normal(mean, sd).quantized(q)

View File

@ -121,8 +121,7 @@ class Trial:
self.metric_analysis[metric]["min"] = min(
value, self.metric_analysis[metric]["min"])
self.metric_analysis[metric]["avg"] = 1 / step * (
value +
(step - 1) * self.metric_analysis[metric]["avg"])
value + (step - 1) * self.metric_analysis[metric]["avg"])
self.metric_analysis[metric]["last"] = value
for n in self.n_steps:

View File

@ -6,7 +6,7 @@
from typing import Optional
try:
from ray.tune.trial import Trial
except:
except ImportError:
from .trial import Trial
import logging
logger = logging.getLogger(__name__)
@ -16,14 +16,15 @@ class Nologger():
'''Logger without logging
'''
def on_result(self, result): pass
def on_result(self, result):
pass
class SimpleTrial(Trial):
'''A simple trial class
'''
def __init__(self, config, trial_id = None):
def __init__(self, config, trial_id=None):
self.trial_id = Trial.generate_id() if trial_id is None else trial_id
self.config = config or {}
self.status = Trial.PENDING
@ -46,17 +47,16 @@ class BaseTrialRunner:
Note that the caller usually should not mutate trial state directly.
"""
def __init__(self,
search_alg = None,
scheduler = None,
metric: Optional[str] = None,
mode: Optional[str] = 'min'):
def __init__(self,
search_alg=None, scheduler=None,
metric: Optional[str] = None,
mode: Optional[str] = 'min'):
self._search_alg = search_alg
self._scheduler_alg = scheduler
self._scheduler_alg = scheduler
self._trials = []
self._metric = metric
self._mode = mode
def get_trials(self):
"""Returns the list of trials managed by this TrialRunner.
@ -81,22 +81,22 @@ class BaseTrialRunner:
self._search_alg.on_trial_result(trial.trial_id, result)
if self._scheduler_alg:
decision = self._scheduler_alg.on_trial_result(self, trial, result)
if decision == "STOP": trial.set_status(Trial.TERMINATED)
elif decision == "PAUSE": trial.set_status(Trial.PAUSED)
if decision == "STOP":
trial.set_status(Trial.TERMINATED)
elif decision == "PAUSE":
trial.set_status(Trial.PAUSED)
def stop_trial(self, trial):
"""Stops trial.
"""
if not trial.status in [Trial.ERROR, Trial.TERMINATED]:
if trial.status not in [Trial.ERROR, Trial.TERMINATED]:
if self._scheduler_alg:
self._scheduler_alg.on_trial_complete(self,
trial.trial_id, trial.last_result)
self._search_alg.on_trial_complete(
trial.trial_id, trial.last_result)
self._scheduler_alg.on_trial_complete(
self, trial.trial_id, trial.last_result)
self._search_alg.on_trial_complete(trial.trial_id, trial.last_result)
trial.set_status(Trial.TERMINATED)
else:
if self._scheduler_alg:
self._scheduler_alg.on_trial_remove(self, trial)
elif self._scheduler_alg:
self._scheduler_alg.on_trial_remove(self, trial)
class SequentialTrialRunner(BaseTrialRunner):
@ -112,10 +112,11 @@ class SequentialTrialRunner(BaseTrialRunner):
"""
trial_id = Trial.generate_id()
config = self._search_alg.suggest(trial_id)
if config:
if config:
trial = SimpleTrial(config, trial_id)
self.add_trial(trial)
trial.set_status(Trial.RUNNING)
else: trial = None
else:
trial = None
self.running_trial = trial
return trial

View File

@ -3,13 +3,13 @@
* Licensed under the MIT License. See LICENSE file in the
* project root for license information.
'''
from typing import Optional, Union, List
import datetime, time
from typing import Optional, Union, List, Callable
import datetime
import time
try:
from ray.tune.analysis import ExperimentAnalysis as EA
except:
except ImportError:
from .analysis import ExperimentAnalysis as EA
import logging
logger = logging.getLogger(__name__)
@ -28,7 +28,7 @@ class ExperimentAnalysis(EA):
def __init__(self, trials, metric, mode):
try:
super().__init__(self, None, trials, metric, mode)
except:
except (TypeError, ValueError):
self.trials = trials
self.default_metric = metric
self.default_mode = mode
@ -59,7 +59,7 @@ def report(_metric=None, **kwargs):
},
metric='metric2minimize', mode='min',
num_samples=1000000, time_budget_s=60, use_ray=False)
print(analysis.trials[-1].last_result)
Args:
@ -78,7 +78,8 @@ def report(_metric=None, **kwargs):
result = kwargs
if _verbose == 2:
logger.info(f"result: {kwargs}")
if _metric: result['_default_anonymous_metric'] = _metric
if _metric:
result['_default_anonymous_metric'] = _metric
trial = _runner.running_trial
if _running_trial == trial:
_training_iteration += 1
@ -88,14 +89,15 @@ def report(_metric=None, **kwargs):
result["training_iteration"] = _training_iteration
result['config'] = trial.config
for key, value in trial.config.items():
result['config/'+key] = value
result['config/' + key] = value
_runner.process_trial_result(_runner.running_trial, result)
result['time_total_s'] = trial.last_update_time - trial.start_time
if _verbose > 2:
logger.info(f"result: {result}")
if _runner.running_trial.is_finished():
return None
else: return True
else:
return True
def run(training_function,
@ -111,14 +113,13 @@ def run(training_function,
max_resource: Optional[float] = None,
reduction_factor: Optional[float] = None,
report_intermediate_result: Optional[bool] = False,
search_alg = None,
verbose: Optional[int] = 2,
search_alg=None,
verbose: Optional[int] = 2,
local_dir: Optional[str] = None,
num_samples: Optional[int] = 1,
resources_per_trial: Optional[dict] = None,
mem_size = None,
use_ray: Optional[bool] = False,
):
mem_size: Callable[[dict], float] = None,
use_ray: Optional[bool] = False):
'''The trigger for HPO.
Example:
@ -142,53 +143,53 @@ def run(training_function,
},
metric='metric2minimize', mode='min',
num_samples=-1, time_budget_s=60, use_ray=False)
print(analysis.trials[-1].last_result)
Args:
training_function: A user-defined training function.
training_function: A user-defined training function.
config: A dictionary to specify the search space.
points_to_evaluate: A list of initial hyperparameter
configurations to run first.
low_cost_partial_config: A dictionary from a subset of
low_cost_partial_config: A dictionary from a subset of
controlled dimensions to the initial low-cost values.
e.g.,
e.g.,
.. code-block:: python
{'n_estimators': 4, 'max_leaves': 4}
cat_hp_cost: A dictionary from a subset of categorical dimensions
to the relative cost of each choice.
to the relative cost of each choice.
e.g.,
.. code-block:: python
{'tree_method': [1, 1, 2]}
i.e., the relative cost of the
i.e., the relative cost of the
three choices of 'tree_method' is 1, 1 and 2 respectively
metric: A string of the metric name to optimize for.
mode: A string in ['min', 'max'] to specify the objective as
minimization or maximization.
time_budget_s: A float of the time budget in seconds.
prune_attr: A string of the attribute used for pruning.
prune_attr: A string of the attribute used for pruning.
Not necessarily in space.
When prune_attr is in space, it is a hyperparameter, e.g.,
When prune_attr is in space, it is a hyperparameter, e.g.,
'n_iters', and the best value is unknown.
When prune_attr is not in space, it is a resource dimension,
When prune_attr is not in space, it is a resource dimension,
e.g., 'sample_size', and the peak performance is assumed
to be at the max_resource.
min_resource: A float of the minimal resource to use for the
min_resource: A float of the minimal resource to use for the
prune_attr; only valid if prune_attr is not in space.
max_resource: A float of the maximal resource to use for the
max_resource: A float of the maximal resource to use for the
prune_attr; only valid if prune_attr is not in space.
reduction_factor: A float of the reduction factor used for incremental
pruning.
report_intermediate_result: A boolean of whether intermediate results
are reported. If so, early stopping and pruning can be used.
search_alg: An instance of BlendSearch as the search algorithm
to be used. The same instance can be used for iterative tuning.
to be used. The same instance can be used for iterative tuning.
e.g.,
.. code-block:: python
@ -201,7 +202,7 @@ def run(training_function,
analysis = tune.run(compute_with_config,
search_alg=algo, use_ray=False)
print(analysis.trials[-1].last_result)
verbose: 0, 1, 2, or 3. Verbosity mode for ray if ray backend is used.
0 = silent, 1 = only status updates, 2 = status and brief trial
results, 3 = status and detailed trial results. Defaults to 2.
@ -215,7 +216,7 @@ def run(training_function,
mem_size: A function to estimate the memory size for a given config.
It is used to skip configs which do not fit in memory.
use_ray: A boolean of whether to use ray as the backend
'''
'''
global _use_ray
global _verbose
if not use_ray:
@ -224,8 +225,8 @@ def run(training_function,
import os
if local_dir:
os.makedirs(local_dir, exist_ok=True)
logger.addHandler(logging.FileHandler(local_dir+'/tune_'+str(
datetime.datetime.now()).replace(':', '-')+'.log'))
logger.addHandler(logging.FileHandler(local_dir + '/tune_' + str(
datetime.datetime.now()).replace(':', '-') + '.log'))
elif not logger.handlers:
# Add the console handler.
_ch = logging.StreamHandler()
@ -233,8 +234,8 @@ def run(training_function,
'[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s',
'%m-%d %H:%M:%S')
_ch.setFormatter(logger_formatter)
logger.addHandler(_ch)
if verbose<=2:
logger.addHandler(_ch)
if verbose <= 2:
logger.setLevel(logging.INFO)
else:
logger.setLevel(logging.DEBUG)
@ -243,55 +244,54 @@ def run(training_function,
if search_alg is None:
from ..searcher.blendsearch import BlendSearch
search_alg = BlendSearch(metric=metric, mode=mode,
points_to_evaluate=points_to_evaluate,
low_cost_partial_config=low_cost_partial_config,
cat_hp_cost=cat_hp_cost,
space=config, prune_attr=prune_attr,
min_resource=min_resource,
max_resource=max_resource,
reduction_factor=reduction_factor,
resources_per_trial=resources_per_trial,
mem_size=mem_size)
search_alg = BlendSearch(
metric=metric, mode=mode, space=config,
points_to_evaluate=points_to_evaluate,
low_cost_partial_config=low_cost_partial_config,
cat_hp_cost=cat_hp_cost,
prune_attr=prune_attr,
min_resource=min_resource, max_resource=max_resource,
reduction_factor=reduction_factor,
resources_per_trial=resources_per_trial,
mem_size=mem_size)
if time_budget_s:
search_alg.set_search_properties(metric, mode, config={
'time_budget_s':time_budget_s})
'time_budget_s': time_budget_s})
scheduler = None
if report_intermediate_result:
params = {}
# scheduler resource_dimension=prune_attr
if prune_attr: params['time_attr'] = prune_attr
if max_resource: params['max_t'] = max_resource
if min_resource: params['grace_period'] = min_resource
if reduction_factor: params['reduction_factor'] = reduction_factor
if prune_attr:
params['time_attr'] = prune_attr
if max_resource:
params['max_t'] = max_resource
if min_resource:
params['grace_period'] = min_resource
if reduction_factor:
params['reduction_factor'] = reduction_factor
try:
from ray.tune.schedulers import ASHAScheduler
scheduler = ASHAScheduler(**params)
except:
scheduler = None
else:
scheduler = None
except ImportError:
pass
if use_ray:
try:
from ray import tune
except:
except ImportError:
raise ImportError("Failed to import ray tune. "
"Please install ray[tune] or set use_ray=False")
"Please install ray[tune] or set use_ray=False")
_use_ray = True
return tune.run(training_function,
metric=metric,
mode=mode,
search_alg=search_alg,
scheduler=scheduler,
time_budget_s=time_budget_s,
verbose=verbose,
local_dir=local_dir,
num_samples=num_samples,
resources_per_trial=resources_per_trial
)
metric=metric, mode=mode,
search_alg=search_alg,
scheduler=scheduler,
time_budget_s=time_budget_s,
verbose=verbose, local_dir=local_dir,
num_samples=num_samples,
resources_per_trial=resources_per_trial)
# simple sequential run without using tune.run() from ray
time_start = time.time()
time_start = time.time()
_use_ray = False
if scheduler:
scheduler.set_search_properties(metric=metric, mode=mode)
@ -302,10 +302,10 @@ def run(training_function,
scheduler=scheduler,
metric=metric,
mode=mode,
)
)
num_trials = 0
while time.time()-time_start<time_budget_s and (
num_samples<0 or num_trials<num_samples):
while time.time() - time_start < time_budget_s and (
num_samples < 0 or num_trials < num_samples):
trial_to_run = _runner.step()
if trial_to_run:
num_trials += 1
@ -313,4 +313,4 @@ def run(training_function,
logger.info(f'trial {num_trials} config: {trial_to_run.config}')
training_function(trial_to_run.config)
_runner.stop_trial(trial_to_run)
return ExperimentAnalysis(_runner.get_trials(), metric=metric, mode=mode)
return ExperimentAnalysis(_runner.get_trials(), metric=metric, mode=mode)

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -474,7 +474,7 @@
"args = TrainingArguments(\n",
" output_dir='output',\n",
" do_eval=True,\n",
" )"
")"
]
},
{
@ -569,7 +569,7 @@
" flaml.tune.report(\n",
" loss=eval_output[\"eval_loss\"],\n",
" matthews_correlation=eval_output[\"eval_matthews_correlation\"],\n",
" )"
" )"
]
},
{
@ -599,7 +599,7 @@
" \"adam_epsilon\": flaml.tune.loguniform(1e-9, 1e-7),\n",
" \"adam_beta1\": flaml.tune.uniform(0.8, 0.99),\n",
" \"adam_beta2\": flaml.tune.loguniform(98e-2, 9999e-4),\n",
" }"
"}"
]
},
{

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -54,7 +54,7 @@ setuptools.setup(
],
"ray": [
"ray[tune]==1.2.0",
"pyyaml<5.3.1",
"pyyaml<5.3.1",
],
"azureml": [
"azureml-mlflow",
@ -66,7 +66,7 @@ setuptools.setup(
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Operating System :: OS Independent",
],
python_requires=">=3.6",
)

View File

@ -2,6 +2,7 @@
'''
import time
import numpy as np
import os
try:
import ray
@ -15,6 +16,7 @@ try:
Trainer,
TrainingArguments,
)
import flaml
MODEL_CHECKPOINT = "microsoft/deberta-base"
task_to_keys = {
"cola": ("sentence", None),
@ -27,9 +29,9 @@ try:
"stsb": ("sentence1", "sentence2"),
"wnli": ("sentence1", "sentence2"),
}
max_seq_length=128
overwrite_cache=False
pad_to_max_length=True
max_seq_length = 128
overwrite_cache = False
pad_to_max_length = True
padding = "max_length"
TASK = "qnli"
@ -46,19 +48,17 @@ try:
examples[sentence1_key], examples[sentence2_key])
)
return tokenizer(*args, padding=padding, max_length=max_seq_length,
truncation=True)
truncation=True)
except:
except ImportError:
print("pip install torch transformers datasets flaml[blendsearch,ray]")
import logging
logger = logging.getLogger(__name__)
import os
os.makedirs('logs', exist_ok=True)
logger.addHandler(logging.FileHandler('logs/tune_deberta.log'))
logger.setLevel(logging.INFO)
import flaml
def train_deberta(config: dict):
@ -76,7 +76,6 @@ def train_deberta(config: dict):
predictions = np.argmax(predictions, axis=1)
return metric.compute(predictions=predictions, references=labels)
model = AutoModelForSequenceClassification.from_pretrained(
MODEL_CHECKPOINT, num_labels=NUM_LABELS
)
@ -109,7 +108,7 @@ def train_deberta(config: dict):
flaml.tune.report(
loss=eval_output["eval_loss"],
accuracy=eval_output["eval_accuracy"],
)
)
try:
from azureml.core import Run
@ -117,10 +116,12 @@ def train_deberta(config: dict):
run.log('accuracy', eval_output["eval_accuracy"])
run.log('loss', eval_output["eval_loss"])
run.log('config', config)
except: pass
except ImportError:
pass
def _test_deberta(method='BlendSearch'):
max_num_epoch = 100
num_samples = -1
time_budget_s = 3600

View File

@ -2,6 +2,8 @@
'''
import time
import numpy as np
import logging
import os
try:
import ray
@ -15,6 +17,7 @@ try:
Trainer,
TrainingArguments,
)
import flaml
MODEL_CHECKPOINT = "distilbert-base-uncased"
TASK = "cola"
NUM_LABELS = 2
@ -26,20 +29,18 @@ try:
# Define tokenize method
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)
def tokenize(examples):
return tokenizer(examples[COLUMN_NAME], truncation=True)
except:
except ImportError:
print("pip install torch transformers datasets flaml[blendsearch,ray]")
import logging
logger = logging.getLogger(__name__)
import os
os.makedirs('logs', exist_ok=True)
logger.addHandler(logging.FileHandler('logs/tune_distilbert.log'))
logger.setLevel(logging.INFO)
import flaml
def train_distilbert(config: dict):
@ -87,11 +88,11 @@ def train_distilbert(config: dict):
flaml.tune.report(
loss=eval_output["eval_loss"],
matthews_correlation=eval_output["eval_matthews_correlation"],
)
)
def _test_distillbert(method='BlendSearch'):
max_num_epoch = 64
num_samples = -1
time_budget_s = 3600

View File

@ -2,6 +2,7 @@
'''
import time
import numpy as np
import os
try:
import ray
@ -15,6 +16,7 @@ try:
Trainer,
TrainingArguments,
)
import flaml
MODEL_CHECKPOINT = "google/electra-base-discriminator"
task_to_keys = {
"cola": ("sentence", None),
@ -27,9 +29,9 @@ try:
"stsb": ("sentence1", "sentence2"),
"wnli": ("sentence1", "sentence2"),
}
max_seq_length=128
overwrite_cache=False
pad_to_max_length=True
max_seq_length = 128
overwrite_cache = False
pad_to_max_length = True
padding = "max_length"
TASK = "qnli"
@ -46,19 +48,17 @@ try:
examples[sentence1_key], examples[sentence2_key])
)
return tokenizer(*args, padding=padding, max_length=max_seq_length,
truncation=True)
truncation=True)
except:
except ImportError:
print("pip install torch transformers datasets flaml[blendsearch,ray]")
import logging
logger = logging.getLogger(__name__)
import os
os.makedirs('logs', exist_ok=True)
logger.addHandler(logging.FileHandler('logs/tune_electra.log'))
logger.setLevel(logging.INFO)
import flaml
def train_electra(config: dict):
@ -76,7 +76,6 @@ def train_electra(config: dict):
predictions = np.argmax(predictions, axis=1)
return metric.compute(predictions=predictions, references=labels)
model = AutoModelForSequenceClassification.from_pretrained(
MODEL_CHECKPOINT, num_labels=NUM_LABELS
)
@ -109,7 +108,7 @@ def train_electra(config: dict):
flaml.tune.report(
loss=eval_output["eval_loss"],
accuracy=eval_output["eval_accuracy"],
)
)
try:
from azureml.core import Run
@ -117,10 +116,12 @@ def train_electra(config: dict):
run.log('accuracy', eval_output["eval_accuracy"])
run.log('loss', eval_output["eval_loss"])
run.log('config', config)
except: pass
except ImportError:
pass
def _test_electra(method='BlendSearch'):
max_num_epoch = 9
num_samples = -1
time_budget_s = 3600
@ -247,4 +248,4 @@ def _test_electra_bohb():
if __name__ == "__main__":
_test_electra()
_test_electra()

View File

@ -2,6 +2,7 @@
'''
import time
import numpy as np
import os
try:
import ray
@ -15,6 +16,7 @@ try:
Trainer,
TrainingArguments,
)
import flaml
MODEL_CHECKPOINT = "roberta-base"
task_to_keys = {
"cola": ("sentence", None),
@ -27,9 +29,9 @@ try:
"stsb": ("sentence1", "sentence2"),
"wnli": ("sentence1", "sentence2"),
}
max_seq_length=128
overwrite_cache=False
pad_to_max_length=True
max_seq_length = 128
overwrite_cache = False
pad_to_max_length = True
padding = "max_length"
TASK = "qnli"
@ -46,19 +48,17 @@ try:
examples[sentence1_key], examples[sentence2_key])
)
return tokenizer(*args, padding=padding, max_length=max_seq_length,
truncation=True)
truncation=True)
except:
except ImportError:
print("pip install torch transformers datasets flaml[blendsearch,ray]")
import logging
logger = logging.getLogger(__name__)
import os
os.makedirs('logs', exist_ok=True)
logger.addHandler(logging.FileHandler('logs/tune_roberta.log'))
logger.setLevel(logging.INFO)
import flaml
def train_roberta(config: dict):
@ -76,7 +76,6 @@ def train_roberta(config: dict):
predictions = np.argmax(predictions, axis=1)
return metric.compute(predictions=predictions, references=labels)
model = AutoModelForSequenceClassification.from_pretrained(
MODEL_CHECKPOINT, num_labels=NUM_LABELS
)
@ -109,7 +108,7 @@ def train_roberta(config: dict):
flaml.tune.report(
loss=eval_output["eval_loss"],
accuracy=eval_output["eval_accuracy"],
)
)
try:
from azureml.core import Run
@ -117,10 +116,12 @@ def train_roberta(config: dict):
run.log('accuracy', eval_output["eval_accuracy"])
run.log('loss', eval_output["eval_loss"])
run.log('config', config)
except: pass
except ImportError:
pass
def _test_roberta(method='BlendSearch'):
max_num_epoch = 100
num_samples = -1
time_budget_s = 3600
@ -248,4 +249,3 @@ def _test_roberta_bohb():
if __name__ == "__main__":
_test_roberta()

View File

@ -2,6 +2,6 @@ from flaml.searcher.blendsearch import BlendSearchTuner as BST
class BlendSearchTuner(BST):
# for best performance pass low cost initial parameters here
def __init__(self, low_cost_partial_config={"hidden_size":128}):
# for best performance pass low cost initial parameters here
def __init__(self, low_cost_partial_config={"hidden_size": 128}):
super.__init__(self, low_cost_partial_config=low_cost_partial_config)

View File

@ -27,7 +27,7 @@ class Net(nn.Module):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 20, 5, 1)
self.conv2 = nn.Conv2d(20, 50, 5, 1)
self.fc1 = nn.Linear(4*4*50, hidden_size)
self.fc1 = nn.Linear(4 * 4 * 50, hidden_size)
self.fc2 = nn.Linear(hidden_size, 10)
def forward(self, x):
@ -35,7 +35,7 @@ class Net(nn.Module):
x = F.max_pool2d(x, 2, 2)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2, 2)
x = x.view(-1, 4*4*50)
x = x.view(-1, 4 * 4 * 50)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return F.log_softmax(x, dim=1)
@ -151,7 +151,6 @@ def get_params():
parser.add_argument('--log_interval', type=int, default=1000, metavar='N',
help='how many batches to wait before logging training status')
args, _ = parser.parse_known_args()
return args

View File

@ -15,5 +15,5 @@ config = ScriptRunConfig(
exp = Experiment(ws, 'test-electra')
run = exp.submit(config)
print(run.get_portal_url()) # link to ml.azure.com
run.wait_for_completion(show_output=True)
print(run.get_portal_url()) # link to ml.azure.com
run.wait_for_completion(show_output=True)

View File

@ -14,10 +14,9 @@ from flaml import tune
class MyRegularizedGreedyForest(SKLearnEstimator):
def __init__(self, task = 'binary:logistic', n_jobs = 1, max_leaf = 4,
n_iter = 1, n_tree_search = 1, opt_interval = 1, learning_rate = 1.0,
min_samples_leaf = 1, **params):
def __init__(self, task='binary:logistic', n_jobs=1, max_leaf=4,
n_iter=1, n_tree_search=1, opt_interval=1, learning_rate=1.0,
min_samples_leaf=1, **params):
super().__init__(task, **params)
@ -34,24 +33,24 @@ class MyRegularizedGreedyForest(SKLearnEstimator):
'n_tree_search': int(round(n_tree_search)),
'opt_interval': int(round(opt_interval)),
'learning_rate': learning_rate,
'min_samples_leaf':int(round(min_samples_leaf))
}
'min_samples_leaf': int(round(min_samples_leaf))
}
@classmethod
def search_space(cls, data_size, task):
space = {
'max_leaf': {'domain': tune.qloguniform(
lower = 4, upper = data_size, q = 1), 'init_value': 4},
'n_iter': {'domain': tune.qloguniform(
lower = 1, upper = data_size, q = 1), 'init_value': 1},
'n_tree_search': {'domain': tune.qloguniform(
lower = 1, upper = 32768, q = 1), 'init_value': 1},
'opt_interval': {'domain': tune.qloguniform(
lower = 1, upper = 10000, q = 1), 'init_value': 100},
'learning_rate': {'domain': tune.loguniform(
lower = 0.01, upper = 20.0)},
'min_samples_leaf': {'domain': tune.qloguniform(
lower = 1, upper = 20, q = 1), 'init_value': 20},
'max_leaf': {'domain': tune.qloguniform(
lower=4, upper=data_size, q=1), 'init_value': 4},
'n_iter': {'domain': tune.qloguniform(
lower=1, upper=data_size, q=1), 'init_value': 1},
'n_tree_search': {'domain': tune.qloguniform(
lower=1, upper=32768, q=1), 'init_value': 1},
'opt_interval': {'domain': tune.qloguniform(
lower=1, upper=10000, q=1), 'init_value': 100},
'learning_rate': {'domain': tune.loguniform(
lower=0.01, upper=20.0)},
'min_samples_leaf': {'domain': tune.qloguniform(
lower=1, upper=20, q=1), 'init_value': 20},
}
return space
@ -59,22 +58,22 @@ class MyRegularizedGreedyForest(SKLearnEstimator):
def size(cls, config):
max_leaves = int(round(config['max_leaf']))
n_estimators = int(round(config['n_iter']))
return (max_leaves*3 + (max_leaves-1)*4 + 1.0)*n_estimators*8
return (max_leaves * 3 + (max_leaves - 1) * 4 + 1.0) * n_estimators * 8
@classmethod
def cost_relative2lgbm(cls):
return 1.0
return 1.0
def custom_metric(X_test, y_test, estimator, labels, X_train, y_train,
weight_test=None, weight_train=None):
weight_test=None, weight_train=None):
from sklearn.metrics import log_loss
y_pred = estimator.predict_proba(X_test)
test_loss = log_loss(y_test, y_pred, labels=labels,
sample_weight=weight_test)
sample_weight=weight_test)
y_pred = estimator.predict_proba(X_train)
train_loss = log_loss(y_train, y_pred, labels=labels,
sample_weight=weight_train)
sample_weight=weight_train)
alpha = 0.5
return test_loss * (1 + alpha) - alpha * train_loss, [test_loss, train_loss]
@ -83,43 +82,42 @@ class TestAutoML(unittest.TestCase):
def test_custom_learner(self):
automl = AutoML()
automl.add_learner(learner_name = 'RGF',
learner_class = MyRegularizedGreedyForest)
automl.add_learner(learner_name='RGF',
learner_class=MyRegularizedGreedyForest)
X_train, y_train = load_wine(return_X_y=True)
settings = {
"time_budget": 10, # total running time in seconds
"estimator_list": ['RGF', 'lgbm', 'rf', 'xgboost'],
"task": 'classification', # task type
"sample": True, # whether to subsample training data
"time_budget": 10, # total running time in seconds
"estimator_list": ['RGF', 'lgbm', 'rf', 'xgboost'],
"task": 'classification', # task type
"sample": True, # whether to subsample training data
"log_file_name": "test/wine.log",
"log_training_metric": True, # whether to log training metric
"log_training_metric": True, # whether to log training metric
"n_jobs": 1,
}
'''The main flaml automl API'''
automl.fit(X_train = X_train, y_train = y_train, **settings)
automl.fit(X_train=X_train, y_train=y_train, **settings)
# print the best model found for RGF
print(automl.best_model_for_estimator("RGF"))
def test_ensemble(self):
automl = AutoML()
automl.add_learner(learner_name = 'RGF',
learner_class = MyRegularizedGreedyForest)
automl.add_learner(learner_name='RGF',
learner_class=MyRegularizedGreedyForest)
X_train, y_train = load_wine(return_X_y=True)
settings = {
"time_budget": 10, # total running time in seconds
# "estimator_list": ['lgbm', 'xgboost'],
"estimator_list": ['RGF', 'lgbm', 'rf', 'xgboost'],
"task": 'classification', # task type
"sample": True, # whether to subsample training data
"time_budget": 10, # total running time in seconds
"estimator_list": ['RGF', 'lgbm', 'rf', 'xgboost'],
"task": 'classification', # task type
"sample": True, # whether to subsample training data
"log_file_name": "test/wine.log",
"log_training_metric": True, # whether to log training metric
"log_training_metric": True, # whether to log training metric
"ensemble": True,
"n_jobs": 1,
}
'''The main flaml automl API'''
automl.fit(X_train = X_train, y_train = y_train, **settings)
automl.fit(X_train=X_train, y_train=y_train, **settings)
def test_dataframe(self):
self.test_classification(True)
@ -210,7 +208,7 @@ class TestAutoML(unittest.TestCase):
"model_history": True
}
X_train, y_train = load_boston(return_X_y=True)
n = int(len(y_train)*9//10)
n = int(len(y_train) * 9 // 10)
automl_experiment.fit(X_train=X_train[:n], y_train=y_train[:n],
X_val=X_train[n:], y_val=y_train[n:],
**automl_settings)

View File

@ -39,8 +39,8 @@ class TestLogging(unittest.TestCase):
X_train, y_train = load_boston(return_X_y=True)
n = len(y_train) >> 1
automl.fit(X_train=X_train[:n], y_train=y_train[:n],
X_val=X_train[n:], y_val=y_train[n:],
**automl_settings)
X_val=X_train[n:], y_val=y_train[n:],
**automl_settings)
# Check if the log buffer is populated.
self.assertTrue(len(buf.getvalue()) > 0)
@ -48,4 +48,4 @@ class TestLogging(unittest.TestCase):
import pickle
with open('automl.pkl', 'wb') as f:
pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
print(automl.__version__)
print(automl.__version__)

View File

@ -9,23 +9,6 @@ logger = logging.getLogger(__name__)
logger.addHandler(logging.FileHandler('test/tune_pytorch_cifar10.log'))
# __load_data_begin__
def load_data(data_dir="./data"):
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
trainset = torchvision.datasets.CIFAR10(
root=data_dir, train=True, download=True, transform=transform)
testset = torchvision.datasets.CIFAR10(
root=data_dir, train=False, download=True, transform=transform)
return trainset, testset
# __load_data_end__
try:
import torch
import torch.nn as nn
@ -35,9 +18,9 @@ try:
import torchvision
import torchvision.transforms as transforms
# __net_begin__
class Net(nn.Module):
def __init__(self, l1=120, l2=84):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
@ -79,7 +62,7 @@ def load_data(data_dir="test/data"):
# __train_begin__
def train_cifar(config, checkpoint_dir=None, data_dir=None):
if not "l1" in config:
if "l1" not in config:
logger.warning(config)
net = Net(2 ** config["l1"], 2 ** config["l2"])
@ -200,8 +183,9 @@ def _test_accuracy(net, device="cpu"):
# __main_begin__
def cifar10_main(method='BlendSearch', num_samples=10, max_num_epochs=100,
gpus_per_trial=2):
def cifar10_main(
method='BlendSearch', num_samples=10, max_num_epochs=100, gpus_per_trial=2
):
data_dir = os.path.abspath("test/data")
load_data(data_dir) # Download data for all trials before starting the run
if method == 'BlendSearch':
@ -214,15 +198,15 @@ def cifar10_main(method='BlendSearch', num_samples=10, max_num_epochs=100,
"l2": tune.randint(2, 8),
"lr": tune.loguniform(1e-4, 1e-1),
"num_epochs": tune.qloguniform(1, max_num_epochs, q=1),
"batch_size": tune.randint(1, 4)#tune.choice([2, 4, 8, 16])
"batch_size": tune.randint(1, 4)
}
else:
config = {
"l1": tune.randint(2, 9),
"l2": tune.randint(2, 9),
"lr": tune.loguniform(1e-4, 1e-1),
"num_epochs": tune.qloguniform(1, max_num_epochs+1, q=1),
"batch_size": tune.randint(1, 5)#tune.choice([2, 4, 8, 16])
"num_epochs": tune.qloguniform(1, max_num_epochs + 1, q=1),
"batch_size": tune.randint(1, 5)
}
import ray
time_budget_s = 3600
@ -274,7 +258,7 @@ def cifar10_main(method='BlendSearch', num_samples=10, max_num_epochs=100,
from ray.tune.schedulers import ASHAScheduler
scheduler = ASHAScheduler(
max_t=max_num_epochs,
grace_period=1)
grace_period=1)
result = tune.run(
tune.with_parameters(train_cifar, data_dir=data_dir),
resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
@ -297,7 +281,7 @@ def cifar10_main(method='BlendSearch', num_samples=10, max_num_epochs=100,
best_trial.metric_analysis["accuracy"]["max"]))
best_trained_model = Net(2**best_trial.config["l1"],
2**best_trial.config["l2"])
2**best_trial.config["l2"])
device = "cpu"
if torch.cuda.is_available():
device = "cuda:0"
@ -315,8 +299,8 @@ def cifar10_main(method='BlendSearch', num_samples=10, max_num_epochs=100,
# __main_end__
gpus_per_trial=0#.5
num_samples=500
gpus_per_trial = 0 # 0.5 on GPU server
num_samples = 500
def _test_cifar10_bs():
@ -325,27 +309,27 @@ def _test_cifar10_bs():
def _test_cifar10_cfo():
cifar10_main('CFO',
num_samples=num_samples, gpus_per_trial=gpus_per_trial)
num_samples=num_samples, gpus_per_trial=gpus_per_trial)
def _test_cifar10_optuna():
cifar10_main('Optuna',
num_samples=num_samples, gpus_per_trial=gpus_per_trial)
num_samples=num_samples, gpus_per_trial=gpus_per_trial)
def _test_cifar10_asha():
cifar10_main('ASHA',
num_samples=num_samples, gpus_per_trial=gpus_per_trial)
num_samples=num_samples, gpus_per_trial=gpus_per_trial)
def _test_cifar10_bohb():
cifar10_main('BOHB',
num_samples=num_samples, gpus_per_trial=gpus_per_trial)
num_samples=num_samples, gpus_per_trial=gpus_per_trial)
def _test_cifar10_nevergrad():
cifar10_main('Nevergrad',
num_samples=num_samples, gpus_per_trial=gpus_per_trial)
num_samples=num_samples, gpus_per_trial=gpus_per_trial)
if __name__ == "__main__":

View File

@ -24,7 +24,7 @@ def _test(split_type):
X, y = fetch_openml(name=dataset, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
random_state=42)
random_state=42)
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
pred = automl.predict(X_test)
@ -32,6 +32,7 @@ def _test(split_type):
print(acc)
def _test_uniform():
_test(split_type="uniform")

View File

@ -23,12 +23,12 @@ class TestTrainingLog(unittest.TestCase):
"task": 'regression',
"log_file_name": filename,
"log_training_metric": True,
"mem_thres": 1024*1024,
"mem_thres": 1024 * 1024,
"n_jobs": 1,
"model_history": True,
"verbose": 2,
}
X_train, y_train = load_boston(return_X_y=True)
X_train, y_train = load_boston(return_X_y=True)
automl_experiment.fit(X_train=X_train, y_train=y_train,
**automl_settings)

View File

@ -57,7 +57,7 @@ def _test_xgboost(method='BlendSearch'):
}
max_iter = 10
for num_samples in [128]:
time_budget_s = 60 #None
time_budget_s = 60
for n_cpu in [8]:
start_time = time.time()
ray.init(num_cpus=n_cpu, num_gpus=0)
@ -79,7 +79,7 @@ def _test_xgboost(method='BlendSearch'):
# You can add "gpu": 0.1 to allocate GPUs
resources_per_trial={"cpu": 1},
local_dir='logs/',
num_samples=num_samples*n_cpu,
num_samples=num_samples * n_cpu,
time_budget_s=time_budget_s,
use_ray=True)
else:
@ -112,7 +112,7 @@ def _test_xgboost(method='BlendSearch'):
algo = NevergradSearch(optimizer=ng.optimizers.OnePlusOne)
elif 'ZOOpt' == method:
from ray.tune.suggest.zoopt import ZOOptSearch
algo = ZOOptSearch(budget=num_samples*n_cpu)
algo = ZOOptSearch(budget=num_samples * n_cpu)
elif 'Ax' == method:
from ray.tune.suggest.ax import AxSearch
algo = AxSearch()
@ -132,7 +132,8 @@ def _test_xgboost(method='BlendSearch'):
# You can add "gpu": 0.1 to allocate GPUs
resources_per_trial={"cpu": 1},
config=search_space, local_dir='logs/',
num_samples=num_samples*n_cpu, time_budget_s=time_budget_s,
num_samples=num_samples * n_cpu,
time_budget_s=time_budget_s,
scheduler=scheduler, search_alg=algo)
ray.shutdown()
# # Load the best model checkpoint
@ -140,7 +141,7 @@ def _test_xgboost(method='BlendSearch'):
# best_bst = xgb.Booster()
# best_bst.load_model(os.path.join(analysis.best_checkpoint,
# "model.xgb"))
best_trial = analysis.get_best_trial("eval-logloss","min","all")
best_trial = analysis.get_best_trial("eval-logloss", "min", "all")
accuracy = 1. - best_trial.metric_analysis["eval-error"]["min"]
logloss = best_trial.metric_analysis["eval-logloss"]["min"]
logger.info(f"method={method}")
@ -162,14 +163,14 @@ def test_nested():
}
def simple_func(config):
tune.report(
metric=(config["cost_related"]["a"]-4)**2 * (config["b"]-0.7)**2)
tune.report(metric=(config["cost_related"]["a"] - 4)**2
* (config["b"] - 0.7)**2)
analysis = tune.run(
tune.run(
simple_func,
config=search_space,
low_cost_partial_config={
"cost_related": {"a": 1,}
"cost_related": {"a": 1}
},
metric="metric",
mode="min",

View File

@ -4,7 +4,6 @@ import flaml
class TestVersion(unittest.TestCase):
def test_version(self):
self.assertTrue(hasattr(flaml, '__version__'))
self.assertTrue(len(flaml.__version__) > 0)

View File

@ -15,7 +15,7 @@ class XGBoost2D(XGBoostSklearnEstimator):
@classmethod
def search_space(cls, data_size, task):
upper = min(32768,int(data_size))
upper = min(32768, int(data_size))
return {
'n_estimators': {
'domain': tune.qloguniform(lower=4, upper=upper, q=1),
@ -30,29 +30,25 @@ class XGBoost2D(XGBoostSklearnEstimator):
def test_simple(method=None):
automl = AutoML()
automl.add_learner(learner_name = 'XGBoost2D',
learner_class = XGBoost2D)
automl.add_learner(learner_name='XGBoost2D',
learner_class=XGBoost2D)
automl_settings = {
"estimator_list": ['XGBoost2D'],
# "metric": 'accuracy',
"task": 'classification',
"log_file_name": f"test/xgboost2d_{dataset}_{method}.log",
# "model_history": True,
# "log_training_metric": True,
# "split_type": split_type,
"n_jobs": 1,
"hpo_method": method,
"log_type": "all",
"time_budget": 3#6000,
"time_budget": 3
}
try:
X, y = fetch_openml(name=dataset, return_X_y=True)
except:
except FileNotFoundError:
from sklearn.datasets import load_wine
X, y = load_wine(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)

View File

@ -15,7 +15,7 @@ class XGBoost2D(XGBoostSklearnEstimator):
@classmethod
def search_space(cls, data_size, task):
upper = min(32768,int(data_size))
upper = min(32768, int(data_size))
return {
'n_estimators': {
'domain': tune.qloguniform(lower=4, upper=upper, q=1),
@ -30,19 +30,14 @@ class XGBoost2D(XGBoostSklearnEstimator):
def _test_simple(method=None, size_ratio=1.0):
automl = AutoML()
automl.add_learner(learner_name = 'XGBoost2D',
learner_class = XGBoost2D)
automl.add_learner(learner_name='XGBoost2D',
learner_class=XGBoost2D)
try:
X, y = fetch_openml(name=dataset, return_X_y=True)
except:
from sklearn.datasets import load_wine
X, y = load_wine(return_X_y=True)
X, y = fetch_openml(name=dataset, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
random_state=42)
random_state=42)
final_size = int(len(y_train)*size_ratio)
final_size = int(len(y_train) * size_ratio)
X_train = X_train[:final_size]
y_train = y_train[:final_size]
automl_settings = {
@ -62,14 +57,17 @@ def _test_simple(method=None, size_ratio=1.0):
def _test_grid_1():
_test_simple(method="grid", size_ratio=1.0/3.0)
_test_simple(method="grid", size_ratio=1.0 / 3.0)
def _test_grid_2():
_test_simple(method="grid", size_ratio=2.0/3.0)
_test_simple(method="grid", size_ratio=2.0 / 3.0)
def _test_grid_4():
_test_simple(method="grid", size_ratio=0.5)
def _test_grid_3():
_test_simple(method="grid", size_ratio=1.0)