From 549a0dfb5392642a634ad1b10cd809509a11b4c2 Mon Sep 17 00:00:00 2001 From: Chi Wang Date: Wed, 3 Nov 2021 19:08:23 -0700 Subject: [PATCH] limit time and memory consumption (#264) * limit time and memory * separate tests * lrl1 can't be limited by limit_resource * free memory when possible * passthrough=False when ensemble fails; retrain when trained_estimator is None * use callback to for resource limit * handle lower version of xgb with no callback * free mem ratio * reduce verbosity * retrain_final when max_iter==1 * remove trained_estimator from result * model_history * wheel * retrain time as best_config_train_time * ci: libomp version for xgboost on macos * limit_resource not working in windows * test pickle load * mute forecaster * notebook update * check hard * preventive callback * add use_ray --- .github/workflows/python-package.yml | 8 +- flaml/automl.py | 150 +- flaml/data.py | 5 +- flaml/model.py | 495 +++++-- notebook/flaml_automl.ipynb | 1429 +++++++++---------- setup.py | 10 +- test/test_classification.py | 323 +++++ test/{test_automl.py => test_multiclass.py} | 517 +------ test/test_notebook_example.py | 2 +- test/test_python_log.py | 6 + test/test_regression.py | 221 +++ test/test_training_log.py | 1 + 12 files changed, 1761 insertions(+), 1406 deletions(-) create mode 100644 test/test_classification.py rename test/{test_automl.py => test_multiclass.py} (52%) create mode 100644 test/test_regression.py diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 3367931d5a..499c114991 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -24,9 +24,11 @@ jobs: uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - - name: If mac, install libomp to facilitate lgbm install + - name: If mac, install libomp to facilitate lgbm and xgboost install if: matrix.os == 'macOS-latest' run: | + # remove libomp version constraint after xgboost works with libomp>11.1.0 + wget https://raw.githubusercontent.com/Homebrew/homebrew-core/679923b4eb48a8dc7ecc1f05d06063cd79b3fc00/Formula/libomp.rb -O $(find $(brew --repository) -name libomp.rb) brew install libomp export CC=/usr/bin/clang export CXX=/usr/bin/clang++ @@ -36,7 +38,7 @@ jobs: export LDFLAGS="$LDFLAGS -Wl,-rpath,/usr/local/opt/libomp/lib -L/usr/local/opt/libomp/lib -lomp" - name: Install packages and dependencies run: | - python -m pip install --upgrade pip + python -m pip install --upgrade pip wheel pip install -e .[test] - name: If linux or mac, install ray if: (matrix.os == 'macOS-latest' || matrix.os == 'ubuntu-latest') && matrix.python-version != '3.9' @@ -65,7 +67,7 @@ jobs: with: file: ./coverage.xml flags: unittests - + docs: runs-on: ubuntu-latest diff --git a/flaml/automl.py b/flaml/automl.py index 07f7f41dd1..f77329e3c8 100644 --- a/flaml/automl.py +++ b/flaml/automl.py @@ -248,7 +248,7 @@ class AutoMLState: "wall_clock_time": time.time() - self._start_time_flag, "metric_for_logging": metric_for_logging, "val_loss": val_loss, - "trained_estimator": trained_estimator, + "trained_estimator": trained_estimator if self.save_model_history else None, } if sampled_weight is not None: self.fit_kwargs["sample_weight"] = weight @@ -403,9 +403,10 @@ class AutoML: @property def best_config_train_time(self): - """A float of the seconds taken by training the - best config.""" - return self._search_states[self._best_estimator].best_config_train_time + """A float of the seconds taken by training the best config.""" + return getattr( + self._search_states[self._best_estimator], "best_config_train_time", None + ) @property def classes_(self): @@ -529,8 +530,9 @@ class AutoML: self._nrow, self._ndim = X_train_all.shape if self._state.task == TS_FORECAST: X_train_all = pd.DataFrame(X_train_all) - assert X_train_all[X_train_all.columns[0]].dtype.name == 'datetime64[ns]', ( - f"For '{TS_FORECAST}' task, the first column must contain timestamp values.") + assert ( + X_train_all[X_train_all.columns[0]].dtype.name == "datetime64[ns]" + ), f"For '{TS_FORECAST}' task, the first column must contain timestamp values." X, y = X_train_all, y_train_all elif dataframe is not None and label is not None: assert isinstance( @@ -539,8 +541,9 @@ class AutoML: assert label in dataframe.columns, "label must a column name in dataframe" self._df = True if self._state.task == TS_FORECAST: - assert dataframe[dataframe.columns[0]].dtype.name == 'datetime64[ns]', ( - f"For '{TS_FORECAST}' task, the first column must contain timestamp values.") + assert ( + dataframe[dataframe.columns[0]].dtype.name == "datetime64[ns]" + ), f"For '{TS_FORECAST}' task, the first column must contain timestamp values." X = dataframe.drop(columns=label) self._nrow, self._ndim = X.shape y = dataframe[label] @@ -584,7 +587,9 @@ class AutoML: else: self._state.X_val = X_val if self._label_transformer: - self._state.y_val = self._label_transformer.transform(y_val, self._state.task) + self._state.y_val = self._label_transformer.transform( + y_val, self._state.task + ) else: self._state.y_val = y_val else: @@ -1064,7 +1069,8 @@ class AutoML: return "holdout" nrow, dim = self._nrow, self._ndim if ( - nrow * dim / 0.9 < SMALL_LARGE_THRES * (time_budget / 3600) + time_budget is None + or nrow * dim / 0.9 < SMALL_LARGE_THRES * (time_budget / 3600) and nrow < CV_HOLDOUT_THRESHOLD ): # time allows or sampling can be used and cv is necessary @@ -1301,6 +1307,7 @@ class AutoML: append_log=False, auto_augment=True, min_sample_size=MIN_SAMPLE_TRAIN, + use_ray=False, **fit_kwargs, ): """Find a model for a given task @@ -1414,7 +1421,9 @@ class AutoML: In the following code example, we get starting_points from the automl_experiment and use them in the new_automl_experiment. e.g., + .. code-block:: python + from flaml import AutoML automl_experiment = AutoML() X_train, y_train = load_iris(return_X_y=True) @@ -1440,6 +1449,10 @@ class AutoML: augment rare classes. min_sample_size: int, default=MIN_SAMPLE_TRAIN | the minimal sample size when sample=True. + use_ray: boolean, default=False | Whether to use ray to run the training + in separate processes. This can be used to prevent OOM for large + datasets, but will incur more overhead in time. Only use it if + you run into OOM failures. **fit_kwargs: Other key word arguments to pass to fit() function of the searched learners, such as sample_weight. Include period as a key word argument for 'ts_forecast' task. @@ -1483,8 +1496,10 @@ class AutoML: ) self._retrain_final = ( retrain_full is True - and (eval_method == "holdout" and self._state.X_val is None) - or (eval_method == "cv") + and eval_method == "holdout" + and self._state.X_val is None + or eval_method == "cv" + or max_iter == 1 ) self._auto_augment = auto_augment self._min_sample_size = min_sample_size @@ -1564,7 +1579,7 @@ class AutoML: logger.info("List of ML learners in AutoML Run: {}".format(estimator_list)) self.estimator_list = estimator_list self._hpo_method = hpo_method or ("cfo" if n_concurrent_trials == 1 else "bs") - self._state.time_budget = time_budget + self._state.time_budget = time_budget or 1e10 self._active_estimators = estimator_list.copy() self._ensemble = ensemble self._max_iter = max_iter @@ -1573,10 +1588,11 @@ class AutoML: self._state.train_time_limit = train_time_limit self._log_type = log_type self.split_ratio = split_ratio - self._save_model_history = model_history + self._state.save_model_history = model_history self._state.n_jobs = n_jobs self._n_concurrent_trials = n_concurrent_trials self._early_stop = early_stop + self._use_ray = use_ray or self._n_concurrent_trials > 1 if log_file_name: with training_log_writer(log_file_name, append_log) as save_helper: self._training_log = save_helper @@ -1627,7 +1643,7 @@ class AutoML: from ray.tune.suggest import ConcurrencyLimiter except (ImportError, AssertionError): raise ImportError( - "n_concurrent_trial > 1 requires installation of ray. " + "n_concurrent_trial>1 or use_ray=True requires installation of ray. " "Please run pip install flaml[ray]" ) if self._hpo_method in ("cfo", "grid"): @@ -1693,7 +1709,8 @@ class AutoML: resources_per_trial=resources_per_trial, time_budget_s=self._state.time_budget, num_samples=self._max_iter, - verbose=self.verbose, + verbose=max(self.verbose - 3, 0), + raise_on_failed_trial=False, ) # logger.info([trial.last_result for trial in analysis.trials]) trials = sorted( @@ -1712,7 +1729,7 @@ class AutoML: config = result["config"] estimator = config.get("ml", config)["learner"] search_state = self._search_states[estimator] - search_state.update(result, 0, self._save_model_history) + search_state.update(result, 0, self._state.save_model_history) if result["wall_clock_time"] is not None: self._state.time_from_start = result["wall_clock_time"] if search_state.sample_size == self._state.data_size: @@ -1727,7 +1744,7 @@ class AutoML: config, self._time_taken_best_iter, ) - if self._save_model_history: + if self._state.save_model_history: self._model_history[ _track_iter ] = search_state.trained_estimator @@ -1902,7 +1919,7 @@ class AutoML: search_state.update( result, time_used=time_used, - save_model_history=self._save_model_history, + save_model_history=self._state.save_model_history, ) if self._estimator_index is None: # update init eci estimate @@ -1945,18 +1962,27 @@ class AutoML: search_state.best_config, self._state.time_from_start, ) - if self._save_model_history: + if self._state.save_model_history: self._model_history[ self._track_iter ] = search_state.trained_estimator elif self._trained_estimator: del self._trained_estimator self._trained_estimator = None - self._trained_estimator = search_state.trained_estimator + if not self._retrain_final: + self._trained_estimator = search_state.trained_estimator self._best_iteration = self._track_iter self._time_taken_best_iter = self._state.time_from_start better = True next_trial_time = search_state.time2eval_best + if search_state.trained_estimator and not ( + self._state.save_model_history or self._ensemble + ): + # free RAM + if search_state.trained_estimator != self._trained_estimator: + search_state.trained_estimator.cleanup() + del search_state.trained_estimator + search_state.trained_estimator = None if better or self._log_type == "all": if self._training_log: self._training_log.append( @@ -2049,7 +2075,9 @@ class AutoML: logger.info( "retrain {} for {:.1f}s".format(self._best_estimator, retrain_time) ) - self._retrained_config[best_config_sig] = retrain_time + self._retrained_config[ + best_config_sig + ] = state.best_config_train_time = retrain_time est_retrain_time = 0 self._state.time_from_start = time.time() - self._start_time_flag if ( @@ -2083,7 +2111,7 @@ class AutoML: self._selected = None self.modelcount = 0 - if self._n_concurrent_trials == 1: + if not self._use_ray: self._search_sequential() else: self._search_parallel() @@ -2103,12 +2131,29 @@ class AutoML: "regression", ): search_states = list( - x for x in self._search_states.items() if x[1].trained_estimator + x for x in self._search_states.items() if x[1].best_config ) search_states.sort(key=lambda x: x[1].best_loss) - estimators = [(x[0], x[1].trained_estimator) for x in search_states[:2]] + estimators = [ + ( + x[0], + x[1].learner_class( + task=self._state.task, + n_jobs=self._state.n_jobs, + **x[1].best_config, + ), + ) + for x in search_states[:2] + ] estimators += [ - (x[0], x[1].trained_estimator) + ( + x[0], + x[1].learner_class( + task=self._state.task, + n_jobs=self._state.n_jobs, + **x[1].best_config, + ), + ) for x in search_states[2:] if x[1].best_loss < 4 * self._selected.best_loss ] @@ -2135,19 +2180,49 @@ class AutoML: ) if self._sample_weight_full is not None: self._state.fit_kwargs["sample_weight"] = self._sample_weight_full - stacker.fit( - self._X_train_all, self._y_train_all, **self._state.fit_kwargs - ) - logger.info(f"ensemble: {stacker}") - self._trained_estimator = stacker - self._trained_estimator.model = stacker + for e in estimators: + e[1].__class__.init() + try: + stacker.fit( + self._X_train_all, self._y_train_all, **self._state.fit_kwargs + ) + logger.info(f"ensemble: {stacker}") + self._trained_estimator = stacker + self._trained_estimator.model = stacker + except ValueError as e: + if passthrough: + logger.warning( + "Using passthrough=False for ensemble because the data contain categorical features." + ) + stacker = Stacker( + estimators, + final_estimator, + n_jobs=self._state.n_jobs, + passthrough=False, + ) + stacker.fit( + self._X_train_all, + self._y_train_all, + **self._state.fit_kwargs, + ) + logger.info(f"ensemble: {stacker}") + self._trained_estimator = stacker + self._trained_estimator.model = stacker + else: + raise e elif self._retrain_final: # reset time budget for retraining - self._state.time_from_start -= self._state.time_budget - if self._state.task == TS_FORECAST or ( - self._state.time_budget - self._state.time_from_start - > self._selected.est_retrain_time(self.data_size_full) - and self._selected.best_config_sample_size == self._state.data_size + if self._max_iter > 1: + self._state.time_from_start -= self._state.time_budget + if ( + self._state.task == TS_FORECAST + or self._trained_estimator is None + or ( + self._state.time_budget - self._state.time_from_start + > self._selected.est_retrain_time(self.data_size_full) + and self._selected.best_config_sample_size + == self._state.data_size + ) ): state = self._search_states[self._best_estimator] ( @@ -2163,6 +2238,7 @@ class AutoML: self._best_estimator, retrain_time ) ) + state.best_config_train_time = retrain_time if self._trained_estimator: logger.info(f"retrained model: {self._trained_estimator.model}") else: diff --git a/flaml/data.py b/flaml/data.py index 5e87f2bd38..bc0c9eb951 100644 --- a/flaml/data.py +++ b/flaml/data.py @@ -275,9 +275,8 @@ class DataTransformer: X[column] = X[column].map(datetime.toordinal) datetime_columns.append(column) del tmp_dt - else: - X[column] = X[column].fillna(np.nan) - num_columns.append(column) + X[column] = X[column].fillna(np.nan) + num_columns.append(column) X = X[cat_columns + num_columns] if task == TS_FORECAST: X.insert(0, TS_TIMESTAMP_COL, ds_col) diff --git a/flaml/model.py b/flaml/model.py index c00d958af9..4f5d5cfd7f 100644 --- a/flaml/model.py +++ b/flaml/model.py @@ -2,20 +2,67 @@ * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the MIT License. """ - +from contextlib import contextmanager +from functools import partial +import signal +import os +from typing import Callable, List import numpy as np import time from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier from sklearn.linear_model import LogisticRegression +from sklearn.dummy import DummyClassifier, DummyRegressor from scipy.sparse import issparse import pandas as pd -from . import tune -from .data import group_counts, CLASSIFICATION, TS_FORECAST, TS_TIMESTAMP_COL, TS_VALUE_COL - import logging +from . import tune +from .data import ( + group_counts, + CLASSIFICATION, + TS_FORECAST, + TS_TIMESTAMP_COL, + TS_VALUE_COL, +) + +try: + import psutil +except ImportError: + psutil = None +try: + import resource +except ImportError: + resource = None logger = logging.getLogger("flaml.automl") +FREE_MEM_RATIO = 0.2 + + +def TimeoutHandler(sig, frame): + raise TimeoutError(sig, frame) + + +@contextmanager +def limit_resource(memory_limit, time_limit): + if memory_limit > 0: + soft, hard = resource.getrlimit(resource.RLIMIT_AS) + if soft < 0 and (hard < 0 or memory_limit <= hard) or memory_limit < soft: + resource.setrlimit(resource.RLIMIT_AS, (memory_limit, hard)) + main_thread = False + if time_limit is not None: + try: + signal.signal(signal.SIGALRM, TimeoutHandler) + signal.alarm(int(time_limit) or 1) + main_thread = True + except ValueError: + pass + try: + yield + finally: + if main_thread: + signal.alarm(0) + if memory_limit > 0: + resource.setrlimit(resource.RLIMIT_AS, (soft, hard)) class BaseEstimator: @@ -112,7 +159,35 @@ class BaseEstimator: Returns: train_time: A float of the training time in seconds """ - return self._fit(X_train, y_train, **kwargs) + if ( + getattr(self, "limit_resource", None) + and resource is not None + and (budget is not None or psutil is not None) + ): + start_time = time.time() + mem = psutil.virtual_memory() if psutil is not None else None + try: + with limit_resource( + mem.available * (1 - FREE_MEM_RATIO) + + psutil.Process(os.getpid()).memory_info().rss + if mem is not None + else -1, + budget, + ): + train_time = self._fit(X_train, y_train, **kwargs) + except (MemoryError, TimeoutError) as e: + logger.warning(f"{e.__class__} {e}") + if self._task in CLASSIFICATION: + model = DummyClassifier() + else: + model = DummyRegressor() + X_train = self._preprocess(X_train) + model.fit(X_train, y_train) + self._model = model + train_time = time.time() - start_time + else: + train_time = self._fit(X_train, y_train, **kwargs) + return train_time def predict(self, X_test): """Predict label from features @@ -223,6 +298,9 @@ class SKLearnEstimator(BaseEstimator): class LGBMEstimator(BaseEstimator): + ITER_HP = "n_estimators" + HAS_CALLBACK = True + @classmethod def search_space(cls, data_size, **params): upper = min(32768, int(data_size)) @@ -297,6 +375,8 @@ class LGBMEstimator(BaseEstimator): self.estimator_class = LGBMClassifier self._time_per_iter = None self._train_size = 0 + self._mem_per_iter = 1 + self.HAS_CALLBACK = self.HAS_CALLBACK and self._callbacks(0, 0) is not None def _preprocess(self, X): if ( @@ -316,50 +396,111 @@ class LGBMEstimator(BaseEstimator): def fit(self, X_train, y_train, budget=None, **kwargs): start_time = time.time() - n_iter = self.params["n_estimators"] + deadline = start_time + budget if budget else np.inf + n_iter = self.params[self.ITER_HP] trained = False - if ( - (not self._time_per_iter or abs(self._train_size - X_train.shape[0]) > 4) - and budget is not None - and n_iter > 1 - ): - self.params["n_estimators"] = 1 - self._t1 = self._fit(X_train, y_train, **kwargs) - if self._t1 >= budget or n_iter == 1: - # self.params["n_estimators"] = n_iter - return self._t1 - self.params["n_estimators"] = min(n_iter, 4) - self._t2 = self._fit(X_train, y_train, **kwargs) - self._time_per_iter = ( - (self._t2 - self._t1) / (self.params["n_estimators"] - 1) - if self._t2 > self._t1 - else self._t1 - if self._t1 - else 0.001 - ) - self._train_size = X_train.shape[0] - if self._t1 + self._t2 >= budget or n_iter == self.params["n_estimators"]: - # self.params["n_estimators"] = n_iter - return time.time() - start_time - trained = True - if budget is not None and n_iter > 1: - max_iter = min( - n_iter, - int( - (budget - time.time() + start_time - self._t1) / self._time_per_iter - + 1 - ), - ) - if trained and max_iter <= self.params["n_estimators"]: - return time.time() - start_time - self.params["n_estimators"] = max_iter - if self.params["n_estimators"] > 0: - self._fit(X_train, y_train, **kwargs) + if not self.HAS_CALLBACK: + mem0 = psutil.virtual_memory().available if psutil is not None else 1 + if ( + ( + not self._time_per_iter + or abs(self._train_size - X_train.shape[0]) > 4 + ) + and budget is not None + or self._mem_per_iter <= 1 + and psutil is not None + ) and n_iter > 1: + self.params[self.ITER_HP] = 1 + self._t1 = self._fit(X_train, y_train, **kwargs) + if budget is not None and self._t1 >= budget or n_iter == 1: + # self.params[self.ITER_HP] = n_iter + return self._t1 + mem1 = psutil.virtual_memory().available if psutil is not None else 1 + self._mem1 = mem0 - mem1 + self.params[self.ITER_HP] = min(n_iter, 4) + self._t2 = self._fit(X_train, y_train, **kwargs) + mem2 = psutil.virtual_memory().available if psutil is not None else 1 + self._mem2 = max(mem0 - mem2, self._mem1) + # if self._mem1 <= 0: + # self._mem_per_iter = self._mem2 / (self.params[self.ITER_HP] + 1) + # elif self._mem2 <= 0: + # self._mem_per_iter = self._mem1 + # else: + self._mem_per_iter = min( + self._mem1, self._mem2 / self.params[self.ITER_HP] + ) + if self._mem_per_iter <= 1 and psutil is not None: + n_iter = self.params[self.ITER_HP] + self._time_per_iter = ( + (self._t2 - self._t1) / (self.params[self.ITER_HP] - 1) + if self._t2 > self._t1 + else self._t1 + if self._t1 + else 0.001 + ) + self._train_size = X_train.shape[0] + if ( + budget is not None + and self._t1 + self._t2 >= budget + or n_iter == self.params[self.ITER_HP] + ): + # self.params[self.ITER_HP] = n_iter + return time.time() - start_time + trained = True + # logger.debug(mem0) + # logger.debug(self._mem_per_iter) + if n_iter > 1: + max_iter = min( + n_iter, + int( + (budget - time.time() + start_time - self._t1) + / self._time_per_iter + + 1 + ) + if budget is not None + else n_iter, + int((1 - FREE_MEM_RATIO) * mem0 / self._mem_per_iter) + if psutil is not None + else n_iter, + ) + if trained and max_iter <= self.params[self.ITER_HP]: + return time.time() - start_time + self.params[self.ITER_HP] = max_iter + if self.params[self.ITER_HP] > 0: + if self.HAS_CALLBACK: + self._fit( + X_train, y_train, callbacks=self._callbacks(start_time, deadline), **kwargs + ) + best_iteration = ( + self._model.get_booster().best_iteration + if isinstance(self, XGBoostSklearnEstimator) + else self._model.best_iteration_ + ) + if best_iteration is not None: + self._model.set_params(n_estimators=best_iteration + 1) + else: + self._fit(X_train, y_train, **kwargs) else: - self.params["n_estimators"] = self._model.n_estimators + self.params[self.ITER_HP] = self._model.n_estimators train_time = time.time() - start_time return train_time + def _callbacks(self, start_time, deadline) -> List[Callable]: + return [partial(self._callback, start_time, deadline)] + + def _callback(self, start_time, deadline, env) -> None: + from lightgbm.callback import EarlyStopException + + now = time.time() + if env.iteration == 0: + self._time_per_iter = now - start_time + if now + self._time_per_iter > deadline: + raise EarlyStopException(env.iteration, env.evaluation_result_list) + if psutil is not None: + mem = psutil.virtual_memory() + if mem.available / mem.total < FREE_MEM_RATIO: + raise EarlyStopException(env.iteration, env.evaluation_result_list) + class XGBoostEstimator(SKLearnEstimator): """not using sklearn API, used for regression""" @@ -439,6 +580,7 @@ class XGBoostEstimator(SKLearnEstimator): import xgboost as xgb start_time = time.time() + deadline = start_time + budget if budget else np.inf if issparse(X_train): self.params["tree_method"] = "auto" else: @@ -456,9 +598,20 @@ class XGBoostEstimator(SKLearnEstimator): if "objective" in self.params: del self.params["objective"] _n_estimators = self.params.pop("n_estimators") - self._model = xgb.train(self.params, dtrain, _n_estimators, obj=obj) + callbacks = XGBoostEstimator._callbacks(start_time, deadline) + if callbacks: + self._model = xgb.train( + self.params, + dtrain, + _n_estimators, + obj=obj, + callbacks=callbacks, + ) + self.params["n_estimators"] = self._model.best_iteration + 1 + else: + self._model = xgb.train(self.params, dtrain, _n_estimators, obj=obj) + self.params["n_estimators"] = _n_estimators self.params["objective"] = objective - self.params["n_estimators"] = _n_estimators del dtrain train_time = time.time() - start_time return train_time @@ -471,6 +624,28 @@ class XGBoostEstimator(SKLearnEstimator): dtest = xgb.DMatrix(X_test) return super().predict(dtest) + @classmethod + def _callbacks(cls, start_time, deadline): + try: + from xgboost.callback import TrainingCallback + except ImportError: # for xgboost<1.3 + return None + + class ResourceLimit(TrainingCallback): + def after_iteration(self, model, epoch, evals_log) -> bool: + now = time.time() + if epoch == 0: + self._time_per_iter = now - start_time + if now + self._time_per_iter > deadline: + return True + if psutil is not None: + mem = psutil.virtual_memory() + if mem.available / mem.total < FREE_MEM_RATIO: + return True + return False + + return [ResourceLimit()] + class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator): """using sklearn API, used for classification""" @@ -513,8 +688,13 @@ class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator): self.params["tree_method"] = "auto" return super().fit(X_train, y_train, budget, **kwargs) + def _callbacks(self, start_time, deadline) -> List[Callable]: + return XGBoostEstimator._callbacks(start_time, deadline) + class RandomForestEstimator(SKLearnEstimator, LGBMEstimator): + HAS_CALLBACK = False + @classmethod def search_space(cls, data_size, task, **params): data_size = int(data_size) @@ -607,6 +787,8 @@ class LRL1Classifier(SKLearnEstimator): class LRL2Classifier(SKLearnEstimator): + limit_resource = True + @classmethod def search_space(cls, **params): return LRL1Classifier.search_space(**params) @@ -629,8 +811,7 @@ class LRL2Classifier(SKLearnEstimator): class CatBoostEstimator(BaseEstimator): - _time_per_iter = None - _train_size = 0 + ITER_HP = "n_estimators" @classmethod def search_space(cls, data_size, **params): @@ -661,11 +842,6 @@ class CatBoostEstimator(BaseEstimator): def cost_relative2lgbm(cls): return 15 - @classmethod - def init(cls): - CatBoostEstimator._time_per_iter = None - CatBoostEstimator._train_size = 0 - def _preprocess(self, X): if isinstance(X, pd.DataFrame): cat_columns = X.select_dtypes(include=["category"]).columns @@ -719,87 +895,36 @@ class CatBoostEstimator(BaseEstimator): import shutil start_time = time.time() + deadline = start_time + budget if budget else np.inf train_dir = f"catboost_{str(start_time)}" - n_iter = self.params["n_estimators"] X_train = self._preprocess(X_train) if isinstance(X_train, pd.DataFrame): cat_features = list(X_train.select_dtypes(include="category").columns) else: cat_features = [] - # from catboost import CatBoostError - # try: - trained = False - if ( - ( - not CatBoostEstimator._time_per_iter - or abs(CatBoostEstimator._train_size - len(y_train)) > 4 - ) - and budget - and n_iter > 4 - ): - # measure the time per iteration - self.params["n_estimators"] = 1 - CatBoostEstimator._smallmodel = self.estimator_class( - train_dir=train_dir, **self.params - ) - CatBoostEstimator._smallmodel.fit( - X_train, y_train, cat_features=cat_features, **kwargs - ) - CatBoostEstimator._t1 = time.time() - start_time - if CatBoostEstimator._t1 >= budget or n_iter == 1: - # self.params["n_estimators"] = n_iter - self._model = CatBoostEstimator._smallmodel - shutil.rmtree(train_dir, ignore_errors=True) - return CatBoostEstimator._t1 - self.params["n_estimators"] = min(n_iter, 4) - CatBoostEstimator._smallmodel = self.estimator_class( - train_dir=train_dir, **self.params - ) - CatBoostEstimator._smallmodel.fit( - X_train, y_train, cat_features=cat_features, **kwargs - ) - CatBoostEstimator._time_per_iter = ( - time.time() - start_time - CatBoostEstimator._t1 - ) / (self.params["n_estimators"] - 1) - if CatBoostEstimator._time_per_iter <= 0: - CatBoostEstimator._time_per_iter = CatBoostEstimator._t1 - CatBoostEstimator._train_size = len(y_train) - if ( - time.time() - start_time >= budget - or n_iter == self.params["n_estimators"] - ): - # self.params["n_estimators"] = n_iter - self._model = CatBoostEstimator._smallmodel - shutil.rmtree(train_dir, ignore_errors=True) - return time.time() - start_time - trained = True - if budget and n_iter > 4: - train_times = 1 - max_iter = min( - n_iter, - int( - (budget - time.time() + start_time - CatBoostEstimator._t1) - / train_times - / CatBoostEstimator._time_per_iter - + 1 - ), - ) - self._model = CatBoostEstimator._smallmodel - if trained and max_iter <= self.params["n_estimators"]: - return time.time() - start_time - self.params["n_estimators"] = max_iter - if self.params["n_estimators"] > 0: - n = max(int(len(y_train) * 0.9), len(y_train) - 1000) - X_tr, y_tr = X_train[:n], y_train[:n] - if "sample_weight" in kwargs: - weight = kwargs["sample_weight"] - if weight is not None: - kwargs["sample_weight"] = weight[:n] - else: - weight = None - from catboost import Pool + n = max(int(len(y_train) * 0.9), len(y_train) - 1000) + X_tr, y_tr = X_train[:n], y_train[:n] + if "sample_weight" in kwargs: + weight = kwargs["sample_weight"] + if weight is not None: + kwargs["sample_weight"] = weight[:n] + else: + weight = None + from catboost import Pool, __version__ - model = self.estimator_class(train_dir=train_dir, **self.params) + model = self.estimator_class(train_dir=train_dir, **self.params) + if __version__ >= "0.26": + model.fit( + X_tr, + y_tr, + cat_features=cat_features, + eval_set=Pool( + data=X_train[n:], label=y_train[n:], cat_features=cat_features + ), + callbacks=CatBoostEstimator._callbacks(start_time, deadline), + **kwargs, + ) + else: model.fit( X_tr, y_tr, @@ -808,18 +933,32 @@ class CatBoostEstimator(BaseEstimator): data=X_train[n:], label=y_train[n:], cat_features=cat_features ), **kwargs, - ) # model.get_best_iteration() - shutil.rmtree(train_dir, ignore_errors=True) - if weight is not None: - kwargs["sample_weight"] = weight - self._model = model - else: - self.params["n_estimators"] = self._model.tree_count_ - # except CatBoostError: - # self._model = None + ) + shutil.rmtree(train_dir, ignore_errors=True) + if weight is not None: + kwargs["sample_weight"] = weight + self._model = model + self.params[self.ITER_HP] = self._model.tree_count_ train_time = time.time() - start_time return train_time + @classmethod + def _callbacks(cls, start_time, deadline): + class ResourceLimit: + def after_iteration(self, info) -> bool: + now = time.time() + if info.iteration == 1: + self._time_per_iter = now - start_time + if now + self._time_per_iter > deadline: + return False + if psutil is not None: + mem = psutil.virtual_memory() + if mem.available / mem.total < FREE_MEM_RATIO: + return False + return True # can continue + + return [ResourceLimit()] + class KNeighborsEstimator(BaseEstimator): @classmethod @@ -919,7 +1058,8 @@ class Prophet(SKLearnEstimator): model = Prophet(**self.params) for regressor in cols: model.add_regressor(regressor) - model.fit(train_df) + with suppress_stdout_stderr(): + model.fit(train_df) train_time = time.time() - current_time self._model = model return train_time @@ -984,15 +1124,21 @@ class ARIMA(Prophet): regressors = cols if regressors: model = ARIMA_estimator( - train_df[[TS_VALUE_COL]], exog=train_df[regressors], order=( - self.params["p"], self.params["d"], self.params["q"]), - enforce_stationarity=False, enforce_invertibility=False) + train_df[[TS_VALUE_COL]], + exog=train_df[regressors], + order=(self.params["p"], self.params["d"], self.params["q"]), + enforce_stationarity=False, + enforce_invertibility=False, + ) else: model = ARIMA_estimator( - train_df, order=( - self.params["p"], self.params["d"], self.params["q"]), - enforce_stationarity=False, enforce_invertibility=False) - model = model.fit() + train_df, + order=(self.params["p"], self.params["d"], self.params["q"]), + enforce_stationarity=False, + enforce_invertibility=False, + ) + with suppress_stdout_stderr(): + model = model.fit() train_time = time.time() - current_time self._model = model return train_time @@ -1010,7 +1156,9 @@ class ARIMA(Prophet): regressors = list(X_test) regressors.remove(TS_TIMESTAMP_COL) X_test = self._preprocess(X_test) - forecast = self._model.predict(start=start, end=end, exog=X_test[regressors]) + forecast = self._model.predict( + start=start, end=end, exog=X_test[regressors] + ) else: forecast = self._model.predict(start=start, end=end) else: @@ -1077,25 +1225,64 @@ class SARIMAX(ARIMA): regressors.remove(TS_VALUE_COL) if regressors: model = SARIMAX_estimator( - train_df[[TS_VALUE_COL]], exog=train_df[regressors], order=( - self.params["p"], self.params["d"], self.params["q"]), + train_df[[TS_VALUE_COL]], + exog=train_df[regressors], + order=(self.params["p"], self.params["d"], self.params["q"]), seasonality_order=( self.params["P"], self.params["D"], self.params["Q"], - self.params["s"]), - enforce_stationarity=False, enforce_invertibility=False) + self.params["s"], + ), + enforce_stationarity=False, + enforce_invertibility=False, + ) else: model = SARIMAX_estimator( - train_df, order=( - self.params["p"], self.params["d"], self.params["q"]), + train_df, + order=(self.params["p"], self.params["d"], self.params["q"]), seasonality_order=( self.params["P"], self.params["D"], self.params["Q"], - self.params["s"]), - enforce_stationarity=False, enforce_invertibility=False) - model = model.fit() + self.params["s"], + ), + enforce_stationarity=False, + enforce_invertibility=False, + ) + with suppress_stdout_stderr(): + model = model.fit() train_time = time.time() - current_time self._model = model return train_time + + +class suppress_stdout_stderr(object): + """ + A context manager for doing a "deep suppression" of stdout and stderr in + Python, i.e. will suppress all print, even if the print originates in a + compiled C/Fortran sub-function. + This will not suppress raised exceptions, since exceptions are printed + to stderr just before a script exits, and after the context manager has + exited. + + """ + + def __init__(self): + # Open a pair of null files + self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)] + # Save the actual stdout (1) and stderr (2) file descriptors. + self.save_fds = (os.dup(1), os.dup(2)) + + def __enter__(self): + # Assign the null pointers to stdout and stderr. + os.dup2(self.null_fds[0], 1) + os.dup2(self.null_fds[1], 2) + + def __exit__(self, *_): + # Re-assign the real stdout/stderr back to (1) and (2) + os.dup2(self.save_fds[0], 1) + os.dup2(self.save_fds[1], 2) + # Close the null files + os.close(self.null_fds[0]) + os.close(self.null_fds[1]) diff --git a/notebook/flaml_automl.ipynb b/notebook/flaml_automl.ipynb index 51a630679e..affadc81a4 100644 --- a/notebook/flaml_automl.ipynb +++ b/notebook/flaml_automl.ipynb @@ -2,6 +2,11 @@ "cells": [ { "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, "source": [ "Copyright (c) Microsoft Corporation. All rights reserved. \n", "\n", @@ -25,50 +30,47 @@ "```bash\n", "pip install flaml[notebook]\n", "```" - ], - "metadata": { - "slideshow": { - "slide_type": "slide" - } - } + ] }, { "cell_type": "code", "execution_count": null, - "source": [ - "!pip install flaml[notebook];\r\n", - "# from v0.6.6, catboost is made an optional dependency to build conda package.\r\n", - "# to install catboost, you can uncomment and run:\r\n", - "# !pip install flaml[catboost]" - ], + "metadata": {}, "outputs": [], - "metadata": {} + "source": [ + "!pip install flaml[notebook];\n", + "# from v0.6.6, catboost is made an optional dependency to build conda package.\n", + "# to install catboost without installing the notebook option, you can run:\n", + "# !pip install flaml[catboost]" + ] }, { "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, "source": [ "## 2. Classification Example\n", "### Load data and preprocess\n", "\n", "Download [Airlines dataset](https://www.openml.org/d/1169) from OpenML. The task is to predict whether a given flight will be delayed, given the information of the scheduled departure." - ], - "metadata": { - "slideshow": { - "slide_type": "slide" - } - } + ] }, { "cell_type": "code", "execution_count": 1, - "source": [ - "from flaml.data import load_openml_dataset\r\n", - "X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir='./')" - ], + "metadata": { + "slideshow": { + "slide_type": "subslide" + }, + "tags": [] + }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "load dataset from ./openml_ds1169.pkl\n", "Dataset name: airlines\n", @@ -77,235 +79,61 @@ ] } ], - "metadata": { - "slideshow": { - "slide_type": "subslide" - }, - "tags": [] - } + "source": [ + "from flaml.data import load_openml_dataset\n", + "X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir='./')" + ] }, { "cell_type": "markdown", - "source": [ - "### Run FLAML\r\n", - "In the FLAML automl run configuration, users can specify the task type, time budget, error metric, learner list, whether to subsample, resampling strategy type, and so on. All these arguments have default values which will be used if users do not provide them. For example, the default classifiers are `['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree', 'lrl1']`. " - ], "metadata": { "slideshow": { "slide_type": "slide" } - } + }, + "source": [ + "### Run FLAML\n", + "In the FLAML automl run configuration, users can specify the task type, time budget, error metric, learner list, whether to subsample, resampling strategy type, and so on. All these arguments have default values which will be used if users do not provide them. For example, the default classifiers are `['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree', 'lrl1']`. " + ] }, { "cell_type": "code", "execution_count": 2, - "source": [ - "''' import AutoML class from flaml package '''\r\n", - "from flaml import AutoML\r\n", - "automl = AutoML()" - ], - "outputs": [], "metadata": { "slideshow": { "slide_type": "slide" } - } + }, + "outputs": [], + "source": [ + "''' import AutoML class from flaml package '''\n", + "from flaml import AutoML\n", + "automl = AutoML()" + ] }, { "cell_type": "code", "execution_count": 3, - "source": [ - "settings = {\r\n", - " \"time_budget\": 240, # total running time in seconds\r\n", - " \"metric\": 'accuracy', # can be: 'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',\r\n", - " # 'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg', 'micro_f1', 'macro_f1'\r\n", - " \"task\": 'classification', # task type\r\n", - " \"log_file_name\": 'airlines_experiment.log', # flaml log file\r\n", - " \"seed\": 7654321, # random seed\r\n", - "}" - ], - "outputs": [], "metadata": { "slideshow": { "slide_type": "slide" } - } + }, + "outputs": [], + "source": [ + "settings = {\n", + " \"time_budget\": 240, # total running time in seconds\n", + " \"metric\": 'accuracy', # can be: 'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',\n", + " # 'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg', 'micro_f1', 'macro_f1'\n", + " \"task\": 'classification', # task type\n", + " \"log_file_name\": 'airlines_experiment.log', # flaml log file\n", + " \"seed\": 7654321, # random seed\n", + "}" + ] }, { "cell_type": "code", "execution_count": 4, - "source": [ - "'''The main flaml automl API'''\r\n", - "automl.fit(X_train=X_train, y_train=y_train, **settings)" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "[flaml.automl: 10-08 15:12:49] {1458} INFO - Data split method: stratified\n", - "[flaml.automl: 10-08 15:12:49] {1462} INFO - Evaluation method: holdout\n", - "[flaml.automl: 10-08 15:12:49] {1510} INFO - Minimizing error metric: 1-accuracy\n", - "[flaml.automl: 10-08 15:12:49] {1547} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'lrl1']\n", - "[flaml.automl: 10-08 15:12:49] {1777} INFO - iteration 0, current learner lgbm\n", - "[flaml.automl: 10-08 15:12:50] {1894} INFO - Estimated sufficient time budget=318171s. Estimated necessary time budget=5298s.\n", - "[flaml.automl: 10-08 15:12:50] {1966} INFO - at 1.8s,\testimator lgbm's best error=0.3777,\tbest estimator lgbm's best error=0.3777\n", - "[flaml.automl: 10-08 15:12:50] {1777} INFO - iteration 1, current learner lgbm\n", - "[flaml.automl: 10-08 15:12:51] {1966} INFO - at 2.4s,\testimator lgbm's best error=0.3759,\tbest estimator lgbm's best error=0.3759\n", - "[flaml.automl: 10-08 15:12:51] {1777} INFO - iteration 2, current learner lgbm\n", - "[flaml.automl: 10-08 15:12:53] {1966} INFO - at 4.7s,\testimator lgbm's best error=0.3759,\tbest estimator lgbm's best error=0.3759\n", - "[flaml.automl: 10-08 15:12:53] {1777} INFO - iteration 3, current learner xgboost\n", - "[flaml.automl: 10-08 15:12:53] {1966} INFO - at 5.0s,\testimator xgboost's best error=0.3787,\tbest estimator lgbm's best error=0.3759\n", - "[flaml.automl: 10-08 15:12:53] {1777} INFO - iteration 4, current learner lgbm\n", - "[flaml.automl: 10-08 15:12:54] {1966} INFO - at 5.1s,\testimator lgbm's best error=0.3644,\tbest estimator lgbm's best error=0.3644\n", - "[flaml.automl: 10-08 15:12:54] {1777} INFO - iteration 5, current learner lgbm\n", - "[flaml.automl: 10-08 15:12:54] {1966} INFO - at 5.3s,\testimator lgbm's best error=0.3588,\tbest estimator lgbm's best error=0.3588\n", - "[flaml.automl: 10-08 15:12:54] {1777} INFO - iteration 6, current learner lgbm\n", - "[flaml.automl: 10-08 15:12:54] {1966} INFO - at 5.4s,\testimator lgbm's best error=0.3588,\tbest estimator lgbm's best error=0.3588\n", - "[flaml.automl: 10-08 15:12:54] {1777} INFO - iteration 7, current learner lgbm\n", - "[flaml.automl: 10-08 15:12:54] {1966} INFO - at 5.9s,\testimator lgbm's best error=0.3555,\tbest estimator lgbm's best error=0.3555\n", - "[flaml.automl: 10-08 15:12:54] {1777} INFO - iteration 8, current learner lgbm\n", - "[flaml.automl: 10-08 15:12:55] {1966} INFO - at 6.2s,\testimator lgbm's best error=0.3555,\tbest estimator lgbm's best error=0.3555\n", - "[flaml.automl: 10-08 15:12:55] {1777} INFO - iteration 9, current learner xgboost\n", - "[flaml.automl: 10-08 15:12:55] {1966} INFO - at 6.3s,\testimator xgboost's best error=0.3649,\tbest estimator lgbm's best error=0.3555\n", - "[flaml.automl: 10-08 15:12:55] {1777} INFO - iteration 10, current learner xgboost\n", - "[flaml.automl: 10-08 15:12:55] {1966} INFO - at 6.4s,\testimator xgboost's best error=0.3649,\tbest estimator lgbm's best error=0.3555\n", - "[flaml.automl: 10-08 15:12:55] {1777} INFO - iteration 11, current learner xgboost\n", - "[flaml.automl: 10-08 15:12:55] {1966} INFO - at 6.6s,\testimator xgboost's best error=0.3649,\tbest estimator lgbm's best error=0.3555\n", - "[flaml.automl: 10-08 15:12:55] {1777} INFO - iteration 12, current learner lgbm\n", - "[flaml.automl: 10-08 15:12:56] {1966} INFO - at 7.7s,\testimator lgbm's best error=0.3555,\tbest estimator lgbm's best error=0.3555\n", - "[flaml.automl: 10-08 15:12:56] {1777} INFO - iteration 13, current learner xgboost\n", - "[flaml.automl: 10-08 15:12:56] {1966} INFO - at 7.8s,\testimator xgboost's best error=0.3629,\tbest estimator lgbm's best error=0.3555\n", - "[flaml.automl: 10-08 15:12:56] {1777} INFO - iteration 14, current learner lgbm\n", - "[flaml.automl: 10-08 15:12:58] {1966} INFO - at 9.2s,\testimator lgbm's best error=0.3555,\tbest estimator lgbm's best error=0.3555\n", - "[flaml.automl: 10-08 15:12:58] {1777} INFO - iteration 15, current learner extra_tree\n", - "[flaml.automl: 10-08 15:12:58] {1966} INFO - at 9.4s,\testimator extra_tree's best error=0.3773,\tbest estimator lgbm's best error=0.3555\n", - "[flaml.automl: 10-08 15:12:58] {1777} INFO - iteration 16, current learner extra_tree\n", - "[flaml.automl: 10-08 15:12:58] {1966} INFO - at 9.5s,\testimator extra_tree's best error=0.3757,\tbest estimator lgbm's best error=0.3555\n", - "[flaml.automl: 10-08 15:12:58] {1777} INFO - iteration 17, current learner rf\n", - "[flaml.automl: 10-08 15:12:58] {1966} INFO - at 9.7s,\testimator rf's best error=0.3765,\tbest estimator lgbm's best error=0.3555\n", - "[flaml.automl: 10-08 15:12:58] {1777} INFO - iteration 18, current learner lgbm\n", - "[flaml.automl: 10-08 15:12:59] {1966} INFO - at 10.7s,\testimator lgbm's best error=0.3542,\tbest estimator lgbm's best error=0.3542\n", - "[flaml.automl: 10-08 15:12:59] {1777} INFO - iteration 19, current learner rf\n", - "[flaml.automl: 10-08 15:12:59] {1966} INFO - at 10.9s,\testimator rf's best error=0.3724,\tbest estimator lgbm's best error=0.3542\n", - "[flaml.automl: 10-08 15:12:59] {1777} INFO - iteration 20, current learner rf\n", - "[flaml.automl: 10-08 15:13:00] {1966} INFO - at 11.1s,\testimator rf's best error=0.3724,\tbest estimator lgbm's best error=0.3542\n", - "[flaml.automl: 10-08 15:13:00] {1777} INFO - iteration 21, current learner xgboost\n", - "[flaml.automl: 10-08 15:13:00] {1966} INFO - at 11.2s,\testimator xgboost's best error=0.3629,\tbest estimator lgbm's best error=0.3542\n", - "[flaml.automl: 10-08 15:13:00] {1777} INFO - iteration 22, current learner lgbm\n", - "[flaml.automl: 10-08 15:13:02] {1966} INFO - at 13.2s,\testimator lgbm's best error=0.3507,\tbest estimator lgbm's best error=0.3507\n", - "[flaml.automl: 10-08 15:13:02] {1777} INFO - iteration 23, current learner lgbm\n", - "[flaml.automl: 10-08 15:13:03] {1966} INFO - at 14.1s,\testimator lgbm's best error=0.3507,\tbest estimator lgbm's best error=0.3507\n", - "[flaml.automl: 10-08 15:13:03] {1777} INFO - iteration 24, current learner xgboost\n", - "[flaml.automl: 10-08 15:13:03] {1966} INFO - at 14.2s,\testimator xgboost's best error=0.3612,\tbest estimator lgbm's best error=0.3507\n", - "[flaml.automl: 10-08 15:13:03] {1777} INFO - iteration 25, current learner extra_tree\n", - "[flaml.automl: 10-08 15:13:03] {1966} INFO - at 14.4s,\testimator extra_tree's best error=0.3757,\tbest estimator lgbm's best error=0.3507\n", - "[flaml.automl: 10-08 15:13:03] {1777} INFO - iteration 26, current learner lgbm\n", - "[flaml.automl: 10-08 15:13:07] {1966} INFO - at 19.0s,\testimator lgbm's best error=0.3507,\tbest estimator lgbm's best error=0.3507\n", - "[flaml.automl: 10-08 15:13:07] {1777} INFO - iteration 27, current learner xgboost\n", - "[flaml.automl: 10-08 15:13:08] {1966} INFO - at 19.1s,\testimator xgboost's best error=0.3612,\tbest estimator lgbm's best error=0.3507\n", - "[flaml.automl: 10-08 15:13:08] {1777} INFO - iteration 28, current learner extra_tree\n", - "[flaml.automl: 10-08 15:13:08] {1966} INFO - at 19.3s,\testimator extra_tree's best error=0.3757,\tbest estimator lgbm's best error=0.3507\n", - "[flaml.automl: 10-08 15:13:08] {1777} INFO - iteration 29, current learner xgboost\n", - "[flaml.automl: 10-08 15:13:08] {1966} INFO - at 19.5s,\testimator xgboost's best error=0.3612,\tbest estimator lgbm's best error=0.3507\n", - "[flaml.automl: 10-08 15:13:08] {1777} INFO - iteration 30, current learner lgbm\n", - "[flaml.automl: 10-08 15:13:09] {1966} INFO - at 20.9s,\testimator lgbm's best error=0.3507,\tbest estimator lgbm's best error=0.3507\n", - "[flaml.automl: 10-08 15:13:09] {1777} INFO - iteration 31, current learner lgbm\n", - "[flaml.automl: 10-08 15:13:11] {1966} INFO - at 22.5s,\testimator lgbm's best error=0.3507,\tbest estimator lgbm's best error=0.3507\n", - "[flaml.automl: 10-08 15:13:11] {1777} INFO - iteration 32, current learner lgbm\n", - "[flaml.automl: 10-08 15:13:23] {1966} INFO - at 34.3s,\testimator lgbm's best error=0.3404,\tbest estimator lgbm's best error=0.3404\n", - "[flaml.automl: 10-08 15:13:23] {1777} INFO - iteration 33, current learner extra_tree\n", - "[flaml.automl: 10-08 15:13:23] {1966} INFO - at 34.5s,\testimator extra_tree's best error=0.3757,\tbest estimator lgbm's best error=0.3404\n", - "[flaml.automl: 10-08 15:13:23] {1777} INFO - iteration 34, current learner lgbm\n", - "[flaml.automl: 10-08 15:13:44] {1966} INFO - at 55.1s,\testimator lgbm's best error=0.3343,\tbest estimator lgbm's best error=0.3343\n", - "[flaml.automl: 10-08 15:13:44] {1777} INFO - iteration 35, current learner rf\n", - "[flaml.automl: 10-08 15:13:44] {1966} INFO - at 55.2s,\testimator rf's best error=0.3724,\tbest estimator lgbm's best error=0.3343\n", - "[flaml.automl: 10-08 15:13:44] {1777} INFO - iteration 36, current learner extra_tree\n", - "[flaml.automl: 10-08 15:13:44] {1966} INFO - at 55.4s,\testimator extra_tree's best error=0.3757,\tbest estimator lgbm's best error=0.3343\n", - "[flaml.automl: 10-08 15:13:44] {1777} INFO - iteration 37, current learner rf\n", - "[flaml.automl: 10-08 15:13:44] {1966} INFO - at 55.5s,\testimator rf's best error=0.3724,\tbest estimator lgbm's best error=0.3343\n", - "[flaml.automl: 10-08 15:13:44] {1777} INFO - iteration 38, current learner xgboost\n", - "[flaml.automl: 10-08 15:13:44] {1966} INFO - at 55.7s,\testimator xgboost's best error=0.3612,\tbest estimator lgbm's best error=0.3343\n", - "[flaml.automl: 10-08 15:13:44] {1777} INFO - iteration 39, current learner rf\n", - "[flaml.automl: 10-08 15:13:44] {1966} INFO - at 56.0s,\testimator rf's best error=0.3719,\tbest estimator lgbm's best error=0.3343\n", - "[flaml.automl: 10-08 15:13:44] {1777} INFO - iteration 40, current learner xgboost\n", - "[flaml.automl: 10-08 15:13:45] {1966} INFO - at 56.3s,\testimator xgboost's best error=0.3600,\tbest estimator lgbm's best error=0.3343\n", - "[flaml.automl: 10-08 15:13:45] {1777} INFO - iteration 41, current learner extra_tree\n", - "[flaml.automl: 10-08 15:13:45] {1966} INFO - at 56.5s,\testimator extra_tree's best error=0.3757,\tbest estimator lgbm's best error=0.3343\n", - "[flaml.automl: 10-08 15:13:45] {1777} INFO - iteration 42, current learner lgbm\n", - "[flaml.automl: 10-08 15:13:56] {1966} INFO - at 67.1s,\testimator lgbm's best error=0.3343,\tbest estimator lgbm's best error=0.3343\n", - "[flaml.automl: 10-08 15:13:56] {1777} INFO - iteration 43, current learner xgboost\n", - "[flaml.automl: 10-08 15:13:56] {1966} INFO - at 67.4s,\testimator xgboost's best error=0.3558,\tbest estimator lgbm's best error=0.3343\n", - "[flaml.automl: 10-08 15:13:56] {1777} INFO - iteration 44, current learner extra_tree\n", - "[flaml.automl: 10-08 15:13:56] {1966} INFO - at 67.5s,\testimator extra_tree's best error=0.3757,\tbest estimator lgbm's best error=0.3343\n", - "[flaml.automl: 10-08 15:13:56] {1777} INFO - iteration 45, current learner xgboost\n", - "[flaml.automl: 10-08 15:13:56] {1966} INFO - at 67.8s,\testimator xgboost's best error=0.3558,\tbest estimator lgbm's best error=0.3343\n", - "[flaml.automl: 10-08 15:13:56] {1777} INFO - iteration 46, current learner xgboost\n", - "[flaml.automl: 10-08 15:13:57] {1966} INFO - at 68.3s,\testimator xgboost's best error=0.3558,\tbest estimator lgbm's best error=0.3343\n", - "[flaml.automl: 10-08 15:13:57] {1777} INFO - iteration 47, current learner xgboost\n", - "[flaml.automl: 10-08 15:13:57] {1966} INFO - at 68.6s,\testimator xgboost's best error=0.3558,\tbest estimator lgbm's best error=0.3343\n", - "[flaml.automl: 10-08 15:13:57] {1777} INFO - iteration 48, current learner lgbm\n", - "[flaml.automl: 10-08 15:14:08] {1966} INFO - at 79.8s,\testimator lgbm's best error=0.3296,\tbest estimator lgbm's best error=0.3296\n", - "[flaml.automl: 10-08 15:14:08] {1777} INFO - iteration 49, current learner xgboost\n", - "[flaml.automl: 10-08 15:14:11] {1966} INFO - at 82.2s,\testimator xgboost's best error=0.3544,\tbest estimator lgbm's best error=0.3296\n", - "[flaml.automl: 10-08 15:14:11] {1777} INFO - iteration 50, current learner extra_tree\n", - "[flaml.automl: 10-08 15:14:11] {1966} INFO - at 82.4s,\testimator extra_tree's best error=0.3753,\tbest estimator lgbm's best error=0.3296\n", - "[flaml.automl: 10-08 15:14:11] {1777} INFO - iteration 51, current learner lgbm\n", - "[flaml.automl: 10-08 15:14:34] {1966} INFO - at 105.2s,\testimator lgbm's best error=0.3296,\tbest estimator lgbm's best error=0.3296\n", - "[flaml.automl: 10-08 15:14:34] {1777} INFO - iteration 52, current learner lgbm\n", - "[flaml.automl: 10-08 15:14:43] {1966} INFO - at 114.1s,\testimator lgbm's best error=0.3296,\tbest estimator lgbm's best error=0.3296\n", - "[flaml.automl: 10-08 15:14:43] {1777} INFO - iteration 53, current learner lgbm\n", - "[flaml.automl: 10-08 15:15:04] {1966} INFO - at 135.7s,\testimator lgbm's best error=0.3296,\tbest estimator lgbm's best error=0.3296\n", - "[flaml.automl: 10-08 15:15:04] {1777} INFO - iteration 54, current learner lgbm\n", - "[flaml.automl: 10-08 15:15:09] {1966} INFO - at 140.0s,\testimator lgbm's best error=0.3296,\tbest estimator lgbm's best error=0.3296\n", - "[flaml.automl: 10-08 15:15:09] {1777} INFO - iteration 55, current learner xgboost\n", - "[flaml.automl: 10-08 15:15:12] {1966} INFO - at 143.5s,\testimator xgboost's best error=0.3494,\tbest estimator lgbm's best error=0.3296\n", - "[flaml.automl: 10-08 15:15:12] {1777} INFO - iteration 56, current learner lgbm\n", - "[flaml.automl: 10-08 15:16:03] {1966} INFO - at 194.7s,\testimator lgbm's best error=0.3296,\tbest estimator lgbm's best error=0.3296\n", - "[flaml.automl: 10-08 15:16:03] {1777} INFO - iteration 57, current learner rf\n", - "[flaml.automl: 10-08 15:16:03] {1966} INFO - at 194.9s,\testimator rf's best error=0.3717,\tbest estimator lgbm's best error=0.3296\n", - "[flaml.automl: 10-08 15:16:03] {1777} INFO - iteration 58, current learner xgboost\n", - "[flaml.automl: 10-08 15:16:05] {1966} INFO - at 196.9s,\testimator xgboost's best error=0.3494,\tbest estimator lgbm's best error=0.3296\n", - "[flaml.automl: 10-08 15:16:05] {1777} INFO - iteration 59, current learner lgbm\n", - "[flaml.automl: 10-08 15:16:45] {1966} INFO - at 236.4s,\testimator lgbm's best error=0.3296,\tbest estimator lgbm's best error=0.3296\n", - "[flaml.automl: 10-08 15:16:45] {1777} INFO - iteration 60, current learner xgboost\n", - "[flaml.automl: 10-08 15:16:47] {1966} INFO - at 238.8s,\testimator xgboost's best error=0.3494,\tbest estimator lgbm's best error=0.3296\n", - "[flaml.automl: 10-08 15:16:47] {1777} INFO - iteration 61, current learner rf\n", - "[flaml.automl: 10-08 15:16:47] {1966} INFO - at 238.8s,\testimator rf's best error=0.3717,\tbest estimator lgbm's best error=0.3296\n", - "[flaml.automl: 10-08 15:16:47] {1777} INFO - iteration 62, current learner rf\n", - "[flaml.automl: 10-08 15:16:47] {1966} INFO - at 238.9s,\testimator rf's best error=0.3717,\tbest estimator lgbm's best error=0.3296\n", - "[flaml.automl: 10-08 15:16:47] {1777} INFO - iteration 63, current learner lrl1\n", - "/home/dmx/miniconda2/envs/test/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:328: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", - " warnings.warn(\"The max_iter was reached which means \"\n", - "[flaml.automl: 10-08 15:16:48] {1966} INFO - at 239.2s,\testimator lrl1's best error=0.4339,\tbest estimator lgbm's best error=0.3296\n", - "[flaml.automl: 10-08 15:16:48] {1777} INFO - iteration 64, current learner lrl1\n", - "/home/dmx/miniconda2/envs/test/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:328: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", - " warnings.warn(\"The max_iter was reached which means \"\n", - "[flaml.automl: 10-08 15:16:48] {1966} INFO - at 239.5s,\testimator lrl1's best error=0.4339,\tbest estimator lgbm's best error=0.3296\n", - "[flaml.automl: 10-08 15:16:48] {1777} INFO - iteration 65, current learner rf\n", - "[flaml.automl: 10-08 15:16:48] {1966} INFO - at 239.6s,\testimator rf's best error=0.3717,\tbest estimator lgbm's best error=0.3296\n", - "[flaml.automl: 10-08 15:16:48] {1777} INFO - iteration 66, current learner rf\n", - "[flaml.automl: 10-08 15:16:48] {1966} INFO - at 239.6s,\testimator rf's best error=0.3717,\tbest estimator lgbm's best error=0.3296\n", - "[flaml.automl: 10-08 15:16:48] {1777} INFO - iteration 67, current learner extra_tree\n", - "[flaml.automl: 10-08 15:16:48] {1966} INFO - at 239.7s,\testimator extra_tree's best error=0.3753,\tbest estimator lgbm's best error=0.3296\n", - "[flaml.automl: 10-08 15:16:48] {1777} INFO - iteration 68, current learner rf\n", - "[flaml.automl: 10-08 15:16:48] {1966} INFO - at 239.8s,\testimator rf's best error=0.3717,\tbest estimator lgbm's best error=0.3296\n", - "[flaml.automl: 10-08 15:16:48] {2073} INFO - selected model: LGBMClassifier(colsample_bytree=0.7263265270618353,\n", - " learning_rate=0.19240592731562936, max_bin=511,\n", - " min_child_samples=101, n_estimators=334, num_leaves=50,\n", - " reg_alpha=0.042474252908075376, reg_lambda=0.44574701224719,\n", - " verbose=-1)\n", - "[flaml.automl: 10-08 15:16:59] {2136} INFO - retrain lgbm for 10.5s\n", - "[flaml.automl: 10-08 15:16:59] {2142} INFO - retrained model: LGBMClassifier(colsample_bytree=0.7263265270618353,\n", - " learning_rate=0.19240592731562936, max_bin=511,\n", - " min_child_samples=101, n_estimators=334, num_leaves=50,\n", - " reg_alpha=0.042474252908075376, reg_lambda=0.44574701224719,\n", - " verbose=-1)\n", - "[flaml.automl: 10-08 15:16:59] {1571} INFO - fit succeeded\n", - "[flaml.automl: 10-08 15:16:59] {1572} INFO - Time taken to find the best model: 79.82886719703674\n" - ] - } - ], "metadata": { "slideshow": { "slide_type": "slide" @@ -313,106 +141,263 @@ "tags": [ "outputPrepend" ] - } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[flaml.automl: 11-02 19:22:53] {1483} INFO - Data split method: stratified\n", + "[flaml.automl: 11-02 19:22:53] {1487} INFO - Evaluation method: holdout\n", + "[flaml.automl: 11-02 19:22:53] {1537} INFO - Minimizing error metric: 1-accuracy\n", + "[flaml.automl: 11-02 19:22:54] {1574} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'lrl1']\n", + "[flaml.automl: 11-02 19:22:54] {1816} INFO - iteration 0, current learner lgbm\n", + "[flaml.automl: 11-02 19:22:58] {1933} INFO - Estimated sufficient time budget=1538089s. Estimated necessary time budget=27916s.\n", + "[flaml.automl: 11-02 19:22:58] {2013} INFO - at 5.6s,\testimator lgbm's best error=0.3777,\tbest estimator lgbm's best error=0.3777\n", + "[flaml.automl: 11-02 19:22:58] {1816} INFO - iteration 1, current learner lgbm\n", + "[flaml.automl: 11-02 19:22:58] {2013} INFO - at 5.7s,\testimator lgbm's best error=0.3759,\tbest estimator lgbm's best error=0.3759\n", + "[flaml.automl: 11-02 19:22:58] {1816} INFO - iteration 2, current learner lgbm\n", + "[flaml.automl: 11-02 19:22:58] {2013} INFO - at 5.8s,\testimator lgbm's best error=0.3759,\tbest estimator lgbm's best error=0.3759\n", + "[flaml.automl: 11-02 19:22:58] {1816} INFO - iteration 3, current learner lgbm\n", + "[flaml.automl: 11-02 19:22:58] {2013} INFO - at 5.8s,\testimator lgbm's best error=0.3644,\tbest estimator lgbm's best error=0.3644\n", + "[flaml.automl: 11-02 19:22:58] {1816} INFO - iteration 4, current learner lgbm\n", + "[flaml.automl: 11-02 19:22:58] {2013} INFO - at 6.0s,\testimator lgbm's best error=0.3588,\tbest estimator lgbm's best error=0.3588\n", + "[flaml.automl: 11-02 19:22:58] {1816} INFO - iteration 5, current learner lgbm\n", + "[flaml.automl: 11-02 19:22:58] {2013} INFO - at 6.1s,\testimator lgbm's best error=0.3588,\tbest estimator lgbm's best error=0.3588\n", + "[flaml.automl: 11-02 19:22:58] {1816} INFO - iteration 6, current learner lgbm\n", + "[flaml.automl: 11-02 19:22:59] {2013} INFO - at 6.5s,\testimator lgbm's best error=0.3555,\tbest estimator lgbm's best error=0.3555\n", + "[flaml.automl: 11-02 19:22:59] {1816} INFO - iteration 7, current learner lgbm\n", + "[flaml.automl: 11-02 19:22:59] {2013} INFO - at 6.7s,\testimator lgbm's best error=0.3555,\tbest estimator lgbm's best error=0.3555\n", + "[flaml.automl: 11-02 19:22:59] {1816} INFO - iteration 8, current learner lgbm\n", + "[flaml.automl: 11-02 19:23:00] {2013} INFO - at 7.8s,\testimator lgbm's best error=0.3555,\tbest estimator lgbm's best error=0.3555\n", + "[flaml.automl: 11-02 19:23:00] {1816} INFO - iteration 9, current learner lgbm\n", + "[flaml.automl: 11-02 19:23:02] {2013} INFO - at 9.2s,\testimator lgbm's best error=0.3555,\tbest estimator lgbm's best error=0.3555\n", + "[flaml.automl: 11-02 19:23:02] {1816} INFO - iteration 10, current learner lgbm\n", + "[flaml.automl: 11-02 19:23:02] {2013} INFO - at 10.0s,\testimator lgbm's best error=0.3542,\tbest estimator lgbm's best error=0.3542\n", + "[flaml.automl: 11-02 19:23:02] {1816} INFO - iteration 11, current learner lgbm\n", + "[flaml.automl: 11-02 19:23:04] {2013} INFO - at 12.2s,\testimator lgbm's best error=0.3507,\tbest estimator lgbm's best error=0.3507\n", + "[flaml.automl: 11-02 19:23:04] {1816} INFO - iteration 12, current learner lgbm\n", + "[flaml.automl: 11-02 19:23:05] {2013} INFO - at 13.1s,\testimator lgbm's best error=0.3507,\tbest estimator lgbm's best error=0.3507\n", + "[flaml.automl: 11-02 19:23:05] {1816} INFO - iteration 13, current learner lgbm\n", + "[flaml.automl: 11-02 19:23:10] {2013} INFO - at 18.1s,\testimator lgbm's best error=0.3507,\tbest estimator lgbm's best error=0.3507\n", + "[flaml.automl: 11-02 19:23:10] {1816} INFO - iteration 14, current learner lgbm\n", + "[flaml.automl: 11-02 19:23:12] {2013} INFO - at 19.3s,\testimator lgbm's best error=0.3507,\tbest estimator lgbm's best error=0.3507\n", + "[flaml.automl: 11-02 19:23:12] {1816} INFO - iteration 15, current learner xgboost\n", + "[flaml.automl: 11-02 19:23:12] {2013} INFO - at 19.7s,\testimator xgboost's best error=0.3787,\tbest estimator lgbm's best error=0.3507\n", + "[flaml.automl: 11-02 19:23:12] {1816} INFO - iteration 16, current learner lgbm\n", + "[flaml.automl: 11-02 19:23:13] {2013} INFO - at 20.8s,\testimator lgbm's best error=0.3507,\tbest estimator lgbm's best error=0.3507\n", + "[flaml.automl: 11-02 19:23:13] {1816} INFO - iteration 17, current learner xgboost\n", + "[flaml.automl: 11-02 19:23:13] {2013} INFO - at 20.9s,\testimator xgboost's best error=0.3649,\tbest estimator lgbm's best error=0.3507\n", + "[flaml.automl: 11-02 19:23:13] {1816} INFO - iteration 18, current learner xgboost\n", + "[flaml.automl: 11-02 19:23:13] {2013} INFO - at 21.0s,\testimator xgboost's best error=0.3649,\tbest estimator lgbm's best error=0.3507\n", + "[flaml.automl: 11-02 19:23:13] {1816} INFO - iteration 19, current learner xgboost\n", + "[flaml.automl: 11-02 19:23:13] {2013} INFO - at 21.1s,\testimator xgboost's best error=0.3649,\tbest estimator lgbm's best error=0.3507\n", + "[flaml.automl: 11-02 19:23:13] {1816} INFO - iteration 20, current learner xgboost\n", + "[flaml.automl: 11-02 19:23:13] {2013} INFO - at 21.2s,\testimator xgboost's best error=0.3629,\tbest estimator lgbm's best error=0.3507\n", + "[flaml.automl: 11-02 19:23:13] {1816} INFO - iteration 21, current learner lgbm\n", + "[flaml.automl: 11-02 19:23:18] {2013} INFO - at 25.9s,\testimator lgbm's best error=0.3507,\tbest estimator lgbm's best error=0.3507\n", + "[flaml.automl: 11-02 19:23:18] {1816} INFO - iteration 22, current learner xgboost\n", + "[flaml.automl: 11-02 19:23:18] {2013} INFO - at 26.0s,\testimator xgboost's best error=0.3629,\tbest estimator lgbm's best error=0.3507\n", + "[flaml.automl: 11-02 19:23:18] {1816} INFO - iteration 23, current learner xgboost\n", + "[flaml.automl: 11-02 19:23:18] {2013} INFO - at 26.1s,\testimator xgboost's best error=0.3629,\tbest estimator lgbm's best error=0.3507\n", + "[flaml.automl: 11-02 19:23:18] {1816} INFO - iteration 24, current learner lgbm\n", + "[flaml.automl: 11-02 19:23:29] {2013} INFO - at 36.7s,\testimator lgbm's best error=0.3404,\tbest estimator lgbm's best error=0.3404\n", + "[flaml.automl: 11-02 19:23:29] {1816} INFO - iteration 25, current learner extra_tree\n", + "[flaml.automl: 11-02 19:23:29] {2013} INFO - at 36.8s,\testimator extra_tree's best error=0.3788,\tbest estimator lgbm's best error=0.3404\n", + "[flaml.automl: 11-02 19:23:29] {1816} INFO - iteration 26, current learner extra_tree\n", + "[flaml.automl: 11-02 19:23:29] {2013} INFO - at 36.9s,\testimator extra_tree's best error=0.3788,\tbest estimator lgbm's best error=0.3404\n", + "[flaml.automl: 11-02 19:23:29] {1816} INFO - iteration 27, current learner extra_tree\n", + "[flaml.automl: 11-02 19:23:29] {2013} INFO - at 37.0s,\testimator extra_tree's best error=0.3774,\tbest estimator lgbm's best error=0.3404\n", + "[flaml.automl: 11-02 19:23:29] {1816} INFO - iteration 28, current learner rf\n", + "[flaml.automl: 11-02 19:23:29] {2013} INFO - at 37.2s,\testimator rf's best error=0.3765,\tbest estimator lgbm's best error=0.3404\n", + "[flaml.automl: 11-02 19:23:29] {1816} INFO - iteration 29, current learner rf\n", + "[flaml.automl: 11-02 19:23:30] {2013} INFO - at 37.4s,\testimator rf's best error=0.3721,\tbest estimator lgbm's best error=0.3404\n", + "[flaml.automl: 11-02 19:23:30] {1816} INFO - iteration 30, current learner rf\n", + "[flaml.automl: 11-02 19:23:30] {2013} INFO - at 37.6s,\testimator rf's best error=0.3721,\tbest estimator lgbm's best error=0.3404\n", + "[flaml.automl: 11-02 19:23:30] {1816} INFO - iteration 31, current learner lgbm\n", + "[flaml.automl: 11-02 19:23:51] {2013} INFO - at 59.1s,\testimator lgbm's best error=0.3343,\tbest estimator lgbm's best error=0.3343\n", + "[flaml.automl: 11-02 19:23:51] {1816} INFO - iteration 32, current learner rf\n", + "[flaml.automl: 11-02 19:23:52] {2013} INFO - at 59.3s,\testimator rf's best error=0.3721,\tbest estimator lgbm's best error=0.3343\n", + "[flaml.automl: 11-02 19:23:52] {1816} INFO - iteration 33, current learner extra_tree\n", + "[flaml.automl: 11-02 19:23:52] {2013} INFO - at 59.4s,\testimator extra_tree's best error=0.3774,\tbest estimator lgbm's best error=0.3343\n", + "[flaml.automl: 11-02 19:23:52] {1816} INFO - iteration 34, current learner lgbm\n", + "[flaml.automl: 11-02 19:24:03] {2013} INFO - at 70.8s,\testimator lgbm's best error=0.3343,\tbest estimator lgbm's best error=0.3343\n", + "[flaml.automl: 11-02 19:24:03] {1816} INFO - iteration 35, current learner rf\n", + "[flaml.automl: 11-02 19:24:03] {2013} INFO - at 70.9s,\testimator rf's best error=0.3721,\tbest estimator lgbm's best error=0.3343\n", + "[flaml.automl: 11-02 19:24:03] {1816} INFO - iteration 36, current learner extra_tree\n", + "[flaml.automl: 11-02 19:24:03] {2013} INFO - at 71.2s,\testimator extra_tree's best error=0.3768,\tbest estimator lgbm's best error=0.3343\n", + "[flaml.automl: 11-02 19:24:03] {1816} INFO - iteration 37, current learner rf\n", + "[flaml.automl: 11-02 19:24:04] {2013} INFO - at 71.3s,\testimator rf's best error=0.3687,\tbest estimator lgbm's best error=0.3343\n", + "[flaml.automl: 11-02 19:24:04] {1816} INFO - iteration 38, current learner xgboost\n", + "[flaml.automl: 11-02 19:24:04] {2013} INFO - at 71.5s,\testimator xgboost's best error=0.3629,\tbest estimator lgbm's best error=0.3343\n", + "[flaml.automl: 11-02 19:24:04] {1816} INFO - iteration 39, current learner rf\n", + "[flaml.automl: 11-02 19:24:04] {2013} INFO - at 71.7s,\testimator rf's best error=0.3687,\tbest estimator lgbm's best error=0.3343\n", + "[flaml.automl: 11-02 19:24:04] {1816} INFO - iteration 40, current learner xgboost\n", + "[flaml.automl: 11-02 19:24:04] {2013} INFO - at 72.0s,\testimator xgboost's best error=0.3605,\tbest estimator lgbm's best error=0.3343\n", + "[flaml.automl: 11-02 19:24:04] {1816} INFO - iteration 41, current learner xgboost\n", + "[flaml.automl: 11-02 19:24:05] {2013} INFO - at 72.2s,\testimator xgboost's best error=0.3586,\tbest estimator lgbm's best error=0.3343\n", + "[flaml.automl: 11-02 19:24:05] {1816} INFO - iteration 42, current learner lgbm\n", + "[flaml.automl: 11-02 19:24:16] {2013} INFO - at 84.2s,\testimator lgbm's best error=0.3296,\tbest estimator lgbm's best error=0.3296\n", + "[flaml.automl: 11-02 19:24:16] {1816} INFO - iteration 43, current learner xgboost\n", + "[flaml.automl: 11-02 19:24:17] {2013} INFO - at 84.5s,\testimator xgboost's best error=0.3586,\tbest estimator lgbm's best error=0.3296\n", + "[flaml.automl: 11-02 19:24:17] {1816} INFO - iteration 44, current learner extra_tree\n", + "[flaml.automl: 11-02 19:24:17] {2013} INFO - at 84.6s,\testimator extra_tree's best error=0.3751,\tbest estimator lgbm's best error=0.3296\n", + "[flaml.automl: 11-02 19:24:17] {1816} INFO - iteration 45, current learner extra_tree\n", + "[flaml.automl: 11-02 19:24:17] {2013} INFO - at 84.8s,\testimator extra_tree's best error=0.3751,\tbest estimator lgbm's best error=0.3296\n", + "[flaml.automl: 11-02 19:24:17] {1816} INFO - iteration 46, current learner rf\n", + "[flaml.automl: 11-02 19:24:17] {2013} INFO - at 84.9s,\testimator rf's best error=0.3687,\tbest estimator lgbm's best error=0.3296\n", + "[flaml.automl: 11-02 19:24:17] {1816} INFO - iteration 47, current learner extra_tree\n", + "[flaml.automl: 11-02 19:24:17] {2013} INFO - at 85.1s,\testimator extra_tree's best error=0.3751,\tbest estimator lgbm's best error=0.3296\n", + "[flaml.automl: 11-02 19:24:17] {1816} INFO - iteration 48, current learner lgbm\n", + "[flaml.automl: 11-02 19:24:40] {2013} INFO - at 107.3s,\testimator lgbm's best error=0.3296,\tbest estimator lgbm's best error=0.3296\n", + "[flaml.automl: 11-02 19:24:40] {1816} INFO - iteration 49, current learner xgboost\n", + "[flaml.automl: 11-02 19:24:40] {2013} INFO - at 107.5s,\testimator xgboost's best error=0.3585,\tbest estimator lgbm's best error=0.3296\n", + "[flaml.automl: 11-02 19:24:40] {1816} INFO - iteration 50, current learner extra_tree\n", + "[flaml.automl: 11-02 19:24:40] {2013} INFO - at 107.8s,\testimator extra_tree's best error=0.3635,\tbest estimator lgbm's best error=0.3296\n", + "[flaml.automl: 11-02 19:24:40] {1816} INFO - iteration 51, current learner extra_tree\n", + "[flaml.automl: 11-02 19:24:40] {2013} INFO - at 107.9s,\testimator extra_tree's best error=0.3635,\tbest estimator lgbm's best error=0.3296\n", + "[flaml.automl: 11-02 19:24:40] {1816} INFO - iteration 52, current learner extra_tree\n", + "[flaml.automl: 11-02 19:24:40] {2013} INFO - at 108.1s,\testimator extra_tree's best error=0.3635,\tbest estimator lgbm's best error=0.3296\n", + "[flaml.automl: 11-02 19:24:40] {1816} INFO - iteration 53, current learner lgbm\n", + "[flaml.automl: 11-02 19:24:48] {2013} INFO - at 115.6s,\testimator lgbm's best error=0.3296,\tbest estimator lgbm's best error=0.3296\n", + "[flaml.automl: 11-02 19:24:48] {1816} INFO - iteration 54, current learner lgbm\n", + "[flaml.automl: 11-02 19:25:08] {2013} INFO - at 136.2s,\testimator lgbm's best error=0.3296,\tbest estimator lgbm's best error=0.3296\n", + "[flaml.automl: 11-02 19:25:08] {1816} INFO - iteration 55, current learner extra_tree\n", + "[flaml.automl: 11-02 19:25:09] {2013} INFO - at 136.4s,\testimator extra_tree's best error=0.3635,\tbest estimator lgbm's best error=0.3296\n", + "[flaml.automl: 11-02 19:25:09] {1816} INFO - iteration 56, current learner rf\n", + "[flaml.automl: 11-02 19:25:09] {2013} INFO - at 136.5s,\testimator rf's best error=0.3687,\tbest estimator lgbm's best error=0.3296\n", + "[flaml.automl: 11-02 19:25:09] {1816} INFO - iteration 57, current learner xgboost\n", + "[flaml.automl: 11-02 19:25:09] {2013} INFO - at 136.9s,\testimator xgboost's best error=0.3585,\tbest estimator lgbm's best error=0.3296\n", + "[flaml.automl: 11-02 19:25:09] {1816} INFO - iteration 58, current learner extra_tree\n", + "[flaml.automl: 11-02 19:25:09] {2013} INFO - at 137.2s,\testimator extra_tree's best error=0.3615,\tbest estimator lgbm's best error=0.3296\n", + "[flaml.automl: 11-02 19:25:09] {1816} INFO - iteration 59, current learner lgbm\n", + "[flaml.automl: 11-02 19:25:13] {2013} INFO - at 140.8s,\testimator lgbm's best error=0.3296,\tbest estimator lgbm's best error=0.3296\n", + "[flaml.automl: 11-02 19:25:13] {1816} INFO - iteration 60, current learner xgboost\n", + "[flaml.automl: 11-02 19:25:13] {2013} INFO - at 141.1s,\testimator xgboost's best error=0.3585,\tbest estimator lgbm's best error=0.3296\n", + "[flaml.automl: 11-02 19:25:13] {1816} INFO - iteration 61, current learner lgbm\n", + "[flaml.automl: 11-02 19:26:04] {2013} INFO - at 192.1s,\testimator lgbm's best error=0.3296,\tbest estimator lgbm's best error=0.3296\n", + "[flaml.automl: 11-02 19:26:04] {1816} INFO - iteration 62, current learner extra_tree\n", + "[flaml.automl: 11-02 19:26:05] {2013} INFO - at 192.3s,\testimator extra_tree's best error=0.3615,\tbest estimator lgbm's best error=0.3296\n", + "[flaml.automl: 11-02 19:26:05] {1816} INFO - iteration 63, current learner extra_tree\n", + "[flaml.automl: 11-02 19:26:05] {2013} INFO - at 192.5s,\testimator extra_tree's best error=0.3615,\tbest estimator lgbm's best error=0.3296\n", + "[flaml.automl: 11-02 19:26:05] {1816} INFO - iteration 64, current learner lgbm\n", + "[flaml.automl: 11-02 19:27:03] {2013} INFO - at 250.7s,\testimator lgbm's best error=0.3296,\tbest estimator lgbm's best error=0.3296\n", + "[flaml.automl: 11-02 19:27:14] {2230} INFO - retrain lgbm for 11.0s\n", + "[flaml.automl: 11-02 19:27:14] {2237} INFO - retrained model: LGBMClassifier(colsample_bytree=0.7263265270618353,\n", + " learning_rate=0.19240592731562967, max_bin=511,\n", + " min_child_samples=101, n_estimators=334, num_leaves=50,\n", + " reg_alpha=0.042474252908075376, reg_lambda=0.44574701224719,\n", + " verbose=-1)\n", + "[flaml.automl: 11-02 19:27:14] {1598} INFO - fit succeeded\n", + "[flaml.automl: 11-02 19:27:14] {1599} INFO - Time taken to find the best model: 84.20605731010437\n" + ] + } + ], + "source": [ + "'''The main flaml automl API'''\n", + "automl.fit(X_train=X_train, y_train=y_train, **settings)" + ] }, { "cell_type": "markdown", - "source": [ - "### Best model and metric" - ], "metadata": { "slideshow": { "slide_type": "slide" } - } + }, + "source": [ + "### Best model and metric" + ] }, { "cell_type": "code", "execution_count": 5, - "source": [ - "''' retrieve best config and best learner'''\r\n", - "print('Best ML leaner:', automl.best_estimator)\r\n", - "print('Best hyperparmeter config:', automl.best_config)\r\n", - "print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))\r\n", - "print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Best ML leaner: lgbm\n", - "Best hyperparmeter config: {'n_estimators': 334, 'num_leaves': 50, 'min_child_samples': 101, 'learning_rate': 0.19240592731562936, 'log_max_bin': 9, 'colsample_bytree': 0.7263265270618353, 'reg_alpha': 0.042474252908075376, 'reg_lambda': 0.44574701224719, 'FLAML_sample_size': 364083}\n", - "Best accuracy on validation data: 0.6704\n", - "Training duration of best run: 11.24 s\n" - ] - } - ], "metadata": { "slideshow": { "slide_type": "slide" }, "tags": [] - } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best ML leaner: lgbm\n", + "Best hyperparmeter config: {'n_estimators': 334, 'num_leaves': 50, 'min_child_samples': 101, 'learning_rate': 0.19240592731562967, 'log_max_bin': 9, 'colsample_bytree': 0.7263265270618353, 'reg_alpha': 0.042474252908075376, 'reg_lambda': 0.44574701224719, 'FLAML_sample_size': 364083}\n", + "Best accuracy on validation data: 0.6704\n", + "Training duration of best run: 10.96 s\n" + ] + } + ], + "source": [ + "'''retrieve best config and best learner'''\n", + "print('Best ML leaner:', automl.best_estimator)\n", + "print('Best hyperparmeter config:', automl.best_config)\n", + "print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))\n", + "print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))" + ] }, { "cell_type": "code", "execution_count": 6, - "source": [ - "automl.model.estimator" - ], + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "LGBMClassifier(colsample_bytree=0.7263265270618353,\n", - " learning_rate=0.19240592731562936, max_bin=511,\n", + " learning_rate=0.19240592731562967, max_bin=511,\n", " min_child_samples=101, n_estimators=334, num_leaves=50,\n", " reg_alpha=0.042474252908075376, reg_lambda=0.44574701224719,\n", " verbose=-1)" ] }, + "execution_count": 6, "metadata": {}, - "execution_count": 6 + "output_type": "execute_result" } ], - "metadata": { - "slideshow": { - "slide_type": "slide" - } - } + "source": [ + "automl.model.estimator" + ] }, { "cell_type": "code", "execution_count": 7, - "source": [ - "''' pickle and save the automl object '''\r\n", - "import pickle\r\n", - "with open('automl.pkl', 'wb') as f:\r\n", - " pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)" - ], - "outputs": [], "metadata": { "slideshow": { "slide_type": "slide" } - } + }, + "outputs": [], + "source": [ + "'''pickle and save the automl object'''\n", + "import pickle\n", + "with open('automl.pkl', 'wb') as f:\n", + " pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)\n", + "'''load pickled automl object'''\n", + "with open('automl.pkl', 'rb') as f:\n", + " automl = pickle.load(f)" + ] }, { "cell_type": "code", "execution_count": 8, - "source": [ - "''' compute predictions of testing dataset ''' \r\n", - "y_pred = automl.predict(X_test)\r\n", - "print('Predicted labels', y_pred)\r\n", - "print('True labels', y_test)\r\n", - "y_pred_proba = automl.predict_proba(X_test)[:,1]" - ], + "metadata": { + "slideshow": { + "slide_type": "slide" + }, + "tags": [] + }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Predicted labels ['1' '0' '1' ... '1' '0' '0']\n", "True labels 118331 0\n", @@ -431,205 +416,203 @@ ] } ], - "metadata": { - "slideshow": { - "slide_type": "slide" - }, - "tags": [] - } + "source": [ + "'''compute predictions of testing dataset''' \n", + "y_pred = automl.predict(X_test)\n", + "print('Predicted labels', y_pred)\n", + "print('True labels', y_test)\n", + "y_pred_proba = automl.predict_proba(X_test)[:,1]" + ] }, { "cell_type": "code", "execution_count": 9, - "source": [ - "''' compute different metric values on testing dataset'''\r\n", - "from flaml.ml import sklearn_metric_loss_score\r\n", - "print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred, y_test))\r\n", - "print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test))\r\n", - "print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test))" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "accuracy = 0.6713287750470908\n", - "roc_auc = 0.7249878990284184\n", - "log_loss = 0.6035815508574605\n" - ] - } - ], "metadata": { "slideshow": { "slide_type": "slide" }, "tags": [] - } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "accuracy = 0.6713287750470908\n", + "roc_auc = 0.7249878990284184\n", + "log_loss = 0.6035815508574554\n" + ] + } + ], + "source": [ + "''' compute different metric values on testing dataset'''\n", + "from flaml.ml import sklearn_metric_loss_score\n", + "print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred, y_test))\n", + "print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test))\n", + "print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test))" + ] }, { "cell_type": "markdown", - "source": [ - "See Section 4 for an accuracy comparison with default LightGBM and XGBoost.\n", - "\n", - "### Log history" - ], "metadata": { "slideshow": { "slide_type": "slide" } - } + }, + "source": [ + "See Section 4 for an accuracy comparison with default LightGBM and XGBoost.\n", + "\n", + "### Log history" + ] }, { "cell_type": "code", "execution_count": 10, - "source": [ - "from flaml.data import get_output_from_log\r\n", - "time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\r\n", - " get_output_from_log(filename=settings['log_file_name'], time_budget=240)\r\n", - "for config in config_history:\r\n", - " print(config)" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "{'Current Learner': 'lgbm', 'Current Sample': 10000, 'Current Hyper-parameters': {'n_estimators': 4, 'num_leaves': 4, 'min_child_samples': 20, 'learning_rate': 0.09999999999999995, 'log_max_bin': 8, 'colsample_bytree': 1.0, 'reg_alpha': 0.0009765625, 'reg_lambda': 1.0, 'FLAML_sample_size': 10000}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 4, 'num_leaves': 4, 'min_child_samples': 20, 'learning_rate': 0.09999999999999995, 'log_max_bin': 8, 'colsample_bytree': 1.0, 'reg_alpha': 0.0009765625, 'reg_lambda': 1.0, 'FLAML_sample_size': 10000}}\n", - "{'Current Learner': 'lgbm', 'Current Sample': 10000, 'Current Hyper-parameters': {'n_estimators': 4, 'num_leaves': 14, 'min_child_samples': 15, 'learning_rate': 0.22841390623808822, 'log_max_bin': 9, 'colsample_bytree': 1.0, 'reg_alpha': 0.0014700173967242716, 'reg_lambda': 7.624911621832711, 'FLAML_sample_size': 10000}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 4, 'num_leaves': 14, 'min_child_samples': 15, 'learning_rate': 0.22841390623808822, 'log_max_bin': 9, 'colsample_bytree': 1.0, 'reg_alpha': 0.0014700173967242716, 'reg_lambda': 7.624911621832711, 'FLAML_sample_size': 10000}}\n", - "{'Current Learner': 'lgbm', 'Current Sample': 10000, 'Current Hyper-parameters': {'n_estimators': 4, 'num_leaves': 25, 'min_child_samples': 12, 'learning_rate': 0.5082200481556802, 'log_max_bin': 8, 'colsample_bytree': 0.9696263001275751, 'reg_alpha': 0.0028107036379524425, 'reg_lambda': 3.716898117989413, 'FLAML_sample_size': 10000}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 4, 'num_leaves': 25, 'min_child_samples': 12, 'learning_rate': 0.5082200481556802, 'log_max_bin': 8, 'colsample_bytree': 0.9696263001275751, 'reg_alpha': 0.0028107036379524425, 'reg_lambda': 3.716898117989413, 'FLAML_sample_size': 10000}}\n", - "{'Current Learner': 'lgbm', 'Current Sample': 10000, 'Current Hyper-parameters': {'n_estimators': 23, 'num_leaves': 14, 'min_child_samples': 15, 'learning_rate': 0.22841390623808822, 'log_max_bin': 9, 'colsample_bytree': 1.0, 'reg_alpha': 0.0014700173967242718, 'reg_lambda': 7.624911621832699, 'FLAML_sample_size': 10000}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 23, 'num_leaves': 14, 'min_child_samples': 15, 'learning_rate': 0.22841390623808822, 'log_max_bin': 9, 'colsample_bytree': 1.0, 'reg_alpha': 0.0014700173967242718, 'reg_lambda': 7.624911621832699, 'FLAML_sample_size': 10000}}\n", - "{'Current Learner': 'lgbm', 'Current Sample': 10000, 'Current Hyper-parameters': {'n_estimators': 101, 'num_leaves': 12, 'min_child_samples': 24, 'learning_rate': 0.07647794276357095, 'log_max_bin': 10, 'colsample_bytree': 1.0, 'reg_alpha': 0.001749539645587163, 'reg_lambda': 4.373760956394571, 'FLAML_sample_size': 10000}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 101, 'num_leaves': 12, 'min_child_samples': 24, 'learning_rate': 0.07647794276357095, 'log_max_bin': 10, 'colsample_bytree': 1.0, 'reg_alpha': 0.001749539645587163, 'reg_lambda': 4.373760956394571, 'FLAML_sample_size': 10000}}\n", - "{'Current Learner': 'lgbm', 'Current Sample': 40000, 'Current Hyper-parameters': {'n_estimators': 101, 'num_leaves': 12, 'min_child_samples': 24, 'learning_rate': 0.07647794276357095, 'log_max_bin': 10, 'colsample_bytree': 1.0, 'reg_alpha': 0.001749539645587163, 'reg_lambda': 4.373760956394571, 'FLAML_sample_size': 40000}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 101, 'num_leaves': 12, 'min_child_samples': 24, 'learning_rate': 0.07647794276357095, 'log_max_bin': 10, 'colsample_bytree': 1.0, 'reg_alpha': 0.001749539645587163, 'reg_lambda': 4.373760956394571, 'FLAML_sample_size': 40000}}\n", - "{'Current Learner': 'lgbm', 'Current Sample': 40000, 'Current Hyper-parameters': {'n_estimators': 361, 'num_leaves': 11, 'min_child_samples': 32, 'learning_rate': 0.13528717598813866, 'log_max_bin': 9, 'colsample_bytree': 0.9851977789068981, 'reg_alpha': 0.0038372002422749616, 'reg_lambda': 0.25113531892556773, 'FLAML_sample_size': 40000}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 361, 'num_leaves': 11, 'min_child_samples': 32, 'learning_rate': 0.13528717598813866, 'log_max_bin': 9, 'colsample_bytree': 0.9851977789068981, 'reg_alpha': 0.0038372002422749616, 'reg_lambda': 0.25113531892556773, 'FLAML_sample_size': 40000}}\n", - "{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 361, 'num_leaves': 11, 'min_child_samples': 32, 'learning_rate': 0.13528717598813866, 'log_max_bin': 9, 'colsample_bytree': 0.9851977789068981, 'reg_alpha': 0.0038372002422749616, 'reg_lambda': 0.25113531892556773, 'FLAML_sample_size': 364083}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 361, 'num_leaves': 11, 'min_child_samples': 32, 'learning_rate': 0.13528717598813866, 'log_max_bin': 9, 'colsample_bytree': 0.9851977789068981, 'reg_alpha': 0.0038372002422749616, 'reg_lambda': 0.25113531892556773, 'FLAML_sample_size': 364083}}\n", - "{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 654, 'num_leaves': 27, 'min_child_samples': 61, 'learning_rate': 0.0705835177602005, 'log_max_bin': 10, 'colsample_bytree': 0.8629551479851468, 'reg_alpha': 0.016562972790870267, 'reg_lambda': 0.25883390536609663, 'FLAML_sample_size': 364083}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 654, 'num_leaves': 27, 'min_child_samples': 61, 'learning_rate': 0.0705835177602005, 'log_max_bin': 10, 'colsample_bytree': 0.8629551479851468, 'reg_alpha': 0.016562972790870267, 'reg_lambda': 0.25883390536609663, 'FLAML_sample_size': 364083}}\n", - "{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 334, 'num_leaves': 50, 'min_child_samples': 101, 'learning_rate': 0.19240592731562936, 'log_max_bin': 9, 'colsample_bytree': 0.7263265270618353, 'reg_alpha': 0.042474252908075376, 'reg_lambda': 0.44574701224719, 'FLAML_sample_size': 364083}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 334, 'num_leaves': 50, 'min_child_samples': 101, 'learning_rate': 0.19240592731562936, 'log_max_bin': 9, 'colsample_bytree': 0.7263265270618353, 'reg_alpha': 0.042474252908075376, 'reg_lambda': 0.44574701224719, 'FLAML_sample_size': 364083}}\n" - ] - } - ], "metadata": { "slideshow": { "slide_type": "subslide" }, "tags": [] - } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'Current Learner': 'lgbm', 'Current Sample': 10000, 'Current Hyper-parameters': {'n_estimators': 4, 'num_leaves': 4, 'min_child_samples': 20, 'learning_rate': 0.09999999999999995, 'log_max_bin': 8, 'colsample_bytree': 1.0, 'reg_alpha': 0.0009765625, 'reg_lambda': 1.0, 'FLAML_sample_size': 10000}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 4, 'num_leaves': 4, 'min_child_samples': 20, 'learning_rate': 0.09999999999999995, 'log_max_bin': 8, 'colsample_bytree': 1.0, 'reg_alpha': 0.0009765625, 'reg_lambda': 1.0, 'FLAML_sample_size': 10000}}\n", + "{'Current Learner': 'lgbm', 'Current Sample': 10000, 'Current Hyper-parameters': {'n_estimators': 4, 'num_leaves': 14, 'min_child_samples': 15, 'learning_rate': 0.2284139062380884, 'log_max_bin': 9, 'colsample_bytree': 1.0, 'reg_alpha': 0.0014700173967242716, 'reg_lambda': 7.624911621832711, 'FLAML_sample_size': 10000}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 4, 'num_leaves': 14, 'min_child_samples': 15, 'learning_rate': 0.2284139062380884, 'log_max_bin': 9, 'colsample_bytree': 1.0, 'reg_alpha': 0.0014700173967242716, 'reg_lambda': 7.624911621832711, 'FLAML_sample_size': 10000}}\n", + "{'Current Learner': 'lgbm', 'Current Sample': 10000, 'Current Hyper-parameters': {'n_estimators': 4, 'num_leaves': 25, 'min_child_samples': 12, 'learning_rate': 0.5082200481556807, 'log_max_bin': 8, 'colsample_bytree': 0.9696263001275751, 'reg_alpha': 0.0028107036379524425, 'reg_lambda': 3.716898117989413, 'FLAML_sample_size': 10000}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 4, 'num_leaves': 25, 'min_child_samples': 12, 'learning_rate': 0.5082200481556807, 'log_max_bin': 8, 'colsample_bytree': 0.9696263001275751, 'reg_alpha': 0.0028107036379524425, 'reg_lambda': 3.716898117989413, 'FLAML_sample_size': 10000}}\n", + "{'Current Learner': 'lgbm', 'Current Sample': 10000, 'Current Hyper-parameters': {'n_estimators': 23, 'num_leaves': 14, 'min_child_samples': 15, 'learning_rate': 0.2284139062380884, 'log_max_bin': 9, 'colsample_bytree': 1.0, 'reg_alpha': 0.0014700173967242718, 'reg_lambda': 7.624911621832699, 'FLAML_sample_size': 10000}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 23, 'num_leaves': 14, 'min_child_samples': 15, 'learning_rate': 0.2284139062380884, 'log_max_bin': 9, 'colsample_bytree': 1.0, 'reg_alpha': 0.0014700173967242718, 'reg_lambda': 7.624911621832699, 'FLAML_sample_size': 10000}}\n", + "{'Current Learner': 'lgbm', 'Current Sample': 10000, 'Current Hyper-parameters': {'n_estimators': 101, 'num_leaves': 12, 'min_child_samples': 24, 'learning_rate': 0.07647794276357107, 'log_max_bin': 10, 'colsample_bytree': 1.0, 'reg_alpha': 0.001749539645587163, 'reg_lambda': 4.373760956394571, 'FLAML_sample_size': 10000}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 101, 'num_leaves': 12, 'min_child_samples': 24, 'learning_rate': 0.07647794276357107, 'log_max_bin': 10, 'colsample_bytree': 1.0, 'reg_alpha': 0.001749539645587163, 'reg_lambda': 4.373760956394571, 'FLAML_sample_size': 10000}}\n", + "{'Current Learner': 'lgbm', 'Current Sample': 40000, 'Current Hyper-parameters': {'n_estimators': 101, 'num_leaves': 12, 'min_child_samples': 24, 'learning_rate': 0.07647794276357107, 'log_max_bin': 10, 'colsample_bytree': 1.0, 'reg_alpha': 0.001749539645587163, 'reg_lambda': 4.373760956394571, 'FLAML_sample_size': 40000}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 101, 'num_leaves': 12, 'min_child_samples': 24, 'learning_rate': 0.07647794276357107, 'log_max_bin': 10, 'colsample_bytree': 1.0, 'reg_alpha': 0.001749539645587163, 'reg_lambda': 4.373760956394571, 'FLAML_sample_size': 40000}}\n", + "{'Current Learner': 'lgbm', 'Current Sample': 40000, 'Current Hyper-parameters': {'n_estimators': 361, 'num_leaves': 11, 'min_child_samples': 32, 'learning_rate': 0.13528717598813889, 'log_max_bin': 9, 'colsample_bytree': 0.9851977789068981, 'reg_alpha': 0.0038372002422749608, 'reg_lambda': 0.25113531892556773, 'FLAML_sample_size': 40000}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 361, 'num_leaves': 11, 'min_child_samples': 32, 'learning_rate': 0.13528717598813889, 'log_max_bin': 9, 'colsample_bytree': 0.9851977789068981, 'reg_alpha': 0.0038372002422749608, 'reg_lambda': 0.25113531892556773, 'FLAML_sample_size': 40000}}\n", + "{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 361, 'num_leaves': 11, 'min_child_samples': 32, 'learning_rate': 0.13528717598813889, 'log_max_bin': 9, 'colsample_bytree': 0.9851977789068981, 'reg_alpha': 0.0038372002422749608, 'reg_lambda': 0.25113531892556773, 'FLAML_sample_size': 364083}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 361, 'num_leaves': 11, 'min_child_samples': 32, 'learning_rate': 0.13528717598813889, 'log_max_bin': 9, 'colsample_bytree': 0.9851977789068981, 'reg_alpha': 0.0038372002422749608, 'reg_lambda': 0.25113531892556773, 'FLAML_sample_size': 364083}}\n", + "{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 654, 'num_leaves': 27, 'min_child_samples': 61, 'learning_rate': 0.07058351776020065, 'log_max_bin': 10, 'colsample_bytree': 0.8629551479851468, 'reg_alpha': 0.016562972790870267, 'reg_lambda': 0.25883390536609663, 'FLAML_sample_size': 364083}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 654, 'num_leaves': 27, 'min_child_samples': 61, 'learning_rate': 0.07058351776020065, 'log_max_bin': 10, 'colsample_bytree': 0.8629551479851468, 'reg_alpha': 0.016562972790870267, 'reg_lambda': 0.25883390536609663, 'FLAML_sample_size': 364083}}\n", + "{'Current Learner': 'lgbm', 'Current Sample': 364083, 'Current Hyper-parameters': {'n_estimators': 334, 'num_leaves': 50, 'min_child_samples': 101, 'learning_rate': 0.19240592731562967, 'log_max_bin': 9, 'colsample_bytree': 0.7263265270618353, 'reg_alpha': 0.042474252908075376, 'reg_lambda': 0.44574701224719, 'FLAML_sample_size': 364083}, 'Best Learner': 'lgbm', 'Best Hyper-parameters': {'n_estimators': 334, 'num_leaves': 50, 'min_child_samples': 101, 'learning_rate': 0.19240592731562967, 'log_max_bin': 9, 'colsample_bytree': 0.7263265270618353, 'reg_alpha': 0.042474252908075376, 'reg_lambda': 0.44574701224719, 'FLAML_sample_size': 364083}}\n" + ] + } + ], + "source": [ + "from flaml.data import get_output_from_log\n", + "time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n", + " get_output_from_log(filename=settings['log_file_name'], time_budget=240)\n", + "for config in config_history:\n", + " print(config)" + ] }, { "cell_type": "code", "execution_count": 11, - "source": [ - "import matplotlib.pyplot as plt\r\n", - "import numpy as np\r\n", - "\r\n", - "plt.title('Learning Curve')\r\n", - "plt.xlabel('Wall Clock Time (s)')\r\n", - "plt.ylabel('Validation Accuracy')\r\n", - "plt.scatter(time_history, 1 - np.array(valid_loss_history))\r\n", - "plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')\r\n", - "plt.show()" - ], + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, "outputs": [ { - "output_type": "display_data", "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEWCAYAAAB8LwAVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8GearUAAAgAElEQVR4nO3dfZhdVXn38e+PIcAIhgETaDIBgg9JNBpNcEQRUaDSRB4lkSKCfUFsidaXamlDSa2UYil4Re2jV1NtsEqxgkAaxoiRkQJCRSCZEMzL4GAICDNBE0JGEEeSTO7nj71P2DnZMzkJs+ecmfP7XNe55uy1197nnjkzc5+11t5rKSIwMzMrd0C1AzAzs9rkBGFmZrmcIMzMLJcThJmZ5XKCMDOzXE4QZmaWywnCbD9IOlVSZ7XjMCuSE4QNO5KekPSuasYQEf8bEVOKOr+kmZLulfS8pM2S7pF0dlGvZ5bHCcIsh6SGKr72ucAtwPXABOBo4HLgvftxLkny37ntF//i2Igh6QBJl0l6TNIWSTdLOjKz/xZJv5T06/TT+esy+66T9FVJyyS9AJyetlT+RtLq9JibJB2S1j9NUlfm+H7rpvsvlfS0pI2S/lxSSDoh53sQ8CXgcxHx9Yj4dUTsjIh7IuLitM4Vkv4rc8zE9HwHpts/knSVpPuA3wLzJLWXvc5fSVqaPj9Y0hckPSnpV5K+JqnxZb4dNgI4QdhI8klgDvBOYDywFViY2f8DYBJwFPAQ8O2y4z8IXAW8EvhxWnYeMAs4HngD8KEBXj+3rqRZwCXAu4ATgNMGOMcU4Bhg8QB1KvEnwFyS7+VrwBRJkzL7PwjckD6/BpgMTE/jayZpsVidc4KwkeSjwGcioisiXgSuAM4tfbKOiG9ExPOZfW+UdHjm+O9GxH3pJ/bfpWVfiYiNEfEs8D2Sf6L96a/uecA3I2JdRPw2fe3+vCr9+nSl33Q/rktfb0dE/Br4LnABQJooXgMsTVssc4G/iohnI+J54J+B81/m69sI4ARhI8lxwK2SeiT1AI8AfcDRkhokXZN2Pz0HPJEeMyZz/FM55/xl5vlvgcMGeP3+6o4vO3fe65RsSb+OG6BOJcpf4wbSBEHSemhNk9VY4BXAyszP7fa03OqcE4SNJE8B746IpszjkIjoJvmnOJukm+dwYGJ6jDLHFzW18dMkg80lxwxQt5Pk+/jDAeq8QPJPveT3cuqUfy93AGMlTSdJFKXupWeAXuB1mZ/Z4RExUCK0OuEEYcPVKEmHZB4HkvS1XyXpOABJYyXNTuu/EniR5BP6K0i6UYbKzcBFkl4r6RXAZ/urGMn8+5cAn5V0kaTR6eD72yUtSqs9DLxD0rFpF9n8vQUQEdtJroxaABxJkjCIiJ3AtcC/SDoKQFKzpJn7/d3aiOEEYcPVMpJPvqXHFcCXgaXADyU9DzwAvCWtfz3wC6Ab6Ej3DYmI+AHwFeBuYH3mtV/sp/5i4APAh4GNwK+AfyIZRyAi7gBuAlYDK4HbKgzlBpIW1C0RsSNT/reluNLut/8hGSy3OicvGGQ2tCS9FlgLHFz2j9qsprgFYTYEJL0vvd/gCODzwPecHKzWOUGYDY2PAJuAx0iurPqL6oZjtnfuYjIzs1xuQZiZWa4Dqx3AYBkzZkxMnDix2mGYmQ0rK1eufCYicm+MHDEJYuLEibS3t++9opmZ7SLpF/3tcxeTmZnlcoIwM7NcThBmZpbLCcLMzHI5QZiZWa4RcxWTmVm9aV3VzYK2Tjb29DK+qZF5M6cwZ0bzoJ3fCcLMbBhqXdXN/CVr6N3eB0B3Ty/zl6wBGLQkUWgXk6RZkjolrZd0WT91zpPUIWmdpBvSstMlPZx5/E7SnCJjNTMbTha0de5KDiW92/tY0NY5aK9RWAtCUgPJgvFnAl3ACklLI6IjU2cSyWInp0TE1tKCJRFxN+l6vpKOJJmr/odFxWpmNtxs7Ondp/L9UWQL4iRgfURsiIhtwHdIlnzMuhhYGBFbASJiU855zgV+kK6fa2ZmwPimxn0q3x9FJohmdl84vSsty5oMTJZ0n6QHJM3KOc/5wI15LyBprqR2Se2bN28elKDNzIaDeTOn0DiqYbeyxlENzJs5eIsBVvsy1wOBScBpJAupXyupqbRT0jhgGtCWd3BELIqIlohoGTs2d64pM7MRac6MZq4+ZxoHNST/xpubGrn6nGnD5iqmbuCYzPaEtCyrC3gwXVD9cUmPkiSMFen+84Bb0/1mZpYxZ0YzNy5/EoCbPnLyoJ+/yBbECmCSpOMlHUTSVbS0rE4rSesBSWNIupw2ZPZfQD/dS2ZmVqzCEkS63u4nSLqHHgFujoh1kq6UdHZarQ3YIqkDuBuYFxFbACRNJGmB3FNUjGZm1r9Cb5SLiGXAsrKyyzPPA7gkfZQf+wR7DmqbmdkQqfYgtZmZ1SgnCDMzy+UEYWZmuZwgzMwslxOEmZnlcoIwM7NcThBmZpbLCcLMzHI5QZiZWS4vOWpm+6Xo9ZCt+pwgzGyfDcV6yFZ9ThBmts/6Ww/50sWrd00/bUOj4+nnmDpudCHn9hiEme2z/tY93ta3c4gjsanjRjN7ejGtNrcgzGyfjW9qpDsnSTQ3NRaycI1Vh1sQZrbPhmI9ZKs+tyDMbJ+VBqIvXbyabX07afZVTCOSE4SZ7Zei10O26nMXk5mZ5XKCMDOzXE4QZmaWywnCzMxyOUGYmVkuJwgzM8vlBGFmZrmcIMzMLJcThJmZ5XKCMDOzXE4QZmaWywnCzMxyOUGYmVkuJwgzM8vlBGFmZrkKTRCSZknqlLRe0mX91DlPUoekdZJuyJQfK+mHkh5J908sMlYzM9tdYQsGSWoAFgJnAl3ACklLI6IjU2cSMB84JSK2Sjoqc4rrgasi4g5JhwFeDd3MbAgVuaLcScD6iNgAIOk7wGygI1PnYmBhRGwFiIhNad2pwIERcUda/psC47Q61bqqmwVtnWzs6WW8l8w020ORXUzNwFOZ7a60LGsyMFnSfZIekDQrU94jaYmkVZIWpC2S3UiaK6ldUvvmzZsL+SZsZGpd1c38JWvo7uklgO6eXuYvWUPrqu5qh2ZWM6q9JvWBwCTgNGACcK+kaWn5qcAM4EngJuBDwH9kD46IRcAigJaWlhiqoG34W9DWSe/2vt3Kerf3ceni1bvWWba963j6OaaOG13tMKwgRbYguoFjMtsT0rKsLmBpRGyPiMeBR0kSRhfwcERsiIgdQCtwYoGxWp3Z2NObW76tz0Nd+2LquNHMnu5uuZGqyBbECmCSpONJEsP5wAfL6rQCFwDflDSGpGtpA9ADNEkaGxGbgTOA9gJjtTozvqmR7pwk0dzUyE0fObkKEZnVnsJaEOkn/08AbcAjwM0RsU7SlZLOTqu1AVskdQB3A/MiYktE9AF/A9wpaQ0g4NqiYrX6M2/mFBpH7T6s1TiqgXkzp1QpIrPao4iR0XXf0tIS7e1uZFjlWld1c+ni1Wzr20mzr2KyOiVpZUS05O2r9iC1WdXMmdG8a0Da3Upme/JUG2ZmlmuvCULSq4YiEDMzqy2VtCAekHSLpLMkqfCIzMysJlSSICaT3Iz2J8DPJf2zpMnFhmVmZtW21wQRiTsi4gKSuZMuBJZLukeSR/bMzEaovV7FlI5B/DFJC+JXwCeBpcB04Bbg+CIDNDOz6qjkMtf7gW8BcyKiK1PeLulrxYRlZmbVVkmCmBL93E0XEZ8f5HjMzKxGVDJI/UNJTaUNSUdIaiswJjMzqwGVJIixEdFT2kgX9zlqgPpmZjYCVJIg+iQdW9qQdBwwMiZwMjOzflUyBvEZ4MeS7iGZVfVUYG6hUZmZWdXtNUFExO2STgTemhZ9OiKeKTYsMzOrtkpnc+0DNgGHAFMlERH3FheWmZlVWyU3yv058CmSJUMfJmlJ3E+yypuZmY1QlQxSfwp4M/CLiDgdmEGyJKiZmY1glSSI30XE7wAkHRwRPwO8LqOZ2QhXyRhEV3qjXCtwh6StwC+KDcvMzKqtkquY3pc+vULS3cDhwO2FRmVmZlU3YIKQ1ACsi4jXAETEPUMSlZmZVd2AYxAR0Qd0Zu+kNjOz+lDJGMQRwDpJy4EXSoURcXZhUZmZWdVVkiA+W3gUZmZWcyoZpPa4g5lZHarkTurneWn21oOAUcALETG6yMDMzKy6KmlBvLL0XJKA2bw0cZ+ZmY1QldxJvUskWoGZBcVjZmY1opIupnMymwcALcDvCovIzMxqQiVXMb0383wH8ARJN5OZmY1glYxBXDQUgZiZWW3Z6xiEpP9MJ+srbR8h6RvFhmVmZtVWySD1GyJi1/oPEbGVZE2IvZI0S1KnpPWSLuunznmSOiStk3RDprxP0sPpY2klr2dmZoOnkjGIAyQdkSYGJB1ZyXHpRH8LgTOBLmCFpKUR0ZGpMwmYD5wSEVslHZU5RW9ETN+H78XMzAZRJQnii8D9km5Jt98PXFXBcScB6yNiA4Ck75AMbndk6lwMLCwln4jYVGngZmZWrEoGqa+X1M5La1Cfk20FDKAZeCqz3QW8pazOZABJ9wENwBURUVpr4pD0dXcA16T3X+xG0lxgLsCxx9bHhLOtq7pZ0NbJxp5exjc1Mm/mFObMaK52WGY2AlXSVfRWkjUh/jXdHi3pLRHx4CC9/iTgNGACcK+kaemYx3ER0S3p1cBdktZExGPZgyNiEbAIoKWlJRjhWld1M3/JGnq39wHQ3dPL/CVrAJwkzGzQVdLF9FXgxMz2b3LK8nQDx2S2J6RlWV3AgxGxHXhc0qMkCWNFRHQDRMQGST8iGRh/jDq2oK1zV3Io6d3ex6WLV3Pj8ierFNXw1vH0c0wd52nFzPJUchWTImLXp/OI2ElliWUFMEnS8ZIOAs4Hyq9GaiVpPSBpDEmX04b0UtqDM+WnsPvYRV3a2NObW76tb+cQRzJyTB03mtnT3foyy1PJP/oNkv6SpNUA8DFgw94Oiogdkj4BtJGML3wjItZJuhJoj4il6b4/kNQB9AHzImKLpLcB/y5pJ0kSu6bCcY8RbXxTI905SaK5qZGbPnJyFSIys5FMmcZBfoXk0tOvkAxSB3An8KmI2Fx8eJVraWmJ9vb2aodRqPIxCIDGUQ1cfc40j0GY2X6RtDIiWvL2VXIV0yaS7qHSyRqB9wC39HuQFaKUBC5dvJptfTtp9lVMZlagSrqYSje9zQQuILnx7cc4QVTFnBnNuwak3a1kZkUaMEFIeifwQeAsYDnJYPGrI+K3QxCbmZlVUb8JQlIX8CTJ4PTfRMTzkh53cjAzqw8DXea6GBgPfAB4r6RDeWltajMzG+H6TRAR8WngeJK5mE4DOoGx6eyrhw1NeGZmVi0D3iiXrkF9d0TMJUkWF5BMuPfEEMRmZmZVVNFVTADpdBi3Abell7qamdkIVnGCyIqI/DkfbL95llYzqzX7lSBscHmWVjOrRU4QNWBfZ2n1DKRmNhQqWQ9iMjAPOC5bPyLO6Pcg2yf7OkurZyA1s6FQSQviFuBrwLUkM67aIPMsrWZWiypZD2JHRHw1IpZHxMrSo/DI6si8mVNoHNWwW1njqAbmzZxSpYjMzCprQXxP0seAW4EXS4UR8WxhUQ0jg3H1kWdpNbNaVEmCuDD9Oi9TFsCrBz+c4WUwrz7yLK1mVmsqWQ/i+KEIZDga7DWifXWSmdWSSq5iGgX8BfCOtOhHwL+nd1bXtcFeI9pXJ5lZLamki+mrwCjg39LtP0nL/ryooIYLX31kZiNZJVcxvTkiLoyIu9LHRcCbiw5sOPDVR2Y2klWSIPok/Z/ShqRX4/shgGRg+epzpnFQQ/JjbG5q5OpzpvnqIzMbESrpYpoH3C1pAyCSO6ovKjSqYcRXH5nZSFXJVUx3SpoElPpNOiPixYGOMTOz4W+gNanPiIi7JJ1TtusESUTEkoJjMzOzKhqoBfFO4C7gvTn7AnCCMDMbwfpNEBHxD+nTKyPi8ew+Sb55zsxshKvkKqb/zilbPNiBmJlZbRloDOI1wOuAw8vGIUYDhxQdWC0rn6DvkFEHMOawg6sdlpnZoBpoDGIK8B6gid3HIZ4HLi4yqFqWN0HfAapyUGZmBRhoDOK7wHclnRwR9w9hTDUtb4K+nQFPPZs/L5OZ2XBVyY1yqyR9nKS7aVfXUkR8uLCoathgT9BnZlarKhmk/hbwe8BM4B5gAkk3U10a39SYW97cT7mZ2XBVSYI4ISI+C7wQEf8J/F/gLZWcXNIsSZ2S1ku6rJ8650nqkLRO0g1l+0ZL6pL0r5W83lDwBH1mVi8q6WIqrfvQI+n1wC+Bo/Z2kKQGYCFwJtAFrJC0NCI6MnUmAfOBUyJiq6Ty834OuLeCGIeMlwc1s3pRSYJYJOkI4LPAUuAw4PIKjjsJWB8RGwAkfQeYDXRk6lwMLIyIrQARsam0Q9KbgKOB24GWCl5vyHiCPjOrB5VM1vf19Ok97Ns61M3AU5ntLvbsmpoMIOk+oAG4IiJul3QA8EXgj4F39fcCkuYCcwGOPfbYfQjNzMz2ZqAb5S4Z6MCI+NIgvf4k4DSSwe97JU0jSQzLIqJL6v8mg4hYBCwCaGlpiUGIx8zMUgO1IF6Zfp1CsoLc0nT7vcDyCs7dDRyT2Z6QlmV1AQ+m61s/LulRkoRxMnCqpI+RdGkdJOk3EZE70G1mZoNvoBvl/hFA0r3AiRHxfLp9BfD9Cs69ApiUTuzXDZwPfLCsTitwAfBNSWNIupw2RMQflSpI+hDQ4uRgZja0KrnM9WhgW2Z7W1o2oIjYAXwCaAMeAW6OiHWSrpR0dlqtDdgiqQO4G5gXEVv25RswM7NiVHIV0/XAckm3pttzgOsqOXlELAOWlZVdnnkewCXpo79zXFfp65mZ2eCp5CqmqyT9ADg1LbooIlYVG5aZmVXbQFcxjY6I5yQdCTyRPkr7joyIZ4sPz8zMqmWgFsQNJNN9ryRZYrRE6fa+3BNhZmbDzEBXMb0n/erlRc3M6tBAXUwnDnRgRDw0+OGYmVmtGKiL6YsD7AvgjEGOxczMashAXUynD2UgZmZWWyq5D4J0mu+p7L6i3PVFBWVmZtW31wQh6R9IJtObSnLT27uBH5PcQGdmZiNUJVNtnAv8PvDLiLgIeCNweKFRmZlZ1VWSIHojYiewQ9JoYBO7z9Jad1pXdbPqyR4efPxZTrnmLlpXlU9Sa2Y2/FUyBtEuqQm4luSmud8A9xcaVQ1rXdXN/CVr2Na3E4Dunl7mL1kD4GVHzWxE6bcFIWmhpFMi4mMR0RMRXyNZX/rCtKupLi1o66R3e99uZb3b+1jQ1lmliMzMijFQC+JR4AuSxgE3Azd6kj7Y2NO7T+VmZsNVvy2IiPhyRJwMvBPYAnxD0s8k/YOkyUMWYY0Z39S4T+VmZsPVXgepI+IXEfH5iJhBsvrbHJIFgOrSvJlTaBzVsFtZ46gG5s2cUqWIzMyKsdcEIelASe+V9G3gB0AncE7hkdWoOTOaufqcaRzUkPzompsaufqcaR6gNrMRZ6DJ+s4kaTGcBSwHvgPMjYgXhii2mjVnRjM3Ln8SgJs+cnKVozEzK8ZAg9TzSdaE+OuI2DpE8ZiZWY0YaLI+z9ZqZlbHKrmT2szM6pAThJmZ5XKCMDOzXE4QZmaWywnCzMxyOUGYmVkuJwgzM8vlBGFmZrmcIMzMLJcThJmZ5XKCMDOzXE4QZmaWq9AEIWmWpE5J6yVd1k+d8yR1SFon6Ya07DhJD0l6OC3/aJFxmpnZngaa7vtlkdQALATOBLqAFZKWRkRHps4kkmnFT4mIrZKOSnc9DZwcES9KOgxYmx67sah4zcxsd0W2IE4C1kfEhojYRrLg0OyyOhcDC0vrTUTEpvTrtoh4Ma1zcMFxmplZjiL/8TYDT2W2u9KyrMnAZEn3SXpA0qzSDknHSFqdnuPzea0HSXMltUtq37x5cwHfgplZ/ar2J/MDgUnAaSTLm14rqQkgIp6KiDcAJwAXSjq6/OCIWBQRLRHRMnbs2CEM28xs5CsyQXQDx2S2J6RlWV3A0ojYHhGPA4+SJIxd0pbDWuDUAmM1M7MyRSaIFcAkScdLOgg4H1haVqeVpPWApDEkXU4bJE2Q1JiWHwG8HegsMFYzMytTWIKIiB3AJ4A24BHg5ohYJ+lKSWen1dqALZI6gLuBeRGxBXgt8KCknwL3AF+IiDVFxWpmZnsq7DJXgIhYBiwrK7s88zyAS9JHts4dwBuKjM3MzAZW7UFqMzOrUU4QZmaWywnCzMxyOUGYmVkuJwgzM8vlBGFmZrmcIMzMLJcThJmZ5XKCMDOzXE4QZmaWywnCzMxyOUGYmVkuJwgzM8vlBGFmZrmcIMzMLFeh60EMJ62rulnQ1snGnl7GNzUyb+YU5sxornZYZmZV4wRBkhzmL1lD7/Y+ALp7epm/JFnAzknCzOqVEwSwoK1zV3Io6d3ex6WLV3Pj8idzj+l4+jmmjhs9FOGZmVWFxyCAjT29ueXb+nb2e8zUcaOZPd2tCzMbudyCAMY3NdKdkySamxq56SMnVyEiM7PqcwsCmDdzCo2jGnYraxzVwLyZU6oUkZlZ9bkFwUsD0ZcuXs22vp00+yomMzMniJI5M5p3DUi7W8nMzAlit/sfRjUcwDFHNlY7JDOzmlDXYxCl+x+6e3oJkquWHn/mBVpXdVc7NDOzqqvrBJF3/8POSMrNzOpdXSeI/u5/6K/czKye1HWCGN+UP97QX7mZWT2p6wTh+x/MzPpX11cxle5z8CyuZmZ7qusEAUmScEIwM9tTXXcxmZlZ/wpNEJJmSeqUtF7SZf3UOU9Sh6R1km5Iy6ZLuj8tWy3pA0XGaWZmeyqsi0lSA7AQOBPoAlZIWhoRHZk6k4D5wCkRsVXSUemu3wJ/GhE/lzQeWCmpLSJ6iorXzMx2V2QL4iRgfURsiIhtwHeA2WV1LgYWRsRWgIjYlH59NCJ+nj7fCGwCxhYYq5mZlSkyQTQDT2W2u9KyrMnAZEn3SXpA0qzyk0g6CTgIeCxn31xJ7ZLaN2/ePIihm5lZta9iOhCYBJwGTADulTSt1JUkaRzwLeDCiNhjebeIWAQsSutulvSLvbzeGOCZwQt/UDm2/ePY9o9j2z8jMbbj+ttRZILoBo7JbE9Iy7K6gAcjYjvwuKRHSRLGCkmjge8Dn4mIB/b2YhGx1y4oSe0R0VLpNzCUHNv+cWz7x7Htn3qLrcguphXAJEnHSzoIOB9YWlanlaT1gKQxJF1OG9L6twLXR8TiAmM0M7N+FJYgImIH8AmgDXgEuDki1km6UtLZabU2YIukDuBuYF5EbAHOA94BfEjSw+ljelGxmpnZngodg4iIZcCysrLLM88DuCR9ZOv8F/BfBYS0qIBzDhbHtn8c2/5xbPunrmJT8j/azMxsd55qw8zMcjlBmJlZrrpIEJXMCTXE8XxD0iZJazNlR0q6Q9LP069HVCGuYyTdnZkb61M1FNshkpZL+mka2z+m5cdLejB9b29Kr4CrCkkNklZJuq2WYpP0hKQ16cUe7WlZ1d/TNI4mSYsl/UzSI5JOroXYJE3JXCDzsKTnJH26FmJL4/ur9O9graQb07+PQf99G/EJIjMn1LuBqcAFkqZWNyquA8rvGr8MuDMiJgF3pttDbQfw1xExFXgr8PH0Z1ULsb0InBERbwSmA7MkvRX4PPAvEXECsBX4syrEVvIpkiv2SmopttMjYnrmOvlaeE8BvgzcHhGvAd5I8vOremwR0Zn+vKYDbyKZH+7WWohNUjPwl0BLRLweaCC5jWDwf98iYkQ/gJOBtsz2fGB+DcQ1EVib2e4ExqXPxwGdNRDjd0kmW6yp2IBXAA8BbyG5c/TAvPd6iGOaQPIP4wzgNkA1FNsTwJiysqq/p8DhwOOkF8vUUmxl8fwBcF+txMZL0xgdSXIl6m3AzCJ+30Z8C4LK5oSqBUdHxNPp818CR1czGEkTgRnAg9RIbGkXzsMkkzfeQTI/V08k99xAdd/b/wdcCpSmhHkVtRNbAD+UtFLS3LSsFt7T44HNwDfTrrmvSzq0RmLLOh+4MX1e9dgiohv4AvAk8DTwa2AlBfy+1UOCGHYi+QhQteuPJR0G/Dfw6Yh4LruvmrFFRF8kTf4JJLMFv6YacZST9B5gU0SsrHYs/Xh7RJxI0s36cUnvyO6s4nt6IHAi8NWImAG8QFmXTQ38LRwEnA3cUr6vWrGl4x6zSRLseOBQ9uyyHhT1kCAqmROqFvwqnZywNEnhpmoEIWkUSXL4dkQsqaXYSiKZzPFukmZ0k6TSDZ/Vem9PAc6W9ATJtPZnkPSt10JspU+cRDKd/q0kybUW3tMuoCsiHky3F5MkjFqIreTdwEMR8at0uxZiexfweERsjmQeuyUkv4OD/vtWDwmikjmhasFS4ML0+YUk/f9DSpKA/wAeiYgv1VhsYyU1pc8bScZGHiFJFOdWM7aImB8REyJiIsnv110R8Ue1EJukQyW9svScpD99LTXwnkbEL4GnJE1Ji34f6KiF2DIu4KXuJaiN2J4E3irpFenfbOnnNvi/b9Uc/BnCQZ2zgEdJ+qw/UwPx3EjSd7id5FPUn5H0Wd8J/Bz4H+DIKsT1dpIm82rg4fRxVo3E9gZgVRrbWuDytPzVwHJgPUk3wMFVfm9PA26rldjSGH6aPtaVfv9r4T1N45gOtKfvaytwRA3FdiiwBTg8U1Yrsf0j8LP0b+FbwMFF/L55qg0zM8tVD11MZma2H5wgzMwslxOEmZnlcoIwM7NcThBmZpbLCcKGBUn/IunTme02SV/PbH9R0iX5R4Ok6ySdmz7/kaQ9FneXNErSNelMnQ9Jul/Su9N9TyhZN31f4971uv3sX5jOFtohqTcze+i5kpaV7v0YTJLGlWac7Wf/QZLuzdx0ZXXKCcKGi/uAtwFIOgAYA7wus/9twE9e5mt8jmQCttdHMjXFHOCVL/OcA4qIj0cyfchZwGORziAaEYsj4qxI7hofbJcA1w4Q0zaSa/0/UMBr2zDiBGHDxU9IptaAJDGsBZ6XdISkg4HXAg9JulzSinSe/EXpnaZ7JekVwMXAJyPiRYCI+FVE3JxT95L0/GvLWjV/Kmm1kjUrvpVz3OfSFkVDhTE9IWmMpFVa5x0AAALySURBVIlK1ku4TtKjkr4t6V2S7ktbOyel9Q9VstbI8nTyu9n9nPoPgdvTY16X1n84jX1SWqcV+KNK4rSRy01IGxYiYqOkHZKOJWkt3E8yW+XJJLNZromIbZL+NSKuBEj/Sb8H+F4FL3EC8GSUTU5YTtKbgItIphoX8KCke4BtwN8Db4uIZyQdWXbcApLWyEWxf3enngC8H/gwyfQxHyS58/1s4O9IWjufIZnm48Np19RySf8TES9k4jge2FpKgsBHgS9HxLfTqWhKyWst8Ob9iNNGELcgbDj5CUlyKCWI+zPb96V1TleyqtYakknzXpd3opfh7cCtEfFCRPyGZKK0U9PXuiUingGIiGczx3yWZLqGj+5ncoBkcrY1EbGTZMqMO9NzrSFZWwSSeZYuUzIl+o+AQ4Bjy84zjmSK7ZL7gb+T9LfAcRHRm8bfB2wrzeNk9ckJwoaT0jjENJJPuA+QtCDeBvxE0iHAvwHnRsQ0kn72Qyo893rgWEmjBz3q5BP/m8pbFfvoxczznZntnbzUEyDgDzPjGMdGRHaFO4BeMj+TiLiBpBXSCyyTdEam7sHA715GzDbMOUHYcPITki6jZyNZG+JZoIkkSfyEl/7xPaNkTYt+rx4qFxG/JZnJ9stpV0tpBtn3l1X9X2BOOpPmocD70rK7gPdLelV6bDYZ3A5cA3y/4E/kbcAnS+Mukmbk1HmUl1ocSHo1sCEivkIy++cb0vJXAc9EMp201SknCBtO1pBcvfRAWdmvI+KZ9Iqfa0laF20kn9z3xd+TdL90SFpLspRj+YJJD5GsKb6cZLW9r0fEqohYB1wF3CPpp8CXyo67JY1taTpdeRE+B4wCVktal27vJh2PeEzSCWnRecDatFvq9cD1afnpwPcLitOGCc/malZnJL0PeFNE/P0AdZYAl0XEo0MXmdUaX8VkVmci4tZSV1ietIut1cnB3IIwM7NcHoMwM7NcThBmZpbLCcLMzHI5QZiZWS4nCDMzy/X/AUdLzLp17HVCAAAAAElFTkSuQmCC", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEWCAYAAAB8LwAVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8GearUAAAgAElEQVR4nO3dfZhdVXn38e+PIcCIhgETaDIBgg9JFI0mOKKIKFBpoo9CpIhgaxFborVaLW0oqRURS8Ervjx6NdUGqxQrCKRhjBgZERAqQt4I5mVwMCQYZoImhIwgjiSZ3M8fe5+wc9gzORNmzzkz5/e5rnPl7LXX3vuel8x91lp7r6WIwMzMrNwB1Q7AzMxqkxOEmZnlcoIwM7NcThBmZpbLCcLMzHI5QZiZWS4nCLP9IOlUSR3VjsOsSE4QNuxIekzS26sZQ0T8b0RMKer8kmZIulfSM5K2SrpH0llFXc8sjxOEWQ5JDVW89rnALcD1wATgKOBy4N37cS5J8v9z2y/+xbERQ9IBki6T9KikbZJulnREZv8tkn4t6bfpp/NXZ/ZdJ+lrkpZIehY4PW2p/IOk1ekxN0k6JK1/mqTOzPF91k33XyrpCUmbJf2VpJB0fM7XIOBLwOci4hsR8duI2B0R90TExWmdKyT9d+aYien5Dky3fyLpKkn3Ab8H5khaUXadv5O0OH1/sKQvSNok6TeSvi6p8UX+OGwEcIKwkeTjwCzgbcB4YDswP7P/h8Ak4EjgQeA7Zce/H7gKeBnw07TsPGAmcBzwWuCD/Vw/t66kmcAlwNuB44HT+jnHFOBoYGE/dSrxAWA2ydfydWCKpEmZ/e8HbkjfXwNMBqal8TWTtFiszjlB2EjyEeBTEdEZEc8BVwDnlj5ZR8Q3I+KZzL7XSTosc/z3IuK+9BP7H9Kyr0bE5oh4Cvg+yR/RvvRV9zzgWxGxLiJ+n167Ly9P/32i0i+6D9el19sVEb8FvgdcAJAmilcCi9MWy2zg7yLiqYh4BvhX4PwXeX0bAZwgbCQ5FrhVUrekbuBhoBc4SlKDpGvS7qengcfSY8Zkjn8855y/zrz/PfDSfq7fV93xZefOu07JtvTfcf3UqUT5NW4gTRAkrYfWNFmNBV4CrMx8325Py63OOUHYSPI48I6IaMq8DomILpI/imeTdPMcBkxMj1Hm+KKmNn6CZLC55Oh+6naQfB1/2k+dZ0n+qJf8UU6d8q/lDmCspGkkiaLUvfQk0AO8OvM9Oywi+kuEViecIGy4GiXpkMzrQJK+9qskHQsgaayks9P6LwOeI/mE/hKSbpShcjNwkaRXSXoJ8Om+KkYy//4lwKclXSRpdDr4/hZJC9JqDwFvlXRM2kU2d18BRMROkjuj5gFHkCQMImI3cC3wZUlHAkhqljRjv79aGzGcIGy4WkLyybf0ugL4CrAY+JGkZ4AHgDem9a8HfgV0Ae3pviERET8EvgrcDazPXPu5PuovBN4HfAjYDPwG+BeScQQi4g7gJmA1sBK4rcJQbiBpQd0SEbsy5f9YiivtfvsxyWC51Tl5wSCzoSXpVcBa4OCyP9RmNcUtCLMhIOk96fMGhwOfB77v5GC1zgnCbGh8GNgCPEpyZ9VfVzccs31zF5OZmeVyC8LMzHIdWO0ABsuYMWNi4sSJ1Q7DzGxYWbly5ZMRkftg5IhJEBMnTmTFihX7rmhmZntI+lVf+9zFZGZmuZwgzMwslxOEmZnlcoIwM7NcThBmZpZrxNzFZGZWb1pXdTGvrYPN3T2Mb2pkzowpzJrePGjnd4IwMxuGWld1MXfRGnp29gLQ1d3D3EVrAAYtSRTaxSRppqQOSeslXdZHnfMktUtaJ+mGtOx0SQ9lXn+QNKvIWM3MhpN5bR17kkNJz85e5rV1DNo1CmtBSGogWTD+TKATWC5pcUS0Z+pMIlns5JSI2F5asCQi7iZdz1fSESRz1f+oqFjNzIabzd09AyrfH0W2IE4C1kfEhojYAXyXZMnHrIuB+RGxHSAituSc51zgh+n6uWZmBoxvahxQ+f4oMkE0s/fC6Z1pWdZkYLKk+yQ9IGlmznnOB27Mu4Ck2ZJWSFqxdevWQQnazGw4mDNjCo2jGvYqaxzVwJwZg7cYYLVvcz0QmAScRrKQ+rWSmko7JY0DpgJteQdHxIKIaImIlrFjc+eaMjMbkWZNb+bqc6ZyUEPyZ7y5qZGrz5k6bO5i6gKOzmxPSMuyOoGl6YLqGyU9QpIwlqf7zwNuTfebmVnGrOnN3LhsEwA3ffjkQT9/kS2I5cAkScdJOoikq2hxWZ1WktYDksaQdDltyOy/gD66l8zMrFiFJYh0vd2PkXQPPQzcHBHrJF0p6ay0WhuwTVI7cDcwJyK2AUiaSNICuaeoGM3MrG+FPigXEUuAJWVll2feB3BJ+io/9jFeOKhtZmZDpNqD1GZmVqOcIMzMLJcThJmZ5XKCMDOzXE4QZmaWywnCzMxyOUGYmVkuJwgzM8vlBGFmZrm85KiZ7Zei10O26nOCMLMBG4r1kK36nCDMbMD6Wg/50oWr90w/bUOj/YmnOWHc6ELO7TEIMxuwvtY93tG7e4gjsRPGjebsacW02tyCMLMBG9/USFdOkmhuaixk4RqrDrcgzGzAhmI9ZKs+tyDMbMBKA9GXLlzNjt7dNPsuphHJCcLM9kvR6yFb9bmLyczMcjlBmJlZLicIMzPL5QRhZma5nCDMzCyXE4SZmeVygjAzs1xOEGZmlssJwszMcjlBmJlZLicIMzPL5QRhZma5nCDMzCyXE4SZmeVygjAzs1yFJghJMyV1SFov6bI+6pwnqV3SOkk3ZMqPkfQjSQ+n+ycWGauZme2tsAWDJDUA84EzgU5guaTFEdGeqTMJmAucEhHbJR2ZOcX1wFURcYeklwJeDd3MbAgVuaLcScD6iNgAIOm7wNlAe6bOxcD8iNgOEBFb0ronAAdGxB1p+e8KjNNGsNZVXcxr62Bzdw/jvSym2YAU2cXUDDye2e5My7ImA5Ml3SfpAUkzM+XdkhZJWiVpXtoi2Yuk2ZJWSFqxdevWQr4IG75aV3Uxd9Eaurp7CKCru4e5i9bQuqqr2qGZDQvVXpP6QGAScBowAbhX0tS0/FRgOrAJuAn4IPCf2YMjYgGwAKClpSWGKmgbHua1ddCzs3evsp6dvVy6cPWetZTtxWl/4mlOGDe62mFYQYpsQXQBR2e2J6RlWZ3A4ojYGREbgUdIEkYn8FBEbIiIXUArcGKBsdoItLm7J7d8R6+HswbLCeNGc/Y0d9mNVEW2IJYDkyQdR5IYzgfeX1anFbgA+JakMSRdSxuAbqBJ0tiI2AqcAawoMFYbgcY3NdKVkySamxq56cMnVyEis+GlsBZE+sn/Y0Ab8DBwc0Ssk3SlpLPSam3ANkntwN3AnIjYFhG9wD8Ad0paAwi4tqhYbWSaM2MKjaP2HrpqHNXAnBlTqhSR2fCiiJHRdd/S0hIrVriRYXtrXdXFpQtXs6N3N82+i8nsBSStjIiWvH3VHqQ2K9Ss6c17BqTdrWQ2MJ5qw8zMcu0zQUh6+VAEYmZmtaWSFsQDkm6R9E5JKjwiMzOrCZUkiMkkD6N9APilpH+VNLnYsMzMrNr2mSAicUdEXEAyd9KFwDJJ90jyqJ+Z2Qi1z7uY0jGIPydpQfwG+DiwGJgG3AIcV2SAZmZWHZXc5no/8G1gVkR0ZspXSPp6MWGZmVm1VZIgpkQfT9NFxOcHOR4zM6sRlQxS/0hSU2lD0uGS2gqMyczMakAlCWJsRHSXNtLFfY7sp76ZmY0AlSSIXknHlDYkHQuMjAmczMysT5WMQXwK+Kmke0hmVT0VmF1oVGZmVnX7TBARcbukE4E3pUWfjIgniw3LzMyqrdLZXHuBLcAhwAmSiIh7iwvLzMyqrZIH5f4K+ATJkqEPkbQk7idZ5c3MzEaoSgapPwG8AfhVRJwOTCdZEtTMzEawShLEHyLiDwCSDo6IXwBes9HMbISrZAyiM31QrhW4Q9J24FfFhmVmZtVWyV1M70nfXiHpbuAw4PZCozIzs6rrN0FIagDWRcQrASLiniGJyszMqq7fMYiI6AU6sk9Sm5lZfahkDOJwYJ2kZcCzpcKIOKuwqMzMrOoqSRCfLjwKMzOrOZUMUnvcwcysDlXyJPUzPD9760HAKODZiBhdZGBmZlZdlbQgXlZ6L0nA2Tw/cZ+ZmY1QlTxJvUckWoEZBcVjZmY1opIupnMymwcALcAfCovIzMxqQiV3Mb07834X8BhJN5OZmY1glYxBXDQUgZiZWW3Z5xiEpP9KJ+srbR8u6ZvFhmVmZtVWySD1ayNiz/oPEbGdZE2IfZI0U1KHpPWSLuujznmS2iWtk3RDprxX0kPpa3El1zMzs8FTyRjEAZIOTxMDko6o5Lh0or/5wJlAJ7Bc0uKIaM/UmQTMBU6JiO2Sjsycoicipg3gazEzs0FUSYL4InC/pFvS7fcCV1Vw3EnA+ojYACDpuySD2+2ZOhcD80vJJyK2VBq4mZkVq5JB6uslreD5NajPybYC+tEMPJ7Z7gTeWFZnMoCk+4AG4IqIKK01cUh63V3ANenzF3uRNBuYDXDMMbU74Wzrqi7mtXWwubuH8U2NzJkxhVnTm6sdlplZvyrpKnoTyZoQ/5Zuj5b0xohYOkjXnwScBkwA7pU0NR3zODYiuiS9ArhL0pqIeDR7cEQsABYAtLS0BDWodVUXcxetoWdnLwBd3T3MXbQGwEnCzGpaJV1MXwNOzGz/LqcsTxdwdGZ7QlqW1QksjYidwEZJj5AkjOUR0QUQERsk/YRkYPxRhpl5bR17kkNJz85eLl24mhuXbapSVPWl/YmnOWGcpw4zG6hK7mJSROz5dB4Ru6kssSwHJkk6TtJBwPlA+d1IrSStBySNIely2pDeSntwpvwU9h67GDY2d/fklu/o3T3EkdSvE8aN5uxpbq2ZDVQlf+g3SPpbklYDwEeBDfs6KCJ2SfoY0EYyvvDNiFgn6UpgRUQsTvf9iaR2oBeYExHbJL0Z+A9Ju0mS2DUVjnvUnPFNjXTlJInmpkZu+vDJVYjIzKwyyjQO8iskt55+lWSQOoA7gU9ExNbiw6tcS0tLrFixotphvED5GARA46gGrj5nqscgzKzqJK2MiJa8fZXcxbSFpHuodLJG4F3ALX0eZHuUksClC1ezo3c3zb6LycyGiUq6mEoPvc0ALiB58O2nOEFUbNb05j0D0u5WMrPhot8EIeltwPuBdwLLSAaLXxERvx+C2MzMrIr6TBCSOoFNJIPT/xARz0ja6ORgZlYf+rvNdSEwHngf8G5Jh/L82tRmZjbC9ZkgIuKTwHEkczGdBnQAY9PZV186NOGZmVm19PugXLoG9d0RMZskWVxAMuHeY0MQm5mZVVFFdzEBpNNh3Abclt7qamZmI1jFCSIrIvLnj6hTnq3VzEai/UoQ9jzP1mpmI5UTxItU6WytnlHUzIabStaDmAzMAY7N1o+IM/o8qI5UOlurZxQ1s+GmkhbELcDXgWtJZly1DM/WamYjVSXrQeyKiK9FxLKIWFl6FR7ZMDFnxhQaRzXsVdY4qoE5M6ZUKSIzs8FRSQvi+5I+CtwKPFcqjIinCouqSvbnbiTP1mpmI1UlCeLC9N85mbIAXjH44VTPi7kbybO1mtlIVMl6EMcNRSDV9mLXjvZdSmY20lRyF9Mo4K+Bt6ZFPwH+I32yesR4sWtH+y4lMxtpKuli+howCvj3dPsDadlfFRVUNfhuJDOzvVVyF9MbIuLCiLgrfV0EvKHowIaa70YyM9tbJQmiV9L/KW1IegUj8HmIWdObufqcqRzUkHxLmpsaufqcqb4byczqViVdTHOAuyVtAETyRPVFhUZVJb4byczseZXcxXSnpElAqa+lIyKe6+8YMzMb/vpbk/qMiLhL0jllu46XREQsKjg2MzOrov5aEG8D7gLenbMvACcIM7MRrM8EERGfSd9eGREbs/sk1cXDc2Zm9aySu5j+J6ds4WAHYmZmtaW/MYhXAq8GDisbhxgNHFJ0YEMpO0nfqIYDOPoIL7ltZtbfGMQU4F1AE3uPQzwDXFxkUEOpfJK+Hb272fjks7Su6vIzEGZW1/obg/ge8D1JJ0fE/UMY05DKm6RvdyTlThBmVs8qeVBulaS/Ielu2tO1FBEfKiyqIdTXJH19lZuZ1YtKBqm/DfwRMAO4B5hA0s00Ioxvyh9v6KvczKxeVJIgjo+ITwPPRsR/Af8XeGMlJ5c0U1KHpPWSLuujznmS2iWtk3RD2b7Rkjol/Vsl19sfnqTPzCxfJV1MpXUfuiW9Bvg1cOS+DpLUAMwHzgQ6geWSFkdEe6bOJGAucEpEbJdUft7PAfdWEON+85KhZmb5KkkQCyQdDnwaWAy8FLi8guNOAtZHxAYASd8FzgbaM3UuBuZHxHaAiNhS2iHp9cBRwO1ASwXX22+epM/M7IUqmazvG+nbexjYOtTNwOOZ7U5e2DU1GUDSfUADcEVE3C7pAOCLwJ8Db+/rApJmA7MBjjnmmAGEZmZm+9Lfg3KX9HdgRHxpkK4/CTiNZPD7XklTSRLDkojolNRfDAuABQAtLS0xCPGYmVmqvxbEy9J/p5CsILc43X43sKyCc3cBR2e2J6RlWZ3A0nR9642SHiFJGCcDp0r6KEmX1kGSfhcRuQPdZmY2+Pp7UO6zAJLuBU6MiGfS7SuAH1Rw7uXApHRivy7gfOD9ZXVagQuAb0kaQ9LltCEi/qxUQdIHgRYnBzOzoVXJba5HATsy2zvSsn5FxC7gY0Ab8DBwc0Ssk3SlpLPSam3ANkntwN3AnIjYNpAvwMzMilHJXUzXA8sk3ZpuzwKuq+TkEbEEWFJWdnnmfQCXpK++znFdpdczM7PBU8ldTFdJ+iFwalp0UUSsKjYsMzOrtv7uYhodEU9LOgJ4LH2V9h0REU8VH56ZmVVLfy2IG0im+15JssRoidLtgTwTYWZmw0x/dzG9K/3Xy4uamdWh/rqYTuzvwIh4cPDDMTOzWtFfF9MX+9kXwBmDHIuZmdWQ/rqYTh/KQMzMrLZU8hwE6TTfJ7D3inLXFxWUmZlV3z4ThKTPkEymdwLJQ2/vAH5K8gCdmZmNUJVMtXEu8MfAryPiIuB1wGGFRmVmZlVXSYLoiYjdwC5Jo4Et7D1L67DXuqqLVZu6WbrxKU655i5aV5VPOmtmVn8qGYNYIakJuJbkobnfAfcXGtUQal3VxdxFa9jRuxuAru4e5i5aA+BlR82srvXZgpA0X9IpEfHRiOiOiK+TrC99YdrVNCLMa+ugZ2fvXmU9O3uZ19ZRpYjMzGpDfy2IR4AvSBoH3AzcOBIn6dvc3TOgcjOzetFnCyIivhIRJwNvA7YB35T0C0mfkTR5yCIs2PimxgGVm5nVi30OUkfEryLi8xExnWT1t1kkCwCNCHNmTKFxVMNeZY2jGpgzY0qVIjIzqw37TBCSDpT0bknfAX4IdADnFB7ZEJk1vZmrz5nKQQ3Jt6K5qZGrz5nqAWozq3v9TdZ3JkmL4Z3AMuC7wOyIeHaIYhsys6Y3c+OyTQDc9OGTqxyNmVlt6G+Qei7JmhB/HxHbhygeMzOrEf1N1ufZWs3M6lglT1KbmVkdcoIwM7NcThBmZpbLCcLMzHI5QZiZWS4nCDMzy+UEYWZmuZwgzMwslxOEmZnlcoIwM7NcThBmZpbLCcLMzHIVmiAkzZTUIWm9pMv6qHOepHZJ6yTdkJYdK+lBSQ+l5R8pMk4zM3uh/qb7flEkNQDzgTOBTmC5pMUR0Z6pM4lkWvFTImK7pCPTXU8AJ0fEc5JeCqxNj91cVLxmZra3IlsQJwHrI2JDROwgWXDo7LI6FwPzS+tNRMSW9N8dEfFcWufgguM0M7McRf7hbQYez2x3pmVZk4HJku6T9ICkmaUdko6WtDo9x+fzWg+SZktaIWnF1q1bC/gSzMzqV7U/mR8ITAJOI1ne9FpJTQAR8XhEvBY4HrhQ0lHlB0fEgohoiYiWsWPHDmHYZmYjX5EJogs4OrM9IS3L6gQWR8TOiNgIPEKSMPZIWw5rgVMLjNXMzMoUmSCWA5MkHSfpIOB8YHFZnVaS1gOSxpB0OW2QNEFSY1p+OPAWoKPAWM3MrExhCSIidgEfA9qAh4GbI2KdpCslnZVWawO2SWoH7gbmRMQ24FXAUkk/B+4BvhARa4qK1czMXqiw21wBImIJsKSs7PLM+wAuSV/ZOncAry0yNjMz61+1B6nNzKxGOUGYmVkuJwgzM8vlBGFmZrmcIMzMLJcThJmZ5XKCMDOzXE4QZmaWywnCzMxyOUGYmVkuJwgzM8vlBGFmZrmcIMzMLJcThJmZ5XKCMDOzXHWfIFpXdXHKNXexdONTrNrUTeuq8lVRzczqU6ELBtW61lVdzF20hp6dvQDs6N3N3EXJwnWzpjdXMzQzs6qr6xbEvLaOPcmhpGdnL/PavPy1mVldJ4jN3T0DKjczqyd1nSDGNzUOqNzMrJ7UdYKYM2MKjaMa9iprHNXAnBlTqhSRmVntqOtB6tJA9KULV7OjdzfNTY3MmTHFA9RmZtR5goAkSdy4bBMAN3345CpHY2ZWO+q6iwmSW11Xbepm6canOOWau/wchJlZqq4TROk5iB29uwHo6u5h7qI1ThJmZtR5gvBzEGZmfavrBOHnIMzM+lbXCcLPQZiZ9a2uE4SfgzAz61td3+Zaet5hXlsHm7t7GO/nIMzM9qjrBAFJknBCMDN7obruYjIzs74VmiAkzZTUIWm9pMv6qHOepHZJ6yTdkJZNk3R/WrZa0vuKjNPMzF6osC4mSQ3AfOBMoBNYLmlxRLRn6kwC5gKnRMR2SUemu34P/EVE/FLSeGClpLaI6C4qXjMz21uRLYiTgPURsSEidgDfBc4uq3MxMD8itgNExJb030ci4pfp+83AFmBsgbGamVmZIhNEM/B4ZrszLcuaDEyWdJ+kByTNLD+JpJOAg4BHc/bNlrRC0oqtW7cOYuhmZlbtu5gOBCYBpwETgHslTS11JUkaB3wbuDAidpcfHBELgAVp3a2SfjVUgfdhDPBklWPIU6txgWPbX7UaW63GBY6tL8f2taPIBNEFHJ3ZnpCWZXUCSyNiJ7BR0iMkCWO5pNHAD4BPRcQD+7pYRFS9C0rSiohoqXYc5Wo1LnBs+6tWY6vVuMCx7Y8iu5iWA5MkHSfpIOB8YHFZnVaS1gOSxpB0OW1I698KXB8RCwuM0czM+lBYgoiIXcDHgDbgYeDmiFgn6UpJZ6XV2oBtktqBu4E5EbENOA94K/BBSQ+lr2lFxWpmZi9U6BhERCwBlpSVXZ55H8Al6Stb57+B/y4ytoIsqHYAfajVuMCx7a9aja1W4wLHNmBK/kabmZntzVNtmJlZLicIMzPL5QSxnyR9U9IWSWszZUdIukPSL9N/D69CXEdLujszv9Unaii2QyQtk/TzNLbPpuXHSVqaztl1U3oX25CT1CBplaTbaiyuxyStSW/WWJGWVf3nmcbRJGmhpF9IeljSybUQm6QpmRtcHpL0tKRP1khsf5f+/q+VdGP6/6ImftfKOUHsv+uA8ie/LwPujIhJwJ3p9lDbBfx9RJwAvAn4G0kn1EhszwFnRMTrgGnATElvAj4PfDkijge2A39ZhdgAPkFyx11JrcQFcHpETMvcK18LP0+ArwC3R8QrgdeRfP+qHltEdKTfr2nA60nmd7u12rFJagb+FmiJiNcADSSPANTS79rzIsKv/XwBE4G1me0OYFz6fhzQUQMxfo9kwsSaig14CfAg8EaSJ0gPTMtPBtqqEM8Ekj8YZwC3AaqFuNJrPwaMKSur+s8TOAzYSHqzSy3FVhbPnwD31UJsPD8F0REkd5HeBsyold+18pdbEIPrqIh4In3/a+CoagYjaSIwHVhKjcSWduM8RDIB4x0kc2x1R/LcDOTP2TUU/h9wKVCa0uXlNRIXQAA/krRS0uy0rBZ+nscBW4FvpV1z35B0aI3ElnU+cGP6vqqxRUQX8AVgE/AE8FtgJbXzu7YXJ4iCRPJRoGr3EEt6KfA/wCcj4unsvmrGFhG9kTT7J5DM+PvKasSRJeldwJaIWFntWPrwlog4EXgHSZfhW7M7q/jzPBA4EfhaREwHnqWsy6YG/h8cBJwF3FK+rxqxpWMeZ5Mk1/HAobywq7pmOEEMrt+kEwyWJhrcUo0gJI0iSQ7fiYhFtRRbSSQTMt5N0pxuklR6aDNvzq6inQKcJekxkmnpzyDpW692XMCeT51EMh3+rSSJtRZ+np1AZ0QsTbcXkiSMWoit5B3AgxHxm3S72rG9HdgYEVsjmYNuEcnvX038rpVzghhci4EL0/cXkvT/DylJAv4TeDgivlRjsY2V1JS+byQZG3mYJFGcW63YImJuREyIiIkk3RF3RcSfVTsuAEmHSnpZ6T1Jf/paauDnGRG/Bh6XNCUt+mOgvRZiy7iA57uXoPqxbQLeJOkl6f/V0ves6r9ruao9CDJcXyS/dE8AO0k+Sf0lSb/1ncAvgR8DR1QhrreQNJtXAw+lr3fWSGyvBValsa0FLk/LXwEsA9aTdAUcXMWf62nAbbUSVxrDz9PXOpLZjamFn2caxzRgRfozbQUOr6HYDgW2AYdlyqoeG/BZ4Bfp/4FvAwfXwu9a3stTbZiZWS53MZmZWS4nCDMzy+UEYWZmuZwgzMwslxOEmZnlcoKwYUHSlyV9MrPdJukbme0vSrok/2iQdJ2kc9P3P5H0ggXiJY2SdE060+eDku6X9I5032NK1k0faNx7rtvH/vnpbKPtknoys4+eK2lJ6bmRwSRpXGnG2j72HyTp3syDW1annCBsuLgPeDOApAOAMcCrM/vfDPzsRV7jcyQTuL0mkqktZgEve5Hn7FdE/E0kU4+8E3g00hlII2JhRLwzkifOB9slwLX9xLSD5FmB9xVwbRtGnCBsuPgZybQckCSGtcAzkg6XdDDwKuBBSZdLWp7Otb8gfVp1nyS9BLgY+HhEPAcQEb+JiJtz6l6Snn9tWavmLyStVrLexbdzjvtc2qJoqDCmxySNkTRRyXoL10l6RNJ3JL1d0n1pa+ektP6hStYpWZZOnnd2H6f+U+D29JhXp/UfSmOflMX7umcAAALBSURBVNZpBf6skjht5HIT0oaFiNgsaZekY0haC/eTzHh5MsmMmGsiYoekf4uIKwHSP9LvAr5fwSWOBzZF2cSG5SS9HriIZJpyAUsl3QPsAP4ZeHNEPCnpiLLj5pG0Ri6K/Xs69XjgvcCHgOXA+0memj8L+CeS1s6nSKYJ+VDaNbVM0o8j4tlMHMcB20tJEPgI8JWI+E46sV0pea0F3rAfcdoI4haEDSc/I0kOpQRxf2b7vrTO6UpW5lpDMuneq/NO9CK8Bbg1Ip6NiN+RTLZ2anqtWyLiSYCIeCpzzKdJpnv4yH4mB0gmeFsTEbtJpty4Mz3XGpJ1SSCZp+kyJdOp/wQ4BDim7DzjSKboLrkf+CdJ/wgcGxE9afy9wI7SPFBWn5wgbDgpjUNMJfmE+wBJC+LNwM8kHQL8O3BuREwl6Wc/pMJzrweOkTR60KNOPvG/vrxVMUDPZd7vzmzv5vmeAAF/mhnHOCYisivkAfSQ+Z5ExA0krZAeYImkMzJ1Dwb+8CJitmHOCcKGk5+RdBk9Fcm6Ek8BTSRJ4mc8/4fvSSXrYfR591C5iPg9ySy4X0m7Wkqzz763rOr/ArPS2TgPBd6Tlt0FvFfSy9Njs8ngduAa4AcFfyJvAz5eGneRND2nziM83+JA0iuADRHxVZIZRF+blr8ceDKSKamtTjlB2HCyhuTupQfKyn4bEU+md/xcS9K6aCP55D4Q/0zS/dIuaS3JcpDliy09SLIe+TKSlfq+ERGrImIdcBVwj6SfA18qO+6WNLbF6VTnRfgcMApYLWldur2XdDziUUnHp0XnAWvTbqnXANen5acDPygoThsmPJurWZ2R9B7g9RHxz/3UWQRcFhGPDF1kVmt8F5NZnYmIW0tdYXnSLrZWJwdzC8LMzHJ5DMLMzHI5QZiZWS4nCDMzy+UEYWZmuZwgzMws1/8HKjHRXh5gA8MAAAAASUVORK5CYII=", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" - } + }, + "output_type": "display_data" } ], - "metadata": { - "slideshow": { - "slide_type": "slide" - } - } + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "plt.title('Learning Curve')\n", + "plt.xlabel('Wall Clock Time (s)')\n", + "plt.ylabel('Validation Accuracy')\n", + "plt.scatter(time_history, 1 - np.array(valid_loss_history))\n", + "plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')\n", + "plt.show()" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## 3. Comparison with alternatives\n" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### Default LightGBM" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 12, - "source": [ - "from lightgbm import LGBMClassifier\r\n", - "lgbm = LGBMClassifier()" - ], + "metadata": {}, "outputs": [], - "metadata": {} + "source": [ + "from lightgbm import LGBMClassifier\n", + "lgbm = LGBMClassifier()" + ] }, { "cell_type": "code", "execution_count": 13, - "source": [ - "lgbm.fit(X_train, y_train)" - ], + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "LGBMClassifier()" ] }, + "execution_count": 13, "metadata": {}, - "execution_count": 13 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "lgbm.fit(X_train, y_train)" + ] }, { "cell_type": "code", "execution_count": 14, + "metadata": {}, + "outputs": [], "source": [ "y_pred_lgbm = lgbm.predict(X_test)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### Default XGBoost" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 15, - "source": [ - "from xgboost import XGBClassifier\r\n", - "xgb = XGBClassifier()\r\n", - "cat_columns = X_train.select_dtypes(include=['category']).columns\r\n", - "X = X_train.copy()\r\n", - "X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)\r\n" - ], + "metadata": {}, "outputs": [], - "metadata": {} + "source": [ + "from xgboost import XGBClassifier\n", + "xgb = XGBClassifier()\n", + "cat_columns = X_train.select_dtypes(include=['category']).columns\n", + "X = X_train.copy()\n", + "X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)\n" + ] }, { "cell_type": "code", "execution_count": 16, - "source": [ - "xgb.fit(X, y_train)" - ], + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n", @@ -642,35 +625,34 @@ " tree_method='exact', validate_parameters=1, verbosity=None)" ] }, + "execution_count": 16, "metadata": {}, - "execution_count": 16 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "xgb.fit(X, y_train)" + ] }, { "cell_type": "code", "execution_count": 17, - "source": [ - "X = X_test.copy()\r\n", - "X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)\r\n", - "y_pred_xgb = xgb.predict(X)" - ], + "metadata": {}, "outputs": [], - "metadata": {} + "source": [ + "X = X_test.copy()\n", + "X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)\n", + "y_pred_xgb = xgb.predict(X)" + ] }, { "cell_type": "code", "execution_count": 18, - "source": [ - "print('default xgboost accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred_xgb, y_test))\r\n", - "print('default lgbm accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred_lgbm, y_test))\r\n", - "print('flaml (4min) accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred, y_test))" - ], + "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "default xgboost accuracy = 0.6676060098186078\n", "default lgbm accuracy = 0.6602346380315323\n", @@ -678,34 +660,43 @@ ] } ], - "metadata": {} + "source": [ + "print('default xgboost accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred_xgb, y_test))\n", + "print('default lgbm accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred_lgbm, y_test))\n", + "print('flaml (4min) accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred, y_test))" + ] }, { "cell_type": "markdown", - "source": [ - "## 4. Customized Learner" - ], "metadata": { "slideshow": { "slide_type": "slide" } - } + }, + "source": [ + "## 4. Customized Learner" + ] }, { "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, "source": [ "Some experienced automl users may have a preferred model to tune or may already have a reasonably by-hand-tuned model before launching the automl experiment. They need to select optimal configurations for the customized model mixed with standard built-in learners. \n", "\n", "FLAML can easily incorporate customized/new learners (preferably with sklearn API) provided by users in a real-time manner, as demonstrated below." - ], + ] + }, + { + "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } - } - }, - { - "cell_type": "markdown", + }, "source": [ "### Example of Regularized Greedy Forest\n", "\n", @@ -716,243 +707,222 @@ "* choose initial value corresponding to low cost for cost-related hyperparameters (e.g., initial value for max_leaf and n_iter should be small)\n", "\n", "In this example, the above information for RGF is wrapped in a python class called *MyRegularizedGreedyForest* that exposes the hyperparameters." - ], - "metadata": { - "slideshow": { - "slide_type": "slide" - } - } + ] }, { "cell_type": "code", "execution_count": 19, - "source": [ - "''' SKLearnEstimator is the super class for a sklearn learner '''\r\n", - "from flaml.model import SKLearnEstimator\r\n", - "from flaml import tune\r\n", - "from flaml.data import CLASSIFICATION\r\n", - "\r\n", - "\r\n", - "class MyRegularizedGreedyForest(SKLearnEstimator):\r\n", - " def __init__(self, task='binary', **config):\r\n", - " '''Constructor\r\n", - " \r\n", - " Args:\r\n", - " task: A string of the task type, one of\r\n", - " 'binary', 'multi', 'regression'\r\n", - " config: A dictionary containing the hyperparameter names\r\n", - " and 'n_jobs' as keys. n_jobs is the number of parallel threads.\r\n", - " '''\r\n", - "\r\n", - " super().__init__(task, **config)\r\n", - "\r\n", - " '''task=binary or multi for classification task'''\r\n", - " if task in CLASSIFICATION:\r\n", - " from rgf.sklearn import RGFClassifier\r\n", - "\r\n", - " self.estimator_class = RGFClassifier\r\n", - " else:\r\n", - " from rgf.sklearn import RGFRegressor\r\n", - " \r\n", - " self.estimator_class = RGFRegressor\r\n", - "\r\n", - " @classmethod\r\n", - " def search_space(cls, data_size, task):\r\n", - " '''[required method] search space\r\n", - "\r\n", - " Returns:\r\n", - " A dictionary of the search space. \r\n", - " Each key is the name of a hyperparameter, and value is a dict with\r\n", - " its domain (required) and low_cost_init_value, init_value,\r\n", - " cat_hp_cost (if applicable).\r\n", - " e.g.,\r\n", - " {'domain': tune.randint(lower=1, upper=10), 'init_value': 1}.\r\n", - " '''\r\n", - " space = { \r\n", - " 'max_leaf': {'domain': tune.lograndint(lower=4, upper=data_size), 'init_value': 4, 'low_cost_init_value': 4},\r\n", - " 'n_iter': {'domain': tune.lograndint(lower=1, upper=data_size), 'init_value': 1, 'low_cost_init_value': 1},\r\n", - " 'n_tree_search': {'domain': tune.lograndint(lower=1, upper=32768), 'init_value': 1, 'low_cost_init_value': 1},\r\n", - " 'opt_interval': {'domain': tune.lograndint(lower=1, upper=10000), 'init_value': 100},\r\n", - " 'learning_rate': {'domain': tune.loguniform(lower=0.01, upper=20.0)},\r\n", - " 'min_samples_leaf': {'domain': tune.lograndint(lower=1, upper=20), 'init_value': 20},\r\n", - " }\r\n", - " return space\r\n", - "\r\n", - " @classmethod\r\n", - " def size(cls, config):\r\n", - " '''[optional method] memory size of the estimator in bytes\r\n", - " \r\n", - " Args:\r\n", - " config - the dict of the hyperparameter config\r\n", - "\r\n", - " Returns:\r\n", - " A float of the memory size required by the estimator to train the\r\n", - " given config\r\n", - " '''\r\n", - " max_leaves = int(round(config['max_leaf']))\r\n", - " n_estimators = int(round(config['n_iter']))\r\n", - " return (max_leaves * 3 + (max_leaves - 1) * 4 + 1.0) * n_estimators * 8\r\n", - "\r\n", - " @classmethod\r\n", - " def cost_relative2lgbm(cls):\r\n", - " '''[optional method] relative cost compared to lightgbm\r\n", - " '''\r\n", - " return 1.0\r\n" - ], - "outputs": [], "metadata": { "slideshow": { "slide_type": "slide" } - } + }, + "outputs": [], + "source": [ + "''' SKLearnEstimator is the super class for a sklearn learner '''\n", + "from flaml.model import SKLearnEstimator\n", + "from flaml import tune\n", + "from flaml.data import CLASSIFICATION\n", + "\n", + "\n", + "class MyRegularizedGreedyForest(SKLearnEstimator):\n", + " def __init__(self, task='binary', **config):\n", + " '''Constructor\n", + " \n", + " Args:\n", + " task: A string of the task type, one of\n", + " 'binary', 'multi', 'regression'\n", + " config: A dictionary containing the hyperparameter names\n", + " and 'n_jobs' as keys. n_jobs is the number of parallel threads.\n", + " '''\n", + "\n", + " super().__init__(task, **config)\n", + "\n", + " '''task=binary or multi for classification task'''\n", + " if task in CLASSIFICATION:\n", + " from rgf.sklearn import RGFClassifier\n", + "\n", + " self.estimator_class = RGFClassifier\n", + " else:\n", + " from rgf.sklearn import RGFRegressor\n", + " \n", + " self.estimator_class = RGFRegressor\n", + "\n", + " @classmethod\n", + " def search_space(cls, data_size, task):\n", + " '''[required method] search space\n", + "\n", + " Returns:\n", + " A dictionary of the search space. \n", + " Each key is the name of a hyperparameter, and value is a dict with\n", + " its domain (required) and low_cost_init_value, init_value,\n", + " cat_hp_cost (if applicable).\n", + " e.g.,\n", + " {'domain': tune.randint(lower=1, upper=10), 'init_value': 1}.\n", + " '''\n", + " space = { \n", + " 'max_leaf': {'domain': tune.lograndint(lower=4, upper=data_size), 'init_value': 4, 'low_cost_init_value': 4},\n", + " 'n_iter': {'domain': tune.lograndint(lower=1, upper=data_size), 'init_value': 1, 'low_cost_init_value': 1},\n", + " 'n_tree_search': {'domain': tune.lograndint(lower=1, upper=32768), 'init_value': 1, 'low_cost_init_value': 1},\n", + " 'opt_interval': {'domain': tune.lograndint(lower=1, upper=10000), 'init_value': 100},\n", + " 'learning_rate': {'domain': tune.loguniform(lower=0.01, upper=20.0)},\n", + " 'min_samples_leaf': {'domain': tune.lograndint(lower=1, upper=20), 'init_value': 20},\n", + " }\n", + " return space\n", + "\n", + " @classmethod\n", + " def size(cls, config):\n", + " '''[optional method] memory size of the estimator in bytes\n", + " \n", + " Args:\n", + " config - the dict of the hyperparameter config\n", + "\n", + " Returns:\n", + " A float of the memory size required by the estimator to train the\n", + " given config\n", + " '''\n", + " max_leaves = int(round(config['max_leaf']))\n", + " n_estimators = int(round(config['n_iter']))\n", + " return (max_leaves * 3 + (max_leaves - 1) * 4 + 1.0) * n_estimators * 8\n", + "\n", + " @classmethod\n", + " def cost_relative2lgbm(cls):\n", + " '''[optional method] relative cost compared to lightgbm\n", + " '''\n", + " return 1.0\n" + ] }, { "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, "source": [ "### Add Customized Learner and Run FLAML AutoML\n", "\n", "After adding RGF into the list of learners, we run automl by tuning hyperpameters of RGF as well as the default learners. " - ], - "metadata": { - "slideshow": { - "slide_type": "slide" - } - } + ] }, { "cell_type": "code", "execution_count": 20, - "source": [ - "automl = AutoML()\r\n", - "automl.add_learner(learner_name='RGF', learner_class=MyRegularizedGreedyForest)" - ], - "outputs": [], "metadata": { "slideshow": { "slide_type": "slide" } - } + }, + "outputs": [], + "source": [ + "automl = AutoML()\n", + "automl.add_learner(learner_name='RGF', learner_class=MyRegularizedGreedyForest)" + ] }, { "cell_type": "code", "execution_count": 21, - "source": [ - "settings = {\r\n", - " \"time_budget\": 10, # total running time in seconds\r\n", - " \"metric\": 'accuracy', \r\n", - " \"estimator_list\": ['RGF', 'lgbm', 'rf', 'xgboost'], # list of ML learners\r\n", - " \"task\": 'classification', # task type \r\n", - " \"log_file_name\": 'airlines_experiment_custom_learner.log', # flaml log file \r\n", - " \"log_training_metric\": True, # whether to log training metric\r\n", - "}\r\n", - "\r\n", - "automl.fit(X_train = X_train, y_train = y_train, **settings)" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "[flaml.automl: 10-08 15:17:57] {1458} INFO - Data split method: stratified\n", - "[flaml.automl: 10-08 15:17:57] {1462} INFO - Evaluation method: holdout\n", - "[flaml.automl: 10-08 15:17:57] {1510} INFO - Minimizing error metric: 1-accuracy\n", - "[flaml.automl: 10-08 15:17:57] {1547} INFO - List of ML learners in AutoML Run: ['RGF', 'lgbm', 'rf', 'xgboost']\n", - "[flaml.automl: 10-08 15:17:57] {1777} INFO - iteration 0, current learner RGF\n", - "/home/dmx/miniconda2/envs/test/lib/python3.8/site-packages/rgf/utils.py:224: UserWarning: Cannot find FastRGF executable files. FastRGF estimators will be unavailable for usage.\n", - " warnings.warn(\"Cannot find FastRGF executable files. \"\n", - "[flaml.automl: 10-08 15:17:59] {1894} INFO - Estimated sufficient time budget=718418s. Estimated necessary time budget=718s.\n", - "[flaml.automl: 10-08 15:17:59] {1966} INFO - at 2.8s,\testimator RGF's best error=0.3787,\tbest estimator RGF's best error=0.3787\n", - "[flaml.automl: 10-08 15:17:59] {1777} INFO - iteration 1, current learner RGF\n", - "[flaml.automl: 10-08 15:18:00] {1966} INFO - at 4.1s,\testimator RGF's best error=0.3787,\tbest estimator RGF's best error=0.3787\n", - "[flaml.automl: 10-08 15:18:00] {1777} INFO - iteration 2, current learner RGF\n", - "[flaml.automl: 10-08 15:18:02] {1966} INFO - at 5.2s,\testimator RGF's best error=0.3787,\tbest estimator RGF's best error=0.3787\n", - "[flaml.automl: 10-08 15:18:02] {1777} INFO - iteration 3, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:02] {1966} INFO - at 5.3s,\testimator lgbm's best error=0.3777,\tbest estimator lgbm's best error=0.3777\n", - "[flaml.automl: 10-08 15:18:02] {1777} INFO - iteration 4, current learner RGF\n", - "[flaml.automl: 10-08 15:18:03] {1966} INFO - at 6.5s,\testimator RGF's best error=0.3787,\tbest estimator lgbm's best error=0.3777\n", - "[flaml.automl: 10-08 15:18:03] {1777} INFO - iteration 5, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:03] {1966} INFO - at 6.6s,\testimator lgbm's best error=0.3777,\tbest estimator lgbm's best error=0.3777\n", - "[flaml.automl: 10-08 15:18:03] {1777} INFO - iteration 6, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:03] {1966} INFO - at 6.7s,\testimator lgbm's best error=0.3777,\tbest estimator lgbm's best error=0.3777\n", - "[flaml.automl: 10-08 15:18:03] {1777} INFO - iteration 7, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:03] {1966} INFO - at 6.8s,\testimator lgbm's best error=0.3777,\tbest estimator lgbm's best error=0.3777\n", - "[flaml.automl: 10-08 15:18:03] {1777} INFO - iteration 8, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:03] {1966} INFO - at 6.8s,\testimator lgbm's best error=0.3777,\tbest estimator lgbm's best error=0.3777\n", - "[flaml.automl: 10-08 15:18:03] {1777} INFO - iteration 9, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:03] {1966} INFO - at 6.9s,\testimator lgbm's best error=0.3777,\tbest estimator lgbm's best error=0.3777\n", - "[flaml.automl: 10-08 15:18:03] {1777} INFO - iteration 10, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:03] {1966} INFO - at 7.1s,\testimator lgbm's best error=0.3765,\tbest estimator lgbm's best error=0.3765\n", - "[flaml.automl: 10-08 15:18:03] {1777} INFO - iteration 11, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:04] {1966} INFO - at 7.3s,\testimator lgbm's best error=0.3765,\tbest estimator lgbm's best error=0.3765\n", - "[flaml.automl: 10-08 15:18:04] {1777} INFO - iteration 12, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:04] {1966} INFO - at 7.5s,\testimator lgbm's best error=0.3765,\tbest estimator lgbm's best error=0.3765\n", - "[flaml.automl: 10-08 15:18:04] {1777} INFO - iteration 13, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:04] {1966} INFO - at 7.7s,\testimator lgbm's best error=0.3750,\tbest estimator lgbm's best error=0.3750\n", - "[flaml.automl: 10-08 15:18:04] {1777} INFO - iteration 14, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:04] {1966} INFO - at 7.9s,\testimator lgbm's best error=0.3750,\tbest estimator lgbm's best error=0.3750\n", - "[flaml.automl: 10-08 15:18:04] {1777} INFO - iteration 15, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:04] {1966} INFO - at 8.1s,\testimator lgbm's best error=0.3604,\tbest estimator lgbm's best error=0.3604\n", - "[flaml.automl: 10-08 15:18:04] {1777} INFO - iteration 16, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:05] {1966} INFO - at 8.3s,\testimator lgbm's best error=0.3604,\tbest estimator lgbm's best error=0.3604\n", - "[flaml.automl: 10-08 15:18:05] {1777} INFO - iteration 17, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:05] {1966} INFO - at 8.6s,\testimator lgbm's best error=0.3604,\tbest estimator lgbm's best error=0.3604\n", - "[flaml.automl: 10-08 15:18:05] {1777} INFO - iteration 18, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:05] {1966} INFO - at 8.8s,\testimator lgbm's best error=0.3600,\tbest estimator lgbm's best error=0.3600\n", - "[flaml.automl: 10-08 15:18:05] {1777} INFO - iteration 19, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:05] {1966} INFO - at 8.9s,\testimator lgbm's best error=0.3600,\tbest estimator lgbm's best error=0.3600\n", - "[flaml.automl: 10-08 15:18:05] {1777} INFO - iteration 20, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:05] {1966} INFO - at 9.0s,\testimator lgbm's best error=0.3600,\tbest estimator lgbm's best error=0.3600\n", - "[flaml.automl: 10-08 15:18:05] {1777} INFO - iteration 21, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:05] {1966} INFO - at 9.2s,\testimator lgbm's best error=0.3600,\tbest estimator lgbm's best error=0.3600\n", - "[flaml.automl: 10-08 15:18:05] {1777} INFO - iteration 22, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:06] {1966} INFO - at 9.3s,\testimator lgbm's best error=0.3600,\tbest estimator lgbm's best error=0.3600\n", - "[flaml.automl: 10-08 15:18:06] {1777} INFO - iteration 23, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:06] {1966} INFO - at 9.4s,\testimator lgbm's best error=0.3600,\tbest estimator lgbm's best error=0.3600\n", - "[flaml.automl: 10-08 15:18:06] {1777} INFO - iteration 24, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:06] {1966} INFO - at 9.6s,\testimator lgbm's best error=0.3600,\tbest estimator lgbm's best error=0.3600\n", - "[flaml.automl: 10-08 15:18:06] {1777} INFO - iteration 25, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:06] {1966} INFO - at 9.7s,\testimator lgbm's best error=0.3600,\tbest estimator lgbm's best error=0.3600\n", - "[flaml.automl: 10-08 15:18:06] {1777} INFO - iteration 26, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:06] {1966} INFO - at 9.8s,\testimator lgbm's best error=0.3600,\tbest estimator lgbm's best error=0.3600\n", - "[flaml.automl: 10-08 15:18:06] {1777} INFO - iteration 27, current learner xgboost\n", - "[flaml.automl: 10-08 15:18:06] {1966} INFO - at 9.9s,\testimator xgboost's best error=0.3787,\tbest estimator lgbm's best error=0.3600\n", - "[flaml.automl: 10-08 15:18:06] {1777} INFO - iteration 28, current learner xgboost\n", - "[flaml.automl: 10-08 15:18:06] {1966} INFO - at 9.9s,\testimator xgboost's best error=0.3787,\tbest estimator lgbm's best error=0.3600\n", - "[flaml.automl: 10-08 15:18:06] {1777} INFO - iteration 29, current learner xgboost\n", - "[flaml.automl: 10-08 15:18:06] {1966} INFO - at 10.0s,\testimator xgboost's best error=0.3765,\tbest estimator lgbm's best error=0.3600\n", - "[flaml.automl: 10-08 15:18:06] {1777} INFO - iteration 30, current learner rf\n", - "[flaml.automl: 10-08 15:18:06] {1966} INFO - at 10.0s,\testimator rf's best error=0.3787,\tbest estimator lgbm's best error=0.3600\n", - "[flaml.automl: 10-08 15:18:06] {2073} INFO - selected model: LGBMClassifier(colsample_bytree=0.868332929662737,\n", - " learning_rate=0.5372172315260287, max_bin=255,\n", - " min_child_samples=24, n_estimators=4, num_leaves=23,\n", - " reg_alpha=0.006958608037974516, reg_lambda=0.07314321471228555,\n", - " verbose=-1)\n", - "[flaml.automl: 10-08 15:18:06] {2144} INFO - not retraining because the time budget is too small.\n", - "[flaml.automl: 10-08 15:18:06] {1571} INFO - fit succeeded\n", - "[flaml.automl: 10-08 15:18:06] {1572} INFO - Time taken to find the best model: 8.79496955871582\n", - "[flaml.automl: 10-08 15:18:06] {1583} WARNING - Time taken to find the best model is 88% of the provided time budget and not all estimators' hyperparameter search converged. Consider increasing the time budget.\n" - ] - } - ], "metadata": { "slideshow": { "slide_type": "slide" }, "tags": [] - } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[flaml.automl: 11-02 19:28:10] {1483} INFO - Data split method: stratified\n", + "[flaml.automl: 11-02 19:28:10] {1487} INFO - Evaluation method: holdout\n", + "[flaml.automl: 11-02 19:28:10] {1537} INFO - Minimizing error metric: 1-accuracy\n", + "[flaml.automl: 11-02 19:28:10] {1574} INFO - List of ML learners in AutoML Run: ['RGF', 'lgbm', 'rf', 'xgboost']\n", + "[flaml.automl: 11-02 19:28:10] {1816} INFO - iteration 0, current learner RGF\n", + "/home/dmx/miniconda2/envs/blend/lib/python3.8/site-packages/rgf/utils.py:225: UserWarning: Cannot find FastRGF executable files. FastRGF estimators will be unavailable for usage.\n", + " warnings.warn(\"Cannot find FastRGF executable files. \"\n", + "[flaml.automl: 11-02 19:28:12] {1933} INFO - Estimated sufficient time budget=838163s. Estimated necessary time budget=838s.\n", + "[flaml.automl: 11-02 19:28:12] {2013} INFO - at 3.4s,\testimator RGF's best error=0.3840,\tbest estimator RGF's best error=0.3840\n", + "[flaml.automl: 11-02 19:28:12] {1816} INFO - iteration 1, current learner RGF\n", + "[flaml.automl: 11-02 19:28:14] {2013} INFO - at 4.6s,\testimator RGF's best error=0.3840,\tbest estimator RGF's best error=0.3840\n", + "[flaml.automl: 11-02 19:28:14] {1816} INFO - iteration 2, current learner RGF\n", + "[flaml.automl: 11-02 19:28:15] {2013} INFO - at 5.8s,\testimator RGF's best error=0.3840,\tbest estimator RGF's best error=0.3840\n", + "[flaml.automl: 11-02 19:28:15] {1816} INFO - iteration 3, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:15] {2013} INFO - at 5.9s,\testimator lgbm's best error=0.3777,\tbest estimator lgbm's best error=0.3777\n", + "[flaml.automl: 11-02 19:28:15] {1816} INFO - iteration 4, current learner RGF\n", + "[flaml.automl: 11-02 19:28:16] {2013} INFO - at 7.1s,\testimator RGF's best error=0.3840,\tbest estimator lgbm's best error=0.3777\n", + "[flaml.automl: 11-02 19:28:16] {1816} INFO - iteration 5, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:16] {2013} INFO - at 7.2s,\testimator lgbm's best error=0.3777,\tbest estimator lgbm's best error=0.3777\n", + "[flaml.automl: 11-02 19:28:16] {1816} INFO - iteration 6, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:16] {2013} INFO - at 7.3s,\testimator lgbm's best error=0.3777,\tbest estimator lgbm's best error=0.3777\n", + "[flaml.automl: 11-02 19:28:16] {1816} INFO - iteration 7, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:16] {2013} INFO - at 7.3s,\testimator lgbm's best error=0.3690,\tbest estimator lgbm's best error=0.3690\n", + "[flaml.automl: 11-02 19:28:16] {1816} INFO - iteration 8, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:16] {2013} INFO - at 7.4s,\testimator lgbm's best error=0.3690,\tbest estimator lgbm's best error=0.3690\n", + "[flaml.automl: 11-02 19:28:16] {1816} INFO - iteration 9, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:16] {2013} INFO - at 7.5s,\testimator lgbm's best error=0.3690,\tbest estimator lgbm's best error=0.3690\n", + "[flaml.automl: 11-02 19:28:16] {1816} INFO - iteration 10, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:17] {2013} INFO - at 7.6s,\testimator lgbm's best error=0.3690,\tbest estimator lgbm's best error=0.3690\n", + "[flaml.automl: 11-02 19:28:17] {1816} INFO - iteration 11, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:17] {2013} INFO - at 7.7s,\testimator lgbm's best error=0.3690,\tbest estimator lgbm's best error=0.3690\n", + "[flaml.automl: 11-02 19:28:17] {1816} INFO - iteration 12, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:17] {2013} INFO - at 7.9s,\testimator lgbm's best error=0.3650,\tbest estimator lgbm's best error=0.3650\n", + "[flaml.automl: 11-02 19:28:17] {1816} INFO - iteration 13, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:17] {2013} INFO - at 8.0s,\testimator lgbm's best error=0.3650,\tbest estimator lgbm's best error=0.3650\n", + "[flaml.automl: 11-02 19:28:17] {1816} INFO - iteration 14, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:17] {2013} INFO - at 8.2s,\testimator lgbm's best error=0.3644,\tbest estimator lgbm's best error=0.3644\n", + "[flaml.automl: 11-02 19:28:17] {1816} INFO - iteration 15, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:17] {2013} INFO - at 8.4s,\testimator lgbm's best error=0.3644,\tbest estimator lgbm's best error=0.3644\n", + "[flaml.automl: 11-02 19:28:17] {1816} INFO - iteration 16, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:17] {2013} INFO - at 8.5s,\testimator lgbm's best error=0.3644,\tbest estimator lgbm's best error=0.3644\n", + "[flaml.automl: 11-02 19:28:17] {1816} INFO - iteration 17, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:18] {2013} INFO - at 8.7s,\testimator lgbm's best error=0.3644,\tbest estimator lgbm's best error=0.3644\n", + "[flaml.automl: 11-02 19:28:18] {1816} INFO - iteration 18, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:18] {2013} INFO - at 8.8s,\testimator lgbm's best error=0.3644,\tbest estimator lgbm's best error=0.3644\n", + "[flaml.automl: 11-02 19:28:18] {1816} INFO - iteration 19, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:18] {2013} INFO - at 9.0s,\testimator lgbm's best error=0.3644,\tbest estimator lgbm's best error=0.3644\n", + "[flaml.automl: 11-02 19:28:18] {1816} INFO - iteration 20, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:18] {2013} INFO - at 9.1s,\testimator lgbm's best error=0.3644,\tbest estimator lgbm's best error=0.3644\n", + "[flaml.automl: 11-02 19:28:18] {1816} INFO - iteration 21, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:19] {2013} INFO - at 10.4s,\testimator lgbm's best error=0.3630,\tbest estimator lgbm's best error=0.3630\n", + "[flaml.automl: 11-02 19:28:22] {2230} INFO - retrain lgbm for 2.3s\n", + "[flaml.automl: 11-02 19:28:22] {2237} INFO - retrained model: LGBMClassifier(colsample_bytree=0.832361601243933,\n", + " learning_rate=0.1125645711212136, max_bin=1023,\n", + " min_child_samples=13, n_estimators=43, num_leaves=20,\n", + " reg_alpha=0.0018874193214614083, reg_lambda=0.3799695712161002,\n", + " verbose=-1)\n", + "[flaml.automl: 11-02 19:28:22] {1598} INFO - fit succeeded\n", + "[flaml.automl: 11-02 19:28:22] {1599} INFO - Time taken to find the best model: 10.391593217849731\n", + "[flaml.automl: 11-02 19:28:22] {1610} WARNING - Time taken to find the best model is 104% of the provided time budget and not all estimators' hyperparameter search converged. Consider increasing the time budget.\n" + ] + } + ], + "source": [ + "settings = {\n", + " \"time_budget\": 10, # total running time in seconds\n", + " \"metric\": 'accuracy', \n", + " \"estimator_list\": ['RGF', 'lgbm', 'rf', 'xgboost'], # list of ML learners\n", + " \"task\": 'classification', # task type \n", + " \"log_file_name\": 'airlines_experiment_custom_learner.log', # flaml log file \n", + " \"log_training_metric\": True, # whether to log training metric\n", + "}\n", + "\n", + "automl.fit(X_train = X_train, y_train = y_train, **settings)" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## 5. Customized Metric\n", "\n", "It's also easy to customize the optimization metric. As an example, we demonstrate with a custom metric function which combines training loss and test loss as the final loss to minimize." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 22, + "metadata": {}, + "outputs": [], "source": [ "def custom_metric(X_test, y_test, estimator, labels, X_train, y_train,\n", " weight_test=None, weight_train=None, config=None,\n", @@ -974,20 +944,127 @@ " # two elements are returned:\n", " # the first element is the metric to minimize as a float number,\n", " # the second element is a dictionary of the metrics to log" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "We can then pass this custom metric function to automl's `fit` method." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[flaml.automl: 11-02 19:28:22] {1483} INFO - Data split method: stratified\n", + "[flaml.automl: 11-02 19:28:22] {1487} INFO - Evaluation method: holdout\n", + "[flaml.automl: 11-02 19:28:23] {1537} INFO - Minimizing error metric: customized metric\n", + "[flaml.automl: 11-02 19:28:23] {1574} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'lrl1']\n", + "[flaml.automl: 11-02 19:28:23] {1816} INFO - iteration 0, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:23] {1933} INFO - Estimated sufficient time budget=53001s. Estimated necessary time budget=962s.\n", + "[flaml.automl: 11-02 19:28:23] {2013} INFO - at 1.2s,\testimator lgbm's best error=0.6647,\tbest estimator lgbm's best error=0.6647\n", + "[flaml.automl: 11-02 19:28:23] {1816} INFO - iteration 1, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:23] {2013} INFO - at 1.4s,\testimator lgbm's best error=0.6647,\tbest estimator lgbm's best error=0.6647\n", + "[flaml.automl: 11-02 19:28:23] {1816} INFO - iteration 2, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:23] {2013} INFO - at 1.4s,\testimator lgbm's best error=0.6491,\tbest estimator lgbm's best error=0.6491\n", + "[flaml.automl: 11-02 19:28:23] {1816} INFO - iteration 3, current learner xgboost\n", + "[flaml.automl: 11-02 19:28:23] {2013} INFO - at 1.6s,\testimator xgboost's best error=0.6672,\tbest estimator lgbm's best error=0.6491\n", + "[flaml.automl: 11-02 19:28:23] {1816} INFO - iteration 4, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:23] {2013} INFO - at 1.7s,\testimator lgbm's best error=0.6423,\tbest estimator lgbm's best error=0.6423\n", + "[flaml.automl: 11-02 19:28:23] {1816} INFO - iteration 5, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:24] {2013} INFO - at 1.8s,\testimator lgbm's best error=0.6423,\tbest estimator lgbm's best error=0.6423\n", + "[flaml.automl: 11-02 19:28:24] {1816} INFO - iteration 6, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:24] {2013} INFO - at 1.9s,\testimator lgbm's best error=0.6423,\tbest estimator lgbm's best error=0.6423\n", + "[flaml.automl: 11-02 19:28:24] {1816} INFO - iteration 7, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:24] {2013} INFO - at 2.0s,\testimator lgbm's best error=0.6400,\tbest estimator lgbm's best error=0.6400\n", + "[flaml.automl: 11-02 19:28:24] {1816} INFO - iteration 8, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:24] {2013} INFO - at 2.1s,\testimator lgbm's best error=0.6400,\tbest estimator lgbm's best error=0.6400\n", + "[flaml.automl: 11-02 19:28:24] {1816} INFO - iteration 9, current learner xgboost\n", + "[flaml.automl: 11-02 19:28:24] {2013} INFO - at 2.2s,\testimator xgboost's best error=0.6672,\tbest estimator lgbm's best error=0.6400\n", + "[flaml.automl: 11-02 19:28:24] {1816} INFO - iteration 10, current learner xgboost\n", + "[flaml.automl: 11-02 19:28:24] {2013} INFO - at 2.3s,\testimator xgboost's best error=0.6500,\tbest estimator lgbm's best error=0.6400\n", + "[flaml.automl: 11-02 19:28:24] {1816} INFO - iteration 11, current learner xgboost\n", + "[flaml.automl: 11-02 19:28:24] {2013} INFO - at 2.4s,\testimator xgboost's best error=0.6413,\tbest estimator lgbm's best error=0.6400\n", + "[flaml.automl: 11-02 19:28:24] {1816} INFO - iteration 12, current learner xgboost\n", + "[flaml.automl: 11-02 19:28:24] {2013} INFO - at 2.5s,\testimator xgboost's best error=0.6413,\tbest estimator lgbm's best error=0.6400\n", + "[flaml.automl: 11-02 19:28:24] {1816} INFO - iteration 13, current learner xgboost\n", + "[flaml.automl: 11-02 19:28:24] {2013} INFO - at 2.6s,\testimator xgboost's best error=0.6413,\tbest estimator lgbm's best error=0.6400\n", + "[flaml.automl: 11-02 19:28:24] {1816} INFO - iteration 14, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:25] {2013} INFO - at 2.8s,\testimator lgbm's best error=0.6400,\tbest estimator lgbm's best error=0.6400\n", + "[flaml.automl: 11-02 19:28:25] {1816} INFO - iteration 15, current learner xgboost\n", + "[flaml.automl: 11-02 19:28:25] {2013} INFO - at 2.9s,\testimator xgboost's best error=0.6413,\tbest estimator lgbm's best error=0.6400\n", + "[flaml.automl: 11-02 19:28:25] {1816} INFO - iteration 16, current learner extra_tree\n", + "[flaml.automl: 11-02 19:28:25] {2013} INFO - at 3.1s,\testimator extra_tree's best error=0.6599,\tbest estimator lgbm's best error=0.6400\n", + "[flaml.automl: 11-02 19:28:25] {1816} INFO - iteration 17, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:25] {2013} INFO - at 3.2s,\testimator lgbm's best error=0.6400,\tbest estimator lgbm's best error=0.6400\n", + "[flaml.automl: 11-02 19:28:25] {1816} INFO - iteration 18, current learner extra_tree\n", + "[flaml.automl: 11-02 19:28:25] {2013} INFO - at 3.4s,\testimator extra_tree's best error=0.6457,\tbest estimator lgbm's best error=0.6400\n", + "[flaml.automl: 11-02 19:28:25] {1816} INFO - iteration 19, current learner xgboost\n", + "[flaml.automl: 11-02 19:28:25] {2013} INFO - at 3.5s,\testimator xgboost's best error=0.6413,\tbest estimator lgbm's best error=0.6400\n", + "[flaml.automl: 11-02 19:28:25] {1816} INFO - iteration 20, current learner extra_tree\n", + "[flaml.automl: 11-02 19:28:25] {2013} INFO - at 3.6s,\testimator extra_tree's best error=0.6457,\tbest estimator lgbm's best error=0.6400\n", + "[flaml.automl: 11-02 19:28:25] {1816} INFO - iteration 21, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:26] {2013} INFO - at 3.8s,\testimator lgbm's best error=0.6400,\tbest estimator lgbm's best error=0.6400\n", + "[flaml.automl: 11-02 19:28:26] {1816} INFO - iteration 22, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:26] {2013} INFO - at 4.0s,\testimator lgbm's best error=0.6335,\tbest estimator lgbm's best error=0.6335\n", + "[flaml.automl: 11-02 19:28:26] {1816} INFO - iteration 23, current learner rf\n", + "[flaml.automl: 11-02 19:28:26] {2013} INFO - at 4.2s,\testimator rf's best error=0.6477,\tbest estimator lgbm's best error=0.6335\n", + "[flaml.automl: 11-02 19:28:26] {1816} INFO - iteration 24, current learner xgboost\n", + "[flaml.automl: 11-02 19:28:26] {2013} INFO - at 4.4s,\testimator xgboost's best error=0.6393,\tbest estimator lgbm's best error=0.6335\n", + "[flaml.automl: 11-02 19:28:26] {1816} INFO - iteration 25, current learner extra_tree\n", + "[flaml.automl: 11-02 19:28:26] {2013} INFO - at 4.5s,\testimator extra_tree's best error=0.6457,\tbest estimator lgbm's best error=0.6335\n", + "[flaml.automl: 11-02 19:28:26] {1816} INFO - iteration 26, current learner rf\n", + "[flaml.automl: 11-02 19:28:26] {2013} INFO - at 4.7s,\testimator rf's best error=0.6446,\tbest estimator lgbm's best error=0.6335\n", + "[flaml.automl: 11-02 19:28:26] {1816} INFO - iteration 27, current learner extra_tree\n", + "[flaml.automl: 11-02 19:28:27] {2013} INFO - at 4.9s,\testimator extra_tree's best error=0.6439,\tbest estimator lgbm's best error=0.6335\n", + "[flaml.automl: 11-02 19:28:27] {1816} INFO - iteration 28, current learner xgboost\n", + "[flaml.automl: 11-02 19:28:27] {2013} INFO - at 5.1s,\testimator xgboost's best error=0.6342,\tbest estimator lgbm's best error=0.6335\n", + "[flaml.automl: 11-02 19:28:27] {1816} INFO - iteration 29, current learner xgboost\n", + "[flaml.automl: 11-02 19:28:27] {2013} INFO - at 5.4s,\testimator xgboost's best error=0.6342,\tbest estimator lgbm's best error=0.6335\n", + "[flaml.automl: 11-02 19:28:27] {1816} INFO - iteration 30, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:27] {2013} INFO - at 5.5s,\testimator lgbm's best error=0.6335,\tbest estimator lgbm's best error=0.6335\n", + "[flaml.automl: 11-02 19:28:27] {1816} INFO - iteration 31, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:27] {2013} INFO - at 5.7s,\testimator lgbm's best error=0.6335,\tbest estimator lgbm's best error=0.6335\n", + "[flaml.automl: 11-02 19:28:27] {1816} INFO - iteration 32, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:28] {2013} INFO - at 5.9s,\testimator lgbm's best error=0.6335,\tbest estimator lgbm's best error=0.6335\n", + "[flaml.automl: 11-02 19:28:28] {1816} INFO - iteration 33, current learner xgboost\n", + "[flaml.automl: 11-02 19:28:28] {2013} INFO - at 6.2s,\testimator xgboost's best error=0.6342,\tbest estimator lgbm's best error=0.6335\n", + "[flaml.automl: 11-02 19:28:28] {1816} INFO - iteration 34, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:28] {2013} INFO - at 6.3s,\testimator lgbm's best error=0.6335,\tbest estimator lgbm's best error=0.6335\n", + "[flaml.automl: 11-02 19:28:28] {1816} INFO - iteration 35, current learner rf\n", + "[flaml.automl: 11-02 19:28:28] {2013} INFO - at 6.5s,\testimator rf's best error=0.6446,\tbest estimator lgbm's best error=0.6335\n", + "[flaml.automl: 11-02 19:28:28] {1816} INFO - iteration 36, current learner extra_tree\n", + "[flaml.automl: 11-02 19:28:28] {2013} INFO - at 6.6s,\testimator extra_tree's best error=0.6439,\tbest estimator lgbm's best error=0.6335\n", + "[flaml.automl: 11-02 19:28:28] {1816} INFO - iteration 37, current learner xgboost\n", + "[flaml.automl: 11-02 19:28:29] {2013} INFO - at 6.9s,\testimator xgboost's best error=0.6330,\tbest estimator xgboost's best error=0.6330\n", + "[flaml.automl: 11-02 19:28:29] {1816} INFO - iteration 38, current learner xgboost\n", + "[flaml.automl: 11-02 19:28:29] {2013} INFO - at 7.1s,\testimator xgboost's best error=0.6330,\tbest estimator xgboost's best error=0.6330\n", + "[flaml.automl: 11-02 19:28:29] {1816} INFO - iteration 39, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:30] {2013} INFO - at 7.9s,\testimator lgbm's best error=0.6328,\tbest estimator lgbm's best error=0.6328\n", + "[flaml.automl: 11-02 19:28:30] {1816} INFO - iteration 40, current learner xgboost\n", + "[flaml.automl: 11-02 19:28:30] {2013} INFO - at 8.1s,\testimator xgboost's best error=0.6330,\tbest estimator lgbm's best error=0.6328\n", + "[flaml.automl: 11-02 19:28:30] {1816} INFO - iteration 41, current learner xgboost\n", + "[flaml.automl: 11-02 19:28:30] {2013} INFO - at 8.4s,\testimator xgboost's best error=0.6330,\tbest estimator lgbm's best error=0.6328\n", + "[flaml.automl: 11-02 19:28:30] {1816} INFO - iteration 42, current learner lgbm\n", + "[flaml.automl: 11-02 19:28:31] {2013} INFO - at 9.4s,\testimator lgbm's best error=0.6241,\tbest estimator lgbm's best error=0.6241\n", + "[flaml.automl: 11-02 19:28:31] {1816} INFO - iteration 43, current learner catboost\n", + "[flaml.automl: 11-02 19:28:32] {2013} INFO - at 10.7s,\testimator catboost's best error=0.6410,\tbest estimator lgbm's best error=0.6241\n", + "[flaml.automl: 11-02 19:28:33] {2230} INFO - retrain lgbm for 1.0s\n", + "[flaml.automl: 11-02 19:28:33] {2237} INFO - retrained model: LGBMClassifier(learning_rate=1.0, max_bin=1023, min_child_samples=13,\n", + " n_estimators=9, num_leaves=16, reg_alpha=0.0036546217826270403,\n", + " reg_lambda=6.081586897506841, verbose=-1)\n", + "[flaml.automl: 11-02 19:28:33] {1598} INFO - fit succeeded\n", + "[flaml.automl: 11-02 19:28:33] {1599} INFO - Time taken to find the best model: 9.365035772323608\n", + "[flaml.automl: 11-02 19:28:33] {1610} WARNING - Time taken to find the best model is 94% of the provided time budget and not all estimators' hyperparameter search converged. Consider increasing the time budget.\n" + ] + } + ], "source": [ "automl = AutoML()\n", "settings = {\n", @@ -998,140 +1075,16 @@ "}\n", "\n", "automl.fit(X_train = X_train, y_train = y_train, **settings)" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "[flaml.automl: 10-08 15:18:07] {1458} INFO - Data split method: stratified\n", - "[flaml.automl: 10-08 15:18:07] {1462} INFO - Evaluation method: holdout\n", - "[flaml.automl: 10-08 15:18:07] {1510} INFO - Minimizing error metric: customized metric\n", - "[flaml.automl: 10-08 15:18:07] {1547} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'lrl1']\n", - "[flaml.automl: 10-08 15:18:07] {1777} INFO - iteration 0, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:07] {1894} INFO - Estimated sufficient time budget=33595s. Estimated necessary time budget=559s.\n", - "[flaml.automl: 10-08 15:18:07] {1966} INFO - at 1.0s,\testimator lgbm's best error=0.6647,\tbest estimator lgbm's best error=0.6647\n", - "[flaml.automl: 10-08 15:18:07] {1777} INFO - iteration 1, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:08] {1966} INFO - at 1.1s,\testimator lgbm's best error=0.6647,\tbest estimator lgbm's best error=0.6647\n", - "[flaml.automl: 10-08 15:18:08] {1777} INFO - iteration 2, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:08] {1966} INFO - at 1.2s,\testimator lgbm's best error=0.6491,\tbest estimator lgbm's best error=0.6491\n", - "[flaml.automl: 10-08 15:18:08] {1777} INFO - iteration 3, current learner xgboost\n", - "[flaml.automl: 10-08 15:18:08] {1966} INFO - at 1.2s,\testimator xgboost's best error=0.6672,\tbest estimator lgbm's best error=0.6491\n", - "[flaml.automl: 10-08 15:18:08] {1777} INFO - iteration 4, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:08] {1966} INFO - at 1.4s,\testimator lgbm's best error=0.6423,\tbest estimator lgbm's best error=0.6423\n", - "[flaml.automl: 10-08 15:18:08] {1777} INFO - iteration 5, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:08] {1966} INFO - at 1.5s,\testimator lgbm's best error=0.6423,\tbest estimator lgbm's best error=0.6423\n", - "[flaml.automl: 10-08 15:18:08] {1777} INFO - iteration 6, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:08] {1966} INFO - at 1.6s,\testimator lgbm's best error=0.6400,\tbest estimator lgbm's best error=0.6400\n", - "[flaml.automl: 10-08 15:18:08] {1777} INFO - iteration 7, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:08] {1966} INFO - at 1.7s,\testimator lgbm's best error=0.6400,\tbest estimator lgbm's best error=0.6400\n", - "[flaml.automl: 10-08 15:18:08] {1777} INFO - iteration 8, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:08] {1966} INFO - at 1.9s,\testimator lgbm's best error=0.6400,\tbest estimator lgbm's best error=0.6400\n", - "[flaml.automl: 10-08 15:18:08] {1777} INFO - iteration 9, current learner xgboost\n", - "[flaml.automl: 10-08 15:18:08] {1966} INFO - at 2.0s,\testimator xgboost's best error=0.6672,\tbest estimator lgbm's best error=0.6400\n", - "[flaml.automl: 10-08 15:18:08] {1777} INFO - iteration 10, current learner xgboost\n", - "[flaml.automl: 10-08 15:18:09] {1966} INFO - at 2.1s,\testimator xgboost's best error=0.6500,\tbest estimator lgbm's best error=0.6400\n", - "[flaml.automl: 10-08 15:18:09] {1777} INFO - iteration 11, current learner extra_tree\n", - "[flaml.automl: 10-08 15:18:09] {1966} INFO - at 2.3s,\testimator extra_tree's best error=0.6536,\tbest estimator lgbm's best error=0.6400\n", - "[flaml.automl: 10-08 15:18:09] {1777} INFO - iteration 12, current learner xgboost\n", - "[flaml.automl: 10-08 15:18:09] {1966} INFO - at 2.4s,\testimator xgboost's best error=0.6413,\tbest estimator lgbm's best error=0.6400\n", - "[flaml.automl: 10-08 15:18:09] {1777} INFO - iteration 13, current learner xgboost\n", - "[flaml.automl: 10-08 15:18:09] {1966} INFO - at 2.5s,\testimator xgboost's best error=0.6413,\tbest estimator lgbm's best error=0.6400\n", - "[flaml.automl: 10-08 15:18:09] {1777} INFO - iteration 14, current learner xgboost\n", - "[flaml.automl: 10-08 15:18:09] {1966} INFO - at 2.6s,\testimator xgboost's best error=0.6413,\tbest estimator lgbm's best error=0.6400\n", - "[flaml.automl: 10-08 15:18:09] {1777} INFO - iteration 15, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:09] {1966} INFO - at 2.8s,\testimator lgbm's best error=0.6400,\tbest estimator lgbm's best error=0.6400\n", - "[flaml.automl: 10-08 15:18:09] {1777} INFO - iteration 16, current learner extra_tree\n", - "[flaml.automl: 10-08 15:18:09] {1966} INFO - at 3.0s,\testimator extra_tree's best error=0.6446,\tbest estimator lgbm's best error=0.6400\n", - "[flaml.automl: 10-08 15:18:09] {1777} INFO - iteration 17, current learner rf\n", - "[flaml.automl: 10-08 15:18:10] {1966} INFO - at 3.2s,\testimator rf's best error=0.6470,\tbest estimator lgbm's best error=0.6400\n", - "[flaml.automl: 10-08 15:18:10] {1777} INFO - iteration 18, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:10] {1966} INFO - at 3.4s,\testimator lgbm's best error=0.6400,\tbest estimator lgbm's best error=0.6400\n", - "[flaml.automl: 10-08 15:18:10] {1777} INFO - iteration 19, current learner xgboost\n", - "[flaml.automl: 10-08 15:18:10] {1966} INFO - at 3.6s,\testimator xgboost's best error=0.6413,\tbest estimator lgbm's best error=0.6400\n", - "[flaml.automl: 10-08 15:18:10] {1777} INFO - iteration 20, current learner rf\n", - "[flaml.automl: 10-08 15:18:10] {1966} INFO - at 3.8s,\testimator rf's best error=0.6411,\tbest estimator lgbm's best error=0.6400\n", - "[flaml.automl: 10-08 15:18:10] {1777} INFO - iteration 21, current learner xgboost\n", - "[flaml.automl: 10-08 15:18:10] {1966} INFO - at 4.0s,\testimator xgboost's best error=0.6413,\tbest estimator lgbm's best error=0.6400\n", - "[flaml.automl: 10-08 15:18:10] {1777} INFO - iteration 22, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:11] {1966} INFO - at 4.4s,\testimator lgbm's best error=0.6358,\tbest estimator lgbm's best error=0.6358\n", - "[flaml.automl: 10-08 15:18:11] {1777} INFO - iteration 23, current learner rf\n", - "[flaml.automl: 10-08 15:18:11] {1966} INFO - at 4.6s,\testimator rf's best error=0.6411,\tbest estimator lgbm's best error=0.6358\n", - "[flaml.automl: 10-08 15:18:11] {1777} INFO - iteration 24, current learner extra_tree\n", - "[flaml.automl: 10-08 15:18:11] {1966} INFO - at 4.7s,\testimator extra_tree's best error=0.6446,\tbest estimator lgbm's best error=0.6358\n", - "[flaml.automl: 10-08 15:18:11] {1777} INFO - iteration 25, current learner extra_tree\n", - "[flaml.automl: 10-08 15:18:11] {1966} INFO - at 4.9s,\testimator extra_tree's best error=0.6446,\tbest estimator lgbm's best error=0.6358\n", - "[flaml.automl: 10-08 15:18:11] {1777} INFO - iteration 26, current learner rf\n", - "[flaml.automl: 10-08 15:18:12] {1966} INFO - at 5.1s,\testimator rf's best error=0.6411,\tbest estimator lgbm's best error=0.6358\n", - "[flaml.automl: 10-08 15:18:12] {1777} INFO - iteration 27, current learner xgboost\n", - "[flaml.automl: 10-08 15:18:12] {1966} INFO - at 5.3s,\testimator xgboost's best error=0.6393,\tbest estimator lgbm's best error=0.6358\n", - "[flaml.automl: 10-08 15:18:12] {1777} INFO - iteration 28, current learner extra_tree\n", - "[flaml.automl: 10-08 15:18:12] {1966} INFO - at 5.4s,\testimator extra_tree's best error=0.6436,\tbest estimator lgbm's best error=0.6358\n", - "[flaml.automl: 10-08 15:18:12] {1777} INFO - iteration 29, current learner xgboost\n", - "[flaml.automl: 10-08 15:18:12] {1966} INFO - at 5.7s,\testimator xgboost's best error=0.6342,\tbest estimator xgboost's best error=0.6342\n", - "[flaml.automl: 10-08 15:18:12] {1777} INFO - iteration 30, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:12] {1966} INFO - at 6.0s,\testimator lgbm's best error=0.6351,\tbest estimator xgboost's best error=0.6342\n", - "[flaml.automl: 10-08 15:18:12] {1777} INFO - iteration 31, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:13] {1966} INFO - at 6.3s,\testimator lgbm's best error=0.6351,\tbest estimator xgboost's best error=0.6342\n", - "[flaml.automl: 10-08 15:18:13] {1777} INFO - iteration 32, current learner rf\n", - "[flaml.automl: 10-08 15:18:13] {1966} INFO - at 6.4s,\testimator rf's best error=0.6411,\tbest estimator xgboost's best error=0.6342\n", - "[flaml.automl: 10-08 15:18:13] {1777} INFO - iteration 33, current learner xgboost\n", - "[flaml.automl: 10-08 15:18:13] {1966} INFO - at 6.7s,\testimator xgboost's best error=0.6342,\tbest estimator xgboost's best error=0.6342\n", - "[flaml.automl: 10-08 15:18:13] {1777} INFO - iteration 34, current learner lgbm\n", - "[flaml.automl: 10-08 15:18:13] {1966} INFO - at 6.9s,\testimator lgbm's best error=0.6351,\tbest estimator xgboost's best error=0.6342\n", - "[flaml.automl: 10-08 15:18:13] {1777} INFO - iteration 35, current learner xgboost\n", - "[flaml.automl: 10-08 15:18:14] {1966} INFO - at 7.1s,\testimator xgboost's best error=0.6342,\tbest estimator xgboost's best error=0.6342\n", - "[flaml.automl: 10-08 15:18:14] {1777} INFO - iteration 36, current learner xgboost\n", - "[flaml.automl: 10-08 15:18:14] {1966} INFO - at 7.3s,\testimator xgboost's best error=0.6330,\tbest estimator xgboost's best error=0.6330\n", - "[flaml.automl: 10-08 15:18:14] {1777} INFO - iteration 37, current learner xgboost\n", - "[flaml.automl: 10-08 15:18:14] {1966} INFO - at 7.5s,\testimator xgboost's best error=0.6330,\tbest estimator xgboost's best error=0.6330\n", - "[flaml.automl: 10-08 15:18:14] {1777} INFO - iteration 38, current learner xgboost\n", - "[flaml.automl: 10-08 15:18:14] {1966} INFO - at 7.8s,\testimator xgboost's best error=0.6330,\tbest estimator xgboost's best error=0.6330\n", - "[flaml.automl: 10-08 15:18:14] {1777} INFO - iteration 39, current learner xgboost\n", - "[flaml.automl: 10-08 15:18:14] {1966} INFO - at 8.0s,\testimator xgboost's best error=0.6330,\tbest estimator xgboost's best error=0.6330\n", - "[flaml.automl: 10-08 15:18:14] {1777} INFO - iteration 40, current learner xgboost\n", - "[flaml.automl: 10-08 15:18:15] {1966} INFO - at 8.2s,\testimator xgboost's best error=0.6330,\tbest estimator xgboost's best error=0.6330\n", - "[flaml.automl: 10-08 15:18:15] {1777} INFO - iteration 41, current learner xgboost\n", - "[flaml.automl: 10-08 15:18:17] {1966} INFO - at 10.1s,\testimator xgboost's best error=0.6290,\tbest estimator xgboost's best error=0.6290\n", - "[flaml.automl: 10-08 15:18:17] {2073} INFO - selected model: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,\n", - " colsample_bynode=1, colsample_bytree=0.7942569492674472, gamma=0,\n", - " gpu_id=-1, grow_policy='lossguide', importance_type='gain',\n", - " interaction_constraints='', learning_rate=0.6413547778096401,\n", - " max_delta_step=0, max_depth=0, max_leaves=17,\n", - " min_child_weight=13.753540541999772, missing=nan,\n", - " monotone_constraints='()', n_estimators=4, n_jobs=-1,\n", - " num_parallel_tree=1, random_state=0,\n", - " reg_alpha=0.016714365103792518, reg_lambda=0.4874780682949813,\n", - " scale_pos_weight=1, subsample=1.0, tree_method='hist',\n", - " use_label_encoder=False, validate_parameters=1, verbosity=0)\n", - "[flaml.automl: 10-08 15:18:18] {2136} INFO - retrain xgboost for 1.8s\n", - "[flaml.automl: 10-08 15:18:18] {2142} INFO - retrained model: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1.0,\n", - " colsample_bynode=1, colsample_bytree=0.7942569492674472, gamma=0,\n", - " gpu_id=-1, grow_policy='lossguide', importance_type='gain',\n", - " interaction_constraints='', learning_rate=0.6413547778096401,\n", - " max_delta_step=0, max_depth=0, max_leaves=17,\n", - " min_child_weight=13.753540541999772, missing=nan,\n", - " monotone_constraints='()', n_estimators=4, n_jobs=-1,\n", - " num_parallel_tree=1, random_state=0,\n", - " reg_alpha=0.016714365103792518, reg_lambda=0.4874780682949813,\n", - " scale_pos_weight=1, subsample=1.0, tree_method='hist',\n", - " use_label_encoder=False, validate_parameters=1, verbosity=0)\n", - "[flaml.automl: 10-08 15:18:18] {1571} INFO - fit succeeded\n", - "[flaml.automl: 10-08 15:18:18] {1572} INFO - Time taken to find the best model: 10.063513994216919\n", - "[flaml.automl: 10-08 15:18:18] {1583} WARNING - Time taken to find the best model is 101% of the provided time budget and not all estimators' hyperparameter search converged. Consider increasing the time budget.\n" - ] - } - ], - "metadata": {} + ] } ], "metadata": { "interpreter": { - "hash": "ea9f131eb1b7663628f6445553ba215a834e2f0b4d18774746f0f47938ce4671" + "hash": "e9d36fc5b7c3dd4177ff1b60184dd696c0acc18150a44682abca4d769811bd46" }, "kernelspec": { - "name": "python3", - "display_name": "Python 3.8.0 64-bit ('test': conda)" + "display_name": "Python 3.8.0 64-bit ('blend': conda)", + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -1148,4 +1101,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/setup.py b/setup.py index 2768e99615..01ff683e4b 100644 --- a/setup.py +++ b/setup.py @@ -38,16 +38,16 @@ setuptools.setup( "notebook": [ "openml==0.10.2", "jupyter", - "matplotlib==3.2.0", + "matplotlib", "rgf-python", + "catboost>=0.26", ], "test": [ "flake8>=3.8.4", "pytest>=6.1.1", "coverage>=5.3", "pre-commit", - "xgboost<1.3", - "catboost>=0.23", + "catboost>=0.26", "rgf-python", "optuna==2.8.0", "vowpalwabbit", @@ -58,8 +58,9 @@ setuptools.setup( "datasets==1.4.1", "azure-storage-blob", "statsmodels>=0.12.2", + "psutil==5.8.0", ], - "catboost": ["catboost>=0.23"], + "catboost": ["catboost>=0.26"], "blendsearch": ["optuna==2.8.0"], "ray": [ "ray[tune]==1.6.0", @@ -83,6 +84,7 @@ setuptools.setup( ], "ts_forecast": ["prophet>=1.0.1", "statsmodels>=0.12.2"], "forecast": ["prophet>=1.0.1", "statsmodels>=0.12.2"], + "benchmark": ["catboost>=0.26", "psutil==5.8.0", "xgboost==1.3.3"], }, classifiers=[ "Programming Language :: Python :: 3", diff --git a/test/test_classification.py b/test/test_classification.py new file mode 100644 index 0000000000..6e05154314 --- /dev/null +++ b/test/test_classification.py @@ -0,0 +1,323 @@ +import unittest +import numpy as np +import scipy.sparse +from sklearn.datasets import load_breast_cancer +import pandas as pd +from datetime import datetime +from flaml import AutoML +from flaml.model import LGBMEstimator +from flaml import tune + + +class MyLargeLGBM(LGBMEstimator): + @classmethod + def search_space(cls, **params): + return { + "n_estimators": { + "domain": tune.lograndint(lower=4, upper=32768), + "init_value": 32768, + "low_cost_init_value": 4, + }, + "num_leaves": { + "domain": tune.lograndint(lower=4, upper=32768), + "init_value": 32768, + "low_cost_init_value": 4, + }, + } + + +class TestClassification(unittest.TestCase): + def test_preprocess(self): + automl = AutoML() + X = pd.DataFrame( + { + "f1": [1, -2, 3, -4, 5, -6, -7, 8, -9, -10, -11, -12, -13, -14], + "f2": [ + 3.0, + 16.0, + 10.0, + 12.0, + 3.0, + 14.0, + 11.0, + 12.0, + 5.0, + 14.0, + 20.0, + 16.0, + 15.0, + 11.0, + ], + "f3": [ + "a", + "b", + "a", + "c", + "c", + "b", + "b", + "b", + "b", + "a", + "b", + 1.0, + 1.0, + "a", + ], + "f4": [ + True, + True, + False, + True, + True, + False, + False, + False, + True, + True, + False, + False, + True, + True, + ], + } + ) + y = pd.Series([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) + + automl = AutoML() + automl_settings = { + "time_budget": 6, + "task": "classification", + "n_jobs": 1, + "estimator_list": ["catboost", "lrl2"], + "eval_method": "cv", + "n_splits": 3, + "metric": "accuracy", + "log_training_metric": True, + # "verbose": 4, + "ensemble": True, + } + automl.fit(X, y, **automl_settings) + + automl = AutoML() + automl_settings = { + "time_budget": 2, + "task": "classification", + "n_jobs": 1, + "estimator_list": ["lrl2", "kneighbor"], + "eval_method": "cv", + "n_splits": 3, + "metric": "accuracy", + "log_training_metric": True, + "verbose": 4, + "ensemble": True, + } + automl.fit(X, y, **automl_settings) + + automl = AutoML() + automl_settings = { + "time_budget": 3, + "task": "classification", + "n_jobs": 1, + "estimator_list": ["xgboost", "catboost", "kneighbor"], + "eval_method": "cv", + "n_splits": 3, + "metric": "accuracy", + "log_training_metric": True, + # "verbose": 4, + "ensemble": True, + } + automl.fit(X, y, **automl_settings) + + automl = AutoML() + automl_settings = { + "time_budget": 3, + "task": "classification", + "n_jobs": 1, + "estimator_list": ["lgbm", "catboost", "kneighbor"], + "eval_method": "cv", + "n_splits": 3, + "metric": "accuracy", + "log_training_metric": True, + # "verbose": 4, + "ensemble": True, + } + automl.fit(X, y, **automl_settings) + + def test_binary(self): + automl_experiment = AutoML() + automl_settings = { + "time_budget": 1, + "task": "binary", + "log_file_name": "test/breast_cancer.log", + "log_training_metric": True, + "n_jobs": 1, + "model_history": True, + } + X_train, y_train = load_breast_cancer(return_X_y=True) + automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) + _ = automl_experiment.predict(X_train) + + def test_datetime_columns(self): + automl_experiment = AutoML() + automl_settings = { + "time_budget": 2, + "log_file_name": "test/datetime_columns.log", + "log_training_metric": True, + "n_jobs": 1, + "model_history": True, + } + fake_df = pd.DataFrame( + { + "A": [ + datetime(1900, 2, 3), + datetime(1900, 3, 4), + datetime(1900, 3, 4), + datetime(1900, 3, 4), + datetime(1900, 7, 2), + datetime(1900, 8, 9), + ], + "B": [ + datetime(1900, 1, 1), + datetime(1900, 1, 1), + datetime(1900, 1, 1), + datetime(1900, 1, 1), + datetime(1900, 1, 1), + datetime(1900, 1, 1), + ], + "year_A": [ + datetime(1900, 1, 2), + datetime(1900, 8, 1), + datetime(1900, 1, 4), + datetime(1900, 6, 1), + datetime(1900, 1, 5), + datetime(1900, 4, 1), + ], + } + ) + y = np.array([0, 1, 0, 1, 0, 0]) + automl_experiment.fit(X_train=fake_df, y_train=y, **automl_settings) + _ = automl_experiment.predict(fake_df) + + def test_sparse_matrix_xgboost(self): + automl_experiment = AutoML() + automl_settings = { + "time_budget": 3, + "metric": "ap", + "task": "classification", + "log_file_name": "test/sparse_classification.log", + "estimator_list": ["xgboost"], + "log_type": "all", + "n_jobs": 1, + } + X_train = scipy.sparse.eye(900000) + y_train = np.random.randint(2, size=900000) + automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) + print(automl_experiment.predict(X_train)) + print(automl_experiment.model) + print(automl_experiment.config_history) + print(automl_experiment.model_history) + print(automl_experiment.best_iteration) + print(automl_experiment.best_estimator) + + def test_ray_classification(self): + from sklearn.datasets import make_classification + + X, y = make_classification(1000, 10) + automl = AutoML() + try: + automl.fit(X, y, time_budget=10, task="classification", use_ray=True) + automl.fit( + X, y, time_budget=10, task="classification", n_concurrent_trials=2 + ) + except ImportError: + return + + def test_parallel_xgboost(self, hpo_method=None): + automl_experiment = AutoML() + automl_settings = { + "time_budget": 10, + "metric": "ap", + "task": "classification", + "log_file_name": "test/sparse_classification.log", + "estimator_list": ["xgboost"], + "log_type": "all", + "n_jobs": 1, + "n_concurrent_trials": 2, + "hpo_method": hpo_method, + } + X_train = scipy.sparse.eye(900000) + y_train = np.random.randint(2, size=900000) + try: + automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) + print(automl_experiment.predict(X_train)) + print(automl_experiment.model) + print(automl_experiment.config_history) + print(automl_experiment.model_history) + print(automl_experiment.best_iteration) + print(automl_experiment.best_estimator) + except ImportError: + return + + def test_parallel_xgboost_others(self): + # use random search as the hpo_method + self.test_parallel_xgboost(hpo_method="random") + + def test_random_skip_oom(self): + automl_experiment = AutoML() + automl_experiment.add_learner( + learner_name="large_lgbm", learner_class=MyLargeLGBM + ) + automl_settings = { + "time_budget": 2, + "task": "classification", + "log_file_name": "test/sparse_classification_oom.log", + "estimator_list": ["large_lgbm"], + "log_type": "all", + "n_jobs": 1, + "hpo_method": "random", + "n_concurrent_trials": 2, + } + X_train = scipy.sparse.eye(900000) + y_train = np.random.randint(2, size=900000) + + try: + automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) + print(automl_experiment.predict(X_train)) + print(automl_experiment.model) + print(automl_experiment.config_history) + print(automl_experiment.model_history) + print(automl_experiment.best_iteration) + print(automl_experiment.best_estimator) + except ImportError: + print("skipping concurrency test as ray is not installed") + return + + def test_sparse_matrix_lr(self): + automl_experiment = AutoML() + automl_settings = { + "time_budget": 3, + "metric": "f1", + "task": "classification", + "log_file_name": "test/sparse_classification.log", + "estimator_list": ["lrl1", "lrl2"], + "log_type": "all", + "n_jobs": 1, + } + X_train = scipy.sparse.random(3000, 3000, density=0.1) + y_train = np.random.randint(2, size=3000) + automl_experiment.fit( + X_train=X_train, y_train=y_train, train_time_limit=1, **automl_settings + ) + automl_settings["time_budget"] = 5 + automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) + print(automl_experiment.predict(X_train)) + print(automl_experiment.model) + print(automl_experiment.config_history) + print(automl_experiment.model_history) + print(automl_experiment.best_iteration) + print(automl_experiment.best_estimator) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/test_automl.py b/test/test_multiclass.py similarity index 52% rename from test/test_automl.py rename to test/test_multiclass.py index 6714dd85a4..4160e348f4 100644 --- a/test/test_automl.py +++ b/test/test_multiclass.py @@ -1,21 +1,12 @@ import unittest - import numpy as np import scipy.sparse -from sklearn.datasets import ( - fetch_california_housing, - load_iris, - load_wine, - load_breast_cancer, -) +from sklearn.datasets import load_iris, load_wine -import pandas as pd -from datetime import datetime from flaml import AutoML from flaml.data import CLASSIFICATION, get_output_from_log - -from flaml.model import LGBMEstimator, SKLearnEstimator, XGBoostEstimator +from flaml.model import LGBMEstimator, XGBoostSklearnEstimator, SKLearnEstimator from flaml import tune from flaml.training_log import training_log_reader @@ -72,26 +63,21 @@ class MyRegularizedGreedyForest(SKLearnEstimator): return 1.0 -def logregobj(preds, dtrain): - labels = dtrain.get_label() - preds = 1.0 / (1.0 + np.exp(-preds)) # transform raw leaf weight - grad = preds - labels - hess = preds * (1.0 - preds) - return grad, hess - - -class MyXGB1(XGBoostEstimator): - """XGBoostEstimator with logregobj as the objective function""" - - def __init__(self, **config): - super().__init__(objective=logregobj, **config) - - -class MyXGB2(XGBoostEstimator): - """XGBoostEstimator with 'reg:squarederror' as the objective function""" - - def __init__(self, **config): - super().__init__(objective="reg:squarederror", **config) +class MyLargeXGB(XGBoostSklearnEstimator): + @classmethod + def search_space(cls, **params): + return { + "n_estimators": { + "domain": tune.lograndint(lower=4, upper=32768), + "init_value": 32768, + "low_cost_init_value": 4, + }, + "max_leaves": { + "domain": tune.lograndint(lower=4, upper=3276), + "init_value": 3276, + "low_cost_init_value": 4, + }, + } class MyLargeLGBM(LGBMEstimator): @@ -104,8 +90,8 @@ class MyLargeLGBM(LGBMEstimator): "low_cost_init_value": 4, }, "num_leaves": { - "domain": tune.lograndint(lower=4, upper=32768), - "init_value": 32768, + "domain": tune.lograndint(lower=4, upper=3276), + "init_value": 3276, "low_cost_init_value": 4, }, } @@ -141,7 +127,7 @@ def custom_metric( } -class TestAutoML(unittest.TestCase): +class TestMultiClass(unittest.TestCase): def test_custom_learner(self): automl = AutoML() automl.add_learner(learner_name="RGF", learner_class=MyRegularizedGreedyForest) @@ -185,123 +171,6 @@ class TestAutoML(unittest.TestCase): """The main flaml automl API""" automl.fit(X_train=X_train, y_train=y_train, **settings) - def test_preprocess(self): - automl = AutoML() - X = pd.DataFrame( - { - "f1": [1, -2, 3, -4, 5, -6, -7, 8, -9, -10, -11, -12, -13, -14], - "f2": [ - 3.0, - 16.0, - 10.0, - 12.0, - 3.0, - 14.0, - 11.0, - 12.0, - 5.0, - 14.0, - 20.0, - 16.0, - 15.0, - 11.0, - ], - "f3": [ - "a", - "b", - "a", - "c", - "c", - "b", - "b", - "b", - "b", - "a", - "b", - 1.0, - 1.0, - "a", - ], - "f4": [ - True, - True, - False, - True, - True, - False, - False, - False, - True, - True, - False, - False, - True, - True, - ], - } - ) - y = pd.Series([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) - - automl = AutoML() - automl_settings = { - "time_budget": 6, - "task": "classification", - "n_jobs": 1, - "estimator_list": ["catboost", "lrl2"], - "eval_method": "cv", - "n_splits": 3, - "metric": "accuracy", - "log_training_metric": True, - "verbose": 4, - "ensemble": True, - } - automl.fit(X, y, **automl_settings) - - automl = AutoML() - automl_settings = { - "time_budget": 2, - "task": "classification", - "n_jobs": 1, - "estimator_list": ["lrl2", "kneighbor"], - "eval_method": "cv", - "n_splits": 3, - "metric": "accuracy", - "log_training_metric": True, - "verbose": 4, - "ensemble": True, - } - automl.fit(X, y, **automl_settings) - - automl = AutoML() - automl_settings = { - "time_budget": 3, - "task": "classification", - "n_jobs": 1, - "estimator_list": ["xgboost", "catboost", "kneighbor"], - "eval_method": "cv", - "n_splits": 3, - "metric": "accuracy", - "log_training_metric": True, - "verbose": 4, - "ensemble": True, - } - automl.fit(X, y, **automl_settings) - - automl = AutoML() - automl_settings = { - "time_budget": 3, - "task": "classification", - "n_jobs": 1, - "estimator_list": ["lgbm", "catboost", "kneighbor"], - "eval_method": "cv", - "n_splits": 3, - "metric": "accuracy", - "log_training_metric": True, - "verbose": 4, - "ensemble": True, - } - automl.fit(X, y, **automl_settings) - def test_dataframe(self): self.test_classification(True) @@ -348,20 +217,6 @@ class TestAutoML(unittest.TestCase): ) print(metric_history) - def test_binary(self): - automl_experiment = AutoML() - automl_settings = { - "time_budget": 1, - "task": "binary", - "log_file_name": "test/breast_cancer.log", - "log_training_metric": True, - "n_jobs": 1, - "model_history": True, - } - X_train, y_train = load_breast_cancer(return_X_y=True) - automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) - _ = automl_experiment.predict(X_train) - def test_classification(self, as_frame=False): automl_experiment = AutoML() automl_settings = { @@ -401,47 +256,6 @@ class TestAutoML(unittest.TestCase): print(automl_experiment.model) print(automl_experiment.predict_proba(X_train)[:5]) - def test_datetime_columns(self): - automl_experiment = AutoML() - automl_settings = { - "time_budget": 2, - "log_file_name": "test/datetime_columns.log", - "log_training_metric": True, - "n_jobs": 1, - "model_history": True, - } - fake_df = pd.DataFrame( - { - "A": [ - datetime(1900, 2, 3), - datetime(1900, 3, 4), - datetime(1900, 3, 4), - datetime(1900, 3, 4), - datetime(1900, 7, 2), - datetime(1900, 8, 9), - ], - "B": [ - datetime(1900, 1, 1), - datetime(1900, 1, 1), - datetime(1900, 1, 1), - datetime(1900, 1, 1), - datetime(1900, 1, 1), - datetime(1900, 1, 1), - ], - "year_A": [ - datetime(1900, 1, 2), - datetime(1900, 8, 1), - datetime(1900, 1, 4), - datetime(1900, 6, 1), - datetime(1900, 1, 5), - datetime(1900, 4, 1), - ], - } - ) - y = np.array([0, 1, 0, 1, 0, 0]) - automl_experiment.fit(X_train=fake_df, y_train=y, **automl_settings) - _ = automl_experiment.predict(fake_df) - def test_micro_macro_f1(self): automl_experiment_micro = AutoML() automl_experiment_macro = AutoML() @@ -501,50 +315,6 @@ class TestAutoML(unittest.TestCase): X_train, y_train = load_iris(return_X_y=True) automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) - def test_regression(self): - automl_experiment = AutoML() - automl_settings = { - "time_budget": 2, - "task": "regression", - "log_file_name": "test/california.log", - "log_training_metric": True, - "n_jobs": 1, - "model_history": True, - } - X_train, y_train = fetch_california_housing(return_X_y=True) - n = int(len(y_train) * 9 // 10) - automl_experiment.fit( - X_train=X_train[:n], - y_train=y_train[:n], - X_val=X_train[n:], - y_val=y_train[n:], - **automl_settings - ) - assert automl_experiment._state.eval_method == "holdout" - print(automl_experiment.predict(X_train)) - print(automl_experiment.model) - print(automl_experiment.config_history) - print(automl_experiment.model_history) - print(automl_experiment.best_iteration) - print(automl_experiment.best_estimator) - print(get_output_from_log(automl_settings["log_file_name"], 1)) - automl_experiment.retrain_from_log( - task="regression", - log_file_name=automl_settings["log_file_name"], - X_train=X_train, - y_train=y_train, - train_full=True, - time_budget=1, - ) - automl_experiment.retrain_from_log( - task="regression", - log_file_name=automl_settings["log_file_name"], - X_train=X_train, - y_train=y_train, - train_full=True, - time_budget=0, - ) - def test_sparse_matrix_classification(self): automl_experiment = AutoML() automl_settings = { @@ -567,236 +337,51 @@ class TestAutoML(unittest.TestCase): print(automl_experiment.best_iteration) print(automl_experiment.best_estimator) - def test_sparse_matrix_regression(self): - X_train = scipy.sparse.random(300, 900, density=0.0001) - y_train = np.random.uniform(size=300) - X_val = scipy.sparse.random(100, 900, density=0.0001) - y_val = np.random.uniform(size=100) - automl_experiment = AutoML() - automl_settings = { - "time_budget": 2, - "metric": "mae", - "task": "regression", - "log_file_name": "test/sparse_regression.log", - "n_jobs": 1, - "model_history": True, - "keep_search_state": True, - "verbose": 0, - "early_stop": True, - } - automl_experiment.fit( - X_train=X_train, - y_train=y_train, - X_val=X_val, - y_val=y_val, - **automl_settings - ) - assert automl_experiment._state.X_val.shape == X_val.shape - print(automl_experiment.predict(X_train)) - print(automl_experiment.model) - print(automl_experiment.config_history) - print(automl_experiment.model_history) - print(automl_experiment.best_iteration) - print(automl_experiment.best_estimator) - print(automl_experiment.best_config) - print(automl_experiment.best_loss) - print(automl_experiment.best_config_train_time) - - def test_sparse_matrix_xgboost(self): - automl_experiment = AutoML() - automl_settings = { - "time_budget": 3, - "metric": "ap", - "task": "classification", - "log_file_name": "test/sparse_classification.log", - "estimator_list": ["xgboost"], - "log_type": "all", - "n_jobs": 1, - } - X_train = scipy.sparse.eye(900000) - y_train = np.random.randint(2, size=900000) - automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) - print(automl_experiment.predict(X_train)) - print(automl_experiment.model) - print(automl_experiment.config_history) - print(automl_experiment.model_history) - print(automl_experiment.best_iteration) - print(automl_experiment.best_estimator) - - def test_parallel(self, hpo_method=None): - automl_experiment = AutoML() - automl_settings = { - "time_budget": 10, - "task": "regression", - "log_file_name": "test/california.log", - "log_type": "all", - "n_jobs": 1, - "n_concurrent_trials": 10, - "hpo_method": hpo_method, - } - X_train, y_train = fetch_california_housing(return_X_y=True) - try: - automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) - print(automl_experiment.predict(X_train)) - print(automl_experiment.model) - print(automl_experiment.config_history) - print(automl_experiment.model_history) - print(automl_experiment.best_iteration) - print(automl_experiment.best_estimator) - except ImportError: - return - - def test_parallel_classification(self): - from sklearn.datasets import make_classification - - X, y = make_classification(1000, 10) - automl = AutoML() - try: - automl.fit( - X, y, time_budget=10, task="classification", n_concurrent_trials=2 - ) - except ImportError: - return - - def test_parallel_xgboost(self, hpo_method=None): - automl_experiment = AutoML() - automl_settings = { - "time_budget": 10, - "metric": "ap", - "task": "classification", - "log_file_name": "test/sparse_classification.log", - "estimator_list": ["xgboost"], - "log_type": "all", - "n_jobs": 1, - "n_concurrent_trials": 2, - "hpo_method": hpo_method, - } - X_train = scipy.sparse.eye(900000) - y_train = np.random.randint(2, size=900000) - try: - automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) - print(automl_experiment.predict(X_train)) - print(automl_experiment.model) - print(automl_experiment.config_history) - print(automl_experiment.model_history) - print(automl_experiment.best_iteration) - print(automl_experiment.best_estimator) - except ImportError: - return - - def test_parallel_xgboost_others(self): - # use random search as the hpo_method - self.test_parallel_xgboost(hpo_method="random") - - def test_random_out_of_memory(self): + def _test_memory_limit(self): automl_experiment = AutoML() automl_experiment.add_learner( learner_name="large_lgbm", learner_class=MyLargeLGBM ) automl_settings = { - "time_budget": 2, - "metric": "ap", + "time_budget": None, "task": "classification", - "log_file_name": "test/sparse_classification_oom.log", + "log_file_name": "test/classification_oom.log", "estimator_list": ["large_lgbm"], "log_type": "all", - "n_jobs": 1, - "n_concurrent_trials": 2, "hpo_method": "random", } + X_train, y_train = load_iris(return_X_y=True, as_frame=True) - X_train = scipy.sparse.eye(900000) - y_train = np.random.randint(2, size=900000) - try: - automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) - print(automl_experiment.predict(X_train)) - print(automl_experiment.model) - print(automl_experiment.config_history) - print(automl_experiment.model_history) - print(automl_experiment.best_iteration) - print(automl_experiment.best_estimator) - except ImportError: - return - - def test_sparse_matrix_lr(self): - automl_experiment = AutoML() - automl_settings = { - "time_budget": 2, - "metric": "f1", - "task": "classification", - "log_file_name": "test/sparse_classification.log", - "estimator_list": ["lrl1", "lrl2"], - "log_type": "all", - "n_jobs": 1, - } - X_train = scipy.sparse.random(3000, 900, density=0.1) - y_train = np.random.randint(2, size=3000) - automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) - print(automl_experiment.predict(X_train)) - print(automl_experiment.model) - print(automl_experiment.config_history) - print(automl_experiment.model_history) - print(automl_experiment.best_iteration) - print(automl_experiment.best_estimator) - - def test_sparse_matrix_regression_holdout(self): - X_train = scipy.sparse.random(8, 100) - y_train = np.random.uniform(size=8) - automl_experiment = AutoML() - automl_settings = { - "time_budget": 1, - "eval_method": "holdout", - "task": "regression", - "log_file_name": "test/sparse_regression.log", - "n_jobs": 1, - "model_history": True, - "metric": "mse", - "sample_weight": np.ones(len(y_train)), - "early_stop": True, - } - automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) - print(automl_experiment.predict(X_train)) - print(automl_experiment.model) - print(automl_experiment.config_history) - print(automl_experiment.model_history) - print(automl_experiment.best_iteration) - print(automl_experiment.best_estimator) - - def test_regression_xgboost(self): - X_train = scipy.sparse.random(300, 900, density=0.0001) - y_train = np.random.uniform(size=300) - X_val = scipy.sparse.random(100, 900, density=0.0001) - y_val = np.random.uniform(size=100) - automl_experiment = AutoML() - automl_experiment.add_learner(learner_name="my_xgb1", learner_class=MyXGB1) - automl_experiment.add_learner(learner_name="my_xgb2", learner_class=MyXGB2) - automl_settings = { - "time_budget": 2, - "estimator_list": ["my_xgb1", "my_xgb2"], - "task": "regression", - "log_file_name": "test/regression_xgboost.log", - "n_jobs": 1, - "model_history": True, - "keep_search_state": True, - "early_stop": True, - } automl_experiment.fit( - X_train=X_train, - y_train=y_train, - X_val=X_val, - y_val=y_val, - **automl_settings + X_train=X_train, y_train=y_train, max_iter=1, **automl_settings ) - assert automl_experiment._state.X_val.shape == X_val.shape - print(automl_experiment.predict(X_train)) print(automl_experiment.model) - print(automl_experiment.config_history) - print(automl_experiment.model_history) - print(automl_experiment.best_iteration) - print(automl_experiment.best_estimator) - print(automl_experiment.best_config) - print(automl_experiment.best_loss) - print(automl_experiment.best_config_train_time) + + def test_time_limit(self): + automl_experiment = AutoML() + automl_experiment.add_learner( + learner_name="large_lgbm", learner_class=MyLargeLGBM + ) + automl_experiment.add_learner( + learner_name="large_xgb", learner_class=MyLargeXGB + ) + automl_settings = { + "time_budget": 0.5, + "task": "classification", + "log_file_name": "test/classification_timeout.log", + "estimator_list": ["catboost"], + "log_type": "all", + "hpo_method": "random", + } + X_train, y_train = load_iris(return_X_y=True, as_frame=True) + automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) + print(automl_experiment.model.params) + automl_settings["estimator_list"] = ["large_xgb"] + automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) + print(automl_experiment.model) + automl_settings["estimator_list"] = ["large_lgbm"] + automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) + print(automl_experiment.model) def test_fit_w_starting_point(self, as_frame=True): automl_experiment = AutoML() diff --git a/test/test_notebook_example.py b/test/test_notebook_example.py index e3fc207615..7ef6722969 100644 --- a/test/test_notebook_example.py +++ b/test/test_notebook_example.py @@ -60,7 +60,7 @@ def test_automl(budget=5, dataset_format="dataframe", hpo_method=None): valid_loss_history, config_history, metric_history, - ) = get_output_from_log(filename=settings["log_file_name"], time_budget=60) + ) = get_output_from_log(filename=settings["log_file_name"], time_budget=6) for config in config_history: print(config) print(automl.prune_attr) diff --git a/test/test_python_log.py b/test/test_python_log.py index 20ac61c18c..73581d5392 100644 --- a/test/test_python_log.py +++ b/test/test_python_log.py @@ -113,3 +113,9 @@ class TestLogging(unittest.TestCase): with open("automl.pkl", "wb") as f: pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL) print(automl.__version__) + pred1 = automl.predict(X_train) + with open("automl.pkl", "rb") as f: + automl = pickle.load(f) + pred2 = automl.predict(X_train) + delta = pred1 - pred2 + assert max(delta) == 0 and min(delta) == 0 diff --git a/test/test_regression.py b/test/test_regression.py new file mode 100644 index 0000000000..e33110bf2a --- /dev/null +++ b/test/test_regression.py @@ -0,0 +1,221 @@ +import unittest +import numpy as np +import scipy.sparse +from sklearn.datasets import ( + fetch_california_housing, +) + +from flaml import AutoML +from flaml.data import get_output_from_log +from flaml.model import XGBoostEstimator + + +def logregobj(preds, dtrain): + labels = dtrain.get_label() + preds = 1.0 / (1.0 + np.exp(-preds)) # transform raw leaf weight + grad = preds - labels + hess = preds * (1.0 - preds) + return grad, hess + + +class MyXGB1(XGBoostEstimator): + """XGBoostEstimator with logregobj as the objective function""" + + def __init__(self, **config): + super().__init__(objective=logregobj, **config) + + +class MyXGB2(XGBoostEstimator): + """XGBoostEstimator with 'reg:squarederror' as the objective function""" + + def __init__(self, **config): + super().__init__(objective="reg:squarederror", **config) + + +class TestRegression(unittest.TestCase): + def test_regression(self): + automl_experiment = AutoML() + automl_settings = { + "time_budget": 2, + "task": "regression", + "log_file_name": "test/california.log", + "log_training_metric": True, + "n_jobs": 1, + "model_history": True, + } + X_train, y_train = fetch_california_housing(return_X_y=True) + n = int(len(y_train) * 9 // 10) + automl_experiment.fit( + X_train=X_train[:n], + y_train=y_train[:n], + X_val=X_train[n:], + y_val=y_train[n:], + **automl_settings + ) + assert automl_experiment._state.eval_method == "holdout" + print(automl_experiment.predict(X_train)) + print(automl_experiment.model) + print(automl_experiment.config_history) + print(automl_experiment.model_history) + print(automl_experiment.best_iteration) + print(automl_experiment.best_estimator) + print(get_output_from_log(automl_settings["log_file_name"], 1)) + automl_experiment.retrain_from_log( + task="regression", + log_file_name=automl_settings["log_file_name"], + X_train=X_train, + y_train=y_train, + train_full=True, + time_budget=1, + ) + automl_experiment.retrain_from_log( + task="regression", + log_file_name=automl_settings["log_file_name"], + X_train=X_train, + y_train=y_train, + train_full=True, + time_budget=0, + ) + + def test_sparse_matrix_classification(self): + automl_experiment = AutoML() + automl_settings = { + "time_budget": 2, + "metric": "auto", + "task": "classification", + "log_file_name": "test/sparse_classification.log", + "split_type": "uniform", + "n_jobs": 1, + "model_history": True, + } + X_train = scipy.sparse.random(1554, 21, dtype=int) + y_train = np.random.randint(3, size=1554) + automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) + print(automl_experiment.classes_) + print(automl_experiment.predict_proba(X_train)) + print(automl_experiment.model) + print(automl_experiment.config_history) + print(automl_experiment.model_history) + print(automl_experiment.best_iteration) + print(automl_experiment.best_estimator) + + def test_sparse_matrix_regression(self): + X_train = scipy.sparse.random(300, 900, density=0.0001) + y_train = np.random.uniform(size=300) + X_val = scipy.sparse.random(100, 900, density=0.0001) + y_val = np.random.uniform(size=100) + automl_experiment = AutoML() + automl_settings = { + "time_budget": 2, + "metric": "mae", + "task": "regression", + "log_file_name": "test/sparse_regression.log", + "n_jobs": 1, + "model_history": True, + "keep_search_state": True, + "verbose": 0, + "early_stop": True, + } + automl_experiment.fit( + X_train=X_train, + y_train=y_train, + X_val=X_val, + y_val=y_val, + **automl_settings + ) + assert automl_experiment._state.X_val.shape == X_val.shape + print(automl_experiment.predict(X_train)) + print(automl_experiment.model) + print(automl_experiment.config_history) + print(automl_experiment.model_history) + print(automl_experiment.best_iteration) + print(automl_experiment.best_estimator) + print(automl_experiment.best_config) + print(automl_experiment.best_loss) + print(automl_experiment.best_config_train_time) + + def test_parallel(self, hpo_method=None): + automl_experiment = AutoML() + automl_settings = { + "time_budget": 10, + "task": "regression", + "log_file_name": "test/california.log", + "log_type": "all", + "n_jobs": 1, + "n_concurrent_trials": 10, + "hpo_method": hpo_method, + } + X_train, y_train = fetch_california_housing(return_X_y=True) + try: + automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) + print(automl_experiment.predict(X_train)) + print(automl_experiment.model) + print(automl_experiment.config_history) + print(automl_experiment.model_history) + print(automl_experiment.best_iteration) + print(automl_experiment.best_estimator) + except ImportError: + return + + def test_sparse_matrix_regression_holdout(self): + X_train = scipy.sparse.random(8, 100) + y_train = np.random.uniform(size=8) + automl_experiment = AutoML() + automl_settings = { + "time_budget": 1, + "eval_method": "holdout", + "task": "regression", + "log_file_name": "test/sparse_regression.log", + "n_jobs": 1, + "model_history": True, + "metric": "mse", + "sample_weight": np.ones(len(y_train)), + "early_stop": True, + } + automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings) + print(automl_experiment.predict(X_train)) + print(automl_experiment.model) + print(automl_experiment.config_history) + print(automl_experiment.model_history) + print(automl_experiment.best_iteration) + print(automl_experiment.best_estimator) + + def test_regression_xgboost(self): + X_train = scipy.sparse.random(300, 900, density=0.0001) + y_train = np.random.uniform(size=300) + X_val = scipy.sparse.random(100, 900, density=0.0001) + y_val = np.random.uniform(size=100) + automl_experiment = AutoML() + automl_experiment.add_learner(learner_name="my_xgb1", learner_class=MyXGB1) + automl_experiment.add_learner(learner_name="my_xgb2", learner_class=MyXGB2) + automl_settings = { + "time_budget": 2, + "estimator_list": ["my_xgb1", "my_xgb2"], + "task": "regression", + "log_file_name": "test/regression_xgboost.log", + "n_jobs": 1, + "model_history": True, + "keep_search_state": True, + "early_stop": True, + } + automl_experiment.fit( + X_train=X_train, + y_train=y_train, + X_val=X_val, + y_val=y_val, + **automl_settings + ) + assert automl_experiment._state.X_val.shape == X_val.shape + print(automl_experiment.predict(X_train)) + print(automl_experiment.model) + print(automl_experiment.config_history) + print(automl_experiment.model_history) + print(automl_experiment.best_iteration) + print(automl_experiment.best_estimator) + print(automl_experiment.best_config) + print(automl_experiment.best_loss) + print(automl_experiment.best_config_train_time) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/test_training_log.py b/test/test_training_log.py index ac8fba1702..de1be2f351 100644 --- a/test/test_training_log.py +++ b/test/test_training_log.py @@ -30,6 +30,7 @@ class TestTrainingLog(unittest.TestCase): # "ensemble": True, "keep_search_state": True, "estimator_list": estimator_list, + "model_history": True, } X_train, y_train = fetch_california_housing(return_X_y=True) automl.fit(X_train=X_train, y_train=y_train, **automl_settings)