mirror of https://github.com/microsoft/autogen.git
limit time and memory consumption (#264)
* limit time and memory * separate tests * lrl1 can't be limited by limit_resource * free memory when possible * passthrough=False when ensemble fails; retrain when trained_estimator is None * use callback to for resource limit * handle lower version of xgb with no callback * free mem ratio * reduce verbosity * retrain_final when max_iter==1 * remove trained_estimator from result * model_history * wheel * retrain time as best_config_train_time * ci: libomp version for xgboost on macos * limit_resource not working in windows * test pickle load * mute forecaster * notebook update * check hard * preventive callback * add use_ray
This commit is contained in:
parent
6c66cd67f7
commit
549a0dfb53
|
@ -24,9 +24,11 @@ jobs:
|
|||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: If mac, install libomp to facilitate lgbm install
|
||||
- name: If mac, install libomp to facilitate lgbm and xgboost install
|
||||
if: matrix.os == 'macOS-latest'
|
||||
run: |
|
||||
# remove libomp version constraint after xgboost works with libomp>11.1.0
|
||||
wget https://raw.githubusercontent.com/Homebrew/homebrew-core/679923b4eb48a8dc7ecc1f05d06063cd79b3fc00/Formula/libomp.rb -O $(find $(brew --repository) -name libomp.rb)
|
||||
brew install libomp
|
||||
export CC=/usr/bin/clang
|
||||
export CXX=/usr/bin/clang++
|
||||
|
@ -36,7 +38,7 @@ jobs:
|
|||
export LDFLAGS="$LDFLAGS -Wl,-rpath,/usr/local/opt/libomp/lib -L/usr/local/opt/libomp/lib -lomp"
|
||||
- name: Install packages and dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install --upgrade pip wheel
|
||||
pip install -e .[test]
|
||||
- name: If linux or mac, install ray
|
||||
if: (matrix.os == 'macOS-latest' || matrix.os == 'ubuntu-latest') && matrix.python-version != '3.9'
|
||||
|
@ -65,7 +67,7 @@ jobs:
|
|||
with:
|
||||
file: ./coverage.xml
|
||||
flags: unittests
|
||||
|
||||
|
||||
docs:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
|
150
flaml/automl.py
150
flaml/automl.py
|
@ -248,7 +248,7 @@ class AutoMLState:
|
|||
"wall_clock_time": time.time() - self._start_time_flag,
|
||||
"metric_for_logging": metric_for_logging,
|
||||
"val_loss": val_loss,
|
||||
"trained_estimator": trained_estimator,
|
||||
"trained_estimator": trained_estimator if self.save_model_history else None,
|
||||
}
|
||||
if sampled_weight is not None:
|
||||
self.fit_kwargs["sample_weight"] = weight
|
||||
|
@ -403,9 +403,10 @@ class AutoML:
|
|||
|
||||
@property
|
||||
def best_config_train_time(self):
|
||||
"""A float of the seconds taken by training the
|
||||
best config."""
|
||||
return self._search_states[self._best_estimator].best_config_train_time
|
||||
"""A float of the seconds taken by training the best config."""
|
||||
return getattr(
|
||||
self._search_states[self._best_estimator], "best_config_train_time", None
|
||||
)
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
|
@ -529,8 +530,9 @@ class AutoML:
|
|||
self._nrow, self._ndim = X_train_all.shape
|
||||
if self._state.task == TS_FORECAST:
|
||||
X_train_all = pd.DataFrame(X_train_all)
|
||||
assert X_train_all[X_train_all.columns[0]].dtype.name == 'datetime64[ns]', (
|
||||
f"For '{TS_FORECAST}' task, the first column must contain timestamp values.")
|
||||
assert (
|
||||
X_train_all[X_train_all.columns[0]].dtype.name == "datetime64[ns]"
|
||||
), f"For '{TS_FORECAST}' task, the first column must contain timestamp values."
|
||||
X, y = X_train_all, y_train_all
|
||||
elif dataframe is not None and label is not None:
|
||||
assert isinstance(
|
||||
|
@ -539,8 +541,9 @@ class AutoML:
|
|||
assert label in dataframe.columns, "label must a column name in dataframe"
|
||||
self._df = True
|
||||
if self._state.task == TS_FORECAST:
|
||||
assert dataframe[dataframe.columns[0]].dtype.name == 'datetime64[ns]', (
|
||||
f"For '{TS_FORECAST}' task, the first column must contain timestamp values.")
|
||||
assert (
|
||||
dataframe[dataframe.columns[0]].dtype.name == "datetime64[ns]"
|
||||
), f"For '{TS_FORECAST}' task, the first column must contain timestamp values."
|
||||
X = dataframe.drop(columns=label)
|
||||
self._nrow, self._ndim = X.shape
|
||||
y = dataframe[label]
|
||||
|
@ -584,7 +587,9 @@ class AutoML:
|
|||
else:
|
||||
self._state.X_val = X_val
|
||||
if self._label_transformer:
|
||||
self._state.y_val = self._label_transformer.transform(y_val, self._state.task)
|
||||
self._state.y_val = self._label_transformer.transform(
|
||||
y_val, self._state.task
|
||||
)
|
||||
else:
|
||||
self._state.y_val = y_val
|
||||
else:
|
||||
|
@ -1064,7 +1069,8 @@ class AutoML:
|
|||
return "holdout"
|
||||
nrow, dim = self._nrow, self._ndim
|
||||
if (
|
||||
nrow * dim / 0.9 < SMALL_LARGE_THRES * (time_budget / 3600)
|
||||
time_budget is None
|
||||
or nrow * dim / 0.9 < SMALL_LARGE_THRES * (time_budget / 3600)
|
||||
and nrow < CV_HOLDOUT_THRESHOLD
|
||||
):
|
||||
# time allows or sampling can be used and cv is necessary
|
||||
|
@ -1301,6 +1307,7 @@ class AutoML:
|
|||
append_log=False,
|
||||
auto_augment=True,
|
||||
min_sample_size=MIN_SAMPLE_TRAIN,
|
||||
use_ray=False,
|
||||
**fit_kwargs,
|
||||
):
|
||||
"""Find a model for a given task
|
||||
|
@ -1414,7 +1421,9 @@ class AutoML:
|
|||
In the following code example, we get starting_points from the
|
||||
automl_experiment and use them in the new_automl_experiment.
|
||||
e.g.,
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from flaml import AutoML
|
||||
automl_experiment = AutoML()
|
||||
X_train, y_train = load_iris(return_X_y=True)
|
||||
|
@ -1440,6 +1449,10 @@ class AutoML:
|
|||
augment rare classes.
|
||||
min_sample_size: int, default=MIN_SAMPLE_TRAIN | the minimal sample
|
||||
size when sample=True.
|
||||
use_ray: boolean, default=False | Whether to use ray to run the training
|
||||
in separate processes. This can be used to prevent OOM for large
|
||||
datasets, but will incur more overhead in time. Only use it if
|
||||
you run into OOM failures.
|
||||
**fit_kwargs: Other key word arguments to pass to fit() function of
|
||||
the searched learners, such as sample_weight. Include period as
|
||||
a key word argument for 'ts_forecast' task.
|
||||
|
@ -1483,8 +1496,10 @@ class AutoML:
|
|||
)
|
||||
self._retrain_final = (
|
||||
retrain_full is True
|
||||
and (eval_method == "holdout" and self._state.X_val is None)
|
||||
or (eval_method == "cv")
|
||||
and eval_method == "holdout"
|
||||
and self._state.X_val is None
|
||||
or eval_method == "cv"
|
||||
or max_iter == 1
|
||||
)
|
||||
self._auto_augment = auto_augment
|
||||
self._min_sample_size = min_sample_size
|
||||
|
@ -1564,7 +1579,7 @@ class AutoML:
|
|||
logger.info("List of ML learners in AutoML Run: {}".format(estimator_list))
|
||||
self.estimator_list = estimator_list
|
||||
self._hpo_method = hpo_method or ("cfo" if n_concurrent_trials == 1 else "bs")
|
||||
self._state.time_budget = time_budget
|
||||
self._state.time_budget = time_budget or 1e10
|
||||
self._active_estimators = estimator_list.copy()
|
||||
self._ensemble = ensemble
|
||||
self._max_iter = max_iter
|
||||
|
@ -1573,10 +1588,11 @@ class AutoML:
|
|||
self._state.train_time_limit = train_time_limit
|
||||
self._log_type = log_type
|
||||
self.split_ratio = split_ratio
|
||||
self._save_model_history = model_history
|
||||
self._state.save_model_history = model_history
|
||||
self._state.n_jobs = n_jobs
|
||||
self._n_concurrent_trials = n_concurrent_trials
|
||||
self._early_stop = early_stop
|
||||
self._use_ray = use_ray or self._n_concurrent_trials > 1
|
||||
if log_file_name:
|
||||
with training_log_writer(log_file_name, append_log) as save_helper:
|
||||
self._training_log = save_helper
|
||||
|
@ -1627,7 +1643,7 @@ class AutoML:
|
|||
from ray.tune.suggest import ConcurrencyLimiter
|
||||
except (ImportError, AssertionError):
|
||||
raise ImportError(
|
||||
"n_concurrent_trial > 1 requires installation of ray. "
|
||||
"n_concurrent_trial>1 or use_ray=True requires installation of ray. "
|
||||
"Please run pip install flaml[ray]"
|
||||
)
|
||||
if self._hpo_method in ("cfo", "grid"):
|
||||
|
@ -1693,7 +1709,8 @@ class AutoML:
|
|||
resources_per_trial=resources_per_trial,
|
||||
time_budget_s=self._state.time_budget,
|
||||
num_samples=self._max_iter,
|
||||
verbose=self.verbose,
|
||||
verbose=max(self.verbose - 3, 0),
|
||||
raise_on_failed_trial=False,
|
||||
)
|
||||
# logger.info([trial.last_result for trial in analysis.trials])
|
||||
trials = sorted(
|
||||
|
@ -1712,7 +1729,7 @@ class AutoML:
|
|||
config = result["config"]
|
||||
estimator = config.get("ml", config)["learner"]
|
||||
search_state = self._search_states[estimator]
|
||||
search_state.update(result, 0, self._save_model_history)
|
||||
search_state.update(result, 0, self._state.save_model_history)
|
||||
if result["wall_clock_time"] is not None:
|
||||
self._state.time_from_start = result["wall_clock_time"]
|
||||
if search_state.sample_size == self._state.data_size:
|
||||
|
@ -1727,7 +1744,7 @@ class AutoML:
|
|||
config,
|
||||
self._time_taken_best_iter,
|
||||
)
|
||||
if self._save_model_history:
|
||||
if self._state.save_model_history:
|
||||
self._model_history[
|
||||
_track_iter
|
||||
] = search_state.trained_estimator
|
||||
|
@ -1902,7 +1919,7 @@ class AutoML:
|
|||
search_state.update(
|
||||
result,
|
||||
time_used=time_used,
|
||||
save_model_history=self._save_model_history,
|
||||
save_model_history=self._state.save_model_history,
|
||||
)
|
||||
if self._estimator_index is None:
|
||||
# update init eci estimate
|
||||
|
@ -1945,18 +1962,27 @@ class AutoML:
|
|||
search_state.best_config,
|
||||
self._state.time_from_start,
|
||||
)
|
||||
if self._save_model_history:
|
||||
if self._state.save_model_history:
|
||||
self._model_history[
|
||||
self._track_iter
|
||||
] = search_state.trained_estimator
|
||||
elif self._trained_estimator:
|
||||
del self._trained_estimator
|
||||
self._trained_estimator = None
|
||||
self._trained_estimator = search_state.trained_estimator
|
||||
if not self._retrain_final:
|
||||
self._trained_estimator = search_state.trained_estimator
|
||||
self._best_iteration = self._track_iter
|
||||
self._time_taken_best_iter = self._state.time_from_start
|
||||
better = True
|
||||
next_trial_time = search_state.time2eval_best
|
||||
if search_state.trained_estimator and not (
|
||||
self._state.save_model_history or self._ensemble
|
||||
):
|
||||
# free RAM
|
||||
if search_state.trained_estimator != self._trained_estimator:
|
||||
search_state.trained_estimator.cleanup()
|
||||
del search_state.trained_estimator
|
||||
search_state.trained_estimator = None
|
||||
if better or self._log_type == "all":
|
||||
if self._training_log:
|
||||
self._training_log.append(
|
||||
|
@ -2049,7 +2075,9 @@ class AutoML:
|
|||
logger.info(
|
||||
"retrain {} for {:.1f}s".format(self._best_estimator, retrain_time)
|
||||
)
|
||||
self._retrained_config[best_config_sig] = retrain_time
|
||||
self._retrained_config[
|
||||
best_config_sig
|
||||
] = state.best_config_train_time = retrain_time
|
||||
est_retrain_time = 0
|
||||
self._state.time_from_start = time.time() - self._start_time_flag
|
||||
if (
|
||||
|
@ -2083,7 +2111,7 @@ class AutoML:
|
|||
self._selected = None
|
||||
self.modelcount = 0
|
||||
|
||||
if self._n_concurrent_trials == 1:
|
||||
if not self._use_ray:
|
||||
self._search_sequential()
|
||||
else:
|
||||
self._search_parallel()
|
||||
|
@ -2103,12 +2131,29 @@ class AutoML:
|
|||
"regression",
|
||||
):
|
||||
search_states = list(
|
||||
x for x in self._search_states.items() if x[1].trained_estimator
|
||||
x for x in self._search_states.items() if x[1].best_config
|
||||
)
|
||||
search_states.sort(key=lambda x: x[1].best_loss)
|
||||
estimators = [(x[0], x[1].trained_estimator) for x in search_states[:2]]
|
||||
estimators = [
|
||||
(
|
||||
x[0],
|
||||
x[1].learner_class(
|
||||
task=self._state.task,
|
||||
n_jobs=self._state.n_jobs,
|
||||
**x[1].best_config,
|
||||
),
|
||||
)
|
||||
for x in search_states[:2]
|
||||
]
|
||||
estimators += [
|
||||
(x[0], x[1].trained_estimator)
|
||||
(
|
||||
x[0],
|
||||
x[1].learner_class(
|
||||
task=self._state.task,
|
||||
n_jobs=self._state.n_jobs,
|
||||
**x[1].best_config,
|
||||
),
|
||||
)
|
||||
for x in search_states[2:]
|
||||
if x[1].best_loss < 4 * self._selected.best_loss
|
||||
]
|
||||
|
@ -2135,19 +2180,49 @@ class AutoML:
|
|||
)
|
||||
if self._sample_weight_full is not None:
|
||||
self._state.fit_kwargs["sample_weight"] = self._sample_weight_full
|
||||
stacker.fit(
|
||||
self._X_train_all, self._y_train_all, **self._state.fit_kwargs
|
||||
)
|
||||
logger.info(f"ensemble: {stacker}")
|
||||
self._trained_estimator = stacker
|
||||
self._trained_estimator.model = stacker
|
||||
for e in estimators:
|
||||
e[1].__class__.init()
|
||||
try:
|
||||
stacker.fit(
|
||||
self._X_train_all, self._y_train_all, **self._state.fit_kwargs
|
||||
)
|
||||
logger.info(f"ensemble: {stacker}")
|
||||
self._trained_estimator = stacker
|
||||
self._trained_estimator.model = stacker
|
||||
except ValueError as e:
|
||||
if passthrough:
|
||||
logger.warning(
|
||||
"Using passthrough=False for ensemble because the data contain categorical features."
|
||||
)
|
||||
stacker = Stacker(
|
||||
estimators,
|
||||
final_estimator,
|
||||
n_jobs=self._state.n_jobs,
|
||||
passthrough=False,
|
||||
)
|
||||
stacker.fit(
|
||||
self._X_train_all,
|
||||
self._y_train_all,
|
||||
**self._state.fit_kwargs,
|
||||
)
|
||||
logger.info(f"ensemble: {stacker}")
|
||||
self._trained_estimator = stacker
|
||||
self._trained_estimator.model = stacker
|
||||
else:
|
||||
raise e
|
||||
elif self._retrain_final:
|
||||
# reset time budget for retraining
|
||||
self._state.time_from_start -= self._state.time_budget
|
||||
if self._state.task == TS_FORECAST or (
|
||||
self._state.time_budget - self._state.time_from_start
|
||||
> self._selected.est_retrain_time(self.data_size_full)
|
||||
and self._selected.best_config_sample_size == self._state.data_size
|
||||
if self._max_iter > 1:
|
||||
self._state.time_from_start -= self._state.time_budget
|
||||
if (
|
||||
self._state.task == TS_FORECAST
|
||||
or self._trained_estimator is None
|
||||
or (
|
||||
self._state.time_budget - self._state.time_from_start
|
||||
> self._selected.est_retrain_time(self.data_size_full)
|
||||
and self._selected.best_config_sample_size
|
||||
== self._state.data_size
|
||||
)
|
||||
):
|
||||
state = self._search_states[self._best_estimator]
|
||||
(
|
||||
|
@ -2163,6 +2238,7 @@ class AutoML:
|
|||
self._best_estimator, retrain_time
|
||||
)
|
||||
)
|
||||
state.best_config_train_time = retrain_time
|
||||
if self._trained_estimator:
|
||||
logger.info(f"retrained model: {self._trained_estimator.model}")
|
||||
else:
|
||||
|
|
|
@ -275,9 +275,8 @@ class DataTransformer:
|
|||
X[column] = X[column].map(datetime.toordinal)
|
||||
datetime_columns.append(column)
|
||||
del tmp_dt
|
||||
else:
|
||||
X[column] = X[column].fillna(np.nan)
|
||||
num_columns.append(column)
|
||||
X[column] = X[column].fillna(np.nan)
|
||||
num_columns.append(column)
|
||||
X = X[cat_columns + num_columns]
|
||||
if task == TS_FORECAST:
|
||||
X.insert(0, TS_TIMESTAMP_COL, ds_col)
|
||||
|
|
495
flaml/model.py
495
flaml/model.py
|
@ -2,20 +2,67 @@
|
|||
* Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
* Licensed under the MIT License.
|
||||
"""
|
||||
|
||||
from contextlib import contextmanager
|
||||
from functools import partial
|
||||
import signal
|
||||
import os
|
||||
from typing import Callable, List
|
||||
import numpy as np
|
||||
import time
|
||||
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
|
||||
from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.dummy import DummyClassifier, DummyRegressor
|
||||
from scipy.sparse import issparse
|
||||
import pandas as pd
|
||||
from . import tune
|
||||
from .data import group_counts, CLASSIFICATION, TS_FORECAST, TS_TIMESTAMP_COL, TS_VALUE_COL
|
||||
|
||||
import logging
|
||||
from . import tune
|
||||
from .data import (
|
||||
group_counts,
|
||||
CLASSIFICATION,
|
||||
TS_FORECAST,
|
||||
TS_TIMESTAMP_COL,
|
||||
TS_VALUE_COL,
|
||||
)
|
||||
|
||||
try:
|
||||
import psutil
|
||||
except ImportError:
|
||||
psutil = None
|
||||
try:
|
||||
import resource
|
||||
except ImportError:
|
||||
resource = None
|
||||
|
||||
logger = logging.getLogger("flaml.automl")
|
||||
FREE_MEM_RATIO = 0.2
|
||||
|
||||
|
||||
def TimeoutHandler(sig, frame):
|
||||
raise TimeoutError(sig, frame)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def limit_resource(memory_limit, time_limit):
|
||||
if memory_limit > 0:
|
||||
soft, hard = resource.getrlimit(resource.RLIMIT_AS)
|
||||
if soft < 0 and (hard < 0 or memory_limit <= hard) or memory_limit < soft:
|
||||
resource.setrlimit(resource.RLIMIT_AS, (memory_limit, hard))
|
||||
main_thread = False
|
||||
if time_limit is not None:
|
||||
try:
|
||||
signal.signal(signal.SIGALRM, TimeoutHandler)
|
||||
signal.alarm(int(time_limit) or 1)
|
||||
main_thread = True
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
if main_thread:
|
||||
signal.alarm(0)
|
||||
if memory_limit > 0:
|
||||
resource.setrlimit(resource.RLIMIT_AS, (soft, hard))
|
||||
|
||||
|
||||
class BaseEstimator:
|
||||
|
@ -112,7 +159,35 @@ class BaseEstimator:
|
|||
Returns:
|
||||
train_time: A float of the training time in seconds
|
||||
"""
|
||||
return self._fit(X_train, y_train, **kwargs)
|
||||
if (
|
||||
getattr(self, "limit_resource", None)
|
||||
and resource is not None
|
||||
and (budget is not None or psutil is not None)
|
||||
):
|
||||
start_time = time.time()
|
||||
mem = psutil.virtual_memory() if psutil is not None else None
|
||||
try:
|
||||
with limit_resource(
|
||||
mem.available * (1 - FREE_MEM_RATIO)
|
||||
+ psutil.Process(os.getpid()).memory_info().rss
|
||||
if mem is not None
|
||||
else -1,
|
||||
budget,
|
||||
):
|
||||
train_time = self._fit(X_train, y_train, **kwargs)
|
||||
except (MemoryError, TimeoutError) as e:
|
||||
logger.warning(f"{e.__class__} {e}")
|
||||
if self._task in CLASSIFICATION:
|
||||
model = DummyClassifier()
|
||||
else:
|
||||
model = DummyRegressor()
|
||||
X_train = self._preprocess(X_train)
|
||||
model.fit(X_train, y_train)
|
||||
self._model = model
|
||||
train_time = time.time() - start_time
|
||||
else:
|
||||
train_time = self._fit(X_train, y_train, **kwargs)
|
||||
return train_time
|
||||
|
||||
def predict(self, X_test):
|
||||
"""Predict label from features
|
||||
|
@ -223,6 +298,9 @@ class SKLearnEstimator(BaseEstimator):
|
|||
|
||||
|
||||
class LGBMEstimator(BaseEstimator):
|
||||
ITER_HP = "n_estimators"
|
||||
HAS_CALLBACK = True
|
||||
|
||||
@classmethod
|
||||
def search_space(cls, data_size, **params):
|
||||
upper = min(32768, int(data_size))
|
||||
|
@ -297,6 +375,8 @@ class LGBMEstimator(BaseEstimator):
|
|||
self.estimator_class = LGBMClassifier
|
||||
self._time_per_iter = None
|
||||
self._train_size = 0
|
||||
self._mem_per_iter = 1
|
||||
self.HAS_CALLBACK = self.HAS_CALLBACK and self._callbacks(0, 0) is not None
|
||||
|
||||
def _preprocess(self, X):
|
||||
if (
|
||||
|
@ -316,50 +396,111 @@ class LGBMEstimator(BaseEstimator):
|
|||
|
||||
def fit(self, X_train, y_train, budget=None, **kwargs):
|
||||
start_time = time.time()
|
||||
n_iter = self.params["n_estimators"]
|
||||
deadline = start_time + budget if budget else np.inf
|
||||
n_iter = self.params[self.ITER_HP]
|
||||
trained = False
|
||||
if (
|
||||
(not self._time_per_iter or abs(self._train_size - X_train.shape[0]) > 4)
|
||||
and budget is not None
|
||||
and n_iter > 1
|
||||
):
|
||||
self.params["n_estimators"] = 1
|
||||
self._t1 = self._fit(X_train, y_train, **kwargs)
|
||||
if self._t1 >= budget or n_iter == 1:
|
||||
# self.params["n_estimators"] = n_iter
|
||||
return self._t1
|
||||
self.params["n_estimators"] = min(n_iter, 4)
|
||||
self._t2 = self._fit(X_train, y_train, **kwargs)
|
||||
self._time_per_iter = (
|
||||
(self._t2 - self._t1) / (self.params["n_estimators"] - 1)
|
||||
if self._t2 > self._t1
|
||||
else self._t1
|
||||
if self._t1
|
||||
else 0.001
|
||||
)
|
||||
self._train_size = X_train.shape[0]
|
||||
if self._t1 + self._t2 >= budget or n_iter == self.params["n_estimators"]:
|
||||
# self.params["n_estimators"] = n_iter
|
||||
return time.time() - start_time
|
||||
trained = True
|
||||
if budget is not None and n_iter > 1:
|
||||
max_iter = min(
|
||||
n_iter,
|
||||
int(
|
||||
(budget - time.time() + start_time - self._t1) / self._time_per_iter
|
||||
+ 1
|
||||
),
|
||||
)
|
||||
if trained and max_iter <= self.params["n_estimators"]:
|
||||
return time.time() - start_time
|
||||
self.params["n_estimators"] = max_iter
|
||||
if self.params["n_estimators"] > 0:
|
||||
self._fit(X_train, y_train, **kwargs)
|
||||
if not self.HAS_CALLBACK:
|
||||
mem0 = psutil.virtual_memory().available if psutil is not None else 1
|
||||
if (
|
||||
(
|
||||
not self._time_per_iter
|
||||
or abs(self._train_size - X_train.shape[0]) > 4
|
||||
)
|
||||
and budget is not None
|
||||
or self._mem_per_iter <= 1
|
||||
and psutil is not None
|
||||
) and n_iter > 1:
|
||||
self.params[self.ITER_HP] = 1
|
||||
self._t1 = self._fit(X_train, y_train, **kwargs)
|
||||
if budget is not None and self._t1 >= budget or n_iter == 1:
|
||||
# self.params[self.ITER_HP] = n_iter
|
||||
return self._t1
|
||||
mem1 = psutil.virtual_memory().available if psutil is not None else 1
|
||||
self._mem1 = mem0 - mem1
|
||||
self.params[self.ITER_HP] = min(n_iter, 4)
|
||||
self._t2 = self._fit(X_train, y_train, **kwargs)
|
||||
mem2 = psutil.virtual_memory().available if psutil is not None else 1
|
||||
self._mem2 = max(mem0 - mem2, self._mem1)
|
||||
# if self._mem1 <= 0:
|
||||
# self._mem_per_iter = self._mem2 / (self.params[self.ITER_HP] + 1)
|
||||
# elif self._mem2 <= 0:
|
||||
# self._mem_per_iter = self._mem1
|
||||
# else:
|
||||
self._mem_per_iter = min(
|
||||
self._mem1, self._mem2 / self.params[self.ITER_HP]
|
||||
)
|
||||
if self._mem_per_iter <= 1 and psutil is not None:
|
||||
n_iter = self.params[self.ITER_HP]
|
||||
self._time_per_iter = (
|
||||
(self._t2 - self._t1) / (self.params[self.ITER_HP] - 1)
|
||||
if self._t2 > self._t1
|
||||
else self._t1
|
||||
if self._t1
|
||||
else 0.001
|
||||
)
|
||||
self._train_size = X_train.shape[0]
|
||||
if (
|
||||
budget is not None
|
||||
and self._t1 + self._t2 >= budget
|
||||
or n_iter == self.params[self.ITER_HP]
|
||||
):
|
||||
# self.params[self.ITER_HP] = n_iter
|
||||
return time.time() - start_time
|
||||
trained = True
|
||||
# logger.debug(mem0)
|
||||
# logger.debug(self._mem_per_iter)
|
||||
if n_iter > 1:
|
||||
max_iter = min(
|
||||
n_iter,
|
||||
int(
|
||||
(budget - time.time() + start_time - self._t1)
|
||||
/ self._time_per_iter
|
||||
+ 1
|
||||
)
|
||||
if budget is not None
|
||||
else n_iter,
|
||||
int((1 - FREE_MEM_RATIO) * mem0 / self._mem_per_iter)
|
||||
if psutil is not None
|
||||
else n_iter,
|
||||
)
|
||||
if trained and max_iter <= self.params[self.ITER_HP]:
|
||||
return time.time() - start_time
|
||||
self.params[self.ITER_HP] = max_iter
|
||||
if self.params[self.ITER_HP] > 0:
|
||||
if self.HAS_CALLBACK:
|
||||
self._fit(
|
||||
X_train, y_train, callbacks=self._callbacks(start_time, deadline), **kwargs
|
||||
)
|
||||
best_iteration = (
|
||||
self._model.get_booster().best_iteration
|
||||
if isinstance(self, XGBoostSklearnEstimator)
|
||||
else self._model.best_iteration_
|
||||
)
|
||||
if best_iteration is not None:
|
||||
self._model.set_params(n_estimators=best_iteration + 1)
|
||||
else:
|
||||
self._fit(X_train, y_train, **kwargs)
|
||||
else:
|
||||
self.params["n_estimators"] = self._model.n_estimators
|
||||
self.params[self.ITER_HP] = self._model.n_estimators
|
||||
train_time = time.time() - start_time
|
||||
return train_time
|
||||
|
||||
def _callbacks(self, start_time, deadline) -> List[Callable]:
|
||||
return [partial(self._callback, start_time, deadline)]
|
||||
|
||||
def _callback(self, start_time, deadline, env) -> None:
|
||||
from lightgbm.callback import EarlyStopException
|
||||
|
||||
now = time.time()
|
||||
if env.iteration == 0:
|
||||
self._time_per_iter = now - start_time
|
||||
if now + self._time_per_iter > deadline:
|
||||
raise EarlyStopException(env.iteration, env.evaluation_result_list)
|
||||
if psutil is not None:
|
||||
mem = psutil.virtual_memory()
|
||||
if mem.available / mem.total < FREE_MEM_RATIO:
|
||||
raise EarlyStopException(env.iteration, env.evaluation_result_list)
|
||||
|
||||
|
||||
class XGBoostEstimator(SKLearnEstimator):
|
||||
"""not using sklearn API, used for regression"""
|
||||
|
@ -439,6 +580,7 @@ class XGBoostEstimator(SKLearnEstimator):
|
|||
import xgboost as xgb
|
||||
|
||||
start_time = time.time()
|
||||
deadline = start_time + budget if budget else np.inf
|
||||
if issparse(X_train):
|
||||
self.params["tree_method"] = "auto"
|
||||
else:
|
||||
|
@ -456,9 +598,20 @@ class XGBoostEstimator(SKLearnEstimator):
|
|||
if "objective" in self.params:
|
||||
del self.params["objective"]
|
||||
_n_estimators = self.params.pop("n_estimators")
|
||||
self._model = xgb.train(self.params, dtrain, _n_estimators, obj=obj)
|
||||
callbacks = XGBoostEstimator._callbacks(start_time, deadline)
|
||||
if callbacks:
|
||||
self._model = xgb.train(
|
||||
self.params,
|
||||
dtrain,
|
||||
_n_estimators,
|
||||
obj=obj,
|
||||
callbacks=callbacks,
|
||||
)
|
||||
self.params["n_estimators"] = self._model.best_iteration + 1
|
||||
else:
|
||||
self._model = xgb.train(self.params, dtrain, _n_estimators, obj=obj)
|
||||
self.params["n_estimators"] = _n_estimators
|
||||
self.params["objective"] = objective
|
||||
self.params["n_estimators"] = _n_estimators
|
||||
del dtrain
|
||||
train_time = time.time() - start_time
|
||||
return train_time
|
||||
|
@ -471,6 +624,28 @@ class XGBoostEstimator(SKLearnEstimator):
|
|||
dtest = xgb.DMatrix(X_test)
|
||||
return super().predict(dtest)
|
||||
|
||||
@classmethod
|
||||
def _callbacks(cls, start_time, deadline):
|
||||
try:
|
||||
from xgboost.callback import TrainingCallback
|
||||
except ImportError: # for xgboost<1.3
|
||||
return None
|
||||
|
||||
class ResourceLimit(TrainingCallback):
|
||||
def after_iteration(self, model, epoch, evals_log) -> bool:
|
||||
now = time.time()
|
||||
if epoch == 0:
|
||||
self._time_per_iter = now - start_time
|
||||
if now + self._time_per_iter > deadline:
|
||||
return True
|
||||
if psutil is not None:
|
||||
mem = psutil.virtual_memory()
|
||||
if mem.available / mem.total < FREE_MEM_RATIO:
|
||||
return True
|
||||
return False
|
||||
|
||||
return [ResourceLimit()]
|
||||
|
||||
|
||||
class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
|
||||
"""using sklearn API, used for classification"""
|
||||
|
@ -513,8 +688,13 @@ class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
|
|||
self.params["tree_method"] = "auto"
|
||||
return super().fit(X_train, y_train, budget, **kwargs)
|
||||
|
||||
def _callbacks(self, start_time, deadline) -> List[Callable]:
|
||||
return XGBoostEstimator._callbacks(start_time, deadline)
|
||||
|
||||
|
||||
class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
|
||||
HAS_CALLBACK = False
|
||||
|
||||
@classmethod
|
||||
def search_space(cls, data_size, task, **params):
|
||||
data_size = int(data_size)
|
||||
|
@ -607,6 +787,8 @@ class LRL1Classifier(SKLearnEstimator):
|
|||
|
||||
|
||||
class LRL2Classifier(SKLearnEstimator):
|
||||
limit_resource = True
|
||||
|
||||
@classmethod
|
||||
def search_space(cls, **params):
|
||||
return LRL1Classifier.search_space(**params)
|
||||
|
@ -629,8 +811,7 @@ class LRL2Classifier(SKLearnEstimator):
|
|||
|
||||
|
||||
class CatBoostEstimator(BaseEstimator):
|
||||
_time_per_iter = None
|
||||
_train_size = 0
|
||||
ITER_HP = "n_estimators"
|
||||
|
||||
@classmethod
|
||||
def search_space(cls, data_size, **params):
|
||||
|
@ -661,11 +842,6 @@ class CatBoostEstimator(BaseEstimator):
|
|||
def cost_relative2lgbm(cls):
|
||||
return 15
|
||||
|
||||
@classmethod
|
||||
def init(cls):
|
||||
CatBoostEstimator._time_per_iter = None
|
||||
CatBoostEstimator._train_size = 0
|
||||
|
||||
def _preprocess(self, X):
|
||||
if isinstance(X, pd.DataFrame):
|
||||
cat_columns = X.select_dtypes(include=["category"]).columns
|
||||
|
@ -719,87 +895,36 @@ class CatBoostEstimator(BaseEstimator):
|
|||
import shutil
|
||||
|
||||
start_time = time.time()
|
||||
deadline = start_time + budget if budget else np.inf
|
||||
train_dir = f"catboost_{str(start_time)}"
|
||||
n_iter = self.params["n_estimators"]
|
||||
X_train = self._preprocess(X_train)
|
||||
if isinstance(X_train, pd.DataFrame):
|
||||
cat_features = list(X_train.select_dtypes(include="category").columns)
|
||||
else:
|
||||
cat_features = []
|
||||
# from catboost import CatBoostError
|
||||
# try:
|
||||
trained = False
|
||||
if (
|
||||
(
|
||||
not CatBoostEstimator._time_per_iter
|
||||
or abs(CatBoostEstimator._train_size - len(y_train)) > 4
|
||||
)
|
||||
and budget
|
||||
and n_iter > 4
|
||||
):
|
||||
# measure the time per iteration
|
||||
self.params["n_estimators"] = 1
|
||||
CatBoostEstimator._smallmodel = self.estimator_class(
|
||||
train_dir=train_dir, **self.params
|
||||
)
|
||||
CatBoostEstimator._smallmodel.fit(
|
||||
X_train, y_train, cat_features=cat_features, **kwargs
|
||||
)
|
||||
CatBoostEstimator._t1 = time.time() - start_time
|
||||
if CatBoostEstimator._t1 >= budget or n_iter == 1:
|
||||
# self.params["n_estimators"] = n_iter
|
||||
self._model = CatBoostEstimator._smallmodel
|
||||
shutil.rmtree(train_dir, ignore_errors=True)
|
||||
return CatBoostEstimator._t1
|
||||
self.params["n_estimators"] = min(n_iter, 4)
|
||||
CatBoostEstimator._smallmodel = self.estimator_class(
|
||||
train_dir=train_dir, **self.params
|
||||
)
|
||||
CatBoostEstimator._smallmodel.fit(
|
||||
X_train, y_train, cat_features=cat_features, **kwargs
|
||||
)
|
||||
CatBoostEstimator._time_per_iter = (
|
||||
time.time() - start_time - CatBoostEstimator._t1
|
||||
) / (self.params["n_estimators"] - 1)
|
||||
if CatBoostEstimator._time_per_iter <= 0:
|
||||
CatBoostEstimator._time_per_iter = CatBoostEstimator._t1
|
||||
CatBoostEstimator._train_size = len(y_train)
|
||||
if (
|
||||
time.time() - start_time >= budget
|
||||
or n_iter == self.params["n_estimators"]
|
||||
):
|
||||
# self.params["n_estimators"] = n_iter
|
||||
self._model = CatBoostEstimator._smallmodel
|
||||
shutil.rmtree(train_dir, ignore_errors=True)
|
||||
return time.time() - start_time
|
||||
trained = True
|
||||
if budget and n_iter > 4:
|
||||
train_times = 1
|
||||
max_iter = min(
|
||||
n_iter,
|
||||
int(
|
||||
(budget - time.time() + start_time - CatBoostEstimator._t1)
|
||||
/ train_times
|
||||
/ CatBoostEstimator._time_per_iter
|
||||
+ 1
|
||||
),
|
||||
)
|
||||
self._model = CatBoostEstimator._smallmodel
|
||||
if trained and max_iter <= self.params["n_estimators"]:
|
||||
return time.time() - start_time
|
||||
self.params["n_estimators"] = max_iter
|
||||
if self.params["n_estimators"] > 0:
|
||||
n = max(int(len(y_train) * 0.9), len(y_train) - 1000)
|
||||
X_tr, y_tr = X_train[:n], y_train[:n]
|
||||
if "sample_weight" in kwargs:
|
||||
weight = kwargs["sample_weight"]
|
||||
if weight is not None:
|
||||
kwargs["sample_weight"] = weight[:n]
|
||||
else:
|
||||
weight = None
|
||||
from catboost import Pool
|
||||
n = max(int(len(y_train) * 0.9), len(y_train) - 1000)
|
||||
X_tr, y_tr = X_train[:n], y_train[:n]
|
||||
if "sample_weight" in kwargs:
|
||||
weight = kwargs["sample_weight"]
|
||||
if weight is not None:
|
||||
kwargs["sample_weight"] = weight[:n]
|
||||
else:
|
||||
weight = None
|
||||
from catboost import Pool, __version__
|
||||
|
||||
model = self.estimator_class(train_dir=train_dir, **self.params)
|
||||
model = self.estimator_class(train_dir=train_dir, **self.params)
|
||||
if __version__ >= "0.26":
|
||||
model.fit(
|
||||
X_tr,
|
||||
y_tr,
|
||||
cat_features=cat_features,
|
||||
eval_set=Pool(
|
||||
data=X_train[n:], label=y_train[n:], cat_features=cat_features
|
||||
),
|
||||
callbacks=CatBoostEstimator._callbacks(start_time, deadline),
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
model.fit(
|
||||
X_tr,
|
||||
y_tr,
|
||||
|
@ -808,18 +933,32 @@ class CatBoostEstimator(BaseEstimator):
|
|||
data=X_train[n:], label=y_train[n:], cat_features=cat_features
|
||||
),
|
||||
**kwargs,
|
||||
) # model.get_best_iteration()
|
||||
shutil.rmtree(train_dir, ignore_errors=True)
|
||||
if weight is not None:
|
||||
kwargs["sample_weight"] = weight
|
||||
self._model = model
|
||||
else:
|
||||
self.params["n_estimators"] = self._model.tree_count_
|
||||
# except CatBoostError:
|
||||
# self._model = None
|
||||
)
|
||||
shutil.rmtree(train_dir, ignore_errors=True)
|
||||
if weight is not None:
|
||||
kwargs["sample_weight"] = weight
|
||||
self._model = model
|
||||
self.params[self.ITER_HP] = self._model.tree_count_
|
||||
train_time = time.time() - start_time
|
||||
return train_time
|
||||
|
||||
@classmethod
|
||||
def _callbacks(cls, start_time, deadline):
|
||||
class ResourceLimit:
|
||||
def after_iteration(self, info) -> bool:
|
||||
now = time.time()
|
||||
if info.iteration == 1:
|
||||
self._time_per_iter = now - start_time
|
||||
if now + self._time_per_iter > deadline:
|
||||
return False
|
||||
if psutil is not None:
|
||||
mem = psutil.virtual_memory()
|
||||
if mem.available / mem.total < FREE_MEM_RATIO:
|
||||
return False
|
||||
return True # can continue
|
||||
|
||||
return [ResourceLimit()]
|
||||
|
||||
|
||||
class KNeighborsEstimator(BaseEstimator):
|
||||
@classmethod
|
||||
|
@ -919,7 +1058,8 @@ class Prophet(SKLearnEstimator):
|
|||
model = Prophet(**self.params)
|
||||
for regressor in cols:
|
||||
model.add_regressor(regressor)
|
||||
model.fit(train_df)
|
||||
with suppress_stdout_stderr():
|
||||
model.fit(train_df)
|
||||
train_time = time.time() - current_time
|
||||
self._model = model
|
||||
return train_time
|
||||
|
@ -984,15 +1124,21 @@ class ARIMA(Prophet):
|
|||
regressors = cols
|
||||
if regressors:
|
||||
model = ARIMA_estimator(
|
||||
train_df[[TS_VALUE_COL]], exog=train_df[regressors], order=(
|
||||
self.params["p"], self.params["d"], self.params["q"]),
|
||||
enforce_stationarity=False, enforce_invertibility=False)
|
||||
train_df[[TS_VALUE_COL]],
|
||||
exog=train_df[regressors],
|
||||
order=(self.params["p"], self.params["d"], self.params["q"]),
|
||||
enforce_stationarity=False,
|
||||
enforce_invertibility=False,
|
||||
)
|
||||
else:
|
||||
model = ARIMA_estimator(
|
||||
train_df, order=(
|
||||
self.params["p"], self.params["d"], self.params["q"]),
|
||||
enforce_stationarity=False, enforce_invertibility=False)
|
||||
model = model.fit()
|
||||
train_df,
|
||||
order=(self.params["p"], self.params["d"], self.params["q"]),
|
||||
enforce_stationarity=False,
|
||||
enforce_invertibility=False,
|
||||
)
|
||||
with suppress_stdout_stderr():
|
||||
model = model.fit()
|
||||
train_time = time.time() - current_time
|
||||
self._model = model
|
||||
return train_time
|
||||
|
@ -1010,7 +1156,9 @@ class ARIMA(Prophet):
|
|||
regressors = list(X_test)
|
||||
regressors.remove(TS_TIMESTAMP_COL)
|
||||
X_test = self._preprocess(X_test)
|
||||
forecast = self._model.predict(start=start, end=end, exog=X_test[regressors])
|
||||
forecast = self._model.predict(
|
||||
start=start, end=end, exog=X_test[regressors]
|
||||
)
|
||||
else:
|
||||
forecast = self._model.predict(start=start, end=end)
|
||||
else:
|
||||
|
@ -1077,25 +1225,64 @@ class SARIMAX(ARIMA):
|
|||
regressors.remove(TS_VALUE_COL)
|
||||
if regressors:
|
||||
model = SARIMAX_estimator(
|
||||
train_df[[TS_VALUE_COL]], exog=train_df[regressors], order=(
|
||||
self.params["p"], self.params["d"], self.params["q"]),
|
||||
train_df[[TS_VALUE_COL]],
|
||||
exog=train_df[regressors],
|
||||
order=(self.params["p"], self.params["d"], self.params["q"]),
|
||||
seasonality_order=(
|
||||
self.params["P"],
|
||||
self.params["D"],
|
||||
self.params["Q"],
|
||||
self.params["s"]),
|
||||
enforce_stationarity=False, enforce_invertibility=False)
|
||||
self.params["s"],
|
||||
),
|
||||
enforce_stationarity=False,
|
||||
enforce_invertibility=False,
|
||||
)
|
||||
else:
|
||||
model = SARIMAX_estimator(
|
||||
train_df, order=(
|
||||
self.params["p"], self.params["d"], self.params["q"]),
|
||||
train_df,
|
||||
order=(self.params["p"], self.params["d"], self.params["q"]),
|
||||
seasonality_order=(
|
||||
self.params["P"],
|
||||
self.params["D"],
|
||||
self.params["Q"],
|
||||
self.params["s"]),
|
||||
enforce_stationarity=False, enforce_invertibility=False)
|
||||
model = model.fit()
|
||||
self.params["s"],
|
||||
),
|
||||
enforce_stationarity=False,
|
||||
enforce_invertibility=False,
|
||||
)
|
||||
with suppress_stdout_stderr():
|
||||
model = model.fit()
|
||||
train_time = time.time() - current_time
|
||||
self._model = model
|
||||
return train_time
|
||||
|
||||
|
||||
class suppress_stdout_stderr(object):
|
||||
"""
|
||||
A context manager for doing a "deep suppression" of stdout and stderr in
|
||||
Python, i.e. will suppress all print, even if the print originates in a
|
||||
compiled C/Fortran sub-function.
|
||||
This will not suppress raised exceptions, since exceptions are printed
|
||||
to stderr just before a script exits, and after the context manager has
|
||||
exited.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# Open a pair of null files
|
||||
self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)]
|
||||
# Save the actual stdout (1) and stderr (2) file descriptors.
|
||||
self.save_fds = (os.dup(1), os.dup(2))
|
||||
|
||||
def __enter__(self):
|
||||
# Assign the null pointers to stdout and stderr.
|
||||
os.dup2(self.null_fds[0], 1)
|
||||
os.dup2(self.null_fds[1], 2)
|
||||
|
||||
def __exit__(self, *_):
|
||||
# Re-assign the real stdout/stderr back to (1) and (2)
|
||||
os.dup2(self.save_fds[0], 1)
|
||||
os.dup2(self.save_fds[1], 2)
|
||||
# Close the null files
|
||||
os.close(self.null_fds[0])
|
||||
os.close(self.null_fds[1])
|
||||
|
|
File diff suppressed because one or more lines are too long
10
setup.py
10
setup.py
|
@ -38,16 +38,16 @@ setuptools.setup(
|
|||
"notebook": [
|
||||
"openml==0.10.2",
|
||||
"jupyter",
|
||||
"matplotlib==3.2.0",
|
||||
"matplotlib",
|
||||
"rgf-python",
|
||||
"catboost>=0.26",
|
||||
],
|
||||
"test": [
|
||||
"flake8>=3.8.4",
|
||||
"pytest>=6.1.1",
|
||||
"coverage>=5.3",
|
||||
"pre-commit",
|
||||
"xgboost<1.3",
|
||||
"catboost>=0.23",
|
||||
"catboost>=0.26",
|
||||
"rgf-python",
|
||||
"optuna==2.8.0",
|
||||
"vowpalwabbit",
|
||||
|
@ -58,8 +58,9 @@ setuptools.setup(
|
|||
"datasets==1.4.1",
|
||||
"azure-storage-blob",
|
||||
"statsmodels>=0.12.2",
|
||||
"psutil==5.8.0",
|
||||
],
|
||||
"catboost": ["catboost>=0.23"],
|
||||
"catboost": ["catboost>=0.26"],
|
||||
"blendsearch": ["optuna==2.8.0"],
|
||||
"ray": [
|
||||
"ray[tune]==1.6.0",
|
||||
|
@ -83,6 +84,7 @@ setuptools.setup(
|
|||
],
|
||||
"ts_forecast": ["prophet>=1.0.1", "statsmodels>=0.12.2"],
|
||||
"forecast": ["prophet>=1.0.1", "statsmodels>=0.12.2"],
|
||||
"benchmark": ["catboost>=0.26", "psutil==5.8.0", "xgboost==1.3.3"],
|
||||
},
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3",
|
||||
|
|
|
@ -0,0 +1,323 @@
|
|||
import unittest
|
||||
import numpy as np
|
||||
import scipy.sparse
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
from flaml import AutoML
|
||||
from flaml.model import LGBMEstimator
|
||||
from flaml import tune
|
||||
|
||||
|
||||
class MyLargeLGBM(LGBMEstimator):
|
||||
@classmethod
|
||||
def search_space(cls, **params):
|
||||
return {
|
||||
"n_estimators": {
|
||||
"domain": tune.lograndint(lower=4, upper=32768),
|
||||
"init_value": 32768,
|
||||
"low_cost_init_value": 4,
|
||||
},
|
||||
"num_leaves": {
|
||||
"domain": tune.lograndint(lower=4, upper=32768),
|
||||
"init_value": 32768,
|
||||
"low_cost_init_value": 4,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class TestClassification(unittest.TestCase):
|
||||
def test_preprocess(self):
|
||||
automl = AutoML()
|
||||
X = pd.DataFrame(
|
||||
{
|
||||
"f1": [1, -2, 3, -4, 5, -6, -7, 8, -9, -10, -11, -12, -13, -14],
|
||||
"f2": [
|
||||
3.0,
|
||||
16.0,
|
||||
10.0,
|
||||
12.0,
|
||||
3.0,
|
||||
14.0,
|
||||
11.0,
|
||||
12.0,
|
||||
5.0,
|
||||
14.0,
|
||||
20.0,
|
||||
16.0,
|
||||
15.0,
|
||||
11.0,
|
||||
],
|
||||
"f3": [
|
||||
"a",
|
||||
"b",
|
||||
"a",
|
||||
"c",
|
||||
"c",
|
||||
"b",
|
||||
"b",
|
||||
"b",
|
||||
"b",
|
||||
"a",
|
||||
"b",
|
||||
1.0,
|
||||
1.0,
|
||||
"a",
|
||||
],
|
||||
"f4": [
|
||||
True,
|
||||
True,
|
||||
False,
|
||||
True,
|
||||
True,
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
True,
|
||||
True,
|
||||
False,
|
||||
False,
|
||||
True,
|
||||
True,
|
||||
],
|
||||
}
|
||||
)
|
||||
y = pd.Series([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1])
|
||||
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 6,
|
||||
"task": "classification",
|
||||
"n_jobs": 1,
|
||||
"estimator_list": ["catboost", "lrl2"],
|
||||
"eval_method": "cv",
|
||||
"n_splits": 3,
|
||||
"metric": "accuracy",
|
||||
"log_training_metric": True,
|
||||
# "verbose": 4,
|
||||
"ensemble": True,
|
||||
}
|
||||
automl.fit(X, y, **automl_settings)
|
||||
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
"task": "classification",
|
||||
"n_jobs": 1,
|
||||
"estimator_list": ["lrl2", "kneighbor"],
|
||||
"eval_method": "cv",
|
||||
"n_splits": 3,
|
||||
"metric": "accuracy",
|
||||
"log_training_metric": True,
|
||||
"verbose": 4,
|
||||
"ensemble": True,
|
||||
}
|
||||
automl.fit(X, y, **automl_settings)
|
||||
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 3,
|
||||
"task": "classification",
|
||||
"n_jobs": 1,
|
||||
"estimator_list": ["xgboost", "catboost", "kneighbor"],
|
||||
"eval_method": "cv",
|
||||
"n_splits": 3,
|
||||
"metric": "accuracy",
|
||||
"log_training_metric": True,
|
||||
# "verbose": 4,
|
||||
"ensemble": True,
|
||||
}
|
||||
automl.fit(X, y, **automl_settings)
|
||||
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 3,
|
||||
"task": "classification",
|
||||
"n_jobs": 1,
|
||||
"estimator_list": ["lgbm", "catboost", "kneighbor"],
|
||||
"eval_method": "cv",
|
||||
"n_splits": 3,
|
||||
"metric": "accuracy",
|
||||
"log_training_metric": True,
|
||||
# "verbose": 4,
|
||||
"ensemble": True,
|
||||
}
|
||||
automl.fit(X, y, **automl_settings)
|
||||
|
||||
def test_binary(self):
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 1,
|
||||
"task": "binary",
|
||||
"log_file_name": "test/breast_cancer.log",
|
||||
"log_training_metric": True,
|
||||
"n_jobs": 1,
|
||||
"model_history": True,
|
||||
}
|
||||
X_train, y_train = load_breast_cancer(return_X_y=True)
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
_ = automl_experiment.predict(X_train)
|
||||
|
||||
def test_datetime_columns(self):
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
"log_file_name": "test/datetime_columns.log",
|
||||
"log_training_metric": True,
|
||||
"n_jobs": 1,
|
||||
"model_history": True,
|
||||
}
|
||||
fake_df = pd.DataFrame(
|
||||
{
|
||||
"A": [
|
||||
datetime(1900, 2, 3),
|
||||
datetime(1900, 3, 4),
|
||||
datetime(1900, 3, 4),
|
||||
datetime(1900, 3, 4),
|
||||
datetime(1900, 7, 2),
|
||||
datetime(1900, 8, 9),
|
||||
],
|
||||
"B": [
|
||||
datetime(1900, 1, 1),
|
||||
datetime(1900, 1, 1),
|
||||
datetime(1900, 1, 1),
|
||||
datetime(1900, 1, 1),
|
||||
datetime(1900, 1, 1),
|
||||
datetime(1900, 1, 1),
|
||||
],
|
||||
"year_A": [
|
||||
datetime(1900, 1, 2),
|
||||
datetime(1900, 8, 1),
|
||||
datetime(1900, 1, 4),
|
||||
datetime(1900, 6, 1),
|
||||
datetime(1900, 1, 5),
|
||||
datetime(1900, 4, 1),
|
||||
],
|
||||
}
|
||||
)
|
||||
y = np.array([0, 1, 0, 1, 0, 0])
|
||||
automl_experiment.fit(X_train=fake_df, y_train=y, **automl_settings)
|
||||
_ = automl_experiment.predict(fake_df)
|
||||
|
||||
def test_sparse_matrix_xgboost(self):
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 3,
|
||||
"metric": "ap",
|
||||
"task": "classification",
|
||||
"log_file_name": "test/sparse_classification.log",
|
||||
"estimator_list": ["xgboost"],
|
||||
"log_type": "all",
|
||||
"n_jobs": 1,
|
||||
}
|
||||
X_train = scipy.sparse.eye(900000)
|
||||
y_train = np.random.randint(2, size=900000)
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
print(automl_experiment.predict(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
|
||||
def test_ray_classification(self):
|
||||
from sklearn.datasets import make_classification
|
||||
|
||||
X, y = make_classification(1000, 10)
|
||||
automl = AutoML()
|
||||
try:
|
||||
automl.fit(X, y, time_budget=10, task="classification", use_ray=True)
|
||||
automl.fit(
|
||||
X, y, time_budget=10, task="classification", n_concurrent_trials=2
|
||||
)
|
||||
except ImportError:
|
||||
return
|
||||
|
||||
def test_parallel_xgboost(self, hpo_method=None):
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 10,
|
||||
"metric": "ap",
|
||||
"task": "classification",
|
||||
"log_file_name": "test/sparse_classification.log",
|
||||
"estimator_list": ["xgboost"],
|
||||
"log_type": "all",
|
||||
"n_jobs": 1,
|
||||
"n_concurrent_trials": 2,
|
||||
"hpo_method": hpo_method,
|
||||
}
|
||||
X_train = scipy.sparse.eye(900000)
|
||||
y_train = np.random.randint(2, size=900000)
|
||||
try:
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
print(automl_experiment.predict(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
except ImportError:
|
||||
return
|
||||
|
||||
def test_parallel_xgboost_others(self):
|
||||
# use random search as the hpo_method
|
||||
self.test_parallel_xgboost(hpo_method="random")
|
||||
|
||||
def test_random_skip_oom(self):
|
||||
automl_experiment = AutoML()
|
||||
automl_experiment.add_learner(
|
||||
learner_name="large_lgbm", learner_class=MyLargeLGBM
|
||||
)
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
"task": "classification",
|
||||
"log_file_name": "test/sparse_classification_oom.log",
|
||||
"estimator_list": ["large_lgbm"],
|
||||
"log_type": "all",
|
||||
"n_jobs": 1,
|
||||
"hpo_method": "random",
|
||||
"n_concurrent_trials": 2,
|
||||
}
|
||||
X_train = scipy.sparse.eye(900000)
|
||||
y_train = np.random.randint(2, size=900000)
|
||||
|
||||
try:
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
print(automl_experiment.predict(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
except ImportError:
|
||||
print("skipping concurrency test as ray is not installed")
|
||||
return
|
||||
|
||||
def test_sparse_matrix_lr(self):
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 3,
|
||||
"metric": "f1",
|
||||
"task": "classification",
|
||||
"log_file_name": "test/sparse_classification.log",
|
||||
"estimator_list": ["lrl1", "lrl2"],
|
||||
"log_type": "all",
|
||||
"n_jobs": 1,
|
||||
}
|
||||
X_train = scipy.sparse.random(3000, 3000, density=0.1)
|
||||
y_train = np.random.randint(2, size=3000)
|
||||
automl_experiment.fit(
|
||||
X_train=X_train, y_train=y_train, train_time_limit=1, **automl_settings
|
||||
)
|
||||
automl_settings["time_budget"] = 5
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
print(automl_experiment.predict(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
|
@ -1,21 +1,12 @@
|
|||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse
|
||||
from sklearn.datasets import (
|
||||
fetch_california_housing,
|
||||
load_iris,
|
||||
load_wine,
|
||||
load_breast_cancer,
|
||||
)
|
||||
from sklearn.datasets import load_iris, load_wine
|
||||
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
|
||||
from flaml import AutoML
|
||||
from flaml.data import CLASSIFICATION, get_output_from_log
|
||||
|
||||
from flaml.model import LGBMEstimator, SKLearnEstimator, XGBoostEstimator
|
||||
from flaml.model import LGBMEstimator, XGBoostSklearnEstimator, SKLearnEstimator
|
||||
from flaml import tune
|
||||
from flaml.training_log import training_log_reader
|
||||
|
||||
|
@ -72,26 +63,21 @@ class MyRegularizedGreedyForest(SKLearnEstimator):
|
|||
return 1.0
|
||||
|
||||
|
||||
def logregobj(preds, dtrain):
|
||||
labels = dtrain.get_label()
|
||||
preds = 1.0 / (1.0 + np.exp(-preds)) # transform raw leaf weight
|
||||
grad = preds - labels
|
||||
hess = preds * (1.0 - preds)
|
||||
return grad, hess
|
||||
|
||||
|
||||
class MyXGB1(XGBoostEstimator):
|
||||
"""XGBoostEstimator with logregobj as the objective function"""
|
||||
|
||||
def __init__(self, **config):
|
||||
super().__init__(objective=logregobj, **config)
|
||||
|
||||
|
||||
class MyXGB2(XGBoostEstimator):
|
||||
"""XGBoostEstimator with 'reg:squarederror' as the objective function"""
|
||||
|
||||
def __init__(self, **config):
|
||||
super().__init__(objective="reg:squarederror", **config)
|
||||
class MyLargeXGB(XGBoostSklearnEstimator):
|
||||
@classmethod
|
||||
def search_space(cls, **params):
|
||||
return {
|
||||
"n_estimators": {
|
||||
"domain": tune.lograndint(lower=4, upper=32768),
|
||||
"init_value": 32768,
|
||||
"low_cost_init_value": 4,
|
||||
},
|
||||
"max_leaves": {
|
||||
"domain": tune.lograndint(lower=4, upper=3276),
|
||||
"init_value": 3276,
|
||||
"low_cost_init_value": 4,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class MyLargeLGBM(LGBMEstimator):
|
||||
|
@ -104,8 +90,8 @@ class MyLargeLGBM(LGBMEstimator):
|
|||
"low_cost_init_value": 4,
|
||||
},
|
||||
"num_leaves": {
|
||||
"domain": tune.lograndint(lower=4, upper=32768),
|
||||
"init_value": 32768,
|
||||
"domain": tune.lograndint(lower=4, upper=3276),
|
||||
"init_value": 3276,
|
||||
"low_cost_init_value": 4,
|
||||
},
|
||||
}
|
||||
|
@ -141,7 +127,7 @@ def custom_metric(
|
|||
}
|
||||
|
||||
|
||||
class TestAutoML(unittest.TestCase):
|
||||
class TestMultiClass(unittest.TestCase):
|
||||
def test_custom_learner(self):
|
||||
automl = AutoML()
|
||||
automl.add_learner(learner_name="RGF", learner_class=MyRegularizedGreedyForest)
|
||||
|
@ -185,123 +171,6 @@ class TestAutoML(unittest.TestCase):
|
|||
"""The main flaml automl API"""
|
||||
automl.fit(X_train=X_train, y_train=y_train, **settings)
|
||||
|
||||
def test_preprocess(self):
|
||||
automl = AutoML()
|
||||
X = pd.DataFrame(
|
||||
{
|
||||
"f1": [1, -2, 3, -4, 5, -6, -7, 8, -9, -10, -11, -12, -13, -14],
|
||||
"f2": [
|
||||
3.0,
|
||||
16.0,
|
||||
10.0,
|
||||
12.0,
|
||||
3.0,
|
||||
14.0,
|
||||
11.0,
|
||||
12.0,
|
||||
5.0,
|
||||
14.0,
|
||||
20.0,
|
||||
16.0,
|
||||
15.0,
|
||||
11.0,
|
||||
],
|
||||
"f3": [
|
||||
"a",
|
||||
"b",
|
||||
"a",
|
||||
"c",
|
||||
"c",
|
||||
"b",
|
||||
"b",
|
||||
"b",
|
||||
"b",
|
||||
"a",
|
||||
"b",
|
||||
1.0,
|
||||
1.0,
|
||||
"a",
|
||||
],
|
||||
"f4": [
|
||||
True,
|
||||
True,
|
||||
False,
|
||||
True,
|
||||
True,
|
||||
False,
|
||||
False,
|
||||
False,
|
||||
True,
|
||||
True,
|
||||
False,
|
||||
False,
|
||||
True,
|
||||
True,
|
||||
],
|
||||
}
|
||||
)
|
||||
y = pd.Series([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1])
|
||||
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 6,
|
||||
"task": "classification",
|
||||
"n_jobs": 1,
|
||||
"estimator_list": ["catboost", "lrl2"],
|
||||
"eval_method": "cv",
|
||||
"n_splits": 3,
|
||||
"metric": "accuracy",
|
||||
"log_training_metric": True,
|
||||
"verbose": 4,
|
||||
"ensemble": True,
|
||||
}
|
||||
automl.fit(X, y, **automl_settings)
|
||||
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
"task": "classification",
|
||||
"n_jobs": 1,
|
||||
"estimator_list": ["lrl2", "kneighbor"],
|
||||
"eval_method": "cv",
|
||||
"n_splits": 3,
|
||||
"metric": "accuracy",
|
||||
"log_training_metric": True,
|
||||
"verbose": 4,
|
||||
"ensemble": True,
|
||||
}
|
||||
automl.fit(X, y, **automl_settings)
|
||||
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 3,
|
||||
"task": "classification",
|
||||
"n_jobs": 1,
|
||||
"estimator_list": ["xgboost", "catboost", "kneighbor"],
|
||||
"eval_method": "cv",
|
||||
"n_splits": 3,
|
||||
"metric": "accuracy",
|
||||
"log_training_metric": True,
|
||||
"verbose": 4,
|
||||
"ensemble": True,
|
||||
}
|
||||
automl.fit(X, y, **automl_settings)
|
||||
|
||||
automl = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 3,
|
||||
"task": "classification",
|
||||
"n_jobs": 1,
|
||||
"estimator_list": ["lgbm", "catboost", "kneighbor"],
|
||||
"eval_method": "cv",
|
||||
"n_splits": 3,
|
||||
"metric": "accuracy",
|
||||
"log_training_metric": True,
|
||||
"verbose": 4,
|
||||
"ensemble": True,
|
||||
}
|
||||
automl.fit(X, y, **automl_settings)
|
||||
|
||||
def test_dataframe(self):
|
||||
self.test_classification(True)
|
||||
|
||||
|
@ -348,20 +217,6 @@ class TestAutoML(unittest.TestCase):
|
|||
)
|
||||
print(metric_history)
|
||||
|
||||
def test_binary(self):
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 1,
|
||||
"task": "binary",
|
||||
"log_file_name": "test/breast_cancer.log",
|
||||
"log_training_metric": True,
|
||||
"n_jobs": 1,
|
||||
"model_history": True,
|
||||
}
|
||||
X_train, y_train = load_breast_cancer(return_X_y=True)
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
_ = automl_experiment.predict(X_train)
|
||||
|
||||
def test_classification(self, as_frame=False):
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
|
@ -401,47 +256,6 @@ class TestAutoML(unittest.TestCase):
|
|||
print(automl_experiment.model)
|
||||
print(automl_experiment.predict_proba(X_train)[:5])
|
||||
|
||||
def test_datetime_columns(self):
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
"log_file_name": "test/datetime_columns.log",
|
||||
"log_training_metric": True,
|
||||
"n_jobs": 1,
|
||||
"model_history": True,
|
||||
}
|
||||
fake_df = pd.DataFrame(
|
||||
{
|
||||
"A": [
|
||||
datetime(1900, 2, 3),
|
||||
datetime(1900, 3, 4),
|
||||
datetime(1900, 3, 4),
|
||||
datetime(1900, 3, 4),
|
||||
datetime(1900, 7, 2),
|
||||
datetime(1900, 8, 9),
|
||||
],
|
||||
"B": [
|
||||
datetime(1900, 1, 1),
|
||||
datetime(1900, 1, 1),
|
||||
datetime(1900, 1, 1),
|
||||
datetime(1900, 1, 1),
|
||||
datetime(1900, 1, 1),
|
||||
datetime(1900, 1, 1),
|
||||
],
|
||||
"year_A": [
|
||||
datetime(1900, 1, 2),
|
||||
datetime(1900, 8, 1),
|
||||
datetime(1900, 1, 4),
|
||||
datetime(1900, 6, 1),
|
||||
datetime(1900, 1, 5),
|
||||
datetime(1900, 4, 1),
|
||||
],
|
||||
}
|
||||
)
|
||||
y = np.array([0, 1, 0, 1, 0, 0])
|
||||
automl_experiment.fit(X_train=fake_df, y_train=y, **automl_settings)
|
||||
_ = automl_experiment.predict(fake_df)
|
||||
|
||||
def test_micro_macro_f1(self):
|
||||
automl_experiment_micro = AutoML()
|
||||
automl_experiment_macro = AutoML()
|
||||
|
@ -501,50 +315,6 @@ class TestAutoML(unittest.TestCase):
|
|||
X_train, y_train = load_iris(return_X_y=True)
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
|
||||
def test_regression(self):
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
"task": "regression",
|
||||
"log_file_name": "test/california.log",
|
||||
"log_training_metric": True,
|
||||
"n_jobs": 1,
|
||||
"model_history": True,
|
||||
}
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True)
|
||||
n = int(len(y_train) * 9 // 10)
|
||||
automl_experiment.fit(
|
||||
X_train=X_train[:n],
|
||||
y_train=y_train[:n],
|
||||
X_val=X_train[n:],
|
||||
y_val=y_train[n:],
|
||||
**automl_settings
|
||||
)
|
||||
assert automl_experiment._state.eval_method == "holdout"
|
||||
print(automl_experiment.predict(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
print(get_output_from_log(automl_settings["log_file_name"], 1))
|
||||
automl_experiment.retrain_from_log(
|
||||
task="regression",
|
||||
log_file_name=automl_settings["log_file_name"],
|
||||
X_train=X_train,
|
||||
y_train=y_train,
|
||||
train_full=True,
|
||||
time_budget=1,
|
||||
)
|
||||
automl_experiment.retrain_from_log(
|
||||
task="regression",
|
||||
log_file_name=automl_settings["log_file_name"],
|
||||
X_train=X_train,
|
||||
y_train=y_train,
|
||||
train_full=True,
|
||||
time_budget=0,
|
||||
)
|
||||
|
||||
def test_sparse_matrix_classification(self):
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
|
@ -567,236 +337,51 @@ class TestAutoML(unittest.TestCase):
|
|||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
|
||||
def test_sparse_matrix_regression(self):
|
||||
X_train = scipy.sparse.random(300, 900, density=0.0001)
|
||||
y_train = np.random.uniform(size=300)
|
||||
X_val = scipy.sparse.random(100, 900, density=0.0001)
|
||||
y_val = np.random.uniform(size=100)
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
"metric": "mae",
|
||||
"task": "regression",
|
||||
"log_file_name": "test/sparse_regression.log",
|
||||
"n_jobs": 1,
|
||||
"model_history": True,
|
||||
"keep_search_state": True,
|
||||
"verbose": 0,
|
||||
"early_stop": True,
|
||||
}
|
||||
automl_experiment.fit(
|
||||
X_train=X_train,
|
||||
y_train=y_train,
|
||||
X_val=X_val,
|
||||
y_val=y_val,
|
||||
**automl_settings
|
||||
)
|
||||
assert automl_experiment._state.X_val.shape == X_val.shape
|
||||
print(automl_experiment.predict(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
print(automl_experiment.best_config)
|
||||
print(automl_experiment.best_loss)
|
||||
print(automl_experiment.best_config_train_time)
|
||||
|
||||
def test_sparse_matrix_xgboost(self):
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 3,
|
||||
"metric": "ap",
|
||||
"task": "classification",
|
||||
"log_file_name": "test/sparse_classification.log",
|
||||
"estimator_list": ["xgboost"],
|
||||
"log_type": "all",
|
||||
"n_jobs": 1,
|
||||
}
|
||||
X_train = scipy.sparse.eye(900000)
|
||||
y_train = np.random.randint(2, size=900000)
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
print(automl_experiment.predict(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
|
||||
def test_parallel(self, hpo_method=None):
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 10,
|
||||
"task": "regression",
|
||||
"log_file_name": "test/california.log",
|
||||
"log_type": "all",
|
||||
"n_jobs": 1,
|
||||
"n_concurrent_trials": 10,
|
||||
"hpo_method": hpo_method,
|
||||
}
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True)
|
||||
try:
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
print(automl_experiment.predict(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
except ImportError:
|
||||
return
|
||||
|
||||
def test_parallel_classification(self):
|
||||
from sklearn.datasets import make_classification
|
||||
|
||||
X, y = make_classification(1000, 10)
|
||||
automl = AutoML()
|
||||
try:
|
||||
automl.fit(
|
||||
X, y, time_budget=10, task="classification", n_concurrent_trials=2
|
||||
)
|
||||
except ImportError:
|
||||
return
|
||||
|
||||
def test_parallel_xgboost(self, hpo_method=None):
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 10,
|
||||
"metric": "ap",
|
||||
"task": "classification",
|
||||
"log_file_name": "test/sparse_classification.log",
|
||||
"estimator_list": ["xgboost"],
|
||||
"log_type": "all",
|
||||
"n_jobs": 1,
|
||||
"n_concurrent_trials": 2,
|
||||
"hpo_method": hpo_method,
|
||||
}
|
||||
X_train = scipy.sparse.eye(900000)
|
||||
y_train = np.random.randint(2, size=900000)
|
||||
try:
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
print(automl_experiment.predict(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
except ImportError:
|
||||
return
|
||||
|
||||
def test_parallel_xgboost_others(self):
|
||||
# use random search as the hpo_method
|
||||
self.test_parallel_xgboost(hpo_method="random")
|
||||
|
||||
def test_random_out_of_memory(self):
|
||||
def _test_memory_limit(self):
|
||||
automl_experiment = AutoML()
|
||||
automl_experiment.add_learner(
|
||||
learner_name="large_lgbm", learner_class=MyLargeLGBM
|
||||
)
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
"metric": "ap",
|
||||
"time_budget": None,
|
||||
"task": "classification",
|
||||
"log_file_name": "test/sparse_classification_oom.log",
|
||||
"log_file_name": "test/classification_oom.log",
|
||||
"estimator_list": ["large_lgbm"],
|
||||
"log_type": "all",
|
||||
"n_jobs": 1,
|
||||
"n_concurrent_trials": 2,
|
||||
"hpo_method": "random",
|
||||
}
|
||||
X_train, y_train = load_iris(return_X_y=True, as_frame=True)
|
||||
|
||||
X_train = scipy.sparse.eye(900000)
|
||||
y_train = np.random.randint(2, size=900000)
|
||||
try:
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
print(automl_experiment.predict(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
except ImportError:
|
||||
return
|
||||
|
||||
def test_sparse_matrix_lr(self):
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
"metric": "f1",
|
||||
"task": "classification",
|
||||
"log_file_name": "test/sparse_classification.log",
|
||||
"estimator_list": ["lrl1", "lrl2"],
|
||||
"log_type": "all",
|
||||
"n_jobs": 1,
|
||||
}
|
||||
X_train = scipy.sparse.random(3000, 900, density=0.1)
|
||||
y_train = np.random.randint(2, size=3000)
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
print(automl_experiment.predict(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
|
||||
def test_sparse_matrix_regression_holdout(self):
|
||||
X_train = scipy.sparse.random(8, 100)
|
||||
y_train = np.random.uniform(size=8)
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 1,
|
||||
"eval_method": "holdout",
|
||||
"task": "regression",
|
||||
"log_file_name": "test/sparse_regression.log",
|
||||
"n_jobs": 1,
|
||||
"model_history": True,
|
||||
"metric": "mse",
|
||||
"sample_weight": np.ones(len(y_train)),
|
||||
"early_stop": True,
|
||||
}
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
print(automl_experiment.predict(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
|
||||
def test_regression_xgboost(self):
|
||||
X_train = scipy.sparse.random(300, 900, density=0.0001)
|
||||
y_train = np.random.uniform(size=300)
|
||||
X_val = scipy.sparse.random(100, 900, density=0.0001)
|
||||
y_val = np.random.uniform(size=100)
|
||||
automl_experiment = AutoML()
|
||||
automl_experiment.add_learner(learner_name="my_xgb1", learner_class=MyXGB1)
|
||||
automl_experiment.add_learner(learner_name="my_xgb2", learner_class=MyXGB2)
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
"estimator_list": ["my_xgb1", "my_xgb2"],
|
||||
"task": "regression",
|
||||
"log_file_name": "test/regression_xgboost.log",
|
||||
"n_jobs": 1,
|
||||
"model_history": True,
|
||||
"keep_search_state": True,
|
||||
"early_stop": True,
|
||||
}
|
||||
automl_experiment.fit(
|
||||
X_train=X_train,
|
||||
y_train=y_train,
|
||||
X_val=X_val,
|
||||
y_val=y_val,
|
||||
**automl_settings
|
||||
X_train=X_train, y_train=y_train, max_iter=1, **automl_settings
|
||||
)
|
||||
assert automl_experiment._state.X_val.shape == X_val.shape
|
||||
print(automl_experiment.predict(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
print(automl_experiment.best_config)
|
||||
print(automl_experiment.best_loss)
|
||||
print(automl_experiment.best_config_train_time)
|
||||
|
||||
def test_time_limit(self):
|
||||
automl_experiment = AutoML()
|
||||
automl_experiment.add_learner(
|
||||
learner_name="large_lgbm", learner_class=MyLargeLGBM
|
||||
)
|
||||
automl_experiment.add_learner(
|
||||
learner_name="large_xgb", learner_class=MyLargeXGB
|
||||
)
|
||||
automl_settings = {
|
||||
"time_budget": 0.5,
|
||||
"task": "classification",
|
||||
"log_file_name": "test/classification_timeout.log",
|
||||
"estimator_list": ["catboost"],
|
||||
"log_type": "all",
|
||||
"hpo_method": "random",
|
||||
}
|
||||
X_train, y_train = load_iris(return_X_y=True, as_frame=True)
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
print(automl_experiment.model.params)
|
||||
automl_settings["estimator_list"] = ["large_xgb"]
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
print(automl_experiment.model)
|
||||
automl_settings["estimator_list"] = ["large_lgbm"]
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
print(automl_experiment.model)
|
||||
|
||||
def test_fit_w_starting_point(self, as_frame=True):
|
||||
automl_experiment = AutoML()
|
|
@ -60,7 +60,7 @@ def test_automl(budget=5, dataset_format="dataframe", hpo_method=None):
|
|||
valid_loss_history,
|
||||
config_history,
|
||||
metric_history,
|
||||
) = get_output_from_log(filename=settings["log_file_name"], time_budget=60)
|
||||
) = get_output_from_log(filename=settings["log_file_name"], time_budget=6)
|
||||
for config in config_history:
|
||||
print(config)
|
||||
print(automl.prune_attr)
|
||||
|
|
|
@ -113,3 +113,9 @@ class TestLogging(unittest.TestCase):
|
|||
with open("automl.pkl", "wb") as f:
|
||||
pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
|
||||
print(automl.__version__)
|
||||
pred1 = automl.predict(X_train)
|
||||
with open("automl.pkl", "rb") as f:
|
||||
automl = pickle.load(f)
|
||||
pred2 = automl.predict(X_train)
|
||||
delta = pred1 - pred2
|
||||
assert max(delta) == 0 and min(delta) == 0
|
||||
|
|
|
@ -0,0 +1,221 @@
|
|||
import unittest
|
||||
import numpy as np
|
||||
import scipy.sparse
|
||||
from sklearn.datasets import (
|
||||
fetch_california_housing,
|
||||
)
|
||||
|
||||
from flaml import AutoML
|
||||
from flaml.data import get_output_from_log
|
||||
from flaml.model import XGBoostEstimator
|
||||
|
||||
|
||||
def logregobj(preds, dtrain):
|
||||
labels = dtrain.get_label()
|
||||
preds = 1.0 / (1.0 + np.exp(-preds)) # transform raw leaf weight
|
||||
grad = preds - labels
|
||||
hess = preds * (1.0 - preds)
|
||||
return grad, hess
|
||||
|
||||
|
||||
class MyXGB1(XGBoostEstimator):
|
||||
"""XGBoostEstimator with logregobj as the objective function"""
|
||||
|
||||
def __init__(self, **config):
|
||||
super().__init__(objective=logregobj, **config)
|
||||
|
||||
|
||||
class MyXGB2(XGBoostEstimator):
|
||||
"""XGBoostEstimator with 'reg:squarederror' as the objective function"""
|
||||
|
||||
def __init__(self, **config):
|
||||
super().__init__(objective="reg:squarederror", **config)
|
||||
|
||||
|
||||
class TestRegression(unittest.TestCase):
|
||||
def test_regression(self):
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
"task": "regression",
|
||||
"log_file_name": "test/california.log",
|
||||
"log_training_metric": True,
|
||||
"n_jobs": 1,
|
||||
"model_history": True,
|
||||
}
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True)
|
||||
n = int(len(y_train) * 9 // 10)
|
||||
automl_experiment.fit(
|
||||
X_train=X_train[:n],
|
||||
y_train=y_train[:n],
|
||||
X_val=X_train[n:],
|
||||
y_val=y_train[n:],
|
||||
**automl_settings
|
||||
)
|
||||
assert automl_experiment._state.eval_method == "holdout"
|
||||
print(automl_experiment.predict(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
print(get_output_from_log(automl_settings["log_file_name"], 1))
|
||||
automl_experiment.retrain_from_log(
|
||||
task="regression",
|
||||
log_file_name=automl_settings["log_file_name"],
|
||||
X_train=X_train,
|
||||
y_train=y_train,
|
||||
train_full=True,
|
||||
time_budget=1,
|
||||
)
|
||||
automl_experiment.retrain_from_log(
|
||||
task="regression",
|
||||
log_file_name=automl_settings["log_file_name"],
|
||||
X_train=X_train,
|
||||
y_train=y_train,
|
||||
train_full=True,
|
||||
time_budget=0,
|
||||
)
|
||||
|
||||
def test_sparse_matrix_classification(self):
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
"metric": "auto",
|
||||
"task": "classification",
|
||||
"log_file_name": "test/sparse_classification.log",
|
||||
"split_type": "uniform",
|
||||
"n_jobs": 1,
|
||||
"model_history": True,
|
||||
}
|
||||
X_train = scipy.sparse.random(1554, 21, dtype=int)
|
||||
y_train = np.random.randint(3, size=1554)
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
print(automl_experiment.classes_)
|
||||
print(automl_experiment.predict_proba(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
|
||||
def test_sparse_matrix_regression(self):
|
||||
X_train = scipy.sparse.random(300, 900, density=0.0001)
|
||||
y_train = np.random.uniform(size=300)
|
||||
X_val = scipy.sparse.random(100, 900, density=0.0001)
|
||||
y_val = np.random.uniform(size=100)
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
"metric": "mae",
|
||||
"task": "regression",
|
||||
"log_file_name": "test/sparse_regression.log",
|
||||
"n_jobs": 1,
|
||||
"model_history": True,
|
||||
"keep_search_state": True,
|
||||
"verbose": 0,
|
||||
"early_stop": True,
|
||||
}
|
||||
automl_experiment.fit(
|
||||
X_train=X_train,
|
||||
y_train=y_train,
|
||||
X_val=X_val,
|
||||
y_val=y_val,
|
||||
**automl_settings
|
||||
)
|
||||
assert automl_experiment._state.X_val.shape == X_val.shape
|
||||
print(automl_experiment.predict(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
print(automl_experiment.best_config)
|
||||
print(automl_experiment.best_loss)
|
||||
print(automl_experiment.best_config_train_time)
|
||||
|
||||
def test_parallel(self, hpo_method=None):
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 10,
|
||||
"task": "regression",
|
||||
"log_file_name": "test/california.log",
|
||||
"log_type": "all",
|
||||
"n_jobs": 1,
|
||||
"n_concurrent_trials": 10,
|
||||
"hpo_method": hpo_method,
|
||||
}
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True)
|
||||
try:
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
print(automl_experiment.predict(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
except ImportError:
|
||||
return
|
||||
|
||||
def test_sparse_matrix_regression_holdout(self):
|
||||
X_train = scipy.sparse.random(8, 100)
|
||||
y_train = np.random.uniform(size=8)
|
||||
automl_experiment = AutoML()
|
||||
automl_settings = {
|
||||
"time_budget": 1,
|
||||
"eval_method": "holdout",
|
||||
"task": "regression",
|
||||
"log_file_name": "test/sparse_regression.log",
|
||||
"n_jobs": 1,
|
||||
"model_history": True,
|
||||
"metric": "mse",
|
||||
"sample_weight": np.ones(len(y_train)),
|
||||
"early_stop": True,
|
||||
}
|
||||
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
print(automl_experiment.predict(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
|
||||
def test_regression_xgboost(self):
|
||||
X_train = scipy.sparse.random(300, 900, density=0.0001)
|
||||
y_train = np.random.uniform(size=300)
|
||||
X_val = scipy.sparse.random(100, 900, density=0.0001)
|
||||
y_val = np.random.uniform(size=100)
|
||||
automl_experiment = AutoML()
|
||||
automl_experiment.add_learner(learner_name="my_xgb1", learner_class=MyXGB1)
|
||||
automl_experiment.add_learner(learner_name="my_xgb2", learner_class=MyXGB2)
|
||||
automl_settings = {
|
||||
"time_budget": 2,
|
||||
"estimator_list": ["my_xgb1", "my_xgb2"],
|
||||
"task": "regression",
|
||||
"log_file_name": "test/regression_xgboost.log",
|
||||
"n_jobs": 1,
|
||||
"model_history": True,
|
||||
"keep_search_state": True,
|
||||
"early_stop": True,
|
||||
}
|
||||
automl_experiment.fit(
|
||||
X_train=X_train,
|
||||
y_train=y_train,
|
||||
X_val=X_val,
|
||||
y_val=y_val,
|
||||
**automl_settings
|
||||
)
|
||||
assert automl_experiment._state.X_val.shape == X_val.shape
|
||||
print(automl_experiment.predict(X_train))
|
||||
print(automl_experiment.model)
|
||||
print(automl_experiment.config_history)
|
||||
print(automl_experiment.model_history)
|
||||
print(automl_experiment.best_iteration)
|
||||
print(automl_experiment.best_estimator)
|
||||
print(automl_experiment.best_config)
|
||||
print(automl_experiment.best_loss)
|
||||
print(automl_experiment.best_config_train_time)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
|
@ -30,6 +30,7 @@ class TestTrainingLog(unittest.TestCase):
|
|||
# "ensemble": True,
|
||||
"keep_search_state": True,
|
||||
"estimator_list": estimator_list,
|
||||
"model_history": True,
|
||||
}
|
||||
X_train, y_train = fetch_california_housing(return_X_y=True)
|
||||
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
||||
|
|
Loading…
Reference in New Issue