limit time and memory consumption (#264)

* limit time and memory

* separate tests

* lrl1 can't be limited by limit_resource

* free memory when possible

* passthrough=False when ensemble fails;
retrain when trained_estimator is None

* use callback to for resource limit

* handle lower version of xgb with no callback

* free mem ratio

* reduce verbosity

* retrain_final when max_iter==1

* remove trained_estimator from result

* model_history

* wheel

* retrain time as best_config_train_time

* ci: libomp version for xgboost on macos

* limit_resource not working in windows

* test pickle load

* mute forecaster

* notebook update

* check hard

* preventive callback

* add use_ray
This commit is contained in:
Chi Wang 2021-11-03 19:08:23 -07:00 committed by GitHub
parent 6c66cd67f7
commit 549a0dfb53
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 1761 additions and 1406 deletions

View File

@ -24,9 +24,11 @@ jobs:
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: If mac, install libomp to facilitate lgbm install
- name: If mac, install libomp to facilitate lgbm and xgboost install
if: matrix.os == 'macOS-latest'
run: |
# remove libomp version constraint after xgboost works with libomp>11.1.0
wget https://raw.githubusercontent.com/Homebrew/homebrew-core/679923b4eb48a8dc7ecc1f05d06063cd79b3fc00/Formula/libomp.rb -O $(find $(brew --repository) -name libomp.rb)
brew install libomp
export CC=/usr/bin/clang
export CXX=/usr/bin/clang++
@ -36,7 +38,7 @@ jobs:
export LDFLAGS="$LDFLAGS -Wl,-rpath,/usr/local/opt/libomp/lib -L/usr/local/opt/libomp/lib -lomp"
- name: Install packages and dependencies
run: |
python -m pip install --upgrade pip
python -m pip install --upgrade pip wheel
pip install -e .[test]
- name: If linux or mac, install ray
if: (matrix.os == 'macOS-latest' || matrix.os == 'ubuntu-latest') && matrix.python-version != '3.9'

View File

@ -248,7 +248,7 @@ class AutoMLState:
"wall_clock_time": time.time() - self._start_time_flag,
"metric_for_logging": metric_for_logging,
"val_loss": val_loss,
"trained_estimator": trained_estimator,
"trained_estimator": trained_estimator if self.save_model_history else None,
}
if sampled_weight is not None:
self.fit_kwargs["sample_weight"] = weight
@ -403,9 +403,10 @@ class AutoML:
@property
def best_config_train_time(self):
"""A float of the seconds taken by training the
best config."""
return self._search_states[self._best_estimator].best_config_train_time
"""A float of the seconds taken by training the best config."""
return getattr(
self._search_states[self._best_estimator], "best_config_train_time", None
)
@property
def classes_(self):
@ -529,8 +530,9 @@ class AutoML:
self._nrow, self._ndim = X_train_all.shape
if self._state.task == TS_FORECAST:
X_train_all = pd.DataFrame(X_train_all)
assert X_train_all[X_train_all.columns[0]].dtype.name == 'datetime64[ns]', (
f"For '{TS_FORECAST}' task, the first column must contain timestamp values.")
assert (
X_train_all[X_train_all.columns[0]].dtype.name == "datetime64[ns]"
), f"For '{TS_FORECAST}' task, the first column must contain timestamp values."
X, y = X_train_all, y_train_all
elif dataframe is not None and label is not None:
assert isinstance(
@ -539,8 +541,9 @@ class AutoML:
assert label in dataframe.columns, "label must a column name in dataframe"
self._df = True
if self._state.task == TS_FORECAST:
assert dataframe[dataframe.columns[0]].dtype.name == 'datetime64[ns]', (
f"For '{TS_FORECAST}' task, the first column must contain timestamp values.")
assert (
dataframe[dataframe.columns[0]].dtype.name == "datetime64[ns]"
), f"For '{TS_FORECAST}' task, the first column must contain timestamp values."
X = dataframe.drop(columns=label)
self._nrow, self._ndim = X.shape
y = dataframe[label]
@ -584,7 +587,9 @@ class AutoML:
else:
self._state.X_val = X_val
if self._label_transformer:
self._state.y_val = self._label_transformer.transform(y_val, self._state.task)
self._state.y_val = self._label_transformer.transform(
y_val, self._state.task
)
else:
self._state.y_val = y_val
else:
@ -1064,7 +1069,8 @@ class AutoML:
return "holdout"
nrow, dim = self._nrow, self._ndim
if (
nrow * dim / 0.9 < SMALL_LARGE_THRES * (time_budget / 3600)
time_budget is None
or nrow * dim / 0.9 < SMALL_LARGE_THRES * (time_budget / 3600)
and nrow < CV_HOLDOUT_THRESHOLD
):
# time allows or sampling can be used and cv is necessary
@ -1301,6 +1307,7 @@ class AutoML:
append_log=False,
auto_augment=True,
min_sample_size=MIN_SAMPLE_TRAIN,
use_ray=False,
**fit_kwargs,
):
"""Find a model for a given task
@ -1414,7 +1421,9 @@ class AutoML:
In the following code example, we get starting_points from the
automl_experiment and use them in the new_automl_experiment.
e.g.,
.. code-block:: python
from flaml import AutoML
automl_experiment = AutoML()
X_train, y_train = load_iris(return_X_y=True)
@ -1440,6 +1449,10 @@ class AutoML:
augment rare classes.
min_sample_size: int, default=MIN_SAMPLE_TRAIN | the minimal sample
size when sample=True.
use_ray: boolean, default=False | Whether to use ray to run the training
in separate processes. This can be used to prevent OOM for large
datasets, but will incur more overhead in time. Only use it if
you run into OOM failures.
**fit_kwargs: Other key word arguments to pass to fit() function of
the searched learners, such as sample_weight. Include period as
a key word argument for 'ts_forecast' task.
@ -1483,8 +1496,10 @@ class AutoML:
)
self._retrain_final = (
retrain_full is True
and (eval_method == "holdout" and self._state.X_val is None)
or (eval_method == "cv")
and eval_method == "holdout"
and self._state.X_val is None
or eval_method == "cv"
or max_iter == 1
)
self._auto_augment = auto_augment
self._min_sample_size = min_sample_size
@ -1564,7 +1579,7 @@ class AutoML:
logger.info("List of ML learners in AutoML Run: {}".format(estimator_list))
self.estimator_list = estimator_list
self._hpo_method = hpo_method or ("cfo" if n_concurrent_trials == 1 else "bs")
self._state.time_budget = time_budget
self._state.time_budget = time_budget or 1e10
self._active_estimators = estimator_list.copy()
self._ensemble = ensemble
self._max_iter = max_iter
@ -1573,10 +1588,11 @@ class AutoML:
self._state.train_time_limit = train_time_limit
self._log_type = log_type
self.split_ratio = split_ratio
self._save_model_history = model_history
self._state.save_model_history = model_history
self._state.n_jobs = n_jobs
self._n_concurrent_trials = n_concurrent_trials
self._early_stop = early_stop
self._use_ray = use_ray or self._n_concurrent_trials > 1
if log_file_name:
with training_log_writer(log_file_name, append_log) as save_helper:
self._training_log = save_helper
@ -1627,7 +1643,7 @@ class AutoML:
from ray.tune.suggest import ConcurrencyLimiter
except (ImportError, AssertionError):
raise ImportError(
"n_concurrent_trial > 1 requires installation of ray. "
"n_concurrent_trial>1 or use_ray=True requires installation of ray. "
"Please run pip install flaml[ray]"
)
if self._hpo_method in ("cfo", "grid"):
@ -1693,7 +1709,8 @@ class AutoML:
resources_per_trial=resources_per_trial,
time_budget_s=self._state.time_budget,
num_samples=self._max_iter,
verbose=self.verbose,
verbose=max(self.verbose - 3, 0),
raise_on_failed_trial=False,
)
# logger.info([trial.last_result for trial in analysis.trials])
trials = sorted(
@ -1712,7 +1729,7 @@ class AutoML:
config = result["config"]
estimator = config.get("ml", config)["learner"]
search_state = self._search_states[estimator]
search_state.update(result, 0, self._save_model_history)
search_state.update(result, 0, self._state.save_model_history)
if result["wall_clock_time"] is not None:
self._state.time_from_start = result["wall_clock_time"]
if search_state.sample_size == self._state.data_size:
@ -1727,7 +1744,7 @@ class AutoML:
config,
self._time_taken_best_iter,
)
if self._save_model_history:
if self._state.save_model_history:
self._model_history[
_track_iter
] = search_state.trained_estimator
@ -1902,7 +1919,7 @@ class AutoML:
search_state.update(
result,
time_used=time_used,
save_model_history=self._save_model_history,
save_model_history=self._state.save_model_history,
)
if self._estimator_index is None:
# update init eci estimate
@ -1945,18 +1962,27 @@ class AutoML:
search_state.best_config,
self._state.time_from_start,
)
if self._save_model_history:
if self._state.save_model_history:
self._model_history[
self._track_iter
] = search_state.trained_estimator
elif self._trained_estimator:
del self._trained_estimator
self._trained_estimator = None
if not self._retrain_final:
self._trained_estimator = search_state.trained_estimator
self._best_iteration = self._track_iter
self._time_taken_best_iter = self._state.time_from_start
better = True
next_trial_time = search_state.time2eval_best
if search_state.trained_estimator and not (
self._state.save_model_history or self._ensemble
):
# free RAM
if search_state.trained_estimator != self._trained_estimator:
search_state.trained_estimator.cleanup()
del search_state.trained_estimator
search_state.trained_estimator = None
if better or self._log_type == "all":
if self._training_log:
self._training_log.append(
@ -2049,7 +2075,9 @@ class AutoML:
logger.info(
"retrain {} for {:.1f}s".format(self._best_estimator, retrain_time)
)
self._retrained_config[best_config_sig] = retrain_time
self._retrained_config[
best_config_sig
] = state.best_config_train_time = retrain_time
est_retrain_time = 0
self._state.time_from_start = time.time() - self._start_time_flag
if (
@ -2083,7 +2111,7 @@ class AutoML:
self._selected = None
self.modelcount = 0
if self._n_concurrent_trials == 1:
if not self._use_ray:
self._search_sequential()
else:
self._search_parallel()
@ -2103,12 +2131,29 @@ class AutoML:
"regression",
):
search_states = list(
x for x in self._search_states.items() if x[1].trained_estimator
x for x in self._search_states.items() if x[1].best_config
)
search_states.sort(key=lambda x: x[1].best_loss)
estimators = [(x[0], x[1].trained_estimator) for x in search_states[:2]]
estimators = [
(
x[0],
x[1].learner_class(
task=self._state.task,
n_jobs=self._state.n_jobs,
**x[1].best_config,
),
)
for x in search_states[:2]
]
estimators += [
(x[0], x[1].trained_estimator)
(
x[0],
x[1].learner_class(
task=self._state.task,
n_jobs=self._state.n_jobs,
**x[1].best_config,
),
)
for x in search_states[2:]
if x[1].best_loss < 4 * self._selected.best_loss
]
@ -2135,19 +2180,49 @@ class AutoML:
)
if self._sample_weight_full is not None:
self._state.fit_kwargs["sample_weight"] = self._sample_weight_full
for e in estimators:
e[1].__class__.init()
try:
stacker.fit(
self._X_train_all, self._y_train_all, **self._state.fit_kwargs
)
logger.info(f"ensemble: {stacker}")
self._trained_estimator = stacker
self._trained_estimator.model = stacker
except ValueError as e:
if passthrough:
logger.warning(
"Using passthrough=False for ensemble because the data contain categorical features."
)
stacker = Stacker(
estimators,
final_estimator,
n_jobs=self._state.n_jobs,
passthrough=False,
)
stacker.fit(
self._X_train_all,
self._y_train_all,
**self._state.fit_kwargs,
)
logger.info(f"ensemble: {stacker}")
self._trained_estimator = stacker
self._trained_estimator.model = stacker
else:
raise e
elif self._retrain_final:
# reset time budget for retraining
if self._max_iter > 1:
self._state.time_from_start -= self._state.time_budget
if self._state.task == TS_FORECAST or (
if (
self._state.task == TS_FORECAST
or self._trained_estimator is None
or (
self._state.time_budget - self._state.time_from_start
> self._selected.est_retrain_time(self.data_size_full)
and self._selected.best_config_sample_size == self._state.data_size
and self._selected.best_config_sample_size
== self._state.data_size
)
):
state = self._search_states[self._best_estimator]
(
@ -2163,6 +2238,7 @@ class AutoML:
self._best_estimator, retrain_time
)
)
state.best_config_train_time = retrain_time
if self._trained_estimator:
logger.info(f"retrained model: {self._trained_estimator.model}")
else:

View File

@ -275,7 +275,6 @@ class DataTransformer:
X[column] = X[column].map(datetime.toordinal)
datetime_columns.append(column)
del tmp_dt
else:
X[column] = X[column].fillna(np.nan)
num_columns.append(column)
X = X[cat_columns + num_columns]

View File

@ -2,20 +2,67 @@
* Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License.
"""
from contextlib import contextmanager
from functools import partial
import signal
import os
from typing import Callable, List
import numpy as np
import time
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier, DummyRegressor
from scipy.sparse import issparse
import pandas as pd
from . import tune
from .data import group_counts, CLASSIFICATION, TS_FORECAST, TS_TIMESTAMP_COL, TS_VALUE_COL
import logging
from . import tune
from .data import (
group_counts,
CLASSIFICATION,
TS_FORECAST,
TS_TIMESTAMP_COL,
TS_VALUE_COL,
)
try:
import psutil
except ImportError:
psutil = None
try:
import resource
except ImportError:
resource = None
logger = logging.getLogger("flaml.automl")
FREE_MEM_RATIO = 0.2
def TimeoutHandler(sig, frame):
raise TimeoutError(sig, frame)
@contextmanager
def limit_resource(memory_limit, time_limit):
if memory_limit > 0:
soft, hard = resource.getrlimit(resource.RLIMIT_AS)
if soft < 0 and (hard < 0 or memory_limit <= hard) or memory_limit < soft:
resource.setrlimit(resource.RLIMIT_AS, (memory_limit, hard))
main_thread = False
if time_limit is not None:
try:
signal.signal(signal.SIGALRM, TimeoutHandler)
signal.alarm(int(time_limit) or 1)
main_thread = True
except ValueError:
pass
try:
yield
finally:
if main_thread:
signal.alarm(0)
if memory_limit > 0:
resource.setrlimit(resource.RLIMIT_AS, (soft, hard))
class BaseEstimator:
@ -112,7 +159,35 @@ class BaseEstimator:
Returns:
train_time: A float of the training time in seconds
"""
return self._fit(X_train, y_train, **kwargs)
if (
getattr(self, "limit_resource", None)
and resource is not None
and (budget is not None or psutil is not None)
):
start_time = time.time()
mem = psutil.virtual_memory() if psutil is not None else None
try:
with limit_resource(
mem.available * (1 - FREE_MEM_RATIO)
+ psutil.Process(os.getpid()).memory_info().rss
if mem is not None
else -1,
budget,
):
train_time = self._fit(X_train, y_train, **kwargs)
except (MemoryError, TimeoutError) as e:
logger.warning(f"{e.__class__} {e}")
if self._task in CLASSIFICATION:
model = DummyClassifier()
else:
model = DummyRegressor()
X_train = self._preprocess(X_train)
model.fit(X_train, y_train)
self._model = model
train_time = time.time() - start_time
else:
train_time = self._fit(X_train, y_train, **kwargs)
return train_time
def predict(self, X_test):
"""Predict label from features
@ -223,6 +298,9 @@ class SKLearnEstimator(BaseEstimator):
class LGBMEstimator(BaseEstimator):
ITER_HP = "n_estimators"
HAS_CALLBACK = True
@classmethod
def search_space(cls, data_size, **params):
upper = min(32768, int(data_size))
@ -297,6 +375,8 @@ class LGBMEstimator(BaseEstimator):
self.estimator_class = LGBMClassifier
self._time_per_iter = None
self._train_size = 0
self._mem_per_iter = 1
self.HAS_CALLBACK = self.HAS_CALLBACK and self._callbacks(0, 0) is not None
def _preprocess(self, X):
if (
@ -316,50 +396,111 @@ class LGBMEstimator(BaseEstimator):
def fit(self, X_train, y_train, budget=None, **kwargs):
start_time = time.time()
n_iter = self.params["n_estimators"]
deadline = start_time + budget if budget else np.inf
n_iter = self.params[self.ITER_HP]
trained = False
if not self.HAS_CALLBACK:
mem0 = psutil.virtual_memory().available if psutil is not None else 1
if (
(not self._time_per_iter or abs(self._train_size - X_train.shape[0]) > 4)
(
not self._time_per_iter
or abs(self._train_size - X_train.shape[0]) > 4
)
and budget is not None
and n_iter > 1
):
self.params["n_estimators"] = 1
or self._mem_per_iter <= 1
and psutil is not None
) and n_iter > 1:
self.params[self.ITER_HP] = 1
self._t1 = self._fit(X_train, y_train, **kwargs)
if self._t1 >= budget or n_iter == 1:
# self.params["n_estimators"] = n_iter
if budget is not None and self._t1 >= budget or n_iter == 1:
# self.params[self.ITER_HP] = n_iter
return self._t1
self.params["n_estimators"] = min(n_iter, 4)
mem1 = psutil.virtual_memory().available if psutil is not None else 1
self._mem1 = mem0 - mem1
self.params[self.ITER_HP] = min(n_iter, 4)
self._t2 = self._fit(X_train, y_train, **kwargs)
mem2 = psutil.virtual_memory().available if psutil is not None else 1
self._mem2 = max(mem0 - mem2, self._mem1)
# if self._mem1 <= 0:
# self._mem_per_iter = self._mem2 / (self.params[self.ITER_HP] + 1)
# elif self._mem2 <= 0:
# self._mem_per_iter = self._mem1
# else:
self._mem_per_iter = min(
self._mem1, self._mem2 / self.params[self.ITER_HP]
)
if self._mem_per_iter <= 1 and psutil is not None:
n_iter = self.params[self.ITER_HP]
self._time_per_iter = (
(self._t2 - self._t1) / (self.params["n_estimators"] - 1)
(self._t2 - self._t1) / (self.params[self.ITER_HP] - 1)
if self._t2 > self._t1
else self._t1
if self._t1
else 0.001
)
self._train_size = X_train.shape[0]
if self._t1 + self._t2 >= budget or n_iter == self.params["n_estimators"]:
# self.params["n_estimators"] = n_iter
if (
budget is not None
and self._t1 + self._t2 >= budget
or n_iter == self.params[self.ITER_HP]
):
# self.params[self.ITER_HP] = n_iter
return time.time() - start_time
trained = True
if budget is not None and n_iter > 1:
# logger.debug(mem0)
# logger.debug(self._mem_per_iter)
if n_iter > 1:
max_iter = min(
n_iter,
int(
(budget - time.time() + start_time - self._t1) / self._time_per_iter
(budget - time.time() + start_time - self._t1)
/ self._time_per_iter
+ 1
),
)
if trained and max_iter <= self.params["n_estimators"]:
if budget is not None
else n_iter,
int((1 - FREE_MEM_RATIO) * mem0 / self._mem_per_iter)
if psutil is not None
else n_iter,
)
if trained and max_iter <= self.params[self.ITER_HP]:
return time.time() - start_time
self.params["n_estimators"] = max_iter
if self.params["n_estimators"] > 0:
self.params[self.ITER_HP] = max_iter
if self.params[self.ITER_HP] > 0:
if self.HAS_CALLBACK:
self._fit(
X_train, y_train, callbacks=self._callbacks(start_time, deadline), **kwargs
)
best_iteration = (
self._model.get_booster().best_iteration
if isinstance(self, XGBoostSklearnEstimator)
else self._model.best_iteration_
)
if best_iteration is not None:
self._model.set_params(n_estimators=best_iteration + 1)
else:
self._fit(X_train, y_train, **kwargs)
else:
self.params["n_estimators"] = self._model.n_estimators
self.params[self.ITER_HP] = self._model.n_estimators
train_time = time.time() - start_time
return train_time
def _callbacks(self, start_time, deadline) -> List[Callable]:
return [partial(self._callback, start_time, deadline)]
def _callback(self, start_time, deadline, env) -> None:
from lightgbm.callback import EarlyStopException
now = time.time()
if env.iteration == 0:
self._time_per_iter = now - start_time
if now + self._time_per_iter > deadline:
raise EarlyStopException(env.iteration, env.evaluation_result_list)
if psutil is not None:
mem = psutil.virtual_memory()
if mem.available / mem.total < FREE_MEM_RATIO:
raise EarlyStopException(env.iteration, env.evaluation_result_list)
class XGBoostEstimator(SKLearnEstimator):
"""not using sklearn API, used for regression"""
@ -439,6 +580,7 @@ class XGBoostEstimator(SKLearnEstimator):
import xgboost as xgb
start_time = time.time()
deadline = start_time + budget if budget else np.inf
if issparse(X_train):
self.params["tree_method"] = "auto"
else:
@ -456,9 +598,20 @@ class XGBoostEstimator(SKLearnEstimator):
if "objective" in self.params:
del self.params["objective"]
_n_estimators = self.params.pop("n_estimators")
callbacks = XGBoostEstimator._callbacks(start_time, deadline)
if callbacks:
self._model = xgb.train(
self.params,
dtrain,
_n_estimators,
obj=obj,
callbacks=callbacks,
)
self.params["n_estimators"] = self._model.best_iteration + 1
else:
self._model = xgb.train(self.params, dtrain, _n_estimators, obj=obj)
self.params["objective"] = objective
self.params["n_estimators"] = _n_estimators
self.params["objective"] = objective
del dtrain
train_time = time.time() - start_time
return train_time
@ -471,6 +624,28 @@ class XGBoostEstimator(SKLearnEstimator):
dtest = xgb.DMatrix(X_test)
return super().predict(dtest)
@classmethod
def _callbacks(cls, start_time, deadline):
try:
from xgboost.callback import TrainingCallback
except ImportError: # for xgboost<1.3
return None
class ResourceLimit(TrainingCallback):
def after_iteration(self, model, epoch, evals_log) -> bool:
now = time.time()
if epoch == 0:
self._time_per_iter = now - start_time
if now + self._time_per_iter > deadline:
return True
if psutil is not None:
mem = psutil.virtual_memory()
if mem.available / mem.total < FREE_MEM_RATIO:
return True
return False
return [ResourceLimit()]
class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
"""using sklearn API, used for classification"""
@ -513,8 +688,13 @@ class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
self.params["tree_method"] = "auto"
return super().fit(X_train, y_train, budget, **kwargs)
def _callbacks(self, start_time, deadline) -> List[Callable]:
return XGBoostEstimator._callbacks(start_time, deadline)
class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
HAS_CALLBACK = False
@classmethod
def search_space(cls, data_size, task, **params):
data_size = int(data_size)
@ -607,6 +787,8 @@ class LRL1Classifier(SKLearnEstimator):
class LRL2Classifier(SKLearnEstimator):
limit_resource = True
@classmethod
def search_space(cls, **params):
return LRL1Classifier.search_space(**params)
@ -629,8 +811,7 @@ class LRL2Classifier(SKLearnEstimator):
class CatBoostEstimator(BaseEstimator):
_time_per_iter = None
_train_size = 0
ITER_HP = "n_estimators"
@classmethod
def search_space(cls, data_size, **params):
@ -661,11 +842,6 @@ class CatBoostEstimator(BaseEstimator):
def cost_relative2lgbm(cls):
return 15
@classmethod
def init(cls):
CatBoostEstimator._time_per_iter = None
CatBoostEstimator._train_size = 0
def _preprocess(self, X):
if isinstance(X, pd.DataFrame):
cat_columns = X.select_dtypes(include=["category"]).columns
@ -719,76 +895,13 @@ class CatBoostEstimator(BaseEstimator):
import shutil
start_time = time.time()
deadline = start_time + budget if budget else np.inf
train_dir = f"catboost_{str(start_time)}"
n_iter = self.params["n_estimators"]
X_train = self._preprocess(X_train)
if isinstance(X_train, pd.DataFrame):
cat_features = list(X_train.select_dtypes(include="category").columns)
else:
cat_features = []
# from catboost import CatBoostError
# try:
trained = False
if (
(
not CatBoostEstimator._time_per_iter
or abs(CatBoostEstimator._train_size - len(y_train)) > 4
)
and budget
and n_iter > 4
):
# measure the time per iteration
self.params["n_estimators"] = 1
CatBoostEstimator._smallmodel = self.estimator_class(
train_dir=train_dir, **self.params
)
CatBoostEstimator._smallmodel.fit(
X_train, y_train, cat_features=cat_features, **kwargs
)
CatBoostEstimator._t1 = time.time() - start_time
if CatBoostEstimator._t1 >= budget or n_iter == 1:
# self.params["n_estimators"] = n_iter
self._model = CatBoostEstimator._smallmodel
shutil.rmtree(train_dir, ignore_errors=True)
return CatBoostEstimator._t1
self.params["n_estimators"] = min(n_iter, 4)
CatBoostEstimator._smallmodel = self.estimator_class(
train_dir=train_dir, **self.params
)
CatBoostEstimator._smallmodel.fit(
X_train, y_train, cat_features=cat_features, **kwargs
)
CatBoostEstimator._time_per_iter = (
time.time() - start_time - CatBoostEstimator._t1
) / (self.params["n_estimators"] - 1)
if CatBoostEstimator._time_per_iter <= 0:
CatBoostEstimator._time_per_iter = CatBoostEstimator._t1
CatBoostEstimator._train_size = len(y_train)
if (
time.time() - start_time >= budget
or n_iter == self.params["n_estimators"]
):
# self.params["n_estimators"] = n_iter
self._model = CatBoostEstimator._smallmodel
shutil.rmtree(train_dir, ignore_errors=True)
return time.time() - start_time
trained = True
if budget and n_iter > 4:
train_times = 1
max_iter = min(
n_iter,
int(
(budget - time.time() + start_time - CatBoostEstimator._t1)
/ train_times
/ CatBoostEstimator._time_per_iter
+ 1
),
)
self._model = CatBoostEstimator._smallmodel
if trained and max_iter <= self.params["n_estimators"]:
return time.time() - start_time
self.params["n_estimators"] = max_iter
if self.params["n_estimators"] > 0:
n = max(int(len(y_train) * 0.9), len(y_train) - 1000)
X_tr, y_tr = X_train[:n], y_train[:n]
if "sample_weight" in kwargs:
@ -797,9 +910,21 @@ class CatBoostEstimator(BaseEstimator):
kwargs["sample_weight"] = weight[:n]
else:
weight = None
from catboost import Pool
from catboost import Pool, __version__
model = self.estimator_class(train_dir=train_dir, **self.params)
if __version__ >= "0.26":
model.fit(
X_tr,
y_tr,
cat_features=cat_features,
eval_set=Pool(
data=X_train[n:], label=y_train[n:], cat_features=cat_features
),
callbacks=CatBoostEstimator._callbacks(start_time, deadline),
**kwargs,
)
else:
model.fit(
X_tr,
y_tr,
@ -808,18 +933,32 @@ class CatBoostEstimator(BaseEstimator):
data=X_train[n:], label=y_train[n:], cat_features=cat_features
),
**kwargs,
) # model.get_best_iteration()
)
shutil.rmtree(train_dir, ignore_errors=True)
if weight is not None:
kwargs["sample_weight"] = weight
self._model = model
else:
self.params["n_estimators"] = self._model.tree_count_
# except CatBoostError:
# self._model = None
self.params[self.ITER_HP] = self._model.tree_count_
train_time = time.time() - start_time
return train_time
@classmethod
def _callbacks(cls, start_time, deadline):
class ResourceLimit:
def after_iteration(self, info) -> bool:
now = time.time()
if info.iteration == 1:
self._time_per_iter = now - start_time
if now + self._time_per_iter > deadline:
return False
if psutil is not None:
mem = psutil.virtual_memory()
if mem.available / mem.total < FREE_MEM_RATIO:
return False
return True # can continue
return [ResourceLimit()]
class KNeighborsEstimator(BaseEstimator):
@classmethod
@ -919,6 +1058,7 @@ class Prophet(SKLearnEstimator):
model = Prophet(**self.params)
for regressor in cols:
model.add_regressor(regressor)
with suppress_stdout_stderr():
model.fit(train_df)
train_time = time.time() - current_time
self._model = model
@ -984,14 +1124,20 @@ class ARIMA(Prophet):
regressors = cols
if regressors:
model = ARIMA_estimator(
train_df[[TS_VALUE_COL]], exog=train_df[regressors], order=(
self.params["p"], self.params["d"], self.params["q"]),
enforce_stationarity=False, enforce_invertibility=False)
train_df[[TS_VALUE_COL]],
exog=train_df[regressors],
order=(self.params["p"], self.params["d"], self.params["q"]),
enforce_stationarity=False,
enforce_invertibility=False,
)
else:
model = ARIMA_estimator(
train_df, order=(
self.params["p"], self.params["d"], self.params["q"]),
enforce_stationarity=False, enforce_invertibility=False)
train_df,
order=(self.params["p"], self.params["d"], self.params["q"]),
enforce_stationarity=False,
enforce_invertibility=False,
)
with suppress_stdout_stderr():
model = model.fit()
train_time = time.time() - current_time
self._model = model
@ -1010,7 +1156,9 @@ class ARIMA(Prophet):
regressors = list(X_test)
regressors.remove(TS_TIMESTAMP_COL)
X_test = self._preprocess(X_test)
forecast = self._model.predict(start=start, end=end, exog=X_test[regressors])
forecast = self._model.predict(
start=start, end=end, exog=X_test[regressors]
)
else:
forecast = self._model.predict(start=start, end=end)
else:
@ -1077,25 +1225,64 @@ class SARIMAX(ARIMA):
regressors.remove(TS_VALUE_COL)
if regressors:
model = SARIMAX_estimator(
train_df[[TS_VALUE_COL]], exog=train_df[regressors], order=(
self.params["p"], self.params["d"], self.params["q"]),
train_df[[TS_VALUE_COL]],
exog=train_df[regressors],
order=(self.params["p"], self.params["d"], self.params["q"]),
seasonality_order=(
self.params["P"],
self.params["D"],
self.params["Q"],
self.params["s"]),
enforce_stationarity=False, enforce_invertibility=False)
self.params["s"],
),
enforce_stationarity=False,
enforce_invertibility=False,
)
else:
model = SARIMAX_estimator(
train_df, order=(
self.params["p"], self.params["d"], self.params["q"]),
train_df,
order=(self.params["p"], self.params["d"], self.params["q"]),
seasonality_order=(
self.params["P"],
self.params["D"],
self.params["Q"],
self.params["s"]),
enforce_stationarity=False, enforce_invertibility=False)
self.params["s"],
),
enforce_stationarity=False,
enforce_invertibility=False,
)
with suppress_stdout_stderr():
model = model.fit()
train_time = time.time() - current_time
self._model = model
return train_time
class suppress_stdout_stderr(object):
"""
A context manager for doing a "deep suppression" of stdout and stderr in
Python, i.e. will suppress all print, even if the print originates in a
compiled C/Fortran sub-function.
This will not suppress raised exceptions, since exceptions are printed
to stderr just before a script exits, and after the context manager has
exited.
"""
def __init__(self):
# Open a pair of null files
self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)]
# Save the actual stdout (1) and stderr (2) file descriptors.
self.save_fds = (os.dup(1), os.dup(2))
def __enter__(self):
# Assign the null pointers to stdout and stderr.
os.dup2(self.null_fds[0], 1)
os.dup2(self.null_fds[1], 2)
def __exit__(self, *_):
# Re-assign the real stdout/stderr back to (1) and (2)
os.dup2(self.save_fds[0], 1)
os.dup2(self.save_fds[1], 2)
# Close the null files
os.close(self.null_fds[0])
os.close(self.null_fds[1])

File diff suppressed because one or more lines are too long

View File

@ -38,16 +38,16 @@ setuptools.setup(
"notebook": [
"openml==0.10.2",
"jupyter",
"matplotlib==3.2.0",
"matplotlib",
"rgf-python",
"catboost>=0.26",
],
"test": [
"flake8>=3.8.4",
"pytest>=6.1.1",
"coverage>=5.3",
"pre-commit",
"xgboost<1.3",
"catboost>=0.23",
"catboost>=0.26",
"rgf-python",
"optuna==2.8.0",
"vowpalwabbit",
@ -58,8 +58,9 @@ setuptools.setup(
"datasets==1.4.1",
"azure-storage-blob",
"statsmodels>=0.12.2",
"psutil==5.8.0",
],
"catboost": ["catboost>=0.23"],
"catboost": ["catboost>=0.26"],
"blendsearch": ["optuna==2.8.0"],
"ray": [
"ray[tune]==1.6.0",
@ -83,6 +84,7 @@ setuptools.setup(
],
"ts_forecast": ["prophet>=1.0.1", "statsmodels>=0.12.2"],
"forecast": ["prophet>=1.0.1", "statsmodels>=0.12.2"],
"benchmark": ["catboost>=0.26", "psutil==5.8.0", "xgboost==1.3.3"],
},
classifiers=[
"Programming Language :: Python :: 3",

323
test/test_classification.py Normal file
View File

@ -0,0 +1,323 @@
import unittest
import numpy as np
import scipy.sparse
from sklearn.datasets import load_breast_cancer
import pandas as pd
from datetime import datetime
from flaml import AutoML
from flaml.model import LGBMEstimator
from flaml import tune
class MyLargeLGBM(LGBMEstimator):
@classmethod
def search_space(cls, **params):
return {
"n_estimators": {
"domain": tune.lograndint(lower=4, upper=32768),
"init_value": 32768,
"low_cost_init_value": 4,
},
"num_leaves": {
"domain": tune.lograndint(lower=4, upper=32768),
"init_value": 32768,
"low_cost_init_value": 4,
},
}
class TestClassification(unittest.TestCase):
def test_preprocess(self):
automl = AutoML()
X = pd.DataFrame(
{
"f1": [1, -2, 3, -4, 5, -6, -7, 8, -9, -10, -11, -12, -13, -14],
"f2": [
3.0,
16.0,
10.0,
12.0,
3.0,
14.0,
11.0,
12.0,
5.0,
14.0,
20.0,
16.0,
15.0,
11.0,
],
"f3": [
"a",
"b",
"a",
"c",
"c",
"b",
"b",
"b",
"b",
"a",
"b",
1.0,
1.0,
"a",
],
"f4": [
True,
True,
False,
True,
True,
False,
False,
False,
True,
True,
False,
False,
True,
True,
],
}
)
y = pd.Series([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1])
automl = AutoML()
automl_settings = {
"time_budget": 6,
"task": "classification",
"n_jobs": 1,
"estimator_list": ["catboost", "lrl2"],
"eval_method": "cv",
"n_splits": 3,
"metric": "accuracy",
"log_training_metric": True,
# "verbose": 4,
"ensemble": True,
}
automl.fit(X, y, **automl_settings)
automl = AutoML()
automl_settings = {
"time_budget": 2,
"task": "classification",
"n_jobs": 1,
"estimator_list": ["lrl2", "kneighbor"],
"eval_method": "cv",
"n_splits": 3,
"metric": "accuracy",
"log_training_metric": True,
"verbose": 4,
"ensemble": True,
}
automl.fit(X, y, **automl_settings)
automl = AutoML()
automl_settings = {
"time_budget": 3,
"task": "classification",
"n_jobs": 1,
"estimator_list": ["xgboost", "catboost", "kneighbor"],
"eval_method": "cv",
"n_splits": 3,
"metric": "accuracy",
"log_training_metric": True,
# "verbose": 4,
"ensemble": True,
}
automl.fit(X, y, **automl_settings)
automl = AutoML()
automl_settings = {
"time_budget": 3,
"task": "classification",
"n_jobs": 1,
"estimator_list": ["lgbm", "catboost", "kneighbor"],
"eval_method": "cv",
"n_splits": 3,
"metric": "accuracy",
"log_training_metric": True,
# "verbose": 4,
"ensemble": True,
}
automl.fit(X, y, **automl_settings)
def test_binary(self):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 1,
"task": "binary",
"log_file_name": "test/breast_cancer.log",
"log_training_metric": True,
"n_jobs": 1,
"model_history": True,
}
X_train, y_train = load_breast_cancer(return_X_y=True)
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
_ = automl_experiment.predict(X_train)
def test_datetime_columns(self):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 2,
"log_file_name": "test/datetime_columns.log",
"log_training_metric": True,
"n_jobs": 1,
"model_history": True,
}
fake_df = pd.DataFrame(
{
"A": [
datetime(1900, 2, 3),
datetime(1900, 3, 4),
datetime(1900, 3, 4),
datetime(1900, 3, 4),
datetime(1900, 7, 2),
datetime(1900, 8, 9),
],
"B": [
datetime(1900, 1, 1),
datetime(1900, 1, 1),
datetime(1900, 1, 1),
datetime(1900, 1, 1),
datetime(1900, 1, 1),
datetime(1900, 1, 1),
],
"year_A": [
datetime(1900, 1, 2),
datetime(1900, 8, 1),
datetime(1900, 1, 4),
datetime(1900, 6, 1),
datetime(1900, 1, 5),
datetime(1900, 4, 1),
],
}
)
y = np.array([0, 1, 0, 1, 0, 0])
automl_experiment.fit(X_train=fake_df, y_train=y, **automl_settings)
_ = automl_experiment.predict(fake_df)
def test_sparse_matrix_xgboost(self):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 3,
"metric": "ap",
"task": "classification",
"log_file_name": "test/sparse_classification.log",
"estimator_list": ["xgboost"],
"log_type": "all",
"n_jobs": 1,
}
X_train = scipy.sparse.eye(900000)
y_train = np.random.randint(2, size=900000)
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
def test_ray_classification(self):
from sklearn.datasets import make_classification
X, y = make_classification(1000, 10)
automl = AutoML()
try:
automl.fit(X, y, time_budget=10, task="classification", use_ray=True)
automl.fit(
X, y, time_budget=10, task="classification", n_concurrent_trials=2
)
except ImportError:
return
def test_parallel_xgboost(self, hpo_method=None):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 10,
"metric": "ap",
"task": "classification",
"log_file_name": "test/sparse_classification.log",
"estimator_list": ["xgboost"],
"log_type": "all",
"n_jobs": 1,
"n_concurrent_trials": 2,
"hpo_method": hpo_method,
}
X_train = scipy.sparse.eye(900000)
y_train = np.random.randint(2, size=900000)
try:
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
except ImportError:
return
def test_parallel_xgboost_others(self):
# use random search as the hpo_method
self.test_parallel_xgboost(hpo_method="random")
def test_random_skip_oom(self):
automl_experiment = AutoML()
automl_experiment.add_learner(
learner_name="large_lgbm", learner_class=MyLargeLGBM
)
automl_settings = {
"time_budget": 2,
"task": "classification",
"log_file_name": "test/sparse_classification_oom.log",
"estimator_list": ["large_lgbm"],
"log_type": "all",
"n_jobs": 1,
"hpo_method": "random",
"n_concurrent_trials": 2,
}
X_train = scipy.sparse.eye(900000)
y_train = np.random.randint(2, size=900000)
try:
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
except ImportError:
print("skipping concurrency test as ray is not installed")
return
def test_sparse_matrix_lr(self):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 3,
"metric": "f1",
"task": "classification",
"log_file_name": "test/sparse_classification.log",
"estimator_list": ["lrl1", "lrl2"],
"log_type": "all",
"n_jobs": 1,
}
X_train = scipy.sparse.random(3000, 3000, density=0.1)
y_train = np.random.randint(2, size=3000)
automl_experiment.fit(
X_train=X_train, y_train=y_train, train_time_limit=1, **automl_settings
)
automl_settings["time_budget"] = 5
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
if __name__ == "__main__":
unittest.main()

View File

@ -1,21 +1,12 @@
import unittest
import numpy as np
import scipy.sparse
from sklearn.datasets import (
fetch_california_housing,
load_iris,
load_wine,
load_breast_cancer,
)
from sklearn.datasets import load_iris, load_wine
import pandas as pd
from datetime import datetime
from flaml import AutoML
from flaml.data import CLASSIFICATION, get_output_from_log
from flaml.model import LGBMEstimator, SKLearnEstimator, XGBoostEstimator
from flaml.model import LGBMEstimator, XGBoostSklearnEstimator, SKLearnEstimator
from flaml import tune
from flaml.training_log import training_log_reader
@ -72,26 +63,21 @@ class MyRegularizedGreedyForest(SKLearnEstimator):
return 1.0
def logregobj(preds, dtrain):
labels = dtrain.get_label()
preds = 1.0 / (1.0 + np.exp(-preds)) # transform raw leaf weight
grad = preds - labels
hess = preds * (1.0 - preds)
return grad, hess
class MyXGB1(XGBoostEstimator):
"""XGBoostEstimator with logregobj as the objective function"""
def __init__(self, **config):
super().__init__(objective=logregobj, **config)
class MyXGB2(XGBoostEstimator):
"""XGBoostEstimator with 'reg:squarederror' as the objective function"""
def __init__(self, **config):
super().__init__(objective="reg:squarederror", **config)
class MyLargeXGB(XGBoostSklearnEstimator):
@classmethod
def search_space(cls, **params):
return {
"n_estimators": {
"domain": tune.lograndint(lower=4, upper=32768),
"init_value": 32768,
"low_cost_init_value": 4,
},
"max_leaves": {
"domain": tune.lograndint(lower=4, upper=3276),
"init_value": 3276,
"low_cost_init_value": 4,
},
}
class MyLargeLGBM(LGBMEstimator):
@ -104,8 +90,8 @@ class MyLargeLGBM(LGBMEstimator):
"low_cost_init_value": 4,
},
"num_leaves": {
"domain": tune.lograndint(lower=4, upper=32768),
"init_value": 32768,
"domain": tune.lograndint(lower=4, upper=3276),
"init_value": 3276,
"low_cost_init_value": 4,
},
}
@ -141,7 +127,7 @@ def custom_metric(
}
class TestAutoML(unittest.TestCase):
class TestMultiClass(unittest.TestCase):
def test_custom_learner(self):
automl = AutoML()
automl.add_learner(learner_name="RGF", learner_class=MyRegularizedGreedyForest)
@ -185,123 +171,6 @@ class TestAutoML(unittest.TestCase):
"""The main flaml automl API"""
automl.fit(X_train=X_train, y_train=y_train, **settings)
def test_preprocess(self):
automl = AutoML()
X = pd.DataFrame(
{
"f1": [1, -2, 3, -4, 5, -6, -7, 8, -9, -10, -11, -12, -13, -14],
"f2": [
3.0,
16.0,
10.0,
12.0,
3.0,
14.0,
11.0,
12.0,
5.0,
14.0,
20.0,
16.0,
15.0,
11.0,
],
"f3": [
"a",
"b",
"a",
"c",
"c",
"b",
"b",
"b",
"b",
"a",
"b",
1.0,
1.0,
"a",
],
"f4": [
True,
True,
False,
True,
True,
False,
False,
False,
True,
True,
False,
False,
True,
True,
],
}
)
y = pd.Series([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1])
automl = AutoML()
automl_settings = {
"time_budget": 6,
"task": "classification",
"n_jobs": 1,
"estimator_list": ["catboost", "lrl2"],
"eval_method": "cv",
"n_splits": 3,
"metric": "accuracy",
"log_training_metric": True,
"verbose": 4,
"ensemble": True,
}
automl.fit(X, y, **automl_settings)
automl = AutoML()
automl_settings = {
"time_budget": 2,
"task": "classification",
"n_jobs": 1,
"estimator_list": ["lrl2", "kneighbor"],
"eval_method": "cv",
"n_splits": 3,
"metric": "accuracy",
"log_training_metric": True,
"verbose": 4,
"ensemble": True,
}
automl.fit(X, y, **automl_settings)
automl = AutoML()
automl_settings = {
"time_budget": 3,
"task": "classification",
"n_jobs": 1,
"estimator_list": ["xgboost", "catboost", "kneighbor"],
"eval_method": "cv",
"n_splits": 3,
"metric": "accuracy",
"log_training_metric": True,
"verbose": 4,
"ensemble": True,
}
automl.fit(X, y, **automl_settings)
automl = AutoML()
automl_settings = {
"time_budget": 3,
"task": "classification",
"n_jobs": 1,
"estimator_list": ["lgbm", "catboost", "kneighbor"],
"eval_method": "cv",
"n_splits": 3,
"metric": "accuracy",
"log_training_metric": True,
"verbose": 4,
"ensemble": True,
}
automl.fit(X, y, **automl_settings)
def test_dataframe(self):
self.test_classification(True)
@ -348,20 +217,6 @@ class TestAutoML(unittest.TestCase):
)
print(metric_history)
def test_binary(self):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 1,
"task": "binary",
"log_file_name": "test/breast_cancer.log",
"log_training_metric": True,
"n_jobs": 1,
"model_history": True,
}
X_train, y_train = load_breast_cancer(return_X_y=True)
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
_ = automl_experiment.predict(X_train)
def test_classification(self, as_frame=False):
automl_experiment = AutoML()
automl_settings = {
@ -401,47 +256,6 @@ class TestAutoML(unittest.TestCase):
print(automl_experiment.model)
print(automl_experiment.predict_proba(X_train)[:5])
def test_datetime_columns(self):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 2,
"log_file_name": "test/datetime_columns.log",
"log_training_metric": True,
"n_jobs": 1,
"model_history": True,
}
fake_df = pd.DataFrame(
{
"A": [
datetime(1900, 2, 3),
datetime(1900, 3, 4),
datetime(1900, 3, 4),
datetime(1900, 3, 4),
datetime(1900, 7, 2),
datetime(1900, 8, 9),
],
"B": [
datetime(1900, 1, 1),
datetime(1900, 1, 1),
datetime(1900, 1, 1),
datetime(1900, 1, 1),
datetime(1900, 1, 1),
datetime(1900, 1, 1),
],
"year_A": [
datetime(1900, 1, 2),
datetime(1900, 8, 1),
datetime(1900, 1, 4),
datetime(1900, 6, 1),
datetime(1900, 1, 5),
datetime(1900, 4, 1),
],
}
)
y = np.array([0, 1, 0, 1, 0, 0])
automl_experiment.fit(X_train=fake_df, y_train=y, **automl_settings)
_ = automl_experiment.predict(fake_df)
def test_micro_macro_f1(self):
automl_experiment_micro = AutoML()
automl_experiment_macro = AutoML()
@ -501,50 +315,6 @@ class TestAutoML(unittest.TestCase):
X_train, y_train = load_iris(return_X_y=True)
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
def test_regression(self):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 2,
"task": "regression",
"log_file_name": "test/california.log",
"log_training_metric": True,
"n_jobs": 1,
"model_history": True,
}
X_train, y_train = fetch_california_housing(return_X_y=True)
n = int(len(y_train) * 9 // 10)
automl_experiment.fit(
X_train=X_train[:n],
y_train=y_train[:n],
X_val=X_train[n:],
y_val=y_train[n:],
**automl_settings
)
assert automl_experiment._state.eval_method == "holdout"
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
print(get_output_from_log(automl_settings["log_file_name"], 1))
automl_experiment.retrain_from_log(
task="regression",
log_file_name=automl_settings["log_file_name"],
X_train=X_train,
y_train=y_train,
train_full=True,
time_budget=1,
)
automl_experiment.retrain_from_log(
task="regression",
log_file_name=automl_settings["log_file_name"],
X_train=X_train,
y_train=y_train,
train_full=True,
time_budget=0,
)
def test_sparse_matrix_classification(self):
automl_experiment = AutoML()
automl_settings = {
@ -567,236 +337,51 @@ class TestAutoML(unittest.TestCase):
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
def test_sparse_matrix_regression(self):
X_train = scipy.sparse.random(300, 900, density=0.0001)
y_train = np.random.uniform(size=300)
X_val = scipy.sparse.random(100, 900, density=0.0001)
y_val = np.random.uniform(size=100)
automl_experiment = AutoML()
automl_settings = {
"time_budget": 2,
"metric": "mae",
"task": "regression",
"log_file_name": "test/sparse_regression.log",
"n_jobs": 1,
"model_history": True,
"keep_search_state": True,
"verbose": 0,
"early_stop": True,
}
automl_experiment.fit(
X_train=X_train,
y_train=y_train,
X_val=X_val,
y_val=y_val,
**automl_settings
)
assert automl_experiment._state.X_val.shape == X_val.shape
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
print(automl_experiment.best_config)
print(automl_experiment.best_loss)
print(automl_experiment.best_config_train_time)
def test_sparse_matrix_xgboost(self):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 3,
"metric": "ap",
"task": "classification",
"log_file_name": "test/sparse_classification.log",
"estimator_list": ["xgboost"],
"log_type": "all",
"n_jobs": 1,
}
X_train = scipy.sparse.eye(900000)
y_train = np.random.randint(2, size=900000)
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
def test_parallel(self, hpo_method=None):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 10,
"task": "regression",
"log_file_name": "test/california.log",
"log_type": "all",
"n_jobs": 1,
"n_concurrent_trials": 10,
"hpo_method": hpo_method,
}
X_train, y_train = fetch_california_housing(return_X_y=True)
try:
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
except ImportError:
return
def test_parallel_classification(self):
from sklearn.datasets import make_classification
X, y = make_classification(1000, 10)
automl = AutoML()
try:
automl.fit(
X, y, time_budget=10, task="classification", n_concurrent_trials=2
)
except ImportError:
return
def test_parallel_xgboost(self, hpo_method=None):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 10,
"metric": "ap",
"task": "classification",
"log_file_name": "test/sparse_classification.log",
"estimator_list": ["xgboost"],
"log_type": "all",
"n_jobs": 1,
"n_concurrent_trials": 2,
"hpo_method": hpo_method,
}
X_train = scipy.sparse.eye(900000)
y_train = np.random.randint(2, size=900000)
try:
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
except ImportError:
return
def test_parallel_xgboost_others(self):
# use random search as the hpo_method
self.test_parallel_xgboost(hpo_method="random")
def test_random_out_of_memory(self):
def _test_memory_limit(self):
automl_experiment = AutoML()
automl_experiment.add_learner(
learner_name="large_lgbm", learner_class=MyLargeLGBM
)
automl_settings = {
"time_budget": 2,
"metric": "ap",
"time_budget": None,
"task": "classification",
"log_file_name": "test/sparse_classification_oom.log",
"log_file_name": "test/classification_oom.log",
"estimator_list": ["large_lgbm"],
"log_type": "all",
"n_jobs": 1,
"n_concurrent_trials": 2,
"hpo_method": "random",
}
X_train, y_train = load_iris(return_X_y=True, as_frame=True)
X_train = scipy.sparse.eye(900000)
y_train = np.random.randint(2, size=900000)
try:
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
except ImportError:
return
def test_sparse_matrix_lr(self):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 2,
"metric": "f1",
"task": "classification",
"log_file_name": "test/sparse_classification.log",
"estimator_list": ["lrl1", "lrl2"],
"log_type": "all",
"n_jobs": 1,
}
X_train = scipy.sparse.random(3000, 900, density=0.1)
y_train = np.random.randint(2, size=3000)
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
def test_sparse_matrix_regression_holdout(self):
X_train = scipy.sparse.random(8, 100)
y_train = np.random.uniform(size=8)
automl_experiment = AutoML()
automl_settings = {
"time_budget": 1,
"eval_method": "holdout",
"task": "regression",
"log_file_name": "test/sparse_regression.log",
"n_jobs": 1,
"model_history": True,
"metric": "mse",
"sample_weight": np.ones(len(y_train)),
"early_stop": True,
}
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
def test_regression_xgboost(self):
X_train = scipy.sparse.random(300, 900, density=0.0001)
y_train = np.random.uniform(size=300)
X_val = scipy.sparse.random(100, 900, density=0.0001)
y_val = np.random.uniform(size=100)
automl_experiment = AutoML()
automl_experiment.add_learner(learner_name="my_xgb1", learner_class=MyXGB1)
automl_experiment.add_learner(learner_name="my_xgb2", learner_class=MyXGB2)
automl_settings = {
"time_budget": 2,
"estimator_list": ["my_xgb1", "my_xgb2"],
"task": "regression",
"log_file_name": "test/regression_xgboost.log",
"n_jobs": 1,
"model_history": True,
"keep_search_state": True,
"early_stop": True,
}
automl_experiment.fit(
X_train=X_train,
y_train=y_train,
X_val=X_val,
y_val=y_val,
**automl_settings
X_train=X_train, y_train=y_train, max_iter=1, **automl_settings
)
assert automl_experiment._state.X_val.shape == X_val.shape
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
print(automl_experiment.best_config)
print(automl_experiment.best_loss)
print(automl_experiment.best_config_train_time)
def test_time_limit(self):
automl_experiment = AutoML()
automl_experiment.add_learner(
learner_name="large_lgbm", learner_class=MyLargeLGBM
)
automl_experiment.add_learner(
learner_name="large_xgb", learner_class=MyLargeXGB
)
automl_settings = {
"time_budget": 0.5,
"task": "classification",
"log_file_name": "test/classification_timeout.log",
"estimator_list": ["catboost"],
"log_type": "all",
"hpo_method": "random",
}
X_train, y_train = load_iris(return_X_y=True, as_frame=True)
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl_experiment.model.params)
automl_settings["estimator_list"] = ["large_xgb"]
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl_experiment.model)
automl_settings["estimator_list"] = ["large_lgbm"]
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl_experiment.model)
def test_fit_w_starting_point(self, as_frame=True):
automl_experiment = AutoML()

View File

@ -60,7 +60,7 @@ def test_automl(budget=5, dataset_format="dataframe", hpo_method=None):
valid_loss_history,
config_history,
metric_history,
) = get_output_from_log(filename=settings["log_file_name"], time_budget=60)
) = get_output_from_log(filename=settings["log_file_name"], time_budget=6)
for config in config_history:
print(config)
print(automl.prune_attr)

View File

@ -113,3 +113,9 @@ class TestLogging(unittest.TestCase):
with open("automl.pkl", "wb") as f:
pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
print(automl.__version__)
pred1 = automl.predict(X_train)
with open("automl.pkl", "rb") as f:
automl = pickle.load(f)
pred2 = automl.predict(X_train)
delta = pred1 - pred2
assert max(delta) == 0 and min(delta) == 0

221
test/test_regression.py Normal file
View File

@ -0,0 +1,221 @@
import unittest
import numpy as np
import scipy.sparse
from sklearn.datasets import (
fetch_california_housing,
)
from flaml import AutoML
from flaml.data import get_output_from_log
from flaml.model import XGBoostEstimator
def logregobj(preds, dtrain):
labels = dtrain.get_label()
preds = 1.0 / (1.0 + np.exp(-preds)) # transform raw leaf weight
grad = preds - labels
hess = preds * (1.0 - preds)
return grad, hess
class MyXGB1(XGBoostEstimator):
"""XGBoostEstimator with logregobj as the objective function"""
def __init__(self, **config):
super().__init__(objective=logregobj, **config)
class MyXGB2(XGBoostEstimator):
"""XGBoostEstimator with 'reg:squarederror' as the objective function"""
def __init__(self, **config):
super().__init__(objective="reg:squarederror", **config)
class TestRegression(unittest.TestCase):
def test_regression(self):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 2,
"task": "regression",
"log_file_name": "test/california.log",
"log_training_metric": True,
"n_jobs": 1,
"model_history": True,
}
X_train, y_train = fetch_california_housing(return_X_y=True)
n = int(len(y_train) * 9 // 10)
automl_experiment.fit(
X_train=X_train[:n],
y_train=y_train[:n],
X_val=X_train[n:],
y_val=y_train[n:],
**automl_settings
)
assert automl_experiment._state.eval_method == "holdout"
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
print(get_output_from_log(automl_settings["log_file_name"], 1))
automl_experiment.retrain_from_log(
task="regression",
log_file_name=automl_settings["log_file_name"],
X_train=X_train,
y_train=y_train,
train_full=True,
time_budget=1,
)
automl_experiment.retrain_from_log(
task="regression",
log_file_name=automl_settings["log_file_name"],
X_train=X_train,
y_train=y_train,
train_full=True,
time_budget=0,
)
def test_sparse_matrix_classification(self):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 2,
"metric": "auto",
"task": "classification",
"log_file_name": "test/sparse_classification.log",
"split_type": "uniform",
"n_jobs": 1,
"model_history": True,
}
X_train = scipy.sparse.random(1554, 21, dtype=int)
y_train = np.random.randint(3, size=1554)
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl_experiment.classes_)
print(automl_experiment.predict_proba(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
def test_sparse_matrix_regression(self):
X_train = scipy.sparse.random(300, 900, density=0.0001)
y_train = np.random.uniform(size=300)
X_val = scipy.sparse.random(100, 900, density=0.0001)
y_val = np.random.uniform(size=100)
automl_experiment = AutoML()
automl_settings = {
"time_budget": 2,
"metric": "mae",
"task": "regression",
"log_file_name": "test/sparse_regression.log",
"n_jobs": 1,
"model_history": True,
"keep_search_state": True,
"verbose": 0,
"early_stop": True,
}
automl_experiment.fit(
X_train=X_train,
y_train=y_train,
X_val=X_val,
y_val=y_val,
**automl_settings
)
assert automl_experiment._state.X_val.shape == X_val.shape
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
print(automl_experiment.best_config)
print(automl_experiment.best_loss)
print(automl_experiment.best_config_train_time)
def test_parallel(self, hpo_method=None):
automl_experiment = AutoML()
automl_settings = {
"time_budget": 10,
"task": "regression",
"log_file_name": "test/california.log",
"log_type": "all",
"n_jobs": 1,
"n_concurrent_trials": 10,
"hpo_method": hpo_method,
}
X_train, y_train = fetch_california_housing(return_X_y=True)
try:
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
except ImportError:
return
def test_sparse_matrix_regression_holdout(self):
X_train = scipy.sparse.random(8, 100)
y_train = np.random.uniform(size=8)
automl_experiment = AutoML()
automl_settings = {
"time_budget": 1,
"eval_method": "holdout",
"task": "regression",
"log_file_name": "test/sparse_regression.log",
"n_jobs": 1,
"model_history": True,
"metric": "mse",
"sample_weight": np.ones(len(y_train)),
"early_stop": True,
}
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
def test_regression_xgboost(self):
X_train = scipy.sparse.random(300, 900, density=0.0001)
y_train = np.random.uniform(size=300)
X_val = scipy.sparse.random(100, 900, density=0.0001)
y_val = np.random.uniform(size=100)
automl_experiment = AutoML()
automl_experiment.add_learner(learner_name="my_xgb1", learner_class=MyXGB1)
automl_experiment.add_learner(learner_name="my_xgb2", learner_class=MyXGB2)
automl_settings = {
"time_budget": 2,
"estimator_list": ["my_xgb1", "my_xgb2"],
"task": "regression",
"log_file_name": "test/regression_xgboost.log",
"n_jobs": 1,
"model_history": True,
"keep_search_state": True,
"early_stop": True,
}
automl_experiment.fit(
X_train=X_train,
y_train=y_train,
X_val=X_val,
y_val=y_val,
**automl_settings
)
assert automl_experiment._state.X_val.shape == X_val.shape
print(automl_experiment.predict(X_train))
print(automl_experiment.model)
print(automl_experiment.config_history)
print(automl_experiment.model_history)
print(automl_experiment.best_iteration)
print(automl_experiment.best_estimator)
print(automl_experiment.best_config)
print(automl_experiment.best_loss)
print(automl_experiment.best_config_train_time)
if __name__ == "__main__":
unittest.main()

View File

@ -30,6 +30,7 @@ class TestTrainingLog(unittest.TestCase):
# "ensemble": True,
"keep_search_state": True,
"estimator_list": estimator_list,
"model_history": True,
}
X_train, y_train = fetch_california_housing(return_X_y=True)
automl.fit(X_train=X_train, y_train=y_train, **automl_settings)