add max_depth to xgboost search space (#282)

* add max_depth to xgboost search space

* notebook update

* two learners for xgboost (max_depth or max_leaves)
This commit is contained in:
Chi Wang 2021-11-22 21:17:48 -08:00 committed by GitHub
parent d937b03e42
commit ea6d28d7bd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 1539 additions and 904 deletions

View File

@ -79,7 +79,9 @@ class SearchState:
self.learner_class = learner_class
search_space = learner_class.search_space(data_size=data_size, task=task)
for name, space in search_space.items():
assert "domain" in space
assert (
"domain" in space
), f"{name}'s domain is missing in the search space spec {space}"
self._search_space_domain[name] = space["domain"]
if "init_value" in space:
self.init_config[name] = space["init_value"]
@ -434,7 +436,7 @@ class AutoML(BaseEstimator):
.. code-block:: python
['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree']
['lgbm', 'xgboost', 'xgb_limitdepth', 'catboost', 'rf', 'extra_tree']
time_budget: A float number of the time budget in seconds.
Use -1 if no time limit.
@ -1659,7 +1661,7 @@ class AutoML(BaseEstimator):
.. code-block:: python
['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree']
['lgbm', 'xgboost', 'xgb_limitdepth', 'catboost', 'rf', 'extra_tree']
time_budget: A float number of the time budget in seconds.
Use -1 if no time limit.
@ -1939,16 +1941,29 @@ class AutoML(BaseEstimator):
except ImportError:
estimator_list = ["arima", "sarimax"]
elif self._state.task == "rank":
estimator_list = ["lgbm", "xgboost"]
estimator_list = ["lgbm", "xgboost", "xgb_limitdepth"]
elif _is_nlp_task(self._state.task):
estimator_list = ["transformer"]
else:
try:
import catboost
estimator_list = ["lgbm", "rf", "catboost", "xgboost", "extra_tree"]
estimator_list = [
"lgbm",
"rf",
"catboost",
"xgboost",
"extra_tree",
"xgb_limitdepth",
]
except ImportError:
estimator_list = ["lgbm", "rf", "xgboost", "extra_tree"]
estimator_list = [
"lgbm",
"rf",
"xgboost",
"extra_tree",
"xgb_limitdepth",
]
if "regression" != self._state.task:
estimator_list += ["lrl1"]
for estimator_name in estimator_list:

View File

@ -20,6 +20,7 @@ from sklearn.metrics import (
from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold, TimeSeriesSplit
from .model import (
XGBoostSklearnEstimator,
XGBoostLimitDepthEstimator,
RandomForestEstimator,
LGBMEstimator,
LRL1Classifier,
@ -42,6 +43,8 @@ def get_estimator_class(task, estimator_name):
# when adding a new learner, need to add an elif branch
if "xgboost" == estimator_name:
estimator_class = XGBoostSklearnEstimator
elif "xgb_limitdepth" == estimator_name:
estimator_class = XGBoostLimitDepthEstimator
elif "rf" == estimator_name:
estimator_class = RandomForestEstimator
elif "lgbm" == estimator_name:

View File

@ -625,7 +625,13 @@ class LGBMEstimator(BaseEstimator):
@classmethod
def size(cls, config):
num_leaves = int(round(config.get("num_leaves") or config["max_leaves"]))
num_leaves = int(
round(
config.get("num_leaves")
or config.get("max_leaves")
or 1 << config["max_depth"]
)
)
n_estimators = int(round(config["n_estimators"]))
return (num_leaves * 3 + (num_leaves - 1) * 4 + 1.0) * n_estimators * 8
@ -794,6 +800,10 @@ class XGBoostEstimator(SKLearnEstimator):
"init_value": 4,
"low_cost_init_value": 4,
},
"max_depth": {
"domain": tune.choice([0, 6, 12]),
"init_value": 0,
},
"min_child_weight": {
"domain": tune.loguniform(lower=0.001, upper=128),
"init_value": 1,
@ -834,11 +844,12 @@ class XGBoostEstimator(SKLearnEstimator):
def config2params(cls, config: dict) -> dict:
params = config.copy()
params["max_depth"] = params.get("max_depth", 0)
params["grow_policy"] = params.get("grow_policy", "lossguide")
params["booster"] = params.get("booster", "gbtree")
max_depth = params["max_depth"] = params.get("max_depth", 0)
if max_depth == 0:
params["grow_policy"] = params.get("grow_policy", "lossguide")
params["tree_method"] = params.get("tree_method", "hist")
# params["booster"] = params.get("booster", "gbtree")
params["use_label_encoder"] = params.get("use_label_encoder", False)
params["tree_method"] = params.get("tree_method", "hist")
if "n_jobs" in config:
params["nthread"] = params.pop("n_jobs")
return params
@ -923,24 +934,25 @@ class XGBoostEstimator(SKLearnEstimator):
class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
"""The class for tuning XGBoost (for classification), using sklearn API."""
"""The class for tuning XGBoost with unlimited depth, using sklearn API."""
@classmethod
def search_space(cls, data_size, **params):
return XGBoostEstimator.search_space(data_size)
space = XGBoostEstimator.search_space(data_size)
space.pop("max_depth")
return space
@classmethod
def cost_relative2lgbm(cls):
return XGBoostEstimator.cost_relative2lgbm()
def config2params(cls, config: dict) -> dict:
# TODO: test
params = config.copy()
params["max_depth"] = 0
params["grow_policy"] = params.get("grow_policy", "lossguide")
params["booster"] = params.get("booster", "gbtree")
max_depth = params["max_depth"] = params.get("max_depth", 0)
if max_depth == 0:
params["grow_policy"] = params.get("grow_policy", "lossguide")
params["tree_method"] = params.get("tree_method", "hist")
params["use_label_encoder"] = params.get("use_label_encoder", False)
params["tree_method"] = params.get("tree_method", "hist")
return params
def __init__(
@ -968,6 +980,28 @@ class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
return XGBoostEstimator._callbacks(start_time, deadline)
class XGBoostLimitDepthEstimator(XGBoostSklearnEstimator):
"""The class for tuning XGBoost with limited depth, using sklearn API."""
@classmethod
def search_space(cls, data_size, **params):
space = XGBoostEstimator.search_space(data_size)
space.pop("max_leaves")
upper = max(6, int(np.log2(data_size)))
space["max_depth"] = {
"domain": tune.randint(lower=1, upper=min(upper, 16)),
"init_value": 6,
"low_cost_init_value": 1,
}
space["learning_rate"]["init_value"] = 0.3
space["n_estimators"]["init_value"] = 10
return space
@classmethod
def cost_relative2lgbm(cls):
return 64
class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
"""The class for tuning Random Forest."""

View File

@ -129,11 +129,11 @@ class FLOW2(Searcher):
sampler = sampler.get_sampler()
if str(sampler) == "Uniform":
self._step_lb = min(
self._step_lb, q / (domain.upper - domain.lower)
self._step_lb, q / (domain.upper - domain.lower + 1)
)
elif isinstance(domain, sample.Integer) and str(sampler) == "Uniform":
self._step_lb = min(
self._step_lb, 1.0 / (domain.upper - 1 - domain.lower)
self._step_lb, 1.0 / (domain.upper - domain.lower)
)
if isinstance(domain, sample.Categorical):
if not domain.ordered:

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long