add max_depth to xgboost search space (#282)

* add max_depth to xgboost search space * notebook update * two learners for xgboost (max_depth or max_leaves)
2021-11-22 21:17:48 -08:00 · 2021-11-22 21:17:48 -08:00 · ea6d28d7bd
parent d937b03e42
commit ea6d28d7bd
6 changed files with 1539 additions and 904 deletions
--- a/flaml/automl.py
+++ b/flaml/automl.py
@ -79,7 +79,9 @@ class SearchState:
        self.learner_class = learner_class
        search_space = learner_class.search_space(data_size=data_size, task=task)
        for name, space in search_space.items():
-            assert "domain" in space
+            assert (
+                "domain" in space
+            ), f"{name}'s domain is missing in the search space spec {space}"
            self._search_space_domain[name] = space["domain"]
            if "init_value" in space:
                self.init_config[name] = space["init_value"]
@ -434,7 +436,7 @@ class AutoML(BaseEstimator):

                .. code-block:: python

-                    ['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree']
+                    ['lgbm', 'xgboost', 'xgb_limitdepth', 'catboost', 'rf', 'extra_tree']

            time_budget: A float number of the time budget in seconds.
                Use -1 if no time limit.
@ -1659,7 +1661,7 @@ class AutoML(BaseEstimator):

                .. code-block:: python

-                    ['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree']
+                    ['lgbm', 'xgboost', 'xgb_limitdepth', 'catboost', 'rf', 'extra_tree']

            time_budget: A float number of the time budget in seconds.
                Use -1 if no time limit.
@ -1939,16 +1941,29 @@ class AutoML(BaseEstimator):
                except ImportError:
                    estimator_list = ["arima", "sarimax"]
            elif self._state.task == "rank":
-                estimator_list = ["lgbm", "xgboost"]
+                estimator_list = ["lgbm", "xgboost", "xgb_limitdepth"]
            elif _is_nlp_task(self._state.task):
                estimator_list = ["transformer"]
            else:
                try:
                    import catboost

-                    estimator_list = ["lgbm", "rf", "catboost", "xgboost", "extra_tree"]
+                    estimator_list = [
+                        "lgbm",
+                        "rf",
+                        "catboost",
+                        "xgboost",
+                        "extra_tree",
+                        "xgb_limitdepth",
+                    ]
                except ImportError:
-                    estimator_list = ["lgbm", "rf", "xgboost", "extra_tree"]
+                    estimator_list = [
+                        "lgbm",
+                        "rf",
+                        "xgboost",
+                        "extra_tree",
+                        "xgb_limitdepth",
+                    ]
                if "regression" != self._state.task:
                    estimator_list += ["lrl1"]
        for estimator_name in estimator_list:
--- a/flaml/ml.py
+++ b/flaml/ml.py
@ -20,6 +20,7 @@ from sklearn.metrics import (
 from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold, TimeSeriesSplit
 from .model import (
    XGBoostSklearnEstimator,
+    XGBoostLimitDepthEstimator,
    RandomForestEstimator,
    LGBMEstimator,
    LRL1Classifier,
@ -42,6 +43,8 @@ def get_estimator_class(task, estimator_name):
    # when adding a new learner, need to add an elif branch
    if "xgboost" == estimator_name:
        estimator_class = XGBoostSklearnEstimator
+    elif "xgb_limitdepth" == estimator_name:
+        estimator_class = XGBoostLimitDepthEstimator
    elif "rf" == estimator_name:
        estimator_class = RandomForestEstimator
    elif "lgbm" == estimator_name:
--- a/flaml/model.py
+++ b/flaml/model.py
@ -625,7 +625,13 @@ class LGBMEstimator(BaseEstimator):

    @classmethod
    def size(cls, config):
-        num_leaves = int(round(config.get("num_leaves") or config["max_leaves"]))
+        num_leaves = int(
+            round(
+                config.get("num_leaves")
+                or config.get("max_leaves")
+                or 1 << config["max_depth"]
+            )
+        )
        n_estimators = int(round(config["n_estimators"]))
        return (num_leaves * 3 + (num_leaves - 1) * 4 + 1.0) * n_estimators * 8

@ -794,6 +800,10 @@ class XGBoostEstimator(SKLearnEstimator):
                "init_value": 4,
                "low_cost_init_value": 4,
            },
+            "max_depth": {
+                "domain": tune.choice([0, 6, 12]),
+                "init_value": 0,
+            },
            "min_child_weight": {
                "domain": tune.loguniform(lower=0.001, upper=128),
                "init_value": 1,
@ -834,11 +844,12 @@ class XGBoostEstimator(SKLearnEstimator):

    def config2params(cls, config: dict) -> dict:
        params = config.copy()
-        params["max_depth"] = params.get("max_depth", 0)
-        params["grow_policy"] = params.get("grow_policy", "lossguide")
-        params["booster"] = params.get("booster", "gbtree")
+        max_depth = params["max_depth"] = params.get("max_depth", 0)
+        if max_depth == 0:
+            params["grow_policy"] = params.get("grow_policy", "lossguide")
+            params["tree_method"] = params.get("tree_method", "hist")
+        # params["booster"] = params.get("booster", "gbtree")
        params["use_label_encoder"] = params.get("use_label_encoder", False)
-        params["tree_method"] = params.get("tree_method", "hist")
        if "n_jobs" in config:
            params["nthread"] = params.pop("n_jobs")
        return params
@ -923,24 +934,25 @@ class XGBoostEstimator(SKLearnEstimator):


 class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
-    """The class for tuning XGBoost (for classification), using sklearn API."""
+    """The class for tuning XGBoost with unlimited depth, using sklearn API."""

    @classmethod
    def search_space(cls, data_size, **params):
-        return XGBoostEstimator.search_space(data_size)
+        space = XGBoostEstimator.search_space(data_size)
+        space.pop("max_depth")
+        return space

    @classmethod
    def cost_relative2lgbm(cls):
        return XGBoostEstimator.cost_relative2lgbm()

    def config2params(cls, config: dict) -> dict:
-        # TODO: test
        params = config.copy()
-        params["max_depth"] = 0
-        params["grow_policy"] = params.get("grow_policy", "lossguide")
-        params["booster"] = params.get("booster", "gbtree")
+        max_depth = params["max_depth"] = params.get("max_depth", 0)
+        if max_depth == 0:
+            params["grow_policy"] = params.get("grow_policy", "lossguide")
+            params["tree_method"] = params.get("tree_method", "hist")
        params["use_label_encoder"] = params.get("use_label_encoder", False)
-        params["tree_method"] = params.get("tree_method", "hist")
        return params

    def __init__(
@ -968,6 +980,28 @@ class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
        return XGBoostEstimator._callbacks(start_time, deadline)


+class XGBoostLimitDepthEstimator(XGBoostSklearnEstimator):
+    """The class for tuning XGBoost with limited depth, using sklearn API."""
+
+    @classmethod
+    def search_space(cls, data_size, **params):
+        space = XGBoostEstimator.search_space(data_size)
+        space.pop("max_leaves")
+        upper = max(6, int(np.log2(data_size)))
+        space["max_depth"] = {
+            "domain": tune.randint(lower=1, upper=min(upper, 16)),
+            "init_value": 6,
+            "low_cost_init_value": 1,
+        }
+        space["learning_rate"]["init_value"] = 0.3
+        space["n_estimators"]["init_value"] = 10
+        return space
+
+    @classmethod
+    def cost_relative2lgbm(cls):
+        return 64
+
+
 class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
    """The class for tuning Random Forest."""

--- a/flaml/searcher/flow2.py
+++ b/flaml/searcher/flow2.py
@ -129,11 +129,11 @@ class FLOW2(Searcher):
                    sampler = sampler.get_sampler()
                    if str(sampler) == "Uniform":
                        self._step_lb = min(
-                            self._step_lb, q / (domain.upper - domain.lower)
+                            self._step_lb, q / (domain.upper - domain.lower + 1)
                        )
                elif isinstance(domain, sample.Integer) and str(sampler) == "Uniform":
                    self._step_lb = min(
-                        self._step_lb, 1.0 / (domain.upper - 1 - domain.lower)
+                        self._step_lb, 1.0 / (domain.upper - domain.lower)
                    )
                if isinstance(domain, sample.Categorical):
                    if not domain.ordered:
--- a/notebook/flaml_automl.ipynb
+++ b/notebook/flaml_automl.ipynb
--- a/notebook/flaml_xgboost.ipynb
+++ b/notebook/flaml_xgboost.ipynb