remove catboost training dir; ensemble api; blendsearch for hierarchical space; ranking task; forecast improvement (#178)

* remove catboost training dir * close #48 * bs for hierarchical space. close #85 * retrain for hierarchical space * clean ml (#180) Co-authored-by: Qingyun Wu <qxw5138@psu.edu> * support ranking task * examples * cv shuffle * forecast api and implementation cleaner * period constraints * delete groups after fit
2021-09-01 16:25:04 -07:00 · 2021-09-01 16:25:04 -07:00 · 6ab0730793
parent 1bc8786dcb
commit 6ab0730793
21 changed files with 1399 additions and 1126 deletions
--- a/README.md
+++ b/README.md
@ -65,7 +65,7 @@ tune.run(train_with_config, config={…}, low_cost_partial_config={…}, time_bu

 ## Advantages

-* For classification and regression tasks, find quality models with lower computational resources.
+* For common machine learning tasks like classification and regression, find quality models with small computational resources.
 * Users can choose their desired customizability: minimal customization (computational resource budget), medium customization (e.g., scikit-style learner, search space and metric), full customization (arbitrary training and evaluation code).
 * Allow human guidance in hyperparameter tuning to respect prior on certain subspaces but also able to explore other subspaces. Read more about the
 hyperparameter optimization methods
@ -75,7 +75,7 @@ And they can be used in distributed HPO frameworks such as ray tune or nni.

 ## Examples

-A basic classification example.
+- A basic classification example.

 ```python
 from flaml import AutoML
@ -99,7 +99,7 @@ print(automl.predict_proba(X_train))
 print(automl.model)
 ```

-A basic regression example.
+- A basic regression example.

 ```python
 from flaml import AutoML
@ -123,6 +123,39 @@ print(automl.predict(X_train))
 print(automl.model)
 ```

+- Time series forecasting.
+
+```python
+# pip install flaml[forecast]
+import numpy as np
+from flaml import AutoML
+X_train = np.arange('2014-01', '2021-01', dtype='datetime64[M]')
+y_train = np.random.random(size=72)
+automl = AutoML()
+automl.fit(X_train=X_train[:72],  # a single column of timestamp
+           y_train=y_train,  # value for each timestamp
+           period=12,  # time horizon to forecast, e.g., 12 months
+           task='forecast', time_budget=15,  # time budget in seconds
+           log_file_name="test/forecast.log",
+          )
+print(automl.predict(X_train[72:]))
+```
+
+- Learning to rank.
+
+```python
+from sklearn.datasets import fetch_openml
+from flaml import AutoML
+X, y = fetch_openml(name="credit-g", return_X_y=True)   
+# not a real learning to rank dataaset
+groups = [200] * 4 + [100] * 2,    # group counts
+automl = AutoML()
+automl.fit(
+    X_train, y_train, groups=groups,
+    task='rank', time_budget=10,    # in seconds
+)
+```
+
 More examples can be found in [notebooks](https://github.com/microsoft/FLAML/tree/main/notebook/).

 ## Documentation
--- a/flaml/automl.py
+++ b/flaml/automl.py
@ -10,7 +10,7 @@ from functools import partial
 import numpy as np
 from scipy.sparse import issparse
 from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, \
-    RepeatedKFold, GroupKFold, TimeSeriesSplit
+    RepeatedKFold, GroupKFold, TimeSeriesSplit, GroupShuffleSplit
 from sklearn.utils import shuffle
 import pandas as pd
 import logging
@ -94,13 +94,13 @@ class SearchState:
            else:
                self.sample_size = self.data_size
            obj = result['val_loss']
-            train_loss = result['train_loss']
+            metric_for_logging = result['metric_for_logging']
            time2eval = result['time_total_s']
            trained_estimator = result['trained_estimator']
            del result['trained_estimator']     # free up RAM
        else:
            obj, time2eval, trained_estimator = np.inf, 0.0, None
-            train_loss = config = None
+            metric_for_logging = config = None
        self.trial_time = time2eval
        self.total_time_used += time_used
        self.total_iter += 1
@ -126,7 +126,8 @@ class SearchState:
                self.trained_estimator.cleanup()
            if trained_estimator:
                self.trained_estimator = trained_estimator
-        self.train_loss, self.val_loss, self.config = train_loss, obj, config
+        self.metric_for_logging, self.val_loss, self.config = \
+            metric_for_logging, obj, config

    def get_hist_config_sig(self, sample_size, config):
        config_values = tuple([config[k] for k in self._hp_names])
@ -144,7 +145,7 @@ class AutoMLState:

    def _prepare_sample_train_data(self, sample_size):
        full_size = len(self.y_train)
-        sampled_weight = None
+        sampled_weight = groups = None
        if sample_size <= full_size:
            if isinstance(self.X_train, pd.DataFrame):
                sampled_X_train = self.X_train.iloc[:sample_size]
@ -154,12 +155,16 @@ class AutoMLState:
            weight = self.fit_kwargs.get('sample_weight')
            if weight is not None:
                sampled_weight = weight[:sample_size]
+            if self.groups is not None:
+                groups = self.groups[:sample_size]
        else:
            sampled_X_train = self.X_train_all
            sampled_y_train = self.y_train_all
            if 'sample_weight' in self.fit_kwargs:
                sampled_weight = self.sample_weight_all
-        return sampled_X_train, sampled_y_train, sampled_weight
+            if self.groups is not None:
+                groups = self.groups_all
+        return sampled_X_train, sampled_y_train, sampled_weight, groups

    def _compute_with_config_base(self,
                                  estimator,
@ -168,13 +173,15 @@ class AutoMLState:
            sample_size = int(config_w_resource['FLAML_sample_size'])
        else:
            sample_size = self.data_size
-        sampled_X_train, sampled_y_train, sampled_weight = \
+        sampled_X_train, sampled_y_train, sampled_weight, groups = \
            self._prepare_sample_train_data(sample_size)
        if sampled_weight is not None:
            weight = self.fit_kwargs['sample_weight']
            self.fit_kwargs['sample_weight'] = sampled_weight
        else:
            weight = None
+        if groups is not None:
+            self.fit_kwargs['groups'] = groups
        config = config_w_resource.copy()
        if 'FLAML_sample_size' in config:
            del config['FLAML_sample_size']
@ -182,13 +189,14 @@ class AutoMLState:
        budget = time_left if sample_size == self.data_size else \
            time_left / 2 * sample_size / self.data_size

-        trained_estimator, val_loss, train_loss, _, pred_time = \
+        trained_estimator, val_loss, metric_for_logging, _, pred_time = \
            compute_estimator(
                sampled_X_train,
                sampled_y_train,
                self.X_val,
                self.y_val,
                self.weight_val,
+                self.groups_val,
                min(budget, self.train_time_limit),
                self.kf,
                config,
@ -204,7 +212,7 @@ class AutoMLState:
        result = {
            'pred_time': pred_time,
            'wall_clock_time': time.time() - self._start_time_flag,
-            'train_loss': train_loss,
+            'metric_for_logging': metric_for_logging,
            'val_loss': val_loss,
            'trained_estimator': trained_estimator
        }
@ -216,19 +224,23 @@ class AutoMLState:
    def _train_with_config(
        self, estimator, config_w_resource, sample_size=None
    ):
-        config = config_w_resource.copy()
+        if not sample_size:
+            sample_size = config_w_resource['FLAML_sample_size']
+        config = config_w_resource.get('ml', config_w_resource).copy()
        if 'FLAML_sample_size' in config:
-            if not sample_size:
-                sample_size = config['FLAML_sample_size']
            del config['FLAML_sample_size']
+        if "learner" in config:
+            del config['learner']
        assert sample_size is not None
-        sampled_X_train, sampled_y_train, sampled_weight = \
+        sampled_X_train, sampled_y_train, sampled_weight, groups = \
            self._prepare_sample_train_data(sample_size)
        if sampled_weight is not None:
            weight = self.fit_kwargs['sample_weight']
            self.fit_kwargs['sample_weight'] = sampled_weight
        else:
            weight = None
+        if groups is not None:
+            self.fit_kwargs['groups'] = groups
        budget = None if self.time_budget is None else (
            self.time_budget - self.time_from_start)
        estimator, train_time = train_estimator(
@ -368,18 +380,18 @@ class AutoML:
            return self._trained_estimator.classes_.tolist()
        return None

-    def predict(self, X_test, freq=None):
+    def predict(self, X_test):
        '''Predict label from features.

        Args:
            X_test: A numpy array of featurized instances, shape n * m,
-                or a pandas dataframe with one column with timestamp values
-                for 'forecasting' task.
-            freq: str or pandas offset, default=None | The frequency of the
-                time-series.
+                or for 'forecasting' task:
+                    a pandas dataframe with one column of timestamp values
+                    or an integer n for the predict steps (only valid when
+                    the estimator is arima or sarimax).

        Returns:
-            A numpy array of shape n * 1 - - each element is a predicted class
+            A array-like of shape n * 1 - - each element is a predicted
            label for an instance.
        '''
        if self._trained_estimator is None:
@ -387,13 +399,7 @@ class AutoML:
                "No estimator is trained. Please run fit with enough budget.")
            return None
        X_test = self._preprocess(X_test)
-        if self._state.task == 'forecast':
-            X_test_df = pd.DataFrame(X_test)
-            X_test_col = list(X_test.columns)[0]
-            X_test_df = X_test_df.rename(columns={X_test_col: 'ds'})
-            y_pred = self._trained_estimator.predict(X_test_df, freq=freq)
-        else:
-            y_pred = self._trained_estimator.predict(X_test)
+        y_pred = self._trained_estimator.predict(X_test)
        if y_pred.ndim > 1 and isinstance(y_pred, np.ndarray):
            y_pred = y_pred.flatten()
        if self._label_transformer:
@ -418,14 +424,20 @@ class AutoML:
        return proba

    def _preprocess(self, X):
-        if issparse(X):
-            X = X.tocsr()
-        if self._transformer:
-            X = self._transformer.transform(X)
+        if isinstance(X, int):
+            return X
+        if self._state.task == 'forecast':
+            X = pd.DataFrame(X)
+            X = X.rename(columns={X.columns[0]: 'ds'})
+        else:
+            if issparse(X):
+                X = X.tocsr()
+            if self._transformer:
+                X = self._transformer.transform(X)
        return X

    def _validate_data(self, X_train_all, y_train_all, dataframe, label,
-                       X_val=None, y_val=None):
+                       X_val=None, y_val=None, groups_val=None, groups=None):
        if self._state.task == 'forecast':
            if dataframe is not None and label is not None:
                dataframe = dataframe.copy()
@ -433,13 +445,11 @@ class AutoML:
            elif dataframe is not None:
                if ('ds' not in dataframe) or ('y' not in dataframe):
                    raise ValueError(
-                        'For forecasting task, Dataframe must have columns "ds" and "y" '
-                        'with the dates and values respectively.'
-                    )
+                        'For forecasting task, dataframe must have columns "ds" and "y" '
+                        'with the dates and values respectively.')
            elif (X_train_all is not None) and (y_train_all is not None):
                dataframe = pd.DataFrame(X_train_all)
-                time_col = list(dataframe.columns)[0]
-                dataframe = dataframe.rename(columns={time_col: 'ds'})
+                dataframe = dataframe.rename(columns={dataframe.columns[0]: 'ds'})
                dataframe['y'] = pd.Series(y_train_all)
                X_train_all = None
                y_train_all = None
@ -515,12 +525,23 @@ class AutoML:
                self._state.y_val = y_val
        else:
            self._state.X_val = self._state.y_val = None
+        if groups is not None and len(groups) != self._nrow:
+            # groups is given as group counts
+            self._state.groups = np.concatenate(
+                [[i] * c for i, c in enumerate(groups)])
+            assert len(self._state.groups) == self._nrow, \
+                "the sum of group counts must match the number of examples"
+            self._state.groups_val = np.concatenate(
+                [[i] * c for i, c in enumerate(groups_val)]
+            ) if groups_val is not None else None
+        else:
+            self._state.groups_val = groups_val
+            self._state.groups = groups

    def _prepare_data(self,
                      eval_method,
                      split_ratio,
-                      n_splits,
-                      period=None):
+                      n_splits):
        X_val, y_val = self._state.X_val, self._state.y_val
        if issparse(X_val):
            X_val = X_val.tocsr()
@ -564,25 +585,25 @@ class AutoML:
                            random_state=RANDOM_SEED)
                self._state.fit_kwargs[
                    'sample_weight'] = self._state.sample_weight_all
-            elif hasattr(self._state, 'groups') and self._state.groups is not None:
-                X_train_all, y_train_all, self._state.groups = shuffle(
-                    X_train_all, y_train_all, self._state.groups,
-                    random_state=RANDOM_SEED)
            else:
                X_train_all, y_train_all = shuffle(
                    X_train_all, y_train_all, random_state=RANDOM_SEED)
-        if self._df:
-            X_train_all.reset_index(drop=True, inplace=True)
-            if isinstance(y_train_all, pd.Series):
-                y_train_all.reset_index(drop=True, inplace=True)
+            if self._df:
+                X_train_all.reset_index(drop=True, inplace=True)
+                if isinstance(y_train_all, pd.Series):
+                    y_train_all.reset_index(drop=True, inplace=True)

        X_train, y_train = X_train_all, y_train_all
-        if X_val is None:
+        self._state.groups_all = self._state.groups
+        if X_val is None and eval_method == 'holdout':
            # if eval_method = holdout, make holdout data
-            if eval_method == 'holdout' and self._split_type == 'time':
-                if 'period' in self._state.fit_kwargs:
+            if self._split_type == 'time':
+                if self._state.task == 'forecast':
                    num_samples = X_train_all.shape[0]
-                    split_idx = num_samples - self._state.fit_kwargs.get('period')
+                    period = self._state.fit_kwargs['period']
+                    assert period < num_samples, (
+                        f"period={period}>#examples={num_samples}")
+                    split_idx = num_samples - period
                    X_train = X_train_all[:split_idx]
                    y_train = y_train_all[:split_idx]
                    X_val = X_train_all[split_idx:]
@ -603,7 +624,21 @@ class AutoML:
                            y_train_all,
                            test_size=split_ratio,
                            shuffle=False)
-            elif self._state.task != 'regression' and eval_method == 'holdout':
+            elif self._state.task == 'rank':
+                gss = GroupShuffleSplit(n_splits=1, test_size=split_ratio,
+                                        random_state=RANDOM_SEED)
+                for train_idx, val_idx in gss.split(X_train_all, y_train_all,
+                                                    self._state.groups):
+                    if self._df:
+                        X_train, X_val = X_train_all.iloc[
+                            train_idx], X_train_all.iloc[val_idx]
+                    else:
+                        X_train, X_val = X_train_all[
+                            train_idx], X_train_all[val_idx]
+                    y_train, y_val = y_train_all[train_idx], y_train_all[val_idx]
+                    self._state.groups, self._state.groups_val = self._state.groups[
+                        train_idx], self._state.groups[val_idx]
+            elif self._state.task != 'regression':
                # for classification, make sure the labels are complete in both
                # training and validation data
                label_set, first = np.unique(y_train_all, return_index=True)
@ -617,8 +652,7 @@ class AutoML:
                X_first = X_train_all.iloc[first] if self._df else X_train_all[
                    first]
                X_rest = X_train_all.iloc[rest] if self._df else X_train_all[rest]
-                y_rest = y_train_all.iloc[rest] if isinstance(
-                    y_train_all, pd.Series) else y_train_all[rest]
+                y_rest = y_train_all[rest]
                stratify = y_rest if self._split_type == 'stratified' else \
                    None
                if 'sample_weight' in self._state.fit_kwargs:
@ -647,7 +681,7 @@ class AutoML:
                X_val = concat(X_first, X_val)
                y_val = concat(label_set, y_val) if self._df else \
                    np.concatenate([label_set, y_val])
-            elif eval_method == 'holdout' and self._state.task == 'regression':
+            elif self._state.task == 'regression':
                if 'sample_weight' in self._state.fit_kwargs:
                    X_train, X_val, y_train, y_val, self._state.fit_kwargs[
                        'sample_weight'], self._state.weight_val = \
@ -669,16 +703,16 @@ class AutoML:
            self._state.y_val = (X_train, y_train, X_val, y_val)
        self._state.X_train_all = X_train_all
        self._state.y_train_all = y_train_all
-        if hasattr(self._state, 'groups') and self._state.groups is not None:
-            logger.info("Using GroupKFold")
-            assert len(self._state.groups) == y_train_all.size, \
+        if self._split_type == 'group':
+            # logger.info("Using GroupKFold")
+            assert len(self._state.groups_all) == y_train_all.size, \
                "the length of groups must match the number of examples"
-            assert len(np.unique(self._state.groups)) >= n_splits, \
+            assert len(np.unique(self._state.groups_all)) >= n_splits, \
                "the number of groups must be equal or larger than n_splits"
            self._state.kf = GroupKFold(n_splits)
-            self._state.kf.groups = self._state.groups
+            self._state.kf.groups = self._state.groups_all
        elif self._split_type == "stratified":
-            logger.info("Using StratifiedKFold")
+            # logger.info("Using StratifiedKFold")
            assert y_train_all.size >= n_splits, (
                f"{n_splits}-fold cross validation"
                f" requires input data with at least {n_splits} examples.")
@ -688,14 +722,22 @@ class AutoML:
            self._state.kf = RepeatedStratifiedKFold(
                n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED)
        elif self._split_type == "time":
-            logger.info("Using TimeSeriesSplit")
+            # logger.info("Using TimeSeriesSplit")
            if self._state.task == 'forecast':
+                period = self._state.fit_kwargs['period']
+                if period * (n_splits + 1) > y_train_all.size:
+                    n_splits = int(y_train_all.size / period - 1)
+                    assert n_splits >= 2, (
+                        f"cross validation for forecasting period={period}"
+                        f" requires input data with at least {3 * period} examples.")
+                    logger.info(
+                        f"Using nsplits={n_splits} due to data size limit.")
                self._state.kf = TimeSeriesSplit(
-                    n_splits=n_splits, test_size=self._state.fit_kwargs.get('period'))
+                    n_splits=n_splits, test_size=period)
            else:
                self._state.kf = TimeSeriesSplit(n_splits=n_splits)
        else:
-            logger.info("Using RepeatedKFold")
+            # logger.info("Using RepeatedKFold")
            self._state.kf = RepeatedKFold(
                n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED)

@ -745,7 +787,8 @@ class AutoML:
                         eval_method='auto',
                         split_ratio=SPLIT_RATIO,
                         n_splits=N_SPLITS,
-                         split_type="stratified",
+                         split_type=None,
+                         groups=None,
                         n_jobs=1,
                         train_best=True,
                         train_full=False,
@ -754,31 +797,51 @@ class AutoML:
        '''Retrain from log file

        Args:
-            time_budget: A float number of the time budget in seconds
            log_file_name: A string of the log file name
            X_train: A numpy array of training data in shape n*m
            y_train: A numpy array of labels in shape n*1
+            dataframe: A dataframe of training data including label column.
+                For 'forecast' task, dataframe must be specified and should
+                have two columns: timestamp and value.
+            label: A str of the label column name for 'classification' or
+                'regression' task, e.g., 'label';
+                or a tuple of strings for timestamp and value columns for
+                'forecasting' task, e.g., ('timestamp', 'value').
+                Note: If X_train and y_train are provided,
+                dataframe and label are ignored;
+                If not, dataframe and label must be provided.
+            time_budget: A float number of the time budget in seconds.
            task: A string of the task type, e.g.,
-                'classification', 'regression'
+                'classification', 'regression', 'forecast', 'rank'.
            eval_method: A string of resampling strategy, one of
-                ['auto', 'cv', 'holdout']
-            split_ratio: A float of the validation data percentage for holdout
-            n_splits: An integer of the number of folds for cross-validation
-            n_jobs: An integer of the number of threads for training
+                ['auto', 'cv', 'holdout'].
+            split_ratio: A float of the validation data percentage for holdout.
+            n_splits: An integer of the number of folds for cross-validation.
+            split_type: str or None, default=None | the data split type.
+                For classification tasks, valid choices are [
+                    None, 'stratified', 'uniform', 'time']. None -> stratified.
+                For regression tasks, valid choices are [None, 'uniform', 'time'].
+                    None -> uniform.
+                For time series forecasting, must be None or 'time'.
+                For ranking task, must be None or 'group'.
+            groups: None or array-like | Group labels (with matching length to
+                y_train) or groups counts (with sum equal to length of y_train)
+                for training data.
+            n_jobs: An integer of the number of threads for training.
            train_best: A boolean of whether to train the best config in the
-                time budget; if false, train the last config in the budget
+                time budget; if false, train the last config in the budget.
            train_full: A boolean of whether to train on the full data. If true,
-                eval_method and sample_size in the log file will be ignored
+                eval_method and sample_size in the log file will be ignored.
            record_id: the ID of the training log record from which the model will
                be retrained. By default `record_id = -1` which means this will be
                ignored. `record_id = 0` corresponds to the first trial, and
                when `record_id >= 0`, `time_budget` will be ignored.
            **fit_kwargs: Other key word arguments to pass to fit() function of
-                the searched learners, such as sample_weight
+                the searched learners, such as sample_weight.
        '''
        self._state.task = task
        self._state.fit_kwargs = fit_kwargs
-        self._validate_data(X_train, y_train, dataframe, label)
+        self._validate_data(X_train, y_train, dataframe, label, groups=groups)

        logger.info('log file name {}'.format(log_file_name))

@ -829,24 +892,17 @@ class AutoML:
        # Partially copied from fit() function
        # Initilize some attributes required for retrain_from_log
        self._state.task = task
-        if self._state.task == 'classification':
-            self._state.task = get_classification_objective(
-                len(np.unique(self._y_train_all)))
-            assert split_type in ["stratified", "uniform", "time"]
-            self._split_type = split_type
-        elif self._state.task == 'regression':
-            if split_type in ["uniform", "time"]:
-                self._split_type = split_type
-            else:
-                self._split_type = "uniform"
-        elif self._state.task == 'forecast':
-            self._split_type = "time"
+        self._decide_split_type(split_type)
        if record_id >= 0:
            eval_method = 'cv'
        elif eval_method == 'auto':
            eval_method = self._decide_eval_method(time_budget)
        self.modelcount = 0
-        self._prepare_data(eval_method, split_ratio, n_splits)
+        if self._state.task != 'forecast':
+            self._prepare_data(eval_method, split_ratio, n_splits)
+        else:
+            self._prepare_data(eval_method, split_ratio, n_splits,
+                               period=self._state.fit_kwargs['period'])
        self._state.time_budget = None
        self._state.n_jobs = n_jobs
        self._trained_estimator = self._state._train_with_config(
@ -854,6 +910,26 @@ class AutoML:
        logger.info('retrain from log succeeded')
        return training_duration

+    def _decide_split_type(self, split_type):
+        if self._state.task == 'classification':
+            self._state.task = get_classification_objective(
+                len(np.unique(self._y_train_all)))
+            assert split_type in [None, "stratified", "uniform", "time"]
+            self._split_type = split_type or "stratified"
+        elif self._state.task == 'regression':
+            assert split_type in [None, "uniform", "time"]
+            self._split_type = split_type or "uniform"
+        elif self._state.task == 'forecast':
+            assert split_type in [None, "time"]
+            self._split_type = "time"
+            assert isinstance(self._state.fit_kwargs.get('period'), int), (
+                "missing a required integer 'period' for forecast.")
+        elif self._state.task == 'rank':
+            assert self._state.groups is not None, \
+                'groups must be specified for ranking task.'
+            assert split_type in [None, "group"]
+            self._split_type = 'group'
+
    def _decide_eval_method(self, time_budget):
        if self._state.X_val is not None:
            return 'holdout'
@ -1020,7 +1096,7 @@ class AutoML:
            else:
                return {'pred_time': 0,
                        'wall_clock_time': None,
-                        'train_loss': np.inf,
+                        'metric_for_logging': np.inf,
                        'val_loss': np.inf,
                        'trained_estimator': None
                        }
@ -1065,10 +1141,11 @@ class AutoML:
            X_val=None,
            y_val=None,
            sample_weight_val=None,
+            groups_val=None,
            groups=None,
            verbose=1,
            retrain_full=True,
-            split_type="stratified",
+            split_type=None,
            learner_selector='sample',
            hpo_method=None,
            starting_points={},
@ -1104,14 +1181,15 @@ class AutoML:

                    def custom_metric(
                        X_test, y_test, estimator, labels,
-                        X_train, y_train, weight_test=None, weight_train=None
+                        X_train, y_train, weight_test=None, weight_train=None,
+                        config=None, groups_test=None, groups_train=None,
                    ):
                        return metric_to_minimize, metrics_to_log

                which returns a float number as the minimization objective,
                and a tuple of floats or a dictionary as the metrics to log.
            task: A string of the task type, e.g.,
-                'classification', 'regression', 'forecast'.
+                'classification', 'regression', 'forecast', 'rank'.
            n_jobs: An integer of the number of threads for training.
            log_file_name: A string of the log file name.
            estimator_list: A list of strings for estimator names, or 'auto'
@ -1125,6 +1203,10 @@ class AutoML:
            max_iter: An integer of the maximal number of iterations.
            sample: A boolean of whether to sample the training data during
                search.
+            ensemble: boolean or dict | default=False. Whether to perform
+                ensemble after search. Can be a dict with keys 'passthrough'
+                and 'final_estimator' to specify the passthrough and
+                final_estimator in the stacker.
            eval_method: A string of resampling strategy, one of
                ['auto', 'cv', 'holdout'].
            split_ratio: A float of the valiation data percentage for holdout.
@ -1144,9 +1226,13 @@ class AutoML:
            X_val: None or a numpy array or a pandas dataframe of validation data.
            y_val: None or a numpy array or a pandas series of validation labels.
            sample_weight_val: None or a numpy array of the sample weight of
-                validation data.
-            groups: None or an array-like of shape (n,) | Group labels for the
-                samples used while splitting the dataset into train/valid set.
+                validation data of the same shape as y_val.
+            groups_val: None or array-like | group labels (with matching length
+                to y_val) or group counts (with sum equal to length of y_val)
+                for validation data. Need to be consistent with groups.
+            groups: None or array-like | Group labels (with matching length to
+                y_train) or groups counts (with sum equal to length of y_train)
+                for training data.
            verbose: int, default=1 | Controls the verbosity, higher means more
                messages.
            retrain_full: bool or str, default=True | whether to retrain the
@ -1154,6 +1240,13 @@ class AutoML:
                True - retrain only after search finishes; False - no retraining;
                'budget' - do best effort to retrain without violating the time
                budget.
+            split_type: str or None, default=None | the data split type.
+                For classification tasks, valid choices are [
+                    None, 'stratified', 'uniform', 'time']. None -> stratified.
+                For regression tasks, valid choices are [None, 'uniform', 'time'].
+                    None -> uniform.
+                For time series forecasting, must be None or 'time'.
+                For ranking task, must be None or 'group'.
            hpo_method: str or None, default=None | The hyperparameter
                optimization method. When it is None, CFO is used.
                No need to set when using flaml's default search space or using
@ -1182,9 +1275,9 @@ class AutoML:
        self._state.log_training_metric = log_training_metric
        self._state.fit_kwargs = fit_kwargs
        self._state.weight_val = sample_weight_val
-        self._state.groups = groups

-        self._validate_data(X_train, y_train, dataframe, label, X_val, y_val)
+        self._validate_data(X_train, y_train, dataframe, label, X_val, y_val,
+                            groups_val, groups)
        self._search_states = {}  # key: estimator name; value: SearchState
        self._random = np.random.RandomState(RANDOM_SEED)
        if seed is not None:
@ -1194,24 +1287,7 @@ class AutoML:
        self.verbose = verbose
        if verbose == 0:
            logger.setLevel(logging.WARNING)
-        if self._state.task == 'classification':
-            self._state.task = get_classification_objective(
-                len(np.unique(self._y_train_all)))
-            assert split_type in ["stratified", "uniform", "time"]
-            self._split_type = split_type
-        elif self._state.task == 'regression':
-            if split_type in ["uniform", "time"]:
-                self._split_type = split_type
-            else:
-                self._split_type = "uniform"
-        elif self._state.task == 'forecast':
-            if split_type is not None and split_type != 'time':
-                raise ValueError(
-                    "split_type must be 'time' when task is 'forecast'.")
-            self._split_type = "time"
-            if self._state.fit_kwargs.get('period') is None:
-                raise TypeError(
-                    "missing 1 required argument for 'forecast' task: 'period'.")
+        self._decide_split_type(split_type)
        if eval_method == 'auto' or self._state.X_val is not None:
            eval_method = self._decide_eval_method(time_budget)
        self._state.eval_method = eval_method
@ -1227,12 +1303,8 @@ class AutoML:
        self._retrain_final = retrain_full is True and (
            eval_method == 'holdout' and self._state.X_val is None) or (
                eval_method == 'cv')
-        if self._state.task != 'forecast':
-            self._prepare_data(eval_method, split_ratio, n_splits)
-        else:
-            self._prepare_data(eval_method, split_ratio, n_splits,
-                               period=self._state.fit_kwargs['period'])
-        self._sample = sample and eval_method != 'cv' and (
+        self._prepare_data(eval_method, split_ratio, n_splits)
+        self._sample = sample and task != 'rank' and eval_method != 'cv' and (
            MIN_SAMPLE_TRAIN * SAMPLE_MULTIPLY_FACTOR < self._state.data_size)
        if 'auto' == metric:
            if 'binary' in self._state.task:
@ -1241,11 +1313,13 @@ class AutoML:
                metric = 'log_loss'
            elif self._state.task == 'forecast':
                metric = 'mape'
+            elif self._state.task == 'rank':
+                metric = 'ndcg'
            else:
                metric = 'r2'
        self._state.metric = metric
        if metric in ['r2', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo',
-                      'f1', 'ap', 'micro_f1', 'macro_f1']:
+                      'f1', 'ap', 'micro_f1', 'macro_f1', 'ndcg']:
            error_metric = f"1-{metric}"
        elif isinstance(metric, str):
            error_metric = metric
@ -1256,6 +1330,8 @@ class AutoML:
        if 'auto' == estimator_list:
            if self._state.task == 'forecast':
                estimator_list = ['fbprophet', 'arima', 'sarimax']
+            elif self._state.task == 'rank':
+                estimator_list = ['lgbm', 'xgboost']
            else:
                estimator_list = [
                    'lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree']
@ -1278,7 +1354,9 @@ class AutoML:
        logger.info("List of ML learners in AutoML Run: {}".format(
            estimator_list))
        self.estimator_list = estimator_list
-        self._hpo_method = hpo_method or 'cfo'
+        self._hpo_method = hpo_method or (
+            'cfo' if n_concurrent_trials == 1 or len(estimator_list) == 1
+            else 'bs')
        self._state.time_budget = time_budget
        self._active_estimators = estimator_list.copy()
        self._ensemble = ensemble
@ -1315,7 +1393,8 @@ class AutoML:
            del self._X_train_all, self._y_train_all, self._state.kf
            del self._state.X_train, self._state.X_train_all, self._state.X_val
            del self._state.y_train, self._state.y_train_all, self._state.y_val
-            del self._sample_weight_full, self._state.fit_kwargs, self._state.groups
+            del self._sample_weight_full, self._state.fit_kwargs
+            del self._state.groups, self._state.groups_all, self._state.groups_val
            for state in self._search_states.values():
                if state.trained_estimator:
                    del state.trained_estimator
@ -1363,8 +1442,7 @@ class AutoML:
                    del p[k]

            search_alg = SearchAlgo(max_concurrent=self._n_concurrent_trials,
-                                    points_to_evaluate=points_to_evaluate
-                                    )
+                                    points_to_evaluate=points_to_evaluate)
        else:
            search_alg = SearchAlgo(
                metric='val_loss',
@ -1387,7 +1465,8 @@ class AutoML:
        analysis = ray.tune.run(
            self.trainable, search_alg=search_alg, config=self.search_space,
            metric='val_loss', mode='min', resources_per_trial=resources_per_trial,
-            time_budget_s=self._state.time_budget, num_samples=self._max_iter)
+            time_budget_s=self._state.time_budget, num_samples=self._max_iter,
+            verbose=self.verbose)
        # logger.info([trial.last_result for trial in analysis.trials])
        trials = sorted((trial for trial in analysis.trials if trial.last_result
                        and trial.last_result['wall_clock_time'] is not None),
@ -1421,7 +1500,7 @@ class AutoML:
                if (better or self._log_type == 'all') and self._training_log:
                    self._training_log.append(
                        self._iter_per_learner[estimator],
-                        search_state.train_loss,
+                        search_state.metric_for_logging,
                        search_state.trial_time,
                        self._state.time_from_start,
                        search_state.val_loss,
@ -1591,7 +1670,7 @@ class AutoML:
                    if self._training_log:
                        self._training_log.append(
                            self._iter_per_learner[estimator],
-                            search_state.train_loss,
+                            search_state.metric_for_logging,
                            search_state.trial_time,
                            self._state.time_from_start,
                            search_state.val_loss,
@ -1604,8 +1683,8 @@ class AutoML:
                        with mlflow.start_run(nested=True):
                            mlflow.log_metric('iter_counter',
                                              self._iter_per_learner[estimator])
-                            mlflow.log_param('train_loss',
-                                             search_state.train_loss)
+                            mlflow.log_param('metric_for_logging',
+                                             search_state.metric_for_logging)
                            mlflow.log_metric('trial_time',
                                              search_state.trial_time)
                            mlflow.log_metric('wall_clock_time',
@ -1702,7 +1781,9 @@ class AutoML:
                for search_state in self._search_states.values())
            if self._trained_estimator:
                logger.info(f'selected model: {self._trained_estimator.model}')
-            if self._ensemble:
+            if self._ensemble and self._state.task in (
+                'binary:logistic', 'multi:softmax', 'regression',
+            ):
                search_states = list(x for x in self._search_states.items()
                                     if x[1].trained_estimator)
                search_states.sort(key=lambda x: x[1].best_loss)
@ -1714,15 +1795,20 @@ class AutoML:
                logger.info(estimators)
                if len(estimators) <= 1:
                    return
-                if self._state.task != "regression":
+                if self._state.task in ('binary:logistic', 'multi:softmax'):
                    from sklearn.ensemble import StackingClassifier as Stacker
-                    for e in estimators:
-                        e[1]._estimator_type = 'classifier'
                else:
                    from sklearn.ensemble import StackingRegressor as Stacker
-                best_m = self._trained_estimator
-                stacker = Stacker(estimators, best_m, n_jobs=self._state.n_jobs,
-                                  passthrough=True)
+                if isinstance(self._ensemble, dict):
+                    final_estimator = self._ensemble.get(
+                        'final_estimator', self._trained_estimator)
+                    passthrough = self._ensemble.get('passthrough', True)
+                else:
+                    final_estimator = self._trained_estimator
+                    passthrough = True
+                stacker = Stacker(
+                    estimators, final_estimator, n_jobs=self._state.n_jobs,
+                    passthrough=passthrough)
                if self._sample_weight_full is not None:
                    self._state.fit_kwargs[
                        'sample_weight'] = self._sample_weight_full
@ -1734,9 +1820,11 @@ class AutoML:
            elif self._retrain_final:
                # reset time budget for retraining
                self._state.time_from_start -= self._state.time_budget
-                if (self._state.time_budget - self._state.time_from_start
-                    > self._selected.est_retrain_time(self.data_size_full)) \
-                   and self._selected.best_config_sample_size == self._state.data_size:
+                if self._state.task == 'forecast' or (
+                    self._state.time_budget - self._state.time_from_start
+                    > self._selected.est_retrain_time(self.data_size_full)
+                    and self._selected.best_config_sample_size == self._state.data_size
+                ):
                    self._trained_estimator, \
                        retrain_time = self._state._train_with_config(
                            self._best_estimator,
--- a/flaml/data.py
+++ b/flaml/data.py
@ -146,7 +146,7 @@ def get_output_from_log(filename, time_budget):
            config = record.config
            learner = record.learner.split('_')[0]
            sample_size = record.sample_size
-            train_loss = record.logged_metric
+            metric = record.logged_metric

            if time_used < time_budget and np.isfinite(val_loss):
                if val_loss < best_val_loss:
@ -156,7 +156,7 @@ def get_output_from_log(filename, time_budget):
                    best_config_list.append(best_config)
                search_time_list.append(time_used)
                best_error_list.append(best_val_loss)
-                logged_metric_list.append(train_loss)
+                logged_metric_list.append(metric)
                error_list.append(val_loss)
                config_list.append({"Current Learner": learner,
                                    "Current Sample": sample_size,
@ -242,8 +242,12 @@ class DataTransformer:
                X[cat_columns] = X[cat_columns].astype('category')
            if num_columns:
                X_num = X[num_columns]
-                if drop and np.issubdtype(X_num.columns.dtype, np.integer):
+                if np.issubdtype(X_num.columns.dtype, np.integer) and (
+                    drop or min(X_num.columns) != 0
+                    or max(X_num.columns) != X_num.shape[1] - 1
+                ):
                    X_num.columns = range(X_num.shape[1])
+                    drop = True
                else:
                    drop = False
                from sklearn.impute import SimpleImputer
@ -257,12 +261,12 @@ class DataTransformer:
                cat_columns, num_columns, datetime_columns
            self._drop = drop

-        if task == 'regression':
-            self.label_transformer = None
-        else:
+        if task in ('binary:logistic', 'multi:softmax'):
            from sklearn.preprocessing import LabelEncoder
            self.label_transformer = LabelEncoder()
            y = self.label_transformer.fit_transform(y)
+        else:
+            self.label_transformer = None
        return X, y

    def transform(self, X):
@ -302,3 +306,8 @@ class DataTransformer:
                    X_num.columns = range(X_num.shape[1])
                X[num_columns] = self.transformer.transform(X_num)
        return X
+
+
+def group_counts(groups):
+    _, i, c = np.unique(groups, return_counts=True, return_index=True)
+    return c[np.argsort(i)]
--- a/flaml/ml.py
+++ b/flaml/ml.py
@ -4,17 +4,17 @@
 '''

 import time
-from joblib.externals.cloudpickle.cloudpickle import instance
 import numpy as np
 import pandas as pd
 from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, \
    accuracy_score, mean_absolute_error, log_loss, average_precision_score, \
-    f1_score, mean_absolute_percentage_error
+    f1_score, mean_absolute_percentage_error, ndcg_score
 from sklearn.model_selection import RepeatedStratifiedKFold, GroupKFold, TimeSeriesSplit
 from .model import (
    XGBoostEstimator, XGBoostSklearnEstimator, RandomForestEstimator,
    LGBMEstimator, LRL1Classifier, LRL2Classifier, CatBoostEstimator,
    ExtraTreeEstimator, KNeighborsEstimator, FBProphet, ARIMA, SARIMAX)
+from .data import group_counts

 import logging
 logger = logging.getLogger(__name__)
@ -56,26 +56,29 @@ def get_estimator_class(task, estimator_name):


 def sklearn_metric_loss_score(
-    metric_name, y_predict, y_true, labels=None, sample_weight=None
+    metric_name, y_predict, y_true, labels=None, sample_weight=None,
+    groups=None,
 ):
    '''Loss using the specified metric

    Args:
        metric_name: A string of the metric name, one of
            'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',
-            'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'micro_f1', 'macro_f1'
+            'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg',
+            'micro_f1', 'macro_f1'.
        y_predict: A 1d or 2d numpy array of the predictions which can be
            used to calculate the metric. E.g., 2d for log_loss and 1d
            for others.
-        y_true: A 1d numpy array of the true labels
-        labels: A 1d numpy array of the unique labels
-        sample_weight: A 1d numpy array of the sample weight
+        y_true: A 1d numpy array of the true labels.
+        labels: A 1d numpy array of the unique labels.
+        sample_weight: A 1d numpy array of the sample weight.
+        groups: A 1d numpy array of the group labels.

    Returns:
-        score: A float number of the loss, the lower the better
+        score: A float number of the loss, the lower the better.
    '''
    metric_name = metric_name.lower()
-    if 'r2' in metric_name:
+    if 'r2' == metric_name:
        score = 1.0 - r2_score(y_true, y_predict, sample_weight=sample_weight)
    elif metric_name == 'rmse':
        score = np.sqrt(mean_squared_error(
@ -98,26 +101,40 @@ def sklearn_metric_loss_score(
    elif metric_name == 'roc_auc_ovo':
        score = 1.0 - roc_auc_score(
            y_true, y_predict, sample_weight=sample_weight, multi_class='ovo')
-    elif 'log_loss' in metric_name:
+    elif 'log_loss' == metric_name:
        score = log_loss(
            y_true, y_predict, labels=labels, sample_weight=sample_weight)
-    elif 'mape' in metric_name:
+    elif 'mape' == metric_name:
        try:
            score = mean_absolute_percentage_error(
                y_true, y_predict)
        except ValueError:
            return np.inf
-    elif 'micro_f1' in metric_name:
+    elif 'micro_f1' == metric_name:
        score = 1 - f1_score(
            y_true, y_predict, sample_weight=sample_weight, average='micro')
-    elif 'macro_f1' in metric_name:
+    elif 'macro_f1' == metric_name:
        score = 1 - f1_score(
            y_true, y_predict, sample_weight=sample_weight, average='macro')
-    elif 'f1' in metric_name:
+    elif 'f1' == metric_name:
        score = 1 - f1_score(y_true, y_predict, sample_weight=sample_weight)
-    elif 'ap' in metric_name:
+    elif 'ap' == metric_name:
        score = 1 - average_precision_score(
            y_true, y_predict, sample_weight=sample_weight)
+    elif 'ndcg' in metric_name:
+        if '@' in metric_name:
+            k = int(metric_name.split('@', 1)[-1])
+            counts = group_counts(groups)
+            score = 0
+            psum = 0
+            for c in counts:
+                score -= ndcg_score(np.asarray([y_true[psum:psum + c]]),
+                                    np.asarray([y_predict[psum:psum + c]]), k=k)
+                psum += c
+            score /= len(counts)
+            score += 1
+        else:
+            score = 1 - ndcg_score([y_true], [y_predict])
    else:
        raise ValueError(
            metric_name + ' is not a built-in metric, '
@ -128,92 +145,60 @@ def sklearn_metric_loss_score(
    return score


-def get_y_pred(estimator, X, eval_metric, obj, freq=None):
+def get_y_pred(estimator, X, eval_metric, obj):
    if eval_metric in ['roc_auc', 'ap'] and 'binary' in obj:
        y_pred_classes = estimator.predict_proba(X)
        y_pred = y_pred_classes[
            :, 1] if y_pred_classes.ndim > 1 else y_pred_classes
    elif eval_metric in ['log_loss', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo']:
        y_pred = estimator.predict_proba(X)
-    elif eval_metric == 'mape':
-        y_pred = estimator.predict(X, freq=freq)
    else:
        y_pred = estimator.predict(X)
    return y_pred


-def get_test_loss(
-    estimator, X_train, y_train, X_test, y_test, weight_test,
-    eval_metric, obj, labels=None, budget=None, log_training_metric=False, fit_kwargs={}
-):
+def get_test_loss(config, estimator, X_train, y_train, X_test, y_test, weight_test,
+                  groups_test, eval_metric, obj, labels=None, budget=None,
+                  log_training_metric=False, fit_kwargs={}):
+
    start = time.time()
+    # if groups_test is not None:
+    #     fit_kwargs['groups_val'] = groups_test
+    #     fit_kwargs['X_val'] = X_test
+    #     fit_kwargs['y_val'] = y_test
    estimator.fit(X_train, y_train, budget, **fit_kwargs)
    if isinstance(eval_metric, str):
        pred_start = time.time()
        test_pred_y = get_y_pred(estimator, X_test, eval_metric, obj)
        pred_time = (time.time() - pred_start) / X_test.shape[0]
        test_loss = sklearn_metric_loss_score(eval_metric, test_pred_y, y_test,
-                                              labels, weight_test)
+                                              labels, weight_test, groups_test)
        if log_training_metric:
            test_pred_y = get_y_pred(estimator, X_train, eval_metric, obj)
-            train_loss = sklearn_metric_loss_score(
-                eval_metric, test_pred_y,
-                y_train, labels, fit_kwargs.get('sample_weight'))
+            metric_for_logging = sklearn_metric_loss_score(
+                eval_metric, test_pred_y, y_train, labels,
+                fit_kwargs.get('sample_weight'), fit_kwargs.get('groups'))
        else:
-            train_loss = None
+            metric_for_logging = None
    else:  # customized metric function
        test_loss, metrics = eval_metric(
-            X_test, y_test, estimator, labels, X_train, y_train,
-            weight_test, fit_kwargs.get('sample_weight'))
+            X_test, y_test, estimator, labels, X_train, y_train, weight_test,
+            fit_kwargs.get('sample_weight'), config, groups_test,
+            fit_kwargs.get('groups'))
        if isinstance(metrics, dict):
            pred_time = metrics.get('pred_time', 0)
-        train_loss = metrics
+        metric_for_logging = metrics
    train_time = time.time() - start
-    return test_loss, train_time, train_loss, pred_time
+    return test_loss, metric_for_logging, train_time, pred_time


-def train_model(estimator, X_train, y_train, budget, fit_kwargs={}):
-    train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs)
-    return train_time
-
-
-def evaluate_model(
-    estimator, X_train, y_train, X_val, y_val, weight_val,
-    budget, kf, task, eval_method, eval_metric, best_val_loss, log_training_metric=False,
-    fit_kwargs={}
-):
-    if 'holdout' in eval_method:
-        val_loss, train_loss, train_time, pred_time = evaluate_model_holdout(
-            estimator, X_train, y_train, X_val, y_val, weight_val, budget,
-            task, eval_metric, log_training_metric=log_training_metric,
-            fit_kwargs=fit_kwargs)
-    else:
-        val_loss, train_loss, train_time, pred_time = evaluate_model_CV(
-            estimator, X_train, y_train, budget, kf, task,
-            eval_metric, best_val_loss, log_training_metric=log_training_metric,
-            fit_kwargs=fit_kwargs)
-    return val_loss, train_loss, train_time, pred_time
-
-
-def evaluate_model_holdout(
-    estimator, X_train, y_train, X_val, y_val,
-    weight_val, budget, task, eval_metric, log_training_metric=False,
-    fit_kwargs={}
-):
-    val_loss, train_time, train_loss, pred_time = get_test_loss(
-        estimator, X_train, y_train, X_val, y_val, weight_val, eval_metric,
-        task, budget=budget, log_training_metric=log_training_metric, fit_kwargs=fit_kwargs)
-    return val_loss, train_loss, train_time, pred_time
-
-
-def evaluate_model_CV(
-    estimator, X_train_all, y_train_all, budget, kf,
-    task, eval_metric, best_val_loss, log_training_metric=False, fit_kwargs={}
-):
+def evaluate_model_CV(config, estimator, X_train_all, y_train_all, budget, kf,
+                      task, eval_metric, best_val_loss,
+                      log_training_metric=False, fit_kwargs={}):
    start_time = time.time()
    total_val_loss = 0
-    total_train_loss = None
-    train_loss = None
+    total_metric = None
+    metric = None
    train_time = pred_time = 0
    valid_fold_num = total_fold_num = 0
    n = kf.get_n_splits()
@ -222,15 +207,19 @@ def evaluate_model_CV(
        labels = np.unique(y_train_all)
    else:
        labels = None
-
+    groups = None
+    shuffle = True
    if isinstance(kf, RepeatedStratifiedKFold):
        kf = kf.split(X_train_split, y_train_split)
    elif isinstance(kf, GroupKFold):
-        kf = kf.split(X_train_split, y_train_split, kf.groups)
+        groups = kf.groups
+        kf = kf.split(X_train_split, y_train_split, groups)
+        shuffle = False
    elif isinstance(kf, TimeSeriesSplit) and task == 'forecast':
        y_train_all = pd.DataFrame(y_train_all, columns=['y'])
        train = X_train_all.join(y_train_all)
        kf = kf.split(train)
+        shuffle = False
    elif isinstance(kf, TimeSeriesSplit):
        kf = kf.split(X_train_split, y_train_split)
    else:
@ -244,7 +233,7 @@ def evaluate_model_CV(
    else:
        weight = weight_val = None
    for train_index, val_index in kf:
-        if not isinstance(kf, TimeSeriesSplit):
+        if shuffle:
            train_index = rng.permutation(train_index)
        if isinstance(X_train_all, pd.DataFrame):
            X_train, X_val = X_train_split.iloc[
@ -252,19 +241,19 @@ def evaluate_model_CV(
        else:
            X_train, X_val = X_train_split[
                train_index], X_train_split[val_index]
-        if isinstance(y_train_all, pd.Series):
-            y_train, y_val = y_train_split.iloc[
-                train_index], y_train_split.iloc[val_index]
-        else:
-            y_train, y_val = y_train_split[
-                train_index], y_train_split[val_index]
+        y_train, y_val = y_train_split[train_index], y_train_split[val_index]
        estimator.cleanup()
        if weight is not None:
            fit_kwargs['sample_weight'], weight_val = weight[
                train_index], weight[val_index]
-        val_loss_i, train_time_i, train_loss_i, pred_time_i = get_test_loss(
-            estimator, X_train, y_train, X_val, y_val, weight_val,
-            eval_metric, task, labels, budget_per_train,
+        if groups is not None:
+            fit_kwargs['groups'] = groups[train_index]
+            groups_val = groups[val_index]
+        else:
+            groups_val = None
+        val_loss_i, metric_i, train_time_i, pred_time_i = get_test_loss(
+            config, estimator, X_train, y_train, X_val, y_val, weight_val,
+            groups_val, eval_metric, task, labels, budget_per_train,
            log_training_metric=log_training_metric, fit_kwargs=fit_kwargs)
        if weight is not None:
            fit_kwargs['sample_weight'] = weight
@ -272,16 +261,16 @@ def evaluate_model_CV(
        total_fold_num += 1
        total_val_loss += val_loss_i
        if log_training_metric or not isinstance(eval_metric, str):
-            if isinstance(total_train_loss, list):
-                total_train_loss = [
-                    total_train_loss[i] + v for i, v in enumerate(train_loss_i)]
-            elif isinstance(total_train_loss, dict):
-                total_train_loss = {
-                    k: total_train_loss[k] + v for k, v in train_loss_i.items()}
-            elif total_train_loss is not None:
-                total_train_loss += train_loss_i
+            if isinstance(total_metric, list):
+                total_metric = [
+                    total_metric[i] + v for i, v in enumerate(metric_i)]
+            elif isinstance(total_metric, dict):
+                total_metric = {
+                    k: total_metric[k] + v for k, v in metric_i.items()}
+            elif total_metric is not None:
+                total_metric += metric_i
            else:
-                total_train_loss = train_loss_i
+                total_metric = metric_i
        train_time += train_time_i
        pred_time += pred_time_i
        if valid_fold_num == n:
@ -293,22 +282,22 @@ def evaluate_model_CV(
    val_loss = np.max(val_loss_list)
    n = total_fold_num
    if log_training_metric or not isinstance(eval_metric, str):
-        if isinstance(total_train_loss, list):
-            train_loss = [v / n for v in total_train_loss]
-        elif isinstance(total_train_loss, dict):
-            train_loss = {k: v / n for k, v in total_train_loss.items()}
+        if isinstance(total_metric, list):
+            metric = [v / n for v in total_metric]
+        elif isinstance(total_metric, dict):
+            metric = {k: v / n for k, v in total_metric.items()}
        else:
-            train_loss = total_train_loss / n
+            metric = total_metric / n
    pred_time /= n
    # budget -= time.time() - start_time
    # if val_loss < best_val_loss and budget > budget_per_train:
    #     estimator.cleanup()
    #     estimator.fit(X_train_all, y_train_all, budget, **fit_kwargs)
-    return val_loss, train_loss, train_time, pred_time
+    return val_loss, metric, train_time, pred_time


 def compute_estimator(
-    X_train, y_train, X_val, y_val, weight_val, budget, kf,
+    X_train, y_train, X_val, y_val, weight_val, groups_val, budget, kf,
    config_dic, task, estimator_name, eval_method, eval_metric,
    best_val_loss=np.Inf, n_jobs=1, estimator_class=None, log_training_metric=False,
    fit_kwargs={}
@ -317,11 +306,17 @@ def compute_estimator(
        task, estimator_name)
    estimator = estimator_class(
        **config_dic, task=task, n_jobs=n_jobs)
-    val_loss, train_loss, train_time, pred_time = evaluate_model(
-        estimator, X_train, y_train, X_val, y_val, weight_val, budget, kf, task,
-        eval_method, eval_metric, best_val_loss, log_training_metric=log_training_metric,
-        fit_kwargs=fit_kwargs)
-    return estimator, val_loss, train_loss, train_time, pred_time
+    if 'holdout' in eval_method:
+        val_loss, metric_for_logging, train_time, pred_time = get_test_loss(
+            config_dic, estimator, X_train, y_train, X_val, y_val, weight_val,
+            groups_val, eval_metric, task, budget=budget,
+            log_training_metric=log_training_metric, fit_kwargs=fit_kwargs)
+    else:
+        val_loss, metric_for_logging, train_time, pred_time = evaluate_model_CV(
+            config_dic, estimator, X_train, y_train, budget, kf, task,
+            eval_metric, best_val_loss, log_training_metric=log_training_metric,
+            fit_kwargs=fit_kwargs)
+    return estimator, val_loss, metric_for_logging, train_time, pred_time


 def train_estimator(
@ -333,8 +328,7 @@ def train_estimator(
        task, estimator_name)
    estimator = estimator_class(**config_dic, task=task, n_jobs=n_jobs)
    if X_train is not None:
-        train_time = train_model(
-            estimator, X_train, y_train, budget, fit_kwargs)
+        train_time = estimator.fit(X_train, y_train, budget, **fit_kwargs)
    else:
        estimator = estimator.estimator_class(**estimator.params)
    train_time = time.time() - start_time
--- a/flaml/model.py
+++ b/flaml/model.py
@ -3,16 +3,18 @@
 * Licensed under the MIT License.
 '''

+import warnings
 import numpy as np
 import xgboost as xgb
 import time
 from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
 from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier
 from sklearn.linear_model import LogisticRegression
-from lightgbm import LGBMClassifier, LGBMRegressor
+from lightgbm import LGBMClassifier, LGBMRegressor, LGBMRanker
 from scipy.sparse import issparse
 import pandas as pd
 from . import tune
+from .data import group_counts

 import logging

@ -45,8 +47,8 @@ class BaseEstimator:
            self._estimator_type = params['_estimator_type']
            del self.params['_estimator_type']
        else:
-            self._estimator_type = "regressor" if task == 'regression' \
-                else "classifier"
+            self._estimator_type = "classifier" if task in (
+                'binary:logistic', 'multi:softmax') else "regressor"

    def get_params(self, deep=False):
        params = self.params.copy()
@ -81,6 +83,18 @@ class BaseEstimator:
    def _fit(self, X_train, y_train, **kwargs):

        current_time = time.time()
+        if 'groups' in kwargs:
+            kwargs = kwargs.copy()
+            if self._task == 'rank':
+                kwargs['group'] = group_counts(kwargs['groups'])
+                # groups_val = kwargs.get('groups_val')
+                # if groups_val is not None:
+                #     kwargs['eval_group'] = [group_counts(groups_val)]
+                #     kwargs['eval_set'] = [
+                #         (kwargs['X_val'], kwargs['y_val'])]
+                #     kwargs['verbose'] = False
+                #     del kwargs['groups_val'], kwargs['X_val'], kwargs['y_val']
+            del kwargs['groups']
        X_train = self._preprocess(X_train)
        model = self.estimator_class(**self.params)
        model.fit(X_train, y_train, **kwargs)
@ -255,12 +269,14 @@ class LGBMEstimator(BaseEstimator):
        if "objective" not in self.params:
            # Default: ‘regression’ for LGBMRegressor,
            # ‘binary’ or ‘multiclass’ for LGBMClassifier
-            if 'regression' in task:
+            if 'regression' == task:
                objective = 'regression'
            elif 'binary' in task:
                objective = 'binary'
            elif 'multi' in task:
                objective = 'multiclass'
+            elif 'rank' == task:
+                objective = 'lambdarank'
            else:
                objective = 'regression'
            self.params["objective"] = objective
@ -276,8 +292,10 @@ class LGBMEstimator(BaseEstimator):
            self.params['verbose'] = -1
        # if "subsample_freq" not in self.params:
        #     self.params['subsample_freq'] = 1
-        if 'regression' in task:
+        if 'regression' == task:
            self.estimator_class = LGBMRegressor
+        elif 'rank' == task:
+            self.estimator_class = LGBMRanker
        else:
            self.estimator_class = LGBMClassifier
        self._time_per_iter = None
@ -488,8 +506,10 @@ class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
            'use_label_encoder': params.get('use_label_encoder', False),
        })

-        if 'regression' in task:
+        if 'regression' == task:
            self.estimator_class = xgb.XGBRegressor
+        elif 'rank' == task:
+            self.estimator_class = xgb.XGBRanker
        else:
            self.estimator_class = xgb.XGBClassifier
        self._time_per_iter = None
@ -716,7 +736,9 @@ class CatBoostEstimator(BaseEstimator):
        return params

    def fit(self, X_train, y_train, budget=None, **kwargs):
+        import shutil
        start_time = time.time()
+        train_dir = f'catboost_{str(start_time)}'
        n_iter = self.params["n_estimators"]
        X_train = self._preprocess(X_train)
        if isinstance(X_train, pd.DataFrame):
@ -730,16 +752,19 @@ class CatBoostEstimator(BaseEstimator):
                CatBoostEstimator._train_size - len(y_train)) > 4) and budget:
            # measure the time per iteration
            self.params["n_estimators"] = 1
-            CatBoostEstimator._smallmodel = self.estimator_class(**self.params)
+            CatBoostEstimator._smallmodel = self.estimator_class(
+                train_dir=train_dir, **self.params)
            CatBoostEstimator._smallmodel.fit(
                X_train, y_train, cat_features=cat_features, **kwargs)
            CatBoostEstimator._t1 = time.time() - start_time
            if CatBoostEstimator._t1 >= budget:
                self.params["n_estimators"] = n_iter
                self._model = CatBoostEstimator._smallmodel
+                shutil.rmtree(train_dir, ignore_errors=True)
                return CatBoostEstimator._t1
            self.params["n_estimators"] = 4
-            CatBoostEstimator._smallmodel = self.estimator_class(**self.params)
+            CatBoostEstimator._smallmodel = self.estimator_class(
+                train_dir=train_dir, **self.params)
            CatBoostEstimator._smallmodel.fit(
                X_train, y_train, cat_features=cat_features, **kwargs)
            CatBoostEstimator._time_per_iter = (
@ -752,6 +777,7 @@ class CatBoostEstimator(BaseEstimator):
                    "n_estimators"]:
                self.params["n_estimators"] = n_iter
                self._model = CatBoostEstimator._smallmodel
+                shutil.rmtree(train_dir, ignore_errors=True)
                return time.time() - start_time
        if budget:
            train_times = 1
@ -769,13 +795,14 @@ class CatBoostEstimator(BaseEstimator):
            else:
                weight = None
            from catboost import Pool
-            model = self.estimator_class(**self.params)
+            model = self.estimator_class(train_dir=train_dir, **self.params)
            model.fit(
                X_tr, y_tr, cat_features=cat_features,
                eval_set=Pool(
                    data=X_train[n:], label=y_train[n:],
                    cat_features=cat_features),
                **kwargs)   # model.get_best_iteration()
+            shutil.rmtree(train_dir, ignore_errors=True)
            if weight is not None:
                kwargs['sample_weight'] = weight
            self._model = model
@ -862,44 +889,43 @@ class FBProphet(BaseEstimator):
        }
        return space

-    def fit(self, X_train, y_train, budget=None, **kwargs):
+    def __init__(self, task='forecast', **params):
+        if 'n_jobs' in params:
+            params.pop('n_jobs')
+        super().__init__(task, **params)
+
+    def _join(self, X_train, y_train):
+        assert 'ds' in X_train, (
+            'Dataframe for training forecast model must have column'
+            ' "ds" with the dates in X_train.')
        y_train = pd.DataFrame(y_train, columns=['y'])
        train_df = X_train.join(y_train)
+        return train_df

-        if ('ds' not in train_df) or ('y' not in train_df):
-            raise ValueError(
-                'Dataframe for training forecast model must have columns "ds" and "y" with the dates and '
-                'values respectively.'
-            )
-
-        if 'n_jobs' in self.params:
-            self.params.pop('n_jobs')
-
+    def fit(self, X_train, y_train, budget=None, **kwargs):
        from prophet import Prophet
-
        current_time = time.time()
+        train_df = self._join(X_train, y_train)
        model = Prophet(**self.params).fit(train_df)
        train_time = time.time() - current_time
        self._model = model
        return train_time

-    def predict(self, X_test, freq=None):
+    def predict(self, X_test):
+        if isinstance(X_test, int):
+            raise ValueError(
+                "predict() with steps is only supported for arima/sarimax."
+                " For FBProphet, pass a dataframe with a date colum named ds.")
        if self._model is not None:
-            if isinstance(X_test, int) and freq is not None:
-                future = self._model.make_future_dataframe(periods=X_test, freq=freq)
-                forecast = self._model.predict(future)
-            elif isinstance(X_test, pd.DataFrame):
-                forecast = self._model.predict(X_test)
-            else:
-                raise ValueError(
-                    "either X_test(pd.Dataframe with dates for predictions, column ds) or"
-                    "X_test(int number of periods)+freq are required.")
+            forecast = self._model.predict(X_test)
            return forecast['yhat']
        else:
+            warnings.warn(
+                "Estimator is not fit yet. Please run fit() before predict().")
            return np.ones(X_test.shape[0])


-class ARIMA(BaseEstimator):
+class ARIMA(FBProphet):
    @classmethod
    def search_space(cls, **params):
        space = {
@ -921,55 +947,45 @@ class ARIMA(BaseEstimator):
        }
        return space

-    def fit(self, X_train, y_train, budget=None, **kwargs):
-        y_train = pd.DataFrame(y_train, columns=['y'])
-        train_df = X_train.join(y_train)
-
-        if ('ds' not in train_df) or ('y' not in train_df):
-            raise ValueError(
-                'Dataframe for training forecast model must have columns "ds" and "y" with the dates and '
-                'values respectively.'
-            )
-
+    def _join(self, X_train, y_train):
+        train_df = super()._join(X_train, y_train)
        train_df.index = pd.to_datetime(train_df['ds'])
        train_df = train_df.drop('ds', axis=1)
+        return train_df

-        if 'n_jobs' in self.params:
-            self.params.pop('n_jobs')
-
+    def fit(self, X_train, y_train, budget=None, **kwargs):
        from statsmodels.tsa.arima.model import ARIMA as ARIMA_estimator
-        import warnings
        warnings.filterwarnings("ignore")
-
        current_time = time.time()
-        model = ARIMA_estimator(train_df,
-                                order=(self.params['p'], self.params['d'], self.params['q']),
-                                enforce_stationarity=False,
-                                enforce_invertibility=False)
-
+        train_df = self._join(X_train, y_train)
+        model = ARIMA_estimator(
+            train_df, order=(
+                self.params['p'], self.params['d'], self.params['q']),
+            enforce_stationarity=False, enforce_invertibility=False)
        model = model.fit()
        train_time = time.time() - current_time
        self._model = model
        return train_time

-    def predict(self, X_test, freq=None):
+    def predict(self, X_test):
        if self._model is not None:
-            if isinstance(X_test, int) and freq is not None:
-                forecast = self._model.forecast(steps=X_test).to_frame().reset_index()
+            if isinstance(X_test, int):
+                forecast = self._model.forecast(steps=X_test)
            elif isinstance(X_test, pd.DataFrame):
-                start_date = X_test.iloc[0, 0]
-                end_date = X_test.iloc[-1, 0]
-                forecast = self._model.predict(start=start_date, end=end_date)
+                start = X_test.iloc[0, 0]
+                end = X_test.iloc[-1, 0]
+                forecast = self._model.predict(start=start, end=end)
            else:
                raise ValueError(
-                    "either X_test(pd.Dataframe with dates for predictions, column ds) or"
-                    "X_test(int number of periods)+freq are required.")
+                    "X_test needs to be either a pd.Dataframe with dates as column ds)"
+                    " or an int number of periods for predict().")
            return forecast
        else:
-            return np.ones(X_test.shape[0])
+            return np.ones(X_test if isinstance(X_test, int)
+                           else X_test.shape[0])


-class SARIMAX(BaseEstimator):
+class SARIMAX(ARIMA):
    @classmethod
    def search_space(cls, **params):
        space = {
@ -1011,47 +1027,17 @@ class SARIMAX(BaseEstimator):
        return space

    def fit(self, X_train, y_train, budget=None, **kwargs):
-        y_train = pd.DataFrame(y_train, columns=['y'])
-        train_df = X_train.join(y_train)
-
-        if ('ds' not in train_df) or ('y' not in train_df):
-            raise ValueError(
-                'Dataframe for training forecast model must have columns "ds" and "y" with the dates and '
-                'values respectively.'
-            )
-
-        train_df.index = pd.to_datetime(train_df['ds'])
-        train_df = train_df.drop('ds', axis=1)
-
-        if 'n_jobs' in self.params:
-            self.params.pop('n_jobs')
-
        from statsmodels.tsa.statespace.sarimax import SARIMAX as SARIMAX_estimator
-
        current_time = time.time()
-        model = SARIMAX_estimator(train_df,
-                                  order=(self.params['p'], self.params['d'], self.params['q']),
-                                  seasonality_order=(self.params['P'], self.params['D'], self.params['Q'], self.params['s']),
-                                  enforce_stationarity=False,
-                                  enforce_invertibility=False)
-
+        train_df = self._join(X_train, y_train)
+        model = SARIMAX_estimator(
+            train_df, order=(
+                self.params['p'], self.params['d'], self.params['q']),
+            seasonality_order=(
+                self.params['P'], self.params['D'], self.params['Q'],
+                self.params['s']),
+            enforce_stationarity=False, enforce_invertibility=False)
        model = model.fit()
        train_time = time.time() - current_time
        self._model = model
        return train_time
-
-    def predict(self, X_test, freq=None):
-        if self._model is not None:
-            if isinstance(X_test, int) and freq is not None:
-                forecast = self._model.forecast(steps=X_test).to_frame().reset_index()
-            elif isinstance(X_test, pd.DataFrame):
-                start_date = X_test.iloc[0, 0]
-                end_date = X_test.iloc[-1, 0]
-                forecast = self._model.predict(start=start_date, end=end_date)
-            else:
-                raise ValueError(
-                    "either X_test(pd.Dataframe with dates for predictions, column ds)"
-                    "or X_test(int number of periods)+freq are required.")
-            return forecast
-        else:
-            return np.ones(X_test.shape[0])
--- a/flaml/searcher/blendsearch.py
+++ b/flaml/searcher/blendsearch.py
@ -8,19 +8,20 @@ import numpy as np
 import time
 import pickle

+
 try:
    from ray import __version__ as ray_version
    assert ray_version >= '1.0.0'
    from ray.tune.suggest import Searcher
    from ray.tune.suggest.optuna import OptunaSearch as GlobalSearch
-    from ray.tune.utils.util import flatten_dict
+    from ray.tune.utils.util import unflatten_dict
 except (ImportError, AssertionError):
    from .suggestion import Searcher
    from .suggestion import OptunaSearch as GlobalSearch
-    from .variant_generator import flatten_dict
+    from ..tune.trial import unflatten_dict
 from .search_thread import SearchThread
 from .flow2 import FLOW2
-from ..tune.space import add_cost_to_space, normalize   # TODO: , define_by_run_func
+from ..tune.space import add_cost_to_space, indexof, normalize, define_by_run_func

 import logging
 logger = logging.getLogger(__name__)
@ -133,9 +134,8 @@ class BlendSearch(Searcher):
        if global_search_alg is not None:
            self._gs = global_search_alg
        elif getattr(self, '__name__', None) != 'CFO':
-            gs_space = space
-            # TODO: when define_by_run is supported
-            # gs_space = define_by_run_func(space)
+            from functools import partial
+            gs_space = partial(define_by_run_func, space=space)
            try:
                gs_seed = seed - 10 if (seed - 10) >= 0 else seed - 11 + (1 << 32)
                if experimental:
@ -198,7 +198,10 @@ class BlendSearch(Searcher):
            # reset search when metric or mode changed
            self._ls.set_search_properties(metric, mode)
            if self._gs is not None:
-                self._gs.set_search_properties(metric, mode)
+                self._gs = GlobalSearch(
+                    space=self._gs._space, metric=metric, mode=mode,
+                    sampler=self._gs._sampler)
+                self._gs.space = self._ls.space
            self._init_search()
        if config:
            if 'time_budget_s' in config:
@ -312,9 +315,11 @@ class BlendSearch(Searcher):
                        self._expand_admissible_region(
                            self._ls_bound_min, self._ls_bound_max,
                            self._subspace.get(trial_id, self._ls.space))
-                    if self._gs is not None and self._experimental:
-                        # TODO: key match for hierarchical space
-                        self._gs.add_evaluated_point(flatten_dict(config), objective)
+                    # if self._gs is not None and self._experimental:
+                    #     # TODO: recover when supported
+                    #     converted = convert_key(config, self._gs.space)
+                    #     logger.info(converted)
+                    #     self._gs.add_evaluated_point(converted, objective)
                elif metric_constraint_satisfied and self._create_condition(
                        result):
                    # thread creator
@ -339,7 +344,6 @@ class BlendSearch(Searcher):
            del self._subspace[trial_id]

    def _create_thread(self, config, result, space):
-        # logger.info(f"create local search thread from {config}")
        self._search_thread_pool[self._thread_count] = SearchThread(
            self._ls.mode,
            self._ls.create(
@ -349,26 +353,29 @@ class BlendSearch(Searcher):
        )
        self._thread_count += 1
        self._update_admissible_region(
-            config, self._ls_bound_min, self._ls_bound_max, space)
+            unflatten_dict(config), self._ls_bound_min, self._ls_bound_max, space,
+            self._ls.space)

    def _update_admissible_region(
-        self, config, admissible_min, admissible_max, space: Dict = {}
+        self, config, admissible_min, admissible_max, subspace: Dict = {},
+        space: Dict = {}
    ):
        # update admissible region
-        normalized_config = normalize(config, space, config, {})
+        normalized_config = normalize(config, subspace, config, {})
        for key in admissible_min:
            value = normalized_config[key]
            if isinstance(admissible_max[key], list):
-                choice = space[key]['_choice_']
+                domain = space[key]
+                choice = indexof(domain, value)
                self._update_admissible_region(
                    value,
                    admissible_min[key][choice], admissible_max[key][choice],
-                    space[key]
+                    subspace[key], domain[choice]
                )
            elif isinstance(value, dict):
                self._update_admissible_region(
-                    value,
-                    admissible_min[key], admissible_max[key], space[key])
+                    value, admissible_min[key], admissible_max[key],
+                    subspace[key], space[key])
            else:
                if value > admissible_max[key]:
                    admissible_max[key] = value
@ -514,7 +521,8 @@ class BlendSearch(Searcher):
                    return None
                use_rs = 1
            if choice or self._valid(
-               config, space, self._gs_admissible_min, self._gs_admissible_max):
+               config, self._ls.space, space, self._gs_admissible_min,
+               self._gs_admissible_max):
                # LS or valid or no backup choice
                self._trial_proposed_by[trial_id] = choice
                self._search_thread_pool[choice].running += use_rs
@ -542,10 +550,11 @@ class BlendSearch(Searcher):
                # temporarily relax admissible region for parallel proposals
                self._update_admissible_region(
                    config, self._gs_admissible_min, self._gs_admissible_max,
-                    space)
+                    space, self._ls.space)
            else:
                self._update_admissible_region(
-                    config, self._ls_bound_min, self._ls_bound_max, space)
+                    config, self._ls_bound_min, self._ls_bound_max, space,
+                    self._ls.space)
                self._gs_admissible_min.update(self._ls_bound_min)
                self._gs_admissible_max.update(self._ls_bound_max)
            signature = self._ls.config_signature(config, space)
@ -632,11 +641,6 @@ class BlendSearch(Searcher):
        top_thread_id = backup_thread_id = 0
        priority1 = priority2 = self._search_thread_pool[0].priority
        for thread_id, thread in self._search_thread_pool.items():
-            # if thread_id:
-            #     print(
-            #         f"priority of thread {thread_id}={thread.priority}")
-            #     logger.debug(
-            #         f"thread {thread_id}.can_suggest={thread.can_suggest}")
            if thread_id and thread.can_suggest:
                priority = thread.priority
                if priority > priority1:
@ -647,21 +651,29 @@ class BlendSearch(Searcher):
                    backup_thread_id = thread_id
        return top_thread_id, backup_thread_id

-    def _valid(self, config: Dict, space: Dict, lower: Dict, upper: Dict) -> bool:
+    def _valid(self, config: Dict, space: Dict, subspace: Dict,
+               lower: Dict, upper: Dict) -> bool:
        ''' config validator
        '''
-        normalized_config = normalize(config, space, config, {})
+        normalized_config = normalize(config, subspace, config, {})
        for key, lb in lower.items():
            if key in config:
                value = normalized_config[key]
                if isinstance(lb, list):
-                    subspace = space[key]['_choice_']
+                    domain = space[key]
+                    index = indexof(domain, value)
+                    nestedspace = subspace[key]
+                    lb = lb[index]
+                    ub = upper[key][index]
                elif isinstance(lb, dict):
-                    subspace = space[key]
+                    nestedspace = subspace[key]
+                    domain = space[key]
+                    ub = upper[key]
                else:
-                    subspace = None
-                if subspace:
-                    valid = self._valid(value, subspace, lb, upper[key])
+                    nestedspace = None
+                if nestedspace:
+                    valid = self._valid(
+                        value, domain, nestedspace, lb, ub)
                    if not valid:
                        return False
                elif (value + self._ls.STEPSIZE < lower[key]
--- a/flaml/searcher/flow2.py
+++ b/flaml/searcher/flow2.py
@ -543,8 +543,9 @@ class FLOW2(Searcher):
            return False
        for key in self._unordered_cat_hp:
            # unordered cat choice is hard to reach by chance
-            if config1[key] != config2[key]:
+            if config1[key] != config2.get(key):
                return False
        delta = np.array(
-            [incumbent1[key] - incumbent2[key] for key in self._tunable_keys])
+            [incumbent1[key] - incumbent2.get(key, np.inf)
+             for key in self._tunable_keys])
        return np.linalg.norm(delta) <= self.step
--- a/flaml/searcher/search_thread.py
+++ b/flaml/searcher/search_thread.py
@ -12,7 +12,7 @@ try:
 except (ImportError, AssertionError):
    from .suggestion import Searcher
 from .flow2 import FLOW2
-from ..tune.space import (add_cost_to_space, unflatten_hierarchical)
+from ..tune.space import unflatten_hierarchical

 import logging
 logger = logging.getLogger(__name__)
@ -46,10 +46,6 @@ class SearchThread:
        self.cost_attr = cost_attr
        if search_alg:
            self.space = self._space = search_alg.space  # unflattened space
-            # TODO: remove when define_by_run is supported
-            if not isinstance(self._search_alg, FLOW2):
-                # remember const config
-                self._const = add_cost_to_space(self.space, {}, {})

    @classmethod
    def set_eps(cls, time_budget_s):
@ -63,8 +59,6 @@ class SearchThread:
        else:
            try:
                config = self._search_alg.suggest(trial_id)
-                # TODO: remove when define_by_run is supported
-                config.update(self._const)
                config, self.space = unflatten_hierarchical(config, self._space)
            except FloatingPointError:
                logger.warning(
--- a/flaml/searcher/suggestion.py
+++ b/flaml/searcher/suggestion.py
@ -17,9 +17,12 @@ This source file is adapted here because ray does not fully support Windows.

 Copyright (c) Microsoft Corporation.
 '''
+import time
+import functools
+import warnings
 import copy
 import logging
-from typing import Any, Dict, Optional, Union, List, Tuple
+from typing import Any, Dict, Optional, Union, List, Tuple, Callable
 import pickle
 from .variant_generator import parse_spec_vars
 from ..tune.sample import Categorical, Domain, Float, Integer, LogUniform, \
@ -332,13 +335,16 @@ class ConcurrencyLimiter(Searcher):

 try:
    import optuna as ot
-    from optuna.trial import TrialState as OptunaTrialState
+    from optuna.distributions import BaseDistribution as OptunaDistribution
    from optuna.samplers import BaseSampler
+    from optuna.trial import TrialState as OptunaTrialState
+    from optuna.trial import Trial as OptunaTrial
 except ImportError:
    ot = None
-    OptunaTrialState = None
+    OptunaDistribution = None
    BaseSampler = None
-
+    OptunaTrialState = None
+    OptunaTrial = None

 # (Optional) Default (anonymous) metric when using tune.report(x)
 DEFAULT_METRIC = "_metric"
@ -346,6 +352,78 @@ DEFAULT_METRIC = "_metric"
 # (Auto-filled) The index of this training iteration.
 TRAINING_ITERATION = "training_iteration"

+# print a warning if define by run function takes longer than this to execute
+DEFINE_BY_RUN_WARN_THRESHOLD_S = 1  # 1 is arbitrary
+
+
+def validate_warmstart(parameter_names: List[str],
+                       points_to_evaluate: List[Union[List, Dict]],
+                       evaluated_rewards: List,
+                       validate_point_name_lengths: bool = True):
+    """Generic validation of a Searcher's warm start functionality.
+    Raises exceptions in case of type and length mismatches between
+    parameters.
+    If ``validate_point_name_lengths`` is False, the equality of lengths
+    between ``points_to_evaluate`` and ``parameter_names`` will not be
+    validated.
+    """
+    if points_to_evaluate:
+        if not isinstance(points_to_evaluate, list):
+            raise TypeError(
+                "points_to_evaluate expected to be a list, got {}.".format(
+                    type(points_to_evaluate)))
+        for point in points_to_evaluate:
+            if not isinstance(point, (dict, list)):
+                raise TypeError(
+                    f"points_to_evaluate expected to include list or dict, "
+                    f"got {point}.")
+
+            if validate_point_name_lengths and (
+                    not len(point) == len(parameter_names)):
+                raise ValueError("Dim of point {}".format(point)
+                                 + " and parameter_names {}".format(
+                                     parameter_names) + " do not match.")
+
+    if points_to_evaluate and evaluated_rewards:
+        if not isinstance(evaluated_rewards, list):
+            raise TypeError(
+                "evaluated_rewards expected to be a list, got {}.".format(
+                    type(evaluated_rewards)))
+        if not len(evaluated_rewards) == len(points_to_evaluate):
+            raise ValueError(
+                "Dim of evaluated_rewards {}".format(evaluated_rewards)
+                + " and points_to_evaluate {}".format(points_to_evaluate)
+                + " do not match.")
+
+
+class _OptunaTrialSuggestCaptor:
+    """Utility to capture returned values from Optuna's suggest_ methods.
+    This will wrap around the ``optuna.Trial` object and decorate all
+    `suggest_` callables with a function capturing the returned value,
+    which will be saved in the ``captured_values`` dict.
+    """
+
+    def __init__(self, ot_trial: OptunaTrial) -> None:
+        self.ot_trial = ot_trial
+        self.captured_values: Dict[str, Any] = {}
+
+    def _get_wrapper(self, func: Callable) -> Callable:
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            # name is always the first arg for suggest_ methods
+            name = kwargs.get("name", args[0])
+            ret = func(*args, **kwargs)
+            self.captured_values[name] = ret
+            return ret
+
+        return wrapper
+
+    def __getattr__(self, item_name: str) -> Any:
+        item = getattr(self.ot_trial, item_name)
+        if item_name.startswith("suggest_") and callable(item):
+            return self._get_wrapper(item)
+        return item
+

 class OptunaSearch(Searcher):
    """A wrapper around Optuna to provide trial suggestions.
@ -355,16 +433,20 @@ class OptunaSearch(Searcher):
    This Searcher is a thin wrapper around Optuna's search algorithms.
    You can pass any Optuna sampler, which will be used to generate
    hyperparameter suggestions.
-    Please note that this wrapper does not support define-by-run, so the
-    search space will be configured before running the optimization. You will
-    also need to use a Tune trainable (e.g. using the function API) with
-    this wrapper.
-    For defining the search space, use ``ray.tune.suggest.optuna.param``
-    (see example).
    Args:
-        space (list): Hyperparameter search space definition for Optuna's
-            sampler. This is a list, and samples for the parameters will
-            be obtained in order.
+        space (dict|Callable): Hyperparameter search space definition for
+            Optuna's sampler. This can be either a :class:`dict` with
+            parameter names as keys and ``optuna.distributions`` as values,
+            or a Callable - in which case, it should be a define-by-run
+            function using ``optuna.trial`` to obtain the hyperparameter
+            values. The function should return either a :class:`dict` of
+            constant values with names as keys, or None.
+            For more information, see https://optuna.readthedocs.io\
+/en/stable/tutorial/10_key_features/002_configurations.html.
+            .. warning::
+                No actual computation should take place in the define-by-run
+                function. Instead, put the training logic inside the function
+                or class trainable passed to ``tune.run``.
        metric (str): The training result objective value attribute. If None
            but a mode was passed, the anonymous metric `_metric` will be used
            per default.
@ -411,15 +493,28 @@ class OptunaSearch(Searcher):
            metric="loss",
            mode="min")
        tune.run(trainable, search_alg=optuna_search)
+        # Equivalent Optuna define-by-run function approach:
+        def define_search_space(trial: optuna.Trial):
+            trial.suggest_float("a", 6, 8)
+            trial.suggest_float("b", 1e-4, 1e-2, log=True)
+            # training logic goes into trainable, this is just
+            # for search space definition
+        optuna_search = OptunaSearch(
+            define_search_space,
+            metric="loss",
+            mode="min")
+        tune.run(trainable, search_alg=optuna_search)
    .. versionadded:: 0.8.8
    """

    def __init__(self,
-                 space: Optional[Union[Dict, List[Tuple]]] = None,
+                 space: Optional[Union[Dict[str, "OptunaDistribution"], List[
+                     Tuple], Callable[["OptunaTrial"], Optional[Dict[
+                         str, Any]]]]] = None,
                 metric: Optional[str] = None,
                 mode: Optional[str] = None,
                 points_to_evaluate: Optional[List[Dict]] = None,
-                 sampler: Optional[BaseSampler] = None,
+                 sampler: Optional["BaseSampler"] = None,
                 seed: Optional[int] = None,
                 evaluated_rewards: Optional[List] = None):
        assert ot is not None, (
@ -490,6 +585,11 @@ class OptunaSearch(Searcher):
            load_if_exists=True)

        if self._points_to_evaluate:
+            validate_warmstart(
+                self._space,
+                self._points_to_evaluate,
+                self._evaluated_rewards,
+                validate_point_name_lengths=not callable(self._space))
            if self._evaluated_rewards:
                for point, reward in zip(self._points_to_evaluate,
                                         self._evaluated_rewards):
@ -512,6 +612,37 @@ class OptunaSearch(Searcher):
        self._setup_study(mode)
        return True

+    def _suggest_from_define_by_run_func(
+            self, func: Callable[["OptunaTrial"], Optional[Dict[str, Any]]],
+            ot_trial: "OptunaTrial") -> Dict:
+        captor = _OptunaTrialSuggestCaptor(ot_trial)
+        time_start = time.time()
+        ret = func(captor)
+        time_taken = time.time() - time_start
+        if time_taken > DEFINE_BY_RUN_WARN_THRESHOLD_S:
+            warnings.warn(
+                "Define-by-run function passed in the `space` argument "
+                f"took {time_taken} seconds to "
+                "run. Ensure that actual computation, training takes "
+                "place inside Tune's train functions or Trainables "
+                "passed to `tune.run`.")
+        if ret is not None:
+            if not isinstance(ret, dict):
+                raise TypeError(
+                    "The return value of the define-by-run function "
+                    "passed in the `space` argument should be "
+                    "either None or a `dict` with `str` keys. "
+                    f"Got {type(ret)}.")
+            if not all(isinstance(k, str) for k in ret.keys()):
+                raise TypeError(
+                    "At least one of the keys in the dict returned by the "
+                    "define-by-run function passed in the `space` argument "
+                    "was not a `str`.")
+        return {
+            **captor.captured_values,
+            **ret
+        } if ret else captor.captured_values
+
    def suggest(self, trial_id: str) -> Optional[Dict]:
        if not self._space:
            raise RuntimeError(
@ -538,6 +669,14 @@ class OptunaSearch(Searcher):
                    ot_trial, fn)(*args, **kwargs)
                for (fn, args, kwargs) in self._space
            }
+        elif callable(self._space):
+            if trial_id not in self._ot_trials:
+                self._ot_trials[trial_id] = self._ot_study.ask()
+
+            ot_trial = self._ot_trials[trial_id]
+
+            params = self._suggest_from_define_by_run_func(
+                self._space, ot_trial)
        else:
            # Use Optuna ask interface (since version 2.6.0)
            if trial_id not in self._ot_trials:
--- a/flaml/tune/space.py
+++ b/flaml/tune/space.py
@ -26,6 +26,9 @@ def define_by_run_func(
    for key, domain in space.items():
        if path:
            key = path + '/' + key
+        if isinstance(domain, dict):
+            config.update(define_by_run_func(trial, domain, key))
+            continue
        if not isinstance(domain, sample.Domain):
            config[key] = domain
            continue
@ -57,7 +60,7 @@ def define_by_run_func(
                trial.suggest_int(
                    key, domain.lower,
                    domain.upper - int(bool(not quantize)),
-                    step=quantize or 1, log=True)
+                    log=True)
            elif isinstance(sampler, sample.Uniform):
                # Upper bound should be inclusive for quantization and
                # exclusive otherwise
@ -76,7 +79,7 @@ def define_by_run_func(
                if isinstance(choice, dict):
                    key += f":{index}"
                    # the suffix needs to be removed from the final config
-                    config[key] = define_by_run_func(trial, choice, key)
+                    config.update(define_by_run_func(trial, choice, key))
        else:
            raise ValueError(
                "Optuna search does not support parameters of type "
@ -87,6 +90,32 @@ def define_by_run_func(
    return config


+def convert_key(
+    conf: Dict, space: Dict, path: str = ""
+) -> Optional[Dict[str, Any]]:
+    """Convert config keys to define-by-run keys.
+
+    Returns:
+        A dict with converted keys.
+    """
+    config = {}
+    for key, domain in space.items():
+        value = conf[key]
+        if path:
+            key = path + '/' + key
+        if isinstance(domain, dict):
+            config.update(convert_key(conf[key], domain, key))
+        elif isinstance(domain, sample.Categorical):
+            index = indexof(domain, value)
+            config[key + '_choice_'] = index
+            if isinstance(value, dict):
+                key += f":{index}"
+                config.update(convert_key(value, domain.categories[index], key))
+        else:
+            config[key] = value
+    return config
+
+
 def unflatten_hierarchical(config: Dict, space: Dict) -> Tuple[Dict, Dict]:
    '''unflatten hierarchical config'''
    hier = {}
@ -101,12 +130,18 @@ def unflatten_hierarchical(config: Dict, space: Dict) -> Tuple[Dict, Dict]:
            hier[true_key], subspace[true_key] = unflatten_hierarchical(
                value, space[true_key][choice])
        else:
+            if key.endswith("_choice_"):
+                key = key[:-8]
            domain = space.get(key)
            if domain is not None:
                subspace[key] = domain
                if isinstance(domain, sample.Domain):
                    sampler = domain.sampler
-                    if isinstance(sampler, sample.Quantized):
+                    if isinstance(domain, sample.Categorical):
+                        value = domain.categories[value]
+                        if isinstance(value, dict):
+                            continue
+                    elif isinstance(sampler, sample.Quantized):
                        q = sampler.q
                        sampler = sampler.sampler
                        if isinstance(sampler, sample.LogUniform):
--- a/notebook/flaml_automl.ipynb
+++ b/notebook/flaml_automl.ipynb
--- a/notebook/flaml_azureml.ipynb
+++ b/notebook/flaml_azureml.ipynb
@ -124,7 +124,8 @@
   "source": [
    "settings = {\n",
    "    \"time_budget\": 60,  # total running time in seconds\n",
-    "    \"metric\": 'accuracy',  # primary metrics can be chosen from: ['accuracy','roc_auc','roc_auc_ovr','roc_auc_ovo','f1','log_loss','mae','mse','r2']\n",
+    "    \"metric\": 'accuracy',  # can be: 'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',\n",
+    "                           # 'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg', 'micro_f1', 'macro_f1'\n",
    "    \"estimator_list\": ['lgbm', 'rf', 'xgboost'],  # list of ML learners\n",
    "    \"task\": 'classification',  # task type    \n",
    "    \"sample\": False,  # whether to subsample training data\n",
@ -265,7 +266,7 @@
   "execution_count": null,
   "source": [
    "from flaml.data import get_output_from_log\n",
-    "time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \\\n",
+    "time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n",
    "    get_output_from_log(filename = settings['log_file_name'], time_budget = 60)\n",
    "\n",
    "for config in config_history:\n",
--- a/notebook/flaml_forecast.ipynb
+++ b/notebook/flaml_forecast.ipynb
@ -104,10 +104,7 @@
    "    \"metric\": 'mape',  # primary metric for validation: 'mape' is generally used for forecast tasks\n",
    "    \"task\": 'forecast',  # task type\n",
    "    \"log_file_name\": 'CO2_forecast.log',  # flaml log file\n",
-    "    \"eval_method\": \"holdout\", # validation method can be chosen from ['auto', 'holdout', 'cv']\n",
-    "    # \"estimator_list\": [\"sarimax\"],\n",
-    "    # \"verbose\": 3,\n",
-    "    \"split_type\": 'time'  # for foretask task, 'split_type' has to be 'time'\n",
+    "    \"eval_method\": \"holdout\",  # validation method can be chosen from ['auto', 'holdout', 'cv']\n",
    "}"
   ],
   "outputs": [],
@ -1355,7 +1352,7 @@
   "execution_count": 11,
   "source": [
    "from flaml.data import get_output_from_log\n",
-    "time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \\\n",
+    "time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n",
    "    get_output_from_log(filename=settings['log_file_name'], time_budget=300)\n",
    "\n",
    "for config in config_history:\n",
--- a/notebook/flaml_lightgbm.ipynb
+++ b/notebook/flaml_lightgbm.ipynb
@ -445,7 +445,7 @@
   "execution_count": 11,
   "source": [
    "from flaml.data import get_output_from_log\n",
-    "time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \\\n",
+    "time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n",
    "    get_output_from_log(filename=settings['log_file_name'], time_budget=60)\n",
    "\n",
    "for config in config_history:\n",
--- a/notebook/flaml_xgboost.ipynb
+++ b/notebook/flaml_xgboost.ipynb
@ -362,7 +362,7 @@
   "execution_count": 10,
   "source": [
    "from flaml.data import get_output_from_log\n",
-    "time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \\\n",
+    "time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n",
    "    get_output_from_log(filename=settings['log_file_name'], time_budget=60)\n",
    "\n",
    "for config in config_history:\n",
--- a/setup.py
+++ b/setup.py
@ -62,7 +62,7 @@ setuptools.setup(
            "optuna==2.8.0"
        ],
        "ray": [
-            "ray[tune]==1.5.1",
+            "ray[tune]==1.6.0",
            "pyyaml<5.3.1",
        ],
        "azureml": [
@ -75,7 +75,7 @@ setuptools.setup(
            "vowpalwabbit",
        ],
        "nlp": [
-            "ray[tune]>=1.5.1",
+            "ray[tune]>=1.6.0",
            "transformers",
            "datasets==1.4.1",
            "tensorboardX<=2.2",
--- a/test/test_automl.py
+++ b/test/test_automl.py
@ -111,7 +111,8 @@ class MyLargeLGBM(LGBMEstimator):


 def custom_metric(X_test, y_test, estimator, labels, X_train, y_train,
-                  weight_test=None, weight_train=None):
+                  weight_test=None, weight_train=None, config=None,
+                  groups_test=None, groups_train=None):
    from sklearn.metrics import log_loss
    import time
    start = time.time()
@ -162,7 +163,10 @@ class TestAutoML(unittest.TestCase):
            "sample": True,  # whether to subsample training data
            "log_file_name": "test/wine.log",
            "log_training_metric": True,  # whether to log training metric
-            "ensemble": True,
+            "ensemble": {
+                "final_estimator": MyRegularizedGreedyForest(),
+                "passthrough": False,
+            },
            "n_jobs": 1,
        }

@ -274,9 +278,9 @@ class TestAutoML(unittest.TestCase):
            task='multi')
        print(estimator)
        time_history, best_valid_loss_history, valid_loss_history, \
-            config_history, train_loss_history = get_output_from_log(
+            config_history, metric_history = get_output_from_log(
                filename=automl_settings['log_file_name'], time_budget=6)
-        print(train_loss_history)
+        print(metric_history)

    def test_classification(self, as_frame=False):
        automl_experiment = AutoML()
@ -496,6 +500,30 @@ class TestAutoML(unittest.TestCase):
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)

+    def test_parallel(self, hpo_method=None):
+        automl_experiment = AutoML()
+        automl_settings = {
+            "time_budget": 10,
+            "task": 'regression',
+            "log_file_name": "test/boston.log",
+            "log_type": "all",
+            "n_jobs": 1,
+            "n_concurrent_trials": 2,
+            "hpo_method": hpo_method,
+        }
+        X_train, y_train = load_boston(return_X_y=True)
+        try:
+            automl_experiment.fit(X_train=X_train, y_train=y_train,
+                                  **automl_settings)
+            print(automl_experiment.predict(X_train))
+            print(automl_experiment.model)
+            print(automl_experiment.config_history)
+            print(automl_experiment.model_history)
+            print(automl_experiment.best_iteration)
+            print(automl_experiment.best_estimator)
+        except ImportError:
+            return
+
    def test_parallel_xgboost(self, hpo_method=None):
        automl_experiment = AutoML()
        automl_settings = {
--- a/test/test_forecast.py
+++ b/test/test_forecast.py
@ -1,20 +1,19 @@
-def test_forecast_automl_df(budget=5):
+import numpy as np
+from flaml import AutoML
+
+
+def test_forecast_automl(budget=5):
    # using dataframe
    import statsmodels.api as sm
-    data = sm.datasets.co2.load_pandas()
-    data = data.data
-    data = data['co2'].resample('MS').mean()
-    data = data.fillna(data.bfill())
-    data = data.to_frame().reset_index()
-    data = data.rename(columns={'index': 'ds', 'co2': 'y'})
+    data = sm.datasets.co2.load_pandas().data['co2'].resample('MS').mean()
+    data = data.fillna(data.bfill()).to_frame().reset_index().rename(
+        columns={'index': 'ds', 'co2': 'y'})
    num_samples = data.shape[0]
    time_horizon = 12
    split_idx = num_samples - time_horizon
-    X_train = data[:split_idx]
-    X_test = data[split_idx:]['ds'].to_frame()
-    y_test = data[split_idx:]['y'].to_frame()
-    ''' import AutoML class from flaml package '''
-    from flaml import AutoML
+    df = data[:split_idx]
+    X_test = data[split_idx:]['ds']
+    y_test = data[split_idx:]['y']
    automl = AutoML()
    settings = {
        "time_budget": budget,  # total running time in seconds
@ -22,13 +21,14 @@ def test_forecast_automl_df(budget=5):
        "task": 'forecast',  # task type
        "log_file_name": 'CO2_forecast.log',  # flaml log file
        "eval_method": "holdout",
-        "split_type": 'time'
    }
    '''The main flaml automl API'''
    try:
-        automl.fit(dataframe=X_train, **settings, period=time_horizon, freq='M')
+        automl.fit(dataframe=df, **settings, period=time_horizon)
    except ImportError:
-        automl.fit(dataframe=X_train, **settings, estimator_list=['arima', 'sarimax'], period=time_horizon, freq='M')
+        print("not using FBProphet due to ImportError")
+        automl.fit(dataframe=df, **settings, estimator_list=[
+            'arima', 'sarimax'], period=time_horizon)
    ''' retrieve best config and best learner'''
    print('Best ML leaner:', automl.best_estimator)
    print('Best hyperparmeter config:', automl.best_config)
@ -47,7 +47,7 @@ def test_forecast_automl_df(budget=5):
    from flaml.ml import sklearn_metric_loss_score
    print('mape', '=', sklearn_metric_loss_score('mape', y_pred, y_test))
    from flaml.data import get_output_from_log
-    time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \
+    time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
        get_output_from_log(filename=settings['log_file_name'], time_budget=budget)
    for config in config_history:
        print(config)
@ -55,65 +55,46 @@ def test_forecast_automl_df(budget=5):
    print(automl.max_resource)
    print(automl.min_resource)

-
-def test_forecast_automl_Xy(budget=5):
-    # using X_train and y_train
-    import statsmodels.api as sm
-    data = sm.datasets.co2.load_pandas()
-    data = data.data
-    data = data['co2'].resample('MS').mean()
-    data = data.fillna(data.bfill())
-    data = data.to_frame().reset_index()
-    num_samples = data.shape[0]
-    time_horizon = 12
-    split_idx = num_samples - time_horizon
-    X_train = data[:split_idx]['index'].to_frame()
-    y_train = data[:split_idx]['co2']
-    X_test = data[split_idx:]['index'].to_frame()
-    y_test = data[split_idx:]['co2'].to_frame()
-    ''' import AutoML class from flaml package '''
-    from flaml import AutoML
+    X_train = df['ds']
+    y_train = df['y']
    automl = AutoML()
-    settings = {
-        "time_budget": budget,  # total running time in seconds
-        "metric": 'mape',  # primary metric
-        "task": 'forecast',  # task type
-        "log_file_name": 'CO2_forecast.log',  # flaml log file
-        "eval_method": "holdout",
-        "split_type": 'time'
-    }
-    '''The main flaml automl API'''
    try:
-        automl.fit(X_train=X_train, y_train=y_train, **settings, period=time_horizon, freq='M')
+        automl.fit(X_train=X_train, y_train=y_train, **settings, period=time_horizon)
    except ImportError:
-        automl.fit(X_train=X_train, y_train=y_train, **settings, estimator_list=['arima', 'sarimax'], period=time_horizon, freq='M')
-    ''' retrieve best config and best learner'''
-    print('Best ML leaner:', automl.best_estimator)
-    print('Best hyperparmeter config:', automl.best_config)
-    print(f'Best mape on validation data: {automl.best_loss}')
-    print(f'Training duration of best run: {automl.best_config_train_time}s')
-    print(automl.model.estimator)
-    ''' pickle and save the automl object '''
-    import pickle
-    with open('automl.pkl', 'wb') as f:
-        pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
-    ''' compute predictions of testing dataset '''
-    y_pred = automl.predict(X_test)
-    print('Predicted labels', y_pred)
-    print('True labels', y_test)
-    ''' compute different metric values on testing dataset'''
-    from flaml.ml import sklearn_metric_loss_score
-    print('mape', '=', sklearn_metric_loss_score('mape', y_pred, y_test))
-    from flaml.data import get_output_from_log
-    time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \
-        get_output_from_log(filename=settings['log_file_name'], time_budget=budget)
-    for config in config_history:
-        print(config)
-    print(automl.prune_attr)
-    print(automl.max_resource)
-    print(automl.min_resource)
+        print("not using FBProphet due to ImportError")
+        automl.fit(X_train=X_train, y_train=y_train, **settings, estimator_list=[
+            'arima', 'sarimax'], period=time_horizon)
+
+
+def test_numpy():
+    X_train = np.arange('2014-01', '2021-01', dtype='datetime64[M]')
+    y_train = np.random.random(size=72)
+    automl = AutoML()
+    try:
+        automl.fit(
+            X_train=X_train[:60],  # a single column of timestamp
+            y_train=y_train,  # value for each timestamp
+            period=12,  # time horizon to forecast, e.g., 12 months
+            task='forecast', time_budget=3,  # time budget in seconds
+            log_file_name="test/forecast.log")
+        print(automl.predict(X_train[60:]))
+        print(automl.predict(12))
+    except ValueError:
+        print("ValueError for FBProphet is raised as expected.")
+    except ImportError:
+        print("not using FBProphet due to ImportError")
+        automl = AutoML()
+        automl.fit(
+            X_train=X_train[:72],  # a single column of timestamp
+            y_train=y_train,  # value for each timestamp
+            period=12,  # time horizon to forecast, e.g., 12 months
+            task='forecast', time_budget=1,  # time budget in seconds
+            estimator_list=['arima', 'sarimax'],
+            log_file_name="test/forecast.log")
+        print(automl.predict(X_train[72:]))
+        # an alternative way to specify predict steps for arima/sarimax
+        print(automl.predict(12))


 if __name__ == "__main__":
-    test_forecast_automl_df(60)
-    test_forecast_automl_Xy(60)
+    test_forecast_automl(60)
--- a/test/test_notebook_example.py
+++ b/test/test_notebook_example.py
@ -42,7 +42,7 @@ def test_automl(budget=5, dataset_format='dataframe'):
    print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test))
    print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test))
    from flaml.data import get_output_from_log
-    time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \
+    time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
        get_output_from_log(filename=settings['log_file_name'], time_budget=60)
    for config in config_history:
        print(config)
--- a/test/test_python_log.py
+++ b/test/test_python_log.py
@ -62,11 +62,11 @@ class TestLogging(unittest.TestCase):
            config = automl.best_config.copy()
            config['learner'] = automl.best_estimator
            automl.trainable({"ml": config})
-            from flaml import tune, CFO
+            from flaml import tune, BlendSearch
            from flaml.automl import size
            from functools import partial
-            search_alg = CFO(
-                metric='val_loss',
+            search_alg = BlendSearch(
+                metric='val_loss', mode='min',
                space=automl.search_space,
                low_cost_partial_config=automl.low_cost_partial_config,
                points_to_evaluate=automl.points_to_evaluate,
--- a/test/test_split.py
+++ b/test/test_split.py
@ -74,5 +74,41 @@ def test_groups():
    automl.fit(X, y, **automl_settings)


+def test_rank():
+    from sklearn.externals._arff import ArffException
+    try:
+        X, y = fetch_openml(name=dataset, return_X_y=True)
+    except (ArffException, ValueError):
+        from sklearn.datasets import load_wine
+        X, y = load_wine(return_X_y=True)
+    y = y.cat.codes
+    import numpy as np
+    automl = AutoML()
+    automl_settings = {
+        "time_budget": 2,
+        "task": "rank",
+        "log_file_name": "test/{}.log".format(dataset),
+        "model_history": True,
+        "eval_method": "cv",
+        "groups": np.array(     # group labels
+            [0] * 200 + [1] * 200 + [2] * 200 + [3] * 200 + [4] * 100 + [5] * 100),
+        "learner_selector": "roundrobin",
+    }
+    automl.fit(X, y, **automl_settings)
+
+    automl = AutoML()
+    automl_settings = {
+        "time_budget": 2,
+        "task": "rank",
+        "metric": "ndcg@5",     # 5 can be replaced by any number
+        "log_file_name": "test/{}.log".format(dataset),
+        "model_history": True,
+        "groups": [200] * 4 + [100] * 2,    # alternative way: group counts
+        # "estimator_list": ['lgbm', 'xgboost'],  # list of ML learners
+        "learner_selector": "roundrobin",
+    }
+    automl.fit(X, y, **automl_settings)
+
+
 if __name__ == "__main__":
    unittest.main()