autogen/flaml/automl.py

'''!
 * Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the
 * project root for license information.
'''
import time
import warnings
from functools import partial
import numpy as np
from scipy.sparse import issparse
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, \
    RepeatedKFold
from sklearn.utils import shuffle
import pandas as pd
import os
import contextlib

from .ml import compute_estimator, train_estimator, get_estimator_class, \
    get_classification_objective
from .config import (
    MIN_SAMPLE_TRAIN, MEM_THRES, RANDOM_SEED,
    SMALL_LARGE_THRES, CV_HOLDOUT_THRESHOLD, SPLIT_RATIO, N_SPLITS,
    SAMPLE_MULTIPLY_FACTOR)
from .data import concat
from . import tune
from .training_log import training_log_reader, training_log_writer

import logging
logger = logging.getLogger(__name__)
logger_formatter = logging.Formatter(
    '[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s',
    '%m-%d %H:%M:%S')

try:
    import mlflow
except ImportError:
    mlflow = None


class SearchState:

    @property
    def search_space(self):
        return self._search_space_domain

    @property
    def estimated_cost4improvement(self):
        return max(self.time_best_found - self.time_best_found_old,
                   self.total_time_used - self.time_best_found)

    def __init__(self, learner_class, data_size, task):
        self.init_eci = learner_class.cost_relative2lgbm()
        self._search_space_domain = {}
        self.init_config = {}
        self.low_cost_partial_config = {}
        self.cat_hp_cost = {}
        self.data_size = data_size
        search_space = learner_class.search_space(
            data_size=data_size, task=task)
        for name, space in search_space.items():
            assert 'domain' in space
            self._search_space_domain[name] = space['domain']
            if 'init_value' in space:
                self.init_config[name] = space['init_value']
            if 'low_cost_init_value' in space:
                self.low_cost_partial_config[name] = space[
                    'low_cost_init_value']
            if 'cat_hp_cost' in space:
                self.cat_hp_cost[name] = space['cat_hp_cost']
        self._hp_names = list(self._search_space_domain.keys())
        self.search_alg = None
        self.best_loss = self.best_loss_old = np.inf
        self.total_time_used = 0
        self.total_iter = 0
        self.base_eci = None
        self.time_best_found = 0
        self.time2eval_best = 0
        self.time2eval_best_old = 0
        self.trained_estimator = None
        self.sample_size = None
        self.trial_time = 0

    def update(self, analysis, time_used, save_model_history=False):
        if not analysis.trials:
            return
        result = analysis.trials[-1].last_result
        if result:
            config = result['config']
            if config and 'FLAML_sample_size' in config:
                self.sample_size = config['FLAML_sample_size']
            else:
                self.sample_size = self.data_size
            obj = result['val_loss']
            train_loss = result['train_loss']
            time2eval = result['time2eval']
            trained_estimator = result[
                'trained_estimator']
        else:
            obj, time2eval, trained_estimator = np.inf, 0.0, None
            train_loss = config = None
        self.trial_time = time2eval
        self.total_time_used += time_used
        self.total_iter += 1

        if self.base_eci is None:
            self.base_eci = time_used
        if (obj is not None) and (self.best_loss is None or obj < self.best_loss):
            self.best_loss_old = self.best_loss if self.best_loss < np.inf \
                else 2 * obj
            self.best_loss = obj
            self.time_best_found_old = self.time_best_found
            self.time_best_found = self.total_time_used
            self.iter_best_found = self.total_iter
            self.best_config = config
            self.best_config_sample_size = self.sample_size
            self.best_config_train_time = time_used
            if time2eval:
                self.time2eval_best_old = self.time2eval_best
                self.time2eval_best = time2eval
            if self.trained_estimator and trained_estimator and \
                self.trained_estimator != trained_estimator and \
                    not save_model_history:
                self.trained_estimator.cleanup()
            if trained_estimator:
                self.trained_estimator = trained_estimator
        self.train_loss, self.val_loss, self.config = train_loss, obj, config

    def get_hist_config_sig(self, sample_size, config):
        config_values = tuple([config[k] for k in self._hp_names])
        config_sig = str(sample_size) + '_' + str(config_values)
        return config_sig

    def est_retrain_time(self, retrain_sample_size):
        assert self.best_config_sample_size is not None, \
            'need to first get best_config_sample_size'
        return (self.time2eval_best * retrain_sample_size
                / self.best_config_sample_size)


class AutoMLState:

    def _prepare_sample_train_data(self, sample_size):
        full_size = len(self.y_train)
        sampled_weight = None
        if sample_size <= full_size:
            if isinstance(self.X_train, pd.DataFrame):
                sampled_X_train = self.X_train.iloc[:sample_size]
            else:
                sampled_X_train = self.X_train[:sample_size]
            sampled_y_train = self.y_train[:sample_size]
            weight = self.fit_kwargs.get('sample_weight')
            if weight is not None:
                sampled_weight = weight[:sample_size]
        else:
            sampled_X_train = concat(self.X_train, self.X_val)
            sampled_y_train = np.concatenate([self.y_train, self.y_val])
            weight = self.fit_kwargs.get('sample_weight')
            if weight is not None:
                sampled_weight = np.concatenate([weight, self.weight_val])
        return sampled_X_train, sampled_y_train, sampled_weight

    def _compute_with_config_base(self,
                                  estimator,
                                  config_w_resource):
        compute_start_time = time.time()
        if 'FLAML_sample_size' in config_w_resource:
            sample_size = int(config_w_resource['FLAML_sample_size'])
        else:
            sample_size = self.data_size
        sampled_X_train, sampled_y_train, sampled_weight = \
            self._prepare_sample_train_data(sample_size)
        if sampled_weight is not None:
            weight = self.fit_kwargs['sample_weight']
            self.fit_kwargs['sample_weight'] = sampled_weight
        else:
            weight = None
        config = config_w_resource.copy()
        if 'FLAML_sample_size' in config:
            del config['FLAML_sample_size']
        time_left = self.time_budget - self.time_from_start
        budget = time_left if sample_size == self.data_size else \
            time_left / 2 * sample_size / self.data_size

        trained_estimator, val_loss, train_loss, time2eval, _ = \
            compute_estimator(
                sampled_X_train,
                sampled_y_train,
                self.X_val,
                self.y_val,
                self.weight_val,
                budget,
                self.kf,
                config,
                self.task,
                estimator,
                self.eval_method,
                self.metric,
                self.best_loss,
                self.n_jobs,
                self.learner_classes.get(estimator),
                self.log_training_metric,
                self.fit_kwargs)
        result = {
            'total_time': time.time() - compute_start_time,
            'time2eval': time2eval,
            'train_loss': train_loss,
            'val_loss': val_loss,
            'trained_estimator': trained_estimator
        }
        with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
            tune.report(**result)
        if sampled_weight is not None:
            self.fit_kwargs['sample_weight'] = weight

    def _train_with_config(
        self, estimator, config_w_resource, sample_size=None
    ):
        config = config_w_resource.copy()
        if 'FLAML_sample_size' in config:
            if not sample_size:
                sample_size = config['FLAML_sample_size']
            del config['FLAML_sample_size']
        assert sample_size is not None
        sampled_X_train, sampled_y_train, sampled_weight = \
            self._prepare_sample_train_data(sample_size)
        if sampled_weight is not None:
            weight = self.fit_kwargs['sample_weight']
            self.fit_kwargs['sample_weight'] = sampled_weight
        else:
            weight = None
        budget = None if self.time_budget is None else (
            self.time_budget - self.time_from_start)
        estimator, train_time = train_estimator(
            sampled_X_train,
            sampled_y_train,
            config,
            self.task,
            estimator,
            self.n_jobs,
            self.learner_classes.get(estimator),
            budget,
            self.fit_kwargs)
        if sampled_weight is not None:
            self.fit_kwargs['sample_weight'] = weight
        return estimator, train_time


class AutoML:
    '''The AutoML class

    Example:

        .. code-block:: python

            automl = AutoML()
            automl_settings = {
                "time_budget": 60,
                "metric": 'accuracy',
                "task": 'classification',
                "log_file_name": 'test/mylog.log',
            }
            automl.fit(X_train = X_train, y_train = y_train,
                **automl_settings)

    '''

    from .version import __version__

    def __init__(self):
        self._track_iter = 0
        self._state = AutoMLState()
        self._state.learner_classes = {}

    @property
    def model_history(self):
        '''A dictionary of iter->model, storing the models when
        the best model is updated each time.
        '''
        return self._model_history

    @property
    def config_history(self):
        '''A dictionary of iter->(estimator, config, time),
        storing the best estimator, config, and the time when the best
        model is updated each time.
        '''
        return self._config_history

    @property
    def model(self):
        '''An object with `predict()` and `predict_proba()` method (for
        classification), storing the best trained model.
        '''
        if self._trained_estimator:
            return self._trained_estimator.model
        else:
            return None

    def best_model_for_estimator(self, estimator_name):
        '''Return the best model found for a particular estimator

        Args:
            estimator_name: a str of the estimator's name

        Returns:
            An object with `predict()` and `predict_proba()` method (for
        classification), storing the best trained model for estimator_name.
        '''
        if estimator_name in self._search_states:
            state = self._search_states[estimator_name]
            if hasattr(state, 'trained_estimator'):
                return state.trained_estimator.model
        return None

    @property
    def best_estimator(self):
        '''A string indicating the best estimator found.'''
        return self._best_estimator

    @property
    def best_iteration(self):
        '''An integer of the iteration number where the best
        config is found.'''
        return self._best_iteration

    @property
    def best_config(self):
        '''A dictionary of the best configuration.'''
        return self._search_states[self._best_estimator].best_config

    @property
    def best_loss(self):
        '''A float of the best loss found
        '''
        return self._state.best_loss

    @property
    def best_config_train_time(self):
        '''A float of the seconds taken by training the
        best config.'''
        return self._search_states[self._best_estimator].best_config_train_time

    @property
    def classes_(self):
        '''A list of n_classes elements for class labels.'''
        if self._label_transformer:
            return self._label_transformer.classes_.tolist()
        if self._trained_estimator:
            return self._trained_estimator.model.classes_.tolist()
        return None

    def predict(self, X_test):
        '''Predict label from features.

        Args:
            X_test: A numpy array of featurized instances, shape n * m.

        Returns:
            A numpy array of shape n * 1 - - each element is a predicted class
            label for an instance.
        '''
        if self._trained_estimator is None:
            warnings.warn(
                "No estimator is trained. Please run fit with enough budget.")
            return None
        X_test = self._preprocess(X_test)
        y_pred = self._trained_estimator.predict(X_test)
        if y_pred.ndim > 1:
            y_pred = y_pred.flatten()
        if self._label_transformer:
            return self._label_transformer.inverse_transform(pd.Series(
                y_pred))
        else:
            return y_pred

    def predict_proba(self, X_test):
        '''Predict the probability of each class from features, only works for
        classification problems.

        Args:
            X_test: A numpy array of featurized instances, shape n * m.

        Returns:
            A numpy array of shape n * c. c is the  # classes. Each element at
            (i, j) is the probability for instance i to be in class j.
        '''
        X_test = self._preprocess(X_test)
        proba = self._trained_estimator.predict_proba(X_test)
        return proba

    def _preprocess(self, X):
        if issparse(X):
            X = X.tocsr()
        if self._transformer:
            X = self._transformer.transform(X)
        return X

    def _validate_data(self, X_train_all, y_train_all, dataframe, label,
                       X_val=None, y_val=None):
        if X_train_all is not None and y_train_all is not None:
            if not (isinstance(X_train_all, np.ndarray) or issparse(X_train_all)
                    or isinstance(X_train_all, pd.DataFrame)):
                raise ValueError(
                    "X_train_all must be a numpy array, a pandas dataframe, "
                    "or Scipy sparse matrix.")
            if not (isinstance(y_train_all, np.ndarray)
                    or isinstance(y_train_all, pd.Series)):
                raise ValueError(
                    "y_train_all must be a numpy array or a pandas series.")
            if X_train_all.size == 0 or y_train_all.size == 0:
                raise ValueError("Input data must not be empty.")
            if isinstance(y_train_all, np.ndarray):
                y_train_all = y_train_all.flatten()
            if X_train_all.shape[0] != y_train_all.shape[0]:
                raise ValueError(
                    "# rows in X_train must match length of y_train.")
            self._df = isinstance(X_train_all, pd.DataFrame)
            self._nrow, self._ndim = X_train_all.shape
            X, y = X_train_all, y_train_all
        elif dataframe is not None and label is not None:
            if not isinstance(dataframe, pd.DataFrame):
                raise ValueError("dataframe must be a pandas DataFrame")
            if label not in dataframe.columns:
                raise ValueError("label must a column name in dataframe")
            self._df = True
            X = dataframe.drop(columns=label)
            self._nrow, self._ndim = X.shape
            y = dataframe[label]
        else:
            raise ValueError(
                "either X_train+y_train or dataframe+label are required")
        if issparse(X_train_all):
            self._transformer = self._label_transformer = False
            self._X_train_all, self._y_train_all = X, y
        else:
            from .data import DataTransformer
            self._transformer = DataTransformer()
            self._X_train_all, self._y_train_all = \
                self._transformer.fit_transform(X, y, self._state.task)
            self._label_transformer = self._transformer.label_transformer
        self._sample_weight_full = self._state.fit_kwargs.get('sample_weight')
        if X_val is not None and y_val is not None:
            if not (isinstance(X_val, np.ndarray) or issparse(X_val)
                    or isinstance(X_val, pd.DataFrame)):
                raise ValueError(
                    "X_val must be None, a numpy array, a pandas dataframe, "
                    "or Scipy sparse matrix.")
            if not (isinstance(y_val, np.ndarray)
                    or isinstance(y_val, pd.Series)):
                raise ValueError(
                    "y_val must be None, a numpy array or a pandas series.")
            if X_val.size == 0 or y_val.size == 0:
                raise ValueError(
                    "Validation data are expected to be nonempty. "
                    "Use None for X_val and y_val if no validation data.")
            if isinstance(y_val, np.ndarray):
                y_val = y_val.flatten()
            if X_val.shape[0] != y_val.shape[0]:
                raise ValueError("# rows in X_val must match length of y_val.")
            if self._transformer:
                self._state.X_val = self._transformer.transform(X_val)
            else:
                self._state.X_val = X_val
            if self._label_transformer:
                self._state.y_val = self._label_transformer.transform(y_val)
            else:
                self._state.y_val = y_val
        else:
            self._state.X_val = self._state.y_val = None

    def _prepare_data(self,
                      eval_method,
                      split_ratio,
                      n_splits):
        X_val, y_val = self._state.X_val, self._state.y_val
        if issparse(X_val):
            X_val = X_val.tocsr()
        X_train_all, y_train_all = \
            self._X_train_all, self._y_train_all
        if issparse(X_train_all):
            X_train_all = X_train_all.tocsr()
        if self._state.task != 'regression' and self._state.fit_kwargs.get(
                'sample_weight') is None:
            # logger.info(f"label {pd.unique(y_train_all)}")
            label_set, counts = np.unique(y_train_all, return_counts=True)
            # augment rare classes
            rare_threshld = 20
            rare = counts < rare_threshld
            rare_label, rare_counts = label_set[rare], counts[rare]
            for i, label in enumerate(rare_label):
                count = rare_count = rare_counts[i]
                rare_index = y_train_all == label
                n = len(y_train_all)
                while count < rare_threshld:
                    if self._df:
                        X_train_all = concat(X_train_all,
                                             X_train_all.iloc[:n].loc[rare_index])
                    else:
                        X_train_all = concat(X_train_all,
                                             X_train_all[:n][rare_index, :])
                    if isinstance(y_train_all, pd.Series):
                        y_train_all = concat(y_train_all,
                                             y_train_all.iloc[:n].loc[rare_index])
                    else:
                        y_train_all = np.concatenate([y_train_all,
                                                      y_train_all[:n][rare_index]])
                    count += rare_count
                logger.debug(
                    f"class {label} augmented from {rare_count} to {count}")
        if 'sample_weight' in self._state.fit_kwargs:
            X_train_all, y_train_all, self._state.fit_kwargs[
                'sample_weight'] = shuffle(
                    X_train_all, y_train_all,
                    self._state.fit_kwargs['sample_weight'],
                    random_state=RANDOM_SEED)
        else:
            X_train_all, y_train_all = shuffle(
                X_train_all, y_train_all, random_state=RANDOM_SEED)
        if self._df:
            X_train_all.reset_index(drop=True, inplace=True)
            if isinstance(y_train_all, pd.Series):
                y_train_all.reset_index(drop=True, inplace=True)

        X_train, y_train = X_train_all, y_train_all
        if X_val is None:
            if self._state.task != 'regression' and eval_method == 'holdout':
                label_set, first = np.unique(y_train_all, return_index=True)
                rest = []
                last = 0
                first.sort()
                for i in range(len(first)):
                    rest.extend(range(last, first[i]))
                    last = first[i] + 1
                rest.extend(range(last, len(y_train_all)))
                X_first = X_train_all.iloc[first] if self._df else X_train_all[
                    first]
                X_rest = X_train_all.iloc[rest] if self._df else X_train_all[rest]
                y_rest = y_train_all.iloc[rest] if isinstance(
                    y_train_all, pd.Series) else y_train_all[rest]
                stratify = y_rest if self._split_type == 'stratified' else \
                    None
                if 'sample_weight' in self._state.fit_kwargs:
                    X_train, X_val, y_train, y_val, weight_train, weight_val = \
                        train_test_split(
                            X_rest,
                            y_rest,
                            self._state.fit_kwargs['sample_weight'][rest],
                            test_size=split_ratio,
                            random_state=RANDOM_SEED)
                    weight1 = self._state.fit_kwargs['sample_weight'][first]
                    self._state.weight_val = concat(weight1, weight_val)
                    self._state.fit_kwargs['sample_weight'] = concat(
                        weight1, weight_train)
                else:
                    X_train, X_val, y_train, y_val = train_test_split(
                        X_rest,
                        y_rest,
                        test_size=split_ratio,
                        stratify=stratify,
                        random_state=RANDOM_SEED)
                X_train = concat(X_first, X_train)
                y_train = concat(
                    label_set, y_train) if self._df else np.concatenate(
                    [label_set, y_train])
                X_val = concat(X_first, X_val)
                y_val = concat(label_set, y_val) if self._df else \
                    np.concatenate([label_set, y_val])
                _, y_train_counts_elements = np.unique(y_train,
                                                       return_counts=True)
                _, y_val_counts_elements = np.unique(y_val,
                                                     return_counts=True)
            elif eval_method == 'holdout' and self._state.task == 'regression':
                if 'sample_weight' in self._state.fit_kwargs:
                    X_train, X_val, y_train, y_val, self._state.fit_kwargs[
                        'sample_weight'], self._state.weight_val = \
                        train_test_split(
                            X_train_all,
                            y_train_all,
                            self._state.fit_kwargs['sample_weight'],
                            test_size=split_ratio,
                            random_state=RANDOM_SEED)
                else:
                    X_train, X_val, y_train, y_val = train_test_split(
                        X_train_all,
                        y_train_all,
                        test_size=split_ratio,
                        random_state=RANDOM_SEED)
        self._state.data_size = X_train.shape[0]
        if X_val is None:
            self.data_size_full = self._state.data_size
        else:
            self.data_size_full = self._state.data_size + X_val.shape[0]
        self._state.X_train, self._state.y_train, self._state.X_val, \
            self._state.y_val = (X_train, y_train, X_val, y_val)
        if self._split_type == "stratified":
            logger.info("Using StratifiedKFold")
            assert y_train_all.size >= n_splits, (
                f"{n_splits}-fold cross validation"
                f" requires input data with at least {n_splits} examples.")
            assert y_train_all.size >= 2 * n_splits, (
                f"{n_splits}-fold cross validation with metric=r2 "
                f"requires input data with at least {n_splits*2} examples.")
            self._state.kf = RepeatedStratifiedKFold(
                n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED)
        else:
            logger.info("Using RepeatedKFold")
            self._state.kf = RepeatedKFold(
                n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED)

    def add_learner(self,
                    learner_name,
                    learner_class):
        '''Add a customized learner

        Args:
            learner_name: A string of the learner's name
            learner_class: A subclass of flaml.model.BaseEstimator
        '''
        self._state.learner_classes[learner_name] = learner_class

    def get_estimator_from_log(self, log_file_name, record_id, task):
        '''Get the estimator from log file

        Args:
            log_file_name: A string of the log file name
            record_id: An integer of the record ID in the file,
                0 corresponds to the first trial
            task: A string of the task type,
                'binary', 'multi', or 'regression'

        Returns:
            An estimator object for the given configuration
        '''

        with training_log_reader(log_file_name) as reader:
            record = reader.get_record(record_id)
            estimator = record.learner
            config = record.config

        estimator, _ = train_estimator(
            None, None, config, task, estimator,
            estimator_class=self._state.learner_classes.get(estimator))
        return estimator

    def retrain_from_log(self,
                         log_file_name,
                         X_train=None,
                         y_train=None,
                         dataframe=None,
                         label=None,
                         time_budget=0,
                         task='classification',
                         eval_method='auto',
                         split_ratio=SPLIT_RATIO,
                         n_splits=N_SPLITS,
                         split_type="stratified",
                         n_jobs=1,
                         train_best=True,
                         train_full=False,
                         record_id=-1,
                         **fit_kwargs):
        '''Retrain from log file

        Args:
            time_budget: A float number of the time budget in seconds
            log_file_name: A string of the log file name
            X_train: A numpy array of training data in shape n*m
            y_train: A numpy array of labels in shape n*1
            task: A string of the task type, e.g.,
                'classification', 'regression'
            eval_method: A string of resampling strategy, one of
                ['auto', 'cv', 'holdout']
            split_ratio: A float of the validation data percentage for holdout
            n_splits: An integer of the number of folds for cross-validation
            n_jobs: An integer of the number of threads for training
            train_best: A boolean of whether to train the best config in the
                time budget; if false, train the last config in the budget
            train_full: A boolean of whether to train on the full data. If true,
                eval_method and sample_size in the log file will be ignored
            record_id: the ID of the training log record from which the model will
                be retrained. By default `record_id = -1` which means this will be
                ignored. `record_id = 0` corresponds to the first trial, and
                when `record_id >= 0`, `time_budget` will be ignored.
            **fit_kwargs: Other key word arguments to pass to fit() function of
                the searched learners, such as sample_weight
        '''
        self._state.task = task
        self._state.fit_kwargs = fit_kwargs
        self._validate_data(X_train, y_train, dataframe, label)

        logger.info('log file name {}'.format(log_file_name))

        best_config = None
        best_val_loss = float('+inf')
        best_estimator = None
        sample_size = None
        time_used = 0.0
        training_duration = 0
        best = None
        with training_log_reader(log_file_name) as reader:
            if record_id >= 0:
                best = reader.get_record(record_id)
            else:
                for record in reader.records():
                    time_used = record.total_search_time
                    if time_used > time_budget:
                        break
                    training_duration = time_used
                    val_loss = record.validation_loss
                    if val_loss <= best_val_loss or not train_best:
                        if val_loss == best_val_loss and train_best:
                            size = record.sample_size
                            if size > sample_size:
                                best = record
                                best_val_loss = val_loss
                                sample_size = size
                        else:
                            best = record
                            size = record.sample_size
                            best_val_loss = val_loss
                            sample_size = size
                if not training_duration:
                    from .model import BaseEstimator as Estimator
                    self._trained_estimator = Estimator()
                    self._trained_estimator.model = None
                    return training_duration
        if not best:
            return
        best_estimator = best.learner
        best_config = best.config
        sample_size = len(self._y_train_all) if train_full \
            else best.sample_size

        logger.info(
            'estimator = {}, config = {}, #training instances = {}'.format(
                best_estimator, best_config, sample_size))
        # Partially copied from fit() function
        # Initilize some attributes required for retrain_from_log
        self._state.task = task
        if self._state.task == 'classification':
            self._state.task = get_classification_objective(
                len(np.unique(self._y_train_all)))
            assert split_type in ["stratified", "uniform"]
            self._split_type = split_type
        else:
            self._split_type = "uniform"
        if record_id >= 0:
            eval_method = 'cv'
        elif eval_method == 'auto':
            eval_method = self._decide_eval_method(time_budget)
        self.modelcount = 0
        self._prepare_data(eval_method, split_ratio, n_splits)
        self._state.time_budget = None
        self._state.n_jobs = n_jobs
        self._trained_estimator = self._state._train_with_config(
            best_estimator, best_config, sample_size)[0]
        return training_duration

    def _decide_eval_method(self, time_budget):
        if self._state.X_val is not None:
            return 'holdout'
        nrow, dim = self._nrow, self._ndim
        if nrow * dim / 0.9 < SMALL_LARGE_THRES * (
                time_budget / 3600) and nrow < CV_HOLDOUT_THRESHOLD:
            # time allows or sampling can be used and cv is necessary
            return 'cv'
        else:
            return 'holdout'

    def fit(self,
            X_train=None,
            y_train=None,
            dataframe=None,
            label=None,
            metric='auto',
            task='classification',
            n_jobs=-1,
            log_file_name='default.log',
            estimator_list='auto',
            time_budget=60,
            max_iter=1000000,
            sample=True,
            ensemble=False,
            eval_method='auto',
            log_type='better',
            model_history=False,
            split_ratio=SPLIT_RATIO,
            n_splits=N_SPLITS,
            log_training_metric=False,
            mem_thres=MEM_THRES,
            X_val=None,
            y_val=None,
            sample_weight_val=None,
            retrain_full=True,
            split_type="stratified",
            learner_selector='sample',
            hpo_method=None,
            verbose=1,
            **fit_kwargs):
        '''Find a model for a given task

        Args:
            X_train: A numpy array or a pandas dataframe of training data in
             shape (n, m)
            y_train: A numpy array or a pandas series of labels in shape (n,)
            dataframe: A dataframe of training data including label column
            label: A str of the label column name
                Note: If X_train and y_train are provided,
                dataframe and label are ignored;
                If not, dataframe and label must be provided.
            metric: A string of the metric name or a function,
                e.g., 'accuracy', 'roc_auc', 'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2'
                if passing a customized metric function, the function needs to
                have the follwing signature:

                .. code-block:: python

                    def custom_metric(X_test, y_test, estimator, labels,
                     X_train, y_train, weight_test=None, weight_train=None):
                        return metric_to_minimize, metrics_to_log

                which returns a float number as the minimization objective,
                and a tuple of floats as the metrics to log
            task: A string of the task type, e.g.,
                'classification', 'regression'
            n_jobs: An integer of the number of threads for training
            log_file_name: A string of the log file name
            estimator_list: A list of strings for estimator names, or 'auto'
                e.g.,

                .. code-block:: python

                    ['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree']

            time_budget: A float number of the time budget in seconds
            max_iter: An integer of the maximal number of iterations
            sample: A boolean of whether to sample the training data during
                search
            eval_method: A string of resampling strategy, one of
                ['auto', 'cv', 'holdout']
            split_ratio: A float of the valiation data percentage for holdout
            n_splits: An integer of the number of folds for cross - validation
            log_type: A string of the log type, one of
                ['better', 'all']
                'better' only logs configs with better loss than previos iters
                'all' logs all the tried configs
            model_history: A boolean of whether to keep the history of best
                models in the history property. Make sure memory is large
                enough if setting to True.
            log_training_metric: A boolean of whether to log the training
                metric for each model.
            mem_thres: A float of the memory size constraint in bytes
            X_val: None | a numpy array or a pandas dataframe of validation data
            y_val: None | a numpy array or a pandas series of validation labels
            sample_weight_val: None | a numpy array of the sample weight of
                validation data
            verbose: int, default=1 | Controls the verbosity, higher means more
                messages
            **fit_kwargs: Other key word arguments to pass to fit() function of
                the searched learners, such sample_weight
        '''
        self._start_time_flag = time.time()
        self._state.task = task
        self._state.log_training_metric = log_training_metric
        self._state.fit_kwargs = fit_kwargs
        self._state.weight_val = sample_weight_val
        self._validate_data(X_train, y_train, dataframe, label, X_val, y_val)
        self._search_states = {}  # key: estimator name; value: SearchState
        self._random = np.random.RandomState(RANDOM_SEED)
        self._learner_selector = learner_selector
        old_level = logger.getEffectiveLevel()
        self.verbose = verbose
        if verbose == 0:
            logger.setLevel(logging.WARNING)
        if self._state.task == 'classification':
            self._state.task = get_classification_objective(
                len(np.unique(self._y_train_all)))
            assert split_type in ["stratified", "uniform"]
            self._split_type = split_type
        else:
            self._split_type = "uniform"
        if eval_method == 'auto' or self._state.X_val is not None:
            eval_method = self._decide_eval_method(time_budget)
        self._state.eval_method = eval_method
        if (not mlflow or not mlflow.active_run()) and not logger.handlers:
            # Add the console handler.
            _ch = logging.StreamHandler()
            _ch.setFormatter(logger_formatter)
            logger.addHandler(_ch)
        logger.info("Evaluation method: {}".format(eval_method))

        self._retrain_full = retrain_full and (
            eval_method == 'holdout' and self._state.X_val is None)
        self._prepare_data(eval_method, split_ratio, n_splits)
        self._sample = sample and eval_method != 'cv' and (
            MIN_SAMPLE_TRAIN * SAMPLE_MULTIPLY_FACTOR < self._state.data_size)
        if 'auto' == metric:
            if 'binary' in self._state.task:
                metric = 'roc_auc'
            elif 'multi' in self._state.task:
                metric = 'log_loss'
            else:
                metric = 'r2'
        self._state.metric = metric
        if metric in ['r2', 'accuracy', 'roc_auc', 'f1', 'ap', 'micro_f1', 'macro_f1']:
            error_metric = f"1-{metric}"
        elif isinstance(metric, str):
            error_metric = metric
        else:
            error_metric = 'customized metric'
        logger.info(f'Minimizing error metric: {error_metric}')

        if 'auto' == estimator_list:
            estimator_list = ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree']
            if 'regression' != self._state.task:
                estimator_list += ['lrl1']
        for estimator_name in estimator_list:
            if estimator_name not in self._state.learner_classes:
                self.add_learner(
                    estimator_name,
                    get_estimator_class(self._state.task, estimator_name))
        # set up learner search space
        for estimator_name in estimator_list:
            estimator_class = self._state.learner_classes[estimator_name]
            self._search_states[estimator_name] = SearchState(
                learner_class=estimator_class,
                data_size=self._state.data_size, task=self._state.task,
            )
        logger.info("List of ML learners in AutoML Run: {}".format(
            estimator_list))
        self._hpo_method = hpo_method or 'cfo'
        with training_log_writer(log_file_name) as save_helper:
            self._training_log = save_helper
            self._state.time_budget = time_budget
            self.estimator_list = estimator_list
            self._ensemble = ensemble
            self._max_iter = max_iter
            self._mem_thres = mem_thres
            self._log_type = log_type
            self.split_ratio = split_ratio
            self._save_model_history = model_history
            self._state.n_jobs = n_jobs
            self._search()
            logger.info("fit succeeded")
        if verbose == 0:
            logger.setLevel(old_level)

    def _search(self):
        # initialize the search_states
        self._eci = []
        self._state.best_loss = float('+inf')
        self._state.time_from_start = 0
        self._estimator_index = None
        self._best_iteration = 0
        self._model_history = {}
        self._config_history = {}
        self._max_iter_per_learner = 1000000  # TODO
        self._iter_per_learner = dict([(e, 0) for e in self.estimator_list])
        self._fullsize_reached = False
        self._trained_estimator = None
        self._best_estimator = None
        self._retrained_config = {}
        est_retrain_time = next_trial_time = 0
        best_config_sig = None
        # use ConcurrencyLimiter to limit the amount of concurrency when
        # using a search algorithm
        better = True  # whether we find a better model in one trial
        if self._ensemble:
            self.best_model = {}
        try:
            from ray.tune.suggest import ConcurrencyLimiter
        except ImportError:
            from .searcher.suggestion import ConcurrencyLimiter
        if self._hpo_method in ('cfo', 'grid'):
            from flaml import CFO as SearchAlgo
        elif 'optuna' == self._hpo_method:
            try:
                from ray.tune.suggest.optuna import OptunaSearch as SearchAlgo
            except ImportError:
                from .searcher.suggestion import OptunaSearch as SearchAlgo
        elif 'bs' == self._hpo_method:
            from flaml import BlendSearch as SearchAlgo
        else:
            raise NotImplementedError

        for self._track_iter in range(self._max_iter):
            if self._estimator_index is None:
                estimator = self.estimator_list[0]
            else:
                estimator = self._select_estimator(self.estimator_list)
                if not estimator:
                    break
            logger.info(
                f"iteration {self._track_iter}, current learner {estimator}")
            search_state = self._search_states[estimator]
            self._state.time_from_start = time.time() - self._start_time_flag
            time_left = self._state.time_budget - self._state.time_from_start
            budget_left = time_left if not self._retrain_full or better or (
                not self.best_estimator) or self._search_states[
                self.best_estimator].sample_size < self._state.data_size \
                else time_left - est_retrain_time
            if not search_state.search_alg:
                search_state.training_function = partial(
                    AutoMLState._compute_with_config_base,
                    self._state, estimator)
                search_space = search_state.search_space
                if self._sample:
                    prune_attr = 'FLAML_sample_size'
                    min_resource = MIN_SAMPLE_TRAIN
                    max_resource = self._state.data_size
                else:
                    prune_attr = min_resource = max_resource = None
                learner_class = self._state.learner_classes.get(estimator)
                if 'grid' == self._hpo_method:  # for synthetic exp only
                    points_to_evaluate = []
                    space = search_space
                    keys = list(space.keys())
                    domain0, domain1 = space[keys[0]], space[keys[1]]
                    for x1 in range(domain0.lower, domain0.upper + 1):
                        for x2 in range(domain1.lower, domain1.upper + 1):
                            points_to_evaluate.append({
                                keys[0]: x1,
                                keys[1]: x2,
                            })
                    self._max_iter_per_learner = len(points_to_evaluate)
                    low_cost_partial_config = None
                else:
                    points_to_evaluate = [search_state.init_config]
                    low_cost_partial_config = search_state.low_cost_partial_config
                if self._hpo_method in ('bs', 'cfo', 'grid'):
                    algo = SearchAlgo(
                        metric='val_loss', mode='min', space=search_space,
                        points_to_evaluate=points_to_evaluate,
                        low_cost_partial_config=low_cost_partial_config,
                        cat_hp_cost=search_state.cat_hp_cost,
                        prune_attr=prune_attr,
                        min_resource=min_resource,
                        max_resource=max_resource,
                        resources_per_trial={"cpu": self._state.n_jobs,
                                             "mem": self._mem_thres},
                        mem_size=learner_class.size)
                else:
                    algo = SearchAlgo(
                        metric='val_loss', mode='min', space=search_space,
                        points_to_evaluate=points_to_evaluate,
                    )
                search_state.search_alg = ConcurrencyLimiter(algo,
                                                             max_concurrent=1)
            else:
                search_space = None
                if self._hpo_method in ('bs', 'cfo'):
                    search_state.search_alg.searcher.set_search_properties(
                        config={
                            'metric_target': self._state.best_loss,
                        },
                    )
            start_run_time = time.time()
            analysis = tune.run(
                search_state.training_function,
                search_alg=search_state.search_alg,
                time_budget_s=budget_left,
                verbose=max(self.verbose - 1, 0),
                use_ray=False)
            time_used = time.time() - start_run_time
            better = False
            if analysis.trials:
                search_state.update(analysis, time_used=time_used,
                                    save_model_history=self._save_model_history)
                if self._estimator_index is None:
                    eci_base = search_state.init_eci
                    self._eci.append(search_state.estimated_cost4improvement)
                    for e in self.estimator_list[1:]:
                        self._eci.append(self._search_states[e].init_eci
                                         / eci_base * self._eci[0])
                    self._estimator_index = 0
                self._state.time_from_start = time.time() - self._start_time_flag
                # logger.info(f"{self._search_states[estimator].sample_size}, {data_size}")
                if search_state.sample_size == self._state.data_size:
                    self._iter_per_learner[estimator] += 1
                    if not self._fullsize_reached:
                        self._fullsize_reached = True
                if search_state.best_loss < self._state.best_loss:
                    best_config_sig = estimator + search_state.get_hist_config_sig(
                        self.data_size_full,
                        search_state.best_config)
                    self._state.best_loss = search_state.best_loss
                    self._best_estimator = estimator
                    est_retrain_time = search_state.est_retrain_time(
                        self.data_size_full) if (
                            best_config_sig not in self._retrained_config) else 0
                    self._config_history[self._track_iter] = (
                        estimator,
                        search_state.best_config,
                        self._state.time_from_start)
                    if self._save_model_history:
                        self._model_history[
                            self._track_iter] = search_state.trained_estimator.model
                    elif self._trained_estimator:
                        del self._trained_estimator
                        self._trained_estimator = None
                    self._trained_estimator = search_state.trained_estimator
                    self._best_iteration = self._track_iter
                    better = True
                    next_trial_time = search_state.time2eval_best
                if better or self._log_type == 'all':
                    self._training_log.append(
                        self._iter_per_learner[estimator],
                        search_state.train_loss,
                        search_state.trial_time,
                        self._state.time_from_start,
                        search_state.val_loss,
                        search_state.config,
                        search_state.best_loss,
                        search_state.best_config,
                        estimator,
                        search_state.sample_size)
                    if mlflow is not None and mlflow.active_run():
                        with mlflow.start_run(nested=True):
                            mlflow.log_metric('iter_counter',
                                              self._iter_per_learner[estimator])
                            mlflow.log_param('train_loss',
                                             search_state.train_loss)
                            mlflow.log_metric('trial_time',
                                              search_state.trial_time)
                            mlflow.log_metric('total_search_time',
                                              self._state.time_from_start)
                            mlflow.log_metric('validation_loss',
                                              search_state.val_loss)
                            mlflow.log_param('config',
                                             search_state.config)
                            mlflow.log_param('learner',
                                             estimator)
                            mlflow.log_param('sample_size',
                                             search_state.sample_size)
                            mlflow.log_metric('best_validation_loss',
                                              search_state.best_loss)
                            mlflow.log_param('best_config',
                                             search_state.best_config)
                            mlflow.log_param('best_learner',
                                             self._best_estimator)
                logger.info(
                    " at {:.1f}s,\tbest {}'s error={:.4f},\tbest {}'s error={:.4f}".format(
                        self._state.time_from_start,
                        estimator,
                        search_state.best_loss,
                        self._best_estimator,
                        self._state.best_loss))
            else:
                logger.info(f"no enough budget for learner {estimator}")
                if self._estimator_index is not None:
                    self.estimator_list.remove(estimator)
                    self._estimator_index -= 1
            if self._retrain_full and best_config_sig and not better and (
                self._search_states[
                    self._best_estimator].sample_size == self._state.data_size
            ) and (est_retrain_time
                    <= self._state.time_budget - self._state.time_from_start
                    <= est_retrain_time + next_trial_time):
                self._trained_estimator, \
                    retrain_time = self._state._train_with_config(
                        self._best_estimator,
                        self._search_states[self._best_estimator].best_config,
                        self.data_size_full)
                logger.info("retrain {} for {:.1f}s".format(
                    estimator, retrain_time))
                self._retrained_config[best_config_sig] = retrain_time
                est_retrain_time = 0
            self._state.time_from_start = time.time() - self._start_time_flag
            if (self._state.time_from_start >= self._state.time_budget
                    or not self.estimator_list):
                break
            if self._ensemble and self._best_estimator:
                time_left = self._state.time_budget - self._state.time_from_start
                time_ensemble = self._search_states[
                    self._best_estimator].time2eval_best
                if time_left < time_ensemble < 2 * time_left:
                    break
        # Add a checkpoint for the current best config to the log.
        self._training_log.checkpoint()
        if self._best_estimator:
            self._selected = self._search_states[self._best_estimator]
            self._trained_estimator = self._selected.trained_estimator
            self.modelcount = sum(
                search_state.total_iter
                for search_state in self._search_states.values())
            if self._trained_estimator:
                logger.info(f'selected model: {self._trained_estimator.model}')
            if self._ensemble:
                search_states = list(x for x in self._search_states.items()
                                     if x[1].trained_estimator)
                search_states.sort(key=lambda x: x[1].best_loss)
                estimators = [(x[0], x[1].trained_estimator)
                              for x in search_states[:2]]
                estimators += [
                    (x[0], x[1].trained_estimator) for x in search_states[2:]
                    if x[1].best_loss < 4 * self._selected.best_loss]
                logger.info(estimators)
                if len(estimators) <= 1:
                    return
                if self._state.task != "regression":
                    from sklearn.ensemble import StackingClassifier as Stacker
                    for e in estimators:
                        e[1]._estimator_type = 'classifier'
                else:
                    from sklearn.ensemble import StackingRegressor as Stacker
                best_m = self._trained_estimator
                stacker = Stacker(estimators, best_m, n_jobs=self._state.n_jobs,
                                  passthrough=True)
                if self._sample_weight_full is not None:
                    self._state.fit_kwargs[
                        'sample_weight'] = self._sample_weight_full
                stacker.fit(self._X_train_all, self._y_train_all,
                            **self._state.fit_kwargs)
                logger.info(f'ensemble: {stacker}')
                self._trained_estimator = stacker
                self._trained_estimator.model = stacker
        else:
            self._selected = self._trained_estimator = None
            self.modelcount = 0

    def __del__(self):
        if hasattr(self, '_trained_estimator') and self._trained_estimator \
                and hasattr(self._trained_estimator, 'cleanup'):
            self._trained_estimator.cleanup()
            del self._trained_estimator

    def _select_estimator(self, estimator_list):
        if self._learner_selector == 'roundrobin':
            self._estimator_index += 1
            if self._estimator_index == len(estimator_list):
                self._estimator_index = 0
            return estimator_list[self._estimator_index]
        min_estimated_cost, selected = np.Inf, None
        inv = []
        untried_exists = False
        for i, estimator in enumerate(estimator_list):
            if estimator in self._search_states and (
                self._search_states[estimator].sample_size
            ):  # sample_size=none meaning no result
                search_state = self._search_states[estimator]
                if (self._search_states[estimator].time2eval_best
                    > self._state.time_budget - self._state.time_from_start
                    or self._iter_per_learner[estimator]
                        >= self._max_iter_per_learner):
                    inv.append(0)
                    continue
                estimated_cost = search_state.estimated_cost4improvement
                if search_state.sample_size < self._state.data_size:
                    estimated_cost = min(
                        estimated_cost,
                        search_state.time2eval_best * min(
                            SAMPLE_MULTIPLY_FACTOR,
                            self._state.data_size / search_state.sample_size))
                gap = search_state.best_loss - self._state.best_loss
                if gap > 0 and not self._ensemble:
                    delta_loss = (search_state.best_loss_old
                                  - search_state.best_loss) or search_state.best_loss
                    delta_time = (search_state.total_time_used
                                  - search_state.time_best_found_old) or 1e-10
                    speed = delta_loss / delta_time
                    if speed:
                        estimated_cost = max(2 * gap / speed, estimated_cost)
                if estimated_cost == 0:
                    estimated_cost = 1e-10
                inv.append(1 / estimated_cost)
            else:
                estimated_cost = self._eci[i]
                inv.append(0)
                untried_exists = True
            if estimated_cost < min_estimated_cost:
                min_estimated_cost = estimated_cost
                selected = estimator
        if untried_exists or not selected:
            state = self._search_states.get(selected)
            if not (state and state.sample_size):
                return selected
        s = sum(inv)
        p = self._random.rand()
        q = 0
        for i in range(len(inv)):
            if inv[i]:
                q += inv[i] / s
                if p < q:
                    return estimator_list[i]